In [None]:
!pip install sentence-transformers

In [4]:
import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

# Load CSVs
job_df = pd.read_csv('/Users/apsarasrinivasan/Downloads/job_descriptions.csv')
resume_df = pd.read_csv('/Users/apsarasrinivasan/Downloads/resumes.csv')

# Combine text fields
job_df['combined_text'] = job_df[['Company_Overview', 'Responsibilities', 'Required_Qualifications', 'Preferred_Qualifications']].fillna('').agg(' '.join, axis=1)
resume_df['combined_text'] = resume_df[['Professional_Summary', 'Work_Experience', 'Projects', 'Certifications', 'Education', 'Skills']].fillna('').agg(' '.join, axis=1)

# Load model just to get embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Get embeddings and similarity matrix
resume_embeddings = model.encode(resume_df['combined_text'].tolist(), convert_to_numpy=True)
job_embeddings = model.encode(job_df['combined_text'].tolist(), convert_to_numpy=True)
similarity_matrix = cosine_similarity(resume_embeddings, job_embeddings)
similarity_df = pd.DataFrame(similarity_matrix, index=resume_df['ID'], columns=job_df['ID'])

# Generate training pairs (positives)
train_examples_all = []
for job_id in job_df['ID']:
    top_resumes = similarity_df[job_id].sort_values(ascending=False).head(3).index.tolist()
    job_text = job_df[job_df['ID'] == job_id]['combined_text'].values[0]
    for resume_id in top_resumes:
        resume_text = resume_df[resume_df['ID'] == resume_id]['combined_text'].values[0]
        train_examples_all.append(InputExample(texts=[resume_text, job_text], label=1.0))

# Add random negatives
import random
for _ in range(len(train_examples_all)):
    rand_resume = resume_df.sample(1).iloc[0]
    rand_job = job_df.sample(1).iloc[0]
    train_examples_all.append(InputExample(texts=[rand_resume['combined_text'], rand_job['combined_text']], label=0.0))

# Split again
train_examples, test_examples = train_test_split(train_examples_all, test_size=0.2, random_state=42)

In [5]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report
from sentence_transformers import SentenceTransformer

# 1. Load the fine-tuned model
model = SentenceTransformer('/Users/apsarasrinivasan/fine_tuned_cv_model')

# 2. Use the test_examples you created during training
# Make sure test_examples is still in memory — if not, reload from a file if you saved it earlier
# Each `InputExample` in test_examples has: texts=[resume, job], label=1.0 or 0.0

# 3. Loop through test pairs and compute cosine similarity
y_true = []
y_pred = []

for example in test_examples:
    emb1 = model.encode(example.texts[0], convert_to_numpy=True)
    emb2 = model.encode(example.texts[1], convert_to_numpy=True)
    sim_score = cosine_similarity([emb1], [emb2])[0][0]

    y_true.append(int(example.label))  # Convert float label to int
    y_pred.append(1 if sim_score >= 0.5 else 0)  # Classification threshold

# 4. Print classification metrics
print(classification_report(y_true, y_pred, target_names=['No Match', 'Match']))

              precision    recall  f1-score   support

    No Match       0.97      0.98      0.97      2952
       Match       0.98      0.97      0.97      3048

    accuracy                           0.97      6000
   macro avg       0.97      0.97      0.97      6000
weighted avg       0.97      0.97      0.97      6000



In [None]:
import sys
print(sys.executable)

In [7]:
# Generate top-5 ranked resumes per job
all_rankings = []

for job_id in job_df['ID']:
    ranked = similarity_df[job_id].sort_values(ascending=False).reset_index()
    ranked.columns = ['Resume_ID', 'Similarity_Score']
    ranked['Job_ID'] = job_id
    top_ranked = ranked.head(5)
    all_rankings.append(top_ranked)

combined_df = pd.concat(all_rankings, ignore_index=True)
combined_df = combined_df[['Job_ID', 'Resume_ID', 'Similarity_Score']]
combined_df.head(5)

Unnamed: 0,Job_ID,Resume_ID,Similarity_Score
0,JD0001,R2941,0.646704
1,JD0001,R6887,0.635832
2,JD0001,R2796,0.601652
3,JD0001,R2937,0.592615
4,JD0001,R6312,0.589408


In [8]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

# Load your fine-tuned model
model = SentenceTransformer('/Users/apsarasrinivasan/fine_tuned_cv_model')

In [9]:
# Recombine text fields if needed
resume_df['combined_text'] = resume_df[['Professional_Summary', 
                                        'Work_Experience', 
                                        'Projects', 
                                        'Certifications', 
                                        'Education', 
                                        'Skills']].fillna('').agg(' '.join, axis=1)

job_df['combined_text'] = job_df[['Company_Overview', 
                                  'Responsibilities', 
                                  'Required_Qualifications', 
                                  'Preferred_Qualifications']].fillna('').agg(' '.join, axis=1)

# Recompute embeddings using the fine-tuned model
resume_embeddings = model.encode(resume_df['combined_text'].tolist(), convert_to_tensor=False)
job_embeddings = model.encode(job_df['combined_text'].tolist(), convert_to_tensor=False)

In [10]:
similarity_matrix = cosine_similarity(resume_embeddings, job_embeddings)
similarity_df = pd.DataFrame(similarity_matrix, index=resume_df['ID'], columns=job_df['ID'])

In [13]:
all_rankings = []

for job_id in job_df['ID']:
    ranked = similarity_df[job_id].sort_values(ascending=False).reset_index()
    ranked.columns = ['Resume_ID', 'Similarity_Score']
    ranked['Job_ID'] = job_id
    top_ranked = ranked.head(5)
    all_rankings.append(top_ranked)

combined_df = pd.concat(all_rankings, ignore_index=True)
combined_df = combined_df[['Job_ID', 'Resume_ID', 'Similarity_Score']]
combined_df.head()

Unnamed: 0,Job_ID,Resume_ID,Similarity_Score
0,JD0001,R6312,0.976339
1,JD0001,R4572,0.973102
2,JD0001,R2936,0.972355
3,JD0001,R5479,0.970639
4,JD0001,R1381,0.968446
