In [1]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, InputExample, evaluation, losses
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

In [2]:
# Load cleaned job descriptions and resumes (provided by Person 1)
job_df = pd.read_csv('/Users/apsarasrinivasan/Downloads/job_descriptions.csv')
resume_df = pd.read_csv('/Users/apsarasrinivasan/Downloads/resumes.csv')

In [3]:
# Combine job description text
job_df['combined_text'] = job_df[['Company_Overview', 
                                  'Responsibilities', 
                                  'Required_Qualifications', 
                                  'Preferred_Qualifications']].fillna('').agg(' '.join, axis=1)

# Combine resume text
resume_df['combined_text'] = resume_df[['Professional_Summary', 
                                        'Work_Experience', 
                                        'Projects', 
                                        'Certifications', 
                                        'Education', 
                                        'Skills']].fillna('').agg(' '.join, axis=1)

In [4]:
# Load Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings
resume_embeddings = model.encode(resume_df['combined_text'].tolist(), convert_to_tensor=True)
job_embeddings = model.encode(job_df['combined_text'].tolist(), convert_to_tensor=True)

# Convert to NumPy arrays for cosine similarity
resume_embeddings_np = resume_embeddings.cpu().numpy()
job_embeddings_np = job_embeddings.cpu().numpy()

Downloading .gitattributes:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading model.onnx:   0%|          | 0.00/90.4M [00:00<?, ?B/s]

Downloading model_O1.onnx:   0%|          | 0.00/90.4M [00:00<?, ?B/s]

Downloading model_O2.onnx:   0%|          | 0.00/90.3M [00:00<?, ?B/s]

Downloading model_O3.onnx:   0%|          | 0.00/90.3M [00:00<?, ?B/s]

Downloading model_O4.onnx:   0%|          | 0.00/45.2M [00:00<?, ?B/s]

Downloading model_qint8_arm64.onnx:   0%|          | 0.00/23.0M [00:00<?, ?B/s]

Downloading (…)el_qint8_avx512.onnx:   0%|          | 0.00/23.0M [00:00<?, ?B/s]

Downloading (…)nt8_avx512_vnni.onnx:   0%|          | 0.00/23.0M [00:00<?, ?B/s]

Downloading model_quint8_avx2.onnx:   0%|          | 0.00/23.0M [00:00<?, ?B/s]

Downloading openvino_model.bin:   0%|          | 0.00/90.3M [00:00<?, ?B/s]

Downloading openvino_model.xml:   0%|          | 0.00/211k [00:00<?, ?B/s]

Downloading (…)_qint8_quantized.bin:   0%|          | 0.00/22.9M [00:00<?, ?B/s]

Downloading (…)_qint8_quantized.xml:   0%|          | 0.00/368k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [5]:
# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(resume_embeddings_np, job_embeddings_np)

In [6]:
# Create similarity DataFrame
similarity_df = pd.DataFrame(similarity_matrix, index=resume_df['ID'], columns=job_df['ID'])

In [7]:
# Build training dataset (positive matches only)
train_examples_all = []

for job_id in job_df['ID']:
    top_resumes = similarity_df[job_id].sort_values(ascending=False).head(3).index.tolist()
    job_text = job_df[job_df['ID'] == job_id]['combined_text'].values[0]
    for resume_id in top_resumes:
        resume_text = resume_df[resume_df['ID'] == resume_id]['combined_text'].values[0]
        train_examples_all.append(InputExample(texts=[resume_text, job_text], label=1.0))

In [8]:
# Add random negative examples
for _ in range(len(train_examples_all)):
    rand_resume = resume_df.sample(1).iloc[0]
    rand_job = job_df.sample(1).iloc[0]
    train_examples_all.append(InputExample(texts=[rand_resume['combined_text'], rand_job['combined_text']], label=0.0))

In [9]:
# Split into training and test sets
train_examples, test_examples = train_test_split(train_examples_all, test_size=0.2, random_state=42)

In [10]:
# Create DataLoader and define loss
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model)

In [11]:
# Fine-tune the model
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=2, warmup_steps=100)

# Save the fine-tuned model
model.save('/Users/apsarasrinivasan/fine_tuned_cv_model')

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1500 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1500 [00:00<?, ?it/s]

In [12]:
# Evaluate on test set
evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(test_examples)
model.evaluate(evaluator)

np.float64(0.8561226942850587)

In [13]:
# Generate top-5 ranked resumes per job
all_rankings = []

for job_id in job_df['ID']:
    ranked = similarity_df[job_id].sort_values(ascending=False).reset_index()
    ranked.columns = ['Resume_ID', 'Similarity_Score']
    ranked['Job_ID'] = job_id
    top_ranked = ranked.head(5)
    all_rankings.append(top_ranked)

combined_df = pd.concat(all_rankings, ignore_index=True)
combined_df = combined_df[['Job_ID', 'Resume_ID', 'Similarity_Score']]