In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


In [None]:
# Load datasets 
resumes_directory = 'data/Resume Dataset/Resume/Resume_With_Skills.csv'
jobs_directory = 'data/Linkedin Job Postings (2023-2024)/cleaned_JD_with_skills.csv'
resumes = pd.read_csv(resumes_directory)
jobs = pd.read_csv(jobs_directory)

# print column names
print(resumes.columns)
print(jobs.columns)

Index(['ID', 'Resume_str', 'Resume_html', 'Category', 'Clean_Resume', 'skills',
       'soft_skills'],
      dtype='object')
Index(['title', 'description', 'skills_desc', 'combined_skills_desc',
       'Clean_JD', 'skills'],
      dtype='object')


In [12]:
resumes["text"] = (
    resumes["Clean_Resume"] + " " +
    resumes["skills"].fillna("") + " " +
    resumes["soft_skills"].fillna("")
)

jobs["text"] = (
    jobs["Clean_JD"] + " " +
    jobs["skills"].fillna("") + " " +
    jobs["skills_desc"].fillna("")
)

In [None]:
# Load pre-trained tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

In [None]:
def get_embeddings_in_batches(text_list, batch_size=16):
    all_embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i+batch_size]
        inputs = tokenizer(
            batch,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=128
        )
        with torch.no_grad():
            outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
        all_embeddings.append(batch_embeddings)
    return np.vstack(all_embeddings)

In [None]:
resumes["text"] = resumes["text"].astype(str).fillna("")
jobs["text"] = jobs["text"].astype(str).fillna("")

In [17]:
# Compute embeddings
resume_embeddings = get_embeddings(resumes["text"].tolist())
job_embeddings = get_embeddings(jobs["text"].tolist())

# Rank Jobs

In [21]:
from sklearn.metrics.pairwise import cosine_similarity

def rank_jobs(resume_text, job_embeddings, jobs, top_n=10):
    # Embed the resume
    resume_embedding = get_embeddings([resume_text])
    
    # Compute cosine similarity
    similarities = cosine_similarity(resume_embedding, job_embeddings)
    
    # Get top N jobs
    top_indices = similarities.argsort()[0][-top_n:][::-1]
    top_scores = similarities[0, top_indices]
    
    return jobs.iloc[top_indices], top_scores

# Example resume of a teacher
resume_text = """
I am a teacher with 5 years of experience in a primary school. I have a passion for teaching and I love working with children. I have a bachelor's degree in education and I am certified to teach in the state of California. I have experience teaching math, science, and English. I am patient, caring, and dedicated to helping my students succeed.
"""
top_jobs, scores = rank_jobs(resume_text, job_embeddings, jobs)

# Display results
print("Top Job Matches:")
for job, score in zip(top_jobs["title"], scores):
    print(f"{job}: {score:.2f}")


Top Job Matches:
SAP Basis: 0.75
Contract Specialist: 0.73
Database Administrator Manager: 0.73
Technical Architect: 0.73
Technical Architect: 0.73
Technical Architect: 0.73
Scrum master: 0.73
High Ticket Enrollment Coach (Sales position): 0.73
Water Damage Restoration Labor Specialist: 0.72
Construction Specialist Labor - Paint Drywall Carpentry: 0.72
