In [15]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder
import textwrap

In [16]:
df_resumes = pd.read_csv('../PreProcessingResumes/processed_data/Resume.csv')
df_resumes.head()

Unnamed: 0,ID,Resume_str,Category
0,16852973,hr administrator marketing associate hr admini...,HR
1,22323967,hr specialist us hr operations summary versati...,HR
2,33176873,hr director summary over years experience in r...,HR
3,27018550,hr specialist summary dedicated driven and dyn...,HR
4,17812897,hr manager skill highlights hr skills hr depar...,HR


In [17]:
df_resumes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2483 entries, 0 to 2482
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ID          2483 non-null   int64 
 1   Resume_str  2483 non-null   object
 2   Category    2483 non-null   object
dtypes: int64(1), object(2)
memory usage: 58.3+ KB


In [18]:
df_jobs = pd.read_csv('../PreProcessingJobs/processed_data/JobDescription.csv')
df_jobs.head()

Unnamed: 0,job_id,title,description
0,921716,Marketing Coordinator,job descriptiona leading real estate firm in n...
1,1829192,Mental Health Therapist/Counselor,at aspen therapy and wellness we are committed...
2,10998357,Assitant Restaurant Manager,the national exemplar is accepting application...
3,23221523,Senior Elder Law / Trusts and Estates Associat...,senior associate attorney elder law trusts and...
4,91700727,Economic Development and Planning Intern,job summary the economic development planning ...


In [19]:
df_jobs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122545 entries, 0 to 122544
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   job_id       122545 non-null  int64 
 1   title        122545 non-null  object
 2   description  122545 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.8+ MB


### Resume & Job Description Matching System

In [20]:
# Convert resumes column to a list
resumes = df_resumes["Resume_str"].tolist()
    
# Encode category labels as integers
encoder = LabelEncoder()
labels = encoder.fit_transform(df_resumes["Category"])
category_names = encoder.classes_.tolist()

In [21]:
# Load the SBERT model for generating sentence embeddings
sbert_model = SentenceTransformer('all-MiniLM-L12-v2')

In [22]:
resumes_embed = sbert_model.encode(resumes, show_progress_bar=True)

Batches:   0%|          | 0/78 [00:00<?, ?it/s]

### Test

In [23]:
# Select a random job posting from the DataFrame
job_sample = df_jobs.sample(n=1).iloc[0]

job_desc = job_sample['description']

In [24]:
# Uncomment this block to select a specific job posting by job_id

# Select the job with the given job_id
# job_sample = df_jobs[df_jobs['job_id'] == 3901356495].iloc[0]  

# job_desc = job_sample['description']  

In [25]:
# Generate the embedding for the job description using the sentence-transformers model
job_embed = sbert_model.encode([job_desc], show_progress_bar=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [26]:
# Compute cosine similarity between the job description and each resume, returning a flat array of similarity scores
similarity_vector = cosine_similarity(job_embed, resumes_embed).flatten()

In [27]:
top_n = 5  # Number of top matches to retrieve

# Get indices of top_n resumes with highest similarity scores
top_matches = similarity_vector.argsort()[::-1][:top_n]

results = []
for rank, cv_idx in enumerate(top_matches, start=1):
    results.append({
        'rank':             rank,
        'job_id':           job_sample['job_id'],
        'job_title':        job_sample['title'],
        'cv_id':            df_resumes.iloc[cv_idx]['ID'],      
        'cv_category':      df_resumes.iloc[cv_idx]['Category'],
        'similarity_score': similarity_vector[cv_idx] 
    })

df_matches = pd.DataFrame(results)
df_matches

Unnamed: 0,rank,job_id,job_title,cv_id,cv_category,similarity_score
0,1,3898178264,Data Protection Senior Associate,85766635,HR,0.557325
1,2,3898178264,Data Protection Senior Associate,84512719,BUSINESS-DEVELOPMENT,0.542097
2,3,3898178264,Data Protection Senior Associate,35325329,INFORMATION-TECHNOLOGY,0.530056
3,4,3898178264,Data Protection Senior Associate,18072085,ACCOUNTANT,0.525723
4,5,3898178264,Data Protection Senior Associate,13115648,TEACHER,0.519477


In [28]:
description = job_sample['description']
print("\n".join(textwrap.wrap(description, width=140)))

ey focuses on high ethical standards and integrity among its employees and expects all candidates to demonstrate these qualities at ey you
ll have the chance to build a career as unique as you are with the global scale support inclusive culture and technology to become the best
version of you and we re counting on your unique voice and perspective to help ey become even better too join us and build an exceptional
experience for yourself and a better working world for all location flexible anywhere in the us data protection and privacy senior associate
ethics compliance and risk management ecrm supports our people in managing the risks that arise during our daily working lives we work
closely with all parts of the organization to identify manage and monitor risk providing coordinated advice and assistance on independence
conflicts compliance regulatory policy security issues as well as dealing with claims and any queries regarding ethics the opportunity we
are operating in an increasing