## Importing libraries

Run the 1_analysis_preprocesing file before this file to create data for this file.

You can create multiple variants of data by adding some preprocessing methods or removing some based on the experiement.

In [1]:
import pandas as pd
import numpy as np

### Load data

Load data from the processed files. 

In [3]:
# df = pd.read_csv('../data/processed/content.csv')
df = pd.read_csv('../data/processed/6_checkpoint_final_data.csv')

In [4]:
df.columns

Index(['career_objective', 'skills', 'educational_institution_name',
       'degree_names', 'passing_years', 'educational_results', 'result_types',
       'major_field_of_studies', 'professional_company_names', 'start_dates',
       'end_dates', 'related_skils_in_job', 'positions', 'responsibilities',
       'extra_curricular_activity_types',
       'extra_curricular_organization_names', 'role_positions', 'languages',
       'proficiency_levels', 'certification_providers', 'certification_skills',
       'job_position_name', 'educational_requirements',
       'experiencere_requirement', 'age_requirement', 'skills_required',
       'matched_score', 'resume_id', 'resume_text', 'job_id', 'jobs_text',
       'total_experience', 'min_experience_required', 'experience_match',
       'skills_matching_ratio', 'matching_words_count',
       'Resume_Cluster_KMeans'],
      dtype='object')

In [5]:
full_data = df[['matched_score', 'resume_id', 'resume_text', 'job_id', 'jobs_text']]

In [6]:
full_data

Unnamed: 0,matched_score,resume_id,resume_text,job_id,jobs_text
0,0.850000,1,big data analytics working database warehouse ...,1,senior software engineer b.sc computer science...
1,0.750000,2,fresher looking join data analyst junior data ...,2,machine learning (ml) engineer m.sc computer s...
2,0.416667,3,"['software development', 'machine learning', '...",3,"executive/ senior executive trade marketing, h..."
3,0.760000,4,obtain position fastpaced business office envi...,4,business development executive bachelor/honors...
4,0.650000,5,professional accountant outstanding work ethic...,5,senior io engineer bachelor science (bsc) comp...
...,...,...,...,...,...
9539,0.683333,217,"['mathematical modelling', 'machine learning',...",21,data engineer bachelor science (bsc) data plat...
9540,0.650000,24,expertise eda modeler. like learn data contain...,13,executive/ sr. executive bachelor science (bsc...
9541,0.650000,235,looking role related application development m...,16,executive vat bba accounting finance mushak fo...
9542,0.650000,267,"['machine learning', 'natural language process...",9,asst. manager/ manger (administrative) bachelo...


In [7]:
resumes = full_data[["resume_id", "resume_text"]]
jobs = full_data[["job_id", "jobs_text"]]

In [8]:
resume_strings = resumes["resume_text"]
job_strings = jobs["jobs_text"]

In [9]:
resume_strings = resume_strings.drop_duplicates()
job_strings = job_strings.drop_duplicates()

### Using tfidf to vectorize resume and job strings:

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
tfidf = TfidfVectorizer()

In [12]:
resume_tfidf = tfidf.fit_transform(resume_strings.values)
job_tfidf = tfidf.transform(job_strings.values)

print(resume_tfidf.shape)
print(job_tfidf.shape)

(344, 4952)
(28, 4952)


We have total of 4961 terms in the dataset.

### Using Cosine Similarity to recommend best jobs for a resume:

In [13]:
from sklearn.metrics.pairwise import cosine_similarity
sim_matrix = cosine_similarity(resume_tfidf, job_tfidf)

In [14]:
sim_matrix

array([[0.00860245, 0.06484749, 0.        , ..., 0.01361648, 0.0356808 ,
        0.00658609],
       [0.07416246, 0.12443126, 0.01252075, ..., 0.03407852, 0.02458509,
        0.00327719],
       [0.06435247, 0.06346508, 0.        , ..., 0.        , 0.        ,
        0.00740992],
       ...,
       [0.02351784, 0.01843194, 0.04668449, ..., 0.20131289, 0.06628729,
        0.05270651],
       [0.0951178 , 0.13746047, 0.        , ..., 0.01088929, 0.        ,
        0.00508909],
       [0.0265125 , 0.04179   , 0.00427706, ..., 0.00531457, 0.01679644,
        0.00447793]])

In [15]:
top_n = 3
recommendations = []

for i in range(5):
    top_jobs = sim_matrix[i].argsort()[::-1][:top_n]
    recommendations.append({
        "resume_id": i+1,
        "top_recommendations": [j+1 for j in top_jobs]
    })

In [16]:
print(f"Sample Resumes and Their Top-3 Job Recommendations:\n{pd.DataFrame(recommendations)}")

Sample Resumes and Their Top-3 Job Recommendations:
   resume_id top_recommendations
0          1        [21, 20, 10]
1          2         [21, 2, 25]
2          3         [9, 25, 21]
3          4         [9, 12, 25]
4          5        [16, 22, 17]


In [17]:
resume_strings.shape[0]

344

## Custom evaluation metrics

We have create custom evaluation metrics to identify the percentage of recommendations matching the top n recommendations of the original dataset based on the matched score.

In [18]:
def generate_weights(n):
    if n == 1:
        return [10.0]
    elif n > 1:
        # Linearly space values between 10 and 5 (inclusive), then reverse
        weights = [round(w, 2) for w in list(reversed([5 + (10 - 5) * (i / (n - 1)) for i in range(n)]))]
        return weights
    else:
        return []

In [19]:
def custom_metric(full_data, top_n, sim_matrix):
    """
    full_data: DataFrame with 'resume_id', 'job_id', and 'matched_score' columns
    top_n: Number of top jobs to consider
    sim_matrix: Similarity matrix between resumes and jobs
    """
    results = []
    
    # Generate descending weights from 10 to 5
    weights = generate_weights(top_n)
    weights_sum = sum(weights)
    
    for i in range(sim_matrix.shape[0]):  # Iterate over resumes
        # Top-N predicted job indices for resume i (sorted by similarity)
        top_jobs = sim_matrix[i].argsort()[::-1][:top_n]
        top_jobs = [int(j+1) for j in top_jobs]
        
        # Ground truth top-N job_ids for resume i, sorted by matched_score
        golden_top_jobs = full_data[full_data["resume_id"] == i + 1] \
                            .sort_values(by="matched_score", ascending=False)[:top_n]['job_id'].tolist()
        simple_score = len(set(top_jobs).intersection(set(golden_top_jobs)))/top_n
        
        
        # Calculate weighted score
        weighted_score = 0.0
        for idx, pred_job in enumerate(top_jobs):
            if (pred_job) in golden_top_jobs:  # sim_matrix job indices are 0-based
                weighted_score += weights[idx]

        
        normalized_score = weighted_score / weights_sum  # Normalize the score

        results.append({
            "resume_id": i + 1,
            "predicted": top_jobs,  # Convert to 1-based index
            "golden": golden_top_jobs,
            "weighted_score": round(normalized_score, 4),
            "simple_score": simple_score,
            "weights": weights
        })
        
    return results


In [20]:
results = custom_metric(full_data, 10, sim_matrix)

In [21]:
results = pd.DataFrame(results)
results

Unnamed: 0,resume_id,predicted,golden,weighted_score,simple_score,weights
0,1,"[21, 20, 10, 2, 6, 25, 15, 27, 12, 11]","[1, 15, 25, 13, 2, 6, 14, 18, 10, 23]",0.5185,0.5,"[10.0, 9.44, 8.89, 8.33, 7.78, 7.22, 6.67, 6.1..."
1,2,"[21, 2, 25, 1, 24, 6, 12, 15, 26, 27]","[2, 10, 25, 6, 15, 14, 11, 9, 20, 18]",0.4221,0.4,"[10.0, 9.44, 8.89, 8.33, 7.78, 7.22, 6.67, 6.1..."
2,3,"[9, 25, 21, 1, 2, 20, 23, 13, 6, 5]","[23, 2, 14, 1, 6, 15, 25, 20, 5, 11]",0.6667,0.7,"[10.0, 9.44, 8.89, 8.33, 7.78, 7.22, 6.67, 6.1..."
3,4,"[9, 12, 25, 24, 21, 6, 18, 8, 22, 19]","[9, 17, 22, 10, 16, 26, 3, 12, 19, 4]",0.4000,0.4,"[10.0, 9.44, 8.89, 8.33, 7.78, 7.22, 6.67, 6.1..."
4,5,"[16, 22, 17, 24, 9, 12, 25, 19, 6, 8]","[22, 9, 17, 16, 26, 11, 3, 10, 14, 15]",0.4815,0.4,"[10.0, 9.44, 8.89, 8.33, 7.78, 7.22, 6.67, 6.1..."
...,...,...,...,...,...,...
339,340,"[21, 2, 6, 15, 25, 1, 11, 13, 10, 20]","[10, 15, 23, 25, 6, 3, 11, 14, 16, 2]",0.6223,0.6,"[10.0, 9.44, 8.89, 8.33, 7.78, 7.22, 6.67, 6.1..."
340,341,"[9, 28, 18, 7, 11, 13, 1, 8, 12, 10]","[2, 9, 7, 8, 11, 10, 18, 14, 20, 24]",0.6148,0.6,"[10.0, 9.44, 8.89, 8.33, 7.78, 7.22, 6.67, 6.1..."
341,342,"[26, 19, 9, 13, 16, 18, 12, 27, 11, 28]","[19, 3, 17, 9, 26, 22, 18, 23, 16, 11]",0.6519,0.6,"[10.0, 9.44, 8.89, 8.33, 7.78, 7.22, 6.67, 6.1..."
342,343,"[21, 25, 2, 6, 15, 1, 20, 24, 12, 5]","[15, 23, 10, 2, 1, 14, 11, 6, 25, 5]",0.6221,0.6,"[10.0, 9.44, 8.89, 8.33, 7.78, 7.22, 6.67, 6.1..."


In [22]:
sum(results["weighted_score"])/results.shape[0]

0.5304706395348832

In [23]:
sum(results["simple_score"])/results.shape[0]

0.5113372093023254

From the content based system, we can see 50% of the top 10 recommendations are matching with the original top 10 job matches for the resume.