In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse as sparse

#Loading data path
resume_cleaned = 'C:/Users/ashua/Desktop/Inelligent Job Recomendation Engine/data/resumes/resume_cleaned.csv'
resume_skill_features = 'C:/Users/ashua/Desktop/Inelligent Job Recomendation Engine/data/Feature Engineering Data/resume_skill_features.csv'
job_skill_features = 'C:/Users/ashua/Desktop/Inelligent Job Recomendation Engine/data/Feature Engineering Data/job_skill_features.csv'
postings_cleaned = 'C:/Users/ashua/Desktop/Inelligent Job Recomendation Engine/data/Jobs/postings_cleaned.csv'
hybrid_model_scores = 'compatibility_score_matrix_final.csv'

#weights of two models
text_weight = 0.6
skill_weight = 0.4

print("Setup complete. Constants and Imports defined.")

Setup complete. Constants and Imports defined.


In [2]:
#Lodaing the data
print("--- Loading Data & Preparing Hybrid Model Scoring---")
try:
    resume_df = pd.read_csv(resume_cleaned)
    resume_features_df = pd.read_csv(resume_skill_features)
    job_skill_features_df = pd.read_csv(job_skill_features)
    postings_df = pd.read_csv(postings_cleaned, usecols=['job_id','description'])
except fileNotFoundError as e:
    print(f"Error: {e}. Please check the file paths.")
    exit()
print(f"Data Loaded Successfully. Resumes: {len(resume_features_df)}, Jobs: {len(postings_df)}")

--- Loading Data & Preparing Hybrid Model Scoring---
Data Loaded Successfully. Resumes: 2484, Jobs: 122124


In [3]:
#TF-IDF Text Vectorization and Text Similarity Score
resume_text = resume_df['Resume_str'].fillna('')
posting_text = postings_df['description'].fillna('')

print("\n   Fiting TF-IDF Vectorizer...")
tfidf = TfidfVectorizer(max_features=2000, stop_words='english', token_pattern=r'\b[a-zA-Z]{3,}\b')

#Fit on All Text (resume + job postings)
all_text = pd.concat([resume_text, posting_text])
tfidf.fit(all_text)

#Transform the texts into sparse TF-IDR vectors
resume_tfidf = tfidf.transform(resume_text)
posting_tfidf = tfidf   .transform(posting_text)
print(f"   TF-IDF Vectorization Complete. Resume TF-IDF shape: {resume_tfidf.shape}, Posting TF-IDF shape: {posting_tfidf.shape}")

#Calcualte Cosine Similarity for Text
print("/n calculating Text Similarity Scores...")

text_similarity_matrix = cosine_similarity(resume_tfidf, posting_tfidf)
print(f'   Text Similarity Calculation Complete. Shape: {text_similarity_matrix.shape}')


   Fiting TF-IDF Vectorizer...
   TF-IDF Vectorization Complete. Resume TF-IDF shape: (2484, 2000), Posting TF-IDF shape: (122124, 2000)
/n calculating Text Similarity Scores...
   Text Similarity Calculation Complete. Shape: (2484, 122124)


In [4]:
#skill match score
posting_df = postings_df.merge(job_skill_features_df, on='job_id', how='left')
posting_df = posting_df.fillna(0)

#Extracting the 35 skills (in binary format)
skil_cols = job_skill_features_df.columns.drop(['job_id']).tolist()
resume_skills = resume_features_df[skil_cols].values
posting_skills = posting_df[skil_cols].values

# caluate the skill match score (Normalized Dot Product)
print("\n Calculating Skill Match Scores...")
skill_match_matrix = np.dot(resume_skills, posting_skills.T)

# Normalize the skill match scores to range [0, 1]
normalized_skill_match_matrix = skill_match_matrix / len(skil_cols)
print(f"   Skill Match Matrix Shape: {normalized_skill_match_matrix.shape}")


 Calculating Skill Match Scores...
   Skill Match Matrix Shape: (2484, 122124)


In [5]:
#Hybrid Model Scoring and Saving
print("\n Calculating Hybrid Model Scores (Text weight = {text_weight}, Skill weight = {skill_weight})...")
#Combine the two scores with weights
hybrid_score_matrix = (text_weight * text_similarity_matrix) + (skill_weight * normalized_skill_match_matrix)

#Final DataFrame for Hybrid Scores
hybrid_scores_df = pd.DataFrame(hybrid_score_matrix, index=resume_df['ID'], columns=postings_df['job_id'])

#Save to CSV
hybrid_scores_df.to_csv(hybrid_model_scores)
print(f"\n Hybrid Model Scoring Complete. Scores saved to {hybrid_model_scores}")
print(f"   Hybrid Score Matrix Shape: {hybrid_scores_df.shape}")


 Calculating Hybrid Model Scores (Text weight = {text_weight}, Skill weight = {skill_weight})...

 Hybrid Model Scoring Complete. Scores saved to compatibility_score_matrix_final.csv
   Hybrid Score Matrix Shape: (2484, 122124)
