In [103]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# # Download necessary NLP resources
# nltk.download('punkt')
# nltk.download('stopwords')


In [104]:

# 📌 Load the dataset (replace 'jobs.csv' with actual file path)
df = pd.read_csv('Downloads/jobs.csv')

In [105]:
df.head()


Unnamed: 0.1,Unnamed: 0,Job Salary,Job Experience Required,Key Skills,Role Category,Functional Area,Industry,Job Title
0,0,Not Disclosed by Recruiter,5 - 10 yrs,Media Planning| Digital Media,Advertising,"Marketing , Advertising , MR , PR , Media Plan...","Advertising, PR, MR, Event Management",Media Planning Executive/Manager
1,1,Not Disclosed by Recruiter,2 - 5 yrs,pre sales| closing| software knowledge| clien...,Retail Sales,"Sales , Retail , Business Development","IT-Software, Software Services",Sales Executive/Officer
2,2,Not Disclosed by Recruiter,0 - 1 yrs,Computer science| Fabrication| Quality check|...,R&D,"Engineering Design , R&D","Recruitment, Staffing",R&D Executive
3,3,"2,00,000 - 4,00,000 PA.",0 - 5 yrs,Technical Support,Admin/Maintenance/Security/Datawarehousing,"IT Software - Application Programming , Mainte...","IT-Software, Software Services",Technical Support Engineer
4,4,Not Disclosed by Recruiter,2 - 5 yrs,manual testing| test engineering| test cases|...,Programming & Design,IT Software - QA & Testing,"IT-Software, Software Services",Testing Engineer


In [106]:

# 📌 Data Preprocessing
# Drop the Unnamed column (likely the index column)
df = df.drop(columns=['Unnamed: 0'])

# Replace 'Not Disclosed by Recruiter' with NaN and then handle it
df['Job Salary'] = df['Job Salary'].replace('Not Disclosed by Recruiter', '0')

# Remove commas from salary and convert to numeric
df['Job Salary'] = df['Job Salary'].replace(',', '', regex=True)
df['Job Salary'] = pd.to_numeric(df['Job Salary'], errors='coerce').fillna(0)

# Fill missing values in 'Job Title' and 'Key Skills'
df = df[['Job Title', 'Key Skills']].fillna('')

# Create a 'content' column by combining 'Job Title' and 'Key Skills'
df['content'] = df['Job Title'] + ' ' + df['Key Skills']


In [107]:

# 🔹 (A) Content-Based Filtering (CBF)
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['content'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [108]:

# Recommendation function using CBF
def recommend_jobs(job_title, top_n=5):
    idx = df[df['Job Title'].str.contains(job_title, case=False, na=False)].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    job_indices = [i[0] for i in sim_scores]
    return df.iloc[job_indices][['Job Title', 'Key Skills', 'content']] 


# # 🔹 Content-Based Filtering (CBF) function (without 'Job Salary')
# def recommend_jobs(job_title, top_n=5):
#     idx = df[df['Job Title'].str.contains(job_title, case=False, na=False)].index[0]
#     sim_scores = list(enumerate(cosine_sim[idx]))
#     sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
#     job_indices = [i[0] for i in sim_scores]
    
#     # Returning only the available columns
#     return df.iloc[job_indices][['Job Title', 'Key Skills']]



In [109]:

# 🔹 (B) Collaborative Filtering (CF)
# Fake user-job interaction dataset
ratings_dict = {
    "user_id": [1, 1, 1, 2, 2, 3, 3, 4],
    "job_id": [101, 102, 103, 101, 104, 102, 105, 103],
    "rating": [5, 4, 3, 5, 2, 4, 3, 5]
}
ratings = pd.DataFrame(ratings_dict)

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['user_id', 'job_id', 'rating']], reader)


In [110]:

# Use SVD for Matrix Factorization
model = SVD()
cross_validate(model, data, cv=3)

trainset = data.build_full_trainset()
model.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x22735534d10>

In [111]:
print(df.columns)


Index(['Job Title', 'Key Skills', 'content'], dtype='object')


In [112]:

# Predict job rating for a user
def predict_job_rating(user_id, job_id):
    return model.predict(user_id, job_id).est

# 🔹 (C) Hybrid Filtering (Weighted Combination of CBF & CF)
def hybrid_recommendation(user_id, job_title, top_n=5):
    cbf_jobs = recommend_jobs(job_title, top_n=top_n).reset_index()
    
    cbf_jobs['Predicted Rating'] = cbf_jobs.index.map(lambda i: predict_job_rating(user_id, i))
    
    # Weighted combination of CF & CBF (50% weight each)
    cbf_jobs['Final Score'] = (0.5 * cbf_jobs['Predicted Rating']) + (0.5 * (cbf_jobs.index / top_n))
    return cbf_jobs.sort_values(by='Final Score', ascending=False)[['Job Title', 'Key Skills', 'Predicted Rating']]


In [113]:

# 🔹 Example Usage:
print("📌 Content-Based Recommendations:")
print(recommend_jobs("Data Scientist"))

print("\n📌 Hybrid Recommendations for User 1:")
print(hybrid_recommendation(1, "Data Scientist"))

📌 Content-Based Recommendations:
                                               Job Title  \
20127  Roles and responsibilities  Functional Respons...   
9593                                        Data Analyst   
4763   DESIGNATION  - Associate Manager- P&S DOMAIN K...   
14335         Associate/Senior Associate -(NonTechnical)   
25776                                       Data Analyst   

                                              Key Skills  \
20127  Project Management|Excel Powerpoint|Time Serie...   
9593    Machine Learning| Data Mining| Big Data| Data...   
4763   sales|rfis|rfps|presales|solution consulting|t...   
14335   Risk management| SAS| Consulting| Financial s...   
25776   Presentation Skills| Business Solutions| Proc...   

                                                 content  
20127  Roles and responsibilities  Functional Respons...  
9593   Data Analyst  Machine Learning| Data Mining| B...  
4763   DESIGNATION  - Associate Manager- P&S DOMAIN K...  
14335  As