In [38]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
from sklearn.metrics import precision_recall_fscore_support, mean_absolute_error

# Load job dataset (Assume CSV with 'job_id', 'Job Salary', 'Job Experience Required', 'Key Skills', 'Job Title')
jobs = pd.read_csv('Downloads/jobs.csv')
jobs.head()

Unnamed: 0.1,Unnamed: 0,Job Salary,Job Experience Required,Key Skills,Role Category,Functional Area,Industry,Job Title
0,0,Not Disclosed by Recruiter,5 - 10 yrs,Media Planning| Digital Media,Advertising,"Marketing , Advertising , MR , PR , Media Plan...","Advertising, PR, MR, Event Management",Media Planning Executive/Manager
1,1,Not Disclosed by Recruiter,2 - 5 yrs,pre sales| closing| software knowledge| clien...,Retail Sales,"Sales , Retail , Business Development","IT-Software, Software Services",Sales Executive/Officer
2,2,Not Disclosed by Recruiter,0 - 1 yrs,Computer science| Fabrication| Quality check|...,R&D,"Engineering Design , R&D","Recruitment, Staffing",R&D Executive
3,3,"2,00,000 - 4,00,000 PA.",0 - 5 yrs,Technical Support,Admin/Maintenance/Security/Datawarehousing,"IT Software - Application Programming , Mainte...","IT-Software, Software Services",Technical Support Engineer
4,4,Not Disclosed by Recruiter,2 - 5 yrs,manual testing| test engineering| test cases|...,Programming & Design,IT Software - QA & Testing,"IT-Software, Software Services",Testing Engineer


In [57]:

# Generate random user-job interactions
num_users = 100
num_jobs = 50

data = {
    "user_id": np.random.randint(1, num_users+1, size=500),
    "job_id": np.random.randint(101, 101+num_jobs, size=500),
    "applied": np.random.choice([0, 1], size=500, p=[0.3, 0.7]),  # 70% applied
    "rating": np.random.randint(1, 6, size=500)  # Ratings between 1 and 5
}

interactions_df = pd.DataFrame(data)
interactions_df.to_csv("user_interactions.csv", index=False)

print("Dummy user_interactions.csv created!")


Dummy user_interactions.csv created!


In [39]:


# Add job_id column
jobs.insert(0, 'job_id', range(1, len(jobs) + 1))
print(jobs.columns)

Index(['job_id', 'Unnamed: 0', 'Job Salary', 'Job Experience Required',
       'Key Skills', 'Role Category', 'Functional Area', 'Industry',
       'Job Title'],
      dtype='object')


In [40]:

# Preprocessing
if 'Unnamed: 0' in jobs.columns:
    jobs.drop(columns=['Unnamed: 0'], inplace=True)
  # Drop unnecessary column
jobs.replace('Not Disclosed by Recruiter', np.nan, inplace=True)  # Handle missing values

# Process salary column
jobs['Job Salary'] = jobs['Job Salary'].str.replace(',', '', regex=True)
jobs['Job Salary'] = jobs['Job Salary'].str.extract(r'(\d+)', expand=False).astype(float)

# Process experience requirements
jobs[['Min Experience', 'Max Experience']] = jobs['Job Experience Required'].str.extract(r'(\d+)\s*-\s*(\d+)', expand=True).astype(float)
jobs.drop(columns=['Job Experience Required'], inplace=True)


# Combine title and key skills for content-based filtering
jobs['text'] = jobs['Job Title'].fillna('') + " " + jobs['Key Skills'].fillna('')

# Load user interactions (Assume CSV with 'user_id', 'job_id', 'applied', 'rating')
interactions = pd.read_csv('user_interactions.csv')


In [49]:

##### CONTENT-BASED FILTERING #####
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(jobs['text'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

def content_based_recommend(job_id, top_n=5):
    idx = jobs[jobs['job_id'] == job_id].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    job_indices = [i[0] for i in sim_scores]
    return jobs.iloc[job_indices][['job_id', 'Job Title']]

In [50]:


##### COLLABORATIVE FILTERING #####
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(interactions[['user_id', 'job_id', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2)
svd = SVD()
svd.fit(trainset)

def collaborative_recommend(user_id, job_ids, top_n=5):
    predictions = [svd.predict(user_id, job) for job in job_ids]
    predictions.sort(key=lambda x: x.est, reverse=True)
    return [pred.iid for pred in predictions[:top_n]]

In [53]:


##### HYBRID RECOMMENDATION #####
def hybrid_recommend(user_id, job_id, top_n=5):
    content_recs = content_based_recommend(job_id, top_n=top_n)['job_id'].tolist()
    collab_recs = collaborative_recommend(user_id, jobs['job_id'].tolist(), top_n=top_n)
    hybrid_recs = list(set(content_recs + collab_recs))[:top_n]
    return jobs[jobs['job_id'].isin(hybrid_recs)][['job_id', 'Job Title']]

In [55]:


##### EVALUATION METRICS #####
predictions = svd.test(testset)
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

def evaluate_recommendations(user_id, top_n=5):
    true_jobs = interactions[(interactions['user_id'] == user_id) & (interactions['rating'] >= 4)]['job_id'].tolist()
    recommended_jobs = collaborative_recommend(user_id, jobs['job_id'].tolist(), top_n=top_n)
    
    if not recommended_jobs:
        return 0, 0, 0
    
    y_true = [1 if job in true_jobs else 0 for job in recommended_jobs]
    y_pred = [1] * len(recommended_jobs)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary', zero_division=0)
    return precision, recall, f1

##### TESTING #####
user_test = 1
job_test = jobs['job_id'].iloc[0]
print("Content-Based:")
print(content_based_recommend(job_test))
print("Collaborative:")
print(collaborative_recommend(user_test, jobs['job_id'].tolist()))
print("Hybrid:")
print(hybrid_recommend(user_test, job_test))

precision, recall, f1 = evaluate_recommendations(user_test)
print(f"Precision: {precision}, Recall: {recall}, F1-Score: {f1}")
print(f"RMSE: {rmse}, MAE: {mae}")



RMSE: 1.4579
MAE:  1.2662
Content-Based:
       job_id                         Job Title
23113   23114  Media Planning Executive/Manager
18305   18306  Media Planning Executive/Manager
3151     3152  Media Planning Executive/Manager
9025     9026  Media Planning Executive/Manager
5376     5377  Media Planning Executive/Manager
Collaborative:
[124, 138, 136, 106, 118]
Hybrid:
       job_id                         Job Title
135       136                Restaurant Manager
5376     5377  Media Planning Executive/Manager
9025     9026  Media Planning Executive/Manager
18305   18306  Media Planning Executive/Manager
23113   23114  Media Planning Executive/Manager
Precision: 0.0, Recall: 0.0, F1-Score: 0.0
RMSE: 1.4579409937389216, MAE: 1.26620347965549
