In [None]:
# Install the required package
!pip install scikit-surprise
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from surprise import Dataset, Reader, SVD # Now this line should work
from surprise.model_selection import train_test_split
from surprise import accuracy

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m153.6/154.4 kB[0m [31m4.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357296 sha256=66dee057a1a670cada494ce159ed3c4c2b38b46fd15d23717d251d13310ead52
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a

In [None]:
# Sample users data
users_data = pd.DataFrame({
    'user_id': [1, 2, 3],
    'skills': ['Python, Machine Learning, SQL', 'Java, Spring, Hibernate', 'Python, Data Analysis, Excel'],
    'experience_years': [3, 5, 2],
    'location': ['New York', 'San Francisco', 'Los Angeles']
})
# Sample job postings data
jobs_data = pd.DataFrame({
    'job_id': [101, 102, 103],
    'job_title': ['Data Scientist', 'Backend Developer', 'Data Analyst'],
    'job_skills': ['Python, SQL, Machine Learning', 'Java, Spring, Hibernate', 'Python, Excel, Data Analysis'],
    'experience_required': [2, 4, 1],
    'location': ['New York', 'San Francisco', 'Los Angeles']
})

**Step 3: Content-Based Filtering (Skills Matching)**

**Text Vectorization:**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Vectorize user and job skills
vectorizer = TfidfVectorizer()
# Fit on job skills and transform both job and user skills
job_skills_matrix = vectorizer.fit_transform(jobs_data['job_skills'])
user_skills_matrix = vectorizer.transform(users_data['skills'])

**Cosine Similarity:**

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
# Calculate cosine similarity between user and job skills
skills_similarity = cosine_similarity(user_skills_matrix, job_skills_matrix)
# Output similarity scores
skills_similarity

array([[1.        , 0.        , 0.16163636],
       [0.        , 1.        , 0.        ],
       [0.16163636, 0.        , 1.        ]])

**Adding Experience and Location Matching**

**Experience Matching:**

In [None]:
from sklearn.preprocessing import MinMaxScaler
# Normalize experience
scaler = MinMaxScaler()
users_data['experience_years_normalized'] = scaler.fit_transform(users_data[['experience_years']])
# Rename the column before applying transform to ensure feature name consistency
jobs_data_renamed = jobs_data.rename(columns={'experience_required': 'experience_years'})
jobs_data['experience_required_normalized'] = scaler.transform(jobs_data_renamed[['experience_years']])
# Calculate similarity for experience (1 - absolute difference)
experience_similarity = 1 - np.abs(users_data['experience_years_normalized'].values[:, None] - jobs_data['experience_required_normalized'].values)

**Location Matching:**

In [None]:
# Binary match for location (1 if match, 0 otherwise)
location_similarity = np.array([[1 if u_loc == j_loc else 0 for j_loc in jobs_data['location']] for u_loc in users_data['location']])

**Step 5: Combining Similarities**

In [None]:
# Weighting the similarity scores (tune alpha, beta, gamma as per importance)
alpha = 0.5  # Weight for skills
beta = 0.3   # Weight for experience
gamma = 0.2  # Weight for location
# Combine the similarity matrices
combined_similarity = alpha * skills_similarity + beta * experience_similarity + gamma * location_similarity
# Output combined similarity
combined_similarity

array([[ 9.00000000e-01,  2.00000000e-01,  1.80818179e-01],
       [ 3.33066907e-17,  9.00000000e-01, -1.00000000e-01],
       [ 3.80818179e-01,  1.00000000e-01,  9.00000000e-01]])

**Step 6: Generate Job Recommendations**

In [None]:
# Recommend top N jobs for each user
top_n = 3
recommendations = np.argsort(combined_similarity, axis=1)[:, ::-1][:, :top_n]

# Display recommendations for each user
for i, user in enumerate(users_data['user_id']):
    print(f"Top {top_n} recommendations for user {user}:")
    recommended_jobs = recommendations[i]
    print(jobs_data.iloc[recommended_jobs][['job_id', 'job_title']])


Top 3 recommendations for user 1:
   job_id          job_title
0     101     Data Scientist
1     102  Backend Developer
2     103       Data Analyst
Top 3 recommendations for user 2:
   job_id          job_title
1     102  Backend Developer
0     101     Data Scientist
2     103       Data Analyst
Top 3 recommendations for user 3:
   job_id          job_title
2     103       Data Analyst
0     101     Data Scientist
1     102  Backend Developer


**Step 7: Collaborative Filtering (Matrix Factorization)**

**Create Interaction Data:**

In [None]:
# Sample interaction data (user_id, job_id, and implicit rating)
interaction_data = pd.DataFrame({
    'user_id': [1, 1, 2, 2, 3],
    'job_id': [101, 102, 102, 103, 101],
    'rating': [5, 3, 4, 5, 2]  # Ratings can be implicit (clicks or applications)
})

**Matrix Factorization Using SVD:**

In [None]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
# Define reader to load interaction data
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(interaction_data[['user_id', 'job_id', 'rating']], reader)
# Train-test split
trainset, testset = train_test_split(data, test_size=0.2)
# Use SVD for matrix factorization
model = SVD()
model.fit(trainset)
# Predict and evaluate
predictions = model.test(testset)
accuracy.rmse(predictions)

RMSE: 1.1032


1.103189895271453

**Generating Recommendations:**

In [None]:
# Predict job ratings for a specific user
user_id = 1
job_id = 103
predicted_rating = model.predict(user_id, job_id)
print(f"Predicted rating for user {user_id} on job {job_id}: {predicted_rating.est}")

# Recommend top jobs for a user
def recommend_jobs_for_user(user_id, model, jobs_data, n_recommendations=3):
    job_ids = jobs_data['job_id'].values
    predictions = [model.predict(user_id, job_id).est for job_id in job_ids]
    recommended_jobs_idx = np.argsort(predictions)[::-1][:n_recommendations]
    return jobs_data.iloc[recommended_jobs_idx]

# Get top 3 recommended jobs for user 1
recommend_jobs_for_user(1, model, jobs_data)


Predicted rating for user 1 on job 103: 4.250361115917192


Unnamed: 0,job_id,job_title,job_skills,experience_required,location,experience_required_normalized
0,101,Data Scientist,"Python, SQL, Machine Learning",2,New York,0.0
2,103,Data Analyst,"Python, Excel, Data Analysis",1,Los Angeles,-0.333333
1,102,Backend Developer,"Java, Spring, Hibernate",4,San Francisco,0.666667


**Step 8: Hybrid Approach**

In [None]:
# Combine content-based similarity with collaborative filtering
content_scores = combined_similarity[0]  # Content-based scores for user 1
collab_scores = np.array([model.predict(1, job_id).est for job_id in jobs_data['job_id']])
# Weighted combination
alpha = 0.6  # Weight for collaborative filtering
hybrid_scores = alpha * collab_scores + (1 - alpha) * content_scores
# Recommend top jobs based on hybrid scores
recommended_jobs = np.argsort(hybrid_scores)[::-1]
jobs_data.iloc[recommended_jobs]

Unnamed: 0,job_id,job_title,job_skills,experience_required,location,experience_required_normalized
0,101,Data Scientist,"Python, SQL, Machine Learning",2,New York,0.0
2,103,Data Analyst,"Python, Excel, Data Analysis",1,Los Angeles,-0.333333
1,102,Backend Developer,"Java, Spring, Hibernate",4,San Francisco,0.666667
