In [49]:
# !pip install pandas
# !pip install scikit-learn

In [None]:
import random
import csv

# Skills pool
skills_list = [
    "Python", "Java", "C++", "JavaScript", "TypeScript", "React", "Angular", "Node.js",
    "Spring Boot", "Django", "FastAPI", "SQL", "MongoDB", "AWS", "Docker", "Kubernetes",
    "Machine Learning", "Deep Learning", "NLP", "Data Engineering"
]

num_users = 100
num_jobs = 250
applications_per_user = (1, 4)  # min, max number of jobs a user can apply to

# ----- Generate jobs.csv -----
with open("jobs.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["job_id", "skills", "description", "num_of_applies"])
    
    for job_id in range(101, 101 + num_jobs):
        job_skills = random.sample(skills_list, random.randint(3, 6))
        desc = f"{random.choice(job_skills)} Developer"
        num_applies = random.randint(5, 50)
        writer.writerow([job_id, ",".join(job_skills), desc, num_applies])

# ----- Generate users.csv -----
with open("users.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["user_id", "skills"])
    
    for user_id in range(1, num_users + 1):
        user_skills = random.sample(skills_list, random.randint(3, 6))
        writer.writerow([user_id, ",".join(user_skills)])

# ----- Generate applications.csv -----
with open("applications.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["user_id", "job_id"])
    
    for user_id in range(1, num_users + 1):
        applied_jobs = random.sample(range(101, 101 + num_jobs), random.randint(*applications_per_user))
        for job_id in applied_jobs:
            writer.writerow([user_id, job_id])


In [None]:
import pandas as pd


In [52]:
jobs = pd.read_csv("jobs.csv")
users = pd.read_csv("users.csv")
applies = pd.read_csv("applications.csv")

In [53]:
jobs.head()

Unnamed: 0,job_id,skills,description,num_of_applies
0,101,"Django,Angular,TypeScript,Docker,SQL",Django Developer,25
1,102,"Node.js,TypeScript,C++,Java",Java Developer,8
2,103,"FastAPI,Python,Deep Learning,Docker",Docker Developer,46
3,104,"Deep Learning,Data Engineering,SQL,Angular,Java",Angular Developer,20
4,105,"Python,Django,FastAPI",Python Developer,13


In [54]:
users.head()

Unnamed: 0,user_id,skills
0,1,"Java,React,Spring Boot,MongoDB,Machine Learnin..."
1,2,"Machine Learning,FastAPI,Python,NLP,Kubernetes..."
2,3,"Docker,C++,NLP,FastAPI,Python"
3,4,"FastAPI,Angular,JavaScript,Node.js"
4,5,"Docker,SQL,MongoDB,React"


In [55]:
applies.head()

Unnamed: 0,user_id,job_id
0,1,305
1,1,123
2,1,118
3,1,103
4,2,120


In [56]:
n_users = len(users)
n_jobs = len(jobs)
n_applies = jobs['num_of_applies'].sum()
users_applied = applies['user_id'].count()

print(f"Number of users: {n_users}")
print(f"Number of jobs: {n_jobs}")
print(f"Total number of job applications: {n_applies}")
print(f"Total number of users who applied for jobs: {users_applied}")
print(f"Avergae number of applications per job: {n_applies / n_jobs:.2f}")
print(f"Average number of applications per user: {users_applied / n_users:.2f}")

Number of users: 100
Number of jobs: 250
Total number of job applications: 6898
Total number of users who applied for jobs: 259
Avergae number of applications per job: 27.59
Average number of applications per user: 2.59


In [57]:
user_freq = applies[["user_id", "job_id"]].groupby("user_id").count().reset_index()
user_freq.columns = ["user_id", "num_of_applies"]
user_freq.head()

Unnamed: 0,user_id,num_of_applies
0,1,4
1,2,3
2,3,1
3,4,2
4,5,1


In [58]:
from scipy.sparse import csr_matrix
import numpy as np

def create_matrix(df):
    
    N = len(df['user_id'].unique())
    M = len(df['job_id'].unique())

    user_mapper = dict(zip(np.unique(df["user_id"]), list(range(N))))
    job_mapper = dict(zip(np.unique(df["job_id"]), list(range(M))))

    user_inv_mapper = dict(zip(list(range(N)), np.unique(df["user_id"])))
    job_inv_mapper = dict(zip(list(range(M)), np.unique(df["job_id"])))

    user_index = [user_mapper[i] for i in df['user_id']]
    job_index = [job_mapper[i] for i in df['job_id']]

    X = csr_matrix((np.ones(len(user_index)), (user_index, job_index)), shape=(N, M))

    return X, user_mapper, job_mapper, user_inv_mapper, job_inv_mapper

X, user_mapper, job_mapper, user_inv_mapper, job_inv_mapper = create_matrix(applies)

In [59]:
from sklearn.neighbors import NearestNeighbors

def find_similar_jobs(job_id, X, k, job_mapper, job_inv_mapper):
    model = NearestNeighbors(metric='cosine', algorithm='brute')
    model.fit(X.T)

    job_index = job_mapper[job_id]
    distances, indices = model.kneighbors(X.T[job_index], n_neighbors=k+1)

    similar_jobs = []
    for i in range(1, len(distances.flatten())):
        similar_jobs.append((job_inv_mapper[indices.flatten()[i]], distances.flatten()[i]))
    
    return similar_jobs

In [60]:
def recommend_jobs(user_id, X, user_mapper, job_mapper, job_inv_mapper, n_recommendations=5):
    if user_id not in user_mapper:
        print(f"User ID {user_id} not found.")
        return []

    user_index = user_mapper[user_id]
    user_applied_jobs = X[user_index].nonzero()[1]

    all_recommendations = {}
    for job_index in user_applied_jobs:
        job_id = job_inv_mapper[job_index]
        similar_jobs = find_similar_jobs(job_id, X, k=n_recommendations, job_mapper=job_mapper, job_inv_mapper=job_inv_mapper)
        
        for sim_job_id, score in similar_jobs:
            if sim_job_id not in user_applied_jobs and sim_job_id not in all_recommendations:
                all_recommendations[sim_job_id] = score

    sorted_recommendations = sorted(all_recommendations.items(), key=lambda x: x[1])[:n_recommendations]
    
    return sorted_recommendations

In [61]:
user_id = 3

In [62]:
for val in applies[applies['user_id'] == user_id]["job_id"].values:
    print(jobs[jobs['job_id'] == val][['skills']].values)

[['Java,Node.js,TypeScript,C++,Docker']]


In [63]:

sorted_recommendations = recommend_jobs(user_id, X, user_mapper, job_mapper, job_inv_mapper, n_recommendations=3)
print(f"Recommendations for User ID {user_id}:")
for job_id, score in sorted_recommendations:
    job_info = jobs[jobs['job_id'] == job_id].iloc[0]
    print(f"Job ID: {job_id}, Skills: {job_info['skills']}, Description: {job_info['description']}, Similarity Score: {score:.4f}")

Recommendations for User ID 3:
Job ID: 102, Skills: Node.js,TypeScript,C++,Java, Description: Java Developer, Similarity Score: 1.0000
Job ID: 104, Skills: Deep Learning,Data Engineering,SQL,Angular,Java, Description: Angular Developer, Similarity Score: 1.0000
Job ID: 103, Skills: FastAPI,Python,Deep Learning,Docker, Description: Docker Developer, Similarity Score: 1.0000


In [64]:
jobs.head()

Unnamed: 0,job_id,skills,description,num_of_applies
0,101,"Django,Angular,TypeScript,Docker,SQL",Django Developer,25
1,102,"Node.js,TypeScript,C++,Java",Java Developer,8
2,103,"FastAPI,Python,Deep Learning,Docker",Docker Developer,46
3,104,"Deep Learning,Data Engineering,SQL,Angular,Java",Angular Developer,20
4,105,"Python,Django,FastAPI",Python Developer,13


In [65]:
users.head()

Unnamed: 0,user_id,skills
0,1,"Java,React,Spring Boot,MongoDB,Machine Learnin..."
1,2,"Machine Learning,FastAPI,Python,NLP,Kubernetes..."
2,3,"Docker,C++,NLP,FastAPI,Python"
3,4,"FastAPI,Angular,JavaScript,Node.js"
4,5,"Docker,SQL,MongoDB,React"


In [66]:
jobs["skills"].head()

0               Django,Angular,TypeScript,Docker,SQL
1                        Node.js,TypeScript,C++,Java
2                FastAPI,Python,Deep Learning,Docker
3    Deep Learning,Data Engineering,SQL,Angular,Java
4                              Python,Django,FastAPI
Name: skills, dtype: object

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Vectorize job skills
vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(","))
job_skill_matrix = vectorizer.fit_transform(jobs["skills"])

# Create a recommendation function
def recommend_jobs(user_id, top_k=5):
    # Get user skills
    user_row = users[users["user_id"] == user_id].iloc[0]
    user_skills = user_row["skills"] # e.g., "Python,Java,SQL"
    
    # Vectorize user skills
    user_vec = vectorizer.transform([user_skills])
    
    # Compute similarity with all jobs
    similarity_scores = cosine_similarity(user_vec, job_skill_matrix).flatten()
    
    # Put into dataframe
    job_scores = pd.DataFrame({
        "job_id": jobs["job_id"],
        "score": similarity_scores
    })
    
    # Remove already applied jobs
    applied = applies[applies["user_id"] == user_id]["job_id"].tolist()
    job_scores = job_scores[~job_scores["job_id"].isin(applied)]
    
    # Sort by similarity (and optionally num_of_applies)
    job_scores = job_scores.merge(jobs[["job_id", "num_of_applies"]], on="job_id")
    job_scores["final_score"] = job_scores["score"] + 0.01 * job_scores["num_of_applies"]
    
    # Top K recommendations
    recommendations = job_scores.sort_values("final_score", ascending=False).head(top_k)
    return recommendations

# Example: Recommend for user 1
print(recommend_jobs(2, top_k=5))


     job_id     score  num_of_applies  final_score
141     245  0.695583              42     1.115583
142     246  0.688878              37     1.058878
197     301  0.635973              42     1.055973
199     303  0.597894              45     1.047894
19      121  0.510528              49     1.000528


