In [33]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Data Cleaning (Specific to Text Processing):

In [44]:

# reading in student data
student_df = pd.read_csv('/content/drive/My Drive/data/Cleaned_Member_Registration_Dataset.csv')

# reading in job data
job_df = pd.read_csv('/content/drive/My Drive/data/internships_clean.csv')

# categorizing student internship job count
def categorize_internship_count(count):
    if count == 0:
        return "No internship experience"
    elif count == 1:
        return "At least one internship experience"
    elif count == 2:
        return "At least two internship experiences"
    else:
        return "Advanced internship experience"

student_df['intern_experience_text'] = student_df['intern_job_count'].apply(categorize_internship_count)

# categorizing grad year
def categorize_by_graduation_year(row, current_year=2024):
    years_left = row['grad_year'] - current_year

    if years_left == 4:
        return 'Freshman'
    elif years_left == 3:
        return 'Sophomore'
    elif years_left == 2:
        return 'Junior'
    elif years_left == 1:
        return 'Senior'
    elif years_left == 0:
        return 'Graduate'
    else:
        return 'None'

# applying categorization function
student_df['year'] = student_df.apply(categorize_by_graduation_year, axis=1)

print(student_df.head())


   gender  grad_year transfer                            major   minor  \
0    Male       2024       No  Mathematics/Applied Mathematics     NaN   
1    Male       2025       No        Statistics & Data Science     NaN   
2  Female       2025       No  Mathematics/Applied Mathematics  French   
3  Female       2025       No        Statistics & Data Science     NaN   
4    Male       2025      Yes        Statistics & Data Science     NaN   

  returning_member  intern_job_count Java Python    R  ...  \
0               No                 1  Yes    Yes   No  ...   
1              Yes                 1  Yes    Yes  Yes  ...   
2               No                 0   No    Yes  Yes  ...   
3              Yes                 0   No    Yes  Yes  ...   
4              Yes                 0   No    Yes  Yes  ...   

  Number.of.completed.data.science.projects  \
0                                         2   
1                                         1   
2                                        

Combining all the columns to create just one column called 'text' with all the text in the dataset

In [45]:
# combining student_df columns
features = ['year', 'major', 'minor', 'intern_experience_text', 'Career.Goal', 'internship_or_full_time', 'Prefer.Remote', 'Top.desired.location', 'Top.desired.state', 'Industry.Preferences', 'Data_Science_Technologies', 'Data.Science.Skills', 'Merged_Languages', 'Packages']

student_df['text'] = ''
for feature in features:
    student_df[feature] = student_df[feature].fillna('')  # Fill NA values
    student_df['text'] += student_df[feature] + ' '       # Concatenate into 'text'

student_text = student_df[['ID', 'text']].copy()
print(student_text.head())

# combining job_df columns
job_features = ['Company', 'Job.Title', 'Location', 'Job.Type', 'Skill.1', 'Skill.2', 'Skill.3','Skill.4', 'Skill.5', 'Skill.6', 'ExperienceQualifications', 'ClassYearQualifications']

job_df['text'] = ''
for feature in job_features:
    job_df[feature] = job_df[feature].fillna('')
    job_df['text'] += job_df[feature] + ' '
job_df['text'] = job_df['text'].str.strip()

job_text = job_df[['text']].copy()# new job text with combined text
print(job_text.head())


   ID                                               text
0   1  Graduate Mathematics/Applied Mathematics  At l...
1   2  Senior Statistics & Data Science  At least one...
2   3  Senior Mathematics/Applied Mathematics French ...
3   4  Senior Statistics & Data Science  No internshi...
4   5  Senior Statistics & Data Science  No internshi...
                                                text
0  Zscaler Data Science Intern- Undergrad (Summer...
1  RYTE Corporation R&D Data Scientist Intern (Al...
2  SOMFY Group Alternance Data Analyst (H/F) Clus...
3  RYTE Corporation MLOps Engineer Intern (Altern...
4  RYTE Corporation Data Scientist (NLP & ML) Int...


### Tokenizing

In [46]:
# tokenizing and preprocessing function
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()  # lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # remove punctuation
    tokens = word_tokenize(text)  # tokenize
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]  # Remove stopwords and stem
    return ' '.join(tokens)  # join tokens back into a string

In [47]:
# apply preprocessing
student_text['preprocessed'] = student_text['text'].apply(preprocess)
job_text['preprocessed'] = job_text['text'].apply(preprocess)

TF-IDF Vectorization

In [48]:
# combining all text
all_text = pd.concat([student_text['preprocessed'], job_text['preprocessed']], ignore_index=True)

# TF-IDF vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(all_text)

student_tfidf = tfidf_matrix[:len(student_text)]
job_tfidf = tfidf_matrix[len(student_text):]


In [49]:
# looking at features
feature_names = vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
print(tfidf_df.head())

    ab  abarca  abstatt  academ  acceler  account  accuweath  act  actuari  \
0  0.0     0.0      0.0     0.0      0.0      0.0        0.0  0.0      0.0   
1  0.0     0.0      0.0     0.0      0.0      0.0        0.0  0.0      0.0   
2  0.0     0.0      0.0     0.0      0.0      0.0        0.0  0.0      0.0   
3  0.0     0.0      0.0     0.0      0.0      0.0        0.0  0.0      0.0   
4  0.0     0.0      0.0     0.0      0.0      0.0        0.0  0.0      0.0   

   adarga  ...  work  world    wrangl  write  york  zeta  zrich  zscaler  \
0     0.0  ...   0.0    0.0  0.171118    0.0   0.0   0.0    0.0      0.0   
1     0.0  ...   0.0    0.0  0.191242    0.0   0.0   0.0    0.0      0.0   
2     0.0  ...   0.0    0.0  0.000000    0.0   0.0   0.0    0.0      0.0   
3     0.0  ...   0.0    0.0  0.225778    0.0   0.0   0.0    0.0      0.0   
4     0.0  ...   0.0    0.0  0.184752    0.0   0.0   0.0    0.0      0.0   

   zuora  zurich  
0    0.0     0.0  
1    0.0     0.0  
2    0.0     0.0 

### Updated Model Using Heuristic Score (w/ more accurate dataset) :

In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

Data Cleaning

In [13]:
# Load datasets
students_df = pd.read_csv("/content/drive/My Drive/data/Cleaned_Member_Registration_Dataset.csv")
internships_df = pd.read_csv("/content/drive/My Drive/data/internships_clean.csv")

print(students_df)

# categorizing student internship job count
def categorize_internship_count(count):
    if count == 0:
        return "No internship experience"
    elif count == 1:
        return "At least one internship experience"
    elif count == 2:
        return "At least two internship experiences"
    else:
        return "Advanced internship experience"

students_df['intern_experience_text'] = students_df['intern_job_count'].apply(categorize_internship_count)

# categorizing grad year
def categorize_by_graduation_year(row, current_year=2024):
    years_left = row['grad_year'] - current_year

    if years_left == 4:
        return 'Freshman'
    elif years_left == 3:
        return 'Sophomore'
    elif years_left == 2:
        return 'Junior'
    elif years_left == 1:
        return 'Senior'
    elif years_left == 0:
        return 'Graduate'
    else:
        return 'None'

# applying categorization function
students_df['year'] = students_df.apply(categorize_by_graduation_year, axis=1)

# category_counts = students_df.groupby(["year"]).size().reset_index(name="Count")
# print(category_counts)

     gender  grad_year transfer                                  major  \
0      Male       2024       No        Mathematics/Applied Mathematics   
1      Male       2025       No              Statistics & Data Science   
2    Female       2025       No        Mathematics/Applied Mathematics   
3    Female       2025       No              Statistics & Data Science   
4      Male       2025      Yes              Statistics & Data Science   
..      ...        ...      ...                                    ...   
476    Male       2027       No  Computer Science/Computer Engineering   
477    Male       2026      Yes              Statistics & Data Science   
478    Male       2028       No  Computer Science/Computer Engineering   
479    Male       2026       No              Statistics & Data Science   
480    Male       2027       No  Computer Science/Computer Engineering   

      minor returning_member  intern_job_count Java Python    R  ...  \
0       NaN               No           

In [14]:
# Text preprocessing function
def preprocess_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return " ".join(tokens)


In [15]:
# Helper function to calculate heuristic score
def calculate_score(student, internship):
    score = 0

    # Combine location preferences
    student_location = preprocess_text(f"{student['Prefer.Remote']} {student['Top.desired.location']} {student['Top.desired.state']}")
    internship_location = preprocess_text(internship["Location"])

    # Calculate cosine similarity for location
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([student_location, internship_location])
    location_similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    score += location_similarity * 10  # Weighted score for location

    # Compare intern experience text with job qualifications and year qualifications
    intern_experience_text = preprocess_text(student["intern_experience_text"])
    job_qualifications = preprocess_text(internship["ExperienceQualifications"])
    class_year_qualifications = preprocess_text(internship["ClassYearQualifications"])
    student_year = preprocess_text(student["year"])

    combined_qualifications = f"{job_qualifications} {class_year_qualifications}"
    combined_student_info = f"{intern_experience_text} {student_year}"

    tfidf_matrix = vectorizer.fit_transform([combined_student_info, combined_qualifications])
    qualifications_similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    score += qualifications_similarity * 15  # Weighted score for qualifications

    # Add cosine similarity between skills and job requirements
    student_skills = preprocess_text(
        " ".join([str(student[col]) for col in ["Data.Science.Skills", "Data_Science_Technologies", "Packages", "Merged_Languages"]])
    )
    job_skills = preprocess_text(
        " ".join([str(internship[f"Skill.{i}"]) for i in range(1, 7)])
    )
    tfidf_matrix = vectorizer.fit_transform([student_skills, job_skills])
    skills_similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    score += skills_similarity * 20  # Weighted score for skills

    return score

In [15]:
# Create labeled dataset for ML
labeled_data = []

for _, student in students_df.iterrows():
    for _, internship in internships_df.iterrows():
        score = calculate_score(student, internship)
        labeled_data.append({
            "student_id": student["ID"],
            "internship_id": internship["Job.Title"],
            "score": score
        })

labeled_df = pd.DataFrame(labeled_data)

In [16]:
# Prepare data for training
X = labeled_df[["student_id", "internship_id"]]
y = labeled_df["score"]

# One-hot encode categorical features
X = pd.get_dummies(X, columns=["student_id", "internship_id"])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a simple ML model
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Evaluate model
print("Training score:", model.score(X_train, y_train))
print("Test score:", model.score(X_test, y_test))

# Predict scores for unseen data
predictions = model.predict(X_test)

# Save the labeled data and predictions
labeled_df.to_csv("labeled_student_internship_scores.csv", index=False)
pd.DataFrame({"Actual": y_test, "Predicted": predictions}).to_csv("predictions.csv", index=False)

print("Labeled dataset and predictions saved.")

Training score: 0.7891858093300057
Test score: 0.1321704230925702
Labeled dataset and predictions saved.


#### Giving Recommendations (Using ML Model):

In [52]:
import random

student_id = random.randint(1, 481)
recommendation_input = pd.DataFrame({
    "student_id": [student_id] * len(internships_df),
    "internship_id": internships_df["Job.Title"]
})

# One-hot encode the input to match training data format
recommendation_input_encoded = pd.get_dummies(recommendation_input, columns=["student_id", "internship_id"])

# Ensure all columns match the model's training columns
missing_cols = set(X_train.columns) - set(recommendation_input_encoded.columns)
for col in missing_cols:
    recommendation_input_encoded[col] = 0  # Add missing columns as zeros

recommendation_input_encoded = recommendation_input_encoded[X_train.columns]


In [28]:
# Predict scores
predicted_scores = model.predict(recommendation_input_encoded)

# Add the predictions back to internships for ranking
recommendations = internships_df.copy()
recommendations["predicted_score"] = predicted_scores

# Sort internships by predicted score
recommendations = recommendations.sort_values(by="predicted_score", ascending=False)
print("Top recommendations for student_id =", student_id)
print(recommendations[["Job.Title", "predicted_score"]].head(10))

Top recommendations for student_id = 423
                                             Job.Title  predicted_score
198                    Data Analyst - Internship (H/F)        13.509969
78                 Stage - Ingénieur véhicule autonome        11.255456
109                  Data Science Intern (Summer 2023)        11.164977
64                    Intern, Data Science & Analytics        10.838065
83            STAGE 6 mois - Data Scientist Junior H/F        10.266527
24            STAGE 6 mois - Data Scientist Junior H/F        10.266527
91             Data Science & Analytics, Summer Intern         9.878242
146  Summer Internship - Data Science / Advanced An...         9.743716
38     Website & Data analyses - Internship EU student         9.652124
20   Intern Associate Data Analyst – Undergrad – Au...         9.447068


### Calculating Cosine Similarity (Using get_recommendation function from hw):

In [54]:

# Calculate Cosine Similarity
cosine_sim_matrix = cosine_similarity(student_tfidf, job_tfidf)

# Extract student IDs as a list or array
student_ids = student_df['ID'].astype(str).values  # Ensure student IDs are strings

# Extract job titles as a list or array
job_titles = job_df['Job.Title'].values  # Replace 'Job.Title' with the actual column name

# Convert cosine similarity matrix to a DataFrame for easier manipulation
cosine_sim_df = pd.DataFrame(
    cosine_sim_matrix,
    index=student_ids,  # Use student IDs as the index
    columns=job_titles  # Use job titles as the columns
)

# Define the give_recommendation function
def give_recommendation(student_id, top_n=10):
    if student_id not in cosine_sim_df.index:
        return f"No recommendations available for student ID: {student_id}"

    # Get similarity scores for the student and sort them
    scores = cosine_sim_df.loc[student_id]
    similar_jobs = scores.sort_values(ascending=False).iloc[:top_n]

    # Return a DataFrame of recommendations
    return pd.DataFrame({
        'Top Recommended Jobs': similar_jobs.index,
        'Similarity Score': similar_jobs.values
    })

# Example Usage
# Replace 'student_1' with an actual student ID from your dataset
recommendations = give_recommendation('113')
print(recommendations)

                                Top Recommended Jobs  Similarity Score
0              Engineering Intern (Data Engineering)          0.431924
1                    Internship: Data Analyst Intern          0.338066
2  Summer Internship, Data Scientist | Content Bu...          0.312964
3                Machine Learning Engineering Intern          0.307369
4                       Computational Biology Intern          0.301560
5                    Internship: Data Science Intern          0.297810
6                        Machine Learning Internship          0.291151
7  Machine Learning Engineer 2023 Internship Prog...          0.288446
8                           Data Science Intern 2023          0.287838
9                  Life Science Data Engineer Intern          0.270764


Code for finding student with the highest similarity score

In [19]:
# Flatten the cosine similarity matrix into a long-form DataFrame
cosine_sim_long = cosine_sim_df.stack().reset_index()
cosine_sim_long.columns = ['Student_ID', 'Job_Title', 'Cosine_Similarity']

# Find the student-job pair with the highest cosine similarity
max_similarity_row = cosine_sim_long.loc[cosine_sim_long['Cosine_Similarity'].idxmax()]

# Extract the relevant details
max_student_id = max_similarity_row['Student_ID']
max_job_title = max_similarity_row['Job_Title']
max_similarity_score = max_similarity_row['Cosine_Similarity']

# Print the student with the highest cosine similarity score
print(f"Student with the highest cosine similarity:")
print(f"Student ID: {max_student_id}")
print(f"Job Title: {max_job_title}")
print(f"Cosine Similarity Score: {max_similarity_score}")

Student with the highest cosine similarity:
Student ID: 113
Job Title: Engineering Intern (Data Engineering)
Cosine Similarity Score: 0.43192413248406536


avg of highest simliarity score of all students

In [55]:
# Find the highest cosine similarity score for each student
highest_similarity_scores = cosine_sim_df.max(axis=1)

# Calculate the average of the highest similarity scores
average_highest_similarity = highest_similarity_scores.mean()

# Output the results
print(f"Average of the highest similarity scores: {average_highest_similarity:.4f}")

# Optionally, you can also print the student with the highest similarity score:
max_student_similarity = highest_similarity_scores.max()
max_student_id = highest_similarity_scores.idxmax()

print(f"Student with the highest similarity score: ID {max_student_id} with a similarity score of {max_student_similarity:.4f}")

Average of the highest similarity scores: 0.2401
Student with the highest similarity score: ID 113 with a similarity score of 0.4319


### Recommendations pt2

In [18]:
# Predict scores for unseen data
predictions = model.predict(X_test)

# Save the labeled data and predictions
labeled_df.to_csv("labeled_student_internship_scores.csv", index=False)
pd.DataFrame({"Actual": y_test, "Predicted": predictions}).to_csv("predictions.csv", index=False)

# Showcase recommendations for specific students
def recommend_for_student(student_id, top_n=10):
    student = students_df[students_df["ID"] == student_id].iloc[0]
    recommendations = []

    for _, internship in internships_df.iterrows():
        score = calculate_score(student, internship)
        recommendations.append({
            "internship": internship["Job.Title"],
            "score": score
        })

    recommendations = sorted(recommendations, key=lambda x: x["score"], reverse=True)
    return recommendations[:top_n]


NameError: name 'model' is not defined

In [36]:
# Example usage
student_id = random.randint(1, 481)
student_id = 335
top_recommendations = recommend_for_student(student_id)
recs = pd.DataFrame(top_recommendations)
print("Top recommendations for student_id =", student_id)
print(recs)
print("Labeled dataset and predictions saved.")


Top recommendations for student_id = 335
                                          internship      score
0                      [Internship] Machine Learning  16.325998
1                          Data Scientist Internship  16.119493
2                Business Data Analyst Intern (APAC)  15.981898
3                                Data Analyst Intern  15.742661
4                        Data Scientist - Internship  15.719672
5  Machine Learning Engineer 2023 Internship Prog...  15.465690
6                       AI Research Scientist Intern  15.298530
7             Intern, Data Engineering (Summer 2023)  15.265013
8                          Data Analytics Internship  14.763490
9                              📈 NLP Research Intern  14.610683
Labeled dataset and predictions saved.


### Finding the Maximum Heuristic Score Possible

In [41]:
def find_max_score():
    max_score = 0
    ideal_student = {
        "Prefer.Remote": "remote",
        "Top.desired.location": "US",
        "Top.desired.state": "SF",
        "intern_experience_text": "at least one internship experience",
        "year": "junior",
        "Data.Science.Skills": "Big Data, Data Analytics",
        "Data_Science_Technologies": "Git",
        "Packages": "numpy",
        "Merged_Languages": "python"
    }

    ideal_internship = {
        "Location": "remote, US",
        "ExperienceQualifications": "at least one internship experience",
        "ClassYearQualifications": "junior",
        "Skill.1": "Git",
        "Skill.2": "Data Analytics",
        "Skill.3": "SQL",
        "Skill.4": "Big Data",
        "Skill.5": "numpy",
        "Skill.6": "python"
    }

    score = calculate_score(pd.Series(ideal_student), pd.Series(ideal_internship))
    return score


In [42]:
max_score = find_max_score()
print("Maximum possible score:", max_score)
# print("Score components:", score_components)

Maximum possible score: 40.20397196794686


### Fitting to Random Forest Model using Heuristic Score (before changing dataset values)

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Helper function to calculate heuristic score
def calculate_score(student, internship):
    score = 0

    # Increase score if preferred location matches
    if student["Top.desired.location"] in internship["Location"]:
        score += 10

    # Increase score if major matches job qualifications
    if student["major"] in internship["ExperienceQualifications"]:
        score += 5

    # Add cosine similarity between skills and job requirements
    skills = " ".join([str(student[skill]) for skill in ["Python", "R", "SQL", "Java"]])
    job_skills = " ".join([str(internship[skill]) for skill in ["Skill.1",
                                                                "Skill.2",
                                                                "Skill.3",
                                                                "Skill.4",
                                                                "Skill.5",
                                                                "Skill.6"]])
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([skills, job_skills])
    similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    score += similarity * 20  # Weighted score for similarity

    return score


Training score: 0.8137118108898125
Test score: 0.22392760674041512
Labeled dataset and predictions saved.


In [None]:
# Create labeled dataset for ML
labeled_data = []

for _, student in student_df.iterrows():
    for _, internship in job_df.iterrows():
        score = calculate_score(student, internship)
        labeled_data.append({
            "student_id": student["ID"],
            "internship_id": internship["Job.Title"],
            "score": score
        })

labeled_df = pd.DataFrame(labeled_data)

In [None]:
# Prepare data for training
X = labeled_df[["student_id", "internship_id"]]
y = labeled_df["score"]

# One-hot encode categorical features
X = pd.get_dummies(X, columns=["student_id", "internship_id"])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a simple ML model
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Evaluate model
print("Training score:", model.score(X_train, y_train))
print("Test score:", model.score(X_test, y_test))

# Predict scores for unseen data
predictions = model.predict(X_test)

# Save the labeled data and predictions
labeled_df.to_csv("labeled_student_internship_scores.csv", index=False)
pd.DataFrame({"Actual": y_test, "Predicted": predictions}).to_csv("predictions.csv", index=False)

print("Labeled dataset and predictions saved.")

### Code for possibly finding the performance of model using metrics (e.g. Precision and Recall):

In [None]:

# Placeholder DataFrames: Replace with your actual data
student_recommendations = pd.DataFrame({
    'student_id': [421, 422, 423],  # Replace with your student IDs
    'recommended_jobs': [['Job1', 'Job2', 'Job3'], ['Job2', 'Job4', 'Job6'], ['Job5', 'Job7', 'Job8']]
})

true_relevant_jobs = pd.DataFrame({
    'student_id': [421, 422, 423],  # Replace with your student IDs
    'relevant_jobs': [['Job1', 'Job5'], ['Job2', 'Job3'], ['Job7', 'Job8', 'Job9']]
})

# Evaluation Function for Precision@N and Recall@N
def evaluate_recommender(recommendations, ground_truth, top_n=3):
    precisions = []
    recalls = []

    for idx, student_id in enumerate(ground_truth['student_id']):
        # Get recommended and relevant jobs for the current student.
        recommended = recommendations.loc[recommendations['student_id'] == student_id, 'recommended_jobs'].values[0][:top_n]
        relevant = ground_truth.loc[ground_truth['student_id'] == student_id, 'relevant_jobs'].values[0]

        # Calculate hits for precision and recall.
        hits = len(set(recommended) & set(relevant))
        precisions.append(hits / len(recommended) if recommended else 0)
        recalls.append(hits / len(relevant) if relevant else 0)

    # Average Precision and Recall
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)

    return avg_precision, avg_recall

# Evaluate the recommender system
top_n = 3  # Adjust this as needed
precision, recall = evaluate_recommender(student_recommendations, true_relevant_jobs, top_n=top_n)

# Print Results
print(f"Precision@{top_n}: {precision:.2f}")
print(f"Recall@{top_n}: {recall:.2f}")


### SOURCES:

https://www.itransition.com/machine-learning/recommendation-systems