In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

## Calling the Saved Dataset

In [2]:
course_dataset = pd.read_csv("data/courseData.csv", encoding= 'unicode_escape')
course_dataset.head()

Unnamed: 0.1,Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
0,0,1001,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
1,1,1002,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
2,2,1003,B.E.,Mechanical,MITAOE,['CATIA'],CATIA
3,3,1004,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
4,4,1005,B.E.,Mechanical,MITCOE,['CATIA'],CATIA


In [3]:
user_dataset = pd.read_csv("data/userData.csv", encoding= 'unicode_escape')
user_dataset.head()

Unnamed: 0.1,Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
0,0,1001,B.E.,Computer Science & Engineering,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."
1,1,1002,B.E.,Computer Science & Engineering,['Hindi English'],"['Java', ' Neural Networks', ' AI', ' Python',...",Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ..."
2,2,1003,B.E.,Computer Science & Engineering,['Missing'],['Missing'],Missing,Missing
3,3,1004,B.E.,Computer Science & Engineering,"['English', ' Hindi', ' Marathi', ' Marwari']","['XML', ' C', ' Java', ' Data Structures', ' P...",Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo..."
4,4,1005,B.E.,Computer Science & Engineering,"['English', 'Hindi ', 'Kashmiri ', ' Urdu']","['XML', ' Word', ' Data Structures', ' Communi...",To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ..."


In [4]:
ratings_df = pd.read_csv("data/ratingData.csv", encoding= 'unicode_escape')
ratings_df.head()

Unnamed: 0,course_id,user_id,rating
0,2001,1001,5
1,2001,1002,3
2,2001,1003,1
3,2001,1004,0
4,2001,1005,2


## Content Based Filtering

### User Dataset - Key Skills 

In [5]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(user_dataset['key_skills_str'].values.astype('U'))
tfidf_matrix.shape

(1097, 2065)

In [6]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim[0]

array([1.        , 0.02408797, 0.        , ..., 0.06121443, 0.01629229,
       0.007909  ])

In [7]:
user_dataset = user_dataset.reset_index()
titles = user_dataset['userid']
indices = pd.Series(user_dataset.index, index=user_dataset['userid'])
indices.head(2)

userid
1001    0
1002    1
dtype: int64

In [8]:
def get_recommendations(title): #this is getting similar users based on their key skills
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

get_recommendations("1001").head(10)

894     1847
996     1946
981     1931
256     1231
868     1821
970     1920
1089    2039
714     1667
417     1375
493     1444
Name: userid, dtype: object

###  User Dataset - Career Objective

In [9]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(user_dataset['career_objective'].values.astype('U'))
tfidf_matrix.shape

(1097, 4673)

In [10]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim[0]

array([1.        , 0.        , 0.        , ..., 0.00592012, 0.        ,
       0.0596521 ])

In [11]:
user_dataset = user_dataset.reset_index()
titles = user_dataset['userid']
indices = pd.Series(user_dataset.index, index=user_dataset['userid'])
indices.head(2)

userid
1001    0
1002    1
dtype: int64

In [12]:
def get_recommendations(title): #this is getting similar users based on their career objectives
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

get_recommendations("1001").head(10)

894     1847
996     1946
93      1087
110     1102
180     1161
20      1021
914     1867
602     1555
801     1754
1022    1972
Name: userid, dtype: object

In [13]:
user_dataset["career_objective"].loc[user_dataset['userid'] == '1001']

0    Computer Engineering student with good technic...
Name: career_objective, dtype: object

In [14]:
user_dataset.loc[user_dataset['userid'] == '1847']

Unnamed: 0.1,level_0,index,Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
894,894,894,842,1847,B.E.,Computer Science & Engineering,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."


### User Dataset - career objective and key skills


In [15]:
#for user
#creating the dataset
cf_dataset = pd.DataFrame()
cf_dataset['userid'] = user_dataset['userid']
cf_dataset['degree_1'] = user_dataset['degree_1']
cf_dataset['degree_1_specializations'] = user_dataset['degree_1_specializations']
#cf_dataset['career_objective'] = user_dataset['career_objective']
cf_dataset['key_skills_str'] = user_dataset['key_skills_str']
cf_dataset.head(5)

cf_dataset['all'] = cf_dataset['degree_1'] + cf_dataset['degree_1_specializations'] + cf_dataset['key_skills_str']
cf_dataset

#for course
#creating the dataset
cf_dataset2 = pd.DataFrame()
cf_dataset2['sr_'] = course_dataset['sr_']
cf_dataset2['degree_1'] = course_dataset['degree_1']
cf_dataset2['degree_1_specializations'] = course_dataset['degree_1_specializations']
cf_dataset2['key_skills_str'] = course_dataset['key_skills_str']
cf_dataset2.head(5)

cf_dataset2['all'] = cf_dataset2['degree_1'] + cf_dataset2['degree_1_specializations'] + cf_dataset2['key_skills_str']
cf_dataset2

Unnamed: 0,sr_,degree_1,degree_1_specializations,key_skills_str,all
0,1001,B.E.,Mechanical,CATIA,B.E.MechanicalCATIA
1,1002,B.E.,Mechanical,CATIA,B.E.MechanicalCATIA
2,1003,B.E.,Mechanical,CATIA,B.E.MechanicalCATIA
3,1004,B.E.,Mechanical,CATIA,B.E.MechanicalCATIA
4,1005,B.E.,Mechanical,CATIA,B.E.MechanicalCATIA
...,...,...,...,...,...
9995,10996,B.E.,Electronics Telecommunication Engineering,"EmbeddedC, MATLAB, Cprogramming, Keil",B.E.Electronics Telecommunication Engineering...
9996,10997,B.E.,Electronics Telecommunication Engineering,"EmbeddedC, MATLAB, Cprogramming, Keil",B.E.Electronics Telecommunication Engineering...
9997,10998,M TeCh,Electronics Telecommunication Engineering,"EmbeddedC, MATLAB, Cprogramming, Keil",M TeCh Electronics Telecommunication Engineer...
9998,10999,B.E.,Electronics Telecommunication Engineering,"AmazonWebServiCes, C CPP, Arduino, MongoDB, Li...",B.E.Electronics Telecommunication Engineering...


In [16]:
#vectorize 
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(cf_dataset['all'].values.astype('U'))

#cosine similarity using linear kernel
cosine_sim = cosine_similarity(count_matrix, count_matrix)

#
cf_dataset = cf_dataset.reset_index()
users = cf_dataset['userid']
#print("Users: " + users)
indices = pd.Series(cf_dataset.index, index=cf_dataset['userid'])
indices.head(10)

#function to get recommendations
def get_recommendations(user): #this is getting similar users based on their career objectives???
    idx = indices[user]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    user_indices = [i[0] for i in sim_scores]
    return users.iloc[user_indices]

In [17]:
recs = get_recommendations("1001")
recs.head(10)

894     1847
996     1946
256     1231
512     1464
868     1821
970     1920
1089    2039
714     1667
100     1092
201     1182
Name: userid, dtype: object

In [18]:
get_recommendations("1847").head(10)

996    1946
0      1001
427    1381
546    1499
646    1599
704    1657
855    1808
869    1822
957    1907
971    1921
Name: userid, dtype: object

In [19]:
cf_dataset["key_skills_str"].loc[cf_dataset['userid'] == '1001']

0    C,  Java,  Keras,  Flask,  Deep Learning,  Sel...
Name: key_skills_str, dtype: object

Upon checking the above, the filter is checking all the users' career objectives and key skill. And returns similar users. 

User 1001 has career objective a and key skills list b. 

User 1847 (the first recommendation for user 1001) has career objective a and key skills list b.

User 1847 (the second recommendation for user 1001) has career objective a and key skills list b.

User 1231 (the fourth recommendation for user 1001) has career objective NEW and key skills list NEW with some matches like Keras.

### Course Dataset -  degree 1, degree 1 specialization and key skills

In [20]:
#creating the dataset
cf_dataset2 = pd.DataFrame()
cf_dataset2['sr_'] = course_dataset['sr_']
cf_dataset2['degree_1'] = course_dataset['degree_1']
cf_dataset2['degree_1_specializations'] = course_dataset['degree_1_specializations']
cf_dataset2['key_skills_str'] = course_dataset['key_skills_str']
cf_dataset2.head(5)

Unnamed: 0,sr_,degree_1,degree_1_specializations,key_skills_str
0,1001,B.E.,Mechanical,CATIA
1,1002,B.E.,Mechanical,CATIA
2,1003,B.E.,Mechanical,CATIA
3,1004,B.E.,Mechanical,CATIA
4,1005,B.E.,Mechanical,CATIA


In [21]:
cf_dataset2['all'] = cf_dataset2['degree_1'] + cf_dataset2['degree_1_specializations'] + cf_dataset2['key_skills_str']
cf_dataset2

Unnamed: 0,sr_,degree_1,degree_1_specializations,key_skills_str,all
0,1001,B.E.,Mechanical,CATIA,B.E.MechanicalCATIA
1,1002,B.E.,Mechanical,CATIA,B.E.MechanicalCATIA
2,1003,B.E.,Mechanical,CATIA,B.E.MechanicalCATIA
3,1004,B.E.,Mechanical,CATIA,B.E.MechanicalCATIA
4,1005,B.E.,Mechanical,CATIA,B.E.MechanicalCATIA
...,...,...,...,...,...
9995,10996,B.E.,Electronics Telecommunication Engineering,"EmbeddedC, MATLAB, Cprogramming, Keil",B.E.Electronics Telecommunication Engineering...
9996,10997,B.E.,Electronics Telecommunication Engineering,"EmbeddedC, MATLAB, Cprogramming, Keil",B.E.Electronics Telecommunication Engineering...
9997,10998,M TeCh,Electronics Telecommunication Engineering,"EmbeddedC, MATLAB, Cprogramming, Keil",M TeCh Electronics Telecommunication Engineer...
9998,10999,B.E.,Electronics Telecommunication Engineering,"AmazonWebServiCes, C CPP, Arduino, MongoDB, Li...",B.E.Electronics Telecommunication Engineering...


In [22]:
#vectorize 
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(cf_dataset2['all'].values.astype('U'))

#cosine similarity using linear kernel
cosine_sim = cosine_similarity(count_matrix, count_matrix)

#
cf_dataset2 = cf_dataset2.reset_index()
courses = cf_dataset2['sr_']
indices = pd.Series(cf_dataset2.index, index=cf_dataset2['sr_'])


#function to get recommendations
def get_recommendations(course): 
    idx = indices[course]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    course_indices = [i[0] for i in sim_scores]
    return courses.iloc[course_indices]

In [23]:
get_recommendations(1001).head(10)

1     1002
2     1003
3     1004
4     1005
5     1006
6     1007
7     1008
8     1009
9     1010
10    1011
Name: sr_, dtype: int64

In [24]:
get_recommendations(1003).head(10)

1     1002
2     1003
3     1004
4     1005
5     1006
6     1007
7     1008
8     1009
9     1010
10    1011
Name: sr_, dtype: int64

In [25]:
print(cf_dataset2["all"].loc[cf_dataset2['sr_'] == 1001])  #key_skills_str

0    B.E.MechanicalCATIA
Name: all, dtype: object


In [26]:
print(cf_dataset2["all"].loc[cf_dataset2['sr_'] == 1002])
print(cf_dataset2["all"].loc[cf_dataset2['sr_'] == 1003])

1    B.E.MechanicalCATIA
Name: all, dtype: object
2    B.E.MechanicalCATIA
Name: all, dtype: object


In [27]:
#trying a different course
get_recommendations(10996).head(10)

172    1173
209    1210
212    1213
213    1214
214    1215
225    1226
226    1227
230    1231
231    1232
435    1436
Name: sr_, dtype: int64

In [28]:
print(cf_dataset2["all"].loc[cf_dataset2['sr_'] == 10996])
print(cf_dataset2["key_skills_str"].loc[cf_dataset2['sr_'] == 1173])
print(cf_dataset2["key_skills_str"].loc[cf_dataset2['sr_'] == 1210])

9995    B.E.Electronics  Telecommunication Engineering...
Name: all, dtype: object
172    EmbeddedC, MATLAB, Cprogramming, Keil
Name: key_skills_str, dtype: object
209    EmbeddedC, MATLAB, Cprogramming, Keil
Name: key_skills_str, dtype: object


### Final Content-Based Filtering Algorithm

In [29]:
#for user
#creating a new dataset dataset using the degree1, degree1specializations and key skills, career objective and the userid
cf_user_dataset = pd.DataFrame()
cf_user_dataset['userid'] = user_dataset['userid']
cf_user_dataset['degree_1'] = user_dataset['degree_1']
cf_user_dataset['degree_1_specializations'] = user_dataset['degree_1_specializations']
cf_user_dataset['career_objective'] = user_dataset['career_objective']
cf_user_dataset['key_skills_str'] = user_dataset['key_skills_str']
cf_user_dataset.head(5)

#store the career objective and key skills in description
cf_user_dataset['description'] = cf_user_dataset['degree_1'] + cf_user_dataset['degree_1_specializations'] + cf_user_dataset['career_objective'] + cf_user_dataset['key_skills_str']
cf_user_dataset.head(5)

#for course
#creating a new dataset dataset using the degree1, degree1specializations and key skills and the course id
cf_course_dataset = pd.DataFrame()
cf_course_dataset['sr_'] = course_dataset['sr_']
cf_course_dataset['campus'] = course_dataset['campus'] #campus is added so we dont have to add it later on
cf_course_dataset['degree_1'] = course_dataset['degree_1']
cf_course_dataset['degree_1_specializations'] = course_dataset['degree_1_specializations']
cf_course_dataset['key_skills_str'] = course_dataset['key_skills_str']
cf_course_dataset.head(5)

#store the career objective and key skills in description
cf_course_dataset['description'] = cf_course_dataset['degree_1'] + cf_course_dataset['degree_1_specializations'] + cf_course_dataset['key_skills_str']
cf_course_dataset.head(5)

#put them in a combined dataframe
comb = pd.DataFrame()
comb['description'] = cf_user_dataset['description'] + cf_course_dataset['description']

#vectorize using countvectorize that converts into a matrix of token counts
comb_count = CountVectorizer(stop_words='english', analyzer='word', ngram_range=(1, 2), min_df=0)
comb_count_matrix = comb_count.fit_transform(comb['description'].values.astype('U'))

#cosine similarity between the the combined matrix itself (normalized dot product of X and Y)
course_cosine_sim = cosine_similarity(comb_count_matrix, comb_count_matrix)

#create indices for the courses is using series
cf_course_dataset = cf_course_dataset.reset_index()
courses = cf_course_dataset
indices = pd.Series(cf_course_dataset.index, index=cf_course_dataset['sr_'])

#function to get content-filtered recommendations
def get_course_cf_recommendations(user): 
    
    #get index of course
    idx = indices[user]
    
    #find the most similar 30 courses using cosine_sim and sorting with highest similarity
    sim_scores = list(enumerate(course_cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    course_indices = [i[0] for i in sim_scores]
    
    return courses.iloc[course_indices]

In [30]:
#getting the similar course recs for user 1001
get_course_cf_recommendations(1001).head(10)

Unnamed: 0,index,sr_,campus,degree_1,degree_1_specializations,key_skills_str,description
894,894,1895,MITCOE,B.E.,Computer Science & Engineering,"C, Java, CPP, HTML, CMStool, MYSQL","B.E.Computer Science & EngineeringC, Java, CPP..."
996,996,1997,"MIT,Pune",B.E.,Electronics Telecommunication Engineering,"EmbeddedC, MATLAB, Cprogramming, Keil",B.E.Electronics Telecommunication Engineering...
93,93,1094,MITCOE,B.E.,Mechanical,"AutoCAD, PROE","B.E.MechanicalAutoCAD, PROE"
256,256,1257,MITAOE,B.E.,Mechanical,"ProE,CATIA","B.E.MechanicalProE,CATIA"
653,653,1654,MITCOE,B.E.,Computer Science & Engineering,"C, Java, CPP, HTML, CMStool, MYSQL","B.E.Computer Science & EngineeringC, Java, CPP..."
941,941,1942,MITCOE,B.E.,Computer Science & Engineering,"C, Java, CPP, HTML, CMStool, MYSQL","B.E.Computer Science & EngineeringC, Java, CPP..."
180,180,1181,MITAOE,B.E.,Computer Science & Engineering,"C, Java, CPP, HTML, CMStool, MYSQL","B.E.Computer Science & EngineeringC, Java, CPP..."
428,428,1429,MITCOE,B.E.,Computer Science & Engineering,"C, Java, CPP, HTML, CMStool, MYSQL","B.E.Computer Science & EngineeringC, Java, CPP..."
91,91,1092,MITAOE,B.E.,Mechanical,"AutoCAD, PROE","B.E.MechanicalAutoCAD, PROE"
123,123,1124,MITAOE,B.E.,Computer Science & Engineering,"C, Java, CPP, HTML, CMStool, MYSQL","B.E.Computer Science & EngineeringC, Java, CPP..."


In [31]:
user_dataset[user_dataset["userid"] == "1001"]

Unnamed: 0.1,level_0,index,Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
0,0,0,0,1001,B.E.,Computer Science & Engineering,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."


In [32]:
course_dataset[course_dataset["sr_"] == 1895]

Unnamed: 0.1,Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
894,894,1895,B.E.,Computer Science & Engineering,MITCOE,"['C, Java, CPP, HTML, CMStool, MYSQL']","C, Java, CPP, HTML, CMStool, MYSQL"


## For Evaluation 

In [33]:
#for user
#creating a new dataset dataset using the degree1, degree1specializations and key skills, career objective and the userid
cf_user_dataset = pd.DataFrame()
cf_user_dataset['userid'] = user_dataset['userid']
cf_user_dataset['degree_1'] = user_dataset['degree_1']
cf_user_dataset['degree_1_specializations'] = user_dataset['degree_1_specializations']
cf_user_dataset['career_objective'] = user_dataset['career_objective']
cf_user_dataset['key_skills_str'] = user_dataset['key_skills_str']
cf_user_dataset.head(5)

#store the career objective and key skills in description
cf_user_dataset['description'] = cf_user_dataset['degree_1'] + cf_user_dataset['degree_1_specializations'] + cf_user_dataset['career_objective'] + cf_user_dataset['key_skills_str']
cf_user_dataset.head(5)

#for course
#creating a new dataset dataset using the degree1, degree1specializations and key skills and the course id
cf_course_dataset = pd.DataFrame()
cf_course_dataset['sr_'] = course_dataset['sr_']
cf_course_dataset['campus'] = course_dataset['campus'] #campus is added so we dont have to add it later on
cf_course_dataset['degree_1'] = course_dataset['degree_1']
cf_course_dataset['degree_1_specializations'] = course_dataset['degree_1_specializations']
cf_course_dataset['key_skills_str'] = course_dataset['key_skills_str']
cf_course_dataset.head(5)

#store the career objective and key skills in description
cf_course_dataset['description'] = cf_course_dataset['degree_1'] + cf_course_dataset['degree_1_specializations'] + cf_course_dataset['key_skills_str']
cf_course_dataset.head(5)

#put them in a combined dataframe
comb = pd.DataFrame()
comb['description'] = cf_user_dataset['description'] + cf_course_dataset['description'] 
comb["degree_1"] = cf_course_dataset['degree_1']
comb["sr_"] = cf_course_dataset['sr_']
comb["degree_1_specializations"] = cf_course_dataset['degree_1_specializations']
comb["key_skills_str"] = cf_course_dataset['key_skills_str']


#load and preprocess data
data = comb
corpus = comb["description"].tolist()
X_train, X_test = train_test_split(data, test_size=0.2)

#vectorize using countvectorize that converts into a matrix of token counts
comb_count = CountVectorizer(stop_words='english', analyzer='word', ngram_range=(1, 2), min_df=0)
comb_count_matrix = comb_count.fit_transform(X_train["description"].values.astype('U'))

#cosine similarity between the the combined matrix itself (normalized dot product of X and Y)
course_cosine_sim = cosine_similarity(comb_count_matrix, comb_count_matrix)

#generate recommendations for each test user
recommendations = []

#user_dataset = llm_user_dataset.dropna()

for i in range(len(cf_user_dataset)):
    #user_input = cf_user_dataset.iloc[i]["description"]
    user_index = cf_user_dataset.iloc[i]["description"] #.index(user_input)
    recommended_items = course_cosine_sim[i].argsort()[:-6:-1]
    recommendations.append(recommended_items)

#Compute precision and recall for degree
relevant_items = []
for i in range(len(cf_user_dataset)):
    relevant_items.append(set(X_train.loc[X_train["degree_1"] == X_test.iloc[i]["degree_1"]]["sr_"].tolist()))
recommended_items = [set(X_train.iloc[rec]["sr_"].tolist()) for rec in recommendations]
true_positives = [len(r & a) for r, a in zip(recommended_items, relevant_items)]
true_negatives = [len(a & r) for r, a in zip(recommended_items, relevant_items)]
false_positives = [len(r - a) for r, a in zip(recommended_items, relevant_items)]
false_negatives = [len(a - r) for r, a in zip(recommended_items, relevant_items)]
accuracy = (sum(true_positives) + sum(true_negatives)) / (sum(true_positives) + sum(true_negatives) + sum(false_negatives) + sum(false_positives))
precision = sum(true_positives) / (sum(true_positives) + sum(false_positives))
recall = sum(true_positives) / (sum(true_positives) + sum(false_negatives))
f1_score = 2 * precision * recall / (precision + recall)

#Print evaluation metrics
print("Evaluation for Degree: ")
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1_score:.3f}")

#Compute precision and recall for degree spec
relevant_items = []
for i in range(len(cf_user_dataset)):
    relevant_items.append(set(X_train.loc[X_train["degree_1_specializations"] == X_test.iloc[i]["degree_1_specializations"]]["sr_"].tolist()))
recommended_items = [set(X_train.iloc[rec]["sr_"].tolist()) for rec in recommendations]
true_positives = [len(r & a) for r, a in zip(recommended_items, relevant_items)]
true_negatives = [len(a & r) for r, a in zip(recommended_items, relevant_items)]
false_positives = [len(r - a) for r, a in zip(recommended_items, relevant_items)]
false_negatives = [len(a - r) for r, a in zip(recommended_items, relevant_items)]
accuracy = (sum(true_positives) + sum(true_negatives)) / (sum(true_positives) + sum(true_negatives) + sum(false_negatives) + sum(false_positives))
precision = sum(true_positives) / (sum(true_positives) + sum(false_positives))
recall = sum(true_positives) / (sum(true_positives) + sum(false_negatives))
f1_score = 2 * precision * recall / (precision + recall)

#Print evaluation metrics
print()
print("Evaluation for Degree Specializations: ")
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1_score:.3f}")

#Compute precision and recall for key skills
relevant_items = []
for i in range(len(cf_user_dataset)):
    relevant_items.append(set(X_train.loc[X_train["key_skills_str"] == X_test.iloc[i]["key_skills_str"]]["sr_"].tolist()))
recommended_items = [set(X_train.iloc[rec]["sr_"].tolist()) for rec in recommendations]
true_positives = [len(r & a) for r, a in zip(recommended_items, relevant_items)]
false_positives = [len(r - a) for r, a in zip(recommended_items, relevant_items)]
false_negatives = [len(a - r) for r, a in zip(recommended_items, relevant_items)]
accuracy = (sum(true_positives) + sum(true_negatives)) / (sum(true_positives) + sum(true_negatives) + sum(false_negatives) + sum(false_positives))
precision = sum(true_positives) / (sum(true_positives) + sum(false_positives))
recall = sum(true_positives) / (sum(true_positives) + sum(false_negatives))
f1_score = 2 * precision * recall / (precision + recall)

#Print evaluation metrics
print()
print("Evaluation for Key Skills: ")
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1_score:.3f}")

Evaluation for Degree: 
Accuracy: 0.001
Precision: 0.922
Recall: 0.001
F1 Score: 0.001

Evaluation for Degree Specializations: 
Accuracy: 0.001
Precision: 0.252
Recall: 0.000
F1 Score: 0.001

Evaluation for Key Skills: 
Accuracy: 0.003
Precision: 0.081
Recall: 0.001
F1 Score: 0.001
