In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

### Calling the Saved Dataset

In [2]:
course_dataset = pd.read_csv("../data/courseData.csv", encoding= 'unicode_escape')
course_dataset.head()

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
0,1001,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
1,1002,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
2,1003,B.E.,Mechanical,MITAOE,['CATIA'],CATIA
3,1004,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
4,1005,B.E.,Mechanical,MITCOE,['CATIA'],CATIA


In [3]:
user_dataset = pd.read_csv("../data/userData.csv", encoding= 'unicode_escape')
user_dataset.head()

Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
0,1001,B.E.,Computer Science & Engineering,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."
1,1002,B.E.,Computer Science & Engineering,['Hindi English'],"['Java', ' Neural Networks', ' AI', ' Python',...",Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ..."
2,1003,B.E.,Computer Science & Engineering,['Missing'],['Missing'],Missing,Missing
3,1004,B.E.,Computer Science & Engineering,"['English', ' Hindi', ' Marathi', ' Marwari']","['XML', ' C', ' Java', ' Data Structures', ' P...",Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo..."
4,1005,B.E.,Computer Science & Engineering,"['English', 'Hindi ', 'Kashmiri ', ' Urdu']","['XML', ' Word', ' Data Structures', ' Communi...",To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ..."


In [4]:
ratings_df = pd.read_csv("../data/ratingData.csv", encoding= 'unicode_escape')
ratings_df.head()

Unnamed: 0,course_id,user_id,rating
0,2001,1001,5
1,2001,1002,3
2,2001,1003,1
3,2001,1004,0
4,2001,1005,2


### Metrics: Precision (Also prints accuracy, recall and f1-score)

In [5]:
#for user
#creating a new dataset dataset using the degree1, degree1specializations and key skills, career objective and the userid
cf_user_dataset = pd.DataFrame()
cf_user_dataset['userid'] = user_dataset['userid']
cf_user_dataset['degree_1'] = user_dataset['degree_1']
cf_user_dataset['degree_1_specializations'] = user_dataset['degree_1_specializations']
cf_user_dataset['career_objective'] = user_dataset['career_objective']
cf_user_dataset['key_skills_str'] = user_dataset['key_skills_str']
cf_user_dataset.head(5)

#store the career objective and key skills in description
cf_user_dataset['description'] = cf_user_dataset['degree_1'] + cf_user_dataset['degree_1_specializations'] + cf_user_dataset['career_objective'] + cf_user_dataset['key_skills_str']
cf_user_dataset.head(5)

#for course
#creating a new dataset dataset using the degree1, degree1specializations and key skills and the course id
cf_course_dataset = pd.DataFrame()
cf_course_dataset['sr_'] = course_dataset['sr_']
cf_course_dataset['campus'] = course_dataset['campus'] #campus is added so we dont have to add it later on
cf_course_dataset['degree_1'] = course_dataset['degree_1']
cf_course_dataset['degree_1_specializations'] = course_dataset['degree_1_specializations']
cf_course_dataset['key_skills_str'] = course_dataset['key_skills_str']
cf_course_dataset.head(5)

#store the career objective and key skills in description
cf_course_dataset['description'] = cf_course_dataset['degree_1'] + cf_course_dataset['degree_1_specializations'] + cf_course_dataset['key_skills_str']
cf_course_dataset.head(5)

#put them in a combined dataframe
comb = pd.DataFrame()
comb['description'] = cf_user_dataset['description'] + cf_course_dataset['description'] 
comb["degree_1"] = cf_course_dataset['degree_1']
comb["sr_"] = cf_course_dataset['sr_']
comb["degree_1_specializations"] = cf_course_dataset['degree_1_specializations']
comb["key_skills_str"] = cf_course_dataset['key_skills_str']


#load and preprocess data
data = comb
corpus = comb["description"].tolist()
X_train, X_test = train_test_split(data, test_size=0.2)

#vectorize using countvectorize that converts into a matrix of token counts
comb_count = CountVectorizer(stop_words='english', analyzer='word', ngram_range=(1, 2), min_df=0)
comb_count_matrix = comb_count.fit_transform(X_train["description"].values.astype('U'))

#cosine similarity between the the combined matrix itself (normalized dot product of X and Y)
course_cosine_sim = cosine_similarity(comb_count_matrix, comb_count_matrix)

#generate recommendations for each test user
recommendations = []

#user_dataset = llm_user_dataset.dropna()

for i in range(len(cf_user_dataset)):
    #user_input = cf_user_dataset.iloc[i]["description"]
    user_index = cf_user_dataset.iloc[i]["description"] #.index(user_input)
    recommended_items = course_cosine_sim[i].argsort()[:-6:-1]
    recommendations.append(recommended_items)

#Compute precision and recall for degree
relevant_items = []
for i in range(len(cf_user_dataset)):
    relevant_items.append(set(X_train.loc[X_train["degree_1"] == X_test.iloc[i]["degree_1"]]["sr_"].tolist()))
recommended_items = [set(X_train.iloc[rec]["sr_"].tolist()) for rec in recommendations]
true_positives = [len(r & a) for r, a in zip(recommended_items, relevant_items)]
true_negatives = [len(a & r) for r, a in zip(recommended_items, relevant_items)]
false_positives = [len(r - a) for r, a in zip(recommended_items, relevant_items)]
false_negatives = [len(a - r) for r, a in zip(recommended_items, relevant_items)]
accuracy = (sum(true_positives) + sum(true_negatives)) / (sum(true_positives) + sum(true_negatives) + sum(false_negatives) + sum(false_positives))
precision = sum(true_positives) / (sum(true_positives) + sum(false_positives))
recall = sum(true_positives) / (sum(true_positives) + sum(false_negatives))
f1_score = 2 * precision * recall / (precision + recall)

#Print evaluation metrics
print("Evaluation for Degree: ")
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1_score:.3f}")

#Compute precision and recall for degree spec
relevant_items = []
for i in range(len(cf_user_dataset)):
    relevant_items.append(set(X_train.loc[X_train["degree_1_specializations"] == X_test.iloc[i]["degree_1_specializations"]]["sr_"].tolist()))
recommended_items = [set(X_train.iloc[rec]["sr_"].tolist()) for rec in recommendations]
true_positives = [len(r & a) for r, a in zip(recommended_items, relevant_items)]
true_negatives = [len(a & r) for r, a in zip(recommended_items, relevant_items)]
false_positives = [len(r - a) for r, a in zip(recommended_items, relevant_items)]
false_negatives = [len(a - r) for r, a in zip(recommended_items, relevant_items)]
accuracy = (sum(true_positives) + sum(true_negatives)) / (sum(true_positives) + sum(true_negatives) + sum(false_negatives) + sum(false_positives))
precision = sum(true_positives) / (sum(true_positives) + sum(false_positives))
recall = sum(true_positives) / (sum(true_positives) + sum(false_negatives))
f1_score = 2 * precision * recall / (precision + recall)

#Print evaluation metrics
print()
print("Evaluation for Degree Specializations: ")
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1_score:.3f}")

#Compute precision and recall for key skills
relevant_items = []
for i in range(len(cf_user_dataset)):
    relevant_items.append(set(X_train.loc[X_train["key_skills_str"] == X_test.iloc[i]["key_skills_str"]]["sr_"].tolist()))
recommended_items = [set(X_train.iloc[rec]["sr_"].tolist()) for rec in recommendations]
true_positives = [len(r & a) for r, a in zip(recommended_items, relevant_items)]
false_positives = [len(r - a) for r, a in zip(recommended_items, relevant_items)]
false_negatives = [len(a - r) for r, a in zip(recommended_items, relevant_items)]
accuracy = (sum(true_positives) + sum(true_negatives)) / (sum(true_positives) + sum(true_negatives) + sum(false_negatives) + sum(false_positives))
precision = sum(true_positives) / (sum(true_positives) + sum(false_positives))
recall = sum(true_positives) / (sum(true_positives) + sum(false_negatives))
f1_score = 2 * precision * recall / (precision + recall)

#Print evaluation metrics
print()
print("Evaluation for Key Skills: ")
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1_score:.3f}")

Evaluation for Degree: 
Accuracy: 0.001
Precision: 0.913
Recall: 0.001
F1 Score: 0.001

Evaluation for Degree Specializations: 
Accuracy: 0.001
Precision: 0.322
Recall: 0.001
F1 Score: 0.001

Evaluation for Key Skills: 
Accuracy: 0.003
Precision: 0.046
Recall: 0.000
F1 Score: 0.001


### Checking Course and User Details

In [6]:
#for user
#creating a new dataset dataset using the degree1, degree1specializations and key skills, career objective and the userid
cf_user_dataset = pd.DataFrame()
cf_user_dataset['userid'] = user_dataset['userid']
cf_user_dataset['degree_1'] = user_dataset['degree_1']
cf_user_dataset['degree_1_specializations'] = user_dataset['degree_1_specializations']
cf_user_dataset['career_objective'] = user_dataset['career_objective']
cf_user_dataset['key_skills_str'] = user_dataset['key_skills_str']
cf_user_dataset.head(5)

#store the career objective and key skills in description
cf_user_dataset['description'] = cf_user_dataset['degree_1'] + cf_user_dataset['degree_1_specializations'] + cf_user_dataset['career_objective'] + cf_user_dataset['key_skills_str']
cf_user_dataset.head(5)

#for course
#creating a new dataset dataset using the degree1, degree1specializations and key skills and the course id
cf_course_dataset = pd.DataFrame()
cf_course_dataset['sr_'] = course_dataset['sr_']
cf_course_dataset['campus'] = course_dataset['campus'] #campus is added so we dont have to add it later on
cf_course_dataset['degree_1'] = course_dataset['degree_1']
cf_course_dataset['degree_1_specializations'] = course_dataset['degree_1_specializations']
cf_course_dataset['key_skills_str'] = course_dataset['key_skills_str']
cf_course_dataset.head(5)

#store the career objective and key skills in description
cf_course_dataset['description'] = cf_course_dataset['degree_1'] + cf_course_dataset['degree_1_specializations'] + cf_course_dataset['key_skills_str']
cf_course_dataset.head(5)

#put them in a combined dataframe
comb = pd.DataFrame()
comb['description'] = cf_user_dataset['description'] + cf_course_dataset['description']

#vectorize using countvectorize that converts into a matrix of token counts
comb_count = CountVectorizer(stop_words='english', analyzer='word', ngram_range=(1, 2), min_df=0)
comb_count_matrix = comb_count.fit_transform(comb['description'].values.astype('U'))

#cosine similarity between the the combined matrix itself (normalized dot product of X and Y)
course_cosine_sim = cosine_similarity(comb_count_matrix, comb_count_matrix)

#create indices for the courses is using series
cf_course_dataset = cf_course_dataset.reset_index()
courses = cf_course_dataset
indices = pd.Series(cf_course_dataset.index, index=cf_course_dataset['sr_'])

#function to get content-filtered recommendations
def get_course_cf_recommendations(user): 
    
    #get index of course
    idx = indices[user]
    
    #find the most similar 30 courses using cosine_sim and sorting with highest similarity
    sim_scores = list(enumerate(course_cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    course_indices = [i[0] for i in sim_scores]
    
    return courses.iloc[course_indices]

In [7]:
#getting the similar course recs for user 1001
get_course_cf_recommendations(1001).head(10)

Unnamed: 0,index,sr_,campus,degree_1,degree_1_specializations,key_skills_str,description
894,894,1895,MITCOE,B.E.,Computer Science & Engineering,"C, Java, CPP, HTML, CMStool, MYSQL","B.E.Computer Science & EngineeringC, Java, CPP..."
996,996,1997,"MIT,Pune",B.E.,Electronics Telecommunication Engineering,"EmbeddedC, MATLAB, Cprogramming, Keil",B.E.Electronics Telecommunication Engineering...
93,93,1094,MITCOE,B.E.,Mechanical,"AutoCAD, PROE","B.E.MechanicalAutoCAD, PROE"
256,256,1257,MITAOE,B.E.,Mechanical,"ProE,CATIA","B.E.MechanicalProE,CATIA"
653,653,1654,MITCOE,B.E.,Computer Science & Engineering,"C, Java, CPP, HTML, CMStool, MYSQL","B.E.Computer Science & EngineeringC, Java, CPP..."
941,941,1942,MITCOE,B.E.,Computer Science & Engineering,"C, Java, CPP, HTML, CMStool, MYSQL","B.E.Computer Science & EngineeringC, Java, CPP..."
180,180,1181,MITAOE,B.E.,Computer Science & Engineering,"C, Java, CPP, HTML, CMStool, MYSQL","B.E.Computer Science & EngineeringC, Java, CPP..."
428,428,1429,MITCOE,B.E.,Computer Science & Engineering,"C, Java, CPP, HTML, CMStool, MYSQL","B.E.Computer Science & EngineeringC, Java, CPP..."
91,91,1092,MITAOE,B.E.,Mechanical,"AutoCAD, PROE","B.E.MechanicalAutoCAD, PROE"
123,123,1124,MITAOE,B.E.,Computer Science & Engineering,"C, Java, CPP, HTML, CMStool, MYSQL","B.E.Computer Science & EngineeringC, Java, CPP..."


In [8]:
user_dataset[user_dataset["userid"] == "1001"]

Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
0,1001,B.E.,Computer Science & Engineering,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."


In [9]:
course_dataset[course_dataset["sr_"] == 1895]

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
894,1895,B.E.,Computer Science & Engineering,MITCOE,"['C, Java, CPP, HTML, CMStool, MYSQL']","C, Java, CPP, HTML, CMStool, MYSQL"


In [10]:
course_dataset[course_dataset["sr_"] == 1997]

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
996,1997,B.E.,Electronics Telecommunication Engineering,"MIT,Pune","['EmbeddedC, MATLAB, Cprogramming, Keil']","EmbeddedC, MATLAB, Cprogramming, Keil"


In [11]:
course_dataset[course_dataset["sr_"] == 1094]

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
93,1094,B.E.,Mechanical,MITCOE,"['AutoCAD, PROE']","AutoCAD, PROE"


In [12]:
course_dataset[course_dataset["sr_"] == 1257]

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
256,1257,B.E.,Mechanical,MITAOE,"['ProE,CATIA']","ProE,CATIA"


In [13]:
course_dataset[course_dataset["sr_"] == 1654]

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
653,1654,B.E.,Computer Science & Engineering,MITCOE,"['C, Java, CPP, HTML, CMStool, MYSQL']","C, Java, CPP, HTML, CMStool, MYSQL"


In [14]:
course_dataset[course_dataset["sr_"] == 1942]

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
941,1942,B.E.,Computer Science & Engineering,MITCOE,"['C, Java, CPP, HTML, CMStool, MYSQL']","C, Java, CPP, HTML, CMStool, MYSQL"


In [15]:
course_dataset[course_dataset["sr_"] == 1181]

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
180,1181,B.E.,Computer Science & Engineering,MITAOE,"['C, Java, CPP, HTML, CMStool, MYSQL']","C, Java, CPP, HTML, CMStool, MYSQL"


In [16]:
course_dataset[course_dataset["sr_"] == 1429]

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
428,1429,B.E.,Computer Science & Engineering,MITCOE,"['C, Java, CPP, HTML, CMStool, MYSQL']","C, Java, CPP, HTML, CMStool, MYSQL"


In [17]:
course_dataset[course_dataset["sr_"] == 1092]

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
91,1092,B.E.,Mechanical,MITAOE,"['AutoCAD, PROE']","AutoCAD, PROE"


In [18]:
course_dataset[course_dataset["sr_"] == 1124]

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
123,1124,B.E.,Computer Science & Engineering,MITAOE,"['C, Java, CPP, HTML, CMStool, MYSQL']","C, Java, CPP, HTML, CMStool, MYSQL"
