In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity, cosine_distances
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
import squarify
import seaborn as sns
from surprise import accuracy
from surprise.model_selection.validation import cross_validate
from surprise.dataset import Dataset
from surprise.reader import Reader
from surprise import SVD, KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline
from surprise.model_selection import train_test_split
import random
import tiktoken
import openai
import pickle
from typing import List, Dict, Tuple
from openai.embeddings_utils import (
    get_embedding,
    distances_from_embeddings,
    tsne_components_from_embeddings,
    chart_from_components,
    indices_of_nearest_neighbors_from_distances,
)

## Calling the Saved Dataset

In [2]:
course_dataset = pd.read_csv("data/courseData.csv", encoding= 'unicode_escape')
course_dataset.head()

Unnamed: 0.1,Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
0,0,1001,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
1,1,1002,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
2,2,1003,B.E.,Mechanical,MITAOE,['CATIA'],CATIA
3,3,1004,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
4,4,1005,B.E.,Mechanical,MITCOE,['CATIA'],CATIA


In [3]:
user_dataset = pd.read_csv("data/userData.csv", encoding= 'unicode_escape')
user_dataset.head()

Unnamed: 0.1,Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
0,0,1001,B.E.,Computer Science & Engineering,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."
1,1,1002,B.E.,Computer Science & Engineering,['Hindi English'],"['Java', ' Neural Networks', ' AI', ' Python',...",Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ..."
2,2,1003,B.E.,Computer Science & Engineering,['Missing'],['Missing'],Missing,Missing
3,3,1004,B.E.,Computer Science & Engineering,"['English', ' Hindi', ' Marathi', ' Marwari']","['XML', ' C', ' Java', ' Data Structures', ' P...",Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo..."
4,4,1005,B.E.,Computer Science & Engineering,"['English', 'Hindi ', 'Kashmiri ', ' Urdu']","['XML', ' Word', ' Data Structures', ' Communi...",To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ..."


In [4]:
ratings_df = pd.read_csv("data/ratingData.csv", encoding= 'unicode_escape')
ratings_df.head()

Unnamed: 0,course_id,user_id,rating
0,2001,1001,5
1,2001,1002,3
2,2001,1003,1
3,2001,1004,0
4,2001,1005,2


## Collaborative Filtering

In [5]:
#Collaborative Filtering is based on the idea that users similar to a me can be used to predict how much I will 
#like a particular product or service those users have used/experienced but I have not.
#I will not be implementing Collaborative Filtering from scratch. Instead, I will use the Surprise library 
#that used extremely powerful algorithms like Singular Value Decomposition (SVD) to minimise RMSE (Root Mean Square Error)
#and give great recommendations.
#Implementation of SVD for surprise library is given on this link

Lower values of RMSE indicate better fit. RMSE is a good measure of how accurately the model predicts the response. It's the most important criterion for fit if the main purpose of the model is prediction.
Based on a rule of thumb, it can be said that RMSE values between **0.2 and 0.5** shows that the model can relatively predict the data accurately.

There is no correct value for MSE. Simply put, the lower the value the better and 0 means the model is perfect.

MAE: A metric that tells us the **mean absolute difference** between the predicted values and the actual values in a dataset. The lower the MAE, the better a model fits a dataset.

### Using Surprise and testing with different algorithms (SVD, KnnBasic, KnnBaseline, KnnWithMeans, KnnWithZScore) to find the best one to use based on MAE and RMAE scores. 

In [6]:
#using surprise
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(ratings_df[['user_id','course_id','rating']], reader)

#split into training and test sets using 80:20 ratio
raw_ratings = data.raw_ratings
random.shuffle(raw_ratings)                

ratio = int(len(raw_ratings)*0.8)
train_raw = raw_ratings[:ratio] 
test_raw = raw_ratings[ratio:] 

data.raw_ratings = train_raw       
trainset = data.build_full_trainset() 
testset = data.construct_testset(test_raw)

#create dict for different models 
models=[SVD(), KNNBasic(), KNNBaseline(), KNNWithMeans(), KNNWithZScore()] 
results = {} #to store the scores

#perform cross validation of MAE and RMSE for all models
for model in models:
    #kfold set to 5
    crossval_scores = cross_validate(model, data, measures=["MAE","RMSE"], cv=5, n_jobs=-1)  
    
    #saving and renaming appropraitely
    result = pd.DataFrame.from_dict(crossval_scores).mean(axis=0).\
             rename({'test_mae':'MAE', 'test_rmse': 'RMSE', 'fit_time': 'Fit Time', 'test_time': 'Test Time'})
    results[str(model).split("algorithms.")[1].split("object ")[0]] = result
    
#printing all models results
all_models = pd.DataFrame.from_dict(results)
all_models.T.sort_values(by='RMSE') #models sorted by RMSE

Unnamed: 0,MAE,RMSE,Fit Time,Test Time
matrix_factorization.SVD,1.095515,1.350589,0.035295,0.033734
knns.KNNBaseline,1.273543,1.542233,0.004437,0.059345
knns.KNNWithMeans,1.277918,1.548912,0.003092,0.023132
knns.KNNWithZScore,1.277385,1.549418,0.007281,0.036641
knns.KNNBasic,1.312221,1.580766,0.002072,0.047676


####  Using the SVD model to generate collaborative filtering recommendations since it has the least MAE and RMSE results

In [7]:
#Since the svd recommender is making recommmendations from the ratings dataset, 
#its recommemnding courses the users have already done
def svd_cf_recommendations(user):
    
    #using the "best" parameters found using the gridsearchcv method from experiments notebook 
    model = SVD(n_factors = 90, n_epochs = 20, lr_all = 0.005, reg_all = 0.2)
    model.fit(trainset)
    
    #building test set and predict the ratings
    testset = trainset.build_anti_testset()
    predictions = model.test(testset)
    
    #save into dataframe
    rating_predictions = pd.DataFrame(predictions)
    #print(rating_predictions)
    
    #find the predictions for the given user
    user_predictions = rating_predictions[rating_predictions['uid'] == user].\
                         sort_values(by="est", ascending = False)
        
    #removing courses already done by user (assuming they are rated 1 to 5)  
    finalcourserecs = [] #to store the final course recs
    
    #uid is the user and iid is the course id
    for i, row in user_predictions.iterrows():
        #get rating details
        rating_dets = ratings_df[(ratings_df['course_id'] == row[1]) & (ratings_df['user_id'] == user)]
        
        if rating_dets["rating"].values[0] == 0:
            #print("Not rated!")
            finalcourserecs.append(row[1])
    
    return(finalcourserecs)

In [8]:
svd_cf_recommendations(1001) #input is a user and output is a course!

#1001 is a user (shown above no course is 1001)
#2082 is course since no user rating is found (check above)

[2246,
 2258,
 2335,
 2184,
 2300,
 2223,
 2370,
 2357,
 2412,
 2196,
 2241,
 2390,
 2320,
 2393,
 2005,
 2245,
 2312,
 2048,
 2100,
 2136,
 2221,
 2084,
 2270,
 2291,
 2243,
 2303,
 2091,
 2124]

In [9]:
ratings_df[ratings_df['user_id']== 1001]

Unnamed: 0,course_id,user_id,rating
0,2001,1001,5
20,2002,1001,3
40,2003,1001,4
60,2004,1001,2
80,2005,1001,0
...,...,...,...
8377,2420,1001,0
8397,2421,1001,4
8417,2422,1001,0
8437,2423,1001,2


In [10]:
ratings_df[ratings_df['course_id']== 2150]

Unnamed: 0,course_id,user_id,rating
2980,2150,1001,4
2981,2150,1002,5
2982,2150,1003,3
2983,2150,1004,2
2984,2150,1005,4
2985,2150,1006,5
2986,2150,1007,3
2987,2150,1008,4
2988,2150,1009,4
2989,2150,1010,5


In [11]:
svd_cf_recommendations(2150) #svd doeesnt work with input of courses

[]

In [12]:
svd_cf_recommendations("2150") #svd doeesnt work with input of courses

[]

## Content Based Filtering

### Using User and Course Dataset in Sim Matrix

In [13]:
#for user
#creating a new dataset dataset using the degree1, degree1specializations and key skills, career objective and the userid
cf_user_dataset = pd.DataFrame()
cf_user_dataset['userid'] = user_dataset['userid']
cf_user_dataset['degree_1'] = user_dataset['degree_1']
cf_user_dataset['degree_1_specializations'] = user_dataset['degree_1_specializations']
cf_user_dataset['career_objective'] = user_dataset['career_objective']
cf_user_dataset['key_skills_str'] = user_dataset['key_skills_str']
cf_user_dataset.head(5)

#store the career objective and key skills in description
cf_user_dataset['description'] = cf_user_dataset['degree_1'] + cf_user_dataset['degree_1_specializations'] + cf_user_dataset['career_objective'] + cf_user_dataset['key_skills_str']
cf_user_dataset.head(5)

Unnamed: 0,userid,degree_1,degree_1_specializations,career_objective,key_skills_str,description
0,1001,B.E.,Computer Science & Engineering,Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel...",B.E.Computer Science & EngineeringComputer Eng...
1,1002,B.E.,Computer Science & Engineering,Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ...",B.E.Computer Science & EngineeringInterested i...
2,1003,B.E.,Computer Science & Engineering,Missing,Missing,B.E.Computer Science & EngineeringMissingMissing
3,1004,B.E.,Computer Science & Engineering,Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo...",B.E.Computer Science & EngineeringCurrently a ...
4,1005,B.E.,Computer Science & Engineering,To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ...",B.E.Computer Science & EngineeringTo have a gr...


In [14]:
#for course
#creating a new dataset dataset using the degree1, degree1specializations and key skills and the course id
cf_course_dataset = pd.DataFrame()
cf_course_dataset['sr_'] = course_dataset['sr_']
cf_course_dataset['campus'] = course_dataset['campus'] #campus is added so we dont have to add it later on
cf_course_dataset['degree_1'] = course_dataset['degree_1']
cf_course_dataset['degree_1_specializations'] = course_dataset['degree_1_specializations']
cf_course_dataset['key_skills_str'] = course_dataset['key_skills_str']
cf_course_dataset.head(5)

#store the career objective and key skills in description
cf_course_dataset['description'] = cf_course_dataset['degree_1'] + cf_course_dataset['degree_1_specializations'] + cf_course_dataset['key_skills_str']
cf_course_dataset.head(5)

Unnamed: 0,sr_,campus,degree_1,degree_1_specializations,key_skills_str,description
0,1001,MITCOE,B.E.,Mechanical,CATIA,B.E.MechanicalCATIA
1,1002,MITCOE,B.E.,Mechanical,CATIA,B.E.MechanicalCATIA
2,1003,MITAOE,B.E.,Mechanical,CATIA,B.E.MechanicalCATIA
3,1004,MITCOE,B.E.,Mechanical,CATIA,B.E.MechanicalCATIA
4,1005,MITCOE,B.E.,Mechanical,CATIA,B.E.MechanicalCATIA


In [15]:
#put them in a combined dataframe
comb = pd.DataFrame()
comb['description'] = cf_user_dataset['description'] + cf_course_dataset['description']

In [16]:
#vectorize using countvectorize that converts into a matrix of token counts
comb_count = CountVectorizer(stop_words='english', analyzer='word', ngram_range=(1, 2), min_df=0)
comb_count_matrix = comb_count.fit_transform(comb['description'].values.astype('U'))

#cosine similarity between the the combined matrix itself (normalized dot product of X and Y)
course_cosine_sim = cosine_similarity(comb_count_matrix, comb_count_matrix)

#create indices for the courses is using series
cf_course_dataset = cf_course_dataset.reset_index()
courses = cf_course_dataset
indices = pd.Series(cf_course_dataset.index, index=cf_course_dataset['sr_'])

#function to get content-filtered recommendations
def get_course_cf_recommendations(user): 
    
    #get index of course
    idx = indices[user]
    
    #find the most similar 30 courses using cosine_sim and sorting with highest similarity
    sim_scores = list(enumerate(course_cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    course_indices = [i[0] for i in sim_scores]
    
    return courses.iloc[course_indices]

In [17]:
#getting the similar course recs for user 1001
get_course_cf_recommendations(1001).head(10)

Unnamed: 0,index,sr_,campus,degree_1,degree_1_specializations,key_skills_str,description
894,894,1895,MITCOE,B.E.,Computer Science & Engineering,"C, Java, CPP, HTML, CMStool, MYSQL","B.E.Computer Science & EngineeringC, Java, CPP..."
996,996,1997,"MIT,Pune",B.E.,Electronics Telecommunication Engineering,"EmbeddedC, MATLAB, Cprogramming, Keil",B.E.Electronics Telecommunication Engineering...
93,93,1094,MITCOE,B.E.,Mechanical,"AutoCAD, PROE","B.E.MechanicalAutoCAD, PROE"
256,256,1257,MITAOE,B.E.,Mechanical,"ProE,CATIA","B.E.MechanicalProE,CATIA"
653,653,1654,MITCOE,B.E.,Computer Science & Engineering,"C, Java, CPP, HTML, CMStool, MYSQL","B.E.Computer Science & EngineeringC, Java, CPP..."
941,941,1942,MITCOE,B.E.,Computer Science & Engineering,"C, Java, CPP, HTML, CMStool, MYSQL","B.E.Computer Science & EngineeringC, Java, CPP..."
180,180,1181,MITAOE,B.E.,Computer Science & Engineering,"C, Java, CPP, HTML, CMStool, MYSQL","B.E.Computer Science & EngineeringC, Java, CPP..."
428,428,1429,MITCOE,B.E.,Computer Science & Engineering,"C, Java, CPP, HTML, CMStool, MYSQL","B.E.Computer Science & EngineeringC, Java, CPP..."
91,91,1092,MITAOE,B.E.,Mechanical,"AutoCAD, PROE","B.E.MechanicalAutoCAD, PROE"
123,123,1124,MITAOE,B.E.,Computer Science & Engineering,"C, Java, CPP, HTML, CMStool, MYSQL","B.E.Computer Science & EngineeringC, Java, CPP..."


In [18]:
user_dataset[user_dataset["userid"] == "1001"]

Unnamed: 0.1,Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
0,0,1001,B.E.,Computer Science & Engineering,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."


In [19]:
course_dataset[course_dataset["sr_"] == 1895]

Unnamed: 0.1,Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
894,894,1895,B.E.,Computer Science & Engineering,MITCOE,"['C, Java, CPP, HTML, CMStool, MYSQL']","C, Java, CPP, HTML, CMStool, MYSQL"


## Large Language Model (LLM) 

### Using GPT-4 (TfidVectorizer and Cosine Similarity)

In [20]:
#get the tokeniser of a specific model in the OpenAI API gpt-4
enc = tiktoken.encoding_for_model("gpt-4")

#set up OpenAI API credentials
openai.api_key = "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"

In [21]:
#for user
#creating a new dataset dataset using the degree1, degree1specializations and key skills, career objective and the userid
llm_user_dataset = pd.DataFrame()
llm_user_dataset['userid'] = user_dataset['userid']
llm_user_dataset['degree_1'] = user_dataset['degree_1']
llm_user_dataset['degree_1_specializations'] = user_dataset['degree_1_specializations']
llm_user_dataset['career_objective'] = user_dataset['career_objective']
llm_user_dataset['key_skills_str'] = user_dataset['key_skills_str']
llm_user_dataset.head(5)

#store the career objective and key skills in description
llm_user_dataset['description'] = llm_user_dataset['degree_1'] + llm_user_dataset['degree_1_specializations'] 
llm_user_dataset.head(5)

Unnamed: 0,userid,degree_1,degree_1_specializations,career_objective,key_skills_str,description
0,1001,B.E.,Computer Science & Engineering,Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel...",B.E.Computer Science & Engineering
1,1002,B.E.,Computer Science & Engineering,Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ...",B.E.Computer Science & Engineering
2,1003,B.E.,Computer Science & Engineering,Missing,Missing,B.E.Computer Science & Engineering
3,1004,B.E.,Computer Science & Engineering,Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo...",B.E.Computer Science & Engineering
4,1005,B.E.,Computer Science & Engineering,To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ...",B.E.Computer Science & Engineering


In [22]:
#for course
#creating a new dataset dataset using the degree1, degree1specializations and key skills and the course id
llm_course_dataset = pd.DataFrame()
llm_course_dataset['sr_'] = course_dataset['sr_']
llm_course_dataset['campus'] = course_dataset['campus'] #campus is added so we dont have to add it later on
llm_course_dataset['degree_1'] = course_dataset['degree_1']
llm_course_dataset['degree_1_specializations'] = course_dataset['degree_1_specializations']
llm_course_dataset['key_skills_str'] = course_dataset['key_skills_str']
llm_course_dataset.head(5)

#store the career objective and key skills in description
llm_course_dataset['description'] = llm_course_dataset['degree_1'] + llm_course_dataset['degree_1_specializations'] 

In [23]:
#load and preprocess data
data = llm_course_dataset
#data = data.dropna()
corpus = data["description"].tolist()

#extract features using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

#compute pairwise similarities using cosine similarity
similarity_matrix = cosine_similarity(X)

#generate recommendations based on user input
def llm_recommender(user):
    course_recommendations = []
    
    user_input = llm_user_dataset["description"].loc[llm_user_dataset["userid"] == user].values[0] 
    
    user_index = corpus.index(user_input)
    recommendations = similarity_matrix[user_index].argsort()[:-6:-1]
    
    
    # Print top 5 recommendations
    #print("Top 5 Recommendations:")
    for i, index in enumerate(recommendations):
        #print(f"{i+1}. {data.loc[index]['sr_']}: {data.loc[index]['description']}")
        course_recommendations.append(data.loc[index]['sr_'])
    
    #return the course ids
    return course_recommendations

In [24]:
llm_recs1 = llm_recommender("1001")

In [25]:
llm_recs1

[7408, 2954, 8949, 8948, 8947]

### Using GPT3 (KNN & embeddings)

In [26]:
#establish a cache of embeddings to avoid recomputing
#cache is a dict of tuples (text, engine) -> embedding, saved as a pickle file

#set path to embedding cache
embedding_cache_path_to_load = "embeddingsCache.pkl"
embedding_cache_path_to_save = "embeddingsCache.pkl"

#load the cache if it exists, and save a copy to disk
try:
    embedding_cache = pd.read_pickle(embedding_cache_path_to_load)
except FileNotFoundError:
    embedding_cache = {}
with open(embedding_cache_path_to_save, "wb") as embedding_cache_file:
    pickle.dump(embedding_cache, embedding_cache_file)

#function to retrieve embeddings from the cache if present, and otherwise request via the API
def embedding_from_string(string: str, engine: str = "text-embedding-ada-002", embedding_cache=embedding_cache) -> list:
    
    if (string, engine) not in embedding_cache.keys():
        embedding_cache[(string, engine)] = get_embedding(string, engine)
        with open(embedding_cache_path_to_save, "wb") as embedding_cache_file:
            pickle.dump(embedding_cache, embedding_cache_file)
            
    return embedding_cache[(string, engine)]

In [27]:
#gets the knn recommendations
def get_knn_recommendations(user, engine: str = "text-embedding-ada-002",) -> List[int]:
    #get the user details
    user_dets = cf_user_dataset["description"].loc[cf_user_dataset["userid"] == user]

    #create the corpus
    df = pd.concat([user_dets,cf_course_dataset["description"].loc[:]]).reset_index(drop=True)
    corpus = df.tolist()
        
    #get embeddings for all strings
    embeddings = [embedding_from_string(string, engine=engine) for string in corpus]
    
    #get the embedding of the source string
    query_embedding = embeddings[0]
    
    #get distances between the source embedding and other embeddings (function from embeddings_utils.py)
    distances = distances_from_embeddings(query_embedding, embeddings, distance_metric="cosine")
    
    #get indices of nearest neighbors (function from embeddings_utils.py)
    indices_of_nearest_neighbors = indices_of_nearest_neighbors_from_distances(distances)

    #return corpus, indices_of_nearest_neighbors[1:31] #dont need to get the corpus here since it is only used for the evaluation
    return indices_of_nearest_neighbors[1:11]
    
llm_recs2 = get_knn_recommendations("1001")

In [28]:
llm_recs2.tolist()

[9235, 3568, 4774, 8452, 8451, 3299, 3906, 8450, 8448, 8447]

## The Success Rate Predictor 

In [29]:
#implement predictor using the user key skills and info and the predicted courses
def calculate_success_rate(user, recommendations):
    success_rate = []
    
    #get the users required info
    #user_details = user_dataset[user_dataset["userid"] == user]
    user_degree = user_dataset["degree_1"].loc[user_dataset["userid"] == user].values[0]
    user_spec = user_dataset["degree_1_specializations"].loc[user_dataset["userid"] == user].values[0]
    user_keyskills = user_dataset["key_skills_str"].loc[user_dataset["userid"] == user].values[0]

    #turn into list 
    user_degree = user_degree.split("-")
    user_spec = user_spec.split("-")
    user_keyskills = user_keyskills.split(",")
    user_keyskills = (map(lambda x: x.lower().strip(), user_keyskills))
    user_keyskills = list(user_keyskills)
    

    #get the recommmendations
    #compare the user details with the recommendations
    for i, row in recommendations.iterrows():
        #print course code
        #print("Checking Course ", i)
        
        #reset the percentages
        apdegree = 0
        apspec = 0
        apkeyskills = 0
        
        #get current course data
        curr_degree = row[1]
        curr_spec = row[2]
        curr_keyskills = row[4] #row 3 using final, might change to row 2 if we remove campus
        
        #turn into list 
        curr_degree = curr_degree.split("-")
        curr_spec = curr_spec.split("-")
        #curr_keyskills = curr_keyskills.split(",")
        curr_keyskills = curr_keyskills.strip().split(",")
        curr_keyskills = (map(lambda x: x.lower().strip(), curr_keyskills))
        curr_keyskills = list(curr_keyskills)
        
        #the percentages according to the academic success
        pdegree = 15
        pspec = 35
        pkeyskills = 50
        
        #compare degree
        if (user_degree == curr_degree):
            apdegree = pdegree
            
        #compare degree specs
        if (user_spec == curr_spec):
            
            apspec = pspec
        
        #compare keyskills 
        ptemp = pkeyskills / len(curr_keyskills) #this will be the percentages of each individual key skill
                
        for currk in curr_keyskills:   
            
            #in the case of missing currk
            if currk == "missing":
                print("There is not enough information!")

            #for case course 2258
            #course 2258 has a keyskill "C CPP" instead of "C" and "CPP". 
            #And its not being counted as a match since it takes them as one thing
            if currk == "c cpp":
                currk = "c"
                curr_keyskills.append("cpp")
                
                #adjust the percentages
                ptemp = pkeyskills / len(curr_keyskills)
                
            #add new if for cprogrammming same as c
            if currk == "cprogramming":
                currk = "c"

            if currk in user_keyskills:
                apkeyskills = apkeyskills + ptemp
                
        #calculate the total success rate and round to 2dp
        total = round(apdegree + apspec + apkeyskills, 2)
        
        #print("The Percentages: ", pdegree, pspec, pkeyskills)
        #print("Course: ", i, "SuccessRate: ", total)
        success_rate.append(total)
                    
    return success_rate

## The Final Recommender 

In [30]:
#gets course details of svd recommendations
def get_course_details(svd_courses): 
    
    #dataframe for final recommendations 
    finalRecs = pd.DataFrame()
    
    #iterate the list
    for i in svd_courses:
        
        #if course found in couse dataset then store details in finalRecs dataframe
        if i in course_dataset["sr_"]:
            course_details = course_dataset[course_dataset["sr_"] == i]
            finalRecs = finalRecs.append(course_details)

            #drop unneccessary columns 
            finalRecs.drop(["Unnamed: 0", "key_skills"], axis = 1, inplace = True)
        
    #return useful information
    return finalRecs[["sr_", "degree_1", "degree_1_specializations", "campus", "key_skills_str"]]

In [31]:
#removes courses that the user has already taken by checking the ratings in the ratings dataframe
def drop_courses_taken(user, cf):
    finalcourserecs = [] #to store the final course recs

    for i, row in cf.iterrows():
        #print(row[1])

        if row[1] in ratings_df["course_id"].values:
            #print("Course is found in the ratings dataframe!")
            #check if rated
            rating_dets = ratings_df[(ratings_df['course_id'] == row[1]) & (ratings_df['user_id'] == user)]
            #print(rating_dets)
            if rating_dets["rating"].values[0] != 0:
                cf.drop([i], inplace=True)
                finalcourserecs.append(row[1])
    return cf

In [32]:
def final_recommender(user):
    
    svd_courses = [] #pd.DataFrame()
    user_recs = pd.DataFrame()
    course_recs = pd.DataFrame()
    
    #get collaborative filtering recommendations
    svd_courses = svd_cf_recommendations(user)
    
    #get content based filtering recommendations
    cf_recs = get_course_cf_recommendations(user)
    
    #get llm1 recommendations
    llm1_courses = llm_recommender(str(user))
    #llm1_recs = pd.DataFrame(llm_recs1, columns=['sr_'])
    llm1_recs = get_course_details(llm1_courses)
    
    #get llm1 (knn) recommendations
    llm2_courses = get_knn_recommendations(str(user))
    #llm2_recs = pd.DataFrame(llm_recs2.tolist(), columns=['sr_'])
    llm2_recs = get_course_details(llm2_courses.tolist())

    
    #adjust columns as necessary
    cf_recs = cf_recs.loc[:, ["sr_","degree_1","degree_1_specializations","campus", "key_skills_str", "index","description"]]

    #if there are no similar users with ratings for the svd, recommendations will be made solely on content-based filtering
    if len(svd_courses) == 0:
        #there are no svd recs
        have_svd = False
        print("No SVD Recommednations!")
    else:
        #get course details of svd recs
        have_svd = True
        
        #check and drop courses if taken by user 
        cf_recs = drop_courses_taken(user, cf_recs)
        svd_recs = get_course_details(svd_courses)
        
    #making a filter column to show where the recommendations were generated from
    cf_recs["Filter"] = "Content-Based"
    llm1_recs["Filter"] = "LLM (gpt-4)"
    llm2_recs["Filter"] = "LLM (KNN)"
    
    #calculate the success rates and save in Success Rate column
    cf_success_rate = calculate_success_rate(str(user), cf_recs)     
    cf_recs['Success Rate'] = cf_success_rate
    
    llm1_success_rate = calculate_success_rate(str(user), llm1_recs)  
    llm1_recs['Success Rate'] = llm1_success_rate
    
    llm2_success_rate = calculate_success_rate(str(user), llm2_recs)  
    llm2_recs['Success Rate'] = llm2_success_rate
    
    #sort to get the courses with the highest success rate
    cf_recs = cf_recs.sort_values(by=['Success Rate'], ascending=False) 
    llm1_recs = llm1_recs.sort_values(by=['Success Rate'], ascending=False)
    llm2_recs = llm2_recs.sort_values(by=['Success Rate'], ascending=False)

    #displaying
    #get final recommendations using concat to get the top 5 recs from both recommenders
    #IF THERE ARE NO SVD MAKE THIS ILOC 10
    #if there are svd recommendations do the same for the svd recommendations
    if have_svd == True:
        
        svd_recs["Filter"] = "Collaborative"
        
        #calculate svd success rate
        svd_success_rate = calculate_success_rate(str(user), svd_recs) 
        svd_recs['Success Rate'] = svd_success_rate 
        
        svd_recs = svd_recs.sort_values(by=['Success Rate'], ascending=False)
    
        #get final recommendations using concat to get the top 5 recs from both recommenders
        final = pd.concat([cf_recs.iloc[:5], svd_recs.iloc[:5]], ignore_index=True, sort=False)
    else:
        final = cf_recs.head(10)
        
    final = pd.concat([final, llm1_recs.iloc[:2]], ignore_index=True, sort=False)
    final = pd.concat([final, llm2_recs.iloc[:2]], ignore_index=True, sort=False)

    #rename index and set index to it
    final = final.rename(columns = {final.columns[0]: "Course Code"})
    final.set_index("Course Code", inplace = True)

    #drop unnecessary colummns
    final.drop(["index", "description"], axis = 1, inplace = True)

    #rename other columns
    final.rename(columns = {"degree_1":"Degree", "degree_1_specializations": "Degree Specializations", "campus": "Campus", "key_skills": "Key Skills Actual", "key_skills_str": "Key Skills", "Filter": "Filter"}, inplace = True)

    print("Course Recommmendations for User", user, ":")
    final = final.reset_index()

    return final

In [33]:
user_dataset[user_dataset["userid"] == "1001"]

Unnamed: 0.1,Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
0,0,1001,B.E.,Computer Science & Engineering,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."


In [34]:
final_recommender(1001)

Course Recommmendations for User 1001 :


Unnamed: 0,Course Code,Degree,Degree Specializations,Campus,Key Skills,Filter,Success Rate
0,1895,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Content-Based,91.67
1,1394,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Content-Based,91.67
2,1934,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Content-Based,91.67
3,1188,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Content-Based,91.67
4,1957,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Content-Based,91.67
5,2221,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Collaborative,91.67
6,2223,B.E.,Computer Science & Engineering,MITAOE,"C, Java, CPP, HTML, CMStool, MYSQL",Collaborative,91.67
7,2136,B.E.,Computer Science & Engineering,MITAOE,"C, Java, CPP, HTML, CMStool, MYSQL",Collaborative,91.67
8,2196,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Collaborative,91.67
9,2370,B.E.,Electronics Telecommunication Engineering,MITAOE,"C,JAVA",Collaborative,65.0


In [35]:
final_recommender(1002)

Course Recommmendations for User 1002 :


Unnamed: 0,Course Code,Degree,Degree Specializations,Campus,Key Skills,Filter,Success Rate
0,1896,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Content-Based,66.67
1,1428,B.E.,Computer Science & Engineering,MITAOE,"C, Java, CPP, HTML, CMStool, MYSQL",Content-Based,66.67
2,1212,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Content-Based,66.67
3,1245,B.E.,Mechanical,MITAOE,CATIA,Content-Based,15.0
4,1765,B.E.,Mechanical,MITCOE,CATIA,Content-Based,15.0
5,2141,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Collaborative,66.67
6,2221,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Collaborative,66.67
7,2151,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Collaborative,66.67
8,2370,B.E.,Electronics Telecommunication Engineering,MITAOE,"C,JAVA",Collaborative,40.0
9,2186,M TeCh,Electronics Telecommunication Engineering,MIT WPU,"C,JAVA",Collaborative,25.0


In [36]:
final_recommender(1003)

Course Recommmendations for User 1003 :


Unnamed: 0,Course Code,Degree,Degree Specializations,Campus,Key Skills,Filter,Success Rate
0,1006,B.E.,Mechanical,MITCOE,CATIA,Content-Based,15
1,1007,B.E.,Mechanical,MITAOE,CATIA,Content-Based,15
2,1271,B.E.,Mechanical,MITCOE,AUTOCAD,Content-Based,15
3,1066,B.E.,Mechanical,MITCOE,AUTOCAD,Content-Based,15
4,1064,B.E.,Mechanical,MITCOE,AUTOCAD,Content-Based,15
5,2385,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Collaborative,50
6,2143,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Collaborative,50
7,2148,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Collaborative,50
8,2138,B.E.,Computer Science & Engineering,MITAOE,"C, Java, CPP, HTML, CMStool, MYSQL",Collaborative,50
9,2068,B.E.,Mechanical,MITAOE,CATIA,Collaborative,15


In [37]:
final_recommender(1009)

Course Recommmendations for User 1009 :


Unnamed: 0,Course Code,Degree,Degree Specializations,Campus,Key Skills,Filter,Success Rate
0,1947,B.E.,Computer Science & Engineering,MITCOE,"CPPProgramming, Core JAVA, CProgramming, MySql",Content-Based,87.5
1,1697,B.E.,Computer Science & Engineering,MITCOE,"CPPProgramming, Core JAVA, CProgramming, MySql",Content-Based,87.5
2,1420,B.E.,Computer Science & Engineering,MITCOE,"CPPProgramming, Core JAVA, CProgramming, MySql",Content-Based,87.5
3,1419,B.E.,Computer Science & Engineering,MITAOE,"CPPProgramming, Core JAVA, CProgramming, MySql",Content-Based,87.5
4,1404,B.E.,Computer Science & Engineering,MITAOE,"CPPProgramming, Core JAVA, CProgramming, MySql",Content-Based,87.5
5,2409,B.E.,Computer Science & Engineering,MITAOE,"CPPProgramming, Core JAVA, CProgramming, MySql",Collaborative,87.5
6,2143,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Collaborative,83.33
7,2135,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Collaborative,83.33
8,2191,B.E.,Computer Science & Engineering,MITAOE,"C, Java, CPP, HTML, CMStool, MYSQL",Collaborative,83.33
9,2142,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Collaborative,83.33


In [38]:
final_recommender(1010)

Course Recommmendations for User 1010 :


Unnamed: 0,Course Code,Degree,Degree Specializations,Campus,Key Skills,Filter,Success Rate
0,1904,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Content-Based,50
1,1709,B.E.,Computer Science & Engineering,MITAOE,"Java, JavasCript, CPP, Laravel Phpframework ,...",Content-Based,50
2,1545,B.E.,Mechanical,MITCOE,CATIA,Content-Based,15
3,2069,B.E.,Mechanical,MITAOE,CATIA,Content-Based,15
4,2068,B.E.,Mechanical,MITAOE,CATIA,Content-Based,15
5,2210,B.E.,Computer Science & Engineering,MITAOE,"CPPProgramming, Core JAVA, CProgramming, MySql",Collaborative,50
6,2032,B.E.,Mechanical,MITAOE,"CAD,CAM",Collaborative,15
7,2047,B.E.,Mechanical,MITCOE,"ProE,CATIA",Collaborative,15
8,2073,B.E.,Mechanical,MITCOE,AUTOCAD,Collaborative,15
9,2035,B.E.,Mechanical,MITAOE,"CAD,CAM",Collaborative,15


## TESTING 

In [39]:
final_recommender(1100)

No SVD Recommednations!
Course Recommmendations for User 1100 :


Unnamed: 0,Course Code,Degree,Degree Specializations,Campus,Key Skills,Filter,Success Rate
0,1085,B.E.,Electronics Telecommunication Engineering,MITAOE,"C,JAVA",Content-Based,15
1,1176,B.E.,Electronics Telecommunication Engineering,MITAOE,"C,PCB design",Content-Based,15
2,1863,B.E.,Electronics Telecommunication Engineering,MITAOE,"C,PCB design",Content-Based,15
3,1861,B.E.,Electronics Telecommunication Engineering,MITAOE,"C,PCB design",Content-Based,15
4,1621,B.E.,Electronics Telecommunication Engineering,"MIT,Pune","C,PCB design",Content-Based,15
5,1620,B.E.,Electronics Telecommunication Engineering,"MIT,Pune","C,PCB design",Content-Based,15
6,1610,B.E.,Electronics Telecommunication Engineering,"MIT,Pune","C,PCB design",Content-Based,15
7,1449,B.E.,Electronics Telecommunication Engineering,"MIT,Pune","C,PCB design",Content-Based,15
8,1421,B.E.,Electronics Telecommunication Engineering,"MIT,Pune","C,PCB design",Content-Based,15
9,1383,B.E.,Electronics Telecommunication Engineering,MITCOE,"C,PCB design",Content-Based,15


In [40]:
final_recommender(1555)

No SVD Recommednations!
Course Recommmendations for User 1555 :


Unnamed: 0,Course Code,Degree,Degree Specializations,Campus,Key Skills,Filter,Success Rate
0,1721,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Content-Based,83.33
1,1941,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Content-Based,83.33
2,1392,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Content-Based,83.33
3,1380,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Content-Based,83.33
4,1196,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Content-Based,83.33
5,1195,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Content-Based,83.33
6,1194,B.E.,Computer Science & Engineering,MITAOE,"C, Java, CPP, HTML, CMStool, MYSQL",Content-Based,83.33
7,1187,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Content-Based,83.33
8,1186,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Content-Based,83.33
9,1185,B.E.,Computer Science & Engineering,MITAOE,"C, Java, CPP, HTML, CMStool, MYSQL",Content-Based,83.33


In [41]:
final_recommender(1666)

No SVD Recommednations!
Course Recommmendations for User 1666 :


Unnamed: 0,Course Code,Degree,Degree Specializations,Campus,Key Skills,Filter,Success Rate
0,1149,B.E.,Computer Science & Engineering,MITAOE,"C, Java, CPP, HTML, CMStool, MYSQL",Content-Based,50
1,1183,B.E.,Computer Science & Engineering,MITAOE,"C, Java, CPP, HTML, CMStool, MYSQL",Content-Based,50
2,1678,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Content-Based,50
3,1674,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Content-Based,50
4,1667,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Content-Based,50
5,1666,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Content-Based,50
6,1664,B.E.,Computer Science & Engineering,MITAOE,"C, Java, CPP, HTML, CMStool, MYSQL",Content-Based,50
7,1651,B.E.,Computer Science & Engineering,MITAOE,"C, Java, CPP, HTML, CMStool, MYSQL",Content-Based,50
8,1650,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Content-Based,50
9,1649,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Content-Based,50


In [42]:
final_recommender(2000) 

No SVD Recommednations!
Course Recommmendations for User 2000 :


Unnamed: 0,Course Code,Degree,Degree Specializations,Campus,Key Skills,Filter,Success Rate
0,1219,B.E.,Electronics Telecommunication Engineering,"MIT,Pune","AmazonWebServiCes, C CPP, Arduino, MongoDB, Li...",Content-Based,15
1,2010,B.E.,Electronics Telecommunication Engineering,MITCOE,"AmazonWebServiCes, C CPP, Arduino, MongoDB, Li...",Content-Based,15
2,1988,B.E.,Electronics Telecommunication Engineering,MITAOE,"AmazonWebServiCes, C CPP, Arduino, MongoDB, Li...",Content-Based,15
3,1731,B.E.,Electronics Telecommunication Engineering,MITAOE,"AmazonWebServiCes, C CPP, Arduino, MongoDB, Li...",Content-Based,15
4,1981,B.E.,Electronics Telecommunication Engineering,MITAOE,"AmazonWebServiCes, C CPP, Arduino, MongoDB, Li...",Content-Based,15
5,2002,B.E.,Electronics Telecommunication Engineering,"MIT,Pune","AmazonWebServiCes, C CPP, Arduino, MongoDB, Li...",Content-Based,15
6,1224,B.E.,Electronics Telecommunication Engineering,"MIT,Pune","AmazonWebServiCes, C CPP, Arduino, MongoDB, Li...",Content-Based,15
7,1757,B.E.,Electronics Telecommunication Engineering,"MIT,Pune","AmazonWebServiCes, C CPP, Arduino, MongoDB, Li...",Content-Based,15
8,1980,B.E.,Electronics Telecommunication Engineering,MITCOE,"AmazonWebServiCes, C CPP, Arduino, MongoDB, Li...",Content-Based,15
9,1990,B.E.,Electronics Telecommunication Engineering,"MIT,Pune","AmazonWebServiCes, C CPP, Arduino, MongoDB, Li...",Content-Based,15


In [43]:
final_recommender(1777) #for the GPT4 REC

No SVD Recommednations!
Course Recommmendations for User 1777 :


Unnamed: 0,Course Code,Degree,Degree Specializations,Campus,Key Skills,Filter,Success Rate
0,1018,B.E.,Mechanical,MITCOE,AUTOCAD,Content-Based,15.0
1,1064,B.E.,Mechanical,MITCOE,AUTOCAD,Content-Based,15.0
2,1307,B.E.,Mechanical,MITCOE,"AutoCAD, PROE",Content-Based,15.0
3,1306,B.E.,Mechanical,MITCOE,"AutoCAD, PROE",Content-Based,15.0
4,1305,B.E.,Mechanical,MITAOE,"AutoCAD, PROE",Content-Based,15.0
5,1287,B.E.,Mechanical,MITCOE,"AutoCAD, PROE",Content-Based,15.0
6,1263,B.E.,Mechanical,MITCOE,"AutoCAD, PROE",Content-Based,15.0
7,1262,B.E.,Mechanical,MITCOE,"AutoCAD, PROE",Content-Based,15.0
8,1258,B.E.,Mechanical,MITAOE,"AutoCAD, PROE",Content-Based,15.0
9,1106,B.E.,Mechanical,MITAOE,"AutoCAD, PROE",Content-Based,15.0
