In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity, cosine_distances
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
import squarify
import seaborn as sns
from surprise import accuracy
from surprise.model_selection.validation import cross_validate
from surprise.dataset import Dataset
from surprise.reader import Reader
from surprise import SVD, KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline
from surprise.model_selection import train_test_split
import random

## Calling the Saved Dataset

In [2]:
course_dataset = pd.read_csv("data/courseData.csv", encoding= 'unicode_escape')
course_dataset.head()

Unnamed: 0.1,Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
0,0,1001,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
1,1,1002,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
2,2,1003,B.E.,Mechanical,MITAOE,['CATIA'],CATIA
3,3,1004,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
4,4,1005,B.E.,Mechanical,MITCOE,['CATIA'],CATIA


In [3]:
user_dataset = pd.read_csv("data/userData.csv", encoding= 'unicode_escape')
user_dataset.head()

Unnamed: 0.1,Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
0,0,1001,B.E.,Computer Science & Engineering,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."
1,1,1002,B.E.,Computer Science & Engineering,['Hindi English'],"['Java', ' Neural Networks', ' AI', ' Python',...",Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ..."
2,2,1003,B.E.,Computer Science & Engineering,['Missing'],['Missing'],Missing,Missing
3,3,1004,B.E.,Computer Science & Engineering,"['English', ' Hindi', ' Marathi', ' Marwari']","['XML', ' C', ' Java', ' Data Structures', ' P...",Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo..."
4,4,1005,B.E.,Computer Science & Engineering,"['English', 'Hindi ', 'Kashmiri ', ' Urdu']","['XML', ' Word', ' Data Structures', ' Communi...",To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ..."


In [4]:
ratings_df = pd.read_csv("data/ratingData.csv", encoding= 'unicode_escape')
ratings_df.head()

Unnamed: 0,course_id,user_id,rating
0,2001,1001,5
1,2001,1002,3
2,2001,1003,1
3,2001,1004,0
4,2001,1005,2


## Collaborative Filtering

In [5]:
#Collaborative Filtering is based on the idea that users similar to a me can be used to predict how much I will 
#like a particular product or service those users have used/experienced but I have not.
#I will not be implementing Collaborative Filtering from scratch. Instead, I will use the Surprise library 
#that used extremely powerful algorithms like Singular Value Decomposition (SVD) to minimise RMSE (Root Mean Square Error)
#and give great recommendations.
#Implementation of SVD for surprise library is given on this link

Lower values of RMSE indicate better fit. RMSE is a good measure of how accurately the model predicts the response. It's the most important criterion for fit if the main purpose of the model is prediction.
Based on a rule of thumb, it can be said that RMSE values between **0.2 and 0.5** shows that the model can relatively predict the data accurately.

There is no correct value for MSE. Simply put, the lower the value the better and 0 means the model is perfect.

MAE: A metric that tells us the **mean absolute difference** between the predicted values and the actual values in a dataset. The lower the MAE, the better a model fits a dataset.

### Using Surprise and testing with different algorithms (SVD, KnnBasic, KnnBaseline, KnnWithMeans, KnnWithZScore) to find the best one to use based on MAE and RMAE scores. 

In [6]:
#using surprise
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(ratings_df[['user_id','course_id','rating']], reader)

#split into training and test sets using 80:20 ratio
raw_ratings = data.raw_ratings
random.shuffle(raw_ratings)                

ratio = int(len(raw_ratings)*0.8)
train_raw = raw_ratings[:ratio] 
test_raw = raw_ratings[ratio:] 

data.raw_ratings = train_raw       
trainset = data.build_full_trainset() 
testset = data.construct_testset(test_raw)

#create dict for different models 
models=[SVD(), KNNBasic(), KNNBaseline(), KNNWithMeans(), KNNWithZScore()] 
results = {} #to store the scores

#perform cross validation of MAE and RMSE for all models
for model in models:
    #kfold set to 5
    crossval_scores = cross_validate(model, data, measures=["MAE","RMSE"], cv=5, n_jobs=-1)  
    
    #saving and renaming appropraitely
    result = pd.DataFrame.from_dict(crossval_scores).mean(axis=0).\
             rename({'test_mae':'MAE', 'test_rmse': 'RMSE', 'fit_time': 'Fit Time', 'test_time': 'Test Time'})
    results[str(model).split("algorithms.")[1].split("object ")[0]] = result
    
#printing all models results
all_models = pd.DataFrame.from_dict(results)
all_models.T.sort_values(by='RMSE') #models sorted by RMSE

Unnamed: 0,MAE,RMSE,Fit Time,Test Time
matrix_factorization.SVD,1.121702,1.484131,0.078996,0.054432
knns.KNNWithMeans,1.291982,1.653239,0.008013,0.055196
knns.KNNBaseline,1.296408,1.65473,0.014246,0.134936
knns.KNNWithZScore,1.301111,1.669038,0.013036,0.065534
knns.KNNBasic,1.324,1.678584,0.004605,0.086612


####  Using the SVD model to generate collaborative filtering recommendations since it has the least MAE and RMSE results

In [7]:
#Since the svd recommender is making recommmendations from the ratings dataset, 
#its recommemnding courses the users have already done
def svd_cf_recommendations(user):
    
    #usisng the "best" parameters found using the gridsearchcv method from experiments notebook 
    model = SVD(n_factors = 90, n_epochs = 20, lr_all = 0.005, reg_all = 0.2)
    model.fit(trainset)
    
    #building test set and predict the ratings
    testset = trainset.build_anti_testset()
    predictions = model.test(testset)
    
    #save into dataframe
    rating_predictions = pd.DataFrame(predictions)
    #print(rating_predictions)
    
    #find the predictions for the given user
    user_predictions = rating_predictions[rating_predictions['uid'] == user].\
                         sort_values(by="est", ascending = False)
        
    #removing courses already done by user (assuming they are rated 1 to 5)  
    finalcourserecs = [] #to store the final course recs
    
    #uid is the user and iid is the course id
    for i, row in user_predictions.iterrows():
        # Select Multiple Conditions using Multiple Columns
        rating_dets = ratings_df[(ratings_df['course_id'] == row[1]) & (ratings_df['user_id'] == user)]
        
        if rating_dets["rating"].values[0] == 0:
            #print("Not rated!")
            finalcourserecs.append(row[1])
    
    return(finalcourserecs)

In [8]:
svd_cf_recommendations(1001) #input is a user and output is a course!

#1001 is a user (shown above no course is 1001)
#2082 is course since no user rating is found (check above)

[2294,
 2202,
 2147,
 2252,
 2076,
 2396,
 2241,
 2184,
 2084,
 2357,
 2005,
 2393,
 2016,
 2245,
 2390,
 2161,
 2330,
 2297,
 2243,
 2407,
 2303,
 2353,
 2317,
 2307,
 2100,
 2269,
 2221,
 2157,
 2281,
 2136,
 2197,
 2116,
 2232,
 2124]

In [9]:
ratings_df[ratings_df['user_id']== 1001]

Unnamed: 0,course_id,user_id,rating
0,2001,1001,5
20,2002,1001,3
40,2003,1001,4
60,2004,1001,2
80,2005,1001,0
...,...,...,...
8380,2420,1001,0
8400,2421,1001,4
8420,2422,1001,0
8440,2423,1001,2


In [10]:
ratings_df[ratings_df['course_id']== 2150]

Unnamed: 0,course_id,user_id,rating
2980,2150,1001,4
2981,2150,1002,5
2982,2150,1003,3
2983,2150,1004,2
2984,2150,1005,4
2985,2150,1006,5
2986,2150,1007,3
2987,2150,1008,4
2988,2150,1009,4
2989,2150,1010,5


In [11]:
svd_cf_recommendations(2150) #svd doeesnt work with input of courses

[]

In [12]:
svd_cf_recommendations("2150") #svd doeesnt work with input of courses

[]

## Content Based Filtering

### User Dataset - Based on Career Objective and Key Skills

In [13]:
user_dataset.head(2)

Unnamed: 0.1,Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
0,0,1001,B.E.,Computer Science & Engineering,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."
1,1,1002,B.E.,Computer Science & Engineering,['Hindi English'],"['Java', ' Neural Networks', ' AI', ' Python',...",Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ..."


In [14]:
#creating a new dataset dataset using the career objetcive, key skills and their user id
cf_user_dataset = pd.DataFrame()
cf_user_dataset['userid'] = user_dataset['userid']
cf_user_dataset['career_objective'] = user_dataset['career_objective']
cf_user_dataset['key_skills_str'] = user_dataset['key_skills_str']
cf_user_dataset.head(5)

#store the career objective and key skills in description
cf_user_dataset['description'] = cf_user_dataset['career_objective'] + cf_user_dataset['key_skills_str']
cf_user_dataset.head(5)

Unnamed: 0,userid,career_objective,key_skills_str,description
0,1001,Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel...",Computer Engineering student with good technic...
1,1002,Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ...",Interested in working under company offering A...
2,1003,Missing,Missing,MissingMissing
3,1004,Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo...",Currently a final year student of Computer Eng...
4,1005,To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ...",To have a growth oriented and challenging care...


In [15]:
#vectorize using countvectorize that converts into a matrix of token counts
user_count = CountVectorizer(stop_words='english', analyzer='word', ngram_range=(1, 2), min_df=0)
user_count_matrix = user_count.fit_transform(cf_user_dataset['description'].values.astype('U'))

#cosine similarity between the the matrix itself (normalized dot product of X and Y)
user_cosine_sim = cosine_similarity(user_count_matrix, user_count_matrix)

#create indices for the user is using series
cf_user_dataset = cf_user_dataset.reset_index()
user_ids = cf_user_dataset
indices = pd.Series(cf_user_dataset.index, index=cf_user_dataset['userid'])


#function to get content-filtered recommendations
def get_user_cf_recommendations(user):
    
    #get index of user
    user_id = indices[user]
    
    #find the most similar 30 users using cosine_sim and sorting with highest similarity
    sim_scores = list(enumerate(user_cosine_sim[user_id]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    user_indices = [i[0] for i in sim_scores]
    
    return user_ids.iloc[user_indices]

In [16]:
#getting the similar users recs for user 1001
get_user_cf_recommendations("1001")

Unnamed: 0,index,userid,career_objective,key_skills_str,description
894,894,1847,Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel...",Computer Engineering student with good technic...
996,996,1946,Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel...",Computer Engineering student with good technic...
93,93,1087,Dedicated and passionate computer engineering ...,"Java, Python, Machine Learning, CPP, Andro...",Dedicated and passionate computer engineering ...
256,256,1231,To enhance my expertise in the field of softwa...,"NLP, Java, Neural Networks, Keras, Python,...",To enhance my expertise in the field of softwa...
201,201,1182,Missing,"C, Data Analysis, Java, Neural Networks, D...","MissingC, Data Analysis, Java, Neural Netwo..."
180,180,1161,I am a student programmer currently seeking ex...,"Python, Data Science, Artificial Intelligenc...",I am a student programmer currently seeking ex...
91,91,1085,Team oriented individual with strong communica...,"C, Statistics, Java, MS Office, Python, D...",Team oriented individual with strong communica...
464,464,1416,Missing,"Database, Embedded C, Machine Learning","MissingDatabase, Embedded C, Machine Learning"
981,981,1931,Missing,"Deep Learning, Selenium, Cpp","MissingDeep Learning, Selenium, Cpp"
165,165,1146,Missing,"Data Structures, Android, Algorithms, Probl...","MissingData Structures, Android, Algorithms,..."


In [17]:
get_user_cf_recommendations("1847").head(10)

Unnamed: 0,index,userid,career_objective,key_skills_str,description
996,996,1946,Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel...",Computer Engineering student with good technic...
0,0,1001,Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel...",Computer Engineering student with good technic...
93,93,1087,Dedicated and passionate computer engineering ...,"Java, Python, Machine Learning, CPP, Andro...",Dedicated and passionate computer engineering ...
981,981,1931,Missing,"Deep Learning, Selenium, Cpp","MissingDeep Learning, Selenium, Cpp"
110,110,1102,Dedicated and Passionate computer engineering ...,"C, Java, Javascript, cpp, Object-Oriented ...",Dedicated and Passionate computer engineering ...
256,256,1231,To enhance my expertise in the field of softwa...,"NLP, Java, Neural Networks, Keras, Python,...",To enhance my expertise in the field of softwa...
180,180,1161,I am a student programmer currently seeking ex...,"Python, Data Science, Artificial Intelligenc...",I am a student programmer currently seeking ex...
91,91,1085,Team oriented individual with strong communica...,"C, Statistics, Java, MS Office, Python, D...",Team oriented individual with strong communica...
165,165,1146,Missing,"Data Structures, Android, Algorithms, Probl...","MissingData Structures, Android, Algorithms,..."
201,201,1182,Missing,"C, Data Analysis, Java, Neural Networks, D...","MissingC, Data Analysis, Java, Neural Netwo..."


### Using User and Course Dataset in Sim Matrix

In [18]:
#for user
#creating a new dataset dataset using the degree1, degree1specializations and key skills, career objective and the userid
cf_user_dataset = pd.DataFrame()
cf_user_dataset['userid'] = user_dataset['userid']
cf_user_dataset['degree_1'] = user_dataset['degree_1']
cf_user_dataset['degree_1_specializations'] = user_dataset['degree_1_specializations']
cf_user_dataset['career_objective'] = user_dataset['career_objective']
cf_user_dataset['key_skills_str'] = user_dataset['key_skills_str']
cf_user_dataset.head(5)

#store the career objective and key skills in description
cf_user_dataset['description'] = cf_user_dataset['degree_1'] + cf_user_dataset['degree_1_specializations'] + cf_user_dataset['career_objective'] + cf_user_dataset['key_skills_str']
cf_user_dataset.head(5)

Unnamed: 0,userid,degree_1,degree_1_specializations,career_objective,key_skills_str,description
0,1001,B.E.,Computer Science & Engineering,Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel...",B.E.Computer Science & EngineeringComputer Eng...
1,1002,B.E.,Computer Science & Engineering,Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ...",B.E.Computer Science & EngineeringInterested i...
2,1003,B.E.,Computer Science & Engineering,Missing,Missing,B.E.Computer Science & EngineeringMissingMissing
3,1004,B.E.,Computer Science & Engineering,Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo...",B.E.Computer Science & EngineeringCurrently a ...
4,1005,B.E.,Computer Science & Engineering,To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ...",B.E.Computer Science & EngineeringTo have a gr...


In [19]:
#for course
#creating a new dataset dataset using the degree1, degree1specializations and key skills and the course id
cf_course_dataset = pd.DataFrame()
cf_course_dataset['sr_'] = course_dataset['sr_']
cf_course_dataset['campus'] = course_dataset['campus'] #campus is added so we dont have to add it later on
cf_course_dataset['degree_1'] = course_dataset['degree_1']
cf_course_dataset['degree_1_specializations'] = course_dataset['degree_1_specializations']
cf_course_dataset['key_skills_str'] = course_dataset['key_skills_str']
cf_course_dataset.head(5)

#store the career objective and key skills in description
cf_course_dataset['description'] = cf_course_dataset['degree_1'] + cf_course_dataset['degree_1_specializations'] + cf_course_dataset['key_skills_str']
cf_course_dataset.head(5)

Unnamed: 0,sr_,campus,degree_1,degree_1_specializations,key_skills_str,description
0,1001,MITCOE,B.E.,Mechanical,CATIA,B.E.MechanicalCATIA
1,1002,MITCOE,B.E.,Mechanical,CATIA,B.E.MechanicalCATIA
2,1003,MITAOE,B.E.,Mechanical,CATIA,B.E.MechanicalCATIA
3,1004,MITCOE,B.E.,Mechanical,CATIA,B.E.MechanicalCATIA
4,1005,MITCOE,B.E.,Mechanical,CATIA,B.E.MechanicalCATIA


In [20]:
#put them in a combined dataframe
comb = pd.DataFrame()
comb['description'] = cf_user_dataset['description'] + cf_user_dataset['description']

In [21]:
#vectorize using countvectorize that converts into a matrix of token counts
comb_count = CountVectorizer(stop_words='english', analyzer='word', ngram_range=(1, 2), min_df=0)
comb_count_matrix = comb_count.fit_transform(comb['description'].values.astype('U'))

#cosine similarity between the the combined matrix itself (normalized dot product of X and Y)
course_cosine_sim = cosine_similarity(comb_count_matrix, comb_count_matrix)

#create indices for the courses is using series
cf_course_dataset = cf_course_dataset.reset_index()
courses = cf_course_dataset
indices = pd.Series(cf_course_dataset.index, index=cf_course_dataset['sr_'])

#function to get content-filtered recommendations
def get_course_cf_recommendations(user): 
    
    #get index of course
    idx = indices[user]
    
    #find the most similar 30 courses using cosine_sim and sorting with highest similarity
    sim_scores = list(enumerate(course_cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:501]
    course_indices = [i[0] for i in sim_scores]
    
    return courses.iloc[course_indices]

In [38]:
#getting the similar course recs for user 1001
cf = get_course_cf_recommendations(1001)

In [39]:
cf

Unnamed: 0,index,sr_,campus,degree_1,degree_1_specializations,key_skills_str,description
894,894,1895,MITCOE,B.E.,Computer Science & Engineering,"C, Java, CPP, HTML, CMStool, MYSQL","B.E.Computer Science & EngineeringC, Java, CPP..."
996,996,1997,"MIT,Pune",B.E.,Electronics Telecommunication Engineering,"EmbeddedC, MATLAB, Cprogramming, Keil",B.E.Electronics Telecommunication Engineering...
93,93,1094,MITCOE,B.E.,Mechanical,"AutoCAD, PROE","B.E.MechanicalAutoCAD, PROE"
201,201,1202,MIT WPU,M TeCh,Electronics Telecommunication Engineering,"EmbeddedC, MATLAB, Cprogramming, Keil",M TeCh Electronics Telecommunication Engineer...
256,256,1257,MITAOE,B.E.,Mechanical,"ProE,CATIA","B.E.MechanicalProE,CATIA"
...,...,...,...,...,...,...,...
20,20,1021,MITAOE,B.E.,Mechanical,CATIA,B.E.MechanicalCATIA
914,914,1915,"MIT,Pune",B.E.,Electronics Telecommunication Engineering,"C,PCB design",B.E.Electronics Telecommunication Engineering...
369,369,1370,MITCOE,B.E.,Mechanical,"AutoCAD, PROE","B.E.MechanicalAutoCAD, PROE"
39,39,1040,MITAOE,B.E.,Mechanical,"AutoCAD, PROE","B.E.MechanicalAutoCAD, PROE"


In [42]:
cf[cf["sr_"] == 2087]

Unnamed: 0,index,sr_,campus,degree_1,degree_1_specializations,key_skills_str,description


In [43]:
def drop_courses_taken(user, cf):
    finalcourserecs = [] #to store the final course recs

    for i, row in cf.iterrows():
        #print(row[1])

        if row[1] in ratings_df["course_id"].values:
            #print("Course is found in the ratings dataframe!")
            #check if rated
            rating_dets = ratings_df[(ratings_df['course_id'] == row[1]) & (ratings_df['user_id'] == user)]
            #print(rating_dets)
            if rating_dets["rating"].values[0] != 0:
                cf.drop([i], inplace=True)
                finalcourserecs.append(row[1])
    return cf

In [45]:
cf = drop_courses_taken(1001, cf)

In [32]:
user = 1001
finalcourserecs = [] #to store the final course recs

for i, row in cf.iterrows():
    #print(row[1])
    
    if row[1] in ratings_df["course_id"].values:
        print("Course is found in the ratings dataframe!")
        #check if rated
        rating_dets = ratings_df[(ratings_df['course_id'] == row[1]) & (ratings_df['user_id'] == user)]
        #print(rating_dets)
        if rating_dets["rating"].values[0] != 0:
            cf.drop([i], inplace=True)
            finalcourserecs.append(row[1])
    #else:
        #print("Course not in ratings dataframe")

Course is found in the ratings dataframe!
Course is found in the ratings dataframe!
Course is found in the ratings dataframe!
Course is found in the ratings dataframe!
Course is found in the ratings dataframe!
Course is found in the ratings dataframe!
Course is found in the ratings dataframe!
Course is found in the ratings dataframe!
Course is found in the ratings dataframe!
Course is found in the ratings dataframe!
Course is found in the ratings dataframe!
Course is found in the ratings dataframe!
Course is found in the ratings dataframe!
Course is found in the ratings dataframe!
Course is found in the ratings dataframe!
Course is found in the ratings dataframe!
Course is found in the ratings dataframe!
Course is found in the ratings dataframe!
Course is found in the ratings dataframe!
Course is found in the ratings dataframe!
Course is found in the ratings dataframe!
Course is found in the ratings dataframe!
Course is found in the ratings dataframe!
Course is found in the ratings dat

In [33]:
finalcourserecs

[2090,
 2003,
 2006,
 2010,
 2015,
 2022,
 2026,
 2028,
 2030,
 2038,
 2040,
 2044,
 2047,
 2049,
 2051,
 2056,
 2057,
 2058,
 2062,
 2065,
 2067,
 2068,
 2069,
 2075,
 2078,
 2079,
 2025,
 2042,
 2029,
 2035,
 2036,
 2087]

In [36]:
cf[cf["sr_"] == 1895]

Unnamed: 0,index,sr_,campus,degree_1,degree_1_specializations,key_skills_str,description
894,894,1895,MITCOE,B.E.,Computer Science & Engineering,"C, Java, CPP, HTML, CMStool, MYSQL","B.E.Computer Science & EngineeringC, Java, CPP..."


In [28]:
course_dataset[course_dataset["sr_"] == 1895]

Unnamed: 0.1,Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
894,894,1895,B.E.,Computer Science & Engineering,MITCOE,"['C, Java, CPP, HTML, CMStool, MYSQL']","C, Java, CPP, HTML, CMStool, MYSQL"


## The Hybrid Recommender 

This recommender will take as input the user and output a number of course recommendations for the user using a hybrid approach. The Collaborative Filtering model will be SVD and the Content Filtering is done using a CountVectorizer and similarity matrix.

To get the recommendations, courses that have been previously taken and considered completed by the user will be removed from the recommendations list.

Furthermore, the predictor will be developed after this is complete. 

Finally, the cluster technique might not be used since collaborative filtering was implemented using surprise

In [29]:
#gets course details of svd recommendations
def get_course_details(svd_courses): 
    
    #dataframe for final recommendations 
    finalRecs = pd.DataFrame()
    
    #iterate the list
    for i in svd_courses:
        
        #if course found in couse dataset then store details in finalRecs dataframe
        if i in course_dataset["sr_"]:
            course_details = course_dataset[course_dataset["sr_"] == i]
            finalRecs = finalRecs.append(course_details)

            #drop unneccessary columns 
            finalRecs.drop(["Unnamed: 0", "key_skills"], axis = 1, inplace = True)
        
    #return useful information
    return finalRecs[["sr_", "degree_1", "degree_1_specializations", "campus", "key_skills_str"]]

In [30]:
def hybrid_recommender(user):
    
    svd_courses = [] #pd.DataFrame()
    user_recs = pd.DataFrame()
    course_recs = pd.DataFrame()
    
    #get collaborative filtering recommendations
    svd_courses = svd_cf_recommendations(user)
    
    #get content based filtering recommendations
    cf_recs = get_course_cf_recommendations(user)
    
    #adjust columns as necessary
    cf_recs = cf_recs.loc[:, ["sr_","degree_1","degree_1_specializations","campus", "key_skills_str", "index","description"]]
    
    #if there are no similar users with ratings for the svd, recommendations will be made solely on content-based filtering
    if len(svd_courses) == 0: #there are no svd recs
        have_svd = False
    else:
        have_svd = True
        svd_recs = get_course_details(svd_courses)
        
    #making a filter column to show where the recommendations were generated from
    cf_recs["Filter"] = "Content-Based"
    
    #calculate the success rates and save in Success Rate column
    cf_success_rate = calculate_success_rate(str(user), cf_recs) 
    
    cf_recs['Success Rate'] = cf_success_rate
    #sort to get the courses with the highest success rate
    cf_recs = cf_recs.sort_values(by=['Success Rate'], ascending=False) 
    
    #if there are svd recommendations do the same for the svd recommendations
    if have_svd == True:
        
        svd_recs["Filter"] = "Collaborative"
        svd_success_rate = calculate_success_rate(str(user), svd_recs) 
        svd_recs['Success Rate'] = svd_success_rate 
        svd_recs = svd_recs.sort_values(by=['Success Rate'], ascending=False)
    
        #get final recommendations using concat to get the top 5 recs from both recommenders
        final = pd.concat([cf_recs.iloc[:5], svd_recs.iloc[:5]], ignore_index=True, sort=False)
    else:
        final = cf_recs.head(10)
       

    #rename index and set index to it
    final = final.rename(columns = {final.columns[0]: "Course Code"})
    final.set_index("Course Code", inplace = True)

    #drop unnecessary colummns
    final.drop(["index", "description"], axis = 1, inplace = True)

    #rename other columns
    final.rename(columns = {"degree_1":"Degree", "degree_1_specializations": "Degree Specializations", "campus": "Campus", "key_skills": "Key Skills Actual", "key_skills_str": "Key Skills", "Filter": "Filter"}, inplace = True)

    print("Course Recommmendations for User", user, ":")
    final = final.reset_index()

    return final

In [31]:
hybrid_recommender(2002)

NameError: name 'calculate_success_rate' is not defined

## The Success Rate Predictor 

In [None]:
#gets the percentages of the three items
def get_percentage(length):
    
    if length == 0:
        pdegree = 0
        pspec = 0
        pkeyskills = 0
        print("There are no key skills!")
        
        return pdegree, pspec, pkeyskills
    
    if length == 1 or length == 2:
        pdegree = 30 
        pspec = 30
        pkeyskills = 40 
        
        return pdegree, pspec, pkeyskills
    
    elif length > 2 and length <= 4:
        pdegree = 25 
        pspec = 25
        pkeyskills = 50 
        
        return pdegree, pspec, pkeyskills
    
    elif length > 4 and length <= 9:
        pdegree = 20 
        pspec = 20
        pkeyskills = 60
        
        return pdegree, pspec, pkeyskills
    
    else:
        pdegree = 15 
        pspec = 15
        pkeyskills = 70

        return pdegree, pspec, pkeyskills

In [None]:
#implement predictor using the user key skills and info and the predicted courses
def calculate_success_rate(user, recommendations):
    success_rate = []
    
    #get the users required info
    #user_details = user_dataset[user_dataset["userid"] == user]
    user_degree = user_dataset["degree_1"].loc[user_dataset["userid"] == user].values[0]
    user_spec = user_dataset["degree_1_specializations"].loc[user_dataset["userid"] == user].values[0]
    user_keyskills = user_dataset["key_skills_str"].loc[user_dataset["userid"] == user].values[0]

    #turn into list 
    user_degree = user_degree.split("-")
    user_spec = user_spec.split("-")
    user_keyskills = user_keyskills.split(",")
    user_keyskills = (map(lambda x: x.lower().strip(), user_keyskills))
    user_keyskills = list(user_keyskills)
    

    #get the recommmendations
    #compare the user details with the recommendations
    for i, row in recommendations.iterrows():
        #print course code
        #print("Checking Course ", i)
        
        #reset the percentages
        apdegree = 0
        apspec = 0
        apkeyskills = 0
        
        #get current course data
        curr_degree = row[1]
        curr_spec = row[2]
        curr_keyskills = row[4] #row 3 using final, might change to row 2 if we remove campus
        
        #turn into list 
        curr_degree = curr_degree.split("-")
        curr_spec = curr_spec.split("-")
        #curr_keyskills = curr_keyskills.split(",")
        curr_keyskills = curr_keyskills.strip().split(",")
        curr_keyskills = (map(lambda x: x.lower().strip(), curr_keyskills))
        curr_keyskills = list(curr_keyskills)

        #get percentages
        pdegree, pspec, pkeyskills = get_percentage(len(curr_keyskills))
        
        #compare degree
        if (user_degree == curr_degree):
            apdegree = pdegree
            
        #compare degree specs
        if (user_spec == curr_spec):
            
            apspec = pspec
        
        #compare keyskills 
        ptemp = pkeyskills / len(curr_keyskills) #this will be the percentages of each individual key skill
                
        for currk in curr_keyskills:   
            
            #in the case of missing currk
            if currk == "missing":
                print("There is not enough information!")

            #for case course 2258
            #course 2258 has a keyskill "C CPP" instead of "C" and "CPP". 
            #And its not being counted as a match since it takes them as one thing
            if currk == "c cpp":
                currk = "c"
                curr_keyskills.append("cpp")
                
                #adjust the percentages
                ptemp = pkeyskills / len(curr_keyskills)
                
            #add new if for cprogrammming same as c
            if currk == "cprogramming":
                currk = "c"

            if currk in user_keyskills:
                apkeyskills = apkeyskills + ptemp
                
        #calculate the total success rate and round to 2dp
        total = round(apdegree + apspec + apkeyskills, 2)
        
        #print("The Percentages: ", pdegree, pspec, pkeyskills)
        #print("Course: ", i, "SuccessRate: ", total)
        success_rate.append(total)
                    
    return success_rate