In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import numpy as np
from surprise import accuracy
from surprise.model_selection.validation import cross_validate
from surprise.dataset import Dataset
from surprise.reader import Reader
from surprise import SVD, KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline
from surprise.model_selection import train_test_split, GridSearchCV
import random

### Calling the Saved Dataset

In [2]:
course_dataset = pd.read_csv("../data/courseData.csv", encoding= 'unicode_escape')
course_dataset.head()

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
0,1001,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
1,1002,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
2,1003,B.E.,Mechanical,MITAOE,['CATIA'],CATIA
3,1004,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
4,1005,B.E.,Mechanical,MITCOE,['CATIA'],CATIA


In [3]:
user_dataset = pd.read_csv("../data/userData.csv", encoding= 'unicode_escape')
user_dataset.head()

Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
0,1001,B.E.,Computer Science & Engineering,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."
1,1002,B.E.,Computer Science & Engineering,['Hindi English'],"['Java', ' Neural Networks', ' AI', ' Python',...",Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ..."
2,1003,B.E.,Computer Science & Engineering,['Missing'],['Missing'],Missing,Missing
3,1004,B.E.,Computer Science & Engineering,"['English', ' Hindi', ' Marathi', ' Marwari']","['XML', ' C', ' Java', ' Data Structures', ' P...",Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo..."
4,1005,B.E.,Computer Science & Engineering,"['English', 'Hindi ', 'Kashmiri ', ' Urdu']","['XML', ' Word', ' Data Structures', ' Communi...",To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ..."


In [4]:
ratings_df = pd.read_csv("../data/ratingData.csv", encoding= 'unicode_escape')
ratings_df.head()

Unnamed: 0,course_id,user_id,rating
0,2001,1001,5
1,2001,1002,3
2,2001,1003,1
3,2001,1004,0
4,2001,1005,2


### Metrics: RMSE, MSE & MAE

In [5]:
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(ratings_df[['user_id','course_id','rating']], reader)

#split into training and test sets using 80:20 ratio
raw_ratings = data.raw_ratings
random.shuffle(raw_ratings)                

ratio = int(len(raw_ratings)*0.8)
train_raw = raw_ratings[:ratio] 
test_raw = raw_ratings[ratio:] 

#data is set to the training dataset
data.raw_ratings = train_raw       
trainset = data.build_full_trainset() 
testset = data.construct_testset(test_raw)


In [6]:
#algo
svd = SVD()

#cross validate with kfold set to 5
cross_validate(svd, data, measures = ['RMSE', 'MAE'], cv = 5, verbose = True)

#build train and train train set
trainset = data.build_full_trainset()
svd.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(svd.test(testset))
accuracy.mse(svd.test(testset))
accuracy.mae(svd.test(testset))

print('\nPredict Tests: ')
print(svd.predict(1001, 2001))
print(svd.predict(2001, 1001))
print(svd.predict(1001, 88))
print(svd.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
svd.test(testset)[:2]

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.3450  1.3655  1.3350  1.3859  1.4045  1.3672  0.0256  
MAE (testset)     1.0733  1.1215  1.0782  1.1261  1.1482  1.1095  0.0290  
Fit time          0.03    0.03    0.03    0.04    0.03    0.03    0.00    
Test time         0.01    0.01    0.00    0.10    0.00    0.02    0.04    

Accuracy on the testset:
RMSE: 1.2674
MSE: 1.6063
MAE:  0.9950

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 4.35   {'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.58   {'was_impossible': False}
user: 1001       item: 88         r_ui = None   est = 2.41   {'was_impossible': False}
user: 1001       item: 5          r_ui = None   est = 2.41   {'was_impossible': False}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=4.3505108981606595, details={'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=2.772439308595188, details={'was_impossible': False})]

In [7]:
params = {"n_factors": range(10, 100, 20),
         "n_epochs": [5, 10, 20],
         "lr_all": [0.002, 0.005],
         "reg_all": [0.2, 0.5]}

gsSVD = GridSearchCV(SVD, params, measures = ["RMSE", "MAE"], cv = 5, n_jobs = -1)
gsSVD.fit(data)

print(f'\nRMSE Best Parameters: {gsSVD.best_params["rmse"]}')
print(f'RMSE Best Score: {gsSVD.best_score["rmse"]}')
print(f'MAE Best Parameters: {gsSVD.best_params["mae"]}')
print(f'MAE Best Score: {gsSVD.best_score["mae"]}')


RMSE Best Parameters: {'n_factors': 90, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.2}
RMSE Best Score: 1.4999093392883065
MAE Best Parameters: {'n_factors': 90, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.2}
MAE Best Score: 1.2658359738788394


In [8]:
finalSVD = SVD(n_factors = 90, n_epochs = 20, lr_all = 0.005, reg_all = 0.2)
predictions = finalSVD.fit(trainset).test(testset)

print('\nUpdated Accuracy: ')
accuracy.rmse(predictions)
accuracy.mse(predictions)
accuracy.mae(predictions)


Updated Accuracy: 
RMSE: 1.3034
MSE: 1.6988
MAE:  1.0938


1.0938094408567685

### Checking Course and User Details

In [9]:
#using surprise
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(ratings_df[['user_id','course_id','rating']], reader)

#split into training and test sets using 80:20 ratio
raw_ratings = data.raw_ratings
random.shuffle(raw_ratings)                

ratio = int(len(raw_ratings)*0.8)
train_raw = raw_ratings[:ratio] 
test_raw = raw_ratings[ratio:] 

data.raw_ratings = train_raw       
trainset = data.build_full_trainset() 
testset = data.construct_testset(test_raw)

In [10]:
#Since the svd recommender is making recommmendations from the ratings dataset, 
#its recommemnding courses the users have already done
def svd_cf_recommendations(user):
    
    #using the "best" parameters found using the gridsearchcv method from experiments notebook 
    model = SVD(n_factors = 90, n_epochs = 20, lr_all = 0.005, reg_all = 0.2)
    model.fit(trainset)
    
    #building test set and predict the ratings
    testset = trainset.build_anti_testset()
    predictions = model.test(testset)
    
    #save into dataframe
    rating_predictions = pd.DataFrame(predictions)
    #print(rating_predictions)
    
    #find the predictions for the given user
    user_predictions = rating_predictions[rating_predictions['uid'] == user].\
                         sort_values(by="est", ascending = False)
        
    #removing courses already done by user (assuming they are rated 1 to 5)  
    finalcourserecs = [] #to store the final course recs
    
    #uid is the user and iid is the course id
    for i, row in user_predictions.iterrows():
        #get rating details
        rating_dets = ratings_df[(ratings_df['course_id'] == row[1]) & (ratings_df['user_id'] == user)]
        
        if rating_dets["rating"].values[0] == 0:
            #print("Not rated!")
            finalcourserecs.append(row[1])
    
    return(finalcourserecs)

In [11]:
svd_cf_recommendations(1001) #input is a user and output is a course!svdrecs
#1001 is a user (shown above no course is 1001)
#2082 is course since no user rating is found (check above)

[2274,
 2266,
 2133,
 2176,
 2385,
 2407,
 2172,
 2355,
 2215,
 2245,
 2281,
 2316,
 2286,
 2197,
 2186,
 2424,
 2343,
 2226,
 2221,
 2350,
 2106,
 2420,
 2178,
 2157,
 2275,
 2297,
 2129]

In [12]:
user_dataset[user_dataset["userid"] == "1001"]

Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
0,1001,B.E.,Computer Science & Engineering,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."


In [13]:
course_dataset[course_dataset["sr_"] == 2274]

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
1273,2274,M TeCh,Electronics Telecommunication Engineering,MIT WPU,"['LoRateChnology, C, MathematiCaltoolMATLAB, b...","LoRateChnology, C, MathematiCaltoolMATLAB, bas..."


In [14]:
course_dataset[course_dataset["sr_"] == 2266]

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
1265,2266,B.E.,Electronics Telecommunication Engineering,MITAOE,"['AmazonWebServiCes, C CPP, Arduino, MongoDB, ...","AmazonWebServiCes, C CPP, Arduino, MongoDB, Li..."


In [15]:
course_dataset[course_dataset["sr_"] == 2133]

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
1132,2133,B.E.,Electronics Telecommunication Engineering,"MIT,Pune","['C,PCB design']","C,PCB design"


In [16]:
course_dataset[course_dataset["sr_"] == 2176]

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
1175,2176,B.E.,Mechanical,MITAOE,"['SOLIDWORKS, AUTOCAD, CREO']","SOLIDWORKS, AUTOCAD, CREO"


In [17]:
course_dataset[course_dataset["sr_"] == 2385]

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
1384,2385,B.E.,Computer Science & Engineering,MITCOE,"['C, Java, CPP, HTML, CMStool, MYSQL']","C, Java, CPP, HTML, CMStool, MYSQL"


In [18]:
course_dataset[course_dataset["sr_"] == 2407]

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
1406,2407,B.E.,Computer Science & Engineering,MITCOE,"['CPPProgramming, Core JAVA, CProgramming, MyS...","CPPProgramming, Core JAVA, CProgramming, MySql"


In [19]:
course_dataset[course_dataset["sr_"] == 2172]

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
1171,2172,B.E.,Computer Science & Engineering,MITCOE,"['C, Java, CPP, HTML, CMStool, MYSQL']","C, Java, CPP, HTML, CMStool, MYSQL"
