In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import numpy as np
from surprise import accuracy
from surprise.model_selection.validation import cross_validate
from surprise.dataset import Dataset
from surprise.reader import Reader
from surprise import SVD, KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline
from surprise.model_selection import train_test_split, GridSearchCV
import random

### Calling the Saved Dataset

In [2]:
ratings_df = pd.read_csv("../data/ratingData.csv", encoding= 'unicode_escape')
ratings_df.head()

Unnamed: 0,course_id,user_id,rating
0,2001,1001,5
1,2001,1002,3
2,2001,1003,1
3,2001,1004,0
4,2001,1005,2


### Using Surprise and testing with different algorithms (SVD, KnnBasic, KnnBaseline, KnnWithMeans, KnnWithZScore) to find the best model to use based on MAE, MSE and RMSE scores. 

In [3]:
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(ratings_df[['user_id','course_id','rating']], reader)

#split into training and test sets using 80:20 ratio
raw_ratings = data.raw_ratings
random.shuffle(raw_ratings)                

ratio = int(len(raw_ratings)*0.8)
train_raw = raw_ratings[:ratio] 
test_raw = raw_ratings[ratio:] 

#data is set to the training dataset
data.raw_ratings = train_raw       
trainset = data.build_full_trainset() 
testset = data.construct_testset(test_raw)

#create dict for different models 
models=[SVD(), KNNBasic(), KNNBaseline(), KNNWithMeans(), KNNWithZScore()] 
results = {} #to store the scores

#perform cross validation of MAE and RMSE for all models
for model in models:
    #kfold set to 5
    crossval_scores = cross_validate(model, data, measures=["MAE", "MSE", "RMSE"], cv=5, n_jobs=-1)  
    
    #saving and renaming appropraitely
    result = pd.DataFrame.from_dict(crossval_scores).mean(axis=0).\
             rename({'test_mae':'MAE', 'test_mse': 'MSE', 'test_rmse': 'RMSE', 'fit_time': 'Fit Time', 'test_time': 'Test Time'})
    results[str(model).split("algorithms.")[1].split("object ")[0]] = result
    
#printing all models results
all_models = pd.DataFrame.from_dict(results)
all_models.T.sort_values(by='RMSE') #models sorted by RMSE

Unnamed: 0,MAE,MSE,RMSE,Fit Time,Test Time
matrix_factorization.SVD,1.095831,1.838189,1.355601,0.052512,0.042773
knns.KNNWithZScore,1.288861,2.437361,1.561071,0.008011,0.037642
knns.KNNWithMeans,1.289591,2.444511,1.563372,0.005192,0.035532
knns.KNNBaseline,1.289648,2.445601,1.563634,0.007668,0.075572
knns.KNNBasic,1.326404,2.547307,1.595991,0.003122,0.05924


## Testing with Different Parameters 
### SVD

####  SVD - Normal

In [4]:
#algo
svd = SVD()

#cross validate with kfold set to 5
cross_validate(svd, data, measures = ['RMSE', 'MAE'], cv = 5, verbose = True)

#build train and train train set
trainset = data.build_full_trainset()
svd.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(svd.test(testset))
accuracy.mse(svd.test(testset))
accuracy.mae(svd.test(testset))

print('\nPredict Tests: ')
print(svd.predict(1001, 2001))
print(svd.predict(2001, 1001))
print(svd.predict(1001, 88))
print(svd.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
svd.test(testset)[:2]

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.3504  1.3860  1.3919  1.3797  1.3247  1.3665  0.0253  
MAE (testset)     1.0979  1.1282  1.1315  1.1171  1.0834  1.1116  0.0184  
Fit time          0.04    0.04    0.03    0.03    0.03    0.03    0.00    
Test time         0.01    0.01    0.01    0.01    0.01    0.01    0.00    

Accuracy on the testset:
RMSE: 1.2891
MSE: 1.6618
MAE:  1.0225

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 4.35   {'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.59   {'was_impossible': False}
user: 1001       item: 88         r_ui = None   est = 2.45   {'was_impossible': False}
user: 1001       item: 5          r_ui = None   est = 2.45   {'was_impossible': False}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=4.353560016318715, details={'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=2.446839209306194, details={'was_impossible': False})]

#### SVD - Using GridSearchCV 

In [5]:
params = {"n_factors": range(10, 100, 20),
         "n_epochs": [5, 10, 20],
         "lr_all": [0.002, 0.005],
         "reg_all": [0.2, 0.5]}

gsSVD = GridSearchCV(SVD, params, measures = ["RMSE", "MAE"], cv = 5, n_jobs = -1)
gsSVD.fit(data)

print(f'\nRMSE Best Parameters: {gsSVD.best_params["rmse"]}')
print(f'RMSE Best Score: {gsSVD.best_score["rmse"]}')
print(f'MAE Best Parameters: {gsSVD.best_params["mae"]}')
print(f'MAE Best Score: {gsSVD.best_score["mae"]}')


RMSE Best Parameters: {'n_factors': 90, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.2}
RMSE Best Score: 1.4934298075625496
MAE Best Parameters: {'n_factors': 90, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.2}
MAE Best Score: 1.2601665019592034


##### Using the Best Hyperparameters found above: 

In [6]:
finalSVD = SVD(n_factors = 90, n_epochs = 20, lr_all = 0.005, reg_all = 0.2)
predictions = finalSVD.fit(trainset).test(testset)

print('\nUpdated Accuracy: ')
accuracy.rmse(predictions)
accuracy.mse(predictions)
accuracy.mae(predictions)


Updated Accuracy: 
RMSE: 1.3031
MSE: 1.6981
MAE:  1.0905


1.0904704727314765

### KNNBasic

##### KNNBasic - user-based cosine 

In [7]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using user-based cosine similarity
params = {
    "name": "cosine",
    "user_based": True,  # Compute  similarities between users
}
algo = KNNBasic(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the cosine similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 1.5182
MSE: 2.3048
MAE:  1.2637

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 1.61   {'actual_k': 13, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=1.611605676774573, details={'actual_k': 13, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=1.5289737132318595, details={'actual_k': 13, 'was_impossible': False})]

#### KNNBasic - item-based cosine 

In [8]:
# #build train and train train set
# trainset = data.build_full_trainset()

# #fitting
# #using user-based cosine similarity
# params = {
#     "name": "cosine",
#     "user_based": False,  # Compute  similarities between users
# }
# algo = KNNBasic(sim_options = params)
# algo.fit(trainset)

# #test the test set using .test() 
# print('\nAccuracy on the testset:')
# accuracy.rmse(algo.test(testset))
# accuracy.mse(algo.test(testset))
# accuracy.mae(algo.test(testset))

# print('\nPredict Tests: ')
# print(algo.predict(1001, 2001))
# print(algo.predict(2001, 1001))
# print(algo.predict(1001, 88))
# print(algo.predict(1001, 5))

# print('\nPredict Using TestSet list: ')
# testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
# algo.test(testset)[:2]

#### KNNBasic - user-based msd 

In [9]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using user-based cosine similarity
params = {
    "name": "msd",
    "user_based": True,  # Compute  similarities between users
}
algo = KNNBasic(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the msd similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 1.2310
MSE: 1.5153
MAE:  1.0143

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 2.58   {'actual_k': 13, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=2.581927554427891, details={'actual_k': 13, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=1.9306927428442013, details={'actual_k': 13, 'was_impossible': False})]

#### KNNBasic - item-based msd 

In [10]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using user-based cosine similarity
params = {
    "name": "msd",
    "user_based": False,  # Compute  similarities between users
}
algo = KNNBasic(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the msd similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 0.7233
MSE: 0.5232
MAE:  0.4594

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 3.73   {'actual_k': 40, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=3.732994515075002, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=2.8066892628359166, details={'actual_k': 40, 'was_impossible': False})]

#### KNNBasic - cosine, user, shrinkage

In [11]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using user-based cosine similarity
params = {
    "name": "msd",
    "user_based": True,
    "shrinkage": 0 #no shrinkage
}
algo = KNNBasic(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the msd similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 1.2310
MSE: 1.5153
MAE:  1.0143

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 2.58   {'actual_k': 13, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=2.581927554427891, details={'actual_k': 13, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=1.9306927428442013, details={'actual_k': 13, 'was_impossible': False})]

#### KNNBasic - msd, user, shrinkage

In [12]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using user-based cosine similarity
params = {
    "name": "msd",
    "user_based": False,
    "shrinkage": 0 #shrinkage is 0
}

algo = KNNBasic(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the msd similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 0.7233
MSE: 0.5232
MAE:  0.4594

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 3.73   {'actual_k': 40, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=3.732994515075002, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=2.8066892628359166, details={'actual_k': 40, 'was_impossible': False})]

#### KNNBasic - Using GridSearchCV 

In [13]:
#doing this with name is msd or cosine causes float or zero division error
params = {'sim_options' : {'method': ['als','sgd'],
                            "n_epochs" : [5, 10, 20],
                            "user_based": [True, False],
                            "min_support": [3, 5, 8],
                          "user_based": [True, False]}}

gsKB = GridSearchCV(KNNBasic, params, measures=['mae', 'rmse'], cv=5, n_jobs=-1)                               
gsKB.fit(data)

print(f'\nRMSE Best Parameters: {gsKB.best_params["rmse"]}')
print(f'RMSE Best Score: {gsKB.best_score["rmse"]}')
print(f'MAE Best Parameters: {gsKB.best_params["mae"]}')
print(f'MAE Best Score: {gsKB.best_score["mae"]}')


RMSE Best Parameters: {'sim_options': {'method': 'als', 'n_epochs': 5, 'user_based': False, 'min_support': 3}}
RMSE Best Score: 1.1075416632105777
MAE Best Parameters: {'sim_options': {'method': 'als', 'n_epochs': 5, 'user_based': False, 'min_support': 3}}
MAE Best Score: 0.7969604131264807


#### Using the Best Hyperparameters found above: 

In [14]:
finalKB = KNNBasic(method = "als", n_epochs = 5, user_based = False, min_support = 3)
predictions = finalKB.fit(trainset).test(testset)

print('\nUpdated Accuracy: ')
accuracy.rmse(predictions)
accuracy.mse(predictions)
accuracy.mae(predictions)

Computing the msd similarity matrix...
Done computing similarity matrix.

Updated Accuracy: 
RMSE: 1.2310
MSE: 1.5153
MAE:  1.0143


1.0143496611571836

### KNNBaseline

#### KNNBaseline - Using GridSearchCV

In [15]:
params = { 'sim_options' : {'method': ['als','sgd'],
                            "n_epochs" : [5, 10, 20],
                            "user_based": [True, False],
                            "reg_u": [10, 12, 15, 18],
                            "reg_i": [8, 10, 12, 15],
                            "learning_rate": [0.00005, 0.05]}}

gsKBL = GridSearchCV(KNNBaseline, params, measures=['mae', 'rmse'], cv=5, n_jobs=-1)                               
gsKBL.fit(data)

print(f'\nRMSE Best Parameters: {gsKBL.best_params["rmse"]}')
print(f'RMSE Best Score: {gsKBL.best_score["rmse"]}')
print(f'MAE Best Parameters: {gsKBL.best_params["mae"]}')
print(f'MAE Best Score: {gsKBL.best_score["mae"]}')


RMSE Best Parameters: {'sim_options': {'method': 'als', 'n_epochs': 5, 'user_based': False, 'reg_u': 10, 'reg_i': 8, 'learning_rate': 5e-05}}
RMSE Best Score: 1.109026431602136
MAE Best Parameters: {'sim_options': {'method': 'als', 'n_epochs': 5, 'user_based': False, 'reg_u': 10, 'reg_i': 8, 'learning_rate': 5e-05}}
MAE Best Score: 0.8163136378518405


#### Using the Best Hyperparameters found above: 

In [16]:
finalKBL = KNNBaseline(method = "als", n_epochs = 5, user_based = False, reg_u = 10, reg_i = 8)
predictions = finalKBL.fit(trainset).test(testset)

print('\nUpdated Accuracy: ')
accuracy.rmse(predictions)
accuracy.mse(predictions)
accuracy.mae(predictions)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.

Updated Accuracy: 
RMSE: 1.2027
MSE: 1.4464
MAE:  0.9871


0.9870788643440845

### KNNWithMeans 

#### KNNWithMeans - Normal 

In [17]:
#algo
algo = KNNWithMeans()

#cross validate with kfold set to 5
cross_validate(algo, data, measures = ['RMSE', 'MAE'], cv = 5, verbose = True)

#build train and train train set
trainset = data.build_full_trainset()
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5800  1.5612  1.5394  1.5538  1.5862  1.5641  0.0171  
MAE (testset)     1.3137  1.2816  1.2699  1.2746  1.3206  1.2921  0.0209  
Fit time          0.01    0.01    0.01    0.00    0.00    0.01    0.00    
Test time         0.03    0.02    0.03    0.02    0.02    0.02    0.00    
Computing the msd similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 1.2026
MSE: 1.4462
MAE:  0.9866

Predict Tests: 
user: 1001       item: 2001       r_ui = No

[Prediction(uid=1001, iid=2001, r_ui=5, est=2.5475672227773507, details={'actual_k': 13, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=2.1378665113491957, details={'actual_k': 13, 'was_impossible': False})]

##### KNNWithMeans - user-based cosine 

In [18]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using user-based cosine similarity
parans = {
    "name": "cosine",
    "user_based": True,  # Compute  similarities between users
}
algo = KNNWithMeans(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the msd similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 1.2026
MSE: 1.4462
MAE:  0.9866

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 2.55   {'actual_k': 13, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=2.5475672227773507, details={'actual_k': 13, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=2.1378665113491957, details={'actual_k': 13, 'was_impossible': False})]

##### KNNWithMeans - item-based cosine 

In [19]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using user-based cosine similarity
parans = {
    "name": "cosine",
    "user_based": False,  # Compute  similarities between items
}
algo = KNNWithMeans(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the msd similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 1.2026
MSE: 1.4462
MAE:  0.9866

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 2.55   {'actual_k': 13, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=2.5475672227773507, details={'actual_k': 13, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=2.1378665113491957, details={'actual_k': 13, 'was_impossible': False})]

##### KNNWithMeans - user-based MSD 

In [20]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using user-based msd similarity
parans = {
    "name": "msd",
    "user_based": True,  # Compute similarities between users
}
algo = KNNWithMeans(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the msd similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 1.2026
MSE: 1.4462
MAE:  0.9866

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 2.55   {'actual_k': 13, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=2.5475672227773507, details={'actual_k': 13, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=2.1378665113491957, details={'actual_k': 13, 'was_impossible': False})]

##### KNNWithMeans - item-based MSD 

In [21]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using item-based msd similarity
parans = {
    "name": "msd",
    "user_based": False,  # Compute similarities between items
}
algo = KNNWithMeans(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the msd similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 1.2026
MSE: 1.4462
MAE:  0.9866

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 2.55   {'actual_k': 13, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=2.5475672227773507, details={'actual_k': 13, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=2.1378665113491957, details={'actual_k': 13, 'was_impossible': False})]

#### KNNWithMeans - Using GridSearchCV

In [22]:
#doing this with name is msd or cosine causes float or zero division error
params = {'sim_options' : {'method': ['als','sgd'],
                            "n_epochs" : [5, 10, 20],
                            "user_based": [True, False],
                            "min_support": [3, 5, 8],
                          "user_based": [True, False]}}

gsKWM = GridSearchCV(KNNWithMeans, params, measures=['mae', 'rmse'], cv=5, n_jobs=-1)                               
gsKWM.fit(data)

print(f'\nRMSE Best Parameters: {gsKWM.best_params["rmse"]}')
print(f'RMSE Best Score: {gsKWM.best_score["rmse"]}')
print(f'MAE Best Parameters: {gsKWM.best_params["mae"]}')
print(f'MAE Best Score: {gsKWM.best_score["mae"]}')


RMSE Best Parameters: {'sim_options': {'method': 'als', 'n_epochs': 5, 'user_based': False, 'min_support': 3}}
RMSE Best Score: 1.116860717933178
MAE Best Parameters: {'sim_options': {'method': 'als', 'n_epochs': 5, 'user_based': False, 'min_support': 3}}
MAE Best Score: 0.8366732758334721


#### Using the Best Hyperparameters found above: 

In [23]:
finalKWM = KNNWithMeans(method = "als", n_epochs = 5, user_based = False, min_support = 3)
predictions = finalKWM.fit(trainset).test(testset)

print('\nUpdated Accuracy: ')
accuracy.rmse(predictions)
accuracy.mse(predictions)
accuracy.mae(predictions)

Computing the msd similarity matrix...
Done computing similarity matrix.

Updated Accuracy: 
RMSE: 1.2026
MSE: 1.4462
MAE:  0.9866


0.9866269845228031

### KNNWithZScore 

#### KNNWithZScore - Using GridSearchCV

In [24]:
params = {'sim_options' : {'method': ['als','sgd'],
                            "n_epochs" : [5, 10, 20],
                            "user_based": [True, False],
                            "reg_u": [10, 12, 15],
                            "reg_i": [8, 10, 12],
                            "learning_rate": [0.0005, 0.05]}}

gsKZS = GridSearchCV(KNNWithZScore, params, measures=['mae', 'rmse'], cv=5, n_jobs=-1)                               
gsKZS.fit(data)

print(f'\nRMSE Best Parameters: {gsKZS.best_params["rmse"]}')
print(f'RMSE Best Score: {gsKZS.best_score["rmse"]}')
print(f'MAE Best Parameters: {gsKZS.best_params["mae"]}')
print(f'MAE Best Score: {gsKZS.best_score["mae"]}')


RMSE Best Parameters: {'sim_options': {'method': 'als', 'n_epochs': 5, 'user_based': False, 'reg_u': 10, 'reg_i': 8, 'learning_rate': 0.0005}}
RMSE Best Score: 1.1440461127218748
MAE Best Parameters: {'sim_options': {'method': 'als', 'n_epochs': 5, 'user_based': False, 'reg_u': 10, 'reg_i': 8, 'learning_rate': 0.0005}}
MAE Best Score: 0.8554530139413365


#### Using the Best Hyperparameters found above: 

In [25]:
finalKZS = KNNWithZScore(method = "als", n_epochs = 5, user_based = False, reg_u = 10, reg_i = 8, learning_rate = 0.0005)
predictions = finalKZS.fit(trainset).test(testset)

print('\nUpdated Accuracy: ')
accuracy.rmse(predictions)
accuracy.mse(predictions)
accuracy.mae(predictions)

Computing the msd similarity matrix...
Done computing similarity matrix.

Updated Accuracy: 
RMSE: 1.2044
MSE: 1.4506
MAE:  0.9866


0.9865817610999927