In [None]:
'''
    ***** RECOMMENDATION SYSTEM USING SURPRISE LIBRARY for User/Movie Rating *****

A. This notebook has implementation of recommendation system for Movie Rating database using SURPRISE library with following 
    algorithms:
    a. SVD (Singular value decomposition)
    b. SVD with GridSearchCV
    c. KNNBaseline default
    d. KNNBaseline item-item similarity
    e. KNNBaseline user-user similarity
    
B. This notebook has different implementation of loading/referring data as supported by Surprise Library such as:
    a. Dataset.load_builtin
    b. Dataset.load_from_df
    
C. Also, this notebook has different implementation of splitting data into training and testing dataset such as:
    a. model_selection.train_test_split
    b. build_full_trainset
    c. build_anti_testset (from build_full_trainset)
    
'''

In [1]:
from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

from surprise import SVDpp
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate
import pandas as pd
import pickle
from surprise import KNNBaseline
from surprise.model_selection import LeaveOneOut
from collections import defaultdict
import itertools

In [11]:
# Part-1 - Load the 'ml-100k' data and split into trainign and testing data

data = Dataset.load_builtin('ml-100k')
trainset, testset = train_test_split(data, test_size=.25)

In [12]:
# Part-2 - Train the model using SVD algo with use of 'GridSearchCV' configuration and evaluate model accuracy 

param_grid = {'n_factors': [110, 120, 140, 160], 'n_epochs': [90, 100, 110], 'lr_all': [0.001, 0.003, 0.005, 0.008],
              'reg_all': [0.08, 0.1, 0.15]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)

print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

algo = gs.best_estimator['rmse']
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

0.9189217722158433
{'n_factors': 140, 'n_epochs': 90, 'lr_all': 0.005, 'reg_all': 0.1}
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9073  0.9134  0.9043  0.9119  0.9115  0.9097  0.0034  
MAE (testset)     0.7181  0.7231  0.7137  0.7213  0.7184  0.7189  0.0032  
Fit time          34.84   34.70   34.69   34.65   34.71   34.72   0.07    
Test time         0.19    0.19    0.19    0.19    0.19    0.19    0.00    


{'test_rmse': array([0.90734656, 0.91341269, 0.90432237, 0.91191643, 0.91146375]),
 'test_mae': array([0.71811855, 0.72308998, 0.71368492, 0.72127343, 0.71843863]),
 'fit_time': (34.843223571777344,
  34.695209980010986,
  34.68920612335205,
  34.645188331604004,
  34.70618510246277),
 'test_time': (0.18900394439697266,
  0.1880033016204834,
  0.18800067901611328,
  0.18500685691833496,
  0.18800115585327148)}

In [3]:
'''
Part - 3 - Train the model using SVD algorithm and test the model in different combination like 
a. test with the build model
b. test with built model pickle file
c. test set of records via dataframe with model

'''

algo = SVD(n_factors=160, n_epochs=2, lr_all=0.005, reg_all=0.1)
algo.fit(trainset)

with open('movie_recomm_svd_pkl.pkl', 'wb') as fid:
    pickle.dump(algo, fid)

test_pred = algo.test(testset)
df = pd.DataFrame(test_pred, columns=['uid', 'iid', 'rui', 'est', 'details'])
df['err'] = abs(df.est - df.rui)

print("***** SVD Model Prediction Result *****")
accuracy.rmse(test_pred, verbose=True)
accuracy.mae(test_pred, verbose=True)
print(df.head())


with open('movie_recomm_svd_pkl.pkl', 'rb') as fid:
    sv = pickle.load(fid)

test_pred = sv.test(testset)
df = pd.DataFrame(test_pred, columns=['uid', 'iid', 'rui', 'est', 'details'])
df['err'] = abs(df.est - df.rui)

print("\n***** SVD Model Prediction Result via model file*****")
accuracy.rmse(test_pred, verbose=True)
accuracy.mae(test_pred, verbose=True)
print(df.head())


with open('movie_recomm_svd_pkl.pkl', 'rb') as fid:
    sv = pickle.load(fid)


ratings_dict = {'movieId': [735, 642],
                'userID': [916, 848],
                'rating': [4, 5]}

df = pd.DataFrame(ratings_dict)

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['userID', 'movieId', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=.95)

pred = sv.test(testset)
odf = pd.DataFrame(pred, columns=['uid', 'iid', 'rui', 'est', 'details'])
odf['err'] = abs(odf.est - odf.rui)

print("\n***** SVD Model Prediction Result via model file for Two record*****")
accuracy.rmse(pred, verbose=True)
accuracy.mae(test_pred, verbose=True)
print(odf.head())

***** SVD Model Prediction Result *****
RMSE: 0.9848
MAE:  0.7896
   uid  iid  rui       est                    details       err
0  560  845  3.0  3.361925  {'was_impossible': False}  0.361925
1  592  174  5.0  4.386536  {'was_impossible': False}  0.613464
2   56  575  3.0  3.189188  {'was_impossible': False}  0.189188
3  446  690  2.0  3.287849  {'was_impossible': False}  1.287849
4  506  274  4.0  3.684122  {'was_impossible': False}  0.315878

***** SVD Model Prediction Result via model file*****
RMSE: 0.9848
MAE:  0.7896
   uid  iid  rui       est                    details       err
0  560  845  3.0  3.361925  {'was_impossible': False}  0.361925
1  592  174  5.0  4.386536  {'was_impossible': False}  0.613464
2   56  575  3.0  3.189188  {'was_impossible': False}  0.189188
3  446  690  2.0  3.287849  {'was_impossible': False}  1.287849
4  506  274  4.0  3.684122  {'was_impossible': False}  0.315878

***** SVD Model Prediction Result via model file for Two record*****
RMSE: 1.0890
MA

In [8]:
# Part-4 - Train the model using KNNBaseline item-item similarity

sim_options = {'name': 'pearson_baseline', 'user_based': False}
simsAlgo = KNNBaseline(sim_options=sim_options)
simsAlgo.fit(trainset)

test_pred = simsAlgo.test(testset)
df = pd.DataFrame(test_pred, columns=['uid', 'iid', 'rui', 'est', 'details'])
df['err'] = abs(df.est - df.rui)
print("****************KNNBaseline item-item similarity: Accuracy Score *****************")
accuracy.rmse(test_pred, verbose=True)
accuracy.mae(test_pred, verbose=True)
print(df.head().to_string())


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
****************KNNBaseline item-item similarity: Accuracy Score *****************
RMSE: 0.9162
MAE:  0.7156
   uid   iid  rui       est                                    details       err
0  181  1340  1.0  1.777545                  {'was_impossible': False}  0.777545
1  336   153  5.0  3.270047  {'actual_k': 40, 'was_impossible': False}  1.729953
2   62    83  5.0  3.462178  {'actual_k': 40, 'was_impossible': False}  1.537822
3  739   327  5.0  3.279885  {'actual_k': 13, 'was_impossible': False}  1.720115
4  425   318  2.0  3.810771  {'actual_k': 40, 'was_impossible': False}  1.810771


In [9]:
# Part-5 - Train the model using KNNBaseline User-User similarity and get the Top-10 movies predictions for each user 

sim_options = {'name': 'pearson_baseline', 'user_based': True}
simsAlgo = KNNBaseline(sim_options=sim_options)
simsAlgo.fit(trainset)

test_pred = simsAlgo.test(testset)
df = pd.DataFrame(test_pred, columns=['uid', 'iid', 'rui', 'est', 'details'])
df['err'] = abs(df.est - df.rui)

print("******* KNNBaseline User-User similarity: Accuracy Score*****************")
accuracy.rmse(test_pred, verbose=True)
accuracy.mae(test_pred, verbose=True)
print(df.head().to_string())


def GetTopN(predictions, n=10, minimumRating=4.0):
    topN = defaultdict(list)
    for uid, iid, rui, est, _ in predictions:
        if (est >= minimumRating):
            topN[int(uid)].append((int(iid), est))

    for uid, ratings in topN.items():
        ratings.sort(key=lambda x: x[1], reverse=True)
        topN[int(uid)] = ratings[:n]

    return topN

topN = GetTopN(test_pred, n=10)
print('****** Top 10 Predictions for first 10 users ****')
dict(itertools.islice(topN.items(), 10))

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
******* KNNBaseline User-User similarity: Accuracy Score*****************
RMSE: 0.9238
MAE:  0.7225
   uid   iid  rui       est                                    details       err
0  181  1340  1.0  1.777545                  {'was_impossible': False}  0.777545
1  336   153  5.0  3.458159  {'actual_k': 40, 'was_impossible': False}  1.541841
2   62    83  5.0  3.745602  {'actual_k': 40, 'was_impossible': False}  1.254398
3  739   327  5.0  3.495446  {'actual_k': 40, 'was_impossible': False}  1.504554
4  425   318  2.0  3.925493  {'actual_k': 40, 'was_impossible': False}  1.925493
****** Top 10 Predictions for first 10 users ****


{18: [(169, 4.69006224904971),
  (856, 4.607218694198554),
  (127, 4.571555471494918),
  (100, 4.4720104882680545),
  (134, 4.458416852266545),
  (659, 4.441713779090256),
  (23, 4.434993507021739),
  (197, 4.396284515092783),
  (48, 4.3749865570894695),
  (496, 4.34764945384162)],
 625: [(408, 4.231678598378699),
  (172, 4.171059303951712),
  (173, 4.104274648270247),
  (498, 4.039330324317177),
  (134, 4.004436085221441)],
 119: [(718, 4.809910263158032),
  (64, 4.776799919448484),
  (50, 4.722321581760029),
  (83, 4.5276930357122485),
  (684, 4.498663413617713),
  (144, 4.494899965447647),
  (23, 4.450937272594064),
  (194, 4.448273040041289),
  (172, 4.431699954302403),
  (87, 4.375356540723753)],
 462: [(272, 5),
  (136, 4.971706748169559),
  (181, 4.757288147148477),
  (22, 4.702595476835327),
  (346, 4.121539935396404),
  (328, 4.077950074461162),
  (271, 4.026578737380373)],
 90: [(474, 5),
  (1125, 5),
  (654, 4.979176831343555),
  (127, 4.9745097419814455),
  (185, 4.87357636

In [10]:
'''
Part- 6 - Train the model using KNNBaseline item-item similarity. 

Trainign and testing Data is generated using  'build_full_trainset' and 'build_anti_testset' api respectively.

'''


fullTrainSet = data.build_full_trainset()
sim_options = {'name': 'pearson_baseline', 'user_based': False}
simsAlgo = KNNBaseline(sim_options=sim_options)
simsAlgo.fit(fullTrainSet)

test_pred = simsAlgo.test(testset)
df = pd.DataFrame(test_pred, columns=['uid', 'iid', 'rui', 'est', 'details'])
df['err'] = abs(df.est - df.rui)

print("**************** KNNBaseline item-item similarity: Accuracy Score with Test Set *****************")
accuracy.rmse(test_pred, verbose=True)
print(df.head().to_string())


bigTestSet = fullTrainSet.build_anti_testset()
test_pred = simsAlgo.test(bigTestSet)
df = pd.DataFrame(test_pred, columns=['uid', 'iid', 'rui', 'est', 'details'])
df['err'] = abs(df.est - df.rui)
print("\n**************** KNNBaseline item-item similarity: Accuracy Score with Anti-Testset *****************")
accuracy.rmse(test_pred, verbose=True)
df.head()

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
**************** KNNBaseline item-item similarity: Accuracy Score with Test Set *****************
RMSE: 0.4796
   uid   iid  rui       est                                    details       err
0  181  1340  1.0  1.000000   {'actual_k': 1, 'was_impossible': False}  0.000000
1  336   153  5.0  4.147630  {'actual_k': 40, 'was_impossible': False}  0.852370
2   62    83  5.0  4.409072  {'actual_k': 40, 'was_impossible': False}  0.590928
3  739   327  5.0  4.472095  {'actual_k': 18, 'was_impossible': False}  0.527905
4  425   318  2.0  3.107934  {'actual_k': 40, 'was_impossible': False}  1.107934

**************** KNNBaseline item-item similarity: Accuracy Score with Anti-Testset *****************
RMSE: 0.6404


Unnamed: 0,uid,iid,rui,est,details,err
0,196,302,3.52986,3.702845,"{'actual_k': 21, 'was_impossible': False}",0.172985
1,196,377,3.52986,2.69864,"{'actual_k': 11, 'was_impossible': False}",0.83122
2,196,51,3.52986,2.793005,"{'actual_k': 16, 'was_impossible': False}",0.736855
3,196,346,3.52986,3.298763,"{'actual_k': 20, 'was_impossible': False}",0.231097
4,196,474,3.52986,4.492609,"{'actual_k': 23, 'was_impossible': False}",0.962749
