# `Recommender System Models`

In [None]:
import os
import random
import numpy as np
import sys
import pandas as pd
import warnings
import time
import json
from surprise import Dataset, Reader
from surprise import BaselineOnly
from surprise import NormalPredictor
from surprise import KNNBaseline, KNNWithMeans, KNNBasic, KNNWithZScore
from surprise import SVD, SVDpp
from surprise import NMF
from surprise import CoClustering
from surprise import dump
from surprise import accuracy
from surprise.model_selection import train_test_split, cross_validate
from surprise.model_selection import GridSearchCV
from scipy.sparse.linalg import svds

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
warnings.filterwarnings('ignore')

# Set seed 
seed_value = 42
os.environ['Recommender'] = str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

# Set path
path = r'D:\AmazonReviews\Data'
os.chdir(path)

# Read data
df = pd.read_csv('Clothing_Shoes_and_Jewelry.csv', header=None, skiprows=[0],
                 low_memory=False)
df = df.drop_duplicates()

# Name columns
df.columns = ['reviewerID','item','rating','timestamp']

print('Sample observations:')
df.head()

Sample observations:


Unnamed: 0,reviewerID,item,rating,timestamp
0,871167042,A3OT9BYASFGU2X,4.0,1398470400
1,871167042,A28GK1G2KDXHRP,5.0,1397692800
2,871167042,A3NFXFEKW8OK0E,5.0,1397606400
3,871167042,A3I6G5TKBVJEK9,5.0,1397520000
4,871167042,A1A7Y1M8AJWNZ8,5.0,1396224000


In [None]:
# Define a function to examine the data
def data_summary(df):
    print('Number of Rows: {}, Columns: {}'.format(df.shape[0] ,df.shape[1]))
    a = pd.DataFrame()
    a['Number of Missing Values'] = df.isnull().sum()
    a['Data type of variable'] = df.dtypes
    a['Number of Unique Values'] = df.nunique()
    print(a)

print('Initial Data Summary:')     
print(data_summary(df))

Initial Data Summary:
Number of Rows: 31697963, Columns: 4
            Number of Missing Values Data type of variable  \
reviewerID                         0                object   
item                               0                object   
rating                             0               float64   
timestamp                          0                 int64   

            Number of Unique Values  
reviewerID                  2681297  
item                       12483678  
rating                            5  
timestamp                      5393  
None


In [None]:
df = df.drop(['timestamp'], axis = 1) 

# Number of unique reviewer id and product id in the data
print('Number of unique reviewers in initial set:', df['reviewerID'].nunique())
print('Number of unique items in initial set:', df['item'].nunique())

Number of unique reviewers in initial set: 2681297
Number of unique items in initial set: 12483678


In [None]:
# Examine reviewers
reviewers_top10 = df.groupby('reviewerID').size().sort_values(ascending=False)[:10]
print('Reviewers with highest number of ratings in initial set:')
print(reviewers_top10)

Reviewers with highest number of ratings in initial set:
reviewerID
B00M4NF9H0    13067
B00F3EDEOC    11268
B00GAPACC0    11241
B000HTICY0    11119
B00SU5244M    10431
B00SU52460    10431
B00X8AMIDG    10321
B00WRGPHY4    10321
B00WBHSMOQ    10266
B00VJOT0CK    10235
dtype: int64


In [None]:
# Examine items
items_top10 = df.groupby('item').size().sort_values(ascending=False)[:10]
print('Items with highest number of ratings in initial set:')
print(items_top10)

Items with highest number of ratings in initial set:
item
ALFRMOGTO1K4M     654
A2OWR2PL3DLWS4    494
A2OS7CFQI4DWPT    486
AVU1ILDDYW301     460
A3W5A9X3HHACQT    402
A2RYWPOL4NN2KG    400
A3W4D8XOGLWUN5    395
A1RRMZKOMZ2M7J    392
A3LV42NBFM829N    389
A2QDOJFFLFGF18    387
dtype: int64


In [None]:
# Create new integer id for item due to sparsity
value_counts = df['item'].value_counts(dropna=True, sort=True)
df1 = pd.DataFrame(value_counts)
df1 = df1.reset_index()
df1.columns = ['item_unique', 'counts'] # change column 
df1 = df1.reset_index()
df1.rename(columns={'index': 'item_id'}, inplace=True)

df1 = df1.drop(['counts'], axis=1)

df = pd.merge(df, df1, how='left', left_on=['item'], 
               right_on=['item_unique'])
df = df.drop_duplicates()

del value_counts, df1

df = df.drop(['item_unique'], axis=1)

In [None]:
# Create new integer id for reviewerID due to sparsity
value_counts = df['reviewerID'].value_counts(dropna=True, sort=True)
df1 = pd.DataFrame(value_counts)
df1 = df1.reset_index()
df1.columns = ['id_unique', 'counts'] 
df1 = df1.reset_index()
df1.rename(columns={'index': 'reviewer_id'}, inplace=True)

df1 = df1.drop(['counts'], axis=1)

df = pd.merge(df, df1, how='left', left_on=['reviewerID'], 
               right_on=['id_unique'])
df = df.drop_duplicates()

del value_counts, df1

df = df.drop(['id_unique'], axis=1)

# Create key for merging new integer id variables for later join
df1 = df[['item', 'item_id', 'reviewerID','reviewer_id']]
df1.to_csv('Clothing_Shoes_and_Jewelry_idMatch.csv', index = False)

del df1

# Drop unnecessary keys
df = df.drop(['item', 'reviewerID'], axis=1)

In [None]:
# Filter to greater than or equal to 1500 due to sparsity
reviewer_count = df.reviewer_id.value_counts()
df = df[df.reviewer_id.isin(reviewer_count[reviewer_count >= 1500].index)]
df = df.drop_duplicates()

del reviewer_count

print('Number of reviewers with 1500 or more ratings: ', len(df))
print('Number of unique reviewers: ', df['reviewer_id'].nunique())
print('Number of unique items: ', df['item_id'].nunique())

# Count reviewers based on rating
for i in range(1,6):
  print('Number of reviewers who rated {0} rating = {1}'.format(i,
                                                               df[df['rating'] == i].shape[0]))

Number of reviewers with 1500 or more ratings:  3113643
Number of unique reviewers:  1138
Number of unique items:  1757869
Number of reviewers who rated 1 rating = 171570
Number of reviewers who rated 2 rating = 149382
Number of reviewers who rated 3 rating = 252204
Number of reviewers who rated 4 rating = 552539
Number of reviewers who rated 5 rating = 1987948


In [None]:
# Examine reviewers
reviewers_top10 = df.groupby('reviewer_id').size().sort_values(ascending=False)[:10]
print('Reviewers with highest number of ratings in filtered set:')
print(reviewers_top10)

del reviewers_top10

Reviewers with highest number of ratings in filtered set:
reviewer_id
0    12934
1    11232
2    11204
3    11082
4    10345
5    10345
6    10235
7    10234
8    10181
9    10151
dtype: int64


In [None]:
# Examine items
items_top10 = df.groupby('item_id').size().sort_values(ascending=False)[:10]
print('Items with highest number of ratings filtered set:')
print(items_top10)

del items_top10

Items with highest number of ratings filtered set:
item_id
914     84
1654    82
1770    81
1723    72
2750    67
2564    66
2463    64
149     63
2334    63
1115    62
dtype: int64


# Modeling

In [None]:
# Create Recommendation Systems using Surprise
# Set path for results
path = r'D:\AmazonReviews\Models'
os.chdir(path)

- Load data using reader
- Iterate over all algorithms: BaselineOnly(), KNNBaseline(), KNNBasic(), KNNWithMeans(), 
                  KNNWithZScore(), CoClustering(), SVD(), SVDpp(), NMF(), 
                  NormalPredictor()
- Cross validation
- Model results
- Create df with results  

In [None]:
# Load data using reader
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['reviewer_id', 'item_id', 'rating']], reader)

# Iterate over all algorithms
print('Time for iterating through different algorithms..')
search_time_start = time.time()
benchmark = []
for algorithm in [BaselineOnly(), KNNBaseline(), KNNBasic(), KNNWithMeans(), 
                  KNNWithZScore(), CoClustering(), SVD(), SVDpp(), NMF(), 
                  NormalPredictor()]:
    # Cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, 
                             verbose=False, n_jobs=-1)
    
    # Model results
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                               index=['Algorithm']))
    benchmark.append(tmp)
print('Finished iterating through different algorithms:',
      time.time() - search_time_start)

# Create df with results 
surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')    

print('Results from testing different algorithms:')
print(surprise_results)

Time for iterating through different algorithms..
Evaluating RMSE of algorithm BaselineOnly on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.0217  1.0219  1.0199  1.0212  0.0009  
Fit time          3.25    3.29    3.73    3.43    0.22    
Test time         5.93    6.04    5.57    5.85    0.20    
Evaluating RMSE of algorithm KNNBaseline on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9013  0.8998  0.9004  0.9005  0.0006  
Fit time          4.22    4.47    3.64    4.11    0.35    
Test time         13.11   12.34   12.28   12.58   0.38    
Evaluating RMSE of algorithm KNNBasic on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9297  0.9289  0.9298  0.9295  0.0004  
Fit time          0.30    0.30    0.31    0.30    0.01    
Test time         10.31   10.50   10.31   10.37   0.09    
Evaluating RMSE of algorithm KNNWithMeans on 3 split(s).

               

In [None]:
# Save df
surprise_results.to_csv('results_algorithms.csv')

del surprise_results

In [None]:
# Partition data for train/test sets
train, test = train_test_split(data, test_size=0.2)

# SVDpp with lowest rmse 

In [None]:
print('Train/predict using SVDpp default parameters for 3 epochs:')
print('\n')

print('Time for iterating through SVDpp default parameters..')
search_time_start = time.time()
algo = SVDpp(n_epochs=3, random_state=seed_value)
cv = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3, verbose=False, 
                    n_jobs=-1)
print('Finished iterating through SVDpp default parameters:',
      time.time() - search_time_start)
print('\n')

print('Cross validation results:')
# Iterate over key/value pairs in cv results dict 
for key, value in cv.items():
    print(key, ' : ', value)
print('\n')

# Fit and predict on best SVDpp model
predictions = algo.fit(train).test(test)
print('RMSE from fit best parameters on train predict on test:')
print(accuracy.rmse(predictions))

# Save predictions and algorithm
dump.dump('./SVDpp_3epochs_DefaultParamModel_file', predictions, algo)
# predictions, algo = dump.load('./SVDpp_3epochs_DefaultParamModel_file')

Train/predict using SVDpp default parameters for 3 epochs:


Time for iterating through SVDpp default parameters..
Finished iterating through SVDpp default parameters: 10790.404549121857


Cross validation results:
test_rmse  :  [1.02346779 1.02116729 1.02138424]
test_mae  :  [0.74395792 0.74263317 0.74233523]
fit_time  :  (9776.340163707733, 9688.727854728699, 9698.9595682621)
test_time  :  (986.8202083110809, 991.8307194709778, 990.1156959533691)


RMSE from fit best parameters on train predict on test:
RMSE: 1.0069
1.0068639867569864




In [None]:
# Examine results from predictions
def get_Ir(reviewerID):
    '''Determine the number of items rated by given reviewer
    Args: 
      reviewerID: the id of the reviewer
    Returns: 
      Number of items rated by the reviewer
    '''
    try:
        return len(train.ur[train.to_inner_uid(reviewerID)])
    except ValueError: # If the reviewer was not in train set
        return 0
    
def get_Ri(itemID):
    ''' Determine number of reviewers that rated given item
    Args:
      itemID: the id of the item
    Returns:
     Number of reviewers that have rated the item
    '''
    try: 
        return len(train.ir[train.to_inner_iid(itemID)])
    except ValueError:
        return 0

In [None]:
# Make df of prediction results    
df1 = pd.DataFrame(predictions, columns=['reviewerID', 'itemID', 'rui', 'est',
                                         'details'])

# Apply functions
df1['Iu'] = df1.reviewerID.apply(get_Ir)
df1['Ui'] = df1.itemID.apply(get_Ri)
df1['err'] = abs(df1.est - df1.rui)

# Save prediction results    
df1.to_csv('predictions_SVDpp_DefaultParamModel.csv')

In [None]:
# Find best predictions
best_predictions = df1.sort_values(by='err')[:10]
print('Best 10 predictions:')
print(best_predictions)

Best 10 predictions:
        reviewerID  itemID  rui  est                    details    Iu  Ui  err
292684          57  132683  5.0  5.0  {'was_impossible': False}  5228   7  0.0
412902          26  133174  5.0  5.0  {'was_impossible': False}  6694  13  0.0
503655          20   49464  5.0  5.0  {'was_impossible': False}  7711  18  0.0
583961           4   46139  5.0  5.0  {'was_impossible': False}  8266  14  0.0
412894          13   44945  5.0  5.0  {'was_impossible': False}  8029  23  0.0
134615          11  134037  5.0  5.0  {'was_impossible': False}  8008  13  0.0
412861           9   82916  5.0  5.0  {'was_impossible': False}  8103  11  0.0
503667          47   53259  5.0  5.0  {'was_impossible': False}  5464  17  0.0
534582          54   58833  5.0  5.0  {'was_impossible': False}  5175  18  0.0
503671           9   60836  5.0  5.0  {'was_impossible': False}  8103  19  0.0


In [None]:
# Find worst predictions
worst_predictions = df1.sort_values(by='err')[-10:]
print('Worst 10 predictions:')
print(worst_predictions)

del predictions, df1, best_predictions, worst_predictions

Worst 10 predictions:
        reviewerID    itemID  rui       est                    details    Iu  \
489076          47     19923  1.0  4.809839  {'was_impossible': False}  5464   
2731            53  12238590  1.0  4.819778  {'was_impossible': False}  5217   
287045          53   4228354  1.0  4.819778  {'was_impossible': False}  5217   
182869          53  12238679  1.0  4.819778  {'was_impossible': False}  5217   
568727         318   5585473  1.0  4.846666  {'was_impossible': False}  2282   
144436         318   5560304  1.0  4.846666  {'was_impossible': False}  2282   
206325         318   1803591  1.0  4.846666  {'was_impossible': False}  2282   
367809         318   5577052  1.0  4.846666  {'was_impossible': False}  2282   
88049           79    742450  1.0  4.846758  {'was_impossible': False}  4522   
610915         815     51958  1.0  4.856628  {'was_impossible': False}  1410   

        Ui       err  
489076   1  3.809839  
2731     0  3.819778  
287045   0  3.819778  
18286

## SVDpp HPO using Grid Search

In [None]:
# Define parameters for grid search       
param_grid = {'n_epochs': [10],
              'n_factors': [30, 40, 50], 
              'lr_all': [7e-4, 7e-3, 7e-2], 
              'reg_all': [2e-4, 2e-3, 2e-2],
              'random_state': [seed_value]
              }

# Print nested dictionary line by line
print('Grid search parameters:')
param_grid

Grid search parameters:


{'n_epochs': [10],
 'n_factors': [30, 40, 50],
 'lr_all': [0.0007, 0.007, 0.07],
 'reg_all': [0.0002, 0.002, 0.02],
 'random_state': [42]}

In [None]:
# Run grid search
gs = GridSearchCV(SVDpp, param_grid, measures=['rmse', 'mae'], cv=3,
                  n_jobs=6)
print('Time for iterating grid search parameters..')
search_time_start = time.time()
gs.fit(data)
print('Finished iterating grid search parameters:',
      time.time() - search_time_start)
print('\n')
print('Lowest RMSE from Grid Search:')
print(gs.best_score['rmse'])
print('\n')
print('Parameters of Model with lowest RMSE from Grid Search:')
print(gs.best_params['rmse'])

Time for iterating grid search parameters..
Finished iterating grid search parameters: 949831.3826599121


Lowest RMSE from Grid Search:
0.8762233407560768


Parameters of Model with lowest RMSE from Grid Search:
{'n_epochs': 10, 'n_factors': 50, 'lr_all': 0.07, 'reg_all': 0.02, 'random_state': 42}


In [None]:
# Save results to df
results_df = pd.DataFrame.from_dict(gs.cv_results)
results_df = results_df.sort_values('mean_test_rmse', ascending=True)

print('SVDpp GridSearch HPO Cross Validation Results:')
print(results_df.head())
print('\n')
results_df.to_csv('SVDpp_gridSearch_cvResults.csv', index=False)

del results_df

SVDpp GridSearch HPO Cross Validation Results:
    split0_test_rmse  split1_test_rmse  split2_test_rmse  mean_test_rmse  \
26          0.877091          0.876967          0.874612        0.876223   
17          0.877104          0.877309          0.874793        0.876402   
8           0.877473          0.877841          0.875282        0.876866   
25          0.880114          0.879673          0.877702        0.879163   
16          0.879928          0.879870          0.877736        0.879178   

    std_test_rmse  rank_test_rmse  split0_test_mae  split1_test_mae  \
26       0.001140               1         0.545362         0.545235   
17       0.001141               2         0.544410         0.544866   
8        0.001130               3         0.544078         0.544394   
25       0.001049               4         0.529889         0.529740   
16       0.001020               5         0.529289         0.529523   

    split2_test_mae  mean_test_mae  std_test_mae  rank_test_mae  \
26

In [None]:
# Fit model with the lowest rmse
algo = gs.best_estimator['rmse']

# Fit and predict on best SVD model
predictions = algo.fit(train).test(test)
print('RMSE from fit best parameters on train predict on test:')
print(accuracy.rmse(predictions))

# Save predictions and algorithm
dump.dump('./SVDpp_bestGrid_Model_file', predictions, algo)
#predictions, algo = dump.load('./SVDpp_bestGrid_Model_file')

# Make df of prediction results    
df1 = pd.DataFrame(predictions, columns=['reviewerID', 'itemID', 'rui', 'est',
                                         'details'])

# Apply functions
df1['Iu'] = df1.reviewerID.apply(get_Ir)
df1['Ui'] = df1.itemID.apply(get_Ri)
df1['err'] = abs(df1.est - df1.rui)

# Save prediction results    
df1.to_csv('predictions_SVDpp_gridSearch.csv')

RMSE from fit best parameters on train predict on test:
RMSE: 0.8576
0.8576483755216049


In [None]:
# Find best predictions
best_predictions = df1.sort_values(by='err')[:10]

print('Best 10 predictions:')
print(best_predictions)

Best 10 predictions:
        reviewerID   itemID  rui  est                    details    Iu  Ui  \
347899           9    33304  5.0  5.0  {'was_impossible': False}  8099  18   
422779        1089  5166669  5.0  5.0  {'was_impossible': False}  1245   1   
422809         130   466573  5.0  5.0  {'was_impossible': False}  3227   1   
422810          69  4697707  5.0  5.0  {'was_impossible': False}  4721   1   
198168         193    16186  5.0  5.0  {'was_impossible': False}  2756  33   
422829         178    35614  5.0  5.0  {'was_impossible': False}  2778  25   
198151         826   276748  5.0  5.0  {'was_impossible': False}  1448   3   
422844         315    30133  5.0  5.0  {'was_impossible': False}  2291  22   
198131         181     8448  5.0  5.0  {'was_impossible': False}  2823  36   
422855         178     5879  5.0  5.0  {'was_impossible': False}  2778  29   

        err  
347899  0.0  
422779  0.0  
422809  0.0  
422810  0.0  
198168  0.0  
422829  0.0  
198151  0.0  
422844  

In [None]:
# Find worst predictions
worst_predictions = df1.sort_values(by='err')[-10:]

print('Worst 10 predictions:')
print(worst_predictions)

del predictions, df1, best_predictions, worst_predictions

Worst 10 predictions:
        reviewerID   itemID  rui  est                    details    Iu  Ui  \
252471         346   176955  1.0  5.0  {'was_impossible': False}  2210   4   
81224          404   343482  1.0  5.0  {'was_impossible': False}  2077   1   
210979         780  1006842  1.0  5.0  {'was_impossible': False}  1487   1   
426144         300   268750  1.0  5.0  {'was_impossible': False}  2292   1   
540253         779  5326776  1.0  5.0  {'was_impossible': False}  1491   1   
250479         609   647550  1.0  5.0  {'was_impossible': False}  1644   3   
554305        1122    27112  1.0  5.0  {'was_impossible': False}  1214   1   
391391         429   149590  1.0  5.0  {'was_impossible': False}  2015   2   
10982          946  4517207  1.0  5.0  {'was_impossible': False}  1313   1   
463644         401   844653  1.0  5.0  {'was_impossible': False}  2049   1   

        err  
252471  4.0  
81224   4.0  
210979  4.0  
426144  4.0  
540253  4.0  
250479  4.0  
554305  4.0  
391391 

## KNNBaseline with second lowest rmse use Alternating Least Squares

In [None]:
print('Train/predict using KNNBaseline default parameters with Alternating Least Squares for 3 epochs:')
print('\n')

bsl_options = {'method': 'als',
               'n_epochs': 3
               }

# Print nested dictionary line by line
print('Baselines estimates configuration:')
print(bsl_options)
print('\n')

print('Model parameters:')
print('KNNBaseline(k=40, min_k=1, bsl_options=bsl_options, verbose=False)') 

Train/predict using KNNBaseline default parameters with Alternating Least Squares for 3 epochs:


Baselines estimates configuration:
{'method': 'als', 'n_epochs': 3}


Model parameters:
KNNBaseline(k=40, min_k=1, bsl_options=bsl_options, verbose=False)


In [None]:
# Cross validation
print('Time for iterating through KNNBaseline default parameters epochs=3 using ALS..')
search_time_start = time.time()
algo = KNNBaseline(bsl_options=bsl_options)
cv = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3, verbose=False, 
                    n_jobs=-1)
print('Finished iterating through KNNBaseline default parameters epochs=3 using ALS:',
      time.time() - search_time_start)
print('\n')

print('Cross validation results:')
# Iterate over key/value pairs in cv results dict 
for key, value in cv.items():
    print(key, ' : ', value)

Time for iterating through KNNBaseline default parameters epochs=3 using ALS..
Finished iterating through KNNBaseline default parameters epochs=3 using ALS: 37.01580023765564


Cross validation results:
test_rmse  :  [0.89967443 0.89968674 0.89972243]
test_mae  :  [0.53176603 0.53147945 0.53162604]
fit_time  :  (1.4181976318359375, 1.3012800216674805, 1.3005609512329102)
test_time  :  (10.874441862106323, 10.71552562713623, 10.45514440536499)


In [None]:
# Fit and predict on best model with the lowest rmse
predictions = algo.fit(train).test(test)
print('RMSE from fit best parameters on train predict on test:')
print(accuracy.rmse(predictions))

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE from fit best parameters on train predict on test:
RMSE: 0.8855
0.8855366849115467


In [None]:
# Save predictions and algorithm
dump.dump('./KNNBaseline_3epochs_DefaultParamModel_file', predictions, algo)
# predictions, algo = dump.load('./KNNBaseline_3epochs_DefaultParamModel_file')

# Make df of prediction results    
df1 = pd.DataFrame(predictions, columns=['reviewerID', 'itemID', 'rui', 'est',
                                         'details'])
# Apply functions
df1['Iu'] = df1.reviewerID.apply(get_Ir)
df1['Ui'] = df1.itemID.apply(get_Ri)
df1['err'] = abs(df1.est - df1.rui)

# Save prediction results    
df1.to_csv('predictions_KNNBaseline_DefaultParamModel.csv')

In [None]:
# Find best predictions
best_predictions = df1.sort_values(by='err')[:10]

print('Best 10 predictions:')
print(best_predictions)

Best 10 predictions:
        reviewerID   itemID  rui  est  \
0              191   868548  1.0  1.0   
310933         171  2317128  5.0  5.0   
310951          17   694716  5.0  5.0   
310952         189   103420  5.0  5.0   
310972         671  4249082  5.0  5.0   
310973         276  1700820  5.0  5.0   
310976         192  1262170  5.0  5.0   
310978         559  1331040  5.0  5.0   
310983        1036  1350025  1.0  1.0   
310986          63   371765  5.0  5.0   

                                          details    Iu  Ui  err  
0        {'actual_k': 1, 'was_impossible': False}  2808   1  0.0  
310933   {'actual_k': 1, 'was_impossible': False}  2804   1  0.0  
310951   {'actual_k': 3, 'was_impossible': False}  7915   3  0.0  
310952  {'actual_k': 14, 'was_impossible': False}  2797  14  0.0  
310972   {'actual_k': 1, 'was_impossible': False}  1515   1  0.0  
310973   {'actual_k': 1, 'was_impossible': False}  2409   1  0.0  
310976   {'actual_k': 1, 'was_impossible': False}  2792   

In [None]:
# Find worst predictions
worst_predictions = df1.sort_values(by='err')[-10:]

print('Worst 10 predictions:')
print(worst_predictions)

del predictions, df1, best_predictions, worst_predictions

Worst 10 predictions:
        reviewerID   itemID  rui  est  \
354495         242     5845  1.0  5.0   
177973         127  1727714  5.0  1.0   
256990         576   524049  5.0  1.0   
490072         638    31093  1.0  5.0   
52555          108  1692340  1.0  5.0   
264925         566  1473905  5.0  1.0   
559845         936  1287067  1.0  5.0   
328255         344   423858  5.0  1.0   
182097         421    48088  1.0  5.0   
205794          43   885292  1.0  5.0   

                                         details    Iu  Ui  err  
354495  {'actual_k': 6, 'was_impossible': False}  2540   6  4.0  
177973  {'actual_k': 1, 'was_impossible': False}  3346   1  4.0  
256990  {'actual_k': 2, 'was_impossible': False}  1714   3  4.0  
490072  {'actual_k': 1, 'was_impossible': False}  1531   2  4.0  
52555   {'actual_k': 1, 'was_impossible': False}  3602   1  4.0  
264925  {'actual_k': 1, 'was_impossible': False}  1728   3  4.0  
559845  {'actual_k': 1, 'was_impossible': False}  1362   1  4.0 

## KNNBaseline HPO using Grid Search

In [None]:
# HPO using grid search
print('KNNBaseline HPO using Grid Search Minimized:')
print('\n')

# Define parameters for grid search
param_grid = {'bsl_options': {'method': ['als', 'sgd']}, 
              'k': [40, 45, 50], 
              'min_k': [30, 35],
              'random_state': [seed_value],
              'sim_options': {'name': ['pearson_baseline'],
                              'min_support': [5, 10],
                              'shrinkage': [0, 100]}
              }

# Print nested dictionary line by line
print('Grid search parameters:')
param_grid          

KNNBaseline HPO using Grid Search Minimized:


Grid search parameters:


{'bsl_options': {'method': ['als', 'sgd']},
 'k': [40, 45, 50],
 'min_k': [30, 35],
 'random_state': [42],
 'sim_options': {'name': ['pearson_baseline'],
  'min_support': [5, 10],
  'shrinkage': [0, 100]}}

In [None]:
# Run grid search
gs = GridSearchCV(KNNBaseline, param_grid, measures=['rmse', 'mae'], cv=3,
                  joblib_verbose=-1, n_jobs=-1)
print('Start time for iterating grid search parameters..')
search_time_start = time.time()
gs.fit(data)
print('Finished iterating grid search parameters:',
      time.time() - search_time_start)
print('\n')
print('Model with lowest RMSE:')
print(gs.best_score['rmse'])
print('\n')
# Parameters with the lowest RMSE 
print('Parameters with the lowest RMSE:')
print(gs.best_params['rmse'])

Start time for iterating grid search parameters..


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:  3.0min


Finished iterating grid search parameters: 607.8373460769653


Model with lowest RMSE:
1.011069858119585


Parameters with the lowest RMSE:
{'bsl_options': {'method': 'sgd'}, 'k': 50, 'min_k': 30, 'random_state': 42, 'sim_options': {'name': 'pearson_baseline', 'min_support': 10, 'shrinkage': 0, 'user_based': True}}


[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:  9.9min finished


In [None]:
# Fit and predict on best model with the lowest rmse
predictions = algo.fit(train).test(test)
print('RMSE from fit best parameters on train predict on test:')
print(accuracy.rmse(predictions))
print('\n')

# Save results to df
results_df = pd.DataFrame.from_dict(gs.cv_results)
results_df = results_df.sort_values('mean_test_rmse', ascending=True)
print('KNNBaseline GridSearch HPO Cross Validation Results:')
print(results_df.head())
results_df.to_csv('KNNBaseline_gridSearch_cvResults_minimized.csv', index=False)

del results_df

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE from fit best parameters on train predict on test:
RMSE: 0.8855
0.8855366849115467


KNNBaseline GridSearch HPO Cross Validation Results:
    split0_test_rmse  split1_test_rmse  split2_test_rmse  mean_test_rmse  \
42          1.011580          1.012132          1.009497        1.011070   
40          1.011580          1.012132          1.009497        1.011070   
41          1.011581          1.012132          1.009498        1.011070   
43          1.011581          1.012133          1.009498        1.011070   
34          1.011583          1.012133          1.009500        1.011072   

    std_test_rmse  rank_test_rmse  split0_test_mae  split1_test_mae  \
42       0.001135               1         0.747531         0.747711   
40       0.001135               2         0.747532         0.747711   
41       0.001135               3         0.747532         0.747711   
43       0.0

In [None]:
# Fit model with the lowest rmse
algo = gs.best_estimator['rmse']

# Fit and predict on best SVD model
predictions = algo.fit(train).test(test)
print('RMSE from fit best parameters on train predict on test:')
print(accuracy.rmse(predictions))
print('\n')

# Save predictions and algorithm
dump.dump('./KNNBaseline_bestGrid_Model_file', predictions, algo)
# predictions, algo = dump.load('./KNNBaseline_bestGrid_Model_file')

# Make df of prediction results    
df1 = pd.DataFrame(predictions, columns=['reviewerID', 'itemID', 'rui', 'est',
                                         'details'])

# Apply functions
df1['Iu'] = df1.reviewerID.apply(get_Ir)
df1['Ui'] = df1.itemID.apply(get_Ri)
df1['err'] = abs(df1.est - df1.rui)

# Save prediction results    
df1.to_csv('predictions_KNNBaseline_gridSearch_minimized.csv')

Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE from fit best parameters on train predict on test:
RMSE: 0.9999
0.9998522259616496




In [None]:
# Find best predictions
best_predictions = df1.sort_values(by='err')[:10]

print('Best 10 predictions:')
print(best_predictions)

Best 10 predictions:
        reviewerID  itemID  rui  est  \
594691         182   17152  5.0  5.0   
568813         695    3031  1.0  1.0   
567495         182   16843  5.0  5.0   
171839         187   14333  5.0  5.0   
587673         121    9306  5.0  5.0   
314436         182    8531  5.0  5.0   
431599         699   20342  5.0  5.0   
322575         690    4686  5.0  5.0   
222042         311    8067  5.0  5.0   
588936         690    3584  5.0  5.0   

                                          details    Iu  Ui  err  
594691  {'actual_k': 32, 'was_impossible': False}  2800  32  0.0  
568813  {'actual_k': 33, 'was_impossible': False}  1537  37  0.0  
567495  {'actual_k': 32, 'was_impossible': False}  2800  32  0.0  
171839  {'actual_k': 33, 'was_impossible': False}  2746  35  0.0  
587673   {'actual_k': 1, 'was_impossible': False}  3461  16  0.0  
314436  {'actual_k': 32, 'was_impossible': False}  2800  32  0.0  
431599   {'actual_k': 1, 'was_impossible': False}  1551  22  0.0  
32

In [None]:
# Find worst predictions
worst_predictions = df1.sort_values(by='err')[-10:]

print('Worst 10 predictions:')
print(worst_predictions)

del predictions, df1, best_predictions, worst_predictions

Worst 10 predictions:
        reviewerID  itemID  rui       est  \
335092         228   18242  1.0  4.893678   
144926          65   10940  1.0  4.893753   
386679         273   30896  1.0  4.895199   
470195         247    1636  1.0  4.906812   
508612         278  355983  1.0  4.938696   
360091         815   51958  1.0  4.959069   
401014        1084   23461  1.0  4.973029   
65206          172  290086  1.0  5.000000   
100280         992   18024  1.0  5.000000   
41341          322   28375  1.0  5.000000   

                                         details    Iu  Ui       err  
335092  {'actual_k': 0, 'was_impossible': False}  2615   5  3.893678  
144926  {'actual_k': 0, 'was_impossible': False}  4919  23  3.893753  
386679  {'actual_k': 0, 'was_impossible': False}  2363  11  3.895199  
470195  {'actual_k': 1, 'was_impossible': False}  2536  14  3.906812  
508612  {'actual_k': 1, 'was_impossible': False}  2389   8  3.938696  
360091  {'actual_k': 0, 'was_impossible': False}  1427  

## SVD with third lowest rmse

In [None]:
print('Train/predict using SVD default parameters for 3 epochs:')
print('\n')

print('Time for iterating through SVD default parameters..')
search_time_start = time.time()
algo = SVD(n_epochs=3, random_state=seed_value)
cv = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3, verbose=False, 
                    n_jobs=-1)
print('Finished iterating through SVD default parameters:',
      time.time() - search_time_start)
print('\n')

print('Cross validation results:')
# Iterate over key/value pairs in cv results dict 
for key, value in cv.items():
    print(key, ' : ', value)
print('\n')

# Fit and predict on best SVD model
predictions = algo.fit(train).test(test)
print('RMSE from fit best parameters on train predict on test:')
print(accuracy.rmse(predictions))

# Save predictions and algorithm
dump.dump('./SVD_3epochs_DefaultParamModel_file', predictions, algo)
# predictions, algo = dump.load('./SVD_3epochs_DefaultParamModel_file')

# Make df of prediction results    
df1 = pd.DataFrame(predictions, columns=['reviewerID', 'itemID', 'rui', 'est',
                                         'details'])

# Apply functions
df1['Iu'] = df1.reviewerID.apply(get_Ir)
df1['Ui'] = df1.itemID.apply(get_Ri)
df1['err'] = abs(df1.est - df1.rui)

# Save prediction results    
df1.to_csv('predictions_SVD_DefaultParamModel.csv')

Train/predict using SVD default parameters for 3 epochs:


Time for iterating through SVD default parameters..
Finished iterating through SVD default parameters: 60.38068866729736


Cross validation results:
test_rmse  :  [1.07224262 1.07074391 1.0702068 ]
test_mae  :  [0.81337178 0.811829   0.81112873]
fit_time  :  (19.490893840789795, 18.908312559127808, 19.21304702758789)
test_time  :  (6.697888374328613, 6.923274278640747, 6.702215909957886)


RMSE from fit best parameters on train predict on test:
RMSE: 1.0521
1.0520844922805106


In [None]:
# Find best predictions
best_predictions = df1.sort_values(by='err')[:10]

print('Best 10 predictions:')
print(best_predictions)

Best 10 predictions:
        reviewerID  itemID  rui  est                    details    Iu  Ui  err
182456           6   92811  5.0  5.0  {'was_impossible': False}  8174  17  0.0
356382          10   55751  5.0  5.0  {'was_impossible': False}  8084  19  0.0
196237          61   62184  5.0  5.0  {'was_impossible': False}  5037  20  0.0
505090           4   29447  5.0  5.0  {'was_impossible': False}  8266  14  0.0
356542           4  174687  5.0  5.0  {'was_impossible': False}  8266  13  0.0
70773            8   65607  5.0  5.0  {'was_impossible': False}  8143  14  0.0
10464           94   49731  5.0  5.0  {'was_impossible': False}  3950  21  0.0
70730            6   47724  5.0  5.0  {'was_impossible': False}  8174  24  0.0
356830         184   14825  5.0  5.0  {'was_impossible': False}  2766  35  0.0
356839          39   67864  5.0  5.0  {'was_impossible': False}  5786  18  0.0


In [None]:
# Find worst predictions
worst_predictions = df1.sort_values(by='err')[-10:]

print('Worst 10 predictions:')
print(worst_predictions)

del predictions, df1, best_predictions, worst_predictions

Worst 10 predictions:
        reviewerID    itemID  rui       est                    details    Iu  \
574212        1054     77926  1.0  4.808597  {'was_impossible': False}  1279   
2731            53  12238590  1.0  4.822271  {'was_impossible': False}  5217   
182869          53  12238679  1.0  4.822271  {'was_impossible': False}  5217   
287045          53   4228354  1.0  4.822271  {'was_impossible': False}  5217   
610915         815     51958  1.0  4.836683  {'was_impossible': False}  1410   
144436         318   5560304  1.0  4.842536  {'was_impossible': False}  2282   
568727         318   5585473  1.0  4.842536  {'was_impossible': False}  2282   
206325         318   1803591  1.0  4.842536  {'was_impossible': False}  2282   
367809         318   5577052  1.0  4.842536  {'was_impossible': False}  2282   
177358         224   3441477  1.0  4.847012  {'was_impossible': False}  2600   

        Ui       err  
574212   1  3.808597  
2731     0  3.822271  
182869   0  3.822271  
28704

## SVD HPO using Grid Search

In [None]:
# Define parameters for grid search       
param_grid = {'n_epochs': [30, 35, 40, 45, 50, 55, 60, 65, 70], 
              'n_factors': [20, 25, 30, 35, 40 ,45 , 50], 
              'lr_all': [0.002, 0.003, 0.004, 0.005, 0.006, 0.007], 
              'reg_all': [0.0001, 0.001, 0.01, 0.02, 0.03, 0.04],
              'random_state': [seed_value]
              }

# Print nested dictionary line by line
print('SVD HPO Grid search parameters:')
param_grid

SVD HPO Grid search parameters:


{'n_epochs': [30, 35, 40, 45, 50, 55, 60, 65, 70],
 'n_factors': [20, 25, 30, 35, 40, 45, 50],
 'lr_all': [0.002, 0.003, 0.004, 0.005, 0.006, 0.007],
 'reg_all': [0.0001, 0.001, 0.01, 0.02, 0.03, 0.04],
 'random_state': [42]}

In [None]:
# Run grid search
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3,
                  joblib_verbose=-1, n_jobs=-1)
print('Time for iterating grid search parameters..')
search_time_start = time.time()
gs.fit(data)
print('Finished iterating grid search parameters:',
      time.time() - search_time_start)
print('\n')

# Lowest RMSE score
print('Lowest RMSE from Grid Search:')
print(gs.best_score['rmse'])
print('\n')

# Parameters with the lowest RMSE 
print('Parameters of Model with lowest RMSE from Grid Search:')
print(gs.best_params['rmse'])

Time for iterating grid search parameters..


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:  7.8min
[Parallel(n_jobs=-1)]: Done 256 tasks      | elapsed: 44.0min
[Parallel(n_jobs=-1)]: Done 616 tasks      | elapsed: 117.4min
[Parallel(n_jobs=-1)]: Done 1120 tasks      | elapsed: 218.4min
[Parallel(n_jobs=-1)]: Done 1768 tasks      | elapsed: 364.5min
[Parallel(n_jobs=-1)]: Done 2560 tasks      | elapsed: 561.4min
[Parallel(n_jobs=-1)]: Done 3496 tasks      | elapsed: 825.0min
[Parallel(n_jobs=-1)]: Done 4576 tasks      | elapsed: 1175.2min
[Parallel(n_jobs=-1)]: Done 5800 tasks      | elapsed: 1606.4min


Finished iterating grid search parameters: 120666.43906760216


Lowest RMSE from Grid Search:
0.8751145898635144


Parameters of Model with lowest RMSE from Grid Search:
{'n_epochs': 70, 'n_factors': 50, 'lr_all': 0.007, 'reg_all': 0.02, 'random_state': 42}


[Parallel(n_jobs=-1)]: Done 6804 out of 6804 | elapsed: 2010.8min finished


In [None]:
# Save results to df
results_df = pd.DataFrame.from_dict(gs.cv_results)
results_df = results_df.sort_values('mean_test_rmse', ascending=True)
print('SVD GridSearch HPO Cross Validation Results:')
print(results_df.head())
print('\n')
results_df.to_csv('SVD_gridSearch_cvResults.csv', index=False)

del results_df

SVD GridSearch HPO Cross Validation Results:
      split0_test_rmse  split1_test_rmse  split2_test_rmse  mean_test_rmse  \
2265          0.875581          0.875944          0.873819        0.875115   
2264          0.875565          0.876027          0.873893        0.875162   
2229          0.875793          0.876274          0.873673        0.875247   
2228          0.875805          0.876348          0.873693        0.875282   
2193          0.876021          0.876184          0.873846        0.875350   

      std_test_rmse  rank_test_rmse  split0_test_mae  split1_test_mae  \
2265       0.000928               1         0.549068         0.549599   
2264       0.000917               2         0.540922         0.541435   
2229       0.001130               3         0.549002         0.549727   
2228       0.001145               4         0.540729         0.541484   
2193       0.001066               5         0.549033         0.549468   

      split2_test_mae  mean_test_mae  std_test_

In [None]:
# Fit model with the lowest rmse
algo = gs.best_estimator['rmse']

# Fit and predict on best SVD model
predictions = algo.fit(train).test(test)
print('RMSE from fit best parameters on train predict on test:')
print(accuracy.rmse(predictions))
print('\n')

# Save predictions and algorithm
dump.dump('./SVD_bestGrid_Model_file', predictions, algo)
# predictions, algo = dump.load('./SVD_bestGrid_Model_file')

# Make df of prediction results    
df1 = pd.DataFrame(predictions, columns=['reviewerID', 'itemID', 'rui', 'est',
                                         'details'])

# Apply functions
df1['Iu'] = df1.reviewerID.apply(get_Ir)
df1['Ui'] = df1.itemID.apply(get_Ri)
df1['err'] = abs(df1.est - df1.rui)

# Save prediction results    
df1.to_csv('predictions_SVD_gridSearch.csv')

RMSE from fit best parameters on train predict on test:
RMSE: 0.8509
0.8509006630864451




In [None]:
# Find best predictions
best_predictions = df1.sort_values(by='err')[:10]

print('Best 10 predictions:')
print(best_predictions)

Best 10 predictions:
        reviewerID   itemID  rui  est                    details    Iu  Ui  \
531122         135  2690287  5.0  5.0  {'was_impossible': False}  3216   1   
405632         455   574393  5.0  5.0  {'was_impossible': False}  1954   2   
282541          27    53228  1.0  1.0  {'was_impossible': False}  6638  22   
187281          31    47707  5.0  5.0  {'was_impossible': False}  6482  22   
405650         178    37615  5.0  5.0  {'was_impossible': False}  2784  25   
235759         455  3117342  5.0  5.0  {'was_impossible': False}  1954   1   
59156           47    47468  5.0  5.0  {'was_impossible': False}  5464  19   
228886          31    64068  5.0  5.0  {'was_impossible': False}  6482  21   
603482         217    68855  5.0  5.0  {'was_impossible': False}  2643   5   
212835          31    64499  5.0  5.0  {'was_impossible': False}  6482  20   

        err  
531122  0.0  
405632  0.0  
282541  0.0  
187281  0.0  
405650  0.0  
235759  0.0  
59156   0.0  
228886  

In [None]:
# Find worst predictions
worst_predictions = df1.sort_values(by='err')[-10:]

print('Worst 10 predictions:')
print(worst_predictions)

del data, train, test, predictions, df1, best_predictions, worst_predictions

Worst 10 predictions:
        reviewerID   itemID  rui  est                    details    Iu  Ui  \
197725         890   129653  1.0  5.0  {'was_impossible': False}  1375   5   
532082         225     4678  1.0  5.0  {'was_impossible': False}  2594  24   
139164         802    76288  1.0  5.0  {'was_impossible': False}  1468   5   
511453          25   690900  1.0  5.0  {'was_impossible': False}  6848   3   
365581         273  1425906  1.0  5.0  {'was_impossible': False}  2382   1   
501522         599  1183676  1.0  5.0  {'was_impossible': False}  1665   1   
315136         458   190428  1.0  5.0  {'was_impossible': False}  1983   1   
621093         493    97874  1.0  5.0  {'was_impossible': False}  1827   5   
78050          628   766692  1.0  5.0  {'was_impossible': False}  1583   1   
255399         322   414659  1.0  5.0  {'was_impossible': False}  2277   1   

        err  
197725  4.0  
532082  4.0  
139164  4.0  
511453  4.0  
365581  4.0  
501522  4.0  
315136  4.0  
621093 

## RecSys Methods without Surprise
### Create the rating matrix with items and reviewers

In [None]:
ratingsMat = pd.pivot_table(df, index=['reviewer_id'], columns = 'item_id', 
                            values = 'rating').fillna(0)

print('Ratings matrix information:')
print(ratingsMat.info())
print('\n')
print('Dimensions of rating matrix:', ratingsMat.shape)
print('\n')

rating_nonZero = np.count_nonzero(ratingsMat)
print('Number of non zero ratings:', rating_nonZero)

rating_possible = ratingsMat.shape[0] * ratingsMat.shape[1]
print('Number of possible ratings:', rating_possible)
print('\n')

density = (rating_nonZero/rating_possible) *100
print ('Density of rating matrix: {:4.3f}%'.format(density))

Ratings matrix information:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1138 entries, 0 to 1137
Columns: 1757869 entries, 0 to 12479886
dtypes: float64(1757869)
memory usage: 14.9 GB
None


Dimensions of rating matrix: (1138, 1757869)


Number of non zero ratings: 3109309
Number of possible ratings: 2000454922


Density of rating matrix: 0.155%


In [None]:
# Use train_test_split from sklearn.model_selection
from sklearn.model_selection import train_test_split

# Create train/test sets for modeling
train, test = train_test_split(df, test_size = 0.2, random_state=seed_value)

print('Dimensions of train set:', train.shape)
print('Dimensions of test set:', test.shape)
print('\n')
print('Count of different ratings in train set:')
print(train.rating.value_counts().to_string())
print('\n')
print('Count of different ratings in test set:')
print(test.rating.value_counts().to_string())

Dimensions of train set:  (2490914, 3)
Dimensions of test set:  (622729, 3)


Count of different ratings in train set:
5.0    1590666
4.0     441962
3.0     201629
1.0     137217
2.0     119440


Count of different ratings in test set:
5.0    397282
4.0    110577
3.0     50575
1.0     34353
2.0     29942


## Create SVD Based Recommendation System using SciPy

In [None]:
# Define reviewer index 
ratingsMat['reviewer_index'] = np.arange(0, ratingsMat.shape[0], 1)
ratingsMat.set_index(['reviewer_index'], inplace=True)

# Define parameters
U, sigma, Vt = svds(ratingsMat, k = 6)

# Construct a diagonal matrix in SVD
sigma = np.diag(sigma)

# Predicted rating
reviewers_predRating = np.dot(np.dot(U, sigma), Vt) 

ratingPred = pd.DataFrame(reviewers_predRating, columns = ratingsMat.columns)

In [None]:
# Recommend the items with the highest predicted rating
def recommend_items(reviewerID, ratingsMat, ratingPred, num_recommendations):
    reviewer_idx = reviewerID-1
    
    # Get and sort the reviewer's rating
    sorted_reviewer_rating = ratingsMat.iloc[reviewer_idx].sort_values(ascending=False)
    sorted_reviewer_predictions = ratingPred.iloc[reviewer_idx].sort_values(ascending=False)

    # Concatenate rating with predicted rating
    tmp = pd.concat([sorted_reviewer_rating, sorted_reviewer_predictions], 
                     axis=1)
    tmp.index.name = 'Recommended Items'
    tmp.columns = ['reviewer_rating', 'reviewer_predictions']
    tmp = tmp.loc[tmp.reviewer_rating == 0]
    tmp = tmp.sort_values('reviewer_predictions', ascending=False)
    
    print('\nBelow are the recommended items for reviewer(reviewer_id = {}):\n'.format(reviewerID))
    print(tmp.head(num_recommendations))

In [None]:
reviewerID = 1
num_recommendations = 10
recommend_items(reviewerID, ratingsMat, ratingPred, num_recommendations)

reviewerID = 2
num_recommendations = 10
recommend_items(reviewerID, ratingsMat, ratingPred, num_recommendations)

reviewerID = 3
num_recommendations = 10
recommend_items(reviewerID, ratingsMat, ratingPred, num_recommendations)


Below are the recommended items for reviewer(reviewer_id = 1):

                   reviewer_rating  reviewer_predictions
Recommended Items                                       
240543                         0.0              5.056791
8072                           0.0              5.054071
170198                         0.0              5.053674
170199                         0.0              5.053278
8036                           0.0              5.053161
24462                          0.0              5.052726
674966                         0.0              5.052351
499498                         0.0              5.052351
675194                         0.0              5.052351
280915                         0.0              5.052351

Below are the recommended items for reviewer(reviewer_id = 2):

                   reviewer_rating  reviewer_predictions
Recommended Items                                       
89917                          0.0              3.440140
217073         

In [None]:
# Evaluate the SciPy SVD Collaborative recommender model
rmse_df = pd.concat([ratingsMat.mean(), ratingPred.mean()], axis=1)
rmse_df.columns = ['Avg_actual_rating', 'Avg_predicted_rating']
rmse_df['item_index'] = np.arange(0, rmse_df.shape[0], 1)

RMSE = round((((rmse_df.Avg_actual_rating - rmse_df.Avg_predicted_rating)**2).mean()**0.5), 10)
print('\nRMSE of SciPy SVD Model = {} \n'.format(RMSE))

RMSE of SciPy SVD Model = 0.0069640134


## Popularity Model

In [None]:
# Count of each reviewer for each unique item as recommendation score 
train_grouped = train.groupby('item_id').agg({'reviewer_id': 'count'}).reset_index()
train_grouped.rename(columns = {'reviewer_id': 'rating'}, inplace=True)

# Sort the products on recommendation score 
train_sort = train_grouped.sort_values(['rating', 'item_id'], 
                                       ascending = [0,1]) 
      
# Generate a recommendation rank based by scoring 
train_sort['rank'] = train_sort['rating'].rank(ascending=0, method='first') 

In [None]:
# Get the top 5 recommendations 
popularity_recommendations = train_sort.head() 
print('\nTop 5 recommendations ')
print(popularity_recommendations)


Top 5 recommendations 
      item_id  rating  rank
806       914      70   1.0
1511     1770      63   2.0
1412     1654      61   3.0
1467     1723      58   4.0
2148     2564      54   5.0




In [None]:
# Use popularity based recommender model to make predictions
def recommend(reviewer_id):     
    reviewer_recommendations = popularity_recommendations 
          
    # Add reviewer_id column for which the recommendations are being generated 
    reviewer_recommendations['reviewer_id'] = reviewer_id 
      
    # Bring reviewer_id column to the first column
    cols = reviewer_recommendations.columns.tolist() 
    cols = cols[-1:] + cols[:-1] 
    reviewer_recommendations = reviewer_recommendations[cols] 
          
    return reviewer_recommendations 

find_recom = [1,100,200]   
for i in find_recom:
    print('The list of recommendations for the reviewer_id: %d\n' %(i))
    print(recommend(i))    

The list of recommendations for the reviewer_id: 1

      reviewer_id  item_id  rating  rank
806             1      914      70   1.0
1511            1     1770      63   2.0
1412            1     1654      61   3.0
1467            1     1723      58   4.0
2148            1     2564      54   5.0
The list of recommendations for the reviewer_id: 100

      reviewer_id  item_id  rating  rank
806           100      914      70   1.0
1511          100     1770      63   2.0
1412          100     1654      61   3.0
1467          100     1723      58   4.0
2148          100     2564      54   5.0
The list of recommendations for the reviewer_id: 200

      reviewer_id  item_id  rating  rank
806           200      914      70   1.0
1511          200     1770      63   2.0
1412          200     1654      61   3.0
1467          200     1723      58   4.0
2148          200     2564      54   5.0
