In [54]:
#imports basics
import pandas as pd
import numpy as np
from tqdm import tqdm


#imports for training
from surprise import SVD, NMF, KNNBasic, KNNWithMeans, Dataset, Reader, accuracy
from sklearn.model_selection import train_test_split
from surprise.model_selection import cross_validate
from sklearn.ensemble import VotingRegressor

In [2]:
#loading the data
df =  pd.read_csv('../data/processed/preprocessed-data.csv')

for the purpose of the recommendation system we want to train different models and see which one performs best.
The following models will be used
- SVD
- User Based Collaborative Filtering
- Item Based Collaborative Filtering

In [57]:
#creating train and test split

reader = Reader(rating_scale=(1, 10))

trainset, testset = train_test_split(df[['user_id','isbn','rating']], test_size=0.25, random_state=0, shuffle=True)
trainset.index = range(len(trainset))
testset.index = range(len(testset))
trainset = Dataset.load_from_df(trainset, reader)
testset = testset.values.tolist()



In [123]:
# Define the models and their hyperparameters

algorithms = [
    {
        "description": 'SVD(n_factors=50)',
        "model": SVD(n_factors=50)
    },
    {
        "description": 'NMF(n_factors=15)',
        "model": NMF(n_factors=15)
    },
    {
        "description": 'KNNBasic(k=20, sim=cosine, item-based)',
        "model": KNNBasic(k=20, sim_options = {'name': 'cosine', 'user_based': False })
    },
    {
        "description": 'KNNBasic(k=20, sim=pearson, user-based)',
        "model": KNNBasic(k=20, sim_options = {'name': 'pearson', 'user_based': True})
    }
]


In [5]:
# Iterate over selected algorithms based on 5-fold CV on training set



benchmark = []
i = 0

for algorithm in tqdm(algorithms):
    # Perform cross validation
    results = cross_validate(algorithm['model'], trainset, measures=['RMSE','MAE'], cv=5, verbose=False)

    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([algorithm['description']], index=['Algorithm']))
    benchmark.append(tmp)
    i+=1
benchmark_df = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

  tmp = tmp.append(pd.Series([algorithm['description']], index=['Algorithm']))
  tmp = tmp.append(pd.Series([algorithm['description']], index=['Algorithm']))
 50%|█████     | 2/4 [00:47<00:49, 24.90s/it]

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


  tmp = tmp.append(pd.Series([algorithm['description']], index=['Algorithm']))
 75%|███████▌  | 3/4 [00:53<00:16, 16.27s/it]

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.


  tmp = tmp.append(pd.Series([algorithm['description']], index=['Algorithm']))
100%|██████████| 4/4 [05:50<00:00, 87.53s/it] 


Unnamed: 0_level_0,test_rmse,test_mae,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SVD(n_factors=50),1.623591,1.258437,2.815966,0.112731
"KNNBasic(k=20, sim=cosine, item-based)",1.806229,1.3418,0.571958,0.448292
"KNNBasic(k=20, sim=pearson, user-based)",1.857445,1.444097,57.79787,1.213157
NMF(n_factors=15),2.486901,2.077223,5.898164,0.135045


In [44]:
knn = algorithms[2]['model']
svd = algorithms[0]['model']

In [102]:

knn.fit(trainset.build_full_trainset())
#testset

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7fea96418460>

In [65]:
#why does it always say it doesnt know ???
res = knn.test(testset)

In [None]:
only_possible = list()
for i in range(len(res)):
    if res[i][4]['was_impossible'] == True:
        continue
    only_possible.append(res[i])


RMSE: 1.6049


1.6048897579631096

In [115]:
uid = str(225232)  # raw user id (as in the ratings file). They are **strings**!
iid = "0671867156"  # raw item id (as in the ratings file). They are **strings**!

# get a prediction for specific users and items.
pred = algo.predict(uid, iid, r_ui=6, verbose=True)

user: 225232     item: 0671867156 r_ui = 6.00   est = 7.99   {'was_impossible': False}


In [98]:
only_possible

[Prediction(uid=160401, iid='0786867647', r_ui=5, est=7.856605245325942, details={'actual_k': 7, 'was_impossible': False}),
 Prediction(uid=136491, iid='0385337116', r_ui=9, est=9.401116926707193, details={'actual_k': 5, 'was_impossible': False}),
 Prediction(uid=229011, iid='0141000198', r_ui=7, est=5.857359418326955, details={'actual_k': 14, 'was_impossible': False}),
 Prediction(uid=22521, iid='0767914767', r_ui=8, est=5.5, details={'actual_k': 2, 'was_impossible': False}),
 Prediction(uid=260897, iid='0805063897', r_ui=8, est=6.441228674618633, details={'actual_k': 13, 'was_impossible': False}),
 Prediction(uid=225232, iid='0671867156', r_ui=6, est=7.000000000000001, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid=30594, iid='0345417623', r_ui=10, est=6.679707819466202, details={'actual_k': 3, 'was_impossible': False}),
 Prediction(uid=62558, iid='0380789035', r_ui=8, est=7.001294019245009, details={'actual_k': 6, 'was_impossible': False}),
 Prediction(uid=178522

In [87]:
only_possible
sum = 0
for i in range(len(only_possible)):
    actual = only_possible[i][2]
    predicted = only_possible[i][3]
    if actual-predicted < 0:
        sum += (-1)*(actual-predicted)
    else:
        sum += actual-predicted

mae = sum/len(only_possible)
print(mae)


1.3012010033993138


In [25]:
trainset.df

Unnamed: 0,user_id,isbn,rating
0,207350,0515129941,5
1,116800,044023722X,8
2,95173,0446610178,1
3,188676,0316569321,8
4,72839,0345423135,6
...,...,...,...
71680,73564,0345413369,9
71681,227859,0345351525,10
71682,8067,0553574639,10
71683,272263,0812550706,8
