In [43]:
from surprise import SVD
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import pairwise_distances
mydata = pd.read_csv('subset100k.csv')
num = len(mydata['user_id'].unique())

In [44]:
# 0 means the hotel_cluster is clikced(interested) but not booked
# 1 means the hotel_cluster is booked
m = np.zeros((num,100))

In [45]:
#Build the user-item Matrix
userid = mydata['user_id'].unique()
hotelcluster = mydata['hotel_cluster'].unique()
for index, row in mydata.iterrows():
    rowNum = np.where(userid==row['user_id'])
    colNum = np.where(hotelcluster==row['hotel_cluster'])
    if m[rowNum,colNum]==0:
        m[rowNum,colNum]=row['is_booking']+1
    else:
        m[rowNum,colNum] = m[rowNum,colNum]+row['is_booking']
print (m.max())

63.0


In [46]:
#generate user_id, hotel_cluster_id, rating matrix for svd
data = []
for i in range(num):
    for j in range(100):
        if m[i, j] > 0:
            data.append([i,j,int(m[i,j])])
data = np.asarray(data)
#print (data.shape)
df = pd.DataFrame(data, columns = ('userID', 'itemID', 'rating'))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1241363 entries, 0 to 1241362
Data columns (total 3 columns):
userID    1241363 non-null int64
itemID    1241363 non-null int64
rating    1241363 non-null int64
dtypes: int64(3)
memory usage: 28.4 MB


In [47]:
#import relative functions
import timeit
from surprise import SVD
from surprise import SVDpp
from surprise import Dataset
from surprise import Reader
from surprise.accuracy import rmse
from surprise import evaluate, print_perf
from surprise import GridSearch
from surprise import dump
from surprise import accuracy

In [48]:
#define function whole_svd()
def whole_svd(k):
    algo = SVD(n_factors = k,n_epochs = 10, lr_all = 0.005, reg_all = 0.05)
    upperBound = m.max()
    reader = Reader(rating_scale=(0,upperBound))
    data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)
    data.split(5)
    result = []
    for trainset, testset in data.folds():
        algo.train(trainset)
        predictions_svd = algo.test(testset)

#    trainset = data.build_full_trainset() # build the train dataset from the whole dataset
#    algo.train(trainset)

        error = accuracy.rmse(predictions_svd, verbose=True) 
        result.append(error)
        
    return np.mean(result)


In [49]:
#implement the whole_svd function
#with specified k
k=15
start_time = timeit.default_timer()
error = whole_svd(k)
elapsed = timeit.default_timer() - start_time
print (error)
print("elapsed time: %f" % (elapsed))

RMSE: 0.5542
RMSE: 0.5754
RMSE: 0.5472
RMSE: 0.5534
RMSE: 0.5625
0.558545806838
elapsed time: 86.876175


In [55]:
#define function whole_svdpp()
def whole_svdpp(k):
    algo = SVDpp(n_factors = k,n_epochs = 10, lr_all = 0.005, reg_all = 0.05)
    upperBound = m.max()
    reader = Reader(rating_scale=(0,upperBound))
    data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)
    data.split(5)
    result = []
    for trainset, testset in data.folds():
        algo.train(trainset)
        predictions_svdpp = algo.test(testset)

#    trainset = data.build_full_trainset() # build the train dataset from the whole dataset
#    algo.train(trainset)

        error = accuracy.rmse(predictions_svdpp, verbose=True) 
        result.append(error)
        
    return np.mean(result), predictions_svdpp

In [51]:
#implement the whole_svdpp function
#with specified k
k=14
start_time = timeit.default_timer()
error = whole_svdpp(k)
elapsed = timeit.default_timer() - start_time
print(error)
print("elapsed time: %f" % (elapsed))

RMSE: 0.5433
RMSE: 0.5665
RMSE: 0.5588
RMSE: 0.5626
RMSE: 0.5603
0.55831564217
elapsed time: 805.430916


In [52]:
#baseline function by default use reg_all = 0.02, lr_all = 0.005, n_epochs = 20, same as SVD++ and SVD
from surprise import BaselineOnly
def baseline():
    bsl_options = {'method': 'sgd',
                   'reg':0.02,
                   'learning_rate': .005,
                   'n_epochs': 10   
               }
    algo = BaselineOnly(bsl_options=bsl_options)
    upperBound = m.max()
    reader = Reader(rating_scale=(0,upperBound))
    data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)
    data.split(5)
    result = []
    for trainset, testset in data.folds():
        algo.train(trainset)
        predictions_baseline = algo.test(testset)

#    trainset = data.build_full_trainset() # build the train dataset from the whole dataset
#    algo.train(trainset)

        error = accuracy.rmse(predictions_baseline, verbose=True) 
        result.append(error)
        
    return np.mean(result)


In [53]:
error_baseline = baseline()
print ('baseline error is %f'%(error_baseline))

Estimating biases using sgd...
RMSE: 0.5537
Estimating biases using sgd...
RMSE: 0.5517
Estimating biases using sgd...
RMSE: 0.5640
Estimating biases using sgd...
RMSE: 0.5679
Estimating biases using sgd...
RMSE: 0.5558
baseline error is 0.558610


In [None]:
#test coverage for the SVDpp algorithm by comparing top5 predicted rated items to original users' top5 rated items

In [54]:
df = mydata[['user_id','hotel_cluster','is_booking']]
df = df.groupby(['user_id', 'hotel_cluster'],as_index=False)['is_booking'].sum()
df = df.rename(columns={'user_id': 'userID', 'hotel_cluster': 'itemID', 'is_booking':'rating'})
df['rating'] = df['rating']+1
df.describe()

Unnamed: 0,userID,itemID,rating
count,1241363.0,1241363.0,1241363.0
mean,601087.6,48.7443,1.201218
std,347470.5,29.17846,0.5801284
min,53.0,0.0,1.0
25%,300585.0,23.0,1.0
50%,599160.0,48.0,1.0
75%,902293.0,73.0,1.0
max,1198757.0,99.0,63.0


In [56]:
k=14 #tuned best factor k
error, predictions_svdpp = whole_svdpp(k)

RMSE: 0.5486
RMSE: 0.5459
RMSE: 0.5548
RMSE: 0.5715
RMSE: 0.5698


In [57]:
df_svdpp = pd.DataFrame(predictions_svdpp, columns=['uid', 'iid', 'rui', 'est', 'details']) 
df_svdpp['err'] = abs(df_svdpp.est - df_svdpp.rui)
print (error)
df_svdpp.describe()

0.558127572938


Unnamed: 0,uid,iid,rui,est,err
count,248272.0,248272.0,248272.0,248272.0,248272.0
mean,600409.1,48.650557,1.201372,1.241348,0.34006
std,347881.5,29.121891,0.590417,0.153457,0.457236
min,53.0,0.0,1.0,0.881552,3e-06
25%,298601.8,23.0,1.0,1.14953,0.154541
50%,597697.0,48.0,1.0,1.220235,0.233937
75%,902364.0,73.0,1.0,1.299412,0.35363
max,1198757.0,99.0,63.0,4.099155,61.159061


In [58]:
#Build the rate Matrix
result = np.zeros((num,100))
userid = mydata['user_id'].unique()
hotelcluster = mydata['hotel_cluster'].unique()
for index, row in df_svdpp.iterrows():
    rowNum = np.where(userid==row['uid'])
    colNum = np.where(hotelcluster==row['iid'])
    result[rowNum,colNum]=row['est']
print (result.max())

4.09915510376


In [59]:
#Count the number of same recommendations given by our result compared to the original matrix
count = 0
for rowNum in range(result.shape[0]):
    recommend = set(np.argsort(result[rowNum,:])[-5:,])
    Mrecommend = set(np.argsort(m[rowNum,:])[-5:,])
    if (recommend.intersection(Mrecommend)) != set([]):
        count+=1
print (count)

76964
