In [1]:
import recommender as rs
from surprise import KNNBasic, SVD
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import cross_validate, PredefinedKFold
import pandas as pd

In [2]:
train_df = pd.read_csv("data/Book_reviews/Book_reviews/BX-Book-Ratings-train.csv", delimiter=";", header=0)
test_df = pd.read_csv("data/Book_reviews/Book_reviews/BX-Book-Ratings-test.csv", delimiter=";", header=0)

In [3]:
folds_files = [("data/data/u1.base", "data/data/u1.test")]
data = Dataset.load_from_folds(folds_files, reader=Reader(rating_scale=(1, 6), sep='\t'))

# algo = KNNBasic(sim_options={'name': 'cosine', 'user_based': True})
algo = SVD()
pkf = PredefinedKFold()
for trainset, testset in pkf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    surprise_user_based = accuracy.rmse(predictions, verbose=True)

RMSE: 0.9508


In [4]:
folds_files = [("data/data/u1.base", "data/data/u1.test")]
data = Dataset.load_from_folds(folds_files, reader=Reader(rating_scale=(1, 6), sep='\t'))

algo = KNNBasic(sim_options={'name': 'cosine', 'user_based': False})
pkf = PredefinedKFold()
for trainset, testset in pkf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    surprise_item_based = accuracy.rmse(predictions, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0491


In [5]:
input_ = rs.InputReader("data/data/u1.base", "data/data/u1.test")
input_.parse("train", "\t", False, False)
input_.parse("test", "\t", False, False)

cf = rs.CF(input_)
recommendation = cf.recommend("user", "1", "user-based", k=40,
                              simi_th=0, n=10, keep_nonzero_topk=True)
user_based_rmse              = cf.user_based_rmse(-1, 40, 0, False) #算全部 
user_based_rmse_keep_nonzero = cf.user_based_rmse(-1, 40, 0, True)  #不要算全部
item_based_rmse              = cf.item_based_rmse(-1, 40, 0, False) #算全部
item_based_rmse_keep_nonzero = cf.item_based_rmse(-1, 40, 0, True)  #不要算全部

read as a vector of triplets (train) ... 0.016558 sec
Init SP matrix (alloc & set) ... 0.000719 sec

# readers (rows m) : 943 , # books (cols n) : 1650
# non zero element: 80000
density = 0.0514155

read as a vector of triplets (test) ... 0.003379 sec
length of test data : 20000
calculate weighted sum of top k item vectors ... 0.143784 sec
# non-zero score recommendation items : 1515
top n recommendation items for the given user : result_n=1515
272 : 4.26498
100 : 4.03409
275 : 3.94558
258 : 3.88044
10 : 3.81758
242 : 3.81579
14 : 3.777
273 : 3.60222
274 : 3.52159
255 : 3.26698
test time ... 21.2393 sec
avg ratio : 0.025
rmse : 1.06021
baseline rmse : 1.15368
test time ... 4.67114 sec
avg ratio : 0.0016
rmse : 1.02243
baseline rmse : 1.15368
test time ... 27.6183 sec
avg ratio : 0.02335
rmse : 1.01612
baseline rmse : 1.15368
test time ... 3.77429 sec
avg ratio : 0.0016
rmse : 1.01082
baseline rmse : 1.15368


In [6]:
pd.DataFrame([[user_based_rmse, user_based_rmse_keep_nonzero, surprise_user_based],
              [item_based_rmse, item_based_rmse_keep_nonzero, surprise_item_based]],
             index=['user-based', 'item-based'], columns=['all_sim', 'only-nonzero_sim', 'surprise']).round(4)

Unnamed: 0,all_sim,only-nonzero_sim,surprise
user-based,1.0602,1.0224,0.9508
item-based,1.0161,1.0108,1.0491


In [7]:
input_ = rs.InputReader("data/data/u1.base", "data/data/u1.test")
input_.parse("train", "\t", False, False)
input_.parse("test", "\t", False, False)

cf = rs.CF(input_)
recommendation = cf.recommend("user", "1", "user-based", -1, 0, 10, False)

read as a vector of triplets (train) ... 0.018295 sec
Init SP matrix (alloc & set) ... 0.000826 sec

# readers (rows m) : 943 , # books (cols n) : 1650
# non zero element: 80000
density = 0.0514155

read as a vector of triplets (test) ... 0.003935 sec
length of test data : 20000
calculate weighted sum of top k item vectors ... 0.008137 sec
# non-zero score recommendation items : 1515
top n recommendation items for the given user : result_n=1515
272 : 4.24212
100 : 4.11221
275 : 3.96882
242 : 3.94684
14 : 3.87806
258 : 3.85336
10 : 3.85014
273 : 3.50525
274 : 3.47559
255 : 3.29722


In [8]:
recommendation = cf.recommend("item", "1", "item-based", -1, 0, 10, False)

calculate weighted sum of top k user vectors ... 0.007318 sec
# non-zero score recommendation items : 560
top n recommendation items for the given user : result_n=560
12 : 4.33329
9 : 4.31992
10 : 4.24414
14 : 4.15256
4 : 4.09586
7 : 4.04158
8 : 3.9044
11 : 3.53397
5 : 3.23545
3 : 2.96947


In [9]:
folds_files = [("data/Book_reviews/Book_reviews/BX-Book-Ratings-train.csv",
                "data/Book_reviews/Book_reviews/BX-Book-Ratings-test.csv")]
data = Dataset.load_from_folds(folds_files, reader=Reader(rating_scale=(0, 11), sep=';', skip_lines=1))

# algo = KNNBasic(sim_options={'name': 'cosine', 'user_based': True})
algo = SVD()
pkf = PredefinedKFold()
for trainset, testset in pkf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    surprise_user_based = accuracy.rmse(predictions, verbose=True)

RMSE: 3.5068


In [10]:
input_ = rs.InputReader("data/Book_reviews/Book_reviews/BX-Book-Ratings-train.csv",
                        "data/Book_reviews/Book_reviews/BX-Book-Ratings-test.csv")
input_.parse("train", ";", True, True)
input_.parse("test", ";", True, True)

cf = rs.CF(input_)

read as a vector of triplets (train) ... 0.442058 sec
Init SP matrix (alloc & set) ... 0.021199 sec

# readers (rows m) : 99285 , # books (cols n) : 320139
# non zero element: 1034802
density = 0.000601549

read as a vector of triplets (test) ... 0.024665 sec
length of test data : 114978


In [11]:
user_based_rmse_keep_nonzero = cf.user_based_rmse(-1, 40, 0, True)  #不要算全部

test time ... 84.8664 sec
avg ratio : 0.313825
rmse : 4.16334
baseline rmse : 3.86146


In [12]:
item_based_rmse_keep_nonzero = cf.item_based_rmse(-1, 40, 0, True)  #不要算全部

test time ... 142.018 sec
avg ratio : 0.313825
rmse : 3.70015
baseline rmse : 3.86146


In [13]:
user_based_rmse              = cf.user_based_rmse(-1, -1, 0, True)

test time ... 84.5335 sec
avg ratio : 0.313825
rmse : 4.16304
baseline rmse : 3.86146


In [14]:
item_based_rmse              = cf.item_based_rmse(-1, -1, 0, True)

test time ... 141.065 sec
avg ratio : 0.313825
rmse : 3.69594
baseline rmse : 3.86146


In [15]:
pd.DataFrame([[user_based_rmse, user_based_rmse_keep_nonzero],
              [item_based_rmse, item_based_rmse_keep_nonzero]],
             index=['user-based', 'item-based'], columns=['all_sim', 'only-nonzero_sim']).round(4)

Unnamed: 0,all_sim,only-nonzero_sim
user-based,4.163,4.1633
item-based,3.6959,3.7001


In [16]:
cf.recommend("user", "1", "user-based", -1, 0, 10, False)

calculate weighted sum of top k item vectors ... 0.044973 sec
# non-zero score recommendation items : 0
top n recommendation items for the given user : result_n=0


[]

In [17]:
cf.recommend("item", "1", "item-based", -1, 0, 10, False)

calculate weighted sum of top k user vectors ... 0.044668 sec
# non-zero score recommendation items : 0
top n recommendation items for the given user : result_n=0


[]