# Books Recommendation with SVD and Collaborative Filtering

## Import library

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from surprise import Dataset, Reader, accuracy, SVD
import pickle, gzip, pickletools

## Import dataset

In [6]:
df = pd.read_csv("./input/ratings.csv")
df = pd.DataFrame({
    "user_id": df["UserID"],
    "book_id": df["BookID"],
    "rating": df["Rating"]
})
df

Unnamed: 0,user_id,book_id,rating
0,51e49f25-397d-43a1-a807-005933626d2e,1283,3
1,51e49f25-397d-43a1-a807-005933626d2e,11,3
2,51e49f25-397d-43a1-a807-005933626d2e,207,3
3,51e49f25-397d-43a1-a807-005933626d2e,312,3
4,51e49f25-397d-43a1-a807-005933626d2e,824,4
...,...,...,...
124836,b06db5a6-f1ca-4f2a-ab7f-90109157e4be,861,4
124837,b06db5a6-f1ca-4f2a-ab7f-90109157e4be,449,3
124838,b06db5a6-f1ca-4f2a-ab7f-90109157e4be,664,3
124839,b06db5a6-f1ca-4f2a-ab7f-90109157e4be,208,3


In [3]:
print(df["book_id"].unique().shape)

(886,)


### Choose books with a certain number of ratings

In [7]:
df1 = df.drop("rating", axis=1)
df1 = pd.DataFrame(df1.groupby(["book_id"]).count())
df1.sort_values(by='user_id')

Unnamed: 0_level_0,user_id
book_id,Unnamed: 1_level_1
32,102
33,102
36,102
915,102
922,102
...,...
11,451
234,451
224,451
569,451


In [9]:
df2 = df1.loc[df1["user_id"] < 100]
df2

Unnamed: 0_level_0,user_id
book_id,Unnamed: 1_level_1


In [None]:
obsecure = df2.index.to_list()
obsecure

In [None]:
index_names = df[df["book_id"].isin(obsecure)].index
df = df.drop(index_names)
df

## Split data

Split data into 80% training and 20% testing

In [10]:
trainset, testset = train_test_split(df, test_size=0.2)

print("Training set size: ", trainset.shape)
print("Test set size: ", testset.shape)

Training set size:  (99872, 3)
Test set size:  (24969, 3)


In [11]:
reader = Reader(line_format="user item rating", rating_scale=(1,5))
train_data = Dataset.load_from_df(trainset, reader)
test_data = Dataset.load_from_df(testset, reader)

In [12]:
test_data = test_data.build_full_trainset()
test_data = test_data.build_testset()

In [13]:
train_data = train_data.build_full_trainset()

## Train Model

Train a SVD model with 200 latent features

In [14]:
model = SVD(n_factors=100, n_epochs=30, reg_all=0.02, verbose=True)

In [15]:
model.fit(train_data)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f7b00447590>

## Test Model

In [16]:
pred_test = model.test(test_data)
accuracy.rmse(pred_test)

RMSE: 0.6928


0.6928317750807189

In [17]:
df_predictions = pd.DataFrame(pred_test, columns=['uid', 'iid', 'rui', 'est', 'details'])
df_predictions['err'] = abs(df_predictions.est - df_predictions.rui)

df_predictions.head()

Unnamed: 0,uid,iid,rui,est,details,err
0,0c850f92-460b-4abc-b851-4bcd0ccbc94e,116,4.0,4.212945,{'was_impossible': False},0.212945
1,0c850f92-460b-4abc-b851-4bcd0ccbc94e,421,2.0,3.097976,{'was_impossible': False},1.097976
2,0c850f92-460b-4abc-b851-4bcd0ccbc94e,6,2.0,2.576427,{'was_impossible': False},0.576427
3,0c850f92-460b-4abc-b851-4bcd0ccbc94e,60,5.0,3.388168,{'was_impossible': False},1.611832
4,0c850f92-460b-4abc-b851-4bcd0ccbc94e,608,3.0,1.919026,{'was_impossible': False},1.080974


## Find Best Prediction

In [18]:
best_predictions = df_predictions.sort_values(by='err')[:10]
best_predictions

Unnamed: 0,uid,iid,rui,est,details,err
16936,7bdb2ac3-5c7a-4ec6-a945-5ec837e06eda,343,1.0,1.0,{'was_impossible': False},0.0
16931,7bdb2ac3-5c7a-4ec6-a945-5ec837e06eda,926,1.0,1.0,{'was_impossible': False},0.0
6532,70decac3-bd50-47a4-8e45-33c10f5c6926,587,5.0,5.0,{'was_impossible': False},0.0
6537,70decac3-bd50-47a4-8e45-33c10f5c6926,254,5.0,5.0,{'was_impossible': False},0.0
6485,70decac3-bd50-47a4-8e45-33c10f5c6926,751,5.0,5.0,{'was_impossible': False},0.0
6487,70decac3-bd50-47a4-8e45-33c10f5c6926,64,5.0,5.0,{'was_impossible': False},0.0
22375,7a8b2147-a889-4c10-964a-19c23993f63a,271,5.0,5.0,{'was_impossible': False},0.0
18769,78f3a39c-08d2-4dce-ad38-0d47c8348a72,219,5.0,5.0,{'was_impossible': False},0.0
16898,7bdb2ac3-5c7a-4ec6-a945-5ec837e06eda,638,1.0,1.0,{'was_impossible': False},0.0
16900,7bdb2ac3-5c7a-4ec6-a945-5ec837e06eda,109,1.0,1.0,{'was_impossible': False},0.0


## Find Worst Prediction

In [19]:
worst_predictions = df_predictions.sort_values(by='err')[-10:]
worst_predictions

Unnamed: 0,uid,iid,rui,est,details,err
16884,7bdb2ac3-5c7a-4ec6-a945-5ec837e06eda,824,5.0,1.626474,{'was_impossible': False},3.373526
13019,ddc3cbab-703b-4d49-b80b-89e3b884c802,627,1.0,4.41676,{'was_impossible': False},3.41676
5722,85803de9-6591-4883-8f67-ad6dd5a405bc,335,1.0,4.490394,{'was_impossible': False},3.490394
17685,593b3f3d-1983-4f20-b84e-6cc56117a4d5,593,1.0,4.653574,{'was_impossible': False},3.653574
18822,fb41a9bc-3277-46c8-962a-e3ef58cdd421,179,1.0,4.747558,{'was_impossible': False},3.747558
11762,f418f082-1417-4186-844e-517e64c4a66d,752,5.0,1.089636,{'was_impossible': False},3.910364
16923,7bdb2ac3-5c7a-4ec6-a945-5ec837e06eda,220,5.0,1.064182,{'was_impossible': False},3.935818
18010,4bad6b13-4387-4b44-9524-39d74b736a6b,754,1.0,4.937207,{'was_impossible': False},3.937207
24097,d6494f6e-01b6-409e-a039-dc7b213300f8,783,1.0,4.956767,{'was_impossible': False},3.956767
18050,4bad6b13-4387-4b44-9524-39d74b736a6b,277,1.0,5.0,{'was_impossible': False},4.0


## Now fitting to full data (excluding books with < 100 ratings)

In [20]:
data = Dataset.load_from_df(df, reader)
data = data.build_full_trainset()

model.fit(data)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f7b00447590>

In [21]:
model.qi.shape

(886, 100)

In [22]:
pd.DataFrame(model.qi)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.043023,-0.319651,-0.066558,-0.077602,-0.150253,0.297176,-0.125661,-0.001561,0.289335,0.061583,...,0.029865,0.072919,-0.043275,0.177479,0.191562,-0.208198,0.024303,0.459184,-0.068537,0.029893
1,-0.214452,-0.101327,0.086615,0.048393,-0.054353,0.206596,-0.007356,0.046981,0.159609,-0.095499,...,0.276162,0.119729,0.019788,-0.188448,0.072627,0.011838,-0.143431,-0.059901,0.109201,-0.108005
2,0.225720,-0.167290,0.222587,-0.191411,0.314358,0.184888,0.353982,0.299889,-0.027567,0.025194,...,0.212401,-0.037135,0.049537,-0.044625,0.074143,-0.299344,-0.149682,0.046827,0.242303,0.112694
3,0.124207,-0.019404,-0.065053,0.078118,-0.147969,0.006256,-0.203950,-0.257661,-0.279606,-0.088558,...,-0.267170,0.266495,-0.116434,0.021859,-0.132600,-0.211786,-0.188322,0.407447,-0.023852,0.065913
4,-0.060680,-0.279450,-0.139020,-0.044696,0.081966,0.160476,0.039470,-0.079144,0.120323,0.240208,...,0.162350,-0.094781,-0.142539,-0.001717,0.187580,-0.155628,-0.253873,0.090758,0.097053,-0.016212
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
881,0.168362,0.047393,0.205747,0.061471,0.003274,-0.104947,0.275797,-0.211896,0.062653,-0.039534,...,0.258914,0.045883,-0.018999,0.173652,0.035382,0.120457,0.141245,0.047813,-0.033542,-0.094441
882,-0.078118,-0.055733,0.177521,0.153407,-0.073289,-0.354206,-0.058046,-0.092125,-0.096062,0.049735,...,-0.143682,0.056484,0.060044,0.212731,0.187492,0.047430,0.360591,0.139575,0.030008,-0.231229
883,0.137348,0.281783,0.245553,-0.009197,0.225086,-0.208190,-0.006119,-0.038175,0.139033,-0.118896,...,0.214449,-0.040270,-0.212041,0.079835,0.049267,0.195690,-0.047314,0.146207,0.164586,-0.083914
884,0.080326,-0.145351,0.353249,0.004022,0.151557,-0.088666,0.051644,-0.005518,0.213196,-0.276439,...,-0.090508,0.117698,-0.025483,-0.083395,0.053403,0.174225,-0.196947,-0.064747,-0.174062,-0.138868


# Dumping model to file

In [23]:
with gzip.open("svd_model. h5", "wb") as f:
    pickled = pickle.dumps(model)
    optimized_pickle = pickletools.optimize(pickled)
    f.write(optimized_pickle)