# Books Recommendation with SVD and Collaborative Filtering

## Import library

In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from surprise import Reader, Dataset
import pickle, gzip, pickletools
from svd import SVD

## Import dataset

In [16]:
df = pd.read_csv("./input/ratings.csv")
df = pd.DataFrame({
    "user_id": df["UserID"],
    "book_id": df["BookID"],
    "rating": df["Rating"]
})
df

Unnamed: 0,user_id,book_id,rating
0,51e49f25-397d-43a1-a807-005933626d2e,1283,3
1,51e49f25-397d-43a1-a807-005933626d2e,11,3
2,51e49f25-397d-43a1-a807-005933626d2e,207,3
3,51e49f25-397d-43a1-a807-005933626d2e,312,3
4,51e49f25-397d-43a1-a807-005933626d2e,824,4
...,...,...,...
124836,b06db5a6-f1ca-4f2a-ab7f-90109157e4be,861,4
124837,b06db5a6-f1ca-4f2a-ab7f-90109157e4be,449,3
124838,b06db5a6-f1ca-4f2a-ab7f-90109157e4be,664,3
124839,b06db5a6-f1ca-4f2a-ab7f-90109157e4be,208,3


In [17]:
print(df["book_id"].unique().shape)

(886,)


## Split data

Split data into 80% training and 20% testing

In [18]:
trainset, testset = train_test_split(df, test_size=0.2)

print("Training set size: ", trainset.shape)
print("Test set size: ", testset.shape)

Training set size:  (99872, 3)
Test set size:  (24969, 3)


In [19]:
reader = Reader(line_format="user item rating", rating_scale=(1,5))
train_data = Dataset.load_from_df(trainset, reader)

In [20]:
train_data = train_data.build_full_trainset()

## Define MAE as Evaluation Metric

In [21]:
def mae(testset, pred):
    if len(testset) != len(pred):
        raise ValueError("testset and pred must have the same length")

    n = len(testset)
    total_error = 0

    for i in range(n):
        total_error += abs(testset[i] - pred[i])

    return total_error / n

## Train and Test Model

### Train a SVD model with latent features = 50, reg = 0.02

In [22]:
model_1 = SVD(n_factors=50, n_epochs=30, reg=0.02, verbose=True)

In [23]:
model_1.fit(train_data)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29


<svd.SVD at 0x7f6774205d10>

Test model

In [47]:
pred_test = model_1.predict(testset)

print("MAE:", mae(testset["rating"].to_list(), pred_test))

MAE: 0.4866184484758057


### Train a SVD model with latent features = 50, reg = 0.05

In [25]:
model_2 = SVD(n_factors=50, n_epochs=30, reg=0.05, verbose=True)

In [26]:
model_2.fit(train_data)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29


<svd.SVD at 0x7f6777a56950>

Test model

In [46]:
pred_test = model_2.predict(testset)

print("MAE:", mae(testset["rating"].to_list(), pred_test))

MAE: 0.4850823124677568


### Train a SVD model with latent features = 100, reg = 0.02

In [28]:
model_3 = SVD(n_factors=100, n_epochs=30, reg=0.02, verbose=True)

In [29]:
model_3.fit(train_data)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29


<svd.SVD at 0x7f6774d48a90>

Test model

In [45]:
pred_test = model_3.predict(testset)

print("MAE:", mae(testset["rating"].to_list(), pred_test))

MAE: 0.4872577746132833


### Train a SVD model with latent features = 100, reg = 0.05

In [37]:
model_4 = SVD(n_factors=100, n_epochs=30, reg=0.05, verbose=True)

In [38]:
model_4.fit(train_data)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29


<svd.SVD at 0x7f6775f70790>

Test model

In [44]:
pred_test = model_4.predict(testset)

print("MAE:", mae(testset["rating"].to_list(), pred_test))

MAE: 0.4825398286199963


In [39]:
model_2.qi.shape

(886, 50)

# Dumping model to file

In [42]:
with gzip.open("../svd_model.pkl.gz", "wb") as f:
    pickled = pickle.dumps(model_4)
    optimized_pickle = pickletools.optimize(pickled)
    f.write(optimized_pickle)