# Books Recommendation with SVD and Collaborative Filtering

## Import library

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from surprise import Reader, Dataset
import pickle, gzip, pickletools
from svd import SVD

## Import dataset

In [2]:
df = pd.read_csv("./input/ratings.csv")
df

Unnamed: 0,user_id,book_id,rating
0,51e49f25-397d-43a1-a807-005933626d2e,1283,3
1,51e49f25-397d-43a1-a807-005933626d2e,11,3
2,51e49f25-397d-43a1-a807-005933626d2e,207,3
3,51e49f25-397d-43a1-a807-005933626d2e,312,3
4,51e49f25-397d-43a1-a807-005933626d2e,824,4
...,...,...,...
124836,b06db5a6-f1ca-4f2a-ab7f-90109157e4be,861,4
124837,b06db5a6-f1ca-4f2a-ab7f-90109157e4be,449,3
124838,b06db5a6-f1ca-4f2a-ab7f-90109157e4be,664,3
124839,b06db5a6-f1ca-4f2a-ab7f-90109157e4be,208,3


In [4]:
print(df["book_id"].unique().shape)

(886,)


## Split data

Split data into 80% training and 20% testing

In [5]:
trainset, testset = train_test_split(df, test_size=0.2)

print("Training set size: ", trainset.shape)
print("Test set size: ", testset.shape)

Training set size:  (99872, 3)
Test set size:  (24969, 3)


In [6]:
reader = Reader(line_format="user item rating", rating_scale=(1,5))
train_data = Dataset.load_from_df(trainset, reader)

In [7]:
train_data = train_data.build_full_trainset()

## Define MAE and RMSE as Evaluation Metric

In [8]:
def mae(testset, pred):
    if len(testset) != len(pred):
        raise ValueError("testset and pred must have the same length")

    n = len(testset)
    total_error = 0

    for i in range(n):
        total_error += abs(testset[i] - pred[i])

    return total_error / n

def rmse(testset, pred):
    if len(testset) != len(pred):
        raise ValueError("testset and pred must have the same length")


    test_arr = np.array(testset)
    pred_arr = np.array(pred)
    return np.sqrt(np.mean((pred-test_arr) ** 2))

## Train and Test Model

### Train a SVD model with latent features = 50, reg = 0

In [15]:
model_1 = SVD(n_factors=50, n_epochs=30, reg=0, verbose=True)

In [16]:
model_1.fit(train_data)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29


<svd.SVD at 0x7f7a009aa2d0>

Test model

In [55]:
pred_test = model_1.predict(testset)

print("MAE:", mae(testset["rating"].to_list(), pred_test))
print("RMSE:", rmse(testset["rating"].to_list(), pred_test))

MAE: 0.5174847224037403
RMSE: 0.740731690582656


### Train a SVD model with latent features = 50, reg = 0.02

In [12]:
model_2 = SVD(n_factors=50, n_epochs=30, reg=0.02, verbose=True)

In [20]:
model_2.fit(train_data)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29


<svd.SVD at 0x7f7a02260590>

Test model

In [27]:
pred_test = model_2.predict(testset)

print("MAE:", mae(testset["rating"].to_list(), pred_test))
print("RMSE:", rmse(testset["rating"].to_list(), pred_test))

MAE: 0.5053119666947016
RMSE: 0.7308640702676806


### Train a SVD model with latent features = 50, reg = 0.05

In [11]:
model_3 = SVD(n_factors=50, n_epochs=30, reg=0.05, verbose=True)

In [12]:
model_3.fit(train_data)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29


<svd.SVD at 0x7fc1c4df2590>

Test model

In [13]:
pred_test = model_3.predict(testset)

print("MAE:", mae(testset["rating"].to_list(), pred_test))
print("RMSE:", rmse(testset["rating"].to_list(), pred_test))

MAE: 0.48163084870594247
RMSE: 0.6890159542892594


### Train a SVD model with latent features = 100, reg = 0

In [None]:
model_4 = SVD(n_factors=100, n_epochs=30, reg=0, verbose=True)

In [None]:
model_4.fit(train_data)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29


<svd.SVD at 0x7f7a009ab150>

Test model

In [54]:
pred_test = model_4.predict(testset)

print("MAE:", mae(testset["rating"].to_list(), pred_test))
print("RMSE:", rmse(testset["rating"].to_list(), pred_test))

MAE: 0.5077931699215673
RMSE: 0.732955821409966


### Train a SVD model with latent features = 100, reg = 0.02

In [14]:
model_5 = SVD(n_factors=100, n_epochs=30, reg=0.02, verbose=True)

In [15]:
model_5.fit(train_data)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29


<svd.SVD at 0x7fc1c3f98050>

Test model

In [16]:
pred_test = model_5.predict(testset)

print("MAE:", mae(testset["rating"].to_list(), pred_test))
print("RMSE:", rmse(testset["rating"].to_list(), pred_test))

MAE: 0.4845280925566546
RMSE: 0.6996565784382274


### Train a SVD model with latent features = 100, reg = 0.05

In [17]:
model_6 = SVD(n_factors=100, n_epochs=30, reg=0.05, verbose=True)

In [18]:
model_6.fit(train_data)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29


<svd.SVD at 0x7fc1c3f8f290>

Test model

In [19]:
pred_test = model_6.predict(testset)

print("MAE:", mae(testset["rating"].to_list(), pred_test))
print("RMSE:", rmse(testset["rating"].to_list(), pred_test))

MAE: 0.4783985941403227
RMSE: 0.6862631430955026


# Dumping model to file

In [56]:
with gzip.open("svd_model.pkl.gz", "wb") as f:
    pickled = pickle.dumps(model_6)
    optimized_pickle = pickletools.optimize(pickled)
    f.write(optimized_pickle)