# Test Data

In [316]:
import pandas as pd

In [317]:
# user_id = [1, 2, 3, 4, 5]
# book_id = [101, 102, 103, 104, 105]
# rating = [4, 3, 5, 2, 4]

# Create DataFrame
# df = pd.DataFrame({
#     "user_id": user_id,
#     "book_id": book_id,
#     "rating": rating
# })

# print(df)

df = pd.read_csv("./input/ratings.csv")
df = pd.DataFrame({
    "user_id": df["UserID"],
    "book_id": df["BookID"],
    "rating": df["Rating"]
})

# Split Data

In [318]:
from sklearn.model_selection import train_test_split

In [319]:
trainset, testset = train_test_split(df, test_size=0.2)

print("Training set size: ", trainset.shape)
print("Test set size: ", testset.shape)

Training set size:  (99872, 3)
Test set size:  (24969, 3)


# Implement SVD Surprise

In [320]:
from surprise import Dataset, Reader, accuracy, SVD

In [321]:
reader = Reader(line_format="user item rating", rating_scale=(1,5))
train_data = Dataset.load_from_df(trainset, reader)
test_data = Dataset.load_from_df(testset, reader)

test_data = test_data.build_full_trainset()
test_data = test_data.build_testset()

train_data = train_data.build_full_trainset()

In [322]:
model = SVD(n_factors=3, n_epochs=10, reg_all=0.02, verbose=True)

In [323]:
model.fit(train_data)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f771bcede50>

In [324]:
pred_test = model.test(test_data)
accuracy.mae(pred_test)

MAE:  0.5043


0.5043383745522805

In [325]:
model.pu

array([[ 8.60460296e-03,  1.75130133e-02,  1.08934466e-01],
       [-5.08471077e-02,  5.28859174e-03,  6.60989753e-02],
       [ 1.50425339e-01, -6.66902807e-02, -1.24281347e-01],
       [-3.44061659e-02,  4.18509999e-02,  1.23411265e-01],
       [-1.44268077e-02,  3.83870137e-03, -8.06854420e-04],
       [ 6.00090919e-02,  1.22185267e-02, -1.67288812e-01],
       [-9.38640185e-02, -1.36520787e-01, -3.54679713e-03],
       [ 3.56625751e-02,  1.51364311e-02,  1.39875824e-01],
       [ 2.84722100e-03, -2.19481293e-02, -9.29817852e-03],
       [-6.25380203e-02,  1.62660192e-01, -1.59955945e-02],
       [-5.77702006e-02,  5.70858645e-02, -8.79344833e-02],
       [-1.19166410e-01, -4.17995132e-02,  1.37166988e-01],
       [ 4.80592423e-02, -4.12562222e-02, -6.82399177e-02],
       [-1.69035454e-03, -1.58507249e-02,  3.68231583e-02],
       [-3.22278403e-02, -6.76245391e-02,  1.30980792e-02],
       [-5.41918304e-02,  2.85173085e-02,  3.27341985e-02],
       [-1.15010055e-01,  1.32272963e-01

# Personal SVD

In [326]:
import numpy as np

In [327]:
def mae(testset, pred):
    if len(testset) != len(pred):
        raise ValueError("testset and pred must have the same length")

    n = len(testset)
    total_error = 0

    for i in range(n):
        total_error += abs(testset[i] - pred[i])

    return total_error / n

In [328]:
class PersonalSVD():
    def __init__(self, n_factors=100, n_epochs=20, init_mean=0,
                 init_std_dev=.1, lr=.005,
                 reg=.02, verbose=False):
        
        self.n_factors = n_factors
        self.n_epochs = n_epochs
        self.init_mean = init_mean
        self.init_std_dev = init_std_dev
        self.verbose=verbose
        self.lr = lr
        self.reg = reg
    
    def fit(self, trainset):
        self.trainset = trainset

        # (re) Initialise baselines
        self.bu = self.bi = None
        self.sgd(trainset)
        
        return self

    def sgd(self, trainset):
        rng = np.random.mtrand._rand

        # user biases
        bu = np.zeros(trainset.n_users, dtype=np.double)
        # item biases
        bi = np.zeros(trainset.n_items, dtype=np.double)
        # user factors
        pu = np.random.normal(loc=self.init_mean, scale=self.init_std_dev, size=(trainset.n_users, self.n_factors))
        # item factors
        qi = np.random.normal(loc=self.init_mean, scale=self.init_std_dev, size=(trainset.n_items, self.n_factors))

        n_factors = self.n_factors
        lr = self.lr
        reg = self.reg
        global_mean = self.trainset.global_mean
        # global_mean = 0
        
        for current_epoch in range(self.n_epochs):
            if self.verbose:
                print("Processing epoch {}".format(current_epoch))

            for u, i, r in trainset.all_ratings():
                # compute current error
                dot = 0  # <q_i, p_u>
                for f in range(n_factors):
                    dot += qi[i, f] * pu[u, f]
                err = r - (global_mean + bu[u] + bi[i] + dot)

                bu[u] += lr * (err - reg * bu[u])
                bi[i] += lr * (err - reg * bi[i])

                # update factors
                for f in range(n_factors):
                    puf = pu[u, f]
                    qif = qi[i, f]
                    pu[u, f] += lr * (err * qif - reg * puf)
                    qi[i, f] += lr * (err * puf - reg * qif)
        
        self.bu = np.asarray(bu)
        self.bi = np.asarray(bi)
        self.pu = np.asarray(pu)
        self.qi = np.asarray(qi)
  
    def estimate(self, u, i):
        known_user = self.trainset.knows_user(u)
        known_item = self.trainset.knows_item(i)

        est = self.trainset.global_mean

        if known_user:
            est += self.bu[u]

        if known_item:
            est += self.bi[i]

        if known_user and known_item:
            est += np.dot(self.qi[i], self.pu[u])

        return est
    
    def predict(self, testset):
        pred = []

        for index, row in testset.iterrows():
            user_id = row['user_id']
            item_id = row['book_id']
          
            inner_uid = self.trainset.to_inner_uid(user_id)
            inner_iid = self.trainset.to_inner_iid(item_id)
            
            est = self.estimate(inner_uid, inner_iid)
            
            lower_bound, higher_bound = self.trainset.rating_scale
            est = min(higher_bound, est)
            est = max(lower_bound, est)
            
            pred.append(est)
        
        return pred


In [329]:
personalSVD = PersonalSVD(n_factors=3, n_epochs=10, reg=0.02, verbose=True)
personalSVD.fit(train_data)

# print("bu")
# print(personalSVD.bu)
# print("bi")
# print(personalSVD.bi)
# print("pu")
# print(personalSVD.pu)
# print("qi")
# print(personalSVD.qi)

pred_test = personalSVD.predict(testset)

print(mae(testset["rating"].to_list(), pred_test))

# print(testset)
# print(test_data)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
0.5045082203538539
