# Test Data

In [1]:
import pandas as pd

In [2]:
# user_id = [1, 2, 3, 4, 5]
# book_id = [101, 102, 103, 104, 105]
# rating = [4, 3, 5, 2, 4]

# Create DataFrame
# df = pd.DataFrame({
#     "user_id": user_id,
#     "book_id": book_id,
#     "rating": rating
# })

# print(df)

df = pd.read_csv("./input/ratings.csv")
df = pd.DataFrame({
    "user_id": df["UserID"],
    "book_id": df["BookID"],
    "rating": df["Rating"]
})

# Split Data

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
trainset, testset = train_test_split(df, test_size=0.2)

print("Training set size: ", trainset.shape)
print("Test set size: ", testset.shape)

Training set size:  (99872, 3)
Test set size:  (24969, 3)


# Implement SVD Surprise

In [5]:
from surprise import Dataset, Reader, accuracy, SVD

In [6]:
reader = Reader(line_format="user item rating", rating_scale=(1,5))
train_data = Dataset.load_from_df(trainset, reader)
test_data = Dataset.load_from_df(testset, reader)

test_data = test_data.build_full_trainset()
test_data = test_data.build_testset()

train_data = train_data.build_full_trainset()

In [7]:
model = SVD(n_factors=3, n_epochs=10, reg_all=0.02, verbose=True)

In [8]:
model.fit(train_data)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f263e877d90>

In [9]:
pred_test = model.test(test_data)
accuracy.mae(pred_test)

MAE:  0.5067


0.5066919966754352

In [10]:
model.pu

array([[ 9.14742428e-03,  1.19771456e-01,  1.35526348e-02],
       [-1.70947036e-01, -1.04158399e-01,  1.35403689e-01],
       [-5.32645342e-01,  2.32233836e-01,  1.50647436e-01],
       [-7.92280770e-02, -3.41521073e-02,  5.98471161e-02],
       [-5.80561117e-02, -4.18143731e-02,  6.09218253e-02],
       [ 1.86268703e-01,  3.06532600e-02, -2.77370298e-01],
       [-6.79833533e-02, -3.04382400e-02,  6.81356892e-02],
       [-4.19048360e-02, -1.73932231e-02, -1.40603460e-01],
       [-1.19875235e-01, -1.44965380e-02, -1.71999446e-01],
       [-1.41327519e-01,  8.38330575e-02,  2.77833897e-02],
       [-5.07352936e-02,  6.22968558e-02,  1.06132288e-01],
       [ 2.31777405e-01,  2.21901974e-01, -3.90609340e-01],
       [-1.45079163e-01,  1.09969865e-01, -1.93499561e-01],
       [-1.25463988e-01,  1.77446857e-02,  6.03929216e-02],
       [-5.10133065e-02, -9.07056056e-02, -3.96105868e-02],
       [ 4.88433744e-02,  1.83598414e-02, -1.28607722e-01],
       [ 5.52751844e-02,  1.99088028e-01

# Personal SVD

In [11]:
import numpy as np

In [12]:
def mae(testset, pred):
    if len(testset) != len(pred):
        raise ValueError("testset and pred must have the same length")

    n = len(testset)
    total_error = 0

    for i in range(n):
        total_error += abs(testset[i] - pred[i])

    return total_error / n

In [17]:
class PersonalSVD():
    def __init__(self, n_factors=100, n_epochs=20, init_mean=0,
                 init_std_dev=.1, lr=.005,
                 reg=.02, verbose=False):
        
        self.n_factors = n_factors
        self.n_epochs = n_epochs
        self.init_mean = init_mean
        self.init_std_dev = init_std_dev
        self.verbose=verbose
        self.lr = lr
        self.reg = reg
    
    def fit(self, trainset):
        self.trainset = trainset

        # (re) Initialise baselines
        self.bu = self.bi = None
        self.sgd(trainset)
        
        return self

    def sgd(self, trainset):
        rng = np.random.mtrand._rand

        # user biases
        bu = np.zeros(trainset.n_users, dtype=np.double)
        # item biases
        bi = np.zeros(trainset.n_items, dtype=np.double)
        # user factors
        pu = np.random.normal(loc=self.init_mean, scale=self.init_std_dev, size=(trainset.n_users, self.n_factors))
        # item factors
        qi = np.random.normal(loc=self.init_mean, scale=self.init_std_dev, size=(trainset.n_items, self.n_factors))

        n_factors = self.n_factors
        lr = self.lr
        reg = self.reg
        global_mean = self.trainset.global_mean
        # global_mean = 0
        
        for current_epoch in range(self.n_epochs):
            if self.verbose:
                print("Processing epoch {}".format(current_epoch))

            for u, i, r in trainset.all_ratings():
                # compute current error
                dot = 0  # <q_i, p_u>
                for f in range(n_factors):
                    dot += qi[i, f] * pu[u, f]
                err = r - (global_mean + bu[u] + bi[i] + dot)

                bu[u] += lr * (err - reg * bu[u])
                bi[i] += lr * (err - reg * bi[i])

                # update factors
                for f in range(n_factors):
                    puf = pu[u, f]
                    qif = qi[i, f]
                    pu[u, f] += lr * (err * qif - reg * puf)
                    qi[i, f] += lr * (err * puf - reg * qif)
        
        self.bu = np.asarray(bu)
        self.bi = np.asarray(bi)
        self.pu = np.asarray(pu)
        self.qi = np.asarray(qi)
  
    def estimate(self, u, i):
        known_user = self.trainset.knows_user(u)
        known_item = self.trainset.knows_item(i)

        est = self.trainset.global_mean

        if known_user:
            est += self.bu[u]

        if known_item:
            est += self.bi[i]

        if known_user and known_item:
            est += np.dot(self.qi[i], self.pu[u])

        return est
    
    def predict(self, testset):
        pred = []

        for index, row in testset.iterrows():
            user_id = row['user_id']
            item_id = row['book_id']
          
            inner_uid = self.trainset.to_inner_uid(user_id)
            inner_iid = self.trainset.to_inner_iid(item_id)
            
            est = self.estimate(inner_uid, inner_iid)
            
            lower_bound, higher_bound = self.trainset.rating_scale
            est = min(higher_bound, est)
            est = max(lower_bound, est)
            
            pred.append(est)
        
        return pred


In [18]:
personalSVD = PersonalSVD(n_factors=100, n_epochs=30, reg=0.02, verbose=True)
personalSVD.fit(train_data)

# print("bu")
# print(personalSVD.bu)
# print("bi")
# print(personalSVD.bi)
# print("pu")
# print(personalSVD.pu)
# print("qi")
# print(personalSVD.qi)

pred_test = personalSVD.predict(testset)

print(mae(testset["rating"].to_list(), pred_test))

# print(testset)
# print(test_data)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
0.48524154279510295


# Export model

In [15]:
import pickle, gzip, pickletools

In [19]:
with gzip.open("svd_model.pkl.gz", "wb") as f:
    pickled = pickle.dumps(personalSVD)
    optimized_pickle = pickletools.optimize(pickled)
    f.write(optimized_pickle)