In [1]:
import pandas as pd
import numpy as np

## Load dataset

In [None]:
"""
Load json files into pandas dataframes
"""
def load_dataset(file_path):
    data_frame = pd.read_json(file_path, lines=True)
    print(">" * 50)
    print("Loading dataset from: ", file_path)
    print("Number of users: ", len(data_frame['user_id'].unique()))
    print("Number of items: ", len(data_frame['item_id'].unique()))
    print("Number of ratings: ", len(data_frame))
    print(">" * 50)
    return data_frame

train_df = load_dataset('goodreads_reviews_young_adult_train.json') # training data
val_df = load_dataset('goodreads_reviews_young_adult_val.json') # validation data
test_df = load_dataset('goodreads_reviews_young_adult_test.json') # test data

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Loading dataset from:  goodreads_reviews_young_adult_train.json
Number of users:  175518
Number of items:  78670
Number of ratings:  1433940
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Loading dataset from:  goodreads_reviews_young_adult_val.json
Number of users:  111805
Number of items:  50707
Number of ratings:  477980
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Loading dataset from:  goodreads_reviews_young_adult_test.json
Number of users:  111807
Number of items:  50783
Number of ratings:  477980
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>


In [3]:
# see the data
train_df.head()

Unnamed: 0,user_id,item_id,review_id,rating
0,6b6166a96cc86d35e638243d7733eb86,7171637,f1075451f04436d47b0abc5e7116599f,0
1,91ff00cc3a6a988904fb4420fac6d63c,428263,00d4600d1ff2726b72aa513abdc9f954,3
2,da1a22733f209a30704ab4f55ca2af91,1217100,34a93cfc946ef38e9a0add45eae4b67f,1
3,599ce0c6a5a5cf980c38f20c24dd4711,9462775,cd30039367407745b4c33e7f98d73ce7,3
4,4fd894ab67e02dfcffb797c7380f5d87,7857408,bec621eb2e788344b173aeb594b3b553,3


Map user_id and item_id to a sequence index, so that it ranges from [0, N]

In [4]:
def map_index(train_df):
    user_id = train_df['user_id'].tolist()
    item_id = train_df['item_id'].tolist()
    sorted_user_id = sorted(set(user_id))
    sorted_item_id = sorted(set(item_id))
    userid_map = {user_id:i for i, user_id in enumerate(sorted_user_id)}
    itemid_map = {item_id:i for i, item_id in enumerate(sorted_item_id)}
    train_df['map_userid'] = train_df['user_id'].apply(lambda x: userid_map[x])
    train_df['map_itemid'] = train_df['item_id'].apply(lambda x: itemid_map[x])

    return train_df, userid_map, itemid_map

train_df, userid_map, itemid_map = map_index(train_df)
train_df.head()

Unnamed: 0,user_id,item_id,review_id,rating,map_userid,map_itemid
0,6b6166a96cc86d35e638243d7733eb86,7171637,f1075451f04436d47b0abc5e7116599f,0,73818,15771
1,91ff00cc3a6a988904fb4420fac6d63c,428263,00d4600d1ff2726b72aa513abdc9f954,3,100227,3308
2,da1a22733f209a30704ab4f55ca2af91,1217100,34a93cfc946ef38e9a0add45eae4b67f,1,149341,6932
3,599ce0c6a5a5cf980c38f20c24dd4711,9462775,cd30039367407745b4c33e7f98d73ce7,3,61424,20436
4,4fd894ab67e02dfcffb797c7380f5d87,7857408,bec621eb2e788344b173aeb594b3b553,3,54879,17123


## 1. Explore biases

In [5]:
# Compute the global bias
def compute_global_bias(train_df):
    return train_df['rating'].mean()

# Compute the user bias of a specific user
def compute_user_bias(train_df, bg, user_id):
    user_bias = train_df[train_df['user_id'] == user_id]['rating'].mean() - bg
    return user_bias

# Compute the item bias of a specific item
def compute_item_bias(train_df, bg, item_id):
    item_bias = train_df[train_df['item_id'] == item_id]['rating'].mean() - bg
    return item_bias

bg = compute_global_bias(train_df)
print("Global bias: ", bg)
user_bias = compute_user_bias(train_df, bg, '91ceb82d91493506532feb02ce751ce7')
print("The user specific bias of user id = “91ceb82d91493506532feb02ce751ce7”: ", user_bias)
item_bias = compute_item_bias(train_df, bg, 6931234)
print("The item specific bias of item id = “6931234”: ", item_bias)

Global bias:  3.7634559326052694
The user specific bias of user id = “91ceb82d91493506532feb02ce751ce7”:  -0.9974984857967586
The item specific bias of item id = “6931234”:  -0.24732690034720495


## 2. Implement the regularized latent factor model without bias using SGD

Function for computing RMSE

In [6]:
def RMSE(groundtruth, prediction):
    sum = 0
    for gt, pred in zip(groundtruth, prediction):
        sum += (gt - pred) ** 2
    mse = sum / len(groundtruth)
    rmse = np.sqrt(mse)
    return rmse

Implement the regularized latent factor model without bias using SGD

In [7]:
# Class for the latent factor model without bias
class LFMwoBias:
    def __init__(self, k, lr, lambda_1, lambda_2, userid_map, itemid_map):
        self.k = k # number of latent factors
        self.lr = lr # learning rate
        # regularization hyperparameters
        self.lambda_1 = lambda_1
        self.lambda_2 = lambda_2
        # user_id and item_id mapping
        self.userid_map = userid_map
        self.itemid_map = itemid_map
        # initialize P and Q by random normal distribution
        self.P = np.random.normal(loc=0, scale=1, size=(len(itemid_map), k))
        self.Q = np.random.normal(loc=0, scale=1, size=(len(userid_map), k))
    
    # use SGD to train the model
    def fit(self, train_df, epochs):
        for epoch in range(epochs):
            for row in train_df.itertuples():
                user_id = self.userid_map[row.user_id]
                item_id = self.itemid_map[row.item_id]
                r = row.rating

                # compute the prediction and loss
                pred_r = np.dot(self.P[item_id], self.Q[user_id])
                loss = r - pred_r

                # compute the gradient
                gradient_p = -2 * loss * self.Q[user_id] + 2 * self.lambda_2 * self.P[item_id]
                gradient_q = -2 * loss * self.P[item_id] + 2 * self.lambda_1 * self.Q[user_id]

                # update P and Q
                self.P[item_id] = self.P[item_id] - self.lr * gradient_p
                self.Q[user_id] = self.Q[user_id] - self.lr * gradient_q

            # compute the RMSE for each epoch
            preds = []
            gts = train_df['rating'].tolist()
            for row in train_df.itertuples():
                user_id = self.userid_map[row.user_id]
                item_id = self.itemid_map[row.item_id]
                preds.append(np.dot(self.P[item_id], self.Q[user_id]))
            rmse = RMSE(gts, preds)
            print("Epoch: ", epoch+1, ", RMSE: ", rmse)

    # predict the rating of a user-item pair
    def predict(self, user_id, item_id):
        # if the user_id or item_id is not in the training set, initialize P and Q by random normal distribution
        if user_id not in self.userid_map:
            q = np.random.normal(loc=0, scale=1, size=(self.k))
        else:
            q = self.Q[self.userid_map[user_id]] 
        if item_id not in self.itemid_map:
            p = np.random.normal(loc=0, scale=1, size=(self.k))
        else:
            p = self.P[self.itemid_map[item_id]]
        pred = np.dot(p, q)
        return pred


train the model when k=8

In [8]:
model_k8 = LFMwoBias(k=8, lr=0.01, lambda_1=0.3, lambda_2=0.3, userid_map=userid_map, itemid_map=itemid_map)
model_k8.fit(train_df, epochs=10)

Epoch:  1 , RMSE:  3.0879025118024415
Epoch:  2 , RMSE:  2.17980890384628
Epoch:  3 , RMSE:  1.7761222980429023
Epoch:  4 , RMSE:  1.5446455886159298
Epoch:  5 , RMSE:  1.3969658314302287
Epoch:  6 , RMSE:  1.2967033655904485
Epoch:  7 , RMSE:  1.2255497662474968
Epoch:  8 , RMSE:  1.1733345139315552
Epoch:  9 , RMSE:  1.1339969382748734
Epoch:  10 , RMSE:  1.1037152964730346


train the model when k=4

In [9]:
model_k4 = LFMwoBias(k=4, lr=0.01, lambda_1=0.3, lambda_2=0.3, userid_map=userid_map, itemid_map=itemid_map)
model_k4.fit(train_df, epochs=10)

Epoch:  1 , RMSE:  3.4885440718829694
Epoch:  2 , RMSE:  2.380794571128425
Epoch:  3 , RMSE:  1.9247162951569834
Epoch:  4 , RMSE:  1.6662961478223834
Epoch:  5 , RMSE:  1.4989942621595547
Epoch:  6 , RMSE:  1.3829382083093162
Epoch:  7 , RMSE:  1.2988218008216301
Epoch:  8 , RMSE:  1.2360227695993369
Epoch:  9 , RMSE:  1.1881047827695514
Epoch:  10 , RMSE:  1.1508967165336603


train the model when k=16

In [10]:
model_k16 = LFMwoBias(k=16, lr=0.01, lambda_1=0.3, lambda_2=0.3, userid_map=userid_map, itemid_map=itemid_map)
model_k16.fit(train_df, epochs=10)

Epoch:  1 , RMSE:  2.915461639496118
Epoch:  2 , RMSE:  2.032617240158386
Epoch:  3 , RMSE:  1.6617923330559556
Epoch:  4 , RMSE:  1.4585213787914093
Epoch:  5 , RMSE:  1.3314851000112553
Epoch:  6 , RMSE:  1.245524438502207
Epoch:  7 , RMSE:  1.1841870349073593
Epoch:  8 , RMSE:  1.1387453237883296
Epoch:  9 , RMSE:  1.1041206665819667
Epoch:  10 , RMSE:  1.0771488528101754


Compute the RMSE for each value of k on the validation data.

In [11]:
# calculate the RMSE of a model on a dataset
def cal_rmse(model, df):
    preds = []
    gts = df['rating'].tolist()
    for row in df.itertuples():
        user_id = row.user_id
        item_id = row.item_id
        pred = model.predict(user_id, item_id)
        preds.append(pred)
    rmse = RMSE(gts, preds)
    return rmse

In [12]:
rmse_k4 = cal_rmse(model_k4, val_df)
rmse_k8 = cal_rmse(model_k8, val_df)
rmse_k16 = cal_rmse(model_k16, val_df)

print("RMSE on the validation set of k=4: ", rmse_k4)
print("RMSE on the validation set of k=8: ", rmse_k8)
print("RMSE on the validation set of k=16: ", rmse_k16)

RMSE on the validation set of k=4:  1.8063713084919952
RMSE on the validation set of k=8:  1.8883039935503667
RMSE on the validation set of k=16:  2.007465162823557


The model when k=4 has the best RMSE.

Compute its RMSE on the test data.

In [13]:
model_best = model_k4
rmse = cal_rmse(model_best, test_df)
print("RMSE of the best model on the test set: ", rmse)

RMSE of the best model on the test set:  1.799751940361155


## 3. Implement the regularized latent factor model with bias using SGD

Compute user bias and item bias for all users and items

In [14]:
global_bias = compute_global_bias(train_df)
train_df["user_bias"] = train_df.groupby('user_id')['rating'].transform(lambda x: x.mean() - global_bias)
train_df["item_bias"] = train_df.groupby('item_id')['rating'].transform(lambda x: x.mean() - global_bias)

In [15]:
train_df.head()

Unnamed: 0,user_id,item_id,review_id,rating,map_userid,map_itemid,user_bias,item_bias
0,6b6166a96cc86d35e638243d7733eb86,7171637,f1075451f04436d47b0abc5e7116599f,0,73818,15771,-1.096789,0.343885
1,91ff00cc3a6a988904fb4420fac6d63c,428263,00d4600d1ff2726b72aa513abdc9f954,3,100227,3308,-0.172547,-0.142382
2,da1a22733f209a30704ab4f55ca2af91,1217100,34a93cfc946ef38e9a0add45eae4b67f,1,149341,6932,-1.528162,0.027971
3,599ce0c6a5a5cf980c38f20c24dd4711,9462775,cd30039367407745b4c33e7f98d73ce7,3,61424,20436,-0.529081,-0.173936
4,4fd894ab67e02dfcffb797c7380f5d87,7857408,bec621eb2e788344b173aeb594b3b553,3,54879,17123,-0.319011,-1.263456


Implement the regularized latent factor model with bias using SGD

In [16]:
# Class for the latent factor model with bias
class LFMwBias:
    def __init__(self, k, lr, lambda_1, lambda_2, lambda_3, lambda_4, userid_map, itemid_map):

        self.k = k # number of latent factors
        self.lr = lr # learning rate
        # regularization hyperparameters
        self.lambda_1 = lambda_1
        self.lambda_2 = lambda_2
        self.lambda_3 = lambda_3
        self.lambda_4 = lambda_4
        # user_id and item_id mapping
        self.userid_map = userid_map
        self.itemid_map = itemid_map
        # initialize P and Q by random normal distribution
        self.P = np.random.normal(loc=0, scale=1, size=(len(itemid_map), k))
        self.Q = np.random.normal(loc=0, scale=1, size=(len(userid_map), k))

    # use SGD to train the model
    def fit(self, train_df, epochs):
        self.global_bias = compute_global_bias(train_df)
        df_user_bias = train_df.groupby('user_id').agg({'user_bias': 'max'})
        df_item_bias = train_df.groupby('item_id').agg({'item_bias': 'max'})
        self.user_bias = {user_id: bias for user_id, bias in zip(df_user_bias.index, df_user_bias['user_bias'])}
        self.item_bias = {item_id: bias for item_id, bias in zip(df_item_bias.index, df_item_bias['item_bias'])}
        
        for epoch in range(epochs):
            for row in train_df.itertuples():
                ori_user_id = row.user_id
                ori_item_id = row.item_id
                user_id = self.userid_map[ori_user_id]
                item_id = self.itemid_map[ori_item_id]
                r = row.rating

                # get the user bias and item bias
                b_user = self.user_bias[ori_user_id]
                b_item = self.item_bias[ori_item_id]

                # compute the prediction and loss
                pred_r = np.dot(self.P[item_id], self.Q[user_id]) + self.global_bias + b_user + b_item
                loss = r - pred_r

                # compute the gradient
                gradient_p = -2 * loss * self.Q[user_id] + 2 * self.lambda_2 * self.P[item_id]
                gradient_q = -2 * loss * self.P[item_id] + 2 * self.lambda_1 * self.Q[user_id]
                gradient_b_user = - 2 * loss + 2 * self.lambda_3 * b_user
                gradient_b_item = -2 * loss + 2 * self.lambda_4 * b_item

                # update P, Q, user_bias and item_bias
                self.P[item_id] = self.P[item_id] - self.lr * gradient_p
                self.Q[user_id] = self.Q[user_id] - self.lr * gradient_q
                self.user_bias[ori_user_id] = b_user - self.lr * gradient_b_user
                self.item_bias[ori_item_id] = b_item - self.lr * gradient_b_item

            # compute the RMSE for each epoch
            preds = []
            gts = train_df['rating'].tolist()
            for row in train_df.itertuples():
                ori_user_id = row.user_id
                ori_item_id = row.item_id
                user_id = self.userid_map[ori_user_id]
                item_id = self.itemid_map[ori_item_id]
                b_user = self.user_bias[ori_user_id]
                b_item = self.item_bias[ori_item_id]
                pred = np.dot(self.P[item_id], self.Q[user_id]) + self.global_bias + b_user + b_item
                preds.append(pred)
            rmse = RMSE(gts, preds)
            print("Epoch: ", epoch+1, ", RMSE: ", rmse)

    # predict the rating of a user-item pair
    def predict(self, user_id, item_id):
        # if the user_id or item_id is not in the training set, initialize P and Q by random normal distribution
        # if the user_id or item_id is not in the training set, initialize user_bias and item_bias by 0
        if user_id not in self.userid_map:
            q = np.random.normal(loc=0, scale=1, size=(self.k))
            b_user = 0
        else:
            q = self.Q[self.userid_map[user_id]] 
            b_user = self.user_bias[user_id]
        if item_id not in self.itemid_map:
            p = np.random.normal(loc=0, scale=1, size=(self.k))
            b_item = 0
        else:
            p = self.P[self.itemid_map[item_id]]
            b_item = self.item_bias[item_id]
        pred = np.dot(p, q) + self.global_bias + b_user + b_item
        return pred


train the model when k=8

In [17]:
model_k8 = LFMwBias(k=8, lr=0.01, lambda_1=0.3, lambda_2=0.3, lambda_3=0.3, lambda_4=0.3, userid_map=userid_map, itemid_map=itemid_map)
model_k8.fit(train_df, epochs=10)

Epoch:  1 , RMSE:  1.0741372910830298
Epoch:  2 , RMSE:  0.9773417704528685
Epoch:  3 , RMSE:  0.94712139775978
Epoch:  4 , RMSE:  0.9348071364424908
Epoch:  5 , RMSE:  0.929070574668833
Epoch:  6 , RMSE:  0.9261940058972807
Epoch:  7 , RMSE:  0.9247056768372163
Epoch:  8 , RMSE:  0.9239454774921092
Epoch:  9 , RMSE:  0.9235890568712566
Epoch:  10 , RMSE:  0.9234654446951402


Report the learned user-specific bias of the user with user id= “91ceb82d91493506532feb02ce751ce7” , and the learned item-specific bias of the item with item id = “6931234”.

In [18]:
print("user bias of “91ceb82d91493506532feb02ce751ce7”:", model_k8.user_bias['91ceb82d91493506532feb02ce751ce7'])
print("item bias of “6931234”:", model_k8.item_bias[6931234])

user bias of “91ceb82d91493506532feb02ce751ce7”: -0.5628730990492314
item bias of “6931234”: -0.16827633953398888


train the model when k=4

In [19]:
model_k4 = LFMwBias(k=4, lr=0.01, lambda_1=0.3, lambda_2=0.3, lambda_3=0.3, lambda_4=0.3, userid_map=userid_map, itemid_map=itemid_map)
model_k4.fit(train_df, epochs=10)

Epoch:  1 , RMSE:  1.0457490098646074
Epoch:  2 , RMSE:  0.9846275246526599
Epoch:  3 , RMSE:  0.9629834774516193
Epoch:  4 , RMSE:  0.9529038558526138
Epoch:  5 , RMSE:  0.9475175061120047
Epoch:  6 , RMSE:  0.9443802171303045
Epoch:  7 , RMSE:  0.9424394016151174
Epoch:  8 , RMSE:  0.9411832079655854
Epoch:  9 , RMSE:  0.9403404573329069
Epoch:  10 , RMSE:  0.9397579076650767


train the model when k=16

In [20]:
model_k16 = LFMwBias(k=16, lr=0.01, lambda_1=0.3, lambda_2=0.3, lambda_3=0.3, lambda_4=0.3, userid_map=userid_map, itemid_map=itemid_map)
model_k16.fit(train_df, epochs=10)

Epoch:  1 , RMSE:  1.0481965364430295
Epoch:  2 , RMSE:  0.937822938078228
Epoch:  3 , RMSE:  0.9115104027617754
Epoch:  4 , RMSE:  0.9031351153421378
Epoch:  5 , RMSE:  0.9002586247128521
Epoch:  6 , RMSE:  0.8994553357498708
Epoch:  7 , RMSE:  0.8995409036164567
Epoch:  8 , RMSE:  0.9000363080531647
Epoch:  9 , RMSE:  0.9007245191804905
Epoch:  10 , RMSE:  0.9014987193034989


Compute the RMSE for each value of k on the validation data.

In [21]:
rmse_k4 = cal_rmse(model_k4, val_df)
rmse_k8 = cal_rmse(model_k8, val_df)
rmse_k16 = cal_rmse(model_k16, val_df)

print("RMSE on the validation set of k=4: ", rmse_k4)
print("RMSE on the validation set of k=8: ", rmse_k8)
print("RMSE on the validation set of k=16: ", rmse_k16)

RMSE on the validation set of k=4:  1.1640240712787042
RMSE on the validation set of k=8:  1.1971190191156318
RMSE on the validation set of k=16:  1.2608739517781484


The model when k=4 has the best RMSE.

Compute its RMSE on the test data.

In [22]:
model_best = model_k4
rmse = cal_rmse(model_best, test_df)
print("RMSE on the test set: ", rmse)

RMSE on the test set:  1.1613597168338194
