In [1]:
import pandas as pd
import numpy as np

df = pd.read_json('ratings.jsonl', lines=True)
df.head()
df.tail()

Unnamed: 0,UserId,ItemId,Timestamp,Rating
659715,3d7e93cbd0,450f41b5d3,2017-02-18 00:10:19,7
659716,3d7e93cbd0,80d1dae630,2017-06-09 22:46:29,7
659717,7804b284a3,0759d2567b,2014-12-28 04:43:49,4
659718,6648728db7,28fb7af42b,2013-06-03 00:57:27,1
659719,e7293de34b,f72dcb10e8,2013-03-09 13:37:35,6


In [2]:
df.drop(columns =["Timestamp"], inplace = True)
df.head()

Unnamed: 0,UserId,ItemId,Rating
0,c4ca4238a0,91766eac45,8
1,c81e728d9d,5c739554f7,9
2,c81e728d9d,48f6d7ce7c,8
3,c81e728d9d,e9318d627a,1
4,a87ff679a2,17e6357973,8


In [4]:
#unique users
unique_users = df['UserId'].unique()
print(len(unique_users))
#unique items
unique_items = df['ItemId'].unique()
print(len(unique_items))

ratings = df['Rating'].values

user_to_index = {user: i for i, user in enumerate(unique_users)}
item_to_index = {item: i for i, item in enumerate(unique_items)}

#vectorize unique users and items
user_indices = df['UserId'].map(user_to_index).values
item_indices = df['ItemId'].map(item_to_index).values   


51671
29674


In [5]:
#hyperparameters
learning_rate = 0.0095 #Learning rate
num_epochs = 19        #Number of epochs
num_factors = 15       #Number of latent factors
lambda_l2 = 0.095      #Regularization parameter for L2
lambda_l1 = 0.01       #Regularization parameter for L1


#number of unique users and items
num_users = len(unique_users)
num_items = len(unique_items)

#initialize user and item matrices with a uniform distribution based on the xavier initialization
np.random.seed(12)
user_matrix = np.random.uniform(-np.sqrt(6 / (num_users + num_factors)), np.sqrt(6 / (num_users + num_factors)), size=(num_users, num_factors))
item_matrix = np.random.uniform(-np.sqrt(6 / (num_items + num_factors)), np.sqrt(6 / (num_items + num_factors)), size=(num_items, num_factors))

#dataset ratings mean
ratings_mean = np.mean(ratings)

#initialize user and item biases as an array of zeros
user_bias = np.zeros(num_users)
item_bias = np.zeros(num_items)

#batch size 
batch_size = 400 

#number of batches
batch_num = int(len(ratings) / batch_size) 

#training loop, for each epoch, shuffle the ratings
for epoch in range(num_epochs):
    
    #shuffle the ratings, to avoid overfitting
    shuffled = np.random.choice(len(ratings), len(ratings), replace=False)

    #for each batch, update the user and item matrices and biases
    for batch in range(batch_num):

        #get the batch indices, by taking the batch size and multiplying it by the batch number
        batch_indices = shuffled[batch * batch_size: (batch + 1) * batch_size]
        
        #get the user and item indices and ratings for the batch by taking the batch indices
        user_batch = user_indices[batch_indices]
        item_batch = item_indices[batch_indices]
        rating_batch = ratings[batch_indices]

        #get the user and item matrices for the batch
        user_batch_matrix = user_matrix[user_batch, :]
        item_batch_matrix = item_matrix[item_batch, :]
        
        #get the dot product of the user and item matrices, to get the predicted ratings
        product = np.sum(user_batch_matrix * item_batch_matrix, axis=1)

        #get the predicted ratings by adding the user and item biases and the ratings mean to the dot product
        rating_hat = product + user_bias[user_batch] + item_bias[item_batch] + ratings_mean

        #calculate the error by subtracting the predicted ratings from the actual ratings
        error = rating_batch - rating_hat

        #calculate the gradients for the user and item biases, by multiplying the learning rate by the error, adding the regularization terms and the biases
        user_bias[user_batch] += learning_rate * (error - lambda_l2 * user_bias[user_batch] - lambda_l1 * np.sign(user_bias[user_batch]))
        item_bias[item_batch] += learning_rate * (error - lambda_l2 * item_bias[item_batch] - lambda_l1 * np.sign(item_bias[item_batch]))

        #calculate the gradients for the user and item matrices, by multiplying the learning rate by the error, adding the regularization terms and the matrices
        user_matrix[user_batch, :] += learning_rate * (error[:, np.newaxis] * item_matrix[item_batch, :] - lambda_l2 * user_matrix[user_batch, :] - lambda_l1 * np.sign(user_matrix[user_batch, :]))
        item_matrix[item_batch, :] += learning_rate * (error[:, np.newaxis] * user_matrix[user_batch, :] - lambda_l2 * item_matrix[item_batch, :] - lambda_l1 * np.sign(item_matrix[item_batch, :]))


In [8]:
#read targets.csv
df_targets = pd.read_csv("targets.csv")



#function to round the predicted rating to the nearest integer if the decimal part is greater than or equal to 0.95
def custom_round(value):
    decimal_part = value - int(value)
    if decimal_part >= 0.95:
        return int(value) + 1
    else:
        return value

#create new column fro the rating predictions
df_targets['Rating'] = 0


#for each pair (user,item) in the targets.csv file, predict the rating and print it
for _, row in df_targets.iterrows():

    #get the user and item indices
    user_index = user_to_index[row['UserId']]

    if  row['ItemId'] in item_to_index:
        item_index = item_to_index[row['ItemId']]
    
        #predict the rating by taking the dot product of the user and item vectors and adding the user and item biases and the ratings mean
        predicted_rating = np.dot(user_matrix[user_index, :], item_matrix[item_index, :].T) + user_bias[user_index] + item_bias[item_index] + ratings_mean
    
    else:
        predicted_rating = 6
    #if the predicted rating is less than 1, set it to 1
    if predicted_rating < 1:
        predicted_rating = 1

    df_targets.at[_, 'Rating'] = predicted_rating

df

Unnamed: 0,UserId,ItemId,Rating
0,0006246bee,01d2404d4c,6.947333
1,0006246bee,03d43fdf92,6.545059
2,0006246bee,0808a9666b,7.010156
3,0006246bee,0a5d7dd6f6,7.263846
4,0006246bee,0bab4a8104,6.366271


In [11]:
#by user id, sort the predicted ratings in descending order
df_targets.sort_values(by=['UserId', 'Rating'], ascending=[True, False], inplace=True)
df_targets.tail()

Unnamed: 0,UserId,ItemId,Rating
616189,fffffe98d0,e5076931c7,5.550633
616194,fffffe98d0,f06dda3e12,5.48375
616105,fffffe98d0,0a86517759,5.414629
616180,fffffe98d0,c3a34bf48a,5.366539
616151,fffffe98d0,7479705807,4.801435


In [15]:
#put the predicted ratings in a csv file, called submission.csv
df_targets.drop(columns =["Rating"], inplace = True)
df_targets.to_csv('submission.csv', index=False)
