In [124]:
import pandas as pd
import numpy as np

df = pd.read_json('ratings.jsonl', lines=True)
df.head()
df.tail()

Unnamed: 0,UserId,ItemId,Timestamp,Rating
659715,3d7e93cbd0,450f41b5d3,2017-02-18 00:10:19,7
659716,3d7e93cbd0,80d1dae630,2017-06-09 22:46:29,7
659717,7804b284a3,0759d2567b,2014-12-28 04:43:49,4
659718,6648728db7,28fb7af42b,2013-06-03 00:57:27,1
659719,e7293de34b,f72dcb10e8,2013-03-09 13:37:35,6


In [125]:
df.drop(columns =["Timestamp"], inplace = True)
df.head()

Unnamed: 0,UserId,ItemId,Rating
0,c4ca4238a0,91766eac45,8
1,c81e728d9d,5c739554f7,9
2,c81e728d9d,48f6d7ce7c,8
3,c81e728d9d,e9318d627a,1
4,a87ff679a2,17e6357973,8


In [126]:
#unique users
unique_users = df['UserId'].unique()
print(len(unique_users))
#unique items
unique_items = df['ItemId'].unique()
print(len(unique_items))

ratings = df['Rating'].values

user_to_index = {user: i for i, user in enumerate(unique_users)}
item_to_index = {item: i for i, item in enumerate(unique_items)}

#vectorize unique users and items
user_indices = df['UserId'].map(user_to_index).values
item_indices = df['ItemId'].map(item_to_index).values   


51671
29674


In [127]:
#hyperparameters
learning_rate = 0.0095 #Learning rate
num_epochs = 19        #Number of epochs
num_factors = 15       #Number of latent factors
lambda_l2 = 0.095      #Regularization parameter for L2
lambda_l1 = 0.01       #Regularization parameter for L1


#number of unique users and items
num_users = len(unique_users)
num_items = len(unique_items)

#initialize user and item matrices with a uniform distribution based on the xavier initialization
np.random.seed(12)
user_matrix = np.random.uniform(-np.sqrt(6 / (num_users + num_factors)), np.sqrt(6 / (num_users + num_factors)), size=(num_users, num_factors))
item_matrix = np.random.uniform(-np.sqrt(6 / (num_items + num_factors)), np.sqrt(6 / (num_items + num_factors)), size=(num_items, num_factors))

#dataset ratings mean
ratings_mean = np.mean(ratings)

#initialize user and item biases as an array of zeros
user_bias = np.zeros(num_users)
item_bias = np.zeros(num_items)

#batch size 
batch_size = 400 

#number of batches
batch_num = int(len(ratings) / batch_size) 

#training loop, for each epoch, shuffle the ratings
for epoch in range(num_epochs):
    
    #shuffle the ratings, to avoid overfitting
    shuffled = np.random.choice(len(ratings), len(ratings), replace=False)

    #for each batch, update the user and item matrices and biases
    for batch in range(batch_num):

        #get the batch indices, by taking the batch size and multiplying it by the batch number
        batch_indices = shuffled[batch * batch_size: (batch + 1) * batch_size]
        
        #get the user and item indices and ratings for the batch by taking the batch indices
        user_batch = user_indices[batch_indices]
        item_batch = item_indices[batch_indices]
        rating_batch = ratings[batch_indices]

        #get the user and item matrices for the batch
        user_batch_matrix = user_matrix[user_batch, :]
        item_batch_matrix = item_matrix[item_batch, :]
        
        #get the dot product of the user and item matrices, to get the predicted ratings
        product = np.sum(user_batch_matrix * item_batch_matrix, axis=1)

        #get the predicted ratings by adding the user and item biases and the ratings mean to the dot product
        rating_hat = product + user_bias[user_batch] + item_bias[item_batch] + ratings_mean

        #calculate the error by subtracting the predicted ratings from the actual ratings
        error = rating_batch - rating_hat

        #calculate the gradients for the user and item biases, by multiplying the learning rate by the error, adding the regularization terms and the biases
        user_bias[user_batch] += learning_rate * (error - lambda_l2 * user_bias[user_batch] - lambda_l1 * np.sign(user_bias[user_batch]))
        item_bias[item_batch] += learning_rate * (error - lambda_l2 * item_bias[item_batch] - lambda_l1 * np.sign(item_bias[item_batch]))

        #calculate the gradients for the user and item matrices, by multiplying the learning rate by the error, adding the regularization terms and the matrices
        user_matrix[user_batch, :] += learning_rate * (error[:, np.newaxis] * item_matrix[item_batch, :] - lambda_l2 * user_matrix[user_batch, :] - lambda_l1 * np.sign(user_matrix[user_batch, :]))
        item_matrix[item_batch, :] += learning_rate * (error[:, np.newaxis] * user_matrix[user_batch, :] - lambda_l2 * item_matrix[item_batch, :] - lambda_l1 * np.sign(item_matrix[item_batch, :]))


In [146]:
#read targets.csv
df_targets = pd.read_csv("targets.csv")

In [147]:
contentDf = pd.read_json('content.jsonl', lines=True)

imdbRating = contentDf[['ItemId', 'imdbRating']]

df_targets = pd.merge(df_targets, imdbRating, on='ItemId')

imdbVotes = contentDf[['ItemId', 'imdbVotes']]

#tirar , de imdbVotes
imdbVotes['imdbVotes'] = imdbVotes['imdbVotes'].str.replace(',', '')

df_targets = pd.merge(df_targets, imdbVotes, on='ItemId')

# Metascore = contentDf[['ItemId', 'Metascore']]


# df_targets = pd.merge(df_targets, Metascore, on='ItemId')


df_targets.tail()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imdbVotes['imdbVotes'] = imdbVotes['imdbVotes'].str.replace(',', '')


Unnamed: 0,UserId,ItemId,imdbRating,imdbVotes
616195,dc4a4d9c5a,06f4f4fd71,8.0,12359
616196,e9e133003c,06f4f4fd71,8.0,12359
616197,b097988cc1,6026d6b5e8,6.7,4448
616198,be19050432,6026d6b5e8,6.7,4448
616199,cea09ffe70,6026d6b5e8,6.7,4448


In [130]:
# # "Ratings": [{"Source": "Internet Movie Database", "Value": "6.7/10"}, {"Source": "Rotten Tomatoes", "Value": "12%"}, {"Source": "Metacritic", "Value": "29/100"}]
# #i want to get a mean of this ratings in 10 scale, keeping in mind maybe theres "N/A" values

# ratingsEnsemble = contentDf[['ItemId', 'Ratings']]


# # Função para normalizar os ratings para uma escala de 0 a 10
# def normalize_rating(value):
#     if '/' in value:
#         numerator, denominator = value.split('/')
#         return float(numerator) / float(denominator) * 10
#     elif '%' in value:
#         return float(value.strip('%')) / 10
#     else:
#         return None

# # Função para calcular a média dos ratings de uma linha
# def calculate_average_ratings(ratings):
#     normalized_ratings = []
#     for rating in ratings:
#         normalized_value = normalize_rating(rating['Value'])
#         if normalized_value is not None:
#             normalized_ratings.append(normalized_value)

#     if normalized_ratings:
#         return sum(normalized_ratings) / len(normalized_ratings)
#     else:
#         return None

# # Supondo que você já tenha o DataFrame 'ratingsEnsemble'
# # Vamos aplicar a função a cada linha do DataFrame
# ratingsEnsemble['AverageRating'] = ratingsEnsemble['Ratings'].apply(calculate_average_ratings)


# ratingsEnsemble.drop(columns =["Ratings"], inplace = True)
# # Exibir as primeiras linhas para verificação

# df_targets = pd.merge(df_targets, ratingsEnsemble, on='ItemId')

# df_targets.tail()



In [131]:

# #create new column fro the rating predictions
# df_targets['Rating'] = 0


# #for each pair (user,item) in the targets.csv file, predict the rating and print it
# for _, row in df_targets.iterrows():

#     #get the user and item indices
#     user_index = user_to_index[row['UserId']]

#     if  row['ItemId'] in item_to_index:
#         item_index = item_to_index[row['ItemId']]
    
#         if row['AverageRating'] != None:
#             # if row['Metascore'] != "N/A":
#             #     predicted_rating = (3 * (np.dot(user_matrix[user_index, :], item_matrix[item_index, :].T) + user_bias[user_index] + item_bias[item_index] + ratings_mean) + float(row['imdbRating']) + float(row['Metascore']) / 10) / 5
#             # else:
#                 predicted_rating = ((np.dot(user_matrix[user_index, :], item_matrix[item_index, :].T) + user_bias[user_index] + item_bias[item_index] + ratings_mean) + float(row['AverageRating'])) / 2
          
#         else:
#             # if row['Metascore'] != "N/A":
#             #     predicted_rating = (2 * (np.dot(user_matrix[user_index, :], item_matrix[item_index, :].T) + user_bias[user_index] + item_bias[item_index] + ratings_mean) + float(row['Metascore']) / 10) / 3
#             # else:
#                 predicted_rating = (np.dot(user_matrix[user_index, :], item_matrix[item_index, :].T) + user_bias[user_index] + item_bias[item_index] + ratings_mean)
    
#     else:
#         if row['AverageRating'] != None:
#             predicted_rating = float(row['AverageRating'])
#         # elif row['Metascore'] != "N/A":
#         #     predicted_rating = float(row['Metascore']) / 10
#         else:
#             predicted_rating = 6
#     #if the predicted rating is less than 1, set it to 1
#     if float(predicted_rating) < 1:
#         predicted_rating = 1

#     df_targets.at[_, 'Rating'] = predicted_rating

# df

In [148]:

#create new column fro the rating predictions
df_targets['Rating'] = 0


#for each pair (user,item) in the targets.csv file, predict the rating and print it
for _, row in df_targets.iterrows():

    #get the user and item indices
    user_index = user_to_index[row['UserId']]

    if  row['ItemId'] in item_to_index:
        item_index = item_to_index[row['ItemId']]
    
        if row['imdbRating'] != "N/A":
            # if row['imdbVotes'] != "N/A":
            #     predicted_rating = (2 * (np.dot(user_matrix[user_index, :], item_matrix[item_index, :].T) + user_bias[user_index] + item_bias[item_index] + ratings_mean) + float(row['imdbRating']) + float(row['imdbVotes']) ) / 4
            # else:
                predicted_rating = (2 * (np.dot(user_matrix[user_index, :], item_matrix[item_index, :].T) + user_bias[user_index] + item_bias[item_index] + ratings_mean) + float(row['imdbRating'])) / 3
          
        else:
            # if row['imdbVotes'] != "N/A":
            #     predicted_rating = (2 * (np.dot(user_matrix[user_index, :], item_matrix[item_index, :].T) + user_bias[user_index] + item_bias[item_index] + ratings_mean) + float(row['imdbVotes'])) / 3
            # else:
                predicted_rating = (np.dot(user_matrix[user_index, :], item_matrix[item_index, :].T) + user_bias[user_index] + item_bias[item_index] + ratings_mean)
    
    else:
        if row['imdbRating'] != "N/A":
            predicted_rating = float(row['imdbRating'])
        # elif row['imdbVotes'] != "N/A":
        #     predicted_rating = float(row['imdbVotes'])
        else:
            predicted_rating = 6
    #if the predicted rating is less than 1, set it to 1
    if float(predicted_rating) < 1:
        predicted_rating = 1

    df_targets.at[_, 'Rating'] = predicted_rating

df

Unnamed: 0,UserId,ItemId,Rating
0,c4ca4238a0,91766eac45,8
1,c81e728d9d,5c739554f7,9
2,c81e728d9d,48f6d7ce7c,8
3,c81e728d9d,e9318d627a,1
4,a87ff679a2,17e6357973,8
...,...,...,...
659715,3d7e93cbd0,450f41b5d3,7
659716,3d7e93cbd0,80d1dae630,7
659717,7804b284a3,0759d2567b,4
659718,6648728db7,28fb7af42b,1


In [None]:

# #create new column fro the rating predictions
# df_targets['Rating'] = 0


# #for each pair (user,item) in the targets.csv file, predict the rating and print it
# for _, row in df_targets.iterrows():

#     #get the user and item indices
#     user_index = user_to_index[row['UserId']]

#     if  row['ItemId'] in item_to_index:
#         item_index = item_to_index[row['ItemId']]
    
#         if row['imdbRating'] != "N/A":
#             if row['imdbVotes'] != "N/A":
#                 predicted_rating = (np.dot(user_matrix[user_index, :], item_matrix[item_index, :].T) + user_bias[user_index] + item_bias[item_index] + ratings_mean) * float(row['imdbRating']) * float(row['imdbVotes']) 
#             else:
#                 predicted_rating = (np.dot(user_matrix[user_index, :], item_matrix[item_index, :].T) + user_bias[user_index] + item_bias[item_index] + ratings_mean) * float(row['imdbRating'])
          
#         else:
#             if row['imdbVotes'] != "N/A":
#                 predicted_rating = (np.dot(user_matrix[user_index, :], item_matrix[item_index, :].T) + user_bias[user_index] + item_bias[item_index] + ratings_mean) * float(row['imdbVotes'])
#             else:
#                 predicted_rating = (np.dot(user_matrix[user_index, :], item_matrix[item_index, :].T) + user_bias[user_index] + item_bias[item_index] + ratings_mean)
    
#     else:
#         if row['imdbRating'] != "N/A":
#             predicted_rating = float(row['imdbRating'])
#         elif row['imdbVotes'] != "N/A":
#             predicted_rating = float(row['imdbVotes'])
#         else:
#             predicted_rating = 6
#     #if the predicted rating is less than 1, set it to 1
#     if float(predicted_rating) < 1:
#         predicted_rating = 1

#     df_targets.at[_, 'Rating'] = predicted_rating
# df

In [150]:
#by user id, sort the predicted ratings in descending order
df_targets.sort_values(by=['UserId', 'Rating'], ascending=[True, False], inplace=True)
df_targets.head()

Unnamed: 0,UserId,ItemId,imdbRating,imdbVotes,Rating
204,0006246bee,1e5bdbcb76,8.1,475339,30527480.0
2434,0006246bee,c1ee6829f5,8.0,430209,26421170.0
2813,0006246bee,dcba99a1a6,7.7,325573,18063610.0
1325,0006246bee,80d1dae630,6.6,219796,9215517.0
2173,0006246bee,aad36aac60,7.2,190857,9186862.0


In [151]:
#put the predicted ratings in a csv file, called submission.csv
df_targets.drop(columns =["Rating", 'imdbRating', 'imdbVotes'], inplace = True)
df_targets.to_csv('submission.csv', index=False)
