In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

number_of_users, number_of_movies = (10000, 1000)

data_pd = pd.read_csv('../../data/data_train.csv')

train_size = 0.9

train_pd, test_pd = train_test_split(data_pd, train_size=train_size, random_state=42)

def extract_users_items_labels(data_pd):
    users, movies = \
        [np.squeeze(arr) for arr in np.split(data_pd.Id.str.extract('r(\d+)_c(\d+)').values.astype(int) - 1, 2, axis=-1)]
    labels = data_pd.Prediction.values
    return users, movies, labels

train_users, train_movies, train_labels = extract_users_items_labels(train_pd)

movies_rated_by_user_u = {}
for train_user, train_movie in zip(train_users, train_movies):
    if train_user in movies_rated_by_user_u.keys():
        movies_rated_by_user_u[train_user].append(train_movie)
    else:
        movies_rated_by_user_u[train_user] = [train_movie]

test_users, test_movies, test_labels = extract_users_items_labels(test_pd)

rmse = lambda x, y: math.sqrt(mean_squared_error(x, y))

def extract_prediction_from_full_matrix(reconstructed_matrix, users=test_users, movies=test_movies):
    # returns predictions for the users-movies combinations specified based on a full m \times n matrix
    assert(len(users) == len(movies)), "users-movies combinations specified should have equal length"
    predictions = np.zeros(len(test_users))

    for i, (user, movie) in enumerate(zip(users, movies)):
        predictions[i] = reconstructed_matrix[user][movie]

    return predictions


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm.notebook import tqdm

#  use gpu if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('Using device:', device)

# Parameters
batch_size = 64
num_epochs = 100
show_validation_score_every_epochs = 1
embedding_size = 200
learning_rate = 7e-4
weight_decay = 7e-5
mean_init = 0.2
std_init = 0.001

class SVDPP(nn.Module):
    def __init__(self, number_of_users, number_of_movies, embedding_size, global_mean, is_train_user_arr, is_train_movie_arr, movies_rated_by_user_u, mean_init, std_init):
        super().__init__()
        self.Bu = nn.Embedding(number_of_users, 1)
        nn.init.normal_(self.Bu.weight, mean=mean_init, std=std_init)
        self.Bi = nn.Embedding(number_of_movies, 1)
        nn.init.normal_(self.Bu.weight, mean=mean_init, std=std_init)
        self.P = nn.Embedding(number_of_users, embedding_size)
        nn.init.normal_(self.P.weight, mean=mean_init, std=std_init)
        self.Q = nn.Embedding(number_of_movies, embedding_size)
        nn.init.normal_(self.Q.weight, mean=mean_init, std=std_init)
        self.Y = nn.Embedding(number_of_movies, embedding_size)
        nn.init.normal_(self.Y.weight, mean=mean_init, std=std_init)
        self.global_mean = torch.tensor(global_mean, requires_grad=False)
        self.mask_unknown_users = nn.Embedding.from_pretrained(torch.FloatTensor(is_train_user_arr), freeze=True)
        self.mask_unknown_movies = nn.Embedding.from_pretrained(torch.FloatTensor(is_train_movie_arr), freeze=True)
        self.movies_rated_by_user_u = movies_rated_by_user_u

    def get_y(self, users):
        js_list = [self.movies_rated_by_user_u[u] for u in users.cpu().numpy()]
        js_lengths = [len(js) for js in js_list]
        js_list_concatted = np.hstack(tuple(js_list))
        temp = self.Y(torch.tensor(js_list_concatted, device=device))
        last_index = 0
        y = []
        for js_length in js_lengths:
            current_tensor = temp[last_index:last_index+js_length, :].sum(dim=0)
            current_tensor = current_tensor.div(np.sqrt(js_length))
            y.append(current_tensor)
            last_index += js_length
        y = torch.stack(y, dim=0)
        return y

    def forward(self, users, movies):
        if self.training:
            bu = self.Bu(users)
            bi = self.Bi(movies)
            gm = self.global_mean
            p = self.P(users)
            q = self.Q(movies)
            y = self.get_y(users)
            result = q.mul(p+y).sum(dim=1) + gm + torch.squeeze(bi) + torch.squeeze(bu)
            return result
        else:
            users_mask = self.mask_unknown_users(users)
            movies_mask = self.mask_unknown_movies(movies)
            bu = users_mask * self.Bu(users)
            bi = movies_mask * self.Bi(movies)
            gm = self.global_mean
            p = users_mask * self.P(users)
            q = movies_mask * self.Q(movies)
            y = movies_mask * users_mask * self.get_y(users)
            result = q.mul(p+y).sum(dim=1) + gm + torch.squeeze(bi) + torch.squeeze(bu)
            return result


Using device: cuda


In [3]:
def mse_loss(predictions, labels):
    return torch.mean((predictions - labels) ** 2)

# Build Dataloaders
train_users_torch = torch.tensor(train_users, device=device)
train_movies_torch = torch.tensor(train_movies, device=device)
train_labels_torch = torch.tensor(train_labels, device=device)

train_dataloader = DataLoader(
    TensorDataset(train_users_torch, train_movies_torch, train_labels_torch),
    batch_size=batch_size)

test_users_torch = torch.tensor(test_users, device=device)
test_movies_torch = torch.tensor(test_movies, device=device)

test_dataloader = DataLoader(
    TensorDataset(test_users_torch, test_movies_torch),
    batch_size=batch_size)

global_mean = np.mean(train_labels)

is_train_user_arr = np.zeros((number_of_users, 1))
is_train_user_arr[train_users, 0] = 1.0

is_train_movie_arr = np.zeros((number_of_movies, 1))
is_train_movie_arr[train_movies, 0] = 1.0

model = SVDPP(number_of_users, number_of_movies, embedding_size, global_mean, is_train_user_arr, is_train_movie_arr, movies_rated_by_user_u, mean_init, std_init).to(device)

optimizer = optim.Adam(model.parameters(),
                       lr=learning_rate,
                       weight_decay=weight_decay)

train_rmse_values = []
test_rmse_values = []
step = 0
with tqdm(total=len(train_dataloader) * num_epochs) as pbar:
    for epoch in range(num_epochs):
        for users_batch, movies_batch, labels_batch in train_dataloader:
            optimizer.zero_grad()

            predictions_batch = model(users_batch, movies_batch)

            loss = mse_loss(predictions_batch, labels_batch)

            loss.backward()

            optimizer.step()

            pbar.update(1)

            step += 1

        if epoch % show_validation_score_every_epochs == 0:
            model.eval()
            
            with torch.no_grad():
                all_train_predictions = []
                for users_batch, movies_batch, _ in train_dataloader:
                    predictions_batch = model(users_batch, movies_batch)
                    all_train_predictions.extend(predictions_batch.detach().cpu().numpy().tolist())

                all_test_predictions = []
                for users_batch, movies_batch in test_dataloader:
                    predictions_batch = model(users_batch, movies_batch)
                    all_test_predictions.extend(predictions_batch.detach().cpu().numpy().tolist())

            train_rmse = rmse(train_labels, all_train_predictions)
            test_rmse = rmse(test_labels, all_test_predictions)
            print('Epoch: {:3d}, Train RMSE: {:.4f}, Test RMSE: {:.4f}'.format(epoch, train_rmse, test_rmse))
            train_rmse_values.append(train_rmse)
            test_rmse_values.append(test_rmse)

            model.train()


HBox(children=(FloatProgress(value=0.0, max=1655100.0), HTML(value='')))

Epoch:   0, Train RMSE: 1.0238, Test RMSE: 1.0326
Epoch:   1, Train RMSE: 0.9782, Test RMSE: 0.9966
Epoch:   2, Train RMSE: 0.9659, Test RMSE: 0.9901
Epoch:   3, Train RMSE: 0.9585, Test RMSE: 0.9872
Epoch:   4, Train RMSE: 0.9535, Test RMSE: 0.9853
Epoch:   5, Train RMSE: 0.9497, Test RMSE: 0.9839
Epoch:   6, Train RMSE: 0.9468, Test RMSE: 0.9830
Epoch:   7, Train RMSE: 0.9445, Test RMSE: 0.9823
Epoch:   8, Train RMSE: 0.9426, Test RMSE: 0.9819
Epoch:   9, Train RMSE: 0.9411, Test RMSE: 0.9816
Epoch:  10, Train RMSE: 0.9396, Test RMSE: 0.9814
Epoch:  11, Train RMSE: 0.9383, Test RMSE: 0.9812
Epoch:  12, Train RMSE: 0.9370, Test RMSE: 0.9810
Epoch:  13, Train RMSE: 0.9360, Test RMSE: 0.9810
Epoch:  14, Train RMSE: 0.9351, Test RMSE: 0.9809
Epoch:  15, Train RMSE: 0.9344, Test RMSE: 0.9809
Epoch:  16, Train RMSE: 0.9337, Test RMSE: 0.9809
Epoch:  17, Train RMSE: 0.9331, Test RMSE: 0.9808
Epoch:  18, Train RMSE: 0.9326, Test RMSE: 0.9808
Epoch:  19, Train RMSE: 0.9324, Test RMSE: 0.9808


# Ideas

1) Use different regularization techniques other than L2. e.g. set prior on latent vectors and add KL divergence to loss

2) Make use of nonlinearity

3) Instead of dividing by np.sqrt(len(number_of_rates)), learn this function

4) Add additional features: Number of movies this user watched, number of users this movie was watched by, clustering features, each user's frequencies for different ratings, each movies frequencies for different ratings

5) Add implicit features for movies.
