In [2]:
random_state = 42
import numpy as np
np.random.seed(random_state)
import torch
torch.manual_seed(random_state)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

number_of_users, number_of_movies = (10000, 1000)

data_pd = pd.read_csv('../../data/data_train.csv')

train_size = 0.9

train_pd, test_pd = train_test_split(data_pd, train_size=train_size, random_state=random_state)

def extract_users_items_labels(data_pd):
    users, movies = \
        [np.squeeze(arr) for arr in np.split(data_pd.Id.str.extract('r(\d+)_c(\d+)').values.astype(int) - 1, 2, axis=-1)]
    labels = data_pd.Prediction.values
    return users, movies, labels

train_users, train_movies, train_labels = extract_users_items_labels(train_pd)
test_users, test_movies, test_labels = extract_users_items_labels(test_pd)

movies_rated_by_user_u = {}
for train_user, train_movie in zip(train_users, train_movies):
    if train_user in movies_rated_by_user_u.keys():
        movies_rated_by_user_u[train_user].append(train_movie)
    else:
        movies_rated_by_user_u[train_user] = [train_movie]
maximum_number_of_ratings_per_user = max([len(movies) for user, movies in movies_rated_by_user_u.items()])

train_is_known_user = np.ones(train_users.shape[0]).reshape(-1, 1)
train_is_known_movie = np.ones(train_movies.shape[0]).reshape(-1, 1)

test_is_known_user = np.zeros(test_users.shape[0])
test_is_known_movie = np.zeros(test_movies.shape[0])

train_users_frozenset = frozenset(train_users)
train_movies_frozenset = frozenset(train_movies)

test_is_known_user = np.array([1.0 if user in train_users_frozenset else 0.0 for user in test_users]).reshape(-1, 1)
test_is_known_movie = np.array([1.0 if movie in train_movies_frozenset else 0.0 for movie in test_movies]).reshape(-1, 1)

train_movies_rated_by_this_user = np.zeros((train_users.shape[0], maximum_number_of_ratings_per_user))
test_movies_rated_by_this_user = np.zeros((test_users.shape[0], maximum_number_of_ratings_per_user))

train_sqrt_of_number_of_movies_rated_by_this_user = np.zeros(train_users.shape[0]).reshape(-1, 1)
test_sqrt_of_number_of_movies_rated_by_this_user = np.zeros(test_users.shape[0]).reshape(-1, 1)

for index, train_user in enumerate(train_users):
    movies_rated_by_this_user = movies_rated_by_user_u[train_user]
    train_movies_rated_by_this_user[index, :len(movies_rated_by_this_user)] = np.array(movies_rated_by_this_user) + 1
    train_sqrt_of_number_of_movies_rated_by_this_user[index, 0] = np.sqrt(len(movies_rated_by_this_user))

for index, test_user in enumerate(test_users):
    movies_rated_by_this_user = movies_rated_by_user_u[test_user]
    test_movies_rated_by_this_user[index, :len(movies_rated_by_this_user)] = np.array(movies_rated_by_this_user) + 1
    test_sqrt_of_number_of_movies_rated_by_this_user[index, 0] = np.sqrt(len(movies_rated_by_this_user))

rmse = lambda x, y: np.sqrt(mean_squared_error(x, y))

def extract_prediction_from_full_matrix(reconstructed_matrix, users=test_users, movies=test_movies):
    # returns predictions for the users-movies combinations specified based on a full m \times n matrix
    assert(len(users) == len(movies)), "users-movies combinations specified should have equal length"
    predictions = np.zeros(len(test_users))

    for i, (user, movie) in enumerate(zip(users, movies)):
        predictions[i] = reconstructed_matrix[user][movie]

    return predictions


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm.notebook import tqdm

#  use gpu if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('Using device:', device)

# Parameters
batch_size = 64
num_epochs = 100
show_validation_score_every_epochs = 1
embedding_size = 200
learning_rate = 7e-4
weight_decay = 7e-5
mean_init = 0.2
std_init = 0.001

class SVDPP(nn.Module):
    def __init__(self, number_of_users, number_of_movies, embedding_size, global_mean, mean_init, std_init):
        super().__init__()
        self.Bu = nn.Embedding(number_of_users, 1)
        nn.init.normal_(self.Bu.weight, mean=mean_init, std=std_init)
        self.Bi = nn.Embedding(number_of_movies, 1)
        nn.init.normal_(self.Bu.weight, mean=mean_init, std=std_init)
        self.P = nn.Embedding(number_of_users, embedding_size)
        nn.init.normal_(self.P.weight, mean=mean_init, std=std_init)
        self.Q = nn.Embedding(number_of_movies, embedding_size)
        nn.init.normal_(self.Q.weight, mean=mean_init, std=std_init)
        self.Y = nn.Embedding(number_of_movies + 1, embedding_size, padding_idx=0) # Made this 1-indexed to save memory in GPU. (To pad movies_rated_by_this_user with zeros.)
        nn.init.normal_(self.Y.weight, mean=mean_init, std=std_init)
        self.global_mean = torch.tensor(global_mean, requires_grad=False)

    def forward(self, users, movies, is_known_user, is_known_movie, movies_rated_by_this_user, sqrt_of_number_of_movies_rated_by_this_user):
        if self.training:
            bu = self.Bu(users)
            bi = self.Bi(movies)
            gm = self.global_mean
            p = self.P(users)
            q = self.Q(movies)
            y = self.Y(movies_rated_by_this_user).sum(dim=1).div(sqrt_of_number_of_movies_rated_by_this_user)
            result = q.mul(p+y).sum(dim=1) + gm + torch.squeeze(bi) + torch.squeeze(bu)
            return result
        else:
            bu = is_known_user * self.Bu(users)
            bi = is_known_movie * self.Bi(movies)
            gm = self.global_mean
            p = is_known_user * self.P(users)
            q = is_known_movie * self.Q(movies)
            y = is_known_user * is_known_movie * self.Y(movies_rated_by_this_user).sum(dim=1).div(sqrt_of_number_of_movies_rated_by_this_user)
            result = q.mul(p+y).sum(dim=1) + gm + torch.squeeze(bi) + torch.squeeze(bu)
            return result



Using device: cuda


In [4]:
def mse_loss(predictions, labels):
    return torch.mean((predictions - labels) ** 2)

# Build Dataloaders
train_users_torch = torch.tensor(train_users, device=device)
train_movies_torch = torch.tensor(train_movies, device=device)
train_labels_torch = torch.tensor(train_labels, device=device)
train_is_known_user_torch = torch.tensor(train_is_known_user, device=device)
train_is_known_movie_torch = torch.tensor(train_is_known_movie, device=device)
train_movies_rated_by_this_user_torch = torch.tensor(train_movies_rated_by_this_user, device=device, dtype=torch.int64)
train_sqrt_of_number_of_movies_rated_by_this_user_torch = torch.tensor(train_sqrt_of_number_of_movies_rated_by_this_user, device=device)

train_dataloader = DataLoader(
    TensorDataset(train_users_torch, train_movies_torch, train_labels_torch, train_is_known_user_torch, train_is_known_movie_torch, train_movies_rated_by_this_user_torch, train_sqrt_of_number_of_movies_rated_by_this_user_torch),
    batch_size=batch_size)

test_users_torch = torch.tensor(test_users, device=device)
test_movies_torch = torch.tensor(test_movies, device=device)
test_is_known_user_torch = torch.tensor(test_is_known_user, device=device)
test_is_known_movie_torch = torch.tensor(test_is_known_movie, device=device)
test_movies_rated_by_this_user_torch = torch.tensor(test_movies_rated_by_this_user, device=device, dtype=torch.int64)
test_sqrt_of_number_of_movies_rated_by_this_user_torch = torch.tensor(test_sqrt_of_number_of_movies_rated_by_this_user, device=device)

test_dataloader = DataLoader(
    TensorDataset(test_users_torch, test_movies_torch, test_is_known_user_torch, test_is_known_movie_torch, test_movies_rated_by_this_user_torch, test_sqrt_of_number_of_movies_rated_by_this_user_torch),
    batch_size=batch_size)

global_mean = np.mean(train_labels)

model = SVDPP(number_of_users, number_of_movies, embedding_size, global_mean, mean_init, std_init).to(device)

optimizer = optim.Adam(model.parameters(),
                       lr=learning_rate,
                       weight_decay=weight_decay)

train_rmse_values = []
test_rmse_values = []
step = 0
with tqdm(total=len(train_dataloader) * num_epochs) as pbar:
    for epoch in range(num_epochs):
        for users_batch, movies_batch, labels_batch, is_known_user_batch, is_known_movie_batch, movies_rated_by_this_user_batch, sqrt_of_number_of_movies_rated_by_this_user_batch in train_dataloader:
            optimizer.zero_grad()

            predictions_batch = model(users_batch, movies_batch, is_known_user_batch, is_known_movie_batch, movies_rated_by_this_user_batch, sqrt_of_number_of_movies_rated_by_this_user_batch)

            loss = mse_loss(predictions_batch, labels_batch)

            loss.backward()

            optimizer.step()

            pbar.update(1)

            step += 1

        if epoch % show_validation_score_every_epochs == 0:
            model.eval()
            
            with torch.no_grad():
                all_train_predictions = []
                for users_batch, movies_batch, _, is_known_user_batch, is_known_movie_batch, movies_rated_by_this_user_batch, sqrt_of_number_of_movies_rated_by_this_user_batch in train_dataloader:
                    predictions_batch = model(users_batch, movies_batch, is_known_user_batch, is_known_movie_batch, movies_rated_by_this_user_batch, sqrt_of_number_of_movies_rated_by_this_user_batch)
                    all_train_predictions.extend(predictions_batch.detach().cpu().numpy().tolist())

                all_test_predictions = []
                for users_batch, movies_batch, is_known_user_batch, is_known_movie_batch, movies_rated_by_this_user_batch, sqrt_of_number_of_movies_rated_by_this_user_batch in test_dataloader:
                    predictions_batch = model(users_batch, movies_batch, is_known_user_batch, is_known_movie_batch, movies_rated_by_this_user_batch, sqrt_of_number_of_movies_rated_by_this_user_batch)
                    all_test_predictions.extend(predictions_batch.detach().cpu().numpy().tolist())

            train_rmse = rmse(train_labels, all_train_predictions)
            test_rmse = rmse(test_labels, all_test_predictions)
            print('Epoch: {:3d}, Train RMSE: {:.4f}, Test RMSE: {:.4f}'.format(epoch, train_rmse, test_rmse))
            train_rmse_values.append(train_rmse)
            test_rmse_values.append(test_rmse)

            model.train()


HBox(children=(FloatProgress(value=0.0, max=1655100.0), HTML(value='')))

Epoch:   0, Train RMSE: 1.0675, Test RMSE: 1.0752
Epoch:   1, Train RMSE: 0.9833, Test RMSE: 0.9992
Epoch:   2, Train RMSE: 0.9678, Test RMSE: 0.9906
Epoch:   3, Train RMSE: 0.9598, Test RMSE: 0.9871
Epoch:   4, Train RMSE: 0.9542, Test RMSE: 0.9850
Epoch:   5, Train RMSE: 0.9504, Test RMSE: 0.9838
Epoch:   6, Train RMSE: 0.9475, Test RMSE: 0.9829
Epoch:   7, Train RMSE: 0.9451, Test RMSE: 0.9824
Epoch:   8, Train RMSE: 0.9434, Test RMSE: 0.9820
Epoch:   9, Train RMSE: 0.9419, Test RMSE: 0.9819
Epoch:  10, Train RMSE: 0.9405, Test RMSE: 0.9817
Epoch:  11, Train RMSE: 0.9392, Test RMSE: 0.9814
Epoch:  12, Train RMSE: 0.9379, Test RMSE: 0.9812
Epoch:  13, Train RMSE: 0.9369, Test RMSE: 0.9811
Epoch:  14, Train RMSE: 0.9361, Test RMSE: 0.9812
Epoch:  15, Train RMSE: 0.9353, Test RMSE: 0.9810
Epoch:  16, Train RMSE: 0.9346, Test RMSE: 0.9809
Epoch:  17, Train RMSE: 0.9339, Test RMSE: 0.9808
Epoch:  18, Train RMSE: 0.9335, Test RMSE: 0.9810
Epoch:  19, Train RMSE: 0.9331, Test RMSE: 0.9810


# Ideas

1) Use different regularization techniques other than L2. e.g. set prior on latent vectors and add KL divergence to loss

2) Make use of nonlinearity

3) Instead of dividing by np.sqrt(len(number_of_rates)), learn this function

4) Add additional features: Number of movies this user watched, number of users this movie was watched by, clustering features, each user's frequencies for different ratings, each movies frequencies for different ratings

5) Add implicit features for movies.

6) Try classification
