In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

number_of_users, number_of_movies = (10000, 1000)

data_pd = pd.read_csv('../../data/data_train.csv')

train_size = 0.9

train_pd, test_pd = train_test_split(data_pd, train_size=train_size, random_state=42)

def extract_users_items_predictions(data_pd):
    users, movies = \
        [np.squeeze(arr) for arr in np.split(data_pd.Id.str.extract('r(\d+)_c(\d+)').values.astype(int) - 1, 2, axis=-1)]
    predictions = data_pd.Prediction.values
    return users, movies, predictions

train_users, train_movies, train_predictions = extract_users_items_predictions(train_pd)

movies_rated_by_user_u = {}
for train_user, train_movie in zip(train_users, train_movies):
    if train_user in movies_rated_by_user_u.keys():
        movies_rated_by_user_u[train_user].append(train_movie)
    else:
        movies_rated_by_user_u[train_user] = [train_movie]

test_users, test_movies, test_predictions = extract_users_items_predictions(test_pd)

rmse = lambda x, y: math.sqrt(mean_squared_error(x, y))
# test our predictions with the true values
def get_score(predictions, target_values=test_predictions):
    return rmse(predictions, target_values)

def extract_prediction_from_full_matrix(reconstructed_matrix, users=test_users, movies=test_movies):
    # returns predictions for the users-movies combinations specified based on a full m \times n matrix
    assert(len(users) == len(movies)), "users-movies combinations specified should have equal length"
    predictions = np.zeros(len(test_users))

    for i, (user, movie) in enumerate(zip(users, movies)):
        predictions[i] = reconstructed_matrix[user][movie]

    return predictions

# also create full matrix of observed values
data = np.full((number_of_users, number_of_movies), np.mean(train_pd.Prediction.values))
mask = np.zeros((number_of_users, number_of_movies)) # 0 -> unobserved value, 1->observed value

for user, movie, pred in zip(train_users, train_movies, train_predictions):
    data[user][movie] = pred
    mask[user][movie] = 1

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.tensorboard import SummaryWriter
from tqdm.notebook import tqdm

#  use gpu if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('Using device:', device)

# Parameters
batch_size = 1024
num_epochs = 25
show_validation_score_every_epochs = 1
embedding_size = 16
learning_rate = 1e-3


class SVDPP(nn.Module):
    def __init__(self, number_of_users, number_of_movies, embedding_size, global_mean, is_train_user_arr, is_train_movie_arr, movies_rated_by_user_u, std_init):
        super().__init__()
        self.Bu = nn.Embedding(number_of_users, 1)
        nn.init.normal_(self.Bu.weight, mean=0.0, std=std_init)
        self.Bi = nn.Embedding(number_of_movies, 1)
        nn.init.normal_(self.Bu.weight, mean=0.0, std=std_init)
        self.P = nn.Embedding(number_of_users, embedding_size)
        nn.init.normal_(self.P.weight, mean=0.0, std=std_init)
        self.Q = nn.Embedding(number_of_movies, embedding_size)
        nn.init.normal_(self.Q.weight, mean=0.0, std=std_init)
        self.Y = nn.Embedding(number_of_movies, embedding_size)
        nn.init.normal_(self.Y.weight, mean=0.0, std=std_init)
        self.global_mean = torch.tensor(global_mean)
        self.mask_unknown_users = nn.Embedding.from_pretrained(torch.FloatTensor(is_train_user_arr), freeze=True)
        self.mask_unknown_movies = nn.Embedding.from_pretrained(torch.FloatTensor(is_train_movie_arr), freeze=True)
        self.movies_rated_by_user_u = movies_rated_by_user_u

    def get_y(self, users):
        js_list = [self.movies_rated_by_user_u[u] for u in users.cpu().numpy()]
        js_lengths = [len(js) for js in js_list]
        js_list_concatted = np.hstack(tuple(js_list))
        temp = self.Y(torch.tensor(js_list_concatted, device=device))
        last_index = 0
        y = []
        for js_length in js_lengths:
            current_tensor = temp[last_index:last_index+js_length, :].sum(dim=0)
            current_tensor = current_tensor.div(np.sqrt(js_length))
            y.append(current_tensor)
            last_index += js_length
        y = torch.stack(y, dim=0)
        return y

    def forward(self, users, movies):
        if self.training:
            bu = self.Bu(users)
            bi = self.Bi(movies)
            gm = self.global_mean
            p = self.P(users)
            q = self.Q(movies)
            y = self.get_y(users)
            result = q.mul(p+y).sum(dim=1) + gm + torch.squeeze(bi) + torch.squeeze(bu)
            return result
        else:
            users_mask = self.mask_unknown_users(users)
            movies_mask = self.mask_unknown_movies(movies)
            bu = users_mask * self.Bu(users)
            bi = movies_mask * self.Bi(movies)
            gm = self.global_mean
            p = users_mask * self.P(users)
            q = movies_mask * self.Q(movies)
            y = movies_mask * users_mask * self.get_y(users)
            result = q.mul(p+y).sum(dim=1) + gm + torch.squeeze(bi) + torch.squeeze(bu)
            return result


Using device: cpu


In [None]:
def mse_loss(predictions, target):
    return torch.mean((predictions - target) ** 2)

# Build Dataloaders
train_users_torch = torch.tensor(train_users, device=device)
train_movies_torch = torch.tensor(train_movies, device=device)
train_predictions_torch = torch.tensor(train_predictions, device=device)

train_dataloader = DataLoader(
    TensorDataset(train_users_torch, train_movies_torch, train_predictions_torch),
    batch_size=batch_size)

test_users_torch = torch.tensor(test_users, device=device)
test_movies_torch = torch.tensor(test_movies, device=device)

test_dataloader = DataLoader(
    TensorDataset(test_users_torch, test_movies_torch),
    batch_size=batch_size)

global_mean = np.mean(train_predictions)

is_train_user_arr = np.zeros((number_of_users, 1))
is_train_user_arr[train_users, 0] = 1.0

is_train_movie_arr = np.zeros((number_of_movies, 1))
is_train_movie_arr[train_movies, 0] = 1.0

std_init = 0.1

model = SVDPP(number_of_users, number_of_movies, embedding_size, global_mean, is_train_user_arr, is_train_movie_arr, movies_rated_by_user_u, std_init).to(device)

optimizer = optim.Adam(model.parameters(),
                       lr=learning_rate)

# collect losses for qualitative inspection
svdpp_logdir = './tensorboard/svdpp'
writer = SummaryWriter(svdpp_logdir)

step = 0
with tqdm(total=len(train_dataloader) * num_epochs) as pbar:
    for epoch in range(num_epochs):
        for users_batch, movies_batch, target_predictions_batch in train_dataloader:
            optimizer.zero_grad()

            predictions_batch = model(users_batch, movies_batch)

            loss = mse_loss(predictions_batch, target_predictions_batch)

            loss.backward()

            optimizer.step()

            writer.add_scalar('loss', loss, step)
            pbar.update(1)
            step += 1

            if step % 100 == 0:
                model.eval()

                with torch.no_grad():
                    all_predictions = []
                    for users_batch, movies_batch in test_dataloader:
                        predictions_batch = model(users_batch, movies_batch)
                        all_predictions.append(predictions_batch)

                all_predictions = torch.cat(all_predictions)

                reconstuction_rmse = get_score(all_predictions.cpu().numpy())
                pbar.set_description('At step {:3d} loss is {:.4f}'.format(step, reconstuction_rmse))

                writer.add_scalar('reconstuction_rmse', reconstuction_rmse, step)

                model.train()