In [1]:
import pandas as pd
import numpy as np
import math
import random
import warnings

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.init as init

warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
movies_df = pd.read_csv('data/movies_data.csv')
ratings_df = pd.read_csv('data/ratings_data.csv')

In [3]:
ratings_df = pd.merge(left=ratings_df, right=movies_df[['title','id']], left_on='movieId', right_on='id', how='left').dropna()

In [4]:
movieid_to_idx = {mid: idx for idx, mid in enumerate(ratings_df['movieId'].unique())}
ratings_df['lookup_id'] = ratings_df['movieId'].map(movieid_to_idx)

In [5]:
movie_name_to_loopup_id = {row['title']:row['lookup_id'] for idx, row in ratings_df[['title', 'lookup_id']].drop_duplicates().iterrows()}
idx_to_movie_name = {idx:name for name, idx in movie_name_to_loopup_id.items()}

In [6]:
class CustomDataset(Dataset):
    def __init__(self, ratings_df: pd.DataFrame):
        super().__init__()
        self.users = ratings_df['userId'].values
        self.movies = ratings_df['lookup_id'].values
        self.ratings = ratings_df['rating'].values


    def __len__(self):
        return len(self.users)
    
    def __getitem__(self, index):
        user = torch.tensor(self.users[index], dtype=torch.int32)
        movie = torch.tensor(self.movies[index], dtype=torch.int32)
        rating = torch.tensor(self.ratings[index], dtype=torch.float32)

        return user, movie, rating

In [7]:
class MatrixFactorization(nn.Module):
    def __init__(self, embed_dim, n_users, n_movies):
        super().__init__()

        self.embed_dim = embed_dim
        self.user_matrix = nn.Embedding(n_users, embed_dim)     # u x d
        self.movies_matrix = nn.Embedding(n_movies, embed_dim)  # m x d

        # Initialize parameters to small random numbers
        self._initialize_weights()

    def _initialize_weights(self):
        # Xavier initialization for embeddings
        nn.init.xavier_uniform_(self.user_matrix.weight)
        nn.init.xavier_uniform_(self.movies_matrix.weight)

    def forward(self, users, movies):
        users = self.user_matrix(users)         # b x d
        movies = self.movies_matrix(movies)     # b x d
        
        affinities = torch.sum(users * movies, dim=1) / math.sqrt(self.embed_dim)  # b x 1

        return affinities


In [8]:
class RMSELoss(torch.nn.Module):
    def __init__(self):
        super(RMSELoss, self).__init__()

    def forward(self, predictions, targets):
        return torch.sqrt(torch.mean((predictions - targets) ** 2))


In [9]:
batch_size = 1024 * 256
train_set = CustomDataset(ratings_df)
train_loader = DataLoader(train_set, batch_size=batch_size ,shuffle=True)


In [10]:
num_users = ratings_df['userId'].nunique() 
num_movies = ratings_df['movieId'].nunique() 

model = MatrixFactorization(4, num_users, num_movies).to(device)
loss_fn = RMSELoss()

In [None]:
Epocs = 20
lambda_reg = 1e-5  # Regularization strength

for i in range(Epocs):
    

    if i < Epocs//2:
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-1)
    else:
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)

    for _, (users, movies, y) in enumerate(train_loader):
    
        pred_ratings = model(users.to(device), movies.to(device))
        loss = loss_fn(pred_ratings.to(device), y.to(device))

        
        l2_reg = sum(param.pow(2).sum() for param in model.parameters())/sum(p.nelement() for p in model.parameters())  
        loss = loss + lambda_reg * l2_reg

        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

In [58]:
torch.save(model.state_dict(), "parameters/collaboative_model.pth")

In [None]:
model.load_state_dict(torch.load("parameters/collaboative_model.pth"))

<All keys matched successfully>

In [12]:
my_list = [
    ['The Sixth Sense 1999', 5],
    ['Se7en 1995', 5],
    ['Men in Black 1997', 4],
]
watched_movies = [movie[0] for movie in my_list]
my_list = [ [movie_name_to_loopup_id[movie[0]], movie[1]]   for movie in my_list]

In [13]:
model.movies_matrix.weight.requires_grad = False

In [14]:
def get_counts(name):
    return movies_df[movies_df['title'] == name]['vote_count'].values[0]
idx_to_movie_name = {idx:name for name, idx in movie_name_to_loopup_id.items()}

In [15]:
lambda_reg = 1e-4  # Regularization strength
class NewUser(nn.Module):
    def __init__(self, user):
        super().__init__()

        self.seen_movies = torch.tensor([movie[0] for movie in user], dtype=torch.int32).to(device)
        self.given_ratings = torch.tensor([movie[1] for movie in user], dtype=torch.float32).to(device)
        self.given_ratings = self.given_ratings - self.given_ratings.mean()

        self.movie_embeddings = model.movies_matrix(self.seen_movies)

        
        self.user = torch.randn((1, model.embed_dim), requires_grad=True, device=device)
        
        
        init.xavier_normal_(self.user)
        self.user = self.user.detach().requires_grad_()

        self.epocs = 500
        self.train_user()

    def train_user(self):
        model.movies_matrix.weight.requires_grad = False
        
        optimizer = torch.optim.Adam([self.user], lr=1e-1)

        for i in range(self.epocs):
            
            affinity = torch.sum(self.user * self.movie_embeddings, dim=1) / math.sqrt(model.embed_dim)

            l2_reg = self.user.sum().pow(2) / model.embed_dim
            loss = loss_fn(affinity, self.given_ratings) + lambda_reg * l2_reg

            
            optimizer.zero_grad(set_to_none=True)
            loss.backward()
            optimizer.step()

        print(f'Final user Loss: {loss.item():.4f}')

    def recommed(self, n_movies = 5):

        with torch.no_grad():

            user_matrix = model.user_matrix.weight.detach().cpu().numpy()
            cur_user = self.user.detach().cpu().numpy()

            # Getting User Similarities
            matrix_norms = np.linalg.norm(user_matrix, axis= 1)
            vector_norm = np.linalg.norm(cur_user, axis = 1)
            dot_products = (user_matrix @ cur_user.T).flatten() 
            cosine_similarities = dot_products / (matrix_norms * vector_norm)

            # Selecting Top 20 Users
            topk = 20
            ids = np.argsort(-cosine_similarities)[:topk]
            similar_users = ratings_df[ratings_df['userId'].isin(ids)]
            similar_users = similar_users.pivot(index='userId', columns='title', values= 'rating')
            movie_names = similar_users.columns.tolist()

            # Ordering movies based on Avg votes of the top 20 similar users
            avg_votes = np.nanmean(similar_users.values, axis = 0)
            num_votes = (np.sum(~np.isnan(similar_users.values), axis = 0) >= topk * 0.75)
            filtered_votes = num_votes*avg_votes
            top_movies = np.argsort(-filtered_votes).tolist()[:10]
            x = [movie_names[id] for id in top_movies if movie_names[id] not in watched_movies]
        return x


In [16]:
n = NewUser(my_list)

Final user Loss: 0.0417


In [17]:
x = n.recommed()
x

['20,000 Leagues Under the Sea 1997',
 'Men in Black II 2002',
 'Once Were Warriors 1994',
 'Point Break 1991',
 'Rush Hour 1998',
 'Scarface 1983',
 'Scary Movie 2000',
 'Secret Window 2004',
 'Sleepless in Seattle 1993',
 'Solaris 1972']