In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm
import os
import sys
import json

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

DATASET_LOCATION = 'data/ml-latest-small'
PARAMETER_FILE = 'parameters/model_small.pth'

device = 'cpu'

In [2]:
# Data Preprocessing
def load_data():
    movies = pd.read_csv(f'{DATASET_LOCATION}/movies.csv')
    ratings = pd.read_csv(f'{DATASET_LOCATION}/ratings.csv')
    return movies, ratings

def preprocess_data(ratings):
    user_ids = ratings['userId'].unique()
    movie_ids = ratings['movieId'].unique()

    user_to_index = {user_id: index for index, user_id in enumerate(user_ids)}
    movie_to_index = {movie_id: index for index, movie_id in enumerate(movie_ids)}

    ratings['userId'] = ratings['userId'].map(user_to_index)
    ratings['movieId'] = ratings['movieId'].map(movie_to_index)

    train_data, test_data = train_test_split(ratings, test_size=0.2)
    return train_data, test_data, len(user_ids), len(movie_ids)

In [3]:
# Model Definition
class MatrixFactorization(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim):
        super(MatrixFactorization, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)

    def forward(self, user_indices, item_indices):
        user_embedding = self.user_embedding(user_indices)
        item_embedding = self.item_embedding(item_indices)
        return (user_embedding * item_embedding).sum(1)
    
    def get_item_embedding(self, item_index):
        return self.item_embedding(item_index)

# Dataset Definition
class RatingsDataset(Dataset):
    def __init__(self, ratings):
        self.ratings = ratings

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        row = self.ratings.iloc[idx]
        return row['userId'], row['movieId'], row['rating']

In [4]:
# Training
def train_model(train_data, num_users, num_items, embedding_dim=50, epochs=10, lr=0.01):
    model = MatrixFactorization(num_users, num_items, embedding_dim).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    train_dataset = RatingsDataset(train_data)
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for user_indices, item_indices, ratings in tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs}'):
            user_indices = user_indices.long().to(device)
            item_indices = item_indices.long().to(device)
            ratings = ratings.float().to(device)

            optimizer.zero_grad()
            outputs = model(user_indices, item_indices)
            loss = criterion(outputs, ratings)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader)}')
    
    # Save the model
    torch.save(model.state_dict(), PARAMETER_FILE)
    return model

# Evaluation
def evaluate_model(model, test_data):
    model.eval()
    test_dataset = RatingsDataset(test_data)
    test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

    total_loss = 0
    criterion = nn.MSELoss()
    with torch.no_grad():
        for user_indices, item_indices, ratings in test_loader:
            user_indices = user_indices.long().to(device)
            item_indices = item_indices.long().to(device)
            ratings = ratings.float().to(device)

            outputs = model(user_indices, item_indices)
            loss = criterion(outputs, ratings)
            total_loss += loss.item()
    print(f'Test Loss: {total_loss/len(test_loader)}')

In [5]:
# Find similar movies
def find_similar_movies(model, movie_id, movie_to_index, index_to_movie, top_k=5):
    model.eval()
    movie_index = torch.tensor([movie_to_index[movie_id]]).to(device)
    movie_embedding = model.get_item_embedding(movie_index).detach().cpu().numpy()

    all_movie_embeddings = model.item_embedding.weight.detach().cpu().numpy()
    similarities = np.dot(all_movie_embeddings, movie_embedding.T).flatten()
    similar_movie_indices = similarities.argsort()[-top_k-1:-1][::-1]

    similar_movies = [index_to_movie[idx] for idx in similar_movie_indices]
    return similar_movies

In [6]:
# Main Script
if __name__ == "__main__":
    movies, ratings = load_data()
    train_data, test_data, num_users, num_items = preprocess_data(ratings)

    model = None

    if not os.path.isfile(PARAMETER_FILE):
        model = train_model(train_data, num_users, num_items, embedding_dim=50)
    else:
        model = MatrixFactorization(num_users, num_items, embedding_dim=50)
    
    # Load the model for evaluation
    model.load_state_dict(torch.load(PARAMETER_FILE, weights_only=True))
    model.to(device)
    evaluate_model(model, test_data)
    
    # Create index to movie mapping
    movie_to_index = {movie_id: index for index, movie_id in enumerate(movies['movieId'].unique())}
    index_to_movie = {index: movie_id for movie_id, index in movie_to_index.items()}

    all_movie_ids = pd.read_csv(f'{DATASET_LOCATION}/ratings.csv')["movieId"].unique()

    recommendations = {}

    links = pd.read_csv(f'{DATASET_LOCATION}/links.csv')
    links['tmdbId'] = links['tmdbId'].fillna(0).astype(int)

for id in all_movie_ids:
    if id in movie_to_index:
        try:
            tmdb_id = int(links.loc[links['movieId'] == id]['tmdbId'].values[0])
            current_recommendation = find_similar_movies(model, id, movie_to_index, index_to_movie, top_k=15)
            current_tmdb_recommendation = [links.loc[links['movieId'] == movie_id]['tmdbId'].values[0] for movie_id in current_recommendation]
            recommendations[tmdb_id] = list(map(int, current_tmdb_recommendation))
        except RuntimeError as e:
            print(f"RuntimeError processing movie {id}: {e}")
        except Exception as e:
            print(f"Error processing movie {id}: {e}")
    else:
        print(f"Movie ID {id} not found in movie_to_index")

with open("../app/recommendations/recommendations.json", "w") as f:
    json.dump(recommendations, f)

Epoch 1/10: 100%|██████████| 1261/1261 [00:07<00:00, 160.25it/s]


Epoch 1/10, Loss: 37.272869165316166


Epoch 2/10: 100%|██████████| 1261/1261 [00:07<00:00, 167.42it/s]


Epoch 2/10, Loss: 6.9295042391148565


Epoch 3/10: 100%|██████████| 1261/1261 [00:07<00:00, 167.00it/s]


Epoch 3/10, Loss: 2.10066028779504


Epoch 4/10: 100%|██████████| 1261/1261 [00:07<00:00, 169.80it/s]


Epoch 4/10, Loss: 1.6063616857180796


Epoch 5/10: 100%|██████████| 1261/1261 [00:07<00:00, 172.91it/s]


Epoch 5/10, Loss: 2.028037469364366


Epoch 6/10: 100%|██████████| 1261/1261 [00:07<00:00, 171.31it/s]


Epoch 6/10, Loss: 2.0816815279477368


Epoch 7/10: 100%|██████████| 1261/1261 [00:07<00:00, 173.01it/s]


Epoch 7/10, Loss: 1.584652452833779


Epoch 8/10: 100%|██████████| 1261/1261 [00:07<00:00, 168.49it/s]


Epoch 8/10, Loss: 1.2783865281081597


Epoch 9/10: 100%|██████████| 1261/1261 [00:07<00:00, 179.96it/s]


Epoch 9/10, Loss: 1.2089920645191214


Epoch 10/10: 100%|██████████| 1261/1261 [00:06<00:00, 182.61it/s]


Epoch 10/10, Loss: 1.1393502782867788
Test Loss: 2.5475297196756435
Error processing movie 190183: index out of range in self
Error processing movie 191005: index out of range in self
Error processing movie 193565: index out of range in self
Error processing movie 193567: index out of range in self
Error processing movie 193571: index out of range in self
Error processing movie 193573: index out of range in self
Error processing movie 193579: index out of range in self
Error processing movie 193581: index out of range in self
Error processing movie 193583: index out of range in self
Error processing movie 193585: index out of range in self
Error processing movie 193587: index out of range in self
Error processing movie 193609: index out of range in self
Error processing movie 190207: index out of range in self
Error processing movie 190209: index out of range in self
Error processing movie 190213: index out of range in self
Error processing movie 190215: index out of range in self
Erro