In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder

In [4]:
# Load MovieLens dataset
movies_df = pd.read_csv('ml-latest-small/movies.csv')
ratings_df = pd.read_csv('ml-latest-small/ratings.csv')

# Display first few rows
print(movies_df.head())
print(ratings_df.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [5]:
# Label encoding of user and movie IDs
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

ratings_df['userId'] = user_encoder.fit_transform(ratings_df['userId'])
ratings_df['movieId'] = movie_encoder.fit_transform(ratings_df['movieId'])

# Get number of unique users and movies
n_users = len(user_encoder.classes_)
n_movies = len(movie_encoder.classes_)

print(f'Number of users: {n_users}')
print(f'Number of movies: {n_movies}')

Number of users: 610
Number of movies: 9724


In [6]:
class MovieLensDataset(Dataset):
    def __init__(self, ratings_df, n_users, n_movies):
        self.users = ratings_df['userId'].values
        self.movies = ratings_df['movieId'].values
        self.ratings = ratings_df['rating'].values

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return torch.tensor(self.users[idx], dtype=torch.long), \
               torch.tensor(self.movies[idx], dtype=torch.long), \
               torch.tensor(self.ratings[idx], dtype=torch.float32)

In [7]:
class NeuralCollaborativeFiltering(nn.Module):
    def __init__(self, n_users, n_movies, embedding_dim=50, hidden_dim=128):
        super(NeuralCollaborativeFiltering, self).__init__()
        
        # Embedding layers for users and movies
        self.user_embedding = nn.Embedding(n_users, embedding_dim)
        self.movie_embedding = nn.Embedding(n_movies, embedding_dim)
        
        # MLP layers for neural network
        self.mlp = nn.Sequential(
            nn.Linear(embedding_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, user, movie):
        # Get embeddings
        user_embedded = self.user_embedding(user)
        movie_embedded = self.movie_embedding(movie)
        
        # Concatenate embeddings
        x = torch.cat([user_embedded, movie_embedded], dim=1)
        
        # Pass through MLP
        return self.mlp(x).squeeze()

In [8]:
# Split data into training and validation sets
train_df, val_df = train_test_split(ratings_df, test_size=0.2, random_state=42)

# Create DataLoader for train and validation
train_dataset = MovieLensDataset(train_df, n_users, n_movies)
val_dataset = MovieLensDataset(val_df, n_users, n_movies)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [9]:
# Initialize the model
model = NeuralCollaborativeFiltering(n_users, n_movies, embedding_dim=50, hidden_dim=128)

# Define loss function and optimizer
criterion = nn.MSELoss()  # Mean Squared Error for regression task
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
n_epochs = 10
for epoch in range(n_epochs):
    model.train()
    total_loss = 0
    
    for user, movie, rating in train_loader:
        # Zero gradients
        optimizer.zero_grad()
        
        # Forward pass
        output = model(user, movie)
        
        # Compute loss
        loss = criterion(output, rating)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_train_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{n_epochs}, Training Loss: {avg_train_loss:.4f}')

Epoch 1/10, Training Loss: 1.2431
Epoch 2/10, Training Loss: 0.8219
Epoch 3/10, Training Loss: 0.7524
Epoch 4/10, Training Loss: 0.7066
Epoch 5/10, Training Loss: 0.6682
Epoch 6/10, Training Loss: 0.6391
Epoch 7/10, Training Loss: 0.6130
Epoch 8/10, Training Loss: 0.5875
Epoch 9/10, Training Loss: 0.5648
Epoch 10/10, Training Loss: 0.5430


In [10]:
# Evaluate the model on the validation set
model.eval()
total_val_loss = 0

with torch.no_grad():
    for user, movie, rating in val_loader:
        output = model(user, movie)
        loss = criterion(output, rating)
        total_val_loss += loss.item()

avg_val_loss = total_val_loss / len(val_loader)
print(f'Validation Loss: {avg_val_loss:.4f}')

Validation Loss: 0.8690


In [11]:
import numpy as np

# Function to calculate RMSE and MAE
def evaluate_metrics(model, val_loader):
    model.eval()
    total_squared_error = 0
    total_absolute_error = 0
    total_samples = 0
    
    with torch.no_grad():
        for user, movie, rating in val_loader:
            # Forward pass
            output = model(user, movie)
            
            # Calculate squared error and absolute error
            squared_error = (output - rating) ** 2
            absolute_error = torch.abs(output - rating)
            
            # Accumulate total errors
            total_squared_error += squared_error.sum().item()
            total_absolute_error += absolute_error.sum().item()
            total_samples += len(rating)
    
    # Compute RMSE and MAE
    rmse = np.sqrt(total_squared_error / total_samples)
    mae = total_absolute_error / total_samples
    
    return rmse, mae

# Evaluate RMSE and MAE on validation data
rmse, mae = evaluate_metrics(model, val_loader)
print(f'Validation RMSE: {rmse:.4f}')
print(f'Validation MAE: {mae:.4f}')

Validation RMSE: 0.9326
Validation MAE: 0.7154
