In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [5]:
# 1. Load & merge dataset
movies = pd.read_csv('./data/32m/movies.csv')
ratings = pd.read_csv('./data/32m/ratings.csv')

# movies = pd.read_csv('./data/movies.csv')
# ratings = pd.read_csv('./data/ratings.csv')

df = pd.merge(ratings, movies, on='movieId')
df.drop_duplicates(inplace=True)

In [6]:
# 2. Genre processing & one-hot encoding
df['genres'] = df['genres'].str.split('|')
all_genres = set(g for genres in df['genres'] for g in genres)
all_genres.discard('(no genres listed)')
for genre in all_genres:
    df[genre] = df['genres'].apply(lambda x: 1 if genre in x else 0)
df.drop(columns=['genres'], inplace=True)

In [7]:
# 3. Movie and user stats
df['avg_rating_movie'] = df.groupby('movieId')['rating'].transform('mean')
df['num_ratings_movie'] = df.groupby('movieId')['rating'].transform('count')
df['avg_rating_user'] = df.groupby('userId')['rating'].transform('mean')
df['num_ratings_user'] = df.groupby('userId')['rating'].transform('count')

In [8]:
# 4. Extract year from title
df['year'] = df['title'].str.extract(r'\((\d{4})\)').astype(float)
df.dropna(subset=['year'], inplace=True)

In [9]:
# 5. Clustering movies based on genres + stats
features_for_clustering = list(all_genres) + ['avg_rating_movie', 'num_ratings_movie', 'year']
movie_features = df.groupby('movieId')[features_for_clustering].first()
scaler_clust = StandardScaler()
X_clust = scaler_clust.fit_transform(movie_features)
# Optimal cluster number with elbow or silhouette score can be automated, here fixed at 8
kmeans = KMeans(n_clusters=8, random_state=42)
movie_features['cluster'] = kmeans.fit_predict(X_clust)
movie_cluster_map = movie_features['cluster'].to_dict()
df['movie_cluster'] = df['movieId'].map(movie_cluster_map)

In [None]:
# 6. Encode categorical variables
user2idx = {u: i for i, u in enumerate(df['userId'].unique())}
movie2idx = {m: i for i, m in enumerate(df['movieId'].unique())}
cluster2idx = {c: i for i, c in enumerate(df['movie_cluster'].unique())}
df['user_idx'] = df['userId'].map(user2idx)
df['movie_idx'] = df['movieId'].map(movie2idx)
df['cluster_idx'] = df['movie_cluster'].map(cluster2idx)

In [11]:
# 7. Normalize continuous features
scaler_feat = StandardScaler()
continuous_feats = ['avg_rating_movie', 'num_ratings_movie', 'year', 'avg_rating_user', 'num_ratings_user']
df[continuous_feats] = scaler_feat.fit_transform(df[continuous_feats])

In [12]:
# 8. Split train-test
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['user_idx'])

In [23]:
# 9. Dataset class
class MovieDataset(Dataset):
    def __init__(self, df):
        self.users = torch.tensor(df['user_idx'].values, dtype=torch.long)
        self.movies = torch.tensor(df['movie_idx'].values, dtype=torch.long)
        self.clusters = torch.tensor(df['cluster_idx'].values, dtype=torch.long)
        self.features = torch.tensor(df[continuous_feats].values, dtype=torch.float32)
        self.ratings = torch.tensor(df['rating'].values, dtype=torch.float32)

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return (self.users[idx], self.movies[idx], self.clusters[idx], self.features[idx]), self.ratings[idx]

train_dataset = MovieDataset(train_df)
test_dataset = MovieDataset(test_df)
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True, num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False, num_workers=0)

In [24]:
# 10. Neural Collaborative Filtering Model with tuning
class NCF(nn.Module):
    def __init__(self, n_users, n_movies, n_clusters, emb_size=64):
        super().__init__()
        self.user_emb = nn.Embedding(n_users, emb_size)
        self.movie_emb = nn.Embedding(n_movies, emb_size)
        self.cluster_emb = nn.Embedding(n_clusters, emb_size)

        self.fc_layers = nn.Sequential(
            nn.Linear(emb_size*3 + len(continuous_feats), 256),
            nn.BatchNorm1d(256),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.3),

            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.3),

            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(0.1),

            nn.Linear(64, 1)
        )

    def forward(self, user, movie, cluster, features):
        u = self.user_emb(user)
        m = self.movie_emb(movie)
        c = self.cluster_emb(cluster)
        x = torch.cat([u, m, c, features], dim=1)
        out = self.fc_layers(x).squeeze(1)
        return out

In [25]:
# 11. Setup device, model, loss, optimizer, scheduler
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = NCF(len(user2idx), len(movie2idx), len(cluster2idx), emb_size=64).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)

In [26]:
# 12. Training loop with early stopping
def train_eval(model, train_loader, val_loader, epochs=50, patience=7):
    best_loss = float('inf')
    patience_count = 0
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for (user, movie, cluster, feats), ratings in train_loader:
            user, movie, cluster, feats, ratings = user.to(device), movie.to(device), cluster.to(device), feats.to(device), ratings.to(device)
            optimizer.zero_grad()
            preds = model(user, movie, cluster, feats)
            loss = criterion(preds, ratings)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * ratings.size(0)
        train_loss /= len(train_loader.dataset)

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for (user, movie, cluster, feats), ratings in val_loader:
                user, movie, cluster, feats, ratings = user.to(device), movie.to(device), cluster.to(device), feats.to(device), ratings.to(device)
                preds = model(user, movie, cluster, feats)
                loss = criterion(preds, ratings)
                val_loss += loss.item() * ratings.size(0)
        val_loss /= len(val_loader.dataset)

        scheduler.step(val_loss)
        print(f"Epoch {epoch+1}: Train Loss={train_loss:.4f}, Val Loss={val_loss:.4f}")

        if val_loss < best_loss:
            best_loss = val_loss
            patience_count = 0
            torch.save(model.state_dict(), 'best_ncf_model.pth')
        else:
            patience_count += 1
            if patience_count >= patience:
                print("Early stopping")
                break

In [None]:
# 13. Train the model
train_eval(model, train_loader, test_loader, epochs=50)

def evaluate(model, loader):
    model.eval()
    total_mape = 0
    total_smape = 0
    total_mae = 0
    total_rmse = 0
    n = 0
    eps = 1e-8

    with torch.no_grad():
        for (user, movie, cluster, feats), ratings in loader:
            user, movie, cluster, feats, ratings = user.to(device), movie.to(device), cluster.to(device), feats.to(device), ratings.to(device)
            preds = model(user, movie, cluster, feats)
            preds = torch.clamp(preds, min=0.5, max=5.0)  # ratings scale clamp

            n += ratings.size(0)
            abs_diff = torch.abs(preds - ratings)
            total_mae += torch.sum(abs_diff).item()
            total_rmse += torch.sum((preds - ratings) ** 2).item()
            total_mape += torch.sum(abs_diff / (ratings + eps)).item()
            total_smape += torch.sum(2 * abs_diff / (torch.abs(preds) + torch.abs(ratings) + eps)).item()

    print(f"Test MAE: {total_mae / n:.4f}")
    print(f"Test RMSE: {(total_rmse / n) ** 0.5:.4f}")
    print(f"Test MAPE: {(total_mape / n)*100:.2f}%")
    print(f"Test SMAPE: {(total_smape / n)*100:.2f}%")


In [None]:
# 15. Load best model and evaluate
model.load_state_dict(torch.load('best_ncf_model.pth'))
evaluate(model, test_loader)
