In [22]:
import os
import sys

current_dir = os.getcwd()

# Walk up the directory tree until we find 'src'
path = current_dir
src_path = None

while True:
    if os.path.basename(path) == "src":
        src_path = path
        break
    parent = os.path.dirname(path)
    if parent == path:  # reached filesystem root
        break
    path = parent

# Add src to sys.path if found
if src_path and src_path not in sys.path:
    sys.path.insert(0, src_path)


import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

# Local imports
from helpers.data_downloader import download_ml1m_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [23]:
class AutoRecEncoder(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super().__init__()
        self.encoder = nn.Linear(input_dim, latent_dim)
        self.decoder = nn.Linear(latent_dim, input_dim)

    def forward(self, x):
        z = F.relu(self.encoder(x))
        x_hat = self.decoder(z)
        return x_hat, z


In [24]:
class NCF(nn.Module):
    def __init__(self, latent_dim, mlp_layers):
        super().__init__()

        # GMF
        self.gmf = nn.Linear(latent_dim, latent_dim)

        # MLP
        mlp_modules = []
        input_dim = latent_dim * 2
        for h in mlp_layers:
            mlp_modules.append(nn.Linear(input_dim, h))
            mlp_modules.append(nn.ReLU())
            input_dim = h
        self.mlp = nn.Sequential(*mlp_modules)

        # Final prediction
        self.output = nn.Linear(latent_dim + mlp_layers[-1], 1)

    def forward(self, user_z, item_z):
        gmf_out = self.gmf(user_z * item_z)
        mlp_out = self.mlp(torch.cat([user_z, item_z], dim=1))
        concat = torch.cat([gmf_out, mlp_out], dim=1)
        return self.output(concat).squeeze()

In [25]:
class HybridAutoRecNCF(nn.Module):
    def __init__(self, num_users, num_items, latent_dim, mlp_layers):
        super().__init__()

        self.user_autorec = AutoRecEncoder(num_items, latent_dim)
        self.item_autorec = AutoRecEncoder(num_users, latent_dim)

        self.ncf = NCF(latent_dim, mlp_layers)

    def forward(self, user_vecs, item_vecs, user_ids, item_ids):
        # AutoRec forward
        # user_vecs: (batch_size, num_items) - each row is a user's rating vector
        # item_vecs: (batch_size, num_users) - each row is an item's rating vector
        user_recon, user_z = self.user_autorec(user_vecs)
        item_recon, item_z = self.item_autorec(item_vecs)

        # Each element in the batch corresponds to a (user, item) pair
        # user_z[i] is the latent for the user in pair i
        # item_z[i] is the latent for the item in pair i
        # So we can use them directly without indexing
        pred = self.ncf(user_z, item_z)
        return pred, user_recon, item_recon


In [26]:
def reconstruction_loss(pred, target, mask):
    diff = (pred - target) * mask
    return torch.sum(diff ** 2) / torch.sum(mask)


In [27]:
def interaction_loss(pred, rating):
    return F.mse_loss(pred, rating)


In [28]:
def total_loss(pred, rating,
               user_recon, user_vecs, user_mask,
               item_recon, item_vecs, item_mask,
               alpha=1.0, beta=1.0):
    rec_u = reconstruction_loss(user_recon, user_vecs, user_mask)
    rec_i = reconstruction_loss(item_recon, item_vecs, item_mask)
    inter = interaction_loss(pred, rating)
    return alpha * (rec_u + rec_i) + beta * inter


In [29]:
def train(model, dataloader, optimizer, device):
    model.train()
    total = 0.0

    for batch in dataloader:
        (user_ids, item_ids, ratings,
         user_vecs, item_vecs,
         user_mask, item_mask) = batch

        user_ids = user_ids.to(device).squeeze()
        item_ids = item_ids.to(device).squeeze()
        ratings = ratings.to(device).squeeze()
        user_vecs = user_vecs.to(device)
        item_vecs = item_vecs.to(device)
        user_mask = user_mask.to(device)
        item_mask = item_mask.to(device)

        optimizer.zero_grad()

        pred, user_recon, item_recon = model(
            user_vecs, item_vecs, user_ids, item_ids
        )

        loss = total_loss(
            pred, ratings,
            user_recon, user_vecs, user_mask,
            item_recon, item_vecs, item_mask
        )

        loss.backward()
        optimizer.step()
        total += loss.item()

    return total / len(dataloader)


In [30]:
data_dir = os.path.join(os.path.dirname(os.getcwd()), '..', 'data')
data_path = os.path.join(data_dir,'ml-1m', 'ratings.dat')
print(data_path)
print(data_dir)
# Check if file exists
if not os.path.exists(data_path):
    download_ml1m_dataset(data_dir=data_dir)

def load_ml_1m_data(data_path = data_path) -> pd.DataFrame:  
    print("=" * 70)
    print("Loading MovieLens 1M Dataset")
    print("=" * 70)
    print(f"Data path: {data_path}")
    return pd.read_csv(
        data_path,
        sep='::',
        header=None,
        names=['user_id', 'item_id', 'rating', 'timestamp'],
        engine='python',  # Explicitly use python engine to avoid warning
        dtype={
            'user_id': np.int32,
            'item_id': np.int32,
            'rating': np.float32,
            'timestamp': np.int32
        }
    )


# Load ratings data with proper engine to avoid warnings
print("\nLoading ratings data...")

ratings_df = load_ml_1m_data()

print(f"✓ Successfully loaded {len(ratings_df):,} ratings")
print("=" * 70)


/Users/abbas/Documents/Codes/thesis/recommender/src/../data/ml-1m/ratings.dat
/Users/abbas/Documents/Codes/thesis/recommender/src/../data

Loading ratings data...
Loading MovieLens 1M Dataset
Data path: /Users/abbas/Documents/Codes/thesis/recommender/src/../data/ml-1m/ratings.dat
✓ Successfully loaded 1,000,209 ratings


In [31]:
# Data preprocessing: remap IDs and create train/test split
def preprocess_data(ratings_df, test_size=0.2, random_state=42):
    """Preprocess data: remap IDs to be contiguous and split train/test"""
    # Create a copy
    data = ratings_df.copy()
    
    # Remap user IDs to be contiguous (0-indexed)
    unique_users = sorted(data['user_id'].unique())
    user_map = {old_id: new_id for new_id, old_id in enumerate(unique_users)}
    data['user_id'] = data['user_id'].map(user_map)
    num_users = len(unique_users)
    
    # Remap item IDs to be contiguous (0-indexed)
    unique_items = sorted(data['item_id'].unique())
    item_map = {old_id: new_id for new_id, old_id in enumerate(unique_items)}
    data['item_id'] = data['item_id'].map(item_map)
    num_items = len(unique_items)
    
    # Split into train and test
    train_df, test_df = train_test_split(
        data[['user_id', 'item_id', 'rating']],
        test_size=test_size,
        random_state=random_state
    )
    
    # Create rating matrices
    train_mat = np.zeros((num_users, num_items), dtype=np.float32)
    test_mat = np.zeros((num_users, num_items), dtype=np.float32)
    
    # Fill train matrix
    for _, row in train_df.iterrows():
        train_mat[int(row['user_id']), int(row['item_id'])] = float(row['rating'])
    
    # Fill test matrix
    for _, row in test_df.iterrows():
        test_mat[int(row['user_id']), int(row['item_id'])] = float(row['rating'])
    
    return train_mat, test_mat, train_df, test_df, num_users, num_items

print("Preprocessing data...")
train_mat, test_mat, train_df, test_df, num_users, num_items = preprocess_data(
    ratings_df, test_size=0.2, random_state=42
)

print(f"✓ Preprocessing complete!")
print(f"  - Users: {num_users}, Items: {num_items}")
print(f"  - Train interactions: {len(train_df):,}")
print(f"  - Test interactions: {len(test_df):,}")
print("=" * 70)


Preprocessing data...
✓ Preprocessing complete!
  - Users: 6040, Items: 3706
  - Train interactions: 800,167
  - Test interactions: 200,042


In [32]:
# Dataset class for hybrid model
class HybridDataset(data.Dataset):
    def __init__(self, rating_matrix, interactions_df):
        """
        Args:
            rating_matrix: (num_users, num_items) rating matrix
            interactions_df: DataFrame with columns ['user_id', 'item_id', 'rating']
        """
        self.rating_matrix = rating_matrix
        self.interactions_df = interactions_df.reset_index(drop=True)
        
        # Create masks (1 where rating exists, 0 otherwise)
        self.user_mask = (rating_matrix > 0).astype(np.float32)
        self.item_mask = (rating_matrix.T > 0).astype(np.float32)  # Transpose for items
    
    def __len__(self):
        return len(self.interactions_df)
    
    def __getitem__(self, idx):
        row = self.interactions_df.iloc[idx]
        user_id = int(row['user_id'])
        item_id = int(row['item_id'])
        rating = float(row['rating'])
        
        # Get user vector (ratings across all items)
        user_vec = torch.FloatTensor(self.rating_matrix[user_id])
        
        # Get item vector (ratings across all users)
        item_vec = torch.FloatTensor(self.rating_matrix[:, item_id])
        
        # Get masks
        user_mask = torch.FloatTensor(self.user_mask[user_id])
        item_mask = torch.FloatTensor(self.item_mask[item_id])
        
        return (
            torch.LongTensor([user_id]),
            torch.LongTensor([item_id]),
            torch.FloatTensor([rating]),
            user_vec,
            item_vec,
            user_mask,
            item_mask
        )

# Create datasets
train_dataset = HybridDataset(train_mat, train_df)
test_dataset = HybridDataset(test_mat, test_df)

print(f"✓ Datasets created")
print(f"  - Train samples: {len(train_dataset):,}")
print(f"  - Test samples: {len(test_dataset):,}")


✓ Datasets created
  - Train samples: 800,167
  - Test samples: 200,042


In [33]:
# Create data loaders
batch_size = 256
train_loader = data.DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=0
)

test_loader = data.DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=0
)

print(f"✓ Data loaders created (batch_size={batch_size})")


✓ Data loaders created (batch_size=256)


In [35]:
# Initialize model and optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

latent_dim = 64
mlp_layers = [128, 64]

model = HybridAutoRecNCF(
    num_users, num_items, latent_dim, mlp_layers
).to(device)

optimizer = torch.optim.Adam(
    model.parameters(), lr=1e-3, weight_decay=1e-5
)

print(f"✓ Model initialized")
print(f"  - Latent dim: {latent_dim}")
print(f"  - MLP layers: {mlp_layers}")
print(f"  - Total parameters: {sum(p.numel() for p in model.parameters()):,}")

print(f"model architecture: {model}")

Using device: cpu
✓ Model initialized
  - Latent dim: 64
  - MLP layers: [128, 64]
  - Total parameters: 1,286,419
model architecture: HybridAutoRecNCF(
  (user_autorec): AutoRecEncoder(
    (encoder): Linear(in_features=3706, out_features=64, bias=True)
    (decoder): Linear(in_features=64, out_features=3706, bias=True)
  )
  (item_autorec): AutoRecEncoder(
    (encoder): Linear(in_features=6040, out_features=64, bias=True)
    (decoder): Linear(in_features=64, out_features=6040, bias=True)
  )
  (ncf): NCF(
    (gmf): Linear(in_features=64, out_features=64, bias=True)
    (mlp): Sequential(
      (0): Linear(in_features=128, out_features=128, bias=True)
      (1): ReLU()
      (2): Linear(in_features=128, out_features=64, bias=True)
      (3): ReLU()
    )
    (output): Linear(in_features=128, out_features=1, bias=True)
  )
)


In [36]:
# Helper functions for HitRate and NDCG
def hit(gt_items, pred_items):
    """Calculate Hit Rate: 1 if any gt_item is in pred_items, else 0"""
    if len(gt_items) == 0:
        return 0.0
    return 1.0 if any(item in pred_items for item in gt_items) else 0.0

def ndcg(gt_items, pred_items):
    """Calculate NDCG for ranking quality"""
    if len(gt_items) == 0:
        return 0.0
    
    # Calculate DCG
    dcg = 0.0
    for idx, item in enumerate(pred_items):
        if item in gt_items:
            dcg += 1.0 / np.log2(idx + 2)
    
    # Calculate IDCG
    num_gt = len(gt_items)
    idcg = sum(1.0 / np.log2(i + 2) for i in range(min(num_gt, len(pred_items))))
    
    if idcg == 0:
        return 0.0
    return dcg / idcg

# Evaluation function for RMSE
def evaluate(model, dataloader, device):
    model.eval()
    accumulated_loss = 0.0
    accumulated_rmse = 0.0
    num_samples = 0
    
    with torch.no_grad():
        for batch in dataloader:
            (user_ids, item_ids, ratings,
             user_vecs, item_vecs,
             user_mask, item_mask) = batch

            user_ids = user_ids.to(device).squeeze()
            item_ids = item_ids.to(device).squeeze()
            ratings = ratings.to(device).squeeze()
            user_vecs = user_vecs.to(device)
            item_vecs = item_vecs.to(device)
            user_mask = user_mask.to(device)
            item_mask = item_mask.to(device)

            pred, user_recon, item_recon = model(
                user_vecs, item_vecs, user_ids, item_ids
            )

            loss = total_loss(
                pred, ratings,
                user_recon, user_vecs, user_mask,
                item_recon, item_vecs, item_mask
            )
            
            # Calculate RMSE for predictions
            rmse = torch.sqrt(F.mse_loss(pred, ratings))
            
            accumulated_loss += loss.item() * len(ratings)
            accumulated_rmse += rmse.item() * len(ratings)
            num_samples += len(ratings)
    
    return accumulated_loss / num_samples, accumulated_rmse / num_samples

# Evaluation function for HitRate and NDCG
def evaluate_ranking_metrics(model, train_mat, test_mat, num_users, num_items, top_k=10, device=None):
    """
    Evaluate HitRate@K and NDCG@K metrics.
    
    For each user:
    1. Get predictions for all items using training data
    2. Mask out items seen in training
    3. Get top-K items
    4. Calculate HR and NDCG based on test items
    """
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    model.eval()
    
    # Convert to tensors
    train_mat_tensor = torch.FloatTensor(train_mat).to(device)
    test_mat_np = test_mat
    
    # Get training mask
    train_mask = (train_mat > 0)
    
    # Predict ratings for all user-item pairs
    # Process in batches to avoid memory issues
    batch_size_users = 256
    predictions = np.zeros((num_users, num_items), dtype=np.float32)
    
    with torch.no_grad():
        for start_idx in range(0, num_users, batch_size_users):
            end_idx = min(start_idx + batch_size_users, num_users)
            batch_size = end_idx - start_idx
            
            # Get user vectors (ratings across all items from training)
            user_vecs_batch = train_mat_tensor[start_idx:end_idx]  # (batch_size, num_items)
            
            # For each item, get its rating vector across all users
            for item_id in range(num_items):
                # Get item vector (ratings across all users from training)
                # This is a column vector: (num_users,)
                item_vec_full = train_mat_tensor[:, item_id]  # (num_users,)
                # Expand to match batch size: each user in batch gets the same item vector
                item_vecs_batch = item_vec_full.unsqueeze(0).expand(batch_size, -1)  # (batch_size, num_users)
                
                # Create dummy user_ids and item_ids (not used in forward pass after our fix)
                user_ids_batch = torch.arange(start_idx, end_idx, device=device)
                item_ids_batch = torch.full((batch_size,), item_id, device=device)
                
                # Get predictions
                pred_batch, _, _ = model(
                    user_vecs_batch, item_vecs_batch, user_ids_batch, item_ids_batch
                )
                
                predictions[start_idx:end_idx, item_id] = pred_batch.cpu().numpy()
    
    # Mask out items seen in training
    predictions = predictions * (~train_mask).astype(np.float32) - train_mask.astype(np.float32) * 1e10
    
    HR_list = []
    NDCG_list = []
    
    # Calculate metrics for each user
    for user_id in range(num_users):
        # Get ground truth items for this user (items rated in test set)
        test_items = set(np.where(test_mat_np[user_id] > 0)[0])
        
        # Skip if user has no test items
        if len(test_items) == 0:
            continue
        
        # Get top-K items for this user
        user_predictions = predictions[user_id]
        top_k_indices = np.argsort(user_predictions)[-top_k:][::-1]
        top_k_items = top_k_indices.tolist()
        
        # Calculate metrics
        HR_list.append(hit(test_items, top_k_items))
        NDCG_list.append(ndcg(test_items, top_k_items))
    
    # Calculate average metrics
    if len(HR_list) == 0:
        return 0.0, 0.0
    
    mean_HR = np.mean(HR_list)
    mean_NDCG = np.mean(NDCG_list)
    
    return mean_HR, mean_NDCG


In [None]:
# Training loop with metrics tracking
num_epochs = 20
best_test_rmse = float('inf')
best_epoch = 0
top_k = 10

# Create model save directory
model_dir = os.path.join(os.path.dirname(os.getcwd()), '..', 'models')
os.makedirs(model_dir, exist_ok=True)
model_path = os.path.join(model_dir, 'HybridAutoRecNCF.pth')

# Track metrics for visualization
training_history = {
    'epoch': [],
    'train_loss': [],
    'test_loss': [],
    'test_rmse': [],
    'hit_rate': [],
    'ndcg': []
}

print("=" * 70)
print("Starting Training")
print("=" * 70)
print(f"Epochs: {num_epochs}")
print(f"Batch size: {batch_size}")
print(f"Learning rate: {optimizer.param_groups[0]['lr']}")
print(f"Top-K for ranking metrics: {top_k}")
print("=" * 70)

for epoch in range(num_epochs):
    # Training
    train_loss = train(model, train_loader, optimizer, device)
    
    # Evaluation (RMSE)
    test_loss, test_rmse = evaluate(model, test_loader, device)
    
    # Evaluation (HitRate and NDCG) - compute every epoch or every few epochs
    # Note: This is computationally expensive, so we do it every epoch
    # You can modify to compute less frequently if needed
    print(f"  Computing ranking metrics (HR@{top_k}, NDCG@{top_k})...")
    hit_rate, ndcg_score = evaluate_ranking_metrics(
        model, train_mat, test_mat, num_users, num_items, top_k=top_k, device=device
    )
    
    # Save best model based on RMSE
    if test_rmse < best_test_rmse:
        best_test_rmse = test_rmse
        best_epoch = epoch
        torch.save(model.state_dict(), model_path)
    
    # Track metrics
    training_history['epoch'].append(epoch + 1)
    training_history['train_loss'].append(train_loss)
    training_history['test_loss'].append(test_loss)
    training_history['test_rmse'].append(test_rmse)
    training_history['hit_rate'].append(hit_rate)
    training_history['ndcg'].append(ndcg_score)
    
    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"  Train Loss: {train_loss:.4f}")
    print(f"  Test Loss: {test_loss:.4f} | Test RMSE: {test_rmse:.4f}")
    print(f"  HR@{top_k}: {hit_rate:.4f} | NDCG@{top_k}: {ndcg_score:.4f}")
    if epoch == best_epoch:
        print(f"  ✓ Best model saved (RMSE: {best_test_rmse:.4f})")
    print("-" * 70)

print("=" * 70)
print("Training Complete!")
print(f"Best model at epoch {best_epoch+1} with RMSE: {best_test_rmse:.4f}")
print(f"Model saved to: {model_path}")
print("=" * 70)


Starting Training
Epochs: 20
Batch size: 256
Learning rate: 0.001
Top-K for ranking metrics: 10
  Computing ranking metrics (HR@10, NDCG@10)...


In [None]:
# Visualization of training metrics
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

epochs = training_history['epoch']

# Plot 1: RMSE
axes[0].plot(epochs, training_history['test_rmse'], 'b-o', label='Test RMSE', linewidth=2, markersize=6)
axes[0].axvline(x=best_epoch + 1, color='r', linestyle='--', alpha=0.7, label=f'Best Epoch ({best_epoch + 1})')
axes[0].set_xlabel('Epoch', fontsize=12)
axes[0].set_ylabel('RMSE', fontsize=12)
axes[0].set_title('Test RMSE Over Epochs', fontsize=14, fontweight='bold')
axes[0].grid(True, alpha=0.3)
axes[0].legend()
axes[0].set_xticks(epochs[::2] if len(epochs) > 10 else epochs)

# Plot 2: Hit Rate
axes[1].plot(epochs, training_history['hit_rate'], 'g-o', label=f'HR@{top_k}', linewidth=2, markersize=6)
axes[1].set_xlabel('Epoch', fontsize=12)
axes[1].set_ylabel('Hit Rate', fontsize=12)
axes[1].set_title(f'Hit Rate@{top_k} Over Epochs', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3)
axes[1].legend()
axes[1].set_xticks(epochs[::2] if len(epochs) > 10 else epochs)
axes[1].set_ylim([0, max(1.0, max(training_history['hit_rate']) * 1.1)])

# Plot 3: NDCG
axes[2].plot(epochs, training_history['ndcg'], 'm-o', label=f'NDCG@{top_k}', linewidth=2, markersize=6)
axes[2].set_xlabel('Epoch', fontsize=12)
axes[2].set_ylabel('NDCG', fontsize=12)
axes[2].set_title(f'NDCG@{top_k} Over Epochs', fontsize=14, fontweight='bold')
axes[2].grid(True, alpha=0.3)
axes[2].legend()
axes[2].set_xticks(epochs[::2] if len(epochs) > 10 else epochs)
axes[2].set_ylim([0, max(1.0, max(training_history['ndcg']) * 1.1)])

plt.tight_layout()
plt.show()

# Print final metrics
print("\n" + "=" * 70)
print("Final Training Metrics")
print("=" * 70)
print(f"Best RMSE: {best_test_rmse:.4f} (Epoch {best_epoch + 1})")
print(f"Final HR@{top_k}: {training_history['hit_rate'][-1]:.4f}")
print(f"Final NDCG@{top_k}: {training_history['ndcg'][-1]:.4f}")
print(f"Best HR@{top_k}: {max(training_history['hit_rate']):.4f} (Epoch {training_history['hit_rate'].index(max(training_history['hit_rate'])) + 1})")
print(f"Best NDCG@{top_k}: {max(training_history['ndcg']):.4f} (Epoch {training_history['ndcg'].index(max(training_history['ndcg'])) + 1})")
print("=" * 70)
