In [1]:
import os
import sys

current_dir = os.getcwd()

# Walk up the directory tree until we find 'src'
path = current_dir
src_path = None

while True:
    if os.path.basename(path) == "src":
        src_path = path
        break
    parent = os.path.dirname(path)
    if parent == path:  # reached filesystem root
        break
    path = parent

# Add src to sys.path if found
if src_path and src_path not in sys.path:
    sys.path.insert(0, src_path)


import time
import numpy as np
import pandas as pd
import scipy.sparse as sp
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data

# local imports
from utils.ncfdata import NCFData
from helpers.ncf_model import NCF
from helpers import download_ml1m_dataset
from helpers.ranking_metrics import hit, ndcg
from utils.load_all_data import load_all_data
from utils.ml_to_ncf import preprocess_ml1m_to_ncf_format

In [None]:
def train_and_evaluate_embedding_size(factor_num, epochs=10):
    """
    Train a model with a specific embedding size and return RMSE, HR@10, and NDCG@10.
    This is a simplified version for quick evaluation.
    """
    print(f"  Training with embedding size: {factor_num}...")
    
    # Create model with specific embedding size
    test_model = NCF(
        user_num=user_num,
        item_num=item_num,
        factor_num=factor_num,
        num_layers=num_layers,
        dropout=dropout_rate,
        model_name='GMF',
        GMF_model=None,
        MLP_model=None
    )
    
    if device == 'cuda' and torch.cuda.is_available():
        test_model = test_model.cuda()
    
    # Setup optimizer
    test_optimizer = optim.Adam(test_model.parameters(), lr=learning_rate)
    test_loss_function = nn.BCEWithLogitsLoss()
    
    # Train for fewer epochs for speed
    best_ndcg = 0.0
    best_hr = 0.0
    
    for epoch in range(epochs):
        test_model.train()
        train_dataset.ng_sample()
        
        epoch_loss = 0.0
        num_batches = 0
        
        for batch_idx, (user, item, label) in enumerate(train_loader):
            if device == 'cuda' and torch.cuda.is_available():
                user = user.cuda()
                item = item.cuda()
                label = label.float().cuda()
            else:
                user = user
                item = item
                label = label.float()
            
            test_optimizer.zero_grad()
            prediction = test_model(user, item)
            loss = test_loss_function(prediction, label)
            loss.backward()
            test_optimizer.step()
            
            epoch_loss += loss.item()
            num_batches += 1
        
        # Evaluate periodically
        if (epoch + 1) % 5 == 0 or epoch == epochs - 1:
            test_model.eval()
            
            HR, NDCG = evaluate_metrics(test_model, test_loader, top_k, device)
            # Track best metrics
            if NDCG > best_ndcg:
                best_ndcg = NDCG
            if HR > best_hr:
                best_hr = HR
    
    print(f" HR@10: {best_hr:.4f}, NDCG@10: {best_ndcg:.4f}")
    return  best_hr, best_ndcg

# Test different embedding sizes
embedding_sizes = [8, 16, 32, 64, 128]
print(f"\nTraining models with different embedding sizes: {embedding_sizes}")
print("Note: This may take some time. Using reduced epochs for speed.")

hr_results = []
ndcg_results = []

for size in embedding_sizes:
    hr_val, ndcg_val = train_and_evaluate_embedding_size(size, epochs=10)  # Reduced epochs for speed
    hr_results.append(hr_val)
    ndcg_results.append(ndcg_val)

In [None]:
# Create figure with subplots for all three metrics
fig_embedding, axes = plt.subplots(1, 2, figsize=(18, 5))

# Plot 2: HR@10
axes[0].plot(embedding_sizes, hr_results, 'g-o', linewidth=2, markersize=8, 
             markerfacecolor='lightgreen', markeredgewidth=2)
axes[0].set_xlabel('Embedding Size', fontsize=12, fontweight='bold')
axes[0].set_ylabel('HR@10', fontsize=12, fontweight='bold')
axes[0].set_title('HR@10 vs Embedding Size', fontsize=13, fontweight='bold')
axes[0].grid(True, alpha=0.3)
axes[0].set_xticks(embedding_sizes)
axes[0].set_ylim([min(hr_results) * 0.95, max(hr_results) * 1.05])

# Add value labels on points
for i, (size, hr) in enumerate(zip(embedding_sizes, hr_results)):
    axes[0].annotate(f'{hr:.3f}', (size, hr), textcoords="offset points", 
                    xytext=(0,10), ha='center', fontsize=9)

# Plot 3: NDCG@10
axes[1].plot(embedding_sizes, ndcg_results, 'b-o', linewidth=2, markersize=8, 
             markerfacecolor='lightblue', markeredgewidth=2)
axes[1].set_xlabel('Embedding Size', fontsize=12, fontweight='bold')
axes[1].set_ylabel('NDCG@10', fontsize=12, fontweight='bold')
axes[1].set_title('NDCG@10 vs Embedding Size', fontsize=13, fontweight='bold')
axes[1].grid(True, alpha=0.3)
axes[1].set_xticks(embedding_sizes)
axes[1].set_ylim([min(ndcg_results) * 0.95, max(ndcg_results) * 1.05])

# Add value labels on points
for i, (size, ndcg) in enumerate(zip(embedding_sizes, ndcg_results)):
    axes[1].annotate(f'{ndcg:.3f}', (size, ndcg), textcoords="offset points", 
                    xytext=(0,10), ha='center', fontsize=9)

plt.suptitle('Figure 4.2: Impact of Embedding Size on Model Performance', 
             fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig(os.path.join(model_path, 'figure_4.2_embedding_size.png'), dpi=300, bbox_inches='tight')
print(f"\nâœ“ Figure 4.2 saved to: {os.path.join(model_path, 'figure_4.2_embedding_size.png')}")
plt.show()

# Print summary
print("\n" + "=" * 70)
print("Embedding Size Impact Summary")
print("=" * 70)
print(f"{'Embedding Size':<15} {'RMSE':<10} {'HR@10':<10} {'NDCG@10':<10}")
print("-" * 70)