In [1]:
# Import AutoRec model and data utilities
from utils.model import AutoRec
from utils.autorecdata import AutoRecData
from utils.preprocessor import PreProcessor
# Import external libraries
import torch
import torch.utils.data as data
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import os
from tqdm.auto import tqdm

print("=" * 70)
print("Imports Successful")
print("=" * 70)

Imports Successful


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
data_dir = os.path.join(os.path.dirname(os.getcwd()), '..', 'data', 'ml-1m')
data_path = os.path.join(data_dir, 'ratings.dat')

# Check if file exists
if not os.path.exists(data_path):
    raise FileNotFoundError(
        f"Data file not found at: {data_path}\n"
        f"Please ensure the MovieLens 1M dataset is downloaded and extracted."
    )

def load_ml_1m_data(data_path = data_path) -> pd.DataFrame:  
    print("=" * 70)
    print("Loading MovieLens 1M Dataset")
    print("=" * 70)
    print(f"Data path: {data_path}")
    return pd.read_csv(
        data_path,
        sep='::',
        header=None,
        names=['user_id', 'item_id', 'rating', 'timestamp'],
        engine='python',  # Explicitly use python engine to avoid warning
        dtype={
            'user_id': np.int32,
            'item_id': np.int32,
            'rating': np.float32,
            'timestamp': np.int32
        }
    )


# Load ratings data with proper engine to avoid warnings
print("\nLoading ratings data...")

ratings_df = load_ml_1m_data()

print(f"✓ Successfully loaded {len(ratings_df):,} ratings")
print("=" * 70)


Loading ratings data...
Loading MovieLens 1M Dataset
Data path: /Users/abbas/Documents/Codes/thesis/NCF/src/../data/ml-1m/ratings.dat
✓ Successfully loaded 1,000,209 ratings


In [3]:
# Create PreProcessor instance
preprocessor = PreProcessor()

# Preprocess the data: split into train/test and create rating matrices
print("\nSplitting data into train/test sets and creating rating matrices...")
print("=" * 70)
train_mat, test_mat, num_users, num_items = preprocessor.preprocess_ml1m_data(
    ratings_df, 
    test_size=0.2,  # 20% for testing
    random_state=42  # For reproducibility
)

print(f"✓ Data preprocessing complete!")
print("=" * 70)


Splitting data into train/test sets and creating rating matrices...
✓ Data preprocessing complete!


In [4]:
train_set = AutoRecData(data=train_mat)
test_set = AutoRecData(data=test_mat)

In [5]:
train_loader = data.DataLoader(
    dataset=train_set,
    batch_size=256,
    shuffle=True,
    num_workers=0,
)
print('Train loader created')
print("=" * 70)
test_loader = data.DataLoader(
    dataset=test_set, batch_size=len(test_set), shuffle=False, num_workers=0
)
print('Test loader created')
print("=" * 70)

Train loader created
Test loader created


In [6]:
# Set device (CPU or CUDA if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

model = AutoRec(
    num_users=num_users,
    num_items=num_items,
    num_hidden_units=500,
).to(device)

Using device: cpu


In [7]:
loss_f = nn.MSELoss()
optimizer = optim.Adam(
    model.parameters(),
    lr=0.001,
    weight_decay=0.01,
)

best_epoch, best_rmse = 0, np.inf
best_hr_at_10, best_ndcg_at_10 = 0.0, 0.0


print(model)

AutoRec(
  (encoder): Sequential(
    (0): Linear(in_features=3706, out_features=500, bias=True)
    (1): Sigmoid()
  )
  (decoder): Sequential(
    (0): Linear(in_features=500, out_features=3706, bias=True)
  )
)


In [None]:
from utils.helper import get_metrics, get_ranking_metrics
for epoch in tqdm(range(40)):
    model.train()
    for input_vec in train_loader:
        input_mask = (input_vec > 0).to(device)
        input_vec = input_vec.float().to(device)

        model.zero_grad()
        reconstruction = model(input_vec)
        loss = loss_f(reconstruction * input_mask, input_vec * input_mask)
        loss.backward()
        optimizer.step()

    model.eval()
    rmse = get_metrics(model=model, train_set=train_set, test_set=test_set, device=device)
    hr_at_10, ndcg_at_10 = get_ranking_metrics(
        model=model, 
        train_set=train_set, 
        test_set=test_set, 
        top_k=10, 
        device=device
    )

    print(f"[Epoch {epoch}]:: RMSE: {rmse:.6f}, HR@10: {hr_at_10:.6f}, NDCG@10: {ndcg_at_10:.6f}")

    print("=" * 70)

    if rmse < best_rmse:
        best_rmse, best_epoch = rmse, epoch
        best_hr_at_10, best_ndcg_at_10 = hr_at_10, ndcg_at_10
        # Save best model
        model_dir = os.path.join(os.path.dirname(os.getcwd()), '..', 'models')
        os.makedirs(model_dir, exist_ok=True)
        best_model_path = os.path.join(model_dir, 'AutoRec-best.pth')
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'rmse': rmse,
            'hr_at_10': hr_at_10,
            'ndcg_at_10': ndcg_at_10,
            'num_users': num_users,
            'num_items': num_items,
            'num_hidden_units': 500,
        }, best_model_path)

    print(f"Done. Best epoch {best_epoch}, best_rmse: {best_rmse:.6f}.")
    print(epoch)
    print("=" * 70)

# Save final model after training
print("\n" + "=" * 70)
print("Saving final model...")
print("=" * 70)
model_dir = os.path.join(os.path.dirname(os.getcwd()), '..', 'models')
os.makedirs(model_dir, exist_ok=True)
final_model_path = os.path.join(model_dir, 'AutoRec.pth')
torch.save({
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'rmse': rmse,
    'hr_at_10': hr_at_10,
    'ndcg_at_10': ndcg_at_10,
    'num_users': num_users,
    'num_items': num_items,
    'num_hidden_units': 500,
    'best_epoch': best_epoch,
    'best_rmse': best_rmse,
    'best_hr_at_10': best_hr_at_10,
    'best_ndcg_at_10': best_ndcg_at_10,
}, final_model_path)
print(f"✓ Final model saved to: {final_model_path}")
print(f"✓ Best model (epoch {best_epoch}) saved to: {os.path.join(model_dir, 'AutoRec-best.pth')}")
print(f"\nBest metrics:")
print(f"  RMSE: {best_rmse:.6f}")
print(f"  HR@10: {best_hr_at_10:.6f}")
print(f"  NDCG@10: {best_ndcg_at_10:.6f}")
print("=" * 70)

  2%|▎         | 1/40 [00:01<00:57,  1.47s/it]

[Epoch 0]:: RMSE: 2.500757, HR@10: 0.658496, NDCG@10: 0.192537
Done. Best epoch 0, best_rmse: 2.500757.
0


  5%|▌         | 2/40 [00:02<00:52,  1.39s/it]

[Epoch 1]:: RMSE: 2.316331, HR@10: 0.650547, NDCG@10: 0.188294
Done. Best epoch 1, best_rmse: 2.316331.
1


  8%|▊         | 3/40 [00:04<00:50,  1.35s/it]

[Epoch 2]:: RMSE: 2.261732, HR@10: 0.653528, NDCG@10: 0.187431
Done. Best epoch 2, best_rmse: 2.261732.
2


 10%|█         | 4/40 [00:05<00:48,  1.35s/it]

[Epoch 3]:: RMSE: 2.260826, HR@10: 0.654521, NDCG@10: 0.186811
Done. Best epoch 3, best_rmse: 2.260826.
3


 12%|█▎        | 5/40 [00:06<00:46,  1.32s/it]

[Epoch 4]:: RMSE: 2.268200, HR@10: 0.646572, NDCG@10: 0.187500
Done. Best epoch 3, best_rmse: 2.260826.
4


 15%|█▌        | 6/40 [00:08<00:44,  1.32s/it]

[Epoch 5]:: RMSE: 2.264537, HR@10: 0.649884, NDCG@10: 0.185480
Done. Best epoch 3, best_rmse: 2.260826.
5


 18%|█▊        | 7/40 [00:09<00:43,  1.33s/it]

[Epoch 6]:: RMSE: 2.266682, HR@10: 0.655018, NDCG@10: 0.187909
Done. Best epoch 3, best_rmse: 2.260826.
6


 20%|██        | 8/40 [00:10<00:41,  1.31s/it]

[Epoch 7]:: RMSE: 2.264211, HR@10: 0.650712, NDCG@10: 0.181934
Done. Best epoch 3, best_rmse: 2.260826.
7


 22%|██▎       | 9/40 [00:12<00:41,  1.33s/it]

[Epoch 8]:: RMSE: 2.254955, HR@10: 0.653528, NDCG@10: 0.186513
Done. Best epoch 8, best_rmse: 2.254955.
8


 25%|██▌       | 10/40 [00:13<00:40,  1.34s/it]

[Epoch 9]:: RMSE: 2.271948, HR@10: 0.648890, NDCG@10: 0.184257
Done. Best epoch 8, best_rmse: 2.254955.
9


 28%|██▊       | 11/40 [00:14<00:38,  1.32s/it]

[Epoch 10]:: RMSE: 2.262995, HR@10: 0.654853, NDCG@10: 0.187126
Done. Best epoch 8, best_rmse: 2.254955.
10


 30%|███       | 12/40 [00:15<00:36,  1.31s/it]

[Epoch 11]:: RMSE: 2.265923, HR@10: 0.646903, NDCG@10: 0.183589
Done. Best epoch 8, best_rmse: 2.254955.
11


 32%|███▎      | 13/40 [00:17<00:35,  1.30s/it]

[Epoch 12]:: RMSE: 2.297369, HR@10: 0.659656, NDCG@10: 0.185724
Done. Best epoch 8, best_rmse: 2.254955.
12


 35%|███▌      | 14/40 [00:18<00:34,  1.33s/it]

[Epoch 13]:: RMSE: 2.281664, HR@10: 0.654521, NDCG@10: 0.185325
Done. Best epoch 8, best_rmse: 2.254955.
13


 38%|███▊      | 15/40 [00:20<00:33,  1.36s/it]

[Epoch 14]:: RMSE: 2.271489, HR@10: 0.653528, NDCG@10: 0.188381
Done. Best epoch 8, best_rmse: 2.254955.
14


 40%|████      | 16/40 [00:21<00:32,  1.35s/it]

[Epoch 15]:: RMSE: 2.265495, HR@10: 0.644253, NDCG@10: 0.180806
Done. Best epoch 8, best_rmse: 2.254955.
15


 42%|████▎     | 17/40 [00:22<00:30,  1.33s/it]

[Epoch 16]:: RMSE: 2.320478, HR@10: 0.653528, NDCG@10: 0.183247
Done. Best epoch 8, best_rmse: 2.254955.
16


 45%|████▌     | 18/40 [00:24<00:29,  1.34s/it]

[Epoch 17]:: RMSE: 2.290147, HR@10: 0.660318, NDCG@10: 0.187755
Done. Best epoch 8, best_rmse: 2.254955.
17


 48%|████▊     | 19/40 [00:25<00:28,  1.34s/it]

[Epoch 18]:: RMSE: 2.269303, HR@10: 0.651871, NDCG@10: 0.185319
Done. Best epoch 8, best_rmse: 2.254955.
18


In [None]:
# Example: Load saved model for inference
# Uncomment and run this cell to load a saved model

# from utils.model import AutoRec
# import torch
# 
# # Load checkpoint
# model_dir = os.path.join(os.path.dirname(os.getcwd()), '..', 'models')
# checkpoint_path = os.path.join(model_dir, 'AutoRec.pth')  # or 'AutoRec-best.pth' for best model
# 
# checkpoint = torch.load(checkpoint_path, map_location='cpu')
# 
# # Recreate model with saved parameters
# loaded_model = AutoRec(
#     num_users=checkpoint['num_users'],
#     num_items=checkpoint['num_items'],
#     num_hidden_units=checkpoint['num_hidden_units']
# )
# 
# # Load model weights
# loaded_model.load_state_dict(checkpoint['model_state_dict'])
# loaded_model.eval()
# 
# print(f"Model loaded from: {checkpoint_path}")
# print(f"Epoch: {checkpoint['epoch']}")
# print(f"RMSE: {checkpoint['rmse']:.6f}")
# print(f"HR@10: {checkpoint['hr_at_10']:.6f}")
# print(f"NDCG@10: {checkpoint['ndcg_at_10']:.6f}")
# 
# # Now you can use loaded_model for inference
