In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Base project directory
BASE = Path(r"C:\Users\carlk\OneDrive\Documents\uoft\ECE1508H F\Project")

SPLITS = BASE / "splits"
CANDS = BASE / "candidates"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cpu


In [2]:
train_idx = pd.read_parquet(SPLITS / "train_indexed.parquet")
val_tgt = pd.read_parquet(SPLITS / "val_targets_indexed.parquet")
test_tgt = pd.read_parquet(SPLITS / "test_targets_indexed.parquet")

cand_val = pd.read_parquet(CANDS / "val.parquet")
cand_test = pd.read_parquet(CANDS / "test.parquet")

print("Train rows:", len(train_idx))
print("Val users:", len(val_tgt["uid"].unique()))
print("Test users:", len(test_tgt["uid"].unique()))

# Determine number of users & items
num_users = int(train_idx["uid"].max()) + 1
num_items = int(train_idx["iid"].max()) + 1

print("num_users =", num_users)
print("num_items =", num_items)

Train rows: 697181
Val users: 64301
Test users: 63492
num_users = 64541
num_items = 26531


In [3]:
class NCFDataset(Dataset):

    # For each positive (u,i), sample num_neg negative items j.

    def __init__(self, df, num_items, num_neg=4):
        self.users = df["uid"].values.astype(np.int64)
        self.items = df["iid"].values.astype(np.int64)
        self.num_items = num_items
        self.num_neg = num_neg

        # Build user->positive set to avoid sampling positives
        user_pos = {}
        for u, i in zip(self.users, self.items):
            user_pos.setdefault(u, set()).add(i)
        self.user_pos = user_pos

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        u = self.users[idx]
        i = self.items[idx]

        # Positive sample
        user_list = [u]
        item_list = [i]
        label_list = [1.0]

        # Negative samples
        pos_items = self.user_pos[u]
        for _ in range(self.num_neg):
            j = np.random.randint(0, self.num_items)
            while j in pos_items:
                j = np.random.randint(0, self.num_items)
            user_list.append(u)
            item_list.append(j)
            label_list.append(0.0)

        return (
            torch.tensor(user_list, dtype=torch.long),
            torch.tensor(item_list, dtype=torch.long),
            torch.tensor(label_list, dtype=torch.float32),
        )

In [4]:
class NCF(nn.Module):

    def __init__(self, num_users, num_items, emb_dim=64, mlp_dims=(128, 64, 32)):
        super().__init__()

        self.user_emb = nn.Embedding(num_users, emb_dim)
        self.item_emb = nn.Embedding(num_items, emb_dim)

        layers = []
        input_dim = emb_dim * 2
        for h in mlp_dims:
            layers.append(nn.Linear(input_dim, h))
            layers.append(nn.ReLU())
            input_dim = h
        self.mlp = nn.Sequential(*layers)

        self.out = nn.Linear(input_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, user, item):
        u = self.user_emb(user)
        v = self.item_emb(item)
        x = torch.cat([u, v], dim=-1)
        x = self.mlp(x)
        x = self.out(x)
        x = self.sigmoid(x)
        return x.squeeze(-1)

In [5]:
BATCH_SIZE = 512
NUM_EPOCHS = 5
NEG_PER_POS = 4
LR = 1e-3

train_ds = NCFDataset(train_idx, num_items=num_items, num_neg=NEG_PER_POS)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)

model = NCF(num_users, num_items).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
criterion = nn.BCELoss()

for epoch in range(1, NUM_EPOCHS + 1):
    model.train()
    total_loss = 0

    for users, items, labels in tqdm(train_loader, desc=f"Epoch {epoch}"):
        users = users.view(-1).to(device)
        items = items.view(-1).to(device)
        labels = labels.view(-1).to(device)

        optimizer.zero_grad()
        preds = model(users, items)
        loss = criterion(preds, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch} - loss: {total_loss / len(train_loader):.4f}")

Epoch 1: 100%|██████████| 1362/1362 [01:19<00:00, 17.05it/s]


Epoch 1 - loss: 0.3901


Epoch 2: 100%|██████████| 1362/1362 [01:16<00:00, 17.74it/s]


Epoch 2 - loss: 0.3355


Epoch 3: 100%|██████████| 1362/1362 [01:13<00:00, 18.63it/s]


Epoch 3 - loss: 0.3157


Epoch 4: 100%|██████████| 1362/1362 [01:17<00:00, 17.55it/s]


Epoch 4 - loss: 0.3017


Epoch 5: 100%|██████████| 1362/1362 [01:16<00:00, 17.92it/s]

Epoch 5 - loss: 0.2927





In [6]:
def eval_split(model, cand_df, tgt_df, topk=10, max_users=None):

    # Evaluate NCF on val/test split using the candidate pools.

    model.eval()

    tgt = tgt_df[["uid", "iid"]].rename(columns={"iid": "target_iid"})
    df = cand_df.merge(tgt, on="uid", how="inner")

    if max_users is not None:
        df = df.iloc[:max_users]

    hits, ndcgs, precs = [], [], []
    skipped = 0

    for row in tqdm(df.itertuples(), total=len(df), desc="Eval users"):
        uid = int(row.uid)
        cands = list(row.candidates)
        target = int(row.target_iid)

        # skip if target not in candidate pool
        if target not in cands:
            skipped += 1
            continue

        items = torch.tensor(cands, device=device)
        users = torch.full_like(items, uid)

        with torch.no_grad():
            scores = model(users, items).cpu().numpy()

        ranking_idx = np.argsort(-scores)
        topk_items = [cands[i] for i in ranking_idx[:topk]]

        hit = int(target in topk_items)
        hits.append(hit)

        if hit:
            rank = topk_items.index(target) + 1
            ndcgs.append(1 / np.log2(rank + 1))
        else:
            ndcgs.append(0)

        precs.append(hit / topk)

    hr = float(np.mean(hits))
    ndcg = float(np.mean(ndcgs))
    prec = float(np.mean(precs))
    recall = hr
    f1 = 2 * prec * recall / (prec + recall) if prec + recall > 0 else 0

    print(f"Evaluated users: {len(hits)}, skipped={skipped}")
    print(f"HR@{topk}: {hr:.4f}")
    print(f"NDCG@{topk}: {ndcg:.4f}")
    print(f"Prec@{topk}: {prec:.4f}")
    print(f"F1@{topk}: {f1:.4f}")

    return {"HR": hr, "NDCG": ndcg, "Precision": prec, "F1": f1}

In [7]:
print("===== Validation =====")
val_metrics = eval_split(model, cand_val, val_tgt, topk=10)

print("\n===== Test =====")
test_metrics = eval_split(model, cand_test, test_tgt, topk=10)

print("\nDone.")

===== Validation =====


Eval users: 100%|██████████| 64301/64301 [00:10<00:00, 6188.91it/s]


Evaluated users: 39990, skipped=24311
HR@10: 0.0240
NDCG@10: 0.0096
Prec@10: 0.0024
F1@10: 0.0044

===== Test =====


Eval users: 100%|██████████| 63492/63492 [00:09<00:00, 6583.47it/s]

Evaluated users: 37464, skipped=26028
HR@10: 0.0078
NDCG@10: 0.0033
Prec@10: 0.0008
F1@10: 0.0014

Done.



