In [1]:
import pandas as pd
import numpy as np

data_path = "../data/KuaiSAR_final"

# inter = pd.read_csv(data_path + '/rec_inter.csv')
use_cols = ['user_id', 'item_id', 'timestamp', 'click', 'like', 'follow', 'search']
df = pd.read_csv(data_path + '/rec_inter.csv', usecols=use_cols)

## data cleaning & preprocessing

In [2]:
# data cleaning & preprocessing

for c in ["click","like","follow","search"]:
    df[c] = df[c].fillna(0).astype(np.int8)

# only keep recommendation interaction(none search)
df = df[df['search'] == 0]

df['pos'] = ((df['click'] + df['like'] + df['follow']) > 0).astype(np.int8)

# timestamp
ts = pd.to_numeric(df['timestamp'], errors='coerce')
df = df[ts.notna()].copy()
df['ts'] = ts.astype('int64')

In [3]:
# ==========================================
# 1. Iterative Filtering (Core Stability)
# ==========================================
# Keep top K items to avoid OOM and reduce noise
target_item_count = 50000
min_user_inter = 5

print(f"Original: {len(df)} interactions")

# Filter Items first (Keep Top 50k)
item_counts = df['item_id'].value_counts()
if len(item_counts) > target_item_count:
    top_items = item_counts.head(target_item_count).index
    df_filtered = df[df['item_id'].isin(top_items)].copy()
else:
    df_filtered = df.copy()

# Filter Users (Keep >= 5 interactions)
# We might need a loop because removing users might reduce item counts, and vice versa
# But for simplicity, one pass usually works well enough for coursework
user_counts = df_filtered['user_id'].value_counts()
valid_users = user_counts[user_counts >= min_user_inter].index
df_filtered = df_filtered[df_filtered['user_id'].isin(valid_users)].copy()

print(f"Filtered: {len(df_filtered)} interactions")
print(f"Users: {df_filtered['user_id'].nunique()}, Items: {df_filtered['item_id'].nunique()}")

# ==========================================
# 2. ID Remapping
# ==========================================
unique_users = df_filtered['user_id'].unique()
unique_items = df_filtered['item_id'].unique()

user2idx = {uid: i for i, uid in enumerate(unique_users)}
item2idx = {iid: i for i, iid in enumerate(unique_items)}

df_filtered['user_idx'] = df_filtered['user_id'].map(user2idx)
df_filtered['item_idx'] = df_filtered['item_id'].map(item2idx)

num_users = len(unique_users)
num_items = len(unique_items)

# ==========================================
# 3. Train/Test Split
# ==========================================
df_filtered = df_filtered.sort_values(['user_idx', 'ts'])
grouped = df_filtered.groupby('user_idx')
test = df_filtered.loc[grouped.tail(1).index]
train = df_filtered.drop(test.index)

print(f"Train: {len(train)}, Test: {len(test)}")

Original: 7461153 interactions
Filtered: 2621974 interactions
Users: 23936, Items: 50000
Train: 2598038, Test: 23936


## Most Popular Baseline

In [None]:
# ==========================================
# 1. Define Reusable Evaluation Function
# ==========================================
def evaluate_model(model_name, test_df, topk_preds, K=50):
    """
    test_df: DataFrame with 'user_idx' and 'item_idx' (ground truth)
    topk_preds: dict or Series, user_idx -> list of top K item_indices
    """
    hits = []
    ndcgs = []
    
    # Convert predictions to a dict for fast lookup if it isn't already
    if not isinstance(topk_preds, dict):
        pred_dict = topk_preds.to_dict()
    else:
        pred_dict = topk_preds

    for _, row in test_df.iterrows():
        u = row['user_idx']
        gt = row['item_idx']
        
        # Get recommendations for this user, default to empty if missing
        recs = pred_dict.get(u, [])
        
        # HR@K
        if gt in recs:
            hits.append(1)
            # NDCG@K
            rank = recs.index(gt)
            ndcgs.append(1.0 / np.log2(rank + 2))
        else:
            hits.append(0)
            ndcgs.append(0.0)
            
    hr = np.mean(hits)
    ndcg = np.mean(ndcgs)
    print(f"[{model_name}] HR@{K}: {hr:.4f}  NDCG@{K}: {ndcg:.4f}")
    return hr, ndcg

# ==========================================
# 2. Run Most Popular Baseline
# ==========================================
# Calculate popularity on TRAIN set only (avoid data leakage)
# Using weighted popularity as you did before
train['w'] = (1*train['click'] + 2*train['like'] + 3*train['follow']).astype(np.int16)
pop_scores = train.groupby('item_idx')['w'].sum().sort_values(ascending=False)

# Get global Top-K list
K = 50
global_topk = pop_scores.index[:K].tolist()

# Assign same topk to all test users
most_pop_preds = {u: global_topk for u in test['user_idx'].unique()}

# Evaluate
hr_MP, ndcg_MP = evaluate_model("MostPopular", test, most_pop_preds, K=50)

[MostPopular] HR@50: 0.0304  NDCG@50: 0.0089


## Item CF

In [7]:
import torch
from sklearn.preprocessing import normalize

# Setup Device (GPU/MPS/CPU)
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA GPU")
else:
    device = torch.device("cpu")
    print("Using CPU")

# Prepare Indices and Values from Train Data
indices = torch.tensor([train['user_idx'].values, train['item_idx'].values], dtype=torch.long)
values = torch.tensor(train['w'].values, dtype=torch.float32)
shape = torch.Size((num_users, num_items))


# Construct User-Item Sparse Matrix
# user_item_mat: (Users x Items)
user_item_mat = torch.sparse_coo_tensor(indices, values, shape, device=device)
print(f"Sparse User-Item Matrix Shape: {user_item_mat.shape}")

Using CPU
Sparse User-Item Matrix Shape: torch.Size([23936, 50000])


In [8]:
# Calculate Item-Item Similarity (Cosine)

# Transpose to (Items x Users)
item_user_mat = user_item_mat.t()

item_user_dense = item_user_mat.to_dense()
item_norms = torch.norm(item_user_dense, p=2, dim=1, keepdim=True)
item_norms[item_norms == 0] = 1e-9
item_user_norm = item_user_dense / item_norms
sim_matrix = torch.mm(item_user_norm, item_user_norm.t()) # (Items x Items)

sim_matrix.fill_diagonal_(0)


tensor([[0.0000, 0.0418, 0.0415,  ..., 0.0000, 0.0000, 0.0000],
        [0.0418, 0.0000, 0.0805,  ..., 0.0000, 0.0000, 0.0000],
        [0.0415, 0.0805, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]])

In [None]:
# Generate Recommendations
from tqdm import tqdm
                                                         

# 5. Generate Recommendations
item_cf_preds = {}
K = 50
test_users_arr = test['user_idx'].unique()
batch_size = 200 # Adjust based on memory

for i in tqdm(range(0, len(test_users_arr), batch_size), desc="Predicting"):
    batch_uids = test_users_arr[i : i + batch_size]
    batch_uids_tensor = torch.tensor(batch_uids, device=device)
    
    # Get History: (Batch x Items)
    # index_select works on dense or sparse (if supported)
    # converting batch history to dense for calculation
    batch_hist = user_item_mat.index_select(0, batch_uids_tensor).to_dense()
    
    # Score: (Batch x Items) * (Items x Items) -> (Batch x Items)
    scores = torch.mm(batch_hist, sim_matrix)
    
    # Mask seen items
    scores = scores - 9999.0 * batch_hist
    
    # Top-K
    _, topk_indices = torch.topk(scores, k=K, dim=1)
    
    # Store
    topk_cpu = topk_indices.cpu().numpy()
    for idx, u in enumerate(batch_uids):
        item_cf_preds[u] = topk_cpu[idx].tolist()

# 6. Evaluate
hr_IC, ndcg_IC = evaluate_model("ItemCF", test, item_cf_preds, K=50)

Predicting: 100%|██████████| 120/120 [08:44<00:00,  4.37s/it]


[ItemCF_Torch] HR@50: 0.0547  NDCG@50: 0.0160


(0.05472927807486631, 0.01601873472111061)

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import random

# ==========================================
# 1. NeuMF Dataset with Negative Sampling
# ==========================================
class NeuMFDataset(Dataset):
    def __init__(self, train_df, num_items, num_neg=4):
        self.users = torch.tensor(train_df['user_idx'].values, dtype=torch.long)
        self.items = torch.tensor(train_df['item_idx'].values, dtype=torch.long)
        self.num_items = num_items
        self.num_neg = num_neg
        
        # Pre-compute a set of interacted items for each user for fast negative sampling
        # set lookup is O(1)
        self.user_item_set = train_df.groupby('user_idx')['item_idx'].apply(set).to_dict()

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        # Positive sample
        u = self.users[idx]
        i = self.items[idx]
        
        # Negative sampling
        # We need to find an item 'j' that user 'u' has NOT seen
        item_indices = []
        labels = []
        
        # Add Positive
        item_indices.append(i)
        labels.append(1.0)
        
        # Add Negatives
        interacted_items = self.user_item_set.get(u.item(), set())
        
        for _ in range(self.num_neg):
            j = random.randint(0, self.num_items - 1)
            # Simple rejection sampling
            while j in interacted_items:
                j = random.randint(0, self.num_items - 1)
            
            item_indices.append(torch.tensor(j, dtype=torch.long))
            labels.append(0.0)
            
        return u, torch.stack(item_indices), torch.tensor(labels, dtype=torch.float32)

# Create Dataset and DataLoader
# num_neg=4 is standard for NCF/NeuMF
train_dataset = NeuMFDataset(train, num_items=num_items, num_neg=4)
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)

print("NeuMF Dataset Ready.")

NeuMF Dataset Ready.


In [11]:
# ==========================================
# 2. NeuMF Model Architecture
# ==========================================
class NeuMF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=32):
        super(NeuMF, self).__init__()
        
        # --- GMF Part ---
        self.gmf_user_embed = nn.Embedding(num_users, embedding_dim)
        self.gmf_item_embed = nn.Embedding(num_items, embedding_dim)
        
        # --- MLP Part ---
        self.mlp_user_embed = nn.Embedding(num_users, embedding_dim)
        self.mlp_item_embed = nn.Embedding(num_items, embedding_dim)
        
        # MLP Layers: Input(2*dim) -> dim -> dim/2 -> 1
        self.mlp = nn.Sequential(
            nn.Linear(embedding_dim * 2, embedding_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(embedding_dim, embedding_dim // 2),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        
        # --- Prediction Layer ---
        # Concatenate GMF output (dim) and MLP output (dim/2)
        self.output = nn.Linear(embedding_dim + embedding_dim // 2, 1)
        
        # Init weights (Optional but recommended)
        self._init_weights()

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Embedding):
                nn.init.normal_(m.weight, std=0.01)
            elif isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)

    def forward(self, user, item):
        # GMF
        gmf_u = self.gmf_user_embed(user)
        gmf_i = self.gmf_item_embed(item)
        gmf_vector = gmf_u * gmf_i # Element-wise product
        
        # MLP
        mlp_u = self.mlp_user_embed(user)
        mlp_i = self.mlp_item_embed(item)
        mlp_input = torch.cat([mlp_u, mlp_i], dim=-1)
        mlp_vector = self.mlp(mlp_input)
        
        # Concat & Predict
        combined = torch.cat([gmf_vector, mlp_vector], dim=-1)
        prediction = self.output(combined)
        
        # Note: We don't use Sigmoid here because we use BCEWithLogitsLoss later
        return prediction.squeeze()

# Instantiate Model
model = NeuMF(num_users, num_items, embedding_dim=32).to(device)
print(model)

NeuMF(
  (gmf_user_embed): Embedding(23936, 32)
  (gmf_item_embed): Embedding(50000, 32)
  (mlp_user_embed): Embedding(23936, 32)
  (mlp_item_embed): Embedding(50000, 32)
  (mlp): Sequential(
    (0): Linear(in_features=64, out_features=32, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=32, out_features=16, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
  )
  (output): Linear(in_features=48, out_features=1, bias=True)
)


In [12]:
# ==========================================
# 3. Training Loop
# ==========================================
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 5 # For demo, you can increase to 10-20
model.train()

for epoch in range(epochs):
    total_loss = 0
    
    # Progress bar for each epoch
    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")
    
    for u, items, labels in pbar:
        u, items, labels = u.to(device), items.to(device), labels.to(device)
        
        # Flatten input for batch processing
        # items shape: (batch, 1+num_neg) -> (batch * (1+num_neg))
        # We repeat user '1+num_neg' times to match items
        batch_size_curr = u.shape[0]
        num_samples = items.shape[1]
        
        u_flat = u.repeat_interleave(num_samples)
        items_flat = items.view(-1)
        labels_flat = labels.view(-1)
        
        optimizer.zero_grad()
        preds = model(u_flat, items_flat)
        loss = criterion(preds, labels_flat)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        pbar.set_postfix({'loss': loss.item()})

print("Training Finished.")

# ==========================================
# 4. Generate Recommendations (Inference)
# ==========================================
model.eval()
neumf_preds = {}
test_users = test['user_idx'].unique()
K = 50

# For NeuMF, scoring every item for every user is SLOW.
# Strategy:
# 1. Compute user embeddings and item embeddings.
# 2. But MLP makes it hard to do simple dot product.
# 3. So we usually do Batch Inference on ALL items for each test user (or a candidate set).
# Given we have 50k items, we can score all 50k for each test user.

all_items = torch.arange(num_items, device=device)

with torch.no_grad():
    for u in tqdm(test_users, desc="NeuMF Prediction"):
        # Create input: user u repeated 50000 times
        u_tensor = torch.tensor([u], device=device).repeat(num_items)
        
        # Predict scores for all items
        scores = model(u_tensor, all_items)
        
        # Mask seen items (Optional but recommended)
        # For simplicity in demo, we might skip masking or do it simple
        # Here we just take Top-K directly
        
        # Top-K
        _, top_indices = torch.topk(scores, K)
        neumf_preds[u] = top_indices.cpu().tolist()

# Evaluate
evaluate_model("NeuMF", test, neumf_preds, K=50)

Epoch 1/5: 100%|██████████| 10149/10149 [02:02<00:00, 82.59it/s, loss=0.29] 
Epoch 2/5: 100%|██████████| 10149/10149 [01:51<00:00, 91.16it/s, loss=0.266]
Epoch 3/5: 100%|██████████| 10149/10149 [01:53<00:00, 89.72it/s, loss=0.188]
Epoch 4/5: 100%|██████████| 10149/10149 [01:49<00:00, 92.63it/s, loss=0.244] 
Epoch 5/5: 100%|██████████| 10149/10149 [01:53<00:00, 89.38it/s, loss=0.213]


Training Finished.


NeuMF Prediction: 100%|██████████| 23936/23936 [02:13<00:00, 179.73it/s]


[NeuMF] HR@50: 0.0656  NDCG@50: 0.0186


(0.06563335561497326, 0.018584764755696204)