In [None]:
import pandas as pd
import numpy as np

data_path = "../data/KuaiSAR_final"

# inter = pd.read_csv(data_path + '/rec_inter.csv')
use_cols = ['user_id', 'item_id', 'timestamp', 'click', 'like', 'follow', 'search']
df = pd.read_csv(data_path + '/rec_inter.csv', usecols=use_cols)

In [2]:
# data cleaning & preprocessing

for c in ["click","like","follow","search"]:
    df[c] = df[c].fillna(0).astype(np.int8)

# only keep recommendation interaction(none search)
df = df[df['search'] == 0]

df['pos'] = ((df['click'] + df['like'] + df['follow']) > 0).astype(np.int8)

# timestamp
ts = pd.to_numeric(df['timestamp'], errors='coerce')
df = df[ts.notna()].copy()
df['ts'] = ts.astype('int64')

In [3]:
# ==========================================
# 1. Filter Sparse Users (Cold-start)
# ==========================================
# Keep users with at least 5 positive interactions
min_interactions = 5
user_counts = df[df['pos'] == 1].groupby('user_id').size()
valid_users = user_counts[user_counts >= min_interactions].index
df_filtered = df[df['user_id'].isin(valid_users) & (df['pos'] == 1)].copy()

print(f"Original users: {df['user_id'].nunique()}, After filter: {df_filtered['user_id'].nunique()}")

# ==========================================
# 2. ID Remapping (Critical for Matrix/DL models)
# ==========================================
# Create mappings
unique_users = df_filtered['user_id'].unique()
unique_items = df_filtered['item_id'].unique()

user2idx = {uid: i for i, uid in enumerate(unique_users)}
item2idx = {iid: i for i, iid in enumerate(unique_items)}

# Invert mappings for later lookup if needed
idx2user = {i: uid for uid, i in user2idx.items()}
idx2item = {i: iid for iid, i in item2idx.items()}

# Map to new indices
df_filtered['user_idx'] = df_filtered['user_id'].map(user2idx)
df_filtered['item_idx'] = df_filtered['item_id'].map(item2idx)

num_users = len(unique_users)
num_items = len(unique_items)
print(f"Num Users: {num_users}, Num Items: {num_items}")

# ==========================================
# 3. Train/Test Split (Leave-One-Out)
# ==========================================
# Sort by user and time
df_filtered = df_filtered.sort_values(['user_idx', 'ts'])

# Group by user and split
# last item -> test
# rest -> train
grouped = df_filtered.groupby('user_idx')
test = df_filtered.loc[grouped.tail(1).index]
train = df_filtered.drop(test.index)

print(f"Train samples: {len(train)}, Test samples: {len(test)}")

Original users: 25876, After filter: 23836
Num Users: 23836, Num Items: 1232678
Train samples: 3747178, Test samples: 23836


In [5]:
# ==========================================
# 1. Define Reusable Evaluation Function
# ==========================================
def evaluate_model(model_name, test_df, topk_preds, K=50):
    """
    test_df: DataFrame with 'user_idx' and 'item_idx' (ground truth)
    topk_preds: dict or Series, user_idx -> list of top K item_indices
    """
    hits = []
    ndcgs = []
    
    # Convert predictions to a dict for fast lookup if it isn't already
    if not isinstance(topk_preds, dict):
        pred_dict = topk_preds.to_dict()
    else:
        pred_dict = topk_preds

    for _, row in test_df.iterrows():
        u = row['user_idx']
        gt = row['item_idx']
        
        # Get recommendations for this user, default to empty if missing
        recs = pred_dict.get(u, [])
        
        # HR@K
        if gt in recs:
            hits.append(1)
            # NDCG@K
            rank = recs.index(gt)
            ndcgs.append(1.0 / np.log2(rank + 2))
        else:
            hits.append(0)
            ndcgs.append(0.0)
            
    hr = np.mean(hits)
    ndcg = np.mean(ndcgs)
    print(f"[{model_name}] HR@{K}: {hr:.4f}  NDCG@{K}: {ndcg:.4f}")
    return hr, ndcg

# ==========================================
# 2. Run Most Popular Baseline
# ==========================================
# Calculate popularity on TRAIN set only (avoid data leakage)
# Using weighted popularity as you did before
train['w'] = (1*train['click'] + 2*train['like'] + 3*train['follow']).astype(np.int16)
pop_scores = train.groupby('item_idx')['w'].sum().sort_values(ascending=False)

# Get global Top-K list
K = 50
global_topk = pop_scores.index[:K].tolist()

# Assign same topk to all test users
most_pop_preds = {u: global_topk for u in test['user_idx'].unique()}

# Evaluate
hr, ndcg = evaluate_model("MostPopular", test, most_pop_preds, K=50)

[MostPopular] HR@50: 0.0199  NDCG@50: 0.0058
