In [1]:
import os
import pandas as pd
import numpy as np

data_path = "../data/KuaiSAR_final"

# inter = pd.read_csv(data_path + '/rec_inter.csv')
use_cols = ['user_id', 'item_id', 'timestamp', 'click', 'like', 'follow', 'search']
df = pd.read_csv(data_path + '/rec_inter.csv', usecols=use_cols)

In [2]:
# data cleaning & preprocessing

for c in ["click","like","follow","search"]:
    df[c] = df[c].fillna(0).astype(np.int8)

# only keep recommendation interaction(none search)
df = df[df['search'] == 0]

df['pos'] = ((df['click'] + df['like'] + df['follow']) > 0).astype(np.int8)

# timestamp
ts = pd.to_numeric(df['timestamp'], errors='coerce')
df = df[ts.notna()].copy()
df['ts'] = ts.astype('int64')

In [3]:
# Use last positive interaction as test set
pos_df = df[df['pos'] == 1].copy()
if pos_df.empty:
    raise ValueError("No positive interactions found in the dataset")

# leave-one-out test
idx_test = pos_df.groupby('user_id')['ts'].idxmax()
test = pos_df.loc[idx_test]
train = pos_df.drop(index=idx_test)

train['w'] = (1*train['click'] + 2*train['like'] + 3*train['follow']).astype(np.int16)
pop = train.groupby('item_id')['w'].sum().sort_values(ascending=False)
K = 50
topk = pop.index[:K].tolist()

In [4]:
# Baseline Model -> Most Popular Model

rank_map = pd.Series(np.arange(len(topk)), index=topk)
test['rank'] = test['item_id'].map(rank_map)

hr = (test['rank'] < K).fillna(False).mean()
gains = np.where(test["rank"].notna() & (test["rank"] < K), 1.0/np.log2(test["rank"]+2), 0.0)
ndcg = gains.mean()

print(f"Users in test: {len(test)}")
print(f"HR@{K}: {hr:.4f}  NDCG@{K}: {ndcg:.4f}")

Users in test: 25318
HR@50: 0.0190  NDCG@50: 0.0058
