In [12]:
import os, shutil
from pathlib import Path

BASE = Path(r"C:\Users\carlk\OneDrive\Documents\uoft\ECE1508H F\Project")

for d in ["splits", "candidates"]:
    p = BASE / d
    if p.exists():
        shutil.rmtree(p)
        print("[clean] removed", p)
    else:
        print("[clean] not found", p)

[clean] removed C:\Users\carlk\OneDrive\Documents\uoft\ECE1508H F\Project\splits
[clean] removed C:\Users\carlk\OneDrive\Documents\uoft\ECE1508H F\Project\candidates


In [13]:
import pandas as pd
import numpy as np

CSV_PATH = BASE / "Books_rating.csv"
assert CSV_PATH.exists(), f"Missing file: {CSV_PATH}"

df_raw = pd.read_csv(CSV_PATH)
print("Raw columns:", list(df_raw.columns))

df = pd.DataFrame({
    "userId":    df_raw["User_id"],
    "itemId":    df_raw["Id"],
    "rating":    pd.to_numeric(df_raw["review/score"], errors="coerce"),
    "timestamp": pd.to_datetime(df_raw["review/time"], errors="coerce"),
})

df["timestamp"] = df["timestamp"].fillna(pd.Timestamp(2000,1,1))
df["timestamp"] = (df["timestamp"].astype("int64") // 10**9)

df = df[(df["rating"] >= 1) & (df["rating"] <= 5)]

print(df.head(), df.dtypes)

def kcore_filter(df, u_col="userId", i_col="itemId", k_user=5, k_item=5, max_iters=20, verbose=True):
    for it in range(max_iters):
        n0, u0, i0 = len(df), df[u_col].nunique(), df[i_col].nunique()
        uf = df[u_col].value_counts()
        vf = df[i_col].value_counts()
        df = df[df[u_col].isin(uf[uf >= k_user].index)]
        df = df[df[i_col].isin(vf[vf >= k_item].index)]
        n1, u1, i1 = len(df), df[u_col].nunique(), df[i_col].nunique()
        if verbose:
            print(f"[k-core {it+1}] rows {n0:,}->{n1:,}, users {u0:,}->{u1:,}, items {i0:,}->{i1:,}")
        if n1 == n0:
            break
    return df

df = kcore_filter(df, k_user=5, k_item=5)
print(f"[after k-core] users={df['userId'].nunique():,}, items={df['itemId'].nunique():,}, rows={len(df):,}")

ratings_csv = BASE / "ratings.csv"
df.to_csv(ratings_csv, index=False)
print(f"[saved] {ratings_csv} ~ {len(df):,} rows")

Raw columns: ['Id', 'Title', 'Price', 'User_id', 'profileName', 'review/helpfulness', 'review/score', 'review/time', 'review/summary', 'review/text']
           userId      itemId  rating  timestamp
0   AVCGYZL8FQQTD  1882931173     4.0          0
1  A30TK6U7DNS82R  0826414346     5.0          1
2  A3UH4UZ4RSVO82  0826414346     5.0          1
3  A2MVUWT453QH61  0826414346     4.0          1
4  A22X4XUPKF66MR  0826414346     4.0          1 userId        object
itemId        object
rating       float64
timestamp      int64
dtype: object
[k-core 1] rows 3,000,000->1,077,091, users 1,008,972->82,519, items 221,998->69,986
[k-core 2] rows 1,077,091->977,301, users 82,519->77,225, items 69,986->29,668
[k-core 3] rows 977,301->951,522, users 77,225->70,381, items 29,668->28,742
[k-core 4] rows 951,522->944,212, users 70,381->70,121, items 28,742->27,027
[k-core 5] rows 944,212->941,724, users 70,121->69,534, items 27,027->26,952
[k-core 6] rows 941,724->940,863, users 69,534->69,496, items 2

In [14]:
def time_aware_loo_split(
    ratings_csv: str,
    out_dir: str,
    rating_threshold: float = 4.0,
    min_positives: int = 2,
    also_csv: bool = False,
):
    out = Path(out_dir); (out / "splits").mkdir(parents=True, exist_ok=True)
    ratings = pd.read_csv(ratings_csv)

    need = {"userId","itemId","rating","timestamp"}
    missing = need - set(ratings.columns)
    if missing:
        raise ValueError(f"ratings.csv missing columns: {missing}")

    unit = "ms" if float(ratings["timestamp"].max()) > 1e12 else "s"
    ratings["ts"] = pd.to_datetime(ratings["timestamp"], unit=unit)

    pos = ratings[ratings["rating"] >= rating_threshold].copy()

    pos = pos.drop_duplicates(["userId","itemId"], keep="first")

    cnt = pos.groupby("userId")["itemId"].transform("size")
    pos = pos[cnt >= min_positives].copy()

    pos = pos.sort_values(["userId","ts"], kind="mergesort")
    pos["n"]   = pos.groupby("userId")["userId"].transform("size")
    pos["idx"] = pos.groupby("userId").cumcount()
    pos["split"] = "train"
    pos.loc[pos["idx"] == pos["n"]-1, "split"] = "test"
    pos.loc[(pos["n"] >= 3) & (pos["idx"] == pos["n"]-2), "split"] = "val"

    train = pos[pos["split"]=="train"][["userId","itemId","ts"]].reset_index(drop=True)
    val_targets  = (pos[pos["split"]=="val"][["userId","itemId","ts"]]
                    .rename(columns={"itemId":"val_item","ts":"ts_val"}).reset_index(drop=True))
    test_targets = (pos[pos["split"]=="test"][["userId","itemId","ts"]]
                    .rename(columns={"itemId":"test_item","ts":"ts_test"}).reset_index(drop=True))

    uids = pd.DataFrame(sorted(train["userId"].unique()), columns=["userId"]); uids["uid"] = range(len(uids))
    iids = pd.DataFrame(sorted(train["itemId"].unique()), columns=["itemId"]); iids["iid"] = range(len(iids))

    val_idx  = (val_targets.merge(uids, on="userId", how="inner")
                          .merge(iids, left_on="val_item", right_on="itemId", how="left")
                          .drop(columns=["itemId"]))
    test_idx = (test_targets.merge(uids, on="userId", how="inner")
                          .merge(iids, left_on="test_item", right_on="itemId", how="left")
                          .drop(columns=["itemId"]))

    sp = out / "splits"
    train.to_parquet(sp / "train.parquet", index=False)
    if len(val_targets):  val_targets.to_parquet(sp / "val_targets.parquet", index=False)
    test_targets.to_parquet(sp / "test_targets.parquet", index=False)
    uids.to_parquet(sp / "user_id_map.parquet", index=False)
    iids.to_parquet(sp / "item_id_map.parquet", index=False)
    (train.merge(uids, on="userId", how="inner")
          .merge(iids, on="itemId", how="inner")
          .drop(columns=["userId","itemId"])
          .to_parquet(sp / "train_indexed.parquet", index=False))
    if len(val_idx):   val_idx.to_parquet(sp / "val_targets_indexed.parquet", index=False)
    test_idx.to_parquet(sp / "test_targets_indexed.parquet", index=False)

    cold_val  = int(val_idx["iid"].isna().sum()) if len(val_idx) else 0
    cold_test = int(test_idx["iid"].isna().sum()) if "iid" in test_idx else 0
    stats = f"""Time-aware LOO split summary
Users (TRAIN map): {len(uids):,}
Items (TRAIN map): {len(iids):,}
TRAIN positives : {len(train):,}
VAL users       : {val_idx["uid"].nunique() if len(val_idx) else 0:,}
TEST users      : {test_idx["uid"].nunique() if len(test_idx) else 0:,}
Cold-start VAL items : {cold_val}
Cold-start TEST items: {cold_test}
"""
    (sp / "stats.txt").write_text(stats, encoding="utf-8")
    print(stats)

In [15]:
import pandas as pd
import numpy as np
from pathlib import Path

def _detect_ts_unit(ts_series):
    vmax = float(ts_series.max())
    return "ms" if vmax > 1e12 else "s"

def time_aware_loo_split(
    ratings_csv: str,
    out_dir: str,
    rating_threshold: float = 4.0,
    min_positives: int = 2,
    also_csv: bool = False,
):
    out = Path(out_dir); (out / "splits").mkdir(parents=True, exist_ok=True)
    ratings = pd.read_csv(ratings_csv)

    need = {"userId","itemId","rating","timestamp"}
    missing = need - set(ratings.columns)
    if missing:
        raise ValueError(f"ratings.csv missing columns: {missing}")

    unit = _detect_ts_unit(ratings["timestamp"])
    ratings["ts"] = pd.to_datetime(ratings["timestamp"], unit=unit)

    pos = ratings[ratings["rating"] >= rating_threshold].copy()
    pos = pos.sort_values(["userId","ts"], kind="mergesort")

    pos = ratings.loc[ratings["rating"] >= rating_threshold,
                      ["userId", "itemId", "timestamp"]].copy()

    pos = pos.drop_duplicates(["userId", "itemId"], keep="first")

    ts_unit = "ms" if float(pos["timestamp"].max()) > 1e12 else "s"
    pos["ts"] = pd.to_datetime(pos["timestamp"], unit=ts_unit, errors="coerce")

    cnt = pos.groupby("userId")["itemId"].transform("size")
    pos = pos[cnt >= min_positives].copy()

    pos = pos.sort_values(["userId", "ts"], kind="mergesort")
    pos["n"]   = pos.groupby("userId")["itemId"].transform("size")
    pos["idx"] = pos.groupby("userId").cumcount()
    pos["split"] = "train"
    pos.loc[pos["idx"] == pos["n"] - 1, "split"] = "test"
    pos.loc[(pos["n"] >= 3) & (pos["idx"] == pos["n"] - 2), "split"] = "val"

    pos = pos.sort_values(["userId","ts"], kind="mergesort")
    pos["n"]   = pos.groupby("userId")["userId"].transform("size")
    pos["idx"] = pos.groupby("userId").cumcount()
    pos["split"] = "train"
    pos.loc[pos["idx"] == pos["n"]-1, "split"] = "test"
    pos.loc[(pos["n"] >= 3) & (pos["idx"] == pos["n"]-2), "split"] = "val"

    train = pos[pos["split"]=="train"][["userId","itemId","ts"]].reset_index(drop=True)
    val_targets  = pos[pos["split"]=="val"][["userId","itemId","ts"]].rename(columns={"itemId":"val_item","ts":"ts_val"}).reset_index(drop=True)
    test_targets = pos[pos["split"]=="test"][["userId","itemId","ts"]].rename(columns={"itemId":"test_item","ts":"ts_test"}).reset_index(drop=True)

    uids = pd.DataFrame(sorted(train["userId"].unique()), columns=["userId"])
    uids["uid"] = range(len(uids))
    iids = pd.DataFrame(sorted(train["itemId"].unique()), columns=["itemId"])
    iids["iid"] = range(len(iids))

    val_idx  = (val_targets.merge(uids, on="userId", how="inner")
                          .merge(iids, left_on="val_item", right_on="itemId", how="left")
                          .drop(columns=["itemId"]))
    test_idx = (test_targets.merge(uids, on="userId", how="inner")
                          .merge(iids, left_on="test_item", right_on="itemId", how="left")
                          .drop(columns=["itemId"]))

    sp = out / "splits"
    train.to_parquet(sp / "train.parquet", index=False)
    if len(val_targets):  val_targets.to_parquet(sp / "val_targets.parquet", index=False)
    test_targets.to_parquet(sp / "test_targets.parquet", index=False)
    uids.to_parquet(sp / "user_id_map.parquet", index=False)
    iids.to_parquet(sp / "item_id_map.parquet", index=False)
    (train.merge(uids, on="userId", how="inner")
          .merge(iids, on="itemId", how="inner")
          .drop(columns=["userId","itemId"])
          .to_parquet(sp / "train_indexed.parquet", index=False))
    if len(val_idx):   val_idx.to_parquet(sp / "val_targets_indexed.parquet", index=False)
    test_idx.to_parquet(sp / "test_targets_indexed.parquet", index=False)

    cold_val  = int(val_idx["iid"].isna().sum()) if len(val_idx) else 0
    cold_test = int(test_idx["iid"].isna().sum()) if "iid" in test_idx else 0
    stats = f"""Time-aware LOO split summary
Users (TRAIN map): {len(uids):,}
Items (TRAIN map): {len(iids):,}
TRAIN positives : {len(train):,}
VAL users       : {val_idx["uid"].nunique() if len(val_idx) else 0:,}
TEST users      : {test_idx["uid"].nunique() if len(test_idx) else 0:,}
Cold-start VAL items : {cold_val}
Cold-start TEST items: {cold_test}
"""
    (sp / "stats.txt").write_text(stats, encoding="utf-8")
    print(stats)

time_aware_loo_split(
    ratings_csv=str(BASE/"ratings.csv"),
    out_dir=str(BASE),
    rating_threshold=3.0,
    min_positives=3,
    also_csv=False,
)

Time-aware LOO split summary
Users (TRAIN map): 64,541
Items (TRAIN map): 26,497
TRAIN positives : 697,181
VAL users       : 64,541
TEST users      : 64,541
Cold-start VAL items : 257
Cold-start TEST items: 1078



In [16]:
import pandas as pd
import numpy as np
from pathlib import Path

def _detect_ts_unit(ts_series: pd.Series) -> str:
    vmax = float(ts_series.max())
    return "ms" if vmax > 1e12 else "s"

def time_aware_loo_split(
    ratings_csv: str,
    out_dir: str,
    rating_threshold: float = 3.0,
    min_positives: int = 3,
    also_csv: bool = False,
):
    out = Path(out_dir); (out / "splits").mkdir(parents=True, exist_ok=True)
    ratings = pd.read_csv(ratings_csv)

    need = {"userId","itemId","rating","timestamp"}
    missing = need - set(ratings.columns)
    if missing:
        raise ValueError(f"ratings.csv missing columns: {missing}")


    try:
        unit = _detect_ts_unit(ratings["timestamp"])
        ratings["ts"] = pd.to_datetime(ratings["timestamp"], unit=unit)
    except Exception:

        ratings["ts"] = pd.to_datetime(ratings["timestamp"], unit="s", origin="unix", errors="ignore")

    pos = ratings[ratings["rating"] >= rating_threshold].copy()
    pos = pos.sort_values(["userId","itemId","ts"], kind="mergesort").drop_duplicates(["userId","itemId"], keep="first")
    pos = pos.groupby("userId").filter(lambda g: len(g) >= min_positives)

    pos = pos.sort_values(["userId","ts"], kind="mergesort")
    pos["n"]   = pos.groupby("userId")["itemId"].transform("size")
    pos["idx"] = pos.groupby("userId").cumcount()

    pos["split"] = "train"
    pos.loc[pos["idx"]==pos["n"]-1, "split"] = "test"
    pos.loc[(pos["n"]>=3) & (pos["idx"]==pos["n"]-2), "split"] = "val"

    train = pos[pos["split"]=="train"][["userId","itemId","ts"]].reset_index(drop=True)
    val_targets  = pos[pos["split"]=="val"][["userId","itemId","ts"]].rename(columns={"itemId":"val_item","ts":"ts_val"}).reset_index(drop=True)
    test_targets = pos[pos["split"]=="test"][["userId","itemId","ts"]].rename(columns={"itemId":"test_item","ts":"ts_test"}).reset_index(drop=True)

    uids = pd.DataFrame(sorted(train["userId"].unique()), columns=["userId"]); uids["uid"]=range(len(uids))
    iids = pd.DataFrame(sorted(train["itemId"].unique()), columns=["itemId"]); iids["iid"]=range(len(iids))

    train_idx = (train.merge(uids, on="userId").merge(iids, on="itemId"))
    val_idx   = (val_targets.merge(uids, on="userId", how="inner")
                           .merge(iids, left_on="val_item", right_on="itemId", how="left")
                           .drop(columns=["itemId"]))
    test_idx  = (test_targets.merge(uids, on="userId", how="inner")
                             .merge(iids, left_on="test_item", right_on="itemId", how="left")
                             .drop(columns=["itemId"]))

    sp = Path(out_dir) / "splits"
    train.to_parquet(sp/"train.parquet", index=False)
    val_targets.to_parquet(sp/"val_targets.parquet", index=False)
    test_targets.to_parquet(sp/"test_targets.parquet", index=False)
    uids.to_parquet(sp/"user_id_map.parquet", index=False)
    iids.to_parquet(sp/"item_id_map.parquet", index=False)
    train_idx.to_parquet(sp/"train_indexed.parquet", index=False)
    val_idx.to_parquet(sp/"val_targets_indexed.parquet", index=False)
    test_idx.to_parquet(sp/"test_targets_indexed.parquet", index=False)

    if also_csv:
        for p in ["train","val_targets","test_targets","user_id_map","item_id_map","train_indexed","val_targets_indexed","test_targets_indexed"]:
            pd.read_parquet(sp/f"{p}.parquet").to_csv(sp/f"{p}.csv", index=False)

    cold_val  = int(val_idx["iid"].isna().sum()) if "iid" in val_idx.columns else 0
    cold_test = int(test_idx["iid"].isna().sum()) if "iid" in test_idx.columns else 0
    stats = f"""Time-aware LOO split summary
Users (TRAIN map): {len(uids)}
Items (TRAIN map): {len(iids)}
TRAIN positives : {len(train)}
VAL users       : {len(val_targets['userId'].unique())}
TEST users      : {len(test_targets['userId'].unique())}
Cold-start VAL items : {cold_val}
Cold-start TEST items: {cold_test}
"""
    (sp/"stats.txt").write_text(stats, encoding="utf-8")
    print(stats)

# Run split on the normalized Kaggle CSV
time_aware_loo_split(
    ratings_csv=str(BASE/"ratings.csv"),
    out_dir=str(BASE),
    rating_threshold=3.0,
    min_positives=3,
    also_csv=False
)

Time-aware LOO split summary
Users (TRAIN map): 64541
Items (TRAIN map): 26531
TRAIN positives : 697181
VAL users       : 64541
TEST users      : 64541
Cold-start VAL items : 240
Cold-start TEST items: 1049



In [17]:
import pandas as pd

SPLITS = BASE / "splits"
train_idx = pd.read_parquet(SPLITS/"train_indexed.parquet")
val_idx   = pd.read_parquet(SPLITS/"val_targets_indexed.parquet")
test_idx  = pd.read_parquet(SPLITS/"test_targets_indexed.parquet")

train_items = set(train_idx["iid"].unique())
val_keep  = val_idx[val_idx["iid"].isin(train_items)].copy()
test_keep = test_idx[test_idx["iid"].isin(train_items)].copy()

val_keep.to_parquet(SPLITS/"val_targets_indexed.parquet", index=False)
test_keep.to_parquet(SPLITS/"test_targets_indexed.parquet", index=False)
print("[covered] kept val:", len(val_keep), " / test:", len(test_keep))


[covered] kept val: 64301  / test: 63492


In [18]:
import pandas as pd
import numpy as np
from pathlib import Path
from scipy.sparse import csr_matrix

SPLITS = BASE / "splits"
train_idx = pd.read_parquet(SPLITS/"train_indexed.parquet")   # [uid, iid, ts]
val_idx   = pd.read_parquet(SPLITS/"val_targets_indexed.parquet")   # [uid, val_item(iid), ts_val]
test_idx  = pd.read_parquet(SPLITS/"test_targets_indexed.parquet")  # [uid, test_item(iid), ts_test]

U = int(train_idx["uid"].max()) + 1
I = int(train_idx["iid"].max()) + 1

user_seen = train_idx.groupby("uid")["iid"].apply(set).to_dict()

def candidate_coverage(cand_df, targets_df, tgt_col="iid"):
    df = cand_df.merge(targets_df[["uid", tgt_col]], on="uid", how="inner")
    df = df[df[tgt_col].notna()]
    return np.mean([(int(t) in set(c)) for t, c in zip(df[tgt_col], df["candidates"])])

In [19]:
train_idx_sorted = train_idx.sort_values(["uid","ts"]).groupby("uid").tail(200)

R = csr_matrix(
    (np.ones(len(train_idx_sorted), dtype=np.float32),
     (train_idx_sorted["uid"].astype(int).values,
      train_idx_sorted["iid"].astype(int).values)),
    shape=(U, I),
    dtype=np.float32
)

C = (R.T @ R).tocsr()
C.setdiag(0); C.eliminate_zeros()

M_SIM = 50
item_sim_map = {}
for iid in range(I):
    a, b = C.indptr[iid], C.indptr[iid+1]
    if a == b:
        item_sim_map[iid] = []
        continue
    neigh = C.indices[a:b]
    vals  = C.data[a:b]
    if len(neigh) > M_SIM:
        top = np.argpartition(-vals, M_SIM)[:M_SIM]
        neigh, vals = neigh[top], vals[top]
    order = np.argsort(-vals)
    item_sim_map[iid] = neigh[order].tolist()

print(f"[item-sim] built for I={I:,}. Example of item 0:", item_sim_map.get(0, [])[:10])

[item-sim] built for I=26,531. Example of item 0: [22824, 8, 8131, 20798, 486, 14822, 22726, 22580, 13332, 4209]


In [20]:
import pandas as pd
import numpy as np
from tqdm import tqdm

item_pop_series = train_idx['iid'].value_counts().astype(float)
item_pop_series = item_pop_series / item_pop_series.max()
item_pop = item_pop_series.to_dict()
item_popular = sorted(item_pop.items(), key=lambda x: -x[1])  # 兜底

def score_items_for_user(uid, recent=15, sim_per_item=50, alpha=1.0, beta=0.2):

    seen = user_seen.get(uid, set())
    recent_items = (train_idx.loc[train_idx["uid"]==uid]
                             .sort_values("ts", ascending=False)
                             .head(recent)["iid"].astype(int).tolist())
    scores = {}
    for j in recent_items:
        for i in item_sim_map.get(j, [])[:sim_per_item]:
            if i in seen:
                continue
            scores[i] = scores.get(i, 0.0) + alpha
    for i in list(scores.keys())[:5000]:
        scores[i] += beta * item_pop.get(int(i), 0.0)
    return scores

def build_pool_for_user(uid, K=50):
    seen = user_seen.get(uid, set())
    scores = score_items_for_user(uid, recent=15, sim_per_item=50, alpha=1.0, beta=0.2)
    if not scores:
        pop_list = [int(i) for i,_ in item_popular if int(i) not in seen]
        return pop_list[:K]
    top = sorted(scores.items(), key=lambda x: -x[1])[:K]
    return [int(i) for i,_ in top]

def make_candidates(user_ids, K=50):
    rows = []
    for u in tqdm(user_ids):
        rows.append({"uid": int(u), "candidates": build_pool_for_user(int(u), K=K)})
    return pd.DataFrame(rows)

users_val  = sorted(val_idx["uid"].unique())
users_test = sorted(test_idx["uid"].unique())

cand_val  = make_candidates(users_val,  K=50)
cand_test = make_candidates(users_test, K=50)

OUT = BASE / "candidates"; OUT.mkdir(parents=True, exist_ok=True)
cand_val.to_parquet(OUT/"val.parquet",  index=False)
cand_test.to_parquet(OUT/"test.parquet", index=False)
print("[saved candidates] ->", OUT)

100%|██████████| 64301/64301 [01:47<00:00, 600.38it/s]
100%|██████████| 63492/63492 [01:24<00:00, 752.49it/s]


[saved candidates] -> C:\Users\carlk\OneDrive\Documents\uoft\ECE1508H F\Project\candidates


In [21]:
cand_val  = pd.read_parquet(BASE/"candidates/val.parquet")
cand_test = pd.read_parquet(BASE/"candidates/test.parquet")

has_iid_val  = "iid" in val_idx.columns
has_iid_test = "iid" in test_idx.columns
print("val has iid:", has_iid_val, "test has iid:", has_iid_test)

val_cov  = candidate_coverage(cand_val,  val_idx,  "iid") if has_iid_val  else float("nan")
test_cov = candidate_coverage(cand_test, test_idx, "iid") if has_iid_test else float("nan")
print(f"Candidate coverage  val={val_cov:.2%}  test={test_cov:.2%}")

val has iid: True test has iid: True
Candidate coverage  val=62.19%  test=59.01%
