# Data Preprocessing

### Time Aware LOO Split

In [4]:
# Time-aware Leave-One-Out (LOO) split for MovieLens ratings
import pandas as pd
import numpy as np
from pathlib import Path

def _detect_ts_unit(ts_series):
    vmax = float(ts_series.max())
    return "ms" if vmax > 1e12 else "s"

def time_aware_loo_split(
    ratings_csv: str,
    out_dir: str,
    rating_threshold: float = 4.0,
    min_positives: int = 2,
    also_csv: bool = False,
):
    out = Path(out_dir); (out / "splits").mkdir(parents=True, exist_ok=True)
    ratings = pd.read_csv(ratings_csv)

    # 1) Basic checks
    need = {"userId","movieId","rating","timestamp"}
    missing = need - set(ratings.columns)
    if missing:
        raise ValueError(f"ratings.csv missing columns: {missing}")

    # 2) Normalize timestamp and keep implicit positives
    unit = _detect_ts_unit(ratings["timestamp"])
    ratings["ts"] = pd.to_datetime(ratings["timestamp"], unit=unit)

    pos = ratings[ratings["rating"] >= rating_threshold].copy()
    # Drop duplicate (user,item) -> keep earliest
    pos = pos.sort_values(["userId","ts","movieId"], kind="mergesort")
    pos = pos.drop_duplicates(["userId","movieId"], keep="first")

    # Keep users with at least min_positives
    pos = pos[pos.groupby("userId")["movieId"].transform("size") >= min_positives].copy()

    # 3) Rank by time per user, assign splits: last=test, second last=val (if >=3), rest=train
    pos["n"] = pos.groupby("userId")["movieId"].transform("size")
    pos["idx"] = pos.groupby("userId").cumcount()
    pos["split"] = "train"
    pos.loc[pos["idx"] == pos["n"]-1, "split"] = "test"
    pos.loc[(pos["n"]>=3) & (pos["idx"] == pos["n"]-2), "split"] = "val"

    train = pos[pos["split"]=="train"][["userId","movieId","ts"]].reset_index(drop=True)
    val_targets  = pos[pos["split"]=="val"][["userId","movieId","ts"]].rename(
        columns={"movieId":"val_item","ts":"ts_val"}).reset_index(drop=True)
    test_targets = pos[pos["split"]=="test"][["userId","movieId","ts"]].rename(
        columns={"movieId":"test_item","ts":"ts_test"}).reset_index(drop=True)

    # 4) Build ID maps from TRAIN only (contiguous 0..U-1 and 0..I-1)
    uids = pd.DataFrame(sorted(train["userId"].unique()), columns=["userId"])
    uids["uid"] = range(len(uids))
    iids = pd.DataFrame(sorted(train["movieId"].unique()), columns=["movieId"])
    iids["iid"] = range(len(iids))

    # 5) Also provide indexed versions (useful for MF/ALS)
    train_idx = (train.merge(uids, on="userId", how="inner")
                      .merge(iids, on="movieId", how="inner"))
    val_idx = None
    if len(val_targets):
        val_idx = (val_targets.merge(uids, on="userId", how="inner")
                              .merge(iids, left_on="val_item", right_on="movieId", how="left")
                              .drop(columns=["movieId"]))
    test_idx = (test_targets.merge(uids, on="userId", how="inner")
                               .merge(iids, left_on="test_item", right_on="movieId", how="left")
                               .drop(columns=["movieId"]))

    # 6) Save outputs
    sp = out / "splits"
    train.to_parquet(sp / "train.parquet", index=False)
    if len(val_targets):
        val_targets.to_parquet(sp / "val_targets.parquet", index=False)
    test_targets.to_parquet(sp / "test_targets.parquet", index=False)
    uids.to_parquet(sp / "user_id_map.parquet", index=False)
    iids.to_parquet(sp / "item_id_map.parquet", index=False)
    train_idx.to_parquet(sp / "train_indexed.parquet", index=False)
    if val_idx is not None:
        val_idx.to_parquet(sp / "val_targets_indexed.parquet", index=False)
    test_idx.to_parquet(sp / "test_targets_indexed.parquet", index=False)

    if also_csv:
        train.to_csv(sp / "train.csv", index=False)
        if len(val_targets): val_targets.to_csv(sp / "val_targets.csv", index=False)
        test_targets.to_csv(sp / "test_targets.csv", index=False)
        uids.to_csv(sp / "user_id_map.csv", index=False)
        iids.to_csv(sp / "item_id_map.csv", index=False)
        train_idx.to_csv(sp / "train_indexed.csv", index=False)
        if val_idx is not None:
            val_idx.to_csv(sp / "val_targets_indexed.csv", index=False)
        test_idx.to_csv(sp / "test_targets_indexed.csv", index=False)

    # 7) Quick stats + cold-start counts
    cold_val = int(val_idx["iid"].isna().sum()) if val_idx is not None and "iid" in val_idx else 0
    cold_test = int(test_idx["iid"].isna().sum()) if "iid" in test_idx else 0
    stats = f"""Time-aware LOO split summary
Users (TRAIN map): {len(uids)}
Items (TRAIN map): {len(iids)}
TRAIN positives  : {len(train)}
VAL users        : {len(val_targets["userId"].unique()) if len(val_targets) else 0}
TEST users       : {len(test_targets["userId"].unique())}
Cold-start VAL items  : {cold_val}
Cold-start TEST items : {cold_test}
"""
    (sp / "stats.txt").write_text(stats, encoding="utf-8")
    print(stats)


### Call Time Aware LOO Split

In [6]:
time_aware_loo_split(
    ratings_csv=r"C:\Users\abdul\ece1508gp\movielens_dataset\ratings.csv",
    out_dir=r"C:\Users\abdul\ece1508gp\movielens_dataset",
    rating_threshold=4.0,   # implicit “like” threshold
    min_positives=2,        # drop users with <2 positives
    also_csv=True           # optional CSV mirrors
)

Time-aware LOO split summary
Users (TRAIN map): 200466
Items (TRAIN map): 54711
TRAIN positives  : 15537362
VAL users        : 200143
TEST users       : 200466
Cold-start VAL items  : 221
Cold-start TEST items : 257



# Load splits + quick helpers

In [9]:
import pandas as pd
import numpy as np
from pathlib import Path
from scipy.sparse import csr_matrix

SPLITS = Path(r"C:\Users\abdul\ece1508gp\movielens_dataset\splits")

train   = pd.read_parquet(SPLITS / "train_indexed.parquet")         # [uid, iid, ts]
val     = pd.read_parquet(SPLITS / "val_targets_indexed.parquet")   # [userId, uid, val_item, iid, ts_val]
test    = pd.read_parquet(SPLITS / "test_targets_indexed.parquet")  # [userId, uid, test_item, iid, ts_test]


U = int(train["uid"].max()) + 1
I = int(train["iid"].max()) + 1

# users×items implicit matrix
R = csr_matrix((np.ones(len(train), dtype=np.float32),
                (train["uid"].astype(int), train["iid"].astype(int))),
               shape=(U, I))

# fast lookup: items seen in TRAIN per user
user_seen = train.groupby("uid")["iid"].apply(set).to_dict()

# evaluation helper: candidate coverage (how often held-out item is in the pool)
def candidate_coverage(cand_df, targets_df, tgt_col="iid"):
    # Join to bring the target iid next to the candidate list
    df = cand_df.merge(targets_df[["uid", tgt_col]], on="uid", how="inner")
    # Drop cold-start rows where iid is NaN (item not in TRAIN map)
    df = df[df[tgt_col].notna()]
    # Fast and robust: zip the target and the candidate list
    return np.mean([int(t) in set(c) for t, c in zip(df[tgt_col], df["candidates"])])

# Artifact A: Popularity (train-only)

In [10]:
# TRAIN-ONLY popularity
pop = (train.groupby("iid").size()
                 .sort_values(ascending=False))

def top_pop_unseen(u_seen, P=50):
    out = []
    for iid in pop.index:
        if iid not in u_seen:
            out.append(int(iid))
            if len(out) >= P:
                break
    return out

# Build candidate pools (popularity-only to start)

In [4]:
from tqdm import tqdm
import pyarrow as pa, pyarrow.parquet as pq

def build_pool_for_user(uid, P=50, K=100):
    seen = user_seen.get(uid, set())
    C = top_pop_unseen(seen, P=P)
    # de-dup + drop seen (pop already avoids seen, but keep it here for safety)
    C = [iid for i, iid in enumerate(C) if iid not in seen and C.index(iid) == i]
    return C[:K]

def make_candidates(user_ids, P=50, K=100):
    rows = []
    for u in tqdm(user_ids):
        rows.append({"uid": int(u), "candidates": build_pool_for_user(int(u), P=P, K=K)})
    return pd.DataFrame(rows)

users_val  = sorted(val_idx["uid"].unique())  if len(val_idx)  else []
users_test = sorted(test_idx["uid"].unique()) if len(test_idx) else []

cand_val  = make_candidates(users_val,  P=50, K=100)
cand_test = make_candidates(users_test, P=50, K=100)

# save
OUT = SPLITS.parent / "candidates"
OUT.mkdir(exist_ok=True)
cand_val.to_parquet(OUT / "val.parquet", index=False)
cand_test.to_parquet(OUT / "test.parquet", index=False)

print("Wrote:", OUT)

NameError: name 'val_idx' is not defined

# Check candidate coverage

In [None]:
val_cov  = candidate_coverage(cand_val,  val_idx,  "iid") if len(cand_val)  else float("nan")
test_cov = candidate_coverage(cand_test, test_idx, "iid") if len(cand_test) else float("nan")
print(f"Candidate coverage  val={val_cov:.2%}  test={test_cov:.2%}")

Candidate coverage  val=14.96%  test=13.80%
