# **Load & basic preprocessing**

In [24]:
import os
import pickle
from typing import List, Tuple, Dict
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from scipy.sparse import csr_matrix, vstack
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MinMaxScaler, normalize
import hnswlib
import time

In [25]:
EVENTS_FILE = "events.csv"
ITEM_PROP1_FILE = "item_properties_part1.csv"
ITEM_PROP2_FILE = "item_properties_part2.csv"
CATEGORY_TREE_FILE = "category_tree.csv"

In [26]:
CACHE_DIR = "cache_reco"
os.makedirs(CACHE_DIR, exist_ok=True)

In [27]:
SVD_DIM = 64                 # embedding dim for CB after TruncatedSVD
ITEM_NN_TOPK = 200           # compute top-k item neighbors in item-based CF
CF_NEIGHBORS_TOPK = 50      # when scoring per user, use top 50 similar items
HNSW_M = 64
HNSW_EF_CONSTRUCTION = 200
HNSW_EF_SEARCH = 100

In [28]:
MIN_INTERACTIONS_ACTIVE_USER = 1  # we keep users with >= this interactions before train-test split
TRAIN_TEST_LAST_N = 3             # last N interactions per user go to test
ALPHAS = [0.1, 0.3, 0.5, 0.7, 0.9]  # for evaluation
K_EVAL = 5

In [29]:
def save_pickle(obj, path):
    with open(path, "wb") as f:
        pickle.dump(obj, f)

In [30]:
def load_pickle(path):
    with open(path, "rb") as f:
        return pickle.load(f)

In [31]:
def load_data():
    events = pd.read_csv(EVENTS_FILE)
    ip1 = pd.read_csv(ITEM_PROP1_FILE)
    ip2 = pd.read_csv(ITEM_PROP2_FILE)
    cat = pd.read_csv(CATEGORY_TREE_FILE)
    item_props = pd.concat([ip1, ip2], axis=0, ignore_index=True)
    return events, item_props, cat

In [32]:
def preprocess_events(events: pd.DataFrame) -> pd.DataFrame:
    # keep valid event types
    events = events[events['event'].isin(['view', 'addtocart', 'transaction'])].copy()
    # parse timestamp to datetime
    if not np.issubdtype(events['timestamp'].dtype, np.datetime64):
        events['timestamp'] = pd.to_datetime(events['timestamp'], unit='ms', errors='coerce')
    # map event type to weights (implicit feedback)
    weight_map = {'view': 1, 'addtocart': 4, 'transaction': 10}
    events['weight'] = events['event'].map(weight_map)
    # sort by visitor + timestamp globally for stable later slicing
    events = events.sort_values(['visitorid', 'timestamp']).reset_index(drop=True)
    return events

# **Train/Test split by last-N per user**

In [33]:
def train_test_split_by_last_n(events: pd.DataFrame, n_last=3) -> Tuple[pd.DataFrame, pd.DataFrame]:
    train_rows = []
    test_rows = []

    for visitor, g in events.groupby('visitorid'):
        g = g.sort_values('timestamp')
        if len(g) > n_last:
            train_rows.append(g.iloc[:-n_last])
            test_rows.append(g.iloc[-n_last:])
        else:
            train_rows.append(g)
            # no test rows for low-interaction users
    train_df = pd.concat(train_rows).reset_index(drop=True)
    test_df = pd.concat(test_rows).reset_index(drop=True) if test_rows else pd.DataFrame(columns=events.columns)
    print(f"Train: {len(train_df):,} rows, Test: {len(test_df):,} rows, Unique users: {events['visitorid'].nunique():,}")
    return train_df, test_df

# **Build item features matrix (one-hot categories) then SVD -> dense embeddings**

In [34]:
def build_item_category_matrix(item_props: pd.DataFrame, category_tree: pd.DataFrame) -> pd.DataFrame:
    # Filter category properties: property == 'categoryid'
    cats = item_props[item_props['property'] == 'categoryid'][['itemid', 'value']].copy()
    cats.rename(columns={'value': 'categoryid'}, inplace=True)
    cats['categoryid'] = cats['categoryid'].astype(int)
    # Expand via category_tree parents
    ct = category_tree.copy()
    ct['parentid'] = ct['parentid'].fillna(0).astype(int)
    ct['categoryid'] = ct['categoryid'].astype(int)
    parent_map = dict(zip(ct['categoryid'], ct['parentid']))

    def get_parents(cat_id):
        res = []
        seen = set()
        while cat_id in parent_map and parent_map[cat_id] != 0 and cat_id not in seen:
            seen.add(cat_id)
            cat_id = parent_map[cat_id]
            res.append(cat_id)
        return res

    extra_rows = []
    for _, r in cats.iterrows():
        parents = get_parents(r['categoryid'])
        for p in parents:
            extra_rows.append({'itemid': r['itemid'], 'categoryid': p})

    if extra_rows:
        cats = pd.concat([cats, pd.DataFrame(extra_rows)], ignore_index=True).drop_duplicates()

    # pivot one-hot
    item_features = pd.get_dummies(cats.set_index('itemid')['categoryid']).groupby(level=0).max()
    return item_features

In [35]:
def build_cb_embeddings(item_features: pd.DataFrame, svd_dim: int = SVD_DIM) -> Tuple[np.ndarray, dict, np.ndarray]:
    """
    Input: item_features: DataFrame index=itemid, columns=category ids (one-hot)
    Output: embeddings (n_items, svd_dim), itemid->idx map, idx_to_itemid array
    """
    itemids = item_features.index.to_list()
    itemid_to_idx = {iid: i for i, iid in enumerate(itemids)}
    idx_to_itemid = np.array(itemids)

    # TruncatedSVD on sparse input to get dense embeddings
    mat = csr_matrix(item_features.values)
    svd = TruncatedSVD(n_components=min(svd_dim, mat.shape[1]-1 if mat.shape[1]>1 else 1), random_state=42)
    emb = svd.fit_transform(mat)  # shape (n_items, svd_dim or less)
    # normalize embeddings (important for cosine/HNSW)
    emb = normalize(emb)
    return emb.astype('float32'), itemid_to_idx, idx_to_itemid

# **Build HNSW index for CB embeddings**

In [36]:
def build_hnsw_index(emb: np.ndarray, ef_construction=HNSW_EF_CONSTRUCTION, M=HNSW_M, ef_search=HNSW_EF_SEARCH):
    n_items, dim = emb.shape
    idx = hnswlib.Index(space='cosine', dim=dim)
    idx.init_index(max_elements=n_items, ef_construction=ef_construction, M=M)
    ids = np.arange(n_items)
    idx.add_items(emb, ids)
    idx.set_ef(ef_search)
    return idx

# **Item-based CF: build item-user sparse matrix and nearest neighbors (top-k)**

In [37]:
def build_item_user_matrix(train_df: pd.DataFrame) -> Tuple[csr_matrix, np.ndarray, Dict[int,int], Dict[int,int]]:
    """
    Build item_user_sparse (n_items x n_users) for neighbors calculation.
    Returns sparse matrix, itemids array, itemid->col_idx, userids->row_idx
    """
    # Map users and items to indices
    users = train_df['visitorid'].unique()
    items = train_df['itemid'].unique()
    user_to_idx = {u: i for i, u in enumerate(users)}
    item_to_idx = {it: i for i, it in enumerate(items)}
    idx_to_item = np.array(items)
    idx_to_user = np.array(users)

    # Build COO arrays
    rows = train_df['itemid'].map(item_to_idx).to_numpy()
    cols = train_df['visitorid'].map(user_to_idx).to_numpy()
    data = train_df['weight'].to_numpy(dtype='float32')

    # item x user
    mat = csr_matrix((data, (rows, cols)), shape=(len(items), len(users)))
    return mat, idx_to_item, item_to_idx, user_to_idx

In [38]:
def build_item_neighbors(item_user_sparse: csr_matrix, topk=ITEM_NN_TOPK, cache_path=None):
    """
    Build top-k item neighbors using NearestNeighbors on sparse rows (items).
    Returns neighbors indices and distances arrays aligned with item index.
    Uses cache if available and matches matrix shape.
    """
    import time

    n_items, n_users = item_user_sparse.shape

    # === 1️⃣ Cek cache lama ===
    if cache_path and os.path.exists(cache_path):
        try:
            print(f"🟢 Found existing cache: {cache_path}")
            item_neighbors_idx, item_neighbors_sims = load_pickle(cache_path)

            # Validasi ukuran sama
            if item_neighbors_idx.shape[0] == n_items:
                print(f"✅ Cache valid. Using cached neighbors ({n_items:,} items, topk={topk}).")
                return item_neighbors_idx, item_neighbors_sims
            else:
                print(f"⚠️ Cache mismatch: cached={item_neighbors_idx.shape[0]}, current={n_items}. Rebuilding...")
        except Exception as e:
            print(f"⚠️ Failed to load cache due to: {e}. Rebuilding...")

    # === 2️⃣ Bangun ulang jika cache tidak valid ===
    print("Building item neighbors (item-based CF) ... ini mungkin memakan waktu.")
    nn = NearestNeighbors(n_neighbors=min(topk, n_items), metric="cosine", algorithm="brute", n_jobs=-1)
    nn.fit(item_user_sparse)
    distances, indices = nn.kneighbors(item_user_sparse, return_distance=True)
    sims = 1.0 - distances

    # === 3️⃣ Simpan hasil baru ===
    if cache_path:
        with open(cache_path, "wb") as f:
            pickle.dump((indices, sims), f)
        print(f"💾 Saved new cache to {cache_path}")

    return indices, sims

# **Recommendation functions (CF, CB, Hybrid)**

In [39]:
def get_item_based_cf_scores_for_user(user_id: int, user_item_matrix_useridx_map: Dict[int,int],
                                      user_item_matrix: csr_matrix,
                                      item_neighbors_idx: np.ndarray, item_neighbors_sims: np.ndarray,
                                      topk_scores=CF_NEIGHBORS_TOPK) -> pd.Series:
    """
    Compute CF item scores for a user by aggregating similarities from items the user interacted with.
    user_item_matrix: item x user sparse matrix
    item_neighbors_idx/sims: arrays shape (n_items, k)
    Returns pd.Series indexed by itemid (string/int).
    """
    # map user -> column index
    if user_id not in user_item_matrix_useridx_map:
        return pd.Series(dtype=float)

    uidx = user_item_matrix_useridx_map[user_id]
    # get user's interacted items (nonzero rows in column uidx)
    user_col = user_item_matrix[:, uidx].toarray().flatten()  # length n_items
    interacted_indices = np.where(user_col > 0)[0]
    if len(interacted_indices) == 0:
        return pd.Series(dtype=float)

    n_items = item_neighbors_idx.shape[0]
    agg_scores = np.zeros(n_items, dtype='float32')

    # For each interacted item, add neighbor similarities
    for it_idx in interacted_indices:
        neigh_idxs = item_neighbors_idx[it_idx]
        neigh_sims = item_neighbors_sims[it_idx]
        # accumulate weighted by user's interaction weight
        w = user_col[it_idx]
        agg_scores[neigh_idxs] += (neigh_sims * w)

    # zero out already-seen items
    agg_scores[interacted_indices] = 0.0
    # return as Series indexed by item index (we'll map to itemids outside)
    return pd.Series(agg_scores)

In [40]:
def get_cb_scores_for_user_by_last_item(user_id: int, user_item_matrix_useridx_map: Dict[int,int],
                                        user_item_matrix: csr_matrix,
                                        itemid_to_emb_idx: Dict[int,int],
                                        emb_index: hnswlib.Index,
                                        emb_data: np.ndarray,
                                        idx_to_itemid_emb: np.ndarray,
                                        top_n=100) -> pd.Series:
    """
    Uses last item the user interacted with as anchor, query HNSW for similar items and return score series.
    """
    if user_id not in user_item_matrix_useridx_map:
        return pd.Series(dtype=float)

    uidx = user_item_matrix_useridx_map[user_id]
    user_col = user_item_matrix[:, uidx].toarray().flatten()
    interacted_indices = np.where(user_col > 0)[0]
    if len(interacted_indices) == 0:
        return pd.Series(dtype=float)

    last_idx = interacted_indices[-1]  # index in item_user matrix
    # convert this index (in item_user matrix) to itemid then to emb idx if mapping exists
    # Note: We'll provide a mapping outside to link item_user index -> emb index
    # Here assume item_user index aligns with idx_to_itemid_emb mapping (we'll create that mapping earlier)
    if last_idx not in itemid_to_emb_idx:
        # fallback: empty
        return pd.Series(dtype=float)

    emb_idx = itemid_to_emb_idx[last_idx]
    k = min(top_n + 1, emb_data.shape[0])
    labels, distances = emb_index.knn_query(emb_data[emb_idx], k=k)
    labels = labels[0]
    distances = distances[0]
    # skip self
    if len(labels) > 1:
        labels = labels[1:]
        distances = distances[1:]
    sims = 1.0 - distances  # convert dist->sim
    # map emb idx back to item_user idx (via idx_to_itemid_emb -> itemid -> item_user index)
    items_emb_ids = idx_to_itemid_emb[labels]
    scores = pd.Series(sims, index=items_emb_ids)
    return scores  # index are item_user idx

In [41]:
def combine_cf_cb_scores(cf_scores: pd.Series, cb_scores: pd.Series, alpha=0.7):
    """
    cf_scores and cb_scores are Series indexed by same item index space (build union).
    alpha = weight for CF (0..1), (1-alpha) for CB.
    Returns Series of hybrid scores.
    """
    if cf_scores.empty and cb_scores.empty:
        return pd.Series(dtype=float)
    # align union
    union_idx = cf_scores.index.union(cb_scores.index)
    cf_aligned = cf_scores.reindex(union_idx).fillna(0.0)
    cb_aligned = cb_scores.reindex(union_idx).fillna(0.0)
    # normalize each to [0,1]
    def minmax(s):
        if s.max() - s.min() < 1e-9:
            return pd.Series(0.0, index=s.index)
        return (s - s.min()) / (s.max() - s.min())
    cf_norm = minmax(cf_aligned)
    cb_norm = minmax(cb_aligned)
    hybrid = alpha * cf_norm + (1 - alpha) * cb_norm
    hybrid = hybrid.sort_values(ascending=False)
    return hybrid

# **Evaluation metrics**

In [42]:
def precision_recall_at_k(recommended: List[int], true_items: List[int], k=5) -> Tuple[float,float]:
    rec = recommended[:k]
    if len(rec) == 0:
        return 0.0, 0.0
    hits = len(set(rec) & set(true_items))
    precision = hits / k
    recall = hits / len(true_items) if len(true_items) > 0 else 0.0
    return precision, recall

In [43]:
def ndcg_at_k(recommended: List[int], true_items: List[int], k=5) -> float:
    dcg = 0.0
    for i, r in enumerate(recommended[:k]):
        if r in true_items:
            dcg += 1.0 / np.log2(i + 2)
    ideal = sum(1.0/np.log2(i+2) for i in range(min(len(true_items), k)))
    return dcg / ideal if ideal > 0 else 0.0

# **Full evaluation loop (item-based CF + CB hybrid)**

In [44]:
def evaluate_hybrid_item_cf(
    train_df, test_df, item_features,
    emb, itemuser_idx_to_emb_idx, idx_to_itemid_emb_arr, emb_index,
    item_user_sparse, idx_to_itemid_itemuser,
    item_neighbors_idx, item_neighbors_sims,
    alpha_values, k=5
):
    """
    Evaluasi hybrid item-based CF + CB dengan checkpoint caching.
    Jika proses terhenti di tengah jalan, bisa dilanjut dari checkpoint terakhir.
    """
    print("Evaluating hybrid item-based CF + CB with safety checks + checkpointing...")

    # === Setup awal ===
    itemuser_idx_to_itemid = np.array(idx_to_itemid_itemuser)
    true_items_per_user = test_df.groupby("visitorid")["itemid"].apply(list).to_dict()
    total_users = len(true_items_per_user)
    print(f"Total users in test: {total_users:,}")

    cache_path = os.path.join(CACHE_DIR, "hybrid_eval_checkpoint.pkl")
    final_path = os.path.join(CACHE_DIR, "hybrid_eval_final.pkl")

    # === Jika checkpoint ada, lanjut dari sana ===
    if os.path.exists(cache_path):
        print(f"🟢 Found checkpoint: {cache_path}, resuming evaluation...")
        with open(cache_path, "rb") as f:
            checkpoint = pickle.load(f)
        results = checkpoint.get("results", [])
        start_idx = checkpoint.get("index", 0)
        alpha_start = checkpoint.get("alpha_index", 0)
    else:
        print("🆕 Starting fresh evaluation from scratch...")
        results = []
        start_idx = 0
        alpha_start = 0

    test_users = list(true_items_per_user.keys())
    total_alphas = len(alpha_values)

    # === Loop per alpha ===
    for a_i, alpha in enumerate(alpha_values[alpha_start:], start=alpha_start):
        print(f"Evaluating α={alpha} ({a_i+1}/{total_alphas})...")
        precisions, recalls, hit_rates, ndcgs = [], [], [], []
        all_recommended_items = set()

        # Jika sedang resume → skip user yang sudah selesai
        if a_i == alpha_start:
            user_start = start_idx
        else:
            user_start = 0

        for i, u in enumerate(tqdm(test_users[user_start:], total=total_users - user_start)):
            true_items = true_items_per_user[u]
            user_train_items = train_df[train_df["visitorid"] == u]["itemid"].tolist()
            if not user_train_items:
                continue

            # === (1) CF Scores ===
            cf_scores = {}
            for itemid in user_train_items:
                try:
                    item_idx = idx_to_itemid_itemuser.index(itemid)
                except ValueError:
                    continue
                neigh_idxs = item_neighbors_idx[item_idx]
                neigh_sims = item_neighbors_sims[item_idx]
                for ni, sim in zip(neigh_idxs, neigh_sims):
                    if ni < len(idx_to_itemid_itemuser):
                        cf_scores[ni] = cf_scores.get(ni, 0) + sim

            if not cf_scores:
                continue

            cf_scores = pd.Series(cf_scores)

            # === (2) CB Scores ===
            cb_scores = {}
            last_item = user_train_items[-1]
            emb_idx = None
            for k_it, v in itemuser_idx_to_emb_idx.items():
                if idx_to_itemid_itemuser[k_it] == last_item:
                    emb_idx = v
                    break

            if emb_idx is not None:
                try:
                    labels, distances = emb_index.knn_query(emb[emb_idx], k=min(200, len(emb)))
                    cb_items = labels[0]
                    cb_dists = 1 - distances[0]
                    for emb_i, dist in zip(cb_items, cb_dists):
                        if emb_i < len(idx_to_itemid_emb_arr):
                            itemid = idx_to_itemid_emb_arr[emb_i]
                            if itemid in idx_to_itemid_itemuser:
                                cf_idx = idx_to_itemid_itemuser.index(itemid)
                                cb_scores[cf_idx] = dist
                except Exception:
                    pass

            cb_scores = pd.Series(cb_scores) if cb_scores else pd.Series(dtype=float)

            # === (3) Gabungkan ===
            cf_scaled = (cf_scores - cf_scores.min()) / (cf_scores.max() - cf_scores.min() + 1e-9)
            cb_scaled = (cb_scores - cb_scores.min()) / (cb_scores.max() - cb_scores.min() + 1e-9)
            hybrid = alpha * cf_scaled.add((1 - alpha) * cb_scaled, fill_value=0)
            hybrid = hybrid.drop(
                [idx_to_itemid_itemuser.index(i) for i in user_train_items if i in idx_to_itemid_itemuser],
                errors="ignore",
            ).sort_values(ascending=False)

            rec_itemuser_idxs = [idx for idx in hybrid.index[:k] if idx < len(itemuser_idx_to_itemid)]
            rec_itemids = [itemuser_idx_to_itemid[idx] for idx in rec_itemuser_idxs if idx < len(itemuser_idx_to_itemid)]
            if not rec_itemids:
                continue

            # === (4) Hitung metrik ===
            hits = len(set(rec_itemids) & set(true_items))
            precisions.append(hits / k)
            recalls.append(hits / len(true_items))
            hit_rates.append(1 if hits > 0 else 0)
            all_recommended_items.update(rec_itemids)

            dcg = sum(1 / np.log2(i + 2) for i, rec in enumerate(rec_itemids) if rec in true_items)
            ideal_dcg = sum(1 / np.log2(i + 2) for i in range(min(len(true_items), k)))
            ndcgs.append(dcg / ideal_dcg if ideal_dcg > 0 else 0)

            # === Simpan checkpoint tiap 1000 user ===
            if i % 1000 == 0 and i > 0:
                with open(cache_path, "wb") as f:
                    pickle.dump({
                        "results": results,
                        "index": user_start + i,
                        "alpha_index": a_i
                    }, f)
                print(f"💾 Checkpoint saved at user {user_start + i}/{total_users} for α={alpha}")

        # === Simpan hasil alpha ini ===
        results.append({
            "alpha": alpha,
            "Precision@5": np.mean(precisions) if precisions else 0,
            "Recall@5": np.mean(recalls) if recalls else 0,
            "HitRate@5": np.mean(hit_rates) if hit_rates else 0,
            "Coverage": len(all_recommended_items) / len(item_features) if len(item_features) > 0 else 0,
            "NDCG@5": np.mean(ndcgs) if ndcgs else 0
        })

        # Hapus start_idx saat pindah alpha
        start_idx = 0

    # === Simpan hasil akhir ===
    with open(final_path, "wb") as f:
        pickle.dump(results, f)
    print(f"✅ Evaluation complete. Saved to {final_path}")

    # Hapus checkpoint terakhir (sudah selesai)
    if os.path.exists(cache_path):
        os.remove(cache_path)
        print("🧹 Removed temporary checkpoint file.")

    return pd.DataFrame(results)

# **Orchestrator main()**

In [45]:
def main():
    print("Loading data...")
    events, item_props, category_tree = load_data()
    events = preprocess_events(events)

    # Filter user aktif
    events = events[events['visitorid'].notnull()].copy()
    user_counts = events['visitorid'].value_counts()
    active_users_list = user_counts[user_counts >= MIN_INTERACTIONS_ACTIVE_USER].index
    events = events[events['visitorid'].isin(active_users_list)].copy()
    print(f"After filtering active users: total interactions {len(events):,}, unique users {events['visitorid'].nunique():,}")

    print("Train/test split (last N per user)...")
    train_df, test_df = train_test_split_by_last_n(events, n_last=TRAIN_TEST_LAST_N)

    # Build item feature matrix (content-based)
    print("Building item category matrix...")
    item_features = build_item_category_matrix(item_props, category_tree)

    # ✅ Sinkronisasi item agar CF dan CB memiliki domain yang sama
    train_items = set(train_df['itemid'].unique())
    feature_items = set(item_features.index)
    common_items = list(train_items & feature_items)

    print(f"Items in train: {len(train_items):,}")
    print(f"Items in features: {len(feature_items):,}")
    print(f"Common items after sync: {len(common_items):,}")

    # Filter hanya item yang muncul di keduanya
    train_df = train_df[train_df['itemid'].isin(common_items)].copy()
    test_df = test_df[test_df['itemid'].isin(common_items)].copy()
    item_features = item_features.loc[common_items].copy()

    # Bangun embedding CB
    print("Building CB embeddings (TruncatedSVD) ...")
    emb, itemid_to_emb_idx_map, idx_to_itemid_emb_arr = build_cb_embeddings(item_features, svd_dim=SVD_DIM)

    # Bangun index HNSW untuk CB
    print("Building HNSW index on CB embeddings...")
    emb_index = build_hnsw_index(emb, ef_construction=HNSW_EF_CONSTRUCTION, M=HNSW_M, ef_search=HNSW_EF_SEARCH)

    # Bangun matriks item–user untuk CF (item-based)
    print("Building item-user sparse matrix...")
    item_user_sparse, idx_to_itemid_itemuser, item_to_idx_itemuser, user_to_idx_itemuser = build_item_user_matrix(train_df)

    # ✅ Sinkronisasi lagi: pastikan item_user dan CB sama-sama mengenal item
    valid_items_cf = set(idx_to_itemid_itemuser)
    valid_items_cb = set(itemid_to_emb_idx_map.keys())
    common_final_items = list(valid_items_cf & valid_items_cb)
    print(f"Items in CF: {len(valid_items_cf):,}")
    print(f"Items in CB: {len(valid_items_cb):,}")
    print(f"Final common items (CF ∩ CB): {len(common_final_items):,}")

    # Filter kembali matriks CF agar hanya mencakup item yang ada di kedua sisi
    keep_idx = [i for i, iid in enumerate(idx_to_itemid_itemuser) if iid in common_final_items]
    item_user_sparse = item_user_sparse[keep_idx, :]
    idx_to_itemid_itemuser = [idx_to_itemid_itemuser[i] for i in keep_idx]

    # Mapping item_user idx ↔ emb idx
    itemuser_idx_to_emb_idx = {}
    for it_idx, itemid in enumerate(idx_to_itemid_itemuser):
        if itemid in itemid_to_emb_idx_map:
            itemuser_idx_to_emb_idx[it_idx] = itemid_to_emb_idx_map[itemid]

    # Reverse mapping emb idx → item_user idx
    emb_idx_to_itemuser_idx = {}
    itemid_to_itemuser_idx = {itemid: idx for idx, itemid in enumerate(idx_to_itemid_itemuser)}
    for emb_idx, itemid in enumerate(idx_to_itemid_emb_arr):
        if itemid in itemid_to_itemuser_idx:
            emb_idx_to_itemuser_idx[emb_idx] = itemid_to_itemuser_idx[itemid]

    # Bangun tetangga item (item-based CF)
    neighbors_cache = os.path.join(CACHE_DIR, "item_neighbors.pkl")
    print("Building item neighbors (item-based CF) ... (may take time)")
    item_neighbors_idx, item_neighbors_sims = build_item_neighbors(item_user_sparse, topk=ITEM_NN_TOPK, cache_path=neighbors_cache)

    # ✅ Proteksi tambahan terhadap mismatch indeks
    print(f"item_neighbors_idx shape: {item_neighbors_idx.shape}, item_user_sparse items: {len(idx_to_itemid_itemuser)}")

    # Evaluasi hybrid (CF + CB)
    print("Starting evaluation of hybrid item-based CF + CB ...")
    results_df = evaluate_hybrid_item_cf(
        train_df, test_df, item_features,
        emb, itemuser_idx_to_emb_idx, idx_to_itemid_emb_arr, emb_index,
        item_user_sparse, idx_to_itemid_itemuser,
        item_neighbors_idx, item_neighbors_sims,
        alpha_values=ALPHAS, k=K_EVAL
    )

    print("\n===== Evaluation Results =====")
    print(results_df.to_string(index=False))

In [None]:
if __name__ == "__main__":
    main()

Loading data...
After filtering active users: total interactions 2,756,101, unique users 1,407,580
Train/test split (last N per user)...
Train: 2,394,853 rows, Test: 361,248 rows, Unique users: 1,407,580
Building item category matrix...
Items in train: 229,547
Items in features: 417,053
Common items after sync: 180,664
Building CB embeddings (TruncatedSVD) ...
Building HNSW index on CB embeddings...
Building item-user sparse matrix...
Items in CF: 180,664
Items in CB: 180,664
Final common items (CF ∩ CB): 180,664
Building item neighbors (item-based CF) ... (may take time)
🟢 Found existing cache: cache_reco\item_neighbors.pkl
✅ Cache valid. Using cached neighbors (180,664 items, topk=200).
item_neighbors_idx shape: (180664, 200), item_user_sparse items: 180664
Starting evaluation of hybrid item-based CF + CB ...
Evaluating hybrid item-based CF + CB with safety checks + checkpointing...
Total users in test: 117,268
🟢 Found checkpoint: cache_reco\hybrid_eval_checkpoint.pkl, resuming evalu

  0%|          | 0/107268 [00:00<?, ?it/s]

💾 Checkpoint saved at user 11000/117268 for α=0.1
💾 Checkpoint saved at user 12000/117268 for α=0.1
💾 Checkpoint saved at user 13000/117268 for α=0.1
💾 Checkpoint saved at user 14000/117268 for α=0.1
💾 Checkpoint saved at user 15000/117268 for α=0.1
💾 Checkpoint saved at user 16000/117268 for α=0.1
💾 Checkpoint saved at user 17000/117268 for α=0.1
💾 Checkpoint saved at user 18000/117268 for α=0.1
💾 Checkpoint saved at user 19000/117268 for α=0.1
💾 Checkpoint saved at user 20000/117268 for α=0.1
💾 Checkpoint saved at user 21000/117268 for α=0.1
💾 Checkpoint saved at user 22000/117268 for α=0.1
💾 Checkpoint saved at user 23000/117268 for α=0.1
💾 Checkpoint saved at user 24000/117268 for α=0.1
💾 Checkpoint saved at user 25000/117268 for α=0.1
💾 Checkpoint saved at user 26000/117268 for α=0.1
💾 Checkpoint saved at user 27000/117268 for α=0.1
💾 Checkpoint saved at user 28000/117268 for α=0.1
💾 Checkpoint saved at user 29000/117268 for α=0.1
💾 Checkpoint saved at user 30000/117268 for α=0.1
