In [58]:
import os
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import List, Tuple, Dict
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize
import hnswlib
import time

In [59]:
EVENTS_FILE = "events.csv"
ITEM_PROP1_FILE = "item_properties_part1.csv"
ITEM_PROP2_FILE = "item_properties_part2.csv"
CATEGORY_TREE_FILE = "category_tree.csv"
CACHE_DIR = "cache_reco_2"
os.makedirs(CACHE_DIR, exist_ok=True)

In [60]:
SVD_DIM = 64
ITEM_NN_TOPK = 200
CF_NEIGHBORS_TOPK = 50
HNSW_M = 64
HNSW_EF_CONSTRUCTION = 200
HNSW_EF_SEARCH = 100
MIN_INTERACTIONS_ACTIVE_USER = 1
TRAIN_TEST_LAST_N = 3
K_EVAL = 5

In [61]:
def save_pickle(obj, path):
    with open(path, "wb") as f:
        pickle.dump(obj, f)

In [62]:
def load_pickle(path):
    with open(path, "rb") as f:
        return pickle.load(f)

# **Data Loading & Preprocessing**

In [63]:
def load_data():
    events = pd.read_csv(EVENTS_FILE)
    ip1 = pd.read_csv(ITEM_PROP1_FILE)
    ip2 = pd.read_csv(ITEM_PROP2_FILE)
    cat = pd.read_csv(CATEGORY_TREE_FILE)
    item_props = pd.concat([ip1, ip2], axis=0, ignore_index=True)
    return events, item_props, cat

In [64]:
def preprocess_events(events: pd.DataFrame) -> pd.DataFrame:
    events = events[events['event'].isin(['view', 'addtocart', 'transaction'])].copy()
    if not np.issubdtype(events['timestamp'].dtype, np.datetime64):
        events['timestamp'] = pd.to_datetime(events['timestamp'], unit='ms', errors='coerce')
    weight_map = {'view': 1.0, 'addtocart': 3.0, 'transaction': 5.0}
    events['weight'] = events['event'].map(weight_map)
    events = events.dropna(subset=['visitorid', 'itemid'])
    events['visitorid'] = events['visitorid'].astype(int)
    events['itemid'] = events['itemid'].astype(int)
    return events

In [65]:
def train_test_split_by_last_n(events: pd.DataFrame, n_last=3) -> Tuple[pd.DataFrame, pd.DataFrame]:
    train_rows, test_rows = [], []
    for visitor, g in events.groupby('visitorid'):
        g = g.sort_values('timestamp')
        if len(g) > n_last:
            train_rows.append(g.iloc[:-n_last])
            test_rows.append(g.iloc[-n_last:])
        else:
            train_rows.append(g)
    train_df = pd.concat(train_rows)
    test_df = pd.concat(test_rows)
    return train_df, test_df

# **Content-Based Embedding**

In [66]:
def build_item_category_matrix(item_props: pd.DataFrame, category_tree: pd.DataFrame) -> pd.DataFrame:
    cats = item_props[item_props['property'] == 'categoryid'][['itemid', 'value']].copy()
    cats.rename(columns={'value': 'categoryid'}, inplace=True)
    cats['categoryid'] = cats['categoryid'].astype(int)
    pivot = pd.get_dummies(cats.set_index('itemid')['categoryid'])
    return pivot

In [67]:
def build_cb_embeddings(item_features: pd.DataFrame, svd_dim: int = SVD_DIM):
    itemids = item_features.index.to_list()
    itemid_to_idx = {iid: i for i, iid in enumerate(itemids)}
    idx_to_itemid = np.array(itemids)
    svd = TruncatedSVD(n_components=svd_dim, random_state=42)
    emb = svd.fit_transform(item_features.values)
    emb = normalize(emb)
    return emb, itemid_to_idx, idx_to_itemid

In [68]:
def build_hnsw_index(emb: np.ndarray, ef_construction=200, M=64, ef_search=100):
    n_items, dim = emb.shape
    index = hnswlib.Index(space='cosine', dim=dim)
    index.init_index(max_elements=n_items, ef_construction=ef_construction, M=M)
    index.add_items(emb, np.arange(n_items))
    index.set_ef(ef_search)
    return index

# **Collaborative Filtering**

In [69]:
def build_item_user_matrix(train_df: pd.DataFrame):
    users = train_df['visitorid'].unique()
    items = train_df['itemid'].unique()
    user_to_idx = {u: i for i, u in enumerate(users)}
    item_to_idx = {it: j for j, it in enumerate(items)}
    rows, cols, data = [], [], []
    for _, row in train_df.iterrows():
        rows.append(item_to_idx[row['itemid']])
        cols.append(user_to_idx[row['visitorid']])
        data.append(row['weight'])
    item_user_sparse = csr_matrix((data, (rows, cols)), shape=(len(items), len(users)))
    idx_to_itemid = [it for it in items]
    return item_user_sparse, idx_to_itemid, item_to_idx, user_to_idx

In [70]:
def build_item_neighbors(item_user_sparse, topk=200, cache_path=None):
    if cache_path and os.path.exists(cache_path):
        return load_pickle(cache_path)
    model = NearestNeighbors(metric='cosine', algorithm='brute', n_jobs=-1)
    model.fit(item_user_sparse)
    sims, idxs = model.kneighbors(item_user_sparse, n_neighbors=topk)
    if cache_path:
        save_pickle((idxs, 1 - sims), cache_path)
    return idxs, 1 - sims

# **Adaptive Alpha**

In [71]:
def compute_adaptive_alpha(user_id, user_item_matrix, min_alpha=0.1, max_alpha=0.9, slope='linear'):
    if user_id not in user_item_matrix.index:
        return min_alpha
    n = (user_item_matrix.loc[user_id] > 0).sum()
    if n <= 1:
        return min_alpha
    max_n = (user_item_matrix > 0).sum(axis=1).max()
    frac = n / (max_n + 1e-9)
    if slope == 'linear':
        alpha = min_alpha + (max_alpha - min_alpha) * frac
    elif slope == 'sqrt':
        alpha = min_alpha + (max_alpha - min_alpha) * np.sqrt(frac)
    else:
        alpha = min_alpha + (max_alpha - min_alpha) * frac
    return float(np.clip(alpha, min_alpha, max_alpha))

# **Hybrid Recommender**

In [72]:
def hybrid_weighted_recommend_adaptive(
    user_id, train_df,
    item_user_sparse, idx_to_itemid_itemuser,
    item_neighbors_idx, item_neighbors_sims,
    itemuser_idx_to_emb_idx, idx_to_itemid_emb_arr, emb_index, emb,
    user_to_idx_itemuser,            # NEW: mapping user_id -> column index in item_user_sparse
    top_n=5, min_alpha=0.1, max_alpha=0.9
):
    """
    Memory-safe hybrid recommender (adaptive alpha) using sparse item_user matrix.
    - Requires: user_to_idx_itemuser mapping (visitorid -> col idx in item_user_sparse)
    - idx_to_itemid_itemuser: list/array mapping item_user row idx -> itemid
    - itemuser_idx_to_emb_idx: mapping item_user row idx -> emb idx (CB)
    """

    # --- 0) user existence
    if user_id not in user_to_idx_itemuser:
        return []

    # --- 1) compute adaptive alpha using counts from train_df (fast)
    # use train_df counts per user (cheap)
    n_user_inter = train_df.loc[train_df["visitorid"] == user_id].shape[0]
    if n_user_inter <= 1:
        alpha = min_alpha
    else:
        max_n = train_df["visitorid"].value_counts().max()
        frac = n_user_inter / (max_n + 1e-9)
        alpha = float(np.clip(min_alpha + (max_alpha - min_alpha) * frac, min_alpha, max_alpha))

    # --- 2) get user's interacted item indices from sparse matrix
    ucol = user_to_idx_itemuser[user_id]
    # extract column (sparse) -> dense array of shape (n_items,)
    user_col = item_user_sparse[:, ucol].toarray().ravel()   # safe: single column
    interacted_item_idxs = np.where(user_col > 0)[0]
    if len(interacted_item_idxs) == 0:
        return []

    # convert idx -> itemids for later
    interacted_itemids = [idx_to_itemid_itemuser[i] for i in interacted_item_idxs]

    # --- 3) compute CF aggregate scores (vectorized accumulation)
    n_items = item_neighbors_idx.shape[0]
    agg = np.zeros(n_items, dtype='float32')
    # accumulate neighbor sims weighted by user's interaction weight
    for it_idx in interacted_item_idxs:
        neigh_idxs = item_neighbors_idx[it_idx]       # array length topk
        neigh_sims = item_neighbors_sims[it_idx]      # same length
        w = float(user_col[it_idx])
        agg[neigh_idxs] += neigh_sims * w

    # zero out already seen
    agg[interacted_item_idxs] = 0.0
    cf_scores_series = pd.Series(agg, index=np.arange(n_items))  # index = item_user idx

    # --- 4) compute CB scores using last interacted item (via emb_index)
    last_idx = interacted_item_idxs[-1]
    cb_scores_series = pd.Series(dtype=float)
    if last_idx in itemuser_idx_to_emb_idx:
        emb_idx = itemuser_idx_to_emb_idx[last_idx]
        kq = min(200, emb.shape[0])
        labels, distances = emb_index.knn_query(emb[emb_idx], k=kq)
        labels = labels[0]
        distances = distances[0]
        # skip self if present
        if len(labels) > 1:
            labels = labels[1:]
            distances = distances[1:]
        sims = 1.0 - distances
        # map emb idx -> item_user idx (via idx_to_itemid_emb_arr -> itemid -> item_user idx)
        # build mapping quickly (we assume idx_to_itemid_itemuser is list)
        # create reverse map once outside if many calls; here do minimal operations:
        emb_itemids = idx_to_itemid_emb_arr[labels]
        # create index mapping emb_itemids -> item_user_idx via idx_to_itemid_itemuser.index(itemid)
        # but using .index in loop is O(n). We'll build a dict once:
        itemid_to_itemuser_idx = {iid: i for i, iid in enumerate(idx_to_itemid_itemuser)}
        cb_idx = []
        cb_vals = []
        for iid, s in zip(emb_itemids, sims):
            if iid in itemid_to_itemuser_idx:
                cb_idx.append(itemid_to_itemuser_idx[iid])
                cb_vals.append(s)
        if cb_idx:
            cb_scores_series = pd.Series(cb_vals, index=cb_idx)
        else:
            cb_scores_series = pd.Series(dtype=float)

    # --- 5) normalize both series to [0,1]
    def safe_minmax_ser(s):
        if s.empty:
            return s
        a = s.to_numpy(dtype='float32')
        mn, mx = a.min(), a.max()
        if mx - mn < 1e-9:
            return pd.Series(0.0, index=s.index)
        return pd.Series((a - mn) / (mx - mn), index=s.index)

    cf_norm = safe_minmax_ser(cf_scores_series)
    cb_norm = safe_minmax_ser(cb_scores_series)

    # union indices -> item_user idx space
    cf_norm = cf_norm.groupby(cf_norm.index).mean()
    cb_norm = cb_norm.groupby(cb_norm.index).mean()
    
    union_idx = cf_norm.index.union(cb_norm.index)

    cf_al = cf_norm.reindex(union_idx).fillna(0.0)
    cb_al = cb_norm.reindex(union_idx).fillna(0.0)


    hybrid = alpha * cf_al + (1 - alpha) * cb_al
    # drop seen items
    hybrid = hybrid.drop(interacted_item_idxs, errors='ignore')
    # take top_n
    top_idxs = hybrid.sort_values(ascending=False).head(top_n).index.tolist()
    rec_itemids = [idx_to_itemid_itemuser[i] for i in top_idxs if i < len(idx_to_itemid_itemuser)]
    return rec_itemids


# **Evaluation Adaptive**

In [73]:
def precision_recall_at_k(recommended_items, true_items, k=10):
    """
    Hitung precision dan recall pada top-k rekomendasi.
    """
    if not recommended_items or not true_items:
        return 0.0, 0.0

    recommended_at_k = recommended_items[:k]
    true_set = set(true_items)

    hits = len(set(recommended_at_k) & true_set)
    precision = hits / k
    recall = hits / len(true_set) if len(true_set) > 0 else 0.0

    return precision, recall

In [74]:
def dcg_at_k(recommended_items, true_items, k=10):
    """
    Hitung Discounted Cumulative Gain (DCG) pada top-k rekomendasi.
    """
    recommended_at_k = recommended_items[:k]
    true_set = set(true_items)
    dcg = 0.0
    for i, item in enumerate(recommended_at_k):
        if item in true_set:
            dcg += 1.0 / np.log2(i + 2)  # posisi dimulai dari 1
    return dcg

In [75]:
def ndcg_at_k(recommended_items, true_items, k=10):
    """
    Hitung Normalized DCG (NDCG) pada top-k rekomendasi.
    """
    ideal_dcg = dcg_at_k(true_items, true_items, k)
    if ideal_dcg == 0:
        return 0.0
    return dcg_at_k(recommended_items, true_items, k) / ideal_dcg

In [76]:
def evaluate_hybrid_item_cf_adaptive(
    train_df, test_df, item_features,
    emb, itemuser_idx_to_emb_idx, idx_to_itemid_emb_arr, emb_index,
    item_user_sparse, idx_to_itemid_itemuser,
    item_neighbors_idx, item_neighbors_sims,
    user_to_idx_itemuser,
    min_alpha=0.1, max_alpha=0.9, k=5,
    max_eval_users=10000, checkpoint_every=1000
):
    """
    Evaluate adaptive hybrid with checkpointing + optional subsampling.
    - user_to_idx_itemuser: visitorid -> user col idx in item_user_sparse
    - max_eval_users: if >0, sample up to that many users for evaluation
    """

    print("Adaptive hybrid eval (memory-safe). Preparing users...")
    true_items_per_user = test_df.groupby("visitorid")["itemid"].apply(list).to_dict()
    all_users = list(true_items_per_user.keys())
    total_users = len(all_users)
    if max_eval_users and total_users > max_eval_users:
        rng = np.random.default_rng(42)
        sampled_users = list(rng.choice(all_users, size=max_eval_users, replace=False))
        print(f"‚ö†Ô∏è Will evaluate on subset {len(sampled_users):,}/{total_users:,} users (max_eval_users={max_eval_users})")
    else:
        sampled_users = all_users

    cache_path = os.path.join(CACHE_DIR, "hybrid_adaptive_eval_checkpoint.pkl")
    results_list = []
    start_idx = 0

    # resume if checkpoint exists
    if os.path.exists(cache_path):
        try:
            d = load_pickle(cache_path)
            results_list = d.get("results", [])
            start_idx = d.get("index", 0)
            print(f"Resuming evaluation from checkpoint at user index {start_idx}")
        except Exception:
            print("Failed to load checkpoint, starting fresh.")

    precisions, recalls, hitrates, ndcgs = [], [], [], []
    all_recommended = set()

    for i in range(start_idx, len(sampled_users)):
        uid = sampled_users[i]
        true_items = true_items_per_user[uid]
        recs = hybrid_weighted_recommend_adaptive(
            uid, train_df,
            item_user_sparse, idx_to_itemid_itemuser,
            item_neighbors_idx, item_neighbors_sims,
            itemuser_idx_to_emb_idx, idx_to_itemid_emb_arr, emb_index, emb,
            user_to_idx_itemuser,
            top_n=k, min_alpha=min_alpha, max_alpha=max_alpha
        )
        if not recs:
            # still want progress; skip
            if i % 1000 == 0:
                print(f"Processed {i}/{len(sampled_users)} users (no recs for uid={uid})")
            continue

        p, r = precision_recall_at_k(recs, true_items, k=k)
        ndcg = ndcg_at_k(recs, true_items, k=k)
        hit = 1.0 if any(it in true_items for it in recs) else 0.0

        precisions.append(p); recalls.append(r); hitrates.append(hit); ndcgs.append(ndcg)
        all_recommended.update(recs)

        # checkpoint
        if (i + 1) % checkpoint_every == 0:
            save_pickle({"results": results_list, "index": i + 1}, cache_path)
            print(f"üíæ Checkpoint saved at user {i + 1}/{len(sampled_users)}")

    # final aggregate
    results = {
        "Precision@5": float(np.mean(precisions)) if precisions else 0.0,
        "Recall@5": float(np.mean(recalls)) if recalls else 0.0,
        "HitRate@5": float(np.mean(hitrates)) if hitrates else 0.0,
        "Coverage": float(len(all_recommended) / max(1, len(idx_to_itemid_itemuser))),
        "NDCG@5": float(np.mean(ndcgs)) if ndcgs else 0.0
    }

    # cleanup checkpoint
    if os.path.exists(cache_path):
        try:
            os.remove(cache_path)
        except Exception:
            pass

    return pd.DataFrame([results])


# **Main Pipeline**

In [77]:
def main():
    print("Loading data...")
    events, item_props, category_tree = load_data()
    events = preprocess_events(events)

    print("Train/Test Split...")
    train_df, test_df = train_test_split_by_last_n(events, TRAIN_TEST_LAST_N)

    print("Building features...")
    item_features = build_item_category_matrix(item_props, category_tree)
    emb, itemid_to_emb_idx_map, idx_to_itemid_emb_arr = build_cb_embeddings(item_features)
    emb_index = build_hnsw_index(emb)

    print("Building CF...")
    item_user_sparse, idx_to_itemid_itemuser, item_to_idx_itemuser, user_to_idx_itemuser = build_item_user_matrix(train_df)
    item_neighbors_idx, item_neighbors_sims = build_item_neighbors(item_user_sparse, ITEM_NN_TOPK)

    # ‚úÖ Sinkronisasi CF dan CB
    valid_items_cf = set(idx_to_itemid_itemuser)
    valid_items_cb = set(itemid_to_emb_idx_map.keys())
    common_items = list(valid_items_cf & valid_items_cb)
    print(f"Common items CF ‚à© CB: {len(common_items):,}")

    item_user_sparse = item_user_sparse[[i for i, it in enumerate(idx_to_itemid_itemuser) if it in common_items], :]
    idx_to_itemid_itemuser = [it for it in idx_to_itemid_itemuser if it in common_items]

    # ‚úÖ Mapping CF index ‚Üí CB embedding index
    itemuser_idx_to_emb_idx = {}
    for it_idx, itemid in enumerate(idx_to_itemid_itemuser):
        if itemid in itemid_to_emb_idx_map:
            itemuser_idx_to_emb_idx[it_idx] = itemid_to_emb_idx_map[itemid]

    print("Evaluating adaptive hybrid recommender...")
    results_df = evaluate_hybrid_item_cf_adaptive(
        train_df, test_df, item_features,
        emb, itemuser_idx_to_emb_idx, idx_to_itemid_emb_arr, emb_index,
        item_user_sparse, idx_to_itemid_itemuser,
        item_neighbors_idx, item_neighbors_sims,
        user_to_idx_itemuser,
        min_alpha=0.1, max_alpha=0.9, k=K_EVAL,
        max_eval_users=50000,
        checkpoint_every=5000
    )

    print("\n===== Final Adaptive Evaluation Results =====")
    print(results_df.to_string(index=False))

In [78]:
if __name__ == "__main__":
    main()

Loading data...
Train/Test Split...
Building features...
Building CF...
Common items CF ‚à© CB: 180,664
Evaluating adaptive hybrid recommender...
Adaptive hybrid eval (memory-safe). Preparing users...
‚ö†Ô∏è Will evaluate on subset 50,000/120,416 users (max_eval_users=50000)
Processed 2000/50000 users (no recs for uid=1062578)
üíæ Checkpoint saved at user 5000/50000
üíæ Checkpoint saved at user 10000/50000
Processed 10000/50000 users (no recs for uid=1349837)
Processed 11000/50000 users (no recs for uid=1399404)
üíæ Checkpoint saved at user 15000/50000
üíæ Checkpoint saved at user 20000/50000
üíæ Checkpoint saved at user 25000/50000
üíæ Checkpoint saved at user 30000/50000
Processed 31000/50000 users (no recs for uid=250712)
üíæ Checkpoint saved at user 35000/50000
üíæ Checkpoint saved at user 40000/50000
üíæ Checkpoint saved at user 45000/50000
Processed 45000/50000 users (no recs for uid=669949)
üíæ Checkpoint saved at user 50000/50000

===== Final Adaptive Evaluation Resul