In [1]:
import os
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import List, Tuple, Dict
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize
import hnswlib
import time

In [2]:
EVENTS_FILE = "events.csv"
ITEM_PROP1_FILE = "item_properties_part1.csv"
ITEM_PROP2_FILE = "item_properties_part2.csv"
CATEGORY_TREE_FILE = "category_tree.csv"
CACHE_DIR = "cache_reco_2"
os.makedirs(CACHE_DIR, exist_ok=True)

In [3]:
SVD_DIM = 64
ITEM_NN_TOPK = 200
CF_NEIGHBORS_TOPK = 50
HNSW_M = 64
HNSW_EF_CONSTRUCTION = 200
HNSW_EF_SEARCH = 100
MIN_INTERACTIONS_ACTIVE_USER = 1
TRAIN_TEST_LAST_N = 3
K_EVAL = 5

In [4]:
def save_pickle(obj, path):
    with open(path, "wb") as f:
        pickle.dump(obj, f)

In [5]:
def load_pickle(path):
    with open(path, "rb") as f:
        return pickle.load(f)

# **Data Loading & Preprocessing**

In [6]:
def load_data():
    events = pd.read_csv(EVENTS_FILE)
    ip1 = pd.read_csv(ITEM_PROP1_FILE)
    ip2 = pd.read_csv(ITEM_PROP2_FILE)
    cat = pd.read_csv(CATEGORY_TREE_FILE)
    item_props = pd.concat([ip1, ip2], axis=0, ignore_index=True)
    return events, item_props, cat

In [7]:
def preprocess_events(events: pd.DataFrame) -> pd.DataFrame:
    events = events[events['event'].isin(['view', 'addtocart', 'transaction'])].copy()
    if not np.issubdtype(events['timestamp'].dtype, np.datetime64):
        events['timestamp'] = pd.to_datetime(events['timestamp'], unit='ms', errors='coerce')
    weight_map = {'view': 1.0, 'addtocart': 3.0, 'transaction': 5.0}
    events['weight'] = events['event'].map(weight_map)
    events = events.dropna(subset=['visitorid', 'itemid'])
    events['visitorid'] = events['visitorid'].astype(int)
    events['itemid'] = events['itemid'].astype(int)
    return events

In [8]:
def train_test_split_by_last_n(events: pd.DataFrame, n_last=3) -> Tuple[pd.DataFrame, pd.DataFrame]:
    train_rows, test_rows = [], []
    for visitor, g in events.groupby('visitorid'):
        g = g.sort_values('timestamp')
        if len(g) > n_last:
            train_rows.append(g.iloc[:-n_last])
            test_rows.append(g.iloc[-n_last:])
        else:
            train_rows.append(g)
    train_df = pd.concat(train_rows)
    test_df = pd.concat(test_rows)
    return train_df, test_df

# **Content-Based Embedding**

In [9]:
def build_item_category_matrix(item_props: pd.DataFrame, category_tree: pd.DataFrame) -> pd.DataFrame:
    cats = item_props[item_props['property'] == 'categoryid'][['itemid', 'value']].copy()
    cats.rename(columns={'value': 'categoryid'}, inplace=True)
    cats['categoryid'] = cats['categoryid'].astype(int)
    pivot = pd.get_dummies(cats.set_index('itemid')['categoryid'])
    return pivot

In [10]:
def build_cb_embeddings(item_features: pd.DataFrame, svd_dim: int = SVD_DIM):
    itemids = item_features.index.to_list()
    itemid_to_idx = {iid: i for i, iid in enumerate(itemids)}
    idx_to_itemid = np.array(itemids)
    svd = TruncatedSVD(n_components=svd_dim, random_state=42)
    emb = svd.fit_transform(item_features.values)
    emb = normalize(emb)
    return emb, itemid_to_idx, idx_to_itemid

In [11]:
def build_hnsw_index(emb: np.ndarray, ef_construction=200, M=64, ef_search=100):
    n_items, dim = emb.shape
    index = hnswlib.Index(space='cosine', dim=dim)
    index.init_index(max_elements=n_items, ef_construction=ef_construction, M=M)
    index.add_items(emb, np.arange(n_items))
    index.set_ef(ef_search)
    return index

# **Collaborative Filtering**

In [12]:
def build_item_user_matrix(train_df: pd.DataFrame):
    users = train_df['visitorid'].unique()
    items = train_df['itemid'].unique()
    user_to_idx = {u: i for i, u in enumerate(users)}
    item_to_idx = {it: j for j, it in enumerate(items)}
    rows, cols, data = [], [], []
    for _, row in train_df.iterrows():
        rows.append(item_to_idx[row['itemid']])
        cols.append(user_to_idx[row['visitorid']])
        data.append(row['weight'])
    item_user_sparse = csr_matrix((data, (rows, cols)), shape=(len(items), len(users)))
    idx_to_itemid = [it for it in items]
    return item_user_sparse, idx_to_itemid, item_to_idx, user_to_idx

In [13]:
def build_item_neighbors(item_user_sparse, topk=200, cache_path=None):
    if cache_path and os.path.exists(cache_path):
        return load_pickle(cache_path)
    model = NearestNeighbors(metric='cosine', algorithm='brute', n_jobs=-1)
    model.fit(item_user_sparse)
    sims, idxs = model.kneighbors(item_user_sparse, n_neighbors=topk)
    if cache_path:
        save_pickle((idxs, 1 - sims), cache_path)
    return idxs, 1 - sims

# **Adaptive Alpha**

In [14]:
def compute_adaptive_alpha(user_id, user_item_matrix, min_alpha=0.1, max_alpha=0.9, slope='linear'):
    if user_id not in user_item_matrix.index:
        return min_alpha
    n = (user_item_matrix.loc[user_id] > 0).sum()
    if n <= 1:
        return min_alpha
    max_n = (user_item_matrix > 0).sum(axis=1).max()
    frac = n / (max_n + 1e-9)
    if slope == 'linear':
        alpha = min_alpha + (max_alpha - min_alpha) * frac
    elif slope == 'sqrt':
        alpha = min_alpha + (max_alpha - min_alpha) * np.sqrt(frac)
    else:
        alpha = min_alpha + (max_alpha - min_alpha) * frac
    return float(np.clip(alpha, min_alpha, max_alpha))

# **Hybrid Recommender**

In [15]:
def hybrid_weighted_recommend_adaptive(
    user_id, train_df,
    item_user_sparse, idx_to_itemid_itemuser,
    item_neighbors_idx, item_neighbors_sims,
    itemuser_idx_to_emb_idx, idx_to_itemid_emb_arr, emb_index, emb,
    top_n=5, min_alpha=0.1, max_alpha=0.9
):
    user_item_matrix = train_df.groupby(['visitorid', 'itemid']).size().unstack(fill_value=0)
    if user_id not in user_item_matrix.index:
        return []
    alpha = compute_adaptive_alpha(user_id, user_item_matrix, min_alpha, max_alpha)
    interacted_items = user_item_matrix.loc[user_id]
    interacted_items = interacted_items[interacted_items > 0].index.tolist()

    # CF
    cf_scores = pd.Series(dtype=float)
    for itemid in interacted_items:
        if itemid not in idx_to_itemid_itemuser:
            continue
        item_idx = idx_to_itemid_itemuser.index(itemid)
        sim_items = item_neighbors_idx[item_idx]
        sim_scores = item_neighbors_sims[item_idx]
        for sim_idx, sim_score in zip(sim_items, sim_scores):
            sim_itemid = idx_to_itemid_itemuser[sim_idx]
            if sim_itemid in interacted_items:
                continue
            cf_scores[sim_itemid] = cf_scores.get(sim_itemid, 0) + sim_score
    cf_scores = cf_scores / (cf_scores.max() + 1e-9)

    # CB
    cb_scores = pd.Series(dtype=float)
    if len(interacted_items) > 0:
        last_item = interacted_items[-1]
        if last_item in itemuser_idx_to_emb_idx:
            emb_idx = itemuser_idx_to_emb_idx[last_item]
            labels, distances = emb_index.knn_query(emb[emb_idx], k=200)
            cb_items = [idx_to_itemid_emb_arr[i] for i in labels[0][1:]]
            cb_scores = pd.Series(1 - distances[0][1:], index=cb_items)
    cb_scores = cb_scores / (cb_scores.max() + 1e-9)

    all_items = cf_scores.index.union(cb_scores.index)
    cf_vals = cf_scores.reindex(all_items).fillna(0)
    cb_vals = cb_scores.reindex(all_items).fillna(0)
    hybrid = alpha * cf_vals + (1 - alpha) * cb_vals
    hybrid = hybrid[~hybrid.index.isin(interacted_items)]
    recs = hybrid.sort_values(ascending=False).head(top_n).index.tolist()
    return recs

# **Evaluation Adaptive**

In [16]:
def evaluate_hybrid_item_cf_adaptive(
    train_df, test_df, item_features,
    emb, itemuser_idx_to_emb_idx, idx_to_itemid_emb_arr, emb_index,
    item_user_sparse, idx_to_itemid_itemuser,
    item_neighbors_idx, item_neighbors_sims,
    min_alpha=0.1, max_alpha=0.9, k=5
):
    true_items_per_user = test_df.groupby("visitorid")["itemid"].apply(list).to_dict()
    metrics = []
    for user_id, true_items in tqdm(true_items_per_user.items(), desc="Adaptive Eval"):
        recs = hybrid_weighted_recommend_adaptive(
            user_id, train_df,
            item_user_sparse, idx_to_itemid_itemuser,
            item_neighbors_idx, item_neighbors_sims,
            itemuser_idx_to_emb_idx, idx_to_itemid_emb_arr, emb_index, emb,
            top_n=k, min_alpha=min_alpha, max_alpha=max_alpha
        )
        hits = len(set(recs) & set(true_items))
        precision = hits / k
        recall = hits / len(true_items)
        hitrate = 1 if hits > 0 else 0
        metrics.append((precision, recall, hitrate))
    df = pd.DataFrame(metrics, columns=["Precision", "Recall", "HitRate"])
    return df.mean().to_frame().T

# **Main Pipeline**

In [17]:
def main():
    print("Loading data...")
    events, item_props, category_tree = load_data()
    events = preprocess_events(events)

    print("Train/Test Split...")
    train_df, test_df = train_test_split_by_last_n(events, TRAIN_TEST_LAST_N)

    print("Building features...")
    item_features = build_item_category_matrix(item_props, category_tree)
    emb, itemid_to_emb_idx_map, idx_to_itemid_emb_arr = build_cb_embeddings(item_features)
    emb_index = build_hnsw_index(emb)

    print("Building CF...")
    item_user_sparse, idx_to_itemid_itemuser, _, _ = build_item_user_matrix(train_df)
    item_neighbors_idx, item_neighbors_sims = build_item_neighbors(item_user_sparse, ITEM_NN_TOPK)

    print("Evaluating adaptive hybrid recommender...")
    results = evaluate_hybrid_item_cf_adaptive(
        train_df, test_df, item_features,
        emb, itemid_to_emb_idx_map, idx_to_itemid_emb_arr, emb_index,
        item_user_sparse, idx_to_itemid_itemuser,
        item_neighbors_idx, item_neighbors_sims,
        min_alpha=0.1, max_alpha=0.9, k=K_EVAL
    )
    print("\n===== Final Adaptive Evaluation Results =====")
    print(results.to_string(index=False))

In [None]:
if __name__ == "__main__":
    main()

Loading data...
Train/Test Split...
