In [1]:
# ---------------------------
# 0) Imports
# ---------------------------
import os
import numpy as np
import pandas as pd
import joblib

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import top_k_accuracy_score

from xgboost import XGBClassifier

In [2]:
# ---------------------------
# 1) Paths + Load data
# ---------------------------
CLIENTS_PATH = "data/clients.csv"
PRODUCTS_PATH = "data/products.csv"
TRANSACTIONS_PATH = "data/transactions.csv"
STOCKS_PATH = "data/stocks.csv"
STORES_PATH = "data/stores.csv"  # optional

MODEL_DIR = "recommender_artifacts"
os.makedirs(MODEL_DIR, exist_ok=True)

clients = pd.read_csv(CLIENTS_PATH)
products = pd.read_csv(PRODUCTS_PATH)
transactions = pd.read_csv(TRANSACTIONS_PATH)
stocks = pd.read_csv(STOCKS_PATH) if os.path.exists(STOCKS_PATH) else None
stores = pd.read_csv(STORES_PATH) if os.path.exists(STORES_PATH) else None

# Standardize dtypes
clients["ClientID"] = clients["ClientID"].astype(str)
products["ProductID"] = products["ProductID"].astype(str)
transactions["ClientID"] = transactions["ClientID"].astype(str)
transactions["ProductID"] = transactions["ProductID"].astype(str)

# Transaction fields
transactions["SaleTransactionDate"] = pd.to_datetime(transactions["SaleTransactionDate"], errors="coerce")
transactions["Quantity"] = pd.to_numeric(transactions.get("Quantity", 1), errors="coerce").fillna(1.0).clip(lower=0)
transactions["SalesNetAmountEuro"] = pd.to_numeric(transactions.get("SalesNetAmountEuro", 0), errors="coerce").fillna(0.0)

# Stocks
if stocks is not None:
    stocks["ProductID"] = stocks["ProductID"].astype(str)
    stocks["Quantity"] = pd.to_numeric(stocks.get("Quantity", 0), errors="coerce").fillna(0.0)

print("Loaded shapes:")
print(" clients:", clients.shape)
print(" products:", products.shape)
print(" transactions:", transactions.shape)
print(" stocks:", None if stocks is None else stocks.shape)
print(" stores:", None if stores is None else stores.shape)


Loaded shapes:
 clients: (424037, 7)
 products: (47458, 5)
 transactions: (1177175, 6)
 stocks: (16024, 3)
 stores: (606, 2)


In [3]:
# ---------------------------
# 2) Utilities
# ---------------------------
FEATURES_HIER = ["Category", "FamilyLevel1", "Universe", "FamilyLevel2"]
for c in FEATURES_HIER:
    if c not in products.columns:
        raise ValueError(f"products.csv missing required column: {c}")

def get_in_stock_ids():
    if stocks is None:
        return set(products["ProductID"].unique())
    return set(stocks.loc[stocks["Quantity"] > 0, "ProductID"].unique())

IN_STOCK_IDS = get_in_stock_ids()

def make_content_key(df):
    return df[["Category", "FamilyLevel1", "FamilyLevel2", "Universe"]].fillna("").agg(" | ".join, axis=1)

def exp_decay(days_ago, half_life_days):
    return np.exp(-days_ago / float(half_life_days))

In [4]:
# ============================================================
# PART A — CONTENT-BASED PRODUCT SIMILARITY MODEL (build/save/load)
# ============================================================

# ---------------------------
# 3) Build weighted content similarity (Category > FamilyLevel1 > Universe > FamilyLevel2)
# ---------------------------
def build_content_similarity(products_df, weights=(0.50, 0.30, 0.20)):
    """
    Builds similarity matrix with explicit weighting:
      S = w_cat*S_cat + w_fam1*S_fam1 + w_other*S_other
    where "other" uses Universe + FamilyLevel2 tokens.
    """
    w_cat, w_fam1, w_other = weights

    df = products_df.copy()
    df["ProductID"] = df["ProductID"].astype(str)

    # Text blocks
    df["cat_text"] = df["Category"].fillna("")
    df["fam1_text"] = df["FamilyLevel1"].fillna("")
    df["other_text"] = (df["Universe"].fillna("") + " " + df["FamilyLevel2"].fillna(""))

    vec_cat = TfidfVectorizer(lowercase=True)
    vec_fam1 = TfidfVectorizer(lowercase=True)
    vec_other = TfidfVectorizer(lowercase=True, ngram_range=(1, 2))

    X_cat = vec_cat.fit_transform(df["cat_text"])
    X_fam1 = vec_fam1.fit_transform(df["fam1_text"])
    X_other = vec_other.fit_transform(df["other_text"])

    S_cat = cosine_similarity(X_cat, X_cat)
    S_fam1 = cosine_similarity(X_fam1, X_fam1)
    S_other = cosine_similarity(X_other, X_other)

    S = w_cat * S_cat + w_fam1 * S_fam1 + w_other * S_other

    # Index mapping
    product_index = pd.Series(df.index, index=df["ProductID"])

    # content_key used later for dedup
    df["content_key"] = make_content_key(df)

    return {
        "products_model": df,
        "similarity_matrix": S,
        "product_index": product_index,
        "weights": {"W_CAT": w_cat, "W_FAM1": w_fam1, "W_OTHER": w_other},
        "vectorizers": {"vec_cat": vec_cat, "vec_fam1": vec_fam1, "vec_other": vec_other},
    }

def save_content_model(model_bundle, model_dir=MODEL_DIR):
    joblib.dump(model_bundle["products_model"], f"{model_dir}/products_model.joblib")
    joblib.dump(model_bundle["similarity_matrix"], f"{model_dir}/similarity_matrix.joblib")
    joblib.dump(model_bundle["product_index"], f"{model_dir}/product_index.joblib")
    joblib.dump(model_bundle["weights"], f"{model_dir}/weights.joblib")
    # Vectorizers optional for inference if you keep similarity_matrix, but save anyway
    joblib.dump(model_bundle["vectorizers"]["vec_cat"], f"{model_dir}/vec_cat.joblib")
    joblib.dump(model_bundle["vectorizers"]["vec_fam1"], f"{model_dir}/vec_fam1.joblib")
    joblib.dump(model_bundle["vectorizers"]["vec_other"], f"{model_dir}/vec_other.joblib")

def load_content_model(model_dir=MODEL_DIR):
    required = ["products_model.joblib", "similarity_matrix.joblib", "product_index.joblib", "weights.joblib"]
    if not all(os.path.exists(f"{model_dir}/{f}") for f in required):
        return None

    products_model = joblib.load(f"{model_dir}/products_model.joblib")
    similarity_matrix = joblib.load(f"{model_dir}/similarity_matrix.joblib")
    product_index = joblib.load(f"{model_dir}/product_index.joblib")
    weights = joblib.load(f"{model_dir}/weights.joblib")
    return {
        "products_model": products_model,
        "similarity_matrix": similarity_matrix,
        "product_index": product_index,
        "weights": weights,
    }

# Build or load content model (built on in-stock catalog snapshot)
content_model = load_content_model(MODEL_DIR)
if content_model is None:
    products_in_stock = products[products["ProductID"].isin(IN_STOCK_IDS)].copy()
    content_model = build_content_similarity(products_in_stock, weights=(0.50, 0.30, 0.20))
    save_content_model(content_model, MODEL_DIR)
    print("✅ Built + saved content model.")
else:
    print("✅ Loaded content model from disk.")

products_model = content_model["products_model"]
S = content_model["similarity_matrix"]
product_index_model = content_model["product_index"]


def content_candidates_from_seed(seed_product_id, per_seed_k=80):
    """
    Return candidates similar to a seed product from the saved similarity model.
    Output includes ProductID + FEATURES_HIER + sim_score + content_key
    """
    pid = str(seed_product_id)
    if pid not in product_index_model:
        return pd.DataFrame(columns=["ProductID", "sim_score"] + FEATURES_HIER + ["content_key"])

    idx = product_index_model[pid]
    # idx is label index of products_model; need row position in similarity matrix
    row_pos = products_model.index.get_loc(idx)

    sims = S[row_pos]
    sim_scores = pd.Series(sims, index=products_model.index).sort_values(ascending=False)

    # exclude itself
    sim_scores = sim_scores.drop(index=idx, errors="ignore")

    top_idx = sim_scores.head(per_seed_k).index
    out = products_model.loc[top_idx, ["ProductID"] + FEATURES_HIER + ["content_key"]].copy()
    out["sim_score"] = sim_scores.loc[top_idx].values
    return out.reset_index(drop=True)


✅ Loaded content model from disk.


In [5]:
# ============================================================
# PART B — PREDICTION MODEL (XGBOOST) FOR NEXT FamilyLevel2
# ============================================================

# ---------------------------
# 4) Build modeling table with a true "next purchase" target
# ---------------------------
tx = transactions.merge(products[["ProductID"] + FEATURES_HIER], on="ProductID", how="left")

client_cols = [c for c in ["ClientSegment", "ClientCountry", "ClientGender", "Age", "ClientOptINEmail", "ClientOptINPhone"] if c in clients.columns]
tx = tx.merge(clients[["ClientID"] + client_cols], on="ClientID", how="left")

if stores is not None and "StoreID" in tx.columns and "StoreID" in stores.columns and "StoreCountry" in stores.columns:
    tx = tx.merge(stores[["StoreID", "StoreCountry"]], on="StoreID", how="left")

tx = tx.sort_values(["ClientID", "SaleTransactionDate"])
tx["TargetNextFamilyLevel2"] = tx.groupby("ClientID")["FamilyLevel2"].shift(-1)

df_model = tx.dropna(subset=["TargetNextFamilyLevel2"]).copy()
df_model = df_model[df_model["SaleTransactionDate"].notna()].copy()

print("Model rows with next target:", df_model.shape[0])

Model rows with next target: 872246


In [6]:
# ---------------------------
# 5) Feature engineering (history up to time t)
# ---------------------------
HALF_LIFE_DAYS = 90.0  # recency decay for client history

df_model["PrevDate"] = df_model.groupby("ClientID")["SaleTransactionDate"].shift(1)
df_model["RecencyDays"] = (df_model["SaleTransactionDate"] - df_model["PrevDate"]).dt.days
df_model["RecencyDays"] = pd.to_numeric(df_model["RecencyDays"], errors="coerce")
df_model["RecencyDays"] = df_model["RecencyDays"].fillna(df_model["RecencyDays"].median())

df_model["CumQty"] = df_model.groupby("ClientID")["Quantity"].cumsum().shift(1).fillna(0.0)
df_model["CumSpend"] = df_model.groupby("ClientID")["SalesNetAmountEuro"].cumsum().shift(1).fillna(0.0)
df_model["CumTxns"] = df_model.groupby("ClientID").cumcount()  # number of previous txns
df_model["AvgBasket"] = (df_model["CumSpend"] / df_model["CumTxns"].replace(0, np.nan)).fillna(0.0)

# Last purchased attributes
for col in ["Category", "FamilyLevel1", "Universe", "FamilyLevel2"]:
    df_model[f"Last_{col}"] = df_model.groupby("ClientID")[col].shift(1)

# Top FamilyLevel1 affinity features (recency-weighted)
top_fam1 = df_model["FamilyLevel1"].value_counts().head(15).index.tolist()

global_max_date = df_model["SaleTransactionDate"].max()
days_ago = (global_max_date - df_model["SaleTransactionDate"]).dt.days.clip(lower=0).fillna(0)
df_model["EventW"] = exp_decay(days_ago, HALF_LIFE_DAYS) * df_model["Quantity"]

for fam in top_fam1:
    m = (df_model["FamilyLevel1"] == fam).astype(float)
    colname = f"CntFam1_{fam}"
    df_model[colname] = (m * df_model["EventW"])
    df_model[colname] = df_model.groupby("ClientID")[colname].cumsum().shift(1).fillna(0.0)

# Clean age
if "Age" in df_model.columns:
    df_model["Age"] = pd.to_numeric(df_model["Age"], errors="coerce")
    df_model["Age"] = df_model["Age"].fillna(df_model["Age"].median() if df_model["Age"].notna().any() else 0.0)
else:
    df_model["Age"] = 0.0

In [7]:
# ---------------------------
# 6) Train XGBoost (FIXED with LabelEncoder)
# ---------------------------
num_features = ["RecencyDays", "CumQty", "CumSpend", "CumTxns", "AvgBasket", "Age"] + [f"CntFam1_{fam}" for fam in top_fam1]
cat_features = [c for c in (
    ["ClientSegment", "ClientCountry", "ClientGender", "ClientOptINEmail", "ClientOptINPhone"] +
    [f"Last_{x}" for x in ["Category", "FamilyLevel1", "Universe", "FamilyLevel2"]]
) if c in df_model.columns]

X = df_model[num_features + cat_features].copy()
y = df_model["TargetNextFamilyLevel2"].astype(str)

# Time-based split (last 20% by date as test)
df_model = df_model.sort_values("SaleTransactionDate")
split_idx = int(len(df_model) * 0.8)
train_idx = df_model.index[:split_idx]
test_idx = df_model.index[split_idx:]

X_train, y_train = X.loc[train_idx], y.loc[train_idx]
X_test, y_test = X.loc[test_idx], y.loc[test_idx]

# Robustness: filter test to labels seen in train
train_classes = set(y_train.unique())
mask = y_test.isin(train_classes)
X_test, y_test = X_test.loc[mask], y_test.loc[mask]

# Encode target labels
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)
num_class = len(le.classes_)

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features),
        ("num", "passthrough", num_features),
    ]
)

xgb = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.08,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="multi:softprob",
    num_class=num_class,
    eval_metric="mlogloss",
    tree_method="hist",
    random_state=42
)

clf = Pipeline(steps=[("prep", preprocess), ("model", xgb)])
clf.fit(X_train, y_train_enc)

probs = clf.predict_proba(X_test)
labels = np.arange(num_class)  # 0..num_class-1

top1 = top_k_accuracy_score(y_test_enc, probs, k=1, labels=labels)
top3 = top_k_accuracy_score(y_test_enc, probs, k=3, labels=labels)
top5 = top_k_accuracy_score(y_test_enc, probs, k=5, labels=labels)

print(f"XGB Top-1: {top1:.4f} | Top-3: {top3:.4f} | Top-5: {top5:.4f}")

# We'll decode predicted class indices using le.classes_
xgb_classes = le.classes_


XGB Top-1: 0.1165 | Top-3: 0.2288 | Top-5: 0.3161


In [8]:
# ============================================================
# PART C — CLIENT PROFILE + FINAL TWO-STAGE RECOMMENDER
# ============================================================

# ---------------------------
# 7) Client profile builder (priority: Category > FamilyLevel1 > Universe > FamilyLevel2)
# ---------------------------
W_PROFILE = {"Category": 4.0, "FamilyLevel1": 2.5, "Universe": 1.5, "FamilyLevel2": 1.0}

txp = transactions.merge(products[["ProductID"] + FEATURES_HIER], on="ProductID", how="left")

def build_client_profile(client_id, half_life_days=HALF_LIFE_DAYS):
    cid = str(client_id)
    df = txp[txp["ClientID"].astype(str) == cid].copy()
    if df.empty:
        return {}

    max_date = txp["SaleTransactionDate"].max()
    days_ago = (max_date - df["SaleTransactionDate"]).dt.days.clip(lower=0).fillna(0)
    base_w = df["Quantity"] * exp_decay(days_ago, half_life_days)

    profile = {}
    for feat in FEATURES_HIER:
        s = df.groupby(feat).apply(lambda g: float(base_w.loc[g.index].sum()))
        s = (s * W_PROFILE[feat]).sort_values(ascending=False)
        profile[feat] = s
    return profile

def profile_boost(profile, candidates_df):
    b = pd.Series(0.0, index=candidates_df.index)
    if not profile:
        return b
    for feat in FEATURES_HIER:
        pref = profile.get(feat)
        if pref is None or pref.empty:
            continue
        b += candidates_df[feat].fillna("").map(pref).fillna(0.0)
    return b

In [9]:
# ---------------------------
# 8) Predict top-N families for a client (from latest available feature-row)
# ---------------------------
def get_latest_feature_row_for_client(client_id):
    cid = str(client_id)
    sub = df_model[df_model["ClientID"].astype(str) == cid].copy()
    if sub.empty:
        return None
    sub = sub.sort_values("SaleTransactionDate")
    return sub.iloc[[-1]]

def predict_top_families_for_client(client_id, top_n=5):
    last_row = get_latest_feature_row_for_client(client_id)
    if last_row is None:
        return []
    X_last = last_row[num_features + cat_features]
    p = clf.predict_proba(X_last)[0]
    top_idx = np.argsort(p)[::-1][:top_n]
    fams = le.inverse_transform(top_idx)
    probs_ = p[top_idx]
    return list(zip(fams, probs_))


In [10]:
# ---------------------------
# 9) Final recommender: Candidate gen (XGB families) + Rerank (content + profile)
# ---------------------------
def recommend_for_client(
    client_id,
    top_k=10,
    top_families_n=5,
    n_seed_products=3,
    per_seed_k=80,
    dedup_by_content=True,
):
    cid = str(client_id)

    # Already bought items
    bought = set(transactions.loc[transactions["ClientID"] == cid, "ProductID"].astype(str).unique())

    # Candidate generation: predicted next families
    fam_preds = predict_top_families_for_client(cid, top_n=top_families_n)
    pred_fams = [f for f, _ in fam_preds]

    # Candidate pool from products, filter constraints
    cand = products.copy()
    cand = cand[cand["ProductID"].isin(IN_STOCK_IDS)]
    cand = cand[~cand["ProductID"].isin(bought)]
    cand = cand[cand["FamilyLevel2"].astype(str).isin(list(map(str, pred_fams)))].copy()

    # If empty: fallback to popularity in stock
    if cand.empty:
        pop = transactions.groupby("ProductID")["Quantity"].sum().sort_values(ascending=False)
        cand = products[products["ProductID"].isin(IN_STOCK_IDS)].copy()
        cand = cand[~cand["ProductID"].isin(bought)]
        cand["pop_score"] = cand["ProductID"].map(pop).fillna(0.0)
        cand["content_key"] = make_content_key(cand)
        if dedup_by_content:
            cand = cand.sort_values("pop_score", ascending=False).drop_duplicates("content_key", keep="first")
        return cand.sort_values("pop_score", ascending=False).head(top_k)[
            ["ProductID"] + FEATURES_HIER + ["content_key", "pop_score"]
        ].reset_index(drop=True)

    # Add content_key
    cand["content_key"] = make_content_key(cand)

    # XGB family probability (same prob for all products within that family)
    fam_prob = {str(f): float(p) for f, p in fam_preds}
    cand["xgb_family_prob"] = cand["FamilyLevel2"].astype(str).map(fam_prob).fillna(0.0)

    # Profile boost
    prof = build_client_profile(cid)
    cand["profile_boost"] = profile_boost(prof, cand)

    # Content similarity score (aggregate from recent seed products)
    hist = transactions[transactions["ClientID"] == cid].sort_values("SaleTransactionDate")
    recent = hist["ProductID"].astype(str).tail(50).tolist()

    # pick most recent unique seeds
    seeds = []
    seen = set()
    for pid in reversed(recent):
        if pid in seen:
            continue
        seen.add(pid)
        seeds.append(pid)
        if len(seeds) == n_seed_products:
            break

    sim_acc = {}
    for s in seeds:
        recs = content_candidates_from_seed(s, per_seed_k=per_seed_k)
        for _, r in recs.iterrows():
            pid = str(r["ProductID"])
            sim_acc[pid] = sim_acc.get(pid, 0.0) + float(r["sim_score"])

    cand["content_sim"] = cand["ProductID"].map(sim_acc).fillna(0.0)

    # Final score (tunable)
    cand["final_score"] = (
        2.0 * cand["xgb_family_prob"] +
        1.0 * cand["profile_boost"] +
        1.0 * cand["content_sim"]
    )

    cand = cand.sort_values("final_score", ascending=False)

    # Deduplicate by "content" at recommendation time
    if dedup_by_content:
        cand = cand.drop_duplicates("content_key", keep="first")

    return cand.head(top_k)[
        ["ProductID"] + FEATURES_HIER + ["content_key", "xgb_family_prob", "profile_boost", "content_sim", "final_score"]
    ].reset_index(drop=True)


In [11]:
# ---------------------------
# 10) Demo: run for one client
# ---------------------------
example_client = transactions["ClientID"].iloc[0]
print("Example client:", example_client)

print("\nTop predicted next families:")
print(predict_top_families_for_client(example_client, top_n=5))

print("\nRecommendations:")
print(recommend_for_client(example_client, top_k=10))

Example client: 8119209481417068505

Top predicted next families:
[('Adidas Rugby Shorts', 0.08971345), ('Canterbury Advantage', 0.06792515), ('Spalding NBA Official Game Ball', 0.06647961), ('Adidas Torpedo', 0.0469231), ('Canterbury Vapodri', 0.045692395)]

Recommendations:
             ProductID    Category FamilyLevel1 Universe  \
0  7813157068856617209  Basketball         Ball      Men   
1  5840373649174391597  Basketball         Ball    Women   
2  8706332538369958162       Rugby       Shorts      Men   
3  1110391261318430111       Rugby       Shorts    Women   
4  3715889193417410749       Rugby       Shorts    Women   
5  1684662842964204861       Rugby         Ball      Men   
6   950317252666201704       Rugby       Shorts      Men   
7   314618247339651600       Rugby         Ball    Women   
8  7042499222040039915       Rugby       Jersey      Men   
9  4552681534700227474       Rugby       Jersey    Women   

                      FamilyLevel2  \
0  Spalding NBA Official

  s = df.groupby(feat).apply(lambda g: float(base_w.loc[g.index].sum()))
  s = df.groupby(feat).apply(lambda g: float(base_w.loc[g.index].sum()))
  s = df.groupby(feat).apply(lambda g: float(base_w.loc[g.index].sum()))
  s = df.groupby(feat).apply(lambda g: float(base_w.loc[g.index].sum()))


In [12]:
import os, json, joblib

BUNDLE_DIR = "streamlit_model_bundle"
os.makedirs(BUNDLE_DIR, exist_ok=True)

def save_streamlit_bundle(bundle_dir=BUNDLE_DIR):
    # --- 1) Prediction model ---
    joblib.dump(clf, f"{bundle_dir}/xgb_pipeline.joblib")
    joblib.dump(le, f"{bundle_dir}/label_encoder.joblib")

    # --- 2) Content model ---
    joblib.dump(products_model, f"{bundle_dir}/products_model.joblib")
    joblib.dump(S, f"{bundle_dir}/similarity_matrix.joblib")
    joblib.dump(product_index_model, f"{bundle_dir}/product_index.joblib")

    # --- 3) Reference data for app ---
    # Keep these smaller if you can; Streamlit can load big files but slower.
    joblib.dump(products, f"{bundle_dir}/products.joblib")
    joblib.dump(transactions, f"{bundle_dir}/transactions.joblib")
    if stocks is not None:
        joblib.dump(stocks, f"{bundle_dir}/stocks.joblib")
    else:
        joblib.dump(list(IN_STOCK_IDS), f"{bundle_dir}/in_stock_ids.joblib")

    # --- 4) Feature lists & params needed at inference ---
    artifacts = {
        "num_features": num_features,
        "cat_features": cat_features,
        "top_fam1": top_fam1,
        "features_hier": FEATURES_HIER,
        "half_life_days": HALF_LIFE_DAYS,
        "profile_weights": W_PROFILE,
        "content_weights": content_model.get("weights", {}),
        # final scoring weights (match your recommender)
        "final_score_weights": {"xgb_family_prob": 2.0, "profile_boost": 1.0, "content_sim": 1.0},
        "num_class": int(len(le.classes_)),
    }

    with open(f"{bundle_dir}/artifacts.json", "w", encoding="utf-8") as f:
        json.dump(artifacts, f, indent=2)

    print(f"✅ Saved Streamlit bundle to: {bundle_dir}")

save_streamlit_bundle()


✅ Saved Streamlit bundle to: streamlit_model_bundle
