In [2]:
# =========================
# ecommerce_propensity_notebook.ipynb (extract)
# =========================

# 0. Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib
import shap
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

# 1. Load dataset (example: Online Retail II or Retail Rocket)
# For this notebook we assume a single events table with columns:
# session_id, user_id, event_type (view, add_to_cart, purchase), product_id, category, price, event_time, product_title, brand
# Replace 'events.csv' with your dataset path or join multiple tables as required.

events = pd.read_csv(r"C:\Users\Abhinesh\OneDrive\Desktop\sem 5 - 6\Field Project\retailrocket_customer_model.csv",)  # <-- replace path
events.head()

# 2. Basic session-level aggregation & label creation
# Create session-level features and label: did this session lead to a purchase?
session_agg = events.sort_values(["session_id","event_time"]).groupby("session_id").agg(
    user_id = ("user_id","first"),
    session_start = ("event_time","min"),
    session_end = ("event_time","max"),
    n_events = ("event_type","size"),
    n_views = (lambda x: (x=="view").sum()),
).reset_index()

# But above lambda grouping won't work; do explicit computations:
session_agg = events.groupby("session_id").apply(
    lambda df: pd.Series({
        "user_id": df["user_id"].iat[0],
        "session_start": df["event_time"].min(),
        "session_end": df["event_time"].max(),
        "n_events": len(df),
        "n_views": (df["event_type"]=="view").sum(),
        "n_addtocart": (df["event_type"]=="add_to_cart").sum(),
        "n_purchases": (df["event_type"]=="purchase").sum(),
        "unique_products": df["product_id"].nunique(),
        "avg_price": df["price"].mean(),
        "top_category": df["category"].mode().iat[0] if not df["category"].mode().empty else None
    })
).reset_index()

# Label: purchase happened in this session (binary)
session_agg["label_purchase"] = (session_agg["n_purchases"] > 0).astype(int)
session_agg["session_duration_sec"] = (session_agg["session_end"] - session_agg["session_start"]).dt.total_seconds().fillna(0)

# Quick class balance check
print(session_agg["label_purchase"].value_counts(normalize=True))

# 3. Feature engineering â€” enrichment with user history (RFM-like + lag)
# Build user-level aggregate (prior to the session)
events = events.sort_values("event_time")
# compute prior purchases per user up to session start
user_stats = events.groupby("user_id").agg(
    total_events=("event_type","count"),
    total_purchases=("event_type", lambda x: (x=="purchase").sum()),
    avg_order_value=("price", lambda x: x[x.index.isin(x.index)].mean() if len(x)>0 else 0) # placeholder
).reset_index()
# Simpler: compute lifetime purchases and avg price per user
user_stats = events.groupby("user_id").agg(
    lifetime_events=("event_type","count"),
    lifetime_purchases=("event_type", lambda x: (x=="purchase").sum()),
    lifetime_avg_price=("price","mean"),
    first_event=("event_time","min"),
    last_event=("event_time","max")
).reset_index()

# merge user stats to session_agg
session = session_agg.merge(user_stats, on="user_id", how="left")
session.fillna(0, inplace=True)

# create recency in days (from last_event to session_start)
session["recency_days"] = (session["session_start"] - session["last_event"]).dt.days.fillna(9999)

# 4. Prepare modeling dataset
feature_cols = [
    "n_events","n_views","n_addtocart","unique_products","avg_price","session_duration_sec",
    "lifetime_events","lifetime_purchases","lifetime_avg_price","recency_days"
]
X = session[feature_cols]
y = session["label_purchase"]

# Train-test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# 5. Baseline model: Logistic Regression
numeric_transformer = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="median")),
    ("scale", StandardScaler())
])
preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, feature_cols)
])

pipe_log = Pipeline([
    ("pre", preprocessor),
    ("clf", LogisticRegression(class_weight="balanced", max_iter=1000))
])
pipe_log.fit(X_train, y_train)
proba_log = pipe_log.predict_proba(X_test)[:,1]
print("Logistic ROC-AUC:", roc_auc_score(y_test, proba_log))
print(classification_report(y_test, pipe_log.predict(X_test)))

# 6. Stronger model: XGBoost
xgb = XGBClassifier(n_estimators=200, use_label_encoder=False, eval_metric="logloss", scale_pos_weight=(y_train==0).sum()/(y_train==1).sum(), random_state=42)
pipe_xgb = Pipeline([("pre", preprocessor), ("xgb", xgb)])
pipe_xgb.fit(X_train, y_train)
proba_xgb = pipe_xgb.predict_proba(X_test)[:,1]
print("XGBoost ROC-AUC:", roc_auc_score(y_test, proba_xgb))
print(classification_report(y_test, pipe_xgb.predict(X_test)))

# 7. Precision@K (for business budget)
def precision_at_k(y_true, y_scores, k):
    # k: fraction or integer top-k
    if 0 < k < 1:
        topn = int(len(y_scores)*k)
    else:
        topn = int(k)
    idx = np.argsort(y_scores)[-topn:]
    return y_true.iloc[idx].mean()

print("Precision@1% (XGB):", precision_at_k(y_test.reset_index(drop=True), proba_xgb, 0.01))

# 8. Save model artifact for dashboard export
joblib.dump(pipe_xgb, "models/propensity_xgb.joblib")

# 9. Explainability (SHAP)
# For high-cardinality preprocessor we need raw feature names
preproc = pipe_xgb.named_steps["pre"]
X_test_trans = preproc.transform(X_test)
xgb_model = pipe_xgb.named_steps["xgb"]
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(X_test_trans)
# For plotting, build feature name list after preprocess (only numeric here)
feature_names = feature_cols
shap.summary_plot(shap_values, X_test_trans, feature_names=feature_names, show=False)
plt.savefig("reports/shap_summary.png", bbox_inches="tight")

# 10. Build product embeddings for content-based recommendations
# Use product title and brand; fall back to category
products = events[["product_id","product_title","brand","category"]].drop_duplicates().set_index("product_id")
products["text"] = (products["product_title"].fillna("") + " " + products["brand"].fillna("") + " " + products["category"].fillna("")).str.lower()

# TF-IDF vectorization (simple)
tfidf = TfidfVectorizer(min_df=2, max_features=5000, ngram_range=(1,2))
product_matrix = tfidf.fit_transform(products["text"].values)

# 11. Recommendation function (content-based)
# Precompute cosine similarities or compute on the fly
def recommend_products_for_product(product_id, topn=10):
    if product_id not in products.index:
        return []
    idx = list(products.index).index(product_id)
    sims = cosine_similarity(product_matrix[idx], product_matrix).ravel()
    top_idx = np.argsort(sims)[-topn-1:][::-1]  # include product self; skip later
    recs = [products.index[i] for i in top_idx if products.index[i] != product_id][:topn]
    return recs

# Example
sample_pid = products.index[0]
print("Recs for", sample_pid, recommend_products_for_product(sample_pid, 5))

# 12. Generate dashboard CSV: predicted propensity + top 3 recommendations per session/user
# Use the latest product viewed in session for "next-product recommendation"
# Get the last viewed/purchased product per session
last_product = events.sort_values(["session_id","event_time"]).groupby("session_id").last().reset_index()[["session_id","product_id"]]
session = session.merge(last_product, on="session_id", how="left")
session["pred_proba"] = pipe_xgb.predict_proba(session[feature_cols])[:,1]

# build top3 recs
def top3_recs_for_session(row):
    pid = row["product_id"]
    if pd.isna(pid) or pid not in products.index:
        return []
    return recommend_products_for_product(pid, topn=3)
session["top3_recs"] = session.apply(top3_recs_for_session, axis=1)

# export for dashboard
dashboard_export = session[["session_id","user_id","session_start","pred_proba","label_purchase","top3_recs","top_category","lifetime_purchases"]]
dashboard_export.to_csv("dashboard/predictions_with_recs.csv", index=False)

# 13. Save TF-IDF and products for use in production
joblib.dump(tfidf, "models/tfidf_products.joblib")
products.to_csv("models/products.csv")

# 14. Evaluation summary (basic)
print("Overall conversion rate (test):", y_test.mean())
for thresh in [0.2,0.3,0.4,0.5]:
    y_pred_thresh = (proba_xgb >= thresh).astype(int)
    print(f"Thresh {thresh}: precision={precision_score(y_test, y_pred_thresh):.3f}, recall={recall_score(y_test, y_pred_thresh):.3f}, f1={f1_score(y_test, y_pred_thresh):.3f}")

# 15. Example: Top customers to target (by proba * lifetime value proxy)
# If you have customer LTV or avg_order_value, multiply prob by value to rank.
session["value_proxy"] = session["lifetime_avg_price"].fillna(0) * session["lifetime_purchases"].fillna(0)
session["expected_value"] = session["pred_proba"] * (session["value_proxy"] + 1)
top_to_target = session.sort_values("expected_value", ascending=False).head(100)
top_to_target[["session_id","user_id","pred_proba","value_proxy","expected_value"]].head()

# End of notebook


ModuleNotFoundError: No module named 'pandas'