# AI/ML Models — Customer Churn & Demand Forecasting

Trains two models on the Gold layer using scikit-learn and Prophet, tracked via MLflow
and registered in the Unity Catalog Model Registry.

| Model | Algorithm | Purpose |
|---|---|---|
| Customer Churn Predictor | GradientBoosting (sklearn) | Identify at-risk customers |
| Demand Forecaster | Prophet | Monthly revenue forecast per region |

**Prereqs**: Run notebooks 00–05 first.

## 1 — Configuration & Installs

In [None]:
%pip install scikit-learn mlflow prophet --quiet
dbutils.library.restartPython()

In [None]:
import mlflow
from pyspark.sql import functions as F

mlflow.set_registry_uri("databricks-uc")

CATALOG = spark.catalog.currentCatalog()
GOLD    = f"{CATALOG}.retail_gold"
SILVER  = f"{CATALOG}.retail_silver"
MODELS  = f"{CATALOG}.retail_models"

spark.sql(f"CREATE SCHEMA IF NOT EXISTS {MODELS}")
spark.sql(f"COMMENT ON SCHEMA {MODELS} IS 'ML models for retail analytics'")

print(f"Catalog : {CATALOG}")
print(f"Gold    : {GOLD}")
print(f"Models  : {MODELS}")
print(f"MLflow  : Unity Catalog")

---
## MODEL 1: Customer Churn Prediction

**Objective**: Predict which customers are likely to churn (become "At Risk", "Needs Attention", or "Lost") so marketing can intervene.

**Algorithm**: scikit-learn GradientBoostingClassifier (serverless compatible — SparkML is blocked).

**Label**: Binary — 1 = at-risk/lost, 0 = healthy.

### 2 — Prepare Training Data

In [None]:
import pandas as pd
import numpy as np

# Load customer RFM data from Gold layer → pandas
df_rfm = spark.table(f"{GOLD}.gold_customer_rfm")

pdf = (
    df_rfm
    .withColumn("churn_label",
        F.when(F.col("rfm_segment").isin("At Risk", "Needs Attention", "Lost"), 1)
         .otherwise(0)
    )
    .select(
        "customer_key",
        "recency_days",
        "frequency",
        F.col("monetary").alias("lifetime_value"),
        "avg_order_value",
        "r_score", "f_score", "m_score", "rfm_score",
        "customer_lifetime_days",
        "market_segment",
        "customer_region",
        "churn_label",
    )
    .toPandas()
)

print(f"Total customers: {len(pdf):,}")
print(f"\nChurn distribution:")
print(pdf["churn_label"].value_counts())

### 3 — Build scikit-learn Pipeline

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, classification_report

# Feature columns
numeric_features = [
    "recency_days", "frequency", "lifetime_value", "avg_order_value",
    "r_score", "f_score", "m_score", "rfm_score", "customer_lifetime_days",
]
categorical_features = ["market_segment", "customer_region"]
all_features = numeric_features + categorical_features

X = pdf[all_features].copy()
y = pdf["churn_label"].values

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Train: {len(X_train):,} rows")
print(f"Test:  {len(X_test):,} rows")
print(f"Features: {len(all_features)} ({len(numeric_features)} numeric + {len(categorical_features)} categorical)")

### 4 — Train & Evaluate with MLflow

In [None]:
# Build sklearn pipeline: encode categoricals + scale numerics + GBT
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_features),
    ("cat", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), categorical_features),
])

pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("classifier", GradientBoostingClassifier(
        n_estimators=100,
        max_depth=5,
        learning_rate=0.1,
        random_state=42,
    )),
])

# Train with MLflow tracking
experiment_name = f"/Users/{spark.sql('SELECT current_user()').collect()[0][0]}/retail_churn_experiment"
mlflow.set_experiment(experiment_name)

with mlflow.start_run(run_name="churn_gbt_sklearn_v1") as run:
    mlflow.sklearn.autolog(log_models=False)  # auto-log params/metrics

    pipeline.fit(X_train, y_train)

    # Predict
    y_pred = pipeline.predict(X_test)
    y_proba = pipeline.predict_proba(X_test)[:, 1]

    # Metrics
    auc = roc_auc_score(y_test, y_proba)
    f1  = f1_score(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)

    mlflow.log_metrics({"test_auc": auc, "test_f1": f1, "test_accuracy": acc})
    mlflow.log_param("features", str(all_features))
    mlflow.log_param("algorithm", "GradientBoostingClassifier")

    # Log the model with signature
    from mlflow.models.signature import infer_signature
    signature = infer_signature(X_train, y_pred)
    mlflow.sklearn.log_model(pipeline, "model", signature=signature, input_example=X_train.head(3))

    print(f"\n{'='*50}")
    print(f"  AUC-ROC  : {auc:.4f}")
    print(f"  F1 Score : {f1:.4f}")
    print(f"  Accuracy : {acc:.4f}")
    print(f"  Run ID   : {run.info.run_id}")
    print(f"{'='*50}")
    print(f"\n{classification_report(y_test, y_pred, target_names=['Healthy', 'Churn'])}")

### 5 — Register Model in Unity Catalog

In [None]:
model_name = f"{MODELS}.customer_churn_model"

model_uri = f"runs:/{run.info.run_id}/model"
registered = mlflow.register_model(model_uri, model_name)

print(f"✓ Model registered: {model_name}")
print(f"  Version: {registered.version}")
print(f"  Source:  {registered.source}")

In [None]:
# Set alias for production use
from mlflow import MlflowClient
client = MlflowClient()

client.set_registered_model_alias(model_name, "champion", registered.version)
print(f"✓ Alias 'champion' set to version {registered.version}")

### 6 — Feature Importance & Explainability

In [None]:
# Extract feature importances from the GBT model
gbt_model = pipeline.named_steps["classifier"]

# Get feature names after preprocessing
feature_names = numeric_features + categorical_features
importances = gbt_model.feature_importances_

fi_df = pd.DataFrame({
    "feature": feature_names,
    "importance": importances
}).sort_values("importance", ascending=False)

print("Feature Importance:")
display(spark.createDataFrame(fi_df))

### 7 — Score All Customers (Batch Inference)

In [None]:
# Score the full customer base
X_all = pdf[all_features].copy()
pdf["churn_prediction"] = pipeline.predict(X_all)
pdf["churn_probability"] = np.round(pipeline.predict_proba(X_all)[:, 1], 4)

pdf["risk_tier"] = pd.cut(
    pdf["churn_probability"],
    bins=[-0.01, 0.4, 0.6, 0.8, 1.01],
    labels=["Low", "Medium", "High", "Critical"]
)

# Select columns for Gold table
churn_scores_pdf = pdf[[
    "customer_key", "market_segment", "customer_region",
    "recency_days", "frequency", "lifetime_value", "rfm_score",
    "churn_label", "churn_prediction", "churn_probability", "risk_tier"
]].copy()

# Convert to Spark and save to Gold layer
churn_scores_sdf = spark.createDataFrame(churn_scores_pdf.astype({"risk_tier": str}))
churn_scores_sdf.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable(f"{GOLD}.gold_churn_scores")

print(f"✓ Churn scores saved to {GOLD}.gold_churn_scores")
print(f"  Total customers scored: {len(churn_scores_pdf):,}")
print(f"\nRisk tier distribution:")
display(
    spark.table(f"{GOLD}.gold_churn_scores")
    .groupBy("risk_tier")
    .agg(
        F.count("*").alias("customers"),
        F.round(F.avg("churn_probability"), 3).alias("avg_churn_prob"),
        F.round(F.avg("lifetime_value"), 0).alias("avg_ltv"),
    )
    .orderBy(F.desc("avg_churn_prob"))
)

---
## MODEL 2: Monthly Demand Forecast (per Region)

**Objective**: Forecast monthly net revenue by region for the next 6 months.

**Approach**: Use Prophet on each region's time series (pandas-based, serverless compatible).

### 8 — Prepare Monthly Revenue Data

In [None]:
monthly_pdf = (
    spark.table(f"{GOLD}.gold_monthly_sales")
    .groupBy("year_month", "region")
    .agg(F.sum("net_revenue").alias("net_revenue"))
    .toPandas()
)

monthly_pdf["ds"] = pd.to_datetime(monthly_pdf["year_month"] + "-01")
monthly_pdf = monthly_pdf.rename(columns={"net_revenue": "y"})
monthly_pdf = monthly_pdf.sort_values(["region", "ds"])

print(f"Regions: {sorted(monthly_pdf['region'].unique())}")
print(f"Date range: {monthly_pdf['ds'].min()} → {monthly_pdf['ds'].max()}")
print(f"Total rows: {len(monthly_pdf):,}")

### 9 — Forecast per Region with Prophet

In [None]:
from prophet import Prophet
import warnings
warnings.filterwarnings("ignore")

all_forecasts = []

for region in sorted(monthly_pdf["region"].unique()):
    region_df = monthly_pdf[monthly_pdf["region"] == region][["ds", "y"]].copy()

    m = Prophet(
        yearly_seasonality=True,
        weekly_seasonality=False,
        daily_seasonality=False,
        changepoint_prior_scale=0.05,
    )
    m.fit(region_df)

    future = m.make_future_dataframe(periods=6, freq="MS")
    forecast = m.predict(future)

    result = forecast[["ds", "yhat", "yhat_lower", "yhat_upper"]].merge(region_df, on="ds", how="left")
    result["region"] = region
    result["is_forecast"] = result["y"].isna().map({True: "forecast", False: "actual"})
    all_forecasts.append(result)

    print(f"  ✓ {region}: {len(region_df)} months history → 6 months forecast")

forecast_pdf = pd.concat(all_forecasts, ignore_index=True)

# Save to Gold
forecast_sdf = spark.createDataFrame(forecast_pdf[["region", "ds", "y", "yhat", "yhat_lower", "yhat_upper", "is_forecast"]])
forecast_sdf.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable(f"{GOLD}.gold_demand_forecast")

print(f"\n✓ Forecasts saved to {GOLD}.gold_demand_forecast")
display(
    spark.table(f"{GOLD}.gold_demand_forecast")
    .filter(F.col("is_forecast") == "forecast")
    .orderBy("region", "ds")
)

### 10 — Log Forecast to MLflow

In [None]:
experiment_name = f"/Users/{spark.sql('SELECT current_user()').collect()[0][0]}/retail_demand_forecast"
mlflow.set_experiment(experiment_name)

with mlflow.start_run(run_name="demand_forecast_prophet_v1") as run:
    mlflow.log_table(forecast_pdf, artifact_file="forecast_results.json")

    # MAPE on actuals
    actuals = forecast_pdf[forecast_pdf["is_forecast"] == "actual"].copy()
    actuals["ape"] = abs(actuals["y"] - actuals["yhat"]) / actuals["y"]
    mape = actuals["ape"].mean() * 100

    mlflow.log_metric("mape_pct", round(mape, 2))
    mlflow.log_param("forecast_horizon_months", 6)
    mlflow.log_param("regions", str(sorted(forecast_pdf["region"].unique().tolist())))

    print(f"  MAPE: {mape:.2f}%")
    print(f"  Run ID: {run.info.run_id}")

## 11 — ML Summary

In [None]:
print(f"Churn model  : {MODELS}.customer_churn_model")
print(f"  Scores     : {GOLD}.gold_churn_scores  ({spark.table(f'{GOLD}.gold_churn_scores').count():,} rows)")
print(f"Forecast     : {GOLD}.gold_demand_forecast  ({spark.table(f'{GOLD}.gold_demand_forecast').count():,} rows)")

---
Both models trained and registered. Continue with `07_ai_agents.ipynb`.