# 05 â€” Train Model (Logistic Regression, 30-day churn)

This notebook trains a **baseline churn model** on the labeled dataset.

## Inputs
- `data/processed/model_dataset_label_30d.parquet` (features + churn_label)

## Outputs
- `artifacts/churn_model_v1.joblib` (sklearn pipeline)
- `artifacts/feature_list.json` (feature order used during training)
- `artifacts/train_meta_v1.json` (metrics + dataset info)

## Notes
- This is a baseline model meant to be easy to interpret and deploy.
- The pipeline standardizes numeric features and trains Logistic Regression with class balancing.


In [None]:
from pathlib import Path
import sys
import json

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    classification_report, confusion_matrix
)
import joblib

PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))

DATASET_PATH = PROJECT_ROOT / "data" / "processed" / "model_dataset_label_30d.parquet"
ARTIFACT_DIR = PROJECT_ROOT / "artifacts"
ARTIFACT_DIR.mkdir(exist_ok=True)

print("DATASET_PATH:", DATASET_PATH)
print("ARTIFACT_DIR:", ARTIFACT_DIR)


In [None]:
# Load labeled dataset
df = pd.read_parquet(DATASET_PATH)
print("dataset shape:", df.shape)
df.head()


In [None]:
# Quick label sanity
TARGET = "churn_label"
ID_COL = "external_customerkey"

print(df[TARGET].value_counts(dropna=False))
print("label mean:", float(df[TARGET].mean()))

# Missingness overview
(df.isna().mean().sort_values(ascending=False).head(15))


In [None]:
# Build X/y
y = df[TARGET].astype(int)

X = df.drop(columns=[c for c in [ID_COL, TARGET] if c in df.columns]).copy()

# Drop task-misaligned features (optional)
DROP_FEATURES = ["total_revenue", "avg_order_value"]
X = X.drop(columns=DROP_FEATURES, errors="ignore")

# Keep only numeric (safety)
X = X.replace([np.inf, -np.inf], np.nan).fillna(0)
X = X.select_dtypes(include=[np.number])

# Drop zero-variance columns (safety)
zero_std = X.columns[X.nunique(dropna=False) <= 1].tolist()
if zero_std:
    X = X.drop(columns=zero_std)
    print("Dropped zero-std cols:", zero_std)

print("X shape:", X.shape)
X.dtypes.head()


In [None]:
# Train/test split (random split baseline; use timesplit eval pipeline for leakage-safe validation)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("train:", X_train.shape, "label mean:", float(y_train.mean()))
print("test :", X_test.shape,  "label mean:", float(y_test.mean()))


In [None]:
# Train model
model = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(
        max_iter=2000,
        solver="liblinear",
        class_weight="balanced"
    ))
])

model.fit(X_train, y_train)


In [None]:
# Evaluate (baseline)
y_proba = model.predict_proba(X_test)[:, 1]
y_pred_50 = (y_proba >= 0.5).astype(int)

roc = roc_auc_score(y_test, y_proba)
pr = average_precision_score(y_test, y_proba)

print("ROC_AUC:", roc)
print("PR_AUC :", pr)
print("\nReport @0.5:\n", classification_report(y_test, y_pred_50))
confusion_matrix(y_test, y_pred_50)


In [None]:
# Save artifacts
MODEL_PATH = ARTIFACT_DIR / "churn_model_v1.joblib"
FEATURES_PATH = ARTIFACT_DIR / "feature_list.json"
META_PATH = ARTIFACT_DIR / "train_meta_v1.json"

joblib.dump(model, MODEL_PATH)

feature_list = list(X_train.columns)
FEATURES_PATH.write_text(json.dumps(feature_list, indent=2), encoding="utf-8")

meta = {
    "dataset_path": str(DATASET_PATH),
    "n_rows": int(len(df)),
    "n_features": int(X.shape[1]),
    "dropped_zero_std": zero_std,
    "dropped_features": DROP_FEATURES,
    "roc_auc": float(roc),
    "pr_auc": float(pr),
    "train_rows": int(len(X_train)),
    "test_rows": int(len(X_test)),
    "label_mean": float(y.mean()),
}
META_PATH.write_text(json.dumps(meta, indent=2), encoding="utf-8")

print("Saved:", MODEL_PATH)
print("Saved:", FEATURES_PATH)
print("Saved:", META_PATH)
