In [2]:
# 06_financial_advice_engine — Cell 1: Load artifacts

import joblib
from pathlib import Path

MODEL_DIR = Path("../models")

paths = {
    "xgb_reg": MODEL_DIR / "savings_xgb_reg.pkl",
    "xgb_reg_cols": MODEL_DIR / "savings_xgb_reg_columns.pkl",
    "scaler": MODEL_DIR / "scaler_05B.pkl",
    "kmeans": MODEL_DIR / "kmeans_05B.pkl",
    "goal_clf": MODEL_DIR / "savings_goal_xgb.pkl",
    "goal_clf_cols": MODEL_DIR / "savings_goal_xgb_columns.pkl",
}

# required
xgb_reg = joblib.load(paths["xgb_reg"])
xgb_features = joblib.load(paths["xgb_reg_cols"])
scaler = joblib.load(paths["scaler"])
kmeans = joblib.load(paths["kmeans"])

# optional
try:
    goal_clf = joblib.load(paths["goal_clf"])
    goal_features = joblib.load(paths["goal_clf_cols"])
except Exception:
    goal_clf, goal_features = None, None

print("✅ Loaded artifacts:")
print(f" - XGB reg: {paths['xgb_reg'].name} (features={len(xgb_features)})")
print(f" - Scaler:  {paths['scaler'].name}")
print(f" - KMeans:  {paths['kmeans'].name}")
print(" - Goal clf:", "loaded" if goal_clf is not None else "not loaded")


✅ Loaded artifacts:
 - XGB reg: savings_xgb_reg.pkl (features=25)
 - Scaler:  scaler_05B.pkl
 - KMeans:  kmeans_05B.pkl
 - Goal clf: loaded


In [3]:
# 06_financial_advice_engine — Cell 2: One-hot + align utility

import pandas as pd

def one_hot_align(df_in: pd.DataFrame, required_cols: list[str], cat_cols=("Occupation","City_Tier")) -> pd.DataFrame:
    """
    One-hot encode categorical columns and align to required_cols.
    Adds missing cols as 0 and ensures exact column order.
    """
    # Encode only if the column is present
    use_cats = [c for c in cat_cols if c in df_in.columns]
    df_enc = pd.get_dummies(df_in.copy(), columns=use_cats, dtype=int)

    # Add missing training columns
    for col in required_cols:
        if col not in df_enc.columns:
            df_enc[col] = 0

    # Keep order identical to training
    return df_enc[required_cols]


In [4]:
# 06_financial_advice_engine — Cell 3: Predict savings with XGB and attach

import numpy as np
import pandas as pd

def predict_savings_xgb(df_raw: pd.DataFrame) -> pd.DataFrame:
    """
    Uses loaded xgb_reg + xgb_features to predict savings.
    Returns a copy with Pred_Savings_XGB attached.
    """
    X = one_hot_align(df_raw, xgb_features)
    yhat = xgb_reg.predict(X)
    yhat = np.maximum(yhat, 0).round(2)  # no negatives, tidy to cents

    out = df_raw.copy()
    out["Pred_Savings_XGB"] = yhat

    # Optional quick evaluation if ground truth present
    if "Desired_Savings" in out.columns:
        mae = float(np.mean(np.abs(out["Pred_Savings_XGB"] - out["Desired_Savings"])))
        print(f"MAE vs Desired_Savings: {mae:.2f}")

    return out

In [5]:
# 06_financial_advice_engine — Cell 4: Assign cluster & persona

import pandas as pd
import numpy as np

# Features used for clustering (same flavor as your 05 notebook, with model signal)
CLUSTER_FEATURES = [
    "Income",
    "Disposable_Income",
    "Pred_Savings_XGB",  # model-driven signal
    "Groceries",
    "Transport",
    "Entertainment",
]

PERSONA_NAMES = {
    0: "Budget-conscious majority",
    1: "Comfortable middle",
    2: "Affluent elite",
}

def assign_cluster_and_persona(df_in: pd.DataFrame) -> pd.DataFrame:
    """
    Uses loaded scaler + kmeans to assign cluster labels, then maps personas by
    ranking clusters on Pred_Savings_XGB (low→mid→high).
    """
    if not set(CLUSTER_FEATURES).issubset(df_in.columns):
        missing = [c for c in CLUSTER_FEATURES if c not in df_in.columns]
        raise KeyError(f"Missing required columns for clustering: {missing}")

    X = df_in[CLUSTER_FEATURES].astype(float)
    X_scaled = scaler.transform(X)
    labels = kmeans.predict(X_scaled).astype(int)

    out = df_in.copy()
    out["Cluster"] = labels

    # Rank clusters by model-predicted savings to assign personas deterministically
    order = (
        out.groupby("Cluster")["Pred_Savings_XGB"]
           .mean()
           .sort_values()           # low → mid → high
           .index.tolist()
    )
    rank_map = {cl: rank for rank, cl in enumerate(order)}
    out["Persona"] = out["Cluster"].map(rank_map).map(PERSONA_NAMES)

    return out


In [6]:
# 06_financial_advice_engine — Cell 5: Goal achievement classification (optional)

import numpy as np
import pandas as pd

def classify_goal(df_in: pd.DataFrame) -> pd.DataFrame:
    """
    Uses loaded goal_clf (+ goal_features) to estimate the probability of achieving
    the user's savings goal. Adds:
      - Goal_Prob: probability of positive class (achieve)
      - Goal_Label: 'Likely' / 'Unlikely' based on a threshold
    If goal_clf is not loaded, returns input unchanged.
    """
    if goal_clf is None or goal_features is None:
        print("ℹ️ Goal classifier not loaded; skipping classification.")
        return df_in.copy()

    # One-hot align to classifier feature space
    Xg = one_hot_align(df_in, goal_features)

    # Some classifiers expose predict_proba; fall back to decision_function if needed
    if hasattr(goal_clf, "predict_proba"):
        probs = goal_clf.predict_proba(Xg)[:, 1]
    elif hasattr(goal_clf, "decision_function"):
        # Min-max to [0,1] as a rough probability proxy
        scores = goal_clf.decision_function(Xg)
        mn, mx = scores.min(), scores.max()
        probs = (scores - mn) / (mx - mn + 1e-9)
    else:
        # Last resort: binary prediction only
        preds = goal_clf.predict(Xg)
        probs = preds.astype(float)

    out = df_in.copy()
    out["Goal_Prob"] = np.round(probs, 3)
    out["Goal_Label"] = np.where(out["Goal_Prob"] >= 0.5, "Likely", "Unlikely")
    return out


In [7]:
# 06_financial_advice_engine — Cell 6: Run the advice engine end-to-end

import pandas as pd

def run_advice_engine(df_input: pd.DataFrame) -> pd.DataFrame:
    """
    Pipeline:
      1) Predict savings (XGB)
      2) Assign cluster & persona (scaler + kmeans)
      3) (Optional) Classify goal likelihood
    Returns a copy of the input with new columns appended.
    """
    df_step1 = predict_savings_xgb(df_input)
    df_step2 = assign_cluster_and_persona(df_step1)
    df_final = classify_goal(df_step2)  # no-op if goal_clf wasn't loaded
    return df_final
