In [17]:
import pandas as pd, os, glob

# Load the same raw dataset used earlier
df = pd.read_csv("../data/personal_finance_expenses.csv")
print("Data shape:", df.shape)
display(df.head(3))

# Peek at available trained model files (from 01–04)
model_dir = "../models"
if os.path.isdir(model_dir):
    model_files = sorted(
        os.path.basename(p)
        for p in glob.glob(os.path.join(model_dir, "*"))
        if p.endswith((".pkl", ".joblib"))
    )
    print(f"Models found in {model_dir}:", model_files)
else:
    print("⚠️ Models directory not found at ../models — create it or adjust the path.")


Data shape: (20000, 27)


Unnamed: 0,Income,Age,Dependents,Occupation,City_Tier,Rent,Loan_Repayment,Insurance,Groceries,Transport,...,Desired_Savings,Disposable_Income,Potential_Savings_Groceries,Potential_Savings_Transport,Potential_Savings_Eating_Out,Potential_Savings_Entertainment,Potential_Savings_Utilities,Potential_Savings_Healthcare,Potential_Savings_Education,Potential_Savings_Miscellaneous
0,44637.249636,49,0,Self_Employed,Tier_1,13391.174891,0.0,2206.490129,6658.768341,2636.970696,...,6200.537192,11265.627707,1685.696222,328.895281,465.769172,195.15132,678.292859,67.682471,0.0,85.735517
1,26858.596592,34,2,Retired,Tier_2,5371.719318,0.0,869.522617,2818.44446,1543.018778,...,1923.176434,9676.818733,540.306561,119.347139,141.866089,234.131168,286.668408,6.603212,56.306874,97.388606
2,50367.605084,35,1,Student,Tier_3,7555.140763,4612.103386,2201.80005,6313.222081,3221.396403,...,7050.360422,13891.450624,1466.073984,473.549752,410.857129,459.965256,488.383423,7.290892,106.653597,138.542422


Models found in ../models: ['kmeans_05B.pkl', 'savings_goal_xgb.pkl', 'savings_goal_xgb_columns.pkl', 'savings_xgb_reg.pkl', 'savings_xgb_reg_columns.pkl', 'scaler_05B.pkl']


In [18]:
# Cell 2 — XGBoost Regression: load, align features, predict, attach

import joblib
import numpy as np
import pandas as pd

# 1) Load XGB regression model + its feature list
xgb_model_path = "../models/savings_xgb_reg.pkl"
xgb_cols_path  = "../models/savings_xgb_reg_columns.pkl"

xgb_model = joblib.load(xgb_model_path)
xgb_features = joblib.load(xgb_cols_path)
print("✅ Loaded XGBoost Regression")
print(f"XGB expects {len(xgb_features)} features")

# 2) One-hot encode and align
df_xgb_enc = pd.get_dummies(df, columns=["Occupation","City_Tier"], dtype=int)

# add any missing training columns as 0; keep exact order
for col in xgb_features:
    if col not in df_xgb_enc.columns:
        df_xgb_enc[col] = 0

X_xgb = df_xgb_enc[xgb_features]
print("Input shape for XGB:", X_xgb.shape)

# 3) Predict
xgb_pred = xgb_model.predict(X_xgb)
xgb_pred = np.maximum(xgb_pred, 0).round(2)

df["Pred_Savings_XGB"] = xgb_pred

# 4) Quick evaluation vs ground truth (if available)
if "Desired_Savings" in df.columns:
    mae_xgb = np.mean(np.abs(df["Pred_Savings_XGB"] - df["Desired_Savings"]))
    print(f"XGBoost Regression — MAE vs Desired_Savings: {mae_xgb:.2f}")

# 5) Preview
df.filter(items=["Income","Disposable_Income","Desired_Savings","Pred_Savings_XGB"]).head(5)


✅ Loaded XGBoost Regression
XGB expects 25 features
Input shape for XGB: (20000, 25)
XGBoost Regression — MAE vs Desired_Savings: 507.92


Unnamed: 0,Income,Disposable_Income,Desired_Savings,Pred_Savings_XGB
0,44637.249636,11265.627707,6200.537192,5703.589844
1,26858.596592,9676.818733,1923.176434,1743.199951
2,50367.605084,13891.450624,7050.360422,6398.379883
3,101455.600247,31617.953615,16694.965136,15308.830078
4,24875.283548,6265.700532,1874.099434,1682.400024


In [19]:
# Cell 3 — Clustering with model signal (XGBoost prediction)

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np
import pandas as pd

# 0) sanity check
assert "Pred_Savings_XGB" in df.columns, "Run Cell 6 first to compute Pred_Savings_XGB."

# 1) choose features (same flavor as 05, but swap in model prediction)
cluster_features = [
    "Income",
    "Disposable_Income",
    "Pred_Savings_XGB",     # <-- model-driven signal
    "Groceries",
    "Transport",
    "Entertainment",
]

# drop any rows with NaNs in the features (should be none, but safe)
X = df[cluster_features].dropna().copy()

# 2) scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 3) cluster (k=3 to match your earlier segments)
kmeans = KMeans(n_clusters=3, random_state=42, n_init=20)
labels = kmeans.fit_predict(X_scaled)

# 4) attach back to df (align indices)
df.loc[X.index, "cluster_k3_model"] = labels.astype(int)

# 5) evaluate + profile (in original units)
sil = silhouette_score(X_scaled, labels)
sizes = pd.Series(labels).value_counts().sort_index()
profile_model = df.loc[X.index, cluster_features + ["cluster_k3_model"]].groupby("cluster_k3_model").mean().round(2)
profile_model["Count"] = sizes

print(f"KMeans (k=3) with XGB signal — silhouette: {sil:.3f}\n")
print("Cluster sizes:")
display(sizes.to_frame("count"))

print("\nCluster centers (original units):")
display(profile_model)


KMeans (k=3) with XGB signal — silhouette: 0.588

Cluster sizes:


Unnamed: 0,count
0,14571
1,4704
2,725



Cluster centers (original units):


Unnamed: 0_level_0,Income,Disposable_Income,Pred_Savings_XGB,Groceries,Transport,Entertainment,Count
cluster_k3_model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0.0,24222.11,6007.87,1752.589966,3033.09,1573.03,836.7,14571
1.0,73609.21,19009.61,8832.360352,9206.31,4785.1,2575.8,4704
2.0,182774.85,49635.19,31028.060547,22912.8,11944.21,6439.95,725


In [20]:
# Cell 4 — Add personas to existing clusters (no recompute, no saving)

# Must have the cluster column from Cell 3
if "cluster_k3_model" not in df.columns:
    raise KeyError("cluster_k3_model not found — run Cell 3 first.")

# clean dtype
df["cluster_k3_model"] = df["cluster_k3_model"].astype("int64")

# Pick a ranking metric if available (for data-driven persona ordering)
metric = "Pred_Savings_XGB" if "Pred_Savings_XGB" in df.columns else (
    "Disposable_Income" if "Disposable_Income" in df.columns else None
)

if metric is not None:
    order = (
        df.groupby("cluster_k3_model")[metric]
          .mean()
          .sort_values()             # low → mid → high
          .index.tolist()
    )
    rank_map = {cl: rank for rank, cl in enumerate(order)}
    persona_names = {
        0: "Budget-conscious majority",
        1: "Comfortable middle",
        2: "Affluent elite",
    }
    df["persona_model"] = df["cluster_k3_model"].map(rank_map).map(persona_names)
else:
    # Fallback: fixed mapping if no metric exists
    df["persona_model"] = df["cluster_k3_model"].map({
        0: "Budget-conscious majority",
        1: "Comfortable middle",
        2: "Affluent elite",
    })

# Quick checks
print("Counts by cluster:")
print(df["cluster_k3_model"].value_counts().sort_index())
print("\nCounts by persona:")
print(df["persona_model"].value_counts(dropna=False))


Counts by cluster:
cluster_k3_model
0    14571
1     4704
2      725
Name: count, dtype: int64

Counts by persona:
persona_model
Budget-conscious majority    14571
Comfortable middle            4704
Affluent elite                 725
Name: count, dtype: int64


In [21]:
import joblib

# after fitting KMeans + scaler in 05B
joblib.dump(kmeans, "../models/kmeans_05B.pkl")
joblib.dump(scaler, "../models/scaler_05B.pkl")

print("✅ saved kmeans_05B.pkl and scaler_05B.pkl")


✅ saved kmeans_05B.pkl and scaler_05B.pkl
