In [1]:
import pandas as pd, os, glob

# Load the same raw dataset used earlier
df = pd.read_csv("../data/personal_finance_expenses.csv")
print("Data shape:", df.shape)
display(df.head(3))

# Peek at available trained model files (from 01–04)
model_dir = "../models"
if os.path.isdir(model_dir):
    model_files = sorted(
        os.path.basename(p)
        for p in glob.glob(os.path.join(model_dir, "*"))
        if p.endswith((".pkl", ".joblib"))
    )
    print(f"Models found in {model_dir}:", model_files)
else:
    print("⚠️ Models directory not found at ../models — create it or adjust the path.")


Data shape: (20000, 27)


Unnamed: 0,Income,Age,Dependents,Occupation,City_Tier,Rent,Loan_Repayment,Insurance,Groceries,Transport,...,Desired_Savings,Disposable_Income,Potential_Savings_Groceries,Potential_Savings_Transport,Potential_Savings_Eating_Out,Potential_Savings_Entertainment,Potential_Savings_Utilities,Potential_Savings_Healthcare,Potential_Savings_Education,Potential_Savings_Miscellaneous
0,44637.249636,49,0,Self_Employed,Tier_1,13391.174891,0.0,2206.490129,6658.768341,2636.970696,...,6200.537192,11265.627707,1685.696222,328.895281,465.769172,195.15132,678.292859,67.682471,0.0,85.735517
1,26858.596592,34,2,Retired,Tier_2,5371.719318,0.0,869.522617,2818.44446,1543.018778,...,1923.176434,9676.818733,540.306561,119.347139,141.866089,234.131168,286.668408,6.603212,56.306874,97.388606
2,50367.605084,35,1,Student,Tier_3,7555.140763,4612.103386,2201.80005,6313.222081,3221.396403,...,7050.360422,13891.450624,1466.073984,473.549752,410.857129,459.965256,488.383423,7.290892,106.653597,138.542422


Models found in ../models: ['savings_goal_dt.pkl', 'savings_goal_dt_columns.pkl', 'savings_goal_xgb.pkl', 'savings_goal_xgb_columns.pkl', 'savings_linear_reg.pkl', 'savings_linear_reg_columns.pkl', 'savings_xgb_reg.pkl', 'savings_xgb_reg_columns.pkl']


In [2]:
import joblib

# pick one model for now
model_path = "../models/savings_linear_reg.pkl"
cols_path  = "../models/savings_linear_reg_columns.pkl"

# load model + the training columns
lin_model = joblib.load(model_path)
lin_features = joblib.load(cols_path)

print("✅ Loaded linear regression model")
print("Expected features:", lin_features)


✅ Loaded linear regression model
Expected features: ['Income', 'Age', 'Dependents', 'Rent', 'Loan_Repayment', 'Insurance', 'Groceries', 'Transport', 'Eating_Out', 'Entertainment', 'Utilities', 'Healthcare', 'Education', 'Miscellaneous', 'Desired_Savings_Percentage', 'Disposable_Income', 'total_expenses', 'estimated_savings', 'Occupation_Professional', 'Occupation_Retired', 'Occupation_Self_Employed', 'Occupation_Student', 'City_Tier_Tier_1', 'City_Tier_Tier_2', 'City_Tier_Tier_3']


In [3]:
# ensure total_expenses & estimated_savings exist (recompute if not present)
if "total_expenses" not in df.columns:
    expense_cols = ["Rent","Loan_Repayment","Insurance","Groceries","Transport",
                    "Eating_Out","Entertainment","Utilities","Healthcare","Education","Miscellaneous"]
    df["total_expenses"] = df[expense_cols].sum(axis=1)

if "estimated_savings" not in df.columns:
    df["estimated_savings"] = df["Income"] - df["total_expenses"]

# one-hot encode Occupation and City_Tier
df_enc = pd.get_dummies(df, columns=["Occupation","City_Tier"])

# align with training features (add any missing cols, keep order)
for col in lin_features:
    if col not in df_enc.columns:
        df_enc[col] = 0  # fill missing with 0

X_lin = df_enc[lin_features]

print("Input shape for linear model:", X_lin.shape)
X_lin.head(3)


Input shape for linear model: (20000, 25)


Unnamed: 0,Income,Age,Dependents,Rent,Loan_Repayment,Insurance,Groceries,Transport,Eating_Out,Entertainment,...,Disposable_Income,total_expenses,estimated_savings,Occupation_Professional,Occupation_Retired,Occupation_Self_Employed,Occupation_Student,City_Tier_Tier_1,City_Tier_Tier_2,City_Tier_Tier_3
0,44637.249636,49,0,13391.174891,0.0,2206.490129,6658.768341,2636.970696,1651.801726,1536.184255,...,11265.627707,33371.621929,11265.627707,False,False,True,False,True,False,False
1,26858.596592,34,2,5371.719318,0.0,869.522617,2818.44446,1543.018778,649.378103,1050.241738,...,9676.818733,17181.777859,9676.818733,False,True,False,False,False,True,False
2,50367.605084,35,1,7555.140763,4612.103386,2201.80005,6313.222081,3221.396403,1513.814376,1723.306926,...,13891.450624,36476.154459,13891.450624,False,False,False,True,False,False,True


In [4]:
# Cell 4 — predict with Linear Regression and attach to df

import numpy as np

# predict desired savings (or whatever target the linear model was trained on)
lin_pred = lin_model.predict(X_lin)

# optional: cleanups (no negatives, round to 2 decimals)
lin_pred = np.maximum(lin_pred, 0).round(2)

# add to dataframe
df["Pred_Savings_LR"] = lin_pred

# quick evaluation against actual Desired_Savings if available
if "Desired_Savings" in df.columns:
    mae = np.mean(np.abs(df["Pred_Savings_LR"] - df["Desired_Savings"]))
    print(f"Linear Regression — MAE vs Desired_Savings: {mae:.2f}")

# preview
df[["Income","Disposable_Income","Desired_Savings","Pred_Savings_LR"]].head(5)


Linear Regression — MAE vs Desired_Savings: 1159.26


Unnamed: 0,Income,Disposable_Income,Desired_Savings,Pred_Savings_LR
0,44637.249636,11265.627707,6200.537192,6664.96
1,26858.596592,9676.818733,1923.176434,2491.88
2,50367.605084,13891.450624,7050.360422,7742.26
3,101455.600247,31617.953615,16694.965136,16551.06
4,24875.283548,6265.700532,1874.099434,1468.63


In [5]:
# Cell 5 — Decision Tree: load, align features, predict, attach

import joblib
import numpy as np
import pandas as pd

# 1) Load DT model + its feature list
dt_model_path = "../models/savings_goal_dt.pkl"
dt_cols_path  = "../models/savings_goal_dt_columns.pkl"

dt_model = joblib.load(dt_model_path)
dt_features = joblib.load(dt_cols_path)
print("✅ Loaded Decision Tree")
print(f"DT expects {len(dt_features)} features")

# 2) Ensure engineered columns exist (same as we did for linear)
if "total_expenses" not in df.columns:
    expense_cols = ["Rent","Loan_Repayment","Insurance","Groceries","Transport",
                    "Eating_Out","Entertainment","Utilities","Healthcare","Education","Miscellaneous"]
    df["total_expenses"] = df[expense_cols].sum(axis=1)

if "estimated_savings" not in df.columns:
    df["estimated_savings"] = df["Income"] - df["total_expenses"]

# 3) One-hot encode and align to the DT's training feature set
df_dt_enc = pd.get_dummies(df, columns=["Occupation","City_Tier"])

# add any missing training columns as 0; keep exact order
for col in dt_features:
    if col not in df_dt_enc.columns:
        df_dt_enc[col] = 0

X_dt = df_dt_enc[dt_features]

print("Input shape for DT:", X_dt.shape)

# 4) Predict
dt_pred = dt_model.predict(X_dt)
dt_pred = np.maximum(dt_pred, 0).round(2)

df["Pred_Savings_DT"] = dt_pred

# 5) Quick evaluation vs ground truth (if available)
if "Desired_Savings" in df.columns:
    mae_dt = np.mean(np.abs(df["Pred_Savings_DT"] - df["Desired_Savings"]))
    print(f"Decision Tree — MAE vs Desired_Savings: {mae_dt:.2f}")

# 6) Preview
df[["Income","Disposable_Income","Desired_Savings","Pred_Savings_LR","Pred_Savings_DT"]].head(5)


✅ Loaded Decision Tree
DT expects 15 features
Input shape for DT: (20000, 15)
Decision Tree — MAE vs Desired_Savings: 4981.89


Unnamed: 0,Income,Disposable_Income,Desired_Savings,Pred_Savings_LR,Pred_Savings_DT
0,44637.249636,11265.627707,6200.537192,6664.96,1
1,26858.596592,9676.818733,1923.176434,2491.88,1
2,50367.605084,13891.450624,7050.360422,7742.26,1
3,101455.600247,31617.953615,16694.965136,16551.06,1
4,24875.283548,6265.700532,1874.099434,1468.63,1


In [6]:
# Cell 6 — XGBoost Regression: load, align features, predict, attach

import joblib
import numpy as np
import pandas as pd

# 1) Load XGB regression model + its feature list
xgb_model_path = "../models/savings_xgb_reg.pkl"
xgb_cols_path  = "../models/savings_xgb_reg_columns.pkl"

xgb_model = joblib.load(xgb_model_path)
xgb_features = joblib.load(xgb_cols_path)
print("✅ Loaded XGBoost Regression")
print(f"XGB expects {len(xgb_features)} features")

# 2) One-hot encode and align
df_xgb_enc = pd.get_dummies(df, columns=["Occupation","City_Tier"], dtype=int)

# add any missing training columns as 0; keep exact order
for col in xgb_features:
    if col not in df_xgb_enc.columns:
        df_xgb_enc[col] = 0

X_xgb = df_xgb_enc[xgb_features]
print("Input shape for XGB:", X_xgb.shape)

# 3) Predict
xgb_pred = xgb_model.predict(X_xgb)
xgb_pred = np.maximum(xgb_pred, 0).round(2)

df["Pred_Savings_XGB"] = xgb_pred

# 4) Quick evaluation vs ground truth (if available)
if "Desired_Savings" in df.columns:
    mae_xgb = np.mean(np.abs(df["Pred_Savings_XGB"] - df["Desired_Savings"]))
    print(f"XGBoost Regression — MAE vs Desired_Savings: {mae_xgb:.2f}")

# 5) Preview
df[["Income","Disposable_Income","Desired_Savings",
    "Pred_Savings_LR","Pred_Savings_DT","Pred_Savings_XGB"]].head(5)


✅ Loaded XGBoost Regression
XGB expects 25 features
Input shape for XGB: (20000, 25)
XGBoost Regression — MAE vs Desired_Savings: 53.42


Unnamed: 0,Income,Disposable_Income,Desired_Savings,Pred_Savings_LR,Pred_Savings_DT,Pred_Savings_XGB
0,44637.249636,11265.627707,6200.537192,6664.96,1,6270.790039
1,26858.596592,9676.818733,1923.176434,2491.88,1,1947.060059
2,50367.605084,13891.450624,7050.360422,7742.26,1,6974.390137
3,101455.600247,31617.953615,16694.965136,16551.06,1,16702.539062
4,24875.283548,6265.700532,1874.099434,1468.63,1,1862.880005


In [7]:
# Cell 7 — Clustering with model signal (XGBoost prediction)

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np
import pandas as pd

# 0) sanity check
assert "Pred_Savings_XGB" in df.columns, "Run Cell 6 first to compute Pred_Savings_XGB."

# 1) choose features (same flavor as 05, but swap in model prediction)
cluster_features = [
    "Income",
    "Disposable_Income",
    "Pred_Savings_XGB",     # <-- model-driven signal
    "Groceries",
    "Transport",
    "Entertainment",
]

# drop any rows with NaNs in the features (should be none, but safe)
X = df[cluster_features].dropna().copy()

# 2) scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 3) cluster (k=3 to match your earlier segments)
kmeans = KMeans(n_clusters=3, random_state=42, n_init=20)
labels = kmeans.fit_predict(X_scaled)

# 4) attach back to df (align indices)
df.loc[X.index, "cluster_k3_model"] = labels.astype(int)

# 5) evaluate + profile (in original units)
sil = silhouette_score(X_scaled, labels)
sizes = pd.Series(labels).value_counts().sort_index()
profile_model = df.loc[X.index, cluster_features + ["cluster_k3_model"]].groupby("cluster_k3_model").mean().round(2)
profile_model["Count"] = sizes

print(f"KMeans (k=3) with XGB signal — silhouette: {sil:.3f}\n")
print("Cluster sizes:")
display(sizes.to_frame("count"))

print("\nCluster centers (original units):")
display(profile_model)


KMeans (k=3) with XGB signal — silhouette: 0.590

Cluster sizes:


Unnamed: 0,count
0,14677
1,4625
2,698



Cluster centers (original units):


Unnamed: 0_level_0,Income,Disposable_Income,Pred_Savings_XGB,Groceries,Transport,Entertainment,Count
cluster_k3_model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0.0,24404.17,6058.91,2022.589966,3055.32,1584.71,842.53,14677
1.0,74453.49,19242.19,9815.919922,9314.1,4840.53,2608.61,4625
2.0,185075.24,50179.89,34969.609375,23198.73,12096.15,6513.48,698


In [10]:
# Cell 8 - Compare 05A mock vs 05B model-driven clusters

import pandas as pd

# 1) Load mock clusters from 05A and rename the column
df_mock = pd.read_csv("../outputs/05_clustered_data.csv", usecols=["cluster_k3"]).rename(
    columns={"cluster_k3": "cluster_k3_mock"}
)

# 2) Join with current df (which has cluster_k3_model)
df_compare = df.join(df_mock, how="inner")  # index-based join; both are 20k rows in same order

# 3) Crosstab: counts
ct = pd.crosstab(df_compare["cluster_k3_mock"], df_compare["cluster_k3_model"])
print("Counts (rows = 05A mock, cols = 05B model-driven):")
display(ct)

# 4) Row-wise percentages (easier to read)
ct_pct = (ct.div(ct.sum(axis=1), axis=0) * 100).round(1)
print("Row % (how each 05A cluster maps into 05B):")
display(ct_pct)


Counts (rows = 05A mock, cols = 05B model-driven):


cluster_k3_model,0.0,1.0,2.0
cluster_k3_mock,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,14571,0,0
1,0,20,698
2,106,4605,0


Row % (how each 05A cluster maps into 05B):


cluster_k3_model,0.0,1.0,2.0
cluster_k3_mock,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,100.0,0.0,0.0
1,0.0,2.8,97.2
2,2.3,97.7,0.0


In [11]:
# Cell 9 — remap model clusters to match 05A personas and save 05B output

import pandas as pd

# mapping based on crosstab majority:
# mock 0 -> model 0
# mock 1 -> model 2
# mock 2 -> model 1
remap = {0: 0, 1: 2, 2: 1}

df["cluster_k3_model_remap"] = df["cluster_k3_model"].map(remap).astype("Int64")

# optional: add persona names consistent with 05A
persona = {0: "Budget-conscious majority", 1: "Comfortable middle", 2: "Affluent elite"}
df["persona_model"] = df["cluster_k3_model_remap"].map(persona)

# quick check: how many in each remapped cluster
print("Counts by remapped cluster:")
print(df["cluster_k3_model_remap"].value_counts().sort_index())

# save model-driven clustering output
out_path = "../outputs/05B_clustered_model.csv"
df.to_csv(out_path, index=False)
print(f"✅ Saved model-driven clusters (remapped) to: {out_path}")


Counts by remapped cluster:
cluster_k3_model_remap
0    14677
1      698
2     4625
Name: count, dtype: Int64
✅ Saved model-driven clusters (remapped) to: ../outputs/05B_clustered_model.csv


In [12]:
import joblib

# after fitting KMeans + scaler in 05B
joblib.dump(kmeans, "../models/kmeans_05B.pkl")
joblib.dump(scaler, "../models/scaler_05B.pkl")

print("✅ saved kmeans_05B.pkl and scaler_05B.pkl")


✅ saved kmeans_05B.pkl and scaler_05B.pkl
