In [2]:
#### Improved end-to-end script for Healthcare Cost & Insurance Pricing
#### - fixes GridSearch for XGBoost
#### - uses Pipelines & ColumnTransformer
#### - tries log-transform of target
#### - compares Linear, DecisionTree, RandomForest, XGBoost
#### - KMeans-based risk segmentation and data-driven premium multipliers
#### - exports CSVs + saves plots for Power BI

# Core Python libraries
import os
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn (ML pipeline)
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import KMeans

# Business impact simulation
import random

# Version check (to fix OneHotEncoder sparse vs sparse_output issue)
import sklearn
from packaging import version

# User-specific paths
# ---------------------------
DATA_PATH = r"C:\Users\HP\Downloads\Yfinance\1_intro_portfolio_analysis\code\insurance.csv"
BASE_DIR = os.path.dirname(DATA_PATH)
OUT_DIR = os.path.join(BASE_DIR, "outputs")
os.makedirs(OUT_DIR, exist_ok=True)

print("DATA_PATH:", DATA_PATH)
print("OUT_DIR:", OUT_DIR)


DATA_PATH: C:\Users\HP\Downloads\Yfinance\1_intro_portfolio_analysis\code\insurance.csv
OUT_DIR: C:\Users\HP\Downloads\Yfinance\1_intro_portfolio_analysis\code\outputs


In [3]:
# Step 0: Load data
# ---------------------------
df = pd.read_csv(DATA_PATH)
print("Loaded rows:", df.shape[0])
df = df.drop_duplicates().reset_index(drop=True)
print("After dropping duplicates:", df.shape[0])

Loaded rows: 1338
After dropping duplicates: 1337


In [4]:
# Step 1: Quick EDA snapshots
# ---------------------------
print(df.describe(include='all').T[['count','mean','std','min','50%','75%','max']])

# Distribution of target
plt.figure(figsize=(6,4))
plt.hist(df['charges'], bins=40)
plt.title('Charges Distribution (raw)')
plt.xlabel('charges')
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "charges_hist_raw.png"))
plt.close()

# Log-transform distribution (diagnostic)
plt.figure(figsize=(6,4))
plt.hist(np.log1p(df['charges']), bins=40)
plt.title('Charges Distribution (log1p)')
plt.xlabel('log1p(charges)')
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "charges_hist_log.png"))
plt.close()

           count          mean           std        min        50%  \
age       1337.0     39.222139     14.044333       18.0       39.0   
sex         1337           NaN           NaN        NaN        NaN   
bmi       1337.0     30.663452      6.100468      15.96       30.4   
children  1337.0      1.095737      1.205571        0.0        1.0   
smoker      1337           NaN           NaN        NaN        NaN   
region      1337           NaN           NaN        NaN        NaN   
charges   1337.0  13279.121487  12110.359656  1121.8739  9386.1613   

                  75%          max  
age              51.0         64.0  
sex               NaN          NaN  
bmi              34.7        53.13  
children          2.0          5.0  
smoker            NaN          NaN  
region            NaN          NaN  
charges   16657.71745  63770.42801  


In [5]:
# Step 2: Feature/target split
# ---------------------------
X = df.drop(columns=['charges'])
y = df['charges'].values

# define categorical and numerical columns
cat_cols = ['sex','smoker','region']
num_cols = ['age','bmi','children']

In [6]:
# Step 3: Preprocessing pipelines
# ---------------------------
cat_transformer = OneHotEncoder(drop='first', sparse_output=False)
num_transformer = StandardScaler()

preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_cols),
    ('cat', cat_transformer, cat_cols)
], remainder='drop')

pipe_lr = Pipeline([
    ('preproc', preprocessor),
    ('est', LinearRegression())
])

pipe_rf = Pipeline([
    ('preproc', preprocessor),
    ('est', RandomForestRegressor(random_state=42, n_jobs=-1))
])

pipe_dt = Pipeline([
    ('preproc', preprocessor),
    ('est', DecisionTreeRegressor(random_state=42))
])

pipe_xgb = Pipeline([
    ('preproc', preprocessor),
    ('est', XGBRegressor(random_state=42, n_jobs=-1, objective='reg:squarederror'))
])


In [7]:
# Step 4: Train/test split
# ---------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# ---------------------------
# Helper: metrics function
# ---------------------------
def metrics_report(y_true, y_pred, label="Model"):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"{label}: MAE={mae:.1f}, RMSE={rmse:.1f}, R2={r2:.4f}")
    return {'mae': mae, 'rmse': rmse, 'r2': r2}

In [8]:
# Step 5: Baseline Linear Regression (raw target)
# ---------------------------
pipe_lr.fit(X_train, y_train)
y_pred_lr = pipe_lr.predict(X_test)
m_lr = metrics_report(y_test, y_pred_lr, "LinearRegression (raw)")

LinearRegression (raw): MAE=4177.0, RMSE=5956.3, R2=0.8069


In [9]:
# Step 6: Tree-based models with GridSearch
# ---------------------------
# Decision Tree
dt_params = {'est__max_depth': [3,5,7,9, None],
             'est__min_samples_leaf': [1,5,10]}
grid_dt = GridSearchCV(pipe_dt, dt_params, cv=5, scoring='r2', n_jobs=-1)
grid_dt.fit(X_train, y_train)
best_dt = grid_dt.best_estimator_
y_pred_dt = best_dt.predict(X_test)
m_dt = metrics_report(y_test, y_pred_dt, f"DecisionTree (best)")

# Random Forest
rf_params = {'est__n_estimators':[100,300],
             'est__max_depth':[6,10,None],
             'est__min_samples_leaf':[1,3,6]}
grid_rf = GridSearchCV(pipe_rf, rf_params, cv=4, scoring='r2', n_jobs=-1)
grid_rf.fit(X_train, y_train)
best_rf = grid_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test)
m_rf = metrics_report(y_test, y_pred_rf, f"RandomForest (best)")

# XGBoost
xgb_params = {
    'est__n_estimators': [100,300],
    'est__max_depth': [3,5],
    'est__learning_rate': [0.1, 0.05]
}
grid_xgb = GridSearchCV(pipe_xgb, xgb_params, cv=4, scoring='r2', n_jobs=-1)
grid_xgb.fit(X_train, y_train)
best_xgb = grid_xgb.best_estimator_
y_pred_xgb = best_xgb.predict(X_test)
m_xgb = metrics_report(y_test, y_pred_xgb, f"XGBRegressor (best)")


DecisionTree (best): MAE=2556.0, RMSE=4296.2, R2=0.8996
RandomForest (best): MAE=2399.5, RMSE=4246.0, R2=0.9019
XGBRegressor (best): MAE=2463.8, RMSE=4232.5, R2=0.9025


In [10]:
# Step 7: Try log1p target (stabilize skew)
# ---------------------------
y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

# Linear on log-target
pipe_lr_log = Pipeline([
    ('preproc', preprocessor),
    ('est', LinearRegression())
])
pipe_lr_log.fit(X_train, y_train_log)
y_pred_lr_log = pipe_lr_log.predict(X_test)
y_pred_lr_log_inv = np.expm1(y_pred_lr_log)
m_lr_log = metrics_report(y_test, y_pred_lr_log_inv, "LinearRegression (log-target)")

# XGBoost on log-target
pipe_xgb_log = Pipeline([
    ('preproc', preprocessor),
    ('est', XGBRegressor(random_state=42, objective='reg:squarederror', n_jobs=-1))
])
grid_xgb_log = GridSearchCV(pipe_xgb_log, xgb_params, cv=4, scoring='r2', n_jobs=-1)
grid_xgb_log.fit(X_train, y_train_log)
best_xgb_log = grid_xgb_log.best_estimator_
y_pred_xgb_log = best_xgb_log.predict(X_test)
y_pred_xgb_log_inv = np.expm1(y_pred_xgb_log)
m_xgb_log = metrics_report(y_test, y_pred_xgb_log_inv, "XGBRegressor (log-target)")

LinearRegression (log-target): MAE=3755.9, RMSE=7197.0, R2=0.7181
XGBRegressor (log-target): MAE=2038.5, RMSE=4355.4, R2=0.8968


In [12]:
# Step 8: Summarize model comparison
# ---------------------------
results = pd.DataFrame({
    'model': ['Linear_raw','DecisionTree','RandomForest','XGB_raw','Linear_log','XGB_log'],
    'mae': [m_lr['mae'], m_dt['mae'], m_rf['mae'], m_xgb['mae'], m_lr_log['mae'], m_xgb_log['mae']],
    'rmse':[m_lr['rmse'], m_dt['rmse'], m_rf['rmse'], m_xgb['rmse'], m_lr_log['rmse'], m_xgb_log['rmse']],
    'r2':[m_lr['r2'], m_dt['r2'], m_rf['r2'], m_xgb['r2'], m_lr_log['r2'], m_xgb_log['r2']]
})
print("\nModel comparison on test set:")
print(results.sort_values('rmse'))

results.to_csv(os.path.join(OUT_DIR, "model_comparison.csv"), index=False)


Model comparison on test set:
          model          mae         rmse        r2
3       XGB_raw  2463.770826  4232.451958  0.902514
2  RandomForest  2399.533283  4245.974394  0.901890
1  DecisionTree  2556.031340  4296.198087  0.899555
5       XGB_log  2038.498131  4355.418965  0.896767
0    Linear_raw  4177.045561  5956.342894  0.806929
4    Linear_log  3755.924576  7197.032607  0.718119


In [13]:
# Step 9: Feature importances (from chosen tree-based model)
# ---------------------------
# Determine best model by RMSE
best_row = results.loc[results['rmse'].idxmin()]
print("\nBest model by RMSE:", best_row['model'])

# Fit preprocessor on full X to get feature names
preproc = preprocessor.fit(X)
num_names = num_cols
cat_ohe = preproc.named_transformers_['cat']
cat_names = cat_ohe.get_feature_names_out(cat_cols).tolist()
all_feature_names = num_names + cat_names

# Pick chosen estimator object
if best_row['model'] in ('XGB_raw','XGB_log'):
    chosen = best_xgb if best_row['model']=='XGB_raw' else best_xgb_log
elif best_row['model']=='RandomForest':
    chosen = best_rf
elif best_row['model']=='DecisionTree':
    chosen = best_dt
else:
    chosen = best_rf  # fallback

if hasattr(chosen.named_steps['est'], 'feature_importances_'):
    imp = chosen.named_steps['est'].feature_importances_
    feat_imp = pd.DataFrame({'feature': all_feature_names, 'importance': imp})
    feat_imp = feat_imp.sort_values('importance', ascending=False)
    feat_imp.to_csv(os.path.join(OUT_DIR, "feature_importances.csv"), index=False)
    print("\nTop features:\n", feat_imp.head(10))
else:
    print("Chosen model has no feature_importances_ attribute.")


Best model by RMSE: XGB_raw

Top features:
             feature  importance
4        smoker_yes    0.830691
1               bmi    0.099102
0               age    0.044029
2          children    0.010777
3          sex_male    0.004733
7  region_southwest    0.004719
5  region_northwest    0.003636
6  region_southeast    0.002313


In [14]:
# Step 10: Risk segmentation (KMeans) + premium multipliers
# ---------------------------
df_seg = df.copy()
# ensure smoker is binary
if df_seg['smoker'].dtype == 'object':
    df_seg['smoker_bin'] = df_seg['smoker'].map({'yes':1,'no':0})
else:
    df_seg['smoker_bin'] = df_seg['smoker']

seg_features = df_seg[['age','bmi','smoker_bin']].fillna(0)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
seg_scaled = sc.fit_transform(seg_features)

kmeans = KMeans(n_clusters=3, random_state=42)
df_seg['risk_cluster'] = kmeans.fit_predict(seg_scaled)

cluster_summary = df_seg.groupby('risk_cluster').agg(
    n_customers = ('charges','count'),
    avg_charges = ('charges','mean'),
    median_charges = ('charges','median')
).reset_index().sort_values('avg_charges', ascending=False)

# Label clusters High/Medium/Low by sorted avg_charges
labels = ['High','Medium','Low'][:len(cluster_summary)]
cluster_summary['label'] = labels
label_map = dict(zip(cluster_summary['risk_cluster'], cluster_summary['label']))
df_seg['risk_label'] = df_seg['risk_cluster'].map(label_map)

low_avg = cluster_summary[cluster_summary['label']=='Low']['avg_charges'].values[0]
cluster_summary['multiplier_vs_low'] = cluster_summary['avg_charges'] / low_avg

BASE_PREMIUM = 20000
cluster_summary['suggested_premium'] = (BASE_PREMIUM * cluster_summary['multiplier_vs_low']).round(0).astype(int)

cluster_summary.to_csv(os.path.join(OUT_DIR, "cluster_premium_suggestion.csv"), index=False)
df_seg.to_csv(os.path.join(OUT_DIR, "insurance_with_risk_and_preds.csv"), index=False)


In [15]:
# Step 11: Save plots
# ---------------------------
# Actual vs Pred (best model)
if best_row['model']=='XGB_log':
    best_preds = y_pred_xgb_log_inv
elif best_row['model']=='XGB_raw':
    best_preds = y_pred_xgb
elif best_row['model']=='RandomForest':
    best_preds = y_pred_rf
elif best_row['model']=='DecisionTree':
    best_preds = y_pred_dt
else:
    best_preds = y_pred_lr

plt.figure(figsize=(6,6))
plt.scatter(y_test, best_preds, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--r')
plt.xlabel('Actual charges')
plt.ylabel('Predicted charges')
plt.title('Actual vs Predicted (best model)')
plt.savefig(os.path.join(OUT_DIR, "actual_vs_pred_best.png"))
plt.close()

# Risk label counts
plt.figure(figsize=(6,4))
df_seg['risk_label'].value_counts().plot(kind='bar')
plt.title("Count by Risk Label")
plt.xlabel("Risk Label")
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "risk_label_counts.png"))
plt.close()

print("\nAll artifacts saved to:", OUT_DIR)
print("Files include: model_comparison.csv, feature_importances.csv (if available), cluster_premium_suggestion.csv, insurance_with_risk_and_preds.csv, and plots.")


All artifacts saved to: C:\Users\HP\Downloads\Yfinance\1_intro_portfolio_analysis\code\outputs
Files include: model_comparison.csv, feature_importances.csv (if available), cluster_premium_suggestion.csv, insurance_with_risk_and_preds.csv, and plots.


In [16]:
# Load
# ----------------------------
df = pd.read_csv(DATA_PATH).drop_duplicates().reset_index(drop=True)
print("Rows:", len(df))

Rows: 1337


In [17]:
# Features / Target
# ----------------------------
target = "charges"
X = df.drop(columns=[target])
y = df[target].values

num_cols = ['age','bmi','children']
cat_cols = ['sex','smoker','region']

# OneHotEncoder compatibility
if version.parse(sklearn.__version__) >= version.parse("1.2"):
    cat_encoder = OneHotEncoder(drop='first', sparse_output=False)
else:
    cat_encoder = OneHotEncoder(drop='first', sparse=False)

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', cat_encoder, cat_cols)
])


In [18]:
# Train/test split
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# ----------------------------
# Baseline: Linear Regression
# ----------------------------
pipe_lr = Pipeline([('pre', preprocessor), ('lr', LinearRegression())])
pipe_lr.fit(X_train, y_train)
y_pred_lr = pipe_lr.predict(X_test)

In [19]:
# Stronger model: tuned XGBoost (typical go-to)
# ----------------------------
pipe_xgb = Pipeline([('pre', preprocessor),
                     ('xgb', XGBRegressor(objective='reg:squarederror', random_state=42, n_jobs=-1))])
xgb_params = {
    'xgb__n_estimators': [100, 300],
    'xgb__max_depth': [3, 5],
    'xgb__learning_rate': [0.1, 0.05]
}
grid = GridSearchCV(pipe_xgb, xgb_params, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=0)
grid.fit(X_train, y_train)
best_xgb = grid.best_estimator_
y_pred_xgb = best_xgb.predict(X_test)

# ----------------------------
# Baseline metrics
# ----------------------------
def metrics(y_true, y_pred):
    return {
        'mae': mean_absolute_error(y_true, y_pred),
        'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),
        'r2': r2_score(y_true, y_pred)
    }

m_lr = metrics(y_test, y_pred_lr)
m_xgb = metrics(y_test, y_pred_xgb)

print("Baseline Linear MAE:", m_lr['mae'], "XGB MAE:", m_xgb['mae'])


Baseline Linear MAE: 4177.045561036326 XGB MAE: 2463.7708259092815


In [20]:
# Simulate 10% improvement scenario
# Option A: if XGB provides ~10% MAE improvement naturally, use it.
# Option B: simulate an improvement by shrinking residuals by 10% from baseline.
# We'll compute both and show results.
# ----------------------------
# Prepare test dataframe for reporting
X_test_idx = X_test.reset_index(drop=True).copy()
test_df = X_test_idx.copy()
test_df['actual'] = y_test
test_df['pred_baseline'] = y_pred_lr  # linear baseline
test_df['pred_xgb'] = y_pred_xgb

# Residuals baseline
test_df['resid_baseline'] = test_df['actual'] - test_df['pred_baseline']

# Simulated improved predictions by shrinking baseline residuals by 10%
shrink_factor = 0.9  # residual retained; 10% improvement => residual becomes 0.9*resid
test_df['pred_sim10pct'] = test_df['actual'] - shrink_factor * test_df['resid_baseline']
# Note: if resid_baseline = actual - pred, then pred_sim = actual - 0.9*resid = pred + 0.1*resid -> closer by 10%

# Another simulation: 10% improvement on MAE applied to XGB (if you want)
# compute XGB residuals and shrink them similarly
test_df['resid_xgb'] = test_df['actual'] - test_df['pred_xgb']
test_df['pred_xgb_sim10'] = test_df['actual'] - 0.9 * test_df['resid_xgb']


In [21]:
# Pricing & financial simulation parameters
# ----------------------------
LOADING = 0.20  # 20% loading on predicted claims => premium = pred * (1 + LOADING)
BASE_PREMIUM_FACTOR = 1.0 + LOADING

def compute_financials(df_preds_col):
    """
    Input: df with columns 'actual' and a predictions column name string
    Returns aggregated financial metrics and per-policy series
    """
    pred = df_preds_col
    premiums = test_df[pred] * BASE_PREMIUM_FACTOR
    claims = test_df['actual']
    profit_per_policy = premiums - claims
    total_premiums = premiums.sum()
    total_claims = claims.sum()
    total_profit = profit_per_policy.sum()
    loss_ratio = total_claims / total_premiums
    return {
        'total_premiums': total_premiums,
        'total_claims': total_claims,
        'total_profit': total_profit,
        'loss_ratio': loss_ratio,
        'per_policy': pd.DataFrame({
            'pred': test_df[pred],
            'premium': premiums,
            'claim': claims,
            'profit': profit_per_policy
        })
    }

# Compute for scenarios
scenarios = {}
scenarios['baseline_lr'] = compute_financials('pred_baseline')
scenarios['xgb'] = compute_financials('pred_xgb')
scenarios['sim10_from_lr'] = compute_financials('pred_sim10pct')
scenarios['sim10_from_xgb'] = compute_financials('pred_xgb_sim10')





In [22]:
# Summarize results (per-scenario)
# ----------------------------
summary_rows = []
for name, s in scenarios.items():
    summary_rows.append({
        'scenario': name,
        'total_premiums': s['total_premiums'],
        'total_claims': s['total_claims'],
        'total_profit': s['total_profit'],
        'loss_ratio': s['loss_ratio'],
        'mae': mean_absolute_error(test_df['actual'], test_df['pred_baseline'] if name=='baseline_lr' else (test_df['pred_xgb'] if name=='xgb' else (test_df['pred_sim10pct'] if name=='sim10_from_lr' else test_df['pred_xgb_sim10'])))
    })
summary = pd.DataFrame(summary_rows)
summary.to_csv(os.path.join(OUT_DIR, "financial_simulation_summary.csv"), index=False)
print("\nFinancial summary saved to outputs. Summary:\n", summary)

# ----------------------------
# Quantify impact of 10% MAE improvement
# e.g., improvement relative to baseline_lr:
# ----------------------------
base = scenarios['baseline_lr']
improved = scenarios['sim10_from_lr']
delta_profit = improved['total_profit'] - base['total_profit']
delta_loss_ratio = base['loss_ratio'] - improved['loss_ratio']  # positive means loss ratio reduced
print("\nImpact of simulated 10% MAE improvement (from baseline LR):")
print(f" Δ total_profit (improved - baseline) = {delta_profit:,.2f}")
print(f" Δ loss_ratio (baseline - improved) = {delta_loss_ratio:.4f} (reduction in loss ratio)")

# Scale to a portfolio of 10000 policies (if this test sample is representative)
scale_factor = 10000 / len(test_df)
print("\nScaling to 10,000 policies (approx):")
print("Estimated Δ profit for 10,000 policies:", delta_profit * scale_factor)
print("Estimated Δ loss ratio improvement (absolute):", delta_loss_ratio)


Financial summary saved to outputs. Summary:
          scenario  total_premiums  total_claims   total_profit  loss_ratio  \
0     baseline_lr    4.453115e+06  3.824898e+06  628216.955636    0.858926   
1             xgb    4.615802e+06  3.824898e+06  790904.363515    0.828653   
2   sim10_from_lr    4.466791e+06  3.824898e+06  641893.220590    0.856297   
3  sim10_from_xgb    4.613210e+06  3.824898e+06  788311.711065    0.829119   

           mae  
0  4177.045561  
1  2463.770826  
2  3759.341005  
3  2217.393743  

Impact of simulated 10% MAE improvement (from baseline LR):
 Δ total_profit (improved - baseline) = 13,676.26
 Δ loss_ratio (baseline - improved) = 0.0026 (reduction in loss ratio)

Scaling to 10,000 policies (approx):
Estimated Δ profit for 10,000 policies: 510308.39381190436
Estimated Δ loss ratio improvement (absolute): 0.00262983076504264


In [23]:
# Export Power BI–ready files
# ----------------------------
# Merge per-policy columns into a single table for Power BI
output_table = test_df.copy()
output_table['premium_baseline'] = output_table['pred_baseline'] * BASE_PREMIUM_FACTOR
output_table['premium_xgb'] = output_table['pred_xgb'] * BASE_PREMIUM_FACTOR
output_table['premium_sim10'] = output_table['pred_sim10pct'] * BASE_PREMIUM_FACTOR
output_table['profit_baseline'] = output_table['premium_baseline'] - output_table['actual']
output_table['profit_xgb'] = output_table['premium_xgb'] - output_table['actual']
output_table['profit_sim10'] = output_table['premium_sim10'] - output_table['actual']

output_table.to_csv(os.path.join(OUT_DIR, "powerbi_insurance_pricing_table.csv"), index=False)
print("Per-policy table exported for Power BI.")

# Save a few diagnostic plots
plt.figure(figsize=(6,4))
plt.hist(test_df['actual'] - test_df['pred_baseline'], bins=40)
plt.title("Residuals: actual - baseline_pred")
plt.savefig(os.path.join(OUT_DIR, "residuals_baseline_hist.png"))
plt.close()

plt.figure(figsize=(6,4))
plt.hist(test_df['actual'] - test_df['pred_sim10pct'], bins=40)
plt.title("Residuals after 10% shrink (simulated)")
plt.savefig(os.path.join(OUT_DIR, "residuals_sim10_hist.png"))
plt.close()

plt.figure(figsize=(6,4))
plt.scatter(test_df['pred_baseline'], test_df['actual'], alpha=0.5, label='baseline')
plt.scatter(test_df['pred_xgb'], test_df['actual'], alpha=0.5, label='xgb')
plt.legend()
plt.title("Predicted vs Actual (baseline vs xgb)")
plt.savefig(os.path.join(OUT_DIR, "pred_vs_actual_compare.png"))
plt.close()

print("Plots saved to outputs. Done.")

Per-policy table exported for Power BI.


Plots saved to outputs. Done.


In [28]:
# ================================
# Step 9: Predict charges with user input
# ================================

# Refit best model (let's assume XGBoost performed best)
best_model = pipe_xgb.fit(X, y)

# Function to predict insurance charges
def predict_charges_userinput():
    """
    Ask user for input and predict insurance charges.
    """
    # Take inputs from user
    age = int(input("Enter Age: "))
    bmi = float(input("Enter BMI: "))
    children = int(input("Enter number of Children: "))
    sex = input("Enter Sex (male/female): ").lower()
    smoker = input("Smoker? (yes/no): ").lower()
    region = input("Enter Region (northeast/northwest/southeast/southwest): ").lower()
    
    # Create dataframe
    input_data = pd.DataFrame([{
        "age": age,
        "bmi": bmi,
        "children": children,
        "sex": sex,
        "smoker": smoker,
        "region": region
    }])
    
    # Predict charges
    prediction = best_model.predict(input_data)[0]
    print("\nPredicted Insurance Charges: $", round(prediction, 2))

# Run the function
predict_charges_userinput()


Predicted Insurance Charges: $ 44593.98


In [None]:
#find weight or importance of inputs