This notebook performs **formal statistical analysis** to:

* Quantify relationships between variables
* Test hypotheses about cost drivers
* Measure effect sizes and significance
* Provide statistically defensible conclusions

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf

plt.style.use("seaborn-v0_8")
sns.set_palette("Set2")

pd.set_option("display.float_format", "{:.4f}".format)

In [None]:
DATA_PATH = "../data/raw/insurance.csv"
df = pd.read_csv(DATA_PATH)

df.head()

In [None]:
numerical_cols = ["age", "bmi", "children", "charges"]
categorical_cols = ["sex", "smoker", "region"]

numerical_cols, categorical_cols

In [None]:
corr_matrix = df[numerical_cols].corr(method="pearson")
corr_matrix

In [None]:
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
plt.title("Pearson Correlation Matrix")
plt.show()

**H₀:** Mean charges are equal
<br>
**H₁:** Mean charges differ

In [None]:
smokers = df[df["smoker"] == "yes"]["charges"]
non_smokers = df[df["smoker"] == "no"]["charges"]

t_stat, p_value = stats.ttest_ind(smokers, non_smokers, equal_var=False)

t_stat, p_value

In [None]:
print(f"Mean (Smokers): {smokers.mean():,.2f}")
print(f"Mean (Non-Smokers): {non_smokers.mean():,.2f}")

In [None]:
def cohens_d(x, y):
    nx, ny = len(x), len(y)
    pooled_std = np.sqrt(((nx-1)*x.var() + (ny-1)*y.var()) / (nx+ny-2))
    return (x.mean() - y.mean()) / pooled_std

cohens_d(smokers, non_smokers)

Effect Size (Cohen’s d)

* 0.2 = small
* 0.5 = medium
* 0.8+ = large

In [None]:
anova_result = stats.f_oneway(
    *[df[df["region"] == r]["charges"] for r in df["region"].unique()]
)

anova_result

In [None]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd

tukey = pairwise_tukeyhsd(
    endog=df["charges"],
    groups=df["region"],
    alpha=0.05
)

print(tukey)

In [None]:
contingency = pd.crosstab(df["sex"], df["smoker"])
contingency

In [None]:
chi2, p, dof, expected = stats.chi2_contingency(contingency)
chi2, p

In [None]:
model = smf.ols(
    "charges ~ age + bmi + children + C(sex) + C(smoker) + C(region)",
    data=df
).fit()

model.summary()

In [None]:
sns.histplot(model.resid, kde=True)
plt.title("Residual Distribution")
plt.show()

In [None]:
sns.scatterplot(x=model.fittedvalues, y=model.resid)
plt.axhline(0, color="red", linestyle="--")
plt.title("Residuals vs Fitted Values")
plt.show()

In [None]:
stat_summary = {
    "Strongest Cost Driver": "Smoking Status",
    "BMI Impact": "Moderate, significant",
    "Age Impact": "Significant, linear",
    "Gender Effect": "Small / mixed",
    "Region Effect": "Statistically weak",
    "Model R-squared": round(model.rsquared, 3)
}

pd.DataFrame.from_dict(stat_summary, orient="index", columns=["Finding"])

## **Key Conclusions**

* Smoking has **both statistically and economically significant** impact
* BMI and age are consistent secondary predictors
* Regional and gender effects are marginal
* Classical OLS assumptions are partially violated → motivates ML models
