This notebook performs **initial exploratory data analysis (EDA)** to understand:

* Dataset structure and schema
* Data quality (missing values, duplicates)
* Variable distributions
* Outliers and skewness
* Early signals of cost drivers


In [None]:
# Update path if running in Colab
DATA_PATH = "../data/raw/insurance.csv"

df = pd.read_csv(DATA_PATH)
df.head()

In [None]:
print(f"Rows: {df.shape[0]}")
print(f"Columns: {df.shape[1]}")

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.isnull().sum()

In [None]:
sns.heatmap(df.isnull(), cbar=False)
plt.title("Missing Value Heatmap")
plt.show()

In [None]:
df.duplicated().sum()

In [None]:
df[df.duplicated()]

In [None]:
df.describe()

In [None]:
sns.histplot(df["age"], bins=30, kde=True)
plt.title("Age Distribution")
plt.show()

In [None]:
sns.histplot(df["bmi"], bins=30, kde=True)
plt.title("BMI Distribution")
plt.show()

In [None]:
sns.histplot(df["charges"], bins=50, kde=True)
plt.title("Insurance Charges Distribution")
plt.show()

In [None]:
sns.boxplot(x=df["charges"])
plt.title("Insurance Charges â€“ Boxplot")
plt.show()

In [None]:
sns.countplot(x="sex", data=df)
plt.title("Gender Distribution")
plt.show()

In [None]:
sns.countplot(x="smoker", data=df)
plt.title("Smoker vs Non-Smoker")
plt.show()

In [None]:
sns.countplot(x="region", data=df)
plt.title("Regional Distribution")
plt.show()

In [None]:
sns.boxplot(x="smoker", y="charges", data=df)
plt.title("Insurance Charges by Smoking Status")
plt.show()

In [None]:
sns.boxplot(x="sex", y="charges", data=df)
plt.title("Insurance Charges by Gender")
plt.show()

In [None]:
sns.boxplot(x="region", y="charges", data=df)
plt.title("Insurance Charges by Region")
plt.show()

In [None]:
Q1 = df["charges"].quantile(0.25)
Q3 = df["charges"].quantile(0.75)
IQR = Q3 - Q1

outliers = df[
    (df["charges"] < Q1 - 1.5 * IQR) |
    (df["charges"] > Q3 + 1.5 * IQR)
]

outliers.shape[0]

In [None]:
eda_summary = {
    "Total Records": df.shape[0],
    "No Missing Values": df.isnull().sum().sum() == 0,
    "Highly Skewed Target": True,
    "Major Cost Driver (Initial)": "Smoking Status",
    "Outliers Present": outliers.shape[0]
}

pd.DataFrame.from_dict(eda_summary, orient="index", columns=["Value"])

**Key Takeaways**

* Insurance charges are **non-normally distributed**
* Smoking status shows a **clear structural break in costs**
* BMI and age likely have **nonlinear effects**
* Dataset quality is high and suitable for advanced modeling