In [None]:
# === IMPORT LIBRARIES ===
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# === LOAD CLEANED DATA ===
insurance_df = pd.read_csv('../data/insurance_cleaned.csv')

# === 1. BASIC DATA INSPECTION ===
print("=== Dataset Overview ===")
print(f"Shape: {insurance_df.shape}\n")
print("First 5 rows:")
display(insurance_df.head())
print("\nData types:")
print(insurance_df.dtypes)
print("\nMissing values:")
print(insurance_df.isnull().sum())

# === 2. UNIVARIATE ANALYSIS ===
plt.figure(figsize=(15,10))

# Numeric features
numeric_cols = ['age', 'bmi', 'children', 'charges']
for i, col in enumerate(numeric_cols, 1):
    plt.subplot(2,2,i)
    sns.histplot(insurance_df[col], kde=True)
    plt.title(f'Distribution of {col}')

plt.tight_layout()
plt.show()

# Categorical features
cat_cols = ['sex', 'smoker', 'region']
plt.figure(figsize=(15,5))
for i, col in enumerate(cat_cols, 1):
    plt.subplot(1,3,i)
    sns.countplot(x=col, data=insurance_df)
    plt.title(f'Count by {col}')
plt.tight_layout()
plt.show()

# === 3. BIVARIATE ANALYSIS ===
# Charges vs categorical features
plt.figure(figsize=(15,5))
for i, col in enumerate(cat_cols, 1):
    plt.subplot(1,3,i)
    sns.boxplot(x=col, y='charges', data=insurance_df)
    plt.title(f'Charges by {col}')
plt.tight_layout()
plt.show()

# Scatter plots for numeric features
sns.pairplot(insurance_df[numeric_cols], diag_kind='kde')
plt.show()

# === 4. CORRELATION ANALYSIS ===
plt.figure(figsize=(10,6))
corr_matrix = insurance_df.corr(numeric_only=True)
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.show()

# === 5. OUTLIER DETECTION ===
plt.figure(figsize=(15,5))
for i, col in enumerate(['age', 'bmi', 'charges'], 1):
    plt.subplot(1,3,i)
    sns.boxplot(x=insurance_df[col])
    plt.title(f'Outliers in {col}')
plt.tight_layout()
plt.show()

# === 6. ADVANCED ANALYSIS ===
# Age vs Charges by Smoker Status
plt.figure(figsize=(10,6))
sns.scatterplot(x='age', y='charges', hue='smoker', data=insurance_df, alpha=0.7)
plt.title('Age vs Charges by Smoking Status')
plt.show()

# BMI vs Charges by Smoker Status
plt.figure(figsize=(10,6))
sns.scatterplot(x='bmi', y='charges', hue='smoker', data=insurance_df, alpha=0.7)
plt.title('BMI vs Charges by Smoking Status')
plt.show()