# Layer 1: Exploratory Data Analysis

This notebook explores the insurance claims dataset before processing.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
%matplotlib inline

In [None]:
# Load data
df = pd.read_csv('../data/raw/insurance_claims.csv')
print(f"Shape: {df.shape}")
df.head()

## Data Quality Overview

In [None]:
# Missing values
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
pd.DataFrame({'Missing': missing, 'Percentage': missing_pct}).sort_values('Percentage', ascending=False)

## Distribution of Key Variables

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

# Age distribution
df['age'].hist(ax=axes[0,0], bins=20)
axes[0,0].set_title('Age Distribution')

# Premium distribution
df['policy_annual_premium'].hist(ax=axes[0,1], bins=20)
axes[0,1].set_title('Premium Distribution')

# Claim amount distribution
df['total_claim_amount'].hist(ax=axes[1,0], bins=20)
axes[1,0].set_title('Claim Amount Distribution')

# Fraud distribution
df['fraud_reported'].value_counts().plot(kind='bar', ax=axes[1,1])
axes[1,1].set_title('Fraud Distribution')

plt.tight_layout()

## Correlation Analysis

In [None]:
numeric_cols = df.select_dtypes(include=[np.number]).columns
corr = df[numeric_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', center=0)
plt.title('Feature Correlations')