# 00: Exploratory Data Analysis

This notebook provides an initial exploration of the auto insurance dataset to:
- Understand data quality and completeness
- Identify demographic distributions and potential bias patterns
- Explore relationships between features and the target variable
- Inform preprocessing decisions


In [None]:
# Install dependencies (for Google Colab)
!pip install fairlearn seaborn -q


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set style for better visualizations
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Set random seed for reproducibility
np.random.seed(42)


## 1. Load Data


In [None]:
# Load the dataset
# Note: Update path if loading from Google Drive or different location
df = pd.read_csv('../data/AutoInsurance.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
df.head()


## 2. Data Quality & Missing Values


In [None]:
# Check for missing values
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Missing Percentage': missing_pct
})
missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)


## 3. Target Variable Distribution


In [None]:
# Examine target variable (Response)
target_col = 'Response'  # Adjust if different
if target_col in df.columns:
    print(f"Target variable distribution:")
    print(df[target_col].value_counts())
    print(f"\nTarget variable distribution (%):")
    print(df[target_col].value_counts(normalize=True) * 100)
    
    plt.figure(figsize=(8, 5))
    df[target_col].value_counts().plot(kind='bar')
    plt.title('Distribution of Response Variable')
    plt.ylabel('Count')
    plt.xlabel('Response')
    plt.xticks(rotation=0)
    plt.tight_layout()
    plt.show()


## 4. Demographic Distributions (Protected Attributes)


In [None]:
# Analyze demographic distributions
# Identify protected attributes
protected_attributes = ['Gender', 'Age']  # Add others as needed

fig, axes = plt.subplots(1, len(protected_attributes), figsize=(15, 5))
if len(protected_attributes) == 1:
    axes = [axes]

for i, attr in enumerate(protected_attributes):
    if attr in df.columns:
        df[attr].value_counts().plot(kind='bar', ax=axes[i])
        axes[i].set_title(f'Distribution of {attr}')
        axes[i].set_ylabel('Count')
        axes[i].tick_params(axis='x', rotation=45)
        
plt.tight_layout()
plt.show()


## 5. Cross-Tabulation: Demographics vs Target


In [None]:
# Examine relationships between protected attributes and target
# This helps identify potential bias patterns early

for attr in protected_attributes:
    if attr in df.columns:
        print(f"\n{attr} vs Response:")
        crosstab = pd.crosstab(df[attr], df[target_col], normalize='index') * 100
        print(crosstab)
        print()
        
        # Visualize
        crosstab.plot(kind='bar', stacked=True)
        plt.title(f'{attr} vs Response Distribution')
        plt.ylabel('Percentage')
        plt.xlabel(attr)
        plt.xticks(rotation=45)
        plt.legend(title=target_col)
        plt.tight_layout()
        plt.show()


## 6. Summary Statistics


In [None]:
# Summary statistics for numerical features
df.describe()


## 7. Key Findings & Notes for Preprocessing

**Findings:**
- Document key observations about data quality
- Note any demographic imbalances
- Identify features that may need special handling
- Document initial bias patterns observed

**Next Steps:**
- Proceed to `01_preprocessing.ipynb` for data cleaning and preparation
