# 01: Data Preprocessing

This notebook handles data preparation for the insurance claim prediction models:
- Load and clean the dataset
- Handle missing values and outliers
- Encode categorical features
- Balance the dataset (if needed)
- Split into train/validation/test sets
- Save processed data for downstream notebooks


In [None]:
# Install dependencies (for Google Colab)
!pip install fairlearn seaborn -q


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pickle
from pathlib import Path

# Set random seed for reproducibility
np.random.seed(42)

# Create results directory if it doesn't exist
Path('../results').mkdir(exist_ok=True)


## 1. Data Loading

In [None]:
# Load the raw dataset
file_path = '../data/AutoInsurance.csv'
df = pd.read_csv(file_path)

# Create a working copy
df_processed = df.copy()

print(f"✓ Loaded dataset: {df_processed.shape[0]} rows, {df_processed.shape[1]} columns")
print(f"\nFirst few rows:")
display(df_processed.head())

## 2. Data Quality Checks

In [None]:
# Check for missing values and duplicates
print("="*60)
print("DATA QUALITY CHECKS")
print("="*60)

# Missing values check
missing = df_processed.isnull().sum()
if missing.sum() > 0:
    print(f"\n⚠ Missing values found:")
    print(missing[missing > 0])
else:
    print("\n✓ No missing values found")

# Duplicate check
duplicates = df_processed.duplicated().sum()
if duplicates > 0:
    print(f"\n⚠ Warning: {duplicates} duplicate rows found")
    print("Removing duplicates...")
    df_processed = df_processed.drop_duplicates()
    print(f"✓ Removed duplicates. New shape: {df_processed.shape}")
else:
    print("\n✓ No duplicate rows found")

# Basic data info
print(f"\n✓ Dataset shape: {df_processed.shape[0]} rows × {df_processed.shape[1]} columns")
print(f"✓ Data types: {df_processed.dtypes.value_counts().to_dict()}")

## 3. Data Transformation
### 3.1 Drop Identifier Column

In [None]:
# Drop Customer ID (unique identifier - no predictive value)
if 'Customer' in df_processed.columns:
    df_processed = df_processed.drop(columns=['Customer'])
    print(f"✓ Dropped 'Customer' column")
    print(f"Remaining columns: {df_processed.shape[1]}")
else:
    print("'Customer' column not found")

### 3.2 Date Feature Extraction

In [None]:
# Convert 'Effective To Date' to datetime and extract features
if 'Effective To Date' in df_processed.columns:
    df_processed['Effective To Date'] = pd.to_datetime(df_processed['Effective To Date'])
    
    # Extract temporal features
    df_processed['Effective_Year'] = df_processed['Effective To Date'].dt.year
    df_processed['Effective_Month'] = df_processed['Effective To Date'].dt.month
    df_processed['Effective_DayOfWeek'] = df_processed['Effective To Date'].dt.dayofweek
    
    # Drop original date column
    df_processed = df_processed.drop(columns=['Effective To Date'])
    print("✓ Extracted temporal features: Year, Month, DayOfWeek")
    print(f"Date range: {df_processed['Effective_Year'].min()} to {df_processed['Effective_Year'].max()}")
else:
    print("'Effective To Date' column not found")

### 3.3 Define and Separate Protected Attributes

In [None]:
# Define protected attributes (for fairness evaluation)
# These are tracked separately for fairness evaluation
# Note: They ARE included as model features (see Section 4.3) for comparison purposes
protected_attributes_list = [
    'Gender',
    'EmploymentStatus',
    'Education', 
    'Marital Status',
    'Location Code',
    'State',
    'Income'
]

# Verify all protected attributes exist
missing_protected = [attr for attr in protected_attributes_list if attr not in df_processed.columns]
if missing_protected:
    print(f"⚠ Warning: Missing protected attributes: {missing_protected}")
else:
    print("✓ All protected attributes found in dataset")
    print(f"Protected attributes: {protected_attributes_list}")

### 3.4 Target Variable Extraction and Encoding

In [None]:
# Extract target variable
target_col = 'Response'

if target_col not in df_processed.columns:
    raise ValueError(f"Target column '{target_col}' not found!")

y = df_processed[target_col].copy()

# Check distribution
print("="*60)
print("TARGET VARIABLE DISTRIBUTION")
print("="*60)
target_dist = pd.Series(y).value_counts()
print(target_dist)
print(f"\nPercentages:")
print(target_dist / len(y) * 100)

# Encode target: No=0, Yes=1
label_encoder = LabelEncoder()
y_encoded = pd.Series(label_encoder.fit_transform(y), name=target_col)

print(f"\n✓ Encoded target: {dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))}")

### 3.5 Feature-Target Separation

In [None]:
# Separate features from target
X = df_processed.drop(columns=[target_col]).copy()

print(f"Features shape: {X.shape}")
print(f"\nColumn types:")
print(f"  - Numerical: {X.select_dtypes(include=['int64', 'float64']).shape[1]} columns")
print(f"  - Categorical: {X.select_dtypes(include=['object', 'category']).shape[1]} columns")

# Identify column types
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"\nCategorical columns ({len(categorical_cols)}):")
print(categorical_cols)
print(f"\nNumerical columns ({len(numerical_cols)}):")
print(numerical_cols)

## 4. Feature Engineering
### 4.1 Outlier Capping

In [None]:
# Handle outliers for key numerical features
# Strategy: Cap extreme values at percentiles

outlier_cols = ['Income', 'Total Claim Amount', 'Customer Lifetime Value']
outlier_stats = {}

for col in outlier_cols:
    if col in X.columns:
        # Calculate percentiles
        p01 = X[col].quantile(0.01)
        p99 = X[col].quantile(0.99)
        
        # Count outliers
        outliers_below = (X[col] < p01).sum()
        outliers_above = (X[col] > p99).sum()
        
        # Cap values
        X[f'{col}_original'] = X[col].copy()  # Keep original for reference
        X[col] = X[col].clip(lower=p01, upper=p99)
        
        outlier_stats[col] = {
            'p01': p01,
            'p99': p99,
            'outliers_capped_below': outliers_below,
            'outliers_capped_above': outliers_above
        }
        
        print(f"\n{col}:")
        print(f"  Capped at [{p01:.2f}, {p99:.2f}]")
        print(f"  Values capped below: {outliers_below}")
        print(f"  Values capped above: {outliers_above}")

# Drop original columns (keep only capped versions)
for col in outlier_cols:
    if f'{col}_original' in X.columns:
        X = X.drop(columns=[f'{col}_original'])
        
print("\n✓ Outlier handling complete")

### 4.2 Zero-income Indicator

In [None]:
# Special handling for Income: ~25% are zeros
# Option 1: Create indicator flag for zero income
# Option 2: Treat as missing and impute
# Option 3: Keep as-is but be aware it's a distinct group

# For now, create an indicator flag
if 'Income' in X.columns:
    X['Income_IsZero'] = (X['Income'] == 0).astype(int)
    zero_income_count = X['Income_IsZero'].sum()
    zero_income_pct = (zero_income_count / len(X)) * 100
    print(f"Income zeros: {zero_income_count} ({zero_income_pct:.1f}%)")
    print("✓ Created 'Income_IsZero' indicator flag")
    
    # Note: Income_IsZero is added AFTER categorical/numerical column identification
    # This is intentional - the new binary column will be automatically included in
    # the encoded feature set during one-hot encoding as it's a numerical (binary) column.
    # The column lists (categorical_cols, numerical_cols) identified earlier don't need
    # to be updated because they're only used for the encoding step, which processes all
    # columns in X at that time.

### 4.3 Categorical Encoding

In [None]:
# One-hot encode categorical features
# Note: Using pd.get_dummies for one-hot encoding
#
# DECISION: drop_first=True eliminates perfect multicollinearity
# - What this means: One category per feature is dropped (becomes the reference/baseline)
# - Why this matters: Logistic Regression (our baseline model) can have convergence issues 
#   with perfect multicollinearity (all categories sum to 1)
# - Example: For Gender (M/F), if we drop 'M', then 'F'=1 means female, 'F'=0 means male
# - Interpretability: Still fully interpretable - the dropped category is the baseline
# - Random Forest: Not affected by multicollinearity, so this is fine for both models
# - Industry standard: This is the recommended approach in most ML pipelines
#
# Alternative: drop_first=False creates full dummy sets but can cause issues with linear models

# DECISION: Protected attributes are INCLUDED in model features
# Rationale for inclusion:
#   - Allows comparison of model behavior with/without protected attributes
#   - Useful for fairness analysis to see how models perform when they can "see" protected groups
#   - Can later train separate models excluding these features for comparison experiments
#   - Enables studying whether models use protected attributes for prediction
# 
# For production fairness-aware models, you would typically EXCLUDE protected attributes
# to avoid direct discrimination. However, for research and comparison purposes, 
# inclusion is valuable to understand model behavior and demonstrate the need for fairness mitigation.
#
# Note: Protected attributes are tracked separately for fairness evaluation regardless of
# whether they're included as features (see Section 3.3)

print("Encoding categorical features...")
print(f"Before encoding: {X.shape}")

# One-hot encode using pandas get_dummies
X_encoded = pd.get_dummies(
    X, 
    columns=categorical_cols, 
    drop_first=True,  # Remove one category per feature (eliminates perfect multicollinearity)
    dtype=int
)

print(f"After encoding: {X_encoded.shape}")
print(f"New columns created: {X_encoded.shape[1] - len(numerical_cols)}")
print("\n✓ Categorical encoding complete (reference categories dropped to avoid multicollinearity)")

## 5. Data Splitting

In [None]:
# Create stratified splits (70% train, 15% validation, 15% test)
# Stratified split maintains class distribution

# IMPORTANT: Split features first, then use the same indices for protected attributes
# to ensure perfect alignment

# First split: train+val (85%) vs test (15%)
X_temp, X_test, y_temp, y_test = train_test_split(
    X_encoded, 
    y_encoded,
    test_size=0.15,
    random_state=42,
    stratify=y_encoded
)

# Get indices for alignment (NEW - this captures which rows went where)
train_val_indices = X_temp.index
test_indices = X_test.index

# Second split: train (70%) vs val (15%)
# Calculate split size: 0.15 / 0.85 ≈ 0.176
X_train, X_val, y_train, y_val = train_test_split(
    X_temp,
    y_temp,
    test_size=0.176,  # This gives us ~15% of total
    random_state=42,
    stratify=y_temp
)

# Get indices for alignment (NEW - this captures which rows went where)
train_indices = X_train.index
val_indices = X_val.index

# Split protected attributes using the SAME indices as features
# This ensures perfect alignment between features and protected attributes
protected_df = df_processed[protected_attributes_list].copy()

# Ensure protected_df has the same index as df_processed
if not protected_df.index.equals(df_processed.index):
    protected_df.index = df_processed.index

# Split protected attributes using the indices from feature splits
# This uses .loc[] to select rows by index instead of splitting randomly
protected_temp = protected_df.loc[train_val_indices]
protected_test = protected_df.loc[test_indices]

protected_train = protected_df.loc[train_indices]
protected_val = protected_df.loc[val_indices]

print("="*60)
print("DATA SPLIT SUMMARY")
print("="*60)
print(f"Training set:   {X_train.shape[0]:,} samples ({X_train.shape[0]/len(X_encoded)*100:.1f}%)")
print(f"Validation set: {X_val.shape[0]:,} samples ({X_val.shape[0]/len(X_encoded)*100:.1f}%)")
print(f"Test set:       {X_test.shape[0]:,} samples ({X_test.shape[0]/len(X_encoded)*100:.1f}%)")
print(f"\nTotal features: {X_train.shape[1]}")

# Verify split alignment (NEW - this checks that everything matches)
print("\n" + "="*60)
print("VERIFYING SPLIT ALIGNMENT")
print("="*60)
assert len(protected_train) == len(X_train), f"Train split mismatch! {len(protected_train)} != {len(X_train)}"
assert len(protected_val) == len(X_val), f"Validation split mismatch! {len(protected_val)} != {len(X_val)}"
assert len(protected_test) == len(X_test), f"Test split mismatch! {len(protected_test)} != {len(X_test)}"
print("✓ All splits aligned correctly")

# Check target distribution in each split
print("\n" + "="*60)
print("TARGET DISTRIBUTION BY SPLIT")
print("="*60)
for split_name, y_split in [('Train', y_train), ('Validation', y_val), ('Test', y_test)]:
    dist = pd.Series(y_split).value_counts(normalize=True) * 100
    print(f"\n{split_name}:")
    for label, pct in dist.items():
        label_name = label_encoder.inverse_transform([label])[0]
        print(f"  {label_name}: {pct:.1f}%")

## 6 Scaling & Preparation
### 6.1 Feature Scaling

In [None]:
# Scale features (fit on training data only)
# NOTE: We scale ALL features (including one-hot encoded) for consistency
# While one-hot encoded features don't strictly need scaling, it ensures
# all features are on the same scale for algorithms sensitive to feature magnitude
scaler = StandardScaler()

# Fit scaler on training data
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)

# Transform validation and test sets
X_val_scaled = pd.DataFrame(
    scaler.transform(X_val), 
    columns=X_val.columns, 
    index=X_val.index
)
X_test_scaled = pd.DataFrame(
    scaler.transform(X_test), 
    columns=X_test.columns, 
    index=X_test.index
)

print("✓ Feature scaling complete (all features scaled for consistency)")
print(f"Scaled training set mean: {X_train_scaled.mean().mean():.6f}")
print(f"Scaled training set std: {X_train_scaled.std().mean():.6f}")

### 6.2 Addressing Class Imbalance

In [None]:
# Class imbalance: ~85.7% No vs 14.3% Yes (ratio: 5.98:1)
# Options:
# 1. Use SMOTE for oversampling (applied later in training)
# 2. Use class weights in models
# 3. Keep as-is

# For now, calculate class weights for use in models
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))

print("Class weights for balanced training:")
for class_idx, weight in class_weight_dict.items():
    class_name = label_encoder.inverse_transform([class_idx])[0]
    print(f"  {class_name} (class {class_idx}): {weight:.3f}")

print("\n⚠ Note: Actual resampling (SMOTE) can be applied during model training")
print("For now, we'll save the data as-is and handle imbalance in training notebooks")

## 7. Saving

In [None]:
# Save all processed data to results directory

# Final verification before saving
print("="*60)
print("FINAL VERIFICATION BEFORE SAVING")
print("="*60)

# Verify split alignment one more time
assert len(protected_train) == len(X_train), f"Train split mismatch! {len(protected_train)} != {len(X_train)}"
assert len(protected_val) == len(X_val), f"Validation split mismatch! {len(protected_val)} != {len(X_val)}"
assert len(protected_test) == len(X_test), f"Test split mismatch! {len(protected_test)} != {len(X_test)}"

# Verify target alignment
assert len(y_train) == len(X_train), "y_train length mismatch with X_train"
assert len(y_val) == len(X_val), "y_val length mismatch with X_val"
assert len(y_test) == len(X_test), "y_test length mismatch with X_test"

# Verify protected attributes shape
assert protected_train.shape[1] == len(protected_attributes_list), "Protected attributes column mismatch"

print("✓ All data structures verified and aligned")
print("\n" + "="*60)

import json

results_dir = Path('../results')
results_dir.mkdir(exist_ok=True)

# Save feature matrices (scaled)
with open(results_dir / 'X_train.pkl', 'wb') as f:
    pickle.dump(X_train_scaled, f)
with open(results_dir / 'X_val.pkl', 'wb') as f:
    pickle.dump(X_val_scaled, f)
with open(results_dir / 'X_test.pkl', 'wb') as f:
    pickle.dump(X_test_scaled, f)

# Save target variables
with open(results_dir / 'y_train.pkl', 'wb') as f:
    pickle.dump(y_train, f)
with open(results_dir / 'y_val.pkl', 'wb') as f:
    pickle.dump(y_val, f)
with open(results_dir / 'y_test.pkl', 'wb') as f:
    pickle.dump(y_test, f)

# Save protected attributes
with open(results_dir / 'protected_train.pkl', 'wb') as f:
    pickle.dump(protected_train, f)
with open(results_dir / 'protected_val.pkl', 'wb') as f:
    pickle.dump(protected_val, f)
with open(results_dir / 'protected_test.pkl', 'wb') as f:
    pickle.dump(protected_test, f)

# Save scaler
with open(results_dir / 'scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Save feature names
with open(results_dir / 'feature_names.pkl', 'wb') as f:
    pickle.dump(X_train_scaled.columns.tolist(), f)

# Save label encoder
with open(results_dir / 'label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

# Save class weights
with open(results_dir / 'class_weights.pkl', 'wb') as f:
    pickle.dump(class_weight_dict, f)

# Save preprocessing metadata
metadata = {
    'preprocessing_date': pd.Timestamp.now().isoformat(),
    'original_shape': df.shape,
    'processed_shape': X_train_scaled.shape,
    'target_column': target_col,
    'protected_attributes': protected_attributes_list,
    'categorical_columns': categorical_cols,
    'numerical_columns': numerical_cols,
    'outlier_handling': 'Capped at 1st and 99th percentiles',
    'outlier_stats': {k: {str(ki): str(vi) for ki, vi in v.items()} for k, v in outlier_stats.items()},
    'class_imbalance_ratio': len(y_train[y_train==0]) / len(y_train[y_train==1]),
    'train_size': len(X_train),
    'val_size': len(X_val),
    'test_size': len(X_test),
    'features_created': list(X_train_scaled.columns)
}

with open(results_dir / 'preprocessing_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2, default=str)

print("="*60)
print("SAVED PROCESSED DATA")
print("="*60)
print(f"✓ Feature matrices: X_train.pkl, X_val.pkl, X_test.pkl")
print(f"✓ Target variables: y_train.pkl, y_val.pkl, y_test.pkl")
print(f"✓ Protected attributes: protected_train.pkl, protected_val.pkl, protected_test.pkl")
print(f"✓ Scaler: scaler.pkl")
print(f"✓ Feature names: feature_names.pkl")
print(f"✓ Label encoder: label_encoder.pkl")
print(f"✓ Class weights: class_weights.pkl")
print(f"✓ Metadata: preprocessing_metadata.json")
print(f"\nAll files saved to: {results_dir.absolute()}")

## 8. Preprocessing Summary & Verification

In [None]:
# Final verification and summary
print("="*60)
print("PREPROCESSING SUMMARY")
print("="*60)

print(f"\n1. Data Shape:")
print(f"   Original: {df.shape}")
print(f"   Processed: {X_train_scaled.shape[1]} features")

print(f"\n2. Transformations Applied:")
print(f"   ✓ Dropped 'Customer' identifier column")
print(f"   ✓ Converted 'Effective To Date' to temporal features")
print(f"   ✓ Handled outliers (capped at percentiles)")
print(f"   ✓ Created 'Income_IsZero' indicator")
print(f"   ✓ Encoded {len(categorical_cols)} categorical features")
print(f"   ✓ Scaled all features (StandardScaler)")

print(f"\n3. Data Splits:")
print(f"   Train: {len(X_train):,} samples")
print(f"   Validation: {len(X_val):,} samples")
print(f"   Test: {len(X_test):,} samples")

print(f"\n4. Protected Attributes Tracked:")
for attr in protected_attributes_list:
    print(f"   - {attr}")

print(f"\n5. Next Steps:")
print(f"   → Proceed to notebook 02_baseline_models.ipynb")
print(f"   → Load saved data from ../results/ directory")