# Notebook 02: Exploratory Data Analysis

Covers: Univariate/bivariate analysis, temporal patterns, STL decomposition, age groups, state rankings

In [None]:
import sys
from pathlib import Path
project_root = Path.cwd().parent
if str(project_root) not in sys.path: sys.path.insert(0, str(project_root))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from statsmodels.tsa.seasonal import STL

from src import config
from src.preprocessing import load_processed
from src.viz import plot_stl_decomposition, generate_summary_table, generate_state_ranking_table

pd.set_option('display.max_columns', 50)
plt.style.use('seaborn-v0_8-whitegrid')
print('Imports loaded')

In [None]:
# Load data
df = load_processed(config.MERGED_DATA_FILE)
print(f'Rows: {len(df):,}, Districts: {df["district"].nunique()}, States: {df["state"].nunique()}')

## 1. Univariate Analysis - Metric Distributions

In [None]:
metrics = [config.METRIC_ENROLMENT_TOTAL, config.METRIC_TOTAL_UPDATES, config.METRIC_UPDATE_TO_ENROLMENT_RATIO, config.METRIC_ENROLMENT_VELOCITY, config.METRIC_ENROLMENT_VOLATILITY]
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
for i, m in enumerate(metrics):
    if m in df.columns:
        ax = axes.flatten()[i]
        data = df[m].dropna()
        ax.hist(data[data <= data.quantile(0.99)], bins=50, edgecolor='white')
        ax.axvline(data.mean(), color='red', linestyle='--', label=f'Mean: {data.mean():.1f}')
        ax.set_title(m.replace('_', ' ').title())
        ax.legend(fontsize=8)
axes.flatten()[-1].axis('off')
plt.tight_layout()
plt.savefig(config.FIGURES_DIR / 'distributions.png', dpi=150)
plt.show()

## 2. Bivariate Analysis - Correlations

In [None]:
corr = df[[m for m in metrics if m in df.columns]].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='RdBu_r', center=0)
plt.title('Correlation Matrix')
plt.tight_layout()
plt.savefig(config.FIGURES_DIR / 'correlation.png', dpi=150)
plt.show()

In [None]:
# Scatter: Enrolment vs Updates
fig = px.scatter(df.sample(min(5000, len(df))), x=config.METRIC_ENROLMENT_TOTAL, y=config.METRIC_TOTAL_UPDATES, color='state', opacity=0.5, title='Enrolment vs Updates')
fig.update_layout(template='plotly_white')
fig.write_html(config.FIGURES_DIR / 'enrol_vs_updates.html')
fig.show()

## 3. Temporal Patterns

In [None]:
daily = df.groupby('date')[config.METRIC_ENROLMENT_TOTAL].sum().reset_index()
fig = px.line(daily, x='date', y=config.METRIC_ENROLMENT_TOTAL, title='Daily Enrolment Trend')
fig.update_layout(template='plotly_white')
fig.write_html(config.FIGURES_DIR / 'daily_trend.html')
fig.show()

In [None]:
# STL Decomposition
stl_fig = plot_stl_decomposition(df, config.METRIC_ENROLMENT_TOTAL, period=7)
stl_fig.savefig(config.FIGURES_DIR / 'stl_decomposition.png', dpi=150)
plt.show()

## 4. Age Group Distribution

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
if 'demo_age_5_17' in df.columns:
    axes[0].pie([df['demo_age_5_17'].sum(), df['demo_age_17_'].sum()], labels=['5-17', '17+'], autopct='%1.1f%%')
    axes[0].set_title('Enrolment Age Groups')
if 'age_0_5' in df.columns:
    axes[1].pie([df['age_0_5'].sum(), df['age_5_17'].sum(), df['age_18_greater'].sum()], labels=['0-5', '5-17', '18+'], autopct='%1.1f%%')
    axes[1].set_title('Demographic Updates Age')
if 'bio_age_5' in df.columns:
    axes[2].pie([df['bio_age_5'].sum(), df['bio_age_17_'].sum()], labels=['0-5', '5+'], autopct='%1.1f%%')
    axes[2].set_title('Biometric Updates Age')
plt.tight_layout()
plt.savefig(config.FIGURES_DIR / 'age_distribution.png', dpi=150)
plt.show()

## 5. State Rankings (Top 10)

In [None]:
state_rank = generate_state_ranking_table(df, top_n=10)
print('TOP 10 STATES BY ENROLMENT')
display(state_rank)

In [None]:
# State bar chart
st = df.groupby('state')[config.METRIC_ENROLMENT_TOTAL].sum().nlargest(15).reset_index()
fig = px.bar(st, x=config.METRIC_ENROLMENT_TOTAL, y='state', orientation='h', title='Top 15 States by Enrolment')
fig.update_layout(yaxis={'categoryorder': 'total ascending'}, template='plotly_white')
fig.write_html(config.FIGURES_DIR / 'state_ranking.html')
fig.show()

In [None]:
# District summary
district_summary = generate_summary_table(df, top_n=20)
print('TOP 20 DISTRICTS')
display(district_summary)

In [None]:
print('EDA Complete - figures saved to:', config.FIGURES_DIR)