# Notebook 03: Advanced Analytics

Covers: Anomaly detection, ASI calculation, inclusion risk, forecasting with Prophet

In [None]:
import sys
from pathlib import Path
project_root = Path.cwd().parent
if str(project_root) not in sys.path: sys.path.insert(0, str(project_root))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

from src import config
from src.preprocessing import load_processed, save_processed
from src.metrics import calculate_asi, detect_inclusion_risk, calculate_saturation_status, identify_imbalanced_districts, rank_service_load, detect_volatility_spikes, calculate_all_metrics
from src.models import AnomalyDetector, EnrolmentForecaster, detect_anomalies_in_dataframe
from src.viz import plot_asi_choropleth, plot_anomaly_scatter, plot_forecast, plot_inclusion_risk_map, generate_summary_table

print('Imports loaded')

In [None]:
# Load processed data
df = load_processed(config.MERGED_DATA_FILE)
print(f'Loaded {len(df):,} rows, {df["district"].nunique()} districts')

## 1. Anomaly Detection with Isolation Forest

In [None]:
# Detect anomalies
features = [f for f in config.ANOMALY_FEATURES if f in df.columns]
print(f'Using features: {features}')

df = detect_anomalies_in_dataframe(df, features)
print(f'\nAnomalies detected: {df["is_anomaly"].sum():,} ({df["is_anomaly"].mean()*100:.1f}%)')

In [None]:
# Anomaly scatter plot
fig = plot_anomaly_scatter(df, config.METRIC_ENROLMENT_TOTAL, config.METRIC_TOTAL_UPDATES)
fig.write_html(config.FIGURES_DIR / 'anomaly_scatter.html')
fig.show()

In [None]:
# Districts with anomalies
anomaly_districts = df[df['is_anomaly'] == 1].groupby(['state', 'district']).size().reset_index(name='anomaly_count')
anomaly_districts = anomaly_districts.sort_values('anomaly_count', ascending=False)
print('Top 20 districts by anomaly count:')
display(anomaly_districts.head(20))

## 2. ASI (Aadhaar Stress Index) Calculation

In [None]:
# Calculate ASI for all districts
asi_scores = calculate_asi(df, district=None, include_national=True)

# Convert to dataframe
asi_df = pd.DataFrame(list(asi_scores.items()), columns=['district', 'asi_score'])
asi_df = asi_df.sort_values('asi_score', ascending=False)

print(f'National ASI: {asi_scores.get("NATIONAL", 0):.1f}')
print(f'\nTop 20 districts by ASI:')
display(asi_df[asi_df['district'] != 'NATIONAL'].head(20))

In [None]:
# Add ASI to main dataframe
df = df.merge(asi_df[asi_df['district'] != 'NATIONAL'], on='district', how='left')
df['asi_score'] = df['asi_score'].fillna(50)  # Default for missing

# ASI distribution
fig = px.histogram(asi_df[asi_df['district'] != 'NATIONAL'], x='asi_score', nbins=30, title='ASI Score Distribution')
fig.add_vline(x=60, line_dash='dash', line_color='red', annotation_text='Threshold=60')
fig.update_layout(template='plotly_white')
fig.write_html(config.FIGURES_DIR / 'asi_distribution.html')
fig.show()

In [None]:
# ASI choropleth/bar chart
fig = plot_asi_choropleth(df)
fig.write_html(config.FIGURES_DIR / 'asi_choropleth.html')
fig.show()

## 3. Inclusion Risk Detection

In [None]:
# Detect inclusion risks
df = detect_inclusion_risk(df)

risk_summary = df.groupby(['state', 'district'])['inclusion_risk'].any().reset_index()
at_risk_count = risk_summary['inclusion_risk'].sum()
print(f'Districts at inclusion risk: {at_risk_count} ({at_risk_count/len(risk_summary)*100:.1f}%)')

In [None]:
# Risk breakdown
risk_cols = ['risk_low_velocity', 'risk_high_ratio_low_growth', 'risk_zero_enrolments']
for col in risk_cols:
    if col in df.columns:
        count = df.groupby('district')[col].any().sum()
        print(f'{col}: {count} districts')

In [None]:
# Inclusion risk map
fig = plot_inclusion_risk_map(df)
fig.write_html(config.FIGURES_DIR / 'inclusion_risk.html')
fig.show()

## 4. Saturation and Service Load Analysis

In [None]:
# Calculate saturation status
df = calculate_saturation_status(df)
print('Saturation status distribution:')
print(df['saturation_status'].value_counts())

In [None]:
# Identify imbalanced districts
df = identify_imbalanced_districts(df)
print('\nBalance status distribution:')
print(df['balance_status'].value_counts())

In [None]:
# Top 20 high-pressure districts
high_pressure = rank_service_load(df, top_n=20)
print('\nTop 20 High-Pressure Districts:')
display(high_pressure)

## 5. Time Series Forecasting with Prophet

In [None]:
# Prepare national-level time series
daily = df.groupby('date').agg({config.METRIC_ENROLMENT_TOTAL: 'sum', config.METRIC_TOTAL_UPDATES: 'sum'}).reset_index()
print(f'Time series: {len(daily)} days')

In [None]:
# Fit forecaster
forecaster = EnrolmentForecaster(horizon=30)
forecaster.fit(daily, target_col=config.METRIC_ENROLMENT_TOTAL)

# Generate forecast
forecast = forecaster.forecast(periods=30)
print(f'Forecast generated for {len(forecast)} days')

In [None]:
# Backtest
metrics = forecaster.backtest(daily, test_days=30)
print(f'\nBacktest Results:')
print(f'  MAPE: {metrics["mape"]:.2f}%')
print(f'  RMSE: {metrics["rmse"]:,.0f}')
print(f'  MAE: {metrics["mae"]:,.0f}')

In [None]:
# Plot forecast
fig = plot_forecast(daily, forecast)
fig.write_html(config.FIGURES_DIR / 'enrolment_forecast.html')
fig.show()

In [None]:
# Forecast for updates
forecaster_updates = EnrolmentForecaster(horizon=30)
forecaster_updates.fit(daily, target_col=config.METRIC_TOTAL_UPDATES)
forecast_updates = forecaster_updates.forecast(periods=30)

fig = plot_forecast(daily, forecast_updates, title='Total Updates Forecast')
fig.write_html(config.FIGURES_DIR / 'updates_forecast.html')
fig.show()

## 6. Save Enhanced Dataset

In [None]:
# Save with all analytics columns
save_processed(df, config.MERGED_DATA_FILE)

print('\nAdvanced Analytics Complete!')
print(f'Dataset saved with {len(df.columns)} columns')
print(f'\nNew columns added:')
new_cols = ['is_anomaly', 'anomaly_score', 'asi_score', 'inclusion_risk', 'saturation_status', 'balance_status']
for c in new_cols:
    if c in df.columns:
        print(f'  - {c}')