# Notebook 04: Final Report

**Project Vande - Aadhaar Analytics Competition Report**

Executive summary, key findings, visualizations, policy recommendations

In [None]:
import sys, inspect
from pathlib import Path
project_root = Path.cwd().parent
if str(project_root) not in sys.path: sys.path.insert(0, str(project_root))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

from src import config
from src.preprocessing import load_processed
from src.metrics import calculate_asi, detect_inclusion_risk
from src.models import AnomalyDetector, EnrolmentForecaster
from src.viz import generate_pdf_report, generate_summary_table

df = load_processed(config.MERGED_DATA_FILE)
print(f'Data loaded: {len(df):,} rows')

---
## Executive Summary

This report presents comprehensive analytics on Aadhaar enrolment and update patterns across India.

In [None]:
# Key Statistics
print('=' * 60)
print('PROJECT VANDE - EXECUTIVE SUMMARY')
print('=' * 60)
print(f'\nAnalysis Period: {df["date"].min().date()} to {df["date"].max().date()}')
print(f'States Covered: {df["state"].nunique()}')
print(f'Districts Analyzed: {df["district"].nunique()}')
print(f'\nTotal Enrolments: {df[config.METRIC_ENROLMENT_TOTAL].sum():,.0f}')
print(f'Total Updates: {df[config.METRIC_TOTAL_UPDATES].sum():,.0f}')
print(f'Average Update Ratio: {df[config.METRIC_UPDATE_TO_ENROLMENT_RATIO].mean():.2f}')

---
## Key Findings

In [None]:
print('KEY FINDINGS')
print('-' * 40)

# Finding 1: ASI
if 'asi_score' in df.columns:
    high_stress = (df.groupby('district')['asi_score'].mean() > 60).sum()
    print(f'1. {high_stress} districts have ASI > 60 (high stress)')

# Finding 2: Anomalies
if 'is_anomaly' in df.columns:
    anomaly_pct = df['is_anomaly'].mean() * 100
    print(f'2. {anomaly_pct:.1f}% of records flagged as anomalies')

# Finding 3: Inclusion risk
if 'inclusion_risk' in df.columns:
    risk_districts = df.groupby('district')['inclusion_risk'].any().sum()
    print(f'3. {risk_districts} districts at inclusion risk')

# Finding 4: Top performers
top_state = df.groupby('state')[config.METRIC_ENROLMENT_TOTAL].sum().idxmax()
print(f'4. Top state by enrolment: {top_state}')

# Finding 5: Update pressure
high_ratio = (df[config.METRIC_UPDATE_TO_ENROLMENT_RATIO] > 1.5).mean() * 100
print(f'5. {high_ratio:.1f}% of records have update ratio > 1.5')

---
## Visualizations

In [None]:
# Trend visualization
daily = df.groupby('date')[config.METRIC_ENROLMENT_TOTAL].sum()
fig, ax = plt.subplots(figsize=(12, 5))
ax.plot(daily.index, daily.values, 'b-', linewidth=1)
ax.fill_between(daily.index, daily.values, alpha=0.3)
ax.set_title('National Daily Enrolment Trend', fontsize=14, fontweight='bold')
ax.set_xlabel('Date')
ax.set_ylabel('Enrolments')
plt.tight_layout()
plt.show()

In [None]:
# State comparison
state_totals = df.groupby('state')[config.METRIC_ENROLMENT_TOTAL].sum().nlargest(10).sort_values()
fig, ax = plt.subplots(figsize=(10, 6))
state_totals.plot(kind='barh', ax=ax, color='steelblue')
ax.set_title('Top 10 States by Total Enrolment', fontsize=14, fontweight='bold')
ax.set_xlabel('Total Enrolments')
plt.tight_layout()
plt.show()

In [None]:
# ASI distribution
if 'asi_score' in df.columns:
    asi_by_district = df.groupby('district')['asi_score'].mean()
    fig, ax = plt.subplots(figsize=(10, 5))
    ax.hist(asi_by_district, bins=30, edgecolor='white', alpha=0.7)
    ax.axvline(60, color='red', linestyle='--', label='Threshold=60')
    ax.set_title('ASI Score Distribution by District', fontsize=14, fontweight='bold')
    ax.set_xlabel('ASI Score')
    ax.set_ylabel('Number of Districts')
    ax.legend()
    plt.tight_layout()
    plt.show()

In [None]:
# Top districts table
summary = generate_summary_table(df, top_n=15)
print('\nTop 15 Districts Summary:')
display(summary)

---
## Policy Recommendations

In [None]:
print('POLICY RECOMMENDATIONS')
print('=' * 60)
print('''
1. RESOURCE ALLOCATION: Focus resources on high-ASI districts
   identified in this analysis.

2. INCLUSION INITIATIVES: Implement targeted outreach in
   districts with low enrolment velocity and zero-enrolment days.

3. SERVICE CAPACITY: Increase service capacity in districts
   with high update-to-enrolment ratios (>1.5).

4. MONITORING: Deploy continuous monitoring for districts
   flagged as anomalous to prevent service disruptions.

5. SATURATION MANAGEMENT: Develop differentiated strategies
   for saturated vs. high-growth districts.

6. FORECASTING: Use 30-day forecasts for proactive capacity
   planning at state and district levels.
''')

---
## Methodology Appendix

In [None]:
print('METHODOLOGY')
print('-' * 40)
print('''
DATA SOURCES:
- Enrolment data: demo_age_5_17, demo_age_17_
- Demographic updates: age_0_5, age_5_17, age_18_greater
- Biometric updates: bio_age_5, bio_age_17_

DERIVED METRICS:
- enrolment_total = demo_age_5_17 + demo_age_17_
- total_updates = demographic_updates_total + biometric_updates_total
- update_to_enrolment_ratio = total_updates / max(enrolment_total, 1)
- enrolment_velocity = diff(enrolment).rolling(7).mean()
- enrolment_volatility = enrolment.rolling(7).std()

MODELS:
- Anomaly Detection: IsolationForest (contamination=0.05)
- Forecasting: Facebook Prophet (horizon=30 days)

ASI FORMULA:
ASI = (volatility*0.30 + ratio*0.30 + anomaly*0.25 + forecast*0.15) * 100
''')

---
## Code Appendix

In [None]:
# Source code for key functions
print('KEY FUNCTION: calculate_asi')
print('=' * 60)
print(inspect.getsource(calculate_asi))

In [None]:
print('KEY FUNCTION: detect_inclusion_risk')
print('=' * 60)
print(inspect.getsource(detect_inclusion_risk))

In [None]:
print('KEY CLASS: AnomalyDetector.predict')
print('=' * 60)
print(inspect.getsource(AnomalyDetector.predict))

In [None]:
print('KEY CLASS: EnrolmentForecaster.forecast')
print('=' * 60)
print(inspect.getsource(EnrolmentForecaster.forecast))

---
## Generate PDF Report

In [None]:
# Generate PDF
pdf_path = generate_pdf_report(df, title='Project Vande Analytics Report')
print(f'\nPDF report saved to: {pdf_path}')

In [None]:
print('\n' + '=' * 60)
print('REPORT COMPLETE')
print('=' * 60)
print(f'Generated: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')