# üáÆüá≥ UIDAI Identity Lifecycle Health Analysis

## Team UIDAI_1545 | IET Lucknow

---

### üéØ Problem Statement

> **"Building India's Identity Lifecycle Health Index: Predicting Aadhaar Data Staleness to Prevent DBT Leakages and Authentication Failures"**

---

### üìä Metrics We'll Calculate

| Metric | Purpose |
|--------|--------|
| **IFI** - Identity Freshness Index | Measures data staleness risk |
| **CLCR** - Child Lifecycle Capture Rate | Tracks mandatory child updates |
| **TAES** - Temporal Access Equity Score | Measures weekend access equity |
| **UCR** - Update Completeness Ratio | Geographic service coverage |
| **AAUP** - Age-Adjusted Update Propensity | Population-normalized comparison |

## 1. Setup & Configuration

In [None]:
# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11
plt.rcParams['figure.dpi'] = 100

# Load configuration
with open('config.yaml', 'r') as f:
    config = yaml.safe_load(f)

print("‚úÖ Configuration loaded")
print(f"  IFI Critical Threshold: {config['analysis']['ifi_bands']['critical']}")
print(f"  TAES Acceptable: {config['analysis']['taes_acceptable']}")

## 2. Data Loading & Cleaning

In [None]:
# Import custom modules
import sys
sys.path.insert(0, '.')
from src.state_mapping import standardize_dataframe_states
from src.metrics import calculate_ifi, calculate_clcr, calculate_taes, calculate_lifecycle_gap

# Load all datasets
print("üìÅ Loading datasets...")

# Enrolment
enrol_path = Path('data/raw/Enrolment')
enrol_files = list(enrol_path.glob('*.csv'))
enrol_dfs = [pd.read_csv(f, on_bad_lines='skip') for f in enrol_files]
enrolment_df = pd.concat(enrol_dfs, ignore_index=True)
print(f"  ‚úì Enrolment: {len(enrolment_df):,} rows")

# Demographic
demo_path = Path('data/raw/Demographic')
demo_files = list(demo_path.glob('*.csv'))
demo_dfs = [pd.read_csv(f, on_bad_lines='skip') for f in demo_files]
demographic_df = pd.concat(demo_dfs, ignore_index=True)
print(f"  ‚úì Demographic: {len(demographic_df):,} rows")

# Biometric
bio_path = Path('data/raw/Biometric')
bio_files = list(bio_path.glob('*.csv'))
bio_dfs = [pd.read_csv(f, on_bad_lines='skip') for f in bio_files]
biometric_df = pd.concat(bio_dfs, ignore_index=True)
print(f"  ‚úì Biometric: {len(biometric_df):,} rows")

# Population
population_df = pd.read_csv('data/external/state_population.csv')
print(f"  ‚úì Population: {len(population_df)} states")

print(f"\nüìä Total Records: {len(enrolment_df) + len(demographic_df) + len(biometric_df):,}")

In [None]:
# Standardize state names
print("üîß Standardizing state names...")

enrolment_df = standardize_dataframe_states(enrolment_df, 'state')
demographic_df = standardize_dataframe_states(demographic_df, 'state')
biometric_df = standardize_dataframe_states(biometric_df, 'state')

print(f"  ‚úì Unique states in Enrolment: {enrolment_df['state'].nunique()}")
print(f"  ‚úì Unique states in Demographic: {demographic_df['state'].nunique()}")
print(f"  ‚úì Unique states in Biometric: {biometric_df['state'].nunique()}")

In [None]:
# Preprocess data
print("‚öôÔ∏è Preprocessing...")

# Parse dates
enrolment_df['date'] = pd.to_datetime(enrolment_df['date'], format='%d-%m-%Y', errors='coerce')
demographic_df['date'] = pd.to_datetime(demographic_df['date'], format='%d-%m-%Y', errors='coerce')
biometric_df['date'] = pd.to_datetime(biometric_df['date'], format='%d-%m-%Y', errors='coerce')

# Add totals
enrolment_df['total_enrolments'] = enrolment_df['age_0_5'] + enrolment_df['age_5_17'] + enrolment_df['age_18_greater']
demographic_df['total_demo_updates'] = demographic_df['demo_age_5_17'] + demographic_df['demo_age_17_']
biometric_df['total_bio_updates'] = biometric_df['bio_age_5_17'] + biometric_df['bio_age_17_']

# Add temporal features
enrolment_df['weekday'] = enrolment_df['date'].dt.day_name()
enrolment_df['is_weekend'] = enrolment_df['date'].dt.dayofweek >= 5

print("‚úÖ Data preprocessed")
print(f"  Date range: {enrolment_df['date'].min()} to {enrolment_df['date'].max()}")

## 3. Data Overview

In [None]:
# Summary statistics
print("="*60)
print("üìä DATA SUMMARY")
print("="*60)

summary_data = {
    'Dataset': ['Enrolment', 'Demographic', 'Biometric'],
    'Records': [len(enrolment_df), len(demographic_df), len(biometric_df)],
    'Total Count': [
        enrolment_df['total_enrolments'].sum(),
        demographic_df['total_demo_updates'].sum(),
        biometric_df['total_bio_updates'].sum()
    ],
    'States': [
        enrolment_df['state'].nunique(),
        demographic_df['state'].nunique(),
        biometric_df['state'].nunique()
    ],
    'Districts': [
        enrolment_df['district'].nunique(),
        demographic_df['district'].nunique(),
        biometric_df['district'].nunique()
    ]
}

summary_df = pd.DataFrame(summary_data)
summary_df['Records'] = summary_df['Records'].apply(lambda x: f"{x:,}")
summary_df['Total Count'] = summary_df['Total Count'].apply(lambda x: f"{x:,.0f}")
display(summary_df)

---

## 4. Metric 1: Identity Freshness Index (IFI)

> **Question: Where is Aadhaar data most likely stale?**

```
IFI = (Demographic Updates + Biometric Updates) / Total Enrolments
```

In [None]:
# Calculate IFI by state
enrol_state = enrolment_df.groupby('state')['total_enrolments'].sum().reset_index()
demo_state = demographic_df.groupby('state')['total_demo_updates'].sum().reset_index()
bio_state = biometric_df.groupby('state')['total_bio_updates'].sum().reset_index()

ifi_df = enrol_state.merge(demo_state, on='state', how='left')
ifi_df = ifi_df.merge(bio_state, on='state', how='left')
ifi_df = ifi_df.fillna(0)

ifi_df['total_updates'] = ifi_df['total_demo_updates'] + ifi_df['total_bio_updates']
ifi_df['ifi'] = ifi_df['total_updates'] / ifi_df['total_enrolments'].replace(0, np.nan)
ifi_df['ifi'] = ifi_df['ifi'].fillna(0)

# Categorize
ifi_df['ifi_risk'] = pd.cut(
    ifi_df['ifi'],
    bins=[0, 0.2, 0.4, 0.6, float('inf')],
    labels=['üî¥ Critical', 'üü° At Risk', 'üü¢ Healthy', 'üîµ Optimal']
)

ifi_df = ifi_df.sort_values('ifi', ascending=True)

# National average
national_ifi = ifi_df['total_updates'].sum() / ifi_df['total_enrolments'].sum()
print(f"üìä National Average IFI: {national_ifi:.2f}")

In [None]:
# Visualization: IFI Rankings
fig, ax = plt.subplots(figsize=(14, 10))

# Get colors based on risk
colors = {
    'üî¥ Critical': '#dc3545',
    'üü° At Risk': '#ffc107',
    'üü¢ Healthy': '#28a745',
    'üîµ Optimal': '#007bff'
}

plot_data = ifi_df.head(25).copy()
bar_colors = [colors.get(str(r), '#888') for r in plot_data['ifi_risk']]

bars = ax.barh(plot_data['state'], plot_data['ifi'], color=bar_colors)

# Add value labels
for bar, val in zip(bars, plot_data['ifi']):
    ax.text(val + 0.5, bar.get_y() + bar.get_height()/2, 
            f'{val:.1f}', va='center', fontsize=9)

ax.axvline(x=national_ifi, color='red', linestyle='--', linewidth=2, label=f'National Avg: {national_ifi:.2f}')
ax.set_xlabel('Identity Freshness Index (IFI)', fontweight='bold')
ax.set_ylabel('State', fontweight='bold')
ax.set_title('Which States Need Identity Refresh Campaigns?', fontsize=16, fontweight='bold')
ax.legend()

plt.tight_layout()
plt.savefig('visualizations/ifi_rankings.png', dpi=300, bbox_inches='tight')
plt.show()

---

## 5. Metric 2: Child Lifecycle Capture Rate (CLCR)

> **Question: Are children getting their mandatory biometric updates?**

```
CLCR = Bio Updates (5-17) / (Enrolments 5-17 √ó 20%)
```

In [None]:
# Calculate CLCR
enrol_child = enrolment_df.groupby('state')['age_5_17'].sum().reset_index()
bio_child = biometric_df.groupby('state')['bio_age_5_17'].sum().reset_index()

clcr_df = enrol_child.merge(bio_child, on='state', how='left')
clcr_df = clcr_df.fillna(0)

expected_rate = config['analysis']['expected_child_update_rate']
clcr_df['expected_updates'] = clcr_df['age_5_17'] * expected_rate
clcr_df['clcr'] = clcr_df['bio_age_5_17'] / clcr_df['expected_updates'].replace(0, np.nan)
clcr_df['clcr'] = clcr_df['clcr'].fillna(0)

# States below target
below_target = clcr_df[clcr_df['clcr'] < 1.0].sort_values('clcr')
print(f"‚ö†Ô∏è States below CLCR target: {len(below_target)}")
display(below_target[['state', 'age_5_17', 'bio_age_5_17', 'clcr']].head(10))

In [None]:
# Visualization: Child Lifecycle Gap
fig, ax = plt.subplots(figsize=(14, 8))

clcr_plot = clcr_df.sort_values('clcr').head(20).copy()
clcr_plot['gap'] = 1 - clcr_plot['clcr'].clip(upper=1)

colors = ['#dc3545' if g > 0 else '#28a745' for g in clcr_plot['gap']]

ax.barh(clcr_plot['state'], clcr_plot['clcr'].clip(upper=2), color=colors)
ax.axvline(x=1.0, color='black', linestyle='--', linewidth=2, label='Target (1.0)')

ax.set_xlabel('Child Lifecycle Capture Rate (CLCR)', fontweight='bold')
ax.set_ylabel('State', fontweight='bold')
ax.set_title('Are Children Getting Mandatory Biometric Updates?', fontsize=16, fontweight='bold')
ax.legend()

plt.tight_layout()
plt.savefig('visualizations/clcr_gap.png', dpi=300, bbox_inches='tight')
plt.show()

---

## 6. Metric 3: Temporal Access Equity Score (TAES)

> **Question: Which states penalize working citizens with weekend service gaps?**

```
TAES = Weekend Daily Average / Weekday Daily Average
```

In [None]:
# Calculate TAES
daily_enrol = enrolment_df.groupby(['state', 'date', 'is_weekend'])['total_enrolments'].sum().reset_index()

weekend_avg = daily_enrol[daily_enrol['is_weekend']].groupby('state')['total_enrolments'].mean().reset_index()
weekend_avg.columns = ['state', 'weekend_avg']

weekday_avg = daily_enrol[~daily_enrol['is_weekend']].groupby('state')['total_enrolments'].mean().reset_index()
weekday_avg.columns = ['state', 'weekday_avg']

taes_df = weekend_avg.merge(weekday_avg, on='state', how='outer').fillna(0)
taes_df['taes'] = taes_df['weekend_avg'] / taes_df['weekday_avg'].replace(0, np.nan)
taes_df['taes'] = taes_df['taes'].fillna(0).clip(upper=1.5)
taes_df['weekend_drop'] = (1 - taes_df['taes']) * 100

# National TAES
national_taes = taes_df['weekend_avg'].sum() / taes_df['weekday_avg'].sum()
print(f"üìä National TAES: {national_taes:.2f} ({(1-national_taes)*100:.0f}% weekend drop)")

In [None]:
# Visualization: TAES
fig, ax = plt.subplots(figsize=(14, 8))

taes_plot = taes_df.sort_values('taes').head(20).copy()
threshold = config['analysis']['taes_acceptable']

colors = ['#dc3545' if t < threshold else '#28a745' for t in taes_plot['taes']]

ax.barh(taes_plot['state'], taes_plot['taes'], color=colors)
ax.axvline(x=threshold, color='orange', linestyle='--', linewidth=2, 
           label=f'Acceptable Threshold ({threshold})')

ax.set_xlabel('Temporal Access Equity Score (TAES)', fontweight='bold')
ax.set_ylabel('State', fontweight='bold')
ax.set_title('Which States Penalize Working Citizens with Weekend Gaps?', fontsize=16, fontweight='bold')
ax.legend()

plt.tight_layout()
plt.savefig('visualizations/taes_weekend.png', dpi=300, bbox_inches='tight')
plt.show()

---

## 7. Trivariate Analysis: Lifecycle Gap

> **Question: In states with high child enrolments, are we seeing proportional biometric updates?**

This is the **flagship differentiation analysis** that most teams will miss.

In [None]:
# Calculate Lifecycle Gap
enrol_agg = enrolment_df.groupby('state').agg({
    'age_5_17': 'sum',
    'age_18_greater': 'sum',
    'total_enrolments': 'sum'
}).reset_index()

enrol_agg['child_share'] = enrol_agg['age_5_17'] / enrol_agg['total_enrolments']

bio_agg = biometric_df.groupby('state').agg({
    'bio_age_5_17': 'sum',
    'bio_age_17_': 'sum',
    'total_bio_updates': 'sum'
}).reset_index()

bio_agg['child_bio_share'] = bio_agg['bio_age_5_17'] / bio_agg['total_bio_updates'].replace(0, np.nan)
bio_agg['child_bio_share'] = bio_agg['child_bio_share'].fillna(0)

lifecycle_df = enrol_agg.merge(bio_agg, on='state')
lifecycle_df['lifecycle_gap'] = lifecycle_df['child_share'] - lifecycle_df['child_bio_share']

# Identify problem states
lifecycle_df['quadrant'] = 'Normal'
lifecycle_df.loc[
    (lifecycle_df['child_share'] > 0.35) & (lifecycle_df['lifecycle_gap'] > 0.05), 
    'quadrant'
] = '‚ö†Ô∏è LIFECYCLE GAP'

gap_states = lifecycle_df[lifecycle_df['quadrant'] == '‚ö†Ô∏è LIFECYCLE GAP']
print(f"üö® States with Lifecycle Gap: {len(gap_states)}")
display(gap_states[['state', 'child_share', 'child_bio_share', 'lifecycle_gap']])

In [None]:
# Visualization: Lifecycle Gap Scatter
fig, ax = plt.subplots(figsize=(12, 10))

colors = ['#dc3545' if q == '‚ö†Ô∏è LIFECYCLE GAP' else '#28a745' for q in lifecycle_df['quadrant']]
sizes = (lifecycle_df['total_enrolments'] / lifecycle_df['total_enrolments'].max() * 500) + 50

scatter = ax.scatter(
    lifecycle_df['child_share'], 
    lifecycle_df['child_bio_share'],
    c=colors,
    s=sizes,
    alpha=0.7,
    edgecolors='white',
    linewidth=1
)

# Add diagonal line (perfect balance)
ax.plot([0, 0.6], [0, 0.6], 'k--', alpha=0.5, label='Perfect Balance')

# Annotate gap states
for _, row in gap_states.iterrows():
    ax.annotate(row['state'], (row['child_share'], row['child_bio_share']),
                fontsize=8, alpha=0.8)

ax.set_xlabel('Child Share of Enrolments', fontweight='bold')
ax.set_ylabel('Child Share of Biometric Updates', fontweight='bold')
ax.set_title('Where Are Lifecycle Transitions Failing?\n(States below diagonal have lifecycle gap)', 
             fontsize=14, fontweight='bold')
ax.legend()

plt.tight_layout()
plt.savefig('visualizations/lifecycle_gap.png', dpi=300, bbox_inches='tight')
plt.show()

---

## 8. Composite State Rankings

In [None]:
# Merge all metrics
composite = ifi_df[['state', 'ifi', 'ifi_risk', 'total_enrolments']].copy()
composite = composite.merge(clcr_df[['state', 'clcr']], on='state', how='left')
composite = composite.merge(taes_df[['state', 'taes']], on='state', how='left')
composite = composite.fillna(0)

# Calculate composite score
composite['composite'] = (
    composite['ifi'].clip(upper=1) * 0.40 +
    composite['clcr'].clip(upper=1) * 0.30 +
    composite['taes'].clip(upper=1) * 0.30
)

composite = composite.sort_values('composite', ascending=True)

print("="*70)
print("üö® PRIORITY INTERVENTION STATES (Lowest Composite Scores)")
print("="*70)
display(composite.head(15)[['state', 'composite', 'ifi', 'clcr', 'taes', 'total_enrolments']])

In [None]:
# Save results
composite.to_csv('data/processed/state_metrics_clean.csv', index=False)
print("‚úÖ Results saved to: data/processed/state_metrics_clean.csv")

---

## 9. Key Findings & Recommendations

In [None]:
# Summary statistics
print("="*70)
print("üìä EXECUTIVE SUMMARY")
print("="*70)

print(f"\nüìà Total Records Analyzed: {len(enrolment_df) + len(demographic_df) + len(biometric_df):,}")
print(f"üìà Total Enrolments: {enrolment_df['total_enrolments'].sum():,}")
print(f"üìà Total Demographic Updates: {demographic_df['total_demo_updates'].sum():,}")
print(f"üìà Total Biometric Updates: {biometric_df['total_bio_updates'].sum():,}")

print(f"\nüéØ National IFI: {national_ifi:.2f}")
print(f"üéØ National TAES: {national_taes:.2f} ({(1-national_taes)*100:.0f}% weekend drop)")

critical_states = composite[composite['composite'] < 0.3]
print(f"\n‚ö†Ô∏è States Requiring Immediate Intervention: {len(critical_states)}")

print("\n" + "="*70)
print("üéØ TOP RECOMMENDATIONS")
print("="*70)
print("\n1. Deploy mobile update camps in states with lowest IFI")
print("2. Partner with state education depts for school biometric drives")
print("3. Extend weekend hours in urban districts with TAES < 0.70")
print("4. Launch SMS awareness campaign for stale-data districts")
print("5. Create national Identity Health Dashboard for monitoring")

---

## 10. Export for Submission

In [None]:
print("üìÅ Saved Visualizations:")
for f in Path('visualizations').glob('*.png'):
    print(f"  ‚Ä¢ {f.name}")

print("\nüìÅ Saved Data:")
for f in Path('data/processed').glob('*.csv'):
    print(f"  ‚Ä¢ {f.name}")

print("\n‚úÖ Analysis complete. Ready for submission!")