# üáÆüá≥ UIDAI Aadhaar Insights - Data Exploration

**Problem Statement**: Unlocking Societal Trends in Aadhaar Enrolment and Updates

This notebook covers:
1. Loading all datasets (Biometric, Demographic, Enrolment)
2. Data cleaning and preparation
3. Initial statistics and exploration
4. First visualizations

## 1. Setup & Imports

In [None]:
# Core imports
import pandas as pd
import numpy as np
import os
from pathlib import Path
from glob import glob

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

# Paths
BASE_PATH = Path('.')
BIOMETRIC_PATH = BASE_PATH / 'api_data_aadhar_biometric'
DEMOGRAPHIC_PATH = BASE_PATH / 'api_data_aadhar_demographic'
ENROLMENT_PATH = BASE_PATH / 'api_data_aadhar_enrolment'

print("‚úÖ Libraries loaded successfully!")

## 2. Load All Datasets

In [None]:
def load_all_csvs(folder_path, dataset_name):
    """Load and combine all CSV files from a folder."""
    csv_files = sorted(glob(str(folder_path / '*.csv')))
    print(f"\nüìÇ Loading {dataset_name}: Found {len(csv_files)} files")
    
    dfs = []
    for i, file in enumerate(csv_files):
        print(f"   Loading file {i+1}/{len(csv_files)}: {os.path.basename(file)}")
        df = pd.read_csv(file)
        dfs.append(df)
        
    combined = pd.concat(dfs, ignore_index=True)
    print(f"   ‚úÖ Total rows: {len(combined):,}")
    return combined

# Load all three datasets
print("="*60)
print("üöÄ LOADING AADHAAR DATASETS")
print("="*60)

df_biometric = load_all_csvs(BIOMETRIC_PATH, "Biometric")
df_demographic = load_all_csvs(DEMOGRAPHIC_PATH, "Demographic")
df_enrolment = load_all_csvs(ENROLMENT_PATH, "Enrolment")

print("\n" + "="*60)
print(f"üìä TOTAL RECORDS LOADED: {len(df_biometric) + len(df_demographic) + len(df_enrolment):,}")
print("="*60)

## 3. Explore Dataset Structures

In [None]:
# Quick look at each dataset
print("="*60)
print("üìã BIOMETRIC DATASET")
print("="*60)
print(f"Shape: {df_biometric.shape}")
print(f"Columns: {list(df_biometric.columns)}")
print("\nSample data:")
df_biometric.head()

In [None]:
print("="*60)
print("üìã DEMOGRAPHIC DATASET")
print("="*60)
print(f"Shape: {df_demographic.shape}")
print(f"Columns: {list(df_demographic.columns)}")
print("\nSample data:")
df_demographic.head()

In [None]:
print("="*60)
print("üìã ENROLMENT DATASET")
print("="*60)
print(f"Shape: {df_enrolment.shape}")
print(f"Columns: {list(df_enrolment.columns)}")
print("\nSample data:")
df_enrolment.head()

## 4. Data Cleaning & Preparation

In [None]:
def clean_dataset(df, name):
    """Clean and prepare dataset."""
    print(f"\nüßπ Cleaning {name} dataset...")
    
    # Create a copy
    df_clean = df.copy()
    
    # Convert date to datetime
    df_clean['date'] = pd.to_datetime(df_clean['date'], format='%d-%m-%Y')
    
    # Add useful time columns
    df_clean['day_of_week'] = df_clean['date'].dt.day_name()
    df_clean['day_num'] = df_clean['date'].dt.dayofweek
    df_clean['week'] = df_clean['date'].dt.isocalendar().week
    df_clean['month'] = df_clean['date'].dt.month
    
    # Clean state names (standardize)
    df_clean['state'] = df_clean['state'].str.strip().str.title()
    df_clean['district'] = df_clean['district'].str.strip().str.title()
    
    # Convert pincode to string (preserve leading zeros)
    df_clean['pincode'] = df_clean['pincode'].astype(str).str.zfill(6)
    
    print(f"   ‚úÖ Cleaned {len(df_clean):,} rows")
    return df_clean

# Clean all datasets
df_biometric_clean = clean_dataset(df_biometric, "Biometric")
df_demographic_clean = clean_dataset(df_demographic, "Demographic")
df_enrolment_clean = clean_dataset(df_enrolment, "Enrolment")

print("\n‚úÖ All datasets cleaned!")

In [None]:
# Add total counts for easier analysis

# Biometric: bio_age_5_17 + bio_age_17_
df_biometric_clean['total_bio'] = df_biometric_clean['bio_age_5_17'] + df_biometric_clean['bio_age_17_']

# Demographic: demo_age_5_17 + demo_age_17_
df_demographic_clean['total_demo'] = df_demographic_clean['demo_age_5_17'] + df_demographic_clean['demo_age_17_']

# Enrolment: age_0_5 + age_5_17 + age_18_greater
df_enrolment_clean['total_enrol'] = (df_enrolment_clean['age_0_5'] + 
                                      df_enrolment_clean['age_5_17'] + 
                                      df_enrolment_clean['age_18_greater'])

print("‚úÖ Added total count columns")
print(f"\nBiometric total authentications: {df_biometric_clean['total_bio'].sum():,}")
print(f"Demographic total updates: {df_demographic_clean['total_demo'].sum():,}")
print(f"Enrolment total: {df_enrolment_clean['total_enrol'].sum():,}")

## 5. Basic Statistics

In [None]:
# Date range and geographic coverage
print("="*60)
print("üìÖ DATE RANGE")
print("="*60)
print(f"Biometric:   {df_biometric_clean['date'].min().date()} to {df_biometric_clean['date'].max().date()}")
print(f"Demographic: {df_demographic_clean['date'].min().date()} to {df_demographic_clean['date'].max().date()}")
print(f"Enrolment:   {df_enrolment_clean['date'].min().date()} to {df_enrolment_clean['date'].max().date()}")

print("\n" + "="*60)
print("üó∫Ô∏è  GEOGRAPHIC COVERAGE")
print("="*60)
print(f"Unique States: {df_enrolment_clean['state'].nunique()}")
print(f"Unique Districts: {df_enrolment_clean['district'].nunique()}")
print(f"Unique Pincodes: {df_enrolment_clean['pincode'].nunique()}")

## 6. Top States & Districts Analysis

In [None]:
# Top 10 States by Enrolment
top_states_enrol = (df_enrolment_clean
                    .groupby('state')['total_enrol']
                    .sum()
                    .sort_values(ascending=False)
                    .head(10))

# Create visualization
fig = px.bar(
    x=top_states_enrol.values,
    y=top_states_enrol.index,
    orientation='h',
    title='üèÜ Top 10 States by Aadhaar Enrolments',
    labels={'x': 'Total Enrolments', 'y': 'State'},
    color=top_states_enrol.values,
    color_continuous_scale='Viridis'
)
fig.update_layout(height=500, showlegend=False, yaxis={'categoryorder':'total ascending'})
fig.show()

In [None]:
# Top 10 Districts by Enrolment
top_districts = (df_enrolment_clean
                 .groupby(['state', 'district'])['total_enrol']
                 .sum()
                 .sort_values(ascending=False)
                 .head(10)
                 .reset_index())

top_districts['location'] = top_districts['district'] + ', ' + top_districts['state']

fig = px.bar(
    top_districts,
    x='total_enrol',
    y='location',
    orientation='h',
    title='üèÜ Top 10 Districts by Aadhaar Enrolments',
    labels={'total_enrol': 'Total Enrolments', 'location': 'District'},
    color='total_enrol',
    color_continuous_scale='Plasma'
)
fig.update_layout(height=500, showlegend=False, yaxis={'categoryorder':'total ascending'})
fig.show()

## 7. Time-based Analysis

In [None]:
# Daily trend for all three datasets
daily_bio = df_biometric_clean.groupby('date')['total_bio'].sum().reset_index()
daily_demo = df_demographic_clean.groupby('date')['total_demo'].sum().reset_index()
daily_enrol = df_enrolment_clean.groupby('date')['total_enrol'].sum().reset_index()

# Create subplot
fig = make_subplots(rows=3, cols=1, 
                    subplot_titles=('üìä Daily Biometric Authentications',
                                    'üìù Daily Demographic Updates',
                                    'üìã Daily Enrolments'),
                    vertical_spacing=0.1)

fig.add_trace(
    go.Scatter(x=daily_bio['date'], y=daily_bio['total_bio'], 
               mode='lines+markers', name='Biometric', line=dict(color='#2ecc71')),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=daily_demo['date'], y=daily_demo['total_demo'],
               mode='lines+markers', name='Demographic', line=dict(color='#3498db')),
    row=2, col=1
)

fig.add_trace(
    go.Scatter(x=daily_enrol['date'], y=daily_enrol['total_enrol'],
               mode='lines+markers', name='Enrolment', line=dict(color='#e74c3c')),
    row=3, col=1
)

fig.update_layout(height=800, title_text='üìÖ Daily Trends - All Aadhaar Activities', showlegend=False)
fig.show()

In [None]:
# Day of Week Analysis
dow_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

dow_enrol = (df_enrolment_clean
             .groupby('day_of_week')['total_enrol']
             .sum()
             .reindex(dow_order))

fig = px.bar(
    x=dow_enrol.index,
    y=dow_enrol.values,
    title='üìÖ Enrolments by Day of Week',
    labels={'x': 'Day', 'y': 'Total Enrolments'},
    color=dow_enrol.values,
    color_continuous_scale='RdYlGn'
)
fig.update_layout(showlegend=False)
fig.show()

print("\nüîç Key Insight: Check which days have highest/lowest activity for resource planning")

## 8. Age Group Analysis

In [None]:
# Age distribution in Enrolments
age_totals = {
    'Age 0-5 (Infants)': df_enrolment_clean['age_0_5'].sum(),
    'Age 5-17 (Children)': df_enrolment_clean['age_5_17'].sum(),
    'Age 18+ (Adults)': df_enrolment_clean['age_18_greater'].sum()
}

fig = px.pie(
    values=list(age_totals.values()),
    names=list(age_totals.keys()),
    title='üë• Enrolment Distribution by Age Group',
    color_discrete_sequence=px.colors.qualitative.Set2,
    hole=0.4
)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

print("\nüìä Total by Age Group:")
for age, count in age_totals.items():
    print(f"   {age}: {count:,}")

In [None]:
# Age distribution by State (Top 10 states)
top_10_states = top_states_enrol.index.tolist()

state_age = (df_enrolment_clean[df_enrolment_clean['state'].isin(top_10_states)]
             .groupby('state')[['age_0_5', 'age_5_17', 'age_18_greater']]
             .sum()
             .reset_index())

fig = px.bar(
    state_age,
    x='state',
    y=['age_0_5', 'age_5_17', 'age_18_greater'],
    title='üë• Age Group Distribution by Top 10 States',
    labels={'value': 'Count', 'state': 'State', 'variable': 'Age Group'},
    barmode='group'
)
fig.update_layout(legend_title='Age Group')
fig.show()

## 9. Save Cleaned Data for Next Phases

In [None]:
# Create output directory
OUTPUT_PATH = BASE_PATH / 'processed_data'
OUTPUT_PATH.mkdir(exist_ok=True)

# Save cleaned datasets as parquet (faster and smaller than CSV)
df_biometric_clean.to_parquet(OUTPUT_PATH / 'biometric_clean.parquet', index=False)
df_demographic_clean.to_parquet(OUTPUT_PATH / 'demographic_clean.parquet', index=False)
df_enrolment_clean.to_parquet(OUTPUT_PATH / 'enrolment_clean.parquet', index=False)

print("‚úÖ Cleaned datasets saved to 'processed_data/' folder")
print(f"   üìÅ biometric_clean.parquet  ({len(df_biometric_clean):,} rows)")
print(f"   üìÅ demographic_clean.parquet ({len(df_demographic_clean):,} rows)")
print(f"   üìÅ enrolment_clean.parquet  ({len(df_enrolment_clean):,} rows)")

## 10. Summary & Key Findings

In [None]:
# Create summary table
summary = {
    'Metric': [
        'Total Records (All datasets)',
        'Biometric Authentications',
        'Demographic Updates', 
        'New Enrolments',
        'Date Range',
        'States Covered',
        'Districts Covered',
        'Pincodes Covered'
    ],
    'Value': [
        f"{len(df_biometric_clean) + len(df_demographic_clean) + len(df_enrolment_clean):,}",
        f"{df_biometric_clean['total_bio'].sum():,}",
        f"{df_demographic_clean['total_demo'].sum():,}",
        f"{df_enrolment_clean['total_enrol'].sum():,}",
        f"{df_enrolment_clean['date'].min().date()} to {df_enrolment_clean['date'].max().date()}",
        f"{df_enrolment_clean['state'].nunique()}",
        f"{df_enrolment_clean['district'].nunique()}",
        f"{df_enrolment_clean['pincode'].nunique()}"
    ]
}

summary_df = pd.DataFrame(summary)
print("="*60)
print("üìä PHASE 1 SUMMARY")
print("="*60)
print(summary_df.to_string(index=False))

print("\n" + "="*60)
print("‚úÖ PHASE 1 COMPLETE!")
print("="*60)
print("""
What we accomplished:
‚úÖ Loaded all CSV files (~5M records)
‚úÖ Cleaned and prepared data
‚úÖ Added useful columns (day_of_week, totals, etc.)
‚úÖ Created initial visualizations
‚úÖ Saved cleaned data for next phases

Next: Phase 2 - Deep Pattern Analysis
- Temporal patterns (trends over time)
- Geographic patterns (state/district analysis)
- Anomaly detection
- Correlation analysis
""")