# SECTION 1: DATA LOADING & SETUP

In [None]:
import zipfile
import pandas as pd

base_path = '/content/drive/MyDrive/'

print("Loading datasets from ZIP files...\n")

# Load enrolment data
print("1/3 Loading enrolment data...")
with zipfile.ZipFile(base_path + 'api_data_aadhar_enrolment.zip') as z:
    csv_name = [f for f in z.namelist() if f.endswith('.csv')][0]
    print(f"    Found CSV inside: {csv_name}")
    with z.open(csv_name) as f:
        enrol = pd.read_csv(f)
print(f"    âœ“ Loaded: {enrol.shape[0]:,} rows Ã— {enrol.shape[1]} columns\n")

# Load demographic update data
print("2/3 Loading demographic update data...")
with zipfile.ZipFile(base_path + 'api_data_aadhar_demographic.zip') as z:
    csv_name = [f for f in z.namelist() if f.endswith('.csv')][0]
    print(f"    Found CSV inside: {csv_name}")
    with z.open(csv_name) as f:
        demo_update = pd.read_csv(f)
print(f"    âœ“ Loaded: {demo_update.shape[0]:,} rows Ã— {demo_update.shape[1]} columns\n")

# Load biometric update data
print("3/3 Loading biometric update data...")
with zipfile.ZipFile(base_path + 'api_data_aadhar_biometric.zip') as z:
    csv_name = [f for f in z.namelist() if f.endswith('.csv')][0]
    print(f"    Found CSV inside: {csv_name}")
    with z.open(csv_name) as f:
        bio_update = pd.read_csv(f)
print(f"    âœ“ Loaded: {bio_update.shape[0]:,} rows Ã— {bio_update.shape[1]} columns\n")

print("="*70)
print("ðŸŽ‰ ALL DATASETS LOADED SUCCESSFULLY!")
print("="*70)


Loading datasets from ZIP files...

1/3 Loading enrolment data...
    Found CSV inside: api_data_aadhar_enrolment/api_data_aadhar_enrolment_0_500000.csv
    âœ“ Loaded: 500,000 rows Ã— 7 columns

2/3 Loading demographic update data...
    Found CSV inside: api_data_aadhar_demographic/api_data_aadhar_demographic_0_500000.csv
    âœ“ Loaded: 500,000 rows Ã— 6 columns

3/3 Loading biometric update data...
    Found CSV inside: api_data_aadhar_biometric/api_data_aadhar_biometric_0_500000.csv
    âœ“ Loaded: 500,000 rows Ã— 6 columns

ðŸŽ‰ ALL DATASETS LOADED SUCCESSFULLY!


In [None]:
# Check enrolment structure
print("="*70)
print("DATASET 1: ENROLMENT DATA")
print("="*70)
print("\nColumns:")
print(enrol.columns.tolist())
print("\nFirst 3 rows:")
display(enrol.head(3))
print(f"\nData types:")
print(enrol.dtypes)


DATASET 1: ENROLMENT DATA

Columns:
['date', 'state', 'district', 'pincode', 'age_0_5', 'age_5_17', 'age_18_greater']

First 3 rows:


Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,02-03-2025,Meghalaya,East Khasi Hills,793121,11,61,37
1,09-03-2025,Karnataka,Bengaluru Urban,560043,14,33,39
2,09-03-2025,Uttar Pradesh,Kanpur Nagar,208001,29,82,12



Data types:
date              object
state             object
district          object
pincode            int64
age_0_5            int64
age_5_17           int64
age_18_greater     int64
dtype: object


In [None]:
# Check demographic structure
print("="*70)
print("DATASET 2: DEMOGRAPHIC UPDATE DATA")
print("="*70)
print("\nColumns:")
print(demo_update.columns.tolist())
print("\nFirst 3 rows:")
display(demo_update.head(3))


DATASET 2: DEMOGRAPHIC UPDATE DATA

Columns:
['date', 'state', 'district', 'pincode', 'demo_age_5_17', 'demo_age_17_']

First 3 rows:


Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_
0,01-03-2025,Uttar Pradesh,Gorakhpur,273213,49,529
1,01-03-2025,Andhra Pradesh,Chittoor,517132,22,375
2,01-03-2025,Gujarat,Rajkot,360006,65,765


In [None]:
# Check biometric structure
print("="*70)
print("DATASET 3: BIOMETRIC UPDATE DATA")
print("="*70)
print("\nColumns:")
print(bio_update.columns.tolist())
print("\nFirst 3 rows:")
display(bio_update.head(3))


DATASET 3: BIOMETRIC UPDATE DATA

Columns:
['date', 'state', 'district', 'pincode', 'bio_age_5_17', 'bio_age_17_']

First 3 rows:


Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
0,01-03-2025,Haryana,Mahendragarh,123029,280,577
1,01-03-2025,Bihar,Madhepura,852121,144,369
2,01-03-2025,Jammu and Kashmir,Punch,185101,643,1091


In [None]:

print("=== QUICK DATA QUALITY CHECK ===\n")

# Check date ranges
print("Date ranges:")
print(f"  Enrolment: {enrol['date'].min()} to {enrol['date'].max()}")
print(f"  Demographic: {demo_update['date'].min()} to {demo_update['date'].max()}")
print(f"  Biometric: {bio_update['date'].min()} to {bio_update['date'].max()}")

# Check for missing values
print(f"\nMissing values:")
print(f"  Enrolment: {enrol.isnull().sum().sum()}")
print(f"  Demographic: {demo_update.isnull().sum().sum()}")
print(f"  Biometric: {bio_update.isnull().sum().sum()}")

# Check number of unique locations
print(f"\nGeographic coverage:")
print(f"  States: {enrol['state'].nunique()}")
print(f"  Districts: {enrol['district'].nunique()}")
print(f"  PIN codes: {enrol['pincode'].nunique()}")

# Total enrolments
print(f"\nTotal counts:")
print(f"  Total enrolments: {enrol[['age_0_5', 'age_5_17', 'age_18_greater']].sum().sum():,}")
print(f"  Total demo updates: {demo_update[['demo_age_5_17', 'demo_age_17_']].sum().sum():,}")
print(f"  Total bio updates: {bio_update[['bio_age_5_17', 'bio_age_17_']].sum().sum():,}")


=== QUICK DATA QUALITY CHECK ===

Date ranges:
  Enrolment: 01-04-2025 to 30-09-2025
  Demographic: 01-03-2025 to 31-10-2025
  Biometric: 01-03-2025 to 19-09-2025

Missing values:
  Enrolment: 0
  Demographic: 0
  Biometric: 0

Geographic coverage:
  States: 54
  Districts: 971
  PIN codes: 19302

Total counts:
  Total enrolments: 3,301,026
  Total demo updates: 14,295,026
  Total bio updates: 48,726,989


In [None]:

import pickle

# Save all dataframes
data_dict = {
    'enrol': enrol,
    'demo_update': demo_update,
    'bio_update': bio_update
}

save_path = '/content/drive/MyDrive/UIDAI_Hackathon_2026/'
import os
os.makedirs(save_path, exist_ok=True)

# Save as pickle for fast loading tomorrow
with open(save_path + 'datasets.pkl', 'wb') as f:
    pickle.dump(data_dict, f)

print(f"âœ“ Datasets saved to: {save_path}datasets.pkl")


âœ“ Datasets saved to: /content/drive/MyDrive/UIDAI_Hackathon_2026/datasets.pkl
