In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import os

RAW_PATH = "/content/drive/MyDrive/uidai_hackathon/data/raw"
os.listdir(RAW_PATH)


['api_data_aadhar_biometric_0_500000.csv',
 'api_data_aadhar_biometric_1500000_1861108.csv',
 'api_data_aadhar_biometric_1000000_1500000.csv',
 'api_data_aadhar_biometric_500000_1000000.csv',
 'api_data_aadhar_demographic_1000000_1500000.csv',
 'api_data_aadhar_demographic_2000000_2071700.csv',
 'api_data_aadhar_demographic_0_500000.csv',
 'api_data_aadhar_demographic_1500000_2000000.csv',
 'api_data_aadhar_demographic_500000_1000000.csv',
 'api_data_aadhar_enrolment_1000000_1006029.csv',
 'api_data_aadhar_enrolment_0_500000.csv',
 'api_data_aadhar_enrolment_500000_1000000.csv']

In [None]:
import pandas as pd
import numpy as np
from glob import glob


In [None]:
enrol_files = glob(f"{RAW_PATH}/*enrolment*.csv")
demo_files  = glob(f"{RAW_PATH}/*demographic*.csv")
bio_files   = glob(f"{RAW_PATH}/*biometric*.csv")

print("Enrolment files:", len(enrol_files))
print("Demographic files:", len(demo_files))
print("Biometric files:", len(bio_files))


Enrolment files: 3
Demographic files: 5
Biometric files: 4


In [None]:
enrol = pd.concat([pd.read_csv(f) for f in enrol_files], ignore_index=True)
demo  = pd.concat([pd.read_csv(f) for f in demo_files], ignore_index=True)
bio   = pd.concat([pd.read_csv(f) for f in bio_files], ignore_index=True)

print(enrol.shape, demo.shape, bio.shape)


(1006029, 7) (2071700, 6) (1861108, 6)


In [None]:
print("Enrolment columns:", enrol.columns.tolist())
print("Demographic columns:", demo.columns.tolist())
print("Biometric columns:", bio.columns.tolist())


Enrolment columns: ['date', 'state', 'district', 'pincode', 'age_0_5', 'age_5_17', 'age_18_greater']
Demographic columns: ['date', 'state', 'district', 'pincode', 'demo_age_5_17', 'demo_age_17_']
Biometric columns: ['date', 'state', 'district', 'pincode', 'bio_age_5_17', 'bio_age_17_']


In [None]:
for df in [enrol, demo, bio]:
    print(df["date"].isna().sum())


0
0
0


In [None]:
enrol_agg = enrol.groupby(["date", "state"], as_index=False).sum()
demo_agg  = demo.groupby(["date", "state"], as_index=False).sum()
bio_agg   = bio.groupby(["date", "state"], as_index=False).sum()

print(enrol_agg.shape, demo_agg.shape, bio_agg.shape)


(3393, 7) (4339, 6) (3984, 6)


In [None]:
# Ensure date dtype is consistent before merge
for df in [enrol_agg, demo_agg, bio_agg]:
    df["date"] = pd.to_datetime(df["date"], dayfirst=True)

# Now merge safely
merged = enrol_agg.merge(demo_agg, on=["date", "state"], how="left")
merged = merged.merge(bio_agg, on=["date", "state"], how="left")

merged.fillna(0, inplace=True)
merged.head()



Unnamed: 0,date,state,district_x,pincode_x,age_0_5,age_5_17,age_18_greater,district_y,pincode_y,demo_age_5_17,demo_age_17_,district,pincode,bio_age_5_17,bio_age_17_
0,2025-03-02,Meghalaya,East Khasi Hills,793121,11,61,37,0,0.0,0.0,0.0,0,0.0,0.0,0.0
1,2025-03-09,Bihar,SitamarhiSitamarhiPurbi ChamparanSitamarhiSita...,6723137,206,633,166,0,0.0,0.0,0.0,0,0.0,0.0,0.0
2,2025-03-09,Delhi,West DelhiWest Delhi,220077,122,53,57,0,0.0,0.0,0.0,0,0.0,0.0,0.0
3,2025-03-09,Haryana,FaridabadGurugram,243005,98,79,23,0,0.0,0.0,0.0,0,0.0,0.0,0.0
4,2025-03-09,Karnataka,Bengaluru UrbanBengaluru UrbanBengaluru UrbanB...,2240159,63,80,105,0,0.0,0.0,0.0,0,0.0,0.0,0.0


In [None]:
merged["total_enrolment"] = (
    merged["age_0_5"] +
    merged["age_5_17"] +
    merged["age_18_greater"]
)

merged["demographic_updates"] = (
    merged["demo_age_5_17"] +
    merged["demo_age_17_"]
)

merged["biometric_updates"] = (
    merged["bio_age_5_17"] +
    merged["bio_age_17_"]
)

merged["DUI"] = merged["demographic_updates"] / merged["total_enrolment"]
merged["BUBI"] = merged["biometric_updates"] / (
    merged["age_5_17"] + merged["age_18_greater"]
)

merged.replace([np.inf, -np.inf], 0, inplace=True)
merged.head()


Unnamed: 0,date,state,district_x,pincode_x,age_0_5,age_5_17,age_18_greater,district_y,pincode_y,demo_age_5_17,demo_age_17_,district,pincode,bio_age_5_17,bio_age_17_,total_enrolment,demographic_updates,biometric_updates,DUI,BUBI
0,2025-03-02,Meghalaya,East Khasi Hills,793121,11,61,37,0,0.0,0.0,0.0,0,0.0,0.0,0.0,109,0.0,0.0,0.0,0.0
1,2025-03-09,Bihar,SitamarhiSitamarhiPurbi ChamparanSitamarhiSita...,6723137,206,633,166,0,0.0,0.0,0.0,0,0.0,0.0,0.0,1005,0.0,0.0,0.0,0.0
2,2025-03-09,Delhi,West DelhiWest Delhi,220077,122,53,57,0,0.0,0.0,0.0,0,0.0,0.0,0.0,232,0.0,0.0,0.0,0.0
3,2025-03-09,Haryana,FaridabadGurugram,243005,98,79,23,0,0.0,0.0,0.0,0,0.0,0.0,0.0,200,0.0,0.0,0.0,0.0
4,2025-03-09,Karnataka,Bengaluru UrbanBengaluru UrbanBengaluru UrbanB...,2240159,63,80,105,0,0.0,0.0,0.0,0,0.0,0.0,0.0,248,0.0,0.0,0.0,0.0


In [None]:
merged.describe()


Unnamed: 0,date,pincode_x,age_0_5,age_5_17,age_18_greater,pincode_y,demo_age_5_17,demo_age_17_,pincode,bio_age_5_17,bio_age_17_,total_enrolment,demographic_updates,biometric_updates,DUI,BUBI
count,3393,3393.0,3393.0,3393.0,3393.0,3393.0,3393.0,3393.0,3393.0,3393.0,3393.0,3393.0,3393.0,3393.0,3393.0,3251.0
mean,2025-10-14 21:16:36.286472192,153777900.0,1045.377247,507.039198,49.617742,237590900.0,911.402594,7899.001768,218168700.0,7516.252874,7894.153257,1602.034188,8810.404362,15410.41,15.964878,69.658072
min,2025-03-02 00:00:00,100000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,2025-09-17 00:00:00,3720622.0,13.0,2.0,0.0,2254343.0,2.0,17.0,1483840.0,3.0,5.0,19.0,20.0,9.0,1.789552,0.0
50%,2025-10-21 00:00:00,26905080.0,122.0,48.0,2.0,36709540.0,73.0,473.0,34946710.0,211.0,204.0,224.0,542.0,421.0,4.813934,14.356135
75%,2025-11-12 00:00:00,276862000.0,1223.0,285.0,19.0,337062100.0,863.0,7582.0,327773700.0,3911.0,4613.0,1592.0,8530.0,8756.0,9.182692,44.759871
max,2025-12-31 00:00:00,1275220000.0,73166.0,81078.0,8768.0,2230183000.0,65004.0,352128.0,2262708000.0,902096.0,776958.0,157866.0,379443.0,1426001.0,7616.5,14705.333333
std,,233463200.0,2554.15422,2229.347176,319.347286,393816600.0,2724.534871,20736.516023,359123600.0,39917.662317,39332.293533,4780.361617,23123.615836,76554.78,161.618778,482.343735


In [None]:
PROCESSED_PATH = "/content/drive/MyDrive/uidai_hackathon/data/processed"
os.makedirs(PROCESSED_PATH, exist_ok=True)

merged.to_csv(f"{PROCESSED_PATH}/lifecycle_aggregated.csv", index=False)
print("Saved lifecycle_aggregated.csv")


Saved lifecycle_aggregated.csv
