In [3]:
import pandas as pd
import glob
from pathlib import Path

ENROL_PATH = Path("../data/raw/enrolment")
DEMO_PATH  = Path("../data/raw/demographic_updates")
BIO_PATH   = Path("../data/raw/biometric_updates")

def load_all_csvs(folder_path):
    files = glob.glob(str(folder_path / "*.csv"))
    print(f"Found {len(files)} files in {folder_path}")
    df_list = [pd.read_csv(f) for f in files]
    return pd.concat(df_list, ignore_index=True)

enrol_df = load_all_csvs(ENROL_PATH)
demo_df  = load_all_csvs(DEMO_PATH)
bio_df   = load_all_csvs(BIO_PATH)



Found 3 files in ..\data\raw\enrolment
Found 5 files in ..\data\raw\demographic_updates
Found 4 files in ..\data\raw\biometric_updates


In [4]:
print("ENROLMENT COLUMNS")
print(enrol_df.columns.tolist())

print("\nDEMOGRAPHIC UPDATE COLUMNS")
print(demo_df.columns.tolist())

print("\nBIOMETRIC UPDATE COLUMNS")
print(bio_df.columns.tolist())


ENROLMENT COLUMNS
['date', 'state', 'district', 'pincode', 'age_0_5', 'age_5_17', 'age_18_greater']

DEMOGRAPHIC UPDATE COLUMNS
['date', 'state', 'district', 'pincode', 'demo_age_5_17', 'demo_age_17_']

BIOMETRIC UPDATE COLUMNS
['date', 'state', 'district', 'pincode', 'bio_age_5_17', 'bio_age_17_']


In [5]:
print("\nENROLMENT SAMPLE")
display(enrol_df.head(3))

print("\nDEMOGRAPHIC SAMPLE")
display(demo_df.head(3))

print("\nBIOMETRIC SAMPLE")
display(bio_df.head(3))



ENROLMENT SAMPLE


Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,02-03-2025,Meghalaya,East Khasi Hills,793121,11,61,37
1,09-03-2025,Karnataka,Bengaluru Urban,560043,14,33,39
2,09-03-2025,Uttar Pradesh,Kanpur Nagar,208001,29,82,12



DEMOGRAPHIC SAMPLE


Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_
0,01-03-2025,Uttar Pradesh,Gorakhpur,273213,49,529
1,01-03-2025,Andhra Pradesh,Chittoor,517132,22,375
2,01-03-2025,Gujarat,Rajkot,360006,65,765



BIOMETRIC SAMPLE


Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
0,01-03-2025,Haryana,Mahendragarh,123029,280,577
1,01-03-2025,Bihar,Madhepura,852121,144,369
2,01-03-2025,Jammu and Kashmir,Punch,185101,643,1091


In [6]:
print("Missing values check:\n")
print("Enrolment:\n", enrol_df.isna().sum())
print("\nDemographic:\n", demo_df.isna().sum())
print("\nBiometric:\n", bio_df.isna().sum())


Missing values check:

Enrolment:
 date              0
state             0
district          0
pincode           0
age_0_5           0
age_5_17          0
age_18_greater    0
dtype: int64

Demographic:
 date             0
state            0
district         0
pincode          0
demo_age_5_17    0
demo_age_17_     0
dtype: int64

Biometric:
 date            0
state           0
district        0
pincode         0
bio_age_5_17    0
bio_age_17_     0
dtype: int64


In [10]:
# --- Fix column names ONCE ---
demo_df = demo_df.rename(columns={
    'demo_age_17_': 'demo_age_17_plus'
})

bio_df = bio_df.rename(columns={
    'bio_age_17_': 'bio_age_17_plus'
})

print(demo_df.columns.tolist())
print(bio_df.columns.tolist())


['date', 'state', 'district', 'pincode', 'demo_age_5_17', 'demo_age_17_plus']
['date', 'state', 'district', 'pincode', 'bio_age_5_17', 'bio_age_17_plus']


In [11]:
# --- Aggregate enrolment data ---
enrol_agg = (
    enrol_df
    .groupby(['date', 'state', 'district'], as_index=False)
    .agg({
        'age_0_5': 'sum',
        'age_5_17': 'sum',
        'age_18_greater': 'sum'
    })
)

# --- Aggregate demographic updates ---
demo_agg = (
    demo_df
    .groupby(['date', 'state', 'district'], as_index=False)
    .agg({
        'demo_age_5_17': 'sum',
        'demo_age_17_plus': 'sum'
    })
)

# --- Aggregate biometric updates ---
bio_agg = (
    bio_df
    .groupby(['date', 'state', 'district'], as_index=False)
    .agg({
        'bio_age_5_17': 'sum',
        'bio_age_17_plus': 'sum'
    })
)


In [13]:
from pathlib import Path

processed_path = Path("../data/processed")
processed_path.mkdir(parents=True, exist_ok=True)

enrol_agg.to_csv(processed_path / "enrolment_aggregated.csv", index=False)
demo_agg.to_csv(processed_path / "demographic_updates_aggregated.csv", index=False)
bio_agg.to_csv(processed_path / "biometric_updates_aggregated.csv", index=False)

print("Processed datasets saved successfully.")


Processed datasets saved successfully.


In [14]:
import os
print(os.listdir("../data/processed"))


['.gitkeep', 'biometric_updates_aggregated.csv', 'demographic_updates_aggregated.csv', 'enrolment_aggregated.csv']
