In [23]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 

In [24]:

BASE_PATH = "/Users/ayush/Library/Mobile Documents/com~apple~CloudDocs/Developer/UIDAI/02_Cleaned_Data"

df_enrol = pd.read_csv(f"{BASE_PATH}/enrolment_cleaned.csv")
df_demo  = pd.read_csv(f"{BASE_PATH}/demographic_cleaned.csv")
df_bio   = pd.read_csv(f"{BASE_PATH}/biometric_cleaned.csv")


In [25]:
for df in [df_enrol, df_demo, df_bio]:
    df['date'] = pd.to_datetime(df['date'], dayfirst=True)
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month


# Aggregate each dataset to the same grain

In [26]:
# enrolment aggregation

enrol_agg = (
    df_enrol
    .groupby(['state','district','year','month'])
    [['age_0_5','age_5_17','age_18_greater']]
    .sum()
    .reset_index()
)


In [27]:
# demographic aggregation 

demo_agg = (
    df_demo
    .groupby(['state','district','year','month'])
    [['demo_age_5_17','demo_age_17_']]
    .sum()
    .reset_index()
)


In [28]:
# biographic aggregation 

bio_agg = (
    df_bio
    .groupby(['state','district','year','month'])
    [['bio_age_5_17','bio_age_17_']]
    .sum()
    .reset_index()
)


In [29]:
# merge 

merged = enrol_agg \
    .merge(demo_agg, on=['state','district','year','month'], how='left') \
    .merge(bio_agg,  on=['state','district','year','month'], how='left')

merged = merged.fillna(0)


In [30]:
merged

Unnamed: 0,state,district,year,month,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_
0,Andaman and Nicobar Islands,nicobar,2025,9,42,6,0,2.0,116.0,92.0,148.0
1,Andaman and Nicobar Islands,nicobar,2025,10,6,4,0,6.0,39.0,29.0,55.0
2,Andaman and Nicobar Islands,nicobar,2025,11,11,0,0,4.0,82.0,116.0,54.0
3,Andaman and Nicobar Islands,nicobar,2025,12,5,1,0,6.0,52.0,203.0,51.0
4,Andaman and Nicobar Islands,north and middle andaman,2025,9,38,2,0,8.0,167.0,531.0,175.0
...,...,...,...,...,...,...,...,...,...,...,...
4393,West Bengal,uttar dinajpur,2025,12,1913,619,66,1217.0,15427.0,3316.0,5535.0
4394,West Bengal,west midnapore,2025,9,243,26,0,81.0,1469.0,436.0,627.0
4395,West Bengal,west midnapore,2025,10,92,13,0,33.0,1064.0,280.0,505.0
4396,West Bengal,west midnapore,2025,11,90,6,0,61.0,1594.0,366.0,754.0


In [31]:
# grain consistency 

merged.duplicated(
    subset=['state','district','year','month']
).sum()


np.int64(0)

In [32]:
# replace negative values 

num_cols = merged.select_dtypes(include='number').columns

for col in num_cols:
    merged.loc[merged[col] < 0, col] = 0


In [33]:
merged[num_cols] = merged[num_cols].fillna(0)


In [34]:
# total enrolment 

merged['total_enrolments'] = (
    merged['age_0_5'] +
    merged['age_5_17'] +
    merged['age_18_greater']
)


In [35]:
# total updates(demo + bio)

merged['total_updates'] = (
    merged['demo_age_5_17'] +
    merged['demo_age_17_'] +
    merged['bio_age_5_17'] +
    merged['bio_age_17_']
)


In [36]:
merged['adult_system_stress'] = (
    merged['demo_age_17_'] +
    merged['bio_age_17_']
)


In [37]:
merged['child_system_stress'] = (
    merged['demo_age_5_17'] +
    merged['bio_age_5_17']
)


In [38]:
#Digital Friction Index

merged['digital_friction_index'] = (
    merged['adult_system_stress']
    / (merged['age_18_greater'] + 1)
)


In [39]:
# normalization 

merged['log_digital_friction'] = np.log1p(
    merged['digital_friction_index']
)


In [40]:
# outlier protection 

q1 = merged['digital_friction_index'].quantile(0.25)
q3 = merged['digital_friction_index'].quantile(0.75)
iqr = q3 - q1

merged['friction_outlier'] = (
    merged['digital_friction_index'] > (q3 + 1.5 * iqr)
)


In [41]:

merged.to_csv(
    "/Users/ayush/Library/Mobile Documents/com~apple~CloudDocs/Developer/UIDAI/02_Cleaned_Data/merged.csv",
    index=False
)
