# Feature Engineering

In [264]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.cm as cm
from matplotlib.patches import Patch
import matplotlib.ticker as ticker
import seaborn as sns

import time
import os

import math
import scipy.stats as stats

In [265]:
enrolment = pd.read_csv('Cleaned Datasets/cleaned_enrollment.csv')
biometric = pd.read_csv('Cleaned Datasets/cleaned_biometric.csv')
demography = pd.read_csv('Cleaned Datasets/cleaned_demographic.csv')

os.makedirs('Featured Datasets', exist_ok=True)

print("Successfully loaded the Cleaned Datasets")

Successfully loaded the Cleaned Datasets


In [266]:
def Dataset_info():
    print(enrolment.info())
    print('-'*50)
    print(demography.info())
    print('-'*50)
    print(biometric.info())

In [267]:
Dataset_info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1006007 entries, 0 to 1006006
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   date             1006007 non-null  object
 1   state            1006007 non-null  object
 2   district         1006007 non-null  object
 3   pincode          1006007 non-null  int64 
 4   age_0_5          1006007 non-null  int64 
 5   age_5_17         1006007 non-null  int64 
 6   age_18_greater   1006007 non-null  int64 
 7   total_enrolment  1006007 non-null  int64 
dtypes: int64(5), object(3)
memory usage: 61.4+ MB
None
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2071698 entries, 0 to 2071697
Data columns (total 7 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   date           object
 1   state          object
 2   district       object
 3   pincode        int64 
 4   demo_age_5_17  int64 
 5   demo_

In [268]:
demography['date'] = pd.to_datetime(demography['date'])
demography['pincode'] = demography['pincode'].astype(str)

enrolment['date'] = pd.to_datetime(enrolment['date'])
enrolment.rename(columns={"age_18_greater": "age_18+"}, inplace=True)
enrolment['pincode'] = enrolment['pincode'].astype(str)

biometric['date'] = pd.to_datetime(biometric['date'])
biometric['pincode'] = biometric['pincode'].astype(str)

In [269]:
output_dir = 'Featured Datasets'
os.makedirs(output_dir, exist_ok=True)

# ADD TEMPORAL FEATURES
enrolment['day'] = enrolment['date'].dt.day
enrolment['month'] = enrolment['date'].dt.month
enrolment['day_name'] = enrolment['date'].dt.day_name()
enrolment['is_weekend'] = enrolment['date'].dt.dayofweek.isin([5, 6]).astype(int)

# Include 'pincode' in keep list
en_cols = ['date', 'state', 'district', 'pincode', 'total_enrolment', 'age_5_17', 'day', 'month', 'day_name', 'is_weekend']
enrolment[en_cols].to_csv(f'{output_dir}/featured_enrolment.csv', index=False)

In [270]:
# 2. Demographic
demography['day'] = demography['date'].dt.day
demography['month'] = demography['date'].dt.month
demography['day_name'] = demography['date'].dt.day_name()
demography['is_weekend'] = demography['date'].dt.dayofweek.isin([5, 6]).astype(int)

de_cols = ['date', 'state', 'district', 'pincode', 'demo_age_5_17', 'total_updates', 'day', 'month', 'day_name', 'is_weekend']
demography[de_cols].to_csv(f'{output_dir}/featured_demographic.csv', index=False)

In [271]:
# 3. Biometric
biometric['day'] = biometric['date'].dt.day
biometric['month'] = biometric['date'].dt.month
biometric['day_name'] = biometric['date'].dt.day_name()
biometric['is_weekend'] = biometric['date'].dt.dayofweek.isin([5, 6]).astype(int)

bio_cols = ['date', 'state', 'district', 'pincode', 'bio_age_5_17', 'total_biometric', 'day', 'month', 'day_name', 'is_weekend']
biometric[bio_cols].to_csv(f'{output_dir}/featured_biometric.csv', index=False)

print("Featured Datasets re-saved")

Featured Datasets re-saved


In [272]:
Dataset_info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1006007 entries, 0 to 1006006
Data columns (total 12 columns):
 #   Column           Non-Null Count    Dtype         
---  ------           --------------    -----         
 0   date             1006007 non-null  datetime64[ns]
 1   state            1006007 non-null  object        
 2   district         1006007 non-null  object        
 3   pincode          1006007 non-null  object        
 4   age_0_5          1006007 non-null  int64         
 5   age_5_17         1006007 non-null  int64         
 6   age_18+          1006007 non-null  int64         
 7   total_enrolment  1006007 non-null  int64         
 8   day              1006007 non-null  int32         
 9   month            1006007 non-null  int32         
 10  day_name         1006007 non-null  object        
 11  is_weekend       1006007 non-null  int32         
dtypes: datetime64[ns](1), int32(3), int64(4), object(4)
memory usage: 80.6+ MB
None
--------------------------------

In [273]:
print(enrolment['state'].unique())
print('-'*50)
print(demography['state'].unique())
print('-'*50)
print(biometric['state'].unique())

['Meghalaya' 'Karnataka' 'Uttar Pradesh' 'Bihar' 'Maharashtra' 'Haryana'
 'Rajasthan' 'Punjab' 'Delhi' 'Madhya Pradesh' 'West Bengal' 'Assam'
 'Uttarakhand' 'Gujarat' 'Andhra Pradesh' 'Tamil Nadu' 'Chhattisgarh'
 'Jharkhand' 'Nagaland' 'Manipur' 'Telangana' 'Tripura' 'Mizoram'
 'Jammu And Kashmir' 'Chandigarh' 'Sikkim' 'Odisha' 'Kerala'
 'Dadra And Nagar Haveli And Daman And Diu' 'Arunachal Pradesh'
 'Himachal Pradesh' 'Goa' 'Ladakh' 'Andaman And Nicobar Islands'
 'Puducherry' 'Lakshadweep']
--------------------------------------------------
['Uttar Pradesh' 'Andhra Pradesh' 'Gujarat' 'Rajasthan' 'Karnataka'
 'West Bengal' 'Telangana' 'Odisha' 'Maharashtra' 'Kerala' 'Bihar'
 'Tamil Nadu' 'Madhya Pradesh' 'Assam' 'Tripura' 'Arunachal Pradesh'
 'Punjab' 'Jharkhand' 'Delhi' 'Chandigarh' 'Chhattisgarh'
 'Jammu and Kashmir' 'Mizoram' 'Nagaland' 'Himachal Pradesh' 'Goa'
 'Haryana' 'Meghalaya' 'Uttarakhand' 'Manipur'
 'Dadra and Nagar Haveli and Daman and Diu' 'Puducherry' 'Sikkim' 'Ladakh'
 

In [274]:
updated_region_map = {
    'Jammu and Kashmir': 'North', 'Himachal Pradesh': 'North', 'Punjab': 'North', 
    'Chandigarh': 'North', 'Uttarakhand': 'North', 'Haryana': 'North', 'Delhi': 'North', 
    'Rajasthan': 'North', 'Uttar Pradesh': 'North', 'Bihar': 'East', 'Sikkim': 'NE', 
    'Arunachal Pradesh': 'NE', 'Nagaland': 'NE', 'Manipur': 'NE', 'Mizoram': 'NE', 
    'Tripura': 'NE', 'Meghalaya': 'NE', 'Assam': 'NE', 'West Bengal': 'East', 
    'Jharkhand': 'East', 'Odisha': 'East', 'Chhattisgarh': 'Central',
    'Madhya Pradesh': 'Central', 'Gujarat': 'West', 'Maharashtra': 'West', 
    'Andhra Pradesh': 'South', 'Karnataka': 'South', 'Goa': 'South',
    'Lakshadweep': 'South', 'Kerala': 'South', 'Tamil Nadu': 'South', 
    'Puducherry': 'South', 'Andaman and Nicobar Islands': 'South', 
    'Telangana': 'South', 'Ladakh': 'North',
    'Dadra and Nagar Haveli and Daman and Diu': 'West'
}
Dataset_info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1006007 entries, 0 to 1006006
Data columns (total 12 columns):
 #   Column           Non-Null Count    Dtype         
---  ------           --------------    -----         
 0   date             1006007 non-null  datetime64[ns]
 1   state            1006007 non-null  object        
 2   district         1006007 non-null  object        
 3   pincode          1006007 non-null  object        
 4   age_0_5          1006007 non-null  int64         
 5   age_5_17         1006007 non-null  int64         
 6   age_18+          1006007 non-null  int64         
 7   total_enrolment  1006007 non-null  int64         
 8   day              1006007 non-null  int32         
 9   month            1006007 non-null  int32         
 10  day_name         1006007 non-null  object        
 11  is_weekend       1006007 non-null  int32         
dtypes: datetime64[ns](1), int32(3), int64(4), object(4)
memory usage: 80.6+ MB
None
--------------------------------

In [275]:
def finalize_aadhaar_dataset(df, youth_col, total_col):
    """
    Standardizes state names, pads pincodes, maps regions, 
    and generates Advanced KPIs (Quarter, Youth Ratio, High Intensity).
    """
    # Standardize State Names to match the map (Title case + small 'and')
    df['state'] = df['state'].str.title().str.replace(' And ', ' and ')
    df['region'] = df['state'].map(updated_region_map)
    
    # Handle Pincode padding (Clean .0 from floats if any, then zfill to 6)
    df['pincode'] = df['pincode'].astype(str).str.replace('.0', '', regex=False).str.zfill(6)
    
    # Add Quarter for seasonal analysis
    df['quarter'] = pd.to_datetime(df['date']).dt.quarter
    
    # KPI: Youth Ratio (Workload intensity from 5-17 group)
    df['youth_ratio'] = (df[youth_col] / (df[total_col] + 0.1)) * 100
    
    # Anomaly Detection: High Intensity Flag (Days in the top 10% volume for that district)
    # Important: This identifies operational bottlenecks
    q90 = df.groupby('district')[total_col].transform(lambda x: x.quantile(0.90))
    df['is_high_intensity'] = (df[total_col] > q90).astype(int)
    
    return df

In [276]:
# Note: We use the existing variables as they appear in your info()
enrolment = finalize_aadhaar_dataset(enrolment, 'age_5_17', 'total_enrolment')
demography = finalize_aadhaar_dataset(demography, 'demo_age_5_17', 'total_updates')
biometric = finalize_aadhaar_dataset(biometric, 'bio_age_5_17', 'total_biometric')

In [277]:
Dataset_info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1006007 entries, 0 to 1006006
Data columns (total 16 columns):
 #   Column             Non-Null Count    Dtype         
---  ------             --------------    -----         
 0   date               1006007 non-null  datetime64[ns]
 1   state              1006007 non-null  object        
 2   district           1006007 non-null  object        
 3   pincode            1006007 non-null  object        
 4   age_0_5            1006007 non-null  int64         
 5   age_5_17           1006007 non-null  int64         
 6   age_18+            1006007 non-null  int64         
 7   total_enrolment    1006007 non-null  int64         
 8   day                1006007 non-null  int32         
 9   month              1006007 non-null  int32         
 10  day_name           1006007 non-null  object        
 11  is_weekend         1006007 non-null  int32         
 12  region             1006007 non-null  object        
 13  quarter            1006007 

In [293]:
# Ensure the directory exists
os.makedirs('Featured Datasets', exist_ok=True)

# Save the finalized DataFrames to the Featured Datasets folder
enrolment.to_csv('Featured Datasets/featured_enrolment.csv', index=False)
demography.to_csv('Featured Datasets/featured_demographic.csv', index=False)
biometric.to_csv('Featured Datasets/featured_biometric.csv', index=False)

print("Success: All featured datasets have been saved to the 'Featured Datasets' folder.")

Success: All featured datasets have been saved to the 'Featured Datasets' folder.


In [295]:
print(enrolment.columns)
print(demography.columns)
print(biometric.columns)


Index(['date', 'state', 'district', 'pincode', 'age_0_5', 'age_5_17',
       'age_18+', 'total_enrolment', 'day', 'month', 'day_name', 'is_weekend',
       'region', 'quarter', 'youth_ratio', 'is_high_intensity'],
      dtype='object')
Index(['date', 'state', 'district', 'pincode', 'demo_age_5_17', 'demo_age_17+',
       'total_updates', 'day', 'month', 'day_name', 'is_weekend', 'region',
       'quarter', 'youth_ratio', 'is_high_intensity'],
      dtype='object')
Index(['date', 'state', 'district', 'pincode', 'bio_age_5_17', 'bio_age_17+',
       'total_biometric', 'day', 'month', 'day_name', 'is_weekend', 'region',
       'quarter', 'youth_ratio', 'is_high_intensity'],
      dtype='object')


In [297]:
EPS = 1e-6

# Youth and adult shares
enrolment['youth_share'] = (
    enrolment['age_5_17'] /
    (enrolment['total_enrolment'] + EPS)
)

enrolment['adult_share'] = (
    enrolment['age_18+'] /
    (enrolment['total_enrolment'] + EPS)
)

# Displacement score
enrolment['displacement_score'] = (
    enrolment['adult_share'] -
    enrolment['youth_share']
)


In [299]:
EPS = 1e-6

demography['youth_share'] = (
    demography['demo_age_5_17'] /
    (demography['total_updates'] + EPS)
)

demography['adult_share'] = (
    demography['demo_age_17+'] /
    (demography['total_updates'] + EPS)
)

demography['displacement_score'] = (
    demography['adult_share'] -
    demography['youth_share']
)


In [301]:
EPS = 1e-6

biometric['youth_share'] = (
    biometric['bio_age_5_17'] /
    (biometric['total_biometric'] + EPS)
)

biometric['adult_share'] = (
    biometric['bio_age_17+'] /
    (biometric['total_biometric'] + EPS)
)

biometric['displacement_score'] = (
    biometric['adult_share'] -
    biometric['youth_share']
)


In [303]:
print(enrolment[['youth_share', 'adult_share', 'displacement_score']].describe())
print(demography[['youth_share', 'adult_share', 'displacement_score']].describe())
print(biometric[['youth_share', 'adult_share', 'displacement_score']].describe())


        youth_share   adult_share  displacement_score
count  1.006007e+06  1.006007e+06        1.006007e+06
mean   2.439310e-01  1.591202e-02       -2.280190e-01
std    3.380805e-01  1.036686e-01        3.595063e-01
min    0.000000e+00  0.000000e+00       -1.000000e+00
25%    0.000000e+00  0.000000e+00       -4.666666e-01
50%    0.000000e+00  0.000000e+00        0.000000e+00
75%    4.999998e-01  0.000000e+00        0.000000e+00
max    1.000000e+00  1.000000e+00        1.000000e+00
        youth_share   adult_share  displacement_score
count  2.071698e+06  2.071698e+06        2.071698e+06
mean   1.123105e-01  8.866568e-01        7.743463e-01
std    1.831304e-01  1.853004e-01        3.670347e-01
min    0.000000e+00  0.000000e+00       -1.000000e+00
25%    0.000000e+00  8.404255e-01        6.808511e-01
50%    3.333333e-02  9.655172e-01        9.310345e-01
75%    1.578947e-01  9.999997e-01        9.999997e-01
max    1.000000e+00  1.000000e+00        1.000000e+00
        youth_share   adult_

In [305]:
EPS = 1e-6

def normalize_displacement_score(df, col='displacement_score', new_col='displacement_score_norm'):
    """
    Robust percentile normalization for bounded, skewed displacement scores.
    Scales values to [0, 1] using 5thâ€“95th percentiles.
    """
    low = df[col].quantile(0.05)
    high = df[col].quantile(0.95)

    df[new_col] = ((df[col] - low) / (high - low + EPS)).clip(0, 1)
    return df

In [307]:
enrolment = normalize_displacement_score(enrolment)
demography = normalize_displacement_score(demography)
biometric  = normalize_displacement_score(biometric)


In [309]:
print(enrolment['displacement_score_norm'].describe())
print(demography['displacement_score_norm'].describe())
print(biometric['displacement_score_norm'].describe())

count    1.006007e+06
mean     7.601007e-01
std      3.377473e-01
min      0.000000e+00
25%      5.333324e-01
50%      9.999990e-01
75%      9.999990e-01
max      1.000000e+00
Name: displacement_score_norm, dtype: float64
count    2.071698e+06
mean     7.966002e-01
std      2.818662e-01
min      0.000000e+00
25%      6.808504e-01
50%      9.310336e-01
75%      9.999988e-01
max      9.999991e-01
Name: displacement_score_norm, dtype: float64
count    1.861107e+06
mean     5.271516e-01
std      3.038650e-01
min      0.000000e+00
25%      3.333329e-01
50%      4.999996e-01
75%      7.499996e-01
max      9.999997e-01
Name: displacement_score_norm, dtype: float64


In [311]:
print(enrolment.info())
print(demography.info())
print(biometric.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1006007 entries, 0 to 1006006
Data columns (total 20 columns):
 #   Column                   Non-Null Count    Dtype         
---  ------                   --------------    -----         
 0   date                     1006007 non-null  datetime64[ns]
 1   state                    1006007 non-null  object        
 2   district                 1006007 non-null  object        
 3   pincode                  1006007 non-null  object        
 4   age_0_5                  1006007 non-null  int64         
 5   age_5_17                 1006007 non-null  int64         
 6   age_18+                  1006007 non-null  int64         
 7   total_enrolment          1006007 non-null  int64         
 8   day                      1006007 non-null  int32         
 9   month                    1006007 non-null  int32         
 10  day_name                 1006007 non-null  object        
 11  is_weekend               1006007 non-null  int32         
 12  

In [313]:
# Save the finalized DataFrames to the Featured Datasets folder
enrolment.to_csv('Featured Datasets/featured_enrolment.csv', index=False)
demography.to_csv('Featured Datasets/featured_demographic.csv', index=False)
biometric.to_csv('Featured Datasets/featured_biometric.csv', index=False)