In [1]:
import pandas as pd
import numpy as np

In [2]:
df1 = pd.read_csv('api_data_aadhar_biometric_0_500000.csv')
df1.head()

Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
0,01-03-2025,Haryana,Mahendragarh,123029,280,577
1,01-03-2025,Bihar,Madhepura,852121,144,369
2,01-03-2025,Jammu and Kashmir,Punch,185101,643,1091
3,01-03-2025,Bihar,Bhojpur,802158,256,980
4,01-03-2025,Tamil Nadu,Madurai,625514,271,815


In [3]:
df2 = pd.read_csv('api_data_aadhar_demographic_0_500000.csv')
df2.head()

Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_
0,01-03-2025,Uttar Pradesh,Gorakhpur,273213,49,529
1,01-03-2025,Andhra Pradesh,Chittoor,517132,22,375
2,01-03-2025,Gujarat,Rajkot,360006,65,765
3,01-03-2025,Andhra Pradesh,Srikakulam,532484,24,314
4,01-03-2025,Rajasthan,Udaipur,313801,45,785


In [4]:
df3 = pd.read_csv('api_data_aadhar_enrolment_0_500000.csv')
df3.head()

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,02-03-2025,Meghalaya,East Khasi Hills,793121,11,61,37
1,09-03-2025,Karnataka,Bengaluru Urban,560043,14,33,39
2,09-03-2025,Uttar Pradesh,Kanpur Nagar,208001,29,82,12
3,09-03-2025,Uttar Pradesh,Aligarh,202133,62,29,15
4,09-03-2025,Karnataka,Bengaluru Urban,560016,14,16,21


## Datasets have no Missing Values

In [5]:
df1.isnull().sum() 

date            0
state           0
district        0
pincode         0
bio_age_5_17    0
bio_age_17_     0
dtype: int64

In [6]:
df2.isnull().sum()

date             0
state            0
district         0
pincode          0
demo_age_5_17    0
demo_age_17_     0
dtype: int64

In [7]:
df3.isnull().sum()

date              0
state             0
district          0
pincode           0
age_0_5           0
age_5_17          0
age_18_greater    0
dtype: int64

### Data Cleaning

In [8]:
df1['state'].unique() # we have to remove duplicated states first

array(['Haryana', 'Bihar', 'Jammu and Kashmir', 'Tamil Nadu',
       'Maharashtra', 'Gujarat', 'Odisha', 'West Bengal', 'Kerala',
       'Rajasthan', 'Punjab', 'Himachal Pradesh', 'Uttar Pradesh',
       'Assam', 'Uttarakhand', 'Madhya Pradesh', 'Karnataka',
       'Andhra Pradesh', 'Telangana', 'Goa', 'Nagaland', 'Jharkhand',
       'Delhi', 'Chhattisgarh', 'Meghalaya', 'Chandigarh', 'Orissa',
       'Puducherry', 'Pondicherry', 'Manipur', 'Sikkim', 'Tripura',
       'Mizoram', 'Arunachal Pradesh', 'Ladakh',
       'Dadra and Nagar Haveli and Daman and Diu', 'Daman and Diu',
       'Andaman and Nicobar Islands', 'Andaman & Nicobar Islands',
       'Dadra and Nagar Haveli', 'Lakshadweep', 'Daman & Diu',
       'Dadra & Nagar Haveli', 'Jammu & Kashmir', 'WESTBENGAL',
       'andhra pradesh', 'Westbengal', 'West  Bengal', 'WEST BENGAL',
       'West Bangal', 'ODISHA', 'odisha', 'West bengal', 'west Bengal',
       'Uttaranchal', 'Chhatisgarh'], dtype=object)

In [9]:
df1['state'].unique()

array(['Haryana', 'Bihar', 'Jammu and Kashmir', 'Tamil Nadu',
       'Maharashtra', 'Gujarat', 'Odisha', 'West Bengal', 'Kerala',
       'Rajasthan', 'Punjab', 'Himachal Pradesh', 'Uttar Pradesh',
       'Assam', 'Uttarakhand', 'Madhya Pradesh', 'Karnataka',
       'Andhra Pradesh', 'Telangana', 'Goa', 'Nagaland', 'Jharkhand',
       'Delhi', 'Chhattisgarh', 'Meghalaya', 'Chandigarh', 'Orissa',
       'Puducherry', 'Pondicherry', 'Manipur', 'Sikkim', 'Tripura',
       'Mizoram', 'Arunachal Pradesh', 'Ladakh',
       'Dadra and Nagar Haveli and Daman and Diu', 'Daman and Diu',
       'Andaman and Nicobar Islands', 'Andaman & Nicobar Islands',
       'Dadra and Nagar Haveli', 'Lakshadweep', 'Daman & Diu',
       'Dadra & Nagar Haveli', 'Jammu & Kashmir', 'WESTBENGAL',
       'andhra pradesh', 'Westbengal', 'West  Bengal', 'WEST BENGAL',
       'West Bangal', 'ODISHA', 'odisha', 'West bengal', 'west Bengal',
       'Uttaranchal', 'Chhatisgarh'], dtype=object)

In [10]:
df3['state'].unique()

array(['Meghalaya', 'Karnataka', 'Uttar Pradesh', 'Bihar', 'Maharashtra',
       'Haryana', 'Rajasthan', 'Punjab', 'Delhi', 'Madhya Pradesh',
       'West Bengal', 'Assam', 'Uttarakhand', 'Gujarat', 'Andhra Pradesh',
       'Tamil Nadu', 'Chhattisgarh', 'Jharkhand', 'Nagaland', 'Manipur',
       'Telangana', 'Tripura', 'Mizoram', 'Jammu and Kashmir',
       'Chandigarh', 'Sikkim', 'Odisha', 'Kerala',
       'The Dadra And Nagar Haveli And Daman And Diu',
       'Arunachal Pradesh', 'Himachal Pradesh', 'Goa',
       'Jammu And Kashmir', 'Dadra and Nagar Haveli and Daman and Diu',
       'Ladakh', 'Andaman and Nicobar Islands', 'Orissa', 'Pondicherry',
       'Puducherry', 'Lakshadweep', 'Andaman & Nicobar Islands',
       'Dadra & Nagar Haveli', 'Dadra and Nagar Haveli', 'Daman and Diu',
       'WEST BENGAL', 'Jammu & Kashmir', 'West  Bengal', '100000',
       'Daman & Diu', 'West Bangal', 'Westbengal', 'West bengal',
       'andhra pradesh', 'ODISHA'], dtype=object)

In [11]:
def clean_names_state(df):
    df['state'] = df['state'].str.lower()
    df['state'] = df['state'].str.replace(r'[^a-z\s]', '', regex=True)
    df['state'] = df['state'].str.strip()

    state_mapping = {
    'orissa': 'odisha',
    'odisha': 'odisha',

    'jammu & kashmir': 'jammu and kashmir',
    'jammu and kashmir': 'jammu and kashmir',
    'jammu kashmir': 'jammu and kashmir',

    'andhra pradesh': 'andhra pradesh',

    'west bengal': 'west bengal',
    'westbengal': 'west bengal',
    'west bangal': 'west bengal',
    'west bengli':'west bengal',

    'uttaranchal': 'uttarakhand',

    'pondicherry': 'puducherry',

    'daman & diu': 'daman and diu',
    'daman and diu': 'daman and diu',

    'dadra & nagar haveli': 'dadra and nagar haveli and daman and diu',
    'dadra and nagar haveli': 'dadra and nagar haveli and daman and diu',
    'daman and diu':'dadra and nagar haveli and daman and diu',
    'daman & diu':'dadra and nagar haveli and daman and diu',
    'the dadra and nagar haveli and daman and diu':'dadra and nagar haveli and daman and diu',
    

    'andaman & nicobar islands': 'andaman and nicobar islands',

    'darbhanga': 'bihar',
    'puttenahalli': 'karnataka',

    'chhatisgarh': 'chhattisgarh', 
'jammu  kashmir':'jammu and kashmir' ,
 'west  bengal' : 'west bengal', 
 'daman  diu' : 'dadra and nagar haveli and daman and diu', 
 'dadra  nagar haveli':'dadra and nagar haveli and daman and diu',
 'andaman  nicobar islands':'andaman and nicobar islands',
}
    df['state'] = df['state'].replace(state_mapping)
    
    return df['state']
    


In [12]:
df1['state'] = clean_names_state(df1)
# df1['state'].unique()

In [13]:
df2['state'] = clean_names_state(df2)
# df2['state'].unique()

In [14]:
df3['state'] = clean_names_state(df3)

In [15]:
# df1['district'].value_counts()

In [16]:
def clean_names_district(df):
    df['district'] = df['district'].str.lower().str.strip().str.replace('  ','')
    df['district'] = df['district'].str.replace(r'[^a-z\s]', '', regex=True) # remove special characters

    return df['district']

In [17]:
df1['district'] = clean_names_district(df1)
# df1['district'].unique()
df1['district'].value_counts()

district
pune               3239
north  parganas    2999
barddhaman         2988
thrissur           2933
east godavari      2891
                   ... 
east midnapur         1
west medinipur        1
bijapurkar            1
tiswadi               1
domjur                1
Name: count, Length: 920, dtype: int64

In [18]:
df2['district'] = clean_names_district(df2)
# df2['district'].unique()
df2['district'].value_counts()

district
north  parganas    3380
barddhaman         3078
pune               2966
east godavari      2927
thrissur           2912
                   ... 
baghpat               1
dist  thane           1
south dumdumm         1
balianta              1
th cross              1
Name: count, Length: 931, dtype: int64

In [19]:
df3['district'] = clean_names_district(df3)
# df3['district'].unique()
df3['district'].value_counts()

district
north  parganas    3392
pune               3274
barddhaman         2744
bengaluru          2619
hyderabad          2456
                   ... 
north east            1
tiruvarur             1
hingoli               1
south  pargana        1
ranga reddy           1
Name: count, Length: 949, dtype: int64

In [20]:
def convert_dates(df):
    df['date'] = pd.to_datetime(df['date'] , errors = 'coerce' , format='%d-%m-%Y')
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    return df['date']

In [21]:
df1['date'] = convert_dates(df1)
df1.drop(columns='date' , axis=1 , inplace=True)
df1.head()

Unnamed: 0,state,district,pincode,bio_age_5_17,bio_age_17_,year,month,day
0,haryana,mahendragarh,123029,280,577,2025,3,1
1,bihar,madhepura,852121,144,369,2025,3,1
2,jammu and kashmir,punch,185101,643,1091,2025,3,1
3,bihar,bhojpur,802158,256,980,2025,3,1
4,tamil nadu,madurai,625514,271,815,2025,3,1


In [22]:
df2['date'] = convert_dates(df2)
df2.drop(columns='date' , axis=1 , inplace=True)
df2.head()

Unnamed: 0,state,district,pincode,demo_age_5_17,demo_age_17_,year,month,day
0,uttar pradesh,gorakhpur,273213,49,529,2025,3,1
1,andhra pradesh,chittoor,517132,22,375,2025,3,1
2,gujarat,rajkot,360006,65,765,2025,3,1
3,andhra pradesh,srikakulam,532484,24,314,2025,3,1
4,rajasthan,udaipur,313801,45,785,2025,3,1


In [23]:
df3['date'] = convert_dates(df3)
df3.drop(columns='date' , axis=1 , inplace=True)
df3.head()

Unnamed: 0,state,district,pincode,age_0_5,age_5_17,age_18_greater,year,month,day
0,meghalaya,east khasi hills,793121,11,61,37,2025,3,2
1,karnataka,bengaluru urban,560043,14,33,39,2025,3,9
2,uttar pradesh,kanpur nagar,208001,29,82,12,2025,3,9
3,uttar pradesh,aligarh,202133,62,29,15,2025,3,9
4,karnataka,bengaluru urban,560016,14,16,21,2025,3,9


In [24]:
df3 = df3[df3['state']!='']

In [25]:
df3  = df3.dropna(subset=['state'])
# df3['state'].isna().sum()

In [26]:
# df3.head()

In [27]:
# df2.head()

In [28]:
# df1.head()

In [29]:
df1.head()

Unnamed: 0,state,district,pincode,bio_age_5_17,bio_age_17_,year,month,day
0,haryana,mahendragarh,123029,280,577,2025,3,1
1,bihar,madhepura,852121,144,369,2025,3,1
2,jammu and kashmir,punch,185101,643,1091,2025,3,1
3,bihar,bhojpur,802158,256,980,2025,3,1
4,tamil nadu,madurai,625514,271,815,2025,3,1


In [30]:
df2.head()

Unnamed: 0,state,district,pincode,demo_age_5_17,demo_age_17_,year,month,day
0,uttar pradesh,gorakhpur,273213,49,529,2025,3,1
1,andhra pradesh,chittoor,517132,22,375,2025,3,1
2,gujarat,rajkot,360006,65,765,2025,3,1
3,andhra pradesh,srikakulam,532484,24,314,2025,3,1
4,rajasthan,udaipur,313801,45,785,2025,3,1


In [31]:
df3.head()

Unnamed: 0,state,district,pincode,age_0_5,age_5_17,age_18_greater,year,month,day
0,meghalaya,east khasi hills,793121,11,61,37,2025,3,2
1,karnataka,bengaluru urban,560043,14,33,39,2025,3,9
2,uttar pradesh,kanpur nagar,208001,29,82,12,2025,3,9
3,uttar pradesh,aligarh,202133,62,29,15,2025,3,9
4,karnataka,bengaluru urban,560016,14,16,21,2025,3,9


In [32]:
df1 = df1.to_csv('biometric.csv' , index = False)
df2 = df2.to_csv('demographic.csv' , index = False)
df3 = df3.to_csv('enrollment.csv' , index = False)