In [1]:
import pandas as pd

In [2]:
biometric_final_df=pd.read_csv("../Dataset/api_data_aadhar_biometric/aadhar_biometric_final.csv")
demographic_final_df=pd.read_csv("../Dataset/api_data_aadhar_demographic/aadhaar_demographic_final.csv")
enrolment_final_df=pd.read_csv("../Dataset/api_data_aadhar_enrolment/aadhar_enrolment_final.csv")

In [4]:
for df in [biometric_final_df, demographic_final_df, enrolment_final_df]:
    df['join_date'] = pd.to_datetime(
        df['date'],
        format='mixed',
        dayfirst=True,
        errors='coerce'
    )
bio_demo_df = pd.merge(
    biometric_final_df,
    demographic_final_df,
    on=['join_date', 'state', 'district', 'pincode'],
    how='outer',
    suffixes=('_bio', '_demo')
)
final_df = pd.merge(
    bio_demo_df,
    enrolment_final_df,
    on=['join_date', 'state', 'district', 'pincode'],
    how='outer',
    suffixes=('', '_enr')
)
final_df['date_final'] = (
    final_df['date_bio']
    .combine_first(final_df['date_demo'])
    .combine_first(final_df['date'])
)



In [5]:
final_df=final_df.drop(columns=['date_bio','date_final','date','date_demo'],errors='ignore')

In [6]:
final_df.head()

Unnamed: 0,state,district,pincode,bio_age_5_17,bio_age_17_,join_date,demo_age_5_17,demo_age_17_,age_0_5,age_5_17,age_18_greater
0,Andaman & Nicobar Islands,Andamans,744101,16.0,193.0,2025-03-01,,,,,
1,Andaman and Nicobar Islands,Nicobar,744301,101.0,48.0,2025-03-01,16.0,180.0,,,
2,Andaman and Nicobar Islands,Nicobar,744302,15.0,12.0,2025-03-01,,,,,
3,Andaman and Nicobar Islands,Nicobar,744303,46.0,27.0,2025-03-01,,,,,
4,Andaman and Nicobar Islands,Nicobar,744304,16.0,14.0,2025-03-01,,,,,


# State Normalisation


In [7]:
final_df["state"].unique()

array(['Andaman & Nicobar Islands', 'Andaman and Nicobar Islands',
       'Andhra Pradesh', 'Arunachal Pradesh', 'Assam', 'Bihar',
       'Chandigarh', 'Chhattisgarh', 'Dadra & Nagar Haveli',
       'Dadra and Nagar Haveli',
       'Dadra and Nagar Haveli and Daman and Diu', 'Daman & Diu',
       'Daman and Diu', 'Delhi', 'Goa', 'Gujarat', 'Haryana',
       'Himachal Pradesh', 'Jammu and Kashmir', 'Jharkhand', 'Karnataka',
       'Kerala', 'Ladakh', 'Lakshadweep', 'Madhya Pradesh', 'Maharashtra',
       'Manipur', 'Meghalaya', 'Mizoram', 'Nagaland', 'Odisha', 'Orissa',
       'Pondicherry', 'Puducherry', 'Punjab', 'Rajasthan', 'Sikkim',
       'Tamil Nadu', 'Telangana', 'Tripura', 'Uttar Pradesh',
       'Uttarakhand', 'West Bengal',
       'The Dadra And Nagar Haveli And Daman And Diu',
       'Jammu And Kashmir', 'Jammu & Kashmir', 'ODISHA', 'WEST BENGAL',
       'WESTBENGAL', 'West  Bengal', 'West bengal', 'Westbengal',
       'andhra pradesh', 'odisha', 'west Bengal', '100000', 'We

In [8]:
len(final_df['state'])

2392273

In [9]:
import pandas as pd
import re

# Normalize raw state text
final_df["state"] = (
    final_df["state"]
    .astype(str)
    .str.strip()
    .str.lower()
    .str.replace(r"\s+", " ", regex=True)
    .str.replace("&", "and")
)

# Explicit state corrections (canonical mapping)
state_corrections = {

    # Andaman & Nicobar
    "andaman and nicobar islands": "Andaman and Nicobar Islands",

    # Andhra Pradesh
    "andhra pradesh": "Andhra Pradesh",

    # Telangana
    "telangana": "Telangana",
    
    #Tamil Nadu
    "tamil nadu": "Tamil Nadu",
    "tamilnadu": "Tamil Nadu",

    # Bihar
    "bihar": "Bihar",

    # Chhattisgarh
    "chhatisgarh": "Chhattisgarh",

    # Dadra & Nagar Haveli and Daman & Diu (merged UT)
    "dadra and nagar haveli": "Dadra and Nagar Haveli and Daman and Diu",
    "daman and diu": "Dadra and Nagar Haveli and Daman and Diu",
    "the dadra and nagar haveli and daman and diu": "Dadra and Nagar Haveli and Daman and Diu",

    # Delhi
    "delhi": "Delhi",

    # Jammu & Kashmir
    "jammu and kashmir": "Jammu and Kashmir",

    # Odisha
    "odisha": "Odisha",
    "orissa": "Odisha",

    # Puducherry
    "pondicherry": "Puducherry",

    # Uttarakhand
    "uttaranchal": "Uttarakhand",

    # West Bengal variants
    "west bengal": "West Bengal",
    "west bengli": "West Bengal",
    "west bangal": "West Bengal",
    "westbengal": "West Bengal",
}

# Apply corrections
final_df["state"] = final_df["state"].replace(state_corrections)

# Final formatting
final_df["state"] = final_df["state"].str.title()

# Inspect results
unique_states = sorted(final_df["state"].unique())

print("Total number of unique states:", len(unique_states))
for state in unique_states:
    print(state)

Total number of unique states: 42
100000
Andaman And Nicobar Islands
Andhra Pradesh
Arunachal Pradesh
Assam
Balanagar
Bihar
Chandigarh
Chhattisgarh
Dadra And Nagar Haveli And Daman And Diu
Darbhanga
Delhi
Goa
Gujarat
Haryana
Himachal Pradesh
Jaipur
Jammu And Kashmir
Jharkhand
Karnataka
Kerala
Ladakh
Lakshadweep
Madanapalle
Madhya Pradesh
Maharashtra
Manipur
Meghalaya
Mizoram
Nagaland
Odisha
Puducherry
Punjab
Puttenahalli
Rajasthan
Sikkim
Tamil Nadu
Telangana
Tripura
Uttar Pradesh
Uttarakhand
West Bengal


In [37]:
(final_df['state']=="100000").sum()

23

In [38]:
NON_STATE_VALUES = {
    "Balanagar",
    "Jaipur",
    "Darbhanga",
    "Madanapalle",
    "Puttenahalli",
}


In [40]:
final_df = final_df[
    final_df["state"].notna() &
    (final_df["state"].str.strip() != "100000")
]


In [41]:
final_df = final_df[~final_df["state"].isin(NON_STATE_VALUES)]
unique_states = sorted(final_df["state"].unique())

print("Total number of cleaned states:", len(unique_states))
for s in unique_states:
    print(s)


Total number of cleaned states: 36
Andaman And Nicobar Islands
Andhra Pradesh
Arunachal Pradesh
Assam
Bihar
Chandigarh
Chhattisgarh
Dadra And Nagar Haveli And Daman And Diu
Delhi
Goa
Gujarat
Haryana
Himachal Pradesh
Jammu And Kashmir
Jharkhand
Karnataka
Kerala
Ladakh
Lakshadweep
Madhya Pradesh
Maharashtra
Manipur
Meghalaya
Mizoram
Nagaland
Odisha
Puducherry
Punjab
Rajasthan
Sikkim
Tamil Nadu
Telangana
Tripura
Uttar Pradesh
Uttarakhand
West Bengal


In [43]:
import json
import pandas as pd

with open("../Dataset/GeoJSON/states_districts.json", "r", encoding="utf-8") as f:
    states_districts = json.load(f)


In [46]:
state_district_map = {}

for item in states_districts["states"]:
    state = item["state"].strip().lower()
    districts = {d.strip().lower() for d in item["districts"]}
    state_district_map[state] = districts

final_df["state_norm"] = (
    final_df["state"]
    .astype(str)
    .str.strip()
    .str.lower()
)

final_df["district_norm"] = (
    final_df["district"]
    .astype(str)
    .str.strip()
    .str.lower()
)



In [52]:
def is_valid_district(row):
    state = row["state_norm"]
    district = row["district_norm"]

    if state not in state_district_map:
        return False

    return district in state_district_map[state]

final_df["is_valid_district"] = final_df.apply(is_valid_district, axis=1)

anomalies_df = final_df[~final_df["is_valid_district"]]

print("Total anomalies found:", len(anomalies_df))
anomalies_df[["state", "district"]].drop_duplicates().head(50)


KeyError: 'state_norm'

In [48]:
final_df = final_df[final_df["is_valid_district"]].copy()


In [50]:
final_df.columns

Index(['state', 'district', 'pincode', 'bio_age_5_17', 'bio_age_17_',
       'join_date', 'demo_age_5_17', 'demo_age_17_', 'age_0_5', 'age_5_17',
       'age_18_greater', 'state_norm', 'district_norm', 'is_valid_district'],
      dtype='object')

In [55]:
import json

with open("../Dataset/GeoJSON/states_districts.json", "r", encoding="utf-8") as f:
    states_districts = json.load(f)

state_district_map = {
    item["state"].strip().lower(): {
        d.strip().lower() for d in item["districts"]
    }
    for item in states_districts["states"]
}

final_df["state_norm"] = (
    final_df["state"]
    .astype(str)
    .str.strip()
    .str.lower()
)

final_df["district_norm"] = (
    final_df["district"]
    .astype(str)
    .str.strip()
    .str.lower()
)

def is_valid_district(row):
    state = row["state_norm"]
    district = row["district_norm"]

    if state not in state_district_map:
        return False

    return district in state_district_map[state]


final_df["is_valid_district"] = final_df.apply(is_valid_district, axis=1)

anomalies_df = final_df[~final_df["is_valid_district"]]

print("Total anomaly rows:", len(anomalies_df))
print("Unique anomaly state–district pairs:")
print(
    anomalies_df[["state", "district"]]
    .drop_duplicates()
    .sort_values(["state", "district"])
    .head(50)
)


Total anomaly rows: 0
Unique anomaly state–district pairs:
Empty DataFrame
Columns: [state, district]
Index: []


In [56]:
total_districts = final_df["district"].nunique()

print("Total number of districts:", total_districts)


Total number of districts: 728


In [60]:
duplicate_rows = final_df[
    final_df.duplicated(subset=["state", "district"], keep=False)
]

print(duplicate_rows.head(2))


                         state district  pincode  bio_age_5_17  bio_age_17_  \
1  Andaman And Nicobar Islands  Nicobar   744301         101.0         48.0   
2  Andaman And Nicobar Islands  Nicobar   744302          15.0         12.0   

   join_date  demo_age_5_17  demo_age_17_  age_0_5  age_5_17  age_18_greater  \
1 2025-03-01           16.0         180.0      NaN       NaN             NaN   
2 2025-03-01            NaN           NaN      NaN       NaN             NaN   

                    state_norm district_norm  is_valid_district  
1  andaman and nicobar islands       nicobar               True  
2  andaman and nicobar islands       nicobar               True  


In [61]:
final_df.columns

Index(['state', 'district', 'pincode', 'bio_age_5_17', 'bio_age_17_',
       'join_date', 'demo_age_5_17', 'demo_age_17_', 'age_0_5', 'age_5_17',
       'age_18_greater', 'state_norm', 'district_norm', 'is_valid_district'],
      dtype='object')

In [62]:
final_df = final_df.drop(
    columns=["state_norm", "district_norm", "is_valid_district"],
    errors="ignore"
)
print(final_df.columns)


Index(['state', 'district', 'pincode', 'bio_age_5_17', 'bio_age_17_',
       'join_date', 'demo_age_5_17', 'demo_age_17_', 'age_0_5', 'age_5_17',
       'age_18_greater'],
      dtype='object')


In [64]:
final_df = final_df.rename(columns={'join_date': 'date','age_0_5':'enr_age_0_5','age_5_17':'enr_age_5_17','age_18_greater':'enr_age_18_greater','demo_age_17_':'demo_age_17_greater','bio_age_17_':'bio_age_17_greater'})


In [65]:
final_df.head(5)

Unnamed: 0,state,district,pincode,bio_age_5_17,bio_age_17_greater,date,demo_age_5_17,demo_age_17_greater,enr_age_0_5,enr_age_5_17,enr_age_18_greater
1,Andaman And Nicobar Islands,Nicobar,744301,101.0,48.0,2025-03-01,16.0,180.0,,,
2,Andaman And Nicobar Islands,Nicobar,744302,15.0,12.0,2025-03-01,,,,,
3,Andaman And Nicobar Islands,Nicobar,744303,46.0,27.0,2025-03-01,,,,,
4,Andaman And Nicobar Islands,Nicobar,744304,16.0,14.0,2025-03-01,,,,,
5,Andaman And Nicobar Islands,North And Middle Andaman,744201,41.0,40.0,2025-03-01,,,,,
