In [1]:
import pandas as pd

df_bio = pd.read_csv('Raw Datasets/raw_biometric.csv')

print("--- BIOMETRIC DATA LOADED ---")
print(f"Shape: {df_bio.shape}")
print("\nColumns found:")
print(df_bio.columns.tolist())

--- BIOMETRIC DATA LOADED ---
Shape: (1861108, 6)

Columns found:
['date', 'state', 'district', 'pincode', 'bio_age_5_17', 'bio_age_17_']


In [2]:
bio_states = df_bio['state'].value_counts()

print("\n--- BIOMETRIC STATE AUDIT ---")
print(f"Unique entries in 'state' column: {len(bio_states)}")
print("-" * 30)
print(bio_states.to_string())


--- BIOMETRIC STATE AUDIT ---
Unique entries in 'state' column: 57
------------------------------
state
Tamil Nadu                                  184568
Andhra Pradesh                              172034
Uttar Pradesh                               155242
Maharashtra                                 151104
Karnataka                                   141227
West Bengal                                 130735
Kerala                                       98511
Gujarat                                      89531
Odisha                                       86476
Bihar                                        83398
Telangana                                    82579
Rajasthan                                    79724
Madhya Pradesh                               70080
Punjab                                       48108
Assam                                        47643
Jharkhand                                    36625
Chhattisgarh                                 31992
Himachal Pradesh            

In [3]:


# 2. Standardize formatting
df_bio['state'] = df_bio['state'].str.strip()

# 3. The Master Biometric State Map
bio_state_map = {
    # West Bengal cleanup
    'West Bangal': 'West Bengal', 'WEST BENGAL': 'West Bengal', 
    'West  Bengal': 'West Bengal', 'West bengal': 'West Bengal', 
    'Westbengal': 'West Bengal', 'WESTBENGAL': 'West Bengal', 
    'west Bengal': 'West Bengal',
    
    # Odisha and others
    'Orissa': 'Odisha', 'ODISHA': 'Odisha', 'odisha': 'Odisha',
    'andhra pradesh': 'Andhra Pradesh', 'Chhatisgarh': 'Chhattisgarh',
    'Uttaranchal': 'Uttarakhand', 'Tamilnadu': 'Tamil Nadu',
    'Pondicherry': 'Puducherry',
    
    # Union Territory Mergers
    'Andaman & Nicobar Islands': 'Andaman and Nicobar Islands',
    'Jammu & Kashmir': 'Jammu and Kashmir',
    'Daman and Diu': 'Dadra and Nagar Haveli and Daman and Diu',
    'Daman & Diu': 'Dadra and Nagar Haveli and Daman and Diu',
    'Dadra and Nagar Haveli': 'Dadra and Nagar Haveli and Daman and Diu',
    'Dadra & Nagar Haveli': 'Dadra and Nagar Haveli and Daman and Diu'
}

# 4. Apply the map
df_bio['state'] = df_bio['state'].replace(bio_state_map)

print("Biometric states cleaned. Unique count should now be around 36.")

Biometric states cleaned. Unique count should now be around 36.


In [4]:
# Check unique states and their counts
bio_states = df_bio['state'].value_counts()

print("\n--- BIOMETRIC STATE AUDIT ---")
print(f"Unique entries in 'state' column: {len(bio_states)}")
print("-" * 30)
print(bio_states.to_string())


--- BIOMETRIC STATE AUDIT ---
Unique entries in 'state' column: 36
------------------------------
state
Tamil Nadu                                  184569
Andhra Pradesh                              172065
Uttar Pradesh                               155242
Maharashtra                                 151104
Karnataka                                   141227
West Bengal                                 130895
Odisha                                       99674
Kerala                                       98511
Gujarat                                      89531
Bihar                                        83398
Telangana                                    82579
Rajasthan                                    79724
Madhya Pradesh                               70080
Punjab                                       48108
Assam                                        47643
Jharkhand                                    36625
Chhattisgarh                                 31997
Himachal Pradesh            

In [5]:
df_bio.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1861108 entries, 0 to 1861107
Data columns (total 6 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   date          object
 1   state         object
 2   district      object
 3   pincode       int64 
 4   bio_age_5_17  int64 
 5   bio_age_17_   int64 
dtypes: int64(3), object(3)
memory usage: 85.2+ MB


In [6]:
# Convert date to datetime format
df_bio['date'] = pd.to_datetime(df_bio['date'], errors='coerce')

# Check for any failures
print(f"Invalid dates found: {df_bio['date'].isna().sum()}")

# Sort by date
df_bio = df_bio.sort_values(by='date').reset_index(drop=True)
print("Date conversion and sorting complete.")

Invalid dates found: 0
Date conversion and sorting complete.


In [7]:
df_bio.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1861108 entries, 0 to 1861107
Data columns (total 6 columns):
 #   Column        Dtype         
---  ------        -----         
 0   date          datetime64[ns]
 1   state         object        
 2   district      object        
 3   pincode       int64         
 4   bio_age_5_17  int64         
 5   bio_age_17_   int64         
dtypes: datetime64[ns](1), int64(3), object(2)
memory usage: 85.2+ MB


In [8]:
df_bio.head(20)

Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
0,2025-03-01,Haryana,Mahendragarh,123029,280,577
1,2025-03-01,Uttar Pradesh,Ghazipur,233232,168,47
2,2025-03-01,West Bengal,West Midnapore,721133,47,153
3,2025-03-01,Gujarat,Anand,388335,24,70
4,2025-03-01,Rajasthan,Rajsamand,313322,79,76
5,2025-03-01,Rajasthan,Churu,331301,96,157
6,2025-03-01,Madhya Pradesh,Anuppur,484336,100,119
7,2025-03-01,Odisha,Kalahandi,766027,266,221
8,2025-03-01,Odisha,Dhenkanal,759023,149,204
9,2025-03-01,Gujarat,Vadodara,391125,25,142


In [9]:
from fuzzywuzzy import process

# Get unique states
states = df_bio['state'].unique()

print("Scanning Biometric Districts for duplicates...")
print("-" * 60)

for state in states:
    districts = df_bio[df_bio['state'] == state]['district'].unique().tolist()
    
    if len(districts) < 2:
        continue
        
    for dist in districts:
        # Get the top match
        matches = process.extract(dist, [d for d in districts if d != dist], limit=1)
        
        for match, score in matches:
            if score >= 88:
                # Rule out the legitimate directional districts
                directions = ['East', 'West', 'North', 'South', 'Central', 'Urban', 'Rural']
                is_legit = any(kw in dist for kw in directions) and any(kw in match for kw in directions)
                
                if not is_legit:
                    print(f"[{state}] Duplicate Found: {dist} <---> {match} (Score: {score})")
                
                # Pop the match to avoid checking it again
                if match in districts:
                    districts.remove(match)

print("-" * 60)
print("Audit complete.")

Scanning Biometric Districts for duplicates...
------------------------------------------------------------
[Haryana] Duplicate Found: Yamuna Nagar <---> Yamunanagar (Score: 96)
[Uttar Pradesh] Duplicate Found: Balrampur <---> Rampur (Score: 90)
[Uttar Pradesh] Duplicate Found: Chandauli <---> Chandauli * (Score: 100)
[Uttar Pradesh] Duplicate Found: Bulandshahr <---> Bulandshahar (Score: 96)
[Uttar Pradesh] Duplicate Found: Agra <---> Prayagraj (Score: 90)
[Uttar Pradesh] Duplicate Found: Gautam Buddha Nagar <---> Gautam Buddha Nagar * (Score: 100)
[Uttar Pradesh] Duplicate Found: Bara Banki <---> Barabanki (Score: 95)
[Uttar Pradesh] Duplicate Found: Rae Bareli <---> Raebareli (Score: 95)
[Uttar Pradesh] Duplicate Found: Baghpat <---> Bagpat (Score: 92)
[Uttar Pradesh] Duplicate Found: Mahoba <---> Mahoba * (Score: 100)
[Uttar Pradesh] Duplicate Found: Auraiya <---> Auraiya * (Score: 100)
[Uttar Pradesh] Duplicate Found: Maharajganj <---> Mahrajganj (Score: 95)
[Uttar Pradesh] Duplic



[Jammu and Kashmir] Duplicate Found: Udhampur <---> udhampur (Score: 100)
[Jammu and Kashmir] Duplicate Found: Leh <---> Leh (ladakh) (Score: 90)
[Karnataka] Duplicate Found: Bagalkot <---> Bagalkot * (Score: 100)
[Karnataka] Duplicate Found: Bengaluru <---> Bengaluru South (Score: 90)
[Karnataka] Duplicate Found: Bangalore Rural <---> Bangalore (Score: 90)
[Karnataka] Duplicate Found: Hassan <---> Hasan (Score: 91)
[Karnataka] Duplicate Found: Udupi <---> Udupi * (Score: 100)
[Karnataka] Duplicate Found: Chikkamagaluru <---> Chikmagalur (Score: 88)
[Karnataka] Duplicate Found: Bijapur <---> Bijapur(KAR) (Score: 90)
[Karnataka] Duplicate Found: Davangere <---> Davanagere (Score: 95)
[Karnataka] Duplicate Found: Yadgir <---> yadgir (Score: 100)
[Karnataka] Duplicate Found: Chamrajanagar <---> Chamrajnagar (Score: 96)
[Karnataka] Duplicate Found: Haveri <---> Haveri * (Score: 100)
[Karnataka] Duplicate Found: Gadag <---> Gadag * (Score: 100)
[Karnataka] Duplicate Found: Chamarajanagar <-

In [10]:
import re

# 1. Basic cleaning: strip spaces and remove the '*' character globally
df_bio['district'] = df_bio['district'].str.replace(r'[*]', '', regex=True).str.strip()

# 2. Master mapping for Biometric districts
bio_dist_map = {
    'Uttar Pradesh': {
        'Bulandshahar': 'Bulandshahr', 'Bara Banki': 'Barabanki', 'Rae Bareli': 'Raebareli',
        'Bagpat': 'Baghpat', 'Mahrajganj': 'Maharajganj', 'Sant Ravidas Nagar Bhadohi': 'Bhadohi'
    },
    'West Bengal': {
        'MALDA': 'Malda', 'Maldah': 'Malda', 'Puruliya': 'Purulia', 'Barddhaman': 'Bardhaman',
        'HOOGHLY': 'Hooghly', 'hooghly': 'Hooghly', 'KOLKATA': 'Kolkata', 'HOWRAH': 'Howrah',
        'NADIA': 'Nadia', 'east midnapore': 'East Midnapore'
    },
    'Andhra Pradesh': {
        'Sri Potti Sriramulu Nellore': 'Nellore', 'chittoor': 'Chittoor', 'Karim Nagar': 'Karimnagar',
        'Ananthapur': 'Anantapur', 'Mahabubnagar': 'Mahbubnagar', 'Mahabub Nagar': 'Mahbubnagar',
        'rangareddi': 'Rangareddy', 'Rangareddi': 'Rangareddy', 'K.v. Rangareddy': 'Rangareddy',
        'K.V.Rangareddy': 'Rangareddy'
    },
    'Maharashtra': {
        'Raigarh(MH)': 'Raigarh', 'Mumbai City': 'Mumbai', 'Mumbai( Sub Urban )': 'Mumbai Suburban',
        'Buldana': 'Buldhana', 'Chatrapati Sambhaji Nagar': 'Chhatrapati Sambhajinagar',
        'Gondiya': 'Gondia'
    },
    'Odisha': {
        'ANGUL': 'Angul', 'ANUGUL': 'Angul', 'Anugul': 'Angul', 'Khorda': 'Khordha', 
        'jajpur': 'Jajpur', 'Jajapur': 'Jajpur', 'JAJPUR': 'Jajpur', 'NAYAGARH': 'Nayagarh',
        'BALANAGAR': 'Balangir', 'BALANGIR': 'Balangir', 'Baleswar': 'Baleshwar',
        'Jagatsinghapur': 'Jagatsinghpur', 'Sundergarh': 'Sundargarh', 'NUAPADA': 'Nuapada'
    },
    'Gujarat': {
        'Ahmadabad': 'Ahmedabad', 'Banas Kantha': 'Banaskantha', 'Surendra Nagar': 'Surendranagar',
        'Sabar Kantha': 'Sabarkantha', 'Panch Mahals': 'Panchmahals'
    },
    'Karnataka': {
        'Bangalore': 'Bengaluru Urban', 'Bangalore Rural': 'Bengaluru Rural', 'Hasan': 'Hassan',
        'Chikmagalur': 'Chikkamagaluru', 'Bijapur(KAR)': 'Vijayapura', 'Bijapur': 'Vijayapura',
        'Davanagere': 'Davangere', 'yadgir': 'Yadgir', 'Chamrajnagar': 'Chamarajanagar',
        'Chamrajanagar': 'Chamarajanagar'
    },
    'Bihar': {
        'Kaimur (Bhabua)': 'Kaimur', 'Bhabua': 'Kaimur', 'Aurangabad(bh)': 'Aurangabad',
        'Aurangabad(BH)': 'Aurangabad', 'Samstipur': 'Samastipur', 'Sheikpura': 'Sheikhpura'
    },
    'Chhattisgarh': {
        'Uttar Bastar Kanker': 'Kanker', 'Janjgir-champa': 'Janjgir Champa', 'Janjgir - Champa': 'Janjgir Champa',
        'Mohla-Manpur-Ambagarh Chouki': 'Mohalla-Manpur-Ambagarh Chowki',
        'ManendragarhChirmiriBharatpur': 'Manendragarh-Chirmiri-Bharatpur'
    },
    'Telangana': {
        'K.v. Rangareddy': 'Rangareddy'
    },
    'Jharkhand': {
        'Hazaribag': 'Hazaribagh', 'Palamau': 'Palamu', 'Pakaur': 'Pakur',
        'Seraikela-kharsawan': 'Seraikela-Kharsawan', 'Sahibganj': 'Sahebganj'
    },
    'Punjab': {
        'Sri Muktsar Sahib': 'Muktsar', 'S.A.S Nagar(Mohali)': 'Sas Nagar (Mohali)',
        'SAS Nagar (Mohali)': 'Sas Nagar (Mohali)'
    },
    'Others': {
        'udhampur': 'Udhampur', 'Leh (ladakh)': 'Leh', 'Hardwar': 'Haridwar',
        'Lahul & Spiti': 'Lahaul and Spiti', 'Lahul and Spiti': 'Lahaul and Spiti'
    }
}

# 3. Apply the surgical mapping
for state, mapping in bio_dist_map.items():
    for wrong, right in mapping.items():
        if state == 'Others':
            df_bio.loc[df_bio['district'] == wrong, 'district'] = right
        else:
            mask = (df_bio['state'] == state) & (df_bio['district'] == wrong)
            df_bio.loc[mask, 'district'] = right

# 4. Handle the specific Telangana encoding mess (Medchal)
df_bio['district'] = df_bio['district'].str.replace(r'Medchal.*malkajgiri', 'Medchal-Malkajgiri', case=False, regex=True)

# 5. Final normalization: Remove double spaces and non-breaking spaces (\xa0)
df_bio['district'] = df_bio['district'].str.replace(r'\s+', ' ', regex=True).str.strip()

print("Biometric district cleanup applied.")

Biometric district cleanup applied.


In [11]:
# 1. Final two targeted fixes
df_bio.loc[(df_bio['state'] == 'Mizoram') & (df_bio['district'] == 'Mammit'), 'district'] = 'Mamit'
df_bio.loc[(df_bio['state'] == 'Andaman and Nicobar Islands') & (df_bio['district'] == 'Nicobars'), 'district'] = 'Nicobar'

# 2. Final normalization sweep to ensure no trailing dots or double spaces survived
df_bio['district'] = df_bio['district'].str.replace(r'\s+', ' ', regex=True).str.strip()

print("Mizoram and Andaman fixes applied. Cleanup complete.")

Mizoram and Andaman fixes applied. Cleanup complete.


In [12]:
from fuzzywuzzy import process

# Get unique states from the cleaned df_bio
states = df_bio['state'].unique()

print("FINAL BIOMETRIC CHECK: Scanning for any remaining issues...")
print("-" * 60)

final_survivors = False
for state in states:
    districts = df_bio[df_bio['state'] == state]['district'].unique().tolist()
    
    if len(districts) < 2:
        continue
        
    for dist in districts:
        # Check for similarity
        matches = process.extract(dist, [d for d in districts if d != dist], limit=1)
        
        for match, score in matches:
            # We use 88 as the threshold for potential duplicates
            if score >= 88:
                # Rule out the legitimate directional districts
                directions = ['East', 'West', 'North', 'South', 'Central', 'Urban', 'Rural']
                is_legit = any(kw in dist for kw in directions) and any(kw in match for kw in directions)
                
                if not is_legit:
                    print(f"[{state}] Survivor Found: {dist} <---> {match} (Score: {score})")
                    final_survivors = True
                
                # Remove from list to avoid reverse pairs
                if match in districts:
                    districts.remove(match)

if not final_survivors:
    print("Verification Result: The biometric data is 100% clean.")
else:
    print("A few survivors are still listed above.")
print("-" * 60)

FINAL BIOMETRIC CHECK: Scanning for any remaining issues...
------------------------------------------------------------
[Haryana] Survivor Found: Yamuna Nagar <---> Yamunanagar (Score: 96)
[Uttar Pradesh] Survivor Found: Balrampur <---> Rampur (Score: 90)
[Uttar Pradesh] Survivor Found: Agra <---> Prayagraj (Score: 90)
[West Bengal] Survivor Found: Purba Medinipur <---> Medinipur (Score: 90)
[West Bengal] Survivor Found: Bardhaman <---> Paschim Bardhaman (Score: 90)
[West Bengal] Survivor Found: Nadia <---> nadia (Score: 100)
[Rajasthan] Survivor Found: Jhunjhunun <---> Jhunjhunu (Score: 95)
[Rajasthan] Survivor Found: Jalor <---> Jalore (Score: 91)
[Kerala] Survivor Found: Kasaragod <---> Kasargod (Score: 94)
[Assam] Survivor Found: Karbi Anglong <---> West Karbi Anglong (Score: 95)
[Assam] Survivor Found: Kamrup Metro <---> Kamrup (Score: 90)
[Assam] Survivor Found: Cachar <---> North Cachar Hills (Score: 90)
[Dadra and Nagar Haveli and Daman and Diu] Survivor Found: Dadra and Nagar



[Karnataka] Survivor Found: Bengaluru <---> Bengaluru Rural (Score: 90)
[Karnataka] Survivor Found: Bengaluru Urban <---> Bengaluru (Score: 90)
[Tamil Nadu] Survivor Found: Kanniyakumari <---> Kanyakumari (Score: 92)
[Tamil Nadu] Survivor Found: Villupuram <---> Viluppuram (Score: 90)
[Tamil Nadu] Survivor Found: Tiruvallur <---> Thiruvallur (Score: 95)
[Tamil Nadu] Survivor Found: Thiruvarur <---> Tiruvarur (Score: 95)
[Tamil Nadu] Survivor Found: Tirupattur <---> Tirupathur (Score: 90)
[Telangana] Survivor Found: Warangal <---> Warangal Urban (Score: 90)
[Telangana] Survivor Found: Rangareddy <---> Sangareddy (Score: 90)
[Telangana] Survivor Found: Warangal Rural <---> Warangal (Score: 90)
[Uttarakhand] Survivor Found: Pauri Garhwal <---> Garhwal (Score: 90)
[Meghalaya] Survivor Found: East Jaintia Hills <---> Jaintia Hills (Score: 95)
[Arunachal Pradesh] Survivor Found: Siang <---> Lower Siang (Score: 90)
[Arunachal Pradesh] Survivor Found: Upper Siang <---> Siang (Score: 90)
[Aruna

In [13]:
# Final surgical mapping for the remaining biometric survivors
final_bio_fixes = {
    'Haryana': {'Yamuna Nagar': 'Yamunanagar'},
    'West Bengal': {'nadia': 'Nadia'},
    'Rajasthan': {'Jhunjhunun': 'Jhunjhunu', 'Jalor': 'Jalore'},
    'Kerala': {'Kasargod': 'Kasaragod'},
    'Dadra and Nagar Haveli and Daman and Diu': {'Dadra & Nagar Haveli': 'Dadra and Nagar Haveli'},
    'Tamil Nadu': {
        'Kanniyakumari': 'Kanyakumari',
        'Viluppuram': 'Villupuram',
        'Thiruvallur': 'Tiruvallur',
        'Tiruvarur': 'Thiruvarur',
        'Tirupathur': 'Tirupattur'
    }
}

# Apply the mapping
for state, mapping in final_bio_fixes.items():
    for wrong, right in mapping.items():
        mask = (df_bio['state'] == state) & (df_bio['district'] == wrong)
        df_bio.loc[mask, 'district'] = right

# Special fix for the Chhattisgarh dash issue (handling the en-dash vs hyphen)
df_bio['district'] = df_bio['district'].str.replace('Manendragarh–Chirmiri–Bharatpur', 'Manendragarh-Chirmiri-Bharatpur')

# One final global strip and space normalization
df_bio['district'] = df_bio['district'].str.replace(r'\s+', ' ', regex=True).str.strip()

print("Final survivors have been unified.")

Final survivors have been unified.


In [14]:
print(df_bio[df_bio['state'] == 'Tamil Nadu']['district'].nunique())

40


In [15]:
from fuzzywuzzy import process

# Get unique states from the cleaned df_bio
states = df_bio['state'].unique()

print("FINAL BIOMETRIC AUDIT: Searching for any remaining duplicates...")
print("-" * 60)

final_check_found = False
for state in states:
    districts = df_bio[df_bio['state'] == state]['district'].unique().tolist()
    
    if len(districts) < 2:
        continue
        
    for dist in districts:
        # Check for similarity with other districts in the same state
        matches = process.extract(dist, [d for d in districts if d != dist], limit=1)
        
        for match, score in matches:
            if score >= 88:
                # Rule out legitimate directional splits
                directions = ['East', 'West', 'North', 'South', 'Central', 'Urban', 'Rural']
                is_legit = any(kw in dist for kw in directions) and any(kw in match for kw in directions)
                
                if not is_legit:
                    print(f"[{state}] Survivor: {dist} <---> {match} (Score: {score})")
                    final_check_found = True
                
                # Prevent duplicate printing of the same pair
                if match in districts:
                    districts.remove(match)

if not final_check_found:
    print("Verification Success: No more duplicates found. The data is clean.")
else:
    print("Check the survivors listed above.")
print("-" * 60)

FINAL BIOMETRIC AUDIT: Searching for any remaining duplicates...
------------------------------------------------------------
[Uttar Pradesh] Survivor: Balrampur <---> Rampur (Score: 90)
[Uttar Pradesh] Survivor: Agra <---> Prayagraj (Score: 90)
[West Bengal] Survivor: Purba Medinipur <---> Medinipur (Score: 90)
[West Bengal] Survivor: Bardhaman <---> Paschim Bardhaman (Score: 90)
[Assam] Survivor: Karbi Anglong <---> West Karbi Anglong (Score: 95)
[Assam] Survivor: Kamrup Metro <---> Kamrup (Score: 90)
[Assam] Survivor: Cachar <---> North Cachar Hills (Score: 90)
[Chhattisgarh] Survivor: Baloda Bazar <---> Balod (Score: 90)
[Chhattisgarh] Survivor: Bastar <---> Dakshin Bastar Dantewada (Score: 90)
[Maharashtra] Survivor: Mumbai <---> Mumbai Suburban (Score: 90)




[Karnataka] Survivor: Bengaluru <---> Bengaluru Rural (Score: 90)
[Karnataka] Survivor: Bengaluru Urban <---> Bengaluru (Score: 90)
[Telangana] Survivor: Warangal <---> Warangal Urban (Score: 90)
[Telangana] Survivor: Rangareddy <---> Sangareddy (Score: 90)
[Telangana] Survivor: Warangal Rural <---> Warangal (Score: 90)
[Uttarakhand] Survivor: Pauri Garhwal <---> Garhwal (Score: 90)
[Meghalaya] Survivor: East Jaintia Hills <---> Jaintia Hills (Score: 95)
[Arunachal Pradesh] Survivor: Siang <---> Lower Siang (Score: 90)
[Arunachal Pradesh] Survivor: Upper Siang <---> Siang (Score: 90)
[Arunachal Pradesh] Survivor: Dibang Valley <---> Lower Dibang Valley (Score: 95)
Check the survivors listed above.
------------------------------------------------------------


In [16]:
# 1. Remove rows where district name is just whitespace or a single symbol
df_bio = df_bio[df_bio['district'].str.len() > 1]

# 2. Specifically remove any rows that might have become empty strings
df_bio = df_bio[df_bio['district'].str.strip() != '']

print("Empty/Symbol districts removed. No more empty string warnings should appear.")



In [17]:
# Create a total updates column just for this view
df_bio['total_biometric'] = df_bio['bio_age_5_17'] + df_bio['bio_age_17_']

# State-wise totals
state_bio_totals = df_bio.groupby('state')['total_biometric'].sum().sort_values(ascending=False)

print("--- FINAL BIOMETRIC TOTALS BY STATE ---")
print(state_bio_totals.to_string())

--- FINAL BIOMETRIC TOTALS BY STATE ---
state
Uttar Pradesh                               9577735
Maharashtra                                 9226139
Madhya Pradesh                              5923771
Bihar                                       4897587
Tamil Nadu                                  4698118
Rajasthan                                   3994955
Andhra Pradesh                              3714633
Gujarat                                     3196514
Chhattisgarh                                2648734
Karnataka                                   2635954
West Bengal                                 2524619
Odisha                                      2464960
Jharkhand                                   2026297
Punjab                                      1739671
Telangana                                   1737654
Haryana                                     1635454
Kerala                                      1609730
Delhi                                       1304362
Assam             

In [18]:
# Create the total column correctly in df_bio
df_bio['total_biometric'] = df_bio['bio_age_5_17'] + df_bio['bio_age_17_']

# Group by state and district for Biometric data
biometric_dist_totals = df_bio.groupby(['state', 'district'])['total_biometric'].sum()

# Print the entire list
print("--- FULL BIOMETRIC LIST BY DISTRICT ---")
print(biometric_dist_totals.to_string())

--- FULL BIOMETRIC LIST BY DISTRICT ---
state                                     district                       
Andaman and Nicobar Islands               Andamans                             2015
                                          Nicobar                              1813
                                          North And Middle Andaman             6383
                                          South Andaman                       10487
Andhra Pradesh                            Adilabad                            52038
                                          Alluri Sitharama Raju               18607
                                          Anakapalli                           6723
                                          Anantapur                          177697
                                          Ananthapuramu                      109602
                                          Annamayya                            9712
                                          Bapa

In [19]:
# Comprehensive mapping for the survivors you found in df_bio
bio_final_cleanup = {
    'Bihar': {'Purnea': 'Purnia'},
    'Gujarat': {'Dohad': 'Dahod'},
    'Jammu and Kashmir': {
        'Bandipur': 'Bandipore',
        'Punch': 'Poonch',
        'Rajauri': 'Rajouri'
    },
    'Jharkhand': {'Kodarma': 'Koderma'},
    'West Bengal': {
        'East Midnapur': 'East Midnapore',
        'East midnapore': 'East Midnapore',
        'Darjiling': 'Darjeeling',
        'Haora': 'Howrah',
        'Hawrah': 'Howrah',
        'Hooghiy': 'Hooghly',
        'Hugli': 'Hooghly',
        'North Twenty Four Parganas': 'North 24 Parganas',
        'South 24 pargana': 'South 24 Parganas',
        'South 24 parganas': 'South 24 Parganas',
        'South 24 Pargana': 'South 24 Parganas'
    },
    'Karnataka': {
        'Bellary': 'Ballari',
        'Chickmagalur': 'Chikkamagaluru',
        'Mysore': 'Mysuru',
        'Shimoga': 'Shivamogga',
        'Tumkur': 'Tumakuru',
        'Bengaluru South': 'Bengaluru Urban' # Merging small fragment into Urban
    },
    'Maharashtra': {
        'Ahmed Nagar': 'Ahmadnagar',
        'Raigad': 'Raigarh' # Standardizing spelling
    },
    'Odisha': {'Anugal': 'Angul'},
    'Puducherry': {'Pondicherry': 'Puducherry'},
    'Punjab': {'Firozpur': 'Ferozepur'},
    'Rajasthan': {
        'Chittaurgarh': 'Chittorgarh',
        'Dhaulpur': 'Dholpur'
    },
    'Tamil Nadu': {'Kanchipuram': 'Kancheepuram'},
    'Telangana': {
        'Jangoan': 'Jangaon',
        'Warangal (urban)': 'Warangal Urban',
        'Warangal': 'Warangal Urban' # Merging generic Warangal into Urban
    }
}

# Apply the mapping specifically to df_bio
for state, mapping in bio_final_cleanup.items():
    for wrong, right in mapping.items():
        mask = (df_bio['state'] == state) & (df_bio['district'] == wrong)
        df_bio.loc[mask, 'district'] = right

# Special Fix for Sikkim Districts (Mapping 'East', 'West' etc. to proper names)
sikkim_mask = df_bio['state'] == 'Sikkim'
df_bio.loc[sikkim_mask & (df_bio['district'] == 'East'), 'district'] = 'East Sikkim'
df_bio.loc[sikkim_mask & (df_bio['district'] == 'West'), 'district'] = 'West Sikkim'
df_bio.loc[sikkim_mask & (df_bio['district'] == 'North'), 'district'] = 'North Sikkim'
df_bio.loc[sikkim_mask & (df_bio['district'] == 'South'), 'district'] = 'South Sikkim'

# Final Space cleanup
df_bio['district'] = df_bio['district'].str.replace(r'\s+', ' ', regex=True).str.strip()

print("Biometric data: All spelling and historical variants unified.")

Biometric data: All spelling and historical variants unified.


In [20]:
# Final unification for West Bengal survivors in df_bio
wb_final_map = {
    'West Medinipur': 'West Midnapore',
    'South Twenty Four Parganas': 'South 24 Parganas'
}

for wrong, right in wb_final_map.items():
    df_bio.loc[(df_bio['state'] == 'West Bengal') & (df_bio['district'] == wrong), 'district'] = right

# One last check for any case-sensitivity issues in West Bengal
df_bio.loc[df_bio['state'] == 'West Bengal', 'district'] = df_bio.loc[df_bio['state'] == 'West Bengal', 'district'].str.title().str.replace('24 Parganas', '24 Parganas')

print("West Bengal districts unified. Biometric data is now consistent.")

West Bengal districts unified. Biometric data is now consistent.


In [21]:
# Final check of the top 20 Biometric districts
df_bio['total_biometric'] = df_bio['bio_age_5_17'] + df_bio['bio_age_17_']
print(df_bio.groupby(['state', 'district'])['total_biometric'].sum().sort_values(ascending=False))

state        district 
Maharashtra  Pune         605762
             Nashik       576606
             Thane        571273
             Mumbai       451930
Gujarat      Ahmedabad    421172
                           ...  
Manipur      Pherzawl          2
Rajasthan    Phalodi           2
Goa          Tiswadi           2
Haryana      Akhera            1
Rajasthan    Salumbar          1
Name: total_biometric, Length: 838, dtype: int64


In [22]:
# 1. Ensure the total column exists in df_bio
df_bio['total_biometric'] = df_bio['bio_age_5_17'] + df_bio['bio_age_17_']

# 2. Group by state and district to get the sums
# Sorting by state then district name for a clean, readable list
biometric_summary = df_bio.groupby(['state', 'district'])['total_biometric'].sum()

# 3. Print the entire list without truncation
print("--- FULL BIOMETRIC LIST BY DISTRICT ---")
print("-" * 60)
print(biometric_summary.to_string())
print("-" * 60)

--- FULL BIOMETRIC LIST BY DISTRICT ---
------------------------------------------------------------
state                                     district                       
Andaman and Nicobar Islands               Andamans                             2015
                                          Nicobar                              1813
                                          North And Middle Andaman             6383
                                          South Andaman                       10487
Andhra Pradesh                            Adilabad                            52038
                                          Alluri Sitharama Raju               18607
                                          Anakapalli                           6723
                                          Anantapur                          177697
                                          Ananthapuramu                      109602
                                          Annamayya                  

In [23]:
# Final consolidation for the Biometric stragglers
final_mopping = {
    'Andhra Pradesh': {
        'Ananthapuramu': 'Anantapur',
        'Cuddapah': 'Y. S. R', # Cuddapah is the old name for YSR District
        'Hyderabad': 'Hyderabad', # Keep as is, but watch for splits with Telangana
    },
    'Bihar': {
        'Monghyr': 'Munger',
        'Pashchim Champaran': 'West Champaran',
        'Purba Champaran': 'East Champaran'
    },
    'Chandigarh': {
        'Mohali': 'Sas Nagar (Mohali)',
        'Rupnagar': 'Rupnagar'
    },
    'Chhattisgarh': {
        'Dakshin Bastar Dantewada': 'Dantewada',
        'Kawardha': 'Kabeerdham'
    },
    'Delhi': {
        'North East Delhi': 'North East',
        'Najafgarh': 'South West Delhi' # Administrative sub-division
    },
    'Karnataka': {
        'Belgaum': 'Belagavi',
        'Bengaluru': 'Bengaluru Urban',
        'Bengaluru South': 'Bengaluru Urban',
        'Gulbarga': 'Kalaburagi'
    },
    'Telangana': {
        'Yadadri.': 'Yadadri Bhuvanagiri',
        'Warangal Rural': 'Warangal',
        'Warangal Urban': 'Warangal'
    },
    'Uttar Pradesh': {
        'Allahabad': 'Prayagraj',
        'Faizabad': 'Ayodhya',
        'Sant Ravidas Nagar': 'Bhadohi',
        'Jyotiba Phule Nagar': 'Amroha'
    },
    'West Bengal': {
        'Burdwan': 'Bardhaman',
        'Bally Jagachha': 'Howrah',
        'Domjur': 'Howrah',
        'Koch Bihar': 'Cooch Behar',
        'North Dinajpur': 'Uttar Dinajpur',
        'South Dinajpur': 'Dakshin Dinajpur',
        'Medinipur': 'Paschim Medinipur'
    }
}

# Apply the mapping
for state, mapping in final_mopping.items():
    for wrong, right in mapping.items():
        mask = (df_bio['state'] == state) & (df_bio['district'] == wrong)
        df_bio.loc[mask, 'district'] = right

# Global fix for the dot in 'Yadadri.' and extra spaces
df_bio['district'] = df_bio['district'].str.replace(r'\.', '', regex=True).str.strip()

print("Deep clean complete.")

Deep clean complete.


In [24]:
# 1. Final total calculation for df_bio
df_bio['total_biometric'] = df_bio['bio_age_5_17'] + df_bio['bio_age_17_']

# 2. Group by state and district
biometric_final_grouped = df_bio.groupby(['state', 'district'])['total_biometric'].sum()

# 3. Print the absolute full list
print("--- FINAL BIOMETRIC GROUPED LIST (ALL STATES/DISTRICTS) ---")
print("-" * 70)
print(biometric_final_grouped.to_string())
print("-" * 70)

# Quick check on total unique districts left
print(f"Total Unique Districts in Biometric: {len(biometric_final_grouped)}")

--- FINAL BIOMETRIC GROUPED LIST (ALL STATES/DISTRICTS) ---
----------------------------------------------------------------------
state                                     district                       
Andaman and Nicobar Islands               Andamans                             2015
                                          Nicobar                              1813
                                          North And Middle Andaman             6383
                                          South Andaman                       10487
Andhra Pradesh                            Adilabad                            52038
                                          Alluri Sitharama Raju               18607
                                          Anakapalli                           6723
                                          Anantapur                          287299
                                          Annamayya                            9712
                                       

In [25]:
df_bio.to_csv('Cleaned Datasets/cleaned_biometric.csv', index=False)

print("Total records secured:", len(df_bio))

Total records secured: 1861107
