# Cleaning Enrolment Data

In [1]:
import zipfile
import pandas as pd
import os

In [4]:
# Define your zip files based on your screenshots
zips = {
    'biometric': 'Zipped Datasets/api_data_aadhar_biometric.zip',
    'demographic': 'Zipped Datasets/api_data_aadhar_demographic.zip',
    'enrolment': 'Zipped Datasets/api_data_aadhar_enrolment.zip'
}

# This dictionary will hold the 3 final merged tables
dataframes = {}

for category, zip_name in zips.items():
    print(f"Combining files for: {category}...")
    temp_list = []
    
    try:
        with zipfile.ZipFile(zip_name, 'r') as z:
            # Get all CSV files inside this specific zip
            csv_files = [f for f in z.namelist() if f.endswith('.csv')]
            
            for csv_file in csv_files:
                with z.open(csv_file) as f:
                    # low_memory=False helps with mixed data types in large files
                    temp_df = pd.read_csv(f, low_memory=False)
                    temp_list.append(temp_df)
        
        # Merge the parts (0_500000, 500000_1000000, etc.) into one table
        dataframes[category] = pd.concat(temp_list, ignore_index=True)
        print(f"Successfully created df_{category} with {len(dataframes[category]):,} rows.")
        
    except FileNotFoundError:
        print(f"Error: Could not find {zip_name}. Make sure it's in the same folder!")

# Create easy-to-use variables for your analysis
df_biometric = dataframes.get('biometric')
df_demographic = dataframes.get('demographic')
df_enrolment = dataframes.get('enrolment')

Combining files for: biometric...
Successfully created df_biometric with 1,861,108 rows.
Combining files for: demographic...
Successfully created df_demographic with 2,071,700 rows.
Combining files for: enrolment...
Successfully created df_enrolment with 1,006,029 rows.


In [None]:
os.makedirs('Raw Datasets', exist_ok=True) 
os.makedirs('Cleaned Datasets', exist_ok=True)

In [5]:
print(df_biometric.head(10))
print(df_biometric.tail(10))

         date              state      district  pincode  bio_age_5_17  \
0  01-03-2025            Haryana  Mahendragarh   123029           280   
1  01-03-2025              Bihar     Madhepura   852121           144   
2  01-03-2025  Jammu and Kashmir         Punch   185101           643   
3  01-03-2025              Bihar       Bhojpur   802158           256   
4  01-03-2025         Tamil Nadu       Madurai   625514           271   
5  01-03-2025        Maharashtra     Ratnagiri   416702           155   
6  01-03-2025            Gujarat         Anand   388130            75   
7  01-03-2025            Gujarat   Gandhinagar   382421           192   
8  01-03-2025             Odisha     Dhenkanal   759025           122   
9  01-03-2025            Gujarat        Valsad   396055            67   

   bio_age_17_  
0          577  
1          369  
2         1091  
3          980  
4          815  
5          529  
6          143  
7          298  
8          214  
9           85  
         

In [6]:
print(df_demographic.head(10))
print(df_demographic.tail(10))

         date           state           district  pincode  demo_age_5_17  \
0  01-03-2025   Uttar Pradesh          Gorakhpur   273213             49   
1  01-03-2025  Andhra Pradesh           Chittoor   517132             22   
2  01-03-2025         Gujarat             Rajkot   360006             65   
3  01-03-2025  Andhra Pradesh         Srikakulam   532484             24   
4  01-03-2025       Rajasthan            Udaipur   313801             45   
5  01-03-2025       Rajasthan              Sikar   332028             28   
6  01-03-2025       Karnataka           Tumakuru   572201             88   
7  01-03-2025   Uttar Pradesh          Gorakhpur   273211             61   
8  01-03-2025  Andhra Pradesh            Kurnool   518313             83   
9  01-03-2025     West Bengal  Paschim Medinipur   721148             13   

   demo_age_17_  
0           529  
1           375  
2           765  
3           314  
4           785  
5           285  
6           332  
7           836  
8

In [7]:
print(df_enrolment.head(10))
print(df_enrolment.tail(10))

         date          state          district  pincode  age_0_5  age_5_17  \
0  02-03-2025      Meghalaya  East Khasi Hills   793121       11        61   
1  09-03-2025      Karnataka   Bengaluru Urban   560043       14        33   
2  09-03-2025  Uttar Pradesh      Kanpur Nagar   208001       29        82   
3  09-03-2025  Uttar Pradesh           Aligarh   202133       62        29   
4  09-03-2025      Karnataka   Bengaluru Urban   560016       14        16   
5  09-03-2025          Bihar         Sitamarhi   843331       20        49   
6  09-03-2025          Bihar         Sitamarhi   843330       23        24   
7  09-03-2025  Uttar Pradesh          Bahraich   271865       26        60   
8  09-03-2025  Uttar Pradesh         Firozabad   283204       28        26   
9  09-03-2025          Bihar   Purbi Champaran   845418       30        48   

   age_18_greater  
0              37  
1              39  
2              12  
3              15  
4              21  
5              12  
6

In [8]:
df_biometric.info()    

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1861108 entries, 0 to 1861107
Data columns (total 6 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   date          object
 1   state         object
 2   district      object
 3   pincode       int64 
 4   bio_age_5_17  int64 
 5   bio_age_17_   int64 
dtypes: int64(3), object(3)
memory usage: 85.2+ MB


In [9]:
df_demographic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2071700 entries, 0 to 2071699
Data columns (total 6 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   date           object
 1   state          object
 2   district       object
 3   pincode        int64 
 4   demo_age_5_17  int64 
 5   demo_age_17_   int64 
dtypes: int64(3), object(3)
memory usage: 94.8+ MB


In [10]:
df_enrolment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1006029 entries, 0 to 1006028
Data columns (total 7 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   date            1006029 non-null  object
 1   state           1006029 non-null  object
 2   district        1006029 non-null  object
 3   pincode         1006029 non-null  int64 
 4   age_0_5         1006029 non-null  int64 
 5   age_5_17        1006029 non-null  int64 
 6   age_18_greater  1006029 non-null  int64 
dtypes: int64(4), object(3)
memory usage: 53.7+ MB


In [11]:
df_biometric.isnull().sum()

date            0
state           0
district        0
pincode         0
bio_age_5_17    0
bio_age_17_     0
dtype: int64

In [12]:
df_demographic.isnull().sum()

date             0
state            0
district         0
pincode          0
demo_age_5_17    0
demo_age_17_     0
dtype: int64

In [13]:
df_enrolment.isnull().sum()

date              0
state             0
district          0
pincode           0
age_0_5           0
age_5_17          0
age_18_greater    0
dtype: int64

In [14]:
# 1. Check for rows that are completely empty strings ""
empty_strings = (df_demographic == "").all(axis=1).sum()

# 2. Check for rows that are just whitespace " " 
# (Common if the CSV was generated poorly)
whitespace_rows = df_demographic.apply(lambda x: x.str.strip().eq('') if x.dtype == "object" else False).all(axis=1).sum()

print(f"Total rows that are completely blank: {empty_strings}")
print(f"Total rows that are just whitespace: {whitespace_rows}")

Total rows that are completely blank: 0
Total rows that are just whitespace: 0


In [15]:
pd.options.display.float_format = '{:.2f}'.format
df_biometric.describe()

Unnamed: 0,pincode,bio_age_5_17,bio_age_17_
count,1861108.0,1861108.0,1861108.0
mean,521761.17,18.39,19.09
std,198162.68,83.7,88.07
min,110001.0,0.0,0.0
25%,391175.0,1.0,1.0
50%,522401.0,3.0,4.0
75%,686636.25,11.0,10.0
max,855456.0,8002.0,7625.0


In [16]:
pd.options.display.float_format = '{:.2f}'.format
df_demographic.describe()

Unnamed: 0,pincode,demo_age_5_17,demo_age_17_
count,2071700.0,2071700.0,2071700.0
mean,527831.78,2.35,21.45
std,197293.32,14.9,125.25
min,100000.0,0.0,0.0
25%,396469.0,0.0,2.0
50%,524322.0,1.0,6.0
75%,695507.0,2.0,15.0
max,855456.0,2690.0,16166.0


In [17]:
pd.options.display.float_format = '{:.2f}'.format
df_enrolment.describe()

Unnamed: 0,pincode,age_0_5,age_5_17,age_18_greater
count,1006029.0,1006029.0,1006029.0,1006029.0
mean,518641.45,3.53,1.71,0.17
std,205635.97,17.54,14.37,3.22
min,100000.0,0.0,0.0,0.0
25%,363641.0,1.0,0.0,0.0
50%,517417.0,2.0,0.0,0.0
75%,700104.0,3.0,1.0,0.0
max,855456.0,2688.0,1812.0,855.0


In [18]:
df_biometric['date'] = pd.to_datetime(df_biometric['date'], dayfirst=True)
df_demographic['date'] = pd.to_datetime(df_demographic['date'], dayfirst=True)
df_enrolment['date'] = pd.to_datetime(df_enrolment['date'], dayfirst=True)

print("Dates converted successfully!")

Dates converted successfully!


In [19]:
df_biometric.info()    

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1861108 entries, 0 to 1861107
Data columns (total 6 columns):
 #   Column        Dtype         
---  ------        -----         
 0   date          datetime64[ns]
 1   state         object        
 2   district      object        
 3   pincode       int64         
 4   bio_age_5_17  int64         
 5   bio_age_17_   int64         
dtypes: datetime64[ns](1), int64(3), object(2)
memory usage: 85.2+ MB


In [20]:
df_demographic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2071700 entries, 0 to 2071699
Data columns (total 6 columns):
 #   Column         Dtype         
---  ------         -----         
 0   date           datetime64[ns]
 1   state          object        
 2   district       object        
 3   pincode        int64         
 4   demo_age_5_17  int64         
 5   demo_age_17_   int64         
dtypes: datetime64[ns](1), int64(3), object(2)
memory usage: 94.8+ MB


In [21]:
df_enrolment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1006029 entries, 0 to 1006028
Data columns (total 7 columns):
 #   Column          Non-Null Count    Dtype         
---  ------          --------------    -----         
 0   date            1006029 non-null  datetime64[ns]
 1   state           1006029 non-null  object        
 2   district        1006029 non-null  object        
 3   pincode         1006029 non-null  int64         
 4   age_0_5         1006029 non-null  int64         
 5   age_5_17        1006029 non-null  int64         
 6   age_18_greater  1006029 non-null  int64         
dtypes: datetime64[ns](1), int64(4), object(2)
memory usage: 53.7+ MB


In [22]:
df_enrolment['total_enrol'] = df_enrolment['age_0_5'] + df_enrolment['age_5_17'] + df_enrolment['age_18_greater']
state_enrolment = df_enrolment.groupby('state')['total_enrol'].sum().sort_values(ascending=False)    
print(state_enrolment)

state
Uttar Pradesh                                   1018629
Bihar                                            609585
Madhya Pradesh                                   493970
West Bengal                                      375297
Maharashtra                                      369139
Rajasthan                                        348458
Gujarat                                          280549
Assam                                            230197
Karnataka                                        223235
Tamil Nadu                                       220789
Jharkhand                                        157539
Telangana                                        131574
Andhra Pradesh                                   127681
Odisha                                           118837
Meghalaya                                        109771
Chhattisgarh                                     103219
Haryana                                           98252
Delhi                                     

In [23]:
df_enrolment['state'] = df_enrolment['state'].str.strip()

cleanup_map = {
    # West Bengal
    'West  Bengal': 'West Bengal', 'West Bangal': 'West Bengal', 
    'West bengal': 'West Bengal', 'Westbengal': 'West Bengal', 
    'WEST BENGAL': 'West Bengal', 'WESTBENGAL': 'West Bengal',
    
    # Andhra Pradesh
    'andhra pradesh': 'Andhra Pradesh',
    
    # Odisha
    'orissa': 'Odisha', 'ODISHA': 'Odisha', 'Orissa': 'Odisha',
    
    # Jammu and Kashmir
    'Jammu And Kashmir': 'Jammu and Kashmir', 'Jammu & Kashmir': 'Jammu and Kashmir',
    
    # Puducherry
    'Pondicherry': 'Puducherry',
    
    # The Big Union Territory Merge
    'Dadra and Nagar Haveli': 'Dadra and Nagar Haveli and Daman and Diu',
    'Daman and Diu': 'Dadra and Nagar Haveli and Daman and Diu',
    'Dadra & Nagar Haveli': 'Dadra and Nagar Haveli and Daman and Diu',
    'Daman & Diu': 'Dadra and Nagar Haveli and Daman and Diu',
    'The Dadra And Nagar Haveli And Daman And Diu': 'Dadra and Nagar Haveli and Daman and Diu',
    
    # Andaman and Nicobar
    'Andaman & Nicobar Islands': 'Andaman and Nicobar Islands'
}

# Apply the mapping
df_enrolment['state'] = df_enrolment['state'].replace(cleanup_map)

# Remove the "100000" garbage value
df_enrolment = df_enrolment[df_enrolment['state'] != '100000']

# Final check
print(df_enrolment['state'].value_counts())

state
Uttar Pradesh                               110369
Tamil Nadu                                   92552
Maharashtra                                  77191
West Bengal                                  76561
Karnataka                                    70198
Andhra Pradesh                               65663
Bihar                                        60567
Rajasthan                                    56159
Madhya Pradesh                               50225
Odisha                                       47011
Gujarat                                      46624
Telangana                                    42774
Kerala                                       39145
Assam                                        31827
Jharkhand                                    23218
Punjab                                       20439
Chhattisgarh                                 18550
Haryana                                      15997
Jammu and Kashmir                            11455
Himachal Pradesh         

In [24]:
# 1. Group by District and get a list of unique PIN codes for each
district_check = df_enrolment.groupby('district')['pincode'].unique()

# 2. To find typos, let's look at the "Top" and "Bottom" alphabetically
# Often typos appear right next to the correct spelling (e.g., 'Gurgaon' vs 'Gurugram')

district_check.sort_index().tail(20)

district
West Siang                         [791125, 791001, 791101, 792051]
West Sikkim                                [737121, 737111, 737113]
West Singhbhum    [833201, 833215, 833212, 833214, 833102, 83310...
West Tripura      [799045, 799035, 799002, 799008, 799012, 79911...
Wokha                              [797111, 797100, 797112, 798601]
Y. S. R           [516002, 516003, 516172, 516233, 516289, 51643...
Yadadri.          [508285, 508111, 508112, 508113, 508105, 50811...
Yadgir            [585290, 585202, 585214, 585215, 585220, 58528...
Yamuna Nagar      [133206, 135001, 135101, 135103, 135106, 13320...
Yamunanagar                [135001, 135003, 135101, 135133, 135002]
Yanam                                                      [533464]
Yavatmal          [445204, 445205, 445215, 445106, 445002, 44510...
Zunheboto          [798620, 797109, 798619, 798601, 798627, 797104]
chittoor                                                   [517520]
hooghly                                

In [25]:
district_check.sort_index().head(20)

district
24 Paraganas North    [743263, 700129, 743248, 743287, 700125, 70011...
24 Paraganas South                                             [743347]
ANGUL                                                          [759119]
ANUGUL                                                         [759119]
Adilabad              [504293, 504294, 504302, 504308, 504208, 50429...
Agar Malwa             [465447, 465445, 465230, 465441, 465449, 465550]
Agra                  [282005, 282001, 282010, 283125, 283111, 28320...
Ahilyanagar           [422620, 422608, 413738, 414001, 414201, 42261...
Ahmadabad             [380001, 382445, 382421, 380016, 382350, 38222...
Ahmadnagar            [413201, 413701, 413703, 413705, 414002, 41400...
Ahmed Nagar           [413714, 413738, 414001, 422603, 413706, 41440...
Ahmedabad             [382150, 380001, 380005, 382340, 380018, 38002...
Ahmednagar                                             [414105, 413705]
Aizawl                [796001, 796012, 796111, 796009, 

In [26]:
from difflib import get_close_matches

def find_district_typos(df):
    for state in df['state'].unique():
        districts = df[df['state'] == state]['district'].unique()
        for d in districts:
            # Find names that are very similar but not identical
            matches = get_close_matches(d, districts, n=2, cutoff=0.8)
            if len(matches) > 1:
                print(f"In {state}: Potential duplicates {matches}")

find_district_typos(df_enrolment)

In Meghalaya: Potential duplicates ['East Khasi Hills', 'West Khasi Hills']
In Meghalaya: Potential duplicates ['West Khasi Hills', 'East Khasi Hills']
In Meghalaya: Potential duplicates ['West Jaintia Hills', 'East Jaintia Hills']
In Meghalaya: Potential duplicates ['West Garo Hills', 'East Garo Hills']
In Meghalaya: Potential duplicates ['South Garo Hills', 'North Garo Hills']
In Meghalaya: Potential duplicates ['North Garo Hills', 'South Garo Hills']
In Meghalaya: Potential duplicates ['South West Khasi Hills', 'West Khasi Hills']
In Meghalaya: Potential duplicates ['East Jaintia Hills', 'West Jaintia Hills']
In Meghalaya: Potential duplicates ['East Garo Hills', 'West Garo Hills']
In Meghalaya: Potential duplicates ['South West Garo Hills', 'South Garo Hills']
In Meghalaya: Potential duplicates ['Eastern West Khasi Hills', 'West Khasi Hills']
In Meghalaya: Potential duplicates ['Jaintia Hills', 'West Jaintia Hills']
In Karnataka: Potential duplicates ['Bengaluru Urban', 'Bengaluru 

In [27]:
import pandas as pd

# 1. First, apply a general standard: Title Case and Strip Spaces
# This fixes 'yadgir' -> 'Yadgir' and 'punch' -> 'Punch' automatically
df_enrolment['district'] = df_enrolment['district'].str.strip().str.title()

# 2. Define the Surgical Mapping
# This list ONLY contains actual errors. I have excluded the distinct directional districts.
master_district_corrections = {
    'Karnataka': {
        'Bangalore Rural': 'Bengaluru Rural', 'Ramanagar': 'Ramanagara',
        'Bagalkot *': 'Bagalkot', 'Chamrajanagar': 'Chamarajanagar', 
        'Chamrajnagar': 'Chamarajanagar', 'Chickmagalur': 'Chikkamagaluru',
        'Chikmagalur': 'Chikkamagaluru', 'Davanagere': 'Davangere',
        'Gadag *': 'Gadag', 'Hasan': 'Hassan', 'Haveri *': 'Haveri',
        'Shimoga': 'Shivamogga', 'Tumkur': 'Tumakuru', 'Udupi *': 'Udupi'
    },
    'Uttar Pradesh': {
        'Mahrajganj': 'Maharajganj', 'Bulandshahar': 'Bulandshahr',
        'Bagpat': 'Baghpat', 'Barabanki': 'Bara Banki', 'Shrawasti': 'Shravasti',
        'Siddharthnagar': 'Siddharth Nagar', 'Kushinagar *': 'Kushinagar',
        'Kushi Nagar': 'Kushinagar', 'Raebareli': 'Rae Bareli',
        'Sant Ravidas Nagar Bhadohi': 'Sant Ravidas Nagar'
    },
    'West Bengal': {
        'Coochbehar': 'Cooch Behar', 'Darjiling': 'Darjeeling',
        '24 Paraganas North': 'North 24 Parganas', '24 Paraganas South': 'South 24 Parganas',
        'North Twenty Four Parganas': 'North 24 Parganas', 'South Twenty Four Parganas': 'South 24 Parganas',
        'Barddhaman': 'Bardhaman', 'East Midnapur': 'East Midnapore',
        'Maldah': 'Malda', 'Puruliya': 'Purulia', 'Hawrah': 'Howrah',
        'Hooghiy': 'Hooghly', 'South 24 Pargana': 'South 24 Parganas'
    },
    'Maharashtra': {
        'Ahmadnagar': 'Ahmednagar', 'Ahmed Nagar': 'Ahmednagar',
        'Mumbai( Sub Urban )': 'Mumbai Suburban', 'Buldana': 'Buldhana',
        'Chatrapati Sambhaji Nagar': 'Chhatrapati Sambhajinagar',
        'Gondiya': 'Gondia', 'Gondiya *': 'Gondia', 'Hingoli *': 'Hingoli'
    },
    'Bihar': {
        'Purba Champaran': 'Purbi Champaran', 'Purnea': 'Purnia',
        'Sheikpura': 'Sheikhpura', 'Samstipur': 'Samastipur',
        'Aurangabad(Bh)': 'Aurangabad', 'Aurangabad(Bh)': 'Aurangabad'
    },
    'Odisha': {
        'Anugul': 'Angul', 'Anugal': 'Angul', 'Nabarangapur': 'Nabarangpur',
        'Khorda': 'Khordha', 'Baleshwar': 'Baleswar', 'Baudh': 'Boudh',
        'Jagatsinghapur': 'Jagatsinghpur', 'Jajapur': 'Jajpur', 'Sundergarh': 'Sundargarh'
    },
    'Telangana': {
        'Jangoan': 'Jangaon', 'Medchal-Malkajgiri': 'Medchal-Malkajgiri',
        'Medchal−Malkajgiri': 'Medchal-Malkajgiri', 'Medchal?Malkajgiri': 'Medchal-Malkajgiri',
        'Medchal Malkajgiri': 'Medchal-Malkajgiri'
    },
    'Jammu And Kashmir': {
        'Shupiyan': 'Shopian', 'Badgam': 'Budgam', 'Bandipore': 'Bandipur',
        'Rajauri': 'Rajouri'
    }
}

# 3. Apply the Mapping Surgicaly by State
for state, mapping in master_district_corrections.items():
    for wrong, right in mapping.items():
        # Only fix if BOTH state and district match to avoid cross-state accidents
        mask = (df_enrolment['state'].str.title() == state) & (df_enrolment['district'] == wrong)
        df_enrolment.loc[mask, 'district'] = right

# 4. Final Polish: Remove any remaining asterisks or weird trailing spaces
df_enrolment['district'] = df_enrolment['district'].str.replace(r'\s*[*]\s*', '', regex=True).str.strip()

print("District cleanup finished! Legitimate directional districts were preserved.")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_enrolment['district'] = df_enrolment['district'].str.strip().str.title()


District cleanup finished! Legitimate directional districts were preserved.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_enrolment['district'] = df_enrolment['district'].str.replace(r'\s*[*]\s*', '', regex=True).str.strip()


In [28]:
from difflib import get_close_matches

def find_district_typos(df):
    for state in df['state'].unique():
        districts = df[df['state'] == state]['district'].unique()
        for d in districts:
            # Find names that are very similar but not identical
            matches = get_close_matches(d, districts, n=2, cutoff=0.8)
            if len(matches) > 1:
                print(f"In {state}: Potential duplicates {matches}")

find_district_typos(df_enrolment)

In Meghalaya: Potential duplicates ['East Khasi Hills', 'West Khasi Hills']
In Meghalaya: Potential duplicates ['West Khasi Hills', 'East Khasi Hills']
In Meghalaya: Potential duplicates ['West Jaintia Hills', 'East Jaintia Hills']
In Meghalaya: Potential duplicates ['West Garo Hills', 'East Garo Hills']
In Meghalaya: Potential duplicates ['South Garo Hills', 'North Garo Hills']
In Meghalaya: Potential duplicates ['North Garo Hills', 'South Garo Hills']
In Meghalaya: Potential duplicates ['South West Khasi Hills', 'West Khasi Hills']
In Meghalaya: Potential duplicates ['East Jaintia Hills', 'West Jaintia Hills']
In Meghalaya: Potential duplicates ['East Garo Hills', 'West Garo Hills']
In Meghalaya: Potential duplicates ['South West Garo Hills', 'South Garo Hills']
In Meghalaya: Potential duplicates ['Eastern West Khasi Hills', 'West Khasi Hills']
In Meghalaya: Potential duplicates ['Jaintia Hills', 'West Jaintia Hills']
In Karnataka: Potential duplicates ['Bengaluru Urban', 'Bengaluru 

In [29]:
import pandas as pd

# 1. Ensure we are working on a clean, independent copy to avoid warnings
df_enrolment = df_enrolment.copy()

# 2. Comprehensive mapping based on your manual audit
# This preserves distinct directional districts while merging spelling errors.
final_comprehensive_map = {
    'Haryana': {'Yamuna Nagar': 'Yamunanagar'},
    'Rajasthan': {
        'Jalor': 'Jalore', 'Dholpur': 'Dhaulpur', 
        'Chittaurgarh': 'Chittorgarh', 'Jhunjhunun': 'Jhunjhunu'
    },
    'Punjab': {
        'S.A.S Nagar(Mohali)': 'Sas Nagar (Mohali)', 
        'Ferozepur': 'Firozpur'
    },
    'Madhya Pradesh': {
        'Ashok Nagar': 'Ashoknagar', 'Narsimhapur': 'Narsinghpur'
    },
    'Assam': {'Sibsagar': 'Sivasagar'},
    'Uttarakhand': {'Hardwar': 'Haridwar'},
    'Gujarat': {
        'Banas Kantha': 'Banaskantha', 'Sabar Kantha': 'Sabarkantha',
        'Panch Mahals': 'Panchmahals', 'Surendra Nagar': 'Surendranagar',
        'Ahmadabad': 'Ahmedabad'
    },
    'Andhra Pradesh': {
        'Visakhapatanam': 'Visakhapatnam', 'Mahabub Nagar': 'Mahabubnagar',
        'Anantapur': 'Ananthapur', 'Ananthapuramu': 'Ananthapur',
        'Karim Nagar': 'Karimnagar', 'K.V.Rangareddy': 'K.V. Rangareddy'
    },
    'Tamil Nadu': {
        'Kancheepuram': 'Kanchipuram', 'Thiruvallur': 'Tiruvallur',
        'Kanniyakumari': 'Kanyakumari', 'Thiruvarur': 'Tiruvarur',
        'Tirupathur': 'Tirupattur', 'Viluppuram': 'Villupuram'
    },
    'Chhattisgarh': {
        'Gaurella Pendra Marwahi': 'Gaurela-Pendra-Marwahi',
        'Janjgir-Champa': 'Janjgir Champa', 'Janjgir - Champa': 'Janjgir Champa',
        'Mohla-Manpur-Ambagarh Chouki': 'Mohalla-Manpur-Ambagarh Chowki'
    },
    'Jharkhand': {
        'Pakaur': 'Pakur', 'Hazaribag': 'Hazaribagh', 
        'East Singhbum': 'East Singhbhum', 'Palamau': 'Palamu',
        'Sahebganj': 'Sahibganj', 'Kodarma': 'Koderma'
    },
    'Telangana': {
        'K.V. Rangareddy': 'Rangareddy', 'Ranga Reddy': 'Rangareddy',
        'Warangal (Urban)': 'Warangal Urban'
    },
    'Mizoram': {'Mammit': 'Mamit'},
    'Kerala': {'Kasargod': 'Kasaragod'},
    'Dadra And Nagar Haveli And Daman And Diu': {
        'Dadra & Nagar Haveli': 'Dadra And Nagar Haveli'
    },
    'Himachal Pradesh': {
        'Lahul And Spiti': 'Lahaul And Spiti', 'Lahul & Spiti': 'Lahaul And Spiti'
    },
    'Andaman And Nicobar Islands': {'Nicobars': 'Nicobar'}
}

# 3. Surgical execution
for state, mapping in final_comprehensive_map.items():
    for wrong, right in mapping.items():
        # Mask ensures we only fix the district if it's in the correct State
        mask = (df_enrolment['state'] == state) & (df_enrolment['district'] == wrong)
        df_enrolment.loc[mask, 'district'] = right

# 4. Final cleaning of the characters that cause grouping breaks
df_enrolment['district'] = df_enrolment['district'].str.replace(r'\s*[*]\s*', '', regex=True).str.strip()

print("Massive cleanup complete! All manually identified typos are now standardized.")

Massive cleanup complete! All manually identified typos are now standardized.


In [30]:
from difflib import get_close_matches

def find_district_typos(df):
    for state in df['state'].unique():
        districts = df[df['state'] == state]['district'].unique()
        for d in districts:
            # Find names that are very similar but not identical
            matches = get_close_matches(d, districts, n=2, cutoff=0.8)
            if len(matches) > 1:
                print(f"In {state}: Potential duplicates {matches}")

find_district_typos(df_enrolment)

In Meghalaya: Potential duplicates ['East Khasi Hills', 'West Khasi Hills']
In Meghalaya: Potential duplicates ['West Khasi Hills', 'East Khasi Hills']
In Meghalaya: Potential duplicates ['West Jaintia Hills', 'East Jaintia Hills']
In Meghalaya: Potential duplicates ['West Garo Hills', 'East Garo Hills']
In Meghalaya: Potential duplicates ['South Garo Hills', 'North Garo Hills']
In Meghalaya: Potential duplicates ['North Garo Hills', 'South Garo Hills']
In Meghalaya: Potential duplicates ['South West Khasi Hills', 'West Khasi Hills']
In Meghalaya: Potential duplicates ['East Jaintia Hills', 'West Jaintia Hills']
In Meghalaya: Potential duplicates ['East Garo Hills', 'West Garo Hills']
In Meghalaya: Potential duplicates ['South West Garo Hills', 'South Garo Hills']
In Meghalaya: Potential duplicates ['Eastern West Khasi Hills', 'West Khasi Hills']
In Meghalaya: Potential duplicates ['Jaintia Hills', 'West Jaintia Hills']
In Karnataka: Potential duplicates ['Bengaluru Urban', 'Bengaluru 

In [31]:
import pandas as pd

# 1. Ensure we are working on a clean copy
df_enrolment = df_enrolment.copy()

# 2. Final specialized mapping for the last few stubborn cases
final_fixes = {
    'Andhra Pradesh': {
        'Mahbubnagar': 'Mahabubnagar',
        'Mahabub Nagar': 'Mahabubnagar'
    },
    'Dadra And Nagar Haveli And Daman And Diu': {
        'Dadra & Nagar Haveli': 'Dadra And Nagar Haveli',
        'Dadra And Nagar Haveli': 'Dadra And Nagar Haveli' # Ensuring consistency
    },
    'Andaman And Nicobar Islands': {
        'Nicobars': 'Nicobar'
    }
}

# 3. Apply the fixes
for state, mapping in final_fixes.items():
    for wrong, right in mapping.items():
        mask = (df_enrolment['state'] == state) & (df_enrolment['district'] == wrong)
        df_enrolment.loc[mask, 'district'] = right

# 4. Global character cleanup (Handling the hidden '&' and extra spaces)
# This converts any remaining '&' to 'And' to match your standardized state names
df_enrolment['district'] = df_enrolment['district'].str.replace('&', 'And', regex=False)
df_enrolment['district'] = df_enrolment['district'].str.strip()

print("Final cleanup executed. Mahabubnagar, Nicobar, and Dadra variations are now unified!")

Final cleanup executed. Mahabubnagar, Nicobar, and Dadra variations are now unified!


In [32]:
from difflib import get_close_matches

def find_district_typos(df):
    for state in df['state'].unique():
        districts = df[df['state'] == state]['district'].unique()
        for d in districts:
            # Find names that are very similar but not identical
            matches = get_close_matches(d, districts, n=2, cutoff=0.8)
            if len(matches) > 1:
                print(f"In {state}: Potential duplicates {matches}")

find_district_typos(df_enrolment)

In Meghalaya: Potential duplicates ['East Khasi Hills', 'West Khasi Hills']
In Meghalaya: Potential duplicates ['West Khasi Hills', 'East Khasi Hills']
In Meghalaya: Potential duplicates ['West Jaintia Hills', 'East Jaintia Hills']
In Meghalaya: Potential duplicates ['West Garo Hills', 'East Garo Hills']
In Meghalaya: Potential duplicates ['South Garo Hills', 'North Garo Hills']
In Meghalaya: Potential duplicates ['North Garo Hills', 'South Garo Hills']
In Meghalaya: Potential duplicates ['South West Khasi Hills', 'West Khasi Hills']
In Meghalaya: Potential duplicates ['East Jaintia Hills', 'West Jaintia Hills']
In Meghalaya: Potential duplicates ['East Garo Hills', 'West Garo Hills']
In Meghalaya: Potential duplicates ['South West Garo Hills', 'South Garo Hills']
In Meghalaya: Potential duplicates ['Eastern West Khasi Hills', 'West Khasi Hills']
In Meghalaya: Potential duplicates ['Jaintia Hills', 'West Jaintia Hills']
In Karnataka: Potential duplicates ['Bengaluru Urban', 'Bengaluru 

In [33]:
# 1. Force everything to a standard format first
df_enrolment['state'] = df_enrolment['state'].str.strip().str.title()
df_enrolment['district'] = df_enrolment['district'].str.strip().str.title()

# 2. Aggressive fix for Andaman & Nicobar
# We look for the word "Andaman" in the state name to avoid spelling mismatches
mask_an = df_enrolment['state'].str.contains('Andaman', na=False)
df_enrolment.loc[mask_an & (df_enrolment['district'] == 'Nicobars'), 'district'] = 'Nicobar'

# 3. Aggressive fix for Dadra & Nagar Haveli
mask_dadra = df_enrolment['state'].str.contains('Dadra', na=False)
df_enrolment.loc[mask_dadra & (df_enrolment['district'].str.contains('&')), 'district'] = 'Dadra And Nagar Haveli'

# 4. Final verification
print("Checking Andaman Districts:")
print(df_enrolment[df_enrolment['state'].str.contains('Andaman')]['district'].unique())

Checking Andaman Districts:
['Nicobar' 'South Andaman' 'Andamans' 'North And Middle Andaman']


In [34]:
from difflib import get_close_matches

def find_district_typos(df):
    for state in df['state'].unique():
        districts = df[df['state'] == state]['district'].unique()
        for d in districts:
            # Find names that are very similar but not identical
            matches = get_close_matches(d, districts, n=2, cutoff=0.8)
            if len(matches) > 1:
                print(f"In {state}: Potential duplicates {matches}")

find_district_typos(df_enrolment)

In Meghalaya: Potential duplicates ['East Khasi Hills', 'West Khasi Hills']
In Meghalaya: Potential duplicates ['West Khasi Hills', 'East Khasi Hills']
In Meghalaya: Potential duplicates ['West Jaintia Hills', 'East Jaintia Hills']
In Meghalaya: Potential duplicates ['West Garo Hills', 'East Garo Hills']
In Meghalaya: Potential duplicates ['South Garo Hills', 'North Garo Hills']
In Meghalaya: Potential duplicates ['North Garo Hills', 'South Garo Hills']
In Meghalaya: Potential duplicates ['South West Khasi Hills', 'West Khasi Hills']
In Meghalaya: Potential duplicates ['East Jaintia Hills', 'West Jaintia Hills']
In Meghalaya: Potential duplicates ['East Garo Hills', 'West Garo Hills']
In Meghalaya: Potential duplicates ['South West Garo Hills', 'South Garo Hills']
In Meghalaya: Potential duplicates ['Eastern West Khasi Hills', 'West Khasi Hills']
In Meghalaya: Potential duplicates ['Jaintia Hills', 'West Jaintia Hills']
In Karnataka: Potential duplicates ['Bengaluru Urban', 'Bengaluru 

In [35]:
# 1. Group by District and get a list of unique PIN codes for each
district_check = df_enrolment.groupby('district')['pincode'].unique()

# 2. To find typos, let's look at the "Top" and "Bottom" alphabetically
# Often typos appear right next to the correct spelling (e.g., 'Gurgaon' vs 'Gurugram')

district_check.sort_index().tail(20)

district
West Godavari         [534002, 534004, 534165, 534186, 534195, 53419...
West Jaintia Hills                     [793150, 793161, 793109, 793151]
West Kameng           [790101, 790001, 790116, 790002, 790102, 79011...
West Karbi Anglong    [782486, 782485, 782450, 782410, 782448, 78244...
West Khasi Hills               [793119, 793120, 781129, 793114, 793106]
West Medinipur                                                 [721201]
West Midnapore        [721145, 721451, 721166, 721424, 721102, 72112...
West Nimar            [451113, 451332, 451442, 450661, 451001, 45111...
West Siang                             [791125, 791001, 791101, 792051]
West Sikkim                                    [737121, 737111, 737113]
West Singhbhum        [833201, 833215, 833212, 833214, 833102, 83310...
West Tripura          [799045, 799035, 799002, 799008, 799012, 79911...
Wokha                                  [797111, 797100, 797112, 798601]
Y. S. R               [516002, 516003, 516172, 516233, 

In [36]:
district_check.sort_index().head(20)

district
Adilabad                 [504293, 504294, 504302, 504308, 504208, 50429...
Agar Malwa                [465447, 465445, 465230, 465441, 465449, 465550]
Agra                     [282005, 282001, 282010, 283125, 283111, 28320...
Ahilyanagar              [422620, 422608, 413738, 414001, 414201, 42261...
Ahmedabad                [382150, 380001, 380005, 382340, 380018, 38002...
Ahmednagar               [413201, 414105, 413705, 413701, 413703, 41400...
Aizawl                   [796001, 796012, 796111, 796009, 796008, 79601...
Ajmer                    [305001, 305901, 305003, 305021, 305207, 30540...
Akola                    [444101, 444102, 444111, 444006, 444311, 44400...
Alappuzha                [686534, 688008, 688504, 688522, 689573, 69050...
Aligarh                  [202133, 202001, 202126, 202132, 202122, 20213...
Alipurduar               [735204, 736182, 736206, 735217, 736202, 73521...
Alirajpur                [457990, 457882, 457887, 457893, 457993, 45788...
Allahabad       

In [37]:
# Create a 'Total' column first for easier viewing
df_enrolment['Total_Enrolment'] = df_enrolment['age_0_5'] + df_enrolment['age_5_17'] + df_enrolment['age_18_greater']

# Group by state and sum the total
state_eda = df_enrolment.groupby('state')['Total_Enrolment'].sum().sort_values(ascending=False).reset_index()

# Display the whole list to check for any remaining 'ghost' states
pd.set_option('display.max_rows', None) 
print(state_eda)

                                       state  Total_Enrolment
0                              Uttar Pradesh          1018629
1                                      Bihar           609585
2                             Madhya Pradesh           493970
3                                West Bengal           375340
4                                Maharashtra           369139
5                                  Rajasthan           348458
6                                    Gujarat           280549
7                                      Assam           230197
8                                  Karnataka           223235
9                                 Tamil Nadu           220789
10                                 Jharkhand           157539
11                                 Telangana           131574
12                            Andhra Pradesh           127686
13                                    Odisha           122987
14                                 Meghalaya           109771
15      

In [38]:
# Group by district and sum the age counts
district_summary = df_enrolment.groupby('district')[['age_0_5', 'age_5_17', 'age_18_greater']].sum()

# Display top 20 districts by total enrollment (calculating total on the fly for sorting)
print(district_summary.assign(Total=district_summary.sum(axis=1)).sort_values('Total', ascending=False).head(20))

                   age_0_5  age_5_17  age_18_greater  Total
district                                                   
Thane                29092     13629             967  43688
Sitamarhi            20679     18856            2697  42232
Bahraich             14674     22360            2304  39338
South 24 Parganas    28384      9427             330  38141
North 24 Parganas    25920      8598            1862  36380
Murshidabad          31442      4383              86  35911
Pune                 24088      6536            1139  31763
Jaipur               21436      8976             734  31146
Bengaluru            20441      6732            3807  30980
Banaskantha          15941     12500            2417  30858
Sitapur              16237     13869             748  30854
Hyderabad            23552      6608             670  30830
West Champaran       11585     18070             783  30438
Agra                 16314     12691             905  29910
East Champaran       10147     18371    

In [39]:
# Drop the redundant total_enrol column
df_enrolment.drop(columns=['total_enrol'], inplace=True)

print("Column 'total_enrol' has been removed.")

Column 'total_enrol' has been removed.


In [40]:
df_enrolment.head(20)

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater,Total_Enrolment
0,2025-03-02,Meghalaya,East Khasi Hills,793121,11,61,37,109
1,2025-03-09,Karnataka,Bengaluru Urban,560043,14,33,39,86
2,2025-03-09,Uttar Pradesh,Kanpur Nagar,208001,29,82,12,123
3,2025-03-09,Uttar Pradesh,Aligarh,202133,62,29,15,106
4,2025-03-09,Karnataka,Bengaluru Urban,560016,14,16,21,51
5,2025-03-09,Bihar,Sitamarhi,843331,20,49,12,81
6,2025-03-09,Bihar,Sitamarhi,843330,23,24,42,89
7,2025-03-09,Uttar Pradesh,Bahraich,271865,26,60,14,100
8,2025-03-09,Uttar Pradesh,Firozabad,283204,28,26,10,64
9,2025-03-09,Bihar,Purbi Champaran,845418,30,48,10,88


In [41]:
df_enrolment.to_csv('Cleaned Datasets/cleaned_enrollment.csv', index=False)

In [42]:
# Save Demographic data
df_demographic.to_csv('Raw Datasets/raw_demographic.csv', index=False)

# Save Biometric data
df_biometric.to_csv('Raw Datasets/raw_biometric.csv', index=False)

print("Both files saved: 'raw_demographic.csv' and 'raw_biometric.csv'")

Both files saved: 'raw_demographic.csv' and 'raw_biometric.csv'


In [43]:
# Calculate total for display only and group by state/district
enrolment_summary = df_enrolment.groupby(['state', 'district'])[['age_0_5', 'age_5_17', 'age_18_greater']].sum().sum(axis=1)

print("--- ENROLLMENT DISTRICT AUDIT ---")
print(enrolment_summary.sort_values(ascending=False).to_string())

--- ENROLLMENT DISTRICT AUDIT ---
state                                     district                       
Maharashtra                               Thane                              43688
Bihar                                     Sitamarhi                          42232
Uttar Pradesh                             Bahraich                           39338
West Bengal                               South 24 Parganas                  38141
                                          North 24 Parganas                  36380
                                          Murshidabad                        35911
Maharashtra                               Pune                               31763
Rajasthan                                 Jaipur                             31146
Karnataka                                 Bengaluru                          30980
Gujarat                                   Banaskantha                        30858
Uttar Pradesh                             Sitapur             

In [44]:
# 1. Surgical mapping for the enrollment survivors
enrollment_final_map = {
    'Bihar': {
        'Kaimur (Bhabua)': 'Kaimur'
    },
    'Karnataka': {
        'Bijapur(Kar)': 'Vijayapura'
    },
    'Andhra Pradesh': {
        'K.V. Rangareddy': 'Rangareddy',
        'Rangareddi': 'Rangareddy'
    }
}

# 2. Apply to df_enrollment
for state, mapping in enrollment_final_map.items():
    for wrong, right in mapping.items():
        mask = (df_enrolment['state'] == state) & (df_enrolment['district'] == wrong)
        df_enrolment.loc[mask, 'district'] = right

# 3. Nuke double spaces and standardize
df_enrolment['district'] = df_enrolment['district'].str.replace(r'\s+', ' ', regex=True).str.strip()

# 4. Save the enrollment file (overwriting the previous version)
df_enrolment.to_csv('Cleaned Datasets/cleaned_enrollment.csv', index=False)

print("Final cleanup applied to Enrollment data.")
print("File 'cleaned_enrollment.csv' has been updated and saved.")

Final cleanup applied to Enrollment data.
File 'cleaned_enrollment.csv' has been updated and saved.


In [45]:
# Calculate total for display only and group by state/district
enrolment_summary = df_enrolment.groupby(['state', 'district'])[['age_0_5', 'age_5_17', 'age_18_greater']].sum().sum(axis=1)

print("--- ENROLLMENT DISTRICT AUDIT ---")
print(enrolment_summary.sort_values(ascending=False).to_string())

--- ENROLLMENT DISTRICT AUDIT ---
state                                     district                       
Maharashtra                               Thane                              43688
Bihar                                     Sitamarhi                          42232
Uttar Pradesh                             Bahraich                           39338
West Bengal                               South 24 Parganas                  38141
                                          North 24 Parganas                  36380
                                          Murshidabad                        35911
Maharashtra                               Pune                               31763
Rajasthan                                 Jaipur                             31146
Karnataka                                 Bengaluru                          30980
Gujarat                                   Banaskantha                        30858
Uttar Pradesh                             Sitapur             

In [46]:
# Check if the old names still exist
check_list = ['Kaimur (Bhabua)', 'Bijapur(Kar)', 'K.V. Rangareddy', 'Rangareddi']
remaining = df_enrolment[df_enrolment['district'].isin(check_list)]

if remaining.empty:
    print("Verification: All target typos are gone from Enrollment.")
else:
    print("Still seeing these:")
    print(remaining[['state', 'district']].value_counts())

Verification: All target typos are gone from Enrollment.


In [47]:
# Surgical cleanup for these specific leftovers in df_enrolment
enrolment_mopping = {
    'Telangana': {
        'Yadadri.': 'Yadadri Bhuvanagiri' # Standard name for Yadadri
    },
    'Punjab': {
        'Sas Nagar (Mohali)': 'Sas Nagar' # Cleaning up the parenthesis
    },
    'Maharashtra': {
        'Raigarh(Mh)': 'Raigarh'
    },
    'Jammu And Kashmir': {
        'Leh (Ladakh)': 'Leh'
    }
}

for state, mapping in enrolment_mopping.items():
    for wrong, right in mapping.items():
        mask = (df_enrolment['state'] == state) & (df_enrolment['district'] == wrong)
        df_enrolment.loc[mask, 'district'] = right

# Global fix for the dot in 'Yadadri.' if it's still there
df_enrolment['district'] = df_enrolment['district'].str.replace(r'\.', '', regex=True).str.strip()

print("Targeted fixes applied to Enrollment data.")

Targeted fixes applied to Enrollment data.


In [48]:
# Audit the specific states you just fixed
audit_states = ['Telangana', 'Punjab', 'Maharashtra', 'Jammu And Kashmir']
enrolment_summary = df_enrolment[df_enrolment['state'].isin(audit_states)].groupby(['state', 'district'])[['age_0_5', 'age_5_17', 'age_18_greater']].sum().sum(axis=1)

print(enrolment_summary.sort_values(ascending=False).to_string())

state              district                  
Maharashtra        Thane                         43688
                   Pune                          31763
Telangana          Hyderabad                     25990
Maharashtra        Nashik                        22368
                   Mumbai Suburban               18635
Telangana          Rangareddy                    17993
Punjab             Ludhiana                      17614
Maharashtra        Aurangabad                    17393
                   Mumbai                        14552
                   Jalgaon                       13260
                   Dhule                         12605
                   Ahmednagar                    12544
                   Solapur                       12292
                   Nanded                        11947
                   Nagpur                        11828
                   Palghar                       10699
                   Nandurbar                     10656
                   

In [49]:
df_enrolment.to_csv('Cleaned Datasets/cleaned_enrollment.csv', index=False)
print("File 'cleaned_enrollment.csv' saved successfully.")

File 'cleaned_enrollment.csv' saved successfully.


In [50]:
# Create total column for Enrollment
df_enrolment['total_enrolment'] = df_enrolment['age_0_5'] + df_enrolment['age_5_17'] + df_enrolment['age_18_greater']

# Group and print
enrolment_summary = df_enrolment.groupby(['state', 'district'])['total_enrolment'].sum()

print("--- FULL ENROLLMENT LIST BY DISTRICT ---")
print("-" * 60)
print(enrolment_summary.to_string())
print("-" * 60)
print(f"Total Unique Districts in Enrollment: {len(enrolment_summary)}")

--- FULL ENROLLMENT LIST BY DISTRICT ---
------------------------------------------------------------
state                                     district                       
Andaman And Nicobar Islands               Andamans                              75
                                          Nicobar                               76
                                          North And Middle Andaman             132
                                          South Andaman                        228
Andhra Pradesh                            Adilabad                            1419
                                          Alluri Sitharama Raju               1255
                                          Anakapalli                           543
                                          Ananthapur                          9596
                                          Annamayya                           1021
                                          Bapatla                            

In [51]:
# Final consolidation for the Enrollment survivors
enrolment_final_mopping = {
    'Andhra Pradesh': {
        'Ananthapur': 'Anantapur',
        'Cuddapah': 'Y S R',
        'Spsr Nellore': 'Nellore',
        'Sri Potti Sriramulu Nellore': 'Nellore'
    },
    'Bihar': {
        'Bhabua': 'Kaimur',
        'Monghyr': 'Munger',
        'Pashchim Champaran': 'West Champaran',
        'Purbi Champaran': 'East Champaran'
    },
    'Haryana': {
        'Gurugram': 'Gurgaon',
        'Nuh': 'Mewat'
    },
    'Karnataka': {
        'Bangalore': 'Bengaluru Urban',
        'Belgaum': 'Belagavi',
        'Bellary': 'Ballari',
        'Bengaluru': 'Bengaluru Urban',
        'Bengaluru South': 'Bengaluru Urban',
        'Bijapur': 'Vijayapura',
        'Gulbarga': 'Kalaburagi',
        'Mysore': 'Mysuru'
    },
    'Punjab': {
        'Firozpur': 'Ferozepur',
        'SAS Nagar': 'Sas Nagar (Mohali)',
        'Sas Nagar': 'Sas Nagar (Mohali)',
        'Sri Muktsar Sahib': 'Muktsar'
    },
    'Uttar Pradesh': {
        'Allahabad': 'Prayagraj',
        'Faizabad': 'Ayodhya',
        'Bara Banki': 'Barabanki',
        'Rae Bareli': 'Raebareli',
        'Sant Ravidas Nagar': 'Bhadohi',
        'Siddharth Nagar': 'Siddharthnagar',
        'Shravasti': 'Shrawasti',
        'Jyotiba Phule Nagar': 'Amroha'
    },
    'West Bengal': {
        'Burdwan': 'Bardhaman',
        'Haora': 'Howrah',
        'Hugli': 'Hooghly',
        'Koch Bihar': 'Cooch Behar',
        'Medinipur': 'Paschim Medinipur',
        'Medinipur West': 'Paschim Medinipur',
        'West Medinipur': 'Paschim Medinipur',
        'West Midnapore': 'Paschim Medinipur',
        'Dinajpur Dakshin': 'Dakshin Dinajpur',
        'South Dinajpur': 'Dakshin Dinajpur',
        'Dinajpur Uttar': 'Uttar Dinajpur',
        'North Dinajpur': 'Uttar Dinajpur'
    }
}

# Apply mapping
for state, mapping in enrolment_final_mopping.items():
    for wrong, right in mapping.items():
        df_enrolment.loc[(df_enrolment['state'] == state) & (df_enrolment['district'] == wrong), 'district'] = right

# Special Fix for Sikkim Districts to match Biometric
sikkim_map = {'East': 'East Sikkim', 'West': 'West Sikkim', 'North': 'North Sikkim', 'South': 'South Sikkim'}
for wrong, right in sikkim_map.items():
    df_enrolment.loc[(df_enrolment['state'] == 'Sikkim') & (df_enrolment['district'] == wrong), 'district'] = right

# Clean up whitespace
df_enrolment['district'] = df_enrolment['district'].str.replace(r'\s+', ' ', regex=True).str.strip()

print("Enrollment survivors unified.")

Enrollment survivors unified.


In [52]:
# Save the final cleaned enrollment data
df_enrolment.to_csv('Cleaned Datasets/cleaned_enrollment.csv', index=False)
