# Aadhar Enrollment Analysis 

### Importing Libraries 

In [1]:
import pandas as pd 
import warnings


## Loading Dataset

In [2]:
file_1 = pd.read_csv('api_data_aadhar_enrolment_0_500000.csv')
file_2 = pd.read_csv('api_data_aadhar_enrolment_500000_1000000.csv')
file_3 = pd.read_csv('api_data_aadhar_enrolment_1000000_1006029.csv')

In [3]:

print(file_1.columns.equals(file_2.columns))
print(file_2.columns.equals(file_3.columns))


True
True


In [4]:
# identifying mis-matches
set(file_1.columns) ^ set(file_2.columns)


set()

In [5]:
# concatenate the csv's
aadhar_raw = pd.concat(
    [file_1, file_2, file_3],
    axis=0,
    ignore_index=True
)


In [6]:
aadhar_raw.shape

(1006029, 7)

### Note:
The Aadhaar enrolment dataset was provided in multiple file partitions due to size constraints. All files were loaded using a consistent schema and concatenated vertically to reconstruct the complete dataset for analysis.

## Data Validation & Cleaning

In [9]:
aadhar_raw['state'].unique()

array(['Meghalaya', 'Karnataka', 'Uttar Pradesh', 'Bihar', 'Maharashtra',
       'Haryana', 'Rajasthan', 'Punjab', 'Delhi', 'Madhya Pradesh',
       'West Bengal', 'Assam', 'Uttarakhand', 'Gujarat', 'Andhra Pradesh',
       'Tamil Nadu', 'Chhattisgarh', 'Jharkhand', 'Nagaland', 'Manipur',
       'Telangana', 'Tripura', 'Mizoram', 'Jammu and Kashmir',
       'Chandigarh', 'Sikkim', 'Odisha', 'Kerala',
       'The Dadra And Nagar Haveli And Daman And Diu',
       'Arunachal Pradesh', 'Himachal Pradesh', 'Goa',
       'Jammu And Kashmir', 'Dadra and Nagar Haveli and Daman and Diu',
       'Ladakh', 'Andaman and Nicobar Islands', 'Orissa', 'Pondicherry',
       'Puducherry', 'Lakshadweep', 'Andaman & Nicobar Islands',
       'Dadra & Nagar Haveli', 'Dadra and Nagar Haveli', 'Daman and Diu',
       'WEST BENGAL', 'Jammu & Kashmir', 'West  Bengal', '100000',
       'Daman & Diu', 'West Bangal', 'Westbengal', 'West bengal',
       'andhra pradesh', 'ODISHA', 'WESTBENGAL'], dtype=object)

In [11]:
aadhar_raw.loc[aadhar_raw['state']== '100000']

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
23108,02-09-2025,100000,100000,100000,0,0,3
46946,03-09-2025,100000,100000,100000,0,0,1
97816,08-09-2025,100000,100000,100000,0,0,1
115798,09-09-2025,100000,100000,100000,0,0,1
153156,11-09-2025,100000,100000,100000,0,0,2
160195,12-09-2025,100000,100000,100000,0,0,2
261778,19-09-2025,100000,100000,100000,0,0,1
272731,20-09-2025,100000,100000,100000,0,0,1
470934,24-10-2025,100000,100000,100000,0,1,0
762744,15-11-2025,100000,100000,100000,0,0,3


In [13]:
aadhar_raw = aadhar_raw[aadhar_raw['state'] != '100000']


In [16]:
aadhar_raw.loc[aadhar_raw['state']== '100000']

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater


In [20]:
df_copy = aadhar_raw.copy()

In [21]:
df_copy['state']= (
    df_copy['state']
    .astype(str)
    .str.strip()
    .str.lower()
    .str.replace('&', 'and', regex=False)
    .str.replace(r'\s+', ' ', regex=True)
)

In [25]:
df_copy['state'].unique()

array(['meghalaya', 'karnataka', 'uttar pradesh', 'bihar', 'maharashtra',
       'haryana', 'rajasthan', 'punjab', 'delhi', 'madhya pradesh',
       'west bengal', 'assam', 'uttarakhand', 'gujarat', 'andhra pradesh',
       'tamil nadu', 'chhattisgarh', 'jharkhand', 'nagaland', 'manipur',
       'telangana', 'tripura', 'mizoram', 'jammu and kashmir',
       'chandigarh', 'sikkim', 'odisha', 'kerala',
       'the dadra and nagar haveli and daman and diu',
       'arunachal pradesh', 'himachal pradesh', 'goa',
       'dadra and nagar haveli and daman and diu', 'ladakh',
       'andaman and nicobar islands', 'orissa', 'pondicherry',
       'puducherry', 'lakshadweep', 'dadra and nagar haveli',
       'daman and diu', 'west bangal', 'westbengal'], dtype=object)

In [23]:
state_mapping = {
    
    'west bengal': 'West Bengal',
    'west bangal': 'West Bengal',
    'westbengal': 'West Bengal',

    
    'orissa': 'Odisha',
    'odisha': 'Odisha',

    
    'pondicherry': 'Puducherry',
    'puducherry': 'Puducherry',

    
    'jammu and kashmir': 'Jammu and Kashmir',
    'jammu and kashmir ': 'Jammu and Kashmir',
    'jammu and kashmir': 'Jammu and Kashmir',

    
    'dadra and nagar haveli': 'Dadra and Nagar Haveli and Daman and Diu',
    'daman and diu': 'Dadra and Nagar Haveli and Daman and Diu',
    'dadra and nagar haveli and daman and diu':
        'Dadra and Nagar Haveli and Daman and Diu',

    
    'andaman and nicobar islands': 'Andaman and Nicobar Islands',

    
    'andhra pradesh': 'Andhra Pradesh'
}


In [29]:
df_copy['state'] = df_copy['state'].map(state_mapping).fillna(df_copy['state'].str.title())
sorted(df_copy['state'].unique())

['Andaman And Nicobar Islands',
 'Andhra Pradesh',
 'Arunachal Pradesh',
 'Assam',
 'Bihar',
 'Chandigarh',
 'Chhattisgarh',
 'Dadra And Nagar Haveli And Daman And Diu',
 'Delhi',
 'Goa',
 'Gujarat',
 'Haryana',
 'Himachal Pradesh',
 'Jammu And Kashmir',
 'Jharkhand',
 'Karnataka',
 'Kerala',
 'Ladakh',
 'Lakshadweep',
 'Madhya Pradesh',
 'Maharashtra',
 'Manipur',
 'Meghalaya',
 'Mizoram',
 'Nagaland',
 'Odisha',
 'Puducherry',
 'Punjab',
 'Rajasthan',
 'Sikkim',
 'Tamil Nadu',
 'Telangana',
 'The Dadra And Nagar Haveli And Daman And Diu',
 'Tripura',
 'Uttar Pradesh',
 'Uttarakhand',
 'West Bengal']

In [33]:
df= df_copy

In [34]:
df['district'].unique()

array(['East Khasi Hills', 'Bengaluru Urban', 'Kanpur Nagar', 'Aligarh',
       'Sitamarhi', 'Bahraich', 'Firozabad', 'Purbi Champaran',
       'Maharajganj', 'Aurangabad', 'Ghaziabad', 'Faridabad', 'Madhubani',
       'Sikar', 'Bhagalpur', 'Amritsar', 'Gurugram',
       'Gautam Buddha Nagar', 'West Delhi', 'Bhind', 'Gwalior', 'Katni',
       'Coochbehar', 'Lucknow', 'Dinajpur Uttar', 'Marigaon', 'Kokrajhar',
       'Agra', 'Haridwar', 'Nagaon', 'Parbhani', 'West Khasi Hills',
       'North West Delhi', 'West Jaintia Hills', 'Unnao', 'Saharanpur',
       'Dibrugarh', 'Udalguri', 'Chirang', 'Kamrup', 'Dhubri', 'Jaunpur',
       'Thane', 'Dhemaji', 'Banas Kantha', 'Tinsukia', 'Spsr Nellore',
       'Barpeta', 'Sonitpur', 'Baksa', 'Ludhiana', 'Patan', 'Kanchipuram',
       'Patna', 'Lakhimpur', 'Bongaigaon', 'Vadodara', 'Nainital',
       'Dehradun', 'Hojai', 'Dohad', 'Morbi', 'Raipur', 'Gorakhpur',
       'Bulandshahr', 'Mathura', 'Pashchim Champaran', 'Bijapur',
       'Deoghar', 'Muzaf

In [35]:
df['district'] = (
    df['district']
    .astype(str)
    .str.strip()
    .str.lower()
    .str.replace('&', 'and', regex=False)
    .str.replace(r'\*', '', regex=True)
    .str.replace(r'\(.*?\)', '', regex=True)
    .str.replace(r'\s+', ' ', regex=True)
    .str.strip()
)


In [36]:
district_mapping = {
    
    'coochbehar': 'Cooch Behar',
    'burdwan': 'Burdwan',
    'south 24 parganas': 'South 24 Parganas',
    'south 24 pargana': 'South 24 Parganas',
    'north 24 parganas': 'North 24 Parganas',
    'dinajpur uttar': 'Uttar Dinajpur',
    'dinajpur dakshin': 'Dakshin Dinajpur',

    
    'purbi champaran': 'East Champaran',
    'pashchim champaran': 'West Champaran',

    
    'banas kantha': 'Banaskantha',
    'dohad': 'Dahod',

    
    'anugal': 'Angul',
    'angul': 'Angul',

    
    'punch': 'Poonch',

    
    'bijapur': 'Vijayapura'
}


In [37]:
df['district'] = (
    df['district']
    .map(district_mapping)
    .fillna(df['district'].str.title())
)


In [38]:
df['district'].nunique()


926

In [39]:
df.groupby('state')['district'].nunique().sort_values(ascending=False).head()


state
Uttar Pradesh     88
Madhya Pradesh    60
Karnataka         48
Maharashtra       47
Andhra Pradesh    47
Name: district, dtype: int64

In [40]:
pd.Series(df['district'].unique()).sort_values().to_csv(
    "unique_districts_cleaned.csv", index=False
)


In [43]:
temp_df = pd.read_csv('unique_districts_cleaned.csv')
temp_df

Unnamed: 0,0
0,24 Paraganas North
1,24 Paraganas South
2,Adilabad
3,Agar Malwa
4,Agra
...,...
921,Yamuna Nagar
922,Yamunanagar
923,Yanam
924,Yavatmal


In [7]:
aadhar_raw.duplicated().sum()

np.int64(22957)

In [8]:
total_rows = len(aadhar_raw)
dup_rows = aadhar_raw.duplicated().sum()

dup_rows, total_rows, dup_rows / total_rows


(np.int64(22957), 1006029, np.float64(0.022819421706531322))

In [9]:
dups = aadhar_raw[aadhar_raw.duplicated(keep=False)]
dups.head(10)


Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
359389,13-10-2025,Punjab,Jalandhar,144041,2,1,0
359390,13-10-2025,Punjab,Jalandhar,144101,1,0,0
359391,13-10-2025,Punjab,Jalandhar,144102,2,0,0
359392,13-10-2025,Punjab,Jalandhar,144418,1,0,0
359393,13-10-2025,Punjab,Jalandhar,144419,1,0,0
359394,13-10-2025,Punjab,Jalandhar,144702,1,1,0
359395,13-10-2025,Punjab,Jalandhar,144801,0,1,0
359396,13-10-2025,Punjab,Kapurthala,144401,5,1,1
359397,13-10-2025,Punjab,Kapurthala,144601,4,2,2
359398,13-10-2025,Punjab,Kapurthala,144804,2,0,0


In [10]:
dups.groupby(dups.columns.tolist()).size().value_counts().head()


2    22957
Name: count, dtype: int64

In [11]:
# dropping duplicates
aadhar_clean = aadhar_raw.drop_duplicates()
aadhar_clean.shape

(983072, 7)

In [12]:
aadhar_clean.duplicated().sum()

np.int64(0)

### Note:
Post loading, the dataset was validated for schema consistency, data types, and exact duplicate records. Identified duplicates arising from file-level overlaps were removed to prevent artificial inflation of enrolment counts, ensuring analytical accuracy

## Feature Engineering 

In [13]:
df = aadhar_clean
df.head()

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,02-03-2025,Meghalaya,East Khasi Hills,793121,11,61,37
1,09-03-2025,Karnataka,Bengaluru Urban,560043,14,33,39
2,09-03-2025,Uttar Pradesh,Kanpur Nagar,208001,29,82,12
3,09-03-2025,Uttar Pradesh,Aligarh,202133,62,29,15
4,09-03-2025,Karnataka,Bengaluru Urban,560016,14,16,21


In [15]:
df.dtypes

date              object
state             object
district          object
pincode            int64
age_0_5            int64
age_5_17           int64
age_18_greater     int64
dtype: object

In [18]:
df = df.copy()
df['date'] = pd.to_datetime(df['date'], dayfirst=True)


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 983072 entries, 0 to 1006028
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   date            983072 non-null  datetime64[ns]
 1   state           983072 non-null  object        
 2   district        983072 non-null  object        
 3   pincode         983072 non-null  int64         
 4   age_0_5         983072 non-null  int64         
 5   age_5_17        983072 non-null  int64         
 6   age_18_greater  983072 non-null  int64         
dtypes: datetime64[ns](1), int64(4), object(2)
memory usage: 60.0+ MB


In [20]:
df['month'] = df['date'].dt.to_period('M')


In [22]:
df['total_enrollments'] = (
    df['age_0_5']
    + df['age_5_17']
    + df['age_18_greater']
)

In [25]:
df['share % 0-5'] = (
    df['age_0_5'] / df['total_enrollments']
)*100

In [27]:
df['share % 5-17'] = (
    df['age_5_17'] / df['total_enrollments']
)*100

In [29]:
df['share % 18+'] = (
    df['age_18_greater'] / df['total_enrollments']
)*100

In [37]:
df['highest_share'] = df[
    ['share % 0-5', 'share % 5-17', 'share % 18+']
].idxmax(axis=1)

In [38]:
df.head()

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater,month,total_enrollments,share % 0-5,share % 5-17,share % 18+,highest_share
0,2025-03-02,Meghalaya,East Khasi Hills,793121,11,61,37,2025-03,109,10.091743,55.963303,33.944954,share % 5-17
1,2025-03-09,Karnataka,Bengaluru Urban,560043,14,33,39,2025-03,86,16.27907,38.372093,45.348837,share % 18+
2,2025-03-09,Uttar Pradesh,Kanpur Nagar,208001,29,82,12,2025-03,123,23.577236,66.666667,9.756098,share % 5-17
3,2025-03-09,Uttar Pradesh,Aligarh,202133,62,29,15,2025-03,106,58.490566,27.358491,14.150943,share % 0-5
4,2025-03-09,Karnataka,Bengaluru Urban,560016,14,16,21,2025-03,51,27.45098,31.372549,41.176471,share % 18+


### Note:
   Feature engineering was performed to transform raw enrolment records into analytically meaningful variables by standardising temporal attributes, deriving demographic composition metrics, and identifying dominant age-group patterns at a granular geographic level.
   
   This phase established a structured, time-aware, and demographic-aware dataset, enabling robust univariate, bivariate, and trivariate analyses without introducing aggregation bias or interpretational ambiguity.



## Data Aggregation & Preparation for Analysis

#### I. District–Month Level Aggregation

In [39]:
district_month = df[
    ['state', 'district', 'month']
]

In [40]:
district_month = (
        df
    .groupby(['state', 'district', 'month'], as_index=False)
    .agg({
        'total_enrollments': 'sum',
        'age_0_5': 'sum',
        'age_5_17': 'sum',
        'age_18_greater': 'sum'
    })
)

In [44]:
district_month

Unnamed: 0,state,district,month,total_enrollments,age_0_5,age_5_17,age_18_greater
0,100000,100000,2025-09,12,0,0,12
1,100000,100000,2025-10,1,0,1,0
2,100000,100000,2025-11,11,0,0,11
3,100000,100000,2025-12,190,0,0,190
4,Andaman & Nicobar Islands,Andamans,2025-09,27,23,4,0
...,...,...,...,...,...,...,...
5057,Westbengal,Hooghly,2025-12,1,1,0,0
5058,andhra pradesh,chittoor,2025-09,1,1,0,0
5059,andhra pradesh,chittoor,2025-10,2,2,0,0
5060,andhra pradesh,chittoor,2025-12,1,1,0,0


### II. State-Month Level Aggregation

In [45]:
state_month = (
        df
    .groupby(['state','month'], as_index=False)
    .agg({
        'total_enrollments': 'sum',
        'age_0_5': 'sum',
        'age_5_17': 'sum',
        'age_18_greater': 'sum'
    })
)

In [46]:
state_month

Unnamed: 0,state,month,total_enrollments,age_0_5,age_5_17,age_18_greater
0,100000,2025-09,12,0,0,12
1,100000,2025-10,1,0,1,0
2,100000,2025-11,11,0,0,11
3,100000,2025-12,190,0,0,190
4,Andaman & Nicobar Islands,2025-09,43,39,4,0
...,...,...,...,...,...,...
318,Westbengal,2025-11,1,0,1,0
319,Westbengal,2025-12,1,1,0,0
320,andhra pradesh,2025-09,2,2,0,0
321,andhra pradesh,2025-10,2,2,0,0


### Note:
The feature-engineered dataset was aggregated into structured analytical tables to support univariate, bivariate, and trivariate analysis. A district–month level table was created as the primary dataset to capture granular spatial and temporal patterns, while a state–month level table was derived as a secondary roll-up for contextual comparison across states.