In [1]:
# ----------------------------
# 1. Import Libraries
# ----------------------------
import pandas as pd
import numpy as np


In [2]:
# ----------------------------
# 2. Load Data
# ----------------------------
demo = pd.read_csv(
    r"C:\Users\Tanaya\Downloads\aadhaar-enrollment-main\aadhaar-enrollment-main\aadhaar-enrollment\data\raw\Aadhaar Demographic Monthly Update Data.csv"
)

enrol = pd.read_csv(
    r"C:\Users\Tanaya\Downloads\aadhaar-enrollment-main\aadhaar-enrollment-main\aadhaar-enrollment\data\raw\Aadhar Monthly Enrollment data.csv"
)


print("Enrollment dataset shape:", enrol.shape)
print("Demographic update dataset shape:", demo.shape)


Enrollment dataset shape: (8037, 7)
Demographic update dataset shape: (13891, 6)


In [3]:
# ----------------------------
# 3. Inspect Missing Values
# ----------------------------
print("\nMissing values in Enrollment dataset:")
print(enrol.isnull().sum())

print("\nMissing values in Demographic dataset:")
print(demo.isnull().sum())



Missing values in Enrollment dataset:
date              0
state             0
district          0
pincode           0
age_0_5           0
age_5_17          0
age_18_greater    0
dtype: int64

Missing values in Demographic dataset:
date             0
state            0
district         0
pincode          0
demo_age_5_17    0
demo_age_17      0
dtype: int64


In [4]:
# ----------------------------
# 4. Handle Missing Values
# ----------------------------

# Strategy: numeric -> 0, categorical -> 'Unknown'

# Enrollment dataset
numeric_cols_enrol = enrol.select_dtypes(include=['int64','float64']).columns
categorical_cols_enrol = enrol.select_dtypes(include=['object']).columns

enrol[numeric_cols_enrol] = enrol[numeric_cols_enrol].fillna(0)
enrol[categorical_cols_enrol] = enrol[categorical_cols_enrol].fillna('Unknown')

# Demographic dataset
numeric_cols_demo = demo.select_dtypes(include=['int64','float64']).columns
categorical_cols_demo = demo.select_dtypes(include=['object']).columns

demo[numeric_cols_demo] = demo[numeric_cols_demo].fillna(0)
demo[categorical_cols_demo] = demo[categorical_cols_demo].fillna('Unknown')


In [5]:
# ----------------------------
# 5. Remove Duplicates
# ----------------------------
enrol = enrol.drop_duplicates()
demo = demo.drop_duplicates()

print("\nShapes after cleaning duplicates:")
print("Enrollment dataset:", enrol.shape)
print("Demographic dataset:", demo.shape)



Shapes after cleaning duplicates:
Enrollment dataset: (8037, 7)
Demographic dataset: (13601, 6)


In [6]:
# ----------------------------
# 6. Convert Data Types
# ----------------------------

# Convert date columns to datetime
date_cols_enrol = [col for col in enrol.columns if 'date' in col.lower()]
for col in date_cols_enrol:
    enrol[col] = pd.to_datetime(enrol[col], errors='coerce')

date_cols_demo = [col for col in demo.columns if 'date' in col.lower()]
for col in date_cols_demo:
    demo[col] = pd.to_datetime(demo[col], errors='coerce')


In [7]:
# ----------------------------
# 7. Encode Categorical Columns (Optional)
# ----------------------------

# For simple analysis, you can keep categories as strings.
# For ML later, you can use label encoding or one-hot encoding
# Example: one-hot encoding for demo dataset
# demo_encoded = pd.get_dummies(demo, columns=['State', 'District'], drop_first=True)


In [8]:
# ----------------------------
# 8. Save Cleaned Data
# ----------------------------
enrol.to_csv(
    "C:/Users/Pragya/Desktop/uidai-hackathon/aadhaar-enrollment/data/processed/enrol_cleaned.csv",
    index=False
)
demo.to_csv(
    "C:/Users/Pragya/Desktop/uidai-hackathon/aadhaar-enrollment/data/processed/demo_cleaned.csv",
    index=False
)

print("\nData cleaning completed. Cleaned files saved to 'processed' folder.")



Data cleaning completed. Cleaned files saved to 'processed' folder.
