# 01. Data Preprocessing
 
 **Objective**: Load, clean, standardize, and prepare Aadhaar enrollment data for analysis
 
 **Steps**:
 1. Load all CSV files from raw data directory
 2. Standardize date formats and district names
 3. Validate and clean data
 4. Handle missing values and duplicates
 5. Create aggregated datasets
 6. Save processed data


In [1]:
# Import required libraries
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

# %%
# Configuration
RAW_DATA_DIR = Path('../data/raw')
PROCESSED_DATA_DIR = Path('../data/processed')

# Create processed data directory if it doesn't exist
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

print(f"Raw data directory: {RAW_DATA_DIR}")
print(f"Processed data directory: {PROCESSED_DATA_DIR}")

Libraries imported successfully!
Raw data directory: ..\data\raw
Processed data directory: ..\data\processed


In [2]:
# Load individual district files
file_mapping = {
    'Hyderabad': 'Hyderabad_main.csv',
    'Nalgonda': 'Nalgonda.csv',
    'Rangareddy': 'Rangareddy.csv',
    'RangaReddy_merged': 'RangaReddy_merged.csv'
}

dataframes = {}

for district, filename in file_mapping.items():
    filepath = RAW_DATA_DIR / filename
    if filepath.exists():
        df = pd.read_csv(filepath)
        dataframes[district] = df
        print(f"✓ Loaded {filename}: {len(df)} records")
    else:
        print(f"✗ File not found: {filename}")

print(f"\nTotal files loaded: {len(dataframes)}")


✗ File not found: Hyderabad_main.csv
✓ Loaded Nalgonda.csv: 2867 records
✓ Loaded Rangareddy.csv: 208 records
✗ File not found: RangaReddy_merged.csv

Total files loaded: 2
