In [4]:
# ============================================================================
# COMPLETE PWD VOTER DATA CLEANING - ALL COLUMNS
# ============================================================================

import pandas as pd
import json
from google.colab import files

print("=" * 70)
print("PWD VOTER DATA CLEANING - COMPLETE VERSION")
print("=" * 70)

# Upload your voter file
print("\nüìÅ Upload your PWD voter Excel/CSV file:")
uploaded = files.upload()
voter_file = list(uploaded.keys())[0]

print(f"\n‚úì Uploaded: {voter_file}")

# ============================================================================
# LOAD FILE
# ============================================================================

print("\n" + "=" * 70)
print("LOADING DATA")
print("=" * 70)

if voter_file.endswith('.csv'):
    df = pd.read_csv(voter_file, encoding='utf-8')
else:
    df = pd.read_excel(voter_file)

print(f"‚úì Loaded {len(df)} rows")
print(f"\nYour columns:")
for i, col in enumerate(df.columns, 1):
    print(f"  {i}. {col}")

print(f"\nFirst 3 rows:")
print(df.head(3))

# ============================================================================
# CLEAN COLUMN NAMES
# ============================================================================

print("\n" + "=" * 70)
print("CLEANING COLUMN NAMES")
print("=" * 70)

# Standardize
df.columns = df.columns.str.strip().str.lower()

# Map ALL possible variations
column_mapping = {
    # State
    'states': 'state',
    'state': 'state',
    'state/ut': 'state',

    # PWD counts
    'males': 'male',
    'male': 'male',
    'females': 'female',
    'female': 'female',
    'transgender': 'transgender',
    'trans': 'transgender',
    'total': 'total_pwd',
    'total pwd': 'total_pwd',
    'total pwd electors': 'total_pwd',

    # Ratios
    'ratio of female to male': 'female_male_ratio',
    'female to male ratio': 'female_male_ratio',
    'female/male ratio': 'female_male_ratio',

    # Total electors
    'total electors': 'total_electors',
    'total_electors': 'total_electors',

    # PWD percentage
    'percentage of pwd electors': 'pwd_pct',
    'pwd percentage': 'pwd_pct',
    '% of pwd electors': 'pwd_pct',
    'percentage of pwd': 'pwd_pct',

    # Postal ballot
    'total number of pwd electors who voted through postal ballot': 'postal_ballot',
    'postal ballot': 'postal_ballot',
    'postal_ballot': 'postal_ballot',

    # HOME VOTING (CRITICAL - multiple variations)
    'percentage of pwd electors who voted from home': 'home_voting_pct',
    'pwd electors who voted from home': 'home_voting_pct',
    '% voted from home': 'home_voting_pct',
    'home voting percentage': 'home_voting_pct',
    'home voting %': 'home_voting_pct',
    'voted from home': 'home_voting_pct',
}

df = df.rename(columns=column_mapping)

print(f"‚úì Cleaned column names")
print(f"\nNew columns:")
for col in df.columns:
    print(f"  - {col}")

# Check if we have the critical column
if 'home_voting_pct' not in df.columns:
    print("\n‚ö†Ô∏è  WARNING: 'home_voting_pct' column not found!")
    print("Available columns that might be it:")
    for col in df.columns:
        if 'home' in col or 'voting' in col or 'vote' in col:
            print(f"  - {col}")
    print("\nPlease tell me which column has home voting percentage")

# ============================================================================
# CONVERT DATA TYPES
# ============================================================================

print("\n" + "=" * 70)
print("CONVERTING DATA TYPES")
print("=" * 70)

# Integer columns
int_cols = ['male', 'female', 'transgender', 'total_pwd', 'total_electors', 'postal_ballot']

for col in int_cols:
    if col in df.columns:
        df[col] = df[col].astype(str).str.replace(',', '').str.replace('%', '')
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)

# Float columns (percentages)
float_cols = ['female_male_ratio', 'pwd_pct', 'home_voting_pct']

for col in float_cols:
    if col in df.columns:
        df[col] = df[col].astype(str).str.replace(',', '').str.replace('%', '')
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

print(f"‚úì Converted to numeric")

# ============================================================================
# ASSIGN REGIONS
# ============================================================================

print("\n" + "=" * 70)
print("ASSIGNING REGIONS")
print("=" * 70)

def assign_region(state):
    state = str(state).strip()

    north = ['Haryana', 'Punjab', 'Himachal Pradesh', 'Jammu and Kashmir', 'Jammu & Kashmir',
             'Ladakh', 'Chandigarh', 'Delhi', 'Uttar Pradesh', 'Uttarakhand', 'Rajasthan']
    south = ['Karnataka', 'Tamil Nadu', 'Kerala', 'Andhra Pradesh', 'Telangana',
             'Puducherry', 'Andaman and Nicobar Islands', 'Andaman & Nicobar Islands']
    east = ['Bihar', 'Jharkhand', 'Odisha', 'West Bengal']
    west = ['Gujarat', 'Maharashtra', 'Goa', 'DNHDD', 'Daman & Diu', 'Dadra & Nagar Haveli']
    northeast = ['Assam', 'Arunachal Pradesh', 'Manipur', 'Meghalaya', 'Mizoram',
                'Nagaland', 'Sikkim', 'Tripura']

    if state in north: return 'North'
    if state in south: return 'South'
    if state in east: return 'East'
    if state in west: return 'West'
    if state in northeast: return 'Northeast'
    return 'Other'

df['region'] = df['state'].apply(assign_region)

print(f"‚úì Assigned regions")

# ============================================================================
# CALCULATE ADDITIONAL METRICS
# ============================================================================

print("\n" + "=" * 70)
print("CALCULATING METRICS")
print("=" * 70)

# Accessibility category
def categorize_access(pct):
    if pct >= 15: return 'High'
    elif pct >= 5: return 'Moderate'
    elif pct >= 2: return 'Low'
    else: return 'Crisis'

if 'home_voting_pct' in df.columns:
    df['accessibility_category'] = df['home_voting_pct'].apply(categorize_access)

# Population category
def categorize_pop(pwd):
    if pwd >= 500000: return 'Very Large'
    elif pwd >= 200000: return 'Large'
    elif pwd >= 50000: return 'Medium'
    else: return 'Small'

df['population_category'] = df['total_pwd'].apply(categorize_pop)

# Gender parity (1.0 = perfect, <1 = fewer women)
df['gender_parity_score'] = df['female_male_ratio'].clip(upper=1.0)

print(f"‚úì Calculated categories")

# ============================================================================
# SUMMARY STATISTICS
# ============================================================================

print("\n" + "=" * 70)
print("SUMMARY STATISTICS")
print("=" * 70)

print(f"Total PWD Voters: {df['total_pwd'].sum():,}")
print(f"Total Electors: {df['total_electors'].sum():,}")
print(f"National PWD %: {(df['total_pwd'].sum() / df['total_electors'].sum() * 100):.2f}%")

if 'home_voting_pct' in df.columns:
    print(f"\nüèÜ TOP 5 - Home Voting Accessibility:")
    top5 = df.nlargest(5, 'home_voting_pct')[['state', 'home_voting_pct', 'total_pwd']]
    print(top5.to_string(index=False))

    print(f"\n‚ö†Ô∏è  CRISIS STATES (>200K PWD, <2% home voting):")
    crisis = df[(df['total_pwd'] > 200000) & (df['home_voting_pct'] < 2)]
    if len(crisis) > 0:
        print(crisis[['state', 'total_pwd', 'home_voting_pct']].to_string(index=False))
    else:
        print("  None found")

print(f"\nüö∫ WORST GENDER GAPS:")
gender = df.nsmallest(5, 'female_male_ratio')[['state', 'female_male_ratio', 'female', 'male']]
print(gender.to_string(index=False))

# ============================================================================
# EXPORT TO JSON
# ============================================================================

print("\n" + "=" * 70)
print("CREATING JSON")
print("=" * 70)

voter_data = []

for _, row in df.iterrows():
    state_data = {
        'state': str(row['state']).strip(),
        'pwd_electors': int(row['total_pwd']),
        'male': int(row['male']),
        'female': int(row['female']),
        'transgender': int(row['transgender']),
        'female_male_ratio': float(row['female_male_ratio']),
        'total_electors': int(row['total_electors']),
        'pwd_percentage': float(row['pwd_pct']),
        'postal_ballot': int(row['postal_ballot']),
        'region': row['region'],
        'population_category': row['population_category'],
        'gender_parity_score': float(row['gender_parity_score'])
    }

    # Add home voting if it exists
    if 'home_voting_pct' in df.columns:
        state_data['home_voting_pct'] = float(row['home_voting_pct'])
        state_data['accessibility_category'] = row['accessibility_category']

    voter_data.append(state_data)

# Save JSON
with open('pwd_voter_data.json', 'w', encoding='utf-8') as f:
    json.dump(voter_data, f, indent=2, ensure_ascii=False)

print(f"‚úì Created JSON with {len(voter_data)} states")

# Preview
print(f"\nJSON preview (first state):")
print(json.dumps(voter_data[0], indent=2))

# ============================================================================
# SAVE CLEANED CSV TOO
# ============================================================================

df.to_csv('pwd_voter_data_cleaned.csv', index=False)
print(f"\n‚úì Saved cleaned CSV")

# ============================================================================
# DOWNLOAD
# ============================================================================

print("\n" + "=" * 70)
print("DOWNLOADING")
print("=" * 70)

files.download('pwd_voter_data.json')
files.download('pwd_voter_data_cleaned.csv')

print("\n‚úÖ COMPLETE!")
print("\nFiles downloaded:")
print("  1. pwd_voter_data.json ‚Üí Upload to GitHub")
print("  2. pwd_voter_data_cleaned.csv ‚Üí Backup")
print("=" * 70)

PWD VOTER DATA CLEANING - COMPLETE VERSION

üìÅ Upload your PWD voter Excel/CSV file:


Saving voters with disability - Sheet1.csv to voters with disability - Sheet1.csv

‚úì Uploaded: voters with disability - Sheet1.csv

LOADING DATA
‚úì Loaded 37 rows

Your columns:
  1. states
  2. males
  3. females
  4. transgender
  5. total
  6. ratio of female to male
  7. total electors
  8. percentage of pwd electors
  9. Total number of PwD electors who voted through postal ballot
  10. Percentage of PwD electors who voted from home

First 3 rows:
                        states     males   females  transgender   total  \
0  Andaman and Nicobar Islands    1422.0    1000.0          0.0    2422   
1               Andhra Pradesh  303008.0  216333.0         24.0  519365   
2            Arunachal Pradesh    2849.0    2747.0          0.0    5596   

   ratio of female to male  total electors percentage of pwd electors  \
0                 0.703235        315148.0                      0.77%   
1                 0.713951      41333702.0                      1.26%   
2                 0.

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


‚úÖ COMPLETE!

Files downloaded:
  1. pwd_voter_data.json ‚Üí Upload to GitHub
  2. pwd_voter_data_cleaned.csv ‚Üí Backup
