In [2]:

import pandas as pd
import numpy as np
import json
from google.colab import files

print("=" * 80)
print("CENSUS 2011 DISABILITY DASHBOARD - DATA PREPARATION WORKFLOW")
print("=" * 80)

# ============================================================================
# STEP 1: UPLOAD YOUR RAW CENSUS FILES
# ============================================================================

print("\n[STEP 1] Upload your raw Census 2011 files")
print("-" * 80)
print("You need:")
print("  1. C-20 Disability data (state-level)")
print("  2. C-30 Disability by household type (for breakdowns)")
print("  3. A-01 Population totals (for prevalence calculation)")
print("\nUpload your files now:")

uploaded = files.upload()

# Identify which files were uploaded
file_list = list(uploaded.keys())
print(f"\n‚úì Uploaded {len(file_list)} files:")
for f in file_list:
    print(f"  - {f}")

# ============================================================================
# STEP 2: LOAD STATE-LEVEL DISABILITY DATA
# ============================================================================

print("\n[STEP 2] Loading state-level disability data")
print("-" * 80)

# Load your C-20 or C-30 state-level file
# Adjust the filename to match what you uploaded
state_disability_file = file_list[0]  # Change index if needed

if state_disability_file.endswith('.csv'):
    df_disability = pd.read_csv(state_disability_file, encoding='utf-8', low_memory=False)
else:
    df_disability = pd.read_excel(state_disability_file)

print(f"‚úì Loaded: {len(df_disability)} rows")
print(f"\nColumns: {list(df_disability.columns)[:10]}")
print(f"\nFirst 3 rows:")
print(df_disability.head(3))

# ============================================================================
# STEP 3: CLEAN AND FILTER TO STATE TOTALS
# ============================================================================

print("\n[STEP 3] Filtering to state-level totals only")
print("-" * 80)

# Filter based on your file structure
# Adjust these conditions based on your actual data
df_states = df_disability[
    (df_disability['area_type'] == 'Total') &
    (df_disability['Type of Household'] == 'Total') &
    (df_disability['area_name'].str.contains('State -', case=False, na=False))
].copy()

print(f"‚úì Filtered to {len(df_states)} states/UTs")

# Clean state names
df_states['state'] = df_states['area_name'].str.replace('State - ', '', case=False).str.strip()

# Show what we have
print(f"\nStates found:")
print(df_states['state'].tolist())

# ============================================================================
# STEP 4: EXTRACT KEY METRICS
# ============================================================================

print("\n[STEP 4] Extracting national and state metrics")
print("-" * 80)

# Rename columns to standard names
df_states = df_states.rename(columns={
    'Total disabled persons': 'total_pwd',
    'Total disabled males': 'male_pwd',
    'Total disabled females': 'female_pwd',
    'Persons_in seeing': 'seeing',
    'Persons_in hearing': 'hearing',
    'Persons_in speech': 'speech',
    'Persons_in movement': 'movement',
    'Persons_mental retardation': 'mental_retard',
    'Persons_mental illness': 'mental_illness',
    'Persons_multiple disability': 'multiple',
    'Persons_any other': 'other'
})

# Convert to numeric
numeric_cols = ['total_pwd', 'male_pwd', 'female_pwd', 'seeing', 'hearing',
                'speech', 'movement', 'mental_retard', 'mental_illness', 'multiple', 'other']

for col in numeric_cols:
    if col in df_states.columns:
        df_states[col] = pd.to_numeric(df_states[col], errors='coerce').fillna(0).astype(int)

# Calculate national totals
national_total_pwd = df_states['total_pwd'].sum()
national_male_pwd = df_states['male_pwd'].sum()
national_female_pwd = df_states['female_pwd'].sum()
national_sex_ratio = int((national_female_pwd / national_male_pwd) * 1000)

print(f"\n‚úì NATIONAL TOTALS:")
print(f"  Total PWDs: {national_total_pwd:,}")
print(f"  Male PWDs: {national_male_pwd:,}")
print(f"  Female PWDs: {national_female_pwd:,}")
print(f"  Sex Ratio: {national_sex_ratio} females per 1000 males")

# ============================================================================
# STEP 5: CALCULATE DISABILITY TYPE PERCENTAGES (NATIONAL)
# ============================================================================

print("\n[STEP 5] Calculating disability type distribution")
print("-" * 80)

# National totals by type
disability_type_data = []

if 'seeing' in df_states.columns:
    types = {
        'Seeing': df_states['seeing'].sum(),
        'Movement': df_states['movement'].sum(),
        'Hearing': df_states['hearing'].sum(),
        'Speech': df_states['speech'].sum(),
        'Mental Retardation': df_states['mental_retard'].sum(),
        'Mental Illness': df_states['mental_illness'].sum(),
        'Multiple': df_states['multiple'].sum(),
        'Other': df_states['other'].sum()
    }

    for name, count in types.items():
        pct = (count / national_total_pwd) * 100
        disability_type_data.append({
            'type': name,
            'count': int(count),
            'percentage': round(pct, 2)
        })
        print(f"  {name}: {count:,} ({pct:.2f}%)")

# ============================================================================
# STEP 6: MERGE WITH POPULATION DATA FOR PREVALENCE
# ============================================================================

print("\n[STEP 6] Calculating PWD prevalence by state")
print("-" * 80)

# If you have population file, load it
# Otherwise, we'll use estimates

# Example: Using Census 2011 known total populations
# You can replace this with your actual A-01 file

state_populations = {
    'JAMMU & KASHMIR': 12548926,
    'HIMACHAL PRADESH': 6856509,
    'PUNJAB': 27704236,
    'CHANDIGARH': 1054686,
    'UTTARAKHAND': 10116752,
    'HARYANA': 25353081,
    'NCT OF DELHI': 16753235,
    'RAJASTHAN': 68621012,
    'UTTAR PRADESH': 199581477,
    'BIHAR': 103804637,
    'SIKKIM': 607688,
    'ARUNACHAL PRADESH': 1382611,
    'NAGALAND': 1980602,
    'MANIPUR': 2721756,
    'MIZORAM': 1091014,
    'TRIPURA': 3671032,
    'MEGHALAYA': 2964007,
    'ASSAM': 31169272,
    'WEST BENGAL': 91347736,
    'JHARKHAND': 32966238,
    'ODISHA': 41947358,
    'CHHATTISGARH': 25540196,
    'MADHYA PRADESH': 72597565,
    'GUJARAT': 60383628,
    'DAMAN & DIU': 242911,
    'DADRA & NAGAR HAVELI': 342853,
    'MAHARASHTRA': 112372972,
    'ANDHRA PRADESH': 84665533,  # Pre-Telangana split
    'KARNATAKA': 61130704,
    'GOA': 1457723,
    'LAKSHADWEEP': 64429,
    'KERALA': 33387677,
    'TAMIL NADU': 72138958,
    'PUDUCHERRY': 1244464,
    'ANDAMAN & NICOBAR ISLANDS': 379944
}

# Clean state names for matching
df_states['state_upper'] = df_states['state'].str.upper().str.strip()

# Add population
df_states['total_population'] = df_states['state_upper'].map(state_populations)

# Calculate prevalence
df_states['pwd_prevalence'] = (df_states['total_pwd'] / df_states['total_population'] * 100).round(2)

# Calculate sex ratio per state
df_states['sex_ratio'] = ((df_states['female_pwd'] / df_states['male_pwd']) * 1000).round(0).astype(int)

print(f"‚úì Calculated prevalence for {df_states['pwd_prevalence'].notna().sum()} states")

# Show top 10
print(f"\nTop 10 states by PWD prevalence:")
top10 = df_states.nlargest(10, 'pwd_prevalence')[['state', 'pwd_prevalence', 'total_pwd']]
print(top10.to_string(index=False))

# ============================================================================
# STEP 7: CREATE DASHBOARD DATA FILES
# ============================================================================

print("\n[STEP 7] Creating visualization-ready data files")
print("-" * 80)

# File 1: National summary
national_summary = {
    'total_pwd': int(national_total_pwd),
    'male_pwd': int(national_male_pwd),
    'female_pwd': int(national_female_pwd),
    'sex_ratio': national_sex_ratio,
    'national_prevalence': round((national_total_pwd / 1210854977) * 100, 2),  # Census 2011 total pop
    'rural_pct': 69.0,  # From Census reports
    'urban_pct': 31.0,
    'homeless_pwd': 68422,  # From your Census document
    'homeless_urban': 43509,
    'homeless_rural': 24913
}

with open('census_national_summary.json', 'w') as f:
    json.dump(national_summary, f, indent=2)

print("‚úì Created: census_national_summary.json")

# File 2: State-level data
state_output = []

for _, row in df_states.iterrows():
    state_output.append({
        'state': row['state'],
        'pwd_prevalence': float(row['pwd_prevalence']) if pd.notna(row['pwd_prevalence']) else 0,
        'pwd_population': int(row['total_pwd']),
        'total_population': int(row['total_population']) if pd.notna(row['total_population']) else 0,
        'sex_ratio': int(row['sex_ratio']) if pd.notna(row['sex_ratio']) else 789,
        'male_pwd': int(row['male_pwd']),
        'female_pwd': int(row['female_pwd'])
    })

with open('census_state_data.json', 'w') as f:
    json.dump(state_output, f, indent=2)

print("‚úì Created: census_state_data.json")

# File 3: Disability types distribution
with open('census_disability_types.json', 'w') as f:
    json.dump(disability_type_data, f, indent=2)

print("‚úì Created: census_disability_types.json")

# ============================================================================
# STEP 8: CREATE HARDCODED DATA FOR DASHBOARD
# ============================================================================

print("\n[STEP 8] Creating hardcoded data arrays for React component")
print("-" * 80)

# Generate the exact JavaScript arrays to paste in your component

print("\n// PASTE THIS IN YOUR PC1.HTML or census-dashboard.js:")
print("\nconst nationalData = {")
print(f"  total_pwd: {national_total_pwd},")
print(f"  male_pwd: {national_male_pwd},")
print(f"  female_pwd: {national_female_pwd},")
print(f"  sex_ratio: {national_sex_ratio},")
print(f"  national_prevalence: {national_summary['national_prevalence']}")
print("};")

print("\nconst stateData = [")
for state in state_output[:15]:  # Top 15 states
    print(f"  {{ state: '{state['state']}', prevalence: {state['pwd_prevalence']}, pwd_pop: {state['pwd_population']} }},")
print("];")

print("\nconst disabilityTypes = [")
for dtype in disability_type_data:
    # Assign icons and colors
    icons = {
        'Seeing': 'üëÅÔ∏è', 'Movement': 'ü¶Ω', 'Hearing': 'üëÇ', 'Speech': 'üí¨',
        'Mental Retardation': 'üß†', 'Mental Illness': 'üí≠', 'Multiple': 'üîÄ', 'Other': '‚ûï'
    }
    colors = {
        'Seeing': '#9333ea', 'Movement': '#7c3aed', 'Hearing': '#a855f7', 'Speech': '#e9d5ff',
        'Mental Retardation': '#ec4899', 'Mental Illness': '#f9a8d4', 'Multiple': '#d8b4fe', 'Other': '#c084fc'
    }
    print(f"  {{ name: '{dtype['type']}', pct: {dtype['percentage']}, count: {dtype['count']}, icon: '{icons.get(dtype['type'], '‚ûï')}', color: '{colors.get(dtype['type'], '#c084fc')}' }},")
print("];")

# ============================================================================
# STEP 9: DOWNLOAD ALL OUTPUT FILES
# ============================================================================

print("\n[STEP 9] Downloading processed data files")
print("-" * 80)

files.download('census_national_summary.json')
files.download('census_state_data.json')
files.download('census_disability_types.json')

print("\n‚úì Downloaded 3 JSON files")

# ============================================================================
# STEP 10: VALIDATION & SUMMARY
# ============================================================================

print("\n[STEP 10] Data Validation Summary")
print("=" * 80)

print(f"\nüìä NATIONAL STATISTICS:")
print(f"  Total PWD Population: {national_total_pwd:,}")
print(f"  As % of India population: {national_summary['national_prevalence']}%")
print(f"  Gender Split: {(national_male_pwd/national_total_pwd*100):.1f}% Male, {(national_female_pwd/national_total_pwd*100):.1f}% Female")
print(f"  Sex Ratio: {national_sex_ratio}:1000")

print(f"\nüèÜ TOP 5 STATES (by prevalence):")
top5 = sorted(state_output, key=lambda x: x['pwd_prevalence'], reverse=True)[:5]
for i, state in enumerate(top5, 1):
    print(f"  {i}. {state['state']}: {state['pwd_prevalence']}% ({state['pwd_population']:,} PWDs)")

print(f"\nüìã DISABILITY TYPES (sorted by %):")
sorted_types = sorted(disability_type_data, key=lambda x: x['percentage'], reverse=True)
for dtype in sorted_types:
    print(f"  {dtype['type']}: {dtype['percentage']}% ({dtype['count']:,} people)")

print("\n" + "=" * 80)
print("‚úÖ DATA PREPARATION COMPLETE!")
print("=" * 80)
print("\nNext steps:")
print("1. Review the JavaScript arrays printed above")
print("2. Copy them into your PC1.html React component")
print("3. Verify numbers match your raw Census data")
print("4. Upload PC1.html to GitHub")
print("\nAll source data and calculations documented in this notebook.")
print("=" * 80)

# ============================================================================
# BONUS: EXPORT CLEANED CSV FOR REFERENCE
# ============================================================================

# Export the cleaned state-level data as CSV for your records
df_states[['state', 'total_pwd', 'pwd_prevalence', 'sex_ratio', 'male_pwd', 'female_pwd']].to_csv(
    'census_states_cleaned.csv',
    index=False
)

files.download('census_states_cleaned.csv')

print("\nüì• Also downloaded: census_states_cleaned.csv (for your records)")

CENSUS 2011 DISABILITY DASHBOARD - DATA PREPARATION WORKFLOW

[STEP 1] Upload your raw Census 2011 files
--------------------------------------------------------------------------------
You need:
  1. C-20 Disability data (state-level)
  2. C-30 Disability by household type (for breakdowns)
  3. A-01 Population totals (for prevalence calculation)

Upload your files now:


Saving Census pwd data state.csv to Census pwd data state.csv

‚úì Uploaded 1 files:
  - Census pwd data state.csv

[STEP 2] Loading state-level disability data
--------------------------------------------------------------------------------
‚úì Loaded: 433 rows

Columns: ['table_name', 'state_code', 'district_code', 'area_name', 'area_type', 'Type of Household', 'No. of Households having disabled persons', 'Total disabled persons', 'Total disabled males', 'Total disabled females']

First 3 rows:
  table_name  state_code  district_code area_name area_type  \
0      C0130         0.0            0.0     INDIA     Total   
1      C0130         0.0            0.0     INDIA     Total   
2      C0130         0.0            0.0     INDIA     Total   

         Type of Household  No. of Households having disabled persons  \
0                    Total                                 20780194.0   
1         Normal Household                                 20643273.0   
2  Institutional Household

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


‚úì Downloaded 3 JSON files

[STEP 10] Data Validation Summary

üìä NATIONAL STATISTICS:
  Total PWD Population: 26,814,994
  As % of India population: 2.21%
  Gender Split: 55.9% Male, 44.1% Female
  Sex Ratio: 789:1000

üèÜ TOP 5 STATES (by prevalence):
  1. SIKKIM: 2.99% (18,187 PWDs)
  2. ODISHA: 2.97% (1,244,402 PWDs)
  3. JAMMU & KASHMIR: 2.88% (361,153 PWDs)
  4. ANDHRA PRADESH: 2.68% (2,266,607 PWDs)
  5. MAHARASHTRA: 2.64% (2,963,392 PWDs)

üìã DISABILITY TYPES (sorted by %):
  Movement: 20.28% (5,436,826 people)
  Hearing: 18.92% (5,072,914 people)
  Seeing: 18.77% (5,033,431 people)
  Other: 18.38% (4,927,589 people)
  Multiple: 7.89% (2,116,698 people)
  Speech: 7.45% (1,998,692 people)
  Mental Retardation: 5.62% (1,505,964 people)
  Mental Illness: 2.7% (722,880 people)

‚úÖ DATA PREPARATION COMPLETE!

Next steps:
1. Review the JavaScript arrays printed above
2. Copy them into your PC1.html React component
3. Verify numbers match your raw Census data
4. Upload PC1.htm

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


üì• Also downloaded: census_states_cleaned.csv (for your records)
