In [None]:
import pandas as pd

df = pd.read_csv('final_df.csv')

print(df.columns.tolist())

['state', 'district', 'pincode', 'bio_age_5_17', 'bio_age_17_greater', 'date', 'demo_age_5_17', 'demo_age_17_greater', 'enr_age_0_5', 'enr_age_5_17', 'enr_age_18_greater']


state-wise demo % by age group  || Top 5 states — Age 5–17 (Demographic change %) || Top 5 states — Age 17+ (Demographic change %)

In [10]:
import pandas as pd

# group by state
state_summary = df.groupby('state').agg({
    'demo_age_5_17': 'sum',
    'demo_age_17_greater': 'sum',
    'enr_age_5_17': 'sum',
    'enr_age_18_greater': 'sum'
}).reset_index()

# remove zero-enrollment states to avoid infinite %
state_summary = state_summary[
    (state_summary['enr_age_5_17'] > 0) &
    (state_summary['enr_age_18_greater'] > 0)
]

# demo % for age 5–17
state_summary['demo_pct_5_17'] = (
    state_summary['demo_age_5_17'] /
    state_summary['enr_age_5_17']
) * 100

# demo % for age 17+
state_summary['demo_pct_17_plus'] = (
    state_summary['demo_age_17_greater'] /
    state_summary['enr_age_18_greater']
) * 100


top_5_5_17 = (
    state_summary[['state', 'demo_pct_5_17']]
    .sort_values(by='demo_pct_5_17', ascending=False)
    .head(5)
)

print("Top 5 states – Demographic Change % (Age 5–17)")
print(top_5_5_17)


top_5_17_plus = (
    state_summary[['state', 'demo_pct_17_plus']]
    .sort_values(by='demo_pct_17_plus', ascending=False)
    .head(5)
)

print("\nTop 5 states – Demographic Change % (Age 17+)")
print(top_5_17_plus)


state_summary['adult_child_demo_ratio'] = (
    state_summary['demo_pct_17_plus'] /
    state_summary['demo_pct_5_17']
)


Top 5 states – Demographic Change % (Age 5–17)
               state  demo_pct_5_17
5         Chandigarh    3631.221719
26        Puducherry    2004.724409
1     Andhra Pradesh    1834.538056
12  Himachal Pradesh    1785.923754
18       Lakshadweep    1120.000000

Top 5 states – Demographic Change % (Age 17+)
             state  demo_pct_17_plus
5       Chandigarh     101424.390244
25          Odisha      99830.277186
30      Tamil Nadu      89779.401709
1   Andhra Pradesh      80570.964912
31       Telangana      75762.043796


Identity Volatility Index (IVI) || IVI = (Biographical Changes + Demographic Changes) / Enrollments

In [12]:
import pandas as pd

ivi_df = df.groupby(
    ['state', 'district', 'date'],
    as_index=False
).agg({
    'bio_age_5_17': 'sum',
    'bio_age_17_greater': 'sum',
    'demo_age_5_17': 'sum',
    'demo_age_17_greater': 'sum',
    'enr_age_0_5': 'sum',
    'enr_age_5_17': 'sum',
    'enr_age_18_greater': 'sum'
})

ivi_df['ivi_5_17'] = (
    (ivi_df['bio_age_5_17'] + ivi_df['demo_age_5_17']) /
    ivi_df['enr_age_5_17']
)

ivi_df['ivi_17_plus'] = (
    (ivi_df['bio_age_17_greater'] + ivi_df['demo_age_17_greater']) /
    ivi_df['enr_age_18_greater']
)

ivi_df['total_updates'] = (
    ivi_df['bio_age_5_17'] +
    ivi_df['bio_age_17_greater'] +
    ivi_df['demo_age_5_17'] +
    ivi_df['demo_age_17_greater']
)

ivi_df['total_enrollments'] = (
    ivi_df['enr_age_0_5'] +
    ivi_df['enr_age_5_17'] +
    ivi_df['enr_age_18_greater']
)

# avoid divide-by-zero
ivi_df = ivi_df[ivi_df['total_enrollments'] > 0]

ivi_df['ivi_overall'] = (
    ivi_df['total_updates'] /
    ivi_df['total_enrollments']
)

high_ivi_districts = (
    ivi_df.groupby(['state', 'district'])
    .agg({'ivi_overall': 'mean'})
    .reset_index()
    .sort_values(by='ivi_overall', ascending=False)
    .head(10)
)

print(high_ivi_districts)

age_ivi_summary = ivi_df[['ivi_5_17', 'ivi_17_plus']].mean()

print("Average IVI by age group")
print(age_ivi_summary)

ivi_time = (
    ivi_df.groupby('date')
    .agg({'ivi_overall': 'mean'})
    .reset_index()
)

print(ivi_time.head())

ivi_df['migration_ratio'] = (
    (ivi_df['demo_age_5_17'] + ivi_df['demo_age_17_greater']) /
    (ivi_df['bio_age_5_17'] + ivi_df['bio_age_17_greater'] + 1)
)

print(ivi_df.head())


           state     district  ivi_overall
385  Maharashtra       Nashik   165.248223
388  Maharashtra     Parbhani   106.565661
383  Maharashtra       Nanded   103.209408
379  Maharashtra        Latur    80.103649
391  Maharashtra    Ratnagiri    78.522024
395  Maharashtra      Solapur    77.356404
398  Maharashtra       Washim    72.727547
410      Manipur      Thoubal    71.429808
377  Maharashtra        Jalna    71.156678
404      Manipur  Imphal West    69.126000
Average IVI by age group
ivi_5_17       inf
ivi_17_plus    inf
dtype: float64
         date  ivi_overall
0  2025-03-02          0.0
1  2025-03-09          0.0
2  2025-03-15          0.0
3  2025-03-20          0.0
4  2025-03-23          0.0
                          state district        date  bio_age_5_17  bio_age_17_greater  demo_age_5_17  demo_age_17_greater  enr_age_0_5  enr_age_5_17  enr_age_18_greater  ivi_5_17  ivi_17_plus  total_updates  total_enrollments  ivi_overall  migration_ratio
5   Andaman And Nicobar Island

 Migration Signature Detection


In [13]:
import pandas as pd

ms_df = df.groupby(
    ['state', 'district', 'date'],
    as_index=False
).agg({
    'demo_age_5_17': 'sum',
    'demo_age_17_greater': 'sum',
    'bio_age_5_17': 'sum',
    'bio_age_17_greater': 'sum'
})

ms_df['total_demo_changes'] = (
    ms_df['demo_age_5_17'] +
    ms_df['demo_age_17_greater']
)

ms_df['total_bio_changes'] = (
    ms_df['bio_age_5_17'] +
    ms_df['bio_age_17_greater']
)

ms_df = ms_df[ms_df['total_bio_changes'] > 0]

ms_df['migration_signal'] = (
    ms_df['total_demo_changes'] /
    ms_df['total_bio_changes']
)

high_migration_districts = (
    ms_df.groupby(['state', 'district'])
    .agg({'migration_signal': 'mean'})
    .reset_index()
    .sort_values(by='migration_signal', ascending=False)
    .head(10)
)

print(high_migration_districts)

ms_df['migration_signal_5_17'] = (
    ms_df['demo_age_5_17'] /
    (ms_df['bio_age_5_17'] + 1)
)
ms_df['migration_signal_17_plus'] = (
    ms_df['demo_age_17_greater'] /
    (ms_df['bio_age_17_greater'] + 1)
)

migration_age_summary = (
    ms_df[['migration_signal_5_17', 'migration_signal_17_plus']]
    .mean()
)

print(migration_age_summary)


             state        district  migration_signal
89           Bihar  East Champaran         37.153582
183        Gujarat           Surat         30.806934
632  Uttar Pradesh          Bijnor         23.552744
86           Bihar         Bhojpur         22.641048
88           Bihar       Darbhanga         21.940532
615  Uttar Pradesh            Agra         19.783282
624  Uttar Pradesh        Bahraich         18.511224
162        Gujarat       Ahmedabad         18.088395
80           Bihar          Araria         17.353709
644  Uttar Pradesh       Ghaziabad         17.212164
migration_signal_5_17       0.381659
migration_signal_17_plus    3.590225
dtype: float64


In [14]:
seasonal_migration = (
    ms_df.groupby('date')
    .agg({'migration_signal': 'mean'})
    .reset_index()
)

print(seasonal_migration.head())


         date  migration_signal
0  2025-03-01          1.041220
1  2025-04-01          0.089381
2  2025-05-01          0.108638
3  2025-06-01          0.121440
4  2025-07-01          0.135693


Child Identity Correction Burden (CICB) || CICB = bio_age_5_17 / enr_age_5_17


In [15]:
import pandas as pd

cicb_df = df.groupby(
    ['state', 'district', 'date'],
    as_index=False
).agg({
    'bio_age_5_17': 'sum',
    'bio_age_17_greater': 'sum',
    'enr_age_5_17': 'sum',
    'enr_age_18_greater': 'sum'
})


cicb_df = cicb_df[cicb_df['enr_age_5_17'] > 0]

cicb_df['child_correction_burden'] = (
    cicb_df['bio_age_5_17'] /
    cicb_df['enr_age_5_17']
)

cicb_df = cicb_df[cicb_df['enr_age_18_greater'] > 0]

cicb_df['adult_correction_burden'] = (
    cicb_df['bio_age_17_greater'] /
    cicb_df['enr_age_18_greater']
)

high_child_burden_districts = (
    cicb_df.groupby(['state', 'district'])
    .agg({'child_correction_burden': 'mean'})
    .reset_index()
    .sort_values(by='child_correction_burden', ascending=False)
    .head(10)
)

print(high_child_burden_districts)

cicb_df['child_vs_adult_ratio'] = (
    cicb_df['child_correction_burden'] /
    (cicb_df['adult_correction_burden'] + 1e-6)
)

def classify_burden(x):
    if x >= 2:
        return 'Severe Child Capture Issues'
    elif x >= 1:
        return 'Moderate Child Capture Issues'
    else:
        return 'Low Child Capture Issues'

cicb_df['child_burden_category'] = (
    cicb_df['child_correction_burden']
    .apply(classify_burden)
)

child_burden_trend = (
    cicb_df.groupby('date')
    .agg({'child_correction_burden': 'mean'})
    .reset_index()
)

print(child_burden_trend.head())


                state       district  child_correction_burden
109      Chhattisgarh           Durg               286.136098
189  Himachal Pradesh         Kangra               212.700000
488        Tamil Nadu      Cuddalore               203.429044
191  Himachal Pradesh          Mandi               188.166667
178           Haryana    Kurukshetra               183.500000
334       Maharashtra       Bhandara               181.000000
361       Maharashtra         Wardha               179.988889
497        Tamil Nadu        Madurai               176.656488
6      Andhra Pradesh  East Godavari               176.575735
362       Maharashtra         Washim               173.875934
         date  child_correction_burden
0  2025-03-02                      0.0
1  2025-03-09                      0.0
2  2025-03-15                      0.0
3  2025-03-20                      0.0
4  2025-03-23                      0.0


Enrollment-to-Update Lag Proxy (EULP) || 

In [16]:
import pandas as pd

lag_df = df.groupby(
    ['state', 'district', 'date'],
    as_index=False
).agg({
    'enr_age_0_5': 'sum',
    'enr_age_5_17': 'sum',
    'enr_age_18_greater': 'sum',
    'bio_age_5_17': 'sum',
    'bio_age_17_greater': 'sum',
    'demo_age_5_17': 'sum',
    'demo_age_17_greater': 'sum'
})

lag_df['total_enrollments'] = (
    lag_df['enr_age_0_5'] +
    lag_df['enr_age_5_17'] +
    lag_df['enr_age_18_greater']
)

lag_df['total_updates'] = (
    lag_df['bio_age_5_17'] +
    lag_df['bio_age_17_greater'] +
    lag_df['demo_age_5_17'] +
    lag_df['demo_age_17_greater']
)

# ensure date is datetime
lag_df['date'] = pd.to_datetime(lag_df['date'])

lag_df = lag_df.sort_values(['state', 'district', 'date'])

lag_df['enr_spike'] = (
    lag_df['total_enrollments'] >
    lag_df.groupby(['state', 'district'])['total_enrollments']
          .transform(lambda x: x.rolling(3, min_periods=1).mean() +
                              x.rolling(3, min_periods=1).std())
)

lag_df['upd_spike'] = (
    lag_df['total_updates'] >
    lag_df.groupby(['state', 'district'])['total_updates']
          .transform(lambda x: x.rolling(3, min_periods=1).mean() +
                              x.rolling(3, min_periods=1).std())
)

lags = []

for (state, district), g in lag_df.groupby(['state', 'district']):
    enr_dates = g.loc[g['enr_spike'], 'date']
    upd_dates = g.loc[g['upd_spike'], 'date']
    
    for e_date in enr_dates:
        future_updates = upd_dates[upd_dates > e_date]
        if not future_updates.empty:
            lag_days = (future_updates.min() - e_date).days
            lags.append({
                'state': state,
                'district': district,
                'lag_days': lag_days
            })

lag_result = pd.DataFrame(lags)

district_lag = (
    lag_result
    .groupby(['state', 'district'])
    .agg(
        avg_lag_days=('lag_days', 'mean'),
        median_lag_days=('lag_days', 'median'),
        observations=('lag_days', 'count')
    )
    .reset_index()
)

slow_settling = (
    district_lag
    .sort_values(by='avg_lag_days', ascending=False)
    .head(10)
)

print(slow_settling)



             state             district  avg_lag_days  median_lag_days  observations
688    West Bengal              KOLKATA     43.000000             43.0             1
705    West Bengal                nadia     31.000000             31.0             1
154          Delhi     North West Delhi     27.062500             16.5            16
159          Delhi           West Delhi     26.777778              8.0            18
99           Bihar            Madhubani     25.285714              2.0            14
68           Assam               Kamrup     24.750000              7.0            12
261      Karnataka      Bengaluru Rural     24.000000             24.0             2
52           Assam              Barpeta     24.000000              7.0            19
191        Haryana            Faridabad     23.809524              5.0            21
631  Uttar Pradesh  Gautam Buddha Nagar     23.736842              7.0            19


Administrative Friction Index || AFI = (Biographical Changes + Demographic Changes) / Population Proxy


In [17]:
import pandas as pd

afi_df = df.groupby(
    ['state', 'district'],
    as_index=False
).agg({
    'bio_age_5_17': 'sum',
    'bio_age_17_greater': 'sum',
    'demo_age_5_17': 'sum',
    'demo_age_17_greater': 'sum',
    'enr_age_0_5': 'sum',
    'enr_age_5_17': 'sum',
    'enr_age_18_greater': 'sum'
})

afi_df['total_updates'] = (
    afi_df['bio_age_5_17'] +
    afi_df['bio_age_17_greater'] +
    afi_df['demo_age_5_17'] +
    afi_df['demo_age_17_greater']
)

afi_df['population_proxy'] = (
    afi_df['enr_age_0_5'] +
    afi_df['enr_age_5_17'] +
    afi_df['enr_age_18_greater']
)

# remove invalid rows
afi_df = afi_df[afi_df['population_proxy'] > 0]


afi_df['administrative_friction_index'] = (
    afi_df['total_updates'] /
    afi_df['population_proxy']
)

high_friction_districts = (
    afi_df
    .sort_values(by='administrative_friction_index', ascending=False)
    .head(10)
)

print(high_friction_districts[['state', 'district', 'administrative_friction_index']])

state_afi = (
    afi_df.groupby('state')
    .agg({'administrative_friction_index': 'mean'})
    .reset_index()
    .sort_values(by='administrative_friction_index', ascending=False)
)

print(state_afi.head(10))



           state          district  administrative_friction_index
516    Rajasthan            Beawar                     273.000000
512    Rajasthan           Balotra                     273.000000
527    Rajasthan  Didwana-Kuchaman                     187.500000
543    Rajasthan          Salumbar                     125.000000
436      Mizoram          Serchhip                      85.376147
413      Manipur           Thoubal                      82.315425
405      Manipur       Imphal East                      79.738056
525    Rajasthan             Deeg                       77.625000
406      Manipur       Imphal West                      77.410355
399  Maharashtra            Wardha                      70.807601
                                       state  administrative_friction_index
0                Andaman And Nicobar Islands                      44.714853
5                                 Chandigarh                      43.483916
28                                 Rajasthan  

Gender-Neutral, Age-Sensitive Stress Signals (GNASS)

In [18]:
import pandas as pd

stress_df = df.groupby(
    ['state', 'district'],
    as_index=False
).agg({
    'bio_age_17_greater': 'sum',
    'demo_age_5_17': 'sum',
    'enr_age_18_greater': 'sum',
    'enr_age_5_17': 'sum'
})

# avoid divide-by-zero
stress_df = stress_df[
    (stress_df['enr_age_18_greater'] > 0) &
    (stress_df['enr_age_5_17'] > 0)
]

# Adult biographical stress
stress_df['adult_bio_stress'] = (
    stress_df['bio_age_17_greater'] /
    stress_df['enr_age_18_greater']
)

# Child demographic stress
stress_df['child_demo_stress'] = (
    stress_df['demo_age_5_17'] /
    stress_df['enr_age_5_17']
)

top_adult_bio = (
    stress_df[['state', 'district', 'adult_bio_stress']]
    .sort_values(by='adult_bio_stress', ascending=False)
    .head(10)
)

print(top_adult_bio)

top_child_demo = (
    stress_df[['state', 'district', 'child_demo_stress']]
    .sort_values(by='child_demo_stress', ascending=False)
    .head(10)
)

print(top_child_demo)

def classify_stress(row):
    if row['adult_bio_stress'] > 1.5 and row['child_demo_stress'] < 1:
        return 'Adult Identity Correction Stress'
    elif row['adult_bio_stress'] < 1 and row['child_demo_stress'] > 1.5:
        return 'Family Mobility Stress'
    elif row['adult_bio_stress'] > 1.5 and row['child_demo_stress'] > 1.5:
        return 'Multi-stage Identity Stress'
    else:
        return 'Low / Normal Stress'

stress_df['stress_type'] = stress_df.apply(classify_stress, axis=1)


              state         district  adult_bio_stress
130    Chhattisgarh   Janjgir-champa          129643.0
580      Tamil Nadu      Tirunelveli           59050.5
135    Chhattisgarh           Koriya           42160.0
563      Tamil Nadu    Kanniyakumari           42003.0
210         Haryana            Sirsa           28336.0
331  Madhya Pradesh          Dindori           28030.0
482          Odisha        Sambalpur           25725.0
579      Tamil Nadu  Tiruchirappalli           24585.6
578      Tamil Nadu            Theni           24037.0
131    Chhattisgarh          Jashpur           22356.0
                 state        district  child_demo_stress
22      Andhra Pradesh      Srikakulam         107.297872
4       Andhra Pradesh      Anakapalli          86.166667
6       Andhra Pradesh       Annamayya          68.820000
16      Andhra Pradesh         Nandyal          68.512195
226  Jammu And Kashmir          Budgam          67.113208
21      Andhra Pradesh  Sri Sathya Sai         

District Identity Churn Ranking (DICR)

In [20]:
import pandas as pd

churn_df = df.groupby(
    ['state', 'district'],
    as_index=False
).agg({
    'bio_age_5_17': 'sum',
    'bio_age_17_greater': 'sum',
    'demo_age_5_17': 'sum',
    'demo_age_17_greater': 'sum',
    'enr_age_0_5': 'sum',
    'enr_age_5_17': 'sum',
    'enr_age_18_greater': 'sum'
})

# total changes
churn_df['total_bio'] = (
    churn_df['bio_age_5_17'] +
    churn_df['bio_age_17_greater']
)

churn_df['total_demo'] = (
    churn_df['demo_age_5_17'] +
    churn_df['demo_age_17_greater']
)

churn_df['total_enr'] = (
    churn_df['enr_age_0_5'] +
    churn_df['enr_age_5_17'] +
    churn_df['enr_age_18_greater']
)

# population proxy
churn_df['population_proxy'] = churn_df['total_enr']

# remove invalid districts
churn_df = churn_df[churn_df['population_proxy'] > 0]

churn_df['identity_churn_per_1000'] = (
    (churn_df['total_bio'] +
     churn_df['total_demo'] +
     churn_df['total_enr']) /
    churn_df['population_proxy']
) * 1000

top_churn_districts = (
    churn_df
    .sort_values(by='identity_churn_per_1000', ascending=False)
    .head(10)
)

print(top_churn_districts[
    ['state', 'district', 'identity_churn_per_1000']
])



           state          district  identity_churn_per_1000
516    Rajasthan            Beawar            274000.000000
512    Rajasthan           Balotra            274000.000000
527    Rajasthan  Didwana-Kuchaman            188500.000000
543    Rajasthan          Salumbar            126000.000000
436      Mizoram          Serchhip             86376.146789
413      Manipur           Thoubal             83315.424610
405      Manipur       Imphal East             80738.056013
525    Rajasthan             Deeg              78625.000000
406      Manipur       Imphal West             78410.355487
399  Maharashtra            Wardha             71807.600950


Early Enrollment vs Late Correction Tradeoff


In [21]:
import pandas as pd

tradeoff_df = (
    df.groupby(['state', 'district'], as_index=False)
      .agg({
          'enr_age_0_5': 'sum',      # early enrollment
          'enr_age_5_17': 'sum',     # child enrollments
          'bio_age_5_17': 'sum'      # later corrections
      })
)

# keep only districts with child activity
tradeoff_df = tradeoff_df[
    (tradeoff_df['enr_age_0_5'] > 0) &
    (tradeoff_df['enr_age_5_17'] > 0)
]

# early enrollment intensity (proxy for proactive capture)
tradeoff_df['early_enrollment_intensity'] = (
    tradeoff_df['enr_age_0_5'] /
    tradeoff_df['enr_age_5_17']
)

# late correction rate (what we expect to be lower)
tradeoff_df['late_child_correction_rate'] = (
    tradeoff_df['bio_age_5_17'] /
    tradeoff_df['enr_age_5_17']
)

# tradeoff signal: lower is better
tradeoff_df['early_vs_late_tradeoff'] = (
    tradeoff_df['late_child_correction_rate'] /
    (tradeoff_df['early_enrollment_intensity'] + 1e-6)
)

# sort to see best-performing districts
tradeoff_df = tradeoff_df.sort_values(
    by='early_vs_late_tradeoff'
)

print(tradeoff_df.head(10))


                 state         district  enr_age_0_5  enr_age_5_17  bio_age_5_17  early_enrollment_intensity  late_child_correction_rate  early_vs_late_tradeoff
732        West Bengal            nadia          2.0           1.0           0.0                    2.000000                         0.0                     0.0
366        Maharashtra       Ahmednagar        182.0         158.0           0.0                    1.151899                         0.0                     0.0
318     Madhya Pradesh       Ashoknagar       1594.0        1323.0           0.0                    1.204837                         0.0                     0.0
289          Karnataka       Ramanagara        168.0          24.0           0.0                    7.000000                         0.0                     0.0
270          Karnataka  Bengaluru Urban      12111.0        7167.0           0.0                    1.689828                         0.0                     0.0
239  Jammu And Kashmir          Sh

Anomaly & Fraud Risk Flagging (Advanced)


In [22]:
import pandas as pd

# aggregate at district level
risk_df = (
    df.groupby(['state', 'district'], as_index=False)
      .agg({
          'bio_age_5_17': 'sum',
          'bio_age_17_greater': 'sum',
          'demo_age_5_17': 'sum',
          'demo_age_17_greater': 'sum',
          'enr_age_0_5': 'sum',
          'enr_age_5_17': 'sum',
          'enr_age_18_greater': 'sum'
      })
)

# totals
risk_df['total_bio'] = (
    risk_df['bio_age_5_17'] +
    risk_df['bio_age_17_greater']
)

risk_df['total_demo'] = (
    risk_df['demo_age_5_17'] +
    risk_df['demo_age_17_greater']
)

risk_df['total_enr'] = (
    risk_df['enr_age_0_5'] +
    risk_df['enr_age_5_17'] +
    risk_df['enr_age_18_greater']
)

# keep meaningful districts
risk_df = risk_df[risk_df['total_enr'] > 0]

# anomaly metrics
risk_df['bio_to_demo_ratio'] = (
    risk_df['total_bio'] /
    (risk_df['total_demo'] + 1e-6)
)

risk_df['bio_to_enr_ratio'] = (
    risk_df['total_bio'] /
    risk_df['total_enr']
)

# risk flags
risk_df['FLAG_bio_gt_demo'] = risk_df['bio_to_demo_ratio'] > 2
risk_df['FLAG_bio_gt_enr'] = risk_df['bio_to_enr_ratio'] > 1

# combined risk score
risk_df['fraud_risk_score'] = (
    risk_df['FLAG_bio_gt_demo'].astype(int) +
    risk_df['FLAG_bio_gt_enr'].astype(int)
)

# show high-risk districts
risk_df = risk_df.sort_values(
    by=['fraud_risk_score', 'bio_to_enr_ratio'],
    ascending=False
)

print(
    risk_df[
        [
            'state',
            'district',
            'total_bio',
            'total_demo',
            'total_enr',
            'bio_to_demo_ratio',
            'bio_to_enr_ratio',
            'fraud_risk_score'
        ]
    ].head(15)
)


                                        state                  district  total_bio  total_demo  total_enr  bio_to_demo_ratio  bio_to_enr_ratio  fraud_risk_score
436                                   Mizoram                  Serchhip     7604.0      1702.0      109.0           4.467685         69.761468                 2
399                               Maharashtra                    Wardha   128058.0     20992.0     2105.0           6.100324         60.835154                 2
433                                   Mizoram                     Mamit    10715.0      1138.0      180.0           9.415641         59.527778                 2
375                               Maharashtra                Gadchiroli   141024.0     33383.0     2481.0           4.224426         56.841596                 2
371                               Maharashtra                  Bhandara   102460.0     24771.0     1984.0           4.136288         51.643145                 2
393                               

In [23]:
import pandas as pd

# ensure date is datetime
df['date'] = pd.to_datetime(df['date'])

# election windows
election_windows = {
    'Delhi': ('2025-01-01', '2025-02-28'),
    'Bihar': ('2025-10-01', '2025-11-30')
}

results = []

for state, (start, end) in election_windows.items():
    state_df = df[df['state'] == state]

    # election period
    election_df = state_df[
        (state_df['date'] >= start) &
        (state_df['date'] <= end)
    ]

    # non-election baseline (same year, excluding election window)
    baseline_df = state_df[
        ~((state_df['date'] >= start) & (state_df['date'] <= end)) &
        (state_df['date'].dt.year == 2025)
    ]

    def summarize(x):
        return pd.Series({
            'bio': x['bio_age_5_17'].sum() + x['bio_age_17_greater'].sum(),
            'demo': x['demo_age_5_17'].sum() + x['demo_age_17_greater'].sum(),
            'enr': (
                x['enr_age_0_5'].sum() +
                x['enr_age_5_17'].sum() +
                x['enr_age_18_greater'].sum()
            )
        })

    e = summarize(election_df)
    b = summarize(baseline_df)

    results.append({
        'state': state,
        'period': 'election',
        'bio': e['bio'],
        'demo': e['demo'],
        'enr': e['enr'],
        'IVI': (e['bio'] + e['demo']) / (e['enr'] + 1e-6)
    })

    results.append({
        'state': state,
        'period': 'baseline',
        'bio': b['bio'],
        'demo': b['demo'],
        'enr': b['enr'],
        'IVI': (b['bio'] + b['demo']) / (b['enr'] + 1e-6)
    })

analysis_df = pd.DataFrame(results)

print(analysis_df)


   state    period        bio       demo       enr        IVI
0  Delhi  election        0.0        0.0       0.0   0.000000
1  Delhi  baseline  1263035.0   867025.0   95206.0  22.373170
2  Bihar  election   684186.0  1070406.0  221121.0   7.934986
3  Bihar  baseline  4158445.0  2035397.0  371457.0  16.674452
