In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import glob
plt.style.use("seaborn-v0_8")


In [None]:
enrol_files = glob.glob("api_data_aadhar_enrolment*.csv")
enroll = pd.concat([pd.read_csv(f) for f in enrol_files], ignore_index=True)
print("Enrolment rows:", enroll.shape)


In [None]:
demo_files = glob.glob("api_data_aadhar_demographic*.csv")
demo = pd.concat([pd.read_csv(f) for f in demo_files], ignore_index=True)
print("Demographic rows:", demo.shape)


In [None]:
bio_files = glob.glob("api_data_aadhar_biometric*.csv")
bio = pd.concat([pd.read_csv(f) for f in bio_files], ignore_index=True)
print("Biometric rows:", bio.shape)


## 1. Data Loading and Cleaning

In [None]:
def clean_columns(df):
    df.columns = (
        df.columns.str.strip()
        .str.lower()
        .str.replace(" ", "_")
    )
    return df

enroll = clean_columns(enroll)
demo = clean_columns(demo)
bio = clean_columns(bio)


In [None]:
enroll["date"] = pd.to_datetime(enroll["date"], dayfirst=True)
demo["date"] = pd.to_datetime(demo["date"], dayfirst=True)
bio["date"] = pd.to_datetime(bio["date"], dayfirst=True)


In [None]:
enroll["enrolments"] = (
    enroll["age_0_5"] +
    enroll["age_5_17"] +
    enroll["age_18_greater"]
)


In [None]:
enroll["child_share"] = enroll["age_0_5"] / (enroll["enrolments"] + 1)


In [None]:
demo_agg = (
    demo.groupby(["state","district","date"])
    .size()
    .reset_index(name="demographic_updates")
)

bio_agg = (
    bio.groupby(["state","district","date"])
    .size()
    .reset_index(name="biometric_updates")
)


In [None]:
master = (
    enroll
    .merge(demo_agg, on=["state","district","date"], how="left")
    .merge(bio_agg, on=["state","district","date"], how="left")
)

master.fillna(0, inplace=True)
master.head(100)


In [None]:
master["update_pressure"] = (
    (master["demographic_updates"] + master["biometric_updates"]) /
    (master["enrolments"] + 1)
)

master["aadhaar_stability_index"] = 1 / (1 + master["update_pressure"])


In [None]:
district_stats = (
    master.groupby(["state","district"])
    .agg(
        enrolments=("enrolments","sum"),
        stability=("aadhaar_stability_index","mean")
    )
    .reset_index()
)

district_stats["z_score"] = (
    (district_stats["stability"] - district_stats["stability"].mean()) /
     district_stats["stability"].std()
)

anomalies = district_stats[district_stats["z_score"] < -2]
anomalies.head()


## 2. Enrolment Trend Analysis

In [None]:
# national enrolment trend dataframe
trend = (
    master
    .groupby("date")["enrolments"]
    .sum()
    .reset_index()
)

trend.head()


In [None]:

peak_row = trend.loc[trend["enrolments"].idxmax()]
peak_date = peak_row["date"]
peak_value = peak_row["enrolments"]

plt.figure(figsize=(10,4))

plt.plot(
    trend["date"],
    trend["enrolments"],
    color="#1f4ed8",
    linewidth=2.5
)

plt.annotate(
    "Major enrolment drive",
    xy=(peak_date, peak_value),
    xytext=(peak_date, peak_value * 0.7),
    arrowprops=dict(arrowstyle="->", color="gray"),
    fontsize=10,
    color="gray"
)

plt.title("National Aadhaar Enrolment Trend", fontsize=14)
plt.xlabel("Date")
plt.ylabel("Total Enrolments")
plt.grid(alpha=0.3)

plt.show()


In [None]:
state_enroll = (
    master
    .groupby("state")["enrolments"]
    .sum()
    .sort_values(ascending=False)
    .head(10)
    .reset_index()
)

# Plot
plt.figure(figsize=(9,5))

ax = sns.barplot(
    data=state_enroll,
    x="enrolments",
    y="state",
    color="#1f4ed8"   
)


for container in ax.containers:
    ax.bar_label(container, fmt="%.0f", padding=3)

plt.title("Top 10 States by Aadhaar Enrolment Volume", fontsize=13)
plt.xlabel("Total Enrolments")
plt.ylabel("State")
plt.grid(axis="x", alpha=0.3)

plt.tight_layout()
plt.savefig("ppt_2_states.png", dpi=300)
plt.show()


# 3. Age-wise Enrolment Analysis

In [None]:
age_totals = enroll[["age_0_5","age_5_17","age_18_greater"]].sum()

labels = ["Age 0–5", "Age 5–17", "Age 18+"]

plt.figure(figsize=(6,6))
plt.pie(
    age_totals,
    labels=labels,
    autopct="%1.1f%%",
    startangle=90,
    wedgeprops={"edgecolor": "white"}
)

plt.title("Age-wise Distribution of Aadhaar Enrolments", fontsize=13)
plt.tight_layout()
plt.savefig("ppt_3_age.png", dpi=300)
plt.show()



# 4. Aadhaar Stability Index Computation

In [None]:

state_stability = (
    master
    .groupby("state")["aadhaar_stability_index"]
    .mean()
    .sort_values()           
    .head(10)
    .reset_index()
)

plt.figure(figsize=(9,5))

ax = sns.barplot(
    data=state_stability,
    x="aadhaar_stability_index",
    y="state",
    color="#e67e22"   
)

for container in ax.containers:
    ax.bar_label(container, fmt="%.2f", padding=3)

plt.title("States with Lowest Aadhaar Stability Index", fontsize=13)
plt.xlabel("Average Aadhaar Stability Index")
plt.ylabel("State")
plt.grid(axis="x", alpha=0.3)

plt.tight_layout()
plt.savefig("ppt_4_stability.png", dpi=300)
plt.show()



# 5. Anomaly Detection

In [None]:
import matplotlib.pyplot as plt
import numpy as np

district_stats = (
    master
    .groupby(["state", "district"])
    .agg(
        enrolments=("enrolments", "sum"),
        stability=("aadhaar_stability_index", "mean")
    )
    .reset_index()
)


threshold = district_stats["stability"].quantile(0.05)

anomalies = district_stats[
    district_stats["stability"] <= threshold
]

print("Total districts:", district_stats.shape[0])
print("Anomalous districts (bottom 5%):", anomalies.shape[0])


plt.figure(figsize=(8,5))

plt.scatter(
    district_stats["enrolments"],
    district_stats["stability"],
    alpha=0.4,
    color="#4c72b0",
    label="Normal districts"
)


plt.scatter(
    anomalies["enrolments"],
    anomalies["stability"],
    color="red",
    s=100,
    marker="X",
    edgecolor="black",
    label="Anomalous districts (Bottom 5%)"
)

plt.title(
    "Anomalous Districts Based on Aadhaar Stability Index",
    fontsize=13
)
plt.xlabel("Total Enrolments")
plt.ylabel("Average Aadhaar Stability Index")
plt.grid(alpha=0.3)
plt.legend()

plt.tight_layout()
plt.savefig("ppt_5_anomalies.png", dpi=300)
plt.show()


# aadhar Stability index state-wise

In [None]:
print("Total districts:", district_stats.shape[0])
print("Anomalies found:", anomalies.shape[0])


In [None]:
import os
os.listdir()


In [None]:
import json
import plotly.express as px

state_map = (
    master
    .groupby("state")["aadhaar_stability_index"]
    .mean()
    .reset_index()
)

state_map["state"] = state_map["state"].str.title()

with open("india_state.geojson.txt", "r") as f:
    india_states = json.load(f)


fig = px.choropleth(
    state_map,
    geojson=india_states,
    locations="state",
    featureidkey="properties.NAME_1",
    color="aadhaar_stability_index",
    color_continuous_scale="RdYlGn",
    title="India: Aadhaar Stability Index by State"
)

fig.update_geos(fitbounds="locations", visible=False)
fig.show()
fig.write_image("ppt_6_india_stability.png", width=1200, height=700, scale=2)
