In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as mpatches
import warnings
warnings.filterwarnings('ignore')

In [3]:
plt.style.use("seaborn-v0_8-darkgrid")

In [4]:
import pyarrow
print(pyarrow.__version__)

22.0.0


In [5]:
import sys
print(sys.executable)

/home/codespace/.python/current/bin/python


## Output paths

In [6]:
from pathlib import Path

PLOT_PATH = Path("../outputs/plots")
TABLE_PATH = Path("../outputs/tables")

PLOT_PATH.mkdir(parents=True, exist_ok=True)
TABLE_PATH.mkdir(parents=True, exist_ok=True)



In [7]:
print(type(PLOT_PATH), PLOT_PATH)

<class 'pathlib.PosixPath'> ../outputs/plots


## Load Data

In [8]:
afi_df = pd.read_csv(
    "../datasets/processed/index/aadhaar_friction_index_only.csv"
)

signals_df = pd.read_csv(
    "../datasets/processed/signals/friction_signals.csv"
)

# Clean period
afi_df = afi_df.dropna(subset=["period"])
signals_df = signals_df.dropna(subset=["period"])


## AFI Heatmap (District × Time)

- Fixes ugly heatmap
- Uses top 25 districts only (readable)

In [9]:
top_districts = (
    afi_df.groupby("district")["AFI"]
    .mean()
    .sort_values(ascending=False)
    .head(25)
    .index
)

heatmap_df = afi_df[
    afi_df["district"].isin(top_districts)
]

pivot_df = heatmap_df.pivot_table(
    index="district",
    columns="period",
    values="AFI",
    aggfunc="mean"
)

plt.figure(figsize=(14, 10))
sns.heatmap(
    pivot_df,
    cmap="Reds",
    linewidths=0.3,
    linecolor="gray"
)

plt.title("Aadhaar Friction Index Heatmap (Top 25 Districts)")
plt.xlabel("Time (Month)")
plt.ylabel("District")

plt.tight_layout()
plt.savefig(PLOT_PATH /"afi_heatmap_district_time.png", dpi=300, bbox_inches="tight")
plt.close()


## Lifecycle Flow Imbalance Chart

(Enrolment vs Updates)

In [10]:
flow_df = signals_df.groupby("period").agg({
    "enrolment_count": "sum",
    "demographic_update_count": "sum",
    "biometric_update_count": "sum"
}).reset_index()

flow_df["total_updates"] = (
    flow_df["demographic_update_count"] +
    flow_df["biometric_update_count"]
)

plt.figure(figsize=(12, 6))
plt.plot(flow_df["period"], flow_df["enrolment_count"], label="Enrolments")
plt.plot(flow_df["period"], flow_df["total_updates"], label="Total Updates")

plt.xticks(rotation=45)
plt.title("Lifecycle Flow Imbalance: Enrolment vs Updates")
plt.xlabel("Period")
plt.ylabel("Count")
plt.legend()

plt.tight_layout()
plt.savefig(PLOT_PATH / "lifecycle_flow_imbalance.png", dpi=300)
plt.close()


## District Typology (TABLE Output)

In [11]:
district_avg = (
    afi_df.groupby(["state", "district"])["AFI"]
    .mean()
    .reset_index()
)

def classify_friction(afi):
    if afi >= 70:
        return "High Friction"
    elif afi >= 40:
        return "Medium Friction"
    else:
        return "Low Friction"

district_avg["friction_type"] = district_avg["AFI"].apply(classify_friction)

district_avg.to_csv(
    TABLE_PATH / "district_friction_typology.csv",
    index=False
)


## Before / After AFI Trend (Selected Districts)

In [12]:
selected_districts = (
    afi_df.groupby("district")["AFI"]
    .mean()
    .sort_values(ascending=False)
    .head(5)
    .index
)

trend_df = afi_df[afi_df["district"].isin(selected_districts)]

plt.figure(figsize=(12, 6))

for district in selected_districts:
    d = trend_df[trend_df["district"] == district]
    plt.plot(d["period"], d["AFI"], label=district)

plt.xticks(rotation=45)
plt.title("AFI Trend for High-Friction Districts")
plt.xlabel("Period")
plt.ylabel("AFI Score")
plt.legend()

plt.tight_layout()
plt.savefig(PLOT_PATH / "afi_trend_selected_districts.png", dpi=300)
plt.close()


## Hidden Risk Scatter Plot

(Low enrolment updates ≠ low friction)

In [13]:
risk_df = signals_df.merge(
    afi_df,
    on=["state", "district", "period"],
    how="inner"
)

risk_df["update_intensity"] = (
    risk_df["demographic_update_count"] +
    risk_df["biometric_update_count"]
)

plt.figure(figsize=(10, 6))
plt.scatter(
    risk_df["update_intensity"],
    risk_df["AFI"],
    alpha=0.5
)

plt.title("Hidden Risk: Low Updates ≠ Low Friction")
plt.xlabel("Total Update Volume")
plt.ylabel("AFI Score")

plt.tight_layout()
plt.savefig(PLOT_PATH / "hidden_risk_scatter.png", dpi=300)
plt.close()


## AFI Summary by District (CORE TABLE)

Purpose:
→ “Which districts are structurally high friction?”

In [14]:
district_summary = (
    afi_df
    .groupby(["state", "district"])
    .agg(
        avg_afi=("AFI", "mean"),
        max_afi=("AFI", "max"),
        min_afi=("AFI", "min"),
        months_observed=("period", "nunique")
    )
    .reset_index()
    .sort_values("avg_afi", ascending=False)
)

district_summary.to_csv(
    TABLE_PATH / "afi_summary_by_district.csv",
    index=False
)


## AFI Summary by State (Policy View)

Purpose:
→ State-wise prioritization & funding logic

In [15]:
state_summary = (
    afi_df
    .groupby("state")
    .agg(
        avg_afi=("AFI", "mean"),
        max_afi=("AFI", "max"),
        districts=("district", "nunique")
    )
    .reset_index()
    .sort_values("avg_afi", ascending=False)
)

state_summary.to_csv(
    TABLE_PATH / "afi_summary_by_state.csv",
    index=False
)


## Monthly AFI Trends (Time-Series Table)

Purpose:
→ Used directly in Power BI / Excel line charts

In [16]:
monthly_afi = (
    afi_df
    .groupby("period")
    .agg(
        avg_afi=("AFI", "mean"),
        median_afi=("AFI", "median"),
        districts_reporting=("district", "nunique")
    )
    .reset_index()
)

monthly_afi.to_csv(
    TABLE_PATH / "monthly_afi_trends.csv",
    index=False
)


## Friction Signal Summary (Explainability Table)

Purpose:
→ “WHY is AFI high here?”

In [17]:
signal_summary = (
    signals_df
    .groupby(["state", "district"])
    .agg(
        avg_UIS=("UIS", "mean"),
        avg_RIS=("RIS", "mean"),
        avg_BSS=("BSS", "mean"),
        avg_TSD=("TSD", "mean"),
        months=("period", "nunique")
    )
    .reset_index()
)

signal_summary.to_csv(
    TABLE_PATH / "friction_signal_summary.csv",
    index=False
)


## Lifecycle Imbalance Table (NOT a plot)

Purpose:
→ Quantifies enrolment vs update pressure

In [18]:
lifecycle_table = (
    signals_df
    .groupby("period")
    .agg(
        total_enrolments=("enrolment_count", "sum"),
        total_demographic_updates=("demographic_update_count", "sum"),
        total_biometric_updates=("biometric_update_count", "sum")
    )
    .reset_index()
)

lifecycle_table["total_updates"] = (
    lifecycle_table["total_demographic_updates"] +
    lifecycle_table["total_biometric_updates"]
)

lifecycle_table["update_to_enrolment_ratio"] = (
    lifecycle_table["total_updates"] /
    lifecycle_table["total_enrolments"].replace(0, pd.NA)
)

lifecycle_table.to_csv(
    TABLE_PATH / "lifecycle_imbalance_table.csv",
    index=False
)


## Hidden Risk Table (VERY IMPORTANT)

Purpose:
→ Finds districts with low volume but high friction

In [19]:
risk_table = signals_df.merge(
    afi_df,
    on=["state", "district", "period"],
    how="inner"
)

risk_table["total_updates"] = (
    risk_table["demographic_update_count"] +
    risk_table["biometric_update_count"]
)

hidden_risk = risk_table[
    (risk_table["total_updates"] < risk_table["total_updates"].median()) &
    (risk_table["AFI"] > risk_table["AFI"].quantile(0.75))
]

hidden_risk = hidden_risk[
    ["state", "district", "period", "total_updates", "AFI"]
].sort_values("AFI", ascending=False)

hidden_risk.to_csv(
    TABLE_PATH / "hidden_risk_table.csv",
    index=False
)


## Top 100 High Friction Records (Audit Table)

Purpose:
→ Manual inspection / case studies

In [20]:
top_100 = (
    afi_df
    .sort_values("AFI", ascending=False)
    .head(100)
)

top_100.to_csv(
    TABLE_PATH / "top_100_high_friction_records.csv",
    index=False
)


## AFI DISTRIBUTION & STATISTICS CHART

In [26]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('AFI Distribution & Statistical Analysis', fontsize=16, fontweight='bold')

# 1.1: Histogram of AFI scores
axes[0, 0].hist(afi_df['AFI'], bins=50, color='#ef4444', edgecolor='black', alpha=0.7)
axes[0, 0].set_title('Distribution of AFI Scores', fontweight='bold')
axes[0, 0].set_xlabel('AFI Score')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].axvline(afi_df['AFI'].mean(), color='blue', linestyle='--', linewidth=2, label=f'Mean: {afi_df["AFI"].mean():.2f}')
axes[0, 0].axvline(afi_df['AFI'].median(), color='green', linestyle='--', linewidth=2, label=f'Median: {afi_df["AFI"].median():.2f}')
axes[0, 0].legend()

# 1.2: Box plot by State (Top 10 states)
top_states = afi_df.groupby('state')['AFI'].mean().nlargest(10).index
afi_top_states = afi_df[afi_df['state'].isin(top_states)]
state_order = afi_top_states.groupby('state')['AFI'].mean().sort_values(ascending=False).index
sns.boxplot(data=afi_top_states, x='state', y='AFI', order=state_order, ax=axes[0, 1], palette='Set2')
axes[0, 1].set_title('AFI Distribution by Top 10 States', fontweight='bold')
axes[0, 1].set_xlabel('State')
axes[0, 1].set_ylabel('AFI Score')
axes[0, 1].tick_params(axis='x', rotation=45)

# 1.3: Friction type classification pie chart
friction_counts = district_avg['friction_type'].value_counts()
colors = ['#ef4444', '#f59e0b', '#10b981']
axes[1, 0].pie(friction_counts, labels=friction_counts.index, autopct='%1.1f%%', 
               colors=colors, startangle=90)
axes[1, 0].set_title('Districts by Friction Classification', fontweight='bold')

# 1.4: Top 15 highest friction districts
top_15_districts = district_avg.nlargest(15, 'AFI')
axes[1, 1].barh(range(len(top_15_districts)), top_15_districts['AFI'].values, color='#ef4444')
axes[1, 1].set_yticks(range(len(top_15_districts)))
axes[1, 1].set_yticklabels([f"{row['district']}, {row['state']}" for _, row in top_15_districts.iterrows()], fontsize=9)
axes[1, 1].set_title('Top 15 Highest Friction Districts', fontweight='bold')
axes[1, 1].set_xlabel('Average AFI Score')
axes[1, 1].invert_yaxis()

plt.tight_layout()
plt.savefig(PLOT_PATH / "01_afi_distribution_statistics.png", dpi=300, bbox_inches='tight')
plt.close()
print("✓ Saved: 01_afi_distribution_statistics.png")


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=afi_top_states, x='state', y='AFI', order=state_order, ax=axes[0, 1], palette='Set2')


✓ Saved: 01_afi_distribution_statistics.png


## SECTION 2: FRICTION SIGNAL ANALYSIS

In [27]:
# Load signal summary if not already in memory
signal_summary = pd.read_csv(TABLE_PATH / "friction_signal_summary.csv")

# 2.1: Average signal values by district (Top 15)
top_signal_districts = signal_summary.nlargest(15, 'avg_UIS')

fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Left: Stacked bar chart of signals
signals = ['avg_UIS', 'avg_RIS', 'avg_BSS', 'avg_TSD']
signal_labels = ['UIS (Unresolved)', 'RIS (Resolution)', 'BSS (Biometric)', 'TSD (Technical Delays)']
x_pos = range(len(top_signal_districts))

bottom = np.zeros(len(top_signal_districts))
colors_signals = ['#ef4444', '#f59e0b', '#3b82f6', '#8b5cf6']

for i, (signal, label, color) in enumerate(zip(signals, signal_labels, colors_signals)):
    axes[0].bar(x_pos, top_signal_districts[signal].values, bottom=bottom, 
                label=label, color=color, alpha=0.8)
    bottom += top_signal_districts[signal].values

axes[0].set_xticks(x_pos)
axes[0].set_xticklabels([f"{row['district'][:15]}" for _, row in top_signal_districts.iterrows()], 
                         rotation=45, ha='right', fontsize=9)
axes[0].set_title('Friction Signals - Top 15 Districts', fontweight='bold')
axes[0].set_ylabel('Signal Intensity')
axes[0].legend(loc='upper right', fontsize=8)

# Right: Scatter plot - UIS vs RIS
axes[1].scatter(signal_summary['avg_UIS'], signal_summary['avg_RIS'], 
               alpha=0.6, s=100, c=signal_summary['avg_BSS'], cmap='RdYlGn_r')
axes[1].set_xlabel('UIS (Unresolved Issues Score)', fontweight='bold')
axes[1].set_ylabel('RIS (Resolution Issues Score)', fontweight='bold')
axes[1].set_title('Signal Relationship: UIS vs RIS', fontweight='bold')
cbar = plt.colorbar(axes[1].collections[0], ax=axes[1])
cbar.set_label('BSS (Biometric)', fontweight='bold')

plt.tight_layout()
plt.savefig(PLOT_PATH / "02_friction_signal_analysis.png", dpi=300, bbox_inches='tight')
plt.close()
print("✓ Saved: 02_friction_signal_analysis.png")



✓ Saved: 02_friction_signal_analysis.png


## HIDDEN RISK DEEP DIVE

In [28]:
hidden_risk = pd.read_csv(TABLE_PATH / "hidden_risk_table.csv")

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Hidden Risk Analysis: Identifying Overlooked Problem Areas', 
             fontsize=16, fontweight='bold')

# 3.1: Scatter - Updates vs AFI (with color coding)
scatter = axes[0, 0].scatter(hidden_risk['total_updates'], hidden_risk['AFI'], 
                            alpha=0.6, s=150, c=hidden_risk['AFI'], 
                            cmap='Reds', edgecolors='black', linewidth=0.5)
axes[0, 0].set_xlabel('Total Updates', fontweight='bold')
axes[0, 0].set_ylabel('AFI Score', fontweight='bold')
axes[0, 0].set_title('Hidden Risk: Updates vs Friction Level', fontweight='bold')
axes[0, 0].grid(True, alpha=0.3)
plt.colorbar(scatter, ax=axes[0, 0], label='AFI')

# 3.2: Top hidden risk districts
top_hidden = hidden_risk.nlargest(10, 'AFI')
axes[0, 1].barh(range(len(top_hidden)), top_hidden['AFI'].values, color='#ec4899')
axes[0, 1].set_yticks(range(len(top_hidden)))
axes[0, 1].set_yticklabels([f"{row['district']}, {row['state']}" for _, row in top_hidden.iterrows()], fontsize=9)
axes[0, 1].set_title('Top 10 Hidden Risk Districts', fontweight='bold')
axes[0, 1].set_xlabel('AFI Score')
axes[0, 1].invert_yaxis()

# 3.3: Update volume distribution in hidden risk
axes[1, 0].hist(hidden_risk['total_updates'], bins=30, color='#3b82f6', edgecolor='black', alpha=0.7)
axes[1, 0].axvline(hidden_risk['total_updates'].median(), color='red', linestyle='--', 
                   linewidth=2, label=f'Median: {hidden_risk["total_updates"].median():.0f}')
axes[1, 0].set_title('Distribution of Updates in Hidden Risk Cases', fontweight='bold')
axes[1, 0].set_xlabel('Total Updates')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].legend()

# 3.4: Risk heatmap by state
hidden_risk_by_state = hidden_risk.groupby('state').agg({
    'AFI': ['mean', 'count'],
    'total_updates': 'mean'
}).reset_index()
hidden_risk_by_state.columns = ['state', 'avg_afi', 'count', 'avg_updates']
hidden_risk_by_state = hidden_risk_by_state.nlargest(15, 'count')

scatter2 = axes[1, 1].scatter(hidden_risk_by_state['avg_updates'], 
                             hidden_risk_by_state['avg_afi'],
                             s=hidden_risk_by_state['count']*50,
                             c=hidden_risk_by_state['avg_afi'],
                             cmap='Reds', alpha=0.6, edgecolors='black', linewidth=1)
axes[1, 1].set_xlabel('Average Updates', fontweight='bold')
axes[1, 1].set_ylabel('Average AFI', fontweight='bold')
axes[1, 1].set_title('Hidden Risk by State (bubble size = case count)', fontweight='bold')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(PLOT_PATH / "03_hidden_risk_analysis.png", dpi=300, bbox_inches='tight')
plt.close()
print("✓ Saved: 03_hidden_risk_analysis.png")


✓ Saved: 03_hidden_risk_analysis.png


## STATE-LEVEL ACTIONABLE INSIGHTS

In [29]:

state_summary = pd.read_csv(TABLE_PATH / "afi_summary_by_state.csv")

fig = plt.figure(figsize=(16, 8))
gs = fig.add_gridspec(2, 2, hspace=0.3, wspace=0.3)

# 4.1: Top states by average AFI
ax1 = fig.add_subplot(gs[0, :])
top_states_data = state_summary.nlargest(20, 'avg_afi')
colors_gradient = plt.cm.Reds(np.linspace(0.4, 0.9, len(top_states_data)))
bars = ax1.barh(range(len(top_states_data)), top_states_data['avg_afi'].values, color=colors_gradient)
ax1.set_yticks(range(len(top_states_data)))
ax1.set_yticklabels(top_states_data['state'].values, fontsize=10)
ax1.set_xlabel('Average AFI Score', fontweight='bold', fontsize=11)
ax1.set_title('Top 20 States by Average Friction Index', fontweight='bold', fontsize=12)
ax1.invert_yaxis()

# Add value labels
for i, (bar, val) in enumerate(zip(bars, top_states_data['avg_afi'].values)):
    ax1.text(val + 1, i, f'{val:.1f}', va='center', fontsize=9, fontweight='bold')

# 4.2: Max AFI vs Avg AFI scatter
ax2 = fig.add_subplot(gs[1, 0])
scatter = ax2.scatter(state_summary['avg_afi'], state_summary['max_afi'], 
                     s=state_summary['districts']*10, alpha=0.6, 
                     c=state_summary['avg_afi'], cmap='Reds', edgecolors='black', linewidth=0.5)
ax2.plot([0, state_summary['avg_afi'].max()], [0, state_summary['avg_afi'].max()], 
        'k--', alpha=0.3, label='Equal line')
ax2.set_xlabel('Average AFI', fontweight='bold')
ax2.set_ylabel('Maximum AFI', fontweight='bold')
ax2.set_title('Average vs Peak Friction (bubble = # districts)', fontweight='bold')
ax2.legend()
ax2.grid(True, alpha=0.3)

# 4.3: Number of districts vs AFI
ax3 = fig.add_subplot(gs[1, 1])
ax3.scatter(state_summary['districts'], state_summary['avg_afi'], 
           s=100, alpha=0.6, c='#3b82f6', edgecolors='black', linewidth=0.5)
ax3.set_xlabel('Number of Districts', fontweight='bold')
ax3.set_ylabel('Average AFI', fontweight='bold')
ax3.set_title('State Size vs Friction Level', fontweight='bold')
ax3.grid(True, alpha=0.3)

plt.savefig(PLOT_PATH / "04_state_level_analysis.png", dpi=300, bbox_inches='tight')
plt.close()
print("✓ Saved: 04_state_level_analysis.png")

✓ Saved: 04_state_level_analysis.png


## Paths 
- Ensure everything is human-readable tabular

In [21]:
from pathlib import Path
import sqlite3

BASE_TABLE_PATH = Path("../outputs/tables")

PARQUET_PATH = BASE_TABLE_PATH / "parquet"
EXCEL_PATH = BASE_TABLE_PATH / "excel"
SQLITE_PATH = BASE_TABLE_PATH / "sqlite"

for p in [PARQUET_PATH, EXCEL_PATH, SQLITE_PATH]:
    p.mkdir(parents=True, exist_ok=True)


## Load EXISTING CSVs

In [22]:
district_summary = pd.read_csv(TABLE_PATH / "afi_summary_by_district.csv")
state_summary = pd.read_csv(TABLE_PATH / "afi_summary_by_state.csv")
signal_summary = pd.read_csv(TABLE_PATH / "friction_signal_summary.csv")
hidden_risk = pd.read_csv(TABLE_PATH / "hidden_risk_table.csv")

## 1. PARQUET
(Only run if pyarrow is installed)

In [23]:
district_summary.to_parquet(
    PARQUET_PATH / "afi_summary_by_district.parquet",
    index=False
)

state_summary.to_parquet(
    PARQUET_PATH / "afi_summary_by_state.parquet",
    index=False
)

signal_summary.to_parquet(
    PARQUET_PATH / "friction_signal_summary.parquet",
    index=False
)

hidden_risk.to_parquet(
    PARQUET_PATH / "hidden_risk.parquet",
    index=False
)


## 2. SQLITE

In [24]:
conn = sqlite3.connect(
    SQLITE_PATH / "aadhaar_friction_tables.db"
)

district_summary.to_sql(
    "afi_by_district",
    conn,
    if_exists="replace",
    index=False
)

state_summary.to_sql(
    "afi_by_state",
    conn,
    if_exists="replace",
    index=False
)

signal_summary.to_sql(
    "friction_signal_summary",
    conn,
    if_exists="replace",
    index=False
)

hidden_risk.to_sql(
    "hidden_risk",
    conn,
    if_exists="replace",
    index=False
)

conn.close()


## 3. EXCEL

In [25]:
with pd.ExcelWriter(
    EXCEL_PATH / "afi_analysis_tables.xlsx",
    engine="xlsxwriter"
) as writer:

    district_summary.to_excel(
        writer,
        sheet_name="AFI by District",
        index=False
    )

    state_summary.to_excel(
        writer,
        sheet_name="AFI by State",
        index=False
    )

    signal_summary.to_excel(
        writer,
        sheet_name="Friction Signals",
        index=False
    )

    hidden_risk.to_excel(
        writer,
        sheet_name="Hidden Risk Districts",
        index=False
    )
