# EDA Question 3: Spending vs Mortality Efficiency

How does Mortality Efficiency (Deaths per 100k affected) change after a significant spending increase, and do the highest-investment regions show steeper declines in mortality rates over time?


In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# --- Load cleaned typhoon impact data ---
project_root = Path('..').resolve().parent
impact_dir = project_root / 'data' / 'typhoon-impact'
impact_files = sorted(impact_dir.glob('*.csv'))
if not impact_files:
    raise FileNotFoundError(f'No CSVs found in {impact_dir}')

df = pd.concat([pd.read_csv(fp) for fp in impact_files], ignore_index=True)
df_clean = df.copy()

# Minimal columns needed for EDA
for col in ['Deaths', 'Affected']:
    df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce').fillna(0)
df_clean['Year'] = pd.to_numeric(df_clean['Year'], errors='coerce').astype('Int64')

# --- Mortality metrics by region and year ---
mortality = (
    df_clean.groupby(['Region', 'Year'], as_index=False)[['Deaths', 'Affected']].sum()
)
mortality['Mortality_Rate'] = mortality['Deaths'] / mortality['Affected'].replace(0, np.nan)
mortality['Mortality_Efficiency'] = mortality['Mortality_Rate'] * 100_000

# --- Load infra data and compute cumulative spending ---
infra = pd.read_csv(project_root / 'data' / 'infra-projects' / 'cleaned_infra_projects.csv')
infra['ActualCompletionDate'] = pd.to_datetime(infra['ActualCompletionDate'], errors='coerce')
infra['CompletionYear'] = infra['ActualCompletionDate'].dt.year
infra['CompletionYear'] = infra['CompletionYear'].fillna(infra['FundingYear']).astype(int)

infra_yearly = (
    infra.groupby(['Region', 'CompletionYear'], as_index=False)['Final_Budget'].sum()
)
infra_yearly = infra_yearly.sort_values(['Region', 'CompletionYear'])
infra_yearly['Cumulative_Spending'] = (
    infra_yearly.groupby('Region')['Final_Budget'].cumsum()
)

# --- Merge and forward-fill cumulative spending by region ---
merged = mortality.merge(
    infra_yearly,
    left_on=['Region', 'Year'],
    right_on=['Region', 'CompletionYear'],
    how='left',
)
merged = merged.sort_values(['Region', 'Year'])
merged['Cumulative_Spending'] = (
    merged.groupby('Region')['Cumulative_Spending'].ffill().fillna(0)
)

# --- Plot: decoupling trails ---
plt.figure(figsize=(10, 6))
for region, grp in merged.groupby('Region'):
    grp = grp.sort_values('Year')
    plt.plot(
        grp['Cumulative_Spending'],
        grp['Mortality_Rate'],
        marker='o',
        linewidth=1,
        alpha=0.7,
        label=region,
    )
plt.xlabel('Cumulative Infrastructure Spending (PhP)')
plt.ylabel('Mortality Rate (Deaths/Affected)')
plt.title('Decoupling Plot: Spending vs Mortality Rate by Region')
plt.grid(alpha=0.3)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', fontsize=8)
plt.tight_layout()
plt.show()

# --- Question Part 1: Mortality Efficiency after significant spending jump ---
merged['Spending_Delta'] = merged.groupby('Region')['Cumulative_Spending'].diff()
jump_threshold = merged['Spending_Delta'].quantile(0.75)

def summarize_jump(group):
    group = group.sort_values('Year')
    jump_years = group[group['Spending_Delta'] >= jump_threshold]
    if jump_years.empty:
        return pd.Series(dtype=float)
    jump_year = int(jump_years['Year'].iloc[0])
    before = group[group['Year'] < jump_year]['Mortality_Efficiency'].mean()
    after = group[group['Year'] >= jump_year]['Mortality_Efficiency'].mean()
    return pd.Series({
        'JumpYear': jump_year,
        'Before_Mortality_Eff': before,
        'After_Mortality_Eff': after,
        'Change': after - before,
    })

jump_summary = (
    merged.groupby('Region').apply(summarize_jump).dropna().reset_index()
)
jump_summary = jump_summary.sort_values('Change')
jump_summary

# --- Question Part 2: Do top-investment regions show steeper declines? ---
max_spend = (
    merged.groupby('Region')['Cumulative_Spending'].max().sort_values(ascending=False)
)
top_regions = max_spend.head(5).index

def rate_slope(group):
    group = group.dropna(subset=['Mortality_Rate', 'Year'])
    if len(group) < 2:
        return np.nan
    slope, _ = np.polyfit(group['Year'], group['Mortality_Rate'], 1)
    return slope

trend = (
    merged[merged['Region'].isin(top_regions)]
    .groupby('Region')
    .apply(rate_slope)
    .reset_index(name='MortalityRate_Slope_perYear')
)
trend = trend.merge(max_spend.rename('Max_Cumulative_Spending'), on='Region')
trend.sort_values('MortalityRate_Slope_perYear')