In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# EDA Question 2
# To what extent does a province's typical environmental threat (measured by its median 24-hour rainfall) align with the cumulative flood control infrastructure investment it has received from 2019 to 2025?

df_master = pd.read_csv('./data/merged/typhoon-info-infra-project.csv')

# 1. Numerical Summary: Aggregating by Province
# We calculate the median rainfall to represent the 'typical' threat
# and the total budget 'so far' to represent the defense.
province_eda = df_master.groupby('Province').agg({
    'Max 24-hour Rainfall (mm)': 'median',
    # Use 'max' to get the latest 'so far' value per province
    'Cumulative_Budget_To_Date': 'max',
    'Region': 'first'  # Keeping region for visualization grouping
}).reset_index()

# Calculate the correlation coefficient (Pearson's r) as a numerical summary
correlation = province_eda['Max 24-hour Rainfall (mm)'].corr(
    province_eda['Cumulative_Budget_To_Date'])

# 2. Visualization: Scatter Plot with Regression Line
plt.figure(figsize=(4, 3))

# We use a scatter plot as it is the standard for comparing two continuous variables
sns.regplot(
    data=province_eda,
    x='Max 24-hour Rainfall (mm)',
    y='Cumulative_Budget_To_Date',
    scatter_kws={'alpha': 0.5},
    line_kws={'color': 'red'}
)

plt.title('Province-Level Comparison: Median Rainfall vs. Total Flood Control Budget')
plt.xlabel('Median 24-hour Rainfall (mm)')
plt.ylabel('Total Cumulative Budget (Pesos)')
plt.grid(True, linestyle='--', alpha=0.6)

plt.savefig('data/eda-outputs/province_rainfall_budget_analysis.png',
            dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Justification of Visualization:

# The visualization reveals a weak-to-nonexistent correlation between a province's median 24-hour rainfall and its total cumulative budget for flood control projects. The regression line is nearly horizontal, suggesting that as environmental threat increases, the allocated "mitigation" budget does not necessarily scale in proportion.

# Key Insights:

# Mismatch in Prioritization: We observe several "budget outliers" where provinces with moderate rainfall (100â€“150 mm) have received funding exceeding 40 billion pesos, while provinces facing extreme typical rainfall (above 250 mm) remain at the lower end of the funding spectrum.

# Conclusion on EDA question 2: The data reveals a significant misalignment between environmental threat and infrastructure investment; the nearly flat regression line in the analysis proves that median rainfall is not a predictor of cumulative funding. This "prioritization mismatch" suggests that resource allocation is likely driven by socio-economic or political factors rather than meteorological risk, leaving high-threat provinces potentially under-protected.