In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# EDA Question 1
# How does the 'Variance Ratio' (how much the actual cost varies compared to the original budget) of completed projects vary by province, and are there regions where cost overruns are consistently higher than the national average? 

df_master = pd.read_csv("./data/merged/typhoon-info-infra-project.csv")

# 1. Numerical Summary: Calculate National Average and Province Stats
national_avg_variance = df_master['Variance_Ratio_To_Date'].mean()

# Grouping by Province and Region to see the distribution
province_variance = df_master.groupby(['Province', 'Region'])['Variance_Ratio_To_Date'].mean().reset_index()

# Sorting for a cleaner visualization
province_variance = province_variance.sort_values(by='Variance_Ratio_To_Date', ascending=False)

print(f"National Average Variance Ratio: {national_avg_variance:.2%}")
print("\nTop 5 Provinces with Highest Budget Overruns:")
print(province_variance.head())

# 2. Visualization: Sorted Bar Chart with National Average Line
plt.figure(figsize=(14, 8))

# We use a bar plot to compare distinct categories (Provinces)
plot = sns.barplot(
    data=province_variance, 
    x='Province', 
    y='Variance_Ratio_To_Date', 
    hue='Region', 
    dodge=False
)

# Adding a horizontal line for the National Average
plt.axhline(national_avg_variance, color='red', linestyle='--', label=f'National Avg ({national_avg_variance:.2%})')

plt.title('Average Variance Ratio per Province (Sorted High to Low)')
plt.ylabel('Variance Ratio (Overrun %)')
plt.xlabel('Province')
plt.xticks(rotation=90) # Rotates labels so they don't overlap
plt.legend(title='Region', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(axis='y', linestyle=':', alpha=0.7)

plt.tight_layout()

plt.savefig('data/eda-outputs/province_variance_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Save Output
province_variance.to_csv('data/eda-outputs/question-1.csv', index=False)