In [None]:
# This Notebook cleans the data of the compiled typhoon info dataset

In [None]:
# Import relevant python modules
import numpy as np
import pandas as pd
import glob

In [None]:
# Disclaimer: Generative AI was used to convert typhoon report screenshots (Source: PAGASA) into CSV format

# A threshold of 150mm in a 24-hour period was utilized to identify High-Intensity Rainfall events. This corresponds to the PAGASA Red Rainfall Warning level, which indicates a high potential for severe flooding and necessitates immediate action. By isolating typhoons that triggered this threshold at at least one station, the study focuses on events where infrastructure resilience was most critical.

# 1. Grab the paths of all CSV files in your folder
filename = './data/typhoon-info/compiled_2019-2025.csv' 
df = pd.read_csv(filename)

# Clean it up before the merge
mapping_df = pd.read_csv('data/geospatial-data/station_location_mapping.csv')

# Remove geo columns from raw data if they exist to avoid the "x/y" issue
df = df.drop(columns=['Province', 'Region'], errors='ignore')

# Merge
df = pd.merge(df, mapping_df, on='Location', how='left')

In [None]:

# We impute the empty cells based on the locality

rain_col = "Max 24-hour Rainfall (mm)"
gust_col = "Peak Gust (10 mins sustained) (m/s)"

df[rain_col] = df[rain_col].replace(0, np.nan)
df[gust_col] = df[gust_col].replace(0, np.nan)

# 1. RAINFALL: Tiered Local Median (Province -> Region)
# This maintains the integrity of rainfall which will be used as a main metric, while imputing for any missing data
df[rain_col] = df.groupby(['Typhoon', 'Year', 'Province'])[rain_col].transform(
    lambda x: x.fillna(x.median())
)
df[rain_col] = df.groupby(['Typhoon', 'Year', 'Region'])[rain_col].transform(
    lambda x: x.fillna(x.median())
)


# 2. GUST: Tiered Local Mean (Province -> Region -> National)
# Mean and National locality is both used because Peak Gust is a lot more consistent

df[gust_col] = df.groupby(['Typhoon', 'Year', 'Province'])[gust_col].transform(
    lambda x: x.fillna(x.mean())
)
df[gust_col] = df.groupby(['Typhoon', 'Year', 'Region'])[gust_col].transform(
    lambda x: x.fillna(x.mean())
)
df[gust_col] = df.groupby(['Typhoon', 'Year'])[gust_col].transform(
    lambda x: x.fillna(x.mean())
)

# 3. Drop rows with no Rainfall
df = df.dropna(subset=[rain_col])

# 4. Fill remaining Gust NaNs with 0 (These are Rain-Heavy storms)
df[gust_col] = df[gust_col].fillna(0)

In [None]:
# Save your cleaned dataframe to a specific folder
# 'index=False' prevents pandas from adding an extra column of numbers at the start
df.to_csv('data/typhoon-info/cleaned_2019-2025.csv', index=False)