In [None]:
# Import relevant python modules
import numpy as np
import pandas as pd
import glob

In [None]:
# Disclaimer: Generative AI was used to convert typhoon report screenshots (Source: PAGASA) into CSV format

# 1. Grab the paths of all CSV files in your folder
path = './data/typhoon-info' 
all_files = glob.glob(path + "/*.csv")

# 2. Use a list comprehension to read them all at once
li = [pd.read_csv(filename) for filename in all_files]

# 3. Concatenate them into one master DataFrame
df = pd.concat(li, axis=0, ignore_index=True)

In [None]:
unique_locations = df['Location'].unique()

# fmt='%s' ensures the locations are saved as strings
np.savetxt('data/geospatial-data/station_locations.txt', unique_locations, fmt='%s')

# The mapping dictionary was made using Generative AI and double checked by the author

In [None]:
# Load your mapping
mapping_df = pd.read_csv('data/geospatial-data/station_location_mapping.csv')

# Merge it into your main dataframe
# Attach Province and Region to your station data
df_weather_geo = pd.merge(df_typhoon_info, mapping_df, on='Location', how='left')

In [None]:
# 1. Ensure your placeholders (0s) are treated as NaNs first
df['Peak Gust (10 mins sustained) (m/s)'] = df['Peak Gust (10 mins sustained) (m/s)'].replace(0, np.nan)
df['Max 24-hour Rainfall (mm)'] = df['Max 24-hour Rainfall (mm)'].replace(0, np.nan)

# 2. Group by Typhoon AND Region, then fill NaNs with the group mean

# a. First Pass: Regional Average (Localized)
# We use the code from before to fill gaps using neighbors in the same region
df['Max 24-hour Rainfall (mm)'] = df.groupby(['Typhoon', 'Region'])['Max 24-hour Rainfall (mm)'].transform(lambda x: x.fillna(x.mean()))
df['Peak Gust (10 mins sustained) (m/s)'] = df.groupby(['Typhoon', 'Region'])['Peak Gust (10 mins sustained) (m/s)'].transform(lambda x: x.fillna(x.mean()))

# b. Second Pass: National Average (Fallback)
# If a whole region is empty, we fill it using the mean of all stations for that typhoon

df['Max 24-hour Rainfall (mm)'] = df.groupby('Typhoon')['Max 24-hour Rainfall (mm)'].transform(lambda x: x.fillna(x.mean()))
df['Peak Gust (10 mins sustained) (m/s)'] = df.groupby('Typhoon')['Peak Gust (10 mins sustained) (m/s)'].transform(lambda x: x.fillna(x.mean()))

# 3. Final Pass: Use the overall mean of the entire 'Peak Gust' column
# This fills any remaining NaNs with the average intensity of all typhoons
overall_wind_mean = df['Peak Gust (10 mins sustained) (m/s)'].mean()

df['Peak Gust (10 mins sustained) (m/s)'] = df['Peak Gust (10 mins sustained) (m/s)'].fillna(overall_wind_mean)

In [None]:
# Save your cleaned dataframe to a specific folder
# 'index=False' prevents pandas from adding an extra column of numbers at the start
df.to_csv('data/typhoon-info/cleaned_2019-2025.csv', index=False)






