In [1]:
# Import relevant python modules
import numpy as np
import pandas as pd
import glob

## Step 1: Combine yearly CSVs

Merge the typhoon impact files (shared columns) into one dataframe.


In [2]:
# 1. Path to your folder containing the CSVs
path = '../data/typhoon-impact/*.csv'
all_files = glob.glob(path)

# 2. Use a list comprehension to read them all at once
li = [pd.read_csv(filename) for filename in all_files]

# 3. Concatenate them into one master DataFrame
df = pd.concat(li, axis=0, ignore_index=True)
df_clean = df.copy()

## Step 2: Quick data checks

Inspect shape, sample rows, dtypes, and missing values.


In [3]:
df.shape

(531, 13)

In [4]:
df.head()

Unnamed: 0,Cyclone Name,Region,Province,Deaths,Injuries,Affected,Houses destroyed,Houses damaged,Total Houses,Damage to Infrastructure (PhP),Damage to Agriculture + Fisheries (PhP),Year,Category
0,Amang,,Agusan del Sur,0,,10344.0,0.0,0,0,0.0,0.0,2019,Tropical depression
1,Amang,,Surigao del Norte,0,,3458.0,0.0,0,0,0.0,0.0,2019,Tropical depression
2,Amang,,Agusan del Norte,0,,1442.0,0.0,0,0,0.0,0.0,2019,Tropical depression
3,Amang,,Surigao del Sur,0,,853.0,0.0,0,0,0.0,0.0,2019,Tropical depression
4,Amang,,Dinagat Islands,0,,477.0,0.0,0,0,0.0,0.0,2019,Tropical depression


In [5]:
df.sample(10)

Unnamed: 0,Cyclone Name,Region,Province,Deaths,Injuries,Affected,Houses destroyed,Houses damaged,Total Houses,Damage to Infrastructure (PhP),Damage to Agriculture + Fisheries (PhP),Year,Category
324,Paeng,Region 9,,0,0.0,79636.0,148.0,157,305,0.0,2360000.0,2022,Severe Tropical Storm
263,Maring,Region 3,,0,0.0,0.0,28.0,1,29,0.0,42060690.0,2021,Severe Tropical Storm
321,Paeng,Region 6,,34,0.0,1773525.0,2185.0,24992,27177,661675.0,2638512000.0,2022,Severe Tropical Storm
86,Quinta,,Aurora,0,,218.0,0.0,0,0,8000000.0,5561162.0,2020,Very strong typhoon
407,Enteng,Region 7,,0,0.0,74.0,5.0,5,10,0.0,0.0,2024,Severe Tropical Storm
34,Tisoy,,Samar,0,,50413.0,440.0,8053,8493,0.0,50231110.0,2019,Very strong typhoon
289,Florita,Region 1,,1,0.0,127122.0,0.0,24,24,546700000.0,13028200.0,2022,Severe Tropical Storm
350,Egay,CALABARZON,,0,0.0,10520.0,0.0,0,0,0.0,213500.0,2023,Super Typhoon
229,Dante,Region 5,,0,0.0,1780.0,31.0,264,295,0.0,0.0,2021,Tropical Storm
110,Quinta,,Negros Occidental,0,,2222.0,3.0,42,45,0.0,45288070.0,2020,Very strong typhoon


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 531 entries, 0 to 530
Data columns (total 13 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Cyclone Name                             531 non-null    object 
 1   Region                                   317 non-null    object 
 2   Province                                 218 non-null    object 
 3   Deaths                                   531 non-null    int64  
 4   Injuries                                 317 non-null    float64
 5   Affected                                 515 non-null    float64
 6   Houses destroyed                         530 non-null    float64
 7   Houses damaged                           531 non-null    int64  
 8   Total Houses                             531 non-null    int64  
 9   Damage to Infrastructure (PhP)           531 non-null    float64
 10  Damage to Agriculture + Fisheries (PhP)  531 non-n

In [7]:
df.isnull().any()

Cyclone Name                               False
Region                                      True
Province                                    True
Deaths                                     False
Injuries                                    True
Affected                                    True
Houses destroyed                            True
Houses damaged                             False
Total Houses                               False
Damage to Infrastructure (PhP)             False
Damage to Agriculture + Fisheries (PhP)    False
Year                                       False
Category                                   False
dtype: bool

## Step 3: Data cleaning (initial questions)

- Why are there missing values in the injuries and region columns?
- Why do categories differ before and after 2020?
- Why are some entries null for injuries, affected, etc.?
- Why is there a single missing value in houses destroyed?


### Check: Houses destroyed missing value


In [8]:
df[df['Houses destroyed'].isna()].head()

Unnamed: 0,Cyclone Name,Region,Province,Deaths,Injuries,Affected,Houses destroyed,Houses damaged,Total Houses,Damage to Infrastructure (PhP),Damage to Agriculture + Fisheries (PhP),Year,Category
293,Florita,CALABARZON,,0,0.0,,,0,0,0.0,0.0,2022,Severe Tropical Storm


Finding: the missing value is a NaN in the source data.


### Step 3.1: Convert impact columns to integers


In [9]:
# Target columns
target_cols = ['Deaths', 'Injuries', 'Affected', 'Houses destroyed', 'Houses damaged', 'Total Houses']

# Fill missing values with 0. Assume no report = no injury
df_clean[target_cols] = df[target_cols].fillna(0)

# Round values
df_clean[target_cols] = df_clean[target_cols].round(0)

# Integer conversion
df_clean[target_cols] = df_clean[target_cols].astype(int)
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 531 entries, 0 to 530
Data columns (total 13 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Cyclone Name                             531 non-null    object 
 1   Region                                   317 non-null    object 
 2   Province                                 218 non-null    object 
 3   Deaths                                   531 non-null    int64  
 4   Injuries                                 531 non-null    int64  
 5   Affected                                 531 non-null    int64  
 6   Houses destroyed                         531 non-null    int64  
 7   Houses damaged                           531 non-null    int64  
 8   Total Houses                             531 non-null    int64  
 9   Damage to Infrastructure (PhP)           531 non-null    float64
 10  Damage to Agriculture + Fisheries (PhP)  531 non-n

Result: impact and housing columns converted to integers.


In [10]:
df_clean.isnull().any()

Cyclone Name                               False
Region                                      True
Province                                    True
Deaths                                     False
Injuries                                   False
Affected                                   False
Houses destroyed                           False
Houses damaged                             False
Total Houses                               False
Damage to Infrastructure (PhP)             False
Damage to Agriculture + Fisheries (PhP)    False
Year                                       False
Category                                   False
dtype: bool

Result: no remaining nulls in injuries, affected, or housing columns.


### Step 3.2: Normalize typhoon classifications


In [11]:
df['Category'].value_counts()

Category
Very strong typhoon      105
Tropical Storm            65
Combined Effects          64
Super Typhoon             59
Severe Tropical Storm     47
Tropical Depression       44
Typhoon                   38
Strong typhoon            36
Violent typhoon           33
Tropical depression       23
Tropical storm            17
Name: count, dtype: int64

In [12]:
# Defining Mapping Dictionary
classification_map = {
    "Very strong typhoon": "Super Typhoon",
    "Violent typhoon": "Super Typhoon",
    "Super Typhoon": "Super Typhoon",
    "Strong typhoon": "Typhoon",
    "Typhoon": "Typhoon",
    "Severe Tropical Storm": "Severe Tropical Storm",
    "Tropical Storm": "Tropical Storm",
    "Tropical storm": "Tropical Storm",
    "Tropical Depression": "Tropical Depression",
    "Tropical depression": "Tropical Depression",
    "Combined Effects": "Combined Effects"
}

# Apply Mapping
df_clean['Category'] = df_clean['Category'].replace(classification_map)

# Final Check 
df_clean['Category'].value_counts()

Category
Super Typhoon            197
Tropical Storm            82
Typhoon                   74
Tropical Depression       67
Combined Effects          64
Severe Tropical Storm     47
Name: count, dtype: int64

In [13]:
df_clean.sample(5)

Unnamed: 0,Cyclone Name,Region,Province,Deaths,Injuries,Affected,Houses destroyed,Houses damaged,Total Houses,Damage to Infrastructure (PhP),Damage to Agriculture + Fisheries (PhP),Year,Category
242,Fabian,Region 6,,0,0,1202,38,118,156,3824600.0,2598115.0,2021,Typhoon
339,Betty,CAR,,0,1,22017,1,43,44,0.0,25000.0,2023,Super Typhoon
363,Jenny,Region 2,,0,0,61449,0,0,0,0.0,0.0,2023,Tropical Storm
243,Fabian,CAR,,0,0,685,5,52,57,0.0,1486112.0,2021,Typhoon
139,Rolly,,Albay,0,0,474401,17299,58975,76274,5475280000.0,1072067000.0,2020,Super Typhoon


In [14]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 531 entries, 0 to 530
Data columns (total 13 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Cyclone Name                             531 non-null    object 
 1   Region                                   317 non-null    object 
 2   Province                                 218 non-null    object 
 3   Deaths                                   531 non-null    int64  
 4   Injuries                                 531 non-null    int64  
 5   Affected                                 531 non-null    int64  
 6   Houses destroyed                         531 non-null    int64  
 7   Houses damaged                           531 non-null    int64  
 8   Total Houses                             531 non-null    int64  
 9   Damage to Infrastructure (PhP)           531 non-null    float64
 10  Damage to Agriculture + Fisheries (PhP)  531 non-n

### Step 3.3: Map provinces to regions

Use the external mapping and apply it to 2019-2020 entries only.


In [15]:
# Note: Code mapping provinces to region was obtained from an external dataset
# Region Mapping

regionNCR = ["Metro Manila"]
regionI = ["Ilocos Norte", "Ilocos Sur", "La Union", "Pangasinan"]
regionII = ["Batanes", "Cagayan", "Isabela", "Nueva Vizcaya", "Quirino"]
regionIII = ["Aurora", "Bataan", "Bulacan", "Nueva Ecija", "Pampanga", "Tarlac", "Zambales"]
regionIV_A = ["Batangas", "Cavite", "Laguna", "Quezon", "Rizal"]
regionMIMAROPA = ["Marinduque", "Occidental Mindoro", "Oriental Mindoro", "Palawan", "Romblon"]
regionV = ["Albay", "Camarines Norte", "Camarines Sur", "Catanduanes", "Masbate", "Sorsogon"]
regionVI = ["Aklan", "Antique", "Capiz", "Guimaras", "Iloilo", "Negros Occidental"]
regionVII = ["Bohol", "Cebu", "Negros Oriental", "Siquijor"]
regionVIII = ["Biliran", "Eastern Samar", "Leyte", "Northern Samar", "Samar", "Southern Leyte"]
regionIX = ["Zamboanga del Norte", "Zamboanga del Sur", "Zamboanga Sibugay"]
regionX = ["Bukidnon", "Camiguin", "Lanao del Norte", "Misamis Occidental", "Misamis Oriental"]
regionXI = ["Davao de Oro", "Davao del Oro", "Davao del Norte", "Davao del Sur", "Davao Occidental", "Davao Oriental"]
regionXII = ["Cotabato", "Sarangani", "South Cotabato", "Sultan Kudarat"]
regionXIII = ["Agusan del Norte", "Agusan del Sur", "Dinagat Islands", "Surigao del Norte", "Surigao del Sur"]
regionCAR = ["Abra", "Apayao", "Benguet", "Ifugao", "Kalinga", "Mountain Province"]
regionBARMM = ["Basilan", "Lanao del Sur", "Maguindanao", "Sulu", "Tawi-Tawi"]

# Mapping to dictionary
region_lists = {
    "NCR": regionNCR, "Region 1": regionI, "Region 2": regionII, 
    "Region 3": regionIII, "Region 4": regionIV_A, "MIMAROPA": regionMIMAROPA,
    "Region 5": regionV, "Region 6": regionVI, "Region 7": regionVII,
    "Region 8": regionVIII, "Region 9": regionIX, "Region 10": regionX,
    "Region 11": regionXI, "Region 12": regionXII, "Region 13": regionXIII,
    "CAR": regionCAR, "BARMM": regionBARMM
}

# Invert the dictionary: Map each Province to its Region
province_to_region = {
    province: region 
    for region, provinces in region_lists.items() 
    for province in provinces
}

# 1. Create a boolean mask for the target years
# This identifies exactly which rows we want to modify
year_mask = df_clean['Year'].isin([2019, 2020])

# 2. Apply the mapping ONLY to those rows
# .loc[rows, column] ensures we are writing to the original dataframe correctly
df_clean.loc[year_mask, 'Region'] = df_clean.loc[year_mask, 'Province'].map(province_to_region)
df_clean['Region'].value_counts()

Region
Region 3      59
Region 5      51
MIMAROPA      47
CAR           47
Region 2      43
Region 6      36
Region 8      34
Region 1      30
Region 4      23
CALABARZON    23
Region 7      21
Region 11     21
NCR           19
CARAGA        15
Region 13     14
Region 9      13
Region 12     13
Region 10     12
BARMM          9
NIR            1
Name: count, dtype: int64

In [16]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 531 entries, 0 to 530
Data columns (total 13 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Cyclone Name                             531 non-null    object 
 1   Region                                   531 non-null    object 
 2   Province                                 218 non-null    object 
 3   Deaths                                   531 non-null    int64  
 4   Injuries                                 531 non-null    int64  
 5   Affected                                 531 non-null    int64  
 6   Houses destroyed                         531 non-null    int64  
 7   Houses damaged                           531 non-null    int64  
 8   Total Houses                             531 non-null    int64  
 9   Damage to Infrastructure (PhP)           531 non-null    float64
 10  Damage to Agriculture + Fisheries (PhP)  531 non-n

### Step 3.4: Drop province column

Keep region as the location field going forward.


In [20]:
df_clean.drop(columns=['Province'], inplace = True) # Modifies clean dataframe to drop provinces

In [21]:
df_clean.info() # Last Checking

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 531 entries, 0 to 530
Data columns (total 12 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Cyclone Name                             531 non-null    object 
 1   Region                                   531 non-null    object 
 2   Deaths                                   531 non-null    int64  
 3   Injuries                                 531 non-null    int64  
 4   Affected                                 531 non-null    int64  
 5   Houses destroyed                         531 non-null    int64  
 6   Houses damaged                           531 non-null    int64  
 7   Total Houses                             531 non-null    int64  
 8   Damage to Infrastructure (PhP)           531 non-null    float64
 9   Damage to Agriculture + Fisheries (PhP)  531 non-null    float64
 10  Year                                     531 non-n