# CPSC 4820 Project - Fire Intensity Prediction using Machine Learning

In [5]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

### Descriptive Analysis

In [14]:
hotspots = pd.read_csv("data/combined_hotspots.csv")

In [15]:
hotspots.head()

Unnamed: 0,lat,lon,rep_date,source,sensor,satellite,agency,temp,rh,ws,...,greenup,elev,sfl,cfl,tfc0,ecozone,sfc0,cbh,uid,fid
0,53.017,-124.689,2019/03/27 21:15:00.000,NASA,MODIS,Aqua,BC,3.761,59,11.673,...,0,1178,,0.8,,,,,,
1,50.591,-119.904,2019/11/17 20:29:00.000,USFS,IBAND,JPSS1,BC,7.732,84,10.635,...,0,761,,2.1,0.14,14.0,0.14,,,
2,53.1846,-124.001,2019/03/27 19:42:00.000,NASA,IBAND,S-NPP,BC,2.09,64,11.087,...,0,1291,,2.7,0.095046,,0.095046,,,
3,54.071,-123.757,2019/11/17 21:18:00.000,USFS,IBAND,N,BC,8.218,61,14.859,...,0,1266,,0.6,0.04,14.0,0.04,,,
4,52.0966,-122.069,2019/03/29 11:00:00.000,NASA,IBAND,S-NPP,BC,11.225,31,7.106,...,0,734,,2.2,0.316152,,0.316152,,,


In [16]:
hotspots.shape

(1033890, 37)

In [17]:
hotspots.columns

Index(['lat', 'lon', 'rep_date', 'source', 'sensor', 'satellite', 'agency',
       'temp', 'rh', 'ws', 'wd', 'pcp', 'ffmc', 'dmc', 'dc', 'isi', 'bui',
       'fwi', 'fuel', 'ros', 'sfc', 'tfc', 'bfc', 'hfi', 'cfb', 'estarea',
       'pcuring', 'greenup', 'elev', 'sfl', 'cfl', 'tfc0', 'ecozone', 'sfc0',
       'cbh', 'uid', 'fid'],
      dtype='object')

In [18]:
hotspots.dtypes

lat          float64
lon          float64
rep_date      object
source        object
sensor        object
satellite     object
agency        object
temp         float64
rh             int64
ws           float64
wd             int64
pcp          float64
ffmc         float64
dmc          float64
dc           float64
isi          float64
bui          float64
fwi          float64
fuel          object
ros          float64
sfc          float64
tfc          float64
bfc          float64
hfi          float64
cfb          float64
estarea      float64
pcuring      float64
greenup       object
elev           int64
sfl          float64
cfl          float64
tfc0         float64
ecozone      float64
sfc0         float64
cbh          float64
uid          float64
fid          float64
dtype: object

In [19]:
# Assessing features with missing values
null_values = hotspots.isnull().sum()
print("Columns with null values:\n", null_values[null_values > 0])

Columns with null values:
 source        696595
satellite      50110
ros              150
sfc              150
tfc              150
bfc           749015
hfi              150
cfb              150
estarea      1019088
pcuring       694831
greenup       694831
sfl             3541
tfc0             170
ecozone          237
sfc0             258
cbh           700136
uid          1021169
fid           337295
dtype: int64


In [20]:
# Calculate the percentage of missing values in each column
missing_percentage = hotspots.isnull().mean() * 100

# Convert the Series to a DataFrame for better readability
missing_percentage_df = missing_percentage.reset_index()
missing_percentage_df.columns = ['Column', 'Missing Percentage']
missing_percentage_df

Unnamed: 0,Column,Missing Percentage
0,lat,0.0
1,lon,0.0
2,rep_date,0.0
3,source,67.376123
4,sensor,0.0
5,satellite,4.846744
6,agency,0.0
7,temp,0.0
8,rh,0.0
9,ws,0.0


In [21]:
#Fill up the missing satelite information with 'Unknown'
hotspots['satellite'].fillna('unknown', inplace=True)

In [22]:
# Identify rows with missing values in the 'sfc' column
sfc_missing_indices = hotspots[hotspots['sfc'].isnull()].index

In [23]:
# Compare with other columns
columns_to_check = ['tfc', 'hfi', 'cfb', 'tfc0', 'sfc0']
missing_in_all = hotspots.loc[sfc_missing_indices, columns_to_check].isnull().all(axis=1)

# Check if all values are True (indicating the same rows are missing in all columns)
if missing_in_all.all():
    print("The rows with missing values in 'sfc' are the same as those in 'tfc', 'hfi', 'cfb', 'tfc0', and 'sfc0'.")
else:
    print("The rows with missing values in 'sfc' are NOT the same as those in 'tfc', 'hfi', 'cfb', 'tfc0', and 'sfc0'.")

# Display the indices where the missing values do not match
mismatched_indices = missing_in_all[~missing_in_all].index
print("Indices with mismatched missing values:\n", mismatched_indices)

The rows with missing values in 'sfc' are the same as those in 'tfc', 'hfi', 'cfb', 'tfc0', and 'sfc0'.
Indices with mismatched missing values:
 Int64Index([], dtype='int64')


In [27]:
#hotspots = hotspots.drop(sfc_missing_indices)
hotspots = hotspots.drop(columns=['cbh', 'bfc','pcuring', 'greenup','source','estarea','fid','uid'])
hotspots = hotspots.dropna(subset=['sfl','tfc0','ecozone','sfc0'])

In [28]:
# Check for duplicate rows
num_duplicates = hotspots.duplicated().sum()
hotspots = hotspots.drop_duplicates()

In [29]:
hotspots = hotspots[hotspots['hfi'] != 0]

In [30]:
hotspots = hotspots.drop(columns=['tfc0', 'sfc0'])
hotspots = hotspots.drop(columns=['agency'])

In [31]:
hotspots['rep_date'] = pd.to_datetime(hotspots['rep_date'], errors='coerce')

In [32]:
hotspots['year'] = hotspots['rep_date'].dt.year
hotspots['month'] = hotspots['rep_date'].dt.month
hotspots['day'] = hotspots['rep_date'].dt.dayofyear

In [None]:
hotspots.to_csv('data/cleaned_hotspots', index=False)

### Sampling

In [34]:
from sklearn.model_selection import train_test_split

In [35]:
print(hotspots['year'].value_counts())

2023    680106
2021    229679
2022     47713
2019     14094
2020      8234
Name: year, dtype: int64


In [36]:
years_of_interest = [2019, 2020, 2021, 2022, 2023]
hotspots = hotspots[hotspots['year'].isin(years_of_interest)]

# Perform stratified sampling to get a representative sample
sampled_df, _ = train_test_split(hotspots, test_size=0.25, stratify=hotspots['year'], random_state=1)

# Display the shape of the sampled DataFrame to verify
print(f"Sampled data shape: {sampled_df.shape}")

# Ensure each year is represented in the sampled data
print("Sampled data year distribution:")
print(sampled_df['year'].value_counts())

Sampled data shape: (734869, 29)
Sampled data year distribution:
2023    510079
2021    172259
2022     35785
2019     10570
2020      6176
Name: year, dtype: int64


In [37]:
def categorize_intensity(hfi):
    if hfi <= 10:
        return 'Low'
    elif hfi <= 100:
        return 'Moderate'
    else:
        return 'High'

# Apply the function to create the new 'Intensity' feature
sampled_df['Intensity'] = sampled_df['hfi'].apply(categorize_intensity)

# Display the first few rows to verify
sampled_df[['hfi', 'Intensity']].head()

Unnamed: 0,hfi,Intensity
86081,6080.0,High
477065,4420.0,High
657036,2728.0,High
548634,46.0,Moderate
553508,1486.0,High


In [38]:
sampled_df = sampled_df[sampled_df['hfi'] <= 60000]

# Verify the filtering step
print(f"Number of rows after filtering 'hfi' > 60000: {sampled_df.shape[0]}")

Number of rows after filtering 'hfi' > 60000: 733476


In [45]:
hotspots.to_csv('data/sampled_hotspots.csv', index=False)

### EDA

In [43]:
sampled_df.dtypes

lat                 float64
lon                 float64
rep_date     datetime64[ns]
sensor               object
satellite            object
temp                float64
rh                    int64
ws                  float64
wd                    int64
pcp                 float64
ffmc                float64
dmc                 float64
dc                  float64
isi                 float64
bui                 float64
fwi                 float64
fuel                 object
ros                 float64
sfc                 float64
tfc                 float64
hfi                 float64
cfb                 float64
elev                  int64
sfl                 float64
cfl                 float64
ecozone             float64
year                  int64
month                 int64
day                   int64
Intensity            object
dtype: object

In [42]:
sampled_df.describe()

Unnamed: 0,lat,lon,temp,rh,ws,wd,pcp,ffmc,dmc,dc,...,tfc,hfi,cfb,elev,sfl,cfl,ecozone,year,month,day
count,733476.0,733476.0,733476.0,733476.0,733476.0,733476.0,733476.0,733476.0,733476.0,733476.0,...,733476.0,733476.0,733476.0,733476.0,733476.0,733476.0,733476.0,733476.0,733476.0,733476.0
mean,54.978613,-122.042067,21.785128,35.484992,9.486457,200.2322,0.207458,89.981015,82.897941,528.341525,...,2.83792,6027.868691,22.538998,927.620762,9.145533,0.792873,10.366762,2022.400072,7.681444,217.745628
std,3.564579,2.6745,5.611399,11.475874,3.217718,91.179982,1.133841,5.718467,36.506501,142.947004,...,1.410308,7638.680768,34.43745,390.406435,7.789797,0.613513,4.575659,0.965248,1.229639,37.817492
min,48.328831,-137.489,-18.736999,11.0,2.323,0.0,0.0,35.380001,0.0,0.0,...,0.01,1.0,0.0,-1.0,-1.0,-1.0,4.0,2019.0,1.0,9.0
25%,51.165001,-123.886002,18.325001,27.0,7.101,136.0,0.0,89.522003,60.166,438.858002,...,2.04,838.0,0.0,609.0,5.139763,0.234353,4.0,2021.0,7.0,193.0
50%,55.4942,-121.423332,22.402,34.0,8.765,203.0,0.0,91.074997,73.481003,547.554505,...,3.08,3284.0,0.0,911.0,7.128627,0.685691,14.0,2023.0,8.0,220.0
75%,58.41753,-120.446121,25.861,41.0,11.157,275.0,0.004,93.008003,99.385252,623.267029,...,3.67,8329.0,48.0,1188.0,11.945951,1.28454,14.0,2023.0,9.0,246.0
max,59.999981,-114.360001,43.881,97.0,31.662001,360.0,651.789001,97.878998,274.265015,1122.06006,...,7.137387,59994.0,100.0,2808.0,37.353542,4.62802,14.0,2023.0,12.0,365.0


In [44]:
sampled_df['Intensity'].value_counts()

High        642198
Moderate     72030
Low          19248
Name: Intensity, dtype: int64