## 🌳🔥 Forest Fire Damage Prediction

Given *data about forest fires*, let's try to predict the **damage** caused by a given fire.

We will use both regression and classification models to make our predictions.

Data source: https://www.kaggle.com/datasets/elikplim/forest-fires-data-set

### Importing Libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neural_network import MLPRegressor, MLPClassifier

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
data = pd.read_csv('forestfires.csv')
data

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.00
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.00
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.00
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.00
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,4,3,aug,sun,81.6,56.7,665.6,1.9,27.8,32,2.7,0.0,6.44
513,2,4,aug,sun,81.6,56.7,665.6,1.9,21.9,71,5.8,0.0,54.29
514,7,4,aug,sun,81.6,56.7,665.6,1.9,21.2,70,6.7,0.0,11.16
515,1,4,aug,sat,94.4,146.0,614.7,11.3,25.6,42,4.0,0.0,0.00


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517 entries, 0 to 516
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   X       517 non-null    int64  
 1   Y       517 non-null    int64  
 2   month   517 non-null    object 
 3   day     517 non-null    object 
 4   FFMC    517 non-null    float64
 5   DMC     517 non-null    float64
 6   DC      517 non-null    float64
 7   ISI     517 non-null    float64
 8   temp    517 non-null    float64
 9   RH      517 non-null    int64  
 10  wind    517 non-null    float64
 11  rain    517 non-null    float64
 12  area    517 non-null    float64
dtypes: float64(8), int64(3), object(2)
memory usage: 52.6+ KB


In [4]:
data.isna().sum().sum()

0

### Preprocessing

In [49]:
df = data.copy()

In [50]:
df['month'].unique()

array(['mar', 'oct', 'aug', 'sep', 'apr', 'jun', 'jul', 'feb', 'jan',
       'dec', 'may', 'nov'], dtype=object)

In [51]:
df['day'].unique()

array(['fri', 'tue', 'sat', 'sun', 'mon', 'wed', 'thu'], dtype=object)

In [52]:
def ordinal_encode(df, column, ordering):
    df = df.copy()
    df[column] = df[column].apply(lambda x: ordering.index(x))
    return df

In [53]:
# Ordinal encode the month and day columns
df = ordinal_encode(
    df,
    column='month',
    ordering=['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
)

df = ordinal_encode(
    df,
    column='day',
    ordering=['sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat']
)

In [54]:
df

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,2,5,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.00
1,7,4,9,2,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.00
2,7,4,9,6,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.00
3,8,6,2,5,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.00
4,8,6,2,0,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,4,3,7,0,81.6,56.7,665.6,1.9,27.8,32,2.7,0.0,6.44
513,2,4,7,0,81.6,56.7,665.6,1.9,21.9,71,5.8,0.0,54.29
514,7,4,7,0,81.6,56.7,665.6,1.9,21.2,70,6.7,0.0,11.16
515,1,4,7,6,94.4,146.0,614.7,11.3,25.6,42,4.0,0.0,0.00


#### Regression

In [55]:
task = 'regression'

In [56]:
if task == 'regression':
    y = df['area']
elif task == 'classification':
    y = df['area'].apply(lambda x: 1 if x > 0 else 0)

In [57]:
X = df.drop('area', axis=1)

In [58]:
X

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain
0,7,5,2,5,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0
1,7,4,9,2,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0
2,7,4,9,6,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0
3,8,6,2,5,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2
4,8,6,2,0,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
512,4,3,7,0,81.6,56.7,665.6,1.9,27.8,32,2.7,0.0
513,2,4,7,0,81.6,56.7,665.6,1.9,21.9,71,5.8,0.0
514,7,4,7,0,81.6,56.7,665.6,1.9,21.2,70,6.7,0.0
515,1,4,7,6,94.4,146.0,614.7,11.3,25.6,42,4.0,0.0


In [59]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)

In [60]:
X_train.shape, X_test.shape

((361, 12), (156, 12))

In [62]:
# Scale Feature Data (X)
scaler = StandardScaler()
scaler.fit(X_train)

X_train = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [63]:
lin_reg_model = LinearRegression()
lin_reg_model.fit(X_train, y_train)

print("Linear Regression R^2: {:.5f}".format(lin_reg_model.score(X_test, y_test)))

Linear Regression R^2: 0.02156


In [65]:
nn_reg_model = MLPRegressor(hidden_layer_sizes = (16, 16))
nn_reg_model.fit(X_train, y_train)

print("Neural Network Regression R^2: {:.5f}".format(nn_reg_model.score(X_test, y_test)))

Neural Network Regression R^2: 0.04905


#### Classification

In [70]:
task = 'classification'

if task == 'regression':
    y = df['area']
elif task == 'classification':
    y = df['area'].apply(lambda x: 1 if x > 0 else 0)

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)

# Scale Feature Data (X)
scaler = StandardScaler()
scaler.fit(X_train)

X_train = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [71]:
y_train.value_counts()

area
1    189
0    172
Name: count, dtype: int64

In [74]:
lin_clf_model = LogisticRegression()
lin_clf_model.fit(X_train, y_train)

print("Logistic Regression Accuracy: {:.2f}%".format(lin_clf_model.score(X_test, y_test)*100))

Logistic Regression R^2: 50.64%


In [77]:
nn_clf_model = MLPClassifier(hidden_layer_sizes = (16, 16))
nn_clf_model.fit(X_train, y_train)

print("Neural Network Classification Accuracy: {:.2f}%".format(nn_clf_model.score(X_test, y_test)*100))

Neural Network Classification Accuracy: 51.28%
