In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from datetime import datetime

from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.inspection import plot_partial_dependence

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Normalization
from tensorflow.keras import regularizers
import tensorflow.keras as keras
from keras.callbacks import ModelCheckpoint
from tensorflow.keras import regularizers

import plotly.express as px

from utils import run_all_regressors
%load_ext autoreload
%autoreload 2

In [2]:
pd.set_option('display.max_columns', None)

Data Pre-processing and EDA(Exploratory data analysis)
Data Cleaning and filtering data which has firesize <5000 as number of small fires are high.

In [3]:
def akReadDf():
    # Reading the combined CSV files
    df = pd.read_csv('Wildfire.csv')
    df = df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1','disc_date_final','cont_date_final','cont_clean_date','putout_time'])
    df['disc_clean_date'] = pd.to_datetime(df['disc_clean_date'], format='%m/%d/%Y')

    #Get rid of outliers - fires of size larger than 5000 acres, and there are large number of small fires and other very less number are having the high 
    # area of fires, because of which the deviation is very high
    df = df.loc[df['fire_size'] < 5000]
    df.columns

    ################
    df['Vegetation'] = df['Vegetation'].astype('category')
    df['Cause'] = df['stat_cause_descr'].astype('category')

    df = pd.get_dummies(df,prefix=['Vegetation'], columns = ['Vegetation'], drop_first=True)
    df = pd.get_dummies(df,prefix=['Cause'], columns = ['stat_cause_descr'], drop_first=True)

    ################
    df_numerics_only = df.select_dtypes(include=np.number)

    corr = df_numerics_only.corr()
    ax = sns.heatmap(
        corr, 
        vmin=-1, vmax=1, center=0,
        cmap=sns.diverging_palette(220, 20, n=200),
        square=True
    )
    ax.set_xticklabels(
        ax.get_xticklabels(),
        rotation=45,
        horizontalalignment='right'
    );
    sns.set(rc={'figure.figsize':(15,15)})

    ################
    #Dealing with missing data
    print(len(df))

    # drop columns where weather_file is missing in the data, as it wont have the weather situation at that time, so its where ever data is 
    #missing we can remove those rows as it wont be useful
    index = df[df['weather_file'] == 'File Not Found'].index
    df.drop(index, inplace = True)
    print(len(df))


    ################
    # Weather data has a lot of 0 and values some of which may be missing values,
    # Mark '0' values in weather columns as Na (to see how many there are) 
    # As 0 wont add any value to the data, we are converting to NA and then removing them which will make data set
    subset0 = ['Temp_pre_30','Temp_pre_15','Temp_pre_7','Temp_cont','Wind_pre_30','Wind_pre_15','Wind_pre_7','Wind_cont','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Hum_cont']
    df[subset0] = df[subset0].replace({0:np.nan, '0':np.nan})
    print(len(df))

    # Mark '-1' as missing
    subset_neg1 = ['Temp_pre_30','Temp_pre_15','Temp_pre_7','Temp_cont','Wind_pre_30','Wind_pre_15','Wind_pre_7','Wind_cont','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Hum_cont','Prec_pre_30','Prec_pre_15','Prec_pre_7','Prec_cont']
    df[subset_neg1] = df[subset_neg1].replace({-1:np.nan})

    # Drop observations where all weather columns are 0
    df = df.dropna(how='all',
                        subset=['Temp_pre_30','Temp_pre_15','Temp_pre_7','Temp_cont','Wind_pre_30','Wind_pre_15','Wind_pre_7','Wind_cont',])
    print(len(df))
    # This leaves us with 38,689 observations  +/- 3,000  to work with (originally we had 50,000)

    ################
    # fill the 'pre' columns temp wind and humidity with mean values
    subset_fill_mean = ['Temp_pre_30','Temp_pre_15','Temp_pre_7', 'Wind_pre_30','Wind_pre_15','Wind_pre_7', 'Hum_pre_30', 'Hum_pre_15','Hum_pre_7']
    df[subset_fill_mean] = df[subset_fill_mean].fillna(df[subset_fill_mean].mean())

    # Fill NAs in the date of fire containment based on mean values from previous days
    for col in ['Temp','Wind','Hum']:
        df[f'{col}_cont'] = df.apply(
            lambda row: (row[f'{col}_pre_7']+row[f'{col}_pre_15']+row[f'{col}_pre_30'])/3 if np.isnan(row[f'{col}_cont']) else row[f'{col}_cont'],
            axis=1)

## Load DF 

In [4]:
from loadDFRegion import getDF
df,dfRegionList = getDF()

Grabbing region 1 dataframe...
Grabbing region 2 dataframe...
Grabbing region 3 dataframe...
Grabbing region 4 dataframe...
Grabbing region 5 dataframe...
Grabbing region 6 dataframe...
Grabbing region 8 dataframe...
Grabbing region 9 dataframe...
Grabbing region 10 dataframe...


In [5]:
#Sanity Check !!
sum1=0
ct=1
for dftemp in dfRegionList:
    if(ct>=7):
        ct+=1
    #print(ct, dftemp.size)
    sum1+= dftemp.size
print(df.size, sum1)
assert(sum1==df.size)

1445578 1445578


## Setting up Experiments

In [6]:
def experimentCreation_FireSizePrediction(df):
    # Experiment 1 
    """
    - which will select all teh available  features from the dataset
    -Features included - variables related to Vegetation,Temperature, Humidity, Wind, Precipitation, cause of  fire, longitude and latitude
    - we have 34 variables  for x-variables  to which we are gonna target one y-variable which is fire_size
    - selecting features and target variables"""
    X1 = df[['Vegetation','remoteness','latitude','Temp_pre_30','Temp_pre_15','Temp_pre_7','Temp_cont','Wind_pre_30','Wind_pre_15','Wind_pre_7','Wind_cont','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Hum_cont','Prec_pre_30','Prec_pre_15','Prec_pre_7','Prec_cont','stat_cause_encoded', 'longitude']]
    #X1 = df[['Vegetation_4','remoteness', 'Vegetation_9','Vegetation_12','Vegetation_14','Vegetation_15','Vegetation_16','latitude','Temp_pre_30','Temp_pre_15','Temp_pre_7','Temp_cont','Wind_pre_30','Wind_pre_15','Wind_pre_7','Wind_cont','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Hum_cont','Prec_pre_30','Prec_pre_15','Prec_pre_7','Prec_cont','Cause_Debris Burning','Cause_Equipment Use','Cause_Fireworks','Cause_Lightning','Cause_Miscellaneous','Cause_Missing/Undefined','Cause_Powerline','Cause_Railroad','Cause_Smoking','Cause_Structure', 'longitude']]
    # X1 = df[['Vegetation_4','Vegetation_9','Vegetation_12','Vegetation_14','Vegetation_15','Vegetation_16','latitude','Temp_pre_30','Temp_pre_15','Temp_pre_7','Temp_cont','Wind_pre_30','Wind_pre_15','Wind_pre_7','Wind_cont','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Hum_cont','Prec_pre_30','Prec_pre_15','Prec_pre_7','Prec_cont','Cause_Debris Burning','Cause_Equipment Use','Cause_Fireworks','Cause_Lightning','Cause_Miscellaneous','Cause_Missing/Undefined','Cause_Powerline','Cause_Railroad','Cause_Smoking','Cause_Structure', 'longitude']]

    y = df['fire_size']

    #train test split
    X1_train, X1_test, y_train, y_test = train_test_split(X1, y, test_size=0.2, random_state=42)
    df1 = [X1_train, X1_test, y_train, y_test]
    #######################
    #Experiment type 2 
    """-Include only long, lat, vegetation, cause and pre- weather data, without cont
    - which is the data set where I removed the variables  on which the fire is  containining on the day
    - removed 4 variables
    - selecting features and target variables"""
    X2 = df[['Vegetation','latitude','Temp_pre_30','Temp_pre_15','Temp_pre_7','Wind_pre_30','Wind_pre_15','Wind_pre_7','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Prec_pre_30','Prec_pre_15','Prec_pre_7','stat_cause_encoded', 'longitude']]
    #X2 = df[['Vegetation_4','Vegetation_9','Vegetation_12','Vegetation_14','Vegetation_15','Vegetation_16','latitude','Temp_pre_30','Temp_pre_15','Temp_pre_7','Wind_pre_30','Wind_pre_15','Wind_pre_7','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Prec_pre_30','Prec_pre_15','Prec_pre_7','Cause_Debris Burning','Cause_Equipment Use','Cause_Fireworks','Cause_Lightning','Cause_Miscellaneous','Cause_Missing/Undefined','Cause_Powerline','Cause_Railroad','Cause_Smoking','Cause_Structure', 'longitude']]
    y = df['fire_size']

    #train test split
    X2_train, X2_test, y_train, y_test = train_test_split(X2, y, test_size=0.2, random_state=42)
    df2 = [X2_train, X2_test, y_train, y_test]
    ########################
    #Experiment 3 
    #- Including only lat, long and weather pre- data
    #When I have done the feature importance, I got to know that the cause and vegetation is not that important, so here we removed the 2 
    #  selecting features and target variables
    X3 = df[['latitude','longitude','Temp_pre_30','Temp_pre_15','Temp_pre_7','Wind_pre_30','Wind_pre_15','Wind_pre_7','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Prec_pre_30','Prec_pre_15','Prec_pre_7']]
    y = df['fire_size']

    #train test split
    X3_train, X3_test, y_train, y_test = train_test_split(X3, y, test_size=0.2, random_state=42)
    df3 = [X3_train, X3_test, y_train, y_test]
    ########################
    #Experiment 4 
    #with experiment 1 data with normalization
    # have done the minMax normalization for the experiment 1 data frame.
    df_4 = df[['Vegetation','latitude','Temp_pre_30','Temp_pre_15','Temp_pre_7','Temp_cont','Wind_pre_30','Wind_pre_15','Wind_pre_7','Wind_cont','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Hum_cont','Prec_pre_30','Prec_pre_15','Prec_pre_7','Prec_cont','stat_cause_encoded', 'longitude']]
    names = df_4.columns

    # normalizing data
    df_4 = preprocessing.normalize(df_4)
    scaled_df = pd.DataFrame(df_4, columns=names)

    #train test split
    X4_train, X4_test, y_train, y_test = train_test_split(scaled_df, y, test_size=0.2, random_state=42)
    df4 = [X4_train, X4_test, y_train, y_test]
    
    return [df1,df2,df3,df4]

In [7]:
regionCount=1
for dfRegion in dfRegionList:
    if(regionCount>=7):
        regionCount+=1
    print(regionCount)
    
    #if(regionCount==6):
    experimentList = experimentCreation_FireSizePrediction(dfRegion)
    print(f"\n\n____Running all experiments for Region {regionCount}____")
    print("____________________________________________")
    print("____________________________________________")
    for i in range(len(experimentList)):
        experiment=experimentList[i]
        print(f"\n--------------Experiment {i+1}--------------")
        print(experiment[0].size,experiment[1].size,experiment[2].size,experiment[3].size)
        run_all_regressors(experiment[0],experiment[2],experiment[1],experiment[3])
    print("____________________________________________")
    regionCount+=1
    

1


____Running all experiments for Region 1____
____________________________________________
____________________________________________

--------------Experiment 1--------------
17241 4326 821 206
Running RandomForestRegressor
Score on training data: 0.9824635669337284
Score on testing data: 0.8610868227857178
Mean Absolute Error:  0.0399599822265911
R Squared:  0.8610868227857178
Adjusted R Squared:  0.860409040090202

Running GradientBoostingRegressor
Score on training data: 0.9743691746910226
Score on testing data: 0.8582809358720239
Mean Absolute Error:  0.03897651132797762
R Squared:  0.8582809358720239
Adjusted R Squared:  0.8575894627431467

Running DecisionTreeRegressor
Score on training data: 0.9999999760149556
Score on testing data: 0.8280419150104283
Mean Absolute Error:  0.03808253107967063
R Squared:  0.8280419150104283
Adjusted R Squared:  0.8272029001905443

Running ExtraTreesRegressor
Score on training data: 0.9999999760149556
Score on testing data: 0.860849229575170

Score on training data: 0.9823108852715381
Score on testing data: 0.8845288694419537
Mean Absolute Error:  0.034167861018196924
R Squared:  0.8845288694419537
Adjusted R Squared:  0.883933511292747

Running GradientBoostingRegressor
Score on training data: 0.990398266838978
Score on testing data: 0.8860334162053056
Mean Absolute Error:  0.030587456319137757
R Squared:  0.8860334162053056
Adjusted R Squared:  0.8854458153558854

Running DecisionTreeRegressor
Score on training data: 0.9999999999786066
Score on testing data: 0.803489255143234
Mean Absolute Error:  0.03968104739774506
R Squared:  0.803489255143234
Adjusted R Squared:  0.8024760644626565

Running ExtraTreesRegressor
Score on training data: 0.9999999999786066
Score on testing data: 0.9054027108232723
Mean Absolute Error:  0.031559181195813774
R Squared:  0.9054027108232723
Adjusted R Squared:  0.9049149762117547


--------------Experiment 2--------------
12480 3120 780 195
Running RandomForestRegressor
Score on training data

Score on training data: 0.9540695813207151
Score on testing data: 0.9075917998780235
Mean Absolute Error:  0.019382032511284588
R Squared:  0.9075917998780235
Adjusted R Squared:  0.9074116502119856

Running DecisionTreeRegressor
Score on training data: 0.9999998887213692
Score on testing data: 0.7399437589434852
Mean Absolute Error:  0.0274209184929279
R Squared:  0.7399437589434852
Adjusted R Squared:  0.7394367796395318

Running ExtraTreesRegressor
Score on training data: 0.9999998887213692
Score on testing data: 0.8611005335605975
Mean Absolute Error:  0.025338779935503358
R Squared:  0.8611005335605975
Adjusted R Squared:  0.8608297492312967


--------------Experiment 2--------------
32848 8224 2053 514
Running RandomForestRegressor
Score on training data: 0.8664044387756021
Score on testing data: 0.08768303941097466
Mean Absolute Error:  0.08249524202508358
R Squared:  0.08768303941097466
Adjusted R Squared:  0.08590442708376322

Running GradientBoostingRegressor
Score on trainin

Score on training data: 0.9999999593515394
Score on testing data: 0.7516911823229748
Mean Absolute Error:  0.00922225721503713
R Squared:  0.7516911823229748
Adjusted R Squared:  0.7516349141387205

Running ExtraTreesRegressor
Score on training data: 0.9999999593515394
Score on testing data: 0.8808936688375387
Mean Absolute Error:  0.008183742319749264
R Squared:  0.8808936688375387
Adjusted R Squared:  0.8808666786684001


--------------Experiment 2--------------
282480 70624 17655 4414
Running RandomForestRegressor
Score on training data: 0.8719229144051592
Score on testing data: 0.03997563744692478
Mean Absolute Error:  0.020925083703053203
R Squared:  0.03997563744692478
Adjusted R Squared:  0.0397580897561739

Running GradientBoostingRegressor
Score on training data: 0.28823817508317195
Score on testing data: 0.030602792346509755
Mean Absolute Error:  0.020029581563793905
R Squared:  0.030602792346509755
Adjusted R Squared:  0.030383120708818656

Running DecisionTreeRegressor
Scor

Score on training data: 1.0
Score on testing data: 0.8672183698312569
Mean Absolute Error:  0.10091280122917679
R Squared:  0.8672183698312569
Adjusted R Squared:  0.8645059046235297


--------------Experiment 2--------------
3152 800 197 50
Running RandomForestRegressor
Score on training data: 0.8988289793203499
Score on testing data: 0.14879425836820603
Mean Absolute Error:  0.2990592188258127
R Squared:  0.14879425836820603
Adjusted R Squared:  0.1314005267384376

Running GradientBoostingRegressor
Score on training data: 0.9225220625042893
Score on testing data: -0.040284481413579876
Mean Absolute Error:  0.32396127327502655
R Squared:  -0.040284481413579876
Adjusted R Squared:  -0.061541890995466586

Running DecisionTreeRegressor
Score on training data: 1.0
Score on testing data: -0.5831907614581437
Mean Absolute Error:  0.34324397541646445
R Squared:  -0.5831907614581437
Adjusted R Squared:  -0.6155420413857686

Running ExtraTreesRegressor
Score on training data: 1.0
Score on test

In [8]:
print(

SyntaxError: unexpected EOF while parsing (149104261.py, line 1)

### Modelling  the above experiments with different models like  decision tree,gradient bosting, random  forest

Decision Tree - Experiment 1


In [None]:
dectr = DecisionTreeRegressor(random_state=0)
dectr.fit(df1[0], df1[2])

predictions = dectr.predict(df1[1])
print('Mean Absolute Error:', metrics.mean_absolute_error(df1[3], predictions))
print('R Squared:', metrics.r2_score(df1[3], predictions))


Gradient Boosting - Experiment 1



In [None]:
gr_boost = GradientBoostingRegressor()
gr_boost.fit(df1[0], df1[2])

predictions = gr_boost.predict(df1[1])
print('Mean Absolute Error:', metrics.mean_absolute_error(df1[3], predictions))
print('R Squared:', metrics.r2_score(df1[3], predictions))


In [None]:
rf = RandomForestRegressor()
rf.fit(df1[0], df1[2])

predictions = rf.predict(df1[1])
print('Mean Absolute Error:', metrics.mean_absolute_error(df1[3], predictions))
print('R Squared:', metrics.r2_score(df1[3], predictions))


Modelling experirments 


In [None]:
# less than 5000
for model in [DecisionTreeRegressor(), GradientBoostingRegressor(), RandomForestRegressor()]:
    for idx,df in enumerate([df1, df2, df3, df4]):
        model.fit(df[0], df[2])
        print(f'{model}; Experiment {idx+1}; Mean Absolute Error:', metrics.mean_absolute_error(df[3], model.predict(df[1])))
        print(f'{model}; Experiment {idx+1}; R Squared:', metrics.r2_score(df[3], model.predict(df[1])))
        print('')


The best performing basemodel is the Random Forest algorithm with R2 of 23%. This is the model we will use for further analysis and improvement.




Reevaluate model after deleting least important features


In [None]:
# Random Forest Regressor 
rf_reg = RandomForestRegressor()

# fit the regressor with x and y data
rf_reg.fit(X2_train, y2_train)

predictions = rf_reg.predict(X2_test)
print('Mean Absolute Error:', metrics.mean_absolute_error(y2_test, predictions))
print('R Squared:', metrics.r2_score(y2_test, predictions))

Hyper-parameter tuning 

In [None]:
rf_reg = RandomForestRegressor()
search_grid={'n_estimators':[50,100,200],'max_depth':[2,5,8,10]}
search=GridSearchCV(estimator=rf_reg,param_grid=search_grid,scoring='neg_mean_absolute_error',n_jobs=1,cv=5, verbose=1)
search.fit(df1[0], df1[2])
print(search.best_score_)
print(search.best_params_)


In [None]:
# Random Forest Regressor 
rf_reg = RandomForestRegressor(n_estimators = 200, max_depth=10)

# fit the regressor with x and y data
rf_reg.fit(df1[0], df1[2])

predictions = rf_reg.predict(df1[1])
print('Mean Absolute Error:', metrics.mean_absolute_error(df1[3], predictions))
print('R Squared:', metrics.r2_score(df1[3], predictions))

### Neural network model

In [None]:
# selecting features and target variables
X = df[['Vegetation_4','Vegetation_9','Vegetation_12','Vegetation_14','Vegetation_15','Vegetation_16','latitude','Temp_pre_30','Temp_pre_15','Temp_pre_7','Temp_cont','Wind_pre_30','Wind_pre_15','Wind_pre_7','Wind_cont','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Hum_cont','Prec_pre_30','Prec_pre_15','Prec_pre_7','Prec_cont','Cause_Debris Burning','Cause_Equipment Use','Cause_Fireworks','Cause_Lightning','Cause_Miscellaneous','Cause_Missing/Undefined','Cause_Powerline','Cause_Railroad','Cause_Smoking','Cause_Structure', 'longitude']]
y = df['fire_size']

#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(np.array(X_train))

In [None]:
X_train.shape

In [None]:
model = Sequential([
    #normalizer,
    Dense(34, activation="relu", kernel_regularizer = regularizers.l2(0.01), kernel_initializer='normal',input_dim = X_train.shape[1]),
    Dense(34, activation="relu", kernel_regularizer = regularizers.l2(0.01)),
    Dense(64, activation="relu", kernel_regularizer = regularizers.l2(0.01)),
    Dense(32, activation="relu", kernel_regularizer = regularizers.l2(0.01)),
    Dense(1, activation='linear')
])

model.compile(
    optimizer='adam',
    loss='mean_squared_error',
    metrics=['accuracy','mae']
)

In [None]:
history = model.fit(
    X_train,
    y_train,
    epochs=20,
    validation_split=0.15,
    callbacks=[tf.keras.callbacks.EarlyStopping(
        monitor='val_mae',
        patience=20,
        mode='min',
        restore_best_weights=True
    )]
)

In [None]:
test_loss, test_acc, test_mae = model.evaluate(x=X_test, y=y_test, verbose=0)
print('Mean Absolute Error: {acc:0.3f}'.format(acc=test_mae))
print('accuracy: {acc:0.3f}'.format(acc=test_acc))
print('loss: {acc:0.3f}'.format(acc=test_loss))

In [None]:
print(f"Average Fire Size: {df.fire_size.mean()}")
print(f"Standard Deviation of Fire Size: {df.fire_size.std()}")


### end here