In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from datetime import datetime

from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.inspection import plot_partial_dependence

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Normalization
from tensorflow.keras import regularizers
import tensorflow.keras as keras
from keras.callbacks import ModelCheckpoint
from tensorflow.keras import regularizers

import plotly.express as px

from utils import run_all_regressors,run_all_classifiers
%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
pd.set_option('display.max_columns', None)

Data Pre-processing and EDA(Exploratory data analysis)
Data Cleaning and filtering data which has firesize <5000 as number of small fires are high.

In [5]:
def akReadDf():
    # Reading the combined CSV files
    df = pd.read_csv('Wildfire.csv')
    df = df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1','disc_date_final','cont_date_final','cont_clean_date','putout_time'])
    df['disc_clean_date'] = pd.to_datetime(df['disc_clean_date'], format='%m/%d/%Y')

    #Get rid of outliers - fires of size larger than 5000 acres, and there are large number of small fires and other very less number are having the high 
    # area of fires, because of which the deviation is very high
    df = df.loc[df['fire_size'] < 5000]
    df.columns

    ################
    df['Vegetation'] = df['Vegetation'].astype('category')
    df['Cause'] = df['stat_cause_descr'].astype('category')

    df = pd.get_dummies(df,prefix=['Vegetation'], columns = ['Vegetation'], drop_first=True)
    df = pd.get_dummies(df,prefix=['Cause'], columns = ['stat_cause_descr'], drop_first=True)

    ################
    df_numerics_only = df.select_dtypes(include=np.number)

    corr = df_numerics_only.corr()
    ax = sns.heatmap(
        corr, 
        vmin=-1, vmax=1, center=0,
        cmap=sns.diverging_palette(220, 20, n=200),
        square=True
    )
    ax.set_xticklabels(
        ax.get_xticklabels(),
        rotation=45,
        horizontalalignment='right'
    );
    sns.set(rc={'figure.figsize':(15,15)})

    ################
    #Dealing with missing data
    print(len(df))

    # drop columns where weather_file is missing in the data, as it wont have the weather situation at that time, so its where ever data is 
    #missing we can remove those rows as it wont be useful
    index = df[df['weather_file'] == 'File Not Found'].index
    df.drop(index, inplace = True)
    print(len(df))


    ################
    # Weather data has a lot of 0 and values some of which may be missing values,
    # Mark '0' values in weather columns as Na (to see how many there are) 
    # As 0 wont add any value to the data, we are converting to NA and then removing them which will make data set
    subset0 = ['Temp_pre_30','Temp_pre_15','Temp_pre_7','Temp_cont','Wind_pre_30','Wind_pre_15','Wind_pre_7','Wind_cont','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Hum_cont']
    df[subset0] = df[subset0].replace({0:np.nan, '0':np.nan})
    print(len(df))

    # Mark '-1' as missing
    subset_neg1 = ['Temp_pre_30','Temp_pre_15','Temp_pre_7','Temp_cont','Wind_pre_30','Wind_pre_15','Wind_pre_7','Wind_cont','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Hum_cont','Prec_pre_30','Prec_pre_15','Prec_pre_7','Prec_cont']
    df[subset_neg1] = df[subset_neg1].replace({-1:np.nan})

    # Drop observations where all weather columns are 0
    df = df.dropna(how='all',
                        subset=['Temp_pre_30','Temp_pre_15','Temp_pre_7','Temp_cont','Wind_pre_30','Wind_pre_15','Wind_pre_7','Wind_cont',])
    print(len(df))
    # This leaves us with 38,689 observations  +/- 3,000  to work with (originally we had 50,000)

    ################
    # fill the 'pre' columns temp wind and humidity with mean values
    subset_fill_mean = ['Temp_pre_30','Temp_pre_15','Temp_pre_7', 'Wind_pre_30','Wind_pre_15','Wind_pre_7', 'Hum_pre_30', 'Hum_pre_15','Hum_pre_7']
    df[subset_fill_mean] = df[subset_fill_mean].fillna(df[subset_fill_mean].mean())

    # Fill NAs in the date of fire containment based on mean values from previous days
    for col in ['Temp','Wind','Hum']:
        df[f'{col}_cont'] = df.apply(
            lambda row: (row[f'{col}_pre_7']+row[f'{col}_pre_15']+row[f'{col}_pre_30'])/3 if np.isnan(row[f'{col}_cont']) else row[f'{col}_cont'],
            axis=1)

## Load DF 

In [6]:
from loadDFRegion import getDF
df,dfRegionList = getDF()

Grabbing region 1 dataframe...
Grabbing region 2 dataframe...
Grabbing region 3 dataframe...
Grabbing region 4 dataframe...
Grabbing region 5 dataframe...
Grabbing region 6 dataframe...
Grabbing region 8 dataframe...
Grabbing region 9 dataframe...
Grabbing region 10 dataframe...


In [7]:
#Sanity Check !!
sum1=0
ct=1
for dftemp in dfRegionList:
    if(ct>=7):
        ct+=1
    #print(ct, dftemp.size)
    sum1+= dftemp.size
print(df.size, sum1)
assert(sum1==df.size)

1445578 1445578


## Setting up Experiments

In [50]:
def experimentCreation_FireSizePrediction(df,yTarget):
    """Function return all experiments splitting data by yTarget name: 
        regression - fire_size
        classification - fire_cause
    """
    
    # Experiment 1 
    """
    - which will select all teh available  features from the dataset
    -Features included - variables related to Vegetation,Temperature, Humidity, Wind, Precipitation, cause of  fire, longitude and latitude
    - we have 34 variables  for x-variables  to which we are gonna target one y-variable which is fire_size
    - selecting features and target variables"""
    X1 = df[['Vegetation','remoteness','latitude','Temp_pre_30','Temp_pre_15','Temp_pre_7','Temp_cont','Wind_pre_30','Wind_pre_15','Wind_pre_7','Wind_cont','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Hum_cont','Prec_pre_30','Prec_pre_15','Prec_pre_7','Prec_cont','stat_cause_desc', 'longitude']]
    #X1 = df[['Vegetation_4','remoteness', 'Vegetation_9','Vegetation_12','Vegetation_14','Vegetation_15','Vegetation_16','latitude','Temp_pre_30','Temp_pre_15','Temp_pre_7','Temp_cont','Wind_pre_30','Wind_pre_15','Wind_pre_7','Wind_cont','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Hum_cont','Prec_pre_30','Prec_pre_15','Prec_pre_7','Prec_cont','Cause_Debris Burning','Cause_Equipment Use','Cause_Fireworks','Cause_Lightning','Cause_Miscellaneous','Cause_Missing/Undefined','Cause_Powerline','Cause_Railroad','Cause_Smoking','Cause_Structure', 'longitude']]
    # X1 = df[['Vegetation_4','Vegetation_9','Vegetation_12','Vegetation_14','Vegetation_15','Vegetation_16','latitude','Temp_pre_30','Temp_pre_15','Temp_pre_7','Temp_cont','Wind_pre_30','Wind_pre_15','Wind_pre_7','Wind_cont','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Hum_cont','Prec_pre_30','Prec_pre_15','Prec_pre_7','Prec_cont','Cause_Debris Burning','Cause_Equipment Use','Cause_Fireworks','Cause_Lightning','Cause_Miscellaneous','Cause_Missing/Undefined','Cause_Powerline','Cause_Railroad','Cause_Smoking','Cause_Structure', 'longitude']]

    y = df[yTarget] 

    #train test split
    X1_train, X1_test, y_train, y_test = train_test_split(X1, y, test_size=0.2, random_state=42)
    df1 = [X1_train, X1_test, y_train, y_test]
    #######################
    #Experiment type 2 
    """-Include only long, lat, vegetation, cause and pre- weather data, without cont
    - which is the data set where I removed the variables  on which the fire is  containining on the day
    - removed 4 variables
    - selecting features and target variables"""
    X2 = df[['Vegetation','latitude','Temp_pre_30','Temp_pre_15','Temp_pre_7','Wind_pre_30','Wind_pre_15','Wind_pre_7','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Prec_pre_30','Prec_pre_15','Prec_pre_7','stat_cause_desc', 'longitude']]
    #X2 = df[['Vegetation_4','Vegetation_9','Vegetation_12','Vegetation_14','Vegetation_15','Vegetation_16','latitude','Temp_pre_30','Temp_pre_15','Temp_pre_7','Wind_pre_30','Wind_pre_15','Wind_pre_7','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Prec_pre_30','Prec_pre_15','Prec_pre_7','Cause_Debris Burning','Cause_Equipment Use','Cause_Fireworks','Cause_Lightning','Cause_Miscellaneous','Cause_Missing/Undefined','Cause_Powerline','Cause_Railroad','Cause_Smoking','Cause_Structure', 'longitude']]
    y = df[yTarget]

    #train test split
    X2_train, X2_test, y_train, y_test = train_test_split(X2, y, test_size=0.2, random_state=42)
    df2 = [X2_train, X2_test, y_train, y_test]
    ########################
    #Experiment 3 
    #- Including only lat, long and weather pre- data
    #When I have done the feature importance, I got to know that the cause and vegetation is not that important, so here we removed the 2 
    #  selecting features and target variables
    X3 = df[['latitude','longitude','Temp_pre_30','Temp_pre_15','Temp_pre_7','Wind_pre_30','Wind_pre_15','Wind_pre_7','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Prec_pre_30','Prec_pre_15','Prec_pre_7']]
    y = df[yTarget]

    #train test split
    X3_train, X3_test, y_train, y_test = train_test_split(X3, y, test_size=0.2, random_state=42)
    df3 = [X3_train, X3_test, y_train, y_test]
    
    ########################
    #Experiment 4 
    #with experiment 1 data with normalization
    # have done the minMax normalization for the experiment 1 data frame.
    df_4 = df[['Vegetation','latitude','Temp_pre_30','Temp_pre_15','Temp_pre_7','Temp_cont','Wind_pre_30','Wind_pre_15','Wind_pre_7','Wind_cont','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Hum_cont','Prec_pre_30','Prec_pre_15','Prec_pre_7','Prec_cont','stat_cause_desc', 'longitude']]
    names = df_4.columns

    # normalizing data
    df_4 = preprocessing.normalize(df_4)
    scaled_df = pd.DataFrame(df_4, columns=names)

    #train test split
    X4_train, X4_test, y_train, y_test = train_test_split(scaled_df, y, test_size=0.2, random_state=42)
    df4 = [X4_train, X4_test, y_train, y_test]
    
    return [df1,df2,df3,df4]

## Running all regression models on all experiments for Prediction on Fire Size

In [7]:
#regionCount=1
regionExperimentDict = {}
for i in range(len(dfRegionList)):
    dfRegion = dfRegionList[i]
    regionCount =i+1
    if(regionCount>=7):
        regionCount+=1
    
    experimentList = experimentCreation_FireSizePrediction(dfRegion,"fire_size")
    print(f"\n\n____Running all experiments for Region {regionCount}____")
    print("____________________________________________")
    print("____________________________________________")
    experimentListOfDictionaries = []
    for i in range(len(experimentList)):
        if(i not in [1,2,3]):
            experiment=experimentList[i]
            print(f"\n--------------Experiment {i+1}--------------")
            print(experiment[0].size,experiment[1].size,experiment[2].size,experiment[3].size)
            regressorDict = run_all_regressors(experiment[0],experiment[2],experiment[1],experiment[3])
            experimentListOfDictionaries.append(regressorDict)
    print("____________________________________________")
    
    regionExperimentDict[regionCount]= experimentListOfDictionaries
    



____Running all experiments for Region 1____
____________________________________________
____________________________________________

--------------Experiment 1--------------
17241 4326 821 206
Running RandomForestRegressor
Score on training data: 0.979876906952423
Score on testing data: 0.8640124792261732
Mean Absolute Error:  0.03884371850751577
R Squared:  0.8640124792261732

Running GradientBoostingRegressor
Score on training data: 0.9743691746910226
Score on testing data: 0.8535192877341835
Mean Absolute Error:  0.039202388310467746
R Squared:  0.8535192877341835

Running DecisionTreeRegressor
Score on training data: 0.9999999760149556
Score on testing data: 0.8309687605851277
Mean Absolute Error:  0.03889052276712061
R Squared:  0.8309687605851277

Running ExtraTreesRegressor
Score on training data: 0.9999999760149556
Score on testing data: 0.856231584983193
Mean Absolute Error:  0.038611570833678596
R Squared:  0.856231584983193

Running SVR
Score on training data: 0.0236695

Score on training data: 0.9328541607408966
Score on testing data: 0.6974720955271009
Mean Absolute Error:  0.005802843938031488
R Squared:  0.6974720955271009

Running GradientBoostingRegressor
Score on training data: 0.9393002582185409
Score on testing data: 0.7310868107402584
Mean Absolute Error:  0.005517160367437063
R Squared:  0.7310868107402584

Running DecisionTreeRegressor
Score on training data: 0.9999999808006592
Score on testing data: 0.27933961445078037
Mean Absolute Error:  0.006971803275067571
R Squared:  0.27933961445078037

Running ExtraTreesRegressor
Score on training data: 0.9999999808006592
Score on testing data: 0.7956444995310956
Mean Absolute Error:  0.006403074912808451
R Squared:  0.7956444995310956

Running SVR
Score on training data: -1.623087752440541
Score on testing data: -3.2579944863207944
Mean Absolute Error:  0.10057050087007174
R Squared:  -3.2579944863207944

____________________________________________


____Running all experiments for Region 10____


In [8]:
for i in range(9):
    regionCt = i+1
    if(regionCt>=7):
        regionCt+=1
    print(f"Results from Region {regionCt}")
    print(regionExperimentDict[regionCt])

Results from Region 1
[{'RandomForestRegressor': [0.979876906952423, 0.8640124792261732, 0.03884371850751577, 0.8640124792261732], 'GradientBoostingRegressor': [0.9743691746910226, 0.8535192877341835, 0.039202388310467746, 0.8535192877341835], 'DecisionTreeRegressor': [0.9999999760149556, 0.8309687605851277, 0.03889052276712061, 0.8309687605851277], 'ExtraTreesRegressor': [0.9999999760149556, 0.856231584983193, 0.038611570833678596, 0.856231584983193], 'SVR': [0.023669598559045824, 0.006796051146053861, 0.16133878041211616, 0.006796051146053861]}]
Results from Region 2
[{'RandomForestRegressor': [0.9600293863244324, 0.6398509079058468, 0.04676969057230677, 0.6398509079058468], 'GradientBoostingRegressor': [0.9051818852303684, 0.6658555967069187, 0.04532944181105967, 0.6658555967069187], 'DecisionTreeRegressor': [1.0, 0.20731180752986988, 0.05586898039554056, 0.20731180752986988], 'ExtraTreesRegressor': [1.0, 0.8177283027868585, 0.035026137496339316, 0.8177283027868585], 'SVR': [-0.0232

In [10]:
def createRegionDFResults(regressorDict):
    dfRegressionResults = pd.DataFrame(columns=['ModelName','TrainScore','TestScore','MAE','R^2'])
    for key in regressorDict.keys():
        #print(key)
        resultList = regressorDict[key]
        new_row = {'ModelName':key, 'TrainScore':resultList[0], 'TestScore':resultList[1], 'MAE':resultList[2] , 'R^2':resultList[3]}
        
        dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
    return dfRegressionResults

In [11]:
regionRegressionList=[]
for i in range(1,len(regionExperimentDict)+1):
    
    if(i>=7):
        i+=1
    #print(i)
    regionRegressionResults = createRegionDFResults(regionExperimentDict[i][0])
    regionRegressionList.append(regionRegressionResults)

  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(n

In [12]:
for i in range(len(regionRegressionList)):
    dfTemp = regionRegressionList[i]
    regionNum= i+1
    if(regionNum>=7):
        regionNum+=1
    print(f"\nRegion {regionNum}")
    print(dfTemp.head())


Region 1
                   ModelName  TrainScore  TestScore       MAE       R^2
0      RandomForestRegressor    0.979877   0.864012  0.038844  0.864012
1  GradientBoostingRegressor    0.974369   0.853519  0.039202  0.853519
2      DecisionTreeRegressor    1.000000   0.830969  0.038891  0.830969
3        ExtraTreesRegressor    1.000000   0.856232  0.038612  0.856232
4                        SVR    0.023670   0.006796  0.161339  0.006796

Region 2
                   ModelName  TrainScore  TestScore       MAE       R^2
0      RandomForestRegressor    0.960029   0.639851  0.046770  0.639851
1  GradientBoostingRegressor    0.905182   0.665856  0.045329  0.665856
2      DecisionTreeRegressor    1.000000   0.207312  0.055869  0.207312
3        ExtraTreesRegressor    1.000000   0.817728  0.035026  0.817728
4                        SVR   -0.023216  -0.046814  0.130777 -0.046814

Region 3
                   ModelName  TrainScore  TestScore       MAE       R^2
0      RandomForestRegressor    0.

In [16]:
#TODO select best model and put in region table
dfRegionModelResults = pd.DataFrame(columns=['Region','ModelName','MAE','R^2'])
r1 = {'Region':1 ,'ModelName':"RandomForestRegressor", 'MAE':0.038844   , 'R^2':0.864012}
r2 = {'Region':2 ,'ModelName':"ExtraTreesRegressor", 'MAE':0.035026  , 'R^2':0.817728}
r3 = {'Region':3 ,'ModelName':"ExtraTreesRegressor", 'MAE':0.032393   , 'R^2':0.903682}
r4 = {'Region':4 ,'ModelName':"ExtraTreesRegressor", 'MAE':0.075709   , 'R^2':0.811049}
r5 = {'Region':5 ,'ModelName':"GradientBoostingRegressor", 'MAE':0.020157  , 'R^2':0.901066}
r6 = {'Region':6 ,'ModelName':"ExtraTreesRegressor", 'MAE':0.046512   , 'R^2':0.890339}
r8 = {'Region':8 ,'ModelName':"RandomForestRegressor", 'MAE':0.007910     , 'R^2':0.888664}
r9 = {'Region':9 ,'ModelName':"ExtraTreesRegressor ", 'MAE':0.006403    , 'R^2':0.877449}
r10 = {'Region':10 ,'ModelName':"GradientBoostingRegressor", 'MAE':0.093626, 'R^2':0.879954}
rowList = [r1,r2,r3,r4,r5,r6,r8,r9,r10]
for row in rowList: 
    dfRegionModelResults = dfRegionModelResults.append(row, ignore_index=True)

  dfRegionModelResults = dfRegionModelResults.append(row, ignore_index=True)
  dfRegionModelResults = dfRegionModelResults.append(row, ignore_index=True)
  dfRegionModelResults = dfRegionModelResults.append(row, ignore_index=True)
  dfRegionModelResults = dfRegionModelResults.append(row, ignore_index=True)
  dfRegionModelResults = dfRegionModelResults.append(row, ignore_index=True)
  dfRegionModelResults = dfRegionModelResults.append(row, ignore_index=True)
  dfRegionModelResults = dfRegionModelResults.append(row, ignore_index=True)
  dfRegionModelResults = dfRegionModelResults.append(row, ignore_index=True)
  dfRegionModelResults = dfRegionModelResults.append(row, ignore_index=True)


In [18]:
dfRegionModelResults.head(10)

Unnamed: 0,Region,ModelName,MAE,R^2
0,1,RandomForestRegressor,0.038844,0.864012
1,2,ExtraTreesRegressor,0.035026,0.817728
2,3,ExtraTreesRegressor,0.032393,0.903682
3,4,ExtraTreesRegressor,0.075709,0.811049
4,5,GradientBoostingRegressor,0.020157,0.901066
5,6,ExtraTreesRegressor,0.046512,0.890339
6,8,RandomForestRegressor,0.00791,0.888664
7,9,ExtraTreesRegressor,0.006403,0.877449
8,10,GradientBoostingRegressor,0.093626,0.879954


## Running all Classification Models for Fire Cause Prediction

In [8]:
#df = df.drop(columns=['fire_name','Unnamed: 0', 'Unnamed: 0.1','disc_date_final','cont_date_final','cont_clean_date','putout_time', 'fire_size_class', 'fire_mag', 'weather_file', 'state'])
df = df.drop(columns=['disc_clean_date', 'discovery_month', 'disc_date_pre', 'disc_pre_year', 'disc_pre_month']) # drop date columns
df = df.drop(columns=['wstation_usaf', 'dstation_m', 'wstation_wban', 'wstation_byear', 'wstation_eyear']) # drop station columns

df = df.loc[df['fire_size'] < 5000]

In [9]:
from utils import reduce_cause_labels
# Group campfire, 
df = reduce_cause_labels(df)

df['stat_cause_descr'].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['stat_cause_descr'] = df['stat_cause_descr'].apply(lambda x: 'Other' if (x in reduced_labels) else x)


array(['Arson', 'Debris Burning', 'Other', 'Miscellaneous', 'Lightning',
       'Equipment Use'], dtype=object)

In [10]:
cause_encoded_dist = {
    'Missing/Undefined':0, 
    'Arson':1, 
    'Debris Burning':2, 
    'Miscellaneous':3,
    'Campfire':4, 
    'Fireworks':5, 
    'Children':6, 
    'Lightning':7, 
    'Equipment Use':8,
    'Smoking':9, 
    'Railroad':10, 
    'Structure':11, 
    'Powerline':12,
    'Other':13
}

# Encode categorical values to numeric
df['stat_cause_descr'] = df['stat_cause_descr'].apply(lambda x: cause_encoded_dist[x]).astype('int')
df.head()

Unnamed: 0,fire_size,fire_size_class,stat_cause_descr,latitude,longitude,state,Vegetation,fire_mag,weather_file,Temp_pre_30,Temp_pre_15,Temp_pre_7,Temp_cont,Wind_pre_30,Wind_pre_15,Wind_pre_7,Wind_cont,Hum_pre_30,Hum_pre_15,Hum_pre_7,Hum_cont,Prec_pre_30,Prec_pre_15,Prec_pre_7,Prec_cont,remoteness,day,month,year,stat_cause_desc,Region
1,0.000499,B,1,35.03833,-87.61,TN,15,0.1,723235-13896-2006.gz,7.553433,7.01,0.343529,10.448298,2.709764,2.881707,1.976471,2.12232,70.84,65.858911,55.505882,81.682678,59.8,8.4,0.0,86.8,0.184355,11,12,2006,1,8
2,0.011914,C,1,34.9478,-88.7225,MS,16,1.0,723235-13896-2004.gz,4.97193,5.782766,5.55875,13.6966,3.364499,2.92383,2.695833,3.36905,75.531629,75.868613,76.812834,65.0638,168.8,42.2,18.1,124.5,0.194544,29,2,2004,1,8
3,9.8e-05,B,2,39.6414,-119.3083,NV,0,0.1,724880-23185-2005.gz,16.275967,18.996181,18.142564,17.804904,4.054982,3.398329,3.671282,3.708198,44.778429,37.140811,35.353846,39.091029,10.4,7.2,0.0,0.0,0.487447,6,6,2005,2,4
6,0.000939,B,2,31.316978,-83.393649,GA,12,0.1,747810-13857-2008.gz,14.877341,16.409326,16.610281,15.96565,2.000214,1.727202,3.13624,2.287885,79.896679,73.431818,59.811044,71.046514,26.0,0.0,0.0,0.0,0.148904,10,1,2009,2,8
7,9.8e-05,B,13,30.90472,-93.5575,TX,12,0.1,722820-99999-2005.gz,16.851939,16.997783,20.434783,11.98556,1.331257,1.472949,1.424783,2.148857,72.899478,75.061381,77.924623,70.732911,28.4,27.5,1.2,55.4,0.241894,12,11,2005,4,8


In [33]:
def exp1(df):
    X = df[['stat_cause_desc', 'latitude', 'longitude', 'Vegetation', 'Temp_cont', 'Wind_cont', 'Hum_cont', 'Prec_cont', 'remoteness']]
    Y = df['stat_cause_desc']
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.15, random_state=50)
    return X_train,X_test,y_train,y_test

In [34]:
classifier_dict={}
for i in range(len(dfRegionList)):
    dfRegion = dfRegionList[i]
    #print(dfRegion)
    regionCount =i+1
    if(regionCount>=7):
        regionCount+=1
    X_train,X_test,y_train,y_test = exp1(dfRegion)
    #print(X_train.shape)
    #print(X_test.shape)
    #print(y_train.shape)
    #print(y_test.shape)
    print(f"\n\n____Running all experiments for Region {regionCount}____")
    classifier_1=run_all_classifiers(X_train,X_test,y_train,y_test)
    classifier_dict[regionCount]=classifier_1



____Running all experiments for Region 1____
Running RandomForestClassifier
[7 7 1 7 7]
19595    7
52147    7
323      1
52169    7
17256    7
Name: stat_cause_desc, dtype: int32
Accuracy:  0.9419354838709677
Sensitivity:  0.9419354838709677
F1 Score:  0.9293842325050128
Running GradientBoostingClassifier
[7 7 1 7 7]
19595    7
52147    7
323      1
52169    7
17256    7
Name: stat_cause_desc, dtype: int32
Accuracy:  1.0
Sensitivity:  1.0
F1 Score:  1.0
Running DecisionTreeClassifier
[7 7 1 7 7]
19595    7
52147    7
323      1
52169    7
17256    7
Name: stat_cause_desc, dtype: int32
Accuracy:  1.0
Sensitivity:  1.0
F1 Score:  1.0
Running ExtraTreesClassifier
[7 7 1 7 7]
19595    7
52147    7
323      1
52169    7
17256    7
Name: stat_cause_desc, dtype: int32
Accuracy:  0.967741935483871
Sensitivity:  0.967741935483871
F1 Score:  0.9652211833635638
Running SVC
[7 7 1 7 3]
19595    7
52147    7
323      1
52169    7
17256    7
Name: stat_cause_desc, dtype: int32
Accuracy:  0.4516129

  _warn_prf(average, modifier, msg_start, len(result))


[7 0 7 7 7]
36821    7
10471    0
54446    7
12436    7
50131    7
Name: stat_cause_desc, dtype: int32
Accuracy:  1.0
Sensitivity:  1.0
F1 Score:  1.0
Running DecisionTreeClassifier
[7 0 7 7 7]
36821    7
10471    0
54446    7
12436    7
50131    7
Name: stat_cause_desc, dtype: int32
Accuracy:  1.0
Sensitivity:  1.0
F1 Score:  1.0
Running ExtraTreesClassifier
[7 0 7 7 7]
36821    7
10471    0
54446    7
12436    7
50131    7
Name: stat_cause_desc, dtype: int32
Accuracy:  1.0
Sensitivity:  1.0
F1 Score:  1.0
Running SVC
[7 7 7 7 7]
36821    7
10471    0
54446    7
12436    7
50131    7
Name: stat_cause_desc, dtype: int32
Accuracy:  0.7368421052631579
Sensitivity:  0.7368421052631579
F1 Score:  0.6251993620414672


In [35]:
for i in range(9):
    regionCt = i+1
    if(regionCt>=7):
        regionCt+=1
    print(f"Results from Region {regionCt}")
    print(classifier_dict[regionCt])


Results from Region 1
{'RandomForestClassifier': [0.9419354838709677, 0.9419354838709677, 0.9293842325050128], 'GradientBoostingClassifier': [1.0, 1.0, 1.0], 'DecisionTreeClassifier': [1.0, 1.0, 1.0], 'ExtraTreesClassifier': [0.967741935483871, 0.967741935483871, 0.9652211833635638], 'SVC': [0.45161290322580644, 0.45161290322580644, 0.3479642886016886]}
Results from Region 2
{'RandomForestClassifier': [0.9811320754716981, 0.9811320754716981, 0.9809047336476157], 'GradientBoostingClassifier': [1.0, 1.0, 1.0], 'DecisionTreeClassifier': [1.0, 1.0, 1.0], 'ExtraTreesClassifier': [0.9924528301886792, 0.9924528301886792, 0.9922666262106853], 'SVC': [0.4226415094339623, 0.4226415094339623, 0.28194034077422453]}
Results from Region 3
{'RandomForestClassifier': [0.9727891156462585, 0.9727891156462585, 0.9687191115762543], 'GradientBoostingClassifier': [0.9931972789115646, 0.9931972789115646, 0.9898295952044185], 'DecisionTreeClassifier': [1.0, 1.0, 1.0], 'ExtraTreesClassifier': [0.98639455782312

In [36]:
def createRegionDFResults(classifier_dict):
    dfClassificationResults = pd.DataFrame(columns=['ModelName','Accuracy','Sensitivity','F1 Score'])
    for key in classifier_dict.keys():
        #print(key)
        resultList = classifier_dict[key]
        new_row = {'ModelName':key, 'Accuracy':resultList[0], 'Sensitivity':resultList[1], 'F1 Score':resultList[2]}
        
        dfClassificationResults = dfClassificationResults.append(new_row, ignore_index=True)
    return dfClassificationResults

In [53]:
regionClassifierList=[]
for i in range(1,len(classifier_dict)+1):
    if(i>=7):
        i+=1
    print(classifier_dict[1])
    regionClassificationResults = createRegionDFResults(classifier_dict[i])
    regionClassifierList.append(regionClassificationResults)

{'RandomForestClassifier': [0.9419354838709677, 0.9419354838709677, 0.9293842325050128], 'GradientBoostingClassifier': [1.0, 1.0, 1.0], 'DecisionTreeClassifier': [1.0, 1.0, 1.0], 'ExtraTreesClassifier': [0.967741935483871, 0.967741935483871, 0.9652211833635638], 'SVC': [0.45161290322580644, 0.45161290322580644, 0.3479642886016886]}
{'RandomForestClassifier': [0.9419354838709677, 0.9419354838709677, 0.9293842325050128], 'GradientBoostingClassifier': [1.0, 1.0, 1.0], 'DecisionTreeClassifier': [1.0, 1.0, 1.0], 'ExtraTreesClassifier': [0.967741935483871, 0.967741935483871, 0.9652211833635638], 'SVC': [0.45161290322580644, 0.45161290322580644, 0.3479642886016886]}
{'RandomForestClassifier': [0.9419354838709677, 0.9419354838709677, 0.9293842325050128], 'GradientBoostingClassifier': [1.0, 1.0, 1.0], 'DecisionTreeClassifier': [1.0, 1.0, 1.0], 'ExtraTreesClassifier': [0.967741935483871, 0.967741935483871, 0.9652211833635638], 'SVC': [0.45161290322580644, 0.45161290322580644, 0.3479642886016886]

  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(n

In [49]:
for i in range(len(regionClassifierList)):
    dfTemp = regionClassifierList[i]
    regionNum= i+1
    if(regionNum>=7):
        regionNum+=1
    print(f"\nRegion {regionNum}")
    print(dfTemp.head())



Region 1
                    ModelName  Accuracy  Sensitivity  F1 Score
0      RandomForestClassifier  0.941935     0.941935  0.929384
1  GradientBoostingClassifier  1.000000     1.000000  1.000000
2      DecisionTreeClassifier  1.000000     1.000000  1.000000
3        ExtraTreesClassifier  0.967742     0.967742  0.965221
4                         SVC  0.451613     0.451613  0.347964

Region 2
                    ModelName  Accuracy  Sensitivity  F1 Score
0      RandomForestClassifier  0.981132     0.981132  0.980905
1  GradientBoostingClassifier  1.000000     1.000000  1.000000
2      DecisionTreeClassifier  1.000000     1.000000  1.000000
3        ExtraTreesClassifier  0.992453     0.992453  0.992267
4                         SVC  0.422642     0.422642  0.281940

Region 3
                    ModelName  Accuracy  Sensitivity  F1 Score
0      RandomForestClassifier  0.972789     0.972789  0.968719
1  GradientBoostingClassifier  0.993197     0.993197  0.989830
2      DecisionTreeClassi

### Hyper-parameter tuning: TODO Have not done

In [55]:
X1 = df[['Vegetation','remoteness','latitude','Temp_pre_30','Temp_pre_15','Temp_pre_7','Temp_cont','Wind_pre_30','Wind_pre_15','Wind_pre_7','Wind_cont','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Hum_cont','Prec_pre_30','Prec_pre_15','Prec_pre_7','Prec_cont','stat_cause_desc', 'longitude']]
y = df["fire_size"] 
#train test split
X1_train, X1_test, y_train, y_test = train_test_split(X1, y, test_size=0.2, random_state=42)
df1 = [X1_train, X1_test, y_train, y_test]
rf_reg = RandomForestRegressor()
search_grid={'n_estimators':[50,100,200],'max_depth':[2,5,8,10]}
search=GridSearchCV(estimator=rf_reg,param_grid=search_grid,scoring='neg_mean_absolute_error',n_jobs=1,cv=5, verbose=1)
search.fit(df1[0], df1[2])
print(search.best_score_)
print(search.best_params_)


Fitting 5 folds for each of 12 candidates, totalling 60 fits
-0.015593356087373897
{'max_depth': 10, 'n_estimators': 200}


In [56]:
def exp2(df):
    X1 = df[['Vegetation','remoteness','latitude','Temp_pre_30','Temp_pre_15','Temp_pre_7','Temp_cont','Wind_pre_30','Wind_pre_15','Wind_pre_7','Wind_cont','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Hum_cont','Prec_pre_30','Prec_pre_15','Prec_pre_7','Prec_cont','stat_cause_desc', 'longitude']]
    y = df["fire_size"] 
    X1_train, X1_test, y_train, y_test = train_test_split(X1, y, test_size=0.2, random_state=42)
    df1 = [X1_train, X1_test, y_train, y_test]
    return df1

In [57]:
# Random Forest Regressor 
for i in range(len(dfRegionList)):
    dfRegion = dfRegionList[i]
    regionCount =i+1
    if(regionCount>=7):
        regionCount+=1
    
    print(f"\n\n____Running experiment for Region {regionCount}____")
    rf_reg = RandomForestRegressor(n_estimators = 200, max_depth=10)
    df1=exp2(dfRegion)
    # fit the regressor with x and y data
    rf_reg.fit(df1[0], df1[2])

    predictions = rf_reg.predict(df1[1])
    print('Mean Absolute Error:', metrics.mean_absolute_error(df1[3], predictions))
    print('R Squared:', metrics.r2_score(df1[3], predictions))



____Running experiment for Region 1____
Mean Absolute Error: 0.03928329609061914
R Squared: 0.8639611781926088


____Running experiment for Region 2____
Mean Absolute Error: 0.0448926945558235
R Squared: 0.6397930534890099


____Running experiment for Region 3____
Mean Absolute Error: 0.03356649736897054
R Squared: 0.8886079601290942


____Running experiment for Region 4____
Mean Absolute Error: 0.07595062330613232
R Squared: 0.8090009105910531


____Running experiment for Region 5____
Mean Absolute Error: 0.021513052648958936
R Squared: 0.8869818244457384


____Running experiment for Region 6____
Mean Absolute Error: 0.04334227585785689
R Squared: 0.8909698645176388


____Running experiment for Region 8____
Mean Absolute Error: 0.007296237855203604
R Squared: 0.883978920989881


____Running experiment for Region 9____
Mean Absolute Error: 0.00557219127103187
R Squared: 0.6855779326572252


____Running experiment for Region 10____
Mean Absolute Error: 0.09629941381244259
R Squared: 0

### Neural network model (from other nb) will run later

In [68]:

from tensorflow.keras.models import load_model
import autokeras as ak
loaded_model = load_model("model_autokeras", custom_objects=ak.CUSTOM_OBJECTS)

In [69]:
def exp3(df):
    X1 = df[['Vegetation','remoteness','latitude','Temp_pre_30','Temp_pre_15','Temp_pre_7','Temp_cont','Wind_pre_30','Wind_pre_15','Wind_pre_7','Wind_cont','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Hum_cont','Prec_pre_30','Prec_pre_15','Prec_pre_7','Prec_cont','stat_cause_desc', 'longitude']]
    y = df["fire_size"] 

    X1_train, X1_test, y_train, y_test = train_test_split(X1, y, test_size=0.2, random_state=42)
    df1 = [X1_train, X1_test, y_train, y_test]
    return df1

In [None]:
for i in range(len(dfRegionList)):
    dfRegion=dfRegionList[i]
    X_train,X_test,y_train,y_test = exp3(dfRegion)
    regionCount =i+1
    if(regionCount>=7):
        regionCount+=1
    print(f"\n\n____Running all experiments for Region {regionCount}____")
    history = loaded_model.fit(
        X_train,
        y_train,
        epochs=20,
        validation_split=0.15,
        callbacks=[tf.keras.callbacks.EarlyStopping(
            monitor='val_mae',
            patience=20,
            mode='min',
            restore_best_weights=True
        )]
    )

In [None]:
test_loss, test_acc, test_mae = loaded_model.evaluate(x=X_test, y=y_test, verbose=0)
print('Mean Absolute Error: {acc:0.3f}'.format(acc=test_mae))
print('accuracy: {acc:0.3f}'.format(acc=test_acc))
print('loss: {acc:0.3f}'.format(acc=test_loss))

In [None]:
print(f"Average Fire Size: {df.fire_size.mean()}")
print(f"Standard Deviation of Fire Size: {df.fire_size.std()}")


### end here