In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from datetime import datetime

from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.inspection import plot_partial_dependence

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Normalization
from tensorflow.keras import regularizers
import tensorflow.keras as keras
from keras.callbacks import ModelCheckpoint
from tensorflow.keras import regularizers

import plotly.express as px

from utils import run_all_regressors
%load_ext autoreload
%autoreload 2

In [2]:
pd.set_option('display.max_columns', None)

Data Pre-processing and EDA(Exploratory data analysis)
Data Cleaning and filtering data which has firesize <5000 as number of small fires are high.

In [3]:
def akReadDf():
    # Reading the combined CSV files
    df = pd.read_csv('Wildfire.csv')
    df = df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1','disc_date_final','cont_date_final','cont_clean_date','putout_time'])
    df['disc_clean_date'] = pd.to_datetime(df['disc_clean_date'], format='%m/%d/%Y')

    #Get rid of outliers - fires of size larger than 5000 acres, and there are large number of small fires and other very less number are having the high 
    # area of fires, because of which the deviation is very high
    df = df.loc[df['fire_size'] < 5000]
    df.columns

    ################
    df['Vegetation'] = df['Vegetation'].astype('category')
    df['Cause'] = df['stat_cause_descr'].astype('category')

    df = pd.get_dummies(df,prefix=['Vegetation'], columns = ['Vegetation'], drop_first=True)
    df = pd.get_dummies(df,prefix=['Cause'], columns = ['stat_cause_descr'], drop_first=True)

    ################
    df_numerics_only = df.select_dtypes(include=np.number)

    corr = df_numerics_only.corr()
    ax = sns.heatmap(
        corr, 
        vmin=-1, vmax=1, center=0,
        cmap=sns.diverging_palette(220, 20, n=200),
        square=True
    )
    ax.set_xticklabels(
        ax.get_xticklabels(),
        rotation=45,
        horizontalalignment='right'
    );
    sns.set(rc={'figure.figsize':(15,15)})

    ################
    #Dealing with missing data
    print(len(df))

    # drop columns where weather_file is missing in the data, as it wont have the weather situation at that time, so its where ever data is 
    #missing we can remove those rows as it wont be useful
    index = df[df['weather_file'] == 'File Not Found'].index
    df.drop(index, inplace = True)
    print(len(df))


    ################
    # Weather data has a lot of 0 and values some of which may be missing values,
    # Mark '0' values in weather columns as Na (to see how many there are) 
    # As 0 wont add any value to the data, we are converting to NA and then removing them which will make data set
    subset0 = ['Temp_pre_30','Temp_pre_15','Temp_pre_7','Temp_cont','Wind_pre_30','Wind_pre_15','Wind_pre_7','Wind_cont','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Hum_cont']
    df[subset0] = df[subset0].replace({0:np.nan, '0':np.nan})
    print(len(df))

    # Mark '-1' as missing
    subset_neg1 = ['Temp_pre_30','Temp_pre_15','Temp_pre_7','Temp_cont','Wind_pre_30','Wind_pre_15','Wind_pre_7','Wind_cont','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Hum_cont','Prec_pre_30','Prec_pre_15','Prec_pre_7','Prec_cont']
    df[subset_neg1] = df[subset_neg1].replace({-1:np.nan})

    # Drop observations where all weather columns are 0
    df = df.dropna(how='all',
                        subset=['Temp_pre_30','Temp_pre_15','Temp_pre_7','Temp_cont','Wind_pre_30','Wind_pre_15','Wind_pre_7','Wind_cont',])
    print(len(df))
    # This leaves us with 38,689 observations  +/- 3,000  to work with (originally we had 50,000)

    ################
    # fill the 'pre' columns temp wind and humidity with mean values
    subset_fill_mean = ['Temp_pre_30','Temp_pre_15','Temp_pre_7', 'Wind_pre_30','Wind_pre_15','Wind_pre_7', 'Hum_pre_30', 'Hum_pre_15','Hum_pre_7']
    df[subset_fill_mean] = df[subset_fill_mean].fillna(df[subset_fill_mean].mean())

    # Fill NAs in the date of fire containment based on mean values from previous days
    for col in ['Temp','Wind','Hum']:
        df[f'{col}_cont'] = df.apply(
            lambda row: (row[f'{col}_pre_7']+row[f'{col}_pre_15']+row[f'{col}_pre_30'])/3 if np.isnan(row[f'{col}_cont']) else row[f'{col}_cont'],
            axis=1)

## Load DF 

In [4]:
from loadDFRegion import getDF
df,dfRegionList = getDF()

Grabbing region 1 dataframe...
Grabbing region 2 dataframe...
Grabbing region 3 dataframe...
Grabbing region 4 dataframe...
Grabbing region 5 dataframe...
Grabbing region 6 dataframe...
Grabbing region 8 dataframe...
Grabbing region 9 dataframe...
Grabbing region 10 dataframe...


In [5]:
#Sanity Check !!
sum1=0
ct=1
for dftemp in dfRegionList:
    if(ct>=7):
        ct+=1
    #print(ct, dftemp.size)
    sum1+= dftemp.size
print(df.size, sum1)
assert(sum1==df.size)

1445578 1445578


## Setting up Experiments

In [6]:
def experimentCreation_FireSizePrediction(df,yTarget):
    """Function return all experiments splitting data by yTarget name: 
        regression - fire_size
        classification - fire_cause
    """
    
    # Experiment 1 
    """
    - which will select all teh available  features from the dataset
    -Features included - variables related to Vegetation,Temperature, Humidity, Wind, Precipitation, cause of  fire, longitude and latitude
    - we have 34 variables  for x-variables  to which we are gonna target one y-variable which is fire_size
    - selecting features and target variables"""
    X1 = df[['Vegetation','remoteness','latitude','Temp_pre_30','Temp_pre_15','Temp_pre_7','Temp_cont','Wind_pre_30','Wind_pre_15','Wind_pre_7','Wind_cont','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Hum_cont','Prec_pre_30','Prec_pre_15','Prec_pre_7','Prec_cont','stat_cause_desc', 'longitude']]
    #X1 = df[['Vegetation_4','remoteness', 'Vegetation_9','Vegetation_12','Vegetation_14','Vegetation_15','Vegetation_16','latitude','Temp_pre_30','Temp_pre_15','Temp_pre_7','Temp_cont','Wind_pre_30','Wind_pre_15','Wind_pre_7','Wind_cont','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Hum_cont','Prec_pre_30','Prec_pre_15','Prec_pre_7','Prec_cont','Cause_Debris Burning','Cause_Equipment Use','Cause_Fireworks','Cause_Lightning','Cause_Miscellaneous','Cause_Missing/Undefined','Cause_Powerline','Cause_Railroad','Cause_Smoking','Cause_Structure', 'longitude']]
    # X1 = df[['Vegetation_4','Vegetation_9','Vegetation_12','Vegetation_14','Vegetation_15','Vegetation_16','latitude','Temp_pre_30','Temp_pre_15','Temp_pre_7','Temp_cont','Wind_pre_30','Wind_pre_15','Wind_pre_7','Wind_cont','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Hum_cont','Prec_pre_30','Prec_pre_15','Prec_pre_7','Prec_cont','Cause_Debris Burning','Cause_Equipment Use','Cause_Fireworks','Cause_Lightning','Cause_Miscellaneous','Cause_Missing/Undefined','Cause_Powerline','Cause_Railroad','Cause_Smoking','Cause_Structure', 'longitude']]

    y = df[yTarget] 

    #train test split
    X1_train, X1_test, y_train, y_test = train_test_split(X1, y, test_size=0.2, random_state=42)
    df1 = [X1_train, X1_test, y_train, y_test]
    #######################
    #Experiment type 2 
    """-Include only long, lat, vegetation, cause and pre- weather data, without cont
    - which is the data set where I removed the variables  on which the fire is  containining on the day
    - removed 4 variables
    - selecting features and target variables"""
    X2 = df[['Vegetation','latitude','Temp_pre_30','Temp_pre_15','Temp_pre_7','Wind_pre_30','Wind_pre_15','Wind_pre_7','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Prec_pre_30','Prec_pre_15','Prec_pre_7','stat_cause_desc', 'longitude']]
    #X2 = df[['Vegetation_4','Vegetation_9','Vegetation_12','Vegetation_14','Vegetation_15','Vegetation_16','latitude','Temp_pre_30','Temp_pre_15','Temp_pre_7','Wind_pre_30','Wind_pre_15','Wind_pre_7','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Prec_pre_30','Prec_pre_15','Prec_pre_7','Cause_Debris Burning','Cause_Equipment Use','Cause_Fireworks','Cause_Lightning','Cause_Miscellaneous','Cause_Missing/Undefined','Cause_Powerline','Cause_Railroad','Cause_Smoking','Cause_Structure', 'longitude']]
    y = df[yTarget]

    #train test split
    X2_train, X2_test, y_train, y_test = train_test_split(X2, y, test_size=0.2, random_state=42)
    df2 = [X2_train, X2_test, y_train, y_test]
    ########################
    #Experiment 3 
    #- Including only lat, long and weather pre- data
    #When I have done the feature importance, I got to know that the cause and vegetation is not that important, so here we removed the 2 
    #  selecting features and target variables
    X3 = df[['latitude','longitude','Temp_pre_30','Temp_pre_15','Temp_pre_7','Wind_pre_30','Wind_pre_15','Wind_pre_7','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Prec_pre_30','Prec_pre_15','Prec_pre_7']]
    y = df[yTarget]

    #train test split
    X3_train, X3_test, y_train, y_test = train_test_split(X3, y, test_size=0.2, random_state=42)
    df3 = [X3_train, X3_test, y_train, y_test]
    
    ########################
    #Experiment 4 
    #with experiment 1 data with normalization
    # have done the minMax normalization for the experiment 1 data frame.
    df_4 = df[['Vegetation','latitude','Temp_pre_30','Temp_pre_15','Temp_pre_7','Temp_cont','Wind_pre_30','Wind_pre_15','Wind_pre_7','Wind_cont','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Hum_cont','Prec_pre_30','Prec_pre_15','Prec_pre_7','Prec_cont','stat_cause_desc', 'longitude']]
    names = df_4.columns

    # normalizing data
    df_4 = preprocessing.normalize(df_4)
    scaled_df = pd.DataFrame(df_4, columns=names)

    #train test split
    X4_train, X4_test, y_train, y_test = train_test_split(scaled_df, y, test_size=0.2, random_state=42)
    df4 = [X4_train, X4_test, y_train, y_test]
    
    return [df1,df2,df3,df4]

## Running all regression models on all experiments for Prediction on Fire Size

In [9]:
#regionCount=1
regionExperimentDict = {}
for i in range(len(dfRegionList)):
    dfRegion = dfRegionList[i]
    regionCount =i+1
    if(regionCount>=7):
        regionCount+=1
    
    experimentList = experimentCreation_FireSizePrediction(dfRegion,"fire_size")
    print(f"\n\n____Running all experiments for Region {regionCount}____")
    print("____________________________________________")
    print("____________________________________________")
    experimentListOfDictionaries = []
    for i in range(len(experimentList)):
        #if(i not in [1,2,3]):
        experiment=experimentList[i]
        print(f"\n--------------Experiment {i+1}--------------")
        #print(experiment[0].shape,experiment[1].shape,experiment[2].shape,experiment[3].shape)
        regressorDict = run_all_regressors(experiment[0],experiment[2],experiment[1],experiment[3])
        if(i not in [1,2,3]):
            experimentListOfDictionaries.append(regressorDict)
    print("____________________________________________")
    
    regionExperimentDict[regionCount]= experimentListOfDictionaries
    



____Running all experiments for Region 1____
____________________________________________
____________________________________________

--------------Experiment 1--------------
Running RandomForestRegressor
Score on training data: 0.9802185520772947
Score on testing data: 0.8679309062586006
Mean Absolute Error:  0.038127723630631744
R Squared:  0.8679309062586006

Running GradientBoostingRegressor
Score on training data: 0.9743691746910226
Score on testing data: 0.854812394338564
Mean Absolute Error:  0.038877218327901746
R Squared:  0.854812394338564

Running DecisionTreeRegressor
Score on training data: 0.9999999760149556
Score on testing data: 0.8298083336446657
Mean Absolute Error:  0.038144774059926764
R Squared:  0.8298083336446657

Running ExtraTreesRegressor
Score on training data: 0.9999999760149556
Score on testing data: 0.8599808485903878
Mean Absolute Error:  0.03788734458200143
R Squared:  0.8599808485903878

Running SVR
Score on training data: 0.023669598559045824
Score

Score on training data: 0.9807379068090071
Score on testing data: 0.8875269462244004
Mean Absolute Error:  0.03442766976846553
R Squared:  0.8875269462244004

Running GradientBoostingRegressor
Score on training data: 0.990398266838978
Score on testing data: 0.8702734948362223
Mean Absolute Error:  0.03135386501707144
R Squared:  0.8702734948362223

Running DecisionTreeRegressor
Score on training data: 0.9999999999786066
Score on testing data: 0.8065455868788571
Mean Absolute Error:  0.03869420421057478
R Squared:  0.8065455868788571

Running ExtraTreesRegressor
Score on training data: 0.9999999999786066
Score on testing data: 0.9054129813653979
Mean Absolute Error:  0.0316598159929155
R Squared:  0.9054129813653979

Running SVR
Score on training data: 0.007271681249555018
Score on testing data: -0.015003288967290862
Mean Absolute Error:  0.15552859354597787
R Squared:  -0.015003288967290862


--------------Experiment 2--------------
Running RandomForestRegressor
Score on training data:

Score on training data: 0.9540695813207151
Score on testing data: 0.9044306453058064
Mean Absolute Error:  0.019805321806126437
R Squared:  0.9044306453058064

Running DecisionTreeRegressor
Score on training data: 0.9999998887213692
Score on testing data: 0.7374743493087104
Mean Absolute Error:  0.027559920511661135
R Squared:  0.7374743493087104

Running ExtraTreesRegressor
Score on training data: 0.9999998887213692
Score on testing data: 0.8534402129562957
Mean Absolute Error:  0.02548071138079128
R Squared:  0.8534402129562957

Running SVR
Score on training data: -0.08044109200393224
Score on testing data: -0.09296565901862852
Mean Absolute Error:  0.12595148640027584
R Squared:  -0.09296565901862852


--------------Experiment 2--------------
Running RandomForestRegressor
Score on training data: 0.8704237807488753
Score on testing data: 0.0888957673313987
Mean Absolute Error:  0.08360321088669871
R Squared:  0.0888957673313987

Running GradientBoostingRegressor
Score on training dat

Score on training data: 0.9999999593515394
Score on testing data: 0.7857389973953917
Mean Absolute Error:  0.008971682911362138
R Squared:  0.7857389973953917

Running ExtraTreesRegressor
Score on training data: 0.9999999593515394
Score on testing data: 0.876526794373302
Mean Absolute Error:  0.008199328428659086
R Squared:  0.876526794373302

Running SVR
Score on training data: -1.0017405066900151
Score on testing data: -1.165920730739057
Mean Absolute Error:  0.10287845889107189
R Squared:  -1.165920730739057


--------------Experiment 2--------------
Running RandomForestRegressor
Score on training data: 0.8726509307078445
Score on testing data: 0.0500535508593104
Mean Absolute Error:  0.020713326609058447
R Squared:  0.0500535508593104

Running GradientBoostingRegressor
Score on training data: 0.28823817508317195
Score on testing data: 0.024595106115973486
Mean Absolute Error:  0.020041400989796454
R Squared:  0.024595106115973486

Running DecisionTreeRegressor
Score on training dat

Score on training data: 1.0
Score on testing data: 0.8594937127282372
Mean Absolute Error:  0.10701534449296464
R Squared:  0.8594937127282372

Running SVR
Score on training data: 0.22517469511566346
Score on testing data: 0.11264268632147412
Mean Absolute Error:  0.32332386597499285
R Squared:  0.11264268632147412


--------------Experiment 2--------------
Running RandomForestRegressor
Score on training data: 0.8964035012003645
Score on testing data: 0.18863396938046073
Mean Absolute Error:  0.29272049975739933
R Squared:  0.18863396938046073

Running GradientBoostingRegressor
Score on training data: 0.9225220625042893
Score on testing data: -0.0046979668001838615
Mean Absolute Error:  0.3173759119203613
R Squared:  -0.0046979668001838615

Running DecisionTreeRegressor
Score on training data: 1.0
Score on testing data: -0.545343964976077
Mean Absolute Error:  0.31994703218502346
R Squared:  -0.545343964976077

Running ExtraTreesRegressor
Score on training data: 1.0
Score on testing da

In [10]:
for i in range(9):
    regionCt = i+1
    if(regionCt>=7):
        regionCt+=1
    print(f"Results from Region {regionCt}")
    print(regionExperimentDict[regionCt])

Results from Region 1
[{'RandomForestRegressor': [0.9802185520772947, 0.8679309062586006, 0.038127723630631744, 0.8679309062586006], 'GradientBoostingRegressor': [0.9743691746910226, 0.854812394338564, 0.038877218327901746, 0.854812394338564], 'DecisionTreeRegressor': [0.9999999760149556, 0.8298083336446657, 0.038144774059926764, 0.8298083336446657], 'ExtraTreesRegressor': [0.9999999760149556, 0.8599808485903878, 0.03788734458200143, 0.8599808485903878], 'SVR': [0.023669598559045824, 0.006796051146053861, 0.16133878041211616, 0.006796051146053861]}]
Results from Region 2
[{'RandomForestRegressor': [0.9597396807287967, 0.6404695740972389, 0.04751856221962242, 0.6404695740972389], 'GradientBoostingRegressor': [0.9051818852303684, 0.672108212602953, 0.044908979937677854, 0.672108212602953], 'DecisionTreeRegressor': [1.0, 0.2861918342799159, 0.05205152538207517, 0.2861918342799159], 'ExtraTreesRegressor': [1.0, 0.8148825110492605, 0.03486456474767063, 0.8148825110492605], 'SVR': [-0.023215

In [11]:
def createRegionDFResults(regressorDict):
    dfRegressionResults = pd.DataFrame(columns=['ModelName','TrainScore','TestScore','MAE','R^2'])
    for key in regressorDict.keys():
        #print(key)
        resultList = regressorDict[key]
        new_row = {'ModelName':key, 'TrainScore':resultList[0], 'TestScore':resultList[1], 'MAE':resultList[2] , 'R^2':resultList[3]}
        
        dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
    return dfRegressionResults

In [12]:
regionRegressionList=[]
for i in range(1,len(regionExperimentDict)+1):
    
    if(i>=7):
        i+=1
    #print(i)
    regionRegressionResults = createRegionDFResults(regionExperimentDict[i][0])
    regionRegressionList.append(regionRegressionResults)

  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(n

  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(new_row, ignore_index=True)
  dfRegressionResults = dfRegressionResults.append(n

In [13]:
for i in range(len(regionRegressionList)):
    dfTemp = regionRegressionList[i]
    regionNum= i+1
    if(regionNum>=7):
        regionNum+=1
    print(f"\nRegion {regionNum}")
    print(dfTemp.head())


Region 1
                   ModelName  TrainScore  TestScore       MAE       R^2
0      RandomForestRegressor    0.980219   0.867931  0.038128  0.867931
1  GradientBoostingRegressor    0.974369   0.854812  0.038877  0.854812
2      DecisionTreeRegressor    1.000000   0.829808  0.038145  0.829808
3        ExtraTreesRegressor    1.000000   0.859981  0.037887  0.859981
4                        SVR    0.023670   0.006796  0.161339  0.006796

Region 2
                   ModelName  TrainScore  TestScore       MAE       R^2
0      RandomForestRegressor    0.959740   0.640470  0.047519  0.640470
1  GradientBoostingRegressor    0.905182   0.672108  0.044909  0.672108
2      DecisionTreeRegressor    1.000000   0.286192  0.052052  0.286192
3        ExtraTreesRegressor    1.000000   0.814883  0.034865  0.814883
4                        SVR   -0.023216  -0.046814  0.130777 -0.046814

Region 3
                   ModelName  TrainScore  TestScore       MAE       R^2
0      RandomForestRegressor    0.

In [14]:
"""dfRegion1 size: 1027
dfRegion2 size: 1766
dfRegion3 size: 975
dfRegion4 size: 1482
dfRegion5 size: 2567
dfRegion6 size: 832
dfRegion8 size: 22069
dfRegion9 size: 4293
dfRegion10 size: 247"""
#TODO select best model and put in region table
dfRegionModelResults = pd.DataFrame(columns=['Region','RegionSize','ModelName','MAE','R^2'])
r1 = {'Region':1 ,'ModelName':"RandomForestRegressor", 'RegionSize':1027,'MAE':0.038844   , 'R^2':0.864012}
r2 = {'Region':2 ,'ModelName':"ExtraTreesRegressor", 'RegionSize':1766,'MAE':0.035026  , 'R^2':0.817728}
r3 = {'Region':3 ,'ModelName':"ExtraTreesRegressor", 'RegionSize':975,'MAE':0.032393   , 'R^2':0.903682}
r4 = {'Region':4 ,'ModelName':"ExtraTreesRegressor", 'RegionSize':1482,'MAE':0.075709   , 'R^2':0.811049}
r5 = {'Region':5 ,'ModelName':"GradientBoostingRegressor",'RegionSize':2567, 'MAE':0.020157  , 'R^2':0.901066}
r6 = {'Region':6 ,'ModelName':"ExtraTreesRegressor", 'RegionSize':832,'MAE':0.046512   , 'R^2':0.890339}
r8 = {'Region':8 ,'ModelName':"RandomForestRegressor", 'RegionSize':22069,'MAE':0.007910     , 'R^2':0.888664}
r9 = {'Region':9 ,'ModelName':"ExtraTreesRegressor ", 'RegionSize':247,'MAE':0.006403    , 'R^2':0.877449}
r10 = {'Region':10 ,'ModelName':"GradientBoostingRegressor", 'RegionSize':1027,'MAE':0.093626, 'R^2':0.879954}
rowList = [r1,r2,r3,r4,r5,r6,r8,r9,r10]
for row in rowList: 
    dfRegionModelResults = dfRegionModelResults.append(row, ignore_index=True)

  dfRegionModelResults = dfRegionModelResults.append(row, ignore_index=True)
  dfRegionModelResults = dfRegionModelResults.append(row, ignore_index=True)
  dfRegionModelResults = dfRegionModelResults.append(row, ignore_index=True)
  dfRegionModelResults = dfRegionModelResults.append(row, ignore_index=True)
  dfRegionModelResults = dfRegionModelResults.append(row, ignore_index=True)
  dfRegionModelResults = dfRegionModelResults.append(row, ignore_index=True)
  dfRegionModelResults = dfRegionModelResults.append(row, ignore_index=True)
  dfRegionModelResults = dfRegionModelResults.append(row, ignore_index=True)
  dfRegionModelResults = dfRegionModelResults.append(row, ignore_index=True)


In [15]:
dfRegionModelResults.head(10)

Unnamed: 0,Region,RegionSize,ModelName,MAE,R^2
0,1,1027,RandomForestRegressor,0.038844,0.864012
1,2,1766,ExtraTreesRegressor,0.035026,0.817728
2,3,975,ExtraTreesRegressor,0.032393,0.903682
3,4,1482,ExtraTreesRegressor,0.075709,0.811049
4,5,2567,GradientBoostingRegressor,0.020157,0.901066
5,6,832,ExtraTreesRegressor,0.046512,0.890339
6,8,22069,RandomForestRegressor,0.00791,0.888664
7,9,247,ExtraTreesRegressor,0.006403,0.877449
8,10,1027,GradientBoostingRegressor,0.093626,0.879954


## Running all Classification Models for Fire Cause Prediction

In [None]:
# Encode categorical values to numeric
df['stat_cause_descr'] = df['stat_cause_descr'].apply(lambda x: cause_encoded_dist[x]).astype('int')
df.head()

In [None]:
regionCount=1
for dfRegion in dfRegionList:
    if(regionCount>=7):
        regionCount+=1
    
    experimentList = experimentCreation_FireSizePrediction(dfRegion)
    print(f"\n\n____Running all experiments for Region {regionCount}____")
    print("____________________________________________")
    print("____________________________________________")
    for i in range(len(experimentList)):
        experiment=experimentList[i]
        print(f"\n--------------Experiment {i+1}--------------")
        print(experiment[0].size,experiment[1].size,experiment[2].size,experiment[3].size)
        run_all_regressors(experiment[0],experiment[2],experiment[1],experiment[3])
    print("____________________________________________")
    regionCount+=1

In [None]:
print(

### Hyper-parameter tuning: TODO Have not done

In [None]:
rf_reg = RandomForestRegressor()
search_grid={'n_estimators':[50,100,200],'max_depth':[2,5,8,10]}
search=GridSearchCV(estimator=rf_reg,param_grid=search_grid,scoring='neg_mean_absolute_error',n_jobs=1,cv=5, verbose=1)
search.fit(df1[0], df1[2])
print(search.best_score_)
print(search.best_params_)


In [None]:
# Random Forest Regressor 
rf_reg = RandomForestRegressor(n_estimators = 200, max_depth=10)

# fit the regressor with x and y data
rf_reg.fit(df1[0], df1[2])

predictions = rf_reg.predict(df1[1])
print('Mean Absolute Error:', metrics.mean_absolute_error(df1[3], predictions))
print('R Squared:', metrics.r2_score(df1[3], predictions))

### Neural network model (from other nb) will run later

In [None]:
# selecting features and target variables
X = df[['Vegetation_4','Vegetation_9','Vegetation_12','Vegetation_14','Vegetation_15','Vegetation_16','latitude','Temp_pre_30','Temp_pre_15','Temp_pre_7','Temp_cont','Wind_pre_30','Wind_pre_15','Wind_pre_7','Wind_cont','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Hum_cont','Prec_pre_30','Prec_pre_15','Prec_pre_7','Prec_cont','Cause_Debris Burning','Cause_Equipment Use','Cause_Fireworks','Cause_Lightning','Cause_Miscellaneous','Cause_Missing/Undefined','Cause_Powerline','Cause_Railroad','Cause_Smoking','Cause_Structure', 'longitude']]
y = df['fire_size']

#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(np.array(X_train))

In [None]:
X_train.shape

In [None]:
model = Sequential([
    #normalizer,
    Dense(34, activation="relu", kernel_regularizer = regularizers.l2(0.01), kernel_initializer='normal',input_dim = X_train.shape[1]),
    Dense(34, activation="relu", kernel_regularizer = regularizers.l2(0.01)),
    Dense(64, activation="relu", kernel_regularizer = regularizers.l2(0.01)),
    Dense(32, activation="relu", kernel_regularizer = regularizers.l2(0.01)),
    Dense(1, activation='linear')
])

model.compile(
    optimizer='adam',
    loss='mean_squared_error',
    metrics=['accuracy','mae']
)

In [None]:
history = model.fit(
    X_train,
    y_train,
    epochs=20,
    validation_split=0.15,
    callbacks=[tf.keras.callbacks.EarlyStopping(
        monitor='val_mae',
        patience=20,
        mode='min',
        restore_best_weights=True
    )]
)

In [None]:
test_loss, test_acc, test_mae = model.evaluate(x=X_test, y=y_test, verbose=0)
print('Mean Absolute Error: {acc:0.3f}'.format(acc=test_mae))
print('accuracy: {acc:0.3f}'.format(acc=test_acc))
print('loss: {acc:0.3f}'.format(acc=test_loss))

In [None]:
print(f"Average Fire Size: {df.fire_size.mean()}")
print(f"Standard Deviation of Fire Size: {df.fire_size.std()}")


### end here