In [1248]:
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns
import re


In [1249]:
df = pd.read_csv('wildfires.csv')
df = df.dropna()
df

Unnamed: 0,area,class,evi,lat,lon,lst_day,lst_night,time,day
0,1.373816e+07,no_fire,0.216975,33.408854,-117.076050,307.487500,289.228750,2018-06-26,0
1,1.202089e+07,no_fire,0.242957,36.141667,-118.236654,305.820000,283.662857,2018-06-26,0
2,5.452330e+08,no_fire,0.398504,41.055741,-121.923551,300.717102,291.531937,2018-06-26,0
3,4.121447e+07,no_fire,0.265087,35.721354,-118.660329,310.887083,290.742083,2018-06-26,0
4,8.586347e+07,no_fire,0.202601,37.484083,-120.411426,317.291400,294.068200,2018-06-26,0
...,...,...,...,...,...,...,...,...,...
579,6.010443e+06,fire,0.354783,39.276786,-121.670988,309.122857,292.431429,2019-06-26,365
580,6.010443e+06,fire,0.313343,37.986310,-121.193545,310.996667,288.516000,2019-06-26,365
581,1.545542e+07,fire,0.163647,37.479630,-121.172461,315.414444,289.468889,2019-06-26,365
582,9.444982e+06,fire,0.325075,36.456439,-119.159807,313.969091,293.300000,2019-06-26,365


In [1250]:
df['class'].value_counts()

fire       249
no_fire    235
Name: class, dtype: int64

In [1251]:
df.time

0      2018-06-26
1      2018-06-26
2      2018-06-26
3      2018-06-26
4      2018-06-26
          ...    
579    2019-06-26
580    2019-06-26
581    2019-06-26
582    2019-06-26
583    2019-06-26
Name: time, Length: 484, dtype: object

In [1252]:
def date_to_month(datetime):
    datetime = int(re.search("-\d\d",datetime).group(0))
    return datetime

In [1253]:
df['month'] = abs(df.time.apply(date_to_month))

In [1254]:
df.month.value_counts()

6     67
8     50
9     46
7     44
3     44
4     38
10    37
5     36
11    33
12    32
1     30
2     27
Name: month, dtype: int64

In [1255]:
one_hot = pd.get_dummies(df['class']).fire
df['class'] = one_hot
df

Unnamed: 0,area,class,evi,lat,lon,lst_day,lst_night,time,day,month
0,1.373816e+07,0,0.216975,33.408854,-117.076050,307.487500,289.228750,2018-06-26,0,6
1,1.202089e+07,0,0.242957,36.141667,-118.236654,305.820000,283.662857,2018-06-26,0,6
2,5.452330e+08,0,0.398504,41.055741,-121.923551,300.717102,291.531937,2018-06-26,0,6
3,4.121447e+07,0,0.265087,35.721354,-118.660329,310.887083,290.742083,2018-06-26,0,6
4,8.586347e+07,0,0.202601,37.484083,-120.411426,317.291400,294.068200,2018-06-26,0,6
...,...,...,...,...,...,...,...,...,...,...
579,6.010443e+06,1,0.354783,39.276786,-121.670988,309.122857,292.431429,2019-06-26,365,6
580,6.010443e+06,1,0.313343,37.986310,-121.193545,310.996667,288.516000,2019-06-26,365,6
581,1.545542e+07,1,0.163647,37.479630,-121.172461,315.414444,289.468889,2019-06-26,365,6
582,9.444982e+06,1,0.325075,36.456439,-119.159807,313.969091,293.300000,2019-06-26,365,6


In [1233]:
#Undersampling majority class
count_class_0, count_class_1 = df['class'].value_counts()

# Divide by class
df_class_0 = df[df['class'] == 0]
df_class_1 = df[df['class'] == 1]
df_class_0_under = df_class_0.sample(count_class_1)
df_test_under = pd.concat([df_class_0_under, df_class_1], axis=0)

In [1090]:
#Oversampling minority class
df_class_1_over = df_class_1.sample(count_class_0, replace=True)
df_test_over = pd.concat([df_class_0, df_class_1_over], axis=0)

In [None]:
#df = df_test_under.copy()
#df = df_test_over.copy()
#df

In [1259]:
y = df['class']
x = df.drop(['class','time','month','lat','lon','lst_night','day'], axis=1)
x['area'] = x['area'].apply(lambda x: x/1000000)

In [1260]:
x

Unnamed: 0,area,evi,lst_day
0,13.738155,0.216975,307.487500
1,12.020886,0.242957,305.820000
2,545.233030,0.398504,300.717102
3,41.214465,0.265087,310.887083
4,85.863469,0.202601,317.291400
...,...,...,...
579,6.010443,0.354783,309.122857
580,6.010443,0.313343,310.996667
581,15.455424,0.163647,315.414444
582,9.444982,0.325075,313.969091


In [1261]:
#splitting
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.2)
scaler = StandardScaler()
scaler.fit(train_x)
train_x = scaler.transform(train_x)
test_x = scaler.transform(test_x)

In [1262]:
test_x.shape

(97, 3)

In [1270]:
def check_params(parameters):

    mlp = MLPClassifier(solver='lbfgs', max_iter = 10000)
    clf = GridSearchCV(mlp, parameters, cv = 4)
    clf.fit(train_x, train_y)
    grid_results = pd.DataFrame(clf.cv_results_)
    return grid_results[grid_results.rank_test_score.isin([1,2,3,4,5,6,7])]

In [1271]:
params = {
    'alpha':[0.01,0.1,0.8,1,2], 
    'hidden_layer_sizes':[(8,8),(14,14),(28,14),(28,28),(64,28),(64,64)]
}
results = check_params(params)
results



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_hidden_layer_sizes,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
18,0.162067,0.032778,0.000499,0.000499,1,"(8, 8)","{'alpha': 1, 'hidden_layer_sizes': (8, 8)}",0.618557,0.742268,0.71134,0.6875,0.689922,0.045605,3
24,0.162316,0.066191,0.000249,0.000432,2,"(8, 8)","{'alpha': 2, 'hidden_layer_sizes': (8, 8)}",0.618557,0.731959,0.71134,0.6875,0.687339,0.042769,5
25,0.420919,0.20945,0.000249,0.000432,2,"(14, 14)","{'alpha': 2, 'hidden_layer_sizes': (14, 14)}",0.608247,0.742268,0.701031,0.697917,0.687339,0.048979,5
26,0.45214,0.205328,0.000739,0.000427,2,"(28, 14)","{'alpha': 2, 'hidden_layer_sizes': (28, 14)}",0.618557,0.752577,0.690722,0.708333,0.692506,0.048354,1
27,0.441646,0.177019,0.000263,0.000264,2,"(28, 28)","{'alpha': 2, 'hidden_layer_sizes': (28, 28)}",0.608247,0.742268,0.701031,0.71875,0.692506,0.050884,1
28,1.114052,0.394536,0.000748,0.000432,2,"(64, 28)","{'alpha': 2, 'hidden_layer_sizes': (64, 28)}",0.608247,0.731959,0.690722,0.708333,0.684755,0.04661,7
29,2.041764,0.750154,0.000499,0.000499,2,"(64, 64)","{'alpha': 2, 'hidden_layer_sizes': (64, 64)}",0.608247,0.742268,0.701031,0.708333,0.689922,0.049737,3


In [1274]:
#MLP Classifier Function
#solver: 'Adam' works better for bigger datasets, 'lbfgs' faster and better for smaller
#regularization parameter: paper did between 0.8 - 4, and compared results
#hiddenlayers: 2 layers with 28 units each
#Best inputs rn: {'alpha': 1 'hidden_layer_sizes': (28, 14)}

mlp = MLPClassifier(solver='lbfgs', max_iter = 10000, alpha=2, hidden_layer_sizes = (28,14))
mlp.fit(train_x, train_y)

test_preds = mlp.predict(test_x)
conf_mat = metrics.confusion_matrix(test_y, test_preds)
print('Accuracy:')
print(accuracy_score(test_y, test_preds))
print('Confusion Matrix:')
print(conf_mat)
print(classification_report(test_y,test_preds))

Accuracy:
0.6082474226804123
Confusion Matrix:
[[27 20]
 [18 32]]
              precision    recall  f1-score   support

           0       0.60      0.57      0.59        47
           1       0.62      0.64      0.63        50

    accuracy                           0.61        97
   macro avg       0.61      0.61      0.61        97
weighted avg       0.61      0.61      0.61        97



In [1266]:
pd.DataFrame(test_preds)[0].value_counts()

1    55
0    42
Name: 0, dtype: int64

In [908]:
#TPR: 35%
#FPR: 11%

In [1267]:
preds = pd.DataFrame()
preds['True'] = test_y.reset_index()['class']
preds['Predictions'] = test_preds
preds.head()

Unnamed: 0,True,Predictions
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1


In [1268]:
#Predictions
preds[preds['True'] == 1]

Unnamed: 0,True,Predictions
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1
7,1,1
10,1,1
11,1,1
12,1,0
14,1,1


### Training with first 300 days, to predict last 25

In [1282]:
train_x = df[df.day<=305].drop(['class','time','day','month','lat','lon','area'], axis=1)
train_y = df[df.day<=305]['class']

test_x = df[df.day>305].drop(['class','time','day','month','lat','lon','area'], axis=1)
test_y = df[df.day>305]['class']

In [1283]:
test_x.shape

(82, 3)

In [1284]:
scaler = StandardScaler()
scaler.fit(train_x)
train_x = scaler.transform(train_x)
test_x = scaler.transform(test_x)

In [1285]:
parameters = {
    'alpha':[0.01,0.1,0.8,1,2,4], 
    'hidden_layer_sizes':[(8,8),(14,14),(28,14),(28,28),(64,28),(64,64)]
}
results = check_params(parameters)
results



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_hidden_layer_sizes,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
13,0.673564,0.230217,0.000997,1.192093e-07,0.8,"(14, 14)","{'alpha': 0.8, 'hidden_layer_sizes': (14, 14)}",0.60396,0.613861,0.633663,0.656566,0.626866,0.020078,4
18,0.120401,0.067527,0.000507,0.0005073848,1.0,"(8, 8)","{'alpha': 1, 'hidden_layer_sizes': (8, 8)}",0.564356,0.643564,0.643564,0.69697,0.636816,0.047256,2
22,4.722639,1.972751,0.000998,3.576279e-07,1.0,"(64, 28)","{'alpha': 1, 'hidden_layer_sizes': (64, 28)}",0.594059,0.594059,0.643564,0.676768,0.626866,0.034987,4
27,0.642679,0.238169,0.000997,2.598106e-07,2.0,"(28, 28)","{'alpha': 2, 'hidden_layer_sizes': (28, 28)}",0.554455,0.633663,0.643564,0.666667,0.624378,0.042225,7
28,1.856196,0.629133,0.000748,0.0004318802,2.0,"(64, 28)","{'alpha': 2, 'hidden_layer_sizes': (64, 28)}",0.574257,0.643564,0.633663,0.69697,0.636816,0.043455,2
29,1.931347,0.794143,0.00075,0.0004327812,2.0,"(64, 64)","{'alpha': 2, 'hidden_layer_sizes': (64, 64)}",0.574257,0.643564,0.643564,0.69697,0.639303,0.043486,1
30,0.110455,0.030921,0.000748,0.0004318457,4.0,"(8, 8)","{'alpha': 4, 'hidden_layer_sizes': (8, 8)}",0.574257,0.693069,0.60396,0.636364,0.626866,0.044165,4


In [1288]:
mlp = MLPClassifier(solver='lbfgs', max_iter = 10000, alpha=2, hidden_layer_sizes = (64, 64))
#clf = GridSearchCV(mlp, parameters, cv = 2)
mlp.fit(train_x, train_y)

test_preds = mlp.predict(test_x)
conf_mat = metrics.confusion_matrix(test_y, test_preds)
print('Accuracy:')
print(accuracy_score(test_y, test_preds))
print('Confusion Matrix:')
print(conf_mat)
print(classification_report(test_y,test_preds))

Accuracy:
0.5975609756097561
Confusion Matrix:
[[27 13]
 [20 22]]
              precision    recall  f1-score   support

           0       0.57      0.68      0.62        40
           1       0.63      0.52      0.57        42

    accuracy                           0.60        82
   macro avg       0.60      0.60      0.60        82
weighted avg       0.60      0.60      0.60        82

