In [13]:
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns
import re


In [14]:
df = pd.read_csv('wildfires.csv')
df = df.dropna()
df

Unnamed: 0,area,class,evi,lat,lon,lst_day,lst_night,time,day
0,1.373816e+07,no_fire,0.216975,33.408854,-117.076050,307.487500,289.228750,2018-06-26,0
1,1.202089e+07,no_fire,0.242957,36.141667,-118.236654,305.820000,283.662857,2018-06-26,0
2,5.452330e+08,no_fire,0.398504,41.055741,-121.923551,300.717102,291.531937,2018-06-26,0
3,4.121447e+07,no_fire,0.265087,35.721354,-118.660329,310.887083,290.742083,2018-06-26,0
4,8.586347e+07,no_fire,0.202601,37.484083,-120.411426,317.291400,294.068200,2018-06-26,0
...,...,...,...,...,...,...,...,...,...
579,6.010443e+06,fire,0.354783,39.276786,-121.670988,309.122857,292.431429,2019-06-26,365
580,6.010443e+06,fire,0.313343,37.986310,-121.193545,310.996667,288.516000,2019-06-26,365
581,1.545542e+07,fire,0.163647,37.479630,-121.172461,315.414444,289.468889,2019-06-26,365
582,9.444982e+06,fire,0.325075,36.456439,-119.159807,313.969091,293.300000,2019-06-26,365


In [15]:
df['class'].value_counts()

fire       249
no_fire    235
Name: class, dtype: int64

In [1251]:
df.time

0      2018-06-26
1      2018-06-26
2      2018-06-26
3      2018-06-26
4      2018-06-26
          ...    
579    2019-06-26
580    2019-06-26
581    2019-06-26
582    2019-06-26
583    2019-06-26
Name: time, Length: 484, dtype: object

In [1252]:
def date_to_month(datetime):
    datetime = int(re.search("-\d\d",datetime).group(0))
    return datetime

In [1253]:
df['month'] = abs(df.time.apply(date_to_month))

In [1254]:
df.month.value_counts()

6     67
8     50
9     46
7     44
3     44
4     38
10    37
5     36
11    33
12    32
1     30
2     27
Name: month, dtype: int64

In [16]:
one_hot = pd.get_dummies(df['class']).fire
df['class'] = one_hot
df

Unnamed: 0,area,class,evi,lat,lon,lst_day,lst_night,time,day
0,1.373816e+07,0,0.216975,33.408854,-117.076050,307.487500,289.228750,2018-06-26,0
1,1.202089e+07,0,0.242957,36.141667,-118.236654,305.820000,283.662857,2018-06-26,0
2,5.452330e+08,0,0.398504,41.055741,-121.923551,300.717102,291.531937,2018-06-26,0
3,4.121447e+07,0,0.265087,35.721354,-118.660329,310.887083,290.742083,2018-06-26,0
4,8.586347e+07,0,0.202601,37.484083,-120.411426,317.291400,294.068200,2018-06-26,0
...,...,...,...,...,...,...,...,...,...
579,6.010443e+06,1,0.354783,39.276786,-121.670988,309.122857,292.431429,2019-06-26,365
580,6.010443e+06,1,0.313343,37.986310,-121.193545,310.996667,288.516000,2019-06-26,365
581,1.545542e+07,1,0.163647,37.479630,-121.172461,315.414444,289.468889,2019-06-26,365
582,9.444982e+06,1,0.325075,36.456439,-119.159807,313.969091,293.300000,2019-06-26,365


In [1233]:
#Undersampling majority class
count_class_0, count_class_1 = df['class'].value_counts()

# Divide by class
df_class_0 = df[df['class'] == 0]
df_class_1 = df[df['class'] == 1]
df_class_0_under = df_class_0.sample(count_class_1)
df_test_under = pd.concat([df_class_0_under, df_class_1], axis=0)

In [1090]:
#Oversampling minority class
df_class_1_over = df_class_1.sample(count_class_0, replace=True)
df_test_over = pd.concat([df_class_0, df_class_1_over], axis=0)

In [None]:
#df = df_test_under.copy()
#df = df_test_over.copy()
#df

In [80]:
y = df['class']
x = df.drop(['class','time','lst_night','day'], axis=1)
x['area'] = x['area'].apply(lambda x: x/1000000)

In [52]:
x

Unnamed: 0,area,evi,lat,lon,lst_day
0,13.738155,0.216975,33.408854,-117.076050,307.487500
1,12.020886,0.242957,36.141667,-118.236654,305.820000
2,545.233030,0.398504,41.055741,-121.923551,300.717102
3,41.214465,0.265087,35.721354,-118.660329,310.887083
4,85.863469,0.202601,37.484083,-120.411426,317.291400
...,...,...,...,...,...
579,6.010443,0.354783,39.276786,-121.670988,309.122857
580,6.010443,0.313343,37.986310,-121.193545,310.996667
581,15.455424,0.163647,37.479630,-121.172461,315.414444
582,9.444982,0.325075,36.456439,-119.159807,313.969091


In [53]:
#splitting
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.2)
scaler = StandardScaler()
scaler.fit(train_x)
train_x = scaler.transform(train_x)
test_x = scaler.transform(test_x)

In [22]:
test_x.shape

(97, 3)

In [23]:
def check_params(parameters):

    mlp = MLPClassifier(solver='lbfgs', max_iter = 10000)
    clf = GridSearchCV(mlp, parameters, cv = 4)
    clf.fit(train_x, train_y)
    grid_results = pd.DataFrame(clf.cv_results_)
    return grid_results[grid_results.rank_test_score.isin([1,2,3,4,5,6,7])]

In [26]:
params = {
    'alpha':[0.01,0.1,0.8,1,2,3,4], 
    'hidden_layer_sizes':[(8,8),(14,14),(28,14),(28,28),(64,28),(64,64)]
}
results = check_params(params)
results



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_hidden_layer_sizes,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
7,1.5475,0.473759,0.00025,0.000433,0.1,"(14, 14)","{'alpha': 0.1, 'hidden_layer_sizes': (14, 14)}",0.663265,0.659794,0.645833,0.645833,0.653747,0.007948,6
13,0.9045,0.340486,0.00025,0.000433,0.8,"(14, 14)","{'alpha': 0.8, 'hidden_layer_sizes': (14, 14)}",0.581633,0.742268,0.645833,0.666667,0.658915,0.057543,1
20,1.3185,0.407328,0.00025,0.000433,1.0,"(28, 14)","{'alpha': 1, 'hidden_layer_sizes': (28, 14)}",0.653061,0.701031,0.614583,0.666667,0.658915,0.030914,1
21,2.0235,0.898994,0.00025,0.000433,1.0,"(28, 28)","{'alpha': 1, 'hidden_layer_sizes': (28, 28)}",0.622449,0.690722,0.635417,0.666667,0.653747,0.026748,6
27,0.682,0.195646,0.00075,0.000433,2.0,"(28, 28)","{'alpha': 2, 'hidden_layer_sizes': (28, 28)}",0.683673,0.628866,0.666667,0.635417,0.653747,0.02251,6
29,1.952,0.56098,0.00025,0.000433,2.0,"(64, 64)","{'alpha': 2, 'hidden_layer_sizes': (64, 64)}",0.683673,0.649485,0.677083,0.625,0.658915,0.023339,1
34,0.71175,0.2434,0.0005,0.0005,3.0,"(64, 28)","{'alpha': 3, 'hidden_layer_sizes': (64, 28)}",0.642857,0.670103,0.666667,0.645833,0.656331,0.012139,4
35,1.069,0.388995,0.00075,0.000433,3.0,"(64, 64)","{'alpha': 3, 'hidden_layer_sizes': (64, 64)}",0.632653,0.670103,0.677083,0.645833,0.656331,0.017991,4
39,0.28975,0.08489,0.0005,0.0005,4.0,"(28, 28)","{'alpha': 4, 'hidden_layer_sizes': (28, 28)}",0.642857,0.639175,0.6875,0.645833,0.653747,0.019529,6
41,1.139749,0.159872,0.0005,0.0005,4.0,"(64, 64)","{'alpha': 4, 'hidden_layer_sizes': (64, 64)}",0.632653,0.659794,0.6875,0.635417,0.653747,0.022086,6


In [55]:
#MLP Classifier Function
#solver: 'Adam' works better for bigger datasets, 'lbfgs' faster and better for smaller
#regularization parameter: paper did between 0.8 - 4, and compared results
#hiddenlayers: 2 layers with 64 units each
#Best inputs rn: {'alpha': 2 'hidden_layer_sizes': (64, 64)}

mlp = MLPClassifier(solver='lbfgs', max_iter = 10000, alpha=2, hidden_layer_sizes = (64,64))
mlp.fit(train_x, train_y)

test_preds = mlp.predict(test_x)
conf_mat = metrics.confusion_matrix(test_y, test_preds)
print('Accuracy:')
print(accuracy_score(test_y, test_preds))
print('Confusion Matrix:')
print(conf_mat)
print(classification_report(test_y,test_preds))

Accuracy:
0.6907216494845361
Confusion Matrix:
[[31 17]
 [13 36]]
              precision    recall  f1-score   support

           0       0.70      0.65      0.67        48
           1       0.68      0.73      0.71        49

    accuracy                           0.69        97
   macro avg       0.69      0.69      0.69        97
weighted avg       0.69      0.69      0.69        97



In [90]:
pd.DataFrame(test_preds)[0].value_counts()

0    42
1    40
Name: 0, dtype: int64

In [1267]:
preds = pd.DataFrame()
preds['True'] = test_y.reset_index()['class']
preds['Predictions'] = test_preds
preds.head()

Unnamed: 0,True,Predictions
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1


In [1268]:
#Predictions
preds[preds['True'] == 1]

Unnamed: 0,True,Predictions
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1
7,1,1
10,1,1
11,1,1
12,1,0
14,1,1


### Training with first 305 days, to predict last 60

In [81]:
train_x = df[df.day<=305].drop(['class','time','day','lst_night'], axis=1)
train_y = df[df.day<=305]['class']

test_x = df[df.day>305].drop(['class','time','day','lst_night'], axis=1)
test_y = df[df.day>305]['class']

In [84]:
train_x['area'] = train_x['area'].apply(lambda x: x/1000000)
test_x['area'] = test_x['area'].apply(lambda x: x/1000000)
test_x.shape

(82, 5)

In [85]:
test_x

Unnamed: 0,area,evi,lat,lon,lst_day
195,6.869078,0.382356,39.880208,-121.599115,293.767500
196,11.162251,0.350431,39.333654,-120.872097,292.860000
197,28.334945,0.331824,40.458712,-121.986116,303.275758
198,7.727712,0.361333,34.179167,-117.873876,295.137778
199,6.010443,0.302064,39.669643,-120.926291,290.148571
...,...,...,...,...,...
579,6.010443,0.354783,39.276786,-121.670988,309.122857
580,6.010443,0.313343,37.986310,-121.193545,310.996667
581,15.455424,0.163647,37.479630,-121.172461,315.414444
582,9.444982,0.325075,36.456439,-119.159807,313.969091


In [86]:
scaler = StandardScaler()
scaler.fit(train_x)
train_x = scaler.transform(train_x)
test_x = scaler.transform(test_x)

In [88]:
parameters = {
    'alpha':[0.01,0.1,0.8,1,2], 
    'hidden_layer_sizes':[(8,8),(14,14),(28,14),(28,28),(64,28),(64,64)]
}
results = check_params(parameters)
results



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_hidden_layer_sizes,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
14,1.426999,0.371755,0.0005,0.0005002618,0.8,"(28, 14)","{'alpha': 0.8, 'hidden_layer_sizes': (28, 14)}",0.574257,0.693069,0.663366,0.666667,0.649254,0.044946,1
16,4.78275,1.654211,0.0005,0.0004997855,0.8,"(64, 28)","{'alpha': 0.8, 'hidden_layer_sizes': (64, 28)}",0.643564,0.623762,0.643564,0.666667,0.644279,0.015147,2
17,5.653499,2.278253,0.00075,0.0004329816,0.8,"(64, 64)","{'alpha': 0.8, 'hidden_layer_sizes': (64, 64)}",0.633663,0.613861,0.613861,0.656566,0.629353,0.017539,4
20,1.8235,0.486115,0.0005,0.0004999043,1.0,"(28, 14)","{'alpha': 1, 'hidden_layer_sizes': (28, 14)}",0.613861,0.673267,0.564356,0.666667,0.629353,0.044148,4
23,6.714257,1.806432,0.001,7.420718e-07,1.0,"(64, 64)","{'alpha': 1, 'hidden_layer_sizes': (64, 64)}",0.643564,0.613861,0.554455,0.69697,0.626866,0.051383,6
24,0.3525,0.238326,0.0005,0.0005000234,2.0,"(8, 8)","{'alpha': 2, 'hidden_layer_sizes': (8, 8)}",0.633663,0.613861,0.60396,0.656566,0.626866,0.020078,6
27,2.484,0.684024,0.0005,0.0005001426,2.0,"(28, 28)","{'alpha': 2, 'hidden_layer_sizes': (28, 28)}",0.60396,0.613861,0.564356,0.727273,0.626866,0.060322,6
28,3.867,2.320343,0.0005,0.0005003812,2.0,"(64, 28)","{'alpha': 2, 'hidden_layer_sizes': (64, 28)}",0.594059,0.613861,0.613861,0.686869,0.626866,0.035243,6
29,5.3395,1.828693,0.00075,0.0004329471,2.0,"(64, 64)","{'alpha': 2, 'hidden_layer_sizes': (64, 64)}",0.584158,0.643564,0.594059,0.737374,0.639303,0.060427,3


In [89]:
mlp = MLPClassifier(solver='lbfgs', max_iter = 10000, alpha=0.8, hidden_layer_sizes = (28, 14))
#clf = GridSearchCV(mlp, parameters, cv = 2)
mlp.fit(train_x, train_y)

test_preds = mlp.predict(test_x)
conf_mat = metrics.confusion_matrix(test_y, test_preds)
print('Accuracy:')
print(accuracy_score(test_y, test_preds))
print('Confusion Matrix:')
print(conf_mat)
print(classification_report(test_y,test_preds))

Accuracy:
0.6097560975609756
Confusion Matrix:
[[25 15]
 [17 25]]
              precision    recall  f1-score   support

           0       0.60      0.62      0.61        40
           1       0.62      0.60      0.61        42

    accuracy                           0.61        82
   macro avg       0.61      0.61      0.61        82
weighted avg       0.61      0.61      0.61        82

