In [1150]:
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import re


In [1115]:
df = pd.read_csv('wildfires.csv')
df = df.dropna()
df

Unnamed: 0,area,class,evi,lat,lon,lst_day,lst_night,time
0,1.373816e+07,no_fire,0.216975,33.408854,-117.076050,307.487500,289.228750,2018-06-26
1,1.202089e+07,no_fire,0.242957,36.141667,-118.236654,305.820000,283.662857,2018-06-26
2,5.452330e+08,no_fire,0.398504,41.055741,-121.923551,300.717102,291.531937,2018-06-26
3,4.121447e+07,no_fire,0.265087,35.721354,-118.660329,310.887083,290.742083,2018-06-26
4,8.586347e+07,no_fire,0.202601,37.484083,-120.411426,317.291400,294.068200,2018-06-26
...,...,...,...,...,...,...,...,...
552,5.924579e+07,fire,0.208693,41.848732,-123.568036,300.325507,284.840870,2019-06-18
553,9.530845e+07,fire,0.278667,41.031794,-122.279068,302.684505,291.530991,2019-06-18
554,4.808354e+07,fire,0.217178,38.383631,-119.460196,307.464643,281.528571,2019-06-18
555,4.550764e+07,fire,0.362010,39.284355,-120.772069,301.082264,287.036604,2019-06-18


In [1116]:
df['class'].value_counts()

fire       237
no_fire    235
Name: class, dtype: int64

In [1117]:
df.time

0      2018-06-26
1      2018-06-26
2      2018-06-26
3      2018-06-26
4      2018-06-26
          ...    
552    2019-06-18
553    2019-06-18
554    2019-06-18
555    2019-06-18
556    2019-06-18
Name: time, Length: 472, dtype: object

In [1118]:
def date_to_month(datetime):
    datetime = int(re.search("-\d\d",datetime).group(0))
    return datetime

In [1119]:
df['month'] = abs(df.time.apply(date_to_month))

In [1120]:
df.month.value_counts()

6     53
9     44
5     43
11    41
7     41
8     40
3     38
1     37
10    36
12    34
2     33
4     32
Name: month, dtype: int64

In [1121]:
one_hot = pd.get_dummies(df['class']).fire
df['class'] = one_hot
df

Unnamed: 0,area,class,evi,lat,lon,lst_day,lst_night,time,month
0,1.373816e+07,0,0.216975,33.408854,-117.076050,307.487500,289.228750,2018-06-26,6
1,1.202089e+07,0,0.242957,36.141667,-118.236654,305.820000,283.662857,2018-06-26,6
2,5.452330e+08,0,0.398504,41.055741,-121.923551,300.717102,291.531937,2018-06-26,6
3,4.121447e+07,0,0.265087,35.721354,-118.660329,310.887083,290.742083,2018-06-26,6
4,8.586347e+07,0,0.202601,37.484083,-120.411426,317.291400,294.068200,2018-06-26,6
...,...,...,...,...,...,...,...,...,...
552,5.924579e+07,1,0.208693,41.848732,-123.568036,300.325507,284.840870,2019-06-18,6
553,9.530845e+07,1,0.278667,41.031794,-122.279068,302.684505,291.530991,2019-06-18,6
554,4.808354e+07,1,0.217178,38.383631,-119.460196,307.464643,281.528571,2019-06-18,6
555,4.550764e+07,1,0.362010,39.284355,-120.772069,301.082264,287.036604,2019-06-18,6


In [1089]:
#Undersampling majority class
count_class_0, count_class_1 = df['class'].value_counts()

# Divide by class
df_class_0 = df[df['class'] == 0]
df_class_1 = df[df['class'] == 1]
df_class_0_under = df_class_0.sample(count_class_1)
df_test_under = pd.concat([df_class_0_under, df_class_1], axis=0)

In [1090]:
#Oversampling minority class
df_class_1_over = df_class_1.sample(count_class_0, replace=True)
df_test_over = pd.concat([df_class_0, df_class_1_over], axis=0)

In [None]:
#df = df_test_under.copy()
#df = df_test_over.copy()
#df

In [1173]:
y = df['class']
x = df.drop(['class','time','month','lat','lon','area'], axis=1)

In [1124]:
x.shape

(472, 3)

In [1174]:
#splitting
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.2)
scaler = StandardScaler()
scaler.fit(train_x)
train_x = scaler.transform(train_x)
test_x = scaler.transform(test_x)

In [1126]:
test_x.shape

(95, 3)

In [1127]:
def sensitivity(conf_mat):
    return conf_mat[1, 1]/np.sum(conf_mat[1, :])
def specificity(conf_mat):
    return conf_mat[0, 0]/np.sum(conf_mat[0, :])

def false_positive_rate(conf_mat):
    return 1 - specificity(conf_mat)

def accuracy(conf_mat):
    return (conf_mat[0, 0] + conf_mat[1, 1])/np.sum(np.ravel(conf_mat))

In [1128]:
def check_params(parameters):

    mlp = MLPClassifier(solver='lbfgs', max_iter = 10000)
    clf = GridSearchCV(mlp, parameters, cv = 4)
    clf.fit(train_x, train_y)
    grid_results = pd.DataFrame(clf.cv_results_)
    return grid_results[grid_results.rank_test_score.isin([1,2,3,4,5,6,7])]

In [1129]:
params = {
    'alpha':[0.01,0.1,0.8,1,2], 
    'hidden_layer_sizes':[(8,8),(14,14),(28,14),(28,28),(64,28),(64,64)]
}
results = check_params(params)
results



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_hidden_layer_sizes,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
2,1.307919,0.51098,0.000249,0.0004319489,0.01,"(28, 14)","{'alpha': 0.01, 'hidden_layer_sizes': (28, 14)}",0.589474,0.521277,0.606383,0.521277,0.559682,0.038768,1
5,7.521899,3.917925,0.001,4.958304e-06,0.01,"(64, 64)","{'alpha': 0.01, 'hidden_layer_sizes': (64, 64)}",0.568421,0.521277,0.56383,0.585106,0.559682,0.023505,1
7,4.066482,2.036877,0.000499,0.0004986525,0.1,"(14, 14)","{'alpha': 0.1, 'hidden_layer_sizes': (14, 14)}",0.547368,0.521277,0.617021,0.542553,0.557029,0.035942,4
9,3.371799,1.80536,0.000748,0.0004318457,0.1,"(28, 28)","{'alpha': 0.1, 'hidden_layer_sizes': (28, 28)}",0.484211,0.574468,0.595745,0.531915,0.546419,0.042784,7
12,0.655718,0.183135,0.000997,7.974558e-07,0.8,"(8, 8)","{'alpha': 0.8, 'hidden_layer_sizes': (8, 8)}",0.589474,0.510638,0.553191,0.553191,0.551724,0.027947,5
16,4.170951,0.817345,0.000997,4.256623e-07,0.8,"(64, 28)","{'alpha': 0.8, 'hidden_layer_sizes': (64, 28)}",0.536842,0.56383,0.521277,0.574468,0.549072,0.021105,6
20,1.568772,0.283511,0.000749,0.0004326787,1.0,"(28, 14)","{'alpha': 1, 'hidden_layer_sizes': (28, 14)}",0.536842,0.574468,0.56383,0.56383,0.559682,0.013948,1
28,1.475048,0.64741,0.000997,1.032383e-07,2.0,"(64, 28)","{'alpha': 2, 'hidden_layer_sizes': (64, 28)}",0.568421,0.521277,0.574468,0.521277,0.546419,0.025167,7
29,2.113525,0.782043,0.000745,0.0004299408,2.0,"(64, 64)","{'alpha': 2, 'hidden_layer_sizes': (64, 64)}",0.557895,0.521277,0.574468,0.531915,0.546419,0.020962,7


In [1175]:
#MLP Classifier Function
#solver: 'Adam' works better for bigger datasets, 'lbfgs' faster and better for smaller
#regularization parameter: paper did between 0.8 - 4, and compared results
#hiddenlayers: 2 layers with 28 units each
#Best inputs rn: {'alpha': 1 'hidden_layer_sizes': (28, 14)}

accs = []
specs = []
tprs = []
fprs = []

mlp = MLPClassifier(solver='lbfgs', max_iter = 10000, alpha=1, hidden_layer_sizes = (28,14))
mlp.fit(train_x, train_y)

test_preds = mlp.predict(test_x)
conf_mat = metrics.confusion_matrix(test_y, test_preds)
print("Accuracy:", accuracy(conf_mat))
print("Specificity", specificity(conf_mat))
print("TPR: ",sensitivity(conf_mat))
print("FPR: ",false_positive_rate(conf_mat))

Accuracy: 0.631578947368421
Specificity 0.5625
TPR:  0.7021276595744681
FPR:  0.4375


In [1179]:
conf_mat

array([[27, 21],
       [14, 33]], dtype=int64)

In [1177]:
pd.DataFrame(test_preds)[0].value_counts()

1    54
0    41
Name: 0, dtype: int64

In [908]:
#TPR: 35%
#FPR: 11%

In [1134]:
preds = pd.DataFrame()
preds['True'] = test_y.reset_index()['class']
preds['Predictions'] = test_preds
preds.head()

Unnamed: 0,True,Predictions
0,0,1
1,1,1
2,1,0
3,1,0
4,0,0


In [1135]:
#Predictions
preds[preds['True'] == 1]

Unnamed: 0,True,Predictions
1,1,1
2,1,0
3,1,0
5,1,1
8,1,1
9,1,1
13,1,1
16,1,0
20,1,0
21,1,0


### Training with first 300 days, to predict last 25

In [1136]:
train_x = df[df.day<=345].drop(['class','time','day','month','lat','lon','area'], axis=1)
train_y = df[df.day<=345]['class']

test_x = df[df.day>=345].drop(['class','time','day','month','lat','lon','area'], axis=1)
test_y = df[df.day>=345]['class']

AttributeError: 'DataFrame' object has no attribute 'day'

In [1066]:
test_x.shape

(59, 3)

In [1067]:
scaler = StandardScaler()
scaler.fit(train_x)
train_x = scaler.transform(train_x)
test_x = scaler.transform(test_x)

In [1073]:
parameters = {
    'alpha':[0.01,0.1,0.8,1,2,4], 
    'hidden_layer_sizes':[(8,8),(14,14),(28,14),(28,28),(64,28),(64,64)]
}
results = check_params(parameters)
results



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_hidden_layer_sizes,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
3,4.296326,1.99797,0.001013,2.806176e-05,0.01,"(28, 28)","{'alpha': 0.01, 'hidden_layer_sizes': (28, 28)}",0.669903,0.854369,0.805825,0.754902,0.77129,0.068347,7
4,5.549833,2.894787,0.001243,0.0004265461,0.01,"(64, 28)","{'alpha': 0.01, 'hidden_layer_sizes': (64, 28)}",0.747573,0.815534,0.805825,0.803922,0.793187,0.026743,5
5,7.598707,2.514039,0.000997,1.032383e-07,0.01,"(64, 64)","{'alpha': 0.01, 'hidden_layer_sizes': (64, 64)}",0.825243,0.786408,0.815534,0.823529,0.812652,0.015614,1
8,2.705929,2.128762,0.000997,5.462856e-07,0.1,"(28, 14)","{'alpha': 0.1, 'hidden_layer_sizes': (28, 14)}",0.786408,0.796117,0.825243,0.784314,0.798054,0.016341,4
9,3.45553,1.938014,0.001245,0.0004290932,0.1,"(28, 28)","{'alpha': 0.1, 'hidden_layer_sizes': (28, 28)}",0.728155,0.825243,0.786408,0.803922,0.785888,0.036114,6
10,7.931502,3.851201,0.001522,0.000471816,0.1,"(64, 28)","{'alpha': 0.1, 'hidden_layer_sizes': (64, 28)}",0.815534,0.786408,0.76699,0.843137,0.80292,0.028864,3
11,8.734597,5.615396,0.000997,4.460403e-07,0.1,"(64, 64)","{'alpha': 0.1, 'hidden_layer_sizes': (64, 64)}",0.805825,0.747573,0.834951,0.833333,0.805353,0.035363,2


In [1180]:
mlp = MLPClassifier(solver='lbfgs', max_iter = 10000, alpha=0.01, hidden_layer_sizes = (64, 64))
#clf = GridSearchCV(mlp, parameters, cv = 2)
mlp.fit(train_x, train_y)

test_preds = mlp.predict(test_x)
conf_mat = metrics.confusion_matrix(test_y, test_preds)
print("Accuracy:", accuracy(conf_mat))
print("Specificity", specificity(conf_mat))
print("TPR: ",sensitivity(conf_mat))
print("FPR: ",false_positive_rate(conf_mat))

Accuracy: 0.6210526315789474
Specificity 0.625
TPR:  0.6170212765957447
FPR:  0.375
