In [1]:
import sqlite3
import pandas as pd
import pandas.io.sql as pd_sql
import numpy as np
import datetime, time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.svm import LinearSVC , SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, precision_recall_curve,f1_score, fbeta_score, confusion_matrix
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

%matplotlib inline

#### Reading the dataframe pickled in previous notebook after cleaning and EDA

In [4]:
fire_clean_model_df = pd.read_pickle("./fire_clean_model.pkl")

In [5]:
fire_clean_model_df.head()

Unnamed: 0,FIRE_YEAR,DISCOVERY_DOY,FIRE_SIZE,CAUSE_LABEL,AK,AL,AR,AZ,CA,CO,...,VT,WA,WI,WV,WY,DAYS_TO_CONT,fall,spring,summer,winter
1,2004,133,0.25,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
2,2004,152,0.1,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,2004,180,0.1,0,0,0,0,0,1,0,...,0,0,0,0,0,5,0,0,1,0
4,2004,180,0.1,0,0,0,0,0,1,0,...,0,0,0,0,0,5,0,0,1,0
5,2004,182,0.1,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,1,0


In [4]:
fire_clean_model_df.columns

Index(['FIRE_YEAR', 'DISCOVERY_DOY', 'FIRE_SIZE', 'CAUSE_LABEL', 'AK', 'AL',
       'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI', 'IA', 'ID',
       'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS',
       'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR',
       'PA', 'PR', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI',
       'WV', 'WY', 'DAYS_TO_CONT', 'fall', 'spring', 'summer', 'winter'],
      dtype='object')

#### Create X & Y subsets from dataframe

In [6]:
X = fire_clean_model_df.drop(['CAUSE_LABEL','FIRE_YEAR'],
                            axis = 1)

In [7]:
Y = fire_clean_model_df['CAUSE_LABEL']

In [8]:
X.columns

Index(['DISCOVERY_DOY', 'FIRE_SIZE', 'AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT',
       'DC', 'DE', 'FL', 'GA', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA',
       'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH',
       'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'PR', 'RI', 'SC', 'SD',
       'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', 'DAYS_TO_CONT',
       'fall', 'spring', 'summer', 'winter'],
      dtype='object')

#### Creating Train Test Validate Subsets

In [9]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, Y, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=.25)

#### Initial knn model

In [102]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
print("The score for kNN is")
print("Training: {:6.2f}%".format(100*knn.score(X_train, y_train)))
print("Test set: {:6.2f}%".format(100*knn.score(X_val, y_val)))

The score for kNN is
Training:  90.01%
Test set:  87.87%


In [103]:
print("kNN confusion matrix: \n\n", confusion_matrix(y_val, knn.predict(X_val)))

kNN confusion matrix: 

 [[ 36311  19619]
 [ 14089 207921]]


In [None]:
knn_confusion = confusion_matrix(label_test, knn.predict(X_test))
plt.figure(dpi=100)
sns.heatmap(knn_confusion, cmap=plt.cm.Blues, annot=True, square=True,
           xticklabels=fire_clean_model_df.CAUSE_LABEL,
           yticklabels=fire_clean_model_df.CAUSE_LABEL)

plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('kNN confusion matrix');

In [104]:
y_predict = knn.predict(X_val)
print("Default threshold:")
print("Precision: {},   Recall: {}".format(precision_score(y_val, y_predict), 
                                                     recall_score(y_val, y_predict)))

Default threshold:
Precision: 0.913777797310363,   Recall: 0.9365388946443853


#### RandomizedSearch for Knn

In [13]:
knn_param = {'n_neighbors' : range(1,5), 
             'algorithm':['auto', 'ball_tree','kd_tree','brute'], 
             'weights':['uniform', 'distance']}
knn = KNeighborsClassifier()
grid_srch = RandomizedSearchCV(knn, knn_param, cv=5, scoring='roc_auc', iid=True)
grid_srch.fit(X_train, y_train)
print(grid_srch.best_score_)
print(grid_srch.best_params_)

KeyboardInterrupt: 

#### Initial Logistic Regression

In [15]:
logit = LogisticRegression(C = 0.1)
logit.fit(X_train, y_train)
print("The score for logistic regression is")
#score returns mean accuracy
print("Training: {:6.2f}%".format(100*logit.score(X_train, y_train)))
print("Test set: {:6.2f}%".format(100*logit.score(X_val, y_val)))



The score for logistic regression is
Training:  88.05%
Test set:  88.05%


In [16]:
y_predict = logit.predict(X_val)
precision_score(y_val, y_predict), recall_score(y_val, y_predict)

(0.909858458656398, 0.9441434549790757)

In [13]:
import seaborn as sns


In [34]:
# plt.figure(figsize=(16,5))
logit.predict_proba(X_val)
# hist of class 1 probas

array([[0.79908972, 0.20091028],
       [0.65850073, 0.34149927],
       [0.47617125, 0.52382875],
       ...,
       [0.02342476, 0.97657524],
       [0.35412059, 0.64587941],
       [0.04366027, 0.95633973]])

In [31]:
logit.classes_

array([0, 1])

In [98]:
print("Logistic confusion matrix: \n\n", confusion_matrix(y_val, logit.predict(X_val)))

Logistic confusion matrix: 

 [[ 34782  21148]
 [ 12076 209934]]


In [101]:
# for item in zip(X.columns, logit.coef_[0]):
#     print(item)

In [99]:
print(logit.coef_[0].shape)

(59,)


#### Initial Naive Bayes

In [107]:
nb = GaussianNB()
nb.fit(X_train, y_train)
nb.score(X_val, y_val)

0.8265273080520976

In [108]:
y_predict = nb.predict(X_val)
precision_score(y_val, y_predict), recall_score(y_val, y_predict)

(0.9542306345298395, 0.8222647628485203)

In [124]:
print("GaussianNB confusion matrix: \n\n", confusion_matrix(y_val, nb.predict(X_val)))

GaussianNB confusion matrix: 

 [[ 40158  15772]
 [ 25996 196014]]


In [121]:
nb_B = BernoulliNB()
nb_B.fit(X_train,y_train)
nb_B.score(X_val,y_val)

0.8497229617903145

In [122]:
y_predict = nb_B.predict(X_val)
precision_score(y_val, y_predict), recall_score(y_val, y_predict)

(0.9255285996241489, 0.8829061753975046)

In [126]:
print("BernoulliNB confusion matrix: \n\n", confusion_matrix(y_val, nb_B.predict(X_val)))

BernoulliNB confusion matrix: 

 [[ 40158  15772]
 [ 25996 196014]]


#### Initial SVM

In [10]:
svm_model = LinearSVC()
svm_model.fit(X_train, y_train)
svm_model.score(X_val, y_val)



0.8000251852917896

In [11]:
y_predict = svm_model.predict(X_val)
precision_score(y_val, y_predict), recall_score(y_val, y_predict)

(0.8000237471575372, 0.9999910052124794)

In [12]:
print("SVC Linear confusion matrix: \n\n", confusion_matrix(y_val, svm_model.predict(X_val)))

SVC Linear confusion matrix: 

 [[    10  55579]
 [     2 222349]]


In [None]:
svm_rbf = SVC(gamma="auto")
svm_rbf.fit(X_train, y_train)
svm_rbf.score(X_val, y_val)

In [None]:
y_predict = svm_rbf.predict(X_val)
precision_score(y_val, y_predict), recall_score(y_val, y_predict)

In [None]:
print("SVC RBF confusion matrix: \n\n", confusion_matrix(y_val, svm_rbf.predict(X_val)))

In [None]:
svm_poly = SVC(kernel="poly", gamma="auto")
svm_poly.fit(X_train, y_train)
svm_poly.score(X_val, y_val)

y_predict = svm_poly.predict(X_val)
precision_score(y_val, y_predict), recall_score(y_val, y_predict)

print("SVC Poly confusion matrix: \n\n", confusion_matrix(y_val, svm_poly.predict(X_val)))

#### Initial Decision Tree

In [35]:
decisiontree = DecisionTreeClassifier(max_depth=4)


decisiontree.fit(X_train, y_train)
decisiontree.score(X_val, y_val)

y_predict = decisiontree.predict(X_val)

print(precision_score(y_val, y_predict), recall_score(y_val, y_predict))

print("\n Decision confusion matrix: \n\n", confusion_matrix(y_val, decisiontree.predict(X_val)))

0.9036316478026412 0.8920307789227377

 Decision confusion matrix: 

 [[ 34569  21141]
 [ 23994 198236]]


#### Initial RandomForest

In [12]:
randomforest = RandomForestClassifier(n_estimators=100)


randomforest.fit(X_train, y_train)
randomforest.score(X_val, y_val)

y_predict = randomforest.predict(X_val)

print(precision_score(y_val, y_predict), recall_score(y_val, y_predict))

print("\n Random Forest confusion matrix: \n\n", confusion_matrix(y_val, randomforest.predict(X_val)))

0.9222642043694862 0.9380011699590515

 Random Forest confusion matrix: 

 [[ 38140  17570]
 [ 13778 208452]]
