# Stacked Data Baseline ML Tests

Required Data File `./FullStacked_data.csv`


## Basic Data Preparation

In [1]:
import os, sys
import numpy as np
import pandas as pd


In [2]:
# Dataset location
DATASET = 'FullStacked_data.csv'
assert os.path.exists(DATASET)

# Load and shuffle
dataset = pd.read_csv(DATASET).sample(frac = 1).reset_index(drop=True)

#### Note: Becaues we are using `sample(frac = 1)` we are randomizing all the data. Therefore, results will vary from time to time based on the data set reading.

In [3]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,OBJECTID,YrMo,mean_B1,stdev_B1,min_B1,max_B1,mean_B2,stdev_B2,min_B2,...,max_B4,mean_B5,stdev_B5,min_B5,max_B5,mean_B6,stdev_B6,min_B6,max_B6,isBurnt
0,26,1921,1998_4,9627.0625,148.36014,9380.0,9835.0,10666.875,174.07541,10366.0,...,14582.0,17694.625,554.2661,16626.0,18582.0,13989.25,372.94925,13346.0,14460.0,2
1,5,2359,1987_4,9020.9375,142.20946,8735.0,9207.0,10164.5,175.25867,9805.0,...,18744.0,16703.5,385.3163,15828.0,17192.0,12714.375,317.25485,12123.0,13289.0,1
2,37,1007,2004_8,9632.125,160.83444,9246.0,9890.0,10515.0,172.83981,10231.0,...,14694.0,16077.5,433.9613,15396.0,16759.0,13572.3125,264.32043,13030.0,13937.0,1
3,37,520,2004_8,10097.1875,108.13092,9854.0,10287.0,11038.4375,130.20854,10799.0,...,14448.0,17772.0,474.47235,16878.0,18519.0,14370.625,289.18573,13827.0,14866.0,1
4,4,1637,1986_3,9573.75,209.6548,9237.0,9917.0,10458.25,339.71234,9847.0,...,15213.0,18303.938,639.81836,17111.0,19374.0,14178.4375,312.28876,13791.0,14823.0,1


In [5]:
# Drop first 3 columns and isBurnt label
# 0 index of columns - so ",3" drops  {0,1,2}
X = np.array(dataset.iloc[:,3:-1])
y = np.array(dataset.isBurnt)
y = y - 1  #shift from {1.2} to {0,1} for non-burn, burn

---

## Test Base Line ML Classifiers

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC


## Baseline a resubstitution Logistic Regression

In [7]:
# Create an instance of a model that can be trained
model = LogisticRegression()

# fit = "train model parameters using this data and expected outcomes"
model.fit(X, y)       
LR_RESUB_SCORE = model.score(X, y)
print("Logistic Regression: {0:6.5f}".format(LR_RESUB_SCORE))



Logistic Regression: 0.88643


--- 

## Baseline a resubstitution KNeighborsClassifier

In [8]:
# Create an instance of a model that can be trained
model = KNeighborsClassifier()

# fit = "train model parameters using this data and expected outcomes"
model.fit(X, y)   
KNN_RESUB_SCORE = model.score(X, y)
print("KNN : {0:6.5f}".format(KNN_RESUB_SCORE))

KNN : 0.93314


--- 

## Baseline a resubstitution Decision Tree

In [9]:
# Create an instance of a model that can be trained
model = DecisionTreeClassifier()

# fit = "train model parameters using this data and expected outcomes"
model.fit(X, y)       
DT_RESUB_SCORE = model.score(X, y)
print("Decision Tree: {0:6.5f}".format(DT_RESUB_SCORE))

Decision Tree: 0.99982


--- 

## Baseline a resubstitution LinearSVC

In [10]:
# Create an instance of a model that can be trained
model = LinearSVC()

# fit = "train model parameters using this data and expected outcomes"
model.fit(X, y)       
SVC_RESUB_SCORE = model.score(X, y)
print("Linear SVC Regression: {0:6.5f}".format(SVC_RESUB_SCORE))

Linear SVC Regression: 0.81846




---
## Resubstitution Model Summary

* Logistic Regression: 0.88639
* K(5) Nearest Neighbors: 0.93313
* Decision Tree: 0.99982
* Linear SVC: 0.80398

---


## Cross-Fold Analysis of Classifier Generalizability
We are going to do a 5-fold cross validation for each model.
Then, compare the degrade.

In [16]:
import sklearn.model_selection
XFOLD = 5

In [17]:
# Hide the pesky warnings from Logit
import warnings; warnings.simplefilter('ignore')

# new model
model = LogisticRegression()
# Show Prior
print("Resub Logistic Regression: {0:6.5f}".format(LR_RESUB_SCORE))
# Run Cross Val
cv_results = sklearn.model_selection.cross_val_score(model, X, y, cv=XFOLD)

for i,acc in enumerate(cv_results):
    change = (acc-LR_RESUB_SCORE)/LR_RESUB_SCORE * 100
    print("Fold {}: {:6.5f}, change {:5.2f}%".format(i,acc,change))

print("Average Logit Acc {:5.2f}%".format(np.mean(cv_results)*100))

Resub Logistic Regression: 0.88643
Fold 0: 0.88622, change -0.02%
Fold 1: 0.88655, change  0.01%
Fold 2: 0.88584, change -0.07%
Fold 3: 0.88751, change  0.12%
Fold 4: 0.88584, change -0.07%
Average Logit Acc 88.64%


In [18]:

# new model
model = KNeighborsClassifier()
# Show Prior
print("Resub KNN: {0:6.5f}".format(KNN_RESUB_SCORE))
# Run Cross Val
cv_results = sklearn.model_selection.cross_val_score(model, X, y, cv=XFOLD)

for i,acc in enumerate(cv_results):
    change = (acc-KNN_RESUB_SCORE)/KNN_RESUB_SCORE * 100
    print("Fold {}: {:6.5f}, change {:5.2f}%".format(i,acc,change))
    
print("Average KNN Acc {:5.2f}%".format(np.mean(cv_results)*100))

Resub KNN: 0.93314
Fold 0: 0.90660, change -2.84%
Fold 1: 0.90870, change -2.62%
Fold 2: 0.90878, change -2.61%
Fold 3: 0.90726, change -2.77%
Fold 4: 0.90804, change -2.69%
Average KNN Acc 90.79%


In [19]:
# new model
model = DecisionTreeClassifier()
# Show Prior
print("Resub Decision Tree: {0:6.5f}".format(DT_RESUB_SCORE))
# Run Cross Val
cv_results = sklearn.model_selection.cross_val_score(model, X, y, cv=XFOLD)

for i,acc in enumerate(cv_results):
    change = (acc-DT_RESUB_SCORE)/DT_RESUB_SCORE * 100
    print("Fold {}: {:6.5f}, change {:5.2f}%".format(i,acc,change))
    
print("Average Decision Tree Acc {:5.2f}%".format(np.mean(cv_results)*100))

Resub Decision Tree: 0.99982
Fold 0: 0.87447, change -12.54%
Fold 1: 0.88110, change -11.87%
Fold 2: 0.87565, change -12.42%
Fold 3: 0.87917, change -12.07%
Fold 4: 0.87895, change -12.09%
Average Decision Tree Acc 87.79%


In [20]:
# new model
model = LinearSVC()
# Show Prior
print("Resub SVC: {0:6.5f}".format(SVC_RESUB_SCORE))
# Run Cross Val
cv_results = sklearn.model_selection.cross_val_score(model, X, y, cv=XFOLD)

for i,acc in enumerate(cv_results):
    change = (acc-SVC_RESUB_SCORE)/SVC_RESUB_SCORE * 100
    print("Fold {}: {:6.5f}, change {:5.2f}%".format(i,acc,change))
    
print("Average Linear SVC Acc {:5.2f}%".format(np.mean(cv_results)*100))

Resub SVC: 0.81846
Fold 0: 0.86817, change  6.07%
Fold 1: 0.26266, change -67.91%
Fold 2: 0.86728, change  5.96%
Fold 3: 0.78662, change -3.89%
Fold 4: 0.85839, change  4.88%
Average Linear SVC Acc 72.86%


## Notes 
 * Average Logit Acc 88.64%
 * Average KNN Acc 90.67%
 * Average Decision Tree Acc 87.67%
 * Average Linear SVC Acc 78.55%

### The high-performing decision tree seems overfit .

### The linear Support Vector Machine is very inconsistent

### The best is the KNN with an average Accuracy of 90.67%

---