## This notebook is the part of Georgetown University Data Science Project - Team Ship Happen


## Purpose of this notebook is Model Selection and Evaluation

### Import required libraries

In [1]:
%matplotlib inline


import time

import numpy as np
import matplotlib.cm as cm

# Standard Python libraries
import os                                    # For accessing operating system functionalities
import json                                  # For encoding and decoding JSON data
import pickle                                # For serializing and de-serializing Python objects

# Libraries that can be pip installed
import requests                              # Simple Python library for HTTP
import pandas as pd                          # Library for building dataframes similar to those in R
import seaborn as sns                        # Statistical visualization library based on Matplotlib
import matplotlib.pyplot as plt  
from sklearn.datasets.base import Bunch

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import ElasticNetCV, LogisticRegressionCV, LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, auc, roc_curve, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
#from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.svm import LinearSVC, NuSVC, SVC

from sklearn import metrics
from sklearn import cross_validation
from sklearn.cross_validation import KFold




### Create Bunch 


In [8]:

DATA_DIR = os.path.abspath(os.path.join(".", "..", "ship-happens"))

# Show the contents of the data directory
for name in os.listdir(DATA_DIR):
    if name.startswith("."): continue
    print("- {}".format(name))

- baltimore_knn.txt
- baltimore_randf.txt
- bokeh_accident.txt
- bokeh_dead.txt
- bokeh_injured.txt
- bokeh_injury.xlsx
- bokeh_region.txt
- data
- feature_selection.ipynb
- incident_knn-classifier.pickle
- incident_random-forest-classifier.pickle
- incident_visualization_bokeh.ipynb
- ingetion_wranling.ipynb
- LICENSE
- meta_incident.json
- model evaluation.txt
- model_selection.ipynb
- model_selection_categorical.ipynb
- mvinjury.txt
- mvinjury.zip
- mvinjury_data.txt
- mvinjury_data_final.txt
- predicton comparision.xlsx
- README.md
- results_user_input_data_random-forest-classifier.txt
- result_baltimore_knn.txt
- result_baltimore_randf.txt
- user_input_data.txt
- Vessel_age_dist.png


In [9]:
def load_data(root=DATA_DIR):
    # Construct the `Bunch` for the Misle incident dataset
    filenames     = {
        'meta': os.path.join(root, 'meta_incident.json'),
        'rdme': os.path.join(root, 'ReadMe.md'),        
        'data': os.path.join(root, 'mvinjury_data_final.txt')        
    }

    # Load the meta data from the meta json
    with open(filenames['meta'], 'r') as f:
        meta = json.load(f)
        target_names  = meta['target_names']
        feature_names = meta['feature_names']

    # Load the description from the README. 
    with open(filenames['rdme'], 'r') as f:
        DESCR = f.read()

    # Load the dataset from the text file.
    mydataset = np.loadtxt(filenames['data'])

    # Extract the target from the data
    data   = mydataset[:, 0:-1]
    target = mydataset[:, -1]

    # Create the bunch object
    return Bunch(
        data=data,
        target=target,
        filenames=filenames,
        target_names=target_names,
        feature_names=feature_names,
        DESCR=DESCR
    )

# Save the dataset as a variable we can use.
mydataset = load_data()

print(mydataset.data.shape)
print(mydataset.target.shape)

(260364, 6)
(260364,)


In [5]:
def fit_and_evaluate(dataset, model, label, **kwargs):
    
    start  = time.time() # Start the clock! 
    scores = {'precision':[], 'recall':[], 'accuracy':[], 'f1':[]}
    
    for train, test in KFold(mydataset.data.shape[0], n_folds=12, shuffle=True):
        X_train, X_test = mydataset.data[train], mydataset.data[test]
        y_train, y_test = mydataset.target[train], mydataset.target[test]
        
        estimator = model(**kwargs)
        estimator.fit(X_train, y_train)
        
        expected  = y_test
        predicted = estimator.predict(X_test)
        
        # Append our scores to the tracker
        scores['precision'].append(metrics.precision_score(expected, predicted, average="weighted"))
        scores['recall'].append(metrics.recall_score(expected, predicted, average="weighted"))
        scores['accuracy'].append(metrics.accuracy_score(expected, predicted))
        scores['f1'].append(metrics.f1_score(expected, predicted, average="weighted"))

    # Report
    print("Build and Validation of {} took {:0.3f} seconds".format(label, time.time()-start))
    print("Validation scores are as follows:\n")
    print(pd.DataFrame(scores).mean())
    
    # Write official estimator to disk
    estimator = model(**kwargs)
    estimator.fit(mydataset.data, mydataset.target)
    
    outpath = label.lower().replace(" ", "-") + ".pickle"
    with open(outpath, 'wb') as f:
        pickle.dump(estimator, f)

    print("\nFitted model written to:\n{}".format(os.path.abspath(outpath)))

In [None]:
# Perform SVC Classification
#fit_and_evaluate(mydataset, SVC, "Incident_SVM Classifier")

In [6]:
# Perform kNN Classification
fit_and_evaluate(mydataset, KNeighborsClassifier, "Incident_KNN Classifier", n_neighbors=12)

Build and Validation of Incident_KNN Classifier took 23.643 seconds
Validation scores are as follows:

accuracy     0.984913
f1           0.977697
precision    0.972265
recall       0.984913
dtype: float64

Fitted model written to:
C:\project\ship-happens\incident_knn-classifier.pickle


In [7]:
# Perform Random Forest Classification
fit_and_evaluate(mydataset, RandomForestClassifier, "Incident_Random Forest Classifier")

Build and Validation of Incident_Random Forest Classifier took 43.656 seconds
Validation scores are as follows:

accuracy     0.981315
f1           0.976906
precision    0.973188
recall       0.981315
dtype: float64

Fitted model written to:
C:\project\ship-happens\incident_random-forest-classifier.pickle


In [10]:
#from sklearn.linear_model import LogisticRegression
# Perform Logistic Regression
fit_and_evaluate(mydataset, LogisticRegression, "Incident_Logistic Regression")

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Build and Validation of Incident_Logistic Regression took 43.749 seconds
Validation scores are as follows:

accuracy     0.985098
f1           0.977722
precision    0.970456
recall       0.985098
dtype: float64

Fitted model written to:
C:\project\ship-happens\incident_logistic-regression.pickle


In [11]:
# Perform ExtraTreesClassifier
fit_and_evaluate(mydataset, ExtraTreesClassifier, "Incident_ExtraTrees Classifier")

Build and Validation of Incident_ExtraTrees Classifier took 25.629 seconds
Validation scores are as follows:

accuracy     0.979444
f1           0.976221
precision    0.973346
recall       0.979444
dtype: float64

Fitted model written to:
C:\project\ship-happens\incident_extratrees-classifier.pickle


In [12]:
# Perform BaggingClassifier
fit_and_evaluate(mydataset, BaggingClassifier, "Incident_Bagging Classifier")

Build and Validation of Incident_Bagging Classifier took 89.055 seconds
Validation scores are as follows:

accuracy     0.980358
f1           0.976703
precision    0.973538
recall       0.980358
dtype: float64

Fitted model written to:
C:\project\ship-happens\incident_bagging-classifier.pickle


In [13]:
# Perform GaussianNB
fit_and_evaluate(mydataset, GaussianNB, "Incident_Gaussian NB")


Build and Validation of Incident_Gaussian NB took 1.139 seconds
Validation scores are as follows:

accuracy     0.955608
f1           0.963438
precision    0.971800
recall       0.955608
dtype: float64

Fitted model written to:
C:\project\ship-happens\incident_gaussian-nb.pickle


In [14]:
# Perform SGDClassifier
fit_and_evaluate(mydataset, SGDClassifier, "Incident_SGD Classifier")

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Build and Validation of Incident_SGD Classifier took 2.842 seconds
Validation scores are as follows:

accuracy     0.953722
f1           0.958261
precision    0.971249
recall       0.953722
dtype: float64

Fitted model written to:
C:\project\ship-happens\incident_sgd-classifier.pickle


In [5]:
#Take user data from text file to predict accident (yes/no), using KNN model
import csv

def load_model(path='incident_knn-classifier.pickle'):
    with open(path, 'rb') as f:
        return pickle.load(f)

model = load_model()

# Create a reader for the text file and a write to write output 
with open('baltimore_knn.txt', 'r') as fin:
    reader = csv.reader(fin, delimiter='\t') 

    # Create writer to write CSV output 
    with open('result_baltimore_knn.txt', 'w') as fout:
        writer = csv.writer(fout) 

        # Go through all your data and run the predictions, writing to the results
        for idx, row in enumerate(reader):
            accident = model.predict([row]) 
            writer.writerow([idx+1,row[0], accident])

In [7]:
#Take user data from text file to predict accident (yes/no), using random forest classifier model
import csv

def load_model(path='incident_random-forest-classifier.pickle'):
    with open(path, 'rb') as f:
        return pickle.load(f)

model = load_model()

# Create a reader for the text file and a write to write output 
with open('baltimore_randf.txt', 'r') as fin:
    reader = csv.reader(fin, delimiter='\t') 

    # Create writer to write CSV output 
    with open('result_baltimore_randf.txt', 'w') as fout:
        writer = csv.writer(fout) 

        # Go through all your data and run the predictions, writing to the results
        for idx, row in enumerate(reader):
            accident = model.predict([row]) 
            writer.writerow([idx+1,row[0], accident])