# Visualizing Results Interactively
This notebook goes through the process of taking Scania heavy truck failure data and visualizing it two different ways.  

First, I've reduced the features to 3-D to create an interactive mapping of the positive and negative (1 and 0) classes.  

Afterwards, there is an interactive confusion matrix to demonstrate how changing the prediction threshold affects model costs on the test data.

## Interactive Visualization
Multiple Ideas:  

1) Confusion Matrix with Sliding Threshold:  
* Demonstrates the effect of prioritizing different errors
* Show how I minimize the model cost

2) 3-D Visuzation of Data: excuse to use PCA
* May not actually mean much since features are anonimized
* Fun tool to visualize how separable the classes are

In [191]:
#plotly imports 

import plotly as py 
import plotly.graph_objs as go
from plotly import __version__
#use this format for working locally 
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot, plot_mpl

init_notebook_mode(connected=True)



print('Plotly version: %s' %(__version__))


#Other Imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from custom_metrics import scania_score, confusion_matrix
import data_cleaning as dc
import pickle
from ipywidgets import interactive, FloatSlider

Plotly version: 3.3.0


In [122]:
# Bring in data from csv files
X_train, X_test, y_train, y_test = dc.ready_aps_data()

In [None]:
# Trim large outliers for visualization reasons

In [124]:
y_test.values

array([0, 0, 0, ..., 0, 0, 0])

In [142]:
# Reduce testing data to 3 principal components
ssx = StandardScaler()
scaled_X_test = ssx.fit_transform(X_test)

pca_3 = PCA(n_components=3)
X_3d = pca_3.fit_transform(scaled_X_test)



In [156]:
def reject_outliers(data, m=5):
    array_bools = abs(data - np.mean(data, axis=0)) < m * np.std(data, axis=0)
    indexes_not_outliers = np.apply_along_axis(all,1,array_bools)
    #return data[indexes_not_outliers]
    return indexes_not_outliers

values_to_retain = reject_outliers(X_3d, m=10)
X_3d_graph = X_3d[values_to_retain]
y_3d_graph = y_test.values[values_to_retain]

In [157]:
X_3d_graph.shape, y_3d_graph.shape

((15970, 3), (15970,))

In [158]:
# Create masks for positive and negative class
pos = y_3d_graph == 1

In [None]:
'''
To-do:

Sample a smaller size of the negative class (< 5000)
Export to plotly
Embed
'''

In [186]:
# Graphing 3-D reduced data
marker_pos = dict(size=3, symbol='circle',
                  #color='rgb(127, 127, 127)',
                  color = 'rgb(255, 127, 14)',
                  line=dict(width=1, color='rgba(217, 217, 217, 0.14)'),
                  opacity=0.8)
marker_neg = dict(size=3, symbol='circle',
                  color='rgb(127, 127, 127)',
                  #line=dict(color='rgb(204, 204, 204)',width=0.5),
                  opacity=0.8)

trace1 = go.Scatter3d(x = X_3d_graph[pos][:,0], y=X_3d_graph[pos][:,1], z=X_3d_graph[pos][:,2],
                    mode = 'markers',
                     marker = marker_pos,
                     name = 'APS Failure')

trace2 = go.Scatter3d(x = X_3d_graph[~pos][:,0], y=X_3d_graph[~pos][:,1], z=X_3d_graph[~pos][:,2],
                    mode = 'markers',
                     marker = marker_neg,
                     name = 'No Check Needed')
data=[trace1, trace2]

layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    ),
    title = 'Scania Truck Data Reduced to 3 Components',
    xaxis = dict(title = 'PC1', titlefont = dict(size=18)),
    yaxis = dict(title = 'PC2', titlefont = dict(size=18)),
    #zaxis = dict(title = 'PC3', titlefont = dict(size=18))
)
fig = go.Figure(data=data, layout=layout)
py.offline.iplot(fig, filename='simple-3d-scatter')

## Sliding Confusion Matrix

In [189]:
# load trained model
best_model = pickle.load(open('vanilla_rfc.pkl', 'rb'))

In [216]:
def make_confusion_matrix(model, threshold=0.5):
    # Predict class 1 if probability of being in class 1 is greater than threshold
    # (model.predict(X_test) does this automatically with a threshold of 0.5)
    y_predict = (model.predict_proba(X_test)[:, 1] >= threshold)
    fraud_confusion = confusion_matrix(y_test, y_predict)
    plt.figure(dpi=130)
    sns.heatmap(fraud_confusion, cmap=plt.cm.Blues, annot=True, square=True, fmt='d',
           xticklabels=['No Failure', 'Failure'],
           yticklabels=['No Failure', 'Failure']);
    plt.xlabel('Predicted')
    plt.ylabel('APS Status')
    print("Cost to Scania: $", scania_score(y_test,y_predict))

In [217]:
interactive(lambda threshold: make_confusion_matrix(best_model, threshold), threshold=(0.0,0.5,0.005))

interactive(children=(FloatSlider(value=0.25, description='threshold', max=0.5, step=0.005), Output()), _dom_c…