# Random Forest Model

__Imports__

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.style as style
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC, LinearSVC

style.use('fivethirtyeight')
# import jupyterthemes as jt
# context = jt.jtplot.set_context('poster')
# jt.jtplot.set_style(context, 'monokai', ticks=True, spines=True)
# jt.stylefx.style_layout('nb_style', theme='monokai',
#                         toolbar=True, nbname=True, cellwidth='1000')
# jt.stylefx.set_nb_theme('monokai')


## Read in the data and instantiate models

In [2]:
df = pd.read_csv("../data/device_failure_data_scientist.csv")
df.head()

Unnamed: 0,date,device,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute8,attribute9,failure
0,15001,S1F01085,215630672,56,0,52,6,407438,0,0,7,0
1,15001,S1F0166B,61370680,0,3,0,6,403174,0,0,0,0
2,15001,S1F01E6Y,173295968,0,0,0,12,237394,0,0,0,0
3,15001,S1F01JE0,79694024,0,0,0,6,410186,0,0,0,0
4,15001,S1F01R2B,135970480,0,0,0,15,313173,0,0,3,0


In [4]:
feat = ['attribute2', 'attribute4', 'attribute7',
        'attribute8']  # list of features to use base off coeffs.
features = df.drop(columns=['date', 'device', 'failure'])
# features = df[feat]
labels = df['failure']

In [5]:
X = features
y = labels

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=42,
                                                    shuffle=False)

In [7]:
ss = StandardScaler()

X_train_ss = ss.fit_transform(X_train)
X_test_ss = ss.transform(X_test)

In [78]:
svc = SVC(C=0.5, kernel='linear', gamma='scale', probability=True, random_state=42, class_weight='balanced', verbose=1)

In [79]:
# lsvc = LinearSVC(C=1, penalty='l1', dual=False, class_weight='balanced', random_state=42, max_iter=2000 )

## Model Evaluation

In [None]:
svc.fit(X_train_rs, y_train )

[LibSVM]

In [70]:
svc.score(X_train_rs, y_train)

0.9343150905001607

In [71]:
svc.score(X_test_rs, y_test)

0.9289615730625883

In [72]:
preds = svc.predict(X_test_rs)

In [73]:
# Generate confusion matrix.
confusion_matrix(
    y_test,  # True values.
    preds)  # Predicted values

array([[28902,  2196],
       [   15,    11]], dtype=int64)

In [74]:
con_mat = confusion_matrix(y_test, preds)

In [75]:
tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()

In [76]:
# Specificity and Sensitivity

spec = tn / (tn + fp)
sens = tp / (tp + fn)

print(f'Specificity: {round(spec, 4)}')
print(f'Sensitivity: {round(sens, 4)}')

Specificity: 0.9294
Sensitivity: 0.4231


### ROC AUC 

In [77]:
pred_proba = [i[1] for i in svc.predict_proba(X_test_rs)]

pred_df = pd.DataFrame({'true_values': y_test, 'pred_probs': pred_proba})

# Create figure.
plt.figure(figsize=(10, 7))

# Create two histograms of observations.
plt.hist(pred_df[pred_df['true_values'] == 0]['pred_probs'],
         bins=25,
         color='b',
         alpha=0.6,
         label='Outcome = 0')
plt.hist(pred_df[pred_df['true_values'] == 1]['pred_probs'],
         bins=25,
         color='orange',
         alpha=0.6,
         label='Outcome = 1')

# Add vertical line at P(Outcome = 1) = 0.5.
plt.vlines(x=0.5, ymin=0, ymax=65, color='r', linestyle='--')

# Label axes.
plt.title('Distribution of P(Outcome = 1)', fontsize=22)
plt.ylabel('Frequency', fontsize=18)
plt.xlabel('Predicted Probability that Outcome = 1', fontsize=18)

# Create legend.
plt.legend(fontsize=20)

AttributeError: 'LinearSVC' object has no attribute 'predict_proba'

In [40]:
# Create figure.
plt.figure(figsize=(10, 7))

# Create threshold values.
thresholds = np.linspace(0, 1, 200)

# Define function to calculate sensitivity. (True positive rate.)


def TPR(df, true_col, pred_prob_col, threshold):
    true_positive = df[(df[true_col] == 1)
                       & (df[pred_prob_col] >= threshold)].shape[0]
    false_negative = df[(df[true_col] == 1)
                        & (df[pred_prob_col] < threshold)].shape[0]
    return true_positive / (true_positive + false_negative)


# Define function to calculate 1 - specificity. (False positive rate.)


def FPR(df, true_col, pred_prob_col, threshold):
    true_negative = df[(df[true_col] == 0)
                       & (df[pred_prob_col] <= threshold)].shape[0]
    false_positive = df[(df[true_col] == 0)
                        & (df[pred_prob_col] > threshold)].shape[0]
    return 1 - (true_negative / (true_negative + false_positive))


# Calculate sensitivity & 1-specificity for each threshold between 0 and 1.
tpr_values = [
    TPR(pred_df, 'true_values', 'pred_probs', prob) for prob in thresholds
]
fpr_values = [
    FPR(pred_df, 'true_values', 'pred_probs', prob) for prob in thresholds
]

# Plot ROC curve.
plt.plot(
    fpr_values,  # False Positive Rate on X-axis
    tpr_values,  # True Positive Rate on Y-axis
    label='ROC Curve')

# Plot baseline. (Perfect overlap between the two populations.)
plt.plot(np.linspace(0, 1, 200),
         np.linspace(0, 1, 200),
         label='baseline',
         linestyle='--')

# Label axes.
plt.title('Receiver Operating Characteristic Curve', fontsize=22)
plt.ylabel('Sensitivity', fontsize=18)
plt.xlabel('1 - Specificity', fontsize=18)

# Create legend.
plt.legend(fontsize=16)

NameError: name 'pred_df' is not defined

<Figure size 720x504 with 0 Axes>

In [41]:
roc_auc_score(pred_df['true_values'], pred_df['pred_probs'])

NameError: name 'pred_df' is not defined

## Results

This approach used some hyperparameter tuning: Changing the solver to accomodate `elastic net` and increasing the penalty.  I also considered that the devices that fail run each day up until they fail so the time series could be an important aspect.  The exact date doesn't matter since each there didn't seem to be a direct dependence on the date.

In [42]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       1.00      0.93      0.96     31098
           1       0.00      0.42      0.01        26

    accuracy                           0.93     31124
   macro avg       0.50      0.68      0.49     31124
weighted avg       1.00      0.93      0.96     31124

