In [1]:
from time import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools

from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn import svm

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2


# If we want to consider inf and -inf to be “NA” in computations, we can set 
pd.options.mode.use_inf_as_na = True

In [2]:
# bind two datasets to split X and y into training and testing sets by itself
# it will be also helpfull for future crossvalidation
train_df = pd.read_csv('data/aps_failure_training_set.csv', skiprows=range(0, 20))
test_df = pd.read_csv('data/aps_failure_test_set.csv', skiprows=range(0, 20))
frames = [train_df, test_df]
df = pd.concat(frames)

In [3]:
# simpe preparation
df['class'] = df['class'].str.replace('neg','0')
df['class'] = df['class'].str.replace('pos','1')
cols = df.columns
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')

In [4]:
# split dataset into x and y
cols = df.columns.drop('class')
X = df[cols].values
y = df['class'].values

In [5]:
# split X and y into training and testing sets# split 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

Feature selection <br>
http://scikit-learn.org/stable/modules/feature_selection.html

In [6]:
X_train.shape

(57000, 170)

In [7]:
# Create our imputer to replace missing values with the mean e.g.
# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Imputer.html
# strategy = {mean, median, most_frequent}

imp_train = Imputer(missing_values='NaN', strategy='median', axis=0)
imp_train = imp_train.fit(X_train)
imp_test = Imputer(missing_values='NaN', strategy='median', axis=0)
imp_test = imp_test.fit(X_test)
# Impute each train and test item, then predict
X_train_imp = imp_train.transform(X_train)
X_test_imp = imp_test.transform(X_test)

In [8]:
X_train_new = SelectKBest(chi2, k=2).fit_transform(X_train_imp, y_train)
# X_new.shape
X_train_new.shape

(57000, 2)

In [9]:
X_train_new

array([[2.334146e+06, 0.000000e+00],
       [1.072840e+05, 6.004000e+03],
       [2.311386e+06, 0.000000e+00],
       ...,
       [5.350240e+07, 0.000000e+00],
       [4.401360e+05, 0.000000e+00],
       [7.860612e+06, 0.000000e+00]])

In [None]:
# SVM
# http://scikit-learn.org/stable/modules/svm.html
# Run classifier, using a model that is too regularized (C too low) to see
# the impact on the results

# The “balanced” mode uses the values of y to automatically adjust weights 
# inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y))
classifier = svm.SVC(kernel='linear', C=1, class_weight = "balanced")
y_pred = classifier.fit(X_train_new, y_train).predict(X_test_imp)

In [None]:
acc = accuracy_score(y_pred, y_test)
print ("accuracy_score:", acc)

In [None]:
# examine the class distribution of the testing set (using a Pandas Series method)# exami 
pd.Series(y_test).value_counts()

In [None]:
# calculate null accuracy (for binary classification problems coded as 0/1)# calcul 
max(y_test.mean(), 1 - y_test.mean())

In [None]:
from __future__ import print_function
print('True:', pd.Series(y_test).values[0:100])
print('Pred:', y_pred[0:100])

## Conclusion:

Classification accuracy is the easiest classification metric to understand<br>
But, it does not tell you the underlying distribution of response values<br>
And, it does not tell you what "types" of errors your classifier is making<br>

In [None]:
# Compute confusion matrix to evaluate the accuracy of a Random Forest Training classification
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

class_names = (['Neg', 'Pos'])
   
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)

plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix, without normalization')

plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

plt.show()

+ Every observation in the testing set is represented in exactly one box
+ It's a 2x2 matrix because there are 2 response classes
+ The format shown here is not universal

Basic terminology <br>

+ True Positives (TP): we correctly predicted that there is some failure
+ True Negatives (TN): we correctly predicted that there is no failure
+ False Positives (FP): we incorrectly predicted that there is some failure (a "Type I error")
+ False Negatives (FN): we incorrectly predicted that there is no failure (a "Type II error")

In [None]:
# save confusion matrix and slice into four pieces
confusion = cnf_matrix
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]

## Metrics computed from a confusion matrix¶
#### Classification Accuracy: Overall, how often is the classifier correct?

In [None]:
print((TP + TN) / float(TP + TN + FP + FN))
print(accuracy_score(y_test, y_pred))

#### Classification Error: Overall, how often is the classifier incorrect?

Also known as "Misclassification Rate"

In [None]:
print((FP + FN) / float(TP + TN + FP + FN))
print(1 - metrics.accuracy_score(y_test, y_pred))

#### Sensitivity: When the actual value is positive, how often is the prediction correct?

How "sensitive" is the classifier to detecting positive instances?
Also known as "True Positive Rate" or "Recall"

In [None]:
print(TP / float(TP + FN))
print(metrics.recall_score(y_test, y_pred))

#### Specificity: When the actual value is negative, how often is the prediction correct?

+ How "specific" (or "selective") is the classifier in predicting positive instances?

In [None]:
print(TN / float(TN + FP))

#### False Positive Rate: When the actual value is negative, how often is the prediction incorrect?

In [None]:
print(FP / float(TN + FP))

#### Precision: When a positive value is predicted, how often is the prediction correct?

+ How "precise" is the classifier when predicting positive instances?

In [None]:
print(TP / float(TP + FP))
print(metrics.precision_score(y_test, y_pred))

#### Conclusion:

+ Confusion matrix gives you a more complete picture of how your classifier is performing
+ Also allows you to compute various classification metrics, and these metrics can guide your model selection

#### Which metrics should you focus on?

+ Choice of metric depends on your business objective
+ Spam filter (positive class is "spam"): Optimize for precision or specificity because false negatives (spam goes to + the inbox) are more acceptable than false positives (non-spam is caught by the spam filter)
+ Fraudulent transaction detector (positive class is "fraud"): Optimize for sensitivity because false positives (normal transactions that are flagged as possible fraud) are more acceptable than false negatives (fraudulent transactions that are not detected)

Cross validation

In [None]:
kfold = KFold(n_splits=3, random_state=7)
scores = cross_val_score(rfmodel, X_train_imp, y_train, cv=kfold, scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print ("kfold mean:", result.mean())