## Novelty and Outlier Detection methods from scikit-learn

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns

%matplotlib inline
# %matplotlib notebook
style.use('ggplot')

In [3]:
data = pd.read_csv("../data/creditcard.csv")
data.drop(['Time'], axis=1, inplace=True)
labels = data['Class']
labels_svm = labels.copy()
labels_svm[labels == 1] = -1
labels_svm[labels == 0] = 1

In [4]:
from sklearn.preprocessing import StandardScaler

scaled_features = StandardScaler().fit_transform(data.values)
scaled_data = pd.DataFrame(scaled_features,
                        index=data.index,
                        columns=data.columns)
scaled_data.drop(['Class'], axis=1, inplace=True)

In [5]:
from sklearn.model_selection import train_test_split

test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(scaled_data,
                                                    labels_svm,
                                                    test_size=test_size)

In [6]:
non_fraud_X_train = X_train[y_train==1]
non_fraud_y_train = y_train[y_train==1]

## One-Class SVM

In [7]:
from sklearn import svm

Parameters copied from scikit-learn example, need to be fine-tuned

In [8]:
clf = svm.OneClassSVM(nu=0.1, kernel='rbf', gamma=0.1, verbose=True)
clf.fit(non_fraud_X_train)

[LibSVM]

OneClassSVM(cache_size=200, coef0=0.0, degree=3, gamma=0.1, kernel='rbf',
      max_iter=-1, nu=0.1, random_state=None, shrinking=True, tol=0.001,
      verbose=True)

In [None]:
from sklearn.externals import joblib

In [None]:
joblib.dump(clf, 'oneclass_svm_non_tuned.pkl')

In [None]:
clf = joblib.load('oneclass_svm_non_tuned.pkl')

In [13]:
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

In [14]:
y_score_train = clf.decision_function(X_train)
y_score_test = clf.decision_function(X_test)

In [None]:
y_pred_train_bin[y_pred_train_bin == 1] = 0 # not fraud
y_pred_train_bin[y_pred_train_bin == -1] = 1 # fraud
y_pred_test_bin = y_pred_test.copy()

In [19]:
np.unique(y_pred_test)

array([-1,  1])

In [None]:
y_pred_test[y_pred_test == -1]

In [20]:
from sklearn.metrics import (average_precision_score,
                             accuracy_score,
                             classification_report,
                             f1_score)
average_precision = average_precision_score(y_test, y_score_test)
f1 = f1_score(y_test, y_pred_test)
acc = accuracy_score(y_test, y_pred_test)


print('Average precision-recall score: {0:0.2f}'.format(
      average_precision))
print('F1 score: {0:0.2f}'.format(f1))
print('Accuracy score: {0:0.2f}'.format(acc))

ValueError: Target is multiclass but average='binary'. Please choose another average setting.

In [16]:
classification_report(y_test, y_pred_test)

array([[  0.68683533],
       [ 24.18481757],
       [-23.46971097],
       ..., 
       [ 29.93386352],
       [ 42.91741061],
       [  9.27360962]])