## Novelty and Outlier Detection methods from scikit-learn

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns

%matplotlib inline
# %matplotlib notebook
style.use('ggplot')

In [2]:
data = pd.read_csv("../data/creditcard.csv")
data.drop(['Time'], axis=1, inplace=True)
labels = data['Class']
labels_svm = labels.copy()
labels_svm[labels == 1] = -1
labels_svm[labels == 0] = 1

In [3]:
from sklearn.preprocessing import StandardScaler

scaled_features = StandardScaler().fit_transform(data.values)
scaled_data = pd.DataFrame(scaled_features,
                        index=data.index,
                        columns=data.columns)
scaled_data.drop(['Class'], axis=1, inplace=True)

In [4]:
from sklearn.model_selection import train_test_split

test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(scaled_data,
                                                    labels_svm,
                                                    test_size=test_size)

In [5]:
non_fraud_X_train = X_train[y_train==1]
non_fraud_y_train = y_train[y_train==1]

## One-Class SVM

In [6]:
from sklearn.ensemble import IsolationForest

Parameters copied from scikit-learn example, need to be fine-tuned

In [17]:
clf = IsolationForest(max_samples=100, contamination=0.01, n_jobs=-1)
clf.fit(X_train)

IsolationForest(bootstrap=False, contamination=0.01, max_features=1.0,
        max_samples=100, n_estimators=100, n_jobs=-1, random_state=None,
        verbose=0)

In [18]:
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

In [19]:
y_score_train = clf.decision_function(X_train)
y_score_test = clf.decision_function(X_test)

In [22]:
from sklearn.metrics import (average_precision_score,
                             accuracy_score,
                             classification_report,
                             f1_score)
average_precision = average_precision_score(y_test, y_score_test)
f1 = f1_score(y_test, y_pred_test)
acc = accuracy_score(y_test, y_pred_test)


print('Average precision-recall score: {0:0.4f}'.format(
      average_precision))
print('F1 score: {0:0.4f}'.format(f1))
print('Accuracy score: {0:0.4f}'.format(acc))

Average precision-recall score: 0.9999
F1 score: 0.9947
Accuracy score: 0.9894


In [21]:
print(classification_report(y_test, y_pred_test))

             precision    recall  f1-score   support

         -1       0.09      0.62      0.16       136
          1       1.00      0.99      0.99     85307

avg / total       1.00      0.99      0.99     85443



## Optimization

In [20]:
class Objective_IF(object):
    """
    A class to encapsulate the SVC model and our objective function (the error in this case, to minimize)
    """
    
    def __init__(self):
        self.history_f = []
        self.fbest = np.inf
        self.history_f_best = []
        
    def encode(self, nu, gamma):
        """
        Returns a dict with the square value of the arguments C and gamma.
        """
        nu2 = nu ** 2
        self.nu_max = nu2 if nu2 > self.nu_max else self.nu_max
        return {'nu': nu2 / self.nu_max, 'gamma': gamma ** 2}
    
    def __call__(self, pair):
        
        
        
        params = self.encode(*pair)
        nu_in = params['nu']
        gamma_in = params['gamma']
        
        
        clf = IsolationForest(max_samples=100, contamination=0.01, n_jobs=-1)
        clf.fit(non_fraud_X_train)
        
        y_pred_train = clf.predict(X_train)
        y_pred_test = clf.predict(X_test)
        y_score_train = clf.decision_function(X_train)
        y_score_test = clf.decision_function(X_test)
        
        average_precision = average_precision_score(y_test, y_score_test)
        f1 = f1_score(y_test, y_pred_test)
        acc = accuracy_score(y_test, y_pred_test)
        
        f = 1 - acc # error function
        
        self.history_f.append(f)
        if f < self.fbest:
            self.fbest = f
        self.history_f_best.append(self.fbest)
        return f

In [21]:
import cma

In [None]:
fun_cma = Objective_SVM()
res_cma = cma.fmin(fun_cma, np.array([0.5, 0.5]), 0.25)

(3_w,6)-aCMA-ES (mu_w=2.0,w_1=63%) in dimension 2 (seed=473977, Tue Mar  6 12:21:24 2018)


In [None]:
cma.plot()