# Novelty and outlier detection

Going through all of the `scikit-learn` methods for outlier detection to see what we get!

## Data

Import the data:

In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.mixture import GaussianMixture
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_fscore_support

In [2]:
# Import cleaned data from clean_data.py
df = pd.read_csv('../data/cleaned_Base.csv')

target = 'fraud_bool'
X = df.drop(target, axis = 1)
y = df[target]

In [3]:
# Print all column names and classes of X
print(X.dtypes)

Unnamed: 0                                  int64
income                                    float64
name_email_similarity                     float64
current_address_months_count                int64
customer_age                                int64
days_since_request                        float64
payment_type                               object
zip_count_4w                                int64
velocity_6h                               float64
velocity_24h                              float64
velocity_4w                               float64
bank_branch_count_8w                        int64
date_of_birth_distinct_emails_4w            int64
employment_status                          object
credit_risk_score                           int64
email_is_free                               int64
housing_status                             object
phone_home_valid                            int64
phone_mobile_valid                          int64
has_other_cards                             int64


In [4]:
np.unique(X['payment_type'])

array(['AA', 'AB', 'AC', 'AD', 'AE'], dtype=object)

In [5]:
# Create one-hot encoded version
ohe_cols = ['payment_type', 
            'employment_status', 
            'housing_status', 
            'source',
            'device_os']

X_ohe = pd.get_dummies(X, columns=ohe_cols)

# Train on month 0
X_ohe_train = X_ohe.loc[X_ohe['month'] == 0]
X_ohe_train = X_ohe_train.drop('month', axis = 1)
print(X_ohe_train.shape)
y_train = y[X_ohe['month'] == 0]

# Test on month 1
X_ohe_test = X_ohe.loc[X_ohe['month'] == 1]
X_ohe_test = X_ohe_test.drop('month', axis = 1)
y_test = y[X_ohe['month'] == 1]

(132440, 54)


In [14]:
# Create scaled version
scaler = StandardScaler()
X_ohe_train_scaled = scaler.fit_transform(X_ohe_train)
X_ohe_test_scaled = scaler.fit_transform(X_ohe_test)

In [6]:
# Setup functions
# Create a results dataframe to store and later compare results
results = pd.DataFrame()
results['Metrics'] = ['True Negatives', 'False Negatives', 'False Positives', 'True Positives', 'Accuracy', 'Precision', 'Recall', 'F1-Score', 'Support']

def get_metrics(labels, pred_labels):
    conf_matrix = np.asarray(confusion_matrix(labels, pred_labels))
    metrics = np.concatenate(([accuracy_score(labels, pred_labels)], precision_recall_fscore_support(labels, pred_labels, average='binary')))
    return np.concatenate((conf_matrix.reshape(-1),metrics))

def fit_model(input_data, model_fn, model_args=None, threshold=None, labels=None):
    return model_fn(input_data, model_args, threshold, labels)

def predict_model(input_data, model, predict_fn, threshold=None):
    return predict_fn(input_data, model, threshold)

## Isolation forest

Uses decision trees: randomly selects a feature and then randomly selects a split value. Theoretically, outliers will need fewer random partitions to be isolated and thus have a shorter path length on the tree.

- Scale: no
- One-hot encode: yes

In [7]:
def fit_isolation_forest(input_data, args=None, threshold=None, labels=None):
    if threshold == None:
        model = IsolationForest()
    else:
        model = IsolationForest(contamination=threshold)
    
    model.fit(input_data)

    return model

def predict_isolation_forest(input_data, model, threshold=None):
    y_pred = model.predict(input_data)
    y_pred = [1 if pred == -1 else 0 for pred in y_pred]

    return y_pred

In [8]:
# Calculate the contamination parameter
contamination_rate = y.sum() / len(y) # going with overall incidence in the data
print("Proportion of fraud in the entire dataset: ", contamination_rate)

Proportion of fraud in the entire dataset:  0.011029


In [9]:
# Run isolation forest on training data
if_model = fit_model(X_ohe_train, fit_isolation_forest, threshold=contamination_rate)

if_pred_test = predict_model(X_ohe_test, if_model, predict_isolation_forest)

print('CONFUSION MATRIX:\n', confusion_matrix(y_test,if_pred_test))
print("Classification Report:\n", classification_report(y_test, if_pred_test))

CONFUSION MATRIX:
 [[121665   4757]
 [  1095    103]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.96      0.98    126422
           1       0.02      0.09      0.03      1198

    accuracy                           0.95    127620
   macro avg       0.51      0.52      0.51    127620
weighted avg       0.98      0.95      0.97    127620



In [10]:
results['Isolation Forest'] = get_metrics(y_test, if_pred_test)

## Local Outlier Factor (LOF)

LOF computes the local density deviation of a given data point with respect to its neighbors. If a point has substantially lower density than its neighbors, it is identified as an outlier.

The contamination parameter sets the proportion of the most isolated points to be predicted as anomalies, so we want to set that to what we believe the incidence of fraud to be.

- Scale: I think so? Gonna do it
- One-hot encode: yes

In [15]:
# LOF model
lof_model = LocalOutlierFactor(n_neighbors = 20,contamination = contamination_rate, novelty=False)
# Need to just run it on the testing data and identify the outliers there
lof_pred_test = lof_model.fit_predict(X_ohe_test_scaled)

In [16]:
# np.unique(lof_pred_train, return_counts = True)
# Recode the predictions
lof_pred_test[lof_pred_test == 1] = 0
lof_pred_test[lof_pred_test == -1] = 1

print('CLASSIFICATION RESULTS')
print(classification_report(y_test, lof_pred_test))

CLASSIFICATION RESULTS
              precision    recall  f1-score   support

           0       0.99      0.99      0.99    126422
           1       0.02      0.02      0.02      1198

    accuracy                           0.98    127620
   macro avg       0.50      0.50      0.50    127620
weighted avg       0.98      0.98      0.98    127620



In [18]:
results['LOF'] = get_metrics(y_test, lof_pred_test)

Unnamed: 0,Metrics,Isolation Forest,LOF
0,True Negatives,121665.0,125037.0
1,False Negatives,4757.0,1385.0
2,False Positives,1095.0,1175.0
3,True Positives,103.0,23.0
4,Accuracy,0.954145,0.97994
5,Precision,0.021193,0.016335
6,Recall,0.085977,0.019199
7,F1-Score,0.034005,0.017652
8,Support,,


## Gaussian Mixture

Gaussian mixture models assume all data points are generated from a mixture of a finite number of Gaussian distributions with unknown parameters. They try to find different subpopulations within the overall dataset.

- Scale: eh why not
- One-hot encode: yea

In [19]:
def fit_gaussian_mixture(input_data, args=None, threshold=None, labels=None):
    if threshold == None:
        model = GaussianMixture(n_components=2)
    else:
        model = GaussianMixture(n_components=2, reg_covar=threshold)
    
    model.fit(input_data)

    return model

def predict_gaussian_mixture(input_data, model, threshold=None):
    return model.predict(input_data)

In [22]:
gm_model = fit_model(X_ohe_train_scaled, fit_gaussian_mixture, None)

gm_pred_test = predict_model(X_ohe_test_scaled, gm_model, predict_gaussian_mixture)
print('CONFUSION MATRIX:\n', confusion_matrix(y_test,gm_pred_test))
print("Classification Report:\n", classification_report(y_test, gm_pred_test))

CONFUSION MATRIX:
 [[34781 91641]
 [  249   949]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.28      0.43    126422
           1       0.01      0.79      0.02      1198

    accuracy                           0.28    127620
   macro avg       0.50      0.53      0.23    127620
weighted avg       0.98      0.28      0.43    127620



In [24]:
results['Gaussian Mixture'] = get_metrics(y_test, gm_pred_test)
results

Unnamed: 0,Metrics,Isolation Forest,LOF,Gaussian Mixture
0,True Negatives,121665.0,125037.0,34781.0
1,False Negatives,4757.0,1385.0,91641.0
2,False Positives,1095.0,1175.0,249.0
3,True Positives,103.0,23.0,949.0
4,Accuracy,0.954145,0.97994,0.279972
5,Precision,0.021193,0.016335,0.010249
6,Recall,0.085977,0.019199,0.792154
7,F1-Score,0.034005,0.017652,0.020237
8,Support,,,
