In [None]:
##################################################
### Author: Anthony Igel                       ###
### Team: Category Management Transformation   ###
### Project: Developing practical Python Tools ###
### Purpose: Outlier Detection                 ###
### Date: 06/04/2018                           ###
##################################################

# http://scikit-learn.org/stable/auto_examples/covariance/plot_outlier_detection.html

######################################################################
########                     Import Modules                   ########
######################################################################
import py_effo as py_effo

### pandas
# Pandas is for structured data operations and manipulations, extensively used for data preparation
import pandas as pd

### numpy
# NumPy stands for Numerical Python, a library contains basic linear algebra functions, Fourier Transforms and advanced random
# number capabilities
import numpy as np 

### sklearn
# Sklearn contains basic statistical models
from sklearn.datasets import load_boston
from sklearn.covariance import EllipticEnvelope
from sklearn import svm
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
# As well as a module to calculate model performance statistics
from sklearn import metrics

### Scipy
from scipy import stats

### Matplotlib
# Matplotlib is a Python based plotting library with complete 2D support and limited 3D support
%matplotlib inline
import matplotlib as mlb
import matplotlib.font_manager
import matplotlib.pyplot as plt

### Seaborn
# Seaborn is a Python visualization library based on Matplolib, providing high-level interface for statistcial graphing
# Seaborn supports numpy and pandas data structures as well as statistical routines from scipy and statsmodels
# Note: https://seaborn.pydata.org/introduction.html
import seaborn as sns

In [None]:
######################################################################
########                  Outlier Detection - 1               ########
######################################################################

######################################################################
########                    Import Data                       ########
######################################################################

### Boston Data
# Two clusters
x1 = load_boston()['data'][:, [8, 10]]
# Banana-shaped
x2 = load_boston()['data'][:, [5, 12]]

########  Define "classifiers" to be used ######## 
classifiers = {
    "Empirical Covariance": EllipticEnvelope(support_fraction = 1.,
                                             contamination = 0.261),
    "Robust Covariance (Minimum Covariance Determinant)":
    EllipticEnvelope(contamination = 0.261),
    "OCSVM": OneClassSVM(nu = 0.261, gamma = 0.05)}
colors = ['m', 'g', 'b']
legend1 = {}
legend2 = {}

######## Train frontier model for detection of outliers ######## 
# define the graphing object dimensions
xx1, yy1 = np.meshgrid(np.linspace(-8, 28, 500), np.linspace(3, 40, 500))
xx2, yy2 = np.meshgrid(np.linspace(3, 10, 500), np.linspace(-5, 45, 500))

# for each classifier, plot overlay of functions
for i, (clf_name, clf) in enumerate(classifiers.items()):
    plt.figure(1)
# Train each classifier model on x1
    clf.fit(x1)
    Z1 = clf.decision_function(np.c_[xx1.ravel(), yy1.ravel()])
    Z1 = Z1.reshape(xx1.shape)
    legend1[clf_name] = plt.contour(
        xx1, yy1, Z1, levels = [0], linewidths = 2, colors = colors[i])
    plt.figure(2)
# Train each classifier model on x2
    clf.fit(x2)
    Z2 = clf.decision_function(np.c_[xx2.ravel(), yy2.ravel()])
    Z2 = Z2.reshape(xx2.shape)
    legend2[clf_name] = plt.contour(
        xx2, yy2, Z2, levels = [0], linewidths = 2, colors=colors[i])

legend1_values_list = list(legend1.values())
legend1_keys_list = list(legend1.keys())

######## Plot the results (= shape of the data points cloud) ######## 
### Two clusters
plt.figure(1)  
plt.title("Outlier detection on a real data set (boston housing)")
# Plot initial data as a scatter plot
plt.scatter(x1[:, 0], x1[:, 1], color = 'black')
# define arguments for annotation object
bbox_args = dict(boxstyle = "round", fc = "0.8")
arrow_args = dict(arrowstyle = "->")
plt.annotate("several confounded points", xy = (24, 19),
             xycoords = "data", textcoords = "data",
             xytext = (13, 10), bbox = bbox_args, arrowprops = arrow_args)
plt.xlim((xx1.min(), xx1.max()))
plt.ylim((yy1.min(), yy1.max()))
plt.legend((legend1_values_list[0].collections[0],
            legend1_values_list[1].collections[0],
            legend1_values_list[2].collections[0]),
           (legend1_keys_list[0], legend1_keys_list[1], legend1_keys_list[2]),
           loc = "upper center",
           prop = matplotlib.font_manager.FontProperties(size=12))
plt.ylabel("accessibility to radial highways")
plt.xlabel("pupil-teacher ratio by town")

legend2_values_list = list(legend2.values())
legend2_keys_list = list(legend2.keys())

### banana-shaped data
plt.figure(2)
plt.title("Outlier detection on a real data set (boston housing)")
plt.scatter(x2[:, 0], x2[:, 1], color = 'black')
plt.xlim((xx2.min(), xx2.max()))
plt.ylim((yy2.min(), yy2.max()))
plt.legend((legend2_values_list[0].collections[0],
            legend2_values_list[1].collections[0],
            legend2_values_list[2].collections[0]),
           (legend2_keys_list[0], legend2_keys_list[1], legend2_keys_list[2]),
           loc = "upper center",
           prop=matplotlib.font_manager.FontProperties(size = 12))
plt.ylabel("% lower status of the population")
plt.xlabel("average number of rooms per dwelling")

plt.show()

In [None]:
######################################################################
########                  Outlier Detection - 2               ########
######################################################################

######################################################################
########                    Import Data                       ########
######################################################################

rng = np.random.RandomState(42)

######## Establish settings for models ########
n_samples = 200
outliers_fraction = 0.25
clusters_separation = [0, 1, 2]

######## Define two outlier detection tools to be compared ########
classifiers = {
    "One-Class SVM": svm.OneClassSVM(nu = 0.95 * outliers_fraction + 0.05,
                                     kernel = "rbf", gamma = 0.1),
    "Robust covariance": EllipticEnvelope(contamination = outliers_fraction),
    "Isolation Forest": IsolationForest(max_samples = n_samples,
                                        contamination = outliers_fraction,
                                        random_state = rng),
    "Local Outlier Factor": LocalOutlierFactor(
        n_neighbors = 35,
        contamination = outliers_fraction)}

######## Compare given classifiers under given settings ######## 
xx, yy = np.meshgrid(np.linspace(-7, 7, 100), np.linspace(-7, 7, 100))
n_inliers = int((1. - outliers_fraction) * n_samples)
n_outliers = int(outliers_fraction * n_samples)
ground_truth = np.ones(n_samples, dtype=int)
ground_truth[-n_outliers:] = -1

######## Fit the problem with varying cluster separation ########
for i, offset in enumerate(clusters_separation):
    np.random.seed(42)
# Data generation
    X1 = 0.3 * np.random.randn(n_inliers // 2, 2) - offset
    X2 = 0.3 * np.random.randn(n_inliers // 2, 2) + offset
    X = np.r_[X1, X2]
# Add outliers
    X = np.r_[X, np.random.uniform(low = -6, high = 6, size = (n_outliers, 2))]

    
# Fit the model
    plt.figure(figsize = (9, 7))
    for i, (clf_name, clf) in enumerate(classifiers.items()):
# fit the data and tag outliers
        if clf_name == "Local Outlier Factor":
            y_pred = clf.fit_predict(X)
            scores_pred = clf.negative_outlier_factor_
        else:
            clf.fit(X)
            scores_pred = clf.decision_function(X)
            y_pred = clf.predict(X)
        threshold = stats.scoreatpercentile(scores_pred,
                                            100 * outliers_fraction)
        n_errors = (y_pred != ground_truth).sum()
# plot the levels lines and the points
        if clf_name == "Local Outlier Factor":
# decision_function is private for LOF
            Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()])
        else:
            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)
        subplot = plt.subplot(2, 2, i + 1)
        subplot.contourf(xx, yy, Z, levels = np.linspace(Z.min(), threshold, 7),
                         cmap = plt.cm.Blues_r)
        a = subplot.contour(xx, yy, Z, levels = [threshold],
                            linewidths = 2, colors = 'red')
        subplot.contourf(xx, yy, Z, levels = [threshold, Z.max()],
                         colors='orange')
        b = subplot.scatter(X[:-n_outliers, 0], X[:-n_outliers, 1], c = 'white',
                            s = 20, edgecolor = 'k')
        c = subplot.scatter(X[-n_outliers:, 0], X[-n_outliers:, 1], c = 'black',
                            s = 20, edgecolor = 'k')
        subplot.axis('tight')
        subplot.legend(
            [a.collections[0], b, c],
            ['learned decision function', 'true inliers', 'true outliers'],
            prop=matplotlib.font_manager.FontProperties(size = 10),
            loc='lower right')
        subplot.set_xlabel("%d. %s (errors: %d)" % (i + 1, clf_name, n_errors))
        subplot.set_xlim((-7, 7))
        subplot.set_ylim((-7, 7))
    plt.subplots_adjust(0.04, 0.1, 0.96, 0.94, 0.1, 0.26)
    plt.suptitle("Outlier detection")

plt.show()