# 离群点检测

In [1]:
# -*- coding: utf-8 -*-
"""Example of using kNN for outlier detection
"""

from __future__ import division
from __future__ import print_function


import os
import sys
import pandas as pd

# temporary solution for relative imports in case pyod is not installed
# if pyod is installed, no need to use the following line
sys.path.append(
    os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))

from pyod.models.knn import KNN
from pyod.utils.data import generate_data
from pyod.utils.data import evaluate_print
from pyod.utils.example import visualize
from sklearn.model_selection import train_test_split

### 对于单个csv处理的过程

将数据集剔除三个无用的属性，'point.id', 'motherset', 'origin'

In [2]:
df = pd.read_csv("./wine/benchmarks/wine_benchmark_0001.csv")
columns = df.columns
df = df[columns].fillna('nan')

In [3]:
columns

Index(['point.id', 'motherset', 'origin', 'original.label', 'diff.score',
       'ground.truth', 'fixed.acidity', 'volatile.acidity', 'citric.acid',
       'residual.sugar', 'chlorides', 'free.sulfur.dioxide',
       'total.sulfur.dioxide', 'density', 'pH', 'sulphates', 'alcohol'],
      dtype='object')

In [4]:
data = df.drop(columns = ['point.id', 'motherset', 'origin'])

In [5]:
class_mapping = {"anomaly":1, "nominal":0}

In [6]:
data['ground.truth'] = data['ground.truth'].map(class_mapping)

将数据集中除了ground.truth的属性当作X，ground.truth属性当作y

In [7]:
data.head()

Unnamed: 0,original.label,diff.score,ground.truth,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol
0,7,0.050492,0,-1.245962,-0.362411,-0.265853,-0.261304,-0.343495,1.209882,0.747594,-0.899276,0.817846,-0.613338,0.17456
1,5,0.082237,1,0.75954,0.973867,0.215849,-0.53454,0.598458,-0.536656,0.199134,0.968217,0.071518,0.596292,-0.915394
2,6,0.290201,0,-0.088942,-0.969809,-0.403482,-0.870829,-0.429127,-0.592996,-0.791633,-0.699187,-1.110168,1.402712,-0.496181
3,5,0.053559,1,0.219597,0.973867,0.284664,0.138039,0.427194,-0.762016,-0.243173,1.034913,0.817846,1.805921,0.006874
4,7,0.4203,0,0.219597,-0.180191,-0.541112,0.34822,-0.714567,-0.142276,0.446826,-0.242318,-0.36384,-1.016548,0.090717


In [8]:
class_mapping = {"anomaly":1, "nominal":0}

In [9]:
y = data['ground.truth']

In [10]:
x = data.drop('ground.truth',axis=1)

数据集中使用8:2划分训练和测试集

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
        x, y, test_size=0.2, random_state=28)

使用Knn算法对数据离群点进行判断

In [12]:
clf_name = 'KNN'
clf = KNN()
clf.fit(X_train)

# get the prediction labels and outlier scores of the training data
y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_  # raw outlier scores

# get the prediction on the test data
y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
y_test_scores = clf.decision_function(X_test)  # outlier scores

# evaluate and print the results
print("\nOn Training Data:")
evaluate_print(clf_name, y_train, y_train_scores)
print("\nOn Test Data:")
evaluate_print(clf_name, y_test, y_test_scores)



On Training Data:
KNN ROC:0.5724, precision @ rank n:0.4167

On Test Data:
KNN ROC:0.5669, precision @ rank n:0.4391


### 使用多个算法对数据集进行处理

- ABOD 
- CBLOF 
- LOF  
- HOBS  
- IForest 
- KNN  
- AKNN 

异常检测问题往往是没有标签的，训练数据中并未标出哪些是异常点，因此必须使用无监督学习,在进行检测时使用标签进行对比

In [16]:
import numpy as np
random_state = np.random.RandomState(42)
outliers_fraction = 0.05
from pyod.models.abod import ABOD
from pyod.models.cblof import CBLOF
from pyod.models.feature_bagging import FeatureBagging
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.lof import LOF
outliers_fraction = 0.05
# Define seven outlier detection tools to be compared
classifiers = {
#         'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction),
        'Cluster-based Local Outlier Factor (CBLOF)':CBLOF(contamination=outliers_fraction,check_estimator=False, random_state=random_state),
        'Feature Bagging':FeatureBagging(LOF(n_neighbors=35),contamination=outliers_fraction,check_estimator=False,random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction),
        'Isolation Forest': IForest(contamination=outliers_fraction,random_state=random_state),
        'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction),
        'Average KNN': KNN(method='mean',contamination=outliers_fraction)
}
for i, (clf_name, clf) in enumerate(classifiers.items()):
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)



On Training Data:
Cluster-based Local Outlier Factor (CBLOF) ROC:0.611, precision @ rank n:0.4666

On Test Data:
Cluster-based Local Outlier Factor (CBLOF) ROC:0.612, precision @ rank n:0.4649

On Training Data:
Feature Bagging ROC:0.563, precision @ rank n:0.4186

On Test Data:
Feature Bagging ROC:0.5625, precision @ rank n:0.4354

On Training Data:
Histogram-base Outlier Detection (HBOS) ROC:0.426, precision @ rank n:0.3123

On Test Data:
Histogram-base Outlier Detection (HBOS) ROC:0.4008, precision @ rank n:0.2952





On Training Data:
Isolation Forest ROC:0.6409, precision @ rank n:0.4685

On Test Data:
Isolation Forest ROC:0.6328, precision @ rank n:0.476

On Training Data:
K Nearest Neighbors (KNN) ROC:0.5724, precision @ rank n:0.4167

On Test Data:
K Nearest Neighbors (KNN) ROC:0.5669, precision @ rank n:0.4391

On Training Data:
Average KNN ROC:0.5649, precision @ rank n:0.4205

On Test Data:
Average KNN ROC:0.5358, precision @ rank n:0.417


### 对数据集中csv进行遍历并使用多个算法进行检测

在本地进行执行，所以只在后面统计结果

In [None]:
# -*- coding: utf-8 -*-
"""Example of using kNN for outlier detection
"""

from __future__ import division, print_function

import json
import os
import sys

import numpy as np
import pandas as pd
from pyod.models.abod import ABOD
from pyod.models.cblof import CBLOF
from pyod.models.feature_bagging import FeatureBagging
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from pyod.utils.data import evaluate_print, generate_data
from pyod.utils.example import visualize
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from pyod.utils import precision_n_scores

# temporary solution for relative imports in case pyod is not installed
# if pyod is installed, no need to use the following line
sys.path.append(
    os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))

file_list = []
total_roc = []
total_prn = []
count = 0
for home, dirs, files in os.walk("./skin/benchmarks"):
    for filename in files:
        fullname = os.path.join(home, filename)
        file_list.append(fullname)
for file_csv in file_list:
    
    df = pd.read_csv(file_csv)
    columns = df.columns
    # df = df[columns].fillna('nan')

    data = df.drop(columns = ['point.id', 'motherset', 'origin'])

    class_mapping = {"anomaly":1, "nominal":0}
    data['ground.truth'] = data['ground.truth'].map(class_mapping)
    class_mapping = {"anomaly":1, "nominal":0}

    y = data['ground.truth']

    x = data.drop('ground.truth',axis=1)

    X_train, X_test, y_train, y_test = train_test_split(
            x, y, test_size=0.2, random_state=28)

    random_state = np.random.RandomState(42)
    outliers_fraction = 0.05
    # Define seven outlier detection tools to be compared
    classifiers = {
            'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction),
            'Cluster-based Local Outlier Factor (CBLOF)':CBLOF(contamination=outliers_fraction,check_estimator=False, random_state=random_state),
            'Feature Bagging':FeatureBagging(LOF(n_neighbors=35),contamination=outliers_fraction,check_estimator=False,random_state=random_state),
            'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction),
            'Isolation Forest': IForest(contamination=outliers_fraction,random_state=random_state),
            'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction),
            'Average KNN': KNN(method='mean',contamination=outliers_fraction)
    }
    p_prn = []
    p_roc = []
    for i, (clf_name, clf) in enumerate(classifiers.items()):
        try:
            clf.fit(X_train)

            # get the prediction labels and outlier scores of the training data
            y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
            y_train_scores = clf.decision_scores_  # raw outlier scores

            # get the prediction on the test data
            y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
            y_test_scores = clf.decision_function(X_test)  # outlier scores

            # evaluate and print the results
            print(str(count)+"is analysing")
            print("\nOn Training Data:")
       
            evaluate_print(clf_name, y_train, y_train_scores)
            print("\nOn Test Data:")
            evaluate_print(clf_name, y_test, y_test_scores)
            roc=np.round(roc_auc_score(y_train, y_train_scores), decimals=4),
            prn=np.round(precision_n_scores(y_test, y_test_scores), decimals=4)
            p_prn.append(prn)
            p_roc.append(roc)
        except:
            p_prn.append(-1)
            p_roc.append(-1)

    total_prn.append(p_prn)
    total_roc.append(p_roc)    
    count += 1
        
total_prn = json.dumps(total_prn)
total_roc = json.dumps(total_roc)
a = open("skin_prc_list.txt", "w",encoding='UTF-8')
a.write(total_prn)
a.close()
a = open("skin_roc_list.txt", "w",encoding='UTF-8')
a.write(total_roc)
a.close()

In [126]:
import json
b = open("./wine_roc_list.txt", "r",encoding='UTF-8')
out = b.read()
out =  json.loads(out)

In [131]:
len(out)

1210

### 显示示例结果

In [128]:
out[0:3]

[[-1, 0.7024, 0.8416, 0.6902, 0.8085, 0.8245, 0.8677],
 [0.6469, 0.6547, 0.6504, 0.6672, 0.7198, 0.6547, 0.6664],
 [0.8098, 0.7399, 0.7717, 0.8707, 0.8017, 0.7943, 0.8181]]

### 计算wine数据集平均ROC和PRN值

其中-1表示该算法在数据集上运行时出错，在计算平均值时剔除

In [129]:
total_roc = []
for ip in range(0,7): 
    total = 0
    count = 0
    for r in out:
        if r[ip] != -1:
            total += r[ip]
            count += 1
    total_roc.append(total/count)

In [130]:
total_roc

[0.6210063073394493,
 0.6189723801065717,
 0.6460626998223798,
 0.5912920959147424,
 0.687247513321493,
 0.6485466252220252,
 0.6578079040852582]

ABOD = 0.6210063073394493,
CBLOF = 0.6189723801065717,
LOF = 0.6460626998223798,
HOBS = 0.5912920959147424,
IForest = 0.687247513321493,
KNN = 0.6485466252220252,
AKNN = 0.6578079040852582



In [132]:
import json
b = open("./wine_prn_list.txt", "r",encoding='UTF-8')
out = b.read()
out =  json.loads(out)

In [133]:
total_prn = []
for ip in range(0,7): 
    total = 0
    count = 0
    for r in out:
        if r[ip] != -1:
            total += r[ip]
            count += 1
    total_prn.append(total/count)

In [134]:
len(out)

1210

In [135]:
total_prn

[0.17593841743119282,
 0.16324582593250422,
 0.17491678507992875,
 0.12832815275310824,
 0.20096989342806384,
 0.17297602131438716,
 0.17796243339253962]

ABOD = 0.17593841743119282,
CBLOF = 0.16324582593250422,
LOF = 0.17491678507992875,
HOBS = 0.12832815275310824,
IForest = 0.20096989342806384,
KNN = 0.17297602131438716,
AKNN = 0.17796243339253962

通过对比在wine数据集中，使用IForest算法训练和检测的准确率最高

### 计算skin数据集平均ROC和PRN值

其中-1表示该算法在数据集上运行时出错，在计算平均值时剔除

In [136]:
b = open("./skin_roc_list.txt", "r",encoding='UTF-8')
out = b.read()
out =  json.loads(out)

In [137]:
total_roc = []
for ip in range(0,7): 
    total = 0
    count = 0
    for r in out:
        if r[ip] != -1:
            total += r[ip]
            count += 1
    total_roc.append(total/count)

In [138]:
len(out)

1500

In [139]:
total_roc

[0.7711199421965318,
 0.769387327586207,
 0.6818256593014956,
 0.9047651461154681,
 0.9054550962223803,
 0.7885023521026375,
 0.7864801140413406]

In [142]:
b = open("./skin_prn_list.txt", "r",encoding='UTF-8')
out = b.read()
out =  json.loads(out)
total_prn = []
for ip in range(0,7): 
    total = 0
    count = 0
    for r in out:
        if r[ip] != -1:
            total += r[ip]
            count += 1
    total_prn.append(total/count)

In [143]:
total_prn

[0.15812774566473972,
 0.12662767241379322,
 0.11434369208838209,
 0.5318011404133992,
 0.4934133285816108,
 0.1716778332145402,
 0.17896500356379177]

通过对比在skin数据集中，使用HOBS算法训练和检测的准确率最高,另外使用IForest算法的准确了也是相当不错