In [1]:
import pandas as pd
import numpy as np

np.random.seed(1234)
normal_data = np.random.normal(loc=0, scale=1, size=(100, 2))  # 정상 데이터
outliers = np.random.uniform(low=-6, high=6, size=(5, 2))      # 이상치

normal_labels = np.zeros((normal_data.shape[0], 1))
outlier_labels = np.ones((outliers.shape[0], 1))

normal_data = np.hstack((normal_data, normal_labels))
outliers = np.hstack((outliers, outlier_labels))

data = np.vstack([normal_data, outliers])

df = pd.DataFrame(data, columns=['A', 'B', 'label'])

# data shuffle
df = df.sample(frac=1).reset_index(drop=True)

df.head()

Unnamed: 0,A,B,label
0,0.35402,-0.035513,0.0
1,-0.121728,2.365769,0.0
2,0.015696,-2.242685,0.0
3,-0.334077,0.002118,0.0
4,-1.735349,1.210384,0.0


In [2]:
from sklearn.neighbors import LocalOutlierFactor

# novelty = 훈련 데이터 외의 새로운 데이터에 대해 이상치 탐지를 할지 여부
# novelty=False : 학습 데이터 자체에 이상치를 탐지, fit_predict()
# novelty=True : 학습 데이터는 정상으로 간주, 새로운 데이터에 대해 이상치 여부 판단, fit(normal) + predict(outlier)
# contamination = 데이터 내에 이상치가 차지하는 비율, 0 ~ 0.5 or 'auto'
lof = LocalOutlierFactor(n_neighbors=20, novelty=True, contamination=0.05)
predict = lof.fit(df[['A', 'B']]).predict(df[['A', 'B']]) # fit에서 모두 정상으로 가정, predict에서 이상치로 다시 감지
predict

array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1,  1,
        1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1])

In [3]:
# 정상 데이터 확인
df[predict != -1]['label'].unique()

array([0.])

In [10]:
df[predict != -1].shape

(100, 3)

In [6]:
df[predict != -1].reset_index(drop=True)

Unnamed: 0,A,B,label
0,0.354020,-0.035513,0.0
1,-0.121728,2.365769,0.0
2,0.015696,-2.242685,0.0
3,-0.334077,0.002118,0.0
4,-1.735349,1.210384,0.0
...,...,...,...
95,0.680656,-1.818499,0.0
96,-1.027851,-0.584718,0.0
97,0.639633,-0.962029,0.0
98,1.104352,-0.431550,0.0
