In [9]:
import pandas as pd
import numpy as np

np.random.seed(1234)
normal_data = np.random.normal(loc=0, scale=1, size=(100, 2))  # 정상 데이터
outliers = np.random.uniform(low=-6, high=6, size=(5, 2))      # 이상치

normal_labels = np.zeros((normal_data.shape[0], 1))
outlier_labels = np.ones((outliers.shape[0], 1))

normal_data = np.hstack((normal_data, normal_labels))
outliers = np.hstack((outliers, outlier_labels))

data = np.vstack([normal_data, outliers])

df = pd.DataFrame(data, columns=['A', 'B', 'label'])

# data shuffle
df = df.sample(frac=1).reset_index(drop=True)

df.head()

Unnamed: 0,A,B,label
0,0.35402,-0.035513,0.0
1,-0.121728,2.365769,0.0
2,0.015696,-2.242685,0.0
3,-0.334077,0.002118,0.0
4,-1.735349,1.210384,0.0


In [10]:
df['label'].value_counts()

0.0    100
1.0      5
Name: label, dtype: int64

In [11]:
from sklearn.ensemble import IsolationForest

# n_estimators : 트리 개수 (default 100)
# contamination : 이상치 비율 예상치 (0.05)
isof = IsolationForest(random_state=1234, contamination=0.04)
predict = isof.fit_predict(df[['A', 'B']]) 
predict



array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1,  1,
        1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1])

In [12]:
# 정상 데이터 확인
df[predict != -1]['label'].unique()

array([0.])

In [13]:
df[predict != -1].shape

(100, 3)

In [15]:
df[predict != -1].reset_index()

Unnamed: 0,index,A,B,label
0,0,0.354020,-0.035513,0.0
1,1,-0.121728,2.365769,0.0
2,2,0.015696,-2.242685,0.0
3,3,-0.334077,0.002118,0.0
4,4,-1.735349,1.210384,0.0
...,...,...,...,...
95,100,0.680656,-1.818499,0.0
96,101,-1.027851,-0.584718,0.0
97,102,0.639633,-0.962029,0.0
98,103,1.104352,-0.431550,0.0
