In [None]:
!pip install pyod

# Simple Autoencoder를 사용한 비지도 이상치 탐지

정상 데이터를 encoder에 학습시키고, reconstruction error가 크면 anomaly로 판단하는 방법이다.

In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch import optim
import time
import os
import random
from collections import Counter
import numpy as np
from pyod.models.abod import ABOD

## 재현성을 위한 seed

In [3]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  
    torch.backends.cudnn.deterministic = True  
    torch.backends.cudnn.benchmark = True
    
seed_everything(42)

In [4]:
train_df = pd.read_csv('./train_data.csv')
test_df = pd.read_csv('./test_data.csv')

In [5]:
train_df.head()

Unnamed: 0,air_inflow,air_end_temp,out_pressure,motor_current,motor_rpm,motor_temp,motor_vibe,type
0,1.59,41.0,0.7,20.53,1680.0,58.67,2.93,0
1,2.97,59.28,0.7,38.4,3142.0,74.91,3.75,0
2,1.91,45.29,0.7,24.73,2023.0,62.48,3.12,0
3,2.37,51.33,0.7,30.63,2506.0,67.84,3.39,0
4,1.9,45.21,0.7,24.65,2017.0,62.41,3.12,0


In [6]:
test_df.head()

Unnamed: 0,air_inflow,air_end_temp,out_pressure,motor_current,motor_rpm,motor_temp,motor_vibe,type
0,2.51,53.28,0.7,32.54,2662.0,69.58,3.48,0
1,2.66,55.24,0.7,34.45,2819.0,71.32,3.57,0
2,1.72,42.74,0.7,22.23,1819.0,60.21,3.01,0
3,2.2,49.15,0.7,28.5,2332.0,65.91,3.3,0
4,2.06,47.28,0.7,26.67,2182.0,64.24,3.21,0


## Dataset 정의

In [7]:
class US_Dataset(Dataset):
    def __init__(self, data):
        self.df = data

    def __getitem__(self, idx):
        x = self.df[idx]
        
        return torch.FloatTensor(x)

    def __len__(self):
        return len(self.df)

## train, eval, get label 함수 정의

In [8]:
def train(model, train_loader, optimizer):
    
    model.train()
    
    running_loss = 0.0
    len_data = len(train_loader.dataset)
    
    for x in train_loader:
        x = x.cuda()
        
        x_hat, _ = model(x)
        
        # loss between original and reconstruction
        loss = loss_func(x, x_hat)
        
        opt.zero_grad()
        loss.backward()
        opt.step()
        
        running_loss += loss.item()
    
    return running_loss/len_data

def eval(model, dataloader):
    scores = []
    model.eval()
    print('Testing...')
    with torch.no_grad():
        for x in dataloader:
            x = x.cuda()
            x_hat, z = model(x)
            score = torch.mean( torch.abs(x - x_hat) , axis=1)
            scores.extend(score.cpu().numpy())
            # print(score.shape)

    return np.array(scores), z

def get_pred_label(model_pred, t):
    # (0:정상, 1:불량)로 Label 변환
    model_pred = np.where(model_pred <= t, 0, model_pred)
    model_pred = np.where(model_pred > t, 1, model_pred)
    return model_pred

## Basic Autoencoder 구조 정의
7->4->2 / 2->4->7로 이어지는 autoencoder 사용

feature를 축소시키고 다시 늘리는 bottleneck 방식 사용

In [9]:
class AutoEncoder(nn.Module):
    def __init__(self):
        super(AutoEncoder, self).__init__()
    
        self.fc1 = nn.Linear(7, 4)
        self.fc2 = nn.Linear(4, 2)

        self.defc1 = nn.Linear(2, 4)
        self.defc2 = nn.Linear(4, 7)
    
    def encoder(self, x):
        
        x = self.fc1(x)
        x = self.fc2(x)

        return x
    
    def decoder(self, x):
        
        x = self.defc1(x)
        x = self.defc2(x)

        return x
    
    def forward(self, x):
        
        z = self.encoder(x)
        x_hat = self.decoder(z)
        
        return x_hat, z

## 모델 학습 및 추론

In [10]:
Config = {
    "num_epochs" : 10000,
    "batch_size" : 256,
    "learning_rate" : 0.0001,
}

In [11]:
submit = pd.read_csv('./answer_sample.csv')
submit.head()

Unnamed: 0,type,label
0,0,-1
1,0,-1
2,0,-1
3,0,-1
4,0,-1


In [None]:
# num = 설비 번호의 type
# 총 0~7의 설비가 있기 때문에 설비마다 따로 학습 및 추론 진행

for num in range(8):
    print("attempt", num,"th")

    # 설비 num에 맞는 data 가져오기
    train_data = train_df[train_df['type'] == num]
    test_data = test_df[test_df['type'] == num]
    
    train_data = train_data[train_data['motor_vibe'] < 10]
    
    train_data = train_data.drop(['type'], axis = 1)
    test_data = test_data.drop(['type'], axis = 1)
    
    # scale 맞춰주기
    scale = StandardScaler()
    scale.fit(train_data)
    
    train_data = scale.transform(train_data)
    test_data = scale.transform(test_data)
    
    ###############################################################
    ################### DEEP LEARNING AUTOENCODER #################
    
    train_dataset = US_Dataset(train_data)
    test_dataset = US_Dataset(test_data)
    
    train_loader = DataLoader(dataset=train_dataset, batch_size=Config['batch_size'], shuffle=True)
    test_loader = DataLoader(dataset=test_dataset, batch_size=Config['batch_size'], shuffle=False)
    
    # 모델 정의
    model = AutoEncoder().cuda()
    
    # L1Loss 사용
    loss_func = nn.L1Loss()
    
    # Adam optimizer 사용
    opt = optim.Adam(model.parameters(), lr=Config['learning_rate'])
    
    start_time = time.time()
    
    for epoch in range(Config['num_epochs']):
        train_loss = train(model, train_loader, opt)
    
    print('train loss: %.6f, time: %.4f min' %(train_loss, (time.time()-start_time)/60))
    print('-'*10)
    
    # scores = reconstruction error 모음
    scores, z = eval(model, train_loader)
    
    # scores_ = test data의 reconstruction error 모음
    scores_, z_ = eval(model, test_loader)
    
    # Train data (정상 데이터)에서 발견할 수 있는 score의 최댓값인 t를 임계치로 설정
    # 정상데이터 관찰할 수 있는 관측치 중 가장 큰 값이므로, 임계치 이하의 값은 
    # 정상 데이터일 것이라는 가정
    # t 이상의 값은 이상치 데이터일 것이다 (reconstruction error is higher)
    t=scores.max()
    
    # t를 사용해 pred label
    test_pred = get_pred_label(scores_, t)
    print(Counter(test_pred))
    
    ###############################################################
    ##################### MACHINE LEARNING ABOD ###################
    
    model = ABOD(n_neighbors=10, contamination=0.0001)
    model.fit(train_data)
    test_pred_2 = model.predict(test_data)
    print(Counter(test_pred_2))
    
    ## Hard Voting
    test_pred_real = []
    for i in range(len(test_pred)):
        if test_pred[i] == test_pred_2[i]:
            test_pred_real.append(test_pred[i])
        else:
            test_pred_real.append(0)
    print(Counter(test_pred_real))
    
    # 설비 num에 맞는 predict 결과 채워주기
    n = 0
    for i in range(len(submit)):
        if submit.iloc[i]['type'] == num:
            submit.iloc[i]['label'] = test_pred_real[n]
            n += 1

attempt 0 th
train loss: 0.000008, time: 0.9612 min
----------
Testing...
Testing...
Counter({0.0: 1153, 1.0: 143})
Counter({0: 1153, 1: 143})
Counter({0.0: 1153, 1.0: 143})
attempt 1 th
train loss: 0.000035, time: 0.8724 min
----------
Testing...
Testing...
Counter({0.0: 1107})
Counter({0: 1107})
Counter({0.0: 1107})
attempt 2 th
train loss: 0.000018, time: 0.9079 min
----------
Testing...
Testing...
Counter({0.0: 1061, 1.0: 37})
Counter({0: 1071, 1: 27})
Counter({0.0: 1071, 1.0: 27})
attempt 3 th
train loss: 0.000024, time: 0.8438 min
----------
Testing...
Testing...
Counter({0.0: 881, 1.0: 37})
Counter({0: 892, 1: 26})
Counter({0.0: 892, 1.0: 26})
attempt 4 th
train loss: 0.000011, time: 0.8791 min
----------
Testing...
Testing...
Counter({0.0: 904, 1.0: 14})
Counter({0: 905, 1: 13})
Counter({0.0: 905, 1.0: 13})
attempt 5 th
train loss: 0.000006, time: 0.5476 min
----------
Testing...
Testing...
Counter({0.0: 712, 1.0: 35})
Counter({0: 699, 1: 48})
Counter({0.0: 717, 1.0: 30})
attem

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

x = submit['label'].unique()
y = submit['label'].value_counts()
colors = sns.color_palette("pastel")

print(y)

fig = plt.figure(figsize=(6, 3))
plt.pie(y, labels=x, autopct='%.1f%%', shadow=True, colors=colors, textprops={'fontsize': 12})
plt.title('Label')
plt.show()

## 제출

In [None]:
submit.to_csv('./submit_13_simpleAE_and_ABOD.csv', index=False)

# 참고자료

https://sonsnotation.blogspot.com/2020/11/13-anomaly-detection.html

https://dacon.io/competitions/official/236036/codeshare/7480?page=1&dtype=recent

#### HANNAH 최고!!!!