In [5]:
import os
os.chdir("/camin1/inyoung/DATA/data_pre")
data_list = os.listdir()
os.chdir("/camin1/inyoung/MTAD-GAT")

In [2]:
import warnings

import numpy as np
import pandas as pd


import torch
from torch import optim
from torch.utils.data import DataLoader

from tqdm import tqdm

import seaborn as sns
import matplotlib.pyplot as plt

from model.loss import JointLoss
from model.mtad_gat import MTAD_GAT
from utils.adjustpred import adjust_predicts
from utils.earlystop import EarlyStop
from utils.evalmethods import pot_threshold, epsilon_threshold, bestf1_threshold
from utils.plot import plot_loss
from utils.preprocess import preprocess
from utils.setseed import set_seed

# Dataset

In [3]:
from torch.utils.data import Dataset


class MyDataset(Dataset):
    def __init__(self, data, w=64):
        self.data = data
        self.w = w

    def __getitem__(self, index):
        x = self.data[index:index + self.w]
        y = self.data[index + self.w:index + self.w + 1]

        return x, y

    def __len__(self):
        return len(self.data) - self.w


# Train

In [4]:
class Exp:
    def __init__(self, iter, name, epochs, batch_size, patience, lr, generate, train_x, valid_x, test_x,  w=64, gamma=1):
        ## hyper-parameter
        self.iter = iter
        self.name = name
        self.epochs = epochs
        self.batch_size = batch_size
        self.patience = patience
        self.w = w
        self.gamma = gamma
        self.lr = lr
        
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.check_point = '/camin1/inyoung/MTAD-GAT/checkpoint/'+ name + '_chechkpoint_iter_'+  "_" + str(iter)+',pkl'
        
        self.train_x = train_x.values
        self.valid_x = valid_x.values
        self.test_x = test_x.values
        
        self._get_data()
        self._get_model()
    
    ################################################ Make Dataset ################################################
    def _get_data(self, train=True):
        if train:
            trainset = MyDataset(self.train_x, w=self.w)
            validset = MyDataset(self.valid_x, w=self.w)
            testset = MyDataset(self.test_x, w=self.w)
            
            self.trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True)
            self.validloader = DataLoader(validset, batch_size=batch_size, shuffle=True)
            self.testloader = DataLoader(testset, batch_size=batch_size, shuffle=True)
            
            self.loss = {'train': {'forecast': [], 'reconstruct': [], 'total': []},
                         'valid': {'forecast': [], 'reconstruct': [], 'total': []}}

            print('train: {0}, valid: {1}, test: {2}'.format(len(trainset), len(validset), len(testset)))
        
        else:
            self.train_x = np.vstack((self.train_x, self.valid_x))
            
            trainset = MyDataset(self.train_x, w=self.w)
            testset = MyDataset(self.test_x, w=self.w)
                
            self.trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True)
            self.testloader = DataLoader(testset, batch_size=batch_size, shuffle=True)
            
            print('train: {0}, test: {1}'.format(len(trainset), len(testset)))
    
    ################################################ Load Model ################################################
    def _get_model(self):
        self.model = MTAD_GAT().to(self.device)
        self.criterion = JointLoss()
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr, weight_decay=1e-4)
        self.earlystopping = EarlyStop(patience=self.patience)
        
    ################################ Training Model for each batch ################################
    def _process_one_batch(self, batch_x, batch_y):
        batch_x = batch_x.float().to(self.device)
        batch_y = batch_y.float().to(self.device)

        reconstruct, forecast = self.model(batch_x)
        forecast_loss, reconstruct_loss, loss = self.criterion(batch_x, batch_y, reconstruct, forecast)

        return forecast_loss, reconstruct_loss, loss
    
    ################################ 각 시점에 대한 이상치 점수 산출 ################################
    def _get_score(self, data, dataloader):
        self.model.eval()
        forecasts, reconstructs = [], []
        
        for (batch_x, batch_y) in dataloader:
            batch_x = batch_x.float().to(self.device)
            batch_y = batch_y.float().to(self.device)
            
            _, forecast = self.model(batch_x)
            recon_x = torch.cat((batch_x[:, 1:, :], batch_y), dim=1)
            reconstruct,_ = self.model(recon_x)
            
            forecasts.append(forecast.detach().cpu().numpy())
            reconstructs.append(reconstruct.detach().cpu().numpy()[:,-1,:])
        
        forecasts = np.concatenate(forecasts, axis=0).squeeze() 
        reconstructs = np.concatenate(reconstructs, axis=0)
        actuals = data[self.w:]
        
        df = pd.DataFrame()
        scores = np.zeros_like(actuals)
        
        for i in range(actuals.shape[1]): ## 변수의 개수만큼!
            df["For_"+str(i)] = forecasts[:,i] ## 예측값
            df["Rec_"+str(i)] = reconstructs[:,i] ## 재건축한 값
            df["Act_"+str(i)] = actuals[:,i] ## 실제값
            
            score = np.sqrt((forecasts[:,i] - actuals[:,i])**2) + self.gamma*np.sqrt((reconstructs[:,i]-actuals[:,i])**2)
            scores[:,i] = score
            df["Score_"+str(i)] = score ## 이상치 점수
        
        scores = np.mean(scores, axis=1) ## 각 행의 평균 -> 64개의 원소를 가진 1차원 배열 
        df['Score_Global'] = scores ## 각 시점별 score_global 값 산출
        return df
    
    def fit(self):
        ################################ 학습하기 전 initial loss ################################
        self.model.eval()
        train_forecast_loss, train_reconstruct_loss, train_loss = [], [], []
        
        for (batch_x, batch_y) in tqdm(self.trainloader):
            forecast_loss, reconstruct_loss, loss = self._process_one_batch(batch_x, batch_y)
            train_forecast_loss.append(forecast_loss.item())
            train_reconstruct_loss.append(reconstruct_loss.item())
            train_loss.append(loss.item())

        self.model.eval()
        valid_forecast_loss, valid_reconstruct_loss, valid_loss = [], [], []
        for (batch_x, batch_y) in self.validloader:
            forecast_loss, reconstruct_loss, loss = self._process_one_batch(batch_x, batch_y)
            valid_forecast_loss.append(forecast_loss.item())
            valid_reconstruct_loss.append(reconstruct_loss.item())
            valid_loss.append(loss.item())

        train_forecast_loss = np.sqrt(np.average(np.array(train_forecast_loss) ** 2))
        valid_forecast_loss = np.sqrt(np.average(np.array(valid_forecast_loss) ** 2))
        train_reconstruct_loss = np.sqrt(np.average(np.array(train_reconstruct_loss) ** 2))
        valid_reconstruct_loss = np.sqrt(np.average(np.array(valid_reconstruct_loss) ** 2))
        train_loss = np.sqrt(np.average(np.array(train_loss) ** 2))
        valid_loss = np.sqrt(np.average(np.array(valid_loss) ** 2))

        print(
            "Iter: {0} Init || Total Loss| Train: {1:.6f} Vali: {2:.6f} || Forecast Loss| Train:{3:.6f} Valid"
            ": {4:.6f} || Reconstruct Loss| Train: {5:.6f} Valid: {6:.6f}".format(
                self.iter, train_loss, valid_loss, train_forecast_loss, valid_forecast_loss,
                train_reconstruct_loss, valid_reconstruct_loss))
    
        ################################ Train ################################
        for e in range(self.epochs):
            self.model.train()
            train_forecast_loss, train_reconstruct_loss, train_loss = [], [], []
            for (batch_x, batch_y) in tqdm(self.trainloader):
                self.optimizer.zero_grad()
                forecast_loss, reconstruct_loss, loss = self._process_one_batch(batch_x, batch_y)
                train_forecast_loss.append(forecast_loss.item())
                train_reconstruct_loss.append(reconstruct_loss.item())
                train_loss.append(loss.item())
                loss.backward()
                self.optimizer.step()

            self.model.eval()
            valid_forecast_loss, valid_reconstruct_loss, valid_loss = [], [], []
            for (batch_x, batch_y) in self.validloader:
                forecast_loss, reconstruct_loss, loss = self._process_one_batch(batch_x, batch_y)
                valid_forecast_loss.append(forecast_loss.item())
                valid_reconstruct_loss.append(reconstruct_loss.item())
                valid_loss.append(loss.item())

            train_forecast_loss = np.sqrt(np.average(np.array(train_forecast_loss) ** 2))
            valid_forecast_loss = np.sqrt(np.average(np.array(valid_forecast_loss) ** 2))
            train_reconstruct_loss = np.sqrt(np.average(np.array(train_reconstruct_loss) ** 2))
            valid_reconstruct_loss = np.sqrt(np.average(np.array(valid_reconstruct_loss) ** 2))
            train_loss = np.sqrt(np.average(np.array(train_loss) ** 2))
            valid_loss = np.sqrt(np.average(np.array(valid_loss) ** 2))

            self.loss['train']['forecast'].append(train_forecast_loss)
            self.loss['train']['reconstruct'].append(train_reconstruct_loss)
            self.loss['train']['total'].append(train_loss)
            self.loss['valid']['forecast'].append(valid_forecast_loss)
            self.loss['valid']['reconstruct'].append(valid_reconstruct_loss)
            self.loss['valid']['total'].append(valid_loss)

            print(
                "Iter: {0} Epoch: {1} || Total Loss| Train: {2:.6f} Vali: {3:.6f} || Forecast Loss| Train:{4:.6f} Valid"
                 ": {5:.6f} || Reconstruct Loss| Train: {6:.6f} Valid: {7:.6f}".format(
                     self.iter, e+1, train_loss, valid_loss, train_forecast_loss, valid_forecast_loss,
                    train_reconstruct_loss, valid_reconstruct_loss))

            self.earlystopping(valid_loss, self.model, self.check_point)
            if self.earlystopping.early_stop:
                print("Iter {0} is Early stopping!".format(self.iter))
                break
            
        self.model.load_state_dict(torch.load(self.check_point))

        plot_loss(self.loss["train"]["forecast"], self.loss["train"]["reconstruct"], self.loss["train"]["total"],
                 '/camin1/inyoung/MTAD-GAT/img/' + '_iter' + self.name + "_" + str(self.iter) + '_trainloss.png')
        plot_loss(self.loss["valid"]["forecast"], self.loss["valid"]["reconstruct"], self.loss["valid"]["total"],
                '/camin1/inyoung/MTAD-GAT/img/' + '_iter' + self.name + "_" + str(self.iter) + str(self.iter) + '_validloss.png')
    
    ################################ Predict for test dataset ################################
    def predict(self, model_load=False, data_load=False):
        if model_load:
            self.model.load_state_dict(torch.load(self.check_point))
        self._get_data(train=False)
        
        
        # actual_label = self.test_y[self.w:]
        
        if data_load:
            trainresult = pd.read_csv('/camin1/inyoung/MTAD-GAT/result/'  + '_iter' + str(self.iter) + '_trainresult.csv')
            testresult = pd.read_csv('/camin1/inyoung/MTAD-GAT/result/' + '_iter' + str(self.iter) + '_testresult.csv')
        else:
            trainresult = self._get_score(self.train_x, self.trainloader)
            testresult = self._get_score(self.test_x, self.testloader)
            
        for i in range(self.test_x.shape[1]):
            train_score = trainresult["Score_" + str(i)].values
            test_score = testresult["Score_" + str(i)].values

            threshold = pot_threshold(train_score, test_score)

            train_pred = (train_score > threshold).astype(np.int64) ## 1 또는 0
            test_pred = (test_score > threshold).astype(np.int64)

            trainresult["Pred_" + str(i)] = train_pred
            trainresult["Threshold_" + str(i)] = threshold
            testresult["Pred_" + str(i)] = test_pred
            testresult["Threshold_" + str(i)] = threshold

        train_score = trainresult["Score_Global"].values
        test_score = testresult["Score_Global"].values

        #threshold = pot_threshold(train_score, test_score)
        # threshold = bestf1_threshold(test_score, actual_label)
        threshold = epsilon_threshold(train_score)

        train_pred = (train_score > threshold).astype(np.int64)
        test_pred = (test_score > threshold).astype(np.int64)
        
        trainresult["Pred_Global"] = train_pred
        #trainresult["Label_Global"] = 0
        trainresult["Threshold_Global"] = threshold

        testresult["Pred_Global"] = test_pred
        #testresult["Label_Global"] = actual_label
        testresult["Threshold_Global"] = threshold
        
        trainresult.to_csv('/camin1/inyoung/MTAD-GAT/result/' + self.name + '_iter' +  "_" +  str(self.iter) + '_trainresult.csv', index=False)
        testresult.to_csv('/camin1/inyoung/MTAD-GAT/result/'  + self.name + '_iter' + "_" + str(self.iter) + '_testresult.csv', index=False)

        #print("Iter {0} || precision: {2:.6f} recall: {3:.6f} f1: {4:.6f}".format(
        #    self.iter, precision_score(actual_label, test_pred),
        #    recall_score(actual_label, test_pred), f1_score(actual_label, test_pred)))

In [2]:
path = "/camin1/inyoung/DATA/data_pre/"

In [6]:
train_data_list = [i for i in data_list if i.split("_")[0]=="train"]
test_data_list = [i for i in data_list if i.split("_")[0]=="test"]
name_list = [i.split("_")[1][:-4] for i in train_data_list]

In [7]:
print(len(train_data_list))
print(len(test_data_list))
print(len(name_list))

1010
1010
1010


    hyper-parameter

In [8]:
iters=1
epochs=50
batch_size=16
patience =3
lr=0.01
generate = True

In [9]:
import gc

In [None]:
for i in range(len(train_data_list)):
    print("###########################################" + name_list[i] + "-th dataset###########################################")
    train_x = pd.read_csv(path+train_data_list[i], header=0)
    test_x = pd.read_csv(path+test_data_list[i], header=0)

    val_split = 0.3
    valid_x = train_x.iloc[-int(val_split * len(train_x)):,:]
    train_x = train_x.iloc[:-int(val_split * len(train_x)),:]
    
    for it in range(iters):
        print("iter " + str(it) + ' is start...')
        exp = Exp(it, name_list[i], epochs, batch_size, patience, lr, generate, train_x, valid_x, test_x,  w=64, gamma=1)
        exp.fit()
        exp.predict(model_load=False, data_load=False)
        print("iter " + str(it) + ' is end!')
    ################### empty gpu cash ###################    
    gc.collect()
    torch.cuda.empty_cache()