In [1]:
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append('/scratch/ys5hd/CPET/code/')

import os
import re
import cv2
import copy
import random
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from model.maxpoolcnn import *
# from model.avgpoolcnn import *

from functools import reduce
from pyts.image import GramianAngularField, MarkovTransitionField

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
torch.manual_seed(0)
torch.cuda.manual_seed_all(0)
np.random.seed(0)
random.seed(0)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
# DATAPATH
# Unzip wafer.zip and add PATH here
PATH = "/project/GutIntelligenceLab/ys5hd/CPET/wafer 2/" 
selected_cols = ['8', '15', '7', '12', '11', '6']

In [4]:
os.listdir(PATH)

['abnormal', 'README', 'normal', '.DS_Store']

### Read Data Files

In [5]:
def read_wafer(path=PATH):
    
    list_of_df = []
    series_name = list(set([x.split('.')[-1] for x in os.listdir(PATH+'normal')]))
    for file_type in ['abnormal', 'normal']:
        for fname in list(set([x.split('.')[0] for x in os.listdir(PATH+file_type)])):
            dic = {}
            label = ''
            for nm in series_name:
                df_inst = pd.read_csv(PATH+file_type+'/'+fname+'.'+nm, delimiter = "\t", header=None)
                if df_inst.shape[1]>1:
                    #dic[nm] = minmax_scale(df_inst[1])
                    dic[nm] = df_inst[1]
                else:
                    label = df_inst[0].values[0]
            df_inst = pd.DataFrame(dic)
            df_inst['label'] = label
            df_inst['id'] = fname
            df_inst = df_inst.reset_index()
            list_of_df.append(df_inst)

    df_agg = pd.concat(list_of_df)    

    df_agg['target'] = df_agg['label'].apply(lambda x: 0 if x=='#FAULT=normal' else 1)
    label_map = dict(zip(df_agg[['id', 'target']].drop_duplicates().id, df_agg[['id', 'target']].drop_duplicates().target))    
    
    return df_agg, label_map

In [6]:
df_agg, label_map = read_wafer(PATH)

### Image Encoding

In [7]:
def run_gram(x, image_size=104, method='difference'):
    ''' 
    Run Gramian Angular Field, recommended to keep image size equivalent to minimum length of time series
    '''
    gasf = GramianAngularField(image_size=image_size)
    X_gasf = gasf.fit_transform(np.array([x]))
    return X_gasf[0]

def run_mtf(x):
    ''' 
    Run Markov Transition Field, recommended to keep image size equivalent to minimum length of time series
    '''
    mtf = MarkovTransitionField(image_size=104)
    X_mtf = mtf.fit_transform(np.array([x]))
    return X_mtf[0]

### Encode Time Series as Image

In [8]:
df_list = []
for col in selected_cols:    
    df_list.append(df_agg.groupby('id')[col].apply(lambda x: run_mtf(x)).reset_index())
    
df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['id'],
                                            how='outer'), df_list)  
df = df_merged.copy()

In [9]:
df.head()

Unnamed: 0,id,8,15,7,12,11,6
0,1549_01,"[[0.8823529411764706, 0.8823529411764706, 0.88...","[[0.6666666666666666, 0.6666666666666666, 0.29...","[[0.8148148148148148, 0.8148148148148148, 0.81...","[[0.9459459459459459, 0.9459459459459459, 0.94...","[[0.9428571428571428, 0.9428571428571428, 0.94...","[[0.8787878787878788, 0.8787878787878788, 0.87..."
1,1549_02,"[[0.8888888888888888, 0.8888888888888888, 0.88...","[[0.7435897435897436, 0.7435897435897436, 0.74...","[[0.7727272727272727, 0.7727272727272727, 0.77...","[[0.9523809523809523, 0.9523809523809523, 0.95...","[[0.9512195121951219, 0.9512195121951219, 0.95...","[[0.9523809523809523, 0.9523809523809523, 0.95..."
2,1549_04,"[[0.8888888888888888, 0.8888888888888888, 0.88...","[[0.6086956521739131, 0.6086956521739131, 0.30...","[[0.8378378378378378, 0.8378378378378378, 0.83...","[[0.9534883720930233, 0.9534883720930233, 0.95...","[[0.9523809523809523, 0.9523809523809523, 0.95...","[[0.8333333333333334, 0.8333333333333334, 0.83..."
3,1549_06,"[[0.9130434782608695, 0.9130434782608695, 0.91...","[[0.6666666666666666, 0.25925925925925924, 0.2...","[[0.8333333333333334, 0.8333333333333334, 0.06...","[[0.9545454545454546, 0.9545454545454546, 0.95...","[[0.9534883720930233, 0.9534883720930233, 0.95...","[[0.875, 0.875, 0.875, 0.875, 0.875, 0.875, 0...."
4,1549_07,"[[0.8888888888888888, 0.8888888888888888, 0.88...","[[0.6153846153846154, 0.6153846153846154, 0.46...","[[0.9183673469387755, 0.9183673469387755, 0.91...","[[0.9523809523809523, 0.9523809523809523, 0.95...","[[0.9512195121951219, 0.9512195121951219, 0.95...","[[0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0...."


### DataLoader

In [10]:
class GAFloader(Dataset):
    def __init__(self, df, label_map, stackwise=True):
        self.df = df
        self.id_list = list(df['id'].unique())
        self.label_map = label_map
        self.stackwise = stackwise
        
    def __len__(self):
        return len(self.id_list)

    def __getitem__(self, idx):
        id_instance = self.id_list[idx]
        if self.stackwise:
            inp = torch.tensor(np.vstack(self.df.loc[idx, selected_cols].values)[None])
        else:
            inp = torch.tensor(np.stack(self.df.loc[idx, selected_cols].values))
        label = self.label_map[id_instance]
        return inp, label

### Stratfied KFold

In [11]:
X = df_agg[['id', 'target']].drop_duplicates().reset_index(drop=True)['id']
y = df_agg[['id', 'target']].drop_duplicates().reset_index(drop=True)['target']

skf = StratifiedKFold(n_splits=5)
skf.get_n_splits(X, y)

5

In [12]:
list_of_split = []
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    list_of_split.append(pd.DataFrame({'id': X[test_index], 'label': y[test_index], 'split': [i]*len(test_index)}))
    
df_split = pd.concat(list_of_split)    
df = pd.merge(df, df_split, on='id')

### Training Model

In [17]:
stackwise = False
NUM_RUNS = 20
NUM_EPOCHS = 100

vt_list = []
for _ in range(NUM_RUNS):

    attn_df_list = []
    val_acc_tracker = []
    best_epoch = -1

    for k in tqdm(range(5)):    
        val_pat = df.loc[df['split']==k]['id'].tolist()

        train_ds = GAFloader(df.loc[df['split']!=k].reset_index(drop=True), label_map, stackwise)
        train_dl = DataLoader(train_ds, batch_size=1, shuffle=True)

        val_ds = GAFloader(df.loc[df['split']==k].reset_index(drop=True), label_map, stackwise)
        val_dl = DataLoader(val_ds, batch_size=1, shuffle=True)

        dl = {'train': train_dl, 'val': val_dl}
        d_size = {'train': len(train_ds), 'val': len(val_ds)}

        # Net
        if stackwise:
            net = GafStackNet().to(device)
        else:
            net = GafAttnNet(mean=False).to(device)

        # create your optimizer
        # optimizer = optim.SGD(net.parameters(), lr=0.01)    
        optimizer = optim.Adam(net.parameters(), lr=0.0023) #lr=1e-4)

        # Loss
        criterion = nn.CrossEntropyLoss()

        # Training
        best_val_acc = 0
        loss_tracker = []
        for epoch in range(NUM_EPOCHS):
            for phase in ['train', 'val']:
                running_corrects = 0.0
                for i, (im, label) in enumerate(dl[phase]):
                    im = im.float().to(device)
                    label = label.to(device)

                    optimizer.zero_grad()

                    if phase == 'train':
                        net.train()
                    else:
                        net.eval()

                    output, _ = net(im)
                    _, preds = torch.max(output, 1)
                    loss = criterion(output, label)

                    running_corrects += torch.sum(preds == label.data)            
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                    loss_tracker.append(loss.item())
#                 print('Phase: {}, Epoch: {}, Loss: {}, Acc: {}'.format(phase, epoch+1,\
#                                                                        np.mean(loss_tracker), \
#                                                                        running_corrects/d_size[phase]))            
                loss_tracker = []
                if phase == 'val' and (running_corrects/d_size[phase]) > best_val_acc:
                    best_val_acc = running_corrects/d_size[phase]
                    best_model_wts = copy.deepcopy(net.state_dict())
                    best_epoch = epoch

        net.load_state_dict(best_model_wts)

        pred = []
        actual = []
        attn_list = []
        with torch.no_grad():
            for i, (im, label) in enumerate(val_dl):
                output, attn_wt = net(im.float().to(device))
                pred += list(torch.max(output, axis=1)[1].cpu().numpy())
                actual += list(label.numpy())
                if not stackwise:
                    attn_list.append(list(attn_wt.detach().cpu().numpy()))            

        if not stackwise:
            temp = pd.DataFrame(attn_list, columns=selected_cols)
            temp['pred'] = pred
            temp['actual'] = actual
            attn_df_list.append(temp)

        print('Length of Validation Data: {}'.format(len(pred)))
        print('Accuracy for fold: {} - {}'.format(k, sum(np.array(pred) == np.array(actual))/len(pred)))
        print('Epoch: {}'.format(best_epoch))
        val_acc_tracker.append(sum(np.array(pred) == np.array(actual)))

    print('Validation Accuracy: {}%'.format((np.sum(val_acc_tracker)/df.shape[0])*100))
    print('Error Rate: {}%'.format(1-(np.sum(val_acc_tracker)/df.shape[0])))
    vt_list.append((np.sum(val_acc_tracker)/df.shape[0])*100)

 20%|██        | 1/5 [02:17<09:10, 137.73s/it]

Length of Validation Data: 239
Accuracy for fold: 0 - 0.9916317991631799
Epoch: 0


 40%|████      | 2/5 [04:36<06:54, 138.16s/it]

Length of Validation Data: 239
Accuracy for fold: 1 - 0.9874476987447699
Epoch: 1


 60%|██████    | 3/5 [07:09<04:44, 142.46s/it]

Length of Validation Data: 239
Accuracy for fold: 2 - 0.9916317991631799
Epoch: 0


 80%|████████  | 4/5 [09:30<02:22, 142.04s/it]

Length of Validation Data: 239
Accuracy for fold: 3 - 0.99581589958159
Epoch: 0


100%|██████████| 5/5 [11:41<00:00, 140.35s/it]

Length of Validation Data: 238
Accuracy for fold: 4 - 0.9957983193277311
Epoch: 0
Validation Accuracy: 99.2462311557789%
Error Rate: 0.007537688442211032%





### Performance Report

In [21]:
print('Mean/Std Accuracy: {} ({})'.format(np.mean(vt_list), np.std(vt_list)))

Mean/Std Accuracy: 99.2462311557789 (0.0)


### Attention Distribution

In [24]:
# Picked up from latest epoch 
# Update code to track for all the epochs
pd.concat(attn_df_list).head()

Unnamed: 0,8,15,7,12,11,6,pred,actual
0,2.2e-05,0.001568,5.263753e-07,0.000383,0.998025,8.182898e-07,0,0
1,1.3e-05,0.059991,4.555024e-07,5e-06,0.939987,2.114229e-06,0,0
2,0.700343,0.000787,0.001129749,0.01855,0.166957,0.1122321,1,1
3,3e-06,0.050836,9.274229e-07,3.3e-05,0.949125,2.960254e-06,0,0
4,2e-06,0.275377,9.859888e-07,1.6e-05,0.724605,6.473377e-08,0,0
