In [1]:
# https://dacon.io/competitions/official/235930/codeshare/5508?page=1&dtype=recent

In [2]:
import random
import pandas as pd
import numpy as np
import os

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import StepLR
from tqdm import tqdm
from sklearn.metrics import f1_score, classification_report, confusion_matrix

import warnings
warnings.filterwarnings(action='ignore')

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
epochs = 100
lr = 1e-3
batch_size = 512
momentum = 0.9
weight_decay = 1e-4
seed = 42
NUM_WORKERS = 8
saved_model = '../saved/ae_nosplit/best_model1.pth'

param = {
            'epochs' : epochs,
            'lr' :lr,
            'batch_size' : batch_size,
            'momentum' : momentum,
            'weight_decay' : weight_decay
        }

In [5]:
'/'.join(saved_model.split('/')[:-1])

'../saved/ae_nosplit'

In [6]:
def seed_everything(seed) :
    random.seed(seed)
    os.environ['PYHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed)    

In [7]:
train_df = pd.read_csv('../dataset/train.csv')
val_df = pd.read_csv('../dataset/val.csv')
train_df = train_df.drop(columns=['ID'])
val_df = val_df.drop(columns=['ID'])
test_df = pd.read_csv('../dataset/test.csv')
test_df = test_df.drop(columns=['ID'])
print(train_df.shape)

(113842, 30)


In [8]:
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
col=[col for col in val_df.columns if col not in ['ID','Class']]

for i in col:
    sc=StandardScaler()
    scaler = sc.fit(train_df[i].values.reshape(-1,1))
    train_df[i] = scaler.transform(train_df[i].values.reshape(-1,1))
    val_df[i] = scaler.transform(val_df[i].values.reshape(-1,1))
    test_df[i] = scaler.transform(test_df[i].values.reshape(-1,1))

In [9]:
class CDataset(Dataset):
    def __init__(self, df, eval_mode=False):
        self.df = df
        self.eval_mode = eval_mode
        if self.eval_mode:
            self.labels = self.df['Class'].values
            self.df = self.df.drop(columns=['Class']).values
        else:
            self.df = self.df.values
        
    def __getitem__(self, index):
        if self.eval_mode:
            self.x = self.df[index]
            self.y = self.labels[index]
            return torch.Tensor(self.x), self.y
        else:
            self.x = self.df[index]
            return torch.Tensor(self.x)
        
    def __len__(self):
        return len(self.df)

In [10]:

train_dataset = CDataset(train_df)
train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle=True, num_workers=NUM_WORKERS)

val_dataset = CDataset(val_df, eval_mode=True)
val_loader = DataLoader(val_dataset, batch_size = batch_size, shuffle=False, num_workers=NUM_WORKERS)

test_dataset = CDataset(test_df, eval_mode=False)
test_loader = DataLoader(test_dataset, batch_size = batch_size, shuffle=False, num_workers=NUM_WORKERS)

In [11]:
x = next(iter(train_loader))
x.shape

torch.Size([512, 30])

In [12]:
x, y  = next(iter(val_loader))
x.shape

torch.Size([512, 30])

In [13]:
from torch import Tensor
class BasicBlock(nn.Module):

    def __init__(
        self,
        inplanes: int,
        planes: int,
    ) -> None:
        super().__init__()
        norm_layer = nn.BatchNorm1d
        self.lin1 = nn.Linear(inplanes, planes)
        self.bn1 = norm_layer(planes)
        self.act = nn.GELU()
        self.lin2 = nn.Linear(planes, inplanes)
        self.bn2 = norm_layer(inplanes)

    def forward(self, x: Tensor) -> Tensor:
        identity = x

        out = self.lin1(x)
        out = self.bn1(out)
        out = self.act(out)

        out = self.lin2(out)
        out = self.bn2(out)

        out += identity
        out = self.act(out)

        return out

In [14]:
class AutoEncoder(nn.Module) :
    def __init__(self) :
        super().__init__()
        self.dim = 30
        self.embeding_dim = 512
        self.hidden = 128
        self.act = nn.GELU()
        
        self.embeding = nn.Sequential(
            nn.Linear(self.dim,self.embeding_dim),
            self.act,
        ) 
        
        self.block1 = BasicBlock(128,128)
        self.block2 = BasicBlock(128,128)

        self.encoder = nn.Sequential(
            nn.Linear(self.embeding_dim,self.hidden),
            nn.BatchNorm1d(self.hidden),
            self.act,
        )
        
        self.decoder = nn.Sequential(
            nn.Linear(self.hidden,self.embeding_dim),
            nn.BatchNorm1d(self.embeding_dim),
            self.act,            
        )        
        
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
            elif isinstance(m, (nn.BatchNorm2d, nn.BatchNorm1d, nn.GroupNorm)):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
                
        
#         freezing embeding layer        
        for name, child in self.named_children() :
            for param in child.parameters() :
                # if 'embeding' in name :
                if name == 'embeding' :
                    param.requires_grad = False                
        
    def forward(self, x) :
        
        x_ = self.embeding(x)
        
        x = self.encoder(x_)
        x = self.block1(x)
        x = self.block2(x)
        x = self.decoder(x)
        
        return x_, x
        

In [15]:
class Trainer() :
    def __init__(self, model, optimizer, train_loader, val_loader, test_loader, scheduler, device, **param) :
        self.model = model
        self.optimizer = optimizer
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.test_loader = test_loader
        self.scheduler = scheduler
        self.device = device
        self.batch_size = param['batch_size']
        self.epochs = param['epochs']
        self.lr = param['lr']
        
        self.criterion = nn.L1Loss().to(device)
        self.cos = nn.CosineSimilarity(dim=1, eps=1e-6)
        self.threshold = 0.95
        
    def fit(self,) :
        self.model.to(self.device)
        best_score = 0
        for epoch in range(self.epochs) :
            self.model.train()
            train_loss = []
            
            for x in iter(self.train_loader) :
                x = x.to(self.device)
                x_, x = self.model(x)
                
                loss = self.criterion(x_, x)
                
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                
                train_loss.append(loss.item())
                
            true, pred = self.validation()
            score = f1_score(true, pred, average='macro')
            self.get_confusion(true, pred)
            
            if self.scheduler is not None :
                self.scheduler.step()
            
            print(f'epoch :[{epoch}] train loss [{np.mean(train_loss)}] val score [{score}]')    

            self.save_model('/'.join(saved_model.split('/')[:-1]) + '/' + str(epoch) +'.pth')
            if best_score < score :
                best_score = score
                self.save_model(saved_model)
            
    def validation(self) :
        self.model.eval()
        pred_y = []
        true_y = []
        
        with torch.no_grad():
            for x, y in iter(self.val_loader) :
                x = x.to(self.device)
                y = y.to(self.device)
                
                x_, x = self.model(x)
                diff = self.cos(x, x_).cpu().tolist()
                batch_pred = np.where(np.array(diff) < self.threshold, 1, 0).tolist()
                pred_y += batch_pred
                true_y += y.tolist()
                
        return true_y, pred_y
    
    def predict(self) :
        
        self.model.eval()
        pred_y = []
        
        with torch.no_grad() :
            for x in iter(self.test_loader) :
                x = x.to(self.device)
                x_, x = self.model(x)
                diff = self.cos(x, x_).cpu().tolist()
                
                batch_pred = np.where(np.array(diff) < self.threshold, 1, 0).tolist()
                pred_y += batch_pred
                
        return pred_y
    
    def save_model(self, name) :
        torch.save(self.model.state_dict(), name)
        
    def load_model(self, name) :
        self.model.load_state_dict(torch.load(name))
        
    def get_confusion(self, true_y, pred_y) :
        # pred_y, true_y = self.validation()
        tn, fp, fn, tp = confusion_matrix(true_y, pred_y).ravel()
        print('tp : ', tp, ', fp : ', fp, ', tn : ', tn, ', fn : ', fn)


In [16]:
model = AutoEncoder()

In [17]:
# for p in model.parameters():
#     print(p.shape)
#     print(p)

In [18]:
model = AutoEncoder()
model.eval()
# optimizer = torch.optim.Adam(
#             [{'params':model.embeding.parameters(), 'lr':1e-7, 'weight_decay':2e-4},
#             {'params':model.encoder.parameters()},
#             {'params':model.decoder.parameters()}]
#             , lr=param['lr'])
optimizer = torch.optim.Adam(model.parameters(), param['lr'])
scheduler = None

In [19]:
trainer = Trainer(model, optimizer, train_loader, val_loader, test_loader, scheduler, device, **param)
# trainer = Trainer(model, optimizer, train_loader, val_loader, scheduler, device)

In [20]:
trainer.fit()

tp :  27 , fp :  3747 , tn :  24685 , fn :  3
epoch :[0] train loss [0.1282639989270223] val score [0.4718003520390711]
tp :  25 , fp :  568 , tn :  27864 , fn :  5
epoch :[1] train loss [0.04921895836901772] val score [0.5350396913539963]
tp :  25 , fp :  249 , tn :  28183 , fn :  5
epoch :[2] train loss [0.036880100471690096] val score [0.5799938184387142]
tp :  25 , fp :  150 , tn :  28282 , fn :  5
epoch :[3] train loss [0.031213303050652747] val score [0.6205848343502564]
tp :  25 , fp :  88 , tn :  28344 , fn :  5
epoch :[4] train loss [0.02956260232079457] val score [0.6740062389135143]
tp :  25 , fp :  95 , tn :  28337 , fn :  5
epoch :[5] train loss [0.02710270195898721] val score [0.6657859818461502]
tp :  25 , fp :  87 , tn :  28345 , fn :  5
epoch :[6] train loss [0.026165323164898717] val score [0.6752462221463755]
tp :  25 , fp :  45 , tn :  28387 , fn :  5
epoch :[7] train loss [0.02500034759297232] val score [0.7495600450513867]
tp :  25 , fp :  49 , tn :  28383 , fn : 

In [32]:
trainer.threshold = 0.95
true_y, pred_y = trainer.validation()
score = f1_score(true_y, pred_y, average='macro')
print(score)
tn, fp, fn, tp = confusion_matrix(true_y, pred_y).ravel()
print('tp : ', tp, ', fp : ', fp, ', tn : ', tn, ', fn : ', fn)
# confusion_matrix(true_y, pred_y)


0.9165787375726882
tp :  25 , fp :  5 , tn :  28427 , fn :  5


In [33]:
confusion_matrix(true_y, pred_y)

array([[28427,     5],
       [    5,    25]])

In [34]:
print(classification_report(true_y, pred_y))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.83      0.83      0.83        30

    accuracy                           1.00     28462
   macro avg       0.92      0.92      0.92     28462
weighted avg       1.00      1.00      1.00     28462



In [35]:
answer = np.where(np.array(true_y) != np.array(pred_y))[0]
answer

array([   71,  1047,  1210,  4039,  7000,  9326, 14221, 15306, 19113,
       28146])

In [36]:
val = pd.read_csv('../dataset/val.csv')
print(val.index.values[answer])
print(val.Class.values[answer])

[   71  1047  1210  4039  7000  9326 14221 15306 19113 28146]
[1 0 0 1 0 1 1 0 0 1]


In [37]:
pred_y = trainer.predict()

In [38]:
submit = pd.read_csv('../dataset/sample_submission.csv')
submit['Class'] = pred_y
submit.to_csv('./submit_AE_embeding3.csv', index=False)

In [39]:
saved_model

'../saved/ae_nosplit/best_model1.pth'