In [1]:
# https://dacon.io/competitions/official/235930/codeshare/5508?page=1&dtype=recent

In [1]:
import random
import pandas as pd
import numpy as np
import os

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import StepLR
from tqdm import tqdm
from sklearn.metrics import f1_score, classification_report, confusion_matrix

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
epochs = 100
lr = 1e-3
batch_size = 512
momentum = 0.9
weight_decay = 1e-4
seed = 42
NUM_WORKERS = 8
saved_model = '../saved/ae_split5/best_model1.pth'

param = {
            'epochs' : epochs,
            'lr' :lr,
            'batch_size' : batch_size,
            'momentum' : momentum,
            'weight_decay' : weight_decay
        }

In [4]:
'/'.join(saved_model.split('/')[:-1])

'../saved/ae_split5'

In [5]:
def seed_everything(seed) :
    random.seed(seed)
    os.environ['PYHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

# seed_everything(seed)    

In [6]:
train_df = pd.read_csv('../dataset/train.csv')
val_df = pd.read_csv('../dataset/val.csv')
train_df = train_df.drop(columns=['ID'])
val_df = val_df.drop(columns=['ID'])
test_df = pd.read_csv('../dataset/test.csv')
test_df = test_df.drop(columns=['ID'])
print(train_df.shape)

(113842, 30)


In [7]:
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
col=[col for col in val_df.columns if col not in ['ID','Class']]

for i in col:
    sc=StandardScaler()
    scaler = sc.fit(train_df[i].values.reshape(-1,1))
    train_df[i] = scaler.transform(train_df[i].values.reshape(-1,1))
    val_df[i] = scaler.transform(val_df[i].values.reshape(-1,1))
    test_df[i] = scaler.transform(test_df[i].values.reshape(-1,1))

In [8]:
class CDataset(Dataset):
    def __init__(self, df, eval_mode=False):
        self.df = df
        self.eval_mode = eval_mode
        if self.eval_mode:
            self.labels = self.df['Class'].values
            self.df = self.df.drop(columns=['Class']).values
        else:
            self.df = self.df.values
        
    def __getitem__(self, index):
        if self.eval_mode:
            self.x = self.df[index]
            self.y = self.labels[index]
            return torch.Tensor(self.x), self.y
        else:
            self.x = self.df[index]
            return torch.Tensor(self.x)
        
    def __len__(self):
        return len(self.df)

In [9]:

train_dataset = CDataset(train_df.sample(frac=1)[:len(train_df)//5])
train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle=True, num_workers=NUM_WORKERS)

val_dataset = CDataset(val_df, eval_mode=True)
val_loader = DataLoader(val_dataset, batch_size = batch_size, shuffle=False, num_workers=NUM_WORKERS)

test_dataset = CDataset(test_df, eval_mode=False)
test_loader = DataLoader(test_dataset, batch_size = batch_size, shuffle=False, num_workers=NUM_WORKERS)

In [10]:
from torch import Tensor
class BasicBlock(nn.Module):

    def __init__(
        self,
        inplanes: int,
        planes: int,
    ) -> None:
        super().__init__()
        norm_layer = nn.BatchNorm1d
        self.lin1 = nn.Linear(inplanes, planes)
        self.bn1 = norm_layer(planes)
        self.act = nn.GELU()
        self.lin2 = nn.Linear(planes, inplanes)
        self.bn2 = norm_layer(inplanes)

    def forward(self, x: Tensor) -> Tensor:
        identity = x

        out = self.lin1(x)
        out = self.bn1(out)
        out = self.act(out)

        out = self.lin2(out)
        out = self.bn2(out)

        out += identity
        out = self.act(out)

        return out

In [11]:
class AutoEncoder(nn.Module) :
    def __init__(self) :
        super().__init__()
        self.dim = 30
        self.embeding_dim = 512
        self.hidden = 128
        self.act = nn.GELU()
        
        self.embeding = nn.Sequential(
            nn.Linear(self.dim,self.embeding_dim),
            self.act,
        ) 
        
        self.block1 = BasicBlock(128,128)
        self.block2 = BasicBlock(128,128)

        self.encoder = nn.Sequential(
            nn.Linear(self.embeding_dim,self.hidden),
            nn.BatchNorm1d(self.hidden),
            self.act,
        )
        
        self.decoder = nn.Sequential(
            nn.Linear(self.hidden,self.embeding_dim),
            nn.BatchNorm1d(self.embeding_dim),
            self.act,            
        )        
        
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
            elif isinstance(m, (nn.BatchNorm2d, nn.BatchNorm1d, nn.GroupNorm)):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
                
        
#         freezing embeding layer        
        for name, child in self.named_children() :
            for param in child.parameters() :
                # if 'embeding' in name :
                if name == 'embeding' :
                    param.requires_grad = False                
        
    def forward(self, x) :
        
        x_ = self.embeding(x)
        
        x = self.encoder(x_)
        x = self.block1(x)
        x = self.block2(x)
        x = self.decoder(x)
        
        return x_, x
        

In [12]:
class Trainer() :
    def __init__(self, model, optimizer, train_loader, val_loader, test_loader, scheduler, device, **param) :
        self.model = model
        self.optimizer = optimizer
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.test_loader = test_loader
        self.scheduler = scheduler
        self.device = device
        self.batch_size = param['batch_size']
        self.epochs = param['epochs']
        self.lr = param['lr']
        
        self.criterion = nn.L1Loss().to(device)
        self.cos = nn.CosineSimilarity(dim=1, eps=1e-6)
        self.threshold = 0.5
        
    def fit(self,) :
        self.model.to(self.device)
        best_score = 0
        for epoch in range(self.epochs) :
            self.model.train()
            train_loss = []
            
            for x in iter(self.train_loader) :
                x = x.to(self.device)
                x_, x = self.model(x)
                
                loss = self.criterion(x_, x)
                
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                
                train_loss.append(loss.item())
                
            true, pred = self.validation()
            score = f1_score(true, pred, average='macro')
            self.get_confusion(true, pred)
            
            if self.scheduler is not None :
                self.scheduler.step()
            
            print(f'epoch :[{epoch}] train loss [{np.mean(train_loss)}] val score [{score}]')    

            self.save_model('/'.join(saved_model.split('/')[:-1]) + '/' + str(epoch) +'.pth')
            if best_score < score :
                best_score = score
                self.save_model(saved_model)
            
    def validation(self) :
        self.model.eval()
        pred_y = []
        true_y = []
        
        with torch.no_grad():
            for x, y in iter(self.val_loader) :
                x = x.to(self.device)
                y = y.to(self.device)
                
                x_, x = self.model(x)
                diff = self.cos(x, x_).cpu().tolist()
                batch_pred = np.where(np.array(diff) < self.threshold, 1, 0).tolist()
                pred_y += batch_pred
                true_y += y.tolist()
                
        return true_y, pred_y
    
    def predict(self) :
        
        self.model.eval()
        pred_y = []
        
        with torch.no_grad() :
            for x in iter(self.test_loader) :
                x = x.to(self.device)
                x_, x = self.model(x)
                diff = self.cos(x, x_).cpu().tolist()
                
                batch_pred = np.where(np.array(diff) < self.threshold, 1, 0).tolist()
                pred_y += batch_pred
                
        return pred_y
    
    def save_model(self, name) :
        torch.save(self.model.state_dict(), name)
        
    def load_model(self, name) :
        self.model.load_state_dict(torch.load(name))
        
    def get_confusion(self, true_y, pred_y) :
        # pred_y, true_y = self.validation()
        tn, fp, fn, tp = confusion_matrix(true_y, pred_y).ravel()
        print('tp : ', tp, ', fp : ', fp, ', tn : ', tn, ', fn : ', fn)


In [13]:
model = AutoEncoder()

In [17]:
# for p in model.parameters():
#     print(p.shape)
#     print(p)

In [18]:
model = AutoEncoder()
model.eval()
optimizer = None
scheduler = None
print('')




In [22]:
csv1 = pd.read_csv('./submit_AE_split1.csv')
csv2 = pd.read_csv('./submit_AE_split2.csv')
csv3 = pd.read_csv('./submit_AE_split3.csv')
csv4 = pd.read_csv('./submit_AE_split4.csv')
csv5 = pd.read_csv('./submit_AE_split5.csv')
csv6 = pd.read_csv('./submit_AE_split6.csv')
csv7 = pd.read_csv('./submit_AE_split7.csv')

In [35]:
id_arr = csv1.ID.values
arr1 = csv1.Class.values
arr2 = csv2.Class.values
arr3 = csv3.Class.values
arr4 = csv4.Class.values
arr5 = csv5.Class.values
arr6 = csv6.Class.values
arr7 = csv7.Class.values

In [36]:
csv = pd.DataFrame(columns=['ID', 'Class1', 'Class2', 'Class3', 'Class4', 'Class5', 'Class6', 'Class7'])
csv['ID'] = id_arr
csv['Class1'] = arr1
csv['Class2'] = arr2
csv['Class3'] = arr3
csv['Class4'] = arr4
csv['Class5'] = arr5
csv['Class6'] = arr6
csv['Class7'] = arr7

In [39]:
csv['Class'] = 0

In [65]:
import collections

def set_label(x) :
    count = collections.Counter(x)
    if count[0] >= 5 :
        return 0
    else :
        return 1
csv['Class'] = csv.apply(lambda x : set_label(x[1:-1]), axis=1)

In [66]:
csv.shape

(142503, 9)

In [67]:
normal, fraud = val_df.Class.value_counts()
fraud / (normal+fraud)

0.0010540369615627855

In [68]:
csv.shape[0] * (fraud / (normal+fraud) )

150.20342913358164

In [69]:
csv.Class.value_counts()

0    142195
1       308
Name: Class, dtype: int64

In [70]:
csv.drop(columns=['Class1', 'Class2', 'Class3', 'Class4', 'Class5', 'Class6', 'Class7']).to_csv('./submit_AE_split.csv', index=False)