In [1]:
# https://dacon.io/competitions/official/235930/codeshare/5508?page=1&dtype=recent


In [2]:
import random
import pandas as pd
import numpy as np
import os

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import StepLR
from tqdm import tqdm
from sklearn.metrics import f1_score, classification_report, confusion_matrix

import warnings
warnings.filterwarnings(action='ignore')

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
device

device(type='cuda')

In [5]:
epochs = 200
lr = 1e-3
batch_size = 512
momentum = 0.9
weight_decay = 1e-4
seed = 42
NUM_WORKERS = 8
saved_model = '../saved/ae_embeding3/best_model.pth'

param = {
            'epochs' : epochs,
            'lr' :lr,
            'batch_size' : batch_size,
            'momentum' : momentum,
            'weight_decay' : weight_decay
        }

In [6]:
def seed_everything(seed) :
    random.seed(seed)
    os.environ['PYHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed)    

In [7]:
train_df = pd.read_csv('../dataset/train.csv')
val_df = pd.read_csv('../dataset/val.csv')
train_df = train_df.drop(columns=['ID'])
val_df = val_df.drop(columns=['ID'])
test_df = pd.read_csv('../dataset/test.csv')
test_df = test_df.drop(columns=['ID'])
print(train_df.shape)

(113842, 30)


In [8]:
# from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
# col=[col for col in val_df.columns if col not in ['ID','Class']]

# for i in col:
#     sc=RobustScaler()
#     # sc = MinMaxScaler(feature_range=(-1,1))
#     train_df[i]=sc.fit_transform(train_df[i].values.reshape(-1,1))
    
# for i in col:
#     sc=RobustScaler()
#     # sc = MinMaxScaler(feature_range=(-1,1))
#     val_df[i]=sc.fit_transform(val_df[i].values.reshape(-1,1))
    
# for i in col:
#     sc=RobustScaler()
#     # sc = MinMaxScaler(feature_range=(-1,1))
#     test_df[i]=sc.fit_transform(test_df[i].values.reshape(-1,1))    

In [9]:
# class CDataset(Dataset) :
#     def __init__(self, df, eval_mode=False) :
#         self.df = df
#         self.eval_mode = eval_mode
#         if self.eval_mode :
#             self.labels = self.df['Class']
#             self.df = self.df.drop(columns=['Class'])
        
#     def __getitem__(self, idx) :
#         if self.eval_mode :
#             self.x = self.df.iloc[idx, 1:]
#             self.y = self.labels[idx]
#             return torch.tensor(self.x, dtype=torch.float32),torch.tensor(self.y)
        
#         else :
#             self.x = self.df.iloc[idx, 1:]
#             return torch.tensor(self.x, dtype=torch.float32)
        
#     def __len__(self) :
#         return len(self.df)
        

In [10]:
class CDataset(Dataset):
    def __init__(self, df, eval_mode=False):
        self.df = df
        self.eval_mode = eval_mode
        if self.eval_mode:
            self.labels = self.df['Class'].values
            self.df = self.df.drop(columns=['Class']).values
        else:
            self.df = self.df.values
        
    def __getitem__(self, index):
        if self.eval_mode:
            self.x = self.df[index]
            self.y = self.labels[index]
            return torch.Tensor(self.x), self.y
        else:
            self.x = self.df[index]
            return torch.Tensor(self.x)
        
    def __len__(self):
        return len(self.df)

In [11]:
train_dataset = CDataset(train_df)
train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle=True, num_workers=NUM_WORKERS)

val_dataset = CDataset(val_df, eval_mode=True)
val_loader = DataLoader(val_dataset, batch_size = batch_size, shuffle=False, num_workers=NUM_WORKERS)

test_dataset = CDataset(test_df, eval_mode=False)
test_loader = DataLoader(test_dataset, batch_size = batch_size, shuffle=False, num_workers=NUM_WORKERS)

In [12]:
x = next(iter(train_loader))
x.shape

torch.Size([512, 30])

In [13]:
x, y  = next(iter(val_loader))
x.shape

torch.Size([512, 30])

In [14]:
class AutoEncoder(nn.Module) :
    def __init__(self) :
        super().__init__()
        self.dim = 30
        self.embeding_dim = 2048
        self.hidden = 512
        self.act = nn.GELU()
        
        self.embeding = nn.Sequential(
            nn.Linear(self.dim,self.embeding_dim),
            # nn.BatchNorm1d(self.embeding_dim),
            nn.GELU(),
        ) 
        self.encoder = nn.Sequential(
            # nn.BatchNorm1d(self.embeding_dim),
            nn.Linear(self.embeding_dim,self.hidden),
            nn.BatchNorm1d(self.hidden),
            self.act,
        )
        
        self.decoder = nn.Sequential(
            nn.Linear(self.hidden,self.embeding_dim),
            nn.BatchNorm1d(self.embeding_dim),
            self.act,            
        )
        
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
            elif isinstance(m, (nn.BatchNorm2d, nn.BatchNorm1d, nn.GroupNorm)):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
                
        
#         freezing embeding layer        
        for name, child in self.named_children() :
            for param in child.parameters() :
                # if 'embeding' in name :
                if name == 'embeding' :
                    param.requires_grad = False                
        
    def forward(self, x) :
        
        x_ = self.embeding(x)
        
        x = self.encoder(x_)
        x = self.decoder(x)
        return x_, x
        

In [15]:
class Trainer() :
    def __init__(self, model, optimizer, train_loader, val_loader, test_loader, scheduler, device, **param) :
        self.model = model
        self.optimizer = optimizer
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.test_loader = test_loader
        self.scheduler = scheduler
        self.device = device
        self.batch_size = param['batch_size']
        self.epochs = param['epochs']
        self.lr = param['lr']
        
        self.criterion = nn.L1Loss().to(device)
        # self.criterion = nn.MSELoss().to(device)
        self.cos = nn.CosineSimilarity(dim=1, eps=1e-6)
        
    def fit(self,) :
        self.model.to(self.device)
        best_score = 0
        for epoch in range(self.epochs) :
            self.model.train()
            train_loss = []
            
            for x in iter(self.train_loader) :
                x = x.to(self.device)
                x_, x = self.model(x)
                
                loss = self.criterion(x_, x)
                
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                
                train_loss.append(loss.item())
                # self.scheduler.step()
                
            score, _, _ = self.validation(self.model, 0.95)
            
            if self.scheduler is not None :
                self.scheduler.step()
            
            print(f'epoch :[{epoch}] train loss [{np.mean(train_loss)}] val score [{score}]')
            for param_group in self.optimizer.param_groups:
                print(param_group['lr'])      
            
            # print(f'epoch :[{epoch}] train loss [{np.mean(train_loss)}] val score [{score}] lr [{self.scheduler.get_lr()}]')

            if best_score < score :
                best_score = score
                # torch.save(self.model.state_dict(), saved_model)
            
    def validation(self, model, threshold) :
        # cos = nn.CosineSimilarity(dim=1, eps=1e-6)
        model.eval()
        pred_y = []
        true_y = []
        
        with torch.no_grad():
            for x, y in iter(self.val_loader) :
                x = x.to(self.device)
                y = y.to(self.device)
                
                x_, x = model(x)
                # diff = torch.linalg.norm(x - x_, dim=1).cpu().tolist()
                diff = self.cos(x, x_).cpu().tolist()
                batch_pred = np.where(np.array(diff) < threshold, 1, 0).tolist()
                pred_y += batch_pred
                true_y += y.tolist()
            print(confusion_matrix(true_y, pred_y))
                
        return f1_score(true_y, pred_y, average='macro'), pred_y, true_y
    
    def predict(self, model, threshold) :
        
        model.eval()
        pred_y = []
        
        with torch.no_grad() :
            for x in iter(self.test_loader) :
                x = x.to(self.device)
                x_, x = model(x)
                diff = self.cos(x, x_).cpu().tolist()
                
                batch_pred = np.where(np.array(diff) < threshold, 1, 0).tolist()
                pred_y += batch_pred
                
        return pred_y

In [16]:
model = AutoEncoder()

In [17]:
# for p in model.parameters():
#     print(p.shape)
#     print(p)

In [18]:
a = torch.randn(2,3)
a


tensor([[-0.5686, -1.4019,  0.1380],
        [-1.6804,  0.4622, -0.0730]])

In [19]:
torch.linalg.norm(a,dim=1)

tensor([1.5191, 1.7444])

In [20]:
model = AutoEncoder()
model.eval()
optimizer = torch.optim.Adam(params=model.parameters(), lr = param['lr'], weight_decay=1e-4)

# optimizer = torch.optim.SGD(model.parameters(), param['lr'],
#                             momentum=param['momentum'],
#                             weight_decay=param['weight_decay'])
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=10, threshold_mode='abs', min_lr=1e-8, verbose=True)
# scheduler = StepLR(optimizer, step_size=20, gamma=0.5)
scheduler = None

# scheduler = lr_scheduler.LambdaLR(optimizer, lambda epoch : 0.1 **(epoch //30))
# scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, 
#                                                 epochs              = param['epochs'], 
#                                                 steps_per_epoch     = int(len(train_dataset)/param['batch_size'])+1,
#                                                 max_lr              = param['lr'], 
#                                                 pct_start           = 0.05, 
#                                                 div_factor          = 5, 
#                                                 final_div_factor    = 5e+4)   


In [21]:
trainer = Trainer(model, optimizer, train_loader, val_loader, test_loader, scheduler, device, **param)
# trainer = Trainer(model, optimizer, train_loader, val_loader, scheduler, device)

In [22]:
trainer.fit()

[[27681   751]
 [    5    25]]
epoch :[0] train loss [0.08623232572201656] val score [0.5242815630340075]
0.001
[[28322   110]
 [    5    25]]
epoch :[1] train loss [0.04011361230782864] val score [0.650502096316857]
0.001
[[28427     5]
 [    5    25]]
epoch :[2] train loss [0.03246125136188862] val score [0.9165787375726882]
0.001
[[28427     5]
 [    9    21]]
epoch :[3] train loss [0.030825096879973004] val score [0.8748769079271295]
0.001
[[28429     3]
 [   23     7]]
epoch :[4] train loss [0.03181507224592927] val score [0.6747714647352507]
0.001
[[28427     5]
 [   11    19]]
epoch :[5] train loss [0.032181891383010176] val score [0.851711180144449]
0.001
[[28427     5]
 [    8    22]]
epoch :[6] train loss [0.03238923329809856] val score [0.8858506104888013]
0.001
[[28427     5]
 [    7    23]]
epoch :[7] train loss [0.03294803378393565] val score [0.8964462129361583]
0.001
[[28428     4]
 [   11    19]]
epoch :[8] train loss [0.03238526753446446] val score [0.8583586886309732


KeyboardInterrupt



In [None]:
f1_score, pred_y, true_y = trainer.validation(trainer.model, 0.93)
f1_score

In [None]:
answer = np.where(np.array(true_y) != np.array(pred_y))[0]
answer

In [None]:
pred_y = trainer.predict(trainer.model, 0.93)

In [None]:
submit = pd.read_csv('../dataset/sample_submission.csv')
submit['Class'] = pred_y
submit.to_csv('./submit_AE_embeding3.csv', index=False)