In [2]:
import os
import sys
import tqdm
import random
import math

from dataclasses import dataclass, asdict

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import DataLoader, Dataset

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings("ignore")

### Settings

In [58]:
@dataclass
class Config:
    FILE_NAME: str = 'Transformer'
    ROOT_DIR: str = './data/huge'
    SEED: int = 42
    
    DEVICE: str = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    BATCH_SIZE: int = 128
    EPOCHS: int = 50
    LEARNING_RATE: float = 0.00005
    SERIES_LENGTH: int = 50
    
    OUTPUT_DIM: int = 1
    MODEL_DIM: int = 128
    NUM_HEADS: int = 8
    NUM_LAYERS: int = 2
    FF_DIM: int = 256
    DROPOUT: float = 0.3
    
CONFIG = Config()

In [59]:
seed = CONFIG.SEED
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x1c80a385b50>

In [60]:
import mlflow
import mlflow.pytorch

mlflow.set_experiment('SKADA')
def mlflow_run_decorator(run_name=None):
    def decorator(func):
        def wrapper(*args, **kwargs):
            mlflow.start_run(run_name=run_name)
            try:
                result = func(*args, **kwargs)
                mlflow.set_tag("Status", "SUCCEESS")
            except Exception as e:
                mlflow.log_param("Exception", e)
                mlflow.set_tag("Status", "FAIL")
                raise e
            finally:
                mlflow.end_run()
            return result
        return wrapper
    return decorator

### Data

In [61]:
train_df = pd.read_csv(os.path.join(CONFIG.ROOT_DIR, "train_df.csv"), index_col='Serial Number')
train_y = pd.read_csv(os.path.join(CONFIG.ROOT_DIR, "train_y.csv"), index_col='Serial Number')
pred_x = pd.read_csv(os.path.join(CONFIG.ROOT_DIR, "pred_x.csv"), index_col='Serial Number')

In [62]:
train_df['TIMESTAMP'] = pd.to_datetime(train_df['TIMESTAMP'])
train_df['TIMESTAMP'] = train_df['TIMESTAMP'].map(lambda t: t.strftime('%Y-%m-%d %H:%M'))

pred_x['TIMESTAMP'] = pd.to_datetime(pred_x['TIMESTAMP'])
pred_x['TIMESTAMP'] = pred_x['TIMESTAMP'].map(lambda t: t.strftime('%Y-%m-%d %H:%M'))

serial_key = list(train_df.columns)[0]
date_time_key = list(train_df.columns)[1]
feature_keys = list(train_df.columns)[3:-1]
target_key = list(train_df.columns)[-1]

train_x = train_df.drop(columns='Y')

In [63]:
print("Train Data 크기 :", train_x.shape)
print("Pred Data 크기 :", pred_x.shape)

print("Train Data의 Serial Number의 unique 값 :", len(train_x[serial_key].unique()))
print("Pred Data의 Serial Number의 unique 값 :", len(pred_x[serial_key].unique()))

Train Data 크기 : (555456, 19)
Pred Data 크기 : (138880, 19)
Train Data의 Serial Number의 unique 값 : 97860
Pred Data의 Serial Number의 unique 값 : 72562


In [64]:
train_x.head(3)

Unnamed: 0_level_0,TIMESTAMP,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,X13,X14,X15,X16,X17,X18
Serial Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
19,2020-02-09 16:24,EI83N072710203N8H,22,18,22,16,32,99,91219,43095,2132,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,2020-02-10 16:39,EI83N072710203N8H,22,18,22,16,32,99,91413,43214,2133,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,2020-02-11 16:54,EI83N072710203N8H,22,18,22,16,32,99,91606,43332,2135,49.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


### PreProcessing

In [65]:
train_x.drop(columns='X1', inplace=True)
pred_x.drop(columns='X1', inplace=True)

In [66]:
scaler = StandardScaler()

train_x[feature_keys] = scaler.fit_transform(train_df[feature_keys])
pred_x[feature_keys] = scaler.transform(pred_x[feature_keys])

In [67]:
train_x_by_serial = [group[1] for group in train_x.groupby(train_x.index)]
pred_x_by_serial = [group[1] for group in pred_x.groupby(pred_x.index)]

train_x_by_serial = [group.sort_values('TIMESTAMP') for group in train_x_by_serial]
pred_x_by_serial = [group.sort_values('TIMESTAMP') for group in pred_x_by_serial]

### Data

In [68]:
def train_val_test_split(Xs, ys, val_ratio=0.2, test_ratio=0.1):
    data_per_label = {}

    for x, y in zip(Xs, ys):
        label = y
        if label not in data_per_label:
            data_per_label[label] = []
        data_per_label[label].append((x, y))

    train, val, test = [], [], []

    for label in data_per_label:
        data = data_per_label[label]
        n_test = int(len(data) * test_ratio)
        n_val = int(len(data) * val_ratio)
        test += data[:n_test]
        val += data[n_test:n_test+n_val]
        train += data[n_test+n_val:]

    X_train, y_train = zip(*train)
    X_val, y_val = zip(*val)
    X_test, y_test = zip(*test)

    return X_train, X_val, X_test, y_train, y_val, y_test

In [69]:
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(train_x_by_serial, train_y['Y'])

X_train = [x.drop(columns='TIMESTAMP') for x in X_train]
X_val = [x.drop(columns='TIMESTAMP') for x in X_val]
X_test = [x.drop(columns='TIMESTAMP') for x in X_test]

print("Train Data의 개수 :", len(X_train))
print("Validation Data의 개수 :", len(X_val))
print("Test Data의 개수 :", len(X_test))

Train Data의 개수 : 5791
Validation Data의 개수 : 1654
Test Data의 개수 : 827


In [70]:
def align_data(data, series_length):
    data_features = [x[feature_keys] for x in data]
    length_aligned_X = []
    
    for x in data_features:
        if len(x) >= series_length:
            length_aligned_X.append(x[-series_length:])
        else:
            length_aligned_X.append(x.append([x.iloc[-1]] * (series_length - len(x))))
    return np.array(length_aligned_X)

In [71]:
X_train = align_data(X_train, CONFIG.SERIES_LENGTH)
X_val = align_data(X_val, CONFIG.SERIES_LENGTH)
X_test = align_data(X_test, CONFIG.SERIES_LENGTH)

In [72]:
y_train = np.array(y_train)
y_val = np.array(y_val)
y_test = np.array(y_test)

### Dataset

In [73]:
class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return torch.tensor(self.X[idx], dtype=torch.float32), torch.tensor(self.y[idx], dtype=torch.float32)

In [74]:
train_dataset = TimeSeriesDataset(X_train, y_train)
val_dataset = TimeSeriesDataset(X_val, y_val)
test_dataset = TimeSeriesDataset(X_test, y_test)

In [75]:
train_loader = DataLoader(train_dataset, batch_size=CONFIG.BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=CONFIG.BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=CONFIG.BATCH_SIZE, shuffle=False)

### Model

In [76]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [77]:
class TransformerModel(nn.Module):
    def __init__(self, input_dim, model_dim, num_heads, num_layers, ff_dim, output_dim, dropout):
        super(TransformerModel, self).__init__()
        self.model_dim = model_dim
        self.embedding = nn.Linear(input_dim, model_dim)
        self.pos_encoder = PositionalEncoding(model_dim, dropout)
        encoder_layers = TransformerEncoderLayer(model_dim, num_heads, ff_dim, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, num_layers)
        self.fc = nn.Linear(model_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, src_mask):
        x = self.embedding(x) * math.sqrt(self.model_dim)
        x = self.pos_encoder(x)
        x = x.permute(1, 0, 2)  # Transformer expects (seq_len, batch_size, model_dim)
        output = self.transformer_encoder(x, src_mask)
        output = output.permute(1, 0, 2)  # Back to (batch_size, seq_len, model_dim)
        output = self.fc(output[:, -1, :])
        return self.sigmoid(output)

def generate_square_subsequent_mask(sz: int):
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

In [78]:
input_dim = len(feature_keys)
model = TransformerModel(input_dim, CONFIG.MODEL_DIM, CONFIG.NUM_HEADS, CONFIG.NUM_LAYERS, CONFIG.FF_DIM, CONFIG.OUTPUT_DIM, CONFIG.DROPOUT)
model = model.to(CONFIG.DEVICE)

In [79]:
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=CONFIG.LEARNING_RATE)

### Training

In [80]:
@mlflow_run_decorator(run_name=CONFIG.FILE_NAME)
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs):
    mlflow.log_params(asdict(CONFIG))
    
    device = CONFIG.DEVICE
    model.to(device)

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            src_mask = generate_square_subsequent_mask(inputs.size(1))
            
            outputs = model(inputs, src_mask)
            loss = criterion(outputs.squeeze(), labels)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item() * inputs.size(0)
        
        val_loss = 0.0
        model.eval()
        all_preds = []
        
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                src_mask = generate_square_subsequent_mask(inputs.size(1))
                
                outputs = model(inputs, src_mask)
                loss = criterion(outputs.squeeze(), labels)
                
                val_loss += loss.item() * inputs.size(0)
                all_preds.extend(outputs.squeeze().cpu().numpy())
        
        train_loss /= len(train_loader.dataset)
        val_loss /= len(val_loader.dataset)
        
        val_preds = (np.array(all_preds) > 0.5).astype(int)
        f1 = f1_score(y_val, val_preds, average='macro')
        
        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val F1 Score: {f1:.4f}')
        mlflow.log_metric('Train Loss', train_loss, step=epoch)
        mlflow.log_metric('Val Loss', val_loss, step=epoch)
        mlflow.log_metric('Val F1 Score', f1, step=epoch)
    
    return model

In [81]:
model = train_model(model, train_loader, val_loader, criterion, optimizer, CONFIG.EPOCHS)

Epoch 1/50, Train Loss: 0.4919, Val Loss: 0.2187, Val F1 Score: 0.8796
Epoch 2/50, Train Loss: 0.2174, Val Loss: 0.1595, Val F1 Score: 0.8820
Epoch 3/50, Train Loss: 0.1596, Val Loss: 0.1535, Val F1 Score: 0.8687
Epoch 4/50, Train Loss: 0.1369, Val Loss: 0.1530, Val F1 Score: 0.8689
Epoch 5/50, Train Loss: 0.1286, Val Loss: 0.1575, Val F1 Score: 0.8659
Epoch 6/50, Train Loss: 0.1176, Val Loss: 0.1533, Val F1 Score: 0.8715
Epoch 7/50, Train Loss: 0.1148, Val Loss: 0.1566, Val F1 Score: 0.8724
Epoch 8/50, Train Loss: 0.1105, Val Loss: 0.1527, Val F1 Score: 0.8735
Epoch 9/50, Train Loss: 0.1110, Val Loss: 0.1483, Val F1 Score: 0.8783
Epoch 10/50, Train Loss: 0.1066, Val Loss: 0.1523, Val F1 Score: 0.8771
Epoch 11/50, Train Loss: 0.1025, Val Loss: 0.1505, Val F1 Score: 0.8798
Epoch 12/50, Train Loss: 0.1020, Val Loss: 0.1484, Val F1 Score: 0.8856
Epoch 13/50, Train Loss: 0.0974, Val Loss: 0.1534, Val F1 Score: 0.8815
Epoch 14/50, Train Loss: 0.0952, Val Loss: 0.1368, Val F1 Score: 0.8937
E

In [82]:
model.eval()
test_preds = []
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(CONFIG.DEVICE)
        src_mask = generate_square_subsequent_mask(inputs.size(1))
        outputs = model(inputs, src_mask)
        
        test_preds.extend(outputs.squeeze().cpu().numpy())

test_preds = (np.array(test_preds) > 0.5).astype(int)
f1 = f1_score(y_test, test_preds, average='macro')
print(f'Test 데이터에 대한 F1 Score: {f1:.8f}')

Test 데이터에 대한 F1 Score: 0.87192562


### Pred

In [None]:
submission = pd.read_csv(os.path.join(CONFIG.ROOT_DIR, "test_y.csv"), index_col='Serial Number')

y_test_pred = clf.predict(X_test)
submission["Y"] = y_test_pred
submission.to_csv("submission.csv", index_label='Serial Number')

In [None]:
submission = pd.read_csv("submission.csv", index_col='Serial Number')
submission

In [None]:
print(submission["Y"].mean())
print(train_y.mean())

Public Score : 0.8130563798219584