## 5-fold cross valdiation

In [None]:
import os
import torch
import numpy as np
import pandas as pd

import argparse
from dkt.dataloader import Preprocess
from dkt import trainer
from dkt.utils import setSeeds

from dkt.dataloader import get_loaders
from dkt.optimizer import get_optimizer
from dkt.scheduler import get_scheduler
from dkt.criterion import get_criterion
from dkt.metric import get_metric
from dkt.model import LSTM, LSTMATTN, Bert, Saint
from dkt.trainer import get_lr, train, validate, get_model, process_batch, compute_loss, update_params, save_checkpoint
import wandb
import time
import datetime

from sklearn.model_selection import KFold

In [None]:
def parse_args(mode='train'):
    parser = argparse.ArgumentParser()

    
    parser.add_argument('--seed', default=42, type=int, help='seed')
    
    parser.add_argument('--device', default='cpu', type=str, help='cpu or gpu')

    parser.add_argument('--data_dir', default='/opt/ml/input/data/train_dataset', type=str, help='data directory')
    parser.add_argument('--asset_dir', default='asset/', type=str, help='data directory')
    
    parser.add_argument('--file_name', default='train_data.csv', type=str, help='train file name')
    
    parser.add_argument('--model_dir', default='models/', type=str, help='model directory')
    parser.add_argument('--model_name', default='model.pt', type=str, help='model file name')

    parser.add_argument('--output_dir', default='output/', type=str, help='output directory')
    parser.add_argument('--test_file_name', default='test_data.csv', type=str, help='test file name')
    
    parser.add_argument('--max_seq_len', default=20, type=int, help='max sequence length')
    parser.add_argument('--num_workers', default=4, type=int, help='number of workers')

    # 모델
    parser.add_argument('--hidden_dim', default=64, type=int, help='hidden dimension size')
    parser.add_argument('--n_layers', default=2, type=int, help='number of layers')
    parser.add_argument('--n_heads', default=2, type=int, help='number of heads')
    parser.add_argument('--drop_out', default=0.2, type=float, help='drop out rate')
    
    # 훈련
    parser.add_argument('--n_epochs', default=100, type=int, help='number of epochs')
    parser.add_argument('--batch_size', default=64, type=int, help='batch size')
    parser.add_argument('--lr', default=0.0001, type=float, help='learning rate')
    parser.add_argument('--clip_grad', default=10, type=int, help='clip grad')
    parser.add_argument('--patience', default=5, type=int, help='for early stopping')
    

    parser.add_argument('--log_steps', default=50, type=int, help='print log per n steps')
    

    ### 중요 ###
    parser.add_argument('--model', default='saint', type=str, help='model type')
    parser.add_argument('--optimizer', default='adam', type=str, help='optimizer type')
    parser.add_argument('--scheduler', default='plateau', type=str, help='scheduler type')
    
    args = parser.parse_args([])

    return args

In [None]:
args = parse_args(mode='train')
device = "cuda" if torch.cuda.is_available() else "cpu"
args.device = device
print(device)

In [None]:
preprocess = Preprocess(args)
# set preprocess.train_data
preprocess.load_train_data(args.file_name)

# get preprocess.train_data
tot_train_data = preprocess.get_train_data()
print(len(tot_train_data)) 

In [None]:
wandb.init(project='CV', config=vars(args), tags=[args.model], name=f'kfold_{args.model}')

In [None]:
MODEL_DIR = 'folds/'
os.makedirs(MODEL_DIR, exist_ok=True)

In [None]:
def run(args, train_data, valid_data, fold):
    train_loader, valid_loader = get_loaders(args, train_data, valid_data)
    
    # only when using warmup scheduler
    args.total_steps = int(len(train_loader.dataset) / args.batch_size) * (args.n_epochs)
    args.warmup_steps = args.total_steps // 10
            
    model = get_model(args)
    optimizer = get_optimizer(model, args)
    scheduler = get_scheduler(optimizer, args)

    best_auc = -1
    best_acc = -1  # best_auc에 따라서 결정됨
    early_stopping_counter = 0
    best_auc_epoch = 0
    print(f"########## Fold {fold} ##########")
    for epoch in range(args.n_epochs):

        print(f"Start Training: Epoch {epoch + 1}")
        start = time.time()
        ### TRAIN
        train_auc, train_acc, train_loss = train(train_loader, model, optimizer, args)
        
        ### VALID
        auc, acc,_ , _ = validate(valid_loader, model, args)

        sec = time.time() - start
        times = str(datetime.timedelta(seconds=sec)).split(".")
        times = times[0]
        print(f'<<<<<<<<<<  {epoch + 1} EPOCH spent : {times}  >>>>>>>>>>')

        ### TODO: model save or early stopping
        wandb.log({"epoch": epoch, "train_loss": train_loss, "train_auc": train_auc, "train_acc":train_acc,
                  "valid_auc":auc, "valid_acc":acc, "Learning_rate": get_lr(optimizer),})
        if auc > best_auc:
            best_auc = auc
            best_acc = acc
            best_auc_epoch = epoch+1
            # torch.nn.DataParallel로 감싸진 경우 원래의 model을 가져옵니다.
            model_to_save = model.module if hasattr(model, 'module') else model
            save_checkpoint({
                'epoch': epoch + 1,
                'state_dict': model_to_save.state_dict(),
                },
                MODEL_DIR, f'model_fold_{fold}.pt',
            )
            early_stopping_counter = 0
        else:
            early_stopping_counter += 1
            if early_stopping_counter >= args.patience:
                print(f'EarlyStopping counter: {early_stopping_counter} out of {args.patience}')
                print(f'Best AUC epoch: {best_auc_epoch}')
                break

        # scheduler
        if args.scheduler == 'plateau':
            scheduler.step(best_auc)
        else:
            scheduler.step()
            
    # model 메모리 지우기        
    model.cpu()
    del model
    gc.collect()
    torch.cuda.empty_cache()
    
    return best_acc, best_auc


## train

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state = 0)

In [None]:
# 데이터셋의 index를 반환한다
fold = 1
acc_list = []
auc_list = []
for train_index, valid_index in kf.split(tot_train_data):
    train_data, valid_data = tot_train_data[train_index], tot_train_data[valid_index]
    acc, auc = run(args, train_data, valid_data, fold)
    acc_list.append(acc)
    auc_list.append(auc)
    print('')
    print(f'Fold {fold} ACC: {acc}')
    print(f'Fold {fold} AUC: {auc}')
    print('')
    fold+=1

## k-fold 결과

In [None]:
# acc 평균
print(f'acc = {acc_list}')
print(f'auc = {auc_list}')
acc_mean = sum(acc_list)/len(acc_list)
auc_mean = sum(auc_list)/len(auc_list)
print(f'acc_mean: {acc_mean}')
print(f'auc_mean: {auc_mean}')

## Inferecne

In [None]:
preprocess.load_test_data(args.test_file_name)
test_data = preprocess.get_test_data()

In [None]:
fold_n = 5
FOLD_OUTPUT_DIR = 'fold_output/'

In [None]:
def load_model(args, fold):
    
    model_path = os.path.join(MODEL_DIR, f'model_fold_{fold}.pt')
    print("Loading Model from:", model_path)
    load_state = torch.load(model_path)
    model = get_model(args)

    # 1. load model state
    model.load_state_dict(load_state['state_dict'], strict=True)
   
    
    print("Loading Model from:", model_path, "...Finished.")
    return model

In [None]:
def inference(args, test_data, fold):
    
    model = load_model(args, fold)
    model.eval()
    _, test_loader = get_loaders(args, None, test_data)
    
    
    total_preds = []
    
    for step, batch in enumerate(test_loader):
        input = process_batch(batch, args)

        preds = model(input)
        

        # predictions
        preds = preds[:,-1]
        

        if args.device == 'cuda':
            preds = preds.to('cpu').detach().numpy()
        else: # cpu
            preds = preds.detach().numpy()
            
        total_preds+=list(preds)

    write_path = os.path.join(FOLD_OUTPUT_DIR, f"output_fold_{fold}.csv")
    if not os.path.exists(FOLD_OUTPUT_DIR):
        os.makedirs(FOLD_OUTPUT_DIR)    
    with open(write_path, 'w', encoding='utf8') as w:
        print("writing prediction : {}".format(write_path))
        w.write("id,prediction\n")
        for id, p in enumerate(total_preds):
            w.write('{},{}\n'.format(id,p))



In [None]:
for k in range(1, fold_n+1):
    inference(args, test_data, k)

## Ensemble

In [None]:
fold_1 = pd.read_csv('./fold_output/output_fold_1.csv').prediction
fold_2 = pd.read_csv('./fold_output/output_fold_2.csv').prediction
fold_3 = pd.read_csv('./fold_output/output_fold_3.csv').prediction
fold_4 = pd.read_csv('./fold_output/output_fold_4.csv').prediction
fold_5 = pd.read_csv('./fold_output/output_fold_5.csv').prediction

In [None]:
concat_df = pd.concat([fold_1, fold_2, fold_3, fold_4, fold_5], axis=1, join='outer')

In [None]:
mean_df = concat_df.mean(axis=1)

In [None]:
write_path = os.path.join(FOLD_OUTPUT_DIR, "ensemble_output.csv")
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction : {}".format(write_path))
    w.write("id,prediction\n")
    for id, p in enumerate(mean_df):
        w.write('{},{}\n'.format(id,p))