# Ensemble

In [1]:
# HuggingFace
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader

from load_data import *
import pandas as pd
import torch
import torch.nn as nn
import pickle as pickle
import numpy as np
import argparse
import os

import json

# SKT KoBERT
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
from sklearn.model_selection import train_test_split
import gluonnlp as nlp

from ipywidgets import FloatProgress

In [2]:
# HuggingFace Inference Functions
def inference_huggingface(model, tokenized_sent, device):
    dataloader = DataLoader(tokenized_sent, batch_size=40, shuffle=False)
    model.eval()
    output_logits = []

    for i, data in enumerate(dataloader):
        with torch.no_grad():
            if 'token_type_ids' in data.keys():
                outputs = model(
                    input_ids=data['input_ids'].to(device),
                    attention_mask=data['attention_mask'].to(device),
                    token_type_ids=data['token_type_ids'].to(device)
                )
            else:
                outputs = model(
                    input_ids=data['input_ids'].to(device),
                    attention_mask=data['attention_mask'].to(device)
                )
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        output_logits.append(logits)
    return np.concatenate(output_logits)

In [3]:
def load_test_dataset(dataset_dir, tokenizer):
    test_dataset = load_data(dataset_dir)
    test_label = test_dataset['label'].values
    # tokenizing dataset
    tokenized_test = tokenized_dataset(test_dataset, tokenizer)
    return tokenized_test, test_label

In [4]:
test_dataset_dir = "/opt/ml/input/data/test/test.tsv"

In [26]:
def get_logits(cfg_file):
    print("CURR cfg_file: {}".format(cfg_file))
    with open(cfg_file) as f:
        cfg = json.load(f)
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    # load tokenizer
    TOK_NAME = cfg["model_name"]
    #TODO: kobert에 대해 따로 처리할 것
    tokenizer = AutoTokenizer.from_pretrained(TOK_NAME)

    # load my model
    MODEL_NAME = cfg["output_dir"] # model dir.
    model = AutoModelForSequenceClassification.from_pretrained(
        os.path.join(MODEL_NAME, "checkpoint-{}".format(cfg["num_train_epochs"] * 550)))
    model.parameters
    model.to(device)

    # load test datset
    #test_dataset_dir = "/opt/ml/input/data/test/test.tsv"
    test_dataset, test_label = load_test_dataset(test_dataset_dir, tokenizer)
    test_dataset = RE_Dataset(test_dataset ,test_label)

    # predict answer
    #print("Start prediction...")
    pred_answer = inference_huggingface(model, test_dataset, device)
    
    return pred_answer

In [6]:
b = get_logits('configs/bert-seed-7-epoch-20.json')
b.shape

CURR cfg_file: configs/bert-seed-7-epoch-20.json


  item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_dataset.items()}


(1000, 42)

In [13]:
model_path_huggingface = [
    #TODO: 앙상블에 사용할 모델 목록
    'configs/roberta.json',
    'configs/bert-seed-7-epoch-20.json',
    'configs/koelectra-epoch-20.json',
    'configs/kobert-epoch-20.json'
]

model_path_kobert = [
    #TODO: 앙상블에 사용할 모델 목록
    '/opt/ml/model/model_skt-kobert-both-label.pt', # 20
    '/opt/ml/model/model_skt-kobert-both-label-seed-7.pt', # 21
    '/opt/ml/model/model_kobert_0_2_epoch_10.pt', # 18
    
]

In [8]:
# HuggingFace
results = []

for path in model_path_huggingface:
    results.append(get_logits(path))

results

CURR cfg_file: configs/roberta.json


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=512.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=9096718.0, style=ProgressStyle(descript…


CURR cfg_file: configs/bert-seed-7-epoch-20.json
CURR cfg_file: configs/koelectra-epoch-20.json


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=467.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=263326.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=61.0, style=ProgressStyle(description_w…


CURR cfg_file: configs/kobert-epoch-20.json


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=426.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=77779.0, style=ProgressStyle(descriptio…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=51.0, style=ProgressStyle(description_w…




[array([[ 2.914875  ,  0.3328355 ,  6.9130335 , ..., -1.0136776 ,
         -1.067852  , -0.56732583],
        [ 0.9859098 , -0.2885439 ,  0.08498568, ..., -0.75621104,
         -0.5908805 , -0.7222718 ],
        [ 1.0534576 ,  2.995008  , -1.5576702 , ..., -0.58633024,
         -0.6264167 , -0.72085637],
        ...,
        [ 1.0475044 , -0.34612644, -0.02586268, ..., -0.6742935 ,
         -0.6050735 , -0.7355977 ],
        [ 7.53278   ,  0.17950976,  0.7187565 , ..., -1.7230488 ,
         -2.1267328 , -1.5935761 ],
        [ 6.6731253 , -0.43922174,  0.38256612, ..., -1.1662935 ,
         -1.8269639 , -1.6029122 ]], dtype=float32),
 array([[ 5.0713735 ,  2.6281214 ,  8.673834  , ..., -2.2970295 ,
         -2.1820421 , -2.2649179 ],
        [ 1.168229  , -1.6565453 ,  0.5646994 , ..., -0.23038615,
         -1.4705226 , -1.7014961 ],
        [ 0.03604685, 10.674938  , -0.21921825, ..., -2.0160847 ,
          0.06821638, -1.0548279 ],
        ...,
        [ 2.4257016 , -1.9017417 ,  0.1

In [9]:
np.array(results).sum(axis=0).argmax(axis=1).shape

(1000,)

In [10]:
np.array(results).sum(axis=0).argmax(axis=1)

array([ 2, 10,  1,  0,  0,  0,  7,  0,  4, 20,  0,  0,  0,  8,  0,  0,  0,
        0,  0,  0, 15,  0,  0, 27,  4,  4, 10,  0,  0, 21,  0,  4, 10, 21,
        0,  0, 21,  4,  0,  0,  0,  0,  0, 25, 17,  0,  9,  2,  0, 15,  0,
       10,  0,  2,  0, 15,  0,  0,  0, 10,  0, 33,  0, 17,  0,  2, 24,  0,
       10,  0,  0,  0,  0, 10,  0,  0,  2, 15, 14,  0, 15,  0,  0, 10, 15,
        0,  4,  6,  7,  0,  0, 12,  0,  0, 21,  0,  8,  0, 15,  9,  0,  2,
        0, 21,  7,  0,  0,  7,  0,  0,  0,  2,  0,  0,  4,  0,  2,  0,  0,
        0,  0,  0,  6,  0,  4, 20,  7,  0, 10, 15,  0,  0,  0,  0, 10,  0,
       20,  0,  4,  0,  0, 10,  0, 10,  0, 15,  0, 10,  0,  9, 10, 21,  2,
        8,  0,  0, 17,  0,  0,  0,  0,  0,  0, 10,  0,  0,  0, 21,  0,  0,
       10,  0,  0,  0,  0,  0,  0, 27,  2,  0,  2,  0, 39,  2, 22,  0,  7,
       10,  0, 25,  0,  5,  0, 20, 10,  0, 11, 10,  0, 15, 21, 24,  2,  0,
        4,  0,  0, 27,  0,  0, 10,  7, 15,  0,  4,  0,  0,  5, 15,  0,  0,
        0,  0, 10,  8,  0

In [17]:
!pip show transformers

Name: transformers
Version: 3.0.0
Summary: State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch
Home-page: https://github.com/huggingface/transformers
Author: Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Sam Shleifer, Patrick von Platen, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors
Author-email: thomas@huggingface.co
License: Apache
Location: /opt/conda/lib/python3.7/site-packages
Requires: packaging, sacremoses, filelock, regex, sentencepiece, requests, numpy, tokenizers, tqdm
Required-by: kobart


In [11]:
# SKT KoBERT 관련 Classes
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len, pad, pair):
        transform = nlp.data.BERTSentenceTransform(bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))
    
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes = 42,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [12]:
# SKT KoBERT inferencing code
device = torch.device("cuda:0")
bertmodel, vocab = get_pytorch_kobert_model()
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

max_len = 128
batch_size = 32
warmup_ratio = 0.01
num_epochs = 10
max_grad_norm = 1
log_interval = 50
learning_rate = 5e-5

# Dataset 설정
dataset_path = r"/opt/ml/input/data/test/test.tsv"
dataset = load_data(dataset_path)
dataset['sentence'] = dataset['entity_01'] + ' [SEP] ' + dataset['entity_02'] + ' [SEP] ' + dataset['sentence']
dataset[['sentence','label']].to_csv("/opt/ml/input/data/test/test.txt", sep='\t', index=False)

# Dataset Load
dataset_test = nlp.data.TSVDataset("/opt/ml/input/data/test/test.txt", field_indices=[0,1], num_discard_samples=1)
data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)

using cached model
using cached model
using cached model


In [15]:
for path in model_path_kobert:
    model.load_state_dict(torch.load(path))
    model.eval()
    Predict = []

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length = valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        Predict.extend(out.detach().cpu().numpy())
    np.save('/opt/ml/logits/logit_{}.npy'.format(path.split("/")[-1].split(".")[0]), Predict)

In [9]:
np.array(Predict).argmax(axis=1).shape

(1000,)

CURR cfg_file: /opt/ml/logits/logit_roberta_large-seed-26.npy


UnicodeDecodeError: 'utf-8' codec can't decode byte 0x93 in position 0: invalid start byte

In [None]:
pred_answer = np.array(results).sum(axis=0).argmax(axis=1)


output = pd.DataFrame(pred_answer, columns=['pred'])
output.to_csv('./prediction/submission_{}.csv'.format("ensemble_huggingface"), index=False)

In [16]:
import glob

In [18]:
paths = glob.glob('/opt/ml/logits/*.npy')

In [22]:
results = []
for path in paths:
    results.append(np.load(path))
    
print(np.array(results).sum(axis=0).argmax(axis=1))

[ 2 10  1  0  0  0  7  0  4 20  0  0  0  8  0  0  0  7  0 10 15  5  0 27
  4  4 10  0  0 21  0  4 12 21 23  0 21  4  0  0  0  0  0  8 17  0  9  2
  0 15  0 10  0  2  0 15  4  0  0 10  0 33  0 17  4  2 24  0 12  0  0  0
  0 10  0  0  2 15 14  0  3  0  0 10 15  0  4  6  7  2  0 12  0 17 11  0
  8 14 15 15 15  2  0 21  7  0  0  8  0  0  4  2  0  0  4  0  2  0  0  0
  0 15  6  0  4 20  7  0 10 15  0  0  0  0 10  0 20  0  4  0  0 10  0 10
  0 15  0 10 10  9 10 21  2  8  0  0 17  0  0  0  0  0  0 10  0 10  9  8
  0  0 10  0  0 20 17  0  0 27  2  0  2  0 39  2 22  0  9 10  0 25  0  5
 32 20 10  0  8 10  0 15  8 24  2  0  4  0  0 27  0 10 10  7 15  0  4  5
  0  5 15  0  0  0  0 10  8  0  1  2  0  4 27  4  0  0  0  0  0  0 10  0
 10  0  0  0 22  0  5  0  7 21  0  0  5  5  5  8  6 10 17  0  2 17  6  4
  4  0  0  0 15  0 15  0  2  7 22  0  0  8  0  0  4  8  5  2  0 10  0 10
  8  2 15  2  0 10  8  0  0  0 10  0  8  0  4  4  0  0  2  4  0  0  0 10
  0  0  0 15  0 15 10  6  0  0  0  4 22 15  4  0  2

In [24]:
output = pd.DataFrame(np.array(results).sum(axis=0).argmax(axis=1), columns=['pred'])
output.to_csv('/opt/ml/submission_{}.csv'.format("ensemble_all"), index=False)

In [29]:
paths = [
    '/opt/ml/logits/logit_roberta_leaderboard_large.npy', #32
    '/opt/ml/logits/logit_koelectra-epoch-20.npy', #28
    '/opt/ml/logits/logit_model_skt-kobert-both-label.npy', #20
    '/opt/ml/logits/logit_roberta_large-seed-26.npy',
    '/opt/ml/logits/logit_bert-seed-7-epoch-20.npy', #29
]

In [30]:
results = []
for path in paths:
    results.append(np.load(path))
    
print(np.array(results).sum(axis=0).argmax(axis=1))
output = pd.DataFrame(np.array(results).sum(axis=0).argmax(axis=1), columns=['pred'])
output.to_csv('/opt/ml/submission_{}.csv'.format("ensemble_top_5"), index=False)

[ 2 10  1  0  0  0  7 28  4 20  0  0  0  8  0  0  0  0  0 10 15  5  0 27
  4  4 10  0  0 21  0  4 10  8 23  0 21  4  0  0  0  0  0 25 17  0  9  2
  0 15  0 10  0  2  0 15 39  0  0 10  0 33  0 17  4  2 24 21 12  0  0  0
  0 10  0  0  2 15 14  0 15  0  0 10 15  0  4  6  7  2  5 12  0 17 21  0
  8 14 15 15 15  2  0 21  7  0  0  7  0  0  0  2  0  0  4  0  2  0  0  0
  0 15  6  0  4 20  7  0 10 15  0  0  0  0 10  0 20  0  4  0  0 10  0 10
  0 15  0 10  0  9 10 21  2  8  0  0 17  0  0  0  0  0  0 10  0 10  9  8
  0  0 10  0  0 20 17  0  0 27  2  0  2  0 39  2 22  0  9 10  0 25  0  5
 32 20 10  0 11 10  0 15 21 24  2  0  4  0  0 27  0 10 10  7 15  0  4  5
  0  5 15  0  0  0  0 10  8  0  0  2  0  4 27  4  0  0  0  0  0  0 10  0
 10  0  0  0 22  0  5  0  7 21  0  0  5  5  5  8  6 10 17  0  2 17  6  4
  4  0  0  0 15  0 15  0  2  7 22 10  0  8  0  0  4  8 25  2  0 10  0 10
  8  2 15  2  0 10 21  0  0  0 10  0  8  0  4  4  7  0  2  4  0  0  0  0
  4  0  0 15  0 15 10  6  0  0  0  4 22 15  4  0  2