In [1]:
# HuggingFace
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader

from load_data import *
import pandas as pd
import torch
import torch.nn as nn
import pickle as pickle
import numpy as np
import argparse
import os

import json
from ipywidgets import FloatProgress

In [9]:
# HuggingFace Inference Functions
def inference_huggingface(model, tokenized_sent, device):
    dataloader = DataLoader(tokenized_sent, batch_size=40, shuffle=False)
    model.eval()
    output_logits = []

    for i, data in enumerate(dataloader):
        with torch.no_grad():
            if 'token_type_ids' in data.keys():
                outputs = model(
                    input_ids=data['input_ids'].to(device),
                    attention_mask=data['attention_mask'].to(device),
                    token_type_ids=data['token_type_ids'].to(device)
                )
            else:
                outputs = model(
                    input_ids=data['input_ids'].to(device),
                    attention_mask=data['attention_mask'].to(device)
                )
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        output_logits.append(logits)
    return np.concatenate(output_logits)

In [10]:
def load_test_dataset(dataset_dir, tokenizer):
    test_dataset = load_data(dataset_dir)
    test_label = test_dataset['label'].values
    # tokenizing dataset
    tokenized_test = tokenized_dataset(test_dataset, tokenizer)
    return tokenized_test, test_label

In [11]:
test_dataset_dir = "/opt/ml/input/data/test/test.tsv"

In [16]:
def get_logits(cfg_file):
    print("CURR cfg_file: {}".format(cfg_file))
    with open(cfg_file) as f:
        cfg = json.load(f)
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    # load tokenizer
    TOK_NAME = cfg["model_name"]
    #TODO: kobert에 대해 따로 처리할 것
    tokenizer = AutoTokenizer.from_pretrained(TOK_NAME)

    # load my model
    MODEL_NAME = cfg["output_dir"] # model dir.
    model = AutoModelForSequenceClassification.from_pretrained(
        os.path.join(MODEL_NAME, "checkpoint-{}".format(cfg["num_train_epochs"] * 550)))
    model.parameters
    model.to(device)

    # load test datset
    #test_dataset_dir = "/opt/ml/input/data/test/test.tsv"
    test_dataset, test_label = load_test_dataset(test_dataset_dir, tokenizer)
    test_dataset = RE_Dataset(test_dataset ,test_label)

    # predict answer
    #print("Start prediction...")
    pred_answer = inference_huggingface(model, test_dataset, device)
    
    return pred_answer

In [6]:
model_path_huggingface = [
    'configs/roberta.json',
    'configs/bert-seed-7-epoch-20.json',
    'configs/koelectra-epoch-20.json',
]

In [7]:
for path in model_path_huggingface:
    result = get_logits(path)
    np.save('/opt/ml/logits/logit_{}.npy'.format(path.split('/')[-1].split('.')[0]), result)

CURR cfg_file: configs/roberta.json


  item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_dataset.items()}


CURR cfg_file: configs/bert-seed-7-epoch-20.json
CURR cfg_file: configs/koelectra-epoch-20.json
CURR cfg_file: configs/kobert-epoch-20.json


In [8]:
test = np.load('/opt/ml/logits/logit_roberta.npy')
test

array([[ 2.914875  ,  0.3328355 ,  6.9130335 , ..., -1.0136776 ,
        -1.067852  , -0.56732583],
       [ 0.9859098 , -0.2885439 ,  0.08498568, ..., -0.75621104,
        -0.5908805 , -0.7222718 ],
       [ 1.0534576 ,  2.995008  , -1.5576702 , ..., -0.58633024,
        -0.6264167 , -0.72085637],
       ...,
       [ 1.0475044 , -0.34612644, -0.02586268, ..., -0.6742935 ,
        -0.6050735 , -0.7355977 ],
       [ 7.53278   ,  0.17950976,  0.7187565 , ..., -1.7230488 ,
        -2.1267328 , -1.5935761 ],
       [ 6.6731253 , -0.43922174,  0.38256612, ..., -1.1662935 ,
        -1.8269639 , -1.6029122 ]], dtype=float32)

In [17]:
result = get_logits('configs/bert-epoch-20.json')
np.save('/opt/ml/logits/logit_bert-epoch-20.npy', result)

CURR cfg_file: configs/bert-epoch-20.json


In [18]:
!pip show transformers

Name: transformers
Version: 3.0.0
Summary: State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch
Home-page: https://github.com/huggingface/transformers
Author: Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Sam Shleifer, Patrick von Platen, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors
Author-email: thomas@huggingface.co
License: Apache
Location: /opt/conda/lib/python3.7/site-packages
Requires: tokenizers, sentencepiece, numpy, packaging, requests, regex, sacremoses, tqdm, filelock
Required-by: kobart


In [2]:
# kobert tokenizer 사용한 logit값 추출
from tokenization_kobert import KoBertTokenizer

def get_logits_kobert(cfg_file):
    print("CURR cfg_file: {}".format(cfg_file))
    with open(cfg_file) as f:
        cfg = json.load(f)
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    # load tokenizer
    TOK_NAME = cfg["model_name"]
    #TODO: kobert에 대해 따로 처리할 것
    tokenizer = KoBertTokenizer.from_pretrained(TOK_NAME)

    # load my model
    MODEL_NAME = cfg["output_dir"] # model dir.
    model = AutoModelForSequenceClassification.from_pretrained(
        os.path.join(MODEL_NAME, "checkpoint-{}".format(cfg["num_train_epochs"] * 550)))
    model.parameters
    model.to(device)

    # load test datset
    #test_dataset_dir = "/opt/ml/input/data/test/test.tsv"
    test_dataset, test_label = load_test_dataset(test_dataset_dir, tokenizer)
    test_dataset = RE_Dataset(test_dataset ,test_label)

    # predict answer
    #print("Start prediction...")
    pred_answer = inference_huggingface(model, test_dataset, device)
    
    return pred_answer

In [12]:
np.load('/opt/ml/logits/logit_kobert-epoch-20.npy')

array([[ 1.5063766 , -0.47663715, 12.623493  , ..., -0.34894428,
        -0.38458663,  1.1262532 ],
       [ 1.99646   , -0.42798647,  2.458978  , ..., -1.8876233 ,
        -1.6238039 , -1.2348315 ],
       [-0.30265176, 10.386776  , -0.8077401 , ..., -2.0220146 ,
        -0.3407276 ,  1.3264096 ],
       ...,
       [ 2.3331583 , -0.44706666,  2.611505  , ..., -2.0147176 ,
        -1.6955225 , -1.4178125 ],
       [ 5.540759  ,  1.8700576 ,  0.3828238 , ..., -1.0348088 ,
        -1.6112745 , -0.81912065],
       [ 3.9532156 , -3.3721848 ,  0.86216   , ...,  2.223165  ,
        -2.1250982 , -3.461781  ]], dtype=float32)

In [19]:
import glob

paths = glob.glob('/opt/ml/logits/*.npy')
results = []

for path in paths:
    results.append(np.load(path))
    
np.array(results).sum(axis=0).argmax(axis=1).shape

(1000,)

In [8]:
cfg_file = 'configs/roberta-large-seed-26.json'
print("CURR cfg_file: {}".format(cfg_file))
with open(cfg_file) as f:
    cfg = json.load(f)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# load tokenizer
TOK_NAME = cfg["model_name"]
#TODO: kobert에 대해 따로 처리할 것
tokenizer = AutoTokenizer.from_pretrained(TOK_NAME)

# load my model
MODEL_NAME = cfg["output_dir"] # model dir.
model = AutoModelForSequenceClassification.from_pretrained(
    os.path.join(MODEL_NAME, "checkpoint-2700"))
model.parameters
model.to(device)

# load test datset
#test_dataset_dir = "/opt/ml/input/data/test/test.tsv"
test_dataset, test_label = load_test_dataset(test_dataset_dir, tokenizer)
test_dataset = RE_Dataset(test_dataset ,test_label)

# predict answer
#print("Start prediction...")
pred_answer = inference_huggingface(model, test_dataset, device)

np.save('/opt/ml/logits/logit_roberta_large-seed-26.npy', pred_answer)

CURR cfg_file: configs/roberta-large-seed-26.json


In [6]:
cfg_file = 'configs/bert-with-token.json'
print("CURR cfg_file: {}".format(cfg_file))
with open(cfg_file) as f:
    cfg = json.load(f)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# load tokenizer
TOK_NAME = cfg["model_name"]
#TODO: kobert에 대해 따로 처리할 것
tokenizer = AutoTokenizer.from_pretrained(TOK_NAME)
tokenizer.add_special_tokens({'additional_special_tokens': ['[E1], [/E1], [E2], [/E2]']})

# load test datset
test_dataset_dir = "/opt/ml/input/data/test/test.tsv"
test_dataset = load_data_with_entity(test_dataset_dir)
test_label = test_dataset['label'].values
tokenized_test = tokenized_dataset(test_dataset, tokenizer)
test_dataset = RE_Dataset(tokenized_test ,test_label)

# load my model
MODEL_NAME = cfg["output_dir"] # model dir.
model = AutoModelForSequenceClassification.from_pretrained(
    os.path.join(MODEL_NAME, "checkpoint-11000"))
model.parameters
model.to(device)
model.resize_token_embeddings(len(tokenizer))


# predict answer
#print("Start prediction...")
pred_answer = inference_huggingface(model, test_dataset, device)

np.save('/opt/ml/logits/logit_bert-with-token.npy', pred_answer)

CURR cfg_file: configs/bert-with-token.json


  item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_dataset.items()}


In [7]:
pred_answer

array([[ 8.296289  , -0.04613327,  8.675463  , ..., -1.5508859 ,
        -2.844798  , -2.0943873 ],
       [ 0.8893885 , -0.32703027,  0.23474614, ..., -1.4912081 ,
        -1.1003903 , -0.94410586],
       [-1.0814409 ,  2.5289311 , -0.08361945, ..., -0.7561385 ,
        -0.14784351, -1.8908087 ],
       ...,
       [ 1.0392582 , -0.5103955 ,  0.37354428, ..., -1.7872754 ,
        -1.2873018 , -0.9500302 ],
       [13.0262575 , -0.29220685,  0.28048193, ..., -3.5321977 ,
        -4.244457  , -3.6638148 ],
       [ 6.444935  , -1.9576925 ,  0.70683753, ..., -0.29889292,
        -3.6757355 , -2.274559  ]], dtype=float32)