In [6]:
# !pip install mxnet
# !pip install gluonnlp
# !pip install sentencepiece
# !pip install git+https://git@github.com/SKTBrain/KoBERT.git@master
# !pip install transformers==3.0.2

## Library import

In [1]:
import os
import random
import numpy as np
import pandas as pd
import gluonnlp as nlp
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm, tqdm_notebook
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset, load_metric, load_from_disk
from transformers import AutoTokenizer, AutoModel, BertForSequenceClassification, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding, EarlyStoppingCallback, AdamW

# from transformers.optimization import get_cosine_schedule_with_warmup
# from kobert.utils import get_tokenizer
# from kobert.pytorch_kobert import get_pytorch_kobert_model

### Device & Random_seed setting

In [2]:
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = True  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore

seed_everything(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


---

### Kobert for Hugging Face

In [55]:
MODEL = 'skt/kobert-base-v1'
kobert = AutoModel.from_pretrained(MODEL)
tokenizer = AutoTokenizer.from_pretrained(MODEL)
tokenizer.truncation_side = 'left'

In [56]:
# print(model)

In [57]:
MAX_LEN = 512
_collator = DataCollatorWithPadding(tokenizer=tokenizer)
_metric = load_metric("glue", "sst2")

def tokenized_fn(data):
    outputs = tokenizer(data["article"], padding=True, max_length=MAX_LEN, truncation=True)
    if 'label' in data:
        outputs["labels"] = data["label"]
    return outputs

def metric_fn(p):
    preds, labels = p
    output = _metric.compute(references=lables, predictions=np.argmax(preds, axis=-1))
    return output


train_dataset = load_dataset("csv", data_files="./data/train_report.csv")["train"]
valid_dataset = load_dataset("csv", data_files="./data/valid_report.csv")["train"]

train_dataset = train_dataset.map(tokenized_fn, remove_columns=['filename', 'article', 'length', 'label'])
valid_dataset = valid_dataset.map(tokenized_fn, remove_columns=['filename', 'article', 'length', 'label'])

Using custom data configuration default-e8f1ba7701d4e902
Reusing dataset csv (/home/piai/.cache/huggingface/datasets/csv/default-e8f1ba7701d4e902/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-21a9e1aedcee44a1
Reusing dataset csv (/home/piai/.cache/huggingface/datasets/csv/default-21a9e1aedcee44a1/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at /home/piai/.cache/huggingface/datasets/csv/default-e8f1ba7701d4e902/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-f24d4418ea6a3ed7.arrow
Loading cached processed dataset at /home/piai/.cache/huggingface/datasets/csv/default-21a9e1aedcee44a1/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-37a56cd247a4152b.arrow


In [58]:
class ClassificationHead(torch.nn.Module):
    def __init__(self):
        super().__init__()
        # self.dense1 = torch.nn.Linear(768, 3072)
        self.dropout = torch.nn.Dropout(0.25)
        # self.dense2 = torch.nn.Linear(3072, 768)
        self.output = torch.nn.Linear(768, 2)
    
    def forward(self, features):
        # 보통 분류기에선 start 토큰에 분류 결과를 담음
        x = features[:, 0, :]    # take <s> token (equiv. to [CLS])
        x = x.reshape(-1, x.size(-1))
        x = self.dropout(x)
#         x = self.dense1(x)
#         x = torch.relu(x)
#         x = self.dropout(x)
        x = self.output(x)
        return x

class KobertForAinalyst(torch.nn.Module):
    def __init__(self, model):
        super(KobertForAinalyst, self).__init__()
        self.model = model
        self.classifier = ClassificationHead()
    
    def forward(self, input_ids=None, attention_mask=None, labels=None):
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            # labels=labels
        )
        self.labels = labels
        logits = self.classifier(outputs["last_hidden_state"])
        # prob = torch.nn.functional.softmax(logits, dim=-1)
        
        if labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels)
            return logits, loss
        else:
            return logits

In [59]:
model = KobertForAinalyst(model=kobert)
model.to(device)
model = torch.nn.DataParallel(model)
isParallel = True

In [60]:
batch_size = 16

train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    sampler = torch.utils.data.RandomSampler(train_dataset),
    batch_size = batch_size,
    collate_fn = _collator,
)

valid_dataloader = torch.utils.data.DataLoader(
    valid_dataset,
    sampler = torch.utils.data.SequentialSampler(valid_dataset),
    batch_size = batch_size,
    collate_fn = _collator,
)

In [61]:
import gc
def clear_cache():
    gc.collect()
    torch.cuda.empty_cache()
    
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [62]:
optimizer = optim.AdamW(model.parameters(), lr=1e-5)

In [64]:
clear_cache()

epochs = 20
for epoch in range(epochs):
    print(f"==================== Epoch {epoch+1}/{epochs} ====================")
    total_train_loss = 0
    model.train()
    
    for step, batch in enumerate(tqdm(train_dataloader, desc="Training: ", ncols=100)):
        batch_input_ids = batch["input_ids"].to(device)
        batch_attention_mask = batch["attention_mask"].to(device)
        batch_labels = batch["labels"].to(device)
        
        model.zero_grad()
        
        logits, loss = model(
            input_ids = batch_input_ids,
            attention_mask = batch_attention_mask,
            labels = batch_labels,
        )

        if isParallel:
            loss = loss.mean()

        total_train_loss += loss.item()
        loss.backward()
        optimizer.step()
    
        if step % 1000 == 0 and not step == 0:
            print("step : {:>5,} of {:>5,} loss: {:.5f}".format(step, len(train_dataloader), loss.item()))
    
    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"    >>> Average training loss: {avg_train_loss:.5f}", end='\n\n')
    
    # Validation
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    
    print("Running Validation...")
    for step, batch in enumerate(valid_dataloader):
        batch_input_ids = batch["input_ids"].to(device)
        batch_attention_mask = batch["attention_mask"].to(device)
        batch_labels = batch["labels"].to(device)
    
        with torch.no_grad():
            logits, loss = model(
                input_ids = batch_input_ids,
                attention_mask = batch_attention_mask,
                labels = batch_labels,
            )
            
            if isParallel:
                loss = loss.mean()
            
            total_eval_loss += loss.item()
            logits = logits.detach().cpu().numpy()
            label_ids = batch_labels.to("cpu").numpy()
            total_eval_accuracy += flat_accuracy(logits, label_ids)

    avg_val_accuracy = total_eval_accuracy / len(valid_dataloader)
    print(f"    >>> Accuracy: {avg_val_accuracy:.5f}", end='\n\n')



Training: 100%|███████████████████████████████████████████████████| 138/138 [01:11<00:00,  1.94it/s]


    >>> Average training loss: 0.68620

Running Validation...
    >>> Accuracy: 0.55580



Training: 100%|███████████████████████████████████████████████████| 138/138 [01:15<00:00,  1.82it/s]


    >>> Average training loss: 0.67214

Running Validation...
    >>> Accuracy: 0.60268



Training: 100%|███████████████████████████████████████████████████| 138/138 [01:12<00:00,  1.89it/s]


    >>> Average training loss: 0.65747

Running Validation...
    >>> Accuracy: 0.54688



Training: 100%|███████████████████████████████████████████████████| 138/138 [01:11<00:00,  1.93it/s]


    >>> Average training loss: 0.57733

Running Validation...
    >>> Accuracy: 0.78571



Training: 100%|███████████████████████████████████████████████████| 138/138 [01:13<00:00,  1.88it/s]


    >>> Average training loss: 0.44837

Running Validation...
    >>> Accuracy: 0.83482



Training: 100%|███████████████████████████████████████████████████| 138/138 [01:13<00:00,  1.87it/s]


    >>> Average training loss: 0.38694

Running Validation...
    >>> Accuracy: 0.82812



Training: 100%|███████████████████████████████████████████████████| 138/138 [01:14<00:00,  1.86it/s]


    >>> Average training loss: 0.32772

Running Validation...
    >>> Accuracy: 0.86607



Training: 100%|███████████████████████████████████████████████████| 138/138 [01:15<00:00,  1.83it/s]


    >>> Average training loss: 0.30163

Running Validation...
    >>> Accuracy: 0.89062



Training: 100%|███████████████████████████████████████████████████| 138/138 [01:13<00:00,  1.87it/s]


    >>> Average training loss: 0.26507

Running Validation...
    >>> Accuracy: 0.92634



Training: 100%|███████████████████████████████████████████████████| 138/138 [01:14<00:00,  1.86it/s]


    >>> Average training loss: 0.23675

Running Validation...
    >>> Accuracy: 0.86161



Training: 100%|███████████████████████████████████████████████████| 138/138 [01:12<00:00,  1.90it/s]


    >>> Average training loss: 0.18758

Running Validation...
    >>> Accuracy: 0.94866



Training: 100%|███████████████████████████████████████████████████| 138/138 [01:12<00:00,  1.89it/s]


    >>> Average training loss: 0.16031

Running Validation...
    >>> Accuracy: 0.96875



Training: 100%|███████████████████████████████████████████████████| 138/138 [01:11<00:00,  1.93it/s]


    >>> Average training loss: 0.17031

Running Validation...
    >>> Accuracy: 0.95312



Training: 100%|███████████████████████████████████████████████████| 138/138 [01:11<00:00,  1.92it/s]


    >>> Average training loss: 0.11417

Running Validation...
    >>> Accuracy: 0.96429



Training: 100%|███████████████████████████████████████████████████| 138/138 [01:12<00:00,  1.91it/s]


    >>> Average training loss: 0.09854

Running Validation...
    >>> Accuracy: 0.98884



Training: 100%|███████████████████████████████████████████████████| 138/138 [01:12<00:00,  1.90it/s]


    >>> Average training loss: 0.07089

Running Validation...
    >>> Accuracy: 0.97991



Training: 100%|███████████████████████████████████████████████████| 138/138 [01:16<00:00,  1.80it/s]


    >>> Average training loss: 0.08388

Running Validation...
    >>> Accuracy: 0.97768



Training: 100%|███████████████████████████████████████████████████| 138/138 [01:12<00:00,  1.89it/s]


    >>> Average training loss: 0.06955

Running Validation...
    >>> Accuracy: 0.99554



Training: 100%|███████████████████████████████████████████████████| 138/138 [01:11<00:00,  1.93it/s]


    >>> Average training loss: 0.06379

Running Validation...
    >>> Accuracy: 0.99330



Training: 100%|███████████████████████████████████████████████████| 138/138 [01:11<00:00,  1.92it/s]


    >>> Average training loss: 0.06719

Running Validation...
    >>> Accuracy: 0.99107



In [65]:
test_dataset = load_dataset("csv", data_files=f"./data/test_report.csv")["train"]
test_dataset = test_dataset.map(tokenized_fn, remove_columns=["filename", "article","length", "label"])

test_dataloader = torch.utils.data.DataLoader(
    test_dataset,
    sampler = torch.utils.data.SequentialSampler(test_dataset),
    batch_size = batch_size,
    collate_fn = _collator,
)

Using custom data configuration default-cad80f5e825dbe52


Downloading and preparing dataset csv/default to /home/piai/.cache/huggingface/datasets/csv/default-cad80f5e825dbe52/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/piai/.cache/huggingface/datasets/csv/default-cad80f5e825dbe52/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/550 [00:00<?, ?ex/s]

In [66]:
print("Run Testing...")

model.eval()
total_test_accuracy = 0
total_test_loss = 0
nb_test_steps = 0

for step, batch in enumerate(test_dataloader):
    batch_input_ids = batch["input_ids"].to(device)
    batch_attention_mask = batch["attention_mask"].to(device)
    batch_labels = batch["labels"].to(device)

    with torch.no_grad():
        logits, loss = model(
            input_ids = batch_input_ids,
            attention_mask = batch_attention_mask,
            labels = batch_labels,
        )

        if isParallel:
            loss = loss.mean()

        total_test_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = batch_labels.to("cpu").numpy()
        total_test_accuracy += flat_accuracy(logits, label_ids)

avg_test_accuracy = total_test_accuracy / len(test_dataloader)
print(f"    >>> Accuracy: {avg_test_accuracy:.5f}")
print()

Run Testing...
    >>> Accuracy: 0.87143



In [68]:
inference_dataset = load_dataset("csv", data_files=f"./data/report_dataset.csv")["train"]
inference_dataset = inference_dataset.map(tokenized_fn, remove_columns=["Unnamed: 0", "company", "title", "article", "opinion", "firm", "date"])

inference_dataloader = torch.utils.data.DataLoader(
    inference_dataset,
    sampler = torch.utils.data.SequentialSampler(inference_dataset),
    batch_size = 1,
    collate_fn = _collator,
)

Using custom data configuration default-d2549ac2fb5ac5b1
Reusing dataset csv (/home/piai/.cache/huggingface/datasets/csv/default-d2549ac2fb5ac5b1/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/50083 [00:00<?, ?ex/s]

In [70]:
model.eval()

probabilities = []
predictions = []

for step, batch in enumerate(tqdm(inference_dataloader, desc="Inference ", ncols=100)):
    batch_input_ids = batch["input_ids"].to(device)
    batch_attention_mask = batch["attention_mask"].to(device)

    with torch.no_grad():
        logits = model(
            input_ids = batch_input_ids,
            attention_mask = batch_attention_mask,
        )
        
        prob = torch.nn.functional.softmax(logits, dim=-1)
        predict = torch.argmax(prob, axis=1)
        
        prob = np.trunc(np.max(prob.detach().cpu().numpy(), axis=1) * 100)
        predict = predict.detach().cpu().numpy()
        
        probabilities.append(prob[0])
        predictions.append(predict[0])

Inference : 100%|█████████████████████████████████████████████| 50083/50083 [22:51<00:00, 36.53it/s]


In [71]:
origin_inference = pd.read_csv("./data/report_dataset.csv")

convert_predictions = list(map(lambda x: "매수" if x == 1 else "매도", predictions))
origin_inference = origin_inference.drop(labels="Unnamed: 0", axis=1)
origin_inference["predictions"] = convert_predictions
origin_inference["pred_rate"] = probabilities
origin_inference.to_csv(f"./data/convert_inference_data_Kobert.csv", index=False)

In [None]:
# args = TrainingArguments(
#     'models/',
#     overwrite_output_dir=True,
#     per_device_train_batch_size=4,
#     per_device_eval_batch_size=16,
#     gradient_accumulation_steps=4,
#     learning_rate=2e-5,
#     num_train_epochs=30,
#     save_total_limit=5,
#     do_train=True,
#     do_eval=True,
#     # eval_epochs=1,
#     save_strategy="epoch",
#     logging_strategy="epoch",
#     evaluation_strategy="epoch",
#     load_best_model_at_end = True,
#     # metric_for_best_model = 'f1',
# )

# trainer = Trainer(
#         model=model,
#         args=args,
#         data_collator=_collator,
#         train_dataset=train_dataset,
#         eval_dataset=valid_dataset,
#         tokenizer=tokenizer,
#         compute_metrics=metric_fn,
#         callbacks=[EarlyStoppingCallback(early_stopping_patience = 5)]
# )

# do_train()

---
#### Kobert by using kobert library

In [7]:
model, vocab = get_pytorch_kobert_model()

/home/piai/hustar/Hustar_Group_4_TeamP/testMH/.cache/kobert_v1.zip[██████████████████████████████████████████████████]
/home/piai/hustar/Hustar_Group_4_TeamP/testMH/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece[██████████████████████████████████████████████████]


In [8]:
train_dataset = nlp.data.TSVDataset('./data/train_report.tsv', field_indices=[1,3], num_discard_samples=1)
valid_dataset = nlp.data.TSVDataset('./data/valid_report.tsv', field_indices=[1,3], num_discard_samples=1)

In [9]:
train_dataset[0]

['하지만 이번 경기 Cycle 에는 이자유예, 양극화 등 여러 요인으로 자산건전성이 경기에 후행할 수도 있다. 2021 년 순이익 추정치를 1 조 5,560 억원으로 6% 상향조정하지만, 여느 때에 비해 실적의 불확실성은 높은 편이다. 장기적인 배당 불확실성 감안하여 목표주가 9,000 원으로 하향조정기업은행은 국책은행이어서 금감원의 배당 축소 권고 대상에 포함되지 않는다. 그럼에도 불구하고 과연 배당성향을 더 높일 수 있을지는 불확실하다. 앞으로도 정책적인 역할을 중시할 가능성이 있기 때문이다. 장기 배당성향 기대치를 35%에서 30%로 낮추면서목표주가를 10,000 원에서 9,000 원으로 하향조정한다. 기존의 ‘중립’ 의견을 유지한다.',
 '0']

In [10]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [11]:
max_len = 512
batch_size = 16
warmup_ratio = 0.1
num_epochs = 30
max_grad_norm = 1
log_interval = 200
learning_rate = 2e-5

In [14]:
# #토큰화
# tokenizer = nlp.data.BERTSPTokenizer(get_tokenizer(), vocab, lower=False)

# #BERTDataset 클래스 이용, TensorDataset으로 만들어주기
# train_dataset = BERTDataset(train_dataset, 0, 1, tokenizer, max_len, True, False)
# valid_dataset = BERTDataset(valid_dataset, 0, 1, tokenizer, max_len, True, False)