# Setting

라이브러리 다운로드

In [1]:
!pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece
!pip install transformers==3
!pip install torch
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

Collecting mxnet
  Downloading mxnet-1.8.0.post0-py2.py3-none-manylinux2014_x86_64.whl (46.9 MB)
[K     |████████████████████████████████| 46.9 MB 8.2 MB/s eta 0:00:01     |█████████████████▍              | 25.4 MB 8.9 MB/s eta 0:00:03     |██████████████████████████▉     | 39.4 MB 7.2 MB/s eta 0:00:02     |████████████████████████████▎   | 41.5 MB 7.2 MB/s eta 0:00:01     |███████████████████████████████▎| 45.9 MB 8.2 MB/s eta 0:00:01
Collecting graphviz<0.9.0,>=0.8.1
  Downloading graphviz-0.8.4-py2.py3-none-any.whl (16 kB)
Installing collected packages: graphviz, mxnet
Successfully installed graphviz-0.8.4 mxnet-1.8.0.post0
Collecting gluonnlp
  Downloading gluonnlp-0.10.0.tar.gz (344 kB)
[K     |████████████████████████████████| 344 kB 2.0 MB/s eta 0:00:01
Collecting cython
  Using cached Cython-0.29.23-cp37-cp37m-manylinux1_x86_64.whl (2.0 MB)
Building wheels for collected packages: gluonnlp
  Building wheel for gluonnlp (setup.py) ... [?25ldone
[?25h  Created wheel for gluonn

라이브러리 불러오기

In [2]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import pandas as pd
import numpy as np
import re
import tarfile
import pickle as pickle
from tqdm import tqdm
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
from sklearn.model_selection import train_test_split

GPU 설정

In [3]:
device = torch.device("cuda:0")

kobert 불러오기

In [4]:
bertmodel, vocab = get_pytorch_kobert_model()

[██████████████████████████████████████████████████]
using cached model


# Preprocessing

In [5]:
def load_data(dataset_dir):
    with open('/opt/ml/input/data/label_type.pkl', 'rb') as f:
        label_type = pickle.load(f)
    dataset = pd.read_csv(dataset_dir, delimiter='\t', header=None)
    dataset = preprocessing_dataset(dataset, label_type)
    return dataset

def preprocessing_dataset(dataset, label_type):
    label = []
    for i in dataset[8]:
        if i == 'blind':
            label.append(100)
        else:
            label.append(label_type[i])
    out_dataset = pd.DataFrame({'sentence':dataset[1],'entity_01':dataset[2],'entity_02':dataset[5],'label':label,})
    return out_dataset

In [6]:
dataset_path = r"/opt/ml/input/data/train/train.tsv"

dataset = load_data(dataset_path)

dataset['sentence'] = dataset['entity_01'] + ' [SEP] ' + dataset['entity_02'] + ' [SEP] ' + dataset['sentence']

In [7]:
train, vali = train_test_split(dataset, test_size=0.2, random_state=42)
train[['sentence','label']].to_csv("/opt/ml/input/data/train/train_train.txt", sep='\t', index=False)
vali[['sentence','label']].to_csv("/opt/ml/input/data/train/train_vali.txt", sep='\t', index=False)

In [8]:
dataset_train = nlp.data.TSVDataset("/opt/ml/input/data/train/train_train.txt", field_indices=[0,1], num_discard_samples=1)
dataset_vali = nlp.data.TSVDataset("/opt/ml/input/data/train/train_vali.txt", field_indices=[0,1], num_discard_samples=1)

In [9]:
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model


In [10]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len, pad, pair):
        transform = nlp.data.BERTSentenceTransform(bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [11]:
max_len = 128
batch_size = 32
warmup_ratio = 0.01
num_epochs = 20
max_grad_norm = 1
log_interval = 50
learning_rate = 5e-5

In [12]:
data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
data_vali = BERTDataset(dataset_vali, 0, 1, tok, max_len, True, False)

In [13]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
vali_dataloader = torch.utils.data.DataLoader(data_vali, batch_size=batch_size, num_workers=5)

# Classification

In [14]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes = 42,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [15]:
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)

In [16]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [17]:
class LabelSmoothingLoss(nn.Module):
    def __init__(self, classes=42, smoothing=0.0, dim=-1):
        super(LabelSmoothingLoss, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.cls = classes
        self.dim = dim

    def forward(self, pred, target):
        pred = pred.log_softmax(dim=self.dim)
        with torch.no_grad():
            true_dist = torch.zeros_like(pred)
            true_dist.fill_(self.smoothing / (self.cls - 1))
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))

In [18]:
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = LabelSmoothingLoss(smoothing=0.2)

In [19]:
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

In [20]:
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [21]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [24]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    best_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(train_dataloader):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(vali_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length = valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))
    if test_acc >= best_acc:
        best_acc = test_acc
        torch.save(model.state_dict(), "/opt/ml/model/model_state_dict.pt")

epoch 1 batch id 1 loss 1.625742793083191 train acc 0.59375
epoch 1 batch id 51 loss 1.461042881011963 train acc 0.6544117647058824
epoch 1 batch id 101 loss 1.064346432685852 train acc 0.6760519801980198
epoch 1 batch id 151 loss 0.9264274835586548 train acc 0.6910182119205298
epoch 1 batch id 201 loss 0.505707323551178 train acc 0.699160447761194
epoch 1 train acc 0.7051388888888889
epoch 1 test acc 0.6831140350877193
epoch 2 batch id 1 loss 1.1351864337921143 train acc 0.6875
epoch 2 batch id 51 loss 1.5454633235931396 train acc 0.75
epoch 2 batch id 101 loss 0.7211730480194092 train acc 0.7543316831683168
epoch 2 batch id 151 loss 0.636289656162262 train acc 0.7613824503311258
epoch 2 batch id 201 loss 0.36386698484420776 train acc 0.7705223880597015
epoch 2 train acc 0.7743055555555556
epoch 2 test acc 0.6913377192982456
epoch 3 batch id 1 loss 0.7790242433547974 train acc 0.78125
epoch 4 batch id 101 loss 0.5585620403289795 train acc 0.8641707920792079
epoch 4 batch id 151 loss 0

# Predict

In [25]:
dataset_path = r"/opt/ml/input/data/test/test.tsv"

dataset = load_data(dataset_path)

dataset['sentence'] = dataset['entity_01'] + ' [SEP] ' + dataset['entity_02'] + ' [SEP] ' + dataset['sentence']

dataset[['sentence','label']].to_csv("/opt/ml/input/data/test/test.txt", sep='\t', index=False)

In [26]:
dataset_test = nlp.data.TSVDataset("/opt/ml/input/data/test/test.txt", field_indices=[0,1], num_discard_samples=1)

data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)

test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

In [27]:
model.load_state_dict(torch.load("/opt/ml/model/model_state_dict.pt"))

model.eval()

Predict = []

for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
    token_ids = token_ids.long().to(device)
    segment_ids = segment_ids.long().to(device)
    valid_length = valid_length
    label = label.long().to(device)
    out = model(token_ids, valid_length, segment_ids)
    _, predict = torch.max(out,1)
    Predict.extend(predict.tolist())

In [29]:
output = pd.DataFrame(Predict, columns=['pred'])
output.to_csv('/opt/ml/submission.csv', index=False)

In [30]:
!jupyter nbconvert --to script 'P2_KLUE.ipynb'

[NbConvertApp] Converting notebook P2_KLUE.ipynb to script
[NbConvertApp] Writing 9079 bytes to P2_KLUE.py
