# Setting

라이브러리 다운로드

In [1]:
!pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece
!pip install transformers==3
!pip install torch
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

Collecting mxnet
  Downloading mxnet-1.8.0.post0-py2.py3-none-manylinux2014_x86_64.whl (46.9 MB)
[K     |████████████████████████████████| 46.9 MB 18.6 MB/s eta 0:00:01   |██                              | 2.8 MB 2.1 MB/s eta 0:00:21     |████                            | 5.9 MB 2.1 MB/s eta 0:00:20     |████▌                           | 6.6 MB 2.1 MB/s eta 0:00:20     |██████                          | 8.8 MB 2.1 MB/s eta 0:00:18     |██████████▊                     | 15.8 MB 12.2 MB/s eta 0:00:03     |███████████▎                    | 16.6 MB 12.2 MB/s eta 0:00:03     |████████████▍                   | 18.1 MB 12.2 MB/s eta 0:00:03     |█████████████                   | 18.9 MB 12.2 MB/s eta 0:00:03     |█████████████▌                  | 19.7 MB 12.2 MB/s eta 0:00:03     |██████████████▌                 | 21.3 MB 12.2 MB/s eta 0:00:03     |███████████████                 | 22.1 MB 12.2 MB/s eta 0:00:03     |███████████████▋                | 22.9 MB 12.2 MB/s eta 0:00:02     |███████

라이브러리 불러오기

In [75]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import pandas as pd
import numpy as np
import re
import tarfile
import os
import pickle as pickle
from tqdm import tqdm
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
from sklearn.model_selection import train_test_split

def seed_everything(seed: int = 42):
    #random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore
    
seed_everything(0)

GPU 설정

In [76]:
device = torch.device("cuda:0")

kobert 불러오기

In [77]:
bertmodel, vocab = get_pytorch_kobert_model()

using cached model
using cached model


# Preprocessing

In [103]:
def load_data(dataset_dir):
    with open('/opt/ml/input/data/label_type.pkl', 'rb') as f:
        label_type = pickle.load(f)
    dataset = pd.read_csv(dataset_dir, delimiter='\t', header=None)
    dataset = preprocessing_dataset(dataset, label_type)
    return dataset

def preprocessing_dataset(dataset, label_type):
    label = []
    for i in dataset[8]:
        if i == 'blind':
            label.append(100)
        else:
            label.append(label_type[i])
    out_dataset = pd.DataFrame({'sentence':dataset[1],'entity_01':dataset[2],'entity_02':dataset[5],'label':label,})
    return out_dataset

In [104]:
dataset_path = r"/opt/ml/input/data/train/labeled_train_aug_fix.tsv"

dataset = load_data(dataset_path)

dataset['sentence'] = dataset['entity_01'] + ' [SEP] ' + dataset['entity_02'] + ' [SEP] ' + dataset['sentence']

In [105]:
train, vali = train_test_split(dataset, test_size=0.2, random_state=42)
train[['sentence','label']].to_csv("/opt/ml/input/data/train/train_train_aug_v3.txt", sep='\t', index=False)
vali[['sentence','label']].to_csv("/opt/ml/input/data/train/train_vali_aug_v3.txt", sep='\t', index=False)

In [106]:
dataset_train = nlp.data.TSVDataset("/opt/ml/input/data/train/train_train_aug_v3.txt", field_indices=[0,1], num_discard_samples=1)
dataset_vali = nlp.data.TSVDataset("/opt/ml/input/data/train/train_vali_aug_v3.txt", field_indices=[0,1], num_discard_samples=1)

In [126]:
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model
/opt/ml/kobert/kobert_news_wiki_ko_cased-1087f8699e.spiece


AttributeError: 'BERTSPTokenizer' object has no attribute 'add_special_tokens'

In [113]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len, pad, pair):
        transform = nlp.data.BERTSentenceTransform(bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        print(self.sentences[i])
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))
    
    def get_classes(self) :
        return self.labels

In [114]:
max_len = 128
batch_size = 32
warmup_ratio = 0.01
num_epochs = 20
max_grad_norm = 1
log_interval = 50
learning_rate = 5e-5

In [115]:
from catalyst.data.sampler import BalanceClassSampler

data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
data_sampler_train = BalanceClassSampler(data_train.get_classes(), 'upsampling')
data_vali = BERTDataset(dataset_vali, 0, 1, tok, max_len, True, False)
data_sampler_vali = BalanceClassSampler(data_vali.get_classes(), 'upsampling')

In [116]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5, sampler = data_sampler_train)
vali_dataloader = torch.utils.data.DataLoader(data_vali, batch_size=batch_size, num_workers=5, sampler = data_sampler_vali)

# Classification

In [117]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes = 42,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

class LabelSmoothingLoss(nn.Module):
    #smoothing 주면 오름 : 0.1 정도로
    def __init__(self, classes=42, smoothing=0.1, dim=-1):
        super(LabelSmoothingLoss, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.cls = classes
        self.dim = dim

    def forward(self, pred, target):
        pred = pred.log_softmax(dim=self.dim)
        with torch.no_grad():
            true_dist = torch.zeros_like(pred)
            true_dist.fill_(self.smoothing / (self.cls - 1))
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))
    

def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = LabelSmoothingLoss()
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [119]:
from tqdm import tqdm

best_train_acc = 0.0
best_test_acc = 0.0

for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    
    for batch_id, (token_ids, valid_length, segment_ids, label) in tqdm(enumerate(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()
        train_acc += calc_accuracy(out, label)
        #if batch_id % log_interval == 0: #print("epoch {0} batch id {1} loss {2:.4f} train acc {3:.4f}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    train_acc = train_acc / (batch_id+1)
    print("epoch {0} train acc {1:.4f}".format(e+1, train_acc))
    
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(vali_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length = valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    test_acc = test_acc / (batch_id + 1)
    print("epoch {0} test acc {1:.4f}".format(e+1, test_acc))
    '''
    if test_acc >= best_test_acc and train_acc >= best_train_acc:
        print('save model')
        best_test_acc = test_acc
        best_train_acc = train_acc
    '''
    torch.save(model.state_dict(), "model_token_upsam.pt")
    
    if train_acc > 0.99 and test_acc > 0.95 : 
        print('early stop')
        break

4739it [30:56,  2.55it/s]

epoch 1 train acc 0.9991





epoch 1 test acc 0.8878


577it [03:45,  2.56it/s]


KeyboardInterrupt: 

# Predict

In [121]:
dataset_path = r"input/data/test/labeled_test.tsv"
dataset = load_data(dataset_path)

dataset['sentence'] = dataset['entity_01'] + ' [SEP] ' + dataset['entity_02'] + ' [SEP] ' + dataset['sentence']
dataset[['sentence','label']].to_csv("input/data/test/labeled_test.txt", sep='\t', index=False)

dataset_test = nlp.data.TSVDataset("input/data/test/labeled_test.txt", field_indices=[0,1], num_discard_samples=1)
data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=16, num_workers=5)

model = BERTClassifier(bertmodel, dr_rate = 0.5).to(device)
model.load_state_dict(torch.load('model_token_upsam.pt'))
model.eval()
predict = []

for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
    token_ids = token_ids.long().to(device)
    segment_ids = segment_ids.long().to(device)
    valid_length = valid_length
    label = label.long().to(device)
    
    out = model(token_ids, valid_length, segment_ids)
    _, out = torch.max(out, 1)
    predict.extend(out.tolist())
    
out = pd.DataFrame(predict, columns = ['pred'])
out.to_csv('submission_tokenized_upsam.csv', index = False)

In [2]:
!df -h
!du -h --max-depth=1 | sort -hr

Filesystem      Size  Used Avail Use% Mounted on
overlay          99G   71G   23G  76% /
tmpfs            64M     0   64M   0% /dev
tmpfs            30G     0   30G   0% /sys/fs/cgroup
shm             1.0G   64K  1.0G   1% /dev/shm
/dev/xvdb1       99G   71G   23G  76% /etc/hosts
tmpfs            30G   12K   30G   1% /proc/driver/nvidia
/dev/xvda1       48G  8.7G   37G  20% /usr/bin/nvidia-smi
udev             30G     0   30G   0% /dev/nvidia0
tmpfs            30G     0   30G   0% /proc/acpi
tmpfs            30G     0   30G   0% /proc/scsi
tmpfs            30G     0   30G   0% /sys/firmware
36G	.
29G	./unlabeled
3.4G	./.cache
1.7G	./labeled
1.3G	./save_best
679M	./kobert
150M	./input
15M	./Dacon
3.3M	./.ipython
900K	./.local
304K	./.ipynb_checkpoints
192K	./logs
52K	./code
24K	./hard_vote
20K	./.jupyter
8.0K	./__pycache__
8.0K	./.nv
8.0K	./.config
8.0K	./.conda
4.0K	./.empty
