In [2]:
!pip install gluonnlp
!pip install mxnet
!pip install git+https://github.com/SKTBrain/KoBERT.git
!pip install transformers

Collecting gluonnlp
[?25l  Downloading https://files.pythonhosted.org/packages/08/42/85f6cf7e13e222b2dc552059de8d37fe5956b9fab37f95dfacfa4e15a124/gluonnlp-0.8.2.tar.gz (237kB)
[K     |█▍                              | 10kB 34.7MB/s eta 0:00:01[K     |██▊                             | 20kB 3.2MB/s eta 0:00:01[K     |████▏                           | 30kB 4.7MB/s eta 0:00:01[K     |█████▌                          | 40kB 3.1MB/s eta 0:00:01[K     |███████                         | 51kB 3.8MB/s eta 0:00:01[K     |████████▎                       | 61kB 4.5MB/s eta 0:00:01[K     |█████████▋                      | 71kB 5.2MB/s eta 0:00:01[K     |███████████                     | 81kB 5.9MB/s eta 0:00:01[K     |████████████▍                   | 92kB 6.6MB/s eta 0:00:01[K     |█████████████▉                  | 102kB 5.0MB/s eta 0:00:01[K     |███████████████▏                | 112kB 5.0MB/s eta 0:00:01[K     |████████████████▌               | 122kB 5.0MB/s eta 0:00:01[

In [1]:
!nvidia-smi

Sat Jan 11 09:08:25 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.44       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    30W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [3]:
from datetime import datetime
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset
import math
from gluonnlp.data import SentencepieceTokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model
from kobert.utils import get_tokenizer
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
# Dataset
# https://github.com/e9t/nsmc.git

# BERT Model
# https://github.com/SKTBrain/KoBERT

# Optimizer
# https://github.com/huggingface/pytorch-transformers#optimizers-bertadam--openaiadam-are-now-adamw-schedules-are-standard-pytorch-schedules

In [0]:
def train(train_loader, device, model, linear, all_params, optimizer, scheduler,
          dropout_rate, max_grad_norm, log_interval, epoch):
    model.train()
    linear.train()
    for batch_idx, (input_ids, token_type_ids, input_mask, target) \
            in enumerate(train_loader):
        input_ids = input_ids.to(device)
        token_type_ids = token_type_ids.to(device)
        input_mask = input_mask.to(device)
        target = target.to(device)

        optimizer.zero_grad()
        _, pooled_output = model(input_ids, token_type_ids, input_mask)
        logits = linear(F.dropout(pooled_output, p=dropout_rate))
        output = F.log_softmax(logits, dim=1)

        loss = F.nll_loss(output, target)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(all_params, max_grad_norm)
        optimizer.step()
        scheduler.step()

        pred = output.argmax(dim=1, keepdim=True)
        correct = pred.eq(target.view_as(pred)).sum().item()

        if (batch_idx + 1) % log_interval == 0 \
                or batch_idx == len(train_loader) - 1:
            batch_len = len(input_ids)
            lr = ''
            for param_group in optimizer.param_groups:
                if 'lr' in param_group:
                    lr = param_group['lr']
                    break
            print('{}\tTrain Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'
                  '\tAccuracy: {}/{} ({:.2f}%)\tlr: {:.3e}'.format(
                    datetime.now(),
                    epoch, (batch_idx + 1) * batch_len,
                    len(train_loader.dataset),
                    100. * (batch_idx + 1) / len(train_loader), loss.item(),
                    correct, batch_len, 100. * correct / batch_len,
                    lr))


def test(test_loader, device, model, linear):
    model.eval()
    linear.eval()
    eval_loss = 0.
    correct = 0
    start_t = datetime.now()
    with torch.no_grad():
        for batch_idx, (input_ids, token_type_ids, input_mask, target) \
                in enumerate(test_loader):
            input_ids = input_ids.to(device)
            token_type_ids = token_type_ids.to(device)
            input_mask = input_mask.to(device)
            target = target.to(device)

            _, pooled_output = model(input_ids, token_type_ids, input_mask)
            logits = linear(pooled_output)
            output = F.log_softmax(logits, dim=1)

            eval_loss += F.nll_loss(output, target, reduction='sum').item()
            pred = output.argmax(1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()

    eval_loss /= len(test_loader.dataset)
    acc = correct / len(test_loader.dataset)
    print('Elapsed time: {}, Test, Avg. Loss: {:.6f}, '
          'Accuracy: {}/{} ({:.2f}%)\n'.format(datetime.now() - start_t,
                                               eval_loss,
                                               correct,
                                               len(test_loader.dataset),
                                               100. * acc))


class MovieDataset(Dataset):
    def __init__(self, examples):
        self.examples = examples

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, index):
        return self.examples[index]


def batchify(b):
    x_len = [len(e[0]) for e in b]
    batch_max_len = max(x_len)

    x = list()
    tk_type_ids = list()
    x_mask = list()
    y = list()
    for e in b:
        seq_len = len(e[0])
        e0_mask = [1] * seq_len  # 1: MASK
        while len(e[0]) < batch_max_len:
            e[0].append(0)  # 0: '[PAD]'
            e0_mask.append(0)
        assert len(e[0]) == batch_max_len

        e0_tk_type_ids = [0] * batch_max_len  #
        # e0_tk_type_ids[seq_len - 1] = 1

        x.append(e[0])
        tk_type_ids.append(e0_tk_type_ids)
        x_mask.append(e0_mask)
        y.append(e[1])

    x = torch.tensor(x, dtype=torch.int64)
    tk_type_ids = torch.tensor(tk_type_ids, dtype=torch.int64)
    x_mask = torch.tensor(x_mask, dtype=torch.int64)
    y = torch.tensor(y, dtype=torch.int64)

    return x, tk_type_ids, x_mask, y


def get_data(filepath, vocab, sp):
    data = list()
    max_seq_len = 0
    with open(filepath, 'r', encoding='euc-kr') as f:
        for lidx, l in enumerate(f):
            if 0 == lidx:
                continue
            cols = l[:-1].split('\t')
            # docid = cols[0]
            doc = cols[1]
            label = cols[2]

            token_ids = list()
            token_ids.append(vocab['[CLS]'])
            for t in sp(doc):
                if t in vocab:
                    token_ids.append(vocab[t])
                else:
                    token_ids.append(vocab['[UNK]'])
            token_ids.append(vocab['[SEP]'])

            data.append([token_ids, int(label)])

            if max_seq_len < len(token_ids):
                max_seq_len = len(token_ids)
    print('max_seq_len', max_seq_len)
    return data




In [5]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import os
os.chdir('/content/drive/My Drive/금융문자/모델링/model_BERT(SKTBrain)/')

In [0]:
ls

In [7]:
model, vocab = get_pytorch_kobert_model(
        ctx='cuda')# if torch.cuda.is_available() else 'cpu')

[██████████████████████████████████████████████████]
[██████████████████████████████████████████████████]


In [8]:
!nvidia-smi

Sat Jan 11 09:10:24 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.44       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    30W / 250W |   1117MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
+-------

In [9]:
def main():
#     nsmc_home_dir = '/media/donghyeon/f7c53837-2156-4793-b2b1-4b0578dffef1/nlp/nsmc'
    train_file = "30000train.txt"  # 150K
    test_file = "30000test.txt"  # 50K

    # model, vocab = get_pytorch_kobert_model(
    #     ctx='cuda' if torch.cuda.is_available() else 'cpu')

    lr = 5e-5
    batch_size = 32
    epochs = 5
    dropout_rate = 0.1
    max_grad_norm = 1.0
    num_total_steps = math.ceil(150000 / batch_size) * epochs
    num_warmup_steps = num_total_steps // 10
    log_interval = 100
    seed = 2019
    num_workers = 2
    num_classes = 2
    pooler_out_dim = model.pooler.dense.out_features

    torch.manual_seed(seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    print('device', device)

    tok_path = get_tokenizer()
    sp = SentencepieceTokenizer(tok_path)

    train_loader = torch.utils.data.DataLoader(
        MovieDataset(get_data(train_file, vocab, sp)),
        shuffle=True,
        batch_size=batch_size,
        num_workers=num_workers,
        collate_fn=batchify,
        pin_memory=True
    )

    test_loader = torch.utils.data.DataLoader(
        MovieDataset(get_data(test_file, vocab, sp)),
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        collate_fn=batchify,
        pin_memory=True
    )

    linear = torch.nn.Linear(pooler_out_dim, num_classes).to(device)

    all_params = list(model.parameters()) + list(linear.parameters())
    optimizer = AdamW(all_params, lr=lr, correct_bias=False)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps,
                                     num_training_steps=num_total_steps)

    for epoch in range(epochs):
        train(train_loader, device, model, linear, all_params,
              optimizer, scheduler, dropout_rate, max_grad_norm,
              log_interval, epoch)
        print(datetime.now(), 'Testing...')
        test(test_loader, device, model, linear)


if __name__ == '__main__':
    main()

device cuda
using cached model
max_seq_len 1236
max_seq_len 725


RuntimeError: ignored

In [0]:
torch.save(model, 'kobert_new_30000_front_back_1epochs')

In [0]:
model, vocab = get_pytorch_kobert_model()

using cached model
using cached model


In [0]:
# model load

In [0]:
vocab.bos_token

NameError: name 'a' is not defined

In [0]:
model = torch.load('kobert_new_30000_front_back_1epochs')

In [0]:
class MovieDataset(Dataset):
    def __init__(self, examples):
        self.examples = examples

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, index):
        return self.examples[index]
###########################################

test_file = "/home/ec2-user/workspace/Members/KSA/newdirectory/neew.txt"  

# model, vocab = get_pytorch_kobert_model('cpu')
#     ctx='cuda' if torch.cuda.is_available() else 'cpu')

lr = 5e-5
batch_size = 1
epochs = 5
dropout_rate = 0.1
max_grad_norm = 1.0
num_total_steps = math.ceil(150000 / batch_size) * epochs
num_warmup_steps = num_total_steps // 10
log_interval = 100
seed = 2019
num_workers = 2
num_classes = 2
pooler_out_dim = model.pooler.dense.out_features

torch.manual_seed(seed)

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")

print('device', device)

tok_path = get_tokenizer()
sp = SentencepieceTokenizer(tok_path)


test_loader = torch.utils.data.DataLoader(
    MovieDataset(get_data(test_file, vocab, sp)),
    batch_size=batch_size,
    shuffle=False,
    num_workers=num_workers,
    collate_fn=batchify,
    pin_memory=True
)


linear = torch.nn.Linear(pooler_out_dim, num_classes).to(device)

 

device cpu
using cached model
max_seq_len 757


In [0]:
device_cpu = torch.device("cpu")

In [0]:
import numpy as np

for batch_id, (input_ids, token_type_ids, input_mask,target) in enumerate(test_loader):
#     if batch_id==1:
#         break
    model.eval()
#     print(target)
    input_ids = input_ids.long().to(device_cpu) ##
    input_mask = input_mask.long().to(device_cpu)  ##
    token_type_ids= token_type_ids.to(device_cpu)  ##
#     print(token_type_ids)
    
#     out = model(token_ids, valid_length, segment_ids)
#     logits=out[0]
    _, pooled_output = model(input_ids, token_type_ids, input_mask)
    logits = linear(pooled_output)
#     print(logits)
    output = F.softmax(logits[0])#, dim=1)
#     print("logits.shape",logits.shape)
#     logits = logits.cpu().detach().numpy()
#     print(logits)
#     a=np.exp(logits[0].cpu().detach().numpy()) + np.exp(logits[1].cpu().detach().numpy())
#     b=np.exp(logits[1].cpu().detach().numpy())

    print(output)
#     print (b/a)   



tensor([0.3639, 0.6361], grad_fn=<SoftmaxBackward>)
tensor([0.3678, 0.6322], grad_fn=<SoftmaxBackward>)
tensor([0.4679, 0.5321], grad_fn=<SoftmaxBackward>)
tensor([0.4358, 0.5642], grad_fn=<SoftmaxBackward>)
tensor([0.3705, 0.6295], grad_fn=<SoftmaxBackward>)
tensor([0.3521, 0.6479], grad_fn=<SoftmaxBackward>)
tensor([0.3633, 0.6367], grad_fn=<SoftmaxBackward>)
tensor([0.3438, 0.6562], grad_fn=<SoftmaxBackward>)
tensor([0.4384, 0.5616], grad_fn=<SoftmaxBackward>)
tensor([0.3391, 0.6609], grad_fn=<SoftmaxBackward>)
tensor([0.3825, 0.6175], grad_fn=<SoftmaxBackward>)
tensor([0.3332, 0.6668], grad_fn=<SoftmaxBackward>)
tensor([0.3548, 0.6452], grad_fn=<SoftmaxBackward>)
tensor([0.3480, 0.6520], grad_fn=<SoftmaxBackward>)
tensor([0.3480, 0.6520], grad_fn=<SoftmaxBackward>)
tensor([0.3487, 0.6513], grad_fn=<SoftmaxBackward>)
tensor([0.3678, 0.6322], grad_fn=<SoftmaxBackward>)
tensor([0.3622, 0.6378], grad_fn=<SoftmaxBackward>)
tensor([0.3505, 0.6495], grad_fn=<SoftmaxBackward>)
tensor([0.42

RuntimeError: index out of range: Tried to access index 512 out of table with 511 rows. at /pytorch/aten/src/TH/generic/THTensorEvenMoreMath.cpp:418