In [None]:
!pip install transformers
!pip install kobert-transformers
!pip install sentencepiece
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

In [2]:
from kobert_transformers import get_tokenizer, get_kobert_model
from sklearn.model_selection import train_test_split
from transformers import get_linear_schedule_with_warmup, AdamW
from tqdm import tqdm, tqdm_notebook
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset

import warnings
import torch
import transformers
import pandas as pd
import numpy as np

device = torch.device("cuda:0")
warnings.filterwarnings(action='ignore')

In [155]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
tokenizer = get_tokenizer()
bert_model = get_kobert_model()

Downloading:   0%|          | 0.00/363k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/76.0k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'KoBertTokenizer'.


Downloading:   0%|          | 0.00/352M [00:00<?, ?B/s]

In [4]:
! gdown --id 1wcrVx4wbrxKuG0EKofexyCa2QRVPBQgJ
data = pd.read_csv("./total_train_data.csv")

Downloading...
From: https://drive.google.com/uc?id=1wcrVx4wbrxKuG0EKofexyCa2QRVPBQgJ
To: /content/total_train_data.csv
100% 4.74M/4.74M [00:00<00:00, 117MB/s]


In [18]:
queries = data['query'].tolist()
intents = data['intent'].tolist()

In [19]:
train_data, valid_data, train_label, valid_label = train_test_split(queries, intents, test_size=0.2, shuffle=True, random_state=34)

In [5]:
MAX_LEN = 15
TRAIN_BATCH_SIZE =16
VALID_BATCH_SIZE =16
EPOCHS = 5
LR = 1e-5
WEIGHT_DECAY = 1e-2
BETA1 = 0.9
BETA2 = 0.999
MODEL_PATH = "/content/drive/MyDrive/Colab Notebooks/intent_class.pt"

In [6]:
class BERTDataset:
  def __init__(self, texts, target, bert_tokenizer, max_len, pad, pair, mode = 'train'):
    self.texts = texts
    self.target = target
    self.tokenizer = bert_tokenizer
    self.max_len = max_len
    self.transform = [tokenizer(
                            text,
                            padding="max_length", 
                            max_length = self.max_len,
                            truncation=True,
                            return_tensors='pt',
                            add_special_tokens=True
                            ) for text in self.texts]
    self.labels = [np.int32(i) for i in target]
    self.mode = mode

  def __len__(self):
    return (len(self.labels))
  
  def __getitem__(self, i):
    if self.mode == 'train':
      return self.transform[i], self.labels[i]
    else: 
      return self.transform[i]

In [22]:
train_dataset = BERTDataset(text=train_data,target=train_label, bert_tokenizer=tokenizer, max_len=MAX_LEN, pad=True, pair=False, mode = 'train')
train_data_loader = DataLoader(train_dataset, batch_size = TRAIN_BATCH_SIZE, num_workers = 2, shuffle=True)

valid_dataset = BERTDataset(text=valid_data,target=valid_label, bert_tokenizer=tokenizer, max_len=MAX_LEN, pad=True, pair=False, mode = 'train')
valid_data_loader = DataLoader(valid_dataset, batch_size = VALID_BATCH_SIZE, num_workers = 2, shuffle=True)

In [7]:
class BERTClassifier(nn.Module):
    def __init__(self, bert, hidden_size = 768, num_classes=5, dr_rate=None, params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)

    def forward(self, token_ids, segment_ids, attention_mask):
        pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids, attention_mask = attention_mask)[1]
        if self.dr_rate:
            out = self.dropout(pooler)
            return self.classifier(out)
        else: 
            return self.classifier(pooler)

In [183]:
model = BERTClassifier(bert_model, dr_rate=0.2).to(device)

In [None]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=LR)
loss_fn = nn.CrossEntropyLoss()
t_total = len(train_data_loader) * EPOCHS
warmup_step = int(t_total * 0.1)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc
max_grad_norm = 1
log_interval = 200

In [None]:
best_acc = 0
for e in range(EPOCHS):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, data in enumerate(tqdm_notebook(train_data_loader)):
        input_ids = data[0]['input_ids'].long().squeeze().to(device)
        token_type_ids = data[0]['token_type_ids'].long().squeeze().to(device)
        attention_mask = data[0]['attention_mask'].long().squeeze().to(device)
        labels = data[1].long().to(device)
        out = model(input_ids, token_type_ids, attention_mask)
        loss = loss_fn(out, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()
        train_acc += calc_accuracy(out, labels)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    
    model.eval()
    for batch_id, data in enumerate(tqdm_notebook(valid_data_loader)):
        input_ids = data[0]['input_ids'].long().squeeze().to(device)
        token_type_ids = data[0]['token_type_ids'].long().squeeze().to(device)
        attention_mask = data[0]['attention_mask'].long().squeeze().to(device)
        labels = data[1].long().to(device)
        out = model(input_ids, token_type_ids, attention_mask)
        test_acc += calc_accuracy(out, labels)
        
    if test_acc > best_acc:
      best_acc = test_acc
      torch.save({'epoch':e,
                'state_dict':model.state_dict(),
                'optimizer': optimizer.state_dict(),
        }, MODEL_PATH)
    print("epoch {} validation acc {}".format(e+1, test_acc / (batch_id+1)))

In [9]:
import os, sys 
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [30]:
query = "오늘 탕수육 주문 가능한가요?"
transform = tokenizer(
                        query,
                        padding="max_length",  # True or 'longest': Pad to the longest sequence in the batch
                        max_length = MAX_LEN,
                        truncation=True,
                        return_tensors='pt',
                        add_special_tokens=True
                        )
# query_transform = transform(query)
# query_data_loader = DataLoader(query_transform, batch_size=1)

In [32]:
device2 = torch.device('cpu')
predict_model = BERTClassifier(bert_model).to(device2)
predict_model.load_state_dict(torch.load(MODEL_PATH)['state_dict'])
predict_model.eval()
with torch.no_grad():
  output = predict_model(transform['input_ids'], transform['token_type_ids'], transform['attention_mask'])
  predict = torch.argmax(output, 1).cpu().detach().numpy()