<a href="https://colab.research.google.com/github/bcmin1018/chatbot/blob/main/notebooks/sentiment_class.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install mxnet-cu101
!pip install gluonnlp pandas tqdm
!pip install sentencepiece==0.1.85
!pip install transformers==2.1.1
!pip install torch==1.3.1
!pip install -Iv botocore==1.17
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

In [7]:
from transformers import BertTokenizer, AdamW
from transformers import get_linear_schedule_with_warmup
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm, tqdm_notebook
from torch import nn
from torch.nn import functional as F
from sklearn.model_selection import StratifiedKFold, train_test_split
import transformers
import pandas as pd
import numpy as np
import torch
import random
import warnings
import gluonnlp as nlp

device = torch.device("cuda:0")
warnings.filterwarnings(action='ignore')

In [8]:
bertmodel, vocab = get_pytorch_kobert_model()
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model. /content/.cache/kobert_v1.zip
using cached model. /content/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece
using cached model. /content/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [3]:
data = pd.read_csv("https://raw.githubusercontent.com/songys/Chatbot_data/master/ChatbotData.csv")

In [4]:
features = data['Q'].tolist()
labels = data['label'].tolist()

In [40]:
train_data, valid_data, train_label, valid_label = train_test_split(features, labels, test_size=0.2, shuffle=True, random_state=34)

In [10]:
# config.py
MAX_LEN = 15
TRAIN_BATCH_SIZE =16
VALID_BATCH_SIZE =16
EPOCHS = 5
LR = 1e-4
WEIGHT_DECAY = 1e-2
BETA1 = 0.9
BETA2 = 0.999
MODEL_PATH = "/content/drive/MyDrive/Colab Notebooks/model.pt"
TOKENIZER = transformers.BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=True)

In [9]:
# dataset.py
class BERTDataset:
  def __init__(self, text, target, bert_tokenizer, max_len, pad, pair):
    self.text = text
    self.target = target
    self.tokenizer = bert_tokenizer
    self.max_len = max_len
    self.transform = nlp.data.BERTSentenceTransform(self.tokenizer, max_seq_length=self.max_len, pad=pad, pair=pair)
    self.sentences = [self.transform([i]) for i in text]
    self.labels = [np.int32(i) for i in target]

  def __len__(self):
    return (len(self.labels))
  
  def __getitem__(self, i):
    return (self.sentences[i] + (self.labels[i], ))

In [43]:
train_dataset = BERTDataset(text=train_data,target=train_label, bert_tokenizer=tok, max_len=MAX_LEN, pad=True, pair=False)
train_data_loader = DataLoader(train_dataset, batch_size = TRAIN_BATCH_SIZE, num_workers = 2, shuffle=True)

valid_dataset = BERTDataset(text=valid_data,target=valid_label, bert_tokenizer=tok, max_len=MAX_LEN, pad=True, pair=False)
valid_data_loader = DataLoader(valid_dataset, batch_size = VALID_BATCH_SIZE, num_workers = 2, shuffle=True)

In [30]:
class BERTClassifier(nn.Module):
    def __init__(self, bert, hidden_size = 768, num_classes=3, dr_rate=None, params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [44]:
model = BERTClassifier(bertmodel,  dr_rate=0.1).to(device)

In [45]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=LR)
loss_fn = nn.CrossEntropyLoss()
t_total = len(train_data_loader) * EPOCHS
warmup_step = int(t_total * 0.1)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc
max_grad_norm = 1
log_interval = 200

In [46]:
for e in range(EPOCHS):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_data_loader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(valid_data_loader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} validation acc {}".format(e+1, test_acc / (batch_id+1)))

  0%|          | 0/592 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 1.0696163177490234 train acc 0.5625
epoch 1 batch id 201 loss 0.5005840063095093 train acc 0.6215796019900498
epoch 1 batch id 401 loss 0.668188750743866 train acc 0.7001246882793017
epoch 1 train acc 0.7302576013513513


  0%|          | 0/148 [00:00<?, ?it/s]

epoch 1 validation acc 0.8278326403326404


  0%|          | 0/592 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 0.540449857711792 train acc 0.8125
epoch 2 batch id 201 loss 0.14733953773975372 train acc 0.8557213930348259
epoch 2 batch id 401 loss 0.30710193514823914 train acc 0.8559850374064838
epoch 2 train acc 0.8561021959459459


  0%|          | 0/148 [00:00<?, ?it/s]

epoch 2 validation acc 0.8572635135135135


  0%|          | 0/592 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 0.13726305961608887 train acc 0.9375
epoch 3 batch id 201 loss 0.026994384825229645 train acc 0.9048507462686567
epoch 3 batch id 401 loss 0.493823379278183 train acc 0.9060162094763092
epoch 3 train acc 0.9067778716216216


  0%|          | 0/148 [00:00<?, ?it/s]

epoch 3 validation acc 0.8811395530145529


  0%|          | 0/592 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 0.29970166087150574 train acc 0.9375
epoch 4 batch id 201 loss 0.02142002247273922 train acc 0.945273631840796
epoch 4 batch id 401 loss 0.6401247382164001 train acc 0.947786783042394
epoch 4 train acc 0.9484797297297297


  0%|          | 0/148 [00:00<?, ?it/s]

epoch 4 validation acc 0.8710044178794178


  0%|          | 0/592 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 0.0995110496878624 train acc 0.9375
epoch 5 batch id 201 loss 0.007559561170637608 train acc 0.9738805970149254
epoch 5 batch id 401 loss 0.004935435485094786 train acc 0.9745947630922693
epoch 5 train acc 0.9720228040540541


  0%|          | 0/148 [00:00<?, ?it/s]

epoch 5 validation acc 0.873635654885655


In [47]:
import os, sys 
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [48]:
MODEL_PATH = "/content/drive/MyDrive/Colab Notebooks/Bert_model.h5"
torch.save(model, MODEL_PATH)