<a href="https://colab.research.google.com/github/alsruf36/political-disposition-determiner/blob/master/notebooks/political_disposition_determiner_modeler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. Colab 환경 설정

In [None]:
!pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece
!pip install transformers==3.0.2
!pip install torch
!pip install kss
!pip install konlpy
!pip install textrankr 
!pip install typing
!pip install dill

In [None]:
#깃허브에서 KoBERT 파일 로드
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
import tqdm
import kss
from pprint import pprint
import emoji
from konlpy.tag import Okt
from hanspell import spell_checker
from soynlp.normalizer import *

In [None]:
#kobert
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

#transformers
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

In [None]:
#GPU 사용
device = torch.device("cuda:0")

In [None]:
#BERT 모델, Vocabulary 불러오기
bertmodel, vocab = get_pytorch_kobert_model()

2. 데이터셋 전처리

In [None]:
import requests
import json

URL = "http://mingyeol.com:5000/comment" #댓글 요청을 위한 URL

#파라미터에서 count는 꼭 포함해야 하며, count 외에는 필요한 조건만 포함해야한다.
params = {
    "count": 100000,            #불러올 댓글 개수
    #"tend": "pro",             #정치 성향 (con : 보수 | pro : 진보)
    "minlike": 20,              #댓글의 최소 공감 개수
    "minlength": 20,            #댓글의 최소 길이
    "mintimestamp": 1514732400, #댓글의 최소 날짜 (Unix Time 형식)
    #"level": 3                 #언론사의 레벨 (0부터 3까지 이며 그 숫자 이하 레벨의 모든 언론사를 대상으로 한다. 숫자가 작을수록 극좌/극우에 가깝다.)
}

response = requests.get(URL, params=params) #requests 모듈을 통해 API에 요청
comments = json.loads(response.text) #로드된 JSON 텍스트를 배열로 변경
contents = [[x['normalized'], x['calculate']] for x in comments] #comments 중 원하는 항목만 추출

cons = [x for x in contents if x[1] == 0] #보수 댓글 리스트
pros = [x for x in contents if x[1] == 1] #진보 댓글 리스트


print("{}개의 데이터를 성공적으로 불러왔습니다. (보수 {}개 | 진보 {}개)".format(len(contents), len(cons), len(pros)))

3. Train data & Test data

In [None]:
#train & test 데이터로 나누기
from sklearn.model_selection import train_test_split
                                                         
dataset_train, dataset_test = train_test_split(contents, test_size=0.25, random_state=0)

print(len(dataset_train))
print(len(dataset_test))
 

4. KoBERT 입력 데이터로 만들기

In [32]:
# BERT 모델에 들어가기 위한 dataset을 만들어주는 클래스
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [33]:
# Setting parameters
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 10
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [None]:
#토큰화
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

In [None]:
data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)


print(data_train[0])

In [36]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

5. KoBERT 학습모델 만들기

In [37]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=2,   ##클래스 수 조정##
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [38]:
model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)

In [39]:
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [40]:
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

In [41]:
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

In [42]:
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [43]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [None]:
train_dataloader

6. KoBERT 모델 학습시키기

In [None]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm.notebook.tqdm(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm.notebook.tqdm(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

In [None]:
import dill
from google.colab import drive
import datetime as pydatetime

def get_now_timestamp():
    return int(pydatetime.datetime.now().timestamp())

drive.mount('/gdrive', force_remount=True)
path = "/gdrive/MyDrive/Colab Notebooks/pickles"

with open(path + '/model-{}-{}-{}-{}-{}-{}.pkl'.format(len(contents), len(cons), len(pros), batch_size, num_epochs, get_now_timestamp()), 'wb') as f:
    dill.dump(model, f)