<a href="https://colab.research.google.com/github/alsruf36/political-disposition-determiner/blob/master/notebooks/political_disposition_determiner_evaluater.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. Colab 환경 설정

In [None]:
from google.colab import drive

drive.mount('/gdrive', force_remount=True)
path = "/gdrive/MyDrive/Colab Notebooks/pickles"

In [None]:
!pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece
!pip install transformers==3.0.2
!pip install torch
!pip install kss
!pip install konlpy
!pip install anvil-uplink
!pip install textrankr 
!pip install typing
!pip install dill

In [None]:
#깃허브에서 KoBERT 파일 로드
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

In [4]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
import tqdm
import kss
from pprint import pprint
import emoji
from konlpy.tag import Okt

In [5]:
#kobert
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

#transformers
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

In [6]:
#GPU 사용
device = torch.device("cuda:0")

In [None]:
#BERT 모델, Vocabulary 불러오기
bertmodel, vocab = get_pytorch_kobert_model()

In [15]:
# BERT 모델에 들어가기 위한 dataset을 만들어주는 클래스
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [16]:
# Setting parameters
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 10
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [17]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=2,   ##클래스 수 조정##
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

6. KoBERT 모델 학습시키기

In [18]:
import dill

with open(path + '/model-100000-50000-50000-64-10.pkl', 'rb') as f:
    model = dill.load(f)

7. 새로운 문장 테스트

In [19]:
#토큰화
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model. /content/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [20]:
def predict(predict_sentence):

    data = [predict_sentence, '0']
    dataset_another = [data]

    another_test = BERTDataset(dataset_another, 0, 1, tok, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(another_test, batch_size=batch_size, num_workers=5)
    
    model.eval()

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)

        valid_length= valid_length
        label = label.long().to(device)

        out = model(token_ids, valid_length, segment_ids)


        test_eval=[]
        for i in out:
            logits=i
            logits = logits.detach().cpu().numpy()

            if np.argmax(logits) == 0:
                test_eval.append(0)
            elif np.argmax(logits) == 1:
                test_eval.append(1)

        print(">> 입력하신 내용에서 " + str(test_eval[0]) + " 느껴집니다.")
        return test_eval[0]

In [None]:
import anvil.server
import requests
from bs4 import BeautifulSoup, Comment
from typing import List
from textrankr import TextRank
import random

anvil.server.connect("server_YEW72CBQZBHHFVTJEFQLAQEH-BMIRAKWNH5DZNLW4", url="ws://mingyeol.com:1303/_/uplink")

class ArticleListCrawler:
  def __init__(self, count):
      custom_header = {
          'referer' : 'https://www.naver.com/',
          'user-agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
      }
      
      list_href = []
      result = []
      
      section = "정치"
      req = self.get_request(section)
      soup = BeautifulSoup(req.text, "html.parser")
      
      list_href = self.get_href(soup)

      target = random.randrange(0, count)
      target_url = list_href[target]
      oid = target_url.split('&')[3].split('=')[1]
      aid = target_url.split('&')[4].split('=')[1]
      
      self.ids = {
          "oid": oid,
          "aid": aid
      }

  def get_href(self, soup) :
      result = []
      
      div = soup.find("div", class_="list_body newsflash_body")
      
      for dt in div.find_all("dt", class_="photo"):
          result.append(dt.find("a")["href"])
      
      return result

  def get_request(self, section) :
      custom_header = {
          'referer' : 'https://www.naver.com/',
          'user-agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
      }
      
      url = "https://news.naver.com/main/list.nhn"
      
      sections = {
          "정치" : 100,
          "경제" : 101,
          "사회" : 102,
          "생활" : 103,
          "세계" : 104,
          "과학" : 105
      }
      
      req = requests.get(url, headers = custom_header,
              params = {"sid1" : sections[section]})
      
      return req

class MyTokenizer:
    def __call__(self, text: str) -> List[str]:
        tokens: List[str] = text.split()
        return tokens

def summarize(text, line):
    mytokenizer: MyTokenizer = MyTokenizer()
    textrank: TextRank = TextRank(mytokenizer)

    summaries: List[str] = textrank.summarize(text, line, verbose=False)
    return summaries

@anvil.server.callable
def getArticle():
  ids = ArticleListCrawler(20).ids
  url = 'https://news.naver.com/main/read.nhn?mode=LSD&mid=sec&sid1=100&oid={}&aid={}'.format(ids['oid'], ids['aid'])
  r = requests.get(url, headers={'User-Agent':'Mozilla/5.0'})
  soup = BeautifulSoup(r.text, 'html.parser')

  title = soup.select_one('h3#articleTitle').text
  content = soup.select_one('#articleBodyContents')
  subtitle = content.select_one('strong')
  if subtitle is not None: subtitle = subtitle.extract().text

  for x in content.select('script'): x.extract()                             # <script>...</script> 제거
  for x in content(text=lambda text: isinstance(text, Comment)): x.extract() # <!-- 주석 --> 제거
  for x in content.select("br"): x.replace_with("\n")                        # <br>을 \n로 교체
  for x in content.find_all('span'): x.decompose()                           # 기타 span 제거
  for x in content.find_all('div'): x.decompose()                            # 기타 div 제거
  content = "".join([str(x) for x in content.contents])                      # 최상위 태그 제거(=innerHtml 추출)
  content = content.strip()                                                  # 앞뒤 공백 제거
  content = kss.split_sentences(content)                                     # 문장 단위로 분리
  summary = summarize("\n".join(content), 5)                                 # 문장 요약

  return {
      "title": title,
      "subtitle": subtitle,
      "content": content,
      "summary": summary
  }

@anvil.server.callable
def predicter(text):
  texts = kss.split_sentences(text)
  result = [[x, predict(x)] for x in texts]

  return result

anvil.server.wait_forever()