In [1]:
!pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece
# 변경: 최신 버전으로 설치하면 "Input: must be Tensor, not str" 라는 에러 발생
!pip install transformers==3
!pip install torch

Collecting mxnet
  Downloading mxnet-1.8.0.post0-py2.py3-none-manylinux2014_x86_64.whl (46.9 MB)
[K     |████████████████████████████████| 46.9 MB 44 kB/s 
[?25hCollecting graphviz<0.9.0,>=0.8.1
  Downloading graphviz-0.8.4-py2.py3-none-any.whl (16 kB)
Installing collected packages: graphviz, mxnet
  Attempting uninstall: graphviz
    Found existing installation: graphviz 0.10.1
    Uninstalling graphviz-0.10.1:
      Successfully uninstalled graphviz-0.10.1
Successfully installed graphviz-0.8.4 mxnet-1.8.0.post0
Collecting gluonnlp
  Downloading gluonnlp-0.10.0.tar.gz (344 kB)
[K     |████████████████████████████████| 344 kB 4.1 MB/s 
Building wheels for collected packages: gluonnlp
  Building wheel for gluonnlp (setup.py) ... [?25l[?25hdone
  Created wheel for gluonnlp: filename=gluonnlp-0.10.0-cp37-cp37m-linux_x86_64.whl size=595729 sha256=7d0ced401da8ab15d72e37fedaf0a72c30d7ec5420828838d5dfcf73e82daa0b
  Stored in directory: /root/.cache/pip/wheels/be/b4/06/7f3fdfaf707e6b5e98b

In [2]:
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

Collecting git+https://****@github.com/SKTBrain/KoBERT.git@master
  Cloning https://****@github.com/SKTBrain/KoBERT.git (to revision master) to /tmp/pip-req-build-qb5fg5ug
  Running command git clone -q 'https://****@github.com/SKTBrain/KoBERT.git' /tmp/pip-req-build-qb5fg5ug
Building wheels for collected packages: kobert
  Building wheel for kobert (setup.py) ... [?25l[?25hdone
  Created wheel for kobert: filename=kobert-0.1.2-py3-none-any.whl size=12770 sha256=5c7730b9942fc62602819425ae4e3317564bc741635ca47c714ce86ef3eebb12
  Stored in directory: /tmp/pip-ephem-wheel-cache-hfo0ciyz/wheels/d3/68/ca/334747dfb038313b49cf71f84832a33372f3470d9ddfd051c0
Successfully built kobert
Installing collected packages: kobert
Successfully installed kobert-0.1.2


In [3]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook

from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

In [4]:
import pandas as pd
import re
import os
import time

In [5]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768, # bert의 hidden layer 크기?
                #  num_classes=2,
                 num_classes=9, # 분류 class 크기
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes) #nn.Linear
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [6]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [59]:
class NewsPredict():

  def __init__(self, device_type='cpu'):
    """
      Args:
          device_type : default는 "cpu", gpu로 돌리고자 할 때 "cuda:0"을 입력합니다.
    """
    self.device = torch.device(device_type) # cpu로 돌리도록 선언 기본값 / "cuda:0"
    self.max_len = 128 # seqeunce 최대 길이
  
  #model and tokenizer 로딩 
  def load_model_n_tokenizer(self, model_path):
    """
      tips: Pytorch Model과 Tokenizer를 반환합니다.
      Args:
          model_path : 모델이 저장된 path.
      Returns:
          tok : nlp.data.BERTSPTokenizer
          model : torch model
    """
    bertmodel, vocab = get_pytorch_kobert_model()
    tokenizer = get_tokenizer()
    tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)
    
    model = torch.load(model_path, map_location=self.device)

    return tok, model

  #전처리 안된 데이터 전처리
  def preprocess_data(self, data_path, data_colname):
    """
      tips: csv 데이터를 받아 지정된 column의 내용을 preprocess 합니다.
      Args:
          data_path : csv데이터의 path
          data_colname : 지정할 column명
      Returns:
          lucy_data : DataFrame
    """
    lucy_data = pd.read_csv(data_path)

    lucy_data[data_colname] = lucy_data[data_colname].str.replace("\(.*\)|\s-\s.*"," " ,regex=True)
    lucy_data[data_colname] = lucy_data[data_colname].str.replace("\[.*\]|\s-\s.*"," ",regex=True)
    lucy_data[data_colname] = lucy_data[data_colname].str.replace("\<.*\>|\s-\s.*"," ",regex=True)
    lucy_data[data_colname] = lucy_data[data_colname].str.replace("무단전재 및 재배포 금지"," ",regex=True)
    lucy_data[data_colname] = lucy_data[data_colname].str.replace("무단 전재 및 재배포 금지"," ",regex=True)
    lucy_data[data_colname] = lucy_data[data_colname].str.replace("©"," ",regex=True)
    lucy_data[data_colname] = lucy_data[data_colname].str.replace("ⓒ"," ",regex=True)
    lucy_data[data_colname] = lucy_data[data_colname].str.replace("저작권자"," ",regex=True)
    lucy_data[data_colname] = lucy_data[data_colname].str.replace(".* 기자", " ", regex=True) #기자 이름에서 오는 유사도 차단
    lucy_data[data_colname] = lucy_data[data_colname].str.replace("사진 = .*", " ", regex=True) #사진 첨부 문구 삭제
    lucy_data[data_colname] = lucy_data[data_colname].str.replace("사진=.*", " ", regex=True) #사진 첨부 문구 삭제
    lucy_data[data_colname] = lucy_data[data_colname].str.replace('\"', "",regex=True)
    lucy_data[data_colname] = lucy_data[data_colname].str.replace("([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+.[a-zA-Z0-9-.]+)", " ", regex=True) #이메일 주소에서 오는 유사도 차단
    lucy_data[data_colname] = lucy_data[data_colname].str.replace("\n"," ")
    lucy_data[data_colname] = lucy_data[data_colname].str.replace("\r"," ")
    lucy_data[data_colname] = lucy_data[data_colname].str.replace("\t"," ")
    lucy_data[data_colname] = lucy_data[data_colname].str.replace( "\’" , "", regex=True)
    lucy_data[data_colname] = lucy_data[data_colname].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]"," ")
    lucy_data[data_colname] = lucy_data[data_colname].str.replace("[ ]{2,}"," ",regex=True)
    
    return lucy_data

  #라벨을 정수로 인코딩
  def category_encoding_n_save(self, lucy_data, save_directory, label_colname):
    """
      tips: 텍스트로 된 라벨을 model에 지정된 정수 label로 인코딩 및 model의 input data를 파일로 저장합니다.
      Args:
          lucy_data : dataframe 형식의 데이터
          save_directory : 저장할 디렉토리(파일명은 제외)
          label_colname : 라벨의 컬럼명
      Returns:
          save_path : string
    """
    lucy_data[label_colname+'_val'] = lucy_data[label_colname] # string 형 컬럼 생성

    label_dict = {'0': 'IT/과학',
      '1': '경제',
      '2': '문화',
      '3': '미용/건강',
      '4': '사회',
      '5': '생활',
      '6': '스포츠',
      '7': '연예',
      '8': '정치'}

    for key, value in label_dict.items():
      print(value)
      lucy_data[label_colname] = lucy_data[label_colname].str.replace(value, key)

    now = int(round(time.time() * 1000))
    filename = 'sample_' + str(now) + '.txt'
    save_path = os.path.join(save_directory, filename)
    lucy_data.to_csv(save_path , sep = '\t' , index = False)
    print("===============Data encoding success! please check this directory : ", save_path)
    return save_path

  #dataset 불러오기

  def load_data(self, save_path, data_colnum, label_colnum):
    """
      tips: 저장된 input dataset을 불러옵니다.
      Args:
          save_path : txt데이터의 path
          data_colnum : data의 컬럼번호
          label_colnum : label의 컬럼번호
      Returns:
          marking_set : dataframe
    """
    predict_set = nlp.data.TSVDataset(save_path, field_indices=[data_colnum,label_colnum], num_discard_samples=1)
    return predict_set
  
  #예측 
  def predict_n_save_result(self, predict_set, tok, marking_set, marking_set_data_colname, save_directory, encoding_type='euc-kr'):
    """
      tips: 저장된 input dataset을 불러옵니다.
      Args:
          predict_set : 예측할 데이터 셋
          tok : tokenizer
          marking_set : 기록할 데이터 셋 (dataframe)
          marking_set_data_colname : 기록할 데이터 셋의 data 컬럼 이름
          save_directory : 저장할 디렉토리
          encoding_type : csv의 default는 euc-kr / 영문은 utf-8-sig 
      Returns:
          predict_set : nlp.data.TSVDataset
    """
    predict_set = BERTDataset(predict_set, 0, 1, tok, self.max_len, True, False)
    predict_input = torch.utils.data.DataLoader(predict_set, batch_size=1, num_workers=9)
    
    #컬럼 초기화
    marking_set['predict'] = -1
    marking_set['predict_tag'] = 'a'

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(predict_input)):
      token_ids = token_ids.long().to(self.device)
      segment_ids = segment_ids.long().to(self.device)
      valid_length= valid_length
      out = model(token_ids, valid_length, segment_ids)
      out_val = torch.argmax(out).cpu().numpy() # tensor > numpy로 변환
      out_tag = ''
      # print(marking_set[marking_set_data_colname][batch_id][1:50],'...') # 50자까지만 미리 출력 (빼도됨)
      
      if(out_val == 0):
        # print('IT/과학')
        out_tag = 'IT/과학'
      elif(out_val == 1):
        # print('경제')
        out_tag = '경제'
      elif(out_val == 2):
        # print('문화')
        out_tag = '문화'
      elif(out_val == 3):
        # print('미용/건강')
        out_tag = '미용/건강'
      elif(out_val == 4):
        # print('사회')
        out_tag = '사회'
      elif(out_val == 5):
        # print('생활')
        out_tag = '생활'
      elif(out_val == 6):
        # print('스포츠')
        out_tag = '스포츠'
      elif(out_val == 7):
        # print('연예')
        out_tag = '연예'
      elif(out_val == 8):
        # print('정치')
        out_tag = '정치'

      marking_set['predict'][batch_id] = out_val
      marking_set['predict_tag'][batch_id] = out_tag

    now = int(round(time.time() * 1000))
    filename = 'news_predict_' + str(now) + '.csv'
    save_path = os.path.join(save_directory, filename)
    marking_set.to_csv(save_path, encoding=encoding_type) #한글이면 euc-kr, utf-8-sig 
    print("===============Thank you for waiting. Predicting your dataset Finally Finish! Please Check this directory : ", save_path)
    
    return marking_set

  #얼마나 타겟의 값을 잘 맞추었는지 평가하는 함수
  def calc_accuracy(self, X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

  #정확도 평가하는 함수
  def get_accuracy(self, model, predict_set, tok):
    """
      tips: predict_set에 대한 정확도를 평가합니다.
      Args:
          predict_set : 예측할 데이터 셋
          tok : tokenizer
          model : Pytorch Model
    """
    predict_set = BERTDataset(predict_set, 0, 1, tok, self.max_len, True, False)
    predict_input = torch.utils.data.DataLoader(predict_set, batch_size=1, num_workers=9)
    
    model.eval() # 평가 모드로 변경
      
    test_acc = 0.0
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(predict_input)):
        token_ids = token_ids.long().to(self.device)
        segment_ids = segment_ids.long().to(self.device)
        valid_length= valid_length
        label = label.long().to(self.device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += self.calc_accuracy(out, label)
    print("test accuracy : {}".format(test_acc / (batch_id+1)))

In [60]:
predict = NewsPredict()

In [11]:
tok, model = predict.load_model_n_tokenizer('/content/drive/MyDrive/rsn_nlp_project/20210802model_95.pt')

using cached model
using cached model
using cached model


In [12]:
lucy_data = predict.preprocess_data('/content/drive/MyDrive/rsn_nlp_project/lucy_data0805_완.csv','contents')

In [13]:
save_path = predict.category_encoding_n_save(lucy_data, '/content/drive/MyDrive/rsn_nlp_project/', 'category')

IT/과학
경제
문화
미용/건강
사회
생활
스포츠
연예
정치


In [14]:
predict_set = predict.load_data(save_path, 1,2)

In [18]:
result_set = predict.predict_n_save_result(predict_set, tok, lucy_data, 'contents', '/content/drive/MyDrive/rsn_nlp_project/')

  cpuset_checked))
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=300.0), HTML(value='')))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy





In [61]:
result_set

Unnamed: 0,id,contents,category,category_val,predict,predict_tag
0,DN000000008778200425,공정소득은 코로나 이후 악화되는 자형 양극화에 대비하자는 것 반박 야권의 대선 주...,8,정치,8,정치
1,DN000000008808263362,공동대표 및 정책자문위원 등 임명장 수여 내 삶을 지켜주는 나라 신복지 충남포럼 ...,8,정치,8,정치
2,DN000000008791388703,바르셀로나 참가 인텔 기가테라 커뮤니케이션즈 를 오픈랜 생태계를 통하여 구축 및 ...,0,IT/과학,0,IT/과학
3,DN000000008853835944,더불어민주당 신영대 의원 이 민주당의 혁신과 흥행을 담당할 대선경선기획단에 인선됐...,8,정치,8,정치
4,DN000000008834209192,지난 일 미 멕시코 국경지대인 텍사스주 엘패소와 멕시코 시우대드후아레스 사이에서 ...,4,사회,4,사회
...,...,...,...,...,...,...
295,DN000000008766398883,파주시가 지난해 월 일 건축물관리법 이 시행됨에 따라 건축물을 해체할 경우 공사 전...,1,경제,5,생활
296,DN000000008766398882,블랙핑크 서비스를 출시한다 이날 서초동 입체음향 음악 스튜디오 사운드 에서 차원 ...,2,문화,0,IT/과학
297,DN000000008770346409,일 강제추행 피해 사실을 신고한 뒤 극단적 선택을 한 이모 공군 중사의 영정이 경...,4,사회,4,사회
298,DN000000008770346519,프로그램 표창원의 뉴스하이킥 출연자 최은영 서울대병원 간호사 진행자 월 일 오늘이...,5,생활,3,미용/건강


In [62]:
predict.get_accuracy(model, predict_set, tok)

  cpuset_checked))
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=300.0), HTML(value='')))


test accuracy : 0.83


{0: 'IT/과학',
 1: '경제',
 2: '문화',
 3: '미용/건강',
 4: '사회',
 5: '생활',
 6: '스포츠',
 7: '연예',
 8: '정치'}