In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup 
import time


In [2]:
df = pd.read_csv('crawling.csv')
#CSV 파일 불러오기

In [4]:
import re
#공백, 특수문자 제거
def remove_white_space(text):
    text = re.sub(r'[\t\r\n\f\v]', ' ', str(text))
    return text

def remove_special_char(text):
    text = re.sub('[^ ㄱ-ㅣ가-힣 0-9]+', ' ', str(text))
    return text

df.title = df.title.apply(remove_white_space)
df.title = df.title.apply(remove_special_char)

df.content = df.content.apply(remove_white_space)
df.content = df.content.apply(remove_special_char)

In [6]:
from konlpy.tag import Okt
#토크나이징, 제목은 형태소, 내용은 명사 
okt = Okt()

df['title_token'] = df.title.apply(okt.morphs)
df['content_token'] = df.content.apply(okt.nouns)

In [7]:
df['token_final'] = df.title_token + df.content_token

df['count'] = df['count'].replace({',' : ''}, regex = True).apply(lambda x : int(x))
#정답지 label 을 구성하기 위해 1,786->1786으로 바꾸고 인트형으로 변환
print(df.dtypes)

df['label'] = df['count'].apply(lambda x: 'Yes' if x>=1000 else 'No')

category         object
content          object
count             int64
end              object
start            object
title            object
title_token      object
content_token    object
token_final      object
dtype: object


In [2]:
df_drop = pd.read_csv('df_drop.csv')
df_drop.head()

Unnamed: 0,token_final,label
0,"['서울', '지방', '병무청', '탈의실', '에', '설치', '된', '에'...",No
1,"['주식시장', '활성화', '및', '소액', '개미', '투자자', '보호', ...",No
2,"['교정', '기관', '의', '민낮', '일로', '국민', '청원', '신청'...",No
3,"['미세먼지', '저', '감', '대책', '미세먼지', '심각', '성은', '...",No
4,"['악질', '세', '입자', '방지', '를', '위', '한', '세', '입...",Yes


In [12]:
df_drop.to_csv(r'C:\Users\82108\딥러닝 프로젝트\df_drop.csv', index = False, encoding = 'utf-8-sig')

In [14]:
from gensim.models import Word2Vec

embedding_model = Word2Vec(df_drop['token_final'], 
                           sg = 1, # skip-gram
                           vector_size = 100, 
                           window = 2, 
                           min_count = 1, 
                           workers = 4
                           )

print(embedding_model)

model_result = embedding_model.wv.most_similar("음주운전")
print(model_result)

Word2Vec<vocab=42642, vector_size=100, alpha=0.025>
[('음주', 0.8594802021980286), ('무면허', 0.8299345970153809), ('뺑소니', 0.8180090188980103), ('살인자', 0.7722511291503906), ('살인죄', 0.7697737812995911), ('형량', 0.7556667923927307), ('전과자', 0.7462077140808105), ('운전자', 0.7455174326896667), ('촉법소년', 0.7346683144569397), ('강력범죄', 0.73124760389328)]


In [16]:
from gensim.models import KeyedVectors

embedding_model.wv.save_word2vec_format(r'C:\Users\82108\딥러닝 프로젝트\petitions_tokens_w2v') # 모델 저장
loaded_model = KeyedVectors.load_word2vec_format(r'C:\Users\82108\딥러닝 프로젝트\petitions_tokens_w2v') # 모델 로드

model_result = loaded_model.most_similar("음주운전")
print(model_result)

[('음주', 0.8594802021980286), ('무면허', 0.8299345970153809), ('뺑소니', 0.8180090188980103), ('살인자', 0.7722511291503906), ('살인죄', 0.7697737812995911), ('형량', 0.7556667923927307), ('전과자', 0.7462077140808105), ('운전자', 0.7455174326896667), ('촉법소년', 0.7346683144569397), ('강력범죄', 0.73124760389328)]


In [3]:
#다시 킬때는 여기부터
from numpy.random import RandomState

rng = RandomState()

tr = df_drop.sample(frac=0.8, random_state=rng) #랜덤한 인덱스의 데이터 80%를 train dataset에 저장
val = df_drop.loc[~df_drop.index.isin(tr.index)] #나머지

tr.to_csv(r'C:\Users\82108\딥러닝 프로젝트\train.csv', index=False, encoding='utf-8-sig')
val.to_csv(r'C:\Users\82108\딥러닝 프로젝트\validation.csv', index=False, encoding='utf-8-sig')

In [4]:
import torchtext
from torchtext.data import Field

def tokenizer(text):
    text = re.sub('[\[\]\']', '', str(text))
    text = text.split(', ') #토근들이 하나의 리스트로 묶여있기 때문에 하나씩 분해
    return text

TEXT = Field(tokenize=tokenizer)
LABEL = Field(sequential = False) #라벨은 순서가 없는 Yes NO 형식이기 때문에 Sequential = False

In [31]:
from torchtext.data import TabularDataset
import re

train, validation = TabularDataset.splits(
    path = 'data/',
    train = 'train.csv',
    validation = 'validation.csv',
    format = 'csv',
    fields = [('text', TEXT), ('label', LABEL)],
    skip_header = True #CSV 파일의 첫 행이 칼럼명일 경우 건너뛰기
)

print("Train:", train[0].text,  train[0].label)
print("Validation:", validation[0].text, validation[0].label)

Train: ['마스크', '를', '1', '재난', '기본소득', '예산', '으로', '국가', '가', '사서', '2', '줄', '서지', '말고', '세대', '별로', '배급', '하면', '국민', '의', '안전', '과', '시간', '을', '보호', '할', '수', '있습니다', '저', '부산', '의사', '요즘', '매일', '어디', '서든', '볼', '수', '풍경', '마스크', '약국', '선', '진료', '소', '줄', '동안', '위험', '비', '효율', '때문', '고안', '드라이브', '스루', '방법', '외국', '약국', '줄', '것', '자신', '보호', '위', '마스크', '사기', '위해', '알', '수', '감염', '위험', '노출', '안', '경제', '활동', '비', '생산', '일', '시간', '모함', '어려움', '가중', '것', '최근', '일부', '지방', '자치', '단체', '전격', '재난', '기본소득', '지급', '결정', '국민', '청원', '재난', '기본소득', '지급', '의견', '동의', '취약', '계층', '마스크', '더', '시간', '마음대로', '낼', '수', '경우', '재난', '기본소득', '누구', '배', '분할', '것', '의논', '합의', '도출', '지급', '꽤', '시간', '소요', '것', '재난', '기본소득', '예산', '당장', '마스크', '국가', '기금', '활용', '그', '나머지', '예산', '경제', '위해', '여러가지', '방법', '강', '구해', '볼', '수', '것', '마스크', '배분', '방법', '최소', '행정', '단위', '별로', '배포', '각', '주민', '그', '쪽', '방문', '거나', '노원구', '공무원', '수고', '주시', '방법', '것', '모든', '사람', '마스크', '구', '수', '안심', '누가', '더', '사재기',

In [7]:
import torch
from torchtext.vocab import Vectors
from torchtext.data import BucketIterator

vectors = Vectors(name="data/petitions_tokens_w2v")

TEXT.build_vocab(train, vectors = vectors, min_freq = 1, max_size = None) 
#train 데이터셋의 단어들을 사전 학습된 단어장의 벡터로 초기화
LABEL.build_vocab(train)

vocab = TEXT.vocab

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_iter, validation_iter = BucketIterator.splits(
    datasets = (train, validation),
    batch_size = 8,
    device = device,
    sort = False
)

print('임베딩 벡터의 개수와 차원 : {} '.format(TEXT.vocab.vectors.shape))

임베딩 벡터의 개수와 차원 : torch.Size([38755, 100]) 


In [22]:
import torch.nn as nn   
import torch.optim as optim 
import torch.nn.functional as F 

class TextCNN(nn.Module): 
    
    def __init__(self, vocab_built, emb_dim, dim_channel, kernel_wins, num_class):
        
        super(TextCNN, self).__init__()
        
        self.embed = nn.Embedding(len(vocab_built), emb_dim)
        self.embed.weight.data.copy_(vocab_built.vectors)      
    
        self.convs = nn.ModuleList([nn.Conv2d(1, dim_channel, (w, emb_dim)) for w in kernel_wins])
        self.relu = nn.ReLU()                
        self.dropout = nn.Dropout(0.4)         
        self.fc = nn.Linear(len(kernel_wins)*dim_channel, num_class)     
        
    def forward(self, x):  
      
        emb_x = self.embed(x)  #(배치사이즈, 문장 길이, 임베딩 차원)         
        emb_x = emb_x.unsqueeze(1)  #(배치사이즈, 1, 문장 길이, 임베딩 차원) 
        #CNN모델에 적용하기 위해 입력 형태를 3차원 데이터로 바꾸어줌

        con_x = [self.relu(conv(emb_x)) for conv in self.convs]       
        #self.convs에 있는 module_list를 ebm_x에 각각 적용 후 리스트로 저장 (배치사이즈, 10, 출력길이, 1)짜리 3개

        pool_x = [F.max_pool1d(x.squeeze(-1), x.size()[2]) for x in con_x]    
        #풀링레이어 con_x를 1짜리 차원 줄여준 것을 입력으로, 문장길이를 kernel_size로 한다. 출력 - (배치사이즈, 10, 1) 3개
        fc_x = torch.cat(pool_x, dim=1) #1차원 방향으로 합치기 (배치사이즈, 30, 1) 1개
        fc_x = fc_x.squeeze(-1) #1짜리 마지막 차원 제거해 선형 함수에 넣기 좋은 형태로 만들기
        fc_x = self.dropout(fc_x)         

        logit = self.fc(fc_x)     
        
        return logit

In [23]:
def train(model, device, train_itr, optimizer):
    
    model.train()                               
    corrects, train_loss = 0.0,0        
    
    for batch in train_itr:
        
        text, target = batch.text, batch.label      
        text = torch.transpose(text, 0, 1)          
        target.data.sub_(1)                                 
        text, target = text.to(device), target.to(device)  

        optimizer.zero_grad()                           
        logit = model(text)                         
    
        loss = F.cross_entropy(logit, target)   
        loss.backward()  
        optimizer.step()  
        
        train_loss += loss.item()    
        result = torch.max(logit,1)[1] 
        corrects += (result.view(target.size()).data == target.data).sum()
        
    train_loss /= len(train_itr.dataset)
    accuracy = 100.0 * corrects / len(train_itr.dataset)

    return train_loss, accuracy

In [24]:
def evaluate(model, device, itr):
    
    model.eval()
    corrects, test_loss = 0.0, 0

    for batch in itr:
        
        text = batch.text
        target = batch.label
        text = torch.transpose(text, 0, 1)
        target.data.sub_(1)
        text, target = text.to(device), target.to(device)
        
        logit = model(text)
        loss = F.cross_entropy(logit, target)

        test_loss += loss.item()
        result = torch.max(logit,1)[1]
        corrects += (result.view(target.size()).data == target.data).sum()

    test_loss /= len(itr.dataset) 
    accuracy = 100.0 * corrects / len(itr.dataset)
    
    return test_loss, accuracy

In [25]:
model = TextCNN(vocab, 100, 10, [3, 4, 5], 2).to(device)
print(model)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

optimizer = optim.Adam(model.parameters(), lr=0.001)

best_test_acc = -1

for epoch in range(1, 3+1):
 
    tr_loss, tr_acc = train(model, device, train_iter, optimizer) 
    print('Train Epoch: {} \t Loss: {} \t Accuracy: {}%'.format(epoch, tr_loss, tr_acc))
    
    val_loss, val_acc = evaluate(model, device, validation_iter)
    print('Valid Epoch: {} \t Loss: {} \t Accuracy: {}%'.format(epoch, val_loss, val_acc))
        
    if val_acc > best_test_acc:
        best_test_acc = val_acc
        
        print("model saves at {} accuracy".format(best_test_acc))
        torch.save(model.state_dict(), "TextCNN_Best_Validation")
    
    print('-----------------------------------------------------------------------------')

TextCNN(
  (embed): Embedding(38755, 100)
  (convs): ModuleList(
    (0): Conv2d(1, 10, kernel_size=(3, 100), stride=(1, 1))
    (1): Conv2d(1, 10, kernel_size=(4, 100), stride=(1, 1))
    (2): Conv2d(1, 10, kernel_size=(5, 100), stride=(1, 1))
  )
  (relu): ReLU()
  (dropout): Dropout(p=0.4, inplace=False)
  (fc): Linear(in_features=30, out_features=2, bias=True)
)
Train Epoch: 1 	 Loss: 0.08189564037774912 	 Accuracy: 61.80356216430664%
Valid Epoch: 1 	 Loss: 0.0773220794439754 	 Accuracy: 66.6819839477539%
model saves at 66.6819839477539 accuracy
-----------------------------------------------------------------------------
Train Epoch: 2 	 Loss: 0.07546763614564705 	 Accuracy: 67.8690414428711%
Valid Epoch: 2 	 Loss: 0.0762244074139744 	 Accuracy: 67.00367736816406%
model saves at 67.00367736816406 accuracy
-----------------------------------------------------------------------------
Train Epoch: 3 	 Loss: 0.06685245595123493 	 Accuracy: 73.6358413696289%
Valid Epoch: 3 	 Loss: 0.07