In [0]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pandas as pd

In [0]:
#라벨링한 csv 파일 불러오기
data = pd.read_csv("KoreanAir_label.csv").dropna(axis=0)

In [0]:
data.shape #데이터의 개수 확인

(118473, 5)

In [0]:
data = data.sample(frac =1) # 1개 단위로 섞기

In [0]:
# 60 : 40 의 비율로 트레인셋, 테스트셋 나눔
Train = data.loc[:71000]
Test = data.loc[71000:]

In [0]:
X_train = Train['preprocessing'].str.split(',').values.tolist()
y_train_label = Train['Label'].values.tolist()

X_test = Test['preprocessing'].str.split(',').values.tolist()
y_test_label = Test['Label'].values.tolist()

# CNN

In [0]:
from collections import defaultdict
import torch.nn as nn
import torch
from torch.autograd import Variable
import torch.nn.functional as F
import random

In [0]:
# 단어에 대한 idx 부여
def convert_token_to_idx(token_ls):
    for tokens in token_ls:
        yield [token2idx[token] for token in tokens]
    return

#Add Padding

In [0]:
token2idx = defaultdict(lambda : len(token2idx)) # token과 index를 매칭시켜주는 딕셔너리
pad = token2idx['<PAD>']  # pytorch Variable로 변환하기 위해, 문장의 길이를 맞춰주기 위한 padding 

x_train = list(convert_token_to_idx(X_train))
x_test = list(convert_token_to_idx(X_test))

idx2token = {val : key for key,val in token2idx.items()}

In [0]:
#Sequence Length를 맞추기 위한 padding
def add_padding(token_ls, max_len):
    for i, tokens in enumerate(token_ls):
        n_token = len(tokens)
        
        # 길이가 짧으면 padding을 추가
        if n_token < max_len:
            token_ls[i] += [pad] * (max_len - n_token) # 부족한 만큼 padding을 추가함
        
        # 길이가 길면, max_len에서 짜름
        elif n_token > max_len:
            token_ls[i] = tokens[:max_len]
    return token_ls

In [0]:
max_len = 200
x_train = add_padding(x_train, max_len)
x_test = add_padding(x_test, max_len)

In [0]:
' '.join([idx2token[x] for x in x_train[0]])

'규현 대한항공 종신 아이디 규용 규디 소울 <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>'

#Pytorch 모델 학습을 위해 Data의 type을 Variable 로 변환

In [0]:
# torch Variable로 변환
def convert_to_long_variable(w2i_ls):
    return Variable(torch.LongTensor(w2i_ls))

In [0]:
x_train = convert_to_long_variable(x_train)
x_test = convert_to_long_variable(x_test)

y_train = convert_to_long_variable(y_train_label)
y_test = convert_to_long_variable(y_test_label)

#CNN with Pytorch

In [0]:
class CNN_text(nn.Module):
    
    def __init__(self, n_words, embed_size, pad_index, hid_size, drop_rate, kernel_size_ls, num_filter, n_class):
        super(CNN_text, self).__init__()
        
        self.pad_index = pad_index              # 단어 embedding 과정에서 제외시킬 padding token
        self.embed_size = embed_size            # 임베딩 차원의 크기
        self.hid_size = hid_size                # 히든 레이어 갯수
        self.drop_rate = drop_rate              # 드롭아웃 비율
        self.num_filter = num_filter            # 필터의 갯수 
        self.kernel_size_ls = kernel_size_ls    # 각기 다른 필터 사이즈가 담긴 리스트
        self.num_kernel = len(kernel_size_ls)   # 필터 사이즈의 종류 수
        self.n_class = n_class                  # 카테고리 갯수
        
        self.embed = nn.Embedding(
            num_embeddings=n_words, 
            embedding_dim=embed_size,
            padding_idx=self.pad_index
        )
        
        
        # kernel size는 (n-gram, embed_size)이다.
        # 커널의 열(column)의 크기는 embed_size와 일치하므로, 단어 임베딩 벡터를 모두 커버한다.
        # 따라서, n의 row 크기를 갖는 커널은 한번에 n개의 단어를 커버하는 n-gram 커널이라고 볼 수 있다.
        self.convs = nn.ModuleList([nn.Conv2d(1, num_filter, (kernel_size, embed_size)) for kernel_size in kernel_size_ls])
        
        self.lin = nn.Sequential(
            nn.Linear(self.num_kernel*num_filter, hid_size), nn.ReLU(), nn.Dropout(drop_rate),
            nn.Linear(hid_size, n_class),
        )
        
    def forward(self, x):
        embed = self.embed(x) # batch_size x max_length x embed_size
        embed.unsqueeze_(1)       # batch_size, 1, max_length, embed_size : convolution을 위해 4D로 차원을 조절
        
        # convolution
        conved = [conv(embed).squeeze(3) for conv in self.convs] # [batch_size, num_filter, max_length -kernel_size +1]
        
        # max_pooling
        pooled = [F.max_pool1d(conv, (conv.size(2))).squeeze(2) for conv in conved] # [batch_size, num_kernel, num_filter]
            
        # dropout
        dropouted = [F.dropout(pool, self.drop_rate) for pool in pooled]
        
        # concatenate
        concated = torch.cat(dropouted, dim = 1) # [batch_size, num_kernel * num_filter]
        logit = self.lin(concated)
        
        return logit

In [0]:
params = {
    'n_words' : len(token2idx),        # 고유한 단어 토큰의 갯수
    'embed_size' : 50,                # 임베딩 차원의 크기
    'pad_index' : token2idx['<PAD>'],  # 패딩 토큰
    'hid_size' : 50,                  # 히든 레이어 갯수
    'drop_rate' : 0.5,                 # 드롭아웃 비율          (원문에서는 0.5를 사용)
    'kernel_size_ls' : [3,4,5],      # 커널 사이즈 리스트        (원문에서는 3,4,5를 사용)
    'num_filter' : 50,                 # 각 사이즈 별 커널 갯수 (원문에서는 100을 사용)
    'n_class' : 3,                  # 카테고리 갯수
}

In [0]:
model = CNN_text(**params)

In [0]:
model

CNN_text(
  (embed): Embedding(2720, 50, padding_idx=0)
  (convs): ModuleList(
    (0): Conv2d(1, 50, kernel_size=(3, 50), stride=(1, 1))
    (1): Conv2d(1, 50, kernel_size=(4, 50), stride=(1, 1))
    (2): Conv2d(1, 50, kernel_size=(5, 50), stride=(1, 1))
  )
  (lin): Sequential(
    (0): Linear(in_features=150, out_features=50, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5)
    (3): Linear(in_features=50, out_features=3, bias=True)
  )
)

In [0]:
len(list(model.parameters()))

11

In [0]:
import numpy as np
epochs = 1
lr = 0.01
batch_size = 100

train_idx = np.arange(x_train.size(0))
test_idx = np.arange(x_test.size(0))
optimizer = torch.optim.Adam(model.parameters(),lr) # 원문에서는 Adadelta 알고리즘을 사용
criterion = nn.CrossEntropyLoss(reduction='sum')

loss_ls = []

for epoch in range(epochs):
    model.train()
    
    # input 데이터 순서 섞기
    random.shuffle(train_idx)
    x_train = x_train[train_idx]
    y_train = y_train[train_idx]
    train_loss = 0

    for start_idx, end_idx in zip(range(0, x_train.size(0), batch_size),
                                  range(batch_size, x_train.size(0)+1, batch_size)):
        
        x_batch = x_train[start_idx : end_idx]
        y_batch = y_train[start_idx : end_idx].long()
        
        scores = model(x_batch)
        predict = F.softmax(scores, dim=1).argmax(dim = 1)
        
        acc = (predict == y_batch).sum().item() / batch_size
        
        loss = criterion(scores, y_batch)
        train_loss += loss.item()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    print('Train epoch : %s,  loss : %s,  accuracy :%.3f'%(epoch+1, train_loss / batch_size, acc))
    print('=================================================================================================')
    
    loss_ls.append(train_loss)
    
    if (epoch+1) % 1 == 0:
        model.eval()
        scores = model(x_test)
        predict = F.softmax(scores, dim=1).argmax(dim = 1)
        
        acc = (predict == y_test.long()).sum().item() / y_test.size(0)
        loss = criterion(scores, y_test.long())
        
        print('*************************************************************************************************')
        print('*************************************************************************************************')
        print('Test Epoch : %s, Test Loss : %.03f , Test Accuracy : %.03f'%(epoch+1, loss.item()/y_test.size(0), acc))
        print('*************************************************************************************************')
        print('*************************************************************************************************')

Train epoch : 1,  loss : 675.750672531128,  accuracy :0.590
*************************************************************************************************
*************************************************************************************************
Test Epoch : 1, Test Loss : 0.696 , Test Accuracy : 0.665
*************************************************************************************************
*************************************************************************************************


# 예측값

In [0]:
x_predict_data = X_train[50]

In [0]:
print("랜덤 날짜:", data['Date'][8500])
print("원래 라벨 값 확인하기:", data['Label'][8500])
print("단어 보기:", data['preprocessing'][8500])

'2015-04-17'

In [0]:
# 토큰화 된 1개의 문서를 이중리스트로 만들기
x_predict_data_2 = []
x_predict_data_2.append(x_predict_data)

In [0]:
x_test = list(convert_token_to_idx(x_predict_data_2))
x_test = add_padding(x_test, max_len)
x_test = convert_to_long_variable(x_test)

In [0]:
scores = model(x_test)
predict = F.softmax(scores, dim=1).argmax(dim = 1)

In [0]:
predict # 라벨 예측값

tensor([0])

In [0]:
predict_ls = predict.numpy()
predict_ls[0]

0

In [0]:
F.softmax(scores, dim=1)
# 각각 0, 1, 2 값이 나올 확률

tensor([[0.8676, 0.0010, 0.1314]], grad_fn=<SoftmaxBackward>)