In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from konlpy.tag import Okt
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
import time
import torch.optim as optim
import sys

In [2]:
!git clone https://github.com/e9t/nsmc.git

fatal: destination path 'nsmc' already exists and is not an empty directory.


In [3]:
all_data = pd.read_table('./nsmc/ratings.txt')

In [4]:
#ALL
all_data.drop_duplicates(subset = ['document'], inplace=True) # document 열에서 중복인 내용이 있다면 중복 제거
all_data['document'] = all_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", regex=True) # 정규 표현식 수행
all_data['document'] = all_data['document'].str.replace('^ +', "", regex=True) # 공백은 empty 값으로 변경
all_data['document'].replace('', np.nan, inplace=True, regex=True) # 공백은 Null 값으로 변경
all_data = all_data.dropna(how='any') # Null 값 제거
print('전처리 후 테스트용 샘플의 개수 :',len(all_data))

전처리 후 테스트용 샘플의 개수 : 193518


In [5]:
all_data

Unnamed: 0,id,document,label
0,8112052,어릴때보고 지금다시봐도 재밌어요ㅋㅋ,1
1,8132799,디자인을 배우는 학생으로 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산업...,1
2,4655635,폴리스스토리 시리즈는 부터 뉴까지 버릴께 하나도 없음 최고,1
3,9251303,와 연기가 진짜 개쩔구나 지루할거라고 생각했는데 몰입해서 봤다 그래 이런게 진짜 영화지,1
4,10067386,안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화,1
...,...,...,...
199995,8963373,포켓 몬스터 짜가 ㅡㅡ,0
199996,3302770,쓰레기,0
199997,5458175,완전 사이코영화 마지막은 더욱더 이 영화의질을 떨어트린다,0
199998,6908648,왜난 재미없었지 ㅠㅠ 라따뚜이 보고나서 스머프 봐서 그런가 ㅋㅋ,0


In [6]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

In [7]:
okt = Okt()
X_data = []
for sentence in tqdm(all_data['document']):
    tokenized_sentence = okt.morphs(sentence, stem=True) # 토큰화
    stopwords_removed_sentence = [word for word in tokenized_sentence if not word in stopwords] # 불용어 제거
    X_data.append(stopwords_removed_sentence)

100%|██████████| 193518/193518 [05:37<00:00, 573.44it/s]


In [8]:
def Make_Word2Index(ctxt : list):
    
    word2index = dict()
    words = []
    max_len = 0
    for text in ctxt:
        temp_len = len(text)
        if max_len < temp_len:
            max_len = temp_len
        for word_temp in text:
            words.append(word_temp)
        
    
    word2index = {word : index for index, word in enumerate(list(set(words)))}
    
    return word2index, max_len

In [9]:
word2index, max_len = Make_Word2Index(X_data)

In [10]:
len(word2index)

49645

In [11]:
def one_hot_encoding(text, word2index): # -> input = text, word2index from above function
    one_hot_encode = []

    tokenized_sentence = okt.morphs(text, stem=True) # 토큰화
    stopwords_removed_sentence = [word for word in tokenized_sentence if not word in stopwords] # 불용어 제거

    for word in stopwords_removed_sentence:
        one_hot_vector = [0]*(len(word2index)+1)
        try :
            word2index[word]
        except KeyError: 
            one_hot_vector[len(word2index)] = 1
        else:
            one_hot_vector[word2index[word]] = 1
            one_hot_encode.append(one_hot_vector)

    return one_hot_encode

In [12]:
def zero_padding(e : list):
    padding = [0 for i in range(input_size)] 
    for i in range(max_len-len(e)):
        e.append(padding)
    return e

In [13]:
""" Make LSTM Cell (class) """

### Use the above process to create a class

class LSTM_Cell(nn.Module):
  def __init__(self, input_size, h_dim):
    super(LSTM_Cell, self).__init__() ### Inheritance, error if not present
    self.x2h = nn.Linear(input_size, 4 * h_dim)
    self.h2h = nn.Linear(h_dim, 4 * h_dim)

  def forward(self, x, hidden_state, cell_state):
    hx, cx = hidden_state, cell_state
    all_gate = self.x2h(x) + self.h2h(hx)
    input_gate, forget_gate, cell_gate ,output_gate = all_gate.chunk(4,dim=2)

    input_gate = torch.sigmoid(input_gate)
    forget_gate = torch.sigmoid(forget_gate)
    cell_gate = torch.tanh(cell_gate)
    output_gate = torch.sigmoid(output_gate)

    new_cell_state = torch.mul(cx, forget_gate) + torch.mul(input_gate, cell_gate)
    new_hidden_state = torch.mul(output_gate, torch.tanh(new_cell_state))    
    
    return new_cell_state, new_hidden_state

In [14]:
""" Make LSTM Cell with LSTM Cell (class) """

class LSTM(nn.Module):
  def __init__(self, input_size, h_dim, output_size):
    super(LSTM,self).__init__()
    self.h_dim = h_dim
    self.lstm = LSTM_Cell(input_size, h_dim)
    self.fc = nn.Linear(h_dim, output_size)
  
  def forward(self, x, hidden_state = None, cell_state = None): 
    if hidden_state is not None:
      hidden_state = hidden_state
    else:
      hidden_state = torch.zeros((x.size(0),1,self.h_dim)).cuda()
    if cell_state is not None:
      cell_state = cell_state
    else:
      cell_state = torch.zeros((x.size(0),1,self.h_dim)).cuda()
    
    outputs = list() ## list for sum the outputs for each sequence

    ## Use the for statement to proceed with the process for all sequence
    for seq in range(x.size(1)) :
      hidden_state, cell_state = self.lstm(x[:, seq, :].unsqueeze(1), hidden_state, cell_state)
      outputs.append(hidden_state)

    outputs = torch.cat(outputs, dim=1)
    result = self.fc(outputs)

    return result, hidden_state, cell_state

In [15]:
### Default Setting
h_dim = 128  # hidden_dimension
input_size = len(word2index)+1  # Dimension of input meaning size of word2index
output_size = 2

In [16]:
class Mymodel(nn.Module):
    def __init__(self, LSTM, input_size, h_dim, output_size):
        super(Mymodel, self).__init__()

        self.lstm = LSTM(input_size, h_dim, output_size)

    def forward(self, sample, labels, hidden_state = None, cell_state = None):

        outputs, hidden_state, cell_state = self.lstm(sample)
        output = outputs[:][:,-1]
        loss = criterion(output, labels)
        
        return loss, output

In [17]:
model = Mymodel(LSTM, input_size, h_dim, output_size)
model = model.cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 1e-3)

In [18]:
train_df = pd.read_csv('./nsmc/ratings_train.txt', sep='\t', quoting=3)
test_df = pd.read_csv('./nsmc/ratings_test.txt', sep='\t', quoting=3)

#train_df
train_df.drop_duplicates(subset = ['document'], inplace=True) # document 열에서 중복인 내용이 있다면 중복 제거
train_df['document'] = train_df['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", regex=True) # 정규 표현식 수행
train_df['document'] = train_df['document'].str.replace('^ +', "", regex=True) # 공백은 empty 값으로 변경
train_df['document'].replace('', np.nan, inplace=True, regex=True) # 공백은 Null 값으로 변경
train_df = train_df.dropna(how='any') # Null 값 제거
print('전처리 후 테스트용 샘플의 개수 :',len(train_df))

#test_df
test_df.drop_duplicates(subset = ['document'], inplace=True) # document 열에서 중복인 내용이 있다면 중복 제거
test_df['document'] = test_df['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", regex=True) # 정규 표현식 수행
test_df['document'] = test_df['document'].str.replace('^ +', "", regex=True) # 공백은 empty 값으로 변경
test_df['document'].replace('', np.nan, inplace=True, regex=True) # 공백은 Null 값으로 변경
test_df = test_df.dropna(how='any') # Null 값 제거
print('전처리 후 테스트용 샘플의 개수 :',len(test_df))

전처리 후 테스트용 샘플의 개수 : 145393
전처리 후 테스트용 샘플의 개수 : 48852


In [19]:
class MyDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.iloc[idx, 1]
        label = self.df.iloc[idx, 2]
        return text, label

In [20]:
def train(epoch):
    model.train()
    start_time = time.time()

    total_loss = 0
    total_len = 0
    total_correct = 0
    itr = 1

    train_dataset = MyDataset(train_df)
    train_loader = DataLoader(train_dataset, batch_size=20, shuffle=True, num_workers=8)

  
    for text, label in train_loader:
        optimizer.zero_grad()
        
        encoded_list = [one_hot_encoding(t, word2index) for t in text]
        padded_list =  [zero_padding(e) for e in encoded_list]
        sample = torch.tensor(padded_list)
        sample = sample.type('torch.FloatTensor').cuda()
        label = torch.tensor(label)
        labels = label.clone().cuda()
        
        outputs = model(sample, labels)
        loss, logits = outputs
        
        label_ = label.clone().cuda()
        pred = torch.argmax(F.softmax(logits, dim=1), dim=1)
        correct = pred.eq(label_)
        total_correct += correct.sum().item()
        total_len += len(label_)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        
        sys.stdout.write('\r epoch: %d, [iter: %d / all %d], Train Loss : %f, Accyracy : %f' \
                          % (epoch, itr, len(train_loader), total_loss/total_len, total_correct/total_len))
        total_loss = 0
        total_len = 0
        total_correct = 0
        sys.stdout.flush()
        
        itr+=1
    now = time.gmtime(time.time() - start_time)
    print('{} hours {} mins {} secs for training'.format(now.tm_hour, now.tm_min, now.tm_sec))

In [21]:
def test():
    model.eval()
    start_time = time.time()

    total_loss = 0
    total_len = 0
    total_correct = 0
    itr = 1

    test_dataset = MyDataset(test_df)
    test_loader = DataLoader(test_dataset, batch_size=40, shuffle=True, num_workers=8)

  
    for text, label in tqdm(test_loader):
        
        encoded_list = [one_hot_encoding(t, word2index) for t in text]
        padded_list =  [zero_padding(e) for e in encoded_list]
        sample = torch.tensor(padded_list)
        sample = sample.type('torch.FloatTensor').cuda()
        label = torch.tensor(label)
        labels = label.clone().cuda()
        
        outputs = model(sample, labels)
        loss, logits = outputs
        
        label_ = label.clone().cuda()
        pred = torch.argmax(F.softmax(logits, dim=1), dim=1)
        correct = pred.eq(label_)
        total_correct += correct.sum().item()
        total_len += len(label_)
        total_loss += loss.item()
        
        
        
    print('Test Loss: {:.4f}, Accuracy: {:.3f}'.format(total_loss/total_len, total_correct/total_len))      
    now = time.gmtime(time.time() - start_time)
    print('{} hours {} mins {} secs for training'.format(now.tm_hour, now.tm_min, now.tm_sec))

In [22]:
start_epoch = 1
epochs = 10

In [None]:
for epoch in range(start_epoch, epochs):
    
    train(epoch)
    test()