In [1]:
# Load FrameWork and Layer module

import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import numpy as np
import pandas as pd
import nltk
from tqdm import tqdm
from konlpy.tag import Okt
from nltk.tokenize import word_tokenize
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import sys
import time

In [2]:
all_data = pd.read_table('./kor-eng/kor.txt', header=None)

In [3]:
all_data

Unnamed: 0,0,1,2
0,Go.,가.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
1,Hi.,안녕.,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
2,Run!,뛰어!,CC-BY 2.0 (France) Attribution: tatoeba.org #9...
3,Run.,뛰어.,CC-BY 2.0 (France) Attribution: tatoeba.org #4...
4,Who?,누구?,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
...,...,...,...
3848,Science fiction has undoubtedly been the inspi...,공상 과학 소설은 의심의 여지 없이 오늘날 존재하는 많은 기술에 영감을 주었어.,CC-BY 2.0 (France) Attribution: tatoeba.org #6...
3849,I started a new blog. I'll do my best not to b...,난 블로그를 시작했어. 블로그를 초반에만 반짝 많이 하다가 관두는 사람처럼은 되지 ...,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
3850,I think it's a shame that some foreign languag...,몇몇 외국어 선생님이 한 번도 원어민과 공부해본 적도 없으면서 대학을 나올 수 있었...,CC-BY 2.0 (France) Attribution: tatoeba.org #9...
3851,If someone who doesn't know your background sa...,만일 네 사정도 잘 모르는 사람이 원어민 같다고 말한다면 그건 그 사람이 네가 원어...,CC-BY 2.0 (France) Attribution: tatoeba.org #9...


In [4]:
#KOR
all_data.drop_duplicates(subset = [1], inplace=True) # document 열에서 중복인 내용이 있다면 중복 제거
all_data[1] = all_data[1].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", regex=True) # 정규 표현식 수행
all_data[1] = all_data[1].str.replace('^ +', "", regex=True) # 공백은 empty 값으로 변경
all_data[1].replace('', np.nan, inplace=True, regex=True) # 공백은 Null 값으로 변경
all_data = all_data.dropna(how='any') # Null 값 제거
print('전처리 후 테스트용 샘플의 개수 :',len(all_data))

전처리 후 테스트용 샘플의 개수 : 3763


In [5]:
all_data[1]

0                                                       가
1                                                      안녕
2                                                      뛰어
3                                                      뛰어
4                                                      누구
                              ...                        
3848          공상 과학 소설은 의심의 여지 없이 오늘날 존재하는 많은 기술에 영감을 주었어
3849    난 블로그를 시작했어 블로그를 초반에만 반짝 많이 하다가 관두는 사람처럼은 되지 않...
3850    몇몇 외국어 선생님이 한 번도 원어민과 공부해본 적도 없으면서 대학을 나올 수 있었...
3851    만일 네 사정도 잘 모르는 사람이 원어민 같다고 말한다면 그건 그 사람이 네가 원어...
3852    의심의 여지 없이 세상에는 어떤 남자이든 정확히 딱 알맞는 여자와 결혼하거나 그 반...
Name: 1, Length: 3763, dtype: object

In [6]:
#ENG
all_data.drop_duplicates(subset = [0], inplace=True) # document 열에서 중복인 내용이 있다면 중복 제거
all_data[0] = all_data[0].str.replace("[^a-z A-Z]","", regex=True) # 정규 표현식 수행
all_data[0] = all_data[0].str.replace('^ +', "", regex=True) # 공백은 empty 값으로 변경
all_data[0].replace('', np.nan, inplace=True, regex=True) # 공백은 Null 값으로 변경
all_data = all_data.dropna(how='any') # Null 값 제거
print('전처리 후 테스트용 샘플의 개수 :',len(all_data))

전처리 후 테스트용 샘플의 개수 : 3470


In [7]:
all_data[0]

0                                                      Go
1                                                      Hi
2                                                     Run
3                                                     Run
4                                                     Who
                              ...                        
3848    Science fiction has undoubtedly been the inspi...
3849    I started a new blog Ill do my best not to be ...
3850    I think its a shame that some foreign language...
3851    If someone who doesnt know your background say...
3852    Doubtless there exists in this world precisely...
Name: 0, Length: 3470, dtype: object

In [8]:
X_eng_data = []
for sentence in tqdm(all_data[0]):
    tokenized_sentence = [word for word in word_tokenize(sentence)] # 토큰화
    X_eng_data.append(tokenized_sentence)

100%|██████████| 3470/3470 [00:00<00:00, 15131.97it/s]


In [9]:
okt = Okt()
X_kor_data = []
for sentence in tqdm(all_data[1]):
    tokenized_sentence = [word for word in okt.morphs(sentence, stem=True)] # 토큰화
    X_kor_data.append(tokenized_sentence)

100%|██████████| 3470/3470 [00:05<00:00, 649.99it/s] 


In [10]:
def Make_Word2Index(ctxt : list):
    
    word2index = dict()
    words = []
    max_len = 0
    for text in ctxt:
        temp_len = len(text)
        if max_len < temp_len:
            max_len = temp_len
        for word_temp in text:
            words.append(word_temp)
    
    word2index = {word : index+2 for index, word in enumerate(list(set(words)))}
### word2index need <sos> and <eos> tokens
    word2index['<sos>'] = 0
    word2index['<eos>'] = 1  
    
    return word2index, max_len

In [11]:
word2index, max_len = Make_Word2Index(X_kor_data+X_eng_data)

In [12]:
def tokenize(text, word2index): # -> input = text, word2index from above function
    tokens = []    
    tokenized_sentence = [word for word in okt.morphs(text, stem=True)]
    for word in tokenized_sentence:
        tokens.append(word2index[word])
        
    return tokens

In [13]:
def tokenize_eng(text, word2index): # -> input = text, word2index from above function
    
    tokens = []
    tokenized_sentence = [word for word in word_tokenize(text)]    
    for word in tokenized_sentence:
        tokens.append(word2index[word])
    
    return tokens

In [14]:
""" Make Encoder with LSTM (class) """

class Seq2Seq_Encoder_Attention(nn.Module):
  def __init__(self, input_size, h_dim, emb_dim):
    super(Seq2Seq_Encoder_Attention, self).__init__()

    self.embedding = nn.Embedding(input_size, emb_dim)
    self.lstm = nn.LSTM(input_size = emb_dim, hidden_size = h_dim, batch_first = True)
    self.drop = nn.Dropout()

  def forward(self, x):
    emb = self.drop(self.embedding(x))
    _, (hidden_state, cell_state) = self.lstm(emb)
    
    return hidden_state, cell_state

In [15]:
""" Make Decoder with LSTM (class) """

class Seq2Seq_Decoder_Attention(nn.Module):
  def __init__(self, output_size, h_dim, emb_dim):
    super(Seq2Seq_Decoder_Attention, self).__init__()
   
    self.embedding = nn.Embedding(output_size, emb_dim)
    self.lstm = nn.LSTM(input_size = emb_dim, hidden_size = h_dim, batch_first = True)
    self.drop = nn.Dropout()
    self.fc = nn.Linear(h_dim, output_size)

  def forward(self, x, hidden_state, cell_state):
    emb = self.drop(self.embedding(x))
    emb = emb.unsqueeze(1)

    result,(hidden_state, cell_state) = self.lstm(emb,\
                              (hidden_state, cell_state))
    result = self.fc(result)
    
    return result, hidden_state, cell_state

In [16]:
""" Make Seq2Seq with Attention(LSTM) class """

class Seq2Seq_Attention(nn.Module):
    def __init__(self, input_size, output_size, h_dim, emb_dim, teacher_forcing=0.5):
        super(Seq2Seq_Attention,self).__init__()
    
        self.output_size = output_size
        self.encoder = Seq2Seq_Encoder_Attention(input_size, h_dim, emb_dim)
        self.decoder = Seq2Seq_Decoder_Attention(output_size, h_dim, emb_dim)
        self.teacher_forcing = teacher_forcing
        self.fc = nn.Linear(2*h_dim, output_size)

    def forward(self, src, tgt):
        hidden_state, cell_state = self.encoder(src)
        encoder_hidden_state = hidden_state

        tgt_len = tgt.size(1)
        result = torch.zeros((tgt.size(0), tgt_len-1, self.output_size))
        de_input = tgt[:,0]

        for i in range(1, tgt_len):
            output, hidden_state, cell_state = self.decoder(de_input, hidden_state, cell_state)
            attn_score = self.attention_scoring(encoder_hidden_state, hidden_state)
            weighted_sum = torch.sum(encoder_hidden_state * attn_score, dim =1)
            hidden_state_concat = torch.cat([hidden_state, weighted_sum.unsqueeze(1)], dim = 2)
            output = self.fc(hidden_state_concat)
            result[:,i-1,:] = output
            top1 = output.argmax(2)
      
            if self.teacher_forcing > random.random() :
                de_input = torch.LongTensor([top1])
                de_input = de_input.cuda()
            else:
                de_input = tgt[:,i]
        return result
  
    def attention_scoring(self, encoder_hidden_state, hidden_state):
        N,_,H  = hidden_state.shape
        attn_score = encoder_hidden_state.matmul(hidden_state.view(N,H,1))
        attn_score = torch.softmax(attn_score, dim =1)
        return attn_score

In [17]:
### Default Setting
h_dim = 128  # hidden_dimension
input_size = len(word2index)  # Dimension of input meaning size of word2index
output_size = len(word2index)
emb_dim = 64

In [18]:
seq2seq_attn = Seq2Seq_Attention(input_size,input_size,h_dim,emb_dim)

In [19]:
model = seq2seq_attn.cuda()

In [20]:
criterion = nn.CrossEntropyLoss()

In [21]:
class MyDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.iloc[idx, 1]
        label = self.df.iloc[idx, 0]
        return text, label

In [22]:
def train(epoch):
    model.train()
    start_time = time.time()

    total_loss = 0
    total_len = 0
    total_correct = 0
    itr = 1

    train_dataset = MyDataset(all_data)
    train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, num_workers=8)

  
    for text, label in train_loader:
        optimizer.zero_grad()
        
        encoded_src = [tokenize(t, word2index) for t in text]
        encoded_tgt = [tokenize_eng(t_, word2index) for t_ in label]

        label = [e_t + [1] for e_t in encoded_tgt]
        decoder_input = [[0]+e_t_+[1] for e_t_ in encoded_tgt]

        encoded_src = torch.LongTensor(encoded_src).cuda()
        decoder_input = torch.LongTensor(decoder_input).cuda()
        label = torch.LongTensor(label)
        
        outputs = model(encoded_src, decoder_input)
        loss = criterion(outputs.transpose(1,2), label)
        
        total_len += len(label)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        if itr%20 == 0 :
            sys.stdout.write('\r epoch: %d, [iter: %d / all %d], Train Loss : %f' \
                          % (epoch, itr, len(train_loader), total_loss/total_len))
            total_loss = 0
            total_len = 0
            sys.stdout.flush()
        
        itr+=1
    now = time.gmtime(time.time() - start_time)
    print('\n')
    print('{} hours {} mins {} secs for training'.format(now.tm_hour, now.tm_min, now.tm_sec))

In [23]:
start_epoch = 1
epochs = 10
optimizer = optim.Adam(model.parameters(), lr = 1e-3)

In [None]:
for epoch in range(start_epoch, epochs):
    train(epoch)