In [100]:
import random
import torch
import torch.nn as nn
import torch.optim as optim

In [101]:
torch.manual_seed(0)

<torch._C.Generator at 0x1ed80265490>

In [102]:
#use cuda
device='cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [103]:
#raw data
raw=["I feel hungry.	나는 배가 고프다.",
       "Pytorch is very easy.	파이토치는 매우 쉽다.",
       "Pytorch is a framework for deep learning.	파이토치는 딥러닝을 위한 프레임워크이다.",
       "Pytorch is very clear to use.	파이토치는 사용하기 매우 직관적이다."]

In [104]:
SOS_token=0 #start of sentence
EOS_token=1 #end of sentence

In [105]:
source_max_length=10
target_max_length=12

In [106]:
class Vocab:
    def __init__ (self):
        self.vocab2index={"<SOS>":SOS_token,"<EOS>":EOS_token}
        self.index2vocab={SOS_token:"<SOS>",EOS_token:"<EOS>"}
        self.vocab_count={}
        self.n_vocab=len(self.vocab2index)
        
    def add_vocab(self,sentence):
        for word in sentence.split(" "):
            if word not in self.vocab2index:
                self.vocab2index[word]=self.n_vocab
                self.vocab_count[word]=1
                self.index2vocab[self.n_vocab]=word
                self.n_vocab+=1
            else:
                self.vocab_count[word]+=1

In [107]:
def filter_pair(pair,source_max_length,target_max_length):
    return len(pair[0].split(" "))<source_max_length and len(pair[1].split(" ")) <target_max_length

In [108]:
def preprocess(corpus,source_max_length,target_max_length):
    
    print("Reading corpus...")
    
    pairs=[]
    
    for line in corpus:
        pairs.append([s for s in line.strip().lower().split("\t")])
    print("Read {} sentence pairs".format(len(pairs)))
    
    pairs=[pair for pair in pairs if filter_pair(pair,source_max_length,target_max_length)]
    
    print("Trimmed to {} sentence pairs".format(len(pairs)))
    
    print("Counting words...")
    
    source_vocab=Vocab()
    target_vocab=Vocab()
    
    for pair in pairs:
        source_vocab.add_vocab(pair[0])
        target_vocab.add_vocab(pair[1])
        
    print("source vocab size = {}".format(source_vocab.n_vocab))
    print("target vocab size = {}".format(target_vocab.n_vocab))
    
    return pairs,source_vocab,target_vocab

In [109]:
load_pairs,load_source_vocab,load_target_vocab=preprocess(raw,source_max_length,target_max_length)

Reading corpus...
Read 4 sentence pairs
Trimmed to 4 sentence pairs
Counting words...
source vocab size = 17
target vocab size = 13


In [110]:
class Encoder(nn.Module):
    def __init__(self,input_size,hidden_size):
        super(Encoder,self).__init__()
        self.hidden_size=hidden_size
        self.embedding=nn.Embedding(input_size,hidden_size)
        self.gru=nn.GRU(hidden_size,hidden_size)
        
    def forward(self,x,hidden):
        x=self.embedding(x).view(1,1,-1)
        x,hidden=self.gru(x,hidden)
        return x,hidden

In [111]:
class Decoder(nn.Module):
    def __init__(self,hidden_size,output_size):
        super(Decoder,self).__init__()
        self.hidden_size=hidden_size
        self.embedding=nn.Embedding(output_size,hidden_size)
        self.gru=nn.GRU(hidden_size,hidden_size)
        self.out=nn.Linear(hidden_size,output_size)
        self.softmax=nn.LogSoftmax(dim=1)
        
    def forward(self,x,hidden):
        x=self.embedding(x).view(1,1,-1)
        x,hidden=self.gru(x,hidden)
        x=self.softmax(self.out(x[0]))
        return x,hidden

In [112]:
enc_hidden_size=16
dec_hidden_size=enc_hidden_size

In [113]:
encoder=Encoder(load_source_vocab.n_vocab,enc_hidden_size).to(device)
decoder=Decoder(dec_hidden_size,load_target_vocab.n_vocab).to(device)

In [114]:
def tensorize(vocab,sentence):
    indexes=[vocab.vocab2index[word] for word in sentence.split(" ")]
    indexes.append(vocab.vocab2index["<EOS>"])
    return torch.Tensor(indexes).long().to(device).view(-1,1)

In [115]:
epochs=5000
learning_rate=0.01

In [116]:
def train(pairs,source_vocab,target_vocab,encoder,decoder,epochs,learning_rate):
    
    loss_total=0
    
    encoder_optimizer=optim.SGD(encoder.parameters(),lr=learning_rate)
    decoder_optimizer=optim.SGD(decoder.parameters(),lr=learning_rate)
    
    training_batch=[random.choice(pairs) for _ in range(epochs)]
    training_source=[tensorize(source_vocab,pair[0]) for pair in training_batch]
    training_target=[tensorize(target_vocab,pair[1]) for pair in training_batch]
    
    criterion=nn.NLLLoss()
    
    for epoch in range(1,epochs+1):
        
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        
        source_tensor=training_source[epoch-1]
        target_tensor=training_target[epoch-1]
        
        source_length=source_tensor.size(0)
        target_length=target_tensor.size(0)
        
        encoder_hidden=torch.zeros([1,1,encoder.hidden_size]).to(device)
        
        loss=0
        
        for ei in range(source_length):
            _,encoder_hidden=encoder(source_tensor[ei],encoder_hidden)
            
        decoder_input=torch.Tensor([[SOS_token]]).long().to(device)
        decoder_hidden=encoder_hidden
        
        for di in range(target_length):
            decoder_output,decoder_hidden=decoder(decoder_input,decoder_hidden)
            loss+=criterion(decoder_output,target_tensor[di])
            
        loss.backward()
        
        encoder_optimizer.step()
        decoder_optimizer.step()
        
        loss=loss.item()/target_length
        loss_total+=loss
        
        if epoch % 1000 ==0:
            loss_avg=loss_total/1000
            loss_total=0
            print("[{}-{}%] loss={:.4f}".format(epoch,epoch/epochs*100,loss_avg))

In [117]:
train(load_pairs,load_source_vocab,load_target_vocab,encoder,decoder,epochs,learning_rate)

[1000-20.0%] loss=1.2079
[2000-40.0%] loss=0.2322
[3000-60.0%] loss=0.0687
[4000-80.0%] loss=0.0363
[5000-100.0%] loss=0.0240


In [118]:
def evaluate(pairs,source_vocab,target_vocab,encoder,decoder,target_max_length):
    for pair in pairs:
        print(pair[0],"=",pair[1])
        
        source_tensor=tensorize(source_vocab,pair[0])
        source_length=source_tensor.size()[0]
        encoder_hidden=torch.zeros([1,1,encoder.hidden_size]).to(device)
        
        for ei in range(source_length):
            _,encoder_hidden=encoder(source_tensor[ei],encoder_hidden)
            
        decoder_input=torch.Tensor([[SOS_token]]).long().to(device)
        decoder_hidden=encoder_hidden
        decoded_words=[]
        
        for di in range(target_max_length):
            decoder_output,decoder_hidden=decoder(decoder_input,decoder_hidden)
            _,top_index=decoder_output.data.topk(1)
            if top_index.item() == EOS_token:
                decoded_words.append("<EOS>")
                break
            else:
                decoded_words.append(target_vocab.index2vocab[top_index.item()])
                
            decoder_input=top_index.squeeze().detach()
            
        predicted_words=decoded_words
        predicted_sentence=" ".join(predicted_words)
        
        print("->",predicted_sentence)
        print(" ")

In [119]:
evaluate(load_pairs,load_source_vocab,load_target_vocab,encoder,decoder,target_max_length)

i feel hungry. = 나는 배가 고프다.
-> 나는 나는 나는 나는 나는 나는 나는 나는 나는 나는 나는 나는
 
pytorch is very easy. = 파이토치는 매우 쉽다.
-> 파이토치는 매우 매우 파이토치는 매우 배가 고프다. <EOS>
 
pytorch is a framework for deep learning. = 파이토치는 딥러닝을 위한 프레임워크이다.
-> 파이토치는 딥러닝을 위한 위한 프레임워크이다. 프레임워크이다. 직관적이다. 매우 파이토치는 프레임워크이다. 쉽다. 직관적이다.
 
pytorch is very clear to use. = 파이토치는 사용하기 매우 직관적이다.
-> 파이토치는 사용하기 사용하기 사용하기 사용하기 사용하기 사용하기 사용하기 사용하기 사용하기 사용하기 사용하기
 
