In [1]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import random
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
data_path = '../dataset/data.csv'
with open(data_path, 'r', encoding='utf-8-sig') as f:
    lines = f.read().split('\n')

input_texts = []
target_texts = []
input_characters = set()
target_characters = set()    
    
for line in lines:
    input_text, target_text = line.split('\t')
    target_text = target_text 
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)

In [3]:
target_texts[0]

'kongphanthahanpuenyai'

In [4]:
# +4 for padding, unknown, <start>, <end>
data_size, vocab_size = len(input_texts), len(input_characters)+1 
output_vocab_size = len(target_characters)+1 #+3 padding, <start>, <end>
print('There are %d lines and %d unique characters in your input data.' % (data_size, vocab_size))
maxlen_in = len( max(input_texts, key=len)) #max input length
maxlen_out = len( max(target_texts, key=len)) #max output length

There are 648241 lines and 92 unique characters in your input data.


In [5]:
input_characters = sorted(input_characters)
target_characters = sorted(target_characters) 
input_characters.insert(0,"<PAD>")#PADDING for input
input_characters.insert(1,"<UNK>")
input_characters.insert(2,"<start>")
input_characters.insert(3,"<end>")
target_characters.insert(0,"<PAD>")#PADDING for output
target_characters.insert(1,"<start>")
target_characters.insert(2,"<end>")

#Input
char_to_ix = { ch:i for i,ch in enumerate(input_characters) }
ix_to_char = { i:ch for i,ch in enumerate(input_characters) } #reverse dictionary
#Output
target_char_to_ix = { ch:i for i,ch in enumerate(target_characters) }
ix_to_target_char = { i:ch for i,ch in enumerate(target_characters) } #reverse dictionary

In [6]:
m=648241
Tx=maxlen_in
Ty=maxlen_out

In [7]:
maxlen_in

76

In [8]:
def prepare_sequence_in( input_text):
    idxs = []
    for w in input_text:
        if w in char_to_ix:
            idxs.append(char_to_ix[w])
        else:
            idxs.append(char_to_ix["<UNK>"])
    idxs.append(target_char_to_ix["<end>"])
    tensor = torch.tensor(idxs, dtype=torch.long)
    return tensor.to(device)

def prepare_sequence_target( input_text):
    idxs = [target_char_to_ix[w] for w in input_text]
    idxs.append(target_char_to_ix["<end>"])
    tensor = torch.tensor(idxs, dtype=torch.long)
    return tensor.to(device)




In [9]:
n_h = 256 #hidden dimensions for encoder 
n_s = 256 #hidden dimensions for decoder
emb_dim = 256 #character embedding size

In [10]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, hidden_dim, emb_dim):
        super(Encoder, self).__init__()
        self.hidden_dim = hidden_dim
        self.char_emb = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, self.hidden_dim // 2, bidirectional = True)

    def forward(self, input_seq):
        self.hidden = self.init_hidden()
        embedded = self.char_emb(input_seq)
        output, self.hidden = self.lstm(embedded.view(len(embedded), 1, -1), self.hidden)
        return output, self.hidden

    def init_hidden(self):
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(2, 1, self.hidden_dim // 2,requires_grad=True).to(device),
    torch.zeros(2, 1, self.hidden_dim // 2,requires_grad=True).to(device))

class OneStepDecoder(nn.Module):
    def __init__(self, vocab_size, hidden_dim, emb_dim):
        super(OneStepDecoder, self).__init__()
        self.hidden_dim = hidden_dim
        self.char_emb = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim)
        self.out = nn.Linear(hidden_dim, vocab_size)
        self.softmax = nn.LogSoftmax(dim=1)

        
    def forward(self, input_step,hidden,encoder_outputs):
    
        embedded = self.char_emb(input_step).view(1, 1, -1)
        output = embedded    
        output = F.relu(output)
        output, hidden = self.lstm(output, hidden)
        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, []#this empty list should be replaced with decoder attn score 

    def init_hidden(self):
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, 1, self.hidden_dim ,requires_grad=True).to(device),
    torch.zeros(1, 1, self.hidden_dim,requires_grad=True).to(device))


In [11]:
#reference:https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html#sphx-glr-intermediate-seq2seq-translation-tutorial-py
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=maxlen_out):
  
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    #encoder_outputs = torch.zeros(max_length, encoder.hidden_dim, device=device)

    loss = 0

    encoder_outputs, encoder_hidden = encoder(input_tensor)


    decoder_input = torch.tensor([target_char_to_ix["<start>"]], device=device)
    decoder_hidden = (encoder_hidden[0].reshape(1,1,encoder.hidden_dim),encoder_hidden[1].reshape(1,1,encoder.hidden_dim))

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
#           print(decoder_output.shape,target_tensor[di].shape,target_tensor[di])
            loss += criterion(decoder_output, target_tensor[di].view(1))
            decoder_input = target_tensor[di].view(1)  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden,decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di].view(1))
            if decoder_input.item() == "<end>":
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length


In [12]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [13]:
encoder = Encoder(len(char_to_ix), n_h, emb_dim).to(device)
decoder = OneStepDecoder(len(char_to_ix), n_h, emb_dim).to(device)

In [14]:
learning_rate = 0.001
encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()

In [15]:
def trainIters(encoder, decoder, epoch_num, learning_rate,encoder_optimizer,decoder_optimizer,criterion,print_every=1000,):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    
    
    for epoch in range(epoch_num):
        count = 1
        for in_text,out_text in zip(input_texts,target_texts):
            input_tensor =  prepare_sequence_in(in_text)
            target_tensor = prepare_sequence_target(out_text)
            loss = train(input_tensor, target_tensor, encoder,
                         decoder, encoder_optimizer, decoder_optimizer, criterion)
            print_loss_total += loss
            #print(print_loss_total)
            if count % print_every == 0:
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                print('%s (%d %d%%) %.4f' % (timeSince(start, epoch+1 / epoch_num),
                                                 epoch, epoch+1 / epoch_num * 100, print_loss_avg))
            count += 1


In [16]:
trainIters(encoder, decoder, 10,learning_rate,encoder_optimizer,decoder_optimizer,criterion,print_every=1000)

0m 10s (- 1m 38s) (0 10%) 2.1072
0m 21s (- 3m 17s) (0 10%) 1.6195
0m 33s (- 4m 57s) (0 10%) 1.4371
0m 44s (- 6m 37s) (0 10%) 1.3479
0m 55s (- 8m 18s) (0 10%) 1.2746
1m 6s (- 9m 59s) (0 10%) 1.1861
1m 17s (- 11m 39s) (0 10%) 1.1398
1m 28s (- 13m 18s) (0 10%) 1.1049
1m 39s (- 14m 57s) (0 10%) 1.0615
1m 50s (- 16m 36s) (0 10%) 1.0075
2m 1s (- 18m 16s) (0 10%) 1.0261
2m 12s (- 19m 56s) (0 10%) 1.0252
2m 24s (- 21m 36s) (0 10%) 0.9867
2m 34s (- 23m 6s) (0 10%) 0.9261
2m 44s (- 24m 44s) (0 10%) 0.9086
2m 56s (- 26m 24s) (0 10%) 0.9072
3m 7s (- 28m 6s) (0 10%) 0.9098
3m 18s (- 29m 46s) (0 10%) 0.8731
3m 29s (- 31m 27s) (0 10%) 0.8605
3m 40s (- 33m 6s) (0 10%) 0.8069
3m 51s (- 34m 46s) (0 10%) 0.8128
4m 3s (- 36m 27s) (0 10%) 0.8162
4m 14s (- 38m 8s) (0 10%) 0.8624
4m 25s (- 39m 48s) (0 10%) 0.7978
4m 36s (- 41m 25s) (0 10%) 0.7746
4m 47s (- 43m 7s) (0 10%) 0.7778
4m 58s (- 44m 48s) (0 10%) 0.7592
5m 9s (- 46m 27s) (0 10%) 0.7813
5m 20s (- 48m 7s) (0 10%) 0.7573
5m 32s (- 49m 48s) (0 10%) 0.73

KeyboardInterrupt: 

In [17]:
def evaluate(input_tensor, target_tensor, encoder, decoder, max_length=maxlen_out):
  

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs, encoder_hidden = encoder(input_tensor)

    decoder_input = torch.tensor([target_char_to_ix["<start>"]], device=device)
    decoder_hidden = (encoder_hidden[0].reshape(1,1,encoder.hidden_dim),encoder_hidden[1].reshape(1,1,encoder.hidden_dim))

    decoded_seq = []

    for di in range(target_length):
        decoder_output, decoder_hidden,decoder_attention = decoder(
            decoder_input, decoder_hidden, encoder_outputs)
        topv, topi = decoder_output.data.topk(1)

        if topi.item() == "<end>":
            decoded_seq.append('<end>')
            break
        else:
            decoded_seq.append(ix_to_target_char[topi.item()])

        decoder_input = topi.squeeze().detach()

    return decoded_seq

In [19]:
for in_text,out_text in zip(input_texts[:50],target_texts[:50]):
    input_tensor =  prepare_sequence_in(in_text)
    target_tensor = prepare_sequence_target(out_text)
    decoded_seq=evaluate(input_tensor, target_tensor, encoder, decoder)
    print(decoded_seq,out_text)

['k', 'o', 'n', 'g', 'p', 'h', 'a', 't', 'u', 'u', 'n', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', '<end>', '<end>', '<end>'] kongphanthahanpuenyai
['w', 'i', 't', 'h', 'u', 'n', '<end>'] withun
['m', 'e', 't', 'a', 'b', 'o', 'n', 's', 'o', 'm', '<end>', 'm'] metabolisom
['b', 'a', 'n', 'n', 'o', 'n', 'g', 'l', 'a', 'o', '<end>'] bannonglao
['a', 'y', 'u', 't', '<end>'] ayut
['t', 'h', 'a', 'e', 'm', 'p', 'a', '<end>'] thaempa
['p', 'r', 'a', 't', 'h', 'a', 'e', 'k', 'a', 'n', 's', 'a', 'n', 'l', 'a', 'e'] prathetkrinlaen
['p', 'h', 'o', 'k', 'k', 'h', 'r', 'o', 'n', 'g', 'k', 'h', 'r', 'o', 'e'] phakkhongkerot
['k', 'a', 'n', 's', 'u', 'p', '<end>'] kansup
['b', 'a', 'n', 't', 'h', 'a', 'p', 'h', 'a', 'e', 'p', 'h', 'o', 'k', 'h', 'h', 'a'] banthepphayaktai
['m', 'o', 'd', 'u', 'n', 'a', 'n', 'a', 't', 'a', 'n', '<end>', '<end>'] modulatnatat
['l', 'a', 'b', 'o', 'n', 'g', '<end>'] labong
['k', 'a', 'n', 'e', 'p', 'l', 'e', 'n', '<end>'] kalenpen
['b', 'a', 'n', 's', 'a', 'i', 'p', 'h', 

In [20]:
evaluate(input_tensor, target_tensor, encoder, decoder)

['k', 'r', 'a', 'c', 'h', 'o', 'm', '<end>']