In [1]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import random
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
data_path = '../dataset/data.csv'
with open(data_path, 'r', encoding='utf-8-sig') as f:
    lines = f.read().split('\n')

input_texts = []
target_texts = []
input_characters = set()
target_characters = set()    
    
for line in lines:
    input_text, target_text = line.split('\t')
    target_text = target_text 
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)

In [3]:
target_texts[0]

'kongphanthahanpuenyai'

In [4]:
# +4 for padding, unknown, <start>, <end>
data_size, vocab_size = len(input_texts), len(input_characters)+1 
output_vocab_size = len(target_characters)+1 #+3 padding, <start>, <end>
print('There are %d lines and %d unique characters in your input data.' % (data_size, vocab_size))
maxlen_in = len( max(input_texts, key=len)) #max input length
maxlen_out = len( max(target_texts, key=len)) #max output length

There are 648241 lines and 92 unique characters in your input data.


In [5]:
maxlen_out

88

In [6]:
input_characters = sorted(input_characters)
target_characters = sorted(target_characters) 
input_characters.insert(0,"<PAD>")#PADDING for input
input_characters.insert(1,"<UNK>")
input_characters.insert(2,"<start>")
input_characters.insert(3,"<end>")
target_characters.insert(0,"<PAD>")#PADDING for output
target_characters.insert(1,"<start>")
target_characters.insert(2,"<end>")

#Input
char_to_ix = { ch:i for i,ch in enumerate(input_characters) }
ix_to_char = { i:ch for i,ch in enumerate(input_characters) } #reverse dictionary
#Output
target_char_to_ix = { ch:i for i,ch in enumerate(target_characters) }
ix_to_target_char = { i:ch for i,ch in enumerate(target_characters) } #reverse dictionary

In [7]:
m=648241
Tx=maxlen_in
Ty=maxlen_out

In [8]:
maxlen_in

76

In [9]:
def prepare_sequence_in( input_text):
    idxs = []
    for w in input_text:
        if w in char_to_ix:
            idxs.append(char_to_ix[w])
        else:
            idxs.append(char_to_ix["<UNK>"])
    idxs.append(target_char_to_ix["<end>"])
    tensor = torch.tensor(idxs, dtype=torch.long)
    return tensor.to(device)

def prepare_sequence_target( input_text):
    idxs = [target_char_to_ix[w] for w in input_text]
    idxs.append(target_char_to_ix["<end>"])
    tensor = torch.tensor(idxs, dtype=torch.long)
    return tensor.to(device)




In [10]:
n_h = 64 #hidden dimensions for encoder 
n_s = 64 #hidden dimensions for decoder
emb_dim = 64 #character embedding size

In [11]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, hidden_dim, emb_dim):
        super(Encoder, self).__init__()
        self.hidden_dim = hidden_dim
        self.char_emb = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, self.hidden_dim // 2, bidirectional = True)

    def forward(self, input_seq):
        self.hidden = self.init_hidden()
        embedded = self.char_emb(input_seq)
        output, self.hidden = self.lstm(embedded.view(len(embedded), 1, -1), self.hidden)
        return output, self.hidden

    def init_hidden(self):
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(2, 1, self.hidden_dim // 2,requires_grad=True).to(device),
    torch.zeros(2, 1, self.hidden_dim // 2,requires_grad=True).to(device))

class OneStepDecoder(nn.Module):
    def __init__(self, vocab_size, hidden_dim, emb_dim):
        super(OneStepDecoder, self).__init__()
        self.hidden_dim = hidden_dim
        self.char_emb = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim)
        self.out = nn.Linear(hidden_dim, vocab_size)
        self.softmax = nn.LogSoftmax(dim=1)

        
    def forward(self, input_step,hidden,encoder_outputs):
        
        embedded = self.char_emb(input_step).view(1, 1, -1)
        output = embedded    
        output = F.relu(output)
        output, hidden = self.lstm(output, hidden)
        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, []#this empty list should be replaced with decoder attn score 

    def init_hidden(self):
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, 1, self.hidden_dim ,requires_grad=True).to(device),
    torch.zeros(1, 1, self.hidden_dim,requires_grad=True).to(device))


In [12]:
#reference:https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html#sphx-glr-intermediate-seq2seq-translation-tutorial-py
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=maxlen_out):
  
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    #encoder_outputs = torch.zeros(max_length, encoder.hidden_dim, device=device)

    loss = 0

    encoder_outputs, encoder_hidden = encoder(input_tensor)


    decoder_input = torch.tensor([target_char_to_ix["<start>"]], device=device)
    decoder_hidden = (encoder_hidden[0].reshape(1,1,encoder.hidden_dim),encoder_hidden[1].reshape(1,1,encoder.hidden_dim))

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
#           print(decoder_output.shape,target_tensor[di].shape,target_tensor[di])
            loss += criterion(decoder_output, target_tensor[di].view(1))
            decoder_input = target_tensor[di].view(1)  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden,decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di].view(1))
            if decoder_input.item() == target_char_to_ix["<end>"]:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length


In [13]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [14]:
encoder = Encoder(len(char_to_ix), n_h, emb_dim).to(device)
decoder = OneStepDecoder(len(target_char_to_ix), n_s, emb_dim).to(device)

In [15]:
learning_rate = 0.001
encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()

In [16]:
def trainIters(encoder, decoder, epoch_num, learning_rate,encoder_optimizer,decoder_optimizer,criterion,print_every=1000,):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    
    
    for epoch in range(epoch_num):
        count = 1
        for in_text,out_text in zip(input_texts,target_texts):
            input_tensor =  prepare_sequence_in(in_text)
            target_tensor = prepare_sequence_target(out_text)
            loss = train(input_tensor, target_tensor, encoder,
                         decoder, encoder_optimizer, decoder_optimizer, criterion)
            print_loss_total += loss
            #print(print_loss_total)
            if count % print_every == 0:
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                print('%s (%d %d%%) %.4f' % (timeSince(start, epoch+1 / epoch_num),
                                                 epoch, epoch+1 / epoch_num * 100, print_loss_avg))
            count += 1


In [17]:
trainIters(encoder, decoder, 1,learning_rate,encoder_optimizer,decoder_optimizer,criterion,print_every=1000)

0m 10s (- 0m 0s) (0 100%) 2.4585
0m 21s (- 0m 0s) (0 100%) 2.0156
0m 32s (- 0m 0s) (0 100%) 1.8287
0m 43s (- 0m 0s) (0 100%) 1.7223
0m 55s (- 0m 0s) (0 100%) 1.6345
1m 6s (- 0m 0s) (0 100%) 1.5592
1m 17s (- 0m 0s) (0 100%) 1.4941
1m 28s (- 0m 0s) (0 100%) 1.4569
1m 39s (- 0m 0s) (0 100%) 1.3796
1m 50s (- 0m 0s) (0 100%) 1.3514
2m 1s (- 0m 0s) (0 100%) 1.3547
2m 12s (- 0m 0s) (0 100%) 1.3362
2m 23s (- 0m 0s) (0 100%) 1.3051
2m 33s (- 0m 0s) (0 100%) 1.2651
2m 44s (- 0m 0s) (0 100%) 1.2530
2m 55s (- 0m 0s) (0 100%) 1.2359
3m 7s (- 0m 0s) (0 100%) 1.2530
3m 18s (- 0m 0s) (0 100%) 1.2128
3m 29s (- 0m 0s) (0 100%) 1.2088
3m 40s (- 0m 0s) (0 100%) 1.1427
3m 51s (- 0m 0s) (0 100%) 1.1543
4m 2s (- 0m 0s) (0 100%) 1.1643
4m 13s (- 0m 0s) (0 100%) 1.1705
4m 24s (- 0m 0s) (0 100%) 1.1406
4m 35s (- 0m 0s) (0 100%) 1.1318
4m 46s (- 0m 0s) (0 100%) 1.1102
4m 58s (- 0m 0s) (0 100%) 1.1142
5m 9s (- 0m 0s) (0 100%) 1.1098
5m 20s (- 0m 0s) (0 100%) 1.0900
5m 31s (- 0m 0s) (0 100%) 1.0871
5m 42s (- 0m 0s

KeyboardInterrupt: 

In [44]:
def evaluate(input_tensor, target_tensor, encoder, decoder, max_length=maxlen_out):
  

    input_length = input_tensor.size(0)
    #target_length = target_tensor.size(0)

    encoder_outputs, encoder_hidden = encoder(input_tensor)

    decoder_input = torch.tensor([target_char_to_ix["<start>"]], device=device)
    decoder_hidden = (encoder_hidden[0].reshape(1,1,encoder.hidden_dim),encoder_hidden[1].reshape(1,1,encoder.hidden_dim))

    decoded_seq = []

    for di in range(max_length):
        decoder_output, decoder_hidden,decoder_attention = decoder(
            decoder_input, decoder_hidden, encoder_outputs)
        topv, topi = decoder_output.data.topk(1)
        if topi.item() == target_char_to_ix['<end>']:
            decoded_seq.append('<end>')
            break
        else:
            decoded_seq.append(ix_to_target_char[topi.item()])

        decoder_input = topi.squeeze().detach()

    return decoded_seq

In [45]:
target_char_to_ix

{'<PAD>': 0,
 '<start>': 1,
 '<end>': 2,
 ' ': 3,
 '!': 4,
 '"': 5,
 '(': 6,
 ')': 7,
 '-': 8,
 '0': 9,
 '1': 10,
 '2': 11,
 '3': 12,
 '4': 13,
 '5': 14,
 '6': 15,
 '7': 16,
 '8': 17,
 '9': 18,
 'a': 19,
 'b': 20,
 'c': 21,
 'd': 22,
 'e': 23,
 'f': 24,
 'g': 25,
 'h': 26,
 'i': 27,
 'k': 28,
 'l': 29,
 'm': 30,
 'n': 31,
 'o': 32,
 'p': 33,
 'r': 34,
 's': 35,
 't': 36,
 'u': 37,
 'w': 38,
 'y': 39}

In [42]:
for in_text,out_text in zip(input_texts[:50],target_texts[:50]):
    input_tensor =  prepare_sequence_in(in_text)
    target_tensor = prepare_sequence_target(out_text)
    decoded_seq=evaluate(input_tensor, target_tensor, encoder, decoder)
    print(decoded_seq,out_text)

['k', 'o', 'n', 'g', 'p', 'h', 'a', 'n', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'i', 'i'] kongphanthahanpuenyai
2
['w', 'i', 't', 'h', 'a', 'n', '<end>'] withun
2
['m', 'e', 't', 't', 'i', 't', ' ', 's', 'a', 'm', 'o', '<end>'] metabolisom
2
['b', 'a', 'n', 'n', 'o', 'n', 'g', 'n', 'a', 'o', '<end>'] bannonglao
2
['a', 'y', 'u', 't', '<end>'] ayut
['t', 'h', 'a', 'e', 'p', 'h', 'a', 'm'] thaempa
2
['p', 'r', 'a', 't', 'h', 'a', 'e', 't', 't', 'a', 'e', 'k', 'a', 'n', '<end>'] prathetkrinlaen
['p', 'h', 'r', 'a', 'k', 'h', 'r', 'o', 'n', 'g', 'k', 'a', 'k', 'k', 'o'] phakkhongkerot
['k', 'a', 'r', 'u', 's', 'u', 'p'] kansup
['b', 'a', 'n', 't', 'h', 'a', 'p', 'p', 'h', 'p', 'h', 'p', 'h', 'a', 'i', 'k', 'h'] banthepphayaktai
['m', 'u', 't', 't', 'a', 'l', 'a', 't', 't', 'a', 'd', 'a', 't'] modulatnatat
2
['l', 'a', 'b', 'o', 'n', 'g', '<end>'] labong
2
['k', 'a', 'p', 'e', 'n', 'e', 'n', 'n', '<end>'] kalenpen
['b', 'a', 'n', 's', 'a', 'p', 'h', 'a', 'c', 'h', 'e'] 

In [43]:
evaluate(input_tensor, target_tensor, encoder, decoder)

2


['k', 'r', 'a', 'c', 'h', 'o', 'm', '<end>']

In [24]:
torch.save({
            'encoder_state_dict': encoder.state_dict(),
            'decoder_state_dict': decoder.state_dict(),
            'char_to_ix': char_to_ix,
            'ix_to_char': ix_to_char,
            'target_char_to_ix': target_char_to_ix,
            'ix_to_target_char':ix_to_target_char
    
#            'optimizerE_state_dict': encoder_optimizer.state_dict(),
#            'optimizerD_state_dict': decoder_optimizer.state_dict(),
            }, "thai2rom-pytorch.tar")

In [25]:
loader = torch.load("thai2rom-pytorch.tar")

In [26]:
loader.keys()

dict_keys(['encoder_state_dict', 'decoder_state_dict', 'char_to_ix', 'ix_to_char', 'target_char_to_ix', 'ix_to_target_char'])

In [36]:
char_to_ix = { ch:i for i,ch in enumerate(input_characters) }
ix_to_char = { i:ch for i,ch in enumerate(input_characters) } #reverse dictionary
#Output
target_char_to_ix = { ch:i for i,ch in enumerate(target_characters) }
ix_to_target_char = { i:ch for i,ch in enumerate(target_characters) } 