In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random

import torch
import torch.nn as nn
from torch import optim
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn.functional as F

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from torchtext.data import get_tokenizer

import numpy as np
import math


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SOS_token = 128
EOS_token = 129
MAX_LENGTH = 10000

In [3]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        # print("encoder input size" + str(input.size()))
        embedded = self.dropout(self.embedding(input))
        # print("embedding input size" + str(embedded.size()))
        output, hidden = self.gru(embedded)
        return output, hidden

In [4]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []

        for i in range(target_tensor.size(2)):
            decoder_output, decoder_hidden = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)

            # Teacher forcing: Feed the target as the next input
            # print("target tensor size original: " + str(target_tensor.squeeze(0).size()))
            # print("target tensor size: " + str(target_tensor[:,i].size()))
            # print("target tensor size: " + str(target_tensor[i,:].unsqueeze(1).size()))
            # decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            decoder_input = target_tensor.squeeze(0)[:,i].unsqueeze(0).to(torch.int)# Teacher forcing

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden, None # We return `None` for consistency in the training loop

    def forward_step(self, input, hidden):
        # print("decoder input size: " + str(input.size()))
        output = self.embedding(input)
        # print("decoder embedding input size: " + str(output.size()))
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        return output, hidden

In [5]:
def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion):

    total_loss = 0
    for data in dataloader:
        input_tensor, target_tensor = data

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        # print(input_tensor.size())
        # input_tensor = input_tensor.to(torch.float32)
        encoder_outputs, encoder_hidden = encoder(input_tensor.to(torch.int).squeeze(0))

        # print("encoder outputs: " + str(encoder_outputs.size()))
        # print("encoder hidden: " + str(encoder_hidden.size()))
        # print("target tensor: " + str(target_tensor.size()))

        encoder_outputs = encoder_outputs.to(torch.float32)
        encoder_hidden = encoder_hidden.to(torch.float32)
        target_tensor = target_tensor.to(torch.float32)
        
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

        

        embedding = nn.Embedding(130, 130)
        # target_tensor = embedding(target_tensor.squeeze(0).to(torch.int)).to(torch.long)
        # print("decoder output: " + str(decoder_outputs.view(-1, decoder_outputs.size(1) * decoder_outputs.size(2)).size()))
        # print("target_tensor: " + str(target_tensor.view(-1, target_tensor.size(1) * target_tensor.size(2)).size()))
        # print("decoder output: " + str(decoder_outputs.view(-1).size()))
        # print("target_tensor: " + str(target_tensor.view(-1).size()))
        # print("decoder output: " + str(decoder_outputs.view(-1).size()))
        # print("target_tensor: " + str(target_tensor.view(-1).size()))
        # decoder_outputs.view(-1, decoder_outputs.size(1) * decoder_outputs.size(2))
        decoder_outputs = decoder_outputs.squeeze(0)
        target_tensor = target_tensor.squeeze(0).squeeze(0).type(torch.LongTensor)
        # print("decoder target")
        # print(decoder_outputs.size())
        # print(target_tensor.size())
        loss = criterion(
            decoder_outputs,
            target_tensor
            # decoder_outputs.view(-1),
            # target_tensor.view(-1)
        )
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [6]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [7]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)
    plt.savefig('foo.png')


In [8]:
def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
               print_every=100, plot_every=100):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for epoch in range(1, n_epochs + 1):
        loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_epochs),
                                        epoch, epoch / n_epochs * 100, print_loss_avg))

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [9]:
from torchtext.vocab import GloVe

unicode_list = []
MAX_UNICODE = 128 + 2
MAX_FILE_LENGTH = 10000

# glove = GloVe(name='840B', dim=300)

for i in range(0, MAX_UNICODE):
    unicode_list.append(chr(i))

class packageDataset(Dataset):
    def __init__(self, df):
        self.df, self.max_len = self.tokenize(df)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, id):
        input_data = self.df['inputs']
        output_data = self.df['outputs']
        return input_data, output_data

    def tokenize(self, df):
        max_len = MAX_FILE_LENGTH 
        tokenizer = get_tokenizer("subword")
        # print(df)
        for input in df['inputs']:
            syn_len = len(tokenizer(input))
            if syn_len > max_len:
                max_len = syn_len

        for output in df['outputs']:
            syn_len = len(tokenizer(output))
            if syn_len > max_len:
                max_len = syn_len

        # for i in range(len(df['synopsis'])):
        def process_input(ex):
            ex = [*ex]
            input = torch.empty((len(ex) + 1))
            for i in range(len(ex)):
                input[i] = torch.tensor(unicode_list.index(ex[i]))
            input[len(ex)] = torch.tensor(129)
            return input
        
        def process_output(ex):
            ex = [*ex]
            target = torch.empty((len(ex) + 1))
            for i in range(len(ex)):
                target[i] = torch.tensor(unicode_list.index(ex[i]))
            target[len(ex)] = torch.tensor(129)
            return target
        
        for i in range(len(df['inputs'])):
            new_df = {}
            new_inputs = process_input(df['inputs'][i])
            new_outputs = process_output(df['outputs'][i])

            new_df['inputs'] = torch.empty((len(df['inputs']), len(new_inputs)))
            new_df['outputs'] = torch.empty((len(df['outputs']), len(new_outputs)))

            new_df['inputs'][i,:] = new_inputs
            new_df['outputs'][i,:] = new_outputs

        return new_df, max_len

#Data loader function
def get_dataloader(path_to_input, path_to_output, batch_size=32):
    df = {'inputs': [], 'outputs': []}

    with open(path_to_input, 'r') as file:
        input = file.read()
    
    with open(path_to_output, 'r') as file:
        output = file.read()

    df['inputs'].append(input)
    df['outputs'].append(output)

    ds = packageDataset(df)
    max_len = ds.max_len

    train_size = int(0.8*len(ds))
    val_size = len(ds) - train_size
    train, val = random_split(ds, [train_size, val_size])
    return DataLoader(train, batch_size), DataLoader(val, batch_size), max_len

In [None]:
hidden_size = 128
batch_size = 1
input_size = 130

train_set, val, max_len = get_dataloader('input_data/ecal/ecal_data.txt', 'output_data/ecal/ecal.spec')

encoder = EncoderRNN(input_size, hidden_size).to(device)
decoder = DecoderRNN(hidden_size, MAX_UNICODE).to(device)

train(train_set, encoder, decoder, 5, print_every=1, plot_every=1)

decoder target
torch.Size([27661, 130])
torch.Size([27661])
0m 41s (- 2m 44s) (1 20%) 4.9094
decoder target
torch.Size([27661, 130])
torch.Size([27661])
1m 23s (- 2m 5s) (2 40%) 4.8133
decoder target
torch.Size([27661, 130])
torch.Size([27661])
