In [220]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random

import torch
import torch.nn as nn
from torch import optim
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn.functional as F

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from torchtext.data import get_tokenizer

import numpy as np
import math


In [221]:
SOS_token = 0
EOS_token = 1
MAX_LENGTH = 10000

In [222]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        print(embedded.size())
        output, hidden = self.gru(embedded)
        return output, hidden

In [223]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden.to(torch.float32)
        decoder_outputs = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden, None # We return `None` for consistency in the training loop

    def forward_step(self, input, hidden):
        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        return output, hidden

In [224]:
def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion):

    total_loss = 0
    for data in dataloader:
        input_tensor, target_tensor = data
        input_tensor = input_tensor.to(torch.int)
        print(input_tensor.size())
        print(target_tensor.size())
        input_tensor = input_tensor.squeeze(0)
        target_tensor = target_tensor.squeeze(0)
        print(input_tensor.size())
        print(target_tensor.size())
        input_tensor = input_tensor.squeeze(0)
        target_tensor = target_tensor.squeeze(0)

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        encoder_outputs, encoder_hidden = encoder(input_tensor)

        encoder_outputs = encoder_outputs.to(torch.int)
        encoder_hidden = encoder_hidden.to(torch.int)
        target_tensor = target_tensor.to(torch.int)
        
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [225]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [226]:
def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
               print_every=100, plot_every=100):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for epoch in range(1, n_epochs + 1):
        loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_epochs),
                                        epoch, epoch / n_epochs * 100, print_loss_avg))

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    # showPlot(plot_losses)

In [227]:
from torchtext.vocab import GloVe

unicode_list = []
MAX_UNICODE = 128
MAX_FILE_LENGTH = 10000

glove = GloVe(name='840B', dim=300)

for i in range(0, MAX_UNICODE):
    unicode_list.append(chr(i))

class packageDataset(Dataset):
    def __init__(self, df):
        self.df, self.max_len = self.tokenize(df)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, id):
        input_data = self.df['inputs']
        output_data = self.df['outputs']
        return input_data, output_data

    def tokenize(self, df):
        max_len = MAX_FILE_LENGTH 
        tokenizer = get_tokenizer("subword")
        print(df)
        for input in df['inputs']:
            syn_len = len(tokenizer(input))
            if syn_len > max_len:
                max_len = syn_len

        for output in df['outputs']:
            syn_len = len(tokenizer(output))
            if syn_len > max_len:
                max_len = syn_len

        # for i in range(len(df['synopsis'])):
        def process_input(ex):
            ex = [*ex]
            input = torch.empty((len(ex), 1))
            for i in range(len(ex)):
                input[i,0] = unicode_list.index(ex[i])
            print(input.size())
            return input
        
        def process_output(ex):
            ex = [*ex]
            target = torch.empty((len(ex), 1))
            for i in range(len(ex)):
                target[i,0] = unicode_list.index(ex[i])
            return target
        
        for i in range(len(df['inputs'])):
            new_df = {}

            new_input = process_input(df['inputs'][i])
            new_output = process_output(df['outputs'][i])

            new_df['inputs'] = torch.empty((len(df['inputs']), len(new_input), 1))
            new_df['outputs'] = torch.empty((len(df['outputs']), len(new_output), 1))

            new_df['inputs'][i] = new_input.to(torch.int)
            new_df['outputs'][i] = new_output.to(torch.int)

            print("input output sizes:")
            print(new_df['inputs'].size())
            print(new_df['outputs'].size())
            # print(new_df['inputs'].size())
            # new_df['inputs'] = new_df['inputs'].squeeze(0).squeeze(0)
            # new_df['outputs'] = new_df['outputs'].squeeze(0).squeeze(0)

        return new_df, max_len

#Data loader function
def get_dataloader(path_to_input, path_to_output, batch_size=32):
    df = {'inputs': [], 'outputs': []}

    with open(path_to_input, 'r') as file:
        input = file.read()
    
    with open(path_to_output, 'r') as file:
        output = file.read()

    df['inputs'].append(input)
    df['outputs'].append(output)
    
    ds = packageDataset(df)
    max_len = ds.max_len

    train_size = int(0.8*len(ds))
    val_size = len(ds) - train_size
    train, val = random_split(ds, [train_size, val_size])
    return DataLoader(train, batch_size), DataLoader(val, batch_size), max_len

In [228]:
hidden_size = 64
batch_size = 1
input_size = 128

train_set, val, max_len = get_dataloader('input_data/ecal/ecal_data.txt', 'output_data/ecal/ecal.spec')

encoder = EncoderRNN(input_size, hidden_size).to(device)
decoder = DecoderRNN(hidden_size, input_size).to(device)

train(train_set, encoder, decoder, 80, print_every=5, plot_every=5)

torch.Size([26689, 1])
input output sizes:
torch.Size([1, 26689, 1])
torch.Size([1, 27660, 1])
torch.Size([1, 1, 26689, 1])
torch.Size([1, 1, 27660, 1])
torch.Size([1, 26689, 1])
torch.Size([1, 27660, 1])
torch.Size([26689, 1, 64])


RuntimeError: Expected hidden size (1, 27660, 64), got [1, 26689, 64]