1. Download data from Kaggle

In [None]:
from google.colab import files
!pip install -q kaggle

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
%cd ..
!mkdir root/.kaggle/
!cp content/kaggle.json root/.kaggle/kaggle.json
!kaggle competitions download -c 11-785-s20-hw4p2

/
mkdir: cannot create directory ‘root/.kaggle/’: File exists
train_new.npy.zip: Skipping, found more recently modified local copy (use --force to force download)
dev_new.npy.zip: Skipping, found more recently modified local copy (use --force to force download)
dev_transcripts.npy: Skipping, found more recently modified local copy (use --force to force download)
train_transcripts.npy.zip: Skipping, found more recently modified local copy (use --force to force download)
test_sample_submission.csv: Skipping, found more recently modified local copy (use --force to force download)
test_new.npy.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
!ls
!unzip \*.zip

bin					   lib	  tensorflow-1.15.2
boot					   lib32  test_new.npy
content					   lib64  test_new.npy.zip
datalab					   media  test_sample_submission.csv
dev					   mnt	  tmp
dev_new.npy				   opt	  tools
dev_new.npy.zip				   proc   train_new.npy
dev_transcripts.npy			   root   train_new.npy.zip
dlib-19.18.0-cp27-cp27mu-linux_x86_64.whl  run	  train_transcripts.npy
dlib-19.18.0-cp36-cp36m-linux_x86_64.whl   sbin   train_transcripts.npy.zip
etc					   srv	  usr
home					   swift  var
kaggle.json				   sys
Archive:  dev_new.npy.zip
replace dev_new.npy? [y]es, [n]o, [A]ll, [N]one, [r]ename: N

Archive:  train_new.npy.zip

Archive:  train_transcripts.npy.zip

Archive:  test_new.npy.zip

4 archives were successfully processed.


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import *

from torch import LongTensor
from torch.nn import Embedding, LSTM
from torch.autograd import Variable

import numpy as np
import time

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(DEVICE)

cuda


2. Data processing

In [None]:
'''
Loading all the numpy files containing the utterance information and text information
'''
def load_data():
    speech_train = np.load('train_new.npy', allow_pickle=True)
    speech_valid = np.load('dev_new.npy', allow_pickle=True)
    speech_test = np.load('test_new.npy', allow_pickle=True)

    transcript_train = np.load('./train_transcripts.npy', allow_pickle=True)
    transcript_valid = np.load('./dev_transcripts.npy', allow_pickle=True)

    return speech_train, speech_valid, speech_test, transcript_train, transcript_valid

'''
Transforms alphabetical input to numerical input, replace each letter by its corresponding 
index from letter_list
'''
def transform_letter_to_index(transcript, letter_list):
    '''
    :param transcript :(N, ) Transcripts are the text input
    :param letter_list: Letter list defined above
    :return letter_to_index_list: Returns a list for all the transcript sentence to index
    '''
    letter_to_index_list = []
    letter2index, index2letter = create_dictionaries(letter_list)
    n_transcripts = len(transcript)

    for t in range(n_transcripts):
      t_list = []
      t_list.append(letter2index['<sos>'])    #adding <sos>
      for i in range(len(transcript[t])):
        if i > 0:
            t_list.append(letter2index[" "])  #adding space between tokens
        word = transcript[t][i].decode("utf-8")
        for c in word:
          t_list.append(letter2index[c])      #adding characters
      t_list.append(letter2index['<eos>'])    #adding <eos>
      letter_to_index_list.append(t_list)

    assert(n_transcripts == len(letter_to_index_list))

    return letter_to_index_list

'''
Optional, create dictionaries for letter2index and index2letter transformations
'''
def create_dictionaries(letter_list):
    letter2index = dict({letter_list[i]:i for i in range(len(letter_list))})
    index2letter = dict({i:letter_list[i] for i in range(len(letter_list))})
    return letter2index, index2letter

class Speech2TextDataset(Dataset):
    '''
    Dataset class for the speech to text data, this may need some tweaking in the
    getitem method as your implementation in the collate function may be different from
    ours. 
    '''
    def __init__(self, speech, text=None, isTrain=True):
        self.speech = speech
        self.isTrain = isTrain
        if (text is not None):
            self.text = text

    def __len__(self):
        return self.speech.shape[0]

    def __getitem__(self, index):
        if (self.isTrain == True):
            return torch.tensor(self.speech[index].astype(np.float32)), torch.tensor(self.text[index])
        else:
            return torch.tensor(self.speech[index].astype(np.float32))

def collate_train(batch):
    ### Return the padded speech and text data, and the length of utterance and transcript ###
    ### collate fn lets you control the return value of each batch
    ### for packed_seqs, you want to return your data sorted by length
    (xx, yy) = zip(*batch)
    x_lens = torch.LongTensor([len(x) for x in xx])   # length = len(origin) + 2
    y_lens = torch.LongTensor([len(y)-1 for y in yy])   # length = len(origin) + 1
    xx_pad = pad_sequence(xx, batch_first = False)
    yy_pad = pad_sequence(yy, batch_first = False)
    return xx_pad, yy_pad, x_lens, y_lens

def collate_test(batch):
    xx = batch
    x_lens = torch.LongTensor([len(x) for x in xx])
    xx_pad = pad_sequence(xx, batch_first = False)
    ### Return padded speech and length of utterance ###
    return xx_pad, x_lens

In [None]:
speech_train, speech_valid, speech_test, transcript_train, transcript_valid = load_data()
print(speech_train.shape, transcript_train.shape)
print(speech_valid.shape, transcript_valid.shape)
print(speech_test.shape)

(24724,) (24724,)
(1106,) (1106,)
(523,)


In [None]:
LETTER_LIST = ['<pad>', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', \
               'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '-', "'", '.', '_', '+', ' ','<sos>','<eos>']

letter2index, index2letter = create_dictionaries(LETTER_LIST)

3. Implement Encoder Decoder network according to this paper https://arxiv.org/pdf/1508.01211.pdf

In [None]:
class pBLSTM(nn.Module):
    '''
    Pyramidal BiLSTM
    The length of utterance (speech input) can be hundereds to thousands of frames long.
    The Paper reports that a direct LSTM implementation as Encoder resulted in slow convergence,
    and inferior results even after extensive training.
    The major reason is inability of AttendAndSpell operation to extract relevant information
    from a large number of input steps.
    '''
    def __init__(self, input_dim, hidden_dim):
        super(pBLSTM, self).__init__()
        self.blstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, num_layers=1, bidirectional=True, batch_first = False)
        self.pool = nn.MaxPool1d(kernel_size=2,stride=2)

    def forward(self, x, lens):
        '''
        :param x :(S, N, H) input to the pBLSTM
        :return output: (S, N, H) encoded sequence from pyramidal Bi-LSTM 
        '''
        lens = lens // 2

        if x.shape[0] % 2 > 0:
          x = x[:-1]
        x = x.permute(1,2,0)
        x = self.pool(x)
        x = x.permute(2,0,1)

        packed_X = pack_padded_sequence(x, lens, batch_first=False, enforce_sorted=False)
        packed_out, _ = self.blstm(packed_X)
        outputs, lens = pad_packed_sequence(packed_out)
              
        return outputs, lens

class Encoder(nn.Module):
    '''
    Encoder takes the utterances as inputs and returns the key and value.
    Key and value are nothing but simple projections of the output from pBLSTM network.
    '''
    def __init__(self, input_dim, hidden_dim, value_size=128, key_size=128):
        super(Encoder, self).__init__()
        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, num_layers=1, bidirectional=True, batch_first = False)
        
        ### Add code to define the blocks of pBLSTMs! ###
        self.plstm1 = pBLSTM(hidden_dim*2, hidden_dim)
        self.plstm2 = pBLSTM(hidden_dim*2, hidden_dim)
        self.plstm3 = pBLSTM(hidden_dim*2, hidden_dim)

        self.key_network = nn.Linear(hidden_dim*2, value_size)
        self.value_network = nn.Linear(hidden_dim*2, key_size)

    def forward(self, x, lens):
        
        rnn_inp = pack_padded_sequence(x, lengths=lens, batch_first=False, enforce_sorted=False)
        outputs, _ = self.lstm(rnn_inp)
        inputs, lens = pad_packed_sequence(outputs, batch_first = False)

        ### Use the outputs and pass it through the pBLSTM blocks! ###
        inputs, lens = self.plstm1(inputs, lens)
        inputs, lens = self.plstm2(inputs, lens)
        linear_input, _ = self.plstm3(inputs, lens)

        #linear_input, _ = utils.rnn.pad_packed_sequence(outputs)
        keys = self.key_network(linear_input)
        value = self.value_network(linear_input)

        return keys, value

In [None]:
logits = torch.randn(20, 32)
# Sample soft categorical using reparametrization trick:
c1 = torch.nn.functional.gumbel_softmax(logits, tau=1, hard=False)
# Sample hard categorical using "Straight-through" trick:
c2 = torch.nn.functional.gumbel_softmax(logits, tau=1, hard=True)
print(c1, c2)

In [None]:
class Attention(nn.Module):
    '''
    Attention is calculated using key, value and query from Encoder and decoder.
    Below are the set of operations you need to perform for computing attention:
        energy = bmm(key, query)
        attention = softmax(energy)
        context = bmm(attention, value)
    '''
    def __init__(self):
        super(Attention, self).__init__()

    def forward(self, query, key, value, x_lens):
        '''
        :param query :(N, context_size) Query is the output of LSTMCell from Decoder
        :param key: (N, key_size) Key Projection from Encoder per time step
        :param value: (N, value_size) Value Projection from Encoder per time step
        :return output: Attended Context
        :return attention_mask: Attention mask that can be plotted  
        '''
        #print("query",query.shape)
        #print("key",key.shape)
        #print("value",value.shape)
        #print("lens", x_lens.shape)

        # Compute (batch_size, max_len) attention logits. "bmm" stands for "batch matrix multiplication".
        # Input shape of bmm:  (batch_szie, max_len, hidden_size), (batch_size, hidden_size, 1) 
        # Output shape of bmm: (batch_size, max_len, 1)
        energy = torch.bmm(key, query.unsqueeze(2)).squeeze(2)
        #print("energy",energy.shape)

        # Create an (batch_size, max_len) boolean mask for all padding positions
        # Make use of broadcasting: (1, max_len), (batch_size, 1) -> (batch_size, max_len)
        mask = torch.arange(key.size(1)).unsqueeze(0) >= x_lens.unsqueeze(1)
        mask = mask.to(DEVICE)
        
        # Set attention logits at padding positions to negative infinity.
        energy.masked_fill_(mask, -1e9)

        #print("energy", energy.shape)

        attention = nn.functional.softmax(energy, dim = 1)
        #print("attention", attention.shape)

        # Compute attention-weighted sum of context vectors
        # Input shape of bmm: (batch_size, 1, max_len), (batch_size, max_len, hidden_size) 
        # Output shape of bmm: (batch_size, 1, hidden_size)
        context = torch.bmm(attention.unsqueeze(1), value.transpose(1,0)).squeeze(1)
        #print("context", context.shape)

        return context, attention

In [None]:
class Decoder(nn.Module):
    '''
    As mentioned in a previous recitation, each forward call of decoder deals with just one time step, 
    thus we use LSTMCell instead of LSLTM here.
    The output from the second LSTMCell can be used as query here for attention module.
    In place of value that we get from the attention, this can be replace by context we get from the attention.
    Methods like Gumble noise and teacher forcing can also be incorporated for improving the performance.
    '''
    def __init__(self, vocab_size, hidden_dim, value_size=128, key_size=128, isAttended=False):
        super(Decoder, self).__init__()

        self.embedding = nn.Embedding(vocab_size, hidden_dim, padding_idx=0)
        self.lstm1 = nn.LSTMCell(input_size=hidden_dim + value_size, hidden_size=hidden_dim)
        self.lstm2 = nn.LSTMCell(input_size=hidden_dim, hidden_size=key_size)

        self.isAttended = isAttended
        if (isAttended == True):
            self.attention = Attention()

        self.character_prob = nn.Linear(key_size + value_size, vocab_size)

    def forward(self, key, values, x_lens, prob, text=None, isTrain=True):
        '''
        :param key :(T, N, key_size) Output of the Encoder Key projection layer
        :param values: (T, N, value_size) Output of the Encoder Value projection layer
        :param text: (N, text_len) Batch input of text with text_length
        :param isTrain: Train or eval mode
        :return predictions: Returns the character prediction probability 
        '''
        batch_size = key.shape[1]
    
        key = key.to(DEVICE)
        key = key.transpose(1,0)
        values = values.to(DEVICE)

        if (isTrain == True):
            max_len =  text.shape[1] #length of the largest sequence + 2
            embeddings = self.embedding(text) #N,T,E
        else:
            max_len = 250

        predictions = [] #T x N x V
        hidden_states = [None, None]
        #prediction = torch.zeros(batch_size,1).to(DEVICE)

        context = values[0,:,:] #values[i,:,:] C0

        for i in range(max_len-1):  #using si, [yi, ci] to predict yi+1

            # * Implement Gumble noise and teacher forcing techniques 
            # * When attention is True, replace values[i,:,:] with the context you get from attention.
            # * If you haven't implemented attention yet, then you may want to check the index and break 
            #   out of the loop so you do you do not get index out of range errors. 
            if i == 0:
              first_token = torch.LongTensor(np.array([33 for i in range(batch_size)])).to(DEVICE)
              char_embed = self.embedding(first_token) #set <sos> to be the first token always
            else:
              if (isTrain):
                  import random
                  if random.random() > prob: #if prob = 0.2
                    noisy = torch.nn.functional.gumbel_softmax(prediction, tau=1, hard=False)
                    char_embed = self.embedding(noisy.argmax(dim=-1))
                  else:
                    char_embed = embeddings[:,i,:]     #if prob = 0.8
              else:
                char_embed = self.embedding(prediction.argmax(dim=-1)) #always the previous char

            inp = torch.cat([char_embed, context], dim=1) #N,H+E
            hidden_states[0] = self.lstm1(inp, hidden_states[0])

            inp_2 = hidden_states[0][0]
            hidden_states[1] = self.lstm2(inp_2, hidden_states[1])
            
            ### Compute attention from the output of the second LSTM Cell ###
            output = hidden_states[1][0]
            context, attention = self.attention(output, key, values, x_lens)
            
            prediction = self.character_prob(torch.cat([output, context], dim=1))   #<sos> ......... c  
                                                                                    #a  .........<eos> 
            predictions.append(prediction.unsqueeze(1))

        return torch.cat(predictions, dim=1)


In [None]:
class Seq2Seq(nn.Module):
    '''
    We train an end-to-end sequence to sequence model comprising of Encoder and Decoder.
    This is simply a wrapper "model" for your encoder and decoder.
    '''
    def __init__(self, input_dim, vocab_size, hidden_dim, value_size=128, key_size=128, isAttended=False):
        super(Seq2Seq, self).__init__()
        self.encoder = Encoder(input_dim, hidden_dim)
        self.decoder = Decoder(vocab_size, hidden_dim, isAttended = True)

    def forward(self, speech_input, speech_len, text_input=None, isTrain=True):
        key, value = self.encoder(speech_input, speech_len)
        if (isTrain == True):
            predictions = self.decoder(key, value, speech_len, text_input)
        else:
            predictions = self.decoder(key, value, speech_len, text=None, isTrain=False)
        return predictions


4. Create a model object

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import *

#from models import Seq2Seq
#from train_test import train, test
#from dataloader import load_data, collate_train, collate_test, transform_letter_to_index, Speech2TextDataset

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

LETTER_LIST = ['<pad>', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', \
               'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '-', "'", '.', '_', '+', ' ','<sos>','<eos>']


num_workers = 8

model = Seq2Seq(input_dim=40, vocab_size=len(LETTER_LIST), hidden_dim=128)
model.to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(reduction="sum")

nepochs = 3
batch_size = 64 if DEVICE == 'cuda' else 1

speech_train, speech_valid, speech_test, transcript_train, transcript_valid = load_data()
character_text_train = transform_letter_to_index(transcript_train, LETTER_LIST)
character_text_valid = transform_letter_to_index(transcript_valid, LETTER_LIST)

train_dataset = Speech2TextDataset(speech_train, character_text_train)
val_dataset = Speech2TextDataset(speech_valid, character_text_valid)
#test_dataset = Speech2TextDataset(speech_test, None, False)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_train, num_workers=num_workers, pin_memory=True)
val_loader = DataLoader(train_dataset, batch_size=64, shuffle=False, collate_fn=collate_train, num_workers=num_workers, pin_memory=True)
#test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_test)

In [None]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

def train(model, train_loader, criterion, optimizer, epoch):

    loss_history = []

    model.train()
    model.to(DEVICE)
    criterion.to(DEVICE)

    start = time.time()

    # 1) Iterate through your loader
    for x, y, x_lens, y_lens in train_loader:

        optimizer.zero_grad()

        loss = 0.0

        # 3) Set the inputs to the device.
        x = x.to(DEVICE)
        y = y.to(DEVICE)
        y = y.T

        # 4) Pass your inputs, and length of speech into the model.
        predictions = model(x, x_lens, text_input=y, isTrain=True)

        n_tokens = y_lens.sum()

        mask = torch.arange(predictions.size(1)).unsqueeze(0) < y_lens.unsqueeze(1)
        mask = mask.to(DEVICE)

        y = y[:,1:]
        loss = criterion(predictions[mask], y[mask])
        
        # 9) Run the backward pass on the masked loss. 
        loss.backward()

        # Divide by number of "real" tokens to get average loss
        loss /= n_tokens
        loss_history.append(np.exp(loss.item()))

        # 11) Take a step with your optimizer
        optimizer.step()

        if len(loss_history) % 50 == 0:
          print('train loss:', loss_history[-1])
    
    end = time.time()
    print('Final train loss:', loss_history[-1], " time ", end-start)

In [None]:
import time
import torch
### Add Your Other Necessary Imports Here! ###

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

def validate(model, val_loader, criterion, epoch):

    loss_history = []

    model.eval()
    model.to(DEVICE)

    start = time.time()

    # 1) Iterate through your loader
    for x, y, x_lens, y_lens in val_loader:

        loss = 0.0

        # 3) Set the inputs to the device.
        x = x.to(DEVICE)
        y = y.to(DEVICE)

        # 4) Pass your inputs, and length of speech into the model.
        predictions = model(x, x_lens, text_input=None, isTrain=False)

        n_tokens = y_lens.sum() - y_lens.size(0)

        for i in range(y.size(0) - 1):  
          out = predictions[:,i,:]
          active = i + 1 < y_lens
          active = active.to(DEVICE)
          loss += criterion(out[active], y[i + 1, active])

        # Divide by number of "real" tokens to get average loss
        loss /= n_tokens
        loss_history.append(np.exp(loss.item()))

        if len(loss_history) % 50 == 0:
          print('validate loss:', loss_history[-1])
            
    # 12) Normalize the masked loss ??????????????
    
    # 13) Optionally print the training loss after every N batches
    
    end = time.time()
    print('Final validate loss:', loss_history[-1], " time ", end-start)

In [None]:
def test(model, test_loader, epoch):

    seq_list = []

    model.eval()
    model.to(DEVICE)

    start = time.time()

    # 1) Iterate through your loader
    for x, x_lens in test_loader:

        x = x.to(DEVICE)
        x_lens = x_lens.to(DEVICE)

        predictions = model(x, x_lens, isTrain=False)
        seq = greedySearch(predictions)
        print(seq)

    seq_list.append(seq)
    end = time.time()
    print('time', end-start)
    return seq_list

In [None]:
def greedySearch(predictions):

  timestep = predictions.size(1)

  ans = ""
  for i in range(timestep):
      out = predictions[:,i,:]
      out_token = torch.argmax(out.squeeze())
      c = index2letter[out_token.item()]
      ans += c
      if c == "<eos>":
        break
      #print(out_token)

  return ans

5. Experimentation & Hyperparameters Tuning

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import *

#from models import Seq2Seq
#from train_test import train, test
#from dataloader import load_data, collate_train, collate_test, transform_letter_to_index, Speech2TextDataset

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

LETTER_LIST = ['<pad>', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', \
               'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '-', "'", '.', '_', '+', ' ','<sos>','<eos>']


num_workers = 8

model = Seq2Seq(input_dim=40, vocab_size=len(LETTER_LIST), hidden_dim=256)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(reduction="sum")

nepochs = 5
batch_size = 64 if DEVICE == 'cuda' else 1

speech_train, speech_valid, speech_test, transcript_train, transcript_valid = load_data()
character_text_train = transform_letter_to_index(transcript_train, LETTER_LIST)
character_text_valid = transform_letter_to_index(transcript_valid, LETTER_LIST)

train_dataset = Speech2TextDataset(speech_train, character_text_train)
val_dataset = Speech2TextDataset(speech_valid, character_text_valid)
#test_dataset = Speech2TextDataset(speech_test, None, False)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_train, num_workers=num_workers, pin_memory=True)
val_loader = DataLoader(train_dataset, batch_size=64, shuffle=False, collate_fn=collate_train, num_workers=num_workers, pin_memory=True)
#test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_test)

for epoch in range(nepochs):
    train(model, train_loader, criterion, optimizer, epoch)
    #validate(model, val_loader, criterion, epoch)
    #test(model, test_loader, epoch)



train loss: 17.00709823674495
train loss: 15.394348236232592
train loss: 14.332326030404971
train loss: 15.121068106772192
train loss: 12.640741847185106
train loss: 13.819287402122088
train loss: 12.464684294110953
Final train loss: 13.945602991835639  time  231.45280361175537
train loss: 12.588386553344746
train loss: 13.816480539028127
train loss: 11.616576762669258
train loss: 10.889751865892002
train loss: 12.411844856411065
train loss: 9.92036306082684


In [None]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_train, num_workers=num_workers, pin_memory=True)

for epoch in range(nepochs):
    train(model, train_loader, criterion, optimizer, epoch)
    #validate(model, val_loader, criterion, epoch)
    #test(model, test_loader, epoch)

train loss: 1.759333787160323
train loss: 1.6763551579308869
train loss: 1.5657001193814077
train loss: 1.6667683127466184
train loss: 1.605856964612205
train loss: 1.5511158716512314
train loss: 1.6149366511171361
Final train loss: 1.577437406905322  time  223.89031076431274
train loss: 1.3657468433123616
train loss: 1.3949106173972616
train loss: 1.6054574934937846
train loss: 1.5044234193045785
train loss: 1.535917056467221
train loss: 1.390476629617859
train loss: 1.363380909139994
Final train loss: 1.3767778782118323  time  223.50807547569275
train loss: 1.394049063232078
train loss: 1.4014144737452805
train loss: 1.4020095900418819
train loss: 1.3721424469694579
train loss: 1.3920035988654336
train loss: 1.3896072949908211
train loss: 1.3091295486083028
Final train loss: 1.3299465137965367  time  223.92248606681824
train loss: 1.2407753650129008
train loss: 1.3191328889527583
train loss: 1.2682780067313142
train loss: 1.2815157627010112
train loss: 1.301445767971392
train loss: 1

In [None]:
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=True, collate_fn=collate_train, num_workers=num_workers, pin_memory=True)

for x, y, x_lens, y_lens in val_loader:
  x = x.to(DEVICE)
  y = y.to(DEVICE)
  predictions = model(x, x_lens, text_input = None, isTrain=False)
  ans = greedySearch(predictions)
  print(ans)

WE'K END CROWDS AT THE PRINTS WILL IN COUNTY STORE HER AVERAGENING FIFTY THOUSAND PERIOD<eos>
AND OBVIOUSLY IF THE DOLLAR GOES DOWN AND U. S. DINTEREST RATES GO UP THEY WILL LOSE THEIR SHARES<eos>
BUT DAY AND BOUS WERE THROUGH ANALYST L. CREATE CARVERS SAYS SATHER STRATEGY WAS LEAVE PILLSBARY VILLONORIEVILLER ERRE VOLUMERIER OILERIEVING<eos>
DOUBLE-QUOTE CURRENT F. D. A. REGULATIONS ACTED AS A BARRIER TO THE MARKET FOR THE MARKET FOR THE LITTLEGUISE COMMA DOUBLE-QUOTE SAYS ONE BY OUTTEC COMPANY EXECUTIVE PERIOD<eos>
THAT USE ESPECIALLY WORRIESEMENTLY TO THE MARKETS APPARENT SINCE ATTIVITY TO EVEN'S SMOLL DOWN WORK PRESSURES ON THE CURRENCY<eos>
CLIPSES OF THE SUN AND MOON HAVE LONG MADE A DEEP IN PRESSION ON HUMANKIND<eos>
THE CLIPPESSES OF THE SUN AT MOON HAVE LONG MADE A DEEP IMPRESSION ON HUMANKIND<eos>
THE FUNDRILL PROVIDE FINANCING FOR UNTRIPPENERS TO ESTABLISHED SHOULD BUSINESS PLAN A SEMBLEMANAGEMENT TEAME AND PRODUCE A PRODARTIP PRODUCTIVE PRODUCTIVE PRODUCTIVE PRODUCTIVE PRODU

KeyboardInterrupt: ignored