# **Author** : Adwoa Asantewaa Bremang 
# **Project**: Attention-based End-to-End Speech-to-Text Deep Neural Network

# LAS(Listen, attend and spell) MODEL

---
https://arxiv.org/pdf/1508.01211.pdf

The project involves predicting  a sequence of sentences provided with utterances and  respective transcript.

**Output**

The train attention model was able to predict the test dataset which achieved an average levenshtein distance of about 20, using runned with a total of 100 epochs.

The train dataset had a frame of length 28539, with varation in length of utterances and frequencies of 40. Each frame was of size **(utterance time step, frequencies(40))**.

The validation dataset had frame length of 2703.

The test dataset had frame length of 2620.



In [2]:
from google.colab import drive
#drive.mount('/content/gdrive')


In [2]:
!pip install python-Levenshtein



In [6]:
import numpy as np
import torch
from torch.utils.data import Dataset 
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.nn.utils as utils
import torch.nn.functional as F
import pandas as pd
import time
from matplotlib.lines import Line2D
import matplotlib.pyplot as plt
import Levenshtein
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
#DEVICE = 'cpu'

In [1]:
LETTER_LIST = ['<pad>', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', \
               'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '-', "'", '.', '_', '+', ' ','<sos>','<eos>']

In [16]:
def load_data():
    '''speech_train = np.load('./hw4p2/train.npy', allow_pickle=True, encoding='bytes')
    speech_valid = np.load('./hw4p2/dev.npy', allow_pickle=True, encoding='bytes')
    speech_test = np.load('./hw4p2/test.npy', allow_pickle=True, encoding='bytes')

    transcript_train = np.load('./hw4p2/train_transcripts.npy', allow_pickle=True,encoding='bytes')
    transcript_valid = np.load('./hw4p2/dev_transcripts.npy', allow_pickle=True,encoding='bytes')'''


    speech_train = np.load('./train.npy', allow_pickle=True, encoding='bytes')
    speech_valid = np.load('./dev.npy', allow_pickle=True, encoding='bytes')
    speech_test = np.load('./test.npy', allow_pickle=True, encoding='bytes')

    transcript_train = np.load('./train_transcripts.npy', allow_pickle=True,encoding='bytes')
    transcript_valid = np.load('./dev_transcripts.npy', allow_pickle=True,encoding='bytes')

    '''print((speech_train[0].shape))
    print(len(speech_valid))
    print(len(speech_test))
    print(len(transcript_train))
    print(len(transcript_valid))'''
    
    return speech_train, speech_valid, speech_test, transcript_train, transcript_valid

In [None]:
speech_train, speech_valid, speech_test, transcript_train, transcript_valid = load_data()

In [9]:
def collate_train(batch_data):
    ### Return the padded speech and text data, and the length of utterance and transcript ###
    y = []
    x = []
    x_length = []
    y_length =[]
    for X,Y in batch_data:
       x.append(X)
       y.append(Y)
       x_length.append(X.shape[0])
       y_length.append(len(Y))
    x_out =pad_sequence(x,batch_first=False)
    y_out =pad_sequence(y,batch_first=True)

    return (x_out, x_length),(y_out,  y_length)

In [10]:
def collate_test(batch_data):
    ### Return padded speech and length of utterance ###
    x = []
    x_length = []
    for X in batch_data:
       x.append(X)
       x_length.append(X.shape[0])
    x_out =pad_sequence(x,batch_first=False)
    return (x_out, x_length)
    #pass 

In [10]:
'''
Transforms alphabetical input to numerical input, replace each letter by its corresponding 
index from letter_list
'''
def transform_letter_to_index(transcript, letter_list):
    '''
    :param transcript :(N, ) Transcripts are the text input
    :param letter_list: Letter list defined above
    :return letter_to_index_list: Returns a list for all the transcript sentence to index
    '''
    for idx in range(len(transcript)):
            label = []
            for c in transcript[idx]:
                k = c.decode("utf-8")
                for i in c:
                  label.append(letter_list.index(i))
                label.append(letter_list.index(' '))
                #label = [letter_list.index(i) for i in k]
            label = [letter_list.index('<sos>')] + label + [letter_list.index('<eos>')]
            
            transcript[idx] = torch.from_numpy(np.array(label)).long()
    return transcript


In [None]:

def transform_index_to_letters(transcript, letter_list):
    '''
    :param transcript :(N, ) Transcripts are the text input
    :param letter_list: Letter list defined above
    :return letter_to_index_list: Returns a list for all the transcript sentence to index
    '''
    value = []
    for pred in transcript:

       #value =[]
       #for i in range(len(pred)):
        #value.append(letter_list[int(pred[i])])
      value.append(''.join([letter_list[int(pred[i])]
                                      for i in range(len(pred))]))       
    return value

In [None]:
'''
Optional, create dictionaries for letter2index and index2letter transformations
'''
def create_dictionaries(letter_list):
    letter2index = dict()
    index2letter = dict()
    return letter2index, index2letter

In [7]:
speech_train, speech_valid, speech_test, transcript_train, transcript_valid = load_data()

In [None]:
character_text_train = transform_letter_to_index(transcript_train, LETTER_LIST)
character_text_valid = transform_letter_to_index(transcript_valid, LETTER_LIST)

In [None]:
class Speech2TextDataset(Dataset):
    '''
    Dataset class for the speech to text data, this may need some tweaking in the
    getitem method as your implementation in the collate function may be different from
    ours. 
    '''
    def __init__(self, speech, text=None, isTrain=True):
        self.speech = speech
        self.isTrain = isTrain
        if (text is not None):
            self.text = text
            
    def __len__(self):
        return self.speech.shape[0]

    def __getitem__(self, index):
        if (self.isTrain == True):
            return torch.tensor(self.speech[index].astype(np.float32)), torch.tensor(self.text[index])
        else:
            return torch.tensor(self.speech[index].astype(np.float32))

## Encoder --> listener

---
The encoder was the listener of the model. the encoder operates using a pyramidal structure of a bidirectional LSTM(Long Short Term Memory) RNN.

**Problem**
 Due to the long length of utterances(speech input), an attendandspell(decoder)  operation can not extra relevation information from large amount of data. Therefore, a direct LSTM model implementation as encoder leads to slower convergence and inferior results even after a immerse training. 


**Solution**

A pyramid bidirectional LSTM(pBLSTM) model was implemented, where each successive stacked pBLSTM layer, reduced the time resolution by a factor of 2.
This allows the attention model to extract the relevant information from a smaller number of times steps.


**Outputs from the pblstm**

An example of input to the encoder is torch.Size([998, 1, 40]): ([sequence length, batch sze, frequencies]). The lstm transforms it with a hidden size of 256.

The first output of pblstm --> torch.Size([569, 1, 256])

The second output of pblstm --> torch.Size([284, 1, 256])

The third output of pblstm --> torch.Size([142, 1, 256])

It is observed that the sequence length reduces by half at each stage, while the batch size and the hidden size remains the same.




    Pyramidal BiLSTM
    The length of utterance (speech input) can be hundereds to thousands of frames long.
    The Paper reports that a direct LSTM implementation as Encoder resulted in slow convergence,
    and inferior results even after extensive training.
    The major reason is inability of AttendAndSpell operation to extract relevant information
    from a large number of input steps.

In [None]:
class pBLSTM(nn.Module):
   
    def __init__(self, input_dim, hidden_dim):
        super(pBLSTM, self).__init__()
        self.blstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, num_layers=1, bidirectional=True)

    def forward(self, x,hidden =None):
        '''
        :param x :(N, T) input to the pBLSTM, N = batchsize, T = sequence length, H = dimension
        :return output: (N, T, H) encoded sequence from pyramidal Bi-LSTM 
        '''      
        x,hidden= self.blstm(x,hidden)
        x, lens = torch.nn.utils.rnn.pad_packed_sequence(x, batch_first=False)
        #print('lens',lens.shape)
        x= x.permute(1,0,2) # [batch_size, length, dim]
        n,t,h = x.shape

        ###......... this caters for odd sequence length, it chops off the of extra frame, hence the code below .....###

        if (t%2)==1:
          t = t-1
          x = x[:,:-1,:]
  
        x = x.reshape(n,t//2,h*2) #[batch_size, length/2, dim*2]
        x = x.permute(1,0,2) # [length/2, batch_size ,hidden dimension--> 256]

        lens = lens//2

        return x,hidden,lens

In [None]:
class Encoder(nn.Module):
    '''
    Encoder takes the utterances as inputs and returns the key and value.
    Key and value are nothing but simple projections of the output from pBLSTM network.
    '''
    def __init__(self, input_dim, hidden_dim, value_size=128,key_size=128):
        super(Encoder, self).__init__()
        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, num_layers=1, bidirectional=True)
        ### Add code to define the blocks of pBLSTMs! ###
        self.pblstm1 = pBLSTM(hidden_dim*2 ,hidden_dim//2 ) #input dim(based on examples used inputs == 256), hidden dim == 128
        self.pblstm2 = pBLSTM(hidden_dim*2 ,hidden_dim//2)
        self.pblstm3 = pBLSTM(hidden_dim*2,hidden_dim//2)
        

        self.key_network = nn.Linear(hidden_dim*2, value_size)
        self.value_network = nn.Linear(hidden_dim*2, key_size)

    def forward(self, x, lens):
        print('in',x.shape)
        rnn_inp = utils.rnn.pack_padded_sequence(x, lengths=lens, batch_first=False, enforce_sorted=False)
        outputs, _ = self.lstm(rnn_inp)
        #print('out'outputs.shape)
        
        ### Use the outputs and pass it through the pBLSTM blocks! ###
        outputs,hidden,lens = self.pblstm1(outputs) #[batch_size, len//2, 256]
        #print("p1",outputs.shape)
        outputs =torch.nn.utils.rnn.pack_padded_sequence(outputs.float(), lens, batch_first=False, enforce_sorted=False)
        outputs,hidden,lens = self.pblstm2(outputs,hidden)
        #print("p2",outputs.shape)
        outputs =torch.nn.utils.rnn.pack_padded_sequence(outputs.float(), lens, batch_first=False, enforce_sorted=False)
        outputs,hidden,lens = self.pblstm3(outputs,hidden)
        #print("p3",outputs.shape)
        linear_input = outputs
        keys = self.key_network(linear_input)
        #print(keys.shape)
        value = self.value_network(linear_input)
      
        return keys, value,lens

##Attention ->attend

# Understanding specific functions 
**pack_padded_sequence** --> this gets rid of paddings applied to variable lengths of data and packs them to a single tensor.
https://pytorch.org/docs/stable/generated/torch.nn.utils.rnn.pack_padded_sequence.html

**pad_packed_sequence** --> this undo changes made by pack_padded_sequence. 
https://pytorch.org/docs/stable/generated/torch.nn.utils.rnn.pad_packed_sequence.html

**torch bmm** --> this finds the a batch matrix-matrix product.
https://pytorch.org/docs/stable/generated/torch.bmm.html

**embeddings** --> https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html

In [None]:
class Attention(nn.Module):
    '''
    Attention is calculated using key, value and query from Encoder and decoder.
    Below are the set of operations you need to perform for computing attention:
        energy = bmm(key, query)
        attention = softmax(energy)
        context = bmm(attention, value)
    '''
    def __init__(self):
        super(Attention, self).__init__()

    def forward(self, query, key, value, lens):
        '''
        :param query :(batch_size, hidden_size) Query is the output of LSTMCell from Decoder
        :param keys: (batch_size, max_len, encoder_size) Key Projection from Encoder
        :param values: (batch_size, max_len, encoder_size) Value Projection from Encoder
        :return context: (batch_size, encoder_size) Attended Context
        :return attention_mask: (batch_size, max_len) Attention mask that can be plotted 

        :param query :(batch_size, hidden_size) Query is the output of LSTMCell from Decoder
        :param keys: (batch_size, max_len, encoder_size) Key Projection from Encoder
        :param values: (batch_size, max_len, encoder_size) Value Projection from Encoder
        :return context: (batch_size, encoder_size) Attended Context
        :return attention_mask: (batch_size, max_len) Attention mask that can be plotted  
        '''
        attention = torch.bmm(key,query.unsqueeze(2)).squeeze(2)
        
        mask = torch.arange(key.size(1)).unsqueeze(0) >= lens.unsqueeze(1) '''  filter out the data to find attention on provided
                                                                                lengths of data, hence data of focus, other forms of data are washed out
                                                                                '''
        attention.masked_fill_(mask.to(DEVICE), -1e9)
        attention = nn.functional.softmax(attention,dim=1)
        
        out = torch.bmm(attention.unsqueeze(1),value).squeeze(1)
        
        return out,attention

In [None]:

def plot_attn_flow(attn_mask, path):
    plt.imsave(path, attn_mask, cmap='hot')
    return plt

## Decoder --> Speller

---

The Decoder is an attention based RNN

**teacher forcing** --> this involves the occassional passing ground truth data to the decoder to elimate propagation of wrong prediction made previously 

**Gumbel noise** --> this is to introduce randomness to the prediction made for a particular input state. 
For example the word "the" can be followed by "boy" also by "school". Gumbel noise help make the model knows the variations.

In [None]:
class Decoder(nn.Module):
    '''
    As mentioned in a previous recitation, each forward call of decoder deals with just one time step, 
    thus we use LSTMCell instead of LSLTM here.
    The output from the second LSTMCell can be used as query here for attention module.
    In place of value that we get from the attention, this can be replace by context we get from the attention.
    Methods like Gumble noise and teacher forcing can also be incorporated for improving the performance.
    '''
    def __init__(self, vocab_size, hidden_dim,value_size=128, key_size=128, isAttended=True):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_dim, padding_idx=0)
        max_len  = 0
        self.lstm1 = nn.LSTMCell(input_size=hidden_dim + value_size, hidden_size=hidden_dim)
        self.lstm2 = nn.LSTMCell(input_size=hidden_dim, hidden_size=key_size)
        self.isAttended = isAttended
        if (isAttended == True):
          self.attention = Attention()

        self.character_prob = nn.Linear(key_size + value_size, vocab_size)
        self.query_network = nn.Linear(hidden_dim,key_size)

    def forward(self, key, values,lens, tf,text=None, isTrain=True):
        '''
        :param key :(T, N, key_size) Output of the Encoder Key projection layer
        :param values: (T, N, value_size) Output of the Encoder Value projection layer
        :param text: (N, text_len) Batch input of text with text_length
        :param isTrain: Train or eval mode
        :return predictions: Returns the character perdiction probability 
        '''
        output_att= 0
        batch_size = key.shape[1]
    
        if (isTrain == True):
  
            max_len = text.shape[1]
      
            embeddings = self.embedding(text)#.to(torch.long))#.cpu().detach().to(torch.long))
            #print('emed',embeddings.shape)

        else:
            
            max_len = 600

        predictions = []
        hidden_states = [None, None]
        prediction = (torch.ones(batch_size, 1)*33).to(DEVICE) # means prediction starts with <sos>
        #print('predict',prediction.shape)
        att = 0
        count = 0
        att_mask = []
      
        for i in range(max_len):
            # * Implement Gumble noise and teacher forcing techniques 
            # * When attention is True, replace values[i,:,:] with the context you get from attention.
            # * If you haven't implemented attention yet, then you may want to check the index and break 
            #   out of the loop so you do not get index out of range errors. 
            if (isTrain):
                #char_embed = embeddings[:,i,:]
                prob = np.random.random() #randomness to deterine when to enforce teacher forcing
                if prob < tf: #using random probability for teacher forcing
                    #print('teacher forcing')
                    char_embed = embeddings[:,i,:] # passing in the gorund truth value
                else:
                    #print('not teacher forcing')
                    char_embed = self.embedding(F.gumbel_softmax(prediction, tau=1).argmax(dim=-1)) #passing through the predicted value
        
            else:
                char_embed = self.embedding(prediction.argmax(dim=-1))
            values_ = values.permute(1,0,2)
            key_ = key.permute(1,0,2)
           
            if count == 0:
              
              query = self.query_network(char_embed)
             
              output_att, att = self.attention(query,key_,values_,lens)
            
            inp = torch.cat([char_embed,output_att], dim=1)
            hidden_states[0] = self.lstm1(inp, hidden_states[0])

            inp_2 = hidden_states[0][0]
            hidden_states[1] = self.lstm2(inp_2, hidden_states[1])

            ### Compute attention from the output of the second LSTM Cell ###
            output = hidden_states[1][0]
        
            output_att, att = self.attention(output,key_,values_,lens)
             #implemented attention here
            prediction = self.character_prob(torch.cat([output, output_att], dim=1))
            predictions.append(prediction.unsqueeze(1))
            count = count +1
            att_mask.append(att.detach().cpu().numpy())
        att_mask = np.array(att_mask)

        res = att_mask[:, 0, :lens[0]]
       
        return torch.cat(predictions, dim=1),res

In [None]:
class Seq2Seq(nn.Module):
    '''
    We train an end-to-end sequence to sequence model comprising of Encoder and Decoder.
    This is simply a wrapper "model" for your encoder and decoder.
    '''
    def __init__(self, input_dim, vocab_size,tf, hidden_dim, value_size=128, key_size=128, isAttended=False):
        super(Seq2Seq, self).__init__()
        self.encoder = Encoder(input_dim, hidden_dim)
        self.decoder = Decoder(vocab_size, hidden_dim)
        self.tf = tf


    def forward(self, speech_input, speech_len,text_input=None, isTrain=False):
        
        key, value,lens = self.encoder(speech_input, speech_len)
      
        if (isTrain == True):
            #print(value.shape)
            predictions,att = self.decoder(key, value,lens,self.tf, text_input)
        else:
            predictions,att = self.decoder(key, value,lens, self.tf, text=None, isTrain=False)

        #print('predictions',predictions.shape)
        return predictions,att


In [None]:
def edit_distance_cal(pred,target):
  for i, p in enumerate(pred):
    if(i< len(target)):
      dis = Levenshtein.distance(p,target[i])
  return dis

In [None]:
def decode(outputs, eos_token,state='pred'):#this is not my code learn to rewrite and comment
    if state == 'pred':
      probs = F.softmax(outputs, dim=2)
      preds = torch.argmax(probs, dim=2)
    # Iterate over each item in batch.
    #print('preds',preds.shape)
    else:
      preds = outputs
    pred_list = []
    for i in range(preds.size(0)): #try and understand all of this
          eos_idx = (preds[i] == eos_token).nonzero()
          
          eos_idx = (len(preds[i])-1) if eos_idx.nelement() == 0 else eos_idx[0]
          
          # pick all predicted chars excluding eos
          pred_list.append(preds[i, :eos_idx])
      #print('sha',np.array(pred_list).shape)'''
    return pred_list

In [None]:
def translate(outputs, vocab,state):
    eos_token = vocab.index('<eos>')
    #if state =='pred':
    pred_list = decode(outputs, eos_token,state)
 
    pred_str = transform_index_to_letters(pred_list, vocab)
    return  pred_str

## Training

In [None]:
def train(model, train_loader, criterion, optimizer, epoch):
    model.train()
    model.to(DEVICE)
    start = time.time()
    #loss= 0
    cumm_loss =0.0
    dis = 0
    disloss = 0.0
    att_mask_store = []
    # 1) Iterate through your loader
    for i,(inputs,target) in enumerate(train_loader):
      optimizer.zero_grad()
      #inputs.shape[]
      #torch.autograd.set_detect_anomaly(True)
      input_data, input_length = (inputs)
       #1344, 64, 40
      #print(target_data.shape)
      sequence_length = input_data.shape[2]
      input_data = input_data.to(DEVICE)
      input_length =  torch.LongTensor(input_length).to(torch.int)
      #print("shape input",input_data.shape,input_length.shape)
      
      target_data, target_length = (target)
      target_data = target_data.to(DEVICE)
      target_length =  torch.LongTensor(target_length)
      target_length = [i-1 for i in target_length] # make sure you reduce target length anytime you use it

      target_in = target_data[:,:-1]
      target_out =target_data[:,1:]

      mask = torch.Tensor(np.zeros(target_in.shape) )#([64, 236, 35]) 
      for k in range(len(target_length)):
        mask[k,:target_length[k]] = 1

    
      predicted, att_mask = model(input_data,input_length,target_in, isTrain = True) 
     
      mask = mask.to(DEVICE)
     

          # 1) Iterate through your loader
        # 2) Use torch.autograd.set_detect_anomaly(True) to get notices about gradient explosion
        
            # 3) Set the inputs to the device.

            # 4) Pass your inputs, and length of speech into the model.

            # 5) Generate a mask based on the lengths of the text to create a masked loss. 
            # 5.1) Ensure the mask is on the device and is the correct shape.

            # 6) If necessary, reshape your predictions and origianl text input 
            # 6.1) Use .contiguous() if you need to. 
      predicted = predicted.contiguous().view(-1,predicted.size(-1)) #gives you a 2D
      mask = mask.view(-1) #gives you a 1d flattened mask
   
            # 7) Use the criterion to get the loss.
      train_loss = criterion(predicted,target_out.contiguous().view(-1))
            # 8) Use the mask to calculate a masked loss.
      mask_loss = torch.sum(train_loss*mask)#
      

            # 9) Run the backward pass on the masked loss. 
      mask_loss.backward()
            # 10) Use torch.nn.utils.clip_grad_norm(model.parameters(), 2)
      torch.nn.utils.clip_grad_norm(model.parameters(), 2)
            # 11) Take a step with your optimizer
      optimizer.step()
            # 12) Normalize the masked loss
      cumm_loss += float(mask_loss.item())/int(torch.sum(mask).item())
            # 13) Optionally print the training loss after every N batches      
    
    print('Training loss',cumm_loss/i)
    #print('disloss',dis)
    return cumm_loss/i,att_mask
      

# Validation

In [None]:
def valid(model, train_loader, criterion, optimizer, epoch):
   with torch.no_grad():
    model.eval()
    model.to(DEVICE)
    start = time.time()
    #loss= 0
    cumm_loss =0.0
    dis = 0
    # 1) Iterate through your loader
    for i,(inputs,target) in enumerate(train_loader):
      optimizer.zero_grad()
      # 2) Use torch.autograd.set_detect_anomaly(True) to get notices about gradient explosion
      #torch.autograd.set_detect_anomaly(True)
      # 3) Set the inputs to the device.
      input_data, input_length = (inputs) #1344, 64, 40
      #print(target_data.shape)
      sequence_length = input_data.shape[2]
      input_data = input_data.to(DEVICE)
      input_length =  torch.LongTensor(input_length).to(torch.int)
      #print("shape input",input_data.shape,input_length.shape)
      
      target_data, target_length = (target)
      target_data = target_data.to(DEVICE)
      target_length =  torch.LongTensor(target_length)
      target_length = [i-1 for i in target_length] # make sure you reduce target length anytime you use it
      target_in = target_data[:,:-1] # input 
      target_out =target_data[:,1:] # output

      # 4) Pass your inputs, and length of speech into the model.
      predicted,att = model(input_data,input_length,isTrain = False) 

      target_words = translate(target_out.data.cpu(), LETTER_LIST,'tar')
      predicted_words = translate(predicted.data.cpu(), LETTER_LIST,'pred')
     
      dis += edit_distance_cal(predicted_words,target_words)
       
     
      # 10) Use torch.nn.utils.clip_grad_norm(model.parameters(), 2)
      torch.nn.utils.clip_grad_norm(model.parameters(), 2)
    dis /= len(train_loader)
    print('disloss',dis)
      # 13) Optionally print the training loss after every N batches
    return dis
      


# Test

In [None]:
def test(model, train_loader):
    words = []
    model.to(DEVICE)
    start = time.time()
    #loss= 0
    cumm_loss =0.0
    # 1) Iterate through your loader
    for i,(inputs) in enumerate(train_loader):
      optimizer.zero_grad()
      # 2) Use torch.autograd.set_detect_anomaly(True) to get notices about gradient explosion
      torch.autograd.set_detect_anomaly(True)
      # 3) Set the inputs to the device.
      input_data, input_length = (inputs) #1344, 64, 40
      #print(target_data.shape)
      sequence_length = input_data.shape[2]
      input_data = input_data.to(DEVICE)
      input_length =  torch.LongTensor(input_length).to(torch.int)
      #print("shape input",input_data.shape,input_length.shape)
  

      # 4) Pass your inputs, and length of speech into the model.
      predicted,att = model(input_data,input_length, isTrain = False)
      #print('pre',predicted.shape)
      predicted_words = translate(predicted.data.cpu(), LETTER_LIST,state = 'pred')
      words.append(predicted_words)
    words = np.concatenate(words)
    return words


## Main

In [None]:
  def init_weights(layer):
        if type(layer) == nn.Linear:
           torch.nn.init.xavier_uniform_(layer.weight.data, gain=1.0)
        if type(layer) == nn.LSTMCell:
           torch.nn.init.uniform_(layer.weight_hh.data, a=-0.1, b=0.1)
           torch.nn.init.uniform_(layer.weight_ih.data, a=-0.1, b=0.1)
        if type(layer) == nn.LSTM:
           torch.nn.init.uniform_(layer.weight_hh_l0.data, a=-0.1, b=0.1)
           torch.nn.init.uniform_(layer.weight_ih_l0.data, a=-0.1, b=0.1)
           

In [None]:
train_dataset = Speech2TextDataset(speech_train, character_text_train)
valid_dataset = Speech2TextDataset(speech_valid, character_text_valid, isTrain = True)
test_dataset = Speech2TextDataset(speech_test, None, isTrain = False)
batch_size = 64 if DEVICE == 'cuda' else 1
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_train)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_train)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_test)


In [None]:
tf = 0.25
model = Seq2Seq(input_dim=40, vocab_size=len(LETTER_LIST),tf = tf,hidden_dim=128)
model.apply(init_weights)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(reduction='none')
nepochs = 25
attn_mask = []

In [None]:
val = 10000.0
for epoch in range(nepochs):
        loss,attn_mask = train(model, train_loader, criterion, optimizer, epoch)

        #optimizer.param_groups[0]['lr'] = 0.005
        if ((epoch+1)%5)==0:
          path ='attention'+str(epoch)+'.png'#"gdrive/MyDrive/hw4p2"
          plot_attn_flow(attn_mask, path)
        #break
        #test(model, test_loader, epoch)

        dis = valid(model, valid_loader, criterion, optimizer, epoch)
        if(dis > val):
          print('model overfitting')
          #optimizer.param_groups[0]['lr'] = 0.005
        val = dis

In [None]:
state_dict = torch.load('savedmodel.pth')
model.load_state_dict(state_dict)


In [None]:
predict = test(model,test_loader)

In [None]:
store_c = np.arange(len(predict))
data = {"id":store_c,"label":predict}
df = pd.DataFrame(data)
df.to_csv("data5.csv", index=False)

In [None]:
print(len(predict))