In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import zipfile
import pickle
from collections import OrderedDict
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
import torchaudio
import math
import random
import numpy as np
import io
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from IPython.display import HTML

Setup / Configs

In [None]:
assert torch.cuda.is_available(), 'GPU unavailable'

In [None]:
print('Number of GPUs available : ', torch.cuda.device_count())

Number of GPUs available :  1


In [None]:
MODEL_CHECKPOINT = '/content/drive/MyDrive/Fall2021-CAPSTONE/checkpoint.pt'

In [None]:
# zip file handler  
zip = zipfile.ZipFile('/content/drive/MyDrive/Fall2021-CAPSTONE/2-DataPreparation/DataSplits_bkp.zip')

In [None]:
# Number of workers for dataloader
workers = 2

# Batch size during training
batch_size = 32

# Load size during training
load_size = 1280

# Number of training epochs
num_epochs = 20

# Learning rate
lr = 0.0006

# Number of GPUs available. Use 0 for CPU mode.
ngpu = 1

Vocabulary

In [None]:
chars = ' abcdefghijklmnopqrstuvwxyz'
vocab = ['<blank>', '<pad>', '<unk>']
for ch in chars:
  vocab.append(ch)

In [None]:
VOCAB_LEN = len(vocab)
print('Vocab size: {} (with BLNK, PAD, UNK and SPACE added)'.format(VOCAB_LEN))
print('vocab[0]:', vocab[0])
print('vocab[1]:', vocab[1])
print('vocab[2]:', vocab[2])
print('vocab[3]:', vocab[3])

Vocab size: 30 (with BLNK, PAD, UNK and SPACE added)
vocab[0]: <blank>
vocab[1]: <pad>
vocab[2]: <unk>
vocab[3]:  


In [None]:
with open('/content/drive/MyDrive/Fall2021-CAPSTONE/max_len.pickle', 'rb') as f2:
  MAX_LEN = pickle.load(f2)
print('Max target length : ', MAX_LEN)

Max target length :  105


Encoder

In [None]:
class Encoder(nn.Module):
  def __init__(self):
    super().__init__()
    self.spec = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=80)

  def forward(self, input_X):
    X_len = []
    AUDIO = []
    for X in input_X:
      waveform, sample_rate = torchaudio.load(io.BytesIO(X))
      audio_tensor = self.spec(waveform).squeeze(0).transpose(1, 0)
      # print('spectrogram', audio_tensor.shape)
      AUDIO.append(audio_tensor)
      X_len.append(audio_tensor.shape[0])
    AUDIO = pad_sequence(AUDIO, padding_value=1.)
    # print('encoder', AUDIO.shape)
    # print(X_len)
    return AUDIO, X_len

Decoder

In [None]:
class OverLastDim(nn.Module):
    """
    An n-dimensional tensor of shape (s_1, s_2, ..., s_n) is first collapsed to
    a tensor with shape (s_1*s_2*...*s_n-1, s_n). The module is called with
    this as input producing (s_1*s_2*...*s_n-1, s_n') --- note that the final
    dimension can change. This is expanded to (s_1, s_2, ..., s_n-1, s_n') and
    returned.
    """

    def __init__(self, module):
        super().__init__()
        self.module = module

    def forward(self, x):
        *dims, input_size = x.size()

        reduced_dims = 1
        for dim in dims:
            reduced_dims *= dim

        x = x.view(reduced_dims, -1)
        x = self.module(x)
        x = x.view(*dims, -1)
        return x

In [None]:
class RNNWrapper(nn.Module):
    def __init__(self, input_size, hidden_size, rnn_type=nn.GRU,
                 bidirectional=True, batch_norm=True):
        """Instantiates an RNN without bias parameters. Optionally applies a batch
        normalisation layer to the input with the statistics computed over all
        time steps. If the RNN is bidirectional, the output from the forward
        and backward units is summed before return.
        """
        super().__init__()
        if batch_norm:
            self.batch_norm = OverLastDim(nn.BatchNorm1d(input_size))
        self.bidirectional = bidirectional
        self.rnn = rnn_type(input_size=input_size,
                            hidden_size=hidden_size,
                            bidirectional=bidirectional,
                            bias=False)

    def forward(self, x):
        if hasattr(self, 'batch_norm'):
            x = self.batch_norm(x)
        x, _ = self.rnn(x)
        if self.bidirectional:
            # TxNx(H*2) -> TxNxH by sum.
            seq_len, batch_size, _ = x.size()
            x = x.view(seq_len, batch_size, 2, -1) \
                 .sum(dim=2) \
                 .view(seq_len, batch_size, -1)
        return x

In [None]:
class Decoder(nn.Module):

  def __init__(self, in_features=80, n_hidden=MAX_LEN, out_features=VOCAB_LEN, rnn_layers=3, relu_clip=20.):
    super().__init__()
    
    # CONVOLUTIONAL layers
    self.conv = nn.Sequential(
        nn.Conv2d(in_channels=1,
                  out_channels=32,
                  kernel_size=5,
                  stride=1,
                  padding='same'),
        nn.BatchNorm2d(32),
        nn.Hardtanh(0, relu_clip, inplace=True),
        nn.Conv2d(in_channels=32,
                  out_channels=32,
                  kernel_size=5,
                  stride=1,
                  padding='same'),
        nn.BatchNorm2d(32),
        nn.Hardtanh(0, relu_clip, inplace=True)
    )

    # RECURRENT layers
    rnn_in_size = 2560
    rnns = OrderedDict()
    for i in range(rnn_layers):
      rnn = RNNWrapper(input_size=rnn_in_size,
                        hidden_size=n_hidden,
                        rnn_type=nn.GRU,
                        bidirectional=True,
                        batch_norm=i > 0)
      rnns[str(i)] = rnn
      rnn_in_size = n_hidden
    self.rnns = nn.Sequential(rnns)

    # FULLY CONNECTED layers
    fully_connected = nn.Sequential(
        nn.BatchNorm1d(n_hidden),
        nn.Linear(n_hidden, out_features, bias=False)
    )
    self.fc = OverLastDim(fully_connected)

  # for training
  def forward(self, X_in, X_len, Y_in, Y_len):
    """
    Perform token prediction and compute loss over training set.
    
    Inputs:
    - X_in: A tensor of shape (seq_len, batch, in_features)
      containing a mini-batch of audio sequence features padded to seq_len.
    - X_len: A tuple of shape (batch, ) containing the 
      actual lengths of the audio sequence (each <= seq_len).
    - Y_in: A tensor of shape (batch, max_seq_len)
      containing a mini-batch of text targets padded to max_seq_len.
      Each element in the target sequence is an index in the vocabulary. 
      And the target index cannot be blank (index=0 in vocab).
    - Y_len: A tuple of shape (batch, ) containing the 
      actual lengths of the targets (each <= max_seq_len).
    
    Returns:
    - loss: A PyTorch scalar containing the CTC loss for the mini-batch.
    """
    # training logic here
    # print('before conv', X_in.shape)
    X_in = X_in.permute(1, 2, 0)   # TxNxH -> NxHxT
    X_in.unsqueeze_(dim=1)      # NxHxT -> Nx1xHxT
    X_in = self.conv(X_in)
    # print('after conv', X_in.shape)

    N, H1, H2, T = X_in.size()
    x = X_in.view(N, H1*H2, T)
    x = x.permute(2, 0, 1)   # NxHxT -> TxNxH
    x = self.rnns(x.contiguous())
    # print('after rnns', x.shape)

    out = self.fc(x)
    logprobs = nn.functional.log_softmax(out, dim=2)
    # print('decoder', logprobs.shape)
    
    # compute CTC loss
    ctc_loss = nn.CTCLoss(zero_infinity=True)
    loss = ctc_loss(logprobs, Y_in, X_len, Y_len)
    return loss 

  # for inference
  def predict(self, X_in):
    """
    Perform token prediction over validation/test set.
    
    Inputs:
    - X_in: A tensor of shape (seq_len, batch, in_features)
      containing a mini-batch of audio features padded to seq_len.
    
    Returns:
    - text: A tuple of shape (batch_size, )
      containing text output for the given batch.
    """
    # inference logic here
    X_in = X_in.permute(1, 2, 0)   # TxNxH -> NxHxT
    X_in.unsqueeze_(dim=1)      # NxHxT -> Nx1xHxT
    X_in = self.conv(X_in)

    N, H1, H2, T = X_in.size()
    x = X_in.view(N, H1*H2, T)
    x = x.permute(2, 0, 1)   # NxHxT -> TxNxH
    x = self.rnns(x.contiguous())
    out = self.fc(x)
    logprobs = nn.functional.log_softmax(out, dim=2)
    _, max_indices = logprobs.float().max(2)
    
    batch_sentences = []
    for i, indices in enumerate(max_indices.t()):
        no_dups, prev = [], None
        for index in indices:
            if prev is None or index != prev:
                no_dups.append(index.item())
                prev = index

        symbols = [vocab[s] for s in no_dups]

        no_blanks = [s for s in symbols if (s!=vocab[0] and s!=vocab[1])]
        batch_sentences.append(''.join(no_blanks))
    return batch_sentences

Training Setup

In [None]:
class CustomDataset(Dataset):
    def __init__(self, listOfFiles):
      '''
      Takes as input the list of file paths containing X-audio, Y-text data.
      Stores the audio data in a member variable X.
      Stores the text data in a member variable Y.
      '''
      # opening the files and storing their contents in lists
      audio_list = []
      listOfAudioFiles = [x for x in listOfFiles if x.endswith('.wav')]
      for audio_file in listOfAudioFiles:
        audio_list.append(zip.read(audio_file))
      
      text_list = []
      listOfTxtFiles = [x for x in listOfFiles if x.endswith('.txt')]
      for text_file in listOfTxtFiles:
        loaded_txt = zip.read(text_file)
        text_list.append(loaded_txt)

      # store them in member variables
      self.X = audio_list
      self.Y = text_list
    
    def __len__(self):
      return len(self.Y)
   
    def __getitem__(self, index):
      '''
      Returns the X,Y pair present at the specified index of the list.
      '''
      return self.X[index], self.Y[index]

In [None]:
def encode_targets(Y_batch):
  text = torch.ones((len(Y_batch), MAX_LEN), dtype=torch.int64)
  sent_lengths = []
  for sent_idx, sent in enumerate(Y_batch):
    sent_lengths.append(len(sent))
    symbols = list(sent.lower())
    for idx, symbol in enumerate(symbols):
      ch = chr(symbol)
      if ch in vocab:
        text[sent_idx][idx] = vocab.index(ch)
      else:
        text[sent_idx][idx] = 2
  # print('target', text.shape)
  return text, tuple(sent_lengths)

Training Loop

In [None]:
# Note : Just a feature extractor (no trainable params).
enc = Encoder()

In [None]:
dec = Decoder()
dec

Decoder(
  (conv): Sequential(
    (0): Conv2d(1, 32, kernel_size=(5, 5), stride=(1, 1), padding=same)
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): Hardtanh(min_val=0, max_val=20.0, inplace=True)
    (3): Conv2d(32, 32, kernel_size=(5, 5), stride=(1, 1), padding=same)
    (4): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): Hardtanh(min_val=0, max_val=20.0, inplace=True)
  )
  (rnns): Sequential(
    (0): RNNWrapper(
      (rnn): GRU(2560, 105, bias=False, bidirectional=True)
    )
    (1): RNNWrapper(
      (batch_norm): OverLastDim(
        (module): BatchNorm1d(105, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (rnn): GRU(105, 105, bias=False, bidirectional=True)
    )
    (2): RNNWrapper(
      (batch_norm): OverLastDim(
        (module): BatchNorm1d(105, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (rnn): GRU(105, 105, bias=

In [None]:
loss_history = []

epoch = 0
load_index = 0
batch_index = 0
loss = None

In [None]:
checkpoint = None
try:
  checkpoint = torch.load(MODEL_CHECKPOINT, map_location=torch.device('cpu'))
except Exception as e:
  print(e)
  pass

In [None]:
if checkpoint is not None and bool(checkpoint):
  if 'epoch' in checkpoint:
    epoch = checkpoint['epoch']

  if 'load_index' in checkpoint:
    load_index = checkpoint['load_index']

  if 'batch_index' in checkpoint: 
    batch_index = checkpoint['batch_index']

  if 'loss' in checkpoint:
    loss = checkpoint['loss']

  if 'loss_history' in checkpoint and len(checkpoint['loss_history']) > 0:
    loss_history = [float(x) for x in checkpoint['loss_history'].split(',')]

In [None]:
if 'model_state_dict' in checkpoint:
  dec.load_state_dict(checkpoint['model_state_dict'])
dec.cuda()

optimizer = optim.Adam(dec.parameters(), lr=lr)
if 'optimizer_state_dict' in checkpoint:
  optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

In [None]:
with open('/content/drive/MyDrive/Fall2021-CAPSTONE/train_list.pickle', 'rb') as f3:
  train_list = pickle.load(f3)
print('Total training load : ', len(train_list))

Total training load :  856210


In [None]:
print("Starting Training Loop...")
dec.train()
# For each epoch
while epoch < num_epochs:
  while load_index < math.ceil(len(train_list)/load_size):
    # create a dataset object
    dset = CustomDataset(train_list[load_index*load_size:(load_index*load_size)+load_size])
    # wrap it around a dataloader
    data_loader = DataLoader(dset, batch_size = batch_size, num_workers = workers)

    for idx, ( X_batch, Y_batch) in enumerate(data_loader):
      if idx < batch_index:
        continue
      # forward propagation
      dec.zero_grad()
      Y_in, Y_len = encode_targets(Y_batch)
      X_in, X_len = enc(X_batch)
      
      loss = dec(X_in.cuda(), X_len, Y_in.cuda(), Y_len)
      # loss = dec(X_in, X_len, Y_in, Y_len)

      # backward propagation
      loss.backward()

      # parameter updates
      optimizer.step()

      batch_index += 1
    
    batch_index = 0
    load_index += 1
    with torch.no_grad():
      print('Train Epoch: {:3} \t Load: {:3} \t Loss: {:F}'.format(epoch, load_index, loss.item()))
      dec.cpu()
      torch.save({
          'epoch': epoch,
          'model_state_dict': dec.state_dict(),
          'optimizer_state_dict': optimizer.state_dict(),
          'load_index': load_index,
          'batch_index': batch_index,
          'loss': loss,
          'loss_history': ','.join([str(x) for x in loss_history])
          }, MODEL_CHECKPOINT)
      dec.cuda()
  
  load_index = 0
  epoch += 1
  with torch.no_grad():
    loss_history.append(loss.item())

Starting Training Loop...
