# How to run code?
Run cells in chronological order.

# Data Loading Scheme
The AudioDataset class sorts the mfcc and transcripts after loading from the directory. For each mfcc, cepstral normalization is done by subtracting the mean and dividing by the standard deviation. For each transcript, [SOS] [EOS] are removed and converted into integers.

# Architectures tried
1. Encoder: 1 Conv1d + 1 pBLSTM \
   Decoder: \
            torch.nn.Dropout(0.2), \
            torch.nn.Linear(embed_size, 1024), \
            torch.nn.GELU(),\
            PermuteBlock(), torch.nn.BatchNorm1d(1024), PermuteBlock(),\
            torch.nn.Dropout(0.2),\
            torch.nn.Linear(1024, 1024),\
            torch.nn.GELU(),\
            PermuteBlock(), torch.nn.BatchNorm1d(1024), PermuteBlock(),\
            torch.nn.Dropout(0.2),\
            torch.nn.Linear(1024, 1024),\
            torch.nn.GELU(),\
            PermuteBlock(), torch.nn.BatchNorm1d(1024), PermuteBlock(),\
            torch.nn.Dropout(0.2),\
            torch.nn.Linear(1024, output_size),

   Beam Width: 2 \
   LR: 2e-3 \
   Epochs: 50 \
   Batch Size: 64 \
   Transforms: torchaudio.transforms.FrequencyMasking(freq_mask_param=10), torchaudio.transforms.TimeMasking(time_mask_param=80), \
   Scheduler: torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.75, patience=1, min_lr=1e-4)

2. Encoder: 2 Conv1d + 2 pBLSTM + LockedDropout\
   Decoder: \
            torch.nn.Linear(embed_size, 2000),\
            torch.nn.GELU(),\
            PermuteBlock(), torch.nn.BatchNorm1d(2000), PermuteBlock(),\
            torch.nn.Dropout(0.2),\
            torch.nn.Linear(2000, 1000),\
            torch.nn.GELU(),\
            PermuteBlock(), torch.nn.BatchNorm1d(1000), PermuteBlock(),\
            torch.nn.Dropout(0.2),\
            torch.nn.Linear(1000, output_size),

   Beam Width: 5 \
   LR: 2e-3 \
   Epochs: 50 \
   Batch Size: 128 \
   Transforms: torchaudio.transforms.FrequencyMasking(freq_mask_param=10), torchaudio.transforms.TimeMasking(time_mask_param=80), \
   Scheduler: torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.75, patience=1, min_lr=1e-4)



# Installs

In [None]:
%pip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 torchtext==0.14.1 torchaudio==0.13.1 torchdata==0.5.1 --extra-index-url https://download.pytorch.org/whl/cu117 -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 GB[0m [31m575.6 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.3/24.3 MB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m87.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m
[?25h


This may take a while

In [None]:
!pip install wandb --quiet
!pip install python-Levenshtein -q
!git clone --recursive https://github.com/parlance/ctcdecode.git
!pip install wget -q
%cd ctcdecode
!pip install . -q
%cd ..

!pip install torchsummaryX -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m195.4/195.4 kB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m263.5/263.5 kB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.4/177.4 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hCloning into 'ctcdecode'...
remote: Enumerating objects: 1102, done.[K
remote: Counting objects: 100% (39/39), done.[K
remote: Compressing objects: 100% (25/25), done.[K
remote: Total 1102 (delta 16), reused 32 (delta 14), pack-reused 1063[K
Receiving objects: 100

In [None]:
'''
If torchsummaryX doesn't work, please run this cell. Alternatively, please refer to Piazza post @209 for more assistance:
'''

!pip install torchsummaryx==1.3.0



# Imports

In [None]:
import torch
import random
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torchsummaryX import summary
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

import torchaudio.transforms as tat

from sklearn.metrics import accuracy_score
import gc

import zipfile
import pandas as pd
from tqdm import tqdm
import os
import datetime

# imports for decoding and distance calculation
import ctcdecode
import Levenshtein
from ctcdecode import CTCBeamDecoder

import warnings
warnings.filterwarnings('ignore')

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device: ", device)

Device:  cuda


# Kaggle Setup

In [None]:
!pip install --upgrade --force-reinstall --no-deps kaggle==1.5.8 -q
!mkdir /root/.kaggle

with open("/root/.kaggle/kaggle.json", "w+") as f:
    f.write('{"username":"","key":""}')

!chmod 600 /root/.kaggle/kaggle.json

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/59.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for kaggle (setup.py) ... [?25l[?25hdone


In [None]:
!kaggle competitions download -c hw3p2asr-s24

Downloading hw3p2asr-s24.zip to /content
100% 3.72G/3.74G [00:25<00:00, 192MB/s]
100% 3.74G/3.74G [00:26<00:00, 149MB/s]


In [None]:
'''
This will take a couple minutes, but you should see at least the following:
11-785-s24-hw3p2  ctcdecode  hw3p2asr-s24.zip  sample_data
'''
!unzip -q hw3p2asr-s24.zip
!ls

11-785-s24-hw3p2  ctcdecode  hw3p2asr-s24.zip  sample_data


# Dataset and Dataloader

In [None]:
# ARPABET PHONEME MAPPING
# DO NOT CHANGE

CMUdict_ARPAbet = {
    "" : " ",
    "[SIL]": "-", "NG": "G", "F" : "f", "M" : "m", "AE": "@",
    "R"    : "r", "UW": "u", "N" : "n", "IY": "i", "AW": "W",
    "V"    : "v", "UH": "U", "OW": "o", "AA": "a", "ER": "R",
    "HH"   : "h", "Z" : "z", "K" : "k", "CH": "C", "W" : "w",
    "EY"   : "e", "ZH": "Z", "T" : "t", "EH": "E", "Y" : "y",
    "AH"   : "A", "B" : "b", "P" : "p", "TH": "T", "DH": "D",
    "AO"   : "c", "G" : "g", "L" : "l", "JH": "j", "OY": "O",
    "SH"   : "S", "D" : "d", "AY": "Y", "S" : "s", "IH": "I",
    "[SOS]": "[SOS]", "[EOS]": "[EOS]"
}

CMUdict = list(CMUdict_ARPAbet.keys())
ARPAbet = list(CMUdict_ARPAbet.values())


PHONEMES = CMUdict[:-2]
LABELS = ARPAbet[:-2]

### Train Data

In [None]:
class AudioDataset(torch.utils.data.Dataset):

    # For this homework, we give you full flexibility to design your data set class.
    # Hint: The data from HW1 is very similar to this HW

     
    def __init__(self, root='/content/11-785-s24-hw3p2/', phonemes = PHONEMES, partition= "train-clean-100"):
        '''
        Initializes the dataset.

        INPUTS: What inputs do you need here?
        '''

        # Load the directory and all files in them

        self.mfcc_dir = root + partition + '/mfcc/'  
        self.transcript_dir = root + partition + '/transcript/'  

        self.mfccs = []  
        self.transcripts = []  

        self.PHONEMES = PHONEMES

        mfcc_names          = sorted(os.listdir(self.mfcc_dir))
        transcript_names    = sorted(os.listdir(self.transcript_dir))

        for i in range(len(mfcc_names)):
            mfcc        = np.load(self.mfcc_dir + mfcc_names[i])
            mean        = np.mean(mfcc, axis=0)
            std         = np.std(mfcc, axis=0)
            mfcc        = (mfcc - mean)/(std + 1e-5)

            transcript  = np.load(self.transcript_dir + transcript_names[i])
            transcript  = transcript[1:-1]
            transcript  = [self.PHONEMES.index(p) for p in transcript]

            self.mfccs.append(mfcc)
            self.transcripts.append(transcript)

         
        # WHAT SHOULD THE LENGTH OF THE DATASET BE?
        self.length = len(self.mfccs)

         
        # HOW CAN WE REPRESENT PHONEMES? CAN WE CREATE A MAPPING FOR THEM?
        # HINT: TENSORS CANNOT STORE NON-NUMERICAL VALUES OR STRINGS

         
        # CREATE AN ARRAY OF ALL FEATUERS AND LABELS
        # WHAT NORMALIZATION TECHNIQUE DID YOU USE IN HW1? CAN WE USE IT HERE?
        # self.mfccs          = np.vstack(self.mfccs)
        # self.transcripts    = np.hstack(self.transcripts)
        '''
        You may decide to do this in __getitem__ if you wish.
        However, doing this here will make the __init__ function take the load of
        loading the data, and shift it away from training.
        '''

    def __len__(self):

        '''
        TODO: What do we return here?
        '''
        return self.length

    def __getitem__(self, ind):
        '''
        TODO: RETURN THE MFCC COEFFICIENTS AND ITS CORRESPONDING LABELS

        If you didn't do the loading and processing of the data in __init__,
        do that here.

        Once done, return a tuple of features and labels.
        '''

        # raise NotImplemented

        mfcc = torch.FloatTensor(self.mfccs[ind]) # TODO
        transcript = torch.FloatTensor(self.transcripts[ind]) # TODO

        return mfcc, transcript


    def collate_fn(self,batch):
        '''
        TODO:
        1.  Extract the features and labels from 'batch'
        2.  We will additionally need to pad both features and labels,
            look at pytorch's docs for pad_sequence
        3.  This is a good place to perform transforms, if you so wish.
            Performing them on batches will speed the process up a bit.
        4.  Return batch of features, labels, lenghts of features,
            and lengths of labels.
        '''
        # import pdb; pdb.set_trace()
        batch_mfcc, batch_transcript, lengths_mfcc, lengths_transcript = [], [], [], []
        for mfcc, transcript in batch:
          # batch of input mfcc coefficients
          batch_mfcc.append(mfcc) # TODO
          lengths_mfcc.append(len(mfcc))
          # batch of output phonemes
          batch_transcript.append(transcript) # TODO
          lengths_transcript.append(len(transcript))


        # HINT: CHECK OUT -> pad_sequence (imported above)
        # Also be sure to check the input format (batch_first)

        batch_mfcc_pad = pad_sequence(batch_mfcc, batch_first=True) # TODO
        batch_transcript_pad = pad_sequence(batch_transcript, batch_first=True) # TODO

        # You may apply some transformation, Time and Frequency masking, here in the collate function;
        # Food for thought -> Why are we applying the transformation here and not in the __getitem__?
        #                  -> Would we apply transformation on the validation set as well?
        #                  -> Is the order of axes / dimensions as expected for the transform functions?

        # Return the following values: padded features, padded labels, actual length of features, actual length of the labels
        return batch_mfcc_pad, batch_transcript_pad, torch.tensor(lengths_mfcc), torch.tensor(lengths_transcript)



### Test Data

In [None]:
# Test Dataloader
class AudioDatasetTest(torch.utils.data.Dataset):

    # For this homework, we give you full flexibility to design your data set class.
    # Hint: The data from HW1 is very similar to this HW

     
    def __init__(self, root='/content/11-785-s24-hw3p2/', phonemes = PHONEMES, partition= "train-clean-100"):
        '''
        Initializes the dataset.

        INPUTS: What inputs do you need here?
        '''

        # Load the directory and all files in them

        self.mfcc_dir = root + partition + '/mfcc/'  
        # self.transcript_dir = root + partition + '/transcript/'  

        self.mfccs = []  
        # self.transcripts = []  

        self.PHONEMES = PHONEMES

        mfcc_names          = sorted(os.listdir(self.mfcc_dir))
        # transcript_names    = sorted(os.listdir(self.transcript_dir))

        for i in range(len(mfcc_names)):
            mfcc        = np.load(self.mfcc_dir + mfcc_names[i])
            mean        = np.mean(mfcc, axis=0)
            std         = np.std(mfcc, axis=0)
            mfcc        = (mfcc - mean)/(std + 1e-5)

            # transcript  = np.load(self.transcript_dir + transcript_names[i])
            # transcript  = transcript[1:-1]
            # transcript  = [self.PHONEMES.index(p) for p in transcript]

            self.mfccs.append(mfcc)
            # self.transcripts.append(transcript)

         
        # WHAT SHOULD THE LENGTH OF THE DATASET BE?
        self.length = len(self.mfccs)

         
        # HOW CAN WE REPRESENT PHONEMES? CAN WE CREATE A MAPPING FOR THEM?
        # HINT: TENSORS CANNOT STORE NON-NUMERICAL VALUES OR STRINGS

         
        # CREATE AN ARRAY OF ALL FEATUERS AND LABELS
        # WHAT NORMALIZATION TECHNIQUE DID YOU USE IN HW1? CAN WE USE IT HERE?
        # self.mfccs          = np.vstack(self.mfccs)
        # self.transcripts    = np.hstack(self.transcripts)
        '''
        You may decide to do this in __getitem__ if you wish.
        However, doing this here will make the __init__ function take the load of
        loading the data, and shift it away from training.
        '''

    def __len__(self):

        '''
        TODO: What do we return here?
        '''
        return self.length

    def __getitem__(self, ind):
        '''
        TODO: RETURN THE MFCC COEFFICIENTS AND ITS CORRESPONDING LABELS

        If you didn't do the loading and processing of the data in __init__,
        do that here.

        Once done, return a tuple of features and labels.
        '''

        # raise NotImplemented

        mfcc = torch.FloatTensor(self.mfccs[ind]) # TODO
        # transcript = torch.FloatTensor(self.transcripts[ind]) # TODO

        return mfcc


    def collate_fn(self,batch):
        '''
        TODO:
        1.  Extract the features and labels from 'batch'
        2.  We will additionally need to pad both features and labels,
            look at pytorch's docs for pad_sequence
        3.  This is a good place to perform transforms, if you so wish.
            Performing them on batches will speed the process up a bit.
        4.  Return batch of features, labels, lenghts of features,
            and lengths of labels.
        '''
        # import pdb; pdb.set_trace()
        batch_mfcc, lengths_mfcc = [], []
        for mfcc in batch:
          # batch of input mfcc coefficients
          batch_mfcc.append(mfcc) # TODO
          lengths_mfcc.append(len(mfcc))
          # batch of output phonemes
          # batch_transcript.append(transcript) # TODO
          # lengths_transcript.append(len(transcript))


        # HINT: CHECK OUT -> pad_sequence (imported above)
        # Also be sure to check the input format (batch_first)

        batch_mfcc_pad = pad_sequence(batch_mfcc, batch_first=True) # TODO
        # batch_transcript_pad = pad_sequence(batch_transcript, batch_first=True) # TODO

        # You may apply some transformation, Time and Frequency masking, here in the collate function;
        # Food for thought -> Why are we applying the transformation here and not in the __getitem__?
        #                  -> Would we apply transformation on the validation set as well?
        #                  -> Is the order of axes / dimensions as expected for the transform functions?

        # Return the following values: padded features, padded labels, actual length of features, actual length of the labels
        return batch_mfcc_pad, torch.tensor(lengths_mfcc)



### Config - Hyperparameters

In [None]:
root = '/content/11-785-s24-hw3p2/'

# Feel free to add more items here
config = {
    "beam_width" : 5,
    "lr"         : 2e-3,
    "epochs"     : 50,
    "batch_size" : 128  # Increase if your device can handle it
}

# You may pass this as a parameter to the dataset class above
# This will help modularize your implementation
transforms = [] # set of tranformations

### Data loaders

In [None]:
# get me RAMMM!!!!
import gc
gc.collect()

0

In [None]:
# Create objects for the dataset class
train_data = AudioDataset(root='/content/11-785-s24-hw3p2/', phonemes = PHONEMES, partition= "train-clean-100")  
val_data = AudioDataset(root='/content/11-785-s24-hw3p2/', phonemes = PHONEMES, partition= "dev-clean", ) # TODO : You can either use the same class with some modifications or make a new one :)


In [None]:
test_data = AudioDatasetTest(root='/content/11-785-s24-hw3p2/', phonemes = PHONEMES, partition= "test-clean", )  

In [None]:
test_loader = torch.utils.data.DataLoader(
    dataset     = test_data,
    num_workers = 4,
    batch_size  = config['batch_size'],
    collate_fn  = test_data.collate_fn,
    pin_memory  = True,
    shuffle     = False
) 

In [None]:
# Do NOT forget to pass in the collate function as parameter while creating the dataloader
train_loader = torch.utils.data.DataLoader(
    dataset     = train_data,
    num_workers = 4,
    batch_size  = config['batch_size'],
    collate_fn  = train_data.collate_fn,
    pin_memory  = True,
    shuffle     = True
)
 
val_loader = torch.utils.data.DataLoader(
    dataset     = val_data,
    num_workers = 4,
    batch_size  = config['batch_size'],
    collate_fn  = val_data.collate_fn,
    pin_memory  = True,
    shuffle     = False
) 


print("Batch size: ", config['batch_size'])
print("Train dataset samples = {}, batches = {}".format(train_data.__len__(), len(train_loader)))
print("Val dataset samples = {}, batches = {}".format(val_data.__len__(), len(val_loader)))
print("Test dataset samples = {}, batches = {}".format(test_data.__len__(), len(test_loader)))

Batch size:  128
Train dataset samples = 28539, batches = 223
Val dataset samples = 2703, batches = 22
Test dataset samples = 2620, batches = 21


In [None]:
# sanity check
for data in train_loader:
    x, y, lx, ly = data
    print(x.shape, y.shape, lx.shape, ly.shape)
    break

torch.Size([128, 1673, 27]) torch.Size([128, 220]) torch.Size([128]) torch.Size([128])


# NETWORK

## ASR Network

### Pyramid Bi-LSTM (pBLSTM)

In [None]:
# Utils for network
torch.cuda.empty_cache()

class PermuteBlock(torch.nn.Module):
    def forward(self, x):
        return x.transpose(1, 2)

In [None]:
class pBLSTM(torch.nn.Module):

    '''
    Pyramidal BiLSTM
    Read the write up/paper and understand the concepts and then write your implementation here.

    At each step,
    1. Pad your input if it is packed (Unpack it)
    2. Reduce the input length dimension by concatenating feature dimension
        (Tip: Write down the shapes and understand)
        (i) How should  you deal with odd/even length input?
        (ii) How should you deal with input length array (x_lens) after truncating the input?
    3. Pack your input
    4. Pass it into LSTM layer

    To make our implementation modular, we pass 1 layer at a time.
    '''

    def __init__(self, input_size, hidden_size):
        super(pBLSTM, self).__init__()

        self.blstm = nn.LSTM(2*input_size, hidden_size, num_layers=1, bidirectional=True, batch_first=True, dropout=0.2) # TODO: Initialize a single layer bidirectional LSTM with the given input_size and hidden_size

    def forward(self, x_packed): # x_packed is a PackedSequence

        # TODO: Pad Packed Sequence
        x, x_lens = pad_packed_sequence(x_packed, batch_first=True)
        # Call self.trunc_reshape() which downsamples the time steps of x and increases the feature dimensions as mentioned above
        # self.trunc_reshape will return 2 outputs. What are they? Think about what quantites are changing.
        x, x_lens = self.trunc_reshape(x, x_lens)
        # TODO: Pack Padded Sequence. What output(s) would you get?
        x = pack_padded_sequence(x, x_lens, batch_first=True, enforce_sorted=False)
        # TODO: Pass the sequence through bLSTM
        x, _ = self.blstm(x)
        # What do you return?

        return x

    def trunc_reshape(self, x, x_lens):
        # TODO: If you have odd number of timesteps, how can you handle it? (Hint: You can exclude them)
        if x.shape[1]%2 != 0:
          x = x[:,:-1,:]
        # TODO: Reshape x. When reshaping x, you have to reduce number of timesteps by a downsampling factor while increasing number of features by the same factor
        x = x.reshape(x.shape[0], x.shape[1]//2, x.shape[2]*2)
        # TODO: Reduce lengths by the same downsampling factor
        x_lens = x_lens//2
        return x, x_lens

### Encoder

In [None]:
class LockedDropout(nn.Module):
    def __init__(self, p=0.4):
        # https://pytorchnlp.readthedocs.io/en/latest/_modules/torchnlp/nn/lock_dropout.html
        super().__init__()
        self.p = p

    def forward(self, x):
        if not self.training or not self.p:
            return x
        x, x_lens = pad_packed_sequence(x, batch_first=True)
        mask = x.new_empty(x.size(0), 1, x.size(2), requires_grad=False).bernoulli_(1 - self.p).div_(1 - self.p)
        x = x * mask.expand_as(x)
        x = pack_padded_sequence(x, x_lens, batch_first=True, enforce_sorted=False)
        return x

In [None]:
class Encoder(torch.nn.Module):
    '''
    The Encoder takes utterances as inputs and returns latent feature representations
    '''
    def __init__(self, input_size, encoder_hidden_size):
        super(Encoder, self).__init__()


        self.embedding = torch.nn.Sequential(
            PermuteBlock(),
            torch.nn.Conv1d(in_channels=input_size, out_channels=128, kernel_size=3, padding=1, stride=1),
            torch.nn.BatchNorm1d(128),
            torch.nn.GELU(),
            torch.nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, padding=1, stride=1),
            torch.nn.BatchNorm1d(256),
            PermuteBlock(),
        )
        #TODO : You can use CNNs as Embedding layer to extract features. Keep in mind the Input dimensions and expected dimension of Pytorch CNN.

        self.pBLSTMs = torch.nn.Sequential( # How many pBLSTMs are required?
            # TODO: Fill this up with pBLSTMs - What should the input_size be?
            # Hint: You are downsampling timesteps by a factor of 2, upsampling features by a factor of 2 and the LSTM is bidirectional)
            # Optional: Dropout/Locked Dropout after each pBLSTM (Not needed for early submission)
            # https://github.com/salesforce/awd-lstm-lm/blob/dfd3cb0235d2caf2847a4d53e1cbd495b781b5d2/locked_dropout.py#L5
            # ...
            # ...
            pBLSTM(input_size=256, hidden_size=encoder_hidden_size),
            LockedDropout(),
            pBLSTM(input_size=2*encoder_hidden_size, hidden_size=encoder_hidden_size),
            LockedDropout(),
        )

    def forward(self, x, x_lens):
        # Where are x and x_lens coming from? The dataloader
        #TODO: Call the embedding layer
        x = self.embedding(x)
        # TODO: Pack Padded Sequence
        x = pack_padded_sequence(x, x_lens, batch_first=True, enforce_sorted=False)
        # TODO: Pass Sequence through the pyramidal Bi-LSTM layer
        x = self.pBLSTMs(x)
        # TODO: Pad Packed Sequence
        encoder_outputs, encoder_lens = pad_packed_sequence(x, batch_first=True)

        # Remember the number of output(s) each function returns

        return encoder_outputs, encoder_lens

### Decoder

In [None]:
class Decoder(torch.nn.Module):

    def __init__(self, embed_size, output_size= 41):
        super().__init__()

        self.mlp = torch.nn.Sequential(
            PermuteBlock(), torch.nn.BatchNorm1d(embed_size), PermuteBlock(),
            #TODO define your MLP arch. Refer HW1P2
            #Use Permute Block before and after BatchNorm1d() to match the size
            torch.nn.Linear(embed_size, 2000),
            torch.nn.GELU(),
            PermuteBlock(), torch.nn.BatchNorm1d(2000), PermuteBlock(),
            torch.nn.Dropout(0.2),
            torch.nn.Linear(2000, 1000),
            torch.nn.GELU(),
            PermuteBlock(), torch.nn.BatchNorm1d(1000), PermuteBlock(),
            torch.nn.Dropout(0.2),
            torch.nn.Linear(1000, output_size),
        )

        self.softmax = torch.nn.LogSoftmax(dim=2)

    def forward(self, encoder_out):
        #TODO  call your MLP
        x = self.mlp(encoder_out)
        #TODO Think what should be the final output of the decoder for the classification
        out = self.softmax(x)
        return out

In [None]:
import torchaudio

In [None]:
class ASRModel(torch.nn.Module):

    def __init__(self, input_size, embed_size= 192, output_size= len(PHONEMES)):
        super().__init__()

        self.augmentations  = torch.nn.Sequential(
            #TODO Add Time Masking/ Frequency Masking
            #Hint: See how to use PermuteBlock() function defined above
            PermuteBlock(),
            torchaudio.transforms.FrequencyMasking(freq_mask_param=10),
            torchaudio.transforms.TimeMasking(time_mask_param=80),
            PermuteBlock(),
        )
        self.encoder        = Encoder(input_size, embed_size)# TODO: Initialize Encoder
        self.decoder        = Decoder(2*embed_size, output_size)# TODO: Initialize Decoder



    def forward(self, x, lengths_x):

        if self.training:
            x = self.augmentations(x)

        encoder_out, encoder_lens   = self.encoder(x, lengths_x)
        decoder_out                 = self.decoder(encoder_out)

        return decoder_out, encoder_lens

## Initialize ASR Network

In [None]:
model = ASRModel(
    input_size  = 27,
    embed_size  = 512,  
    output_size = len(PHONEMES)
).to(device)
print(model)
summary(model, x.to(device), lx)

ASRModel(
  (augmentations): Sequential(
    (0): PermuteBlock()
    (1): FrequencyMasking()
    (2): TimeMasking()
    (3): PermuteBlock()
  )
  (encoder): Encoder(
    (embedding): Sequential(
      (0): PermuteBlock()
      (1): Conv1d(27, 128, kernel_size=(3,), stride=(1,), padding=(1,))
      (2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (3): GELU(approximate='none')
      (4): Conv1d(128, 256, kernel_size=(3,), stride=(1,), padding=(1,))
      (5): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (6): PermuteBlock()
    )
    (pBLSTMs): Sequential(
      (0): pBLSTM(
        (blstm): LSTM(512, 512, batch_first=True, dropout=0.2, bidirectional=True)
      )
      (1): LockedDropout()
      (2): pBLSTM(
        (blstm): LSTM(2048, 512, batch_first=True, dropout=0.2, bidirectional=True)
      )
      (3): LockedDropout()
    )
  )
  (decoder): Decoder(
    (mlp): Sequential(
      (0): PermuteBlock()

Unnamed: 0_level_0,Kernel Shape,Output Shape,Params,Mult-Adds
Layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0_augmentations.PermuteBlock_0,-,"[128, 27, 1673]",,
1_augmentations.FrequencyMasking_1,-,"[128, 27, 1673]",,
2_augmentations.TimeMasking_2,-,"[128, 27, 1673]",,
3_augmentations.PermuteBlock_3,-,"[128, 1673, 27]",,
4_encoder.embedding.PermuteBlock_0,-,"[128, 27, 1673]",,
5_encoder.embedding.Conv1d_1,"[27, 128, 3]","[128, 128, 1673]",10496.0,17345664.0
6_encoder.embedding.BatchNorm1d_2,[128],"[128, 128, 1673]",256.0,128.0
7_encoder.embedding.GELU_3,-,"[128, 128, 1673]",,
8_encoder.embedding.Conv1d_4,"[128, 256, 3]","[128, 256, 1673]",98560.0,164462592.0
9_encoder.embedding.BatchNorm1d_5,[256],"[128, 256, 1673]",512.0,256.0


# Training Config
Initialize Loss Criterion, Optimizer, CTC Beam Decoder, Scheduler, Scaler (Mixed-Precision), etc.

In [None]:
#TODO


criterion = torch.nn.CTCLoss(blank=0, reduction='mean', zero_infinity=False) # Define CTC loss as the criterion. How would the losses be reduced?
# CTC Loss: https://pytorch.org/docs/stable/generated/torch.nn.CTCLoss.html
# Refer to the handout for hints

optimizer =  torch.optim.AdamW(model.parameters(), lr=config['lr'], weight_decay=1e-3) # What goes in here?

# Declare the decoder. Use the CTC Beam Decoder to decode phonemes
# CTC Beam Decoder Doc: https://github.com/parlance/ctcdecode
decoder = CTCBeamDecoder(LABELS, beam_width=config['beam_width'], log_probs_input=True) 

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.75, patience=1, min_lr=1e-4)  

# Mixed Precision, if you need it
scaler = torch.cuda.amp.GradScaler()

# Decode Prediction

In [None]:
def decode_prediction(output, output_lens, decoder, PHONEME_MAP= LABELS):

    # TODO: look at docs for CTC.decoder and find out what is returned here. Check the shape of output and expected shape in decode.
    beam_results, beam_scores, timesteps, out_lens = decoder.decode(output, seq_lens= output_lens) #lengths - list of lengths

    pred_strings                    = []

    for i in range(output_lens.shape[0]):
        #TODO: Create the prediction from the output of decoder.decode. Don't forget to map it using PHONEMES_MAP.
        # CTC Beam Decoder Doc: https://github.com/parlance/ctcdecode
        pred_strings.append(''.join([PHONEME_MAP[n] for n in beam_results[i][0][:out_lens[i][0]]]))
    return pred_strings

def calculate_levenshtein(output, label, output_lens, label_lens, decoder, PHONEME_MAP= LABELS): # y - sequence of integers

    dist            = 0
    batch_size      = label.shape[0]

    pred_strings    = decode_prediction(output, output_lens, decoder, PHONEME_MAP)

    for i in range(batch_size):
        # TODO: Get predicted string and label string for each element in the batch
        # CTC Beam Decoder Doc: https://github.com/parlance/ctcdecode
        pred_string = pred_strings[i]  
        label_string = ''.join([PHONEME_MAP[int(n)] for n in label[i][:label_lens[i]]])  
        dist += Levenshtein.distance(pred_string, label_string)

    dist /= batch_size # TODO: Uncomment this, but think about why we are doing this
    # raise NotImplemented
    return dist

# Test Implementation

In [None]:
# test code to check shapes

model.eval()
for i, data in enumerate(val_loader, 0):
    x, y, lx, ly = data
    x, y = x.to(device), y.to(device)
    h, lh = model(x, lx)
    print(h.shape)
    print(calculate_levenshtein(h, y, lx, ly, decoder, LABELS))
    h = torch.permute(h, (1, 0, 2))
    print(h.shape, y.shape)
    loss = criterion(h, y, lh, ly)
    print(loss)
    break

torch.Size([128, 734, 41])
197.859375
torch.Size([734, 128, 41]) torch.Size([128, 265])
tensor(7.1913, device='cuda:0', grad_fn=<MeanBackward0>)


# WandB

You will need to fetch your api key from wandb.ai

In [None]:
import wandb
wandb.login(key="")

[34m[1mwandb[0m: Currently logged in as: [33makameswa[0m ([33mhickups[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
run = wandb.init(
    name = "high-cutoff", ## Wandb creates random run names if you skip this field
    reinit = True, ### Allows reinitalizing runs when you re-run this cell
    id = "jyqyah20", ### Insert specific run id here if you want to resume a previous run
    resume = "must", ### You need this to resume previous runs, but comment out reinit = True when using this
    project = "hw3p2", ### Project should be created in your wandb account
    entity="tekkotsu",
    config = config ### Wandb Config for your run
)

[34m[1mwandb[0m: Currently logged in as: [33makameswa[0m ([33mtekkotsu[0m). Use [1m`wandb login --relogin`[0m to force relogin


# Train Functions

In [None]:
from tqdm import tqdm

def train_model(model, train_loader, criterion, optimizer):

    model.train()
    batch_bar = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc='Train')

    total_loss = 0

    for i, data in enumerate(train_loader):
        optimizer.zero_grad()

        x, y, lx, ly = data
        x, y = x.to(device), y.to(device)

        with torch.cuda.amp.autocast():
            h, lh = model(x, lx)
            h = torch.permute(h, (1, 0, 2))
            loss = criterion(h, y, lh, ly)

        total_loss += loss.item()

        batch_bar.set_postfix(
            loss="{:.04f}".format(float(total_loss / (i + 1))),
            lr="{:.06f}".format(float(optimizer.param_groups[0]['lr'])))

        batch_bar.update() # Update tqdm bar

        # Another couple things you need for FP16.
        scaler.scale(loss).backward() # This is a replacement for loss.backward()
        scaler.step(optimizer) # This is a replacement for optimizer.step()
        scaler.update() # This is something added just for FP16

        del x, y, lx, ly, h, lh, loss
        torch.cuda.empty_cache()

    batch_bar.close() # You need this to close the tqdm bar

    return total_loss / len(train_loader)


def validate_model(model, val_loader, decoder, phoneme_map= LABELS):

    model.eval()
    batch_bar = tqdm(total=len(val_loader), dynamic_ncols=True, position=0, leave=False, desc='Val')

    total_loss = 0
    vdist = 0

    for i, data in enumerate(val_loader):

        x, y, lx, ly = data
        x, y = x.to(device), y.to(device)

        with torch.inference_mode():
            h, lh = model(x, lx)
            h = torch.permute(h, (1, 0, 2))
            loss = criterion(h, y, lh, ly)

        total_loss += float(loss)
        vdist += calculate_levenshtein(torch.permute(h, (1, 0, 2)), y, lh, ly, decoder, phoneme_map)

        batch_bar.set_postfix(loss="{:.04f}".format(float(total_loss / (i + 1))), dist="{:.04f}".format(float(vdist / (i + 1))))

        batch_bar.update()

        del x, y, lx, ly, h, lh, loss
        torch.cuda.empty_cache()

    batch_bar.close()
    total_loss = total_loss/len(val_loader)
    val_dist = vdist/len(val_loader)
    return total_loss, val_dist

## Training Setup

In [None]:
def save_model(model, optimizer, scheduler, metric, epoch, path):
    torch.save(
        {'model_state_dict'         : model.state_dict(),
         'optimizer_state_dict'     : optimizer.state_dict(),
         'scheduler_state_dict'     : scheduler.state_dict(),
         metric[0]                  : metric[1],
         'epoch'                    : epoch},
         path
    )

def load_model(path, model, metric= 'valid_acc', optimizer= None, scheduler= None):

    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state_dict'])

    if optimizer != None:
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    if scheduler != None:
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])

    epoch   = checkpoint['epoch']
    metric  = checkpoint[metric]

    return [model, optimizer, scheduler, epoch, metric]

In [None]:
# # This is for checkpointing, if you're doing it over multiple sessions

# last_epoch_completed = 0
# start = last_epoch_completed
# end = config["epochs"]
best_lev_dist = float("inf") # if you're restarting from some checkpoint, use what you saw there.
epoch_model_path = f'/content/models/epoch/epoch_model_{i}.pt'#TODO set the model path( Optional, you can just store best one. Make sure to make the changes below )
best_model_path = f'/content/models/best/best_model_{i}.pt'#TODO set best model path

In [None]:
torch.cuda.empty_cache()
gc.collect()

#TODO: Please complete the training loop
# i=0
for epoch in range(0, config['epochs']):
    i += 1
    print("\nEpoch: {}/{}".format(epoch+1, config['epochs']))

    curr_lr = optimizer.param_groups[0]['lr']  

    train_loss              = train_model(model, train_loader, criterion, optimizer) #TODO
    valid_loss, valid_dist  = validate_model(model, val_loader, decoder, phoneme_map= LABELS) #TODO
    scheduler.step(valid_dist)

    print("\tTrain Loss {:.04f}\t Learning Rate {:.07f}".format(train_loss, curr_lr))
    print("\tVal Dist {:.04f}%\t Val Loss {:.04f}".format(valid_dist, valid_loss))


    wandb.log({
        'train_loss': train_loss,
        'valid_dist': valid_dist,
        'valid_loss': valid_loss,
        'lr'        : curr_lr
    })

    save_model(model, optimizer, scheduler, ['valid_dist', valid_dist], epoch, f'/content/models/epoch/epoch_model_{i}.pt')
    wandb.save(f'/content/models/epoch/epoch_model_{i}.pt')
    print("Saved epoch model")

    if valid_dist <= best_lev_dist:
        best_lev_dist = valid_dist
        save_model(model, optimizer, scheduler, ['valid_dist', valid_dist], epoch, f'/content/models/best/best_model_{i}.pt')
        wandb.save(f'/content/models/best/best_model_{i}.pt')
        print("Saved best model")
      # You may find it interesting to exlplore Wandb Artifcats to version your models
run.finish()


Epoch: 1/50




	Train Loss 0.2773	 Learning Rate 0.0011250
	Val Dist 5.2400%	 Val Loss 0.2704
Saved epoch model
Saved best model

Epoch: 2/50




	Train Loss 0.2606	 Learning Rate 0.0011250
	Val Dist 5.2811%	 Val Loss 0.2730
Saved epoch model

Epoch: 3/50




	Train Loss 0.2623	 Learning Rate 0.0011250
	Val Dist 5.2346%	 Val Loss 0.2695
Saved epoch model
Saved best model

Epoch: 4/50




	Train Loss 0.2537	 Learning Rate 0.0011250
	Val Dist 5.2463%	 Val Loss 0.2722
Saved epoch model

Epoch: 5/50




	Train Loss 0.2648	 Learning Rate 0.0011250
	Val Dist 5.3473%	 Val Loss 0.2706
Saved epoch model

Epoch: 6/50




	Train Loss 0.2515	 Learning Rate 0.0008438
	Val Dist 5.1172%	 Val Loss 0.2656
Saved epoch model
Saved best model

Epoch: 7/50




	Train Loss 0.2457	 Learning Rate 0.0008438
	Val Dist 5.1499%	 Val Loss 0.2671
Saved epoch model

Epoch: 8/50




	Train Loss 0.2450	 Learning Rate 0.0008438
	Val Dist 5.1241%	 Val Loss 0.2680
Saved epoch model

Epoch: 9/50




	Train Loss 0.2389	 Learning Rate 0.0006328
	Val Dist 5.1137%	 Val Loss 0.2748
Saved epoch model
Saved best model

Epoch: 10/50




	Train Loss 0.2247	 Learning Rate 0.0006328
	Val Dist 5.0124%	 Val Loss 0.2701
Saved epoch model
Saved best model

Epoch: 11/50




	Train Loss 0.2309	 Learning Rate 0.0006328
	Val Dist 5.0495%	 Val Loss 0.2669
Saved epoch model

Epoch: 12/50




	Train Loss 0.2348	 Learning Rate 0.0006328
	Val Dist 4.9810%	 Val Loss 0.2691
Saved epoch model
Saved best model

Epoch: 13/50




	Train Loss 0.2293	 Learning Rate 0.0006328
	Val Dist 5.0355%	 Val Loss 0.2632
Saved epoch model

Epoch: 14/50




	Train Loss 0.2172	 Learning Rate 0.0006328
	Val Dist 5.0620%	 Val Loss 0.2667
Saved epoch model

Epoch: 15/50




	Train Loss 0.2189	 Learning Rate 0.0004746
	Val Dist 4.9796%	 Val Loss 0.2664
Saved epoch model
Saved best model

Epoch: 16/50




	Train Loss 0.2074	 Learning Rate 0.0004746
	Val Dist 4.9994%	 Val Loss 0.2713
Saved epoch model

Epoch: 17/50




	Train Loss 0.2268	 Learning Rate 0.0004746
	Val Dist 5.0394%	 Val Loss 0.2689
Saved epoch model

Epoch: 18/50




	Train Loss 0.2175	 Learning Rate 0.0003560
	Val Dist 4.8674%	 Val Loss 0.2652
Saved epoch model
Saved best model

Epoch: 19/50




	Train Loss 0.2205	 Learning Rate 0.0003560
	Val Dist 5.0148%	 Val Loss 0.2681
Saved epoch model

Epoch: 20/50




	Train Loss 0.2198	 Learning Rate 0.0003560
	Val Dist 4.8487%	 Val Loss 0.2721
Saved epoch model
Saved best model

Epoch: 21/50




	Train Loss 0.2045	 Learning Rate 0.0003560
	Val Dist 4.8524%	 Val Loss 0.2694
Saved epoch model

Epoch: 22/50




	Train Loss 0.2112	 Learning Rate 0.0003560
	Val Dist 4.8494%	 Val Loss 0.2667
Saved epoch model

Epoch: 23/50




	Train Loss 0.2084	 Learning Rate 0.0002670
	Val Dist 4.8469%	 Val Loss 0.2688
Saved epoch model
Saved best model

Epoch: 24/50




	Train Loss 0.2057	 Learning Rate 0.0002670
	Val Dist 4.8398%	 Val Loss 0.2712
Saved epoch model
Saved best model

Epoch: 25/50




	Train Loss 0.2044	 Learning Rate 0.0002670
	Val Dist 4.8779%	 Val Loss 0.2714
Saved epoch model

Epoch: 26/50


Train:  77%|███████▋  | 172/223 [08:04<02:34,  3.03s/it, loss=0.1993, lr=0.000267]

KeyboardInterrupt: 

# Generate Predictions and Submit to Kaggle

In [None]:
#TODO: Make predictions

# Follow the steps below:
# 1. Create a new object for CTCBeamDecoder with larger (why?) number of beams
# 2. Get prediction string by decoding the results of the beam decoder

TEST_BEAM_WIDTH = 30 #TODO

test_decoder    = CTCBeamDecoder(LABELS, beam_width = TEST_BEAM_WIDTH, log_probs_input = True) #TODO
results = []

model.eval()
print("Testing")
for data in tqdm(test_loader):

    x, lx   = data
    x       = x.to(device)

    with torch.no_grad():
        h, lh = model(x, lx)

    prediction_string= decode_prediction(h, lh, test_decoder)# TODO call decode_prediction
    #TODO save the output in results array.
    results.extend(prediction_string)
    del x, lx, h, lh
    torch.cuda.empty_cache()

Testing



  0%|          | 0/21 [00:00<?, ?it/s][A
  5%|▍         | 1/21 [00:03<01:08,  3.44s/it][A
 10%|▉         | 2/21 [00:06<00:57,  3.02s/it][A
 14%|█▍        | 3/21 [00:08<00:47,  2.66s/it][A
 19%|█▉        | 4/21 [00:10<00:40,  2.40s/it][A
 24%|██▍       | 5/21 [00:13<00:41,  2.58s/it][A
 29%|██▊       | 6/21 [00:15<00:34,  2.31s/it][A
 33%|███▎      | 7/21 [00:17<00:31,  2.22s/it][A
 38%|███▊      | 8/21 [00:19<00:31,  2.43s/it][A
 43%|████▎     | 9/21 [00:22<00:30,  2.57s/it][A
 48%|████▊     | 10/21 [00:25<00:26,  2.43s/it][A
 52%|█████▏    | 11/21 [00:27<00:24,  2.47s/it][A
 57%|█████▋    | 12/21 [00:29<00:20,  2.32s/it][A
 62%|██████▏   | 13/21 [00:32<00:19,  2.41s/it][A
 67%|██████▋   | 14/21 [00:33<00:15,  2.14s/it][A
 71%|███████▏  | 15/21 [00:35<00:12,  2.07s/it][A
 76%|███████▌  | 16/21 [00:37<00:10,  2.12s/it][A
 81%|████████  | 17/21 [00:40<00:08,  2.16s/it][A
 86%|████████▌ | 18/21 [00:42<00:07,  2.39s/it][A
 90%|█████████ | 19/21 [00:45<00:04,  2.45s/it]

In [None]:
data_dir = f"{root}/test-clean/random_submission.csv"
df = pd.read_csv(data_dir)
df.label = results
df.to_csv('submission.csv', index = False)

In [None]:
!kaggle competitions submit -c hw3p2asr-s24 -f submission.csv -m "I made it!"

100% 209k/209k [00:00<00:00, 340kB/s]
500 - Internal Server Error
