# Installs

In [1]:
!pip install wandb -q

### Levenshtein

This may take a while

In [2]:
!pip install wandb --quiet
!pip install python-Levenshtein -q
!git clone --recursive https://github.com/parlance/ctcdecode.git
!pip install wget -q
%cd ctcdecode
!pip install . -q
%cd ..

!pip install torchsummaryX -q

fatal: destination path 'ctcdecode' already exists and is not an empty directory.
/content/ctcdecode
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for ctcdecode (setup.py) ... [?25l[?25hdone
/content


## Imports

In [3]:
import torch
import random
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torchsummaryX import summary
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

import torchaudio.transforms as tat

from sklearn.metrics import accuracy_score
import gc

import zipfile
import pandas as pd
from tqdm import tqdm
import os
import datetime

# imports for decoding and distance calculation
import ctcdecode
import Levenshtein
from ctcdecode import CTCBeamDecoder

import warnings
warnings.filterwarnings('ignore')

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device: ", device)

Device:  cuda


# Kaggle Setup

In [None]:
!pip install --upgrade --force-reinstall --no-deps kaggle==1.5.8
!mkdir /root/.kaggle

with open("/root/.kaggle/kaggle.json", "w+") as f:
    f.write('{"username":"ayh2cfa","key":"924942fc523aefb5cd38a395654d3ca9"}')

!chmod 600 /root/.kaggle/kaggle.json

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting kaggle==1.5.8
  Downloading kaggle-1.5.8.tar.gz (59 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25l[?25hdone
  Created wheel for kaggle: filename=kaggle-1.5.8-py3-none-any.whl size=73272 sha256=5b7acae11ed51f118fef260e9a83eadce9fd5426ba620468c33eec7e9408973d
  Stored in directory: /root/.cache/pip/wheels/d4/02/ef/3f8c8d86b8d5388a1d3155876837f1a1a3143ab3fc2ff1ffad
Successfully built kaggle
Installing collected packages: kaggle
  Attempting uninstall: kaggle
    Found existing installation: kaggle 1.5.13
    Uninstalling kaggle-1.5.13:
      Successfully uninstalled kaggle-1.5.13
Successfully installed kaggle-1.5.8


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
!kaggle competitions download -c 11-785-s23-hw3p2
!mkdir '/content/data'
'''
This will take a couple minutes, but you should see at least the following:
11-785-f22-hw3p2.zip  ctcdecode  hw3p2
'''
!unzip -qo 11-785-s23-hw3p2.zip -d '/content/data'
# !ls

Downloading 11-785-s23-hw3p2.zip to /content
100% 15.9G/15.9G [02:19<00:00, 139MB/s]
100% 15.9G/15.9G [02:19<00:00, 123MB/s]


# Google Drive

# Dataset and Dataloader

In [4]:
# ARPABET PHONEME MAPPING
# DO NOT CHANGE
# This overwrites the phonetics.py file.

CMUdict_ARPAbet = {
    "" : " ",
    "[SIL]": "-", "NG": "G", "F" : "f", "M" : "m", "AE": "@", 
    "R"    : "r", "UW": "u", "N" : "n", "IY": "i", "AW": "W", 
    "V"    : "v", "UH": "U", "OW": "o", "AA": "a", "ER": "R", 
    "HH"   : "h", "Z" : "z", "K" : "k", "CH": "C", "W" : "w", 
    "EY"   : "e", "ZH": "Z", "T" : "t", "EH": "E", "Y" : "y", 
    "AH"   : "A", "B" : "b", "P" : "p", "TH": "T", "DH": "D", 
    "AO"   : "c", "G" : "g", "L" : "l", "JH": "j", "OY": "O", 
    "SH"   : "S", "D" : "d", "AY": "Y", "S" : "s", "IH": "I",
    "[SOS]": "[SOS]", "[EOS]": "[EOS]"
}

CMUdict = list(CMUdict_ARPAbet.keys())
ARPAbet = list(CMUdict_ARPAbet.values())


PHONEMES = CMUdict[:-2]
LABELS = ARPAbet[:-2]

In [5]:
# You might want to play around with the mapping as a sanity check here
print(CMUdict)
print(ARPAbet)

['', '[SIL]', 'NG', 'F', 'M', 'AE', 'R', 'UW', 'N', 'IY', 'AW', 'V', 'UH', 'OW', 'AA', 'ER', 'HH', 'Z', 'K', 'CH', 'W', 'EY', 'ZH', 'T', 'EH', 'Y', 'AH', 'B', 'P', 'TH', 'DH', 'AO', 'G', 'L', 'JH', 'OY', 'SH', 'D', 'AY', 'S', 'IH', '[SOS]', '[EOS]']
[' ', '-', 'G', 'f', 'm', '@', 'r', 'u', 'n', 'i', 'W', 'v', 'U', 'o', 'a', 'R', 'h', 'z', 'k', 'C', 'w', 'e', 'Z', 't', 'E', 'y', 'A', 'b', 'p', 'T', 'D', 'c', 'g', 'l', 'j', 'O', 'S', 'd', 'Y', 's', 'I', '[SOS]', '[EOS]']


### Train Data

In [6]:
class AudioDataset(torch.utils.data.Dataset):

    # For this homework, we give you full flexibility to design your data set class.
    # Hint: The data from HW1 is very similar to this HW

    #TODO
    def __init__(self, root, partition, phonemes = PHONEMES): 
        '''
        Initializes the dataset.

        INPUTS: What inputs do you need here?
        '''

        # Load the directory and all files in them

        self.mfcc_dir = root+'/'+partition+'/mfcc'
        self.transcript_dir = root+'/'+partition+'/transcript'

        self.mfcc_files = sorted(os.listdir(self.mfcc_dir))
        self.transcript_files = sorted(os.listdir(self.transcript_dir))
        self.phonemes = phonemes

        #TODO
        # WHAT SHOULD THE LENGTH OF THE DATASET BE?
        self.length = len(self.mfcc_files)
        
        #TODO
        # HOW CAN WE REPRESENT PHONEMES? CAN WE CREATE A MAPPING FOR THEM?
        # HINT: TENSORS CANNOT STORE NON-NUMERICAL VALUES OR STRINGS
        num_mfccs = self.length

        #TODO
        # CREATE AN ARRAY OF ALL FEATUERS AND LABELS
        # WHAT NORMALIZATION TECHNIQUE DID YOU USE IN HW1? CAN WE USE IT HERE?
        '''
        You may decide to do this in __getitem__ if you wish.
        However, doing this here will make the __init__ function take the load of
        loading the data, and shift it away from training.
        '''
        # Iterate through mfccs and transcripts
        self.transcripts = [None]*num_mfccs
        self.mfccs = [None]*num_mfccs
        for i in range(num_mfccs):
        #   Load a single mfcc
            mfcc = np.load(self.mfcc_dir+'/'+self.mfcc_files[i])
        #   Do Cepstral Normalization of mfcc (explained in writeup)
            sigma = np.std(mfcc,axis=0)
            mfcc -= np.sum(mfcc,axis=0)/mfcc.shape[0]
            mfcc /= sigma
            self.mfccs[i] = torch.tensor(mfcc)

        #   Load the corresponding transcript
            transcript  = np.load(self.transcript_dir+'/'+self.transcript_files[i]) 
            transcript = transcript[1:len(transcript)-1] # Remove [SOS] and [EOS] from the transcript

            # Map the phonemes to their corresponding list indexes in self.phonemes
            # Now, if an element in self.transcript is 0, it means that it is 'SIL' (as per the above example)
            transcript = [self.phonemes.index(p) for p in transcript]
            self.transcripts[i] = torch.tensor(transcript)

    def __len__(self):
        
        '''
        TODO: What do we return here?
        '''
        return self.length

    def __getitem__(self, ind):
        '''
        TODO: RETURN THE MFCC COEFFICIENTS AND ITS CORRESPONDING LABELS

        If you didn't do the loading and processing of the data in __init__,
        do that here.

        Once done, return a tuple of features and labels.
        '''

        mfcc = self.mfccs[ind]
        transcript = self.transcripts[ind]
        return mfcc, transcript


    def collate_fn(self,batch):
        '''
        TODO:
        1.  Extract the features and labels from 'batch'
        2.  We will additionally need to pad both features and labels,
            look at pytorch's docs for pad_sequence
        3.  This is a good place to perform transforms, if you so wish. 
            Performing them on batches will speed the process up a bit.
        4.  Return batch of features, labels, lenghts of features, 
            and lengths of labels.
        '''
        # batch of input mfcc coefficients
        batch_mfcc = [b[0] for b in batch]
        # batch of output phonemes
        batch_transcript = [b[1] for b in batch]

        # HINT: CHECK OUT -> pad_sequence (imported above)
        # Also be sure to check the input format (batch_first)
        lengths_mfcc = [len(m) for m in batch_mfcc]
        batch_mfcc_pad = pad_sequence(batch_mfcc, batch_first=True)

        lengths_transcript = [len(t) for t in batch_transcript]
        batch_transcript_pad = pad_sequence(batch_transcript, batch_first=True)

        # You may apply some transformation, Time and Frequency masking, here in the collate function;
        # Food for thought -> Why are we applying the transformation here and not in the __getitem__?
        #                  -> Would we apply transformation on the validation set as well?
        #                  -> Is the order of axes / dimensions as expected for the transform functions?
        
        # Return the following values: padded features, padded labels, actual length of features, actual length of the labels
        return batch_mfcc_pad, batch_transcript_pad, torch.tensor(lengths_mfcc), torch.tensor(lengths_transcript)

       

### Test Data

In [7]:
# Test Dataloader
class AudioTestDataset(torch.utils.data.Dataset):

    # For this homework, we give you full flexibility to design your data set class.
    # Hint: The data from HW1 is very similar to this HW

    #TODO
    def __init__(self, root, partition): 
        '''
        Initializes the dataset.

        INPUTS: What inputs do you need here?
        '''

        # Load the directory and all files in them

        self.mfcc_dir = root+'/'+partition+'/mfcc'

        self.mfcc_files = sorted(os.listdir(self.mfcc_dir))

        self.length = len(self.mfcc_files)

        num_mfccs = self.length

        # Iterate through mfccs
        self.mfccs = [None]*num_mfccs
        for i in range(num_mfccs):
        #   Load a single mfcc
            mfcc = np.load(self.mfcc_dir+'/'+self.mfcc_files[i])
        #   Do Cepstral Normalization of mfcc (explained in writeup)
            sigma = np.std(mfcc,axis=0)
            mfcc -= np.sum(mfcc,axis=0)/mfcc.shape[0]
            mfcc /= sigma
            self.mfccs[i] = torch.tensor(mfcc)

    def __len__(self):
        
        '''
        TODO: What do we return here?
        '''
        return self.length

    def __getitem__(self, ind):
        '''
        TODO: RETURN THE MFCC COEFFICIENTS AND ITS CORRESPONDING LABELS

        If you didn't do the loading and processing of the data in __init__,
        do that here.

        Once done, return a tuple of features and labels.
        '''

        mfcc = self.mfccs[ind]
        return mfcc


    def collate_fn(self,batch):
        # batch of input mfcc coefficients
        batch_mfcc = batch
        lengths_mfcc = [len(m) for m in batch_mfcc]
        batch_mfcc_pad = pad_sequence(batch_mfcc, batch_first=True)
        return batch_mfcc_pad, torch.tensor(lengths_mfcc)

### Data - Hyperparameters

In [95]:
BATCH_SIZE = 64 # Increase if your device can handle it

transforms = [] # set of tranformations
# You may pass this as a parameter to the dataset class above
# This will help modularize your implementation
config = {
    'batch_size': 64,
    'dropout': 0.15,
    'embedding_size': 256,
    'hidden_size': 256,
    'tmask_length': 60,
    'fmask': 3,
    "beam_width" : 2,
    "lr" : 1e-3,
    "epochs" : 50,
    'scheduler': 'reduceOnPlateau',
    'pct_start': 0.1
}
root = 'data/11-785-s23-hw3p2' 

### Data loaders

In [9]:
# get me RAMMM!!!! 
import gc 
gc.collect()

0

In [10]:
# Create objects for the dataset class
train_100 = AudioDataset(root,'train-clean-100') #TODO
train_360 = AudioDataset(root,'train-clean-360') #TODO
train_data = torch.utils.data.ConcatDataset([train_100, train_360])

In [187]:
train100 = AudioDataset(root,'train-clean-100')

In [190]:
train_loader_100 = torch.utils.data.DataLoader(
    dataset     = train100, 
    num_workers = 4,
    batch_size  = config['batch_size'], 
    pin_memory  = True,
    shuffle     = True,
    collate_fn = train100.collate_fn
)

In [None]:
for data in train_loader_100:
    x, y, lx, ly = data
    print(x.shape, y.shape, lx.shape, ly.shape)
    break 

In [11]:
val_data = AudioDataset(root,'dev-clean') #TODO

In [12]:
test_data = AudioTestDataset(root,'test-clean') #TODO

In [14]:
# Do NOT forget to pass in the collate function as parameter while creating the dataloader
train_loader = torch.utils.data.DataLoader(
    dataset     = train_data, 
    num_workers = 4,
    batch_size  = config['batch_size'], 
    pin_memory  = True,
    shuffle     = True,
    collate_fn = train_100.collate_fn
)
val_loader = torch.utils.data.DataLoader(
    dataset     = val_data, 
    num_workers = 2,
    batch_size  = config['batch_size'],
    pin_memory  = True,
    shuffle     = False,
    collate_fn = val_data.collate_fn
)


# print("Batch size: ", config['batch_size'])
# print("Train dataset samples = {}, batches = {}".format(train_data.__len__(), len(train_loader)))
# print("Val dataset samples = {}, batches = {}".format(val_data.__len__(), len(val_loader)))
# print("Test dataset samples = {}, batches = {}".format(test_data.__len__(), len(test_loader)))

In [15]:
test_loader = torch.utils.data.DataLoader(
    dataset     = test_data, 
    num_workers = 2, 
    batch_size  = config['batch_size'], 
    pin_memory  = True, 
    shuffle     = False,
    collate_fn = test_data.collate_fn
)

In [142]:
# sanity check
for data in train_loader:
    x, y, lx, ly = data
    print(x.shape, y.shape, lx.shape, ly.shape)
    break 

torch.Size([64, 1702, 27]) torch.Size([64, 191]) torch.Size([64]) torch.Size([64])


In [None]:
for data in test_loader:
    x, lx = data
    print(x.shape, lx.shape)
    break 

# NETWORK

## Basic

This is a basic block for understanding, you can skip this and move to pBLSTM one

In [21]:
# Utils for network
torch.cuda.empty_cache()

class PermuteBlock(torch.nn.Module):
    def forward(self, x):
        return x.transpose(1, 2)

In [112]:
torch.cuda.empty_cache()

class Network(nn.Module):

    def __init__(self, input_size, hidden_size, output_size=len(PHONEMES)):

        super(Network, self).__init__()

        # Adding some sort of embedding layer or feature extractor might help performance.
        self.permute = PermuteBlock()
        self.embedding = nn.Conv1d(in_channels=27, out_channels=input_size, kernel_size=3, padding=1)
        
        # TODO : look up the documentation. You might need to pass some additional parameters.
        self.lstm = nn.LSTM(input_size = input_size, hidden_size=config['hidden_size'], num_layers=1, batch_first=True) 
       
        self.classification = nn.Sequential(
            #TODO: Linear layer with in_features from the lstm module above and out_features = OUT_SIZE
            torch.nn.Linear(hidden_size,output_size)
        )
        
        self.logSoftmax = torch.nn.LogSoftmax(dim=2)

    def forward(self, x, lx):
        #TODO
        # The forward function takes 2 parameter inputs here. Why?
        # Refer to the handout for hints
        E = self.permute(self.embedding(self.permute(x)))
        out, (h_n, c_n) = self.lstm(E)
        out = self.classification(out)
        out = self.logSoftmax(out)
        return out, lx

## Pyramid Bi-LSTM (pBLSTM)

In [79]:
class pBLSTM(torch.nn.Module):

    '''
    Pyramidal BiLSTM
    Read the write up/paper and understand the concepts and then write your implementation here.

    At each step,
    1. Pad your input if it is packed (Unpack it)
    2. Reduce the input length dimension by concatenating feature dimension
        (Tip: Write down the shapes and understand)
        (i) How should  you deal with odd/even length input? 
        (ii) How should you deal with input length array (x_lens) after truncating the input?
    3. Pack your input
    4. Pass it into LSTM layer

    To make our implementation modular, we pass 1 layer at a time.
    '''
    
    def __init__(self, input_size, hidden_size, dropout):
        super(pBLSTM, self).__init__()
        self.blstm = nn.LSTM(input_size, hidden_size, bidirectional=True, dropout=dropout, batch_first=True)
        self.input_size = input_size

    def forward(self, x_packed): # x_packed is a PackedSequence

        # TODO: Pad Packed Sequence
        x_padded, x_padded_len = pad_packed_sequence(x_packed, batch_first=True)
        # Call self.trunc_reshape() which downsamples the time steps of x and increases the feature dimensions as mentioned above
        # self.trunc_reshape will return 2 outputs. What are they? Think about what quantites are changing.
        x_pad_trunc, x_pad_trunc_len = self.trunc_reshape(x_padded, x_padded_len)
        # TODO: Pack Padded Sequence. What output(s) would you get?
        x_packed = pack_padded_sequence(x_pad_trunc, x_pad_trunc_len, batch_first=True, enforce_sorted=False)
        # TODO: Pass the sequence through bLSTM
        # What do you return?
        # print(x_packed[0].shape)
        output, (h_n, c_n) = self.blstm(x_packed)
        # print('output shapes: ')
        # print(output[0].shape)
        return output

    def trunc_reshape(self, x, x_lens): 
        # TODO: If you have odd number of timesteps, how can you handle it? (Hint: You can exclude them)
        # TODO: Reshape x. When reshaping x, you have to reduce number of timesteps by a downsampling factor while increasing number of features by the same factor
        # TODO: Reduce lengths by the same downsampling factor
        B, T, F = x.shape
        x = x[:,:(T//2)*2, :]
        x = x.reshape(B,T//2,F*2)
        x_lens = torch.clamp(x_lens, max=x.shape[1])
        return x, x_lens

# Encoder

In [80]:
class Encoder(torch.nn.Module):
    '''
    The Encoder takes utterances as inputs and returns latent feature representations
    '''
    def __init__(self, input_size, encoder_hidden_size):
        super(Encoder, self).__init__()
        self.permute = PermuteBlock()
        self.embedding = nn.Conv1d(in_channels=27, out_channels=input_size, kernel_size=3, padding=1)
        #TODO: You can use CNNs as Embedding layer to extract features. Keep in mind the Input dimensions and expected dimension of Pytorch CNN.
        self.encoder_size = encoder_hidden_size
        self.pBLSTMs = torch.nn.Sequential( # How many pBLSTMs are required?
            # TODO: Fill this up with pBLSTMs - What should the input_size be? 
            # Hint: You are downsampling timesteps by a factor of 2, upsampling features by a factor of 2 and the LSTM is bidirectional)
            # Optional: Dropout/Locked Dropout after each pBLSTM (Not needed for early submission)
            # https://github.com/salesforce/awd-lstm-lm/blob/dfd3cb0235d2caf2847a4d53e1cbd495b781b5d2/locked_dropout.py#L5
            # ...
            # ...
            pBLSTM(input_size*2, encoder_hidden_size, dropout=config['dropout']),
            pBLSTM(encoder_hidden_size*4, encoder_hidden_size, dropout=config['dropout']),
            # pBLSTM(encoder_hidden_size*4, encoder_hidden_size, dropout=config['dropout'])
        )

    def forward(self, x, x_lens):
        # Where are x and x_lens coming from? The dataloader
        # TODO: Call the embedding layer
        # TODO: Pack Padded Sequence
        # TODO: Pass Sequence through the pyramidal Bi-LSTM layer
        # TODO: Pad Packed Sequence
        E = self.permute(self.embedding(self.permute(x)))
        E_packed = pack_padded_sequence(E, x_lens, batch_first=True, enforce_sorted=False)
        # print('encoder size is')
        # print(self.encoder_size)
        pblstms_out = self.pBLSTMs(E_packed.to(device))
        encoder_outputs, encoder_lens = pad_packed_sequence(pblstms_out, batch_first=True)
        # Remember the number of output(s) each function returns
        return encoder_outputs, encoder_lens

# Decoder

In [81]:
class Decoder(torch.nn.Module):

    def __init__(self, embed_size, output_size= 41):
        super().__init__()

        self.mlp = torch.nn.Sequential(
            PermuteBlock(), torch.nn.BatchNorm1d(embed_size*2), PermuteBlock(),
            torch.nn.Linear(embed_size*2, 2048),
            PermuteBlock(), torch.nn.BatchNorm1d(2048), PermuteBlock(),
            torch.nn.Dropout(config['dropout']),
            torch.nn.GELU(),
            torch.nn.Linear(2048,4096),
            PermuteBlock(), torch.nn.BatchNorm1d(4096), PermuteBlock(),
            torch.nn.Dropout(config['dropout']),
            torch.nn.GELU(),
            torch.nn.Linear(4096,output_size)
            #TODO define your MLP arch. Refer HW1P2
            #Use Permute Block before and after BatchNorm1d() to match the size
        )
        
        self.softmax = torch.nn.LogSoftmax(dim=2)

    def forward(self, encoder_out):
        #TODO call your MLP
        #TODO Think what should be the final output of the decoder for the classification 
        out = self.mlp(encoder_out)
        out = self.softmax(out)
        return out

In [82]:
class ASRModel(torch.nn.Module):

    def __init__(self, input_size, embed_size= 192, output_size= len(PHONEMES)):
        super().__init__()

        self.augmentations  = torch.nn.Sequential(
            #TODO Add Time Masking/ Frequency Masking
            #Hint: See how to use PermuteBlock() function defined above
            PermuteBlock(),
            tat.TimeMasking(time_mask_param=config['tmask_length']),
            tat.FrequencyMasking(freq_mask_param=config['fmask']),
            PermuteBlock()
        )
        self.encoder        =Encoder(input_size, config['hidden_size'])
        self.decoder        =Decoder(embed_size)
    
    def forward(self, x, lengths_x):
        if self.training:
            x = self.augmentations(x)
        encoder_out, encoder_lens   = self.encoder(x, lengths_x)
        decoder_out                 = self.decoder(encoder_out)
        return decoder_out, encoder_lens

## INIT
(If trying out the basic Network)

## INIT ASR

In [83]:
torch.cuda.empty_cache()
model = ASRModel(
    input_size  = config['embedding_size'],
    embed_size  = config['hidden_size'],
    output_size = len(PHONEMES)
).to(device)

In [84]:
for data in train_loader:
    x, y, lx, ly = data
    print(x.shape, y.shape, lx.shape, ly.shape)
    break 
summary(model, x.to(device), lx)

torch.Size([64, 1628, 27]) torch.Size([64, 195]) torch.Size([64]) torch.Size([64])
                                    Kernel Shape     Output Shape     Params  \
Layer                                                                          
0_augmentations.PermuteBlock_0                 -   [64, 27, 1628]          -   
1_augmentations.TimeMasking_1                  -   [64, 27, 1628]          -   
2_augmentations.FrequencyMasking_2             -   [64, 27, 1628]          -   
3_augmentations.PermuteBlock_3                 -   [64, 1628, 27]          -   
4_encoder.PermuteBlock_permute                 -   [64, 27, 1628]          -   
5_encoder.Conv1d_embedding          [27, 256, 3]  [64, 256, 1628]    20.992k   
6_encoder.PermuteBlock_permute                 -  [64, 1628, 256]          -   
7_encoder.pBLSTMs.0.LSTM_blstm                 -     [49711, 512]   1.57696M   
8_encoder.pBLSTMs.1.LSTM_blstm                 -     [25863, 512]  2.625536M   
9_decoder.mlp.PermuteBlock_0         

Unnamed: 0_level_0,Kernel Shape,Output Shape,Params,Mult-Adds
Layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0_augmentations.PermuteBlock_0,-,"[64, 27, 1628]",,
1_augmentations.TimeMasking_1,-,"[64, 27, 1628]",,
2_augmentations.FrequencyMasking_2,-,"[64, 27, 1628]",,
3_augmentations.PermuteBlock_3,-,"[64, 1628, 27]",,
4_encoder.PermuteBlock_permute,-,"[64, 27, 1628]",,
5_encoder.Conv1d_embedding,"[27, 256, 3]","[64, 256, 1628]",20992.0,33758208.0
6_encoder.PermuteBlock_permute,-,"[64, 1628, 256]",,
7_encoder.pBLSTMs.0.LSTM_blstm,-,"[49711, 512]",1576960.0,1572864.0
8_encoder.pBLSTMs.1.LSTM_blstm,-,"[25863, 512]",2625536.0,2621440.0
9_decoder.mlp.PermuteBlock_0,-,"[64, 512, 407]",,


# Training Config

In [113]:
torch.cuda.empty_cache()
model = Network(input_size=64, hidden_size=config['hidden_size']).to(device)
summary(model, x.to(device), lx) # x and lx come from the sanity check above :)

In [96]:
#TODO

criterion = nn.CTCLoss()
# Define CTC loss as the criterion. How would the losses be reduced?
# CTC Loss: https://pytorch.org/docs/stable/generated/torch.nn.CTCLoss.html
# Refer to the handout for hints

optimizer =  torch.optim.AdamW(model.parameters(), lr=config['lr']) # What goes in here?

# Declare the decoder. Use the CTC Beam Decoder to decode phonemes
# CTC Beam Decoder Doc: https://github.com/parlance/ctcdecode
decoder = CTCBeamDecoder(LABELS, model_path=None, alpha=0, beta=0, cutoff_top_n=40, cutoff_prob=0, beam_width=config['beam_width'], num_processes=4, blank_id=0, log_probs_input=True)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,factor=0.2,patience=3, threshold=0.01)
# scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=config['lr'], epochs=config['epochs'], steps_per_epoch = len(train_loader), pct_start = config['pct_start'])

# Mixed Precision, if you need it
scaler = torch.cuda.amp.GradScaler()

## Decode Prediction

In [97]:
def decode_prediction(output, output_lens, decoder, PHONEME_MAP= LABELS):
    
    # TODO: look at docs for CTC.decoder and find out what is returned here. Check the shape of output and expected shape in decode.
    beam_results, beam_scores, timesteps, out_lens = decoder.decode(output, seq_lens=output_lens) #lengths - list of lengths
    # beam_results (B, N, T): each entry is a matrix that contains the output characters from each beam
    # beam_scores (B, N): records the CTC score for each beam
    # timesteps (B, N): The timestep at which the nth output character has peak probability
    # out_lens (B, N): (i, j)-th entry is the length of the jth beam_result, of item i of your batch
    pred_strings                    = []
    for i in range(output_lens.shape[0]):
        #TODO: Create the prediction from the output of decoder.decode. Don't forget to map it using PHONEMES_MAP.
        most_likely_seq = beam_results[i][0]
        most_likely_seq = most_likely_seq[0:out_lens[i][0]] # get rid of the extra useless length at the end
        prediction = [PHONEME_MAP[ind] for ind in most_likely_seq]
        prediction = "".join(prediction)
        pred_strings.append(prediction)

    return pred_strings

def calculate_levenshtein(output, label, output_lens, label_lens, decoder, PHONEME_MAP= LABELS): # y - sequence of integers
    
    dist            = 0
    batch_size      = label.shape[0]
    pred_strings    = decode_prediction(output, output_lens, decoder, PHONEME_MAP)
    
    for i in range(batch_size):
        # TODO: Get predicted string and label string for each element in the batch
        prediction = pred_strings[i]
        label_string = label[i][0:label_lens[i]]
        label_string = [PHONEME_MAP[ind] for ind in label_string]
        label_string = "".join(label_string)
        dist += Levenshtein.distance(prediction, label_string)    

    dist /= batch_size # TODO: Uncomment this, but think about why we are doing this
    # raise NotImplemented
    return dist

In [98]:
# test code to check shapes

model.eval()
for i, data in enumerate(val_loader, 0):
    x, y, lx, ly = data
    x, y = x.to(device), y.to(device)
    h, lh = model(x, lx)
    loss = criterion(torch.permute(h, (1, 0, 2)), y, lh, ly)

    print(calculate_levenshtein(h, y, lh, ly, decoder))

    break

30.15625


## wandb

You will need to fetch your api key from wandb.ai

In [99]:
import wandb
wandb.login(key="17b33e5165b64dc340be46f11e70984b346ab965")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [100]:
run = wandb.init(
    name = str(config), ## Wandb creates random run names if you skip this field
    reinit = True, ### Allows reinitalizing runs when you re-run this cell
    # run_id = ### Insert specific run id here if you want to resume a previous run
    # resume = "must" ### You need this to resume previous runs, but comment out reinit = True when using this
    project = "hw3p2-ablations", ### Project should be created in your wandb account 
    config = config ### Wandb Config for your run
)

# Train Functions

In [104]:
from tqdm import tqdm

def train_model(model, train_loader, criterion, optimizer):
    
    model.train()
    batch_bar = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc='Train') 

    total_loss = 0

    for i, data in enumerate(train_loader):
        optimizer.zero_grad()

        x, y, lx, ly = data
        x, y = x.to(device), y.to(device)

        with torch.cuda.amp.autocast():     
            h, lh = model(x, lx)
            h = torch.permute(h, (1, 0, 2))
            loss = criterion(h, y, lh, ly)

        total_loss += loss.item()

        batch_bar.set_postfix(
            loss="{:.04f}".format(float(total_loss / (i + 1))),
            lr="{:.06f}".format(float(optimizer.param_groups[0]['lr'])))

        batch_bar.update() # Update tqdm bar

        # Another couple things you need for FP16. 
        scaler.scale(loss).backward() # This is a replacement for loss.backward()
        scaler.step(optimizer) # This is a replacement for optimizer.step()
        scaler.update() # This is something added just for FP16
        # scheduler.step()
        del x, y, lx, ly, h, lh, loss 
        torch.cuda.empty_cache()
    batch_bar.close() # You need this to close the tqdm bar
    
    return total_loss / len(train_loader)


def validate_model(model, val_loader, decoder, phoneme_map= LABELS):

    model.eval()
    batch_bar = tqdm(total=len(val_loader), dynamic_ncols=True, position=0, leave=False, desc='Val')

    total_loss = 0
    vdist = 0

    for i, data in enumerate(val_loader):

        x, y, lx, ly = data
        x, y = x.to(device), y.to(device)

        with torch.inference_mode():
            h, lh = model(x, lx)
            h = torch.permute(h, (1, 0, 2))
            loss = criterion(h, y, lh, ly)

        total_loss += float(loss)
        vdist += calculate_levenshtein(torch.permute(h, (1, 0, 2)), y, lh, ly, decoder, phoneme_map)

        batch_bar.set_postfix(loss="{:.04f}".format(float(total_loss / (i + 1))), dist="{:.04f}".format(float(vdist / (i + 1))))

        batch_bar.update()
    
        del x, y, lx, ly, h, lh, loss
        torch.cuda.empty_cache()
    batch_bar.close()
    total_loss = total_loss/len(val_loader)
    val_dist = vdist/len(val_loader)
    return total_loss, val_dist

### Training Setup

In [105]:
def save_model(model, optimizer, scheduler, metric, epoch, path):
    torch.save(
        {'model_state_dict'         : model.state_dict(),
         'optimizer_state_dict'     : optimizer.state_dict(),
         'scheduler_state_dict'     : scheduler.state_dict(),
         metric[0]                  : metric[1], 
         'epoch'                    : epoch}, 
         path
    )

def load_model(path, model, metric= 'valid_acc', optimizer= None, scheduler= None):

    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state_dict'])

    if optimizer != None:
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    if scheduler != None:
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        
    epoch   = checkpoint['epoch']
    metric  = checkpoint[metric]

    return [model, optimizer, scheduler, epoch, metric]

In [106]:
# This is for checkpointing, if you're doing it over multiple sessions

last_epoch_completed = 0
start = last_epoch_completed
end = config["epochs"]
best_lev_dist = float("inf") # if you're restarting from some checkpoint, use what you saw there.
epoch_model_path = 'hw3p2_current_checkpoint.pth'
best_model_path = 'hw3p2_best_checkpoint.pth'

In [None]:
torch.cuda.empty_cache()
gc.collect()

#TODO: Please complete the training loop

for epoch in range(0, config['epochs']):

    print("\nEpoch: {}/{}".format(epoch+1, config['epochs']))
    
    curr_lr = float(optimizer.param_groups[0]['lr'])

    train_loss              = train_model(model, train_loader, criterion, optimizer)
    valid_loss, valid_dist  = validate_model(model, val_loader, decoder)
    scheduler.step(valid_dist)

    print("\tTrain Loss {:.04f}\t Learning Rate {:.07f}".format(train_loss, curr_lr))
    print("\tVal Dist {:.04f}%\t Val Loss {:.04f}".format(valid_dist, valid_loss))


    wandb.log({
        'train_loss': train_loss,  
        'valid_dist': valid_dist, 
        'valid_loss': valid_loss, 
        'lr'        : curr_lr
    })
    
    save_model(model, optimizer, scheduler, ['valid_dist', valid_dist], epoch, epoch_model_path)
    wandb.save(epoch_model_path)
    print("Saved epoch model")

    if valid_dist <= best_lev_dist:
        best_lev_dist = valid_dist
        save_model(model, optimizer, scheduler, ['valid_dist', valid_dist], epoch, best_model_path)
        wandb.save(best_model_path)
        print("Saved best model")




Epoch: 1/50




	Train Loss 0.7428	 Learning Rate 0.0010000
	Val Dist 9.0372%	 Val Loss 0.4133
Saved epoch model
Saved best model

Epoch: 2/50




	Train Loss 0.4281	 Learning Rate 0.0010000
	Val Dist 7.1060%	 Val Loss 0.3298
Saved epoch model
Saved best model

Epoch: 3/50




	Train Loss 0.3504	 Learning Rate 0.0010000
	Val Dist 6.1646%	 Val Loss 0.2897
Saved epoch model
Saved best model

Epoch: 4/50




	Train Loss 0.3110	 Learning Rate 0.0010000
	Val Dist 5.7620%	 Val Loss 0.2765
Saved epoch model
Saved best model

Epoch: 5/50




	Train Loss 0.2819	 Learning Rate 0.0010000
	Val Dist 5.3704%	 Val Loss 0.2638
Saved epoch model
Saved best model

Epoch: 6/50




	Train Loss 0.2594	 Learning Rate 0.0010000
	Val Dist 5.1215%	 Val Loss 0.2605
Saved epoch model
Saved best model

Epoch: 7/50




	Train Loss 0.2436	 Learning Rate 0.0010000
	Val Dist 4.9919%	 Val Loss 0.2623
Saved epoch model
Saved best model

Epoch: 8/50




	Train Loss 0.2305	 Learning Rate 0.0010000
	Val Dist 4.8380%	 Val Loss 0.2616
Saved epoch model
Saved best model

Epoch: 9/50




	Train Loss 0.2172	 Learning Rate 0.0010000
	Val Dist 4.7410%	 Val Loss 0.2590
Saved epoch model
Saved best model

Epoch: 10/50


Train:  53%|█████▎    | 1105/2072 [16:32<13:24,  1.20it/s, loss=0.2078, lr=0.001000]

In [94]:
run.finish()

# Generate Predictions and Submit to Kaggle

In [63]:
best_state_dict = torch.load('hw3p2_best_checkpoint.pth')
best_model = ASRModel(input_size  = config['embedding_size'], embed_size  = config['hidden_size'], output_size = len(PHONEMES)).to(device)
best_model.load_state_dict(best_state_dict['model_state_dict'])

<All keys matched successfully>

In [64]:
#TODO: Make predictions

# Follow the steps below:
# 1. Create a new object for CTCBeamDecoder with larger (why?) number of beams
# 2. Get prediction string by decoding the results of the beam decoder

TEST_BEAM_WIDTH = 40

test_decoder    = CTCBeamDecoder(LABELS, model_path=None, alpha=0, beta=0, cutoff_top_n=40, cutoff_prob=0, beam_width=TEST_BEAM_WIDTH, num_processes=4, blank_id=0, log_probs_input=True)
results = []

best_model.eval()
print("Testing")
for data in tqdm(test_loader):

    x, lx   = data
    x       = x.to(device)

    with torch.no_grad():
        h, lh = best_model(x, lx)

    prediction_string = decode_prediction(h, lh, test_decoder)
    #TODO save the output in results array.
    results.append(prediction_string)
    
    del x, lx, h, lh
    torch.cuda.empty_cache()

Testing



  0%|          | 0/41 [00:00<?, ?it/s][A
  2%|▏         | 1/41 [00:00<00:30,  1.32it/s][A
  5%|▍         | 2/41 [00:00<00:17,  2.23it/s][A
  7%|▋         | 3/41 [00:01<00:13,  2.90it/s][A
 10%|▉         | 4/41 [00:01<00:10,  3.61it/s][A
 12%|█▏        | 5/41 [00:01<00:08,  4.07it/s][A
 15%|█▍        | 6/41 [00:01<00:07,  4.78it/s][A
 20%|█▉        | 8/41 [00:02<00:06,  5.44it/s][A
 22%|██▏       | 9/41 [00:02<00:06,  5.22it/s][A
 24%|██▍       | 10/41 [00:02<00:06,  5.07it/s][A
 27%|██▋       | 11/41 [00:02<00:05,  5.83it/s][A
 29%|██▉       | 12/41 [00:02<00:04,  6.07it/s][A
 32%|███▏      | 13/41 [00:02<00:04,  6.59it/s][A
 34%|███▍      | 14/41 [00:03<00:04,  6.06it/s][A
 37%|███▋      | 15/41 [00:03<00:04,  5.62it/s][A
 39%|███▉      | 16/41 [00:03<00:04,  5.28it/s][A
 41%|████▏     | 17/41 [00:03<00:04,  4.99it/s][A
 44%|████▍     | 18/41 [00:03<00:04,  5.10it/s][A
 49%|████▉     | 20/41 [00:04<00:03,  5.73it/s][A
 51%|█████     | 21/41 [00:04<00:03,  5.54it/s]

In [65]:
RESULT = []
for i in range(len(results)):
  for j in range(len(results[i])):
    RESULT.append(results[i][j])

In [66]:
data_dir = "hw3p2_submission.csv"
df = pd.DataFrame(columns=["index", "label"])
df.to_csv("hw3p2_submission.csv", index=False)
df = pd.read_csv(data_dir)
df['label'] = RESULT
df['index'] = np.array(range(2620))
df.to_csv("hw3p2_submission.csv", index=False)

!kaggle competitions submit -c automatic-speech-recognition-asr-slack-kaggle -f hw3p2_submission.csv -m "I made it!"


100% 209k/209k [00:00<00:00, 402kB/s]
Successfully submitted to Automatic Speech Recognition (ASR) Slack Kaggle