# Imports

In [1]:
import torch
import random
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
# from torchsummaryX import summary
from torchinfo import summary
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

import torchaudio.transforms as tat

from sklearn.metrics import accuracy_score
import gc

# import zipfile
import pandas as pd
from tqdm import tqdm
import os
import datetime

# imports for decoding and distance calculation 
# `ctcdecode` is a obselete package, it won't go through compilation on any mordern compilers
# import ctcdecode
import Levenshtein
# from ctcdecode import CTCBeamDecoder
from torchaudio.models.decoder import ctc_decoder

import warnings
warnings.filterwarnings('ignore')

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device: ", device)

Device:  cuda


# Dataset and Dataloader

In [2]:
# ARPABET PHONEME MAPPING
# DO NOT CHANGE

CMUdict_ARPAbet = {
    "" : " ",
    "[SIL]": "-", "NG": "G", "F" : "f", "M" : "m", "AE": "@",
    "R"    : "r", "UW": "u", "N" : "n", "IY": "i", "AW": "W",
    "V"    : "v", "UH": "U", "OW": "o", "AA": "a", "ER": "R",
    "HH"   : "h", "Z" : "z", "K" : "k", "CH": "C", "W" : "w",
    "EY"   : "e", "ZH": "Z", "T" : "t", "EH": "E", "Y" : "y",
    "AH"   : "A", "B" : "b", "P" : "p", "TH": "T", "DH": "D",
    "AO"   : "c", "G" : "g", "L" : "l", "JH": "j", "OY": "O",
    "SH"   : "S", "D" : "d", "AY": "Y", "S" : "s", "IH": "I",
    "[SOS]": "[SOS]", "[EOS]": "[EOS]"
}

CMUdict = list(CMUdict_ARPAbet.keys())
ARPAbet = list(CMUdict_ARPAbet.values())


PHONEMES = CMUdict[:-2]
LABELS = ARPAbet[:-2]

In [3]:

DATA_ROOT = "/mnt/e/Workspace/IDL/Data/hw3/11-785-s24-hw3p2"
MODEL_ROOT = "/mnt/e/Workspace/IDL/Models/hw1/11-785-s24-hw3p2/"

### Train Data

In [19]:
class PermuteBlock(torch.nn.Module):
    def forward(self, x):
        return x.transpose(1, 2)

In [31]:
class AudioDataset(torch.utils.data.Dataset):

    # For this homework, we give you full flexibility to design your data set class.
    # Hint: The data from HW1 is very similar to this HW

    #TODO
    def __init__(self, root=DATA_ROOT, partition="train-clean-100", use_cmn=False, audio_transformation=None):
        '''
        Initializes the dataset.

        INPUTS: What inputs do you need here?
        '''

        # Load the directory and all files in them
        
        self.phonemes = PHONEMES

        self.mfccs, self.transcripts = self._init_data(f"{root}/{partition}", use_cmn=use_cmn)
        
        print(self.length, len(self.mfccs), len(self.transcripts))

        if audio_transformation is not None:
            self.transformation = audio_transformation
        else:
            self.transformation = nn.Sequential()

        #TODO
        # HOW CAN WE REPRESENT PHONEMES? CAN WE CREATE A MAPPING FOR THEM?
        # HINT: TENSORS CANNOT STORE NON-NUMERICAL VALUES OR STRINGS

        #TODO
        # CREATE AN ARRAY OF ALL FEATUERS AND LABELS
        # WHAT NORMALIZATION TECHNIQUE DID YOU USE IN HW1? CAN WE USE IT HERE?
        '''
        You may decide to do this in __getitem__ if you wish.
        However, doing this here will make the __init__ function take the load of
        loading the data, and shift it away from training.
        '''
        
    def _init_data(self, root: str, use_cmn = False):
        self.mfcc_dir       = f"{root}/mfcc"
        self.transcript_dir = f"{root}/transcript"
        mfcc_names          = os.listdir(self.mfcc_dir)
        transcript_names    = os.listdir(self.transcript_dir)
        
        self.length = len(mfcc_names)

        self.mfccs, self.transcripts = [], []
        for i in tqdm(range(len(mfcc_names))):
        #   Load a single mfcc
            mfcc        = np.load(f"{self.mfcc_dir}/{mfcc_names[i]}")
        #   Do Cepstral Normalization of mfcc (explained in writeup)
            if use_cmn:
                mfcc = mfcc - np.mean(mfcc, axis=0)
        #   Load the corresponding transcript
            transcript  = np.load(f"{self.transcript_dir}/{transcript_names[i]}") 
            # Remove [SOS] and [EOS] from the transcript
            assert transcript[0] == '[SOS]' and transcript[-1] == '[EOS]'
            transcript = transcript[1:-1]
            #lookup phoneme index
            transcript = np.vectorize(self.phonemes.index)(transcript)
            # assert len(mfcc) == len(transcript)
            # (Is there an efficient way to do this without traversing through the transcript?)
            # Note that SOS will always be in the starting and EOS at end, as the name suggests.
        #   Append each mfcc to self.mfcc, transcript to self.transcript
            self.mfccs.append(mfcc)
            self.transcripts.append(transcript)
            
        # return np.concatenate(self.mfccs, axis=0), np.concatenate(self.transcripts, axis=0)
        return self.mfccs, self.transcripts

    def __len__(self):

        '''
        TODO: What do we return here?
        '''
        return self.length

    def __getitem__(self, ind):
        '''
        TODO: RETURN THE MFCC COEFFICIENTS AND ITS CORRESPONDING LABELS

        If you didn't do the loading and processing of the data in __init__,
        do that here.

        Once done, return a tuple of features and labels.
        '''

        mfcc = self.mfccs[ind]
        transcript = self.transcripts[ind]
        return torch.FloatTensor(mfcc), torch.tensor(transcript)


    def collate_fn(self,batch):
        '''
        TODO:
        1.  Extract the features and labels from 'batch'
        2.  We will additionally need to pad both features and labels,
            look at pytorch's docs for pad_sequence
        3.  This is a good place to perform transforms, if you so wish.
            Performing them on batches will speed the process up a bit.
        4.  Return batch of features, labels, lenghts of features,
            and lengths of labels.
        '''
        # batch of input mfcc coefficients
        batch_mfcc, batch_transcript, = [], []
        lengths_mfcc, lengths_transcript = [], []
        for (m, t) in batch:
          batch_mfcc.append(m)
          lengths_mfcc.append(len(m))
          batch_transcript.append(t)
          lengths_transcript.append(len(t))
        
       
        batch_mfcc_pad = pad_sequence(batch_mfcc, batch_first = True)
        
        batch_mfcc_pad = self.transformation(batch_mfcc_pad)
        
        batch_transcript_pad = pad_sequence(batch_transcript, batch_first = True)
        
        # You may apply some transformation, Time and Frequency masking, here in the collate function;
        # Food for thought -> Why are we applying the transformation here and not in the __getitem__?
        #                  -> Would we apply transformation on the validation set as well?
        #                  -> Is the order of axes / dimensions as expected for the transform functions?

        # Return the following values: padded features, padded labels, actual length of features, actual length of the labels
        return batch_mfcc_pad, batch_transcript_pad, torch.tensor(lengths_mfcc), torch.tensor(lengths_transcript)



### Test Data

In [5]:
# Test Dataloader
# TODO
class AudioDatasetTest(AudioDataset):
      def __init__(self, root=DATA_ROOT, partition="test-clean", use_cmn=True, audio_transformation=None):
            super().__init__(root, partition, use_cmn, audio_transformation=None)
      
      def _init_data(self, root: str, use_cmn):
            self.mfcc_dir = f"{root}/mfcc"

            mfcc_names = os.listdir(self.mfcc_dir)
            
            self.length = len(mfcc_names)

            self.mfccs, self.transcripts = [], []

            for i in tqdm(range(len(mfcc_names))):
                  #   Load a single mfcc
                  mfcc = np.load(f"{self.mfcc_dir}/{mfcc_names[i]}")
                  transcript = np.array([0 for _ in range(len(mfcc))])

                  assert len(mfcc) == len(transcript)

                  self.mfccs.append(mfcc)
                  self.transcripts.append(transcript)

            return np.concatenate(self.mfccs, axis=0), np.concatenate(self.transcripts, axis=0)
      
      def __getitem__(self, ind):
            mfcc = self.mfccs[ind]
            return torch.FloatTensor(mfcc)
      
      def collate_fn(self, batch):
            batch_mfcc = []
            lengths_mfcc = []
        
            for mfcc in batch:
                  batch_mfcc.append(mfcc)
            
                  lengths_mfcc.append(len(mfcc))
            
      
            batch_mfcc_pad = pad_sequence(batch_mfcc, batch_first = True)
            

            # You may apply some transformation, Time and Frequency masking, here in the collate function;
            # Food for thought -> Why are we applying the transformation here and not in the __getitem__?
            #                  -> Would we apply transformation on the validation set as well?
            #                  -> Is the order of axes / dimensions as expected for the transform functions?
            
            # Return the following values: padded features, padded labels, actual length of features, actual length of the labels
            return batch_mfcc_pad, torch.tensor(lengths_mfcc)

### Config - Hyperparameters

In [79]:
root = '/content/11-785-s24-hw3p2/'

# Feel free to add more items here
config = {
    "beam_width" : 5,
    "lr"         : 2e-3,
    "epochs"     : 50,
    "batch_size" : 256,  # Increase if your device can handle it
    "dropout": 0.2
}

# You may pass this as a parameter to the dataset class above
# This will help modularize your implementation
transforms = nn.Sequential(
            PermuteBlock(),
            tat.FrequencyMasking(freq_mask_param=5),
            tat.TimeMasking(time_mask_param=100),
            PermuteBlock()
 ) # set of tranformations

### Data loaders

In [32]:
# get me RAMMM!!!!
import gc
gc.collect()

2443

In [33]:
# Create objects for the dataset class
train_data = AudioDataset(partition="train-clean-100", use_cmn=True, audio_transformation=transforms) 
val_data = AudioDataset(partition="dev-clean", use_cmn=True, audio_transformation=None)
test_data = AudioDatasetTest(partition="test-clean", use_cmn=True, audio_transformation=None)

# Do NOT forget to pass in the collate function as parameter while creating the dataloader


100%|██████████| 28539/28539 [03:07<00:00, 152.07it/s]


28539 28539 28539


100%|██████████| 2703/2703 [00:16<00:00, 159.58it/s]


2703 2703 2703


100%|██████████| 2620/2620 [00:08<00:00, 306.45it/s]

2620 (1934138, 27) (1934138,)





In [34]:
train_loader = torch.utils.data.DataLoader(
    dataset     = train_data, 
    num_workers = 8,
    batch_size  = config["batch_size"], 
    pin_memory  = True,
    shuffle     = True,
    collate_fn = train_data.collate_fn
)


In [35]:
val_loader = torch.utils.data.DataLoader(
    dataset     = val_data, 
    num_workers = 4,
    batch_size  = config["batch_size"], 
    pin_memory  = True,
    shuffle     = False,
    collate_fn = val_data.collate_fn
)
test_loader = torch.utils.data.DataLoader(
    dataset     = test_data, 
    num_workers = 4,
    batch_size  = 1, 
    pin_memory  = True,
    shuffle     = False,
    collate_fn = test_data.collate_fn
)

In [36]:
gc.collect()

print("Batch size: ", config["batch_size"])
print("Train dataset samples = {}, batches = {}".format(train_data.__len__(), len(train_loader)))
print("Val dataset samples = {}, batches = {}".format(val_data.__len__(), len(val_loader)))
print("Test dataset samples = {}, batches = {}".format(test_data.__len__(), len(test_loader)))

Batch size:  256
Train dataset samples = 28539, batches = 112
Val dataset samples = 2703, batches = 11
Test dataset samples = 2620, batches = 2620


In [37]:
# sanity check
for data in train_loader:
    x, y, lx, ly = data
    print(x.shape, y.shape, lx.shape, ly.shape)
    break

torch.Size([256, 1686, 27]) torch.Size([256, 201]) torch.Size([256]) torch.Size([256])


# NETWORK

## ASR Network

### Pyramid Bi-LSTM (pBLSTM)

In [38]:
# Utils for network
torch.cuda.empty_cache()

In [39]:
class pBLSTM(torch.nn.Module):

    '''
    Pyramidal BiLSTM
    Read the write up/paper and understand the concepts and then write your implementation here.

    At each step,
    1. Pad your input if it is packed (Unpack it)
    2. Reduce the input length dimension by concatenating feature dimension
        (Tip: Write down the shapes and understand)
        (i) How should  you deal with odd/even length input? 
        (ii) How should you deal with input length array (x_lens) after truncating the input?
    3. Pack your input
    4. Pass it into LSTM layer

    To make our implementation modular, we pass 1 layer at a time.
    '''
    
    def __init__(self, input_size, hidden_size):
        super(pBLSTM, self).__init__()

        self.blstm = nn.LSTM(input_size = 2*input_size, hidden_size = hidden_size, num_layers = 1, bidirectional = True, dropout = 0.2, batch_first = True) 

    def forward(self, x_packed): # x_packed is a PackedSequence
        
        x , lengths = pad_packed_sequence(x_packed, batch_first = True)
        
        x, x_lens = self.trunc_reshape(x, lengths)
        
        x = pack_padded_sequence(x, x_lens, batch_first = True, enforce_sorted= False)
        
        x , h= self.blstm(x)
        
        return x

    def trunc_reshape(self, x, x_lens): 
        
        if x.shape[1]%2 != 0:
            x= x[:,:-1,:]

        x = x.reshape(x.shape[0],x.shape[1]//2, x.shape[2]*2)
        x_lens  = x_lens//2
        return x, x_lens

### Encoder

In [71]:
from torchnlp.nn import LockedDropout

class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        expand_dims = [128, 256]
                
        self.embed = nn.Sequential(
            PermuteBlock(), 
            nn.Conv1d(in_channels=input_size, out_channels=expand_dims[0], kernel_size=3, stride=1, padding=1),
            nn.BatchNorm1d(num_features=expand_dims[0]),
            nn.Mish(),
            nn.Conv1d(in_channels=expand_dims[0], out_channels=expand_dims[1], kernel_size=3, stride=1, padding=1),
            nn.BatchNorm1d(num_features=expand_dims[1]),
            PermuteBlock())

        self.pBLSTMs = nn.Sequential(
            pBLSTM(input_size=expand_dims[1], hidden_size=hidden_size),
            LockedDropout(0.4),
            pBLSTM(input_size=2*hidden_size, hidden_size=hidden_size),
            LockedDropout(0.3)
        )
         
    def forward(self, x, lens):
        x = self.embed(x)      
        lens = lens.clamp(max=x.shape[1]).cpu()
        
        x = pack_padded_sequence(x, lens, batch_first=True, enforce_sorted=False)
        x = self.pBLSTMs(x)
        outputs, lens = pad_packed_sequence(x, batch_first=True)

        return outputs, lens

### Decoder

In [75]:
class DynamicMlpNet(torch.nn.Module):
    def __init__(self, input_size, output_size, hidden_sizes, dropout_rate):
        super(DynamicMlpNet, self).__init__()
        self.layers = []
        for i, hs in enumerate(hidden_sizes):
            self.layers.extend(self._mlp_layer_provider(input_size, hs, dropout_rate, i))
            input_size = hs
        self.layers.append(nn.Linear(input_size, output_size)) # output
        self.model = nn.Sequential(*self.layers)
    
    def forward(self, x):
        out = self.model(x)
        return out
    
    def _mlp_layer_provider(self, input_size, hidden_size, dropout_rate, idx) -> list[nn.Module]:
        return [
            nn.Linear(input_size, hidden_size),
            nn.LeakyReLU(),
            nn.BatchNorm1d(hidden_size),
            nn.Dropout(dropout_rate)
        ]
    
    def describe() -> str:
        return "base model, don't use this"

class Decoder(DynamicMlpNet):

    def __init__(self, embed_size, output_size= 41):
        super().__init__(input_size=embed_size, output_size=output_size, dropout_rate=config["dropout"], hidden_sizes=[2048,1024])
                
        self.softmax = torch.nn.LogSoftmax(dim=2)
    
    def _mlp_layer_provider(self, input_size, hidden_size, dropout_rate, idx) -> list[nn.Module]:
        return [
            nn.Linear(input_size, hidden_size),
            nn.Mish(),
            PermuteBlock(), torch.nn.BatchNorm1d(hidden_size), PermuteBlock(),
            nn.Dropout(dropout_rate),
        ] if idx != 0 else [
            PermuteBlock(), torch.nn.BatchNorm1d(input_size), PermuteBlock(),
            nn.Linear(input_size, hidden_size),
            nn.Mish(),
            PermuteBlock(), torch.nn.BatchNorm1d(hidden_size), PermuteBlock(),
            nn.Dropout(dropout_rate),
        ]
    
    def forward(self, encoder_out):
        out = self.model(encoder_out)
        out = self.softmax(out)
        return out 
    
    def describe() -> str:
        return "MLP Decoder model"

In [76]:
class ASRModel(torch.nn.Module):

    def __init__(self, input_size, embed_size= 192, output_size= len(PHONEMES)):
        super().__init__()

        # self.augmentations  = torch.nn.Sequential(
        #     #TODO Add Time Masking/ Frequency Masking
        #     #Hint: See how to use PermuteBlock() function defined above
        #     PermuteBlock(), 
        #     torchaudio.transforms.FrequencyMasking(freq_mask_param=10),
        #     torchaudio.transforms.TimeMasking(time_mask_param=100),
        #     PermuteBlock(),
        # ) # did augmentation in the collate_fn
        self.encoder        = Encoder(input_size, embed_size)# TODO: Initialize Encoder
        self.decoder        = Decoder(embed_size*2, output_size) # TODO: Initialize Decoder 

        
    
    def forward(self, x, lengths_x):
        
        # if self.training:
        #     x = self.augmentations(x)

        encoder_out, encoder_lens   = self.encoder(x, lengths_x)
        decoder_out                 = self.decoder(encoder_out)

        return decoder_out, encoder_lens

## Initialize ASR Network

In [77]:
model = ASRModel(
    input_size  = 27,
    embed_size  = 512,
    output_size = len(PHONEMES)
).to(device)
print(model)
summary(model, input_data=[x.to(device), lx.to(device)], device=device)

ASRModel(
  (encoder): Encoder(
    (embed): Sequential(
      (0): PermuteBlock()
      (1): Conv1d(27, 128, kernel_size=(3,), stride=(1,), padding=(1,))
      (2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (3): Mish()
      (4): Conv1d(128, 256, kernel_size=(3,), stride=(1,), padding=(1,))
      (5): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (6): PermuteBlock()
    )
    (pBLSTMs): Sequential(
      (0): pBLSTM(
        (blstm): LSTM(512, 512, batch_first=True, dropout=0.2, bidirectional=True)
      )
      (1): LockedDropout(p=0.4)
      (2): pBLSTM(
        (blstm): LSTM(2048, 512, batch_first=True, dropout=0.2, bidirectional=True)
      )
      (3): LockedDropout(p=0.3)
    )
  )
  (decoder): Decoder(
    (model): Sequential(
      (0): PermuteBlock()
      (1): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): PermuteBlock()
      (3): Linear(in_fea

Layer (type:depth-idx)                   Output Shape              Param #
ASRModel                                 [256, 421, 41]            --
├─Encoder: 1-1                           [256, 421, 1024]          --
│    └─Sequential: 2-1                   [256, 1686, 256]          --
│    │    └─PermuteBlock: 3-1            [256, 27, 1686]           --
│    │    └─Conv1d: 3-2                  [256, 128, 1686]          10,496
│    │    └─BatchNorm1d: 3-3             [256, 128, 1686]          256
│    │    └─Mish: 3-4                    [256, 128, 1686]          --
│    │    └─Conv1d: 3-5                  [256, 256, 1686]          98,560
│    │    └─BatchNorm1d: 3-6             [256, 256, 1686]          512
│    │    └─PermuteBlock: 3-7            [256, 1686, 256]          --
│    └─Sequential: 2-2                   [80940, 1024]             --
│    │    └─pBLSTM: 3-8                  [162003, 1024]            4,202,496
│    │    └─LockedDropout: 3-9           [162003, 1024]            -

# Training Config
Initialize Loss Criterion, Optimizer, CTC Beam Decoder, Scheduler, Scaler (Mixed-Precision), etc.

In [83]:
import torchaudio
torchaudio.__version__

'2.1.0'

In [85]:
from torchaudio.models.decoder import ctc_decoder
from pyctcdecode import build_ctcdecoder


criterion = torch.nn.CTCLoss(blank=0, reduction='mean', zero_infinity=False) # Define CTC loss as the criterion. How would the losses be reduced?
# CTC Loss: https://pytorch.org/docs/stable/generated/torch.nn.CTCLoss.html
# Refer to the handout for hints

optimizer =  torch.optim.AdamW(model.parameters(), lr= config['lr']) # What goes in here?

# Declare the decoder. Use the CTC Beam Decoder to decode phonemes
# CTC Beam Decoder Doc: https://github.com/parlance/ctcdecode

# decoder = CTCBeamDecoder(LABELS, beam_width = config["beam_width"], log_probs_input = True)
decoder = build_ctcdecoder(
    LABELS,
    
)

# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode = 'min', patience = 3, threshold=1e-2)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max = 50, eta_min = 1e-6)
# Mixed Precision, if you need it
scaler = torch.cuda.amp.GradScaler()

ValueError: Unknown entry in dictionary: ''

# Decode Prediction

In [None]:
def decode_prediction(output, output_lens, decoder, PHONEME_MAP= LABELS):
    
    # TODO: look at docs for CTC.decoder and find out what is returned here. Check the shape of output and expected shape in decode.
    beam_results, beam_scores, timesteps, out_seq_len = decoder.decode(output, seq_lens= output_lens) #lengths - list of lengths

    pred_strings  = []
    # print(beam_results.shape)
    # print(beam_results)
    for i in range(output_lens.shape[0]):
        #TODO: Create the prediction from the output of decoder.decode. Don't forget to map it using PHONEMES_MAP.
        pred_strings.append(''.join([PHONEME_MAP[n] for n in beam_results[i][0][:out_seq_len[i][0]]]))
    # print(pred_strings)
    
    return pred_strings

def calculate_levenshtein(output, label, output_lens, label_lens, decoder, PHONEME_MAP= LABELS): # y - sequence of integers
    
    dist            = 0
    batch_size      = label.shape[0]

    pred_strings    = decode_prediction(output, output_lens, decoder, PHONEME_MAP)
    # print(batch_size)
    for i in range(batch_size):
        # TODO: Get predicted string and label string for each element in the batch
        pred_string = pred_strings[i]#TODO
        # print('pred',pred_string)
        label_string = ''.join([PHONEME_MAP[n] for n in label[i][:label_lens[i]]]) #TODO
        # print('label',label_string)
        dist += Levenshtein.distance(pred_string, label_string)

    dist /= batch_size # TODO: Uncomment this, but think about why we are doing this
    
    return dist

# Test Implementation

In [None]:
# test code to check shapes

model.eval()
for i, data in enumerate(val_loader, 0):
    x, y, lx, ly = data
    x, y = x.to(device), y.to(device)
    h, lh = model(x, lx)
    print(h.shape)
    h = torch.permute(h, (1, 0, 2))
    print(h.shape, y.shape)
    loss = criterion(h, y, lh, ly)
    print(loss)

    print(calculate_levenshtein(h, y, lx, ly, decoder, LABELS))

    break

# WandB

You will need to fetch your api key from wandb.ai

In [None]:
import wandb
wandb.login(key="<replace with your API key here>")

In [None]:
run = wandb.init(
    name = "early-submission", ## Wandb creates random run names if you skip this field
    reinit = True, ### Allows reinitalizing runs when you re-run this cell
    # run_id = ### Insert specific run id here if you want to resume a previous run
    # resume = "must" ### You need this to resume previous runs, but comment out reinit = True when using this
    project = "hw3p2-ablations", ### Project should be created in your wandb account
    config = config ### Wandb Config for your run
)

# Train Functions

In [None]:
from tqdm import tqdm

def train_model(model, train_loader, criterion, optimizer):

    model.train()
    batch_bar = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc='Train')

    total_loss = 0

    for i, data in enumerate(train_loader):
        optimizer.zero_grad()

        x, y, lx, ly = data
        x, y = x.to(device), y.to(device)

        with torch.cuda.amp.autocast():
            h, lh = model(x, lx)
            h = torch.permute(h, (1, 0, 2))
            loss = criterion(h, y, lh, ly)

        total_loss += loss.item()

        batch_bar.set_postfix(
            loss="{:.04f}".format(float(total_loss / (i + 1))),
            lr="{:.06f}".format(float(optimizer.param_groups[0]['lr'])))

        batch_bar.update() # Update tqdm bar

        # Another couple things you need for FP16.
        scaler.scale(loss).backward() # This is a replacement for loss.backward()
        scaler.step(optimizer) # This is a replacement for optimizer.step()
        scaler.update() # This is something added just for FP16

        del x, y, lx, ly, h, lh, loss
        torch.cuda.empty_cache()

    batch_bar.close() # You need this to close the tqdm bar

    return total_loss / len(train_loader)


def validate_model(model, val_loader, decoder, phoneme_map= LABELS):

    model.eval()
    batch_bar = tqdm(total=len(val_loader), dynamic_ncols=True, position=0, leave=False, desc='Val')

    total_loss = 0
    vdist = 0

    for i, data in enumerate(val_loader):

        x, y, lx, ly = data
        x, y = x.to(device), y.to(device)

        with torch.inference_mode():
            h, lh = model(x, lx)
            h = torch.permute(h, (1, 0, 2))
            loss = criterion(h, y, lh, ly)

        total_loss += float(loss)
        vdist += calculate_levenshtein(torch.permute(h, (1, 0, 2)), y, lh, ly, decoder, phoneme_map)

        batch_bar.set_postfix(loss="{:.04f}".format(float(total_loss / (i + 1))), dist="{:.04f}".format(float(vdist / (i + 1))))

        batch_bar.update()

        del x, y, lx, ly, h, lh, loss
        torch.cuda.empty_cache()

    batch_bar.close()
    total_loss = total_loss/len(val_loader)
    val_dist = vdist/len(val_loader)
    return total_loss, val_dist

## Training Setup

In [None]:
def save_model(model, optimizer, scheduler, metric, epoch, path):
    torch.save(
        {'model_state_dict'         : model.state_dict(),
         'optimizer_state_dict'     : optimizer.state_dict(),
         'scheduler_state_dict'     : scheduler.state_dict(),
         metric[0]                  : metric[1],
         'epoch'                    : epoch},
         path
    )

def load_model(path, model, metric= 'valid_acc', optimizer= None, scheduler= None):

    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state_dict'])

    if optimizer != None:
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    if scheduler != None:
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])

    epoch   = checkpoint['epoch']
    metric  = checkpoint[metric]

    return [model, optimizer, scheduler, epoch, metric]

In [None]:
# This is for checkpointing, if you're doing it over multiple sessions

last_epoch_completed = 0
start = last_epoch_completed
end = config["epochs"]
best_lev_dist = float("inf") # if you're restarting from some checkpoint, use what you saw there.
epoch_model_path = #TODO set the model path( Optional, you can just store best one. Make sure to make the changes below )
best_model_path = #TODO set best model path

In [None]:
torch.cuda.empty_cache()
gc.collect()

#TODO: Please complete the training loop

for epoch in range(0, config['epochs']):

    print("\nEpoch: {}/{}".format(epoch+1, config['epochs']))

    curr_lr = #TODO

    train_loss              = #TODO
    valid_loss, valid_dist  = #TODO
    scheduler.step(valid_dist)

    print("\tTrain Loss {:.04f}\t Learning Rate {:.07f}".format(train_loss, curr_lr))
    print("\tVal Dist {:.04f}%\t Val Loss {:.04f}".format(valid_dist, valid_loss))


    wandb.log({
        'train_loss': train_loss,
        'valid_dist': valid_dist,
        'valid_loss': valid_loss,
        'lr'        : curr_lr
    })

    save_model(model, optimizer, scheduler, ['valid_dist', valid_dist], epoch, epoch_model_path)
    wandb.save(epoch_model_path)
    print("Saved epoch model")

    if valid_dist <= best_lev_dist:
        best_lev_dist = valid_dist
        save_model(model, optimizer, scheduler, ['valid_dist', valid_dist], epoch, best_model_path)
        wandb.save(best_model_path)
        print("Saved best model")
      # You may find it interesting to exlplore Wandb Artifcats to version your models
run.finish()

# Generate Predictions and Submit to Kaggle

In [None]:
#TODO: Make predictions

# Follow the steps below:
# 1. Create a new object for CTCBeamDecoder with larger (why?) number of beams
# 2. Get prediction string by decoding the results of the beam decoder

TEST_BEAM_WIDTH = #TODO

test_decoder    = #TODO
results = []

model.eval()
print("Testing")
for data in tqdm(test_loader):

    x, lx   = data
    x       = x.to(device)

    with torch.no_grad():
        h, lh = model(x, lx)

    prediction_string= # TODO call decode_prediction
    #TODO save the output in results array.

    del x, lx, h, lh
    torch.cuda.empty_cache()

In [None]:
data_dir = f"{root}/test-clean/random_submission.csv"
df = pd.read_csv(data_dir)
df.label = results
df.to_csv('submission.csv', index = False)

In [None]:
!kaggle competitions submit -c hw3p2asr-s24 -f submission.csv -m "I made it!"