# HW6: Frame-Level Speech Recognition

In this homework, you will be working with MFCC data consisting of 28 features at each time step/frame. Your model should be able to recognize the phoneme occured in that frame.

# Libraries

In [7]:
!pip install torchsummaryX wandb --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.6/190.6 kB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m254.1/254.1 kB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [8]:
import torch
import numpy as np
from torchsummaryX import summary
import sklearn
import gc
import zipfile
import pandas as pd
from tqdm.auto import tqdm
import os
import datetime
import wandb
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device: ", device)

Device:  cuda


In [1]:
## If you are using colab, you can import google drive to save model checkpoints in a folder
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
### PHONEME LIST
PHONEMES = [
            '[SIL]',   'AA',    'AE',    'AH',    'AO',    'AW',    'AY',
            'B',     'CH',    'D',     'DH',    'EH',    'ER',    'EY',
            'F',     'G',     'HH',    'IH',    'IY',    'JH',    'K',
            'L',     'M',     'N',     'NG',    'OW',    'OY',    'P',
            'R',     'S',     'SH',    'T',     'TH',    'UH',    'UW',
            'V',     'W',     'Y',     'Z',     'ZH',    '[SOS]', '[EOS]']

# Dataset

This section covers the dataset/dataloader class for speech data. You will have to spend time writing code to create this class successfully. We have given you a lot of comments guiding you on what code to write at each stage, from top to bottom of the class. Please try and take your time figuring this out, as it will immensely help in creating dataset/dataloader classes for future homeworks.

Before running the following cells, please take some time to analyse the structure of data. Try loading a single MFCC and its transcipt, print out the shapes and print out the values. Do the transcripts look like phonemes?

In [3]:
!gdown 1gNVILsmbYZbYJQk2-NN2q7VG2RKJp-7x

Downloading...
From: https://drive.google.com/uc?id=1gNVILsmbYZbYJQk2-NN2q7VG2RKJp-7x
To: /content/Data.zip
100% 4.28G/4.28G [00:34<00:00, 123MB/s] 


In [15]:
!unzip -qo /content/Data.zip -d '/content/data'

In [9]:
# Dataset class to load train and validation data
class AudioDataset(torch.utils.data.Dataset):

    def __init__(self, root, phonemes = PHONEMES, context=0, partition= "train-clean-100"): # Feel free to add more arguments

        self.context    = context
        self.phonemes   = phonemes
        # TODO: MFCC directory - use partition to acces train/dev directories from kaggle data using root
        self.mfcc_dir       = root + '/' + partition + '/mfcc/'
        # TODO: Transcripts directory - use partition to acces train/dev directories from kaggle data using root
        self.transcript_dir = root + '/' + partition + '/transcript/'

        # TODO: List files in sefl.mfcc_dir using os.listdir in sorted order
        mfcc_names          = sorted(os.listdir(self.mfcc_dir))
        # TODO: List files in self.transcript_dir using os.listdir in sorted order
        transcript_names    = sorted(os.listdir(self.transcript_dir))

        # Making sure that we have the same no. of mfcc and transcripts
        total_timestamps = 0
        assert len(mfcc_names) == len(transcript_names)
        for mfcc in mfcc_names:
            total_timestamps += len(np.load(self.mfcc_dir + mfcc))
        self.length = total_timestamps
        print(total_timestamps)

        print("HERE")
        self.mfccs, self.transcripts = np.zeros((2*context+total_timestamps, 28), dtype=np.float32), np.zeros((total_timestamps), dtype=np.uint8)
        #self.mfccs, self.transcripts = [], []
        # TODO: Iterate through mfccs and transcripts
        current_index = context
        for i in range(len(mfcc_names)):
        #   Load a single mfcc
            mfcc        = np.load(self.mfcc_dir + mfcc_names[i])
            mfcc_mean = np.mean(mfcc, axis = 0)
            mfcc_stddev =  np.std(mfcc, axis = 0)
            cepstral_norm = (mfcc - mfcc_mean)/mfcc_stddev
        #   Do Cepstral Normalization of mfcc (explained in writeup)
        #   Load the corresponding transcript
            transcript  = np.load(self.transcript_dir + transcript_names[i]) # Remove [SOS] and [EOS] from the transcript
            transcript = transcript[1: -1]
            # (Is there an efficient way to do this without traversing through the transcript?)
            # Note that SOS will always be in the starting and EOS at end, as the name suggests.
        #   Append each mfcc to self.mfcc, transcript to self.transcript
            self.mfccs[current_index: current_index + len(cepstral_norm)] = cepstral_norm
            self.transcripts[current_index - context: current_index + len(transcript) - context] = np.array(list(map(lambda x : self.phonemes.index(x), transcript)))
            current_index += len(cepstral_norm)
            #self.mfccs.append(cepstral_norm)
            #self.transcripts.append(transcript)
        print(current_index)
        # NOTE:
        # Each mfcc is of shape T1 x 28, T2 x 28, ...
        # Each transcript is of shape (T1+2), (T2+2),... before removing [SOS] and [EOS]

        # TODO: Concatenate all mfccs in self.mfccs such that
        # the final shape is T x 28 (Where T = T1 + T2 + ...)
        #self.mfccs          = np.concatenate(self.mfccs, axis = 0)

        # TODO: Concatenate all transcripts in self.transcripts such that
        # the final shape is (T,) meaning, each time step has one phoneme output
        #self.transcripts    = np.concatenate(self.transcripts, axis = 0)
        # Hint: Use numpy to concatenate

        # Length of the dataset is now the length of concatenated mfccs/transcripts

        # Take some time to think about what we have done.
        # self.mfcc is an array of the format (Frames x Features).
        # Our goal is to recognize phonemes of each frame
        # We can introduce context by padding zeros on top and bottom of self.mfcc
        #self.mfccs = np.concatenate((np.zeros((context, 28)), self.mfccs, np.zeros((context, 28))), axis = 0) # TODO

        # The available phonemes in the transcript are of string data type
        # But the neural network cannot predict strings as such.
        # Hence, we map these phonemes to integers

        # TODO: Map the phonemes to their corresponding list indexes in self.phonemes
        #def phoneme_to_index(p):
        #  return self.phonemes.index(p)
        #self.transcripts = np.array(list(map(lambda x : self.phonemes.index(x), self.transcripts)))
        # Now, if an element in self.transcript is 0, it means that it is 'SIL' (as per the above example)

    def __len__(self):
        return self.length

    def __getitem__(self, ind):

        # TODO: Based on context and offset, return a frame at given index with context frames to the left, and right.
        frames = self.mfccs[ind: ind + 2*self.context+1]
        # After slicing, you get an array of shape 2*context+1 x 28. But our MLP needs 1d data and not 2d.
        frames = frames.flatten() # TODO: Flatten to get 1d data

        frames      = torch.FloatTensor(frames) # Convert to tensors
        phonemes    = torch.tensor(self.transcripts[ind])

        return frames, phonemes

In [10]:
class AudioTestDataset(torch.utils.data.Dataset):
    def __init__(self, root, phonemes = PHONEMES, context=0, partition= "train-clean-100"): # Feel free to add more arguments

        self.context    = context
        self.phonemes   = phonemes
        # TODO: MFCC directory - use partition to acces train/dev directories from kaggle data using root
        self.mfcc_dir       = root + '/' + partition + '/mfcc/'


        # TODO: List files in sefl.mfcc_dir using os.listdir in sorted order
        mfcc_names          = sorted(os.listdir(self.mfcc_dir))


        total_timestamps = 0
        for mfcc in mfcc_names:
            total_timestamps += len(np.load(self.mfcc_dir + mfcc))
        self.length = total_timestamps
        print(total_timestamps)
        #self.mfccs = []
        self.mfccs = np.zeros((2*context+total_timestamps, 28), dtype=np.float32)

        # TODO: Iterate through mfccs and transcripts
        current_index = context
        for i in range(len(mfcc_names)):
        #   Load a single mfcc
            mfcc        = np.load(self.mfcc_dir + mfcc_names[i])
            mfcc_mean = np.mean(mfcc, axis = 0)
            mfcc_stddev =  np.std(mfcc, axis = 0)
            cepstral_norm = (mfcc - mfcc_mean)/mfcc_stddev
        #   Do Cepstral Normalization of mfcc (explained in writeup)
        #   Load the corresponding transcript

            #self.mfccs.append(cepstral_norm)
        #   Append each mfcc to self.mfcc, transcript to self.transcript
            self.mfccs[current_index: current_index + len(cepstral_norm)] = cepstral_norm
            current_index += len(cepstral_norm)
        print(current_index)
        # NOTE:
        # Each mfcc is of shape T1 x 28, T2 x 28, ...
        # Each transcript is of shape (T1+2), (T2+2),... before removing [SOS] and [EOS]

        # TODO: Concatenate all mfccs in self.mfccs such that
        # the final shape is T x 28 (Where T = T1 + T2 + ...)
        #self.mfccs          = np.concatenate(self.mfccs, axis = 0)

        # TODO: Concatenate all transcripts in self.transcripts such that
        # the final shape is (T,) meaning, each time step has one phoneme output
        #self.transcripts    = np.concatenate(self.transcripts, axis = 0)
        # Hint: Use numpy to concatenate

        # Length of the dataset is now the length of concatenated mfccs/transcripts

        # Take some time to think about what we have done.
        # self.mfcc is an array of the format (Frames x Features).
        # Our goal is to recognize phonemes of each frame
        # We can introduce context by padding zeros on top and bottom of self.mfcc
        #self.mfccs = np.concatenate((np.zeros((context, 28)), self.mfccs, np.zeros((context, 28))), axis = 0) # TODO

        # The available phonemes in the transcript are of string data type
        # But the neural network cannot predict strings as such.
        # Hence, we map these phonemes to integers


    def __len__(self):
        return self.length

    def __getitem__(self, ind):

        # TODO: Based on context and offset, return a frame at given index with context frames to the left, and right.
        frames = self.mfccs[ind: ind + 2*self.context+1]
        # After slicing, you get an array of shape 2*context+1 x 28. But our MLP needs 1d data and not 2d.
        frames = frames.flatten() # TODO: Flatten to get 1d data

        frames      = torch.FloatTensor(frames) # Convert to tensors

        return frames

    # TODO: Create a test dataset class similar to the previous class but you dont have transcripts for this
    # Imp: Read the mfccs in sorted order, do NOT shuffle the data here or in your dataloader.

# Parameters Configuration

Storing your parameters and hyperparameters in a single configuration dictionary makes it easier to keep track of them during each experiment. It can also be used with weights and biases to log your parameters for each experiment and keep track of them across multiple experiments.

In [11]:
config = {
    'epochs'        : 5,
    'batch_size'    : 1024,
    'context'       : 20,
    'init_lr'       : 1e-3,
    'architecture'  : 'very-low-cutoff'
    # Add more as you need them - e.g dropout values, weight decay, scheduler parameters
}

# Create Datasets

In [17]:
#TODO: Create a dataset object using the AudioDataset class for the training data
train_data = AudioDataset('/content/data/11-785-f23-hw1p2', context = config['context'])

# TODO: Create a dataset object using the AudioDataset class for the validation data
val_data = AudioDataset('/content/data/11-785-f23-hw1p2', context = config['context'], partition='dev-clean')

# TODO: Create a dataset object using the AudioTestDataset class for the test data
test_data = AudioTestDataset('/content/data/11-785-f23-hw1p2', context = config['context'], partition='test-clean')

36091157
HERE
36091177
1928204
HERE
1928224
1934138
1934158


In [18]:
# Define dataloaders for train, val and test datasets
# Dataloaders will yield a batch of frames and phonemes of given batch_size at every iteration
# We shuffle train dataloader but not val & test dataloader. Why?

train_loader = torch.utils.data.DataLoader(
    dataset     = train_data,
    num_workers = 4,
    batch_size  = config['batch_size'],
    pin_memory  = True,
    shuffle     = True
)

val_loader = torch.utils.data.DataLoader(
    dataset     = val_data,
    num_workers = 2,
    batch_size  = config['batch_size'],
    pin_memory  = True,
    shuffle     = False
)

test_loader = torch.utils.data.DataLoader(
    dataset     = test_data,
    num_workers = 2,
    batch_size  = config['batch_size'],
    pin_memory  = True,
    shuffle     = False
)


print("Batch size     : ", config['batch_size'])
print("Context        : ", config['context'])
print("Input size     : ", (2*config['context']+1)*28)
print("Output symbols : ", len(PHONEMES))

print("Train dataset samples = {}, batches = {}".format(train_data.__len__(), len(train_loader)))
print("Validation dataset samples = {}, batches = {}".format(val_data.__len__(), len(val_loader)))
print("Test dataset samples = {}, batches = {}".format(test_data.__len__(), len(test_loader)))

Batch size     :  1024
Context        :  20
Input size     :  1148
Output symbols :  42
Train dataset samples = 36091157, batches = 35246
Validation dataset samples = 1928204, batches = 1884
Test dataset samples = 1934138, batches = 1889




In [19]:
# Testing code to check if your data loaders are working
for i, data in enumerate(train_loader):
    frames, phoneme = data
    print(frames.shape, phoneme.shape)
    break

torch.Size([1024, 1148]) torch.Size([1024])


# Network Architecture


This section defines your network architecture for the homework. We have given you a sample architecture that can easily clear the very low cutoff for the early submission deadline.

In [20]:
# This architecture will make you cross the very low cutoff
# However, you need to run a lot of experiments to cross the medium or high cutoff
class Network(torch.nn.Module):

    def __init__(self, input_size, output_size):

        super(Network, self).__init__()

        self.model = torch.nn.Sequential(
            torch.nn.Linear(input_size, 512),
            torch.nn.ReLU(),
            torch.nn.Linear(512, output_size)
        )

    def forward(self, x):
        out = self.model(x)

        return out

# Define Model, Loss Function and Optimizer

Here we define the model, loss function, optimizer and optionally a learning rate scheduler.

In [21]:
INPUT_SIZE  = (2*config['context'] + 1) * 28 # Why is this the case?
model       = Network(INPUT_SIZE, len(train_data.phonemes)).to(device)
summary(model, frames.to(device))
# Check number of parameters of your network
# Remember, you are limited to 25 million parameters for HW1 (including ensembles)

                 Kernel Shape Output Shape    Params Mult-Adds
Layer                                                         
0_model.Linear_0  [1148, 512]  [1024, 512]  588.288k  587.776k
1_model.ReLU_1              -  [1024, 512]         -         -
2_model.Linear_2    [512, 42]   [1024, 42]   21.546k   21.504k
---------------------------------------------------------------
                        Totals
Total params          609.834k
Trainable params      609.834k
Non-trainable params       0.0
Mult-Adds              609.28k


  df_sum = df.sum()


Unnamed: 0_level_0,Kernel Shape,Output Shape,Params,Mult-Adds
Layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0_model.Linear_0,"[1148, 512]","[1024, 512]",588288.0,587776.0
1_model.ReLU_1,-,"[1024, 512]",,
2_model.Linear_2,"[512, 42]","[1024, 42]",21546.0,21504.0


In [22]:
criterion = torch.nn.CrossEntropyLoss() # Defining Loss function.
# We use CE because the task is multi-class classification

optimizer = torch.optim.Adam(model.parameters(), lr= config['init_lr']) #Defining Optimizer
# Recommended : Define Scheduler for Learning Rate,
# including but not limited to StepLR, MultiStepLR, CosineAnnealingLR, ReduceLROnPlateau, etc.
# You can refer to Pytorch documentation for more information on how to use them.

# Is your training time very high?
# Look into mixed precision training if your GPU (Tesla T4, V100, etc) can make use of it
# Refer - https://pytorch.org/docs/stable/notes/amp_examples.html

# Training and Validation Functions

This section covers the training, and validation functions for each epoch of running your experiment with a given model architecture. The code has been provided to you, but we recommend going through the comments to understand the workflow to enable you to write these loops for future HWs.

In [23]:
torch.cuda.empty_cache()
gc.collect()

36

In [24]:
def train(model, dataloader, optimizer, criterion):

    model.train()
    tloss, tacc = 0, 0 # Monitoring loss and accuracy
    batch_bar   = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc='Train')

    for i, (frames, phonemes) in enumerate(dataloader):

        ### Initialize Gradients
        optimizer.zero_grad()

        ### Move Data to Device (Ideally GPU)
        frames      = frames.to(device)
        phonemes    = phonemes.to(device)

        ### Forward Propagation
        logits  = model(frames)

        ### Loss Calculation
        loss    = criterion(logits, phonemes)

        ### Backward Propagation
        loss.backward()

        ### Gradient Descent
        optimizer.step()

        tloss   += loss.item()
        tacc    += torch.sum(torch.argmax(logits, dim= 1) == phonemes).item()/logits.shape[0]

        batch_bar.set_postfix(loss="{:.04f}".format(float(tloss / (i + 1))),
                              acc="{:.04f}%".format(float(tacc*100 / (i + 1))))
        batch_bar.update()

        ### Release memory
        del frames, phonemes, logits
        torch.cuda.empty_cache()

    batch_bar.close()
    tloss   /= len(train_loader)
    tacc    /= len(train_loader)

    return tloss, tacc

In [27]:
def eval(model, dataloader):

    model.eval() # set model in evaluation mode
    vloss, vacc = 0, 0 # Monitoring loss and accuracy
    batch_bar   = tqdm(total=len(val_loader), dynamic_ncols=True, position=0, leave=False, desc='Val')

    for i, (frames, phonemes) in enumerate(dataloader):

        ### Move data to device (ideally GPU)
        frames      = frames.to(device)
        phonemes    = phonemes.to(device)

        # makes sure that there are no gradients computed as we are not training the model now
        with torch.inference_mode():
            ### Forward Propagation
            logits  = model(frames)
            ### Loss Calculation
            loss    = criterion(logits, phonemes)

        vloss   += loss.item()
        vacc    += torch.sum(torch.argmax(logits, dim= 1) == phonemes).item()/logits.shape[0]

        # Do you think we need loss.backward() and optimizer.step() here?

        batch_bar.set_postfix(loss="{:.04f}".format(float(vloss / (i + 1))),
                              acc="{:.04f}%".format(float(vacc*100 / (i + 1))))
        batch_bar.update()

        ### Release memory
        del frames, phonemes, logits
        torch.cuda.empty_cache()

    batch_bar.close()
    vloss   /= len(val_loader)
    vacc    /= len(val_loader)

    return vloss, vacc

# Experiment

Now, it is time to finally run your ablations! Have fun!

In [None]:
# Iterate over number of epochs to train and evaluate your model
torch.cuda.empty_cache()
gc.collect()

# wandb.watch(model, log="all")

for epoch in range(config['epochs']):

    print("\nEpoch {}/{}".format(epoch+1, config['epochs']))

    curr_lr                 = float(optimizer.param_groups[0]['lr'])
    train_loss, train_acc   = train(model, train_loader, optimizer, criterion)
    val_loss, val_acc       = eval(model, val_loader)

    print("\tTrain Acc {:.04f}%\tTrain Loss {:.04f}\t Learning Rate {:.07f}".format(train_acc*100, train_loss, curr_lr))
    print("\tVal Acc {:.04f}%\tVal Loss {:.04f}".format(val_acc*100, val_loss))

    ### Log metrics at each epoch in your run
    # Optionally, you can log at each batch inside train/eval functions
    # (explore wandb documentation/wandb recitation)
    # wandb.log({'train_acc': train_acc*100, 'train_loss': train_loss,
    #            'val_acc': val_acc*100, 'valid_loss': val_loss, 'lr': curr_lr})

    ### Highly Recommended: Save checkpoint in drive and/or wandb if accuracy is better than your current best

### Finish your wandb run
#run.finish()


Epoch 1/5


Train:   0%|          | 0/35246 [00:00<?, ?it/s]



Val:   0%|          | 0/1884 [00:00<?, ?it/s]

	Train Acc 69.7034%	Train Loss 0.9805	 Learning Rate 0.0010000
	Val Acc 68.8041%	Val Loss 1.0077

Epoch 2/5


Train:   0%|          | 0/35246 [00:00<?, ?it/s]