# Frame-Level Speech Recognition

In this homework, you will be working with MFCC data consisting of 27 features at each time step/frame. Your model should be able to recognize the phoneme occured in that frame.

# Libraries

In [None]:
!pip install torchsummaryX wandb --quiet
import torch
import numpy as np
from torchsummaryX import summary
import sklearn
import gc
import zipfile
import pandas as pd
from tqdm.auto import tqdm
import os
import datetime
import wandb
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device: ", device)
### PHONEME LIST
PHONEMES = [
            '[SIL]',   'AA',    'AE',    'AH',    'AO',    'AW',    'AY',  
            'B',     'CH',    'D',     'DH',    'EH',    'ER',    'EY',
            'F',     'G',     'HH',    'IH',    'IY',    'JH',    'K',
            'L',     'M',     'N',     'NG',    'OW',    'OY',    'P',
            'R',     'S',     'SH',    'T',     'TH',    'UH',    'UW',
            'V',     'W',     'Y',     'Z',     'ZH',    '[SOS]', '[EOS]']

Device:  cuda


In [None]:
# ### If you are using colab, you can import google drive to save model checkpoints in a folder
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install --upgrade --force-reinstall --no-deps kaggle==1.5.8
!mkdir /root/.kaggle

with open("/root/.kaggle/kaggle.json", "w+") as f:
    f.write('{"username":"ayh2cfa","key":"924942fc523aefb5cd38a395654d3ca9"}') 
    # Put your kaggle username & key here

!chmod 600 /root/.kaggle/kaggle.json

# # commands to download data from kaggle

# !kaggle competitions download -c 11-785-s23-hw1p2
# !mkdir '/content/data'

# !unzip -qo '11-785-s23-hw1p2.zip' -d '/content/data'

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting kaggle==1.5.8
  Using cached kaggle-1.5.8-py3-none-any.whl
Installing collected packages: kaggle
  Attempting uninstall: kaggle
    Found existing installation: kaggle 1.5.8
    Uninstalling kaggle-1.5.8:
      Successfully uninstalled kaggle-1.5.8
Successfully installed kaggle-1.5.8
mkdir: cannot create directory ‘/root/.kaggle’: File exists


# ONLY RUN THIS ONCE, COMMENT THIS OUT AFTERWARDS

In [None]:
torch.save({'val_acc': 0}, '/content/status/currentBest.pth')

# Kaggle

This section contains code that helps you install kaggle's API, creating kaggle.json with you username and API key details. Make sure to input those in the given code to ensure you can download data from the competition successfully.

# Dataset

This section covers the dataset/dataloader class for speech data. You will have to spend time writing code to create this class successfully. We have given you a lot of comments guiding you on what code to write at each stage, from top to bottom of the class. Please try and take your time figuring this out, as it will immensely help in creating dataset/dataloader classes for future homeworks.

Before running the following cells, please take some time to analyse the structure of data. Try loading a single MFCC and its transcipt, print out the shapes and print out the values. Do the transcripts look like phonemes?

In [None]:
# Dataset class to load train data for ablation

class AudioDatasetPart(torch.utils.data.Dataset):

    def __init__(self, root, context, partition, percentage, phonemes = PHONEMES): # Feel free to add more arguments

        self.context    = context
        self.phonemes   = phonemes
        
        # MFCC directory - use partition to acces train/dev directories from kaggle data using root
        self.mfcc_dir       = root+'/'+partition+'/mfcc'
        # Transcripts directory - use partition to acces train/dev directories from kaggle data using root
        self.transcript_dir = root+'/'+partition+'/transcript'

        # List files in sefl.mfcc_dir using os.listdir in sorted order
        mfcc_names          = sorted(os.listdir(self.mfcc_dir))
        # List files in self.transcript_dir using os.listdir in sorted order
        transcript_names    = sorted(os.listdir(self.transcript_dir))

        # Making sure that we have the same no. of mfcc and transcripts
        assert len(mfcc_names) == len(transcript_names)

        num_mfccs = len(mfcc_names)

        Ts = np.zeros((num_mfccs,))
        ind_arr = int(num_mfccs*percentage)+np.array(range(int(0.25*num_mfccs)))
        for i in ind_arr:
          mfcc = np.load(self.mfcc_dir+'/'+mfcc_names[i])
          Ts[i] = mfcc.shape[0]
        T = int(np.sum(Ts))

        self.mfccs = np.zeros((T+2*context,27),dtype=np.float32)
        self.transcripts = np.zeros((T,),dtype=int)
        cx,cy = context,0

        # Iterate through mfccs and transcripts
        for i in ind_arr:
        #   Load a single mfcc
            mfcc = np.load(self.mfcc_dir+'/'+mfcc_names[i])
        #   Do Cepstral Normalization of mfcc (explained in writeup)
            sigma = np.std(mfcc,axis=0)
            mfcc -= np.sum(mfcc,axis=0)/mfcc.shape[0]
            mfcc /= sigma

        #   Load the corresponding transcript
            transcript  = np.load(self.transcript_dir+'/'+transcript_names[i]) 
            transcript = transcript[1:len(transcript)-1] # Remove [SOS] and [EOS] from the transcript

            # Map the phonemes to their corresponding list indexes in self.phonemes
            # Now, if an element in self.transcript is 0, it means that it is 'SIL' (as per the above example)
            transcript = [self.phonemes.index(p) for p in transcript]

        #   Append each mfcc to self.mfcc, transcript to self.transcript
            self.mfccs[cx:cx+mfcc.shape[0]] = mfcc
            self.transcripts[cy:cy+len(transcript)] = transcript

            cx,cy = cx+mfcc.shape[0], cy+len(transcript)   

        # Length of the dataset is now the length of concatenated mfccs/transcripts
        self.length = len(self.transcripts)

    def __len__(self):
        return self.length

    def __getitem__(self, ind):
        # Based on context and offset, return a frame at given index with context frames to the left, and right.
        frames = self.mfccs[ind : ind + 2 * self.context + 1, :]
        # After slicing, you get an array of shape 2*context+1 x 27. But our MLP needs 1d data and not 2d.
        frames = frames.flatten()
        if ind >= len(self.transcripts):
          print('ERROR: ind >= transcripts length')

        frames      = torch.FloatTensor(frames) # Convert to tensors
        phonemes    = torch.tensor(self.transcripts[ind])       

        return frames, phonemes

In [None]:
# Dataset class to load train data for ablation

class AudioDataset(torch.utils.data.Dataset):

    def __init__(self, root, context, partition, phonemes = PHONEMES): # Feel free to add more arguments

        self.context    = context
        self.phonemes   = phonemes
        
        # MFCC directory - use partition to acces train/dev directories from kaggle data using root
        self.mfcc_dir       = root+'/'+partition+'/mfcc'
        # Transcripts directory - use partition to acces train/dev directories from kaggle data using root
        self.transcript_dir = root+'/'+partition+'/transcript'

        # List files in sefl.mfcc_dir using os.listdir in sorted order
        mfcc_names          = sorted(os.listdir(self.mfcc_dir))
        # List files in self.transcript_dir using os.listdir in sorted order
        transcript_names    = sorted(os.listdir(self.transcript_dir))

        # Making sure that we have the same no. of mfcc and transcripts
        assert len(mfcc_names) == len(transcript_names)

        num_mfccs = len(mfcc_names)

        Ts = np.zeros((num_mfccs,))
        for i in range(num_mfccs):
          mfcc = np.load(self.mfcc_dir+'/'+mfcc_names[i])
          Ts[i] = mfcc.shape[0]
        T = int(np.sum(Ts))

        self.mfccs = np.zeros((T+2*context,27),dtype=np.float32)
        self.transcripts = np.zeros((T,),dtype=int)
        cx,cy = context,0

        # Iterate through mfccs and transcripts
        for i in range(num_mfccs):
        #   Load a single mfcc
            mfcc = np.load(self.mfcc_dir+'/'+mfcc_names[i])
        #   Do Cepstral Normalization of mfcc (explained in writeup)
            sigma = np.std(mfcc,axis=0)
            mfcc -= np.sum(mfcc,axis=0)/mfcc.shape[0]
            mfcc /= sigma

        #   Load the corresponding transcript
            transcript  = np.load(self.transcript_dir+'/'+transcript_names[i]) 
            transcript = transcript[1:len(transcript)-1] # Remove [SOS] and [EOS] from the transcript

            # Map the phonemes to their corresponding list indexes in self.phonemes
            # Now, if an element in self.transcript is 0, it means that it is 'SIL' (as per the above example)
            transcript = [self.phonemes.index(p) for p in transcript]

        #   Append each mfcc to self.mfcc, transcript to self.transcript
            self.mfccs[cx:cx+mfcc.shape[0]] = mfcc
            self.transcripts[cy:cy+len(transcript)] = transcript

            cx,cy = cx+mfcc.shape[0], cy+len(transcript)   

        # Length of the dataset is now the length of concatenated mfccs/transcripts
        self.length = len(self.transcripts)

    def __len__(self):
        return self.length

    def __getitem__(self, ind):
        # Based on context and offset, return a frame at given index with context frames to the left, and right.
        frames = self.mfccs[ind : ind + 2 * self.context + 1, :]
        # After slicing, you get an array of shape 2*context+1 x 27. But our MLP needs 1d data and not 2d.
        frames = frames.flatten()
        if ind >= len(self.transcripts):
          print('ERROR: ind >= transcripts length')

        frames      = torch.FloatTensor(frames) # Convert to tensors
        phonemes    = torch.tensor(self.transcripts[ind])       

        return frames, phonemes

# Dataset class to load train and validation data

class AudioTestDataset(torch.utils.data.Dataset):

    def __init__(self, root, context, partition): # Feel free to add more arguments

        self.context    = context
        
        # MFCC directory - use partition to acces train/dev directories from kaggle data using root
        self.mfcc_dir       = root+'/'+partition+'/mfcc'

        # List files in sefl.mfcc_dir using os.listdir in sorted order
        mfcc_names          = sorted(os.listdir(self.mfcc_dir))

        self.mfccs = []

        # Iterate through mfccs and transcripts
        for i in range(len(mfcc_names)):
        #   Load a single mfcc
            mfcc = np.load(self.mfcc_dir+'/'+mfcc_names[i])
        #   Do Cepstral Normalization of mfcc (explained in writeup)
            sigma = np.std(mfcc,axis=0)
            mfcc -= np.sum(mfcc,axis=0)/mfcc.shape[0]
            mfcc /= sigma
        #   Append each mfcc to self.mfcc
            self.mfccs.append(mfcc)    

        # NOTE:
        # Each mfcc is of shape T1 x 27, T2 x 27, ...
        # Each transcript is of shape (T1+2) x 27, (T2+2) x 27 before removing [SOS] and [EOS]

        # Concatenate all mfccs in self.mfccs such that 
        # the final shape is T x 27 (Where T = T1 + T2 + ...) 
        self.mfccs = np.concatenate(self.mfccs, axis=0)

        # Length of the dataset is now the length of concatenated mfccs/transcripts
        self.length = len(self.mfccs)

        # Take some time to think about what we have done. 
        # self.mfcc is an array of the format (Frames x Features). 
        # Our goal is to recognize phonemes of each frame
        # From hw0, you will be knowing what context is. 
        # We can introduce context by padding zeros on top and bottom of self.mfcc
        self.mfccs = np.pad(self.mfccs, ((self.context,self.context), (0, 0)), 'constant', constant_values=0)

    def __len__(self):
        return self.length

    def __getitem__(self, ind):
        # Based on context and offset, return a frame at given index with context frames to the left, and right.
        frames = self.mfccs[ind : ind + 2 * self.context + 1, :]
        # After slicing, you get an array of shape 2*context+1 x 27. But our MLP needs 1d data and not 2d.
        frames = frames.flatten()

        frames      = torch.FloatTensor(frames) # Convert to tensors   

        return frames

# Dataset class to load train data for ablation

class AblationDataset(torch.utils.data.Dataset):

    def __init__(self, root, context, partition, percentage, phonemes = PHONEMES): # Feel free to add more arguments

        self.context    = context
        self.phonemes   = phonemes
        
        # MFCC directory - use partition to acces train/dev directories from kaggle data using root
        self.mfcc_dir       = root+'/'+partition+'/mfcc'
        # Transcripts directory - use partition to acces train/dev directories from kaggle data using root
        self.transcript_dir = root+'/'+partition+'/transcript'

        # List files in sefl.mfcc_dir using os.listdir in sorted order
        mfcc_names          = sorted(os.listdir(self.mfcc_dir))
        # List files in self.transcript_dir using os.listdir in sorted order
        transcript_names    = sorted(os.listdir(self.transcript_dir))

        # Making sure that we have the same no. of mfcc and transcripts
        assert len(mfcc_names) == len(transcript_names)

        num_mfccs = len(mfcc_names)
        num_samples = int(percentage*num_mfccs)
        ind_arr = np.random.choice(range(num_mfccs), size=num_samples)

        Ts = np.zeros((num_samples,))
        for i in range(len(Ts)):
          mfcc = np.load(self.mfcc_dir+'/'+mfcc_names[ind_arr[i]])
          Ts[i] = mfcc.shape[0]
        T = int(np.sum(Ts))

        self.mfccs = np.zeros((T+2*context,27),dtype=np.float32)
        self.transcripts = np.zeros((T,),dtype=int)
        cx,cy = context,0

        # Iterate through mfccs and transcripts
        for i in range(len(ind_arr)):
        #   Load a single mfcc
            mfcc = np.load(self.mfcc_dir+'/'+mfcc_names[ind_arr[i]])
        #   Do Cepstral Normalization of mfcc (explained in writeup)
            sigma = np.std(mfcc,axis=0)
            mfcc -= np.sum(mfcc,axis=0)/mfcc.shape[0]
            mfcc /= sigma

        #   Load the corresponding transcript
            transcript  = np.load(self.transcript_dir+'/'+transcript_names[ind_arr[i]]) 
            transcript = transcript[1:len(transcript)-1] # Remove [SOS] and [EOS] from the transcript

            # Map the phonemes to their corresponding list indexes in self.phonemes
            # Now, if an element in self.transcript is 0, it means that it is 'SIL' (as per the above example)
            transcript = [self.phonemes.index(p) for p in transcript]

        #   Append each mfcc to self.mfcc, transcript to self.transcript
            self.mfccs[cx:cx+mfcc.shape[0]] = mfcc
            self.transcripts[cy:cy+len(transcript)] = transcript

            cx,cy = cx+mfcc.shape[0], cy+len(transcript)   

        # Length of the dataset is now the length of concatenated mfccs/transcripts
        self.length = len(self.transcripts)

    def __len__(self):
        return self.length

    def __getitem__(self, ind):
        # Based on context and offset, return a frame at given index with context frames to the left, and right.
        frames = self.mfccs[ind : ind + 2 * self.context + 1, :]
        # After slicing, you get an array of shape 2*context+1 x 27. But our MLP needs 1d data and not 2d.
        frames = frames.flatten()
        if ind >= len(self.transcripts):
          print('ERROR: ind >= transcripts length')

        frames      = torch.FloatTensor(frames) # Convert to tensors
        phonemes    = torch.tensor(self.transcripts[ind])       

        return frames, phonemes

config = {
    'epochs'        : 24,
    'batch_size'    : 32768,
    'context'       : 35,
    'init_lr'       : 0.002,
    'dropout'       : 0.1,
    'gamma'         : 0.85,
    'architecture'  : [2048, 2048, 2048, 1024, 1024, 1024, 1024, 42]
    # # early stopping parameters 
    # 'patience' : 2,
    # 'min_delta' : 0.001,
    # 'min_lr' : 1e-4
}

# Parameters Configuration

Storing your parameters and hyperparameters in a single configuration dictionary makes it easier to keep track of them during each experiment. It can also be used with weights and biases to log your parameters for each experiment and keep track of them across multiple experiments. 

# Create Datasets

In [None]:
# Create a dataset object using the AudioDataset class for the training data 
train_a = AudioDatasetPart('/content/data/11-785-s23-hw1p2', config['context'], 'train-clean-360', 0)

In [None]:
train_b = AudioDatasetPart('/content/data/11-785-s23-hw1p2', config['context'], 'train-clean-360', 0.25)

In [None]:
train_c = AudioDatasetPart('/content/data/11-785-s23-hw1p2', config['context'], 'train-clean-360', 0.5)

In [None]:
train_d = AudioDatasetPart('/content/data/11-785-s23-hw1p2', config['context'], 'train-clean-360', 0.75)

In [None]:
# Create a dataset object using the AudioDataset class for the training data 
train_e = AudioDatasetPart('/content/data/11-785-s23-hw1p2', config['context'], 'train-clean-100', 0)

In [None]:
# Create a dataset object using the AudioDataset class for the training data 
train_f = AudioDatasetPart('/content/data/11-785-s23-hw1p2', config['context'], 'train-clean-100', 0.25)

In [None]:
# Create a dataset object using the AudioDataset class for the training data 
train_g = AudioDatasetPart('/content/data/11-785-s23-hw1p2', config['context'], 'train-clean-100', 0.5)

In [None]:
# Create a dataset object using the AudioDataset class for the training data 
train_h = AudioDatasetPart('/content/data/11-785-s23-hw1p2', config['context'], 'train-clean-100', 0.75)

In [None]:
train_100 = AudioDataset('/content/data/11-785-s23-hw1p2', config['context'], 'train-clean-100')
train_360 = AudioDataset('/content/data/11-785-s23-hw1p2', config['context'], 'train-clean-360')

In [None]:
# Create a dataset object using the AudioDataset class for the training data 
# train_data = torch.utils.data.ConcatDataset([train_100, train_360])
train_data = torch.utils.data.ConcatDataset([train_100, train_360])

In [None]:
# Create a dataset object using the AudioDataset class for the validation data 
val_data = AudioDataset('/content/data/11-785-s23-hw1p2', config['context'], 'dev-clean')

# Create a dataset object using the AudioTestDataset class for the test data 
test_data = AudioTestDataset('/content/data/11-785-s23-hw1p2', config['context'], 'test-clean')

In [None]:
# Ablation training data
abl_100 = AblationDataset('/content/data/11-785-s23-hw1p2', config['context'], 'train-clean-100', 0.2)

In [None]:
# Ablation training data
abl_360 = AblationDataset('/content/data/11-785-s23-hw1p2', config['context'], 'train-clean-360', 0.2)

In [None]:
abl_data = torch.utils.data.ConcatDataset([abl_100,abl_360])
# abl_data = abl_100

In [None]:
# Define dataloaders for train, val and test datasets
# Dataloaders will yield a batch of frames and phonemes of given batch_size at every iteration
# We shuffle train dataloader but not val & test dataloader. Why?

train_loader = torch.utils.data.DataLoader(
    dataset     = train_data, 
    num_workers = 4,
    batch_size  = config['batch_size'], 
    pin_memory  = True,
    shuffle     = True
)

In [None]:
val_loader = torch.utils.data.DataLoader(
    dataset     = val_data, 
    num_workers = 2,
    batch_size  = config['batch_size'],
    pin_memory  = True,
    shuffle     = False
)

test_loader = torch.utils.data.DataLoader(
    dataset     = test_data, 
    num_workers = 2, 
    batch_size  = config['batch_size'], 
    pin_memory  = True, 
    shuffle     = False
)

print("Batch size     : ", config['batch_size'])
print("Context        : ", config['context'])
print("Input size     : ", (2*config['context']+1)*27)
print("Output symbols : ", len(PHONEMES))

# print("Train dataset samples = {}, batches = {}".format(train_data.__len__(), len(train_loader)))
print("Validation dataset samples = {}, batches = {}".format(val_data.__len__(), len(val_loader)))
print("Test dataset samples = {}, batches = {}".format(test_data.__len__(), len(test_loader)))

Batch size     :  65536
Context        :  35
Input size     :  1917
Output symbols :  42
Validation dataset samples = 1928204, batches = 30
Test dataset samples = 1934138, batches = 30


In [None]:
abl_loader = torch.utils.data.DataLoader(
    dataset     = abl_data, 
    num_workers = 4,
    batch_size  = config['batch_size'], 
    pin_memory  = True,
    shuffle     = True
)

In [None]:
train_100_loader = torch.utils.data.DataLoader(
    dataset     = train_100, 
    num_workers = 4,
    batch_size  = config['batch_size'], 
    pin_memory  = True,
    shuffle     = True
)

In [None]:
# Testing code to check if your data loaders are working
for i, data in enumerate(train_loader):
    frames, phonemes = data
    print(frames.shape, phonemes.shape)
    break

torch.Size([65536, 1917]) torch.Size([65536])


# Network Architecture


This section defines your network architecture for the homework. We have given you a sample architecture that can easily clear the very low cutoff for the early submission deadline.

In [None]:
class Network(torch.nn.Module):

    def __init__(self, input_size, output_size):

        super(Network, self).__init__()

        self.model = torch.nn.Sequential(
            torch.nn.Linear(input_size, 2048),
            torch.nn.BatchNorm1d(2048,momentum=0.3),
            torch.nn.PReLU(),
            torch.nn.Dropout(p=config['dropout']),

            torch.nn.Linear(2048, 2048),
            torch.nn.BatchNorm1d(2048,momentum=0.3),
            torch.nn.PReLU(),
            torch.nn.Dropout(p=config['dropout']),

            torch.nn.Linear(2048, 2048),
            torch.nn.BatchNorm1d(2048,momentum=0.3),
            torch.nn.PReLU(),
            torch.nn.Dropout(p=config['dropout']),

            torch.nn.Linear(2048, 1024),
            torch.nn.BatchNorm1d(1024,momentum=0.3),
            torch.nn.PReLU(),
            torch.nn.Dropout(p=config['dropout']),

            torch.nn.Linear(1024, 1024),
            torch.nn.BatchNorm1d(1024,momentum=0.3),
            torch.nn.PReLU(),
            torch.nn.Dropout(p=config['dropout']),
          
            torch.nn.Linear(1024, 1024),
            torch.nn.BatchNorm1d(1024,momentum=0.3),
            torch.nn.PReLU(),
            torch.nn.Dropout(p=config['dropout']),

            torch.nn.Linear(1024, 1024),
            torch.nn.BatchNorm1d(1024,momentum=0.3),
            torch.nn.PReLU(),
            torch.nn.Dropout(p=config['dropout']),

            torch.nn.Linear(1024, output_size),
        )      

    def forward(self, x):
        out = self.model(x)

        return out

# Define Model, Loss Function and Optimizer

Here we define the model, loss function, optimizer and optionally a learning rate scheduler. Depending on you're training or ablating, run either model or abl_model

In [None]:
INPUT_SIZE  = (2*config['context'] + 1) * 27 # Why is this the case?
model       = Network(INPUT_SIZE, 42).to(device)
# summary(model, frames.to(device))
# Check number of parameters of your network
# Remember, you are limited to 20 million parameters for HW1 (including ensembles)

In [None]:
INPUT_SIZE  = (2*config['context'] + 1) * 27 # Why is this the case?
abl_model       = Network(INPUT_SIZE, 42).to(device)
summary(abl_model, frames.to(device))   
# Check number of parameters of your network
# Remember, you are limited to 20 million parameters for HW1 (including ensembles)

In [None]:
criterion = torch.nn.CrossEntropyLoss() # Defining Loss function. 
# We use CE because the task is multi-class classification 

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr= config['init_lr']) #Defining Optimizer
# Recommended : Define Scheduler for Learning Rate, 
# including but not limited to StepLR, MultiStepLR, CosineAnnealingLR, ReduceLROnPlateau, etc. 
# You can refer to Pytorch documentation for more information on how to use them.
scheduler = torch.optim.lr_scheduler.StepLR(optimizer,1,config['gamma'])

# Is your training time very high? 
# Look into mixed precision training if your GPU (Tesla T4, V100, etc) can make use of it 
# Refer - https://pytorch.org/docs/stable/notes/amp_examples.html

In [None]:
optimizer = torch.optim.Adamax(abl_model.parameters(), lr= config['init_lr']) #Defining Optimizer
# Recommended : Define Scheduler for Learning Rate, 
# including but not limited to StepLR, MultiStepLR, CosineAnnealingLR, ReduceLROnPlateau, etc. 
# You can refer to Pytorch documentation for more information on how to use them.
scheduler = torch.optim.lr_scheduler.StepLR(optimizer,1,config['gamma'])

# Is your training time very high? 
# Look into mixed precision training if your GPU (Tesla T4, V100, etc) can make use of it 
# Refer - https://pytorch.org/docs/stable/notes/amp_examples.html

# Training and Validation Functions

This section covers the training, and validation functions for each epoch of running your experiment with a given model architecture. the ablation is a modified version of train

In [None]:
torch.cuda.empty_cache()
gc.collect()

2048

In [None]:
scaler = torch.cuda.amp.GradScaler()

def train(model, dataloader, optimizer, criterion):

    model.train()
    tloss, tacc = 0, 0 # Monitoring loss and accuracy
    batch_bar   = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc='Train')
    
    for i, (frames, phonemes) in enumerate(dataloader):
        
        ### Initialize Gradients
        optimizer.zero_grad()

        ### Move Data to Device (Ideally GPU)
        frames      = frames.to(device)
        phonemes    = phonemes.to(device)

        ### Forward Propagation
        with torch.cuda.amp.autocast():
          logits  = model(frames)
          ### Loss Calculation
          loss = criterion(logits, phonemes)

        ### Backward Propagation
        scaler.scale(loss).backward()
        
        ### Gradient Descent
        scaler.step(optimizer)

        scaler.update()       

        tloss   += loss.item()
        tacc    += torch.sum(torch.argmax(logits, dim= 1) == phonemes).item()/logits.shape[0]

        batch_bar.set_postfix(loss="{:.04f}".format(float(tloss / (i + 1))), 
                              acc="{:.04f}%".format(float(tacc*100 / (i + 1))))
        batch_bar.update()

        ### Release memory
        del frames, phonemes, logits
        torch.cuda.empty_cache()
  
    batch_bar.close()
    tloss   /= len(train_loader)
    tacc    /= len(train_loader)

    return tloss, tacc

In [None]:
def eval(model, dataloader):

    model.eval() # set model in evaluation mode
    vloss, vacc = 0, 0 # Monitoring loss and accuracy
    batch_bar   = tqdm(total=len(val_loader), dynamic_ncols=True, position=0, leave=False, desc='Val')

    for i, (frames, phonemes) in enumerate(dataloader):

        ### Move data to device (ideally GPU)
        frames      = frames.to(device)
        phonemes    = phonemes.to(device)

        # makes sure that there are no gradients computed as we are not training the model now
        with torch.inference_mode(): 
            ### Forward Propagation
            logits  = model(frames)
            ### Loss Calculation
            loss    = criterion(logits, phonemes)

        vloss   += loss.item()
        vacc    += torch.sum(torch.argmax(logits, dim= 1) == phonemes).item()/logits.shape[0]
        
        # Do you think we need loss.backward() and optimizer.step() here?

        batch_bar.set_postfix(loss="{:.04f}".format(float(vloss / (i + 1))), 
                              acc="{:.04f}%".format(float(vacc*100 / (i + 1))))
        batch_bar.update()
    
        ### Release memory
        del frames, phonemes, logits
        torch.cuda.empty_cache()

    batch_bar.close()
    vloss   /= len(val_loader)
    vacc    /= len(val_loader)

    return vloss, vacc

In [None]:
scaler = torch.cuda.amp.GradScaler()

def ablation(model, dataloader, optimizer, criterion):

    model.train()
    abl_loss, abl_acc = 0, 0 # Monitoring loss and accuracy
    batch_bar   = tqdm(total=len(abl_loader), dynamic_ncols=True, leave=False, position=0, desc='Train')
    
    for i, (frames, phonemes) in enumerate(dataloader):
        
        ### Initialize Gradients
        optimizer.zero_grad()

        ### Move Data to Device (Ideally GPU)
        frames      = frames.to(device)
        phonemes    = phonemes.to(device)

        ### Forward Propagation
        with torch.cuda.amp.autocast():
          logits  = model(frames)
          ### Loss Calculation
          loss = criterion(logits, phonemes)

        ### Backward Propagation
        scaler.scale(loss).backward()
        
        ### Gradient Descent
        scaler.step(optimizer)

        scaler.update()       

        abl_loss   += loss.item()
        abl_acc    += torch.sum(torch.argmax(logits, dim= 1) == phonemes).item()/logits.shape[0]

        batch_bar.set_postfix(loss="{:.04f}".format(float(abl_loss / (i + 1))), 
                              acc="{:.04f}%".format(float(abl_acc*100 / (i + 1))))
        batch_bar.update()

        ### Release memory
        del frames, phonemes, logits
        torch.cuda.empty_cache()
  
    batch_bar.close()
    abl_loss   /= len(abl_loader)
    abl_acc    /= len(abl_loader)

    return abl_loss, abl_acc

# Weights and Biases Setup

This section is to enable logging metrics and files with Weights and Biases. Please refer to wandb documentationa and recitation 0 that covers the use of weights and biases for logging, hyperparameter tuning and monitoring your runs for your homeworks. Using this tool makes it very easy to show results when submitting your code and models for homeworks, and also extremely useful for study groups to organize and run ablations under a single team in wandb. 

We have written code for you to make use of it out of the box, so that you start using wandb for all your HWs from the beginning.

In [None]:
wandb.login(key="17b33e5165b64dc340be46f11e70984b346ab965") #API Key is in your wandb account, under settings (wandb.ai/settings)
# Create your wandb run
run_name = "TRN" + str(config) + "architecture = [2048 * 3 + 1024 * 4 42]"
run = wandb.init(
    name    = run_name,
    reinit  = True, ### Allows reinitalizing runs when you re-run this cell
    #id     = "y28t31uz", ### Insert specific run id here if you want to resume a previous run
    #resume = "must", ### You need this to resume previous runs, but comment out reinit = True when using this
    project = "hw1p2", ### Project should be created in your wandb account 
    config  = config ### Wandb Config for your run
)

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
### Save your model architecture as a string with str(model) 
model_arch  = str(model)
# model_arch = str(abl_model)

### Save it in a txt file 
arch_file   = open(run_name+".txt", "w")
file_write  = arch_file.write(model_arch)
arch_file.close()

# Resume running you training model, if needed

In [None]:
INPUT_SIZE  = (2*config['context'] + 1) * 27 # Why is this the case?
state_dict = torch.load('/content/status/'+run_name+'.pth')

model = Network(INPUT_SIZE, 42).to(device)
model.load_state_dict(state_dict['model_state_dict'])

optimizer = torch.optim.Adamax(model.parameters(), lr= 0.0002684) #Defining Optimizer
# optimizer.load_state_dict(state_dict['optimizer_state_dict'])
# print(optimizer)

scheduler = torch.optim.lr_scheduler.StepLR(optimizer,1,0.9)
# scheduler.load_state_dict(state_dict['scheduler_state_dict'])
# print(scheduler)

# e = state_dict['curr_epoch']

# ### log it in your wandb run with wandb.save()
wandb.save(run_name+'.txt')

# # Iterate over number of epochs to train and evaluate your model
torch.cuda.empty_cache()
gc.collect()
wandb.watch(model, log="all")
metric_list = []
violation = 0
best_checkpoint = torch.load('/content/status/best_train.pth')
best_val_acc = best_checkpoint['val_acc']

for epoch in [20,21,22,23,24]:

    print("\nEpoch {}/{}".format(epoch+1, config['epochs']))

    curr_lr                 = float(optimizer.param_groups[0]['lr'])
    train_loss, train_acc   = train(model, train_loader, optimizer, criterion)
    val_loss, val_acc       = eval(model, val_loader)
    # if len(metric_list) > 0 and val_acc - max(metric_list) <= config['min_delta']:
    #   violation += 1
    metric_list.append(val_acc)

    ### Log metrics at each epoch in your run 
    # Optionally, you can log at each batch inside train/eval functions 
    # (explore wandb documentation/wandb recitation)
    wandb.log({'train_acc': train_acc*100, 'train_loss': train_loss, 
               'val_acc': val_acc*100, 'valid_loss': val_loss, 'lr': scheduler.get_last_lr()[0]})

    print("\tTrain Acc {:.04f}%\tTrain Loss {:.04f}\t Learning Rate {:.07f}".format(train_acc*100, train_loss, scheduler.get_last_lr()[0]))
    print("\tValidation Acc {:.04f}%\tValidation Loss {:.04f}".format(val_acc*100, val_loss))
    scheduler.step()
    # if (violation >= config['patience']):
    #   print('Early stopping triggered')
    #   break
    # save your trainings anyways cuz why not
    torch.save({'epoch': config['epochs'],
      'batch_size': config['batch_size'],
      'context': config['context'],
      'init_lr': config['init_lr'],
      'dropout': config['dropout'],
      'gamma' : config['gamma'],
      # 'patience': config['patience'],
      # 'min_delta': config['min_delta'],
      'curr_epoch': epoch,
      'model_state_dict': model.state_dict(),
      'optimizer_state_dict': optimizer.state_dict(),
      'scheduler_state_dict': scheduler.state_dict()},
    '/content/status/'+run_name+'.pth')

    if (val_acc > best_val_acc):
      torch.save({'epoch': config['epochs'],
                'batch_size': config['batch_size'],
                'context': config['context'],
                'init_lr': config['init_lr'],
                'dropout': config['dropout'],
                'gamma' : config['gamma'],
                # 'patience': config['patience'],
                # 'min_delta': config['min_delta'],
                'curr_epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict(),
                'train_loss': train_loss,
                'val_acc': val_acc}, 
        '/content/status/best_train.pth')

# Experiment

Now, it is time to finally run your ablations! Have fun!

#Run training dataset

In [None]:
# Iterate over number of epochs to train and evaluate your model
torch.cuda.empty_cache()
gc.collect()
wandb.watch(model, log="all")
metric_list = []
violation = 0
best_checkpoint = torch.load('/content/status/train.pth')
# best_val_acc = best_checkpoint['val_acc']
best_val_acc = 0

for epoch in range(config['epochs']):

    print("\nEpoch {}/{}".format(epoch+1, config['epochs']))

    curr_lr                 = float(optimizer.param_groups[0]['lr'])
    train_loss, train_acc   = train(model, train_loader, optimizer, criterion)
    val_loss, val_acc       = eval(model, val_loader)
    metric_list.append(val_acc)

    ### Log metrics at each epoch in your run 
    # Optionally, you can log at each batch inside train/eval functions 
    # (explore wandb documentation/wandb recitation)
    wandb.log({'train_acc': train_acc*100, 'train_loss': train_loss, 
               'val_acc': val_acc*100, 'valid_loss': val_loss, 'lr': scheduler.get_last_lr()[0]})

    print("\tTrain Acc {:.04f}%\tTrain Loss {:.04f}\t Learning Rate {:.07f}".format(train_acc*100, train_loss, scheduler.get_last_lr()[0]))
    print("\tValidation Acc {:.04f}%\tValidation Loss {:.04f}".format(val_acc*100, val_loss))
    scheduler.step()
    # if (violation >= config['patience']):
    #   print('Early stopping triggered')
    #   break
    # save your trainings anyways cuz why not
    torch.save({'epoch': config['epochs'],
      'batch_size': config['batch_size'],
      'context': config['context'],
      'init_lr': config['init_lr'],
      'dropout': config['dropout'],
      'gamma' : config['gamma'],
      # 'patience': config['patience'],
      # 'min_delta': config['min_delta'],
      'curr_epoch': epoch,
      'model_state_dict': model.state_dict(),
      'optimizer_state_dict': optimizer.state_dict(),
      'scheduler_state_dict': scheduler.state_dict()},
    '/content/status/'+run_name+'.pth')

    if (val_acc > best_val_acc):
      torch.save({'epoch': config['epochs'],
                'batch_size': config['batch_size'],
                'context': config['context'],
                'init_lr': config['init_lr'],
                'dropout': config['dropout'],
                'gamma' : config['gamma'],
                # 'patience': config['patience'],
                # 'min_delta': config['min_delta'],
                'curr_epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict(),
                'train_loss': train_loss,
                'val_acc': val_acc}, 
        '/content/status/best_train.pth')


Epoch 1/24


Train:   0%|          | 0/5083 [00:00<?, ?it/s]

Val:   0%|          | 0/30 [00:00<?, ?it/s]

	Train Acc 77.6760%	Train Loss 0.6904	 Learning Rate 0.0020000
	Validation Acc 83.8634%	Validation Loss 0.4752

Epoch 2/24


Train:   0%|          | 0/5083 [00:00<?, ?it/s]

Val:   0%|          | 0/30 [00:00<?, ?it/s]

	Train Acc 84.0649%	Train Loss 0.4706	 Learning Rate 0.0017000
	Validation Acc 85.6423%	Validation Loss 0.4177

Epoch 3/24


Train:   0%|          | 0/5083 [00:00<?, ?it/s]

Val:   0%|          | 0/30 [00:00<?, ?it/s]

	Train Acc 85.4633%	Train Loss 0.4236	 Learning Rate 0.0014450
	Validation Acc 86.4237%	Validation Loss 0.3934

Epoch 4/24


Train:   0%|          | 0/5083 [00:00<?, ?it/s]

Val:   0%|          | 0/30 [00:00<?, ?it/s]

	Train Acc 86.2343%	Train Loss 0.3976	 Learning Rate 0.0012282
	Validation Acc 86.8266%	Validation Loss 0.3809

Epoch 5/24


Train:   0%|          | 0/5083 [00:00<?, ?it/s]

Val:   0%|          | 0/30 [00:00<?, ?it/s]

	Train Acc 86.7390%	Train Loss 0.3806	 Learning Rate 0.0010440
	Validation Acc 87.1262%	Validation Loss 0.3718

Epoch 6/24


Train:   0%|          | 0/5083 [00:00<?, ?it/s]

Val:   0%|          | 0/30 [00:00<?, ?it/s]

	Train Acc 87.1058%	Train Loss 0.3682	 Learning Rate 0.0008874
	Validation Acc 87.3479%	Validation Loss 0.3652

Epoch 7/24


Train:   0%|          | 0/5083 [00:00<?, ?it/s]

Val:   0%|          | 0/30 [00:00<?, ?it/s]

	Train Acc 87.3903%	Train Loss 0.3586	 Learning Rate 0.0007543
	Validation Acc 87.4968%	Validation Loss 0.3603

Epoch 8/24


Train:   0%|          | 0/5083 [00:00<?, ?it/s]

Val:   0%|          | 0/30 [00:00<?, ?it/s]

	Train Acc 87.6188%	Train Loss 0.3509	 Learning Rate 0.0006412
	Validation Acc 87.6270%	Validation Loss 0.3568

Epoch 9/24


Train:   0%|          | 0/5083 [00:00<?, ?it/s]

Val:   0%|          | 0/30 [00:00<?, ?it/s]

	Train Acc 87.8067%	Train Loss 0.3446	 Learning Rate 0.0005450
	Validation Acc 87.6895%	Validation Loss 0.3545

Epoch 10/24


Train:   0%|          | 0/5083 [00:00<?, ?it/s]

Val:   0%|          | 0/30 [00:00<?, ?it/s]

	Train Acc 87.9663%	Train Loss 0.3392	 Learning Rate 0.0004632
	Validation Acc 87.7713%	Validation Loss 0.3527

Epoch 11/24


Train:   0%|          | 0/5083 [00:00<?, ?it/s]

Val:   0%|          | 0/30 [00:00<?, ?it/s]

	Train Acc 88.1017%	Train Loss 0.3346	 Learning Rate 0.0003937
	Validation Acc 87.8523%	Validation Loss 0.3507

Epoch 12/24


Train:   0%|          | 0/5083 [00:00<?, ?it/s]

Val:   0%|          | 0/30 [00:00<?, ?it/s]

	Train Acc 88.2189%	Train Loss 0.3307	 Learning Rate 0.0003347
	Validation Acc 87.9210%	Validation Loss 0.3483

Epoch 13/24


Train:   0%|          | 0/5083 [00:00<?, ?it/s]

Val:   0%|          | 0/30 [00:00<?, ?it/s]

	Train Acc 88.3166%	Train Loss 0.3274	 Learning Rate 0.0002845
	Validation Acc 87.9536%	Validation Loss 0.3477

Epoch 14/24


Train:   0%|          | 0/5083 [00:00<?, ?it/s]

Val:   0%|          | 0/30 [00:00<?, ?it/s]

	Train Acc 88.3998%	Train Loss 0.3245	 Learning Rate 0.0002418
	Validation Acc 88.0022%	Validation Loss 0.3468

Epoch 15/24


Train:   0%|          | 0/5083 [00:00<?, ?it/s]

Val:   0%|          | 0/30 [00:00<?, ?it/s]

	Train Acc 88.4765%	Train Loss 0.3221	 Learning Rate 0.0002055
	Validation Acc 87.9989%	Validation Loss 0.3467

Epoch 16/24


Train:   0%|          | 0/5083 [00:00<?, ?it/s]

Val:   0%|          | 0/30 [00:00<?, ?it/s]

	Train Acc 88.5379%	Train Loss 0.3200	 Learning Rate 0.0001747
	Validation Acc 88.0172%	Validation Loss 0.3463

Epoch 17/24


Train:   0%|          | 0/5083 [00:00<?, ?it/s]

Val:   0%|          | 0/30 [00:00<?, ?it/s]

	Train Acc 88.5910%	Train Loss 0.3182	 Learning Rate 0.0001485
	Validation Acc 88.0816%	Validation Loss 0.3455

Epoch 18/24


Train:   0%|          | 0/5083 [00:00<?, ?it/s]

Val:   0%|          | 0/30 [00:00<?, ?it/s]

	Train Acc 88.6372%	Train Loss 0.3166	 Learning Rate 0.0001262
	Validation Acc 88.0873%	Validation Loss 0.3456

Epoch 19/24


Train:   0%|          | 0/5083 [00:00<?, ?it/s]

Val:   0%|          | 0/30 [00:00<?, ?it/s]

	Train Acc 88.6782%	Train Loss 0.3153	 Learning Rate 0.0001073
	Validation Acc 88.0776%	Validation Loss 0.3454

Epoch 20/24


Train:   0%|          | 0/5083 [00:00<?, ?it/s]

Val:   0%|          | 0/30 [00:00<?, ?it/s]

	Train Acc 88.7134%	Train Loss 0.3141	 Learning Rate 0.0000912
	Validation Acc 88.0923%	Validation Loss 0.3452

Epoch 21/24


Train:   0%|          | 0/5083 [00:00<?, ?it/s]

Val:   0%|          | 0/30 [00:00<?, ?it/s]

	Train Acc 88.7415%	Train Loss 0.3132	 Learning Rate 0.0000775
	Validation Acc 88.1127%	Validation Loss 0.3447

Epoch 22/24


Train:   0%|          | 0/5083 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

#Run ablation

In [None]:
# Iterate over number of epochs to train and evaluate your model
torch.cuda.empty_cache()
gc.collect()
wandb.watch(abl_model, log="all")
metric_list = []
violation = 0
# best_checkpoint = torch.load('/content/drive/MyDrive/Colab Notebooks/11485/1/currentBest.pth')
best_checkpoint = torch.load('/content/status/currentBest.pth')
best_val_acc = best_checkpoint['val_acc']
for epoch in range(10):
    if (scheduler.get_last_lr()[0] >= 1e-5):
      print("\nEpoch {}/{}".format(epoch+1, config['epochs']))

      curr_lr                 = float(optimizer.param_groups[0]['lr'])
      abl_loss, abl_acc   = ablation(abl_model, abl_loader, optimizer, criterion)
      val_loss, val_acc       = eval(abl_model, val_loader)
      if len(metric_list) > 0 and val_acc - max(metric_list) <= config['min_delta']:
        violation += 1
      metric_list.append(val_acc)

      ## Highly Recommended: Save checkpoint in drive and/or wandb if accuracy is better than your current best
      if (val_acc > best_val_acc):
        torch.save({'epoch': config['epochs'],
                'batch_size': config['batch_size'],
                'context': config['context'],
                'init_lr': config['init_lr'],
                'dropout': config['dropout'],
                'gamma' : config['gamma'],
                'patience': config['patience'],
                'min_delta': config['min_delta'],
                'curr_epoch': epoch,
                'model_state_dict': abl_model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict(),
                'train_loss': abl_loss,
                'val_acc': val_acc}, 
              '/content/status/ABL_best.pth')
      ### Log metrics at each epoch in your run 
      # Optionally, you can log at each batch inside train/eval functions 
      # (explore wandb documentation/wandb recitation)
      wandb.log({'abl_acc': abl_acc*100, 'abl_loss': abl_loss, 
                'val_acc': val_acc*100, 'valid_loss': val_loss, 'lr': scheduler.get_last_lr()[0]})

      print("\tAblation Acc {:.04f}%\tAblation Loss {:.04f}\t Learning Rate {:.07f}".format(abl_acc*100, abl_loss, scheduler.get_last_lr()[0]))
      print("\tValidation Acc {:.04f}%\tValidation Loss {:.04f}".format(val_acc*100, val_loss))
      scheduler.step()
      # if (violation >= config['patience']):
      #   print('Early stopping triggered')
      #   break

In [None]:
# save your ablations anyways cuz why not
torch.save({'epoch': config['epochs'],
        'batch_size': config['batch_size'],
        'context': config['context'],
        'init_lr': config['init_lr'],
        'dropout': config['dropout'],
        'gamma' : config['gamma'],
        'patience': config['patience'],
        'min_delta': config['min_delta'],
        'curr_epoch': epoch,
        'model_state_dict': abl_model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict()},
      '/content/status/'+run_name+'.pth')

In [None]:
### Finish your wandb run
run.finish()

0,1
lr,█▇▆▅▅▄▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
train_acc,▁▅▆▆▇▇▇▇▇████████████
train_loss,█▄▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁
val_acc,▁▄▅▆▆▇▇▇▇▇███████████
valid_loss,█▅▄▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁

0,1
lr,8e-05
train_acc,88.74153
train_loss,0.31316
val_acc,88.1127
valid_loss,0.34472


# Testing and submission to Kaggle

Before we get to the following code, make sure to see the format of submission given in *sample_submission.csv*. Once you have done so, it is time to fill the following function to complete your inference on test data. Refer the eval function from previous cells to get an idea of how to go about completing this function.

In [None]:
def test(model, test_loader):
    ### What you call for model to perform inference?
    model.eval()

    ### List to store predicted phonemes of test data
    test_predictions = []

    ### Which mode do you need to avoid gradients?
    with torch.inference_mode():
        for i, mfccs in enumerate(tqdm(test_loader)):
            mfccs   = mfccs.to(device)             
            
            logits  = model(mfccs)

            ### Get most likely predicted phoneme with argmax
            predicted_phonemes = torch.argmax(logits, dim=1) # (1024,)

            ### How do you store predicted_phonemes with test_predictions? Hint, look at eval 
            for j in predicted_phonemes:
              test_predictions.append(PHONEMES[j])

    return test_predictions

In [None]:
predictions = test(model, test_loader)

  0%|          | 0/30 [00:00<?, ?it/s]

In [None]:
### Create CSV file with predictions
with open("./submission.csv", "w+") as f:
    f.write("id,label\n")
    for i in range(len(predictions)):
        f.write("{},{}\n".format(i, predictions[i]))

In [None]:
### Submit to kaggle competition using kaggle API (Uncomment below to use)
!kaggle competitions submit -c 11-785-s23-hw1p2 -f ./submission.csv -m "Test Submission"

### However, its always safer to download the csv file and then upload to kaggle

100% 19.3M/19.3M [00:00<00:00, 49.8MB/s]
Successfully submitted to Frame-Level Speech Recognition