# README

In [None]:
'''

To run the entire pipeline : Runtime -> Rull All OR Ctrl + F9

Experimentation :

    1) Epochs            : 10       : the model was able to converge within very small number of epochs
    2) Batch Size        : 128      : maximum that fits the hardware
    3) Beam Width        : 3        : increasing beam width also increased the training time
    4) Weight Decay      : 0.05     : value was set to commnly used value
    5) Scheduler         : Constant : constant learning rate gave satisfactory results
    6) Optimizer         : AdamW    : sgd, adam, and adamw were tried on small networks and adamw gave slightly better results
    
    7) Network           : 1D convolution layers were added to first extract features and then pass to the lstm. this improved the accuracy by a large value
                           [1, 2, 3, 4] layers of lstm were tried and 4 layer lstm gave better results. bidirectional lstm was used
                           finally the lstm outputs were passed through 2 linear layers

                           Following is the network used :
                           
                           Network(
                            (embedding): Sequential(
                              (0): Conv1d(15, 128, kernel_size=(3,), stride=(2,), padding=(1,))
                              (1): GELU(approximate=none)
                              (2): Dropout(p=0.3, inplace=False)
                              (3): Conv1d(128, 256, kernel_size=(3,), stride=(2,), padding=(1,))
                              (4): GELU(approximate=none)
                            )
                            (lstm): LSTM(256, 512, num_layers=4, batch_first=True, bidirectional=True)
                            (classification): Sequential(
                              (0): Linear(in_features=1024, out_features=1024, bias=True)
                              (1): GELU(approximate=none)
                              (2): Linear(in_features=1024, out_features=43, bias=True)
                            )
                            (logSoftmax): LogSoftmax(dim=2)
                          )

    9) WandB Runs        : wandb project is made public and can be found here : https://wandb.ai/ajinkyanande111/hw3p2

'''

# Preliminaries

In [None]:
!pip install wandb -q

In [None]:
wandb.login(key='')

In [None]:
!pip install torchsummaryX -q
!pip install python-Levenshtein -q
!git clone --recursive https://github.com/parlance/ctcdecode.git -q
!pip install wget -q
%cd ctcdecode
!pip install .  -q
%cd ..

In [5]:
import os
import datetime
import math
import gc
import zipfile

import torch
import random
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

import torchaudio.transforms as tat
from sklearn.metrics import accuracy_score
import pandas as pd

from tqdm import tqdm
from torchsummaryX import summary
import wandb

import ctcdecode
from ctcdecode import CTCBeamDecoder
import Levenshtein

import warnings
warnings.filterwarnings('ignore')

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device: ", device)

Device:  cpu


In [None]:
!pip install --upgrade --force-reinstall --no-deps kaggle==1.5.8 -q
!mkdir /root/.kaggle

with open('/root/.kaggle/kaggle.json', 'w+') as f:
    f.write('{"username":"ajinkyanande","key":"5f60b9bc169fe67552e51c70e754066d"}') 

!chmod 600 /root/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c 11-785-f22-hw3p2

In [None]:
!unzip -q 11-785-f22-hw3p2.zip
!ls

# Config

In [6]:
config = {'beam_width': 3,
          'lr': 0.001,
          'epochs': 10,
          'batch_size': 128}

root = '/content/hw3p2'

In [7]:
CMUdict_ARPAbet = {"" : " ",
                   "[SIL]": "-", "NG": "G", "F" : "f", "M" : "m", "AE": "@", 
                   "R"    : "r", "UW": "u", "N" : "n", "IY": "i", "AW": "W", 
                   "V"    : "v", "UH": "U", "OW": "o", "AA": "a", "ER": "R", 
                   "HH"   : "h", "Z" : "z", "K" : "k", "CH": "C", "W" : "w", 
                   "EY"   : "e", "ZH": "Z", "T" : "t", "EH": "E", "Y" : "y", 
                   "AH"   : "A", "B" : "b", "P" : "p", "TH": "T", "DH": "D", 
                   "AO"   : "c", "G" : "g", "L" : "l", "JH": "j", "OY": "O", 
                   "SH"   : "S", "D" : "d", "AY": "Y", "S" : "s", "IH": "I",
                   "[SOS]": "[SOS]", "[EOS]": "[EOS]"}

CMUdict = list(CMUdict_ARPAbet.keys())
ARPAbet = list(CMUdict_ARPAbet.values())

phonems = CMUdict
mapping = CMUdict_ARPAbet
labels = ARPAbet

# Dataset

In [None]:
class AudioDataset(torch.utils.data.Dataset):

    def __init__(self, mfcc_dir, transcript_dir): 

        # mfcc and transcript file names
        mfcc_files = os.listdir(mfcc_dir)
        transcript_files = os.listdir(transcript_dir)

        # dataset length
        assert len(mfcc_files) == len(transcript_files)
        self.length = len(mfcc_files)
        
        # iterate through files and load X and Y in list
        self.mfccs = []
        self.transcripts = []

        for i in tqdm(range(self.length), total=self.length):
            
            # load a single mfcc
            mfcc = np.load(mfcc_dir + mfcc_files[i])

            # cepstral normalization of mfcc
            mfcc = mfcc - np.mean(mfcc, axis=0, keepdims=True)
            mfcc = mfcc / np.std(mfcc, axis=0, keepdims=True)

            # load the corresponding transcript
            transcript = np.load(transcript_dir + transcript_files[i])

            # remove start of line [SOS] and end of line [EOS]
            transcript = transcript[1: -1]

            # phonems to dictionary indexes
            transcript_1 = [phonems.index(y) for y in transcript]
            
            # append each mfcc to self.mfcc and transcript to self.transcript
            self.mfccs.append(mfcc)
            self.transcripts.append(transcript_1)

    def __len__(self):
        
        return self.length

    def __getitem__(self, ind):
        
        return torch.FloatTensor(self.mfccs[ind]), torch.FloatTensor(self.transcripts[ind])

    def collate_fn(batch):

        # batch of input mfcc coefficients and transcripts
        batch_mfcc = [x[0] for x in batch]
        batch_transcript = [y[1] for y in batch]

        # pad mfccs and transcripts of batch to make of same length
        lengths_mfcc = [len(x) for x in batch_mfcc]
        batch_mfcc_pad = pad_sequence(batch_mfcc, batch_first=True)

        lengths_transcript = [len(y) for y in batch_transcript]
        batch_transcript_pad = pad_sequence(batch_transcript, batch_first=True)
        
        return batch_mfcc_pad, batch_transcript_pad, torch.tensor(lengths_mfcc), torch.tensor(lengths_transcript)

In [None]:
class AudioDatasetTest(torch.utils.data.Dataset):
   
    def __init__(self, mfcc_dir): 

        # mfcc file names
        mfcc_files = sorted(os.listdir(mfcc_dir))

        # dataset length
        self.length = len(mfcc_files)
        
        # iterate through files and load X in list
        self.mfccs = []

        for i in tqdm(range(self.length), total=self.length):
            
            # load a single mfcc
            mfcc = np.load(mfcc_dir + mfcc_files[i])

            # cepstral normalization of mfcc
            mfcc = mfcc - np.mean(mfcc, axis=0, keepdims=True)
            mfcc = mfcc / np.std(mfcc, axis=0, keepdims=True)

            # append each mfcc to self.mfcc
            self.mfccs.append(mfcc)

    def __len__(self):
        
        return self.length

    def __getitem__(self, ind):
        
        return torch.FloatTensor(self.mfccs[ind])

    def collate_fn(batch):

        # batch of input mfcc coefficients and transcripts
        batch_mfcc = [x for x in batch]

        # pad mfccs and transcripts of batch to make of same length
        lengths_mfcc = [len(x) for x in batch_mfcc]
        batch_mfcc_pad = pad_sequence(batch_mfcc, batch_first=True)
        
        return batch_mfcc_pad, torch.tensor(lengths_mfcc)

# Dataloader

In [10]:
import gc 
gc.collect()

24

In [None]:
train_mfcc_dir       = '/content/hw3p2/train-clean-360/mfcc/'
train_transcript_dir = '/content/hw3p2/train-clean-360/transcript/raw/'

val_mfcc_dir         = '/content/hw3p2/dev-clean/mfcc/'
val_transcript_dir   = '/content/hw3p2/dev-clean/transcript/raw/'

test_mfcc_dir        = '/content/hw3p2/test-clean/mfcc/'

train_data = AudioDataset(train_mfcc_dir, train_transcript_dir)
val_data   = AudioDataset(val_mfcc_dir, val_transcript_dir)
test_data  = AudioDatasetTest(test_mfcc_dir)

train_loader = torch.utils.data.DataLoader(train_data,
                                           batch_size=config['batch_size'],
                                           collate_fn=AudioDataset.collate_fn,
                                           pin_memory=True,
                                           shuffle=True)

val_loader   = torch.utils.data.DataLoader(val_data,
                                           batch_size=config['batch_size'],
                                           collate_fn=AudioDataset.collate_fn,
                                           pin_memory=True,
                                           shuffle=False)

test_loader  = torch.utils.data.DataLoader(test_data,
                                           batch_size=config['batch_size'],
                                           collate_fn=AudioDatasetTest.collate_fn,
                                           pin_memory=True,
                                           shuffle=False)

print("\nBatch size: ", config['batch_size'])
print("Train dataset samples = {}, batches = {}".format(train_data.__len__(), len(train_loader)))
print("Val dataset samples = {}, batches = {}".format(val_data.__len__(), len(val_loader)))
print("Test dataset samples = {}, batches = {}".format(test_data.__len__(), len(test_loader)))

In [None]:
# SANITY CHECK

for data in train_loader:
    x, y, lx, ly = data
    print(x.shape, y.shape, lx.shape, ly.shape)
    break

for data in test_loader:
    x, lx = data
    print(x.shape, lx.shape)
    break

In [11]:
OUT_SIZE = len(labels)
OUT_SIZE

43

# Network

In [12]:
class Network(nn.Module):

    def __init__(self):

        super(Network, self).__init__()

        self.embedding = nn.Sequential(nn.Conv1d(in_channels=15, out_channels=128, kernel_size=3, padding=1, stride=2),
                                       nn.GELU(),
                                       nn.Dropout(p=0.3),
                                       nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, padding=1, stride=2),
                                       nn.GELU())
        
        self.lstm = nn.LSTM(input_size=256, hidden_size=512, num_layers=4, bidirectional=True, batch_first=True)
        
        self.classification = nn.Sequential(nn.Linear(1024 , 1024),
                                            nn.GELU(),
                                            nn.Linear(1024, OUT_SIZE))
        
        self.logSoftmax = nn.LogSoftmax(dim=2)

    def forward(self, x, lx):

        x_embd = self.embedding(x.transpose(1, 2)).transpose(1, 2)
        lx = (((lx - 1) // 2) - 1) // 2
    
        x_packed = pack_padded_sequence(x_embd, lx, batch_first=True, enforce_sorted=False)
        out_packed, _ = self.lstm(x_packed)

        out_unpacked, lens_unpacked = pad_packed_sequence(out_packed, batch_first=True)
        out = self.classification(out_unpacked)
        
        out = self.logSoftmax(out)

        return out, lens_unpacked

In [13]:
torch.cuda.empty_cache()

model = Network().to(device)
summary(model, x.to(device), lx)

Network(
  (embedding): Sequential(
    (0): Conv1d(15, 128, kernel_size=(3,), stride=(2,), padding=(1,))
    (1): GELU(approximate=none)
    (2): Dropout(p=0.3, inplace=False)
    (3): Conv1d(128, 256, kernel_size=(3,), stride=(2,), padding=(1,))
    (4): GELU(approximate=none)
  )
  (lstm): LSTM(256, 512, num_layers=4, batch_first=True, bidirectional=True)
  (classification): Sequential(
    (0): Linear(in_features=1024, out_features=1024, bias=True)
    (1): GELU(approximate=none)
    (2): Linear(in_features=1024, out_features=43, bias=True)
  )
  (logSoftmax): LogSoftmax(dim=2)
)


# Setup

In [None]:
criterion = torch.nn.CTCLoss(blank=0)
optimizer = torch.optim.AdamW(model.parameters(), lr=config['lr'], weight_decay=0.0)

decoder = CTCBeamDecoder(labels=labels,
                         beam_width=config['beam_width'],
                         blank_id=0,
                         log_probs_input=True)

# mixed Precision
scaler = torch.cuda.amp.GradScaler()

In [None]:
run = wandb.init(name='submission',
                 reinit=True,
                 project='hw3p2',
                 config=config)

In [None]:
def calculate_levenshtein(h, y, lh, ly, decoder, labels):

    beam_results, beam_scores, timesteps, out_lens = decoder.decode(h, seq_lens=lh)

    assert lh.shape == ly.shape
    batch_size = lh.shape[0]

    distance = 0

    for i in range(batch_size):

        prediction = beam_results[i, 0, :out_lens[i, 0]]
        
        prediction = ''.join([labels[n] for n in prediction])
        target = ''.join([labels[n] for n in y.int()[i, :ly[i]]])
        
        distance += Levenshtein.distance(target, prediction)
        
    distance /= batch_size

    return distance

In [None]:
# SANITY CHECK

with torch.no_grad():

    for i, data in enumerate(train_loader):
      
        x, y, lx, ly = data

        x = x.to(device)
        y = y.to(device)

        out, lout = model(x, lx)
        
        print(x.shape)
        print(lx.shape)
        print(y.shape)
        print(ly.shape)
        print(out.shape)
        print(lout.shape)

        loss = criterion(out.transpose(1, 0), y, lout, ly)
        print(f'loss: ', loss)

        distance = calculate_levenshtein(out, y, lout, ly, decoder, labels)
        print(f'lev-distance: ', distance)

        break

# Training

In [None]:
def train(dataloader, model, optimizer, criterion, scheduler, scaler):

    # train mode
    model.train()

    batch_bar = tqdm(total=len(dataloader), dynamic_ncols=True, leave=True, position=0, desc='train') 
    train_loss = 0.0

    for i, data in enumerate(dataloader):

        x, y, lx, ly = data

        x = x.to(device)
        y = y.to(device)

        # forward prop
        out, lout = model(x, lx)

        # loss calc
        loss = criterion(out.transpose(1, 0), y, lout, ly)
        train_loss += loss

        # init gradient for each batch
        optimizer.zero_grad()

        # back prop with mixed precision
        scaler.scale(loss).backward()

        # scaler update
        scaler.step(optimizer)
        scaler.update()

        batch_bar.set_postfix(loss=f"{train_loss/ (i+1):.4f}",
                              lr=f"{optimizer.param_groups[0]['lr']}")

        batch_bar.update()
    
    batch_bar.close()

    train_loss /= len(train_loader)

    return train_loss

In [None]:
def evaluate(data_loader, model):
    
    # eval mode
    model.eval()

    val_dist = 0
    val_loss = 0
    
    batch_bar = tqdm(total=len(data_loader), dynamic_ncols=True, leave=True, position=0, desc='val') 

    for i, data in enumerate(data_loader):

        x, y, lx, ly = data
        x = x.to(device)
        y = y.to(device)

        # forward prop
        with torch.inference_mode():
            out, lout = model(x, lx)

        # loss calc
        loss = criterion(out.transpose(1, 0), y, lout, ly)
        val_loss += loss

        # levenshtein distance
        dist = calculate_levenshtein(out, y, lout, ly, decoder, labels)
        val_dist += dist

        batch_bar.set_postfix(loss=f"{val_loss/ (i+1):.4f}",
                              lr=f"{optimizer.param_groups[0]['lr']}")

        batch_bar.update()

    val_loss /= len(data_loader)
    val_dist /= len(data_loader)

    return val_loss, val_dist

In [None]:
best_val_dist = math.inf

for epoch in range(config['epochs']):

    # clear cache
    gc.collect()
    torch.cuda.empty_cache()

    train_loss = train(train_loader, model, optimizer, criterion, None, scaler)
    val_loss, val_dist = evaluate(val_loader, model)

    # print updates
    print('epoch {}/{}'.format(epoch+1, config['epochs']))
    print('learning Rate: {:.6f}'.format(optimizer.param_groups[0]['lr']))
    print('train loss: {:.4f}'.format(train_loss))
    print('val loss: {:.4f}'.format(val_loss))
    print('val dist: {:.2f}%'.format(val_dist))
    
    if val_dist < best_val_dist:

        print('saving model')
        torch.save({'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'val_dist': val_dist,
                    'epoch': epoch}, 'checkpoint.pth')
        
        best_val_dist = val_dist
        wandb.save('checkpoint.pth')
    
    wandb.log({'train_loss': train_loss,
               'val_loss': val_loss,
               'val_dist': val_dist})

run.finish()

In [None]:
print('loading best model')
checkpoint = torch.load('checkpoint.pth')
model.load_state_dict(checkpoint['model_state_dict'])

decoder_test = CTCBeamDecoder(labels=labels,
                              beam_width=20,
                              blank_id=0,
                              log_probs_input=True)

def make_output(h, lh, decoder, labels):
  
    beam_results, beam_scores, timesteps, out_seq_len = decoder.decode(h, seq_lens=lh)
    batch_size = lh.shape[0]

    dist = 0
    preds = []

    for i in range(batch_size):

        h_sliced = beam_results[i, 0, :out_seq_len[i, 0]]
        h_string = ''.join([labels[n] for n in h_sliced])

        preds.append(h_string)
    
    return preds

In [None]:
def predict(data_loader, model, decoder, labels):

    # eval mode
    model.eval()

    batch_bar = tqdm(total=len(data_loader), dynamic_ncols=True, leave=False, position=0, desc='Val') 

    preds = []

    with torch.inference_mode():

        for i, data in enumerate(data_loader):

            x, lx = data
            x = x.to(device)

            # forward prop
            out, lout = model(x, lx)
            batch_preds = make_output(out, lout, decoder, labels)

            preds += batch_preds

    return preds

torch.cuda.empty_cache()
predictions = predict(test_loader, model, decoder, labels)

import pandas as pd

df = pd.read_csv('/content/hw3p2/test-clean/transcript/random_submission.csv')
df.label = predictions

df.to_csv('submission.csv', index = False)
!kaggle competitions submit -c 11-785-f22-hw3p2 -f submission.csv -m "I made it!"