In [7]:

import torch
import torchaudio
import torch.nn as nn
import torch.nn.functional as f
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

import transformers
from transformers import AutoTokenizer,AutoModel

from typing import Union,List,Optional,Dict 

In [8]:
# data
train_data=torchaudio.datasets.LIBRISPEECH('./',url='dev-clean',download=True)
val_data=torchaudio.datasets.LIBRISPEECH('./',url='test-clean',download=True)

- [x] the data format is *tuple(waveform:torch.Tensor, _ , _ , _ , _ ,_)*
- [x] so we take thewave form only and process it.but, the shape is (1,x) x is diffrent for diff waveforms x is in lacks
- [x] once processed the waveform becomes of the shape - (1,128,x) here x is <1000 but different for each single data
- [x] we wanna batch the data with a unique size
- [x] in the function __getitem__(self idx) idx is a number and taking self.data[idx] takes one element so the entire function should be written for procesing a single data point the batch ing of these points are somehow done by the ***torch.utils.data.DataLoader*** function you dont have tyo worry about it

In [41]:
class Config:
    max_length=124 #totally randomly picked change accordingly
    train_batch_size=16
    val_batch_size=8
    padding_value=0
    tokenizer_checkpoint='bert-base-cased'
    cnn_layers=3
    rnn_layers=5
    input_features=128


train_transforms=nn.Sequential(
    torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=128),
    torchaudio.transforms.FrequencyMasking(freq_mask_param=15),
    torchaudio.transforms.TimeMasking(time_mask_param=35)
)
# after the transformation what we get is (batch ,channel,feature,time)
val_transforms=nn.Sequential(
    torchaudio.transforms.MelSpectrogram()
)


class Dataloader:
    def __init__(self,data_url:str,tokenizer:transformers.AutoTokenizer,transforms:nn.Sequential,mode=Union['train','val'])->Dict['inputs','targets']:
        self.tokenizer=tokenizer
        self.mode=mode
        if mode=='train':
            self.data=data_url
            # self.data= torchaudio.datasets.LIBRISPEECH('./',url=train_url,download=True)
            self.transforms=transforms
        if mode =='val':
            self.data=data_url
            # self.data= torchaudio.datasets.LIBRISPEECH('/',url=val_url,download=True)
            self.transforms=transforms
    
    def __getitem__(self,idx):
        single_audio,transcript=self.transforms(self.data[idx][0]),self.data[idx][2]
        label=self.tokenizer.encode(transcript,truncation=True,padding='max_length',max_length=30,return_tensors='pt') #bert_tokenizer.decode(token.squeeze(0),skip_special_tokens=True):'to decode'
        
        if single_audio.shape[2]>Config.max_length:
            single_audio=single_audio[:,:,:Config.max_length]
        else:
            left=Config.max_length-single_audio.shape[2]
            single_audio=torch.cat([single_audio,torch.zeros(1,128,left)],dim=2)
    
        return {
                self.mode:single_audio,
                'label':label.squeeze_(0)
                }
        
    
    def __len__(self):
        return len(self.data)
    
bert_tokenizer=AutoTokenizer.from_pretrained(Config.tokenizer_checkpoint)   
    
Train_Loader=DataLoader(Dataloader(train_data,tokenizer=bert_tokenizer,transforms=train_transforms,mode='train'),batch_size=Config.train_batch_size,shuffle=True)
Val_Loader=DataLoader(Dataloader(val_data,tokenizer=bert_tokenizer,transforms=train_transforms,mode='val'),batch_size=Config.val_batch_size,shuffle=True)

In [42]:
next(iter(Train_Loader))['train'].shape

torch.Size([16, 1, 128, 124])

In [4]:
def layernorm(x,n_feat):
    layer=nn.LayerNorm(n_feat)
    return layer(x)


class ResidualCNN(nn.Module):
    def __init__(self,in_channels,out_channels,kernel,stride,padding,dropout_probability,input_features):
        super().__init__()
        self.input_features=input_features
        self.conv_1=nn.Conv2d(in_channels,out_channels,kernel,stride,padding)
        self.conv_2=nn.Conv2d(out_channels,out_channels,kernel,stride,padding)  #if not necesseary change out_ch,out_ch
        self.drop_1=nn.Dropout(dropout_probability)
        self.drop_2=nn.Dropout(dropout_probability)
    
    def forward(self,batch):
        residue=batch
        batch=f.gelu(layernorm(batch,self.input_features))
        batch=self.drop_1(batch)
        batch=self.conv_1(batch)
        batch=f.gelu(layernorm(batch,self.input_features))
        batch=self.drop_2(batch)
        batch=self.conv_2(batch)
        return batch+residue
    
class BidirectionalGRU(nn.Module):
    def __init__(self,input_size,hidden_size,num_layers,dropout_probability):
        """
        input_size=n_features
        hidden_size= nom feature in the hidden state
        num_layers= how many rnns do you wanna stack
        """
        
        super().__init__()
        self.bi_gru=nn.GRU(input_size=input_size,hidden_size=hidden_size,num_layers=num_layers,batch_first=True,bidirectional=True)
        self.drop=nn.Dropout(dropout_probability)
        
    def forward(self,batch):
        batch=f.gelu(layernorm(batch))
        batch=self.bi_gru(batch)
        batch=self.drop(batch)
        return batch
    
    
class Config_cnn:
    input_channels=32
    output_cahnnels=32
    kernel_size=3
    stride=1    
    padding=1
    dropout_probability=0.1
    input_features=128

class Config_rnn:
    input_size=512 # must be the n_features of the input (input_size abd hidden size same for first bi_rnn lets say x) then for evey next layer input_size=2x and hidden_size=x
    hidden_size=512
    num_layers=1
    dropout_probability=0.1
    
class Config_classifier:
    cnn_layers=3
    rnn_layers=5
    input_size=Config_rnn.input_size
    dropout_probability=0.1
    num_classes=29 #for some reason


    
class audio(nn.Module):
    def __init__(self,cnn_layers,rnn_layers,input_features,max_length,batch_size,out_channels):
        super().__init__()
        self.input_features=input_features
        self.max_length=max_length
        self.batch_size=batch_size
        self.out_channels=out_channels
        
        self.cnn=nn.Conv2d(input_channels=1,
                            output_channels=32,
                            kernel_size=3,
                            stride=2,
                            padding=1)
        
        self.linear=nn.Linear(32*self.input_features,Config_rnn.input_size)  #lil issue contradicts with my reference
        
        self.res_cnn=nn.Sequential(*[ResidualCNN(input_channels=Config_cnn.input_channels,
                                                 output_channels=Config_cnn.output_cahnnels,
                                                 kernel_size=Config_cnn.kernel_size,
                                                 stride=Config_cnn.stride,
                                                 padding=Config_cnn.padding,
                                                 dropout_probability=Config_cnn.dropout_probability) for _ in range(cnn_layers)])
        
        self.rnn_layers=nn.Sequential(*[BidirectionalGRU(input_size=Config_rnn.input_size if i==0 else 2*Config_rnn.input_size,
                                                         hidden_size=Config_rnn.hidden_size,
                                                         num_layers=Config_rnn.num_layers,
                                                         drop_p=Config_rnn.dropout_probability) for i in range(rnn_layers)])
        self.classifier=nn.Sequential(
                                    nn.Linear(Config_classifier.input_size*2,Config_classifier.input_size),
                                    nn.GELU(),
                                    nn.Dropout(Config_classifier.dropout_probability),
                                    nn.Linear(Config_classifier.input_size,Config_classifier.num_classes)
                                    )
        
    def forward(self,batch):
        out=self.cnn(batch)
        out=self.res_cnn(out)
        out.view(self.batch_size,self.max_length,self.out_channels*self.input_features) #batch,features:32*128, the other dimension this can potentially load into nn.linear 
        out=self.linear(out)
        out=self.rnn_layers(out)
        out=self.classifier(batch)
        return out
        

In [22]:
dummy_cnn=ResidualCNN(Config_cnn.input_channels,
                      Config_cnn.output_cahnnels,
                      kernel=Config_cnn.kernel_size,
                      stride=Config_cnn.stride,
                      padding=Config_cnn.padding,
                      dropout_probability=Config_cnn.dropout_probability,
                      input_features=Config_cnn.input_features)

# dummy_cnn2=nn.Sequential(*[dummy_cnn for i in range(3)])

# gru=BidirectionalGRU(
#     input_size=Config_rnn.input_size,
#     hidden_size=Config_rnn.hidden_size,
#     num_layers=Config_rnn.hidden_size,
#     dropout_probability=Config_rnn.dropout_probabilitys
# )

In [None]:
class CNNLayerNorm(nn.Module):
    """Layer normalization built for cnns input"""
    def __init__(self, n_feats):
        super(CNNLayerNorm, self).__init__()
        self.layer_norm = nn.LayerNorm(n_feats)

    def forward(self, x):
        # x (batch, channel, feature, time)
        x = x.transpose(2, 3).contiguous() # (batch, channel, time, feature)
        x = self.layer_norm(x)
        return x.transpose(2, 3).contiguous() # (batch, channel, feature, time) 


class ResidualCNN(nn.Module):
    """Residual CNN inspired by https://arxiv.org/pdf/1603.05027.pdf
        except with layer norm instead of batch norm
    """
    def __init__(self, in_channels, out_channels, kernel, stride, dropout, n_feats):
        super(ResidualCNN, self).__init__()

        self.cnn1 = nn.Conv2d(in_channels, out_channels, kernel, stride, padding=kernel//2)
        self.cnn2 = nn.Conv2d(out_channels, out_channels, kernel, stride, padding=kernel//2)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.layer_norm1 = CNNLayerNorm(n_feats)
        self.layer_norm2 = CNNLayerNorm(n_feats)

    def forward(self, x):
        residual = x  # (batch, channel, feature, time)
        x = self.layer_norm1(x)
        x = F.gelu(x)
        x = self.dropout1(x)
        x = self.cnn1(x)
        x = self.layer_norm2(x)
        x = F.gelu(x)
        x = self.dropout2(x)
        x = self.cnn2(x)
        x += residual
        return x # (batch, channel, feature, time)


class BidirectionalGRU(nn.Module):

    def __init__(self, rnn_dim, hidden_size, dropout, batch_first):
        super(BidirectionalGRU, self).__init__()

        self.BiGRU = nn.GRU(
            input_size=rnn_dim, hidden_size=hidden_size,
            num_layers=1, batch_first=batch_first, bidirectional=True)
        self.layer_norm = nn.LayerNorm(rnn_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.layer_norm(x)
        x = F.gelu(x)
        x, _ = self.BiGRU(x)
        x = self.dropout(x)
        return x






 "n_cnn_layers": 3,
        "n_rnn_layers": 5,
        "rnn_dim": 512,
        "n_class": 29,
        "n_feats": 128,
        "stride": 2,
        "dropout": 0.1,
        "learning_rate": learning_rate,
        "batch_size": batch_size,
        "epochs": epochs
    



class SpeechRecognitionModel(nn.Module):
    
    def __init__(self, n_cnn_layers, n_rnn_layers, rnn_dim, n_class, n_feats, stride=2, dropout=0.1):
        super(SpeechRecognitionModel, self).__init__()
        n_feats = n_feats//2
        self.cnn = nn.Conv2d(1, 32, 3, stride=stride, padding=3//2)  # cnn for extracting heirachal features

        # n residual cnn layers with filter size of 32
        self.rescnn_layers = nn.Sequential(*[ResidualCNN(32, 32, kernel=3, stride=1, dropout=dropout, n_feats=n_feats) for _ in range(n_cnn_layers)])
        
        self.fully_connected = nn.Linear(n_feats*32, rnn_dim)
        
        self.birnn_layers = nn.Sequential(*[BidirectionalGRU(rnn_dim=rnn_dim if i==0 else rnn_dim*2,hidden_size=rnn_dim, dropout=dropout, batch_first=i==0)for i in range(n_rnn_layers)])
        
        self.classifier = nn.Sequential(nn.Linear(rnn_dim*2, rnn_dim),  # birnn returns rnn_dim*2
                                        nn.GELU(),
                                        nn.Dropout(dropout),
                                        nn.Linear(rnn_dim, n_class))

    def forward(self, x):
        x = self.cnn(x)
        x = self.rescnn_layers(x)
        sizes = x.size()
        x = x.view(sizes[0], sizes[1] * sizes[2], sizes[3])  # (batch, feature, time)
        x = x.transpose(1, 2) # (batch, time, feature)
        x = self.fully_connected(x)
        x = self.birnn_layers(x)
        x = self.classifier(x)
        return x


In [83]:
[*['a' for i in range(3)]]

['a', 'a', 'a']