In [1]:
import os
os.environ['PYTHONWARNINGS'] = 'ignore:semaphore_tracker:UserWarning'
os.environ['OMP_NUM_THREADS'] = '1'


# Task 2: Speech recognition 

In [2]:
import os
import pandas as pd
import numpy as np
import librosa
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from keras.preprocessing.sequence import pad_sequences
import pytorch_lightning as pl
import torch.nn.functional as F


2024-07-10 17:49:34.471180: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-10 17:49:34.471293: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-10 17:49:34.600816: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


### SpeechDataset class:
- for handling dataset features and labels.

In [3]:
class SpeechDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]



### Model building:
- CNN layer for feature extarction.
- LSTM network sequence modeling.
- Fully connected layer for classification.
- Adam optimizer.
- CTC loss metric.

In [None]:
class SpeechRecognitionModel(pl.LightningModule):
    def __init__(self,num_classes,num_mfcc,max_frames):
        super(SpeechRecognitionModel,self).__init__()
        self.num_classes=num_classes
        self.cnn=nn.Sequential(
        nn.Conv2d(1,32,kernel_size=3,stride=1,padding=1),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2,stride=2),
        nn.BatchNorm2d(32),
        
        nn.Conv2d(32,64,kernel_size=3,stride=1,padding=1),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2,stride=2),
        nn.BatchNorm2d(64),
        
        nn.Conv2d(64,128,kernel_size=3,stride=1,padding=1),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2,stride=2),
        nn.BatchNorm2d(128) )
        self.lstm=nn.LSTM(128*(num_mfcc//8),128,batch_first=True,bidirectional=True,num_layers=2)
        self.fc=nn.Linear(256,num_classes)
        
    def forward(self,x):
        x=x.unsqueeze(1)
        batch_size, _, _,_= x.shape
        x = self.cnn(x)
        x = x.permute(0, 2, 1, 3).contiguous().view(batch_size, -1, 128 * (num_mfcc // 8))
        x, _ = self.lstm(x)
        x = self.fc(x)
        return F.log_softmax(x, dim=-1)
    
    def training_step(self,batch,batch_idx):
        features, labels = batch
        outputs = self(features)
        outputs = outputs.permute(1, 0, 2)  
        input_lengths = torch.full((outputs.size(1),), outputs.size(0), dtype=torch.long)
        target_lengths = torch.tensor([len(label) for label in labels])
        loss = F.ctc_loss(outputs, labels, input_lengths, target_lengths)
        '''outputs=outputs.view(-1,self.num_classes)
        labels=labels.view(-1)
        loss=nn.CrossEntropyLoss()(outputs,labels)'''
        self.log('train_loss',loss)
        return loss
    
    def validation_step(self,batch,batch_idx):
        features, labels = batch
        outputs = self(features)
        outputs = outputs.permute(1, 0, 2)  
        input_lengths = torch.full((outputs.size(1),), outputs.size(0), dtype=torch.long)
        target_lengths = torch.tensor([len(label) for label in labels])
        loss = F.ctc_loss(outputs, labels, input_lengths, target_lengths)
        self.log('val_loss',loss)
        return loss
    
    def configure_optimizers(self):
        return optim.Adam(self.parameters(),lr=1e-3)
    
    
            

### Data preprocessing and training
- loading and prepairing audio files.
- Extracting MFCC features from audio.
- Padding sequences and normalizing features.
- Creating data loaders for batch processing.
- Initializing the model and training with PyTorch Lightning.

In [6]:
if __name__ == "__main__":
    wave_path = '/kaggle/input/ljspeech-sr16k-dataset/wavs'
    metadata_path = '/kaggle/input/ljspeech-sr16k-dataset/metadata.csv'
    df = pd.read_csv(metadata_path)
    
    audio_paths=[]
    for i in range(len(df)):
        audios=os.path.join(wave_path,df['file_name'][i])
        audio_paths.append(audios)
        
    mfccs=[]
    for path in audio_paths:
        y,sr=librosa.load(path,sr=16000)
        y=librosa.util.normalize(y)
        mfcc=librosa.feature.mfcc(y=y,sr=16000,n_mfcc=13,n_fft=2048,hop_length=512)
        mfcc=np.array(mfcc)
        mfccs.append(mfcc.T)
        
    features=mfccs
    labels=np.array(df['sentence'])
    
    chars = set(''.join(labels))
    char2idx = {char: idx + 1 for idx, char in enumerate(sorted(chars))}
    char2idx['<pad>']=0
    idx2char={idx:char for char,idx in char2idx.items()}
    
    labels=[[char2idx[char] for char in sentence] for sentence in labels]
    max_len=max(len(seq) for seq in labels)
    labels=pad_sequences(labels,maxlen=max_len,padding='post')
    
    max_feature = max(feature.shape[0] for feature in features)
    features=pad_sequences(features,maxlen=max_feature,padding='post',dtype='float32',truncating='post')
    features=np.array(features)
    features=features/np.max(np.abs(features))
    
    features=torch.tensor(features,dtype=torch.float32)
    labels=torch.tensor(labels,dtype=torch.long)
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=22)
    train=SpeechDataset(X_train,y_train)
    test=SpeechDataset(X_test,y_test)
    
    train_loader = torch.utils.data.DataLoader(train, batch_size=32, shuffle=True, num_workers=3)
    test_loader = torch.utils.data.DataLoader(test, batch_size=32, shuffle=False, num_workers=3)
    
    num_classes=len(char2idx)
    num_mfcc=features.shape[2]
    max_frames=features.shape[1]

    model=SpeechRecognitionModel(num_classes=num_classes,num_mfcc=num_mfcc,max_frames=max_frames)
    trainer= pl.Trainer(max_epochs=10,devices='auto',accelerator='gpu')
    
    trainer.fit(model,train_loader,test_loader)
    
    print(trainer.callback_metrics)




Sanity Checking: |          | 0/? [00:00<?, ?it/s]

  self.pid = os.fork()


Training: |          | 0/? [00:00<?, ?it/s]

  self.pid = os.fork()


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

{'train_loss': tensor(nan), 'val_loss': tensor(nan)}


### Results:
- despite trying different preprcessing and padding techniques as well as different servers for multiprocessing and different IDE's such as jupyter notebook,google colab and kagle the train and val loss are both NaN.
- feedback and Advise would be appretiated.