# Imports

In [1]:
# %pip install torch torchaudio scikit-learn pandas matplotlib numpy seaborn

In [2]:
print("Hello World!")

Hello World!


In [3]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio

from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from collections import Counter

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from keras.callbacks import ModelCheckpoint

import os

2024-07-03 10:13:23.305440: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-03 10:13:23.305502: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-03 10:13:23.306957: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
# Source: https://www.kaggle.com/discussions/getting-started/140636
# !pip install GPUtil

# import torch
# from GPUtil import showUtilization as gpu_usage
# from numba import cuda

# def free_gpu_cache():
#     print("Initial GPU Usage")
#     gpu_usage()                             

#     torch.cuda.empty_cache()

#     cuda.select_device(0)
#     cuda.close()
#     cuda.select_device(0)

#     print("GPU Usage after emptying the cache")
#     gpu_usage()

# if torch.cuda.is_available():
#     free_gpu_cache()

In [5]:
print(str(torchaudio.list_audio_backends()))
# %pip install PySoundFile

['ffmpeg', 'soundfile']


In [6]:
print(torch.cuda.is_available())  # Should return True if CUDA is available

True


# Setup

## kaggle Setup

In [7]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [8]:
DATASET_PATH = "/kaggle/input/arabic-asr/"
TRAIN_PATH = DATASET_PATH + "train/"
ADAPT_PATH = DATASET_PATH + "adapt/"
batch_size = 16

# Data

## EDA

In [9]:
# Load the CSV file
data = pd.read_csv(DATASET_PATH + 'train.csv') # nrows=1000
# Display the first few rows of the dataset
data.head()

Unnamed: 0,audio,transcript
0,train_sample_0,على إنها عار في الوقت اللي كانت بتتعامل مع أخو...
1,train_sample_1,فأكيد ربنا عوضهم خير هو الراجل بيبقى ليه إختيا...
2,train_sample_2,زي دول كتيره بنشوفها النهارده في العالم وأصبحت...
3,train_sample_3,يعني مين اللي بيحط شروطها يعني أنا شايفه إني م...
4,train_sample_4,والله هي الموضوع مش كليب خلي بالك ولا أغنيه ال...


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50715 entries, 0 to 50714
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   audio       50715 non-null  object
 1   transcript  50709 non-null  object
dtypes: object(2)
memory usage: 792.5+ KB


In [11]:
data.isnull().sum()

audio         0
transcript    6
dtype: int64

In [12]:
data.dropna(inplace=True)
data.isnull().sum()

audio         0
transcript    0
dtype: int64

In [13]:
data.transcript.str.contains(r'[<.*>|=|÷|??]', na=False).sum()

418

In [14]:
data[data.transcript.str.contains(r'<.*>', na=False)]

Unnamed: 0,audio,transcript
786,train_sample_786,<fil> هو مختارش إن هو يتكلم عن نفسه كأوباما ول...
3690,train_sample_3690,<fil> الانتخابات اللي فاتت مش السابقه دي الأست...
19449,train_sample_19449,<fil> ساعة ونس بنتونس مع بعض يعني <fil> الكلام...
19463,train_sample_19463,<fil> <fil> <fil> <fil> <fil> <fil> <overlap>
19613,train_sample_19613,<fil> مفيش جنيه <fil> اشتغلت مع عمي في المنيا ...
...,...,...
50211,train_sample_50211,و لوحدي ما شاء الله لا لوحدي <fil> طب بابا مكل...
50224,train_sample_50224,هو <fil> ملوش أمانه بتتولى
50254,train_sample_50254,فكرة إن الدول دي تستفيد من <fil> هو يمكن هطلع ...
50271,train_sample_50271,فمنقدرش نربط الشهاده <fil> أكتر من سنه <fil> ت...


In [15]:
data.drop(data[data.transcript.str.contains(r'<.*>', na=False)].index, inplace=True)
data.transcript.str.contains(r'<.*>', na=False).sum()

0

In [16]:
data['audio'] = TRAIN_PATH + data['audio'] + ".wav"
data.reset_index(drop=True, inplace=True)

In [17]:
data.iloc[784:787]

Unnamed: 0,audio,transcript
784,/kaggle/input/arabic-asr/train/train_sample_78...,لواء أركان حرب أيمن حب الدين نكمل إن شاء الله
785,/kaggle/input/arabic-asr/train/train_sample_78...,والأم هنيجي لرساله للأب والأم هنا إرحموا ولادك...
786,/kaggle/input/arabic-asr/train/train_sample_78...,و خاصة الدول الناميه ومنها مصر بتعاني النهارده...


## Define Speech Dataset Class

In [18]:
class SpeechDataset(Dataset):
    def __init__(self, data: str | pd.DataFrame, n_mels=40, sample_rate=16000):
        if isinstance(data, str):
            self.data = pd.read_csv(data)
        else:
            self.data = data
        self.sample_rate = sample_rate
        self.n_mels = n_mels
        self.vocab = self._build_vocab(self.data['transcript'].tolist())

    def _build_vocab(self, transcripts):
        all_chars = ''.join(transcripts)
        char_counter = Counter(all_chars)
        vocab = {char: idx + 1 for idx, (char, _) in enumerate(char_counter.most_common())}
        vocab['<pad>'] = 0  # Padding token
        return vocab

    """
        * If you are certain that your audio data is already sampled at 16kHz, 
        then you do not need to perform the resampling step.
        * The purpose of the resampling step in the provided code is to ensure that
        the audio waveform is at the desired sample rate (in this case, 16kHz).
    """
    def _load_audio(self, file_path, resample: bool = True):
        """
            Load an audio file and resample it to the desired sample rate.
            file_path: Path to the audio file.
            resample: Whether to resample the audio to the desired sample rate.
            sample_rate: Desired sample rate.
            Returns: Audio waveform tensor.
        """
        # Load audio waveform and sample rate
        waveform, sr = torchaudio.load(file_path)
        if resample and sr != self.sample_rate:
            resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=self.sample_rate)  # Create a resampler object
            waveform = resampler(waveform)  # Resample the waveform to the desired sample rate
        return waveform.squeeze()  # Remove extra dimension

    def _extract_features(self, audio):
        transform = torchaudio.transforms.MelSpectrogram(sample_rate=self.sample_rate, n_mels=self.n_mels)
        mel_spectrogram = transform(audio)
        mel_spectrogram = torch.log(mel_spectrogram + 1e-9)  # Log-scale for numerical stability
        return mel_spectrogram.T  # Transpose to have time steps as rows

    def _text_to_sequence(self, text):
        return [self.vocab[char] for char in text]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        try:
            audio_path = self.data['audio'][idx]
            transcript = self.data['transcript'][idx]
        
            audio = self._load_audio(audio_path)
            mel_features = self._extract_features(audio)
            target = self._text_to_sequence(transcript)
            # print(mel_features.shape, len(target))
        except Exception as e:
            print(f"Error processing {idx}")
            return self.__getitem__(idx + 1)
        return mel_features, target


    def collate_fn(self, batch):
        inputs, targets = zip(*batch)
        
        inputs_padded = pad_sequence([torch.tensor(inp, dtype=torch.float32) for inp in inputs], batch_first=True)
        targets_padded = pad_sequence([torch.tensor(tgt, dtype=torch.long) for tgt in targets], batch_first=True)
        return inputs_padded, targets_padded


In [19]:
# Path to your train.csv
csv_path = DATASET_PATH + 'train.csv'

feature_dim = 60

# Instantiate the dataset
speech_dataset = SpeechDataset(data, n_mels=feature_dim)

# Create DataLoader
train_loader = DataLoader(speech_dataset, batch_size=batch_size, shuffle=False, collate_fn=speech_dataset.collate_fn)

In [20]:
# speech_dataset[0]

# Model

## Listener

In [21]:
# Define the Listener (Encoder) with Pyramidal BLSTM
class Listener(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers):
        """
            Initialize the Listener with a Pyramidal BLSTM.
            input_dim: Dimension of input features (e.g., size of the MFCC or Mel-spectrogram features).
            hidden_dim: Number of hidden units in each LSTM layer.
            num_layers: Number of LSTM layers.
        """
        super(Listener, self).__init__()
        self.layers = nn.ModuleList()  # List to hold the LSTM layers.
        current_dim = input_dim  # Start with the input dimension.
        
        for _ in range(num_layers):
            # Append a bidirectional LSTM layer to the list.
            self.layers.append(nn.LSTM(current_dim, hidden_dim, num_layers=1, batch_first=True, bidirectional=True))
            # Update the current dimension to match the BLSTM output.
            current_dim = hidden_dim * 2  # BLSTM doubles the hidden dimension.

    def forward(self, x):
        """
            Forward pass through the Listener.
            x: Input tensor of shape (batch_size, sequence_length, input_dim).
            Returns: Downsampled sequence of encoded features.
        """
        for lstm in self.layers:
            x, _ = lstm(x)  # Pass the input through the current LSTM layer.

#             # Print shape information for debugging.
#             print(f'Before downsampling: batch_size={x.size(0)}, seq_len={x.size(1)}, feature_dim={x.size(2)}')
            
            # Downsample by concatenating adjacent time steps.
            batch_size, seq_len, feature_dim = x.shape

#             # Print shape information for debugging.
#             print(f'After downsampling: batch_size={x.size(0)}, seq_len={x.size(1)}, feature_dim={x.size(2)}')

            # x.contiguous(): This ensures that the tensor's memory layout is contiguous
            # view(batch_size, seq_len // 2, feature_dim * 2): The view function reshapes the tensor. 
            # x = x.contiguous().view(batch_size, seq_len // 2, feature_dim * 2)
        return x

## Attender

In [22]:
# Define the Attender with Attention Context
class Attender(nn.Module):
    def __init__(self, encoder_dim, decoder_dim, attention_dim):
        """
        Initialize the Attender with an attention mechanism.
        encoder_dim: Dimension of the encoded features from the Listener.
        decoder_dim: Dimension of the hidden state from the Speller.
        attention_dim: Dimension of the attention layer.
        """
        super(Attender, self).__init__()
        # Linear layer to transform the encoder and decoder states into the attention space.
        self.attn = nn.Linear(encoder_dim + decoder_dim, attention_dim)
        # Linear layer to compute the attention energies.
        self.v = nn.Linear(attention_dim, 1, bias=False)

    def forward(self, encoder_outputs, decoder_hidden):
        """
        Forward pass through the Attender.
        encoder_outputs: Encoded features from the Listener (batch_size, sequence_length, encoder_dim).
        decoder_hidden: Hidden state from the Speller (batch_size, decoder_dim).
        Returns: Context vector and attention weights.
        """
        # Calculate attention energies by concatenating encoder outputs with the decoder hidden state.
        seq_len = encoder_outputs.size(1)
        decoder_hidden = decoder_hidden.unsqueeze(1).repeat(1, seq_len, 1)
        energy = torch.cat((encoder_outputs, decoder_hidden), dim=2)
        attn_energies = self.v(F.tanh(self.attn(energy))).squeeze(2)
        # Apply softmax to get attention weights.
        attn_weights = F.softmax(attn_energies, dim=1)
        # Compute the context vector as a weighted sum of encoder outputs.
        context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs).squeeze(1)
        return context, attn_weights

## Speller

In [23]:
# Define the Speller (Decoder) with Character-Level Model
class Speller(nn.Module):
    def __init__(self, vocab_size, encoder_dim, hidden_dim, num_layers):
        """
            Initialize the Speller.
            vocab_size: Size of the output vocabulary (number of possible output tokens)
            hidden_dim: Number of hidden units in the LSTM layers
            num_layers: Number of LSTM layers
        """
        super(Speller, self).__init__()
        self.lstm = nn.LSTM(encoder_dim + hidden_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        self.embedding = nn.Embedding(vocab_size, hidden_dim)

    def forward(self, context, hidden, cell, previous_char):
        """
            Forward pass through the Speller.
            context: Context vector from the Attender (batch_size, hidden_dim * 2)
            hidden: Initial hidden state for the LSTM
            cell: Initial cell state for the LSTM
            Returns: Output token probabilities (batch_size, vocab_size), Updated hidden state, Updated cell state
        """

        # Embed the previous character
        embedded = self.embedding(previous_char).unsqueeze(1)
        # Concatenate context and embedded character
        lstm_input = torch.cat((context.unsqueeze(1), embedded), dim=2)
        # Pass through LSTM
        output, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))
        # Map to vocabulary size
        output = self.fc(output.squeeze(1))
        return output, hidden, cell

## LAS Model

In [24]:
class LASModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, vocab_size, num_layers):
        """
        Initialize the LAS model.
        input_dim: Dimension of input features (e.g., MFCC features).
        hidden_dim: Number of hidden units in each LSTM layer.
        vocab_size: Size of the output vocabulary (number of grapheme characters).
        num_layers: Number of LSTM layers.
        """
        super(LASModel, self).__init__()
        self.listener = Listener(input_dim, hidden_dim, num_layers)
        self.attender = Attender(hidden_dim * 2, hidden_dim, hidden_dim)
        self.speller = Speller(vocab_size, hidden_dim * 2, hidden_dim, num_layers)
        self.vocab_size = vocab_size

    def forward(self, inputs, targets, teacher_forcing_ratio=0.5):
        """
        Forward pass through the LAS model.
        inputs: Input features (batch_size, sequence_length, input_dim).
        targets: Target character sequence (batch_size, target_length).
        teacher_forcing_ratio: Probability of using true target instead of predicted character during training.
        """
        encoder_outputs = self.listener(inputs)
        batch_size = targets.size(0)
        target_length = targets.size(1)
        outputs = torch.zeros(batch_size, target_length, self.vocab_size).to(inputs.device)

        hidden = torch.zeros(self.speller.lstm.num_layers, batch_size, self.speller.lstm.hidden_size).to(inputs.device)
        cell = torch.zeros(self.speller.lstm.num_layers, batch_size, self.speller.lstm.hidden_size).to(inputs.device)
        previous_char = targets[:, 0]

        for t in range(1, target_length):
            context, _ = self.attender(encoder_outputs, hidden[-1])
            output, hidden, cell = self.speller(context, hidden, cell, previous_char)
            outputs[:, t, :] = output

            if torch.rand(1).item() < teacher_forcing_ratio:
                previous_char = targets[:, t]
            else:
                previous_char = output.argmax(1)

        return outputs


# Preprocessing

## Audio Preprocessing

In [25]:
"""
    * If you are certain that your audio data is already sampled at 16kHz, 
    then you do not need to perform the resampling step.
    * The purpose of the resampling step in the provided code is to ensure that
    the audio waveform is at the desired sample rate (in this case, 16kHz).
"""
def load_audio(file_path, resample: bool = True, sample_rate=16000):
    """
        Load an audio file and resample it to the desired sample rate.
        file_path: Path to the audio file.
        resample: Whether to resample the audio to the desired sample rate.
        sample_rate: Desired sample rate.
        Returns: Audio waveform tensor.
    """
    # Load audio waveform and sample rate
    waveform, sr = torchaudio.load(file_path)
    if resample and sr != sample_rate:
        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=sample_rate)  # Create a resampler object
        waveform = resampler(waveform)  # Resample the waveform to the desired sample rate
    return waveform.squeeze()  # Remove extra dimension

In [26]:
def extract_features(audio, sample_rate=16000, n_mels=40):
    """
        Extract Mel spectrogram features from an audio waveform.
        audio: Input audio waveform.
        sample_rate: Sample rate of the audio waveform.
        n_mels: Number of Mel filterbanks.
        Returns: Mel spectrogram tensor.
    """
    transform = torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate, n_mels=n_mels)
    mel_spectrogram = transform(audio)
    mel_spectrogram = torch.log(mel_spectrogram + 1e-9)  # Log-scale for numerical stability
    return mel_spectrogram.T  # Transpose to have time steps as rows

In [27]:
# # Example usage
# audio_path = data['audio'][0]
# audio = load_audio(audio_path)
# mel_features = extract_features(audio)
# print(mel_features.shape)

## Text Preprocessing

Tokenization, vectorization, and sequencing

In [28]:
from collections import Counter

def build_vocab(transcripts):
    all_chars = ''.join(transcripts)
    char_counter = Counter(all_chars)
    vocab = {char: idx + 1 for idx, (char, _) in enumerate(char_counter.most_common())}
    vocab['<pad>'] = 0  # Add padding token
    return vocab

# Build vocabulary
vocab = build_vocab(data['transcript'])
print(vocab)

def text_to_sequence(text, vocab):
    return [vocab[char] for char in text]

# Example usage
transcript = data['transcript'][0]
sequence = text_to_sequence(transcript, vocab)
print(sequence)

{' ': 1, 'ا': 2, 'ل': 3, 'ي': 4, 'ن': 5, 'م': 6, 'ه': 7, 'و': 8, 'ت': 9, 'ب': 10, 'ع': 11, 'ر': 12, 'د': 13, 'ك': 14, 'ف': 15, 'أ': 16, 'س': 17, 'ح': 18, 'ق': 19, 'ش': 20, 'إ': 21, 'ج': 22, 'ص': 23, 'ط': 24, 'خ': 25, 'ى': 26, 'ز': 27, 'ض': 28, 'ة': 29, 'غ': 30, 'ذ': 31, 'ث': 32, 'ء': 33, 'ظ': 34, 'ئ': 35, 'ؤ': 36, 'آ': 37, 'ً': 38, 'ڨ': 39, '<': 40, '÷': 41, '،': 42, '>': 43, 'ِ': 44, '⁇': 45, 'n': 46, '=': 47, '١': 48, 'ٱ': 49, 'چ': 50, '[': 51, '<pad>': 0}
[11, 3, 26, 1, 21, 5, 7, 2, 1, 11, 2, 12, 1, 15, 4, 1, 2, 3, 8, 19, 9, 1, 2, 3, 3, 4, 1, 14, 2, 5, 9, 1, 10, 9, 9, 11, 2, 6, 3, 1, 6, 11, 1, 16, 25, 8, 4, 2, 1, 2, 3, 8, 3, 13, 1, 2, 3, 8, 18, 4, 13, 1, 6, 11, 2, 6, 3, 7, 1, 25, 2, 23, 7]


# Train

## Initalize Model

In [29]:
# Parameters
input_dim = feature_dim  # Mel-spectrogram dimension
hidden_dim = 128  # Hidden layer size
vocab_size = len(speech_dataset.vocab)  # Vocabulary size
num_layers = 10  # Number of LSTM layers
learning_rate = 0.001  # Learning rate
num_epochs = 30  # Number of epochs

print(vocab_size)

# Instantiate the model
if torch.cuda.is_available():
    model = LASModel(input_dim, hidden_dim, vocab_size, num_layers, ).to('cuda')
else:
    model = LASModel(input_dim, hidden_dim, vocab_size, num_layers).to('cpu')
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding index in loss calculation
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

52


In [30]:
# # Parameters
# input_dim = 40  # Example input dimension for MFCC features.
# hidden_dim = 256  # Hidden dimension size.
# vocab_size = 100  # Size of the character vocabulary.
# num_layers = 3  # Number of LSTM layers.
# learning_rate = 0.001  # Learning rate.
# num_epochs = 20  # Number of epochs to train.
# teacher_forcing_ratio = 0.5  # Teacher forcing ratio.

# # Define the model, loss function, and optimizer
# model = LASModel(input_dim, hidden_dim, vocab_size, num_layers).to('cuda')
# criterion = nn.CrossEntropyLoss(ignore_index=0)  # Use an appropriate ignore index for padding.
# optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# # Example data loader (replace with your own data loader)
# def get_dataloader(batch_size):
#     # Return a PyTorch DataLoader with your training data
#     pass  # Implement your data loading here

# train_loader = get_dataloader(batch_size=32)


### Checkpoints

#### Saving Checkpoints

In [31]:
def save_checkpoint(model, optimizer, epoch, loss, best_loss, checkpoint_dir='checkpoints', filename='best_model.pth'):
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
        
    checkpoint_path = os.path.join(checkpoint_dir, filename)
    # Save the model if current loss is the best
    if loss < best_loss:
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
        }, checkpoint_path)
        print(f'Checkpoint saved at epoch {epoch + 1} with loss {loss:.4f}')
        best_loss = loss  # Update best loss
    return best_loss

#### Load Checkpoints

In [32]:
def load_checkpoint(checkpoint_path, model, optimizer):
    if not os.path.exists(checkpoint_path):
        print(f"No checkpoint found at '{checkpoint_path}'")
        return model, optimizer, -1, float('inf')

    try:
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        epoch = checkpoint['epoch']
        best_loss = checkpoint['loss']
        print(f"Loaded checkpoint from '{checkpoint_path}' (epoch {epoch + 1}, loss {best_loss:.4f})")
    except RuntimeError as e:
#         print(f"Error loading checkpoint: {e}")
#         print("Mismatch in model architecture. Check the dimensions of your model layers.")
#         print("Attempting to load state dict with strict=False.")
        
#         try:
#             # Load model with strict=False to ignore size mismatches
#             model.load_state_dict(checkpoint['model_state_dict'], strict=False)
#             print("Checkpoint loaded with some mismatches in layer sizes.")
#         except Exception as e2:
#             print(f"Error loading with strict=False: {e2}")
#             print("Could not load the model checkpoint. Please ensure the model architecture matches the checkpoint.")
        return model, optimizer, -1, float('inf')
        
    return model, optimizer, checkpoint.get('epoch', 0), checkpoint.get('loss', float('inf'))


### Train Loop

In [34]:
###### free_gpu_cache()

# Path to the checkpoint file
checkpoint_path = 'checkpoints/best_model.pth'
# Load the checkpoint if it exists
model, optimizer, start_epoch, best_loss = load_checkpoint(checkpoint_path, model, optimizer)
print(start_epoch)
for epoch in range(start_epoch + 1, num_epochs):
    model.train()
    epoch_loss = 0
    
    with tqdm(total=len(train_loader), desc=f'Epoch {epoch + 1}/{num_epochs}', unit='batch') as pbar:
        for inputs, targets in train_loader:
            if torch.cuda.is_available():
                inputs, targets = inputs.to('cuda'), targets.to('cuda')
            else:
                inputs, targets = inputs.to('cpu'), targets.to('cpu')

#             print(inputs.shape, targets.shape)

            optimizer.zero_grad()
            outputs = model(inputs, targets)

            outputs = outputs[:, 1:].reshape(-1, vocab_size)
            targets = targets[:, 1:].reshape(-1)

            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            pbar.set_postfix(loss=epoch_loss / (pbar.n + 1))  # Display average loss
            pbar.update(1)

    best_loss = save_checkpoint(model, optimizer, epoch, epoch_loss, best_loss, checkpoint_dir='checkpoints', filename='best_model.pth')
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss / len(train_loader):.4f}")


-1


  inputs_padded = pad_sequence([torch.tensor(inp, dtype=torch.float32) for inp in inputs], batch_first=True)
Epoch 1/30: 100%|██████████| 3144/3144 [52:43<00:00,  1.01s/batch, loss=2.97]


Checkpoint saved at epoch 1 with loss 9325.6330
Epoch [1/30], Loss: 2.9662


Epoch 2/30: 100%|██████████| 3144/3144 [45:16<00:00,  1.16batch/s, loss=2.96]


Checkpoint saved at epoch 2 with loss 9320.7535
Epoch [2/30], Loss: 2.9646


Epoch 3/30: 100%|██████████| 3144/3144 [45:38<00:00,  1.15batch/s, loss=2.96]


Checkpoint saved at epoch 3 with loss 9320.2036
Epoch [3/30], Loss: 2.9644


Epoch 4/30: 100%|██████████| 3144/3144 [45:43<00:00,  1.15batch/s, loss=2.96]


Checkpoint saved at epoch 4 with loss 9319.6337
Epoch [4/30], Loss: 2.9643


Epoch 5/30: 100%|██████████| 3144/3144 [45:33<00:00,  1.15batch/s, loss=2.96]


Checkpoint saved at epoch 5 with loss 9319.3477
Epoch [5/30], Loss: 2.9642


Epoch 6/30: 100%|██████████| 3144/3144 [45:37<00:00,  1.15batch/s, loss=2.96]


Checkpoint saved at epoch 6 with loss 9318.9916
Epoch [6/30], Loss: 2.9641


Epoch 7/30: 100%|██████████| 3144/3144 [45:43<00:00,  1.15batch/s, loss=2.96]


Checkpoint saved at epoch 7 with loss 9318.5504
Epoch [7/30], Loss: 2.9639


Epoch 8/30: 100%|██████████| 3144/3144 [45:40<00:00,  1.15batch/s, loss=2.96]


Checkpoint saved at epoch 8 with loss 9317.9665
Epoch [8/30], Loss: 2.9637


Epoch 9/30: 100%|██████████| 3144/3144 [45:27<00:00,  1.15batch/s, loss=2.96]


Checkpoint saved at epoch 9 with loss 9317.5533
Epoch [9/30], Loss: 2.9636


Epoch 10/30: 100%|██████████| 3144/3144 [44:58<00:00,  1.17batch/s, loss=2.96]


Checkpoint saved at epoch 10 with loss 9317.3327
Epoch [10/30], Loss: 2.9635


Epoch 11/30: 100%|██████████| 3144/3144 [45:02<00:00,  1.16batch/s, loss=2.96]


Checkpoint saved at epoch 11 with loss 9317.2032
Epoch [11/30], Loss: 2.9635


Epoch 12/30: 100%|██████████| 3144/3144 [45:01<00:00,  1.16batch/s, loss=2.96]


Checkpoint saved at epoch 12 with loss 9317.1645
Epoch [12/30], Loss: 2.9635


Epoch 13/30:   2%|▏         | 78/3144 [01:07<44:29,  1.15batch/s, loss=2.97]


KeyboardInterrupt: 

## Save and Load the Model

In [None]:
# # Save the model
# torch.save(model.state_dict(), 'las_model.pth')

# # Load the model
# model.load_state_dict(torch.load('las_model.pth'))
# model.eval()

In [51]:
# Load the model
load_checkpoint('/kaggle/working/checkpoints/best_model.pth', model, optimizer)
model.eval()

Loaded checkpoint from '/kaggle/working/checkpoints/best_model.pth' (epoch 12, loss 9317.1645)


LASModel(
  (listener): Listener(
    (layers): ModuleList(
      (0): LSTM(60, 128, batch_first=True, bidirectional=True)
      (1-9): 9 x LSTM(256, 128, batch_first=True, bidirectional=True)
    )
  )
  (attender): Attender(
    (attn): Linear(in_features=384, out_features=128, bias=True)
    (v): Linear(in_features=128, out_features=1, bias=False)
  )
  (speller): Speller(
    (lstm): LSTM(384, 128, num_layers=10, batch_first=True)
    (fc): Linear(in_features=128, out_features=52, bias=True)
    (embedding): Embedding(52, 128)
  )
)

# Inference (Prediction)

In [36]:
def infer_from_folder(model, folder_path, vocab, output_csv_path, max_length=100):
    """
    Perform inference on all .wav files in a folder and save the transcriptions to a CSV file.

    Parameters:
        model (LASModel): Trained LAS model.
        folder_path (str): Path to the folder containing .wav files.
        vocab (dict): Vocabulary dictionary mapping characters to indices.
        output_csv_path (str): Path to save the output CSV file.
        max_length (int): Maximum length of the output sequence.

    Returns:
        None
    """
    model.eval()
    results = []
    wav_files = [f for f in os.listdir(folder_path) if f.endswith('.wav')]

    # Mapping from indices to characters
    index_to_char = {idx: char for char, idx in vocab.items()}

    with torch.no_grad():
        for wav_file in tqdm(wav_files, desc='Inference', unit='file'):
            audio_id = os.path.splitext(wav_file)[0]  # Remove '.wav' extension to get audio ID
            audio_path = os.path.join(folder_path, wav_file)

            # Load audio
            audio_waveform, sample_rate = torchaudio.load(audio_path)

            # Convert to Mel spectrogram
            mel_transform = torchaudio.transforms.MelSpectrogram(n_mels=feature_dim,sample_rate=sample_rate)
            mel_features = mel_transform(audio_waveform).squeeze(0).transpose(0, 1)

            inputs = torch.tensor(mel_features, dtype=torch.float32).unsqueeze(0).to('cuda' if torch.cuda.is_available() else 'cpu')

            # Perform inference
            batch_size = 1
            encoder_outputs = model.listener(inputs)
            outputs = torch.zeros(batch_size, max_length, len(vocab)).to(inputs.device)

            hidden = torch.zeros(model.speller.lstm.num_layers, batch_size, model.speller.lstm.hidden_size).to(inputs.device)
            cell = torch.zeros(model.speller.lstm.num_layers, batch_size, model.speller.lstm.hidden_size).to(inputs.device)
            previous_char = torch.zeros(batch_size, dtype=torch.long).to(inputs.device)  # Start token

            for t in range(1, max_length):
                context, _ = model.attender(encoder_outputs, hidden[-1])
                output, hidden, cell = model.speller(context, hidden, cell, previous_char)
                outputs[:, t, :] = output
                previous_char = output.argmax(1)

            predicted_indices = outputs.argmax(2).cpu().numpy().squeeze()

            # Convert indices to characters
            predicted_transcript = ''.join([index_to_char[idx] for idx in predicted_indices if idx != 0])
            results.append((audio_id, predicted_transcript))

    # Convert the results to a DataFrame and save to CSV
    df = pd.DataFrame(results, columns=['audio', 'transcript'])
    df.to_csv(output_csv_path, index=False, encoding='utf-8-sig')
    print(f"Transcriptions saved to {output_csv_path}")

In [48]:
# Example usage
# Assuming you have a trained model and a vocabulary dictionary `vocab`
folder_path = '/kaggle/input/test-data/test/'
infer_from_folder(model, folder_path, speech_dataset.vocab, 'transcriptions.csv')

  inputs = torch.tensor(mel_features, dtype=torch.float32).unsqueeze(0).to('cuda' if torch.cuda.is_available() else 'cpu')
Inference: 100%|██████████| 1726/1726 [04:42<00:00,  6.10file/s]

Transcriptions saved to transcriptions.csv





In [49]:
%ls

[0m[01;34mcheckpoints[0m/  las_model.pth  transcriptions.csv


In [50]:
from IPython.display import FileLink
FileLink(r'transcriptions.csv')

In [47]:
from IPython.display import FileLink
df = pd.read_csv('/kaggle/working/transcriptions.csv')
csv_file = '/kaggle/working/transcriptions.csv'
df.to_csv(csv_file, index=False)
print("CSV file 'df_train_default' exported successfully.")
FileLink(csv_file)

EmptyDataError: No columns to parse from file

In [38]:
def infer(model, audio_path, vocab, max_length=100):
    model.eval()
    with torch.no_grad():
        audio = speech_dataset._load_audio(audio_path)
        mel_features = speech_dataset._extract_features(audio)
        inputs = torch.tensor(mel_features, dtype=torch.float32).unsqueeze(0).to('cuda')

        batch_size = 1
        encoder_outputs = model.listener(inputs)
        outputs = torch.zeros(batch_size, max_length, vocab_size).to(inputs.device)

        hidden = torch.zeros(model.speller.lstm.num_layers, batch_size, model.speller.lstm.hidden_size).to(inputs.device)
        cell = torch.zeros(model.speller.lstm.num_layers, batch_size, model.speller.lstm.hidden_size).to(inputs.device)
        previous_char = torch.zeros(batch_size, dtype=torch.long).to(inputs.device)  # Start token

        for t in range(1, max_length):
            context, _ = model.attender(encoder_outputs, hidden[-1])
            output, hidden, cell = model.speller(context, hidden, cell, previous_char)
            outputs[:, t, :] = output

            previous_char = output.argmax(1)

        predicted_indices = outputs.argmax(2).squeeze().cpu().numpy()
        index_to_char = {idx: char for char, idx in vocab.items()}
        predicted_transcript = ''.join([index_to_char[idx] for idx in predicted_indices if idx != 0])
        return predicted_transcript

In [52]:
# Example usage
# audio_path = '/kaggle/input/arabic-asr/train/train_sample_1.wav'
audio_path = '/kaggle/input/test-data/test/test_sample_1000_clean.wav'
predicted_transcript = infer(model, audio_path, speech_dataset.vocab)
print(predicted_transcript)

ل                                                                                                  


  inputs = torch.tensor(mel_features, dtype=torch.float32).unsqueeze(0).to('cuda')
