In [None]:
import csv
import torchaudio

def is_mp3_valid(file_path):
    try:
        waveform, sample = torchaudio.load(file_path)
        # print(sample)
        return True
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return False

file_path_eng = '/content/drive/MyDrive/dataset/arabic/ar_en_asr_train_colab.csv'
output_file_path = '/content/drive/MyDrive/dataset/arabic/ar_en_asr_train_colab_corr.csv'

with open(file_path_eng, 'r', encoding='utf-8') as tsv_file, open(output_file_path, 'w', newline='', encoding='utf-8') as csv_file:
    csv_reader = csv.reader(tsv_file, delimiter='\t')
    csv_writer = csv.writer(csv_file)

    for row in csv_reader:
        if row:
            mp3_path = row[0]
            if is_mp3_valid(mp3_path):
                # Write the valid row to the new CSV file
                csv_writer.writerow(row)

In [None]:
import csv

input_tsv_file = '/content/drive/MyDrive/dataset/arabic/train.tsv'
output_csv_file = '/content/drive/MyDrive/dataset/arabic/train_colab.csv'

# Open TSV file for reading and CSV file for writing
with open(input_tsv_file, 'r', encoding='utf-8') as tsv_file, open(output_csv_file, 'w', newline='', encoding='utf-8') as csv_file:
    tsv_reader = csv.reader(tsv_file, delimiter='\t')
    csv_writer = csv.writer(csv_file)

    # Write header to CSV file
    csv_writer.writerow(['audio', 'transcript'])

    # Skip the header in TSV file
    next(tsv_reader)

    # Process each row in TSV file and write to CSV file
    for row in tsv_reader:
        # print(row[0])
        audio_path = '/content/drive/MyDrive/dataset/arabic/clips/' + row[1]
        transcript = row[2]
        csv_writer.writerow([audio_path, transcript])

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -U torchmetrics

Collecting torchmetrics
  Downloading torchmetrics-1.4.3-py3-none-any.whl.metadata (19 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.11.8-py3-none-any.whl.metadata (5.2 kB)
Downloading torchmetrics-1.4.3-py3-none-any.whl (869 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m869.5/869.5 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.11.8-py3-none-any.whl (26 kB)
Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.11.8 torchmetrics-1.4.3


In [None]:
import os
# from ctcdecode import CTCBeamDecoder
import torch
import torch.nn as nn
import torch.utils.data as data
import torch.optim as optim
import torch.nn.functional as F
import torchaudio
import numpy as np
import torchmetrics
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import random
from einops.layers.torch import Rearrange
from torch.nn import MultiheadAttention
import pandas as pd

class InfoNCE(nn.Module):

    def __init__(self, temperature=0.1, reduction='mean', negative_mode='unpaired'):
        super().__init__()
        self.temperature = temperature
        self.reduction = reduction
        self.negative_mode = negative_mode

    def forward(self, query, positive_key, negative_keys=None):
        return info_nce(query, positive_key, negative_keys,
                        temperature=self.temperature,
                        reduction=self.reduction,
                        negative_mode=self.negative_mode)


def info_nce(query, positive_key, negative_keys=None, temperature=0.1, reduction='mean', negative_mode='unpaired'):
    # Check input dimensionality.
    if query.dim() != 2:
        raise ValueError('<query> must have 2 dimensions.')
    if positive_key.dim() != 2:
        raise ValueError('<positive_key> must have 2 dimensions.')
    if negative_keys is not None:
        if negative_mode == 'unpaired' and negative_keys.dim() != 2:
            raise ValueError("<negative_keys> must have 2 dimensions if <negative_mode> == 'unpaired'.")
        if negative_mode == 'paired' and negative_keys.dim() != 3:
            raise ValueError("<negative_keys> must have 3 dimensions if <negative_mode> == 'paired'.")

    # Check matching number of samples.
    if len(query) != len(positive_key):
        raise ValueError('<query> and <positive_key> must must have the same number of samples.')
    if negative_keys is not None:
        if negative_mode == 'paired' and len(query) != len(negative_keys):
            raise ValueError("If negative_mode == 'paired', then <negative_keys> must have the same number of samples as <query>.")

    # Embedding vectors should have same number of components.
    if query.shape[-1] != positive_key.shape[-1]:
        raise ValueError('Vectors of <query> and <positive_key> should have the same number of components.')
    if negative_keys is not None:
        if query.shape[-1] != negative_keys.shape[-1]:
            raise ValueError('Vectors of <query> and <negative_keys> should have the same number of components.')

    # Normalize to unit vectors
    query, positive_key, negative_keys = normalize(query, positive_key, negative_keys)
    if negative_keys is not None:
        # Explicit negative keys

        # Cosine between positive pairs
        positive_logit = torch.sum(query * positive_key, dim=1, keepdim=True)

        if negative_mode == 'unpaired':
            # Cosine between all query-negative combinations
            negative_logits = query @ transpose(negative_keys)

        elif negative_mode == 'paired':
            query = query.unsqueeze(1)
            negative_logits = query @ transpose(negative_keys)
            negative_logits = negative_logits.squeeze(1)

        # First index in last dimension are the positive samples
        logits = torch.cat([positive_logit, negative_logits], dim=1)
        labels = torch.zeros(len(logits), dtype=torch.long, device=query.device)
    else:
        # Negative keys are implicitly off-diagonal positive keys.

        # Cosine between all combinations
        logits = query @ transpose(positive_key)

        # Positive keys are the entries on the diagonal
        labels = torch.arange(len(query), device=query.device)

    return F.cross_entropy(logits / temperature, labels, reduction=reduction)


def transpose(x):
    return x.transpose(-2, -1)


def normalize(*xs):
    return [None if x is None else F.normalize(x, dim=-1) for x in xs]



class LibriSpeechDataset(Dataset):
    def __init__(self, audio_files, waveform_length, context_length, future_length, negative_waveform_length):
        self.audio_files = audio_files
        self.waveform_length = waveform_length
        self.context_length = context_length
        self.future_length = future_length
        self.negative_waveform_length = negative_waveform_length

    def __len__(self):
        return len(self.audio_files)

    def load_waveform(self, audio_path, waveform_length):
        waveform, _ = torchaudio.load(audio_path)
        if waveform.size(1) > waveform_length:
            start_idx = random.randint(0, waveform.size(1) - waveform_length)
            waveform = waveform[:, start_idx: start_idx + waveform_length]
        else:
            pad_length = waveform_length - waveform.size(1)
            waveform = torch.nn.functional.pad(waveform, (0, pad_length))
        return waveform

    def __getitem__(self, idx):
        audio_path = self.audio_files[idx]
        waveform = self.load_waveform(audio_path, self.waveform_length)

        # Generate context waves
        start_idx = random.randint(0, self.waveform_length - self.context_length - self.future_length)
        context = waveform[:, start_idx: start_idx + self.context_length]

        # Generate future samples
        future = waveform[:, start_idx + self.context_length: start_idx + self.context_length + self.future_length]

        # Generate negative sample
        negative_idx = random.randint(0, len(self.audio_files) - 1)
        while negative_idx == idx:
            negative_idx = random.randint(0, len(self.audio_files) - 1)

        negative_audio_path = self.audio_files[negative_idx]
        negative_waveform = self.load_waveform(negative_audio_path, self.negative_waveform_length)

        negative_sample = negative_waveform

        # Return context, future, negative sample, and waveform length
        return context, future, negative_sample, context.size(1)



class preCNNLayerNorm(nn.Module):
    """Layer normalization built for cnns input"""
    def __init__(self, n_feats):
        super(preCNNLayerNorm, self).__init__()
        self.layer_norm = nn.LayerNorm(n_feats)

    def forward(self, x):
        # x (batch, channel, feature, time)
#         print(x.size())
        x = x.transpose(2, 3).contiguous() # (batch, channel, time, feature)
        x = self.layer_norm(x)
        return x.transpose(2, 3).contiguous() # (batch, channel, feature, time)



class preResidualCNN(nn.Module):

    def __init__(self, in_channels, out_channels, kernel, stride, dropout, n_feats):
        super(preResidualCNN, self).__init__()

        self.self_attention1 = MultiheadAttention(embed_dim=in_channels, num_heads=8)
        self.cnn1 = nn.Conv2d(in_channels, out_channels*2, kernel, stride, padding=kernel//2)
        self.cnn2 = nn.Conv2d(out_channels*2, out_channels*2, kernel, stride, padding=kernel//2)
        self.cnn3 = nn.Conv2d(out_channels*2, out_channels*2, kernel, stride, padding=kernel//2)
        self.cnn4 = nn.Conv2d(out_channels*2, out_channels, kernel, stride, padding=kernel//2)


        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)
        self.dropout4 = nn.Dropout(dropout)
        self.layer_norm1 = preCNNLayerNorm(n_feats)
        self.layer_norm2 = preCNNLayerNorm(n_feats)
        self.layer_norm3 = preCNNLayerNorm(n_feats)
        self.layer_norm4 = preCNNLayerNorm(n_feats)

    def forward(self, x):
        residual = x  # (batch, channel, feature, time)

        sizes = x.size()

        x = x.view(sizes[0], sizes[1], sizes[2]*sizes[3])
        x = x.transpose(1,2)

        x, _ = self.self_attention1(x, x, x)
        x = x.transpose(1,2)
        x = x.view(sizes)

        x = self.layer_norm1(x)
        x = F.gelu(x)
        x = self.dropout1(x)
        x = self.cnn1(x)


        x = self.layer_norm2(x)
        x = F.gelu(x)

        x = self.dropout2(x)
        x = self.cnn2(x)


        x = self.layer_norm3(x)
        x = F.gelu(x)

        x = self.dropout3(x)
        x = self.cnn3(x)


        x = self.layer_norm4(x)
        x = F.gelu(x)

        x = self.dropout4(x)
        x = self.cnn4(x)


        x += residual
        return x # (batch, channel, feature, time)


class preBidirectionalGRU(nn.Module):

    def __init__(self, rnn_dim, hidden_size, dropout, batch_first):
        super(preBidirectionalGRU, self).__init__()

        self.BiGRU = nn.GRU(
            input_size=rnn_dim, hidden_size=hidden_size,
            num_layers=1, batch_first=batch_first, bidirectional=True)
        self.layer_norm = nn.LayerNorm(rnn_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.layer_norm(x)
        x = F.gelu(x)
        x, _ = self.BiGRU(x)
        x = self.dropout(x)
        return x


class preSpeechRecognitionModel(nn.Module):

    def __init__(self, n_cnn_layers, n_rnn_layers, rnn_dim, n_class, n_feats, stride=2, dropout=0.1, n_predictions=5):
        super(preSpeechRecognitionModel, self).__init__()
        n_feats = n_feats//2
        self.n_predictions = n_predictions
        self.cnn = nn.Conv2d(1, 32, 3, stride=stride, padding=3//2)  # cnn for extracting heirachal features

        # n residual cnn layers with filter size of 32
        self.rescnn_layers = nn.Sequential(*[
            preResidualCNN(32, 32, kernel=3, stride=1, dropout=dropout, n_feats=n_feats)
            for _ in range(n_cnn_layers)
        ])
        self.fully_connected = nn.Linear(n_feats*32, rnn_dim)
        self.birnn_layers = nn.Sequential(*[
            preBidirectionalGRU(rnn_dim=rnn_dim if i==0 else rnn_dim*2,
                             hidden_size=rnn_dim, dropout=dropout, batch_first=i==0)
            for i in range(n_rnn_layers)
        ])
        self.classifier = nn.Sequential(
            nn.Linear(rnn_dim*2, rnn_dim*2),  # birnn returns rnn_dim*2
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(rnn_dim*2, n_class)
        )

    def forward(self, x):
#         print(f'input x: {x.size()}')


#         print(x.size())
        x = self.cnn(x)
#         print(f'output of cnn: {x.size()}')
        x = self.rescnn_layers(x)
        sizes = x.size()
        x = x.view(sizes[0], sizes[1] * sizes[2], sizes[3])  # (batch, feature, time)
        x = x.transpose(1, 2) # (batch, time, feature)
        x = self.fully_connected(x)
        x = self.birnn_layers(x)
        x = self.classifier(x)
        x = x[:, -1:, :]
        return x


class CNNLayerNorm(nn.Module):
    """Layer normalization built for cnns input"""
    def __init__(self, n_feats):
        super(CNNLayerNorm, self).__init__()
        self.layer_norm = nn.LayerNorm(n_feats)

    def forward(self, x):
        # x (batch, channel, feature, time)
        x = x.transpose(2, 3).contiguous() # (batch, channel, time, feature)
        x = self.layer_norm(x)
        return x.transpose(2, 3).contiguous() # (batch, channel, feature, time)






class ResidualCNN(nn.Module):

    def __init__(self, in_channels, out_channels, kernel, stride, dropout, n_feats):
        super(ResidualCNN, self).__init__()

        self.self_attention1 = MultiheadAttention(embed_dim=in_channels, num_heads=8)
        self.cnn1 = nn.Conv2d(in_channels, out_channels*2, kernel, stride, padding=kernel//2)
        self.cnn2 = nn.Conv2d(out_channels*2, out_channels*2, kernel, stride, padding=kernel//2)
        self.cnn3 = nn.Conv2d(out_channels*2, out_channels*2, kernel, stride, padding=kernel//2)
        self.cnn4 = nn.Conv2d(out_channels*2, out_channels, kernel, stride, padding=kernel//2)

        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)
        self.dropout4 = nn.Dropout(dropout)
        self.layer_norm1 = CNNLayerNorm(n_feats)
        self.layer_norm2 = CNNLayerNorm(n_feats)
        self.layer_norm3 = CNNLayerNorm(n_feats)
        self.layer_norm4 = CNNLayerNorm(n_feats)

    def forward(self, x):
        residual = x  # (batch, channel, feature, time)
        sizes = x.size()
        x = x.view(sizes[0], sizes[1], sizes[2]*sizes[3])
        x = x.transpose(1,2)
        x, _ = self.self_attention1(x, x, x)
        x = x.transpose(1,2)
        x = x.view(sizes)
        x = self.layer_norm1(x)
        x = F.gelu(x)
        x = self.dropout1(x)
        x = self.cnn1(x)

        x = self.layer_norm2(x)
        x = F.gelu(x)

        x = self.dropout2(x)
        x = self.cnn2(x)


        x = self.layer_norm3(x)
        x = F.gelu(x)

        x = self.dropout3(x)
        x = self.cnn3(x)


        x = self.layer_norm4(x)
        x = F.gelu(x)

        x = self.dropout4(x)
        x = self.cnn4(x)

        x += residual
        return x # (batch, channel, feature, time)


class BidirectionalGRU(nn.Module):

    def __init__(self, rnn_dim, hidden_size, dropout, batch_first):
        super(BidirectionalGRU, self).__init__()

        self.BiGRU = nn.GRU(
            input_size=rnn_dim, hidden_size=hidden_size,
            num_layers=1, batch_first=batch_first, bidirectional=True)
        self.layer_norm = nn.LayerNorm(rnn_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.layer_norm(x)
        x = F.gelu(x)
        x, _ = self.BiGRU(x)
        x = self.dropout(x)
        return x


class SpeechRecognitionModel(nn.Module):

    def __init__(self, n_cnn_layers, n_rnn_layers, rnn_dim, n_class, n_feats, stride=2, dropout=0.1):
        super(SpeechRecognitionModel, self).__init__()
        n_feats = n_feats//2
        self.cnn = nn.Conv2d(1, 32, 3, stride=stride, padding=3//2)  # cnn for extracting heirachal features

        # n residual cnn layers with filter size of 32
        self.rescnn_layers = nn.Sequential(*[
            ResidualCNN(32, 32, kernel=3, stride=1, dropout=dropout, n_feats=n_feats)
            for _ in range(n_cnn_layers)
        ])
        self.fully_connected = nn.Linear(n_feats*32, rnn_dim)
        self.birnn_layers = nn.Sequential(*[
            BidirectionalGRU(rnn_dim=rnn_dim if i==0 else rnn_dim*2,
                             hidden_size=rnn_dim, dropout=dropout, batch_first=i==0)
            for i in range(n_rnn_layers)
        ])
        self.classifier = nn.Sequential(
            nn.Linear(rnn_dim*2, rnn_dim*4),  # birnn returns rnn_dim*2
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(rnn_dim*4, n_class)
        )

    def forward(self, x):
        x = self.cnn(x)
        x = self.rescnn_layers(x)
        sizes = x.size()
        x = x.view(sizes[0], sizes[1] * sizes[2], sizes[3])  # (batch, feature, time)
        x = x.transpose(1, 2) # (batch, time, feature)
        x = self.fully_connected(x)
        x = self.birnn_layers(x)
        x = self.classifier(x)
        return x


import sentencepiece as spm

class SentencePieceTransform:
    """Maps subwords to integers and vice versa using SentencePiece"""
    def __init__(self, model_path):
        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(model_path)

    def text_to_int(self, text):
        """ Use the SentencePiece tokenizer to convert text to an integer sequence """
        subwords = self.sp.EncodeAsPieces(text.lower())
        return [self.sp.PieceToId(subword) for subword in subwords]

    def int_to_text(self, labels):
        """ Use the SentencePiece tokenizer to convert integer labels to a text sequence """

        return self.sp.decode(labels)

sentencepiece_transform = SentencePieceTransform("/content/drive/MyDrive/dataset/data/spm_unigram_1023.model")


def get_audio_transforms():
    time_masks = [torchaudio.transforms.TimeMasking(time_mask_param=15, p=0.05) for _ in range(10)]
    train_audio_transform = nn.Sequential(
        torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=80, hop_length=160),
        torchaudio.transforms.FrequencyMasking(freq_mask_param=27),
        *time_masks,
    )
    valid_audio_transform = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=80, hop_length=160)
    return train_audio_transform, valid_audio_transform

train_audio_transforms, valid_audio_transforms = get_audio_transforms()

def data_processing(data, data_type="train"):
    spectrograms = []
    labels = []
    input_lengths = []
    label_lengths = []
    for (waveform, _, utterance, _, _, _) in data:
        if data_type == 'train':
            spec = train_audio_transforms(waveform).squeeze(0).transpose(0, 1)
        elif data_type == 'valid':
            spec = valid_audio_transforms(waveform).squeeze(0).transpose(0, 1)
        else:
            raise Exception('data_type should be train or valid')
        spectrograms.append(spec)
        label = torch.Tensor(sentencepiece_transform.text_to_int(utterance))
        labels.append(label)
        input_lengths.append(spec.shape[0]//2)
        label_lengths.append(len(label))

    spectrograms = nn.utils.rnn.pad_sequence(spectrograms, batch_first=True).unsqueeze(1).transpose(2, 3)
    labels = nn.utils.rnn.pad_sequence(labels, batch_first=True)

    return spectrograms, labels, input_lengths, label_lengths

def numtoword(beam_results, out_lens, labels, label_lengths,blank_label=0, collapse_repeated=True):
    arg_maxes = beam_results

    decodes = []
    targets = []

    for i, args in enumerate(arg_maxes):
        decode = []
        tar_list = labels[i][:label_lengths[i]].tolist()
        tar_list = list(map(int, tar_list))
        tar_list = list(filter(lambda x: x != 0, tar_list))
        targets.append(sentencepiece_transform.int_to_text(tar_list))

        for j, index in enumerate(args):
            if index != blank_label:
                if collapse_repeated and j != 0 and index == args[j-1]:
                    continue
                decode.append(index.item())
        decodes.append(sentencepiece_transform.int_to_text(decode))
    return decodes, targets


def loss_F(parameters):
    return sum(torch.linalg.norm(w) ** 2 for w in parameters)




loss_fn = InfoNCE()

def train(model, premodel, device, train_loader, train_loader2, criterion, optimizer, preoptimizer, scheduler,
                                     prescheduler, epoch, gam, optimizer1, preoptimizer1):
    model.train()
    premodel.train()

    train_loss = 0
    info_loss = 0

    data_len = len(train_loader.dataset)
    data_len2 = len(train_loader2.dataset)

    for batch_idx, (_data, predata) in enumerate(zip(train_loader, train_loader2)):

            context, future, negative_samples, lengths = predata
            context = context.to(device)
            future = future.to(device)
            negative_samples = negative_samples.to(device)


            # Forward pass
            context = context.unsqueeze(1)
            context = context.repeat(1, 1, 80, 1)

            predictions = premodel(context)


            sizes = predictions.size()


            predictions = predictions.view(sizes[0], sizes[1]*sizes[2])

            target = future.view(sizes[0], sizes[1]*sizes[2])

            neg_target = negative_samples.view(sizes[0], sizes[1]*sizes[2])

            lamda = .001

            reg =  loss_F(premodel.parameters())

            loss_cpc = loss_fn(predictions, target, neg_target) + lamda*reg  # gxy

            if batch_idx % 400 == 0 or batch_idx == data_len2:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tCPC_Loss: {:.6f}'.format(
                    epoch, batch_idx * len(context), data_len2,
                    100. * batch_idx / len(train_loader2), loss_cpc.item()))


        ######################Supervised training portion#########################################

            spectrograms, labels, input_lengths, label_lengths = _data

            gam = round(gam, 3)
            spectrograms, labels = spectrograms.to(device), labels.to(device)


            output = model(spectrograms)  # (batch, time, n_class)

            output = F.log_softmax(output, dim=2)
            output = output.transpose(0, 1) # (time, batch, n_class)



            # loss_nce = criterion(output, labels, input_lengths, label_lengths) + gam*loss_cpc  #(fy + gam* (gxy-vx))



#
#            for name, param in model.named_parameters():
#                if name != 'last_layer_weight' and name != 'last_layer_bias':
#                          param.requires_grad = True
#
#                else:
#                  param.requires_grad = False


            # preoptimizer.zero_grad()
            # optimizer1.zero_grad()

            # loss_nce.backward(retain_graph=True)
#             torch.nn.utils.clip_grad_norm_(parameters=premodel.parameters(), max_norm=10, norm_type=2.0)
#             torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=10, norm_type=2.0)
            # preoptimizer.step()
            # optimizer1.step()


            # optimizer.zero_grad()

#            for name, param in model.named_parameters():
#                if name != 'last_layer_weight' and name != 'last_layer_bias':
#                  param.requires_grad = False
#
#                else:
#                  param.requires_grad = True

            ctc_loss = criterion(output, labels, input_lengths, label_lengths)+gam*loss_cpc

            ctc_loss.backward()

            optimizer.step()

            train_loss += ctc_loss.item() / len(train_loader)


            if batch_idx % 400 == 0 or batch_idx == data_len:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tCTC_Loss: {:.6f}'.format(
                    epoch, batch_idx * len(spectrograms), data_len,
                    100. * batch_idx / len(train_loader), ctc_loss.item()))

                print(f'gamma: {gam}')
    print(f'train_loss: {train_loss}')
    scheduler.step()
    prescheduler.step()


    return train_loss, info_loss


def test(model, device, test_loader, criterion, epoch):
    print('\nevaluating...')
    model.eval()
    test_loss = 0
    test_cer, test_wer = [], []
    n_classes = 1000

    if epoch%2000==0:
        with torch.no_grad():
                for i, _data in enumerate(test_loader):
                    spectrograms, labels, input_lengths, label_lengths = _data

                    spectrograms, labels = spectrograms.to(device), labels.to(device)

                    output = model(spectrograms)  # (batch, time, n_class)
                    soft_max = torch.nn.functional.softmax(output,dim=2)
                    output_lengths = torch.full((output.size(0),), output.size(1), dtype=torch.int32)
                    output = F.log_softmax(output, dim=2)
                    output = output.transpose(0, 1) # (time, batch, n_class)
                    loss = criterion(output, labels, input_lengths, label_lengths)
                    test_loss += loss.item() / len(test_loader)

                    itera = spectrograms.size()

    #                 print("output for greedy")

    #                 decoded_preds, decoded_targets = GreedyDecoder(output.transpose(0, 1), labels, label_lengths)
    #                 print(f'predtion: {decoded_preds1}')
    #                 print(f'Labels: {decoded_targets1}')
                    decoder = CTCBeamDecoder(
                        [''] * (n_classes - 1) + [' '],
                        model_path=None,
                        alpha=0,
                        beta=0,
                        cutoff_top_n=40,
                        cutoff_prob=1.0,
                        beam_width=1000,
                        num_processes=4,
                        blank_id=0,
                        log_probs_input=False
                    )
                    beam_results, beam_scores, timesteps, out_lens = decoder.decode(soft_max, output_lengths)
                    b=[]
                    for i in range(itera[0]):
                         b.append(beam_results[i][0][:out_lens[i][0]])
                    decoded_preds, decoded_targets = numtoword(b,out_lens,labels, label_lengths)

                    for j in range(len(decoded_preds)):
                        test_cer.append(torchmetrics.functional.char_error_rate(decoded_targets[j], decoded_preds[j]))
                        test_wer.append(torchaudio.functional.edit_distance(decoded_targets[j], decoded_preds[j]) / len(
    decoded_targets[j]
))

        avg_cer = sum(test_cer)/len(test_cer)
        avg_wer = sum(test_wer)/len(test_wer)

    # load the last checkpoint with the best model
#         model.load_state_dict(torch.load('checkpoint.pt'))


        print('Test set: Average loss: {:.4f}, Average CER: {:4f} Average WER: {:.4f}\n'.format(test_loss, avg_cer, avg_wer))


        # file_path = "/home/exx/Desktop/saif/conformer/wer.txt"
        # with open(file_path, "a") as file:
        #     file.write(f"Epoch {epoch}: {avg_wer}\n")

        return test_loss, avg_cer, avg_wer
    #     return beam_results, out_lens, output
    else:
        with torch.no_grad():
            for i, _data in enumerate(test_loader):
                spectrograms, labels, input_lengths, label_lengths = _data

                spectrograms, labels = spectrograms.to(device), labels.to(device)

                output = model(spectrograms)  # (batch, time, n_class)
                soft_max = torch.nn.functional.softmax(output,dim=2)
                output_lengths = torch.full((output.size(0),), output.size(1), dtype=torch.int32)
                output = F.log_softmax(output, dim=2)
                output = output.transpose(0, 1) # (time, batch, n_class)
                loss = criterion(output, labels, input_lengths, label_lengths)
                test_loss += loss.item() / len(test_loader)
            print('Test set: Average loss: {:.4f}\n'.format(test_loss))

        return test_loss, 0 , 0


class ASR(Dataset):
    """
    Stores a Pandas DataFrame in __init__, and reads and preprocesses examples in __getitem__.
    """
    def __init__(self, split, path, augmentation):
        """
        Args:
            augmentation (bool): Apply SpecAugment to training data or not.
        """
        if split.upper()=='TRAIN':
            file_path = path
            self.df1 = pd.read_csv(file_path)

            self.df = pd.concat([self.df1], ignore_index=True)


        # self.df = pd.read_csv('%s.csv' % split.upper())
        # self.tokenizer = torch.load('tokenizer.pth')

        if split.upper()=='TEST':
            self.df = pd.read_csv(path)
        self.augmentation = (augmentation and (split.upper() == 'TRAIN'))

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        """
        Returns:
            x (torch.FloatTensor, [seq_length, dim_features]): The FBANK features.
            y (torch.LongTensor, [n_tokens]): The label sequence.
        """
        x, y = self.df.iloc[idx]
        x, sample_rate = librosa.load(x)


        return x, y

def data_processing_c(data, data_type="train"):
    spectrograms = []
    labels = []
    input_lengths = []
    label_lengths = []
    for (waveform, utterance) in data:
        if data_type == 'train':
            spec = train_audio_transforms(waveform).squeeze(0).transpose(0, 1)


        elif data_type == 'test' or "valid":
            spec = valid_audio_transforms(waveform).squeeze(0).transpose(0, 1)

        else:
            raise Exception('data_type should be train or valid')
        spectrograms.append(spec)
        label = torch.Tensor(sentencepiece_transform_c.text_to_int(utterance))
        labels.append(label)
        input_lengths.append(spec.shape[0])
        label_lengths.append(len(label))

    spectrograms = nn.utils.rnn.pad_sequence(spectrograms, batch_first=True).unsqueeze(1).transpose(2, 3)

    labels = nn.utils.rnn.pad_sequence(labels, batch_first=True)

    #print(spectrograms.size())
#
#    print(labels.size())

    return spectrograms, labels, input_lengths, label_lengths


def load(split, path, batch_size, workers=0, augmentation=False):
    """
    Args:
        split (string): Which of the subset of data to take. One of 'train', 'dev' or 'test'.
        batch_size (integer): Batch size.
        workers (integer): How many subprocesses to use for data loading.
        augmentation (bool): Apply SpecAugment to training data or not.

    Returns:
        loader (DataLoader): A DataLoader can generate batches of (FBANK features, FBANK lengths, label sequence).
    """
    assert split in ['train', 'dev', 'test']

    dataset = ASR(split, path, augmentation)
    # print(dataset)
    print ("%s set size:"%split.upper(), len(dataset))

    # kwargs = {'num_workers': 6, 'pin_memory': True} if use_cuda else {}

    loader = DataLoader(dataset,
                        batch_size=batch_size,
                        shuffle=True,
                        collate_fn=lambda x: data_processing_c(x, split),
                        num_workers=workers,
                        pin_memory=True)
    return loader

def get_audio_files_wav(data_dir):
    return [os.path.join(root, file) for root, dirs, files in os.walk(data_dir) for file in files if file.lower().endswith('.mp3')]



sentencepiece_transform_c = SentencePieceTransform("/content/drive/MyDrive/dataset/arabic/arabic_unigram1000_model.model")

def main(learning_rate=5e-4, batch_size=10, epochs=10,
        train_url="train-clean-100", test_url="test-clean"):


    hparams = {
        "n_cnn_layers": 3, #### 2,3,4
        "n_rnn_layers": 5, ##### 4,5,6
        "rnn_dim": 512, #512,
        "n_class": 1000,
        "n_feats": 80,
        "stride":2,
        "dropout": 0.05,
        "learning_rate": learning_rate,
        "batch_size": batch_size,
        "epochs": epochs
    }


    use_cuda = torch.cuda.is_available()
    torch.manual_seed(7)
    device = torch.device("cuda" if use_cuda else "cpu")

    # if not os.path.isdir("/content/drive/MyDrive/dataset/data/data100"):
    #     os.makedirs("/content/drive/MyDrive/dataset/data/data100")

    # train_dataset = torchaudio.datasets.LIBRISPEECH("/content/drive/MyDrive/dataset/data/data100", url=train_url, download=True)
    # test_dataset = torchaudio.datasets.LIBRISPEECH("/content/drive/MyDrive/dataset/data/data100", url=test_url, download=True)


    path_train_e = "/content/drive/MyDrive/dataset/arabic/train_colab.csv"
    path_test_e = "/content/drive/MyDrive/dataset/arabic/ar_en_asr_test_colab.csv"

    train_loader = load('train', path_train_e, 10)
    test_loader = load('test', path_test_e, 10)



    kwargs = {'num_workers': 4, 'pin_memory': True} if use_cuda else {}
    # train_loader = data.DataLoader(dataset=train_dataset,#combined_dataset,
    #                             batch_size=10,#hparams['batch_size'],
    #                             shuffle=True,
    #                             collate_fn=lambda x: data_processing(x, 'train'),
    #                             **kwargs)
    # test_loader = data.DataLoader(dataset=test_dataset,
    #                             batch_size=hparams['batch_size'],
    #                             shuffle=False,
    #                             collate_fn=lambda x: data_processing(x, 'valid'),
    #                             **kwargs)

    model = SpeechRecognitionModel(
        hparams['n_cnn_layers'], hparams['n_rnn_layers'], hparams['rnn_dim'],
        hparams['n_class'], hparams['n_feats'], hparams['stride'], hparams['dropout']
        )

    model.to(device)
    model = nn.DataParallel(model)

#     print(model)



    print('Num Model Parameters', sum([param.nelement() for param in model.parameters()]))

    optimizer = optim.AdamW(model.parameters(), lr=hparams['learning_rate'])
    criterion = nn.CTCLoss(blank=1, zero_infinity=True).to(device)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=hparams['learning_rate'],
                                            steps_per_epoch=int(len(train_loader)),
                                            epochs=hparams['epochs'],
                                            anneal_strategy='linear')
    optimizer1 = optim.AdamW(model.parameters(), lr=5e-4) ###. fine-tuning learning rate



    ####################################Pre training######################################

    # if not os.path.isdir("/content/drive/MyDrive/dataset/data/data100"):
    #     os.makedirs("/content/drive/MyDrive/dataset/data/data100")


    data_dir = "/content/drive/MyDrive/dataset/arabic/clips" # pre-training data path.


    audio_files = get_audio_files_wav(data_dir)

    waveform_length = 32000  # Length of the waveform (can be adjusted as needed)
    context_length = 256  # Length of the context wave
    future_length = 12  # Length of the future samples
    negative_waveform_length = 12


    train_dataset2 = LibriSpeechDataset(audio_files, waveform_length, context_length, future_length, negative_waveform_length)
      # Adjust the batch size as needed
    train_loader2 = DataLoader(train_dataset2, batch_size=hparams['batch_size']) # Iterate over the data loader

    print(len(train_loader.dataset))
    print(len(train_loader2.dataset))

    prehparams = {
        "n_cnn_layers": 3,
        "n_rnn_layers": 5,
        "rnn_dim": 512, #512,
        "n_class": 12,
        "n_feats": 80, # do not change
        "stride":2,
        "dropout": 0.05,
        "n_predictions": 5,
        "epochs": 10
            }


    premodel = preSpeechRecognitionModel(
            prehparams['n_cnn_layers'], prehparams['n_rnn_layers'], prehparams['rnn_dim'],
            prehparams['n_class'], prehparams['n_feats'], prehparams['stride'], prehparams['dropout'], prehparams['n_predictions']
            )


#     preoptimizer = optim.AdamW(premodel.parameters(), lr=0.001)

    preoptimizer = optim.AdamW(premodel.parameters(), lr=5e-3) # Pre-training learniong rate


    preoptimizer1 = optim.AdamW(premodel.parameters(), lr=hparams['learning_rate'])
#     prescheduler1 = optim.lr_scheduler.OneCycleLR(preoptimizer, max_lr=.001,
#                                             steps_per_epoch=int(len(train_loader2)),
#                                             epochs=hparams['epochs'],
#                                             anneal_strategy='linear')

    prescheduler = optim.lr_scheduler.OneCycleLR(preoptimizer1, max_lr=.001,
                                            steps_per_epoch=int(len(train_loader2)),
                                            epochs=hparams['epochs'],
                                            anneal_strategy='linear')

    premodel.to(device)
    premodel = nn.DataParallel(premodel)



#     model = SpeechRecognitionModel(premodel,latent_dim=1024, num_classes= hparams["n_class"])

#     model = nn.DataParallel(model)

#     print(model)

#     model.to(device)

#     print('Num Model Parameters', sum([param.nelement() for param in model.parameters()]))

#     optimizer = optim.AdamW(model.parameters(), lr=hparams['learning_rate'])
#     criterion = nn.CTCLoss(blank=0).to(device)
#     scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=hparams['learning_rate'],
#                                             steps_per_epoch=int(len(train_loader)),
#                                             epochs=hparams['epochs'],
#                                             anneal_strategy='linear')



    gamma_max = 1
    gamma_init = 0
    gamma_argmax_step = 500
    if gamma_init > gamma_max:
        gamma_max = gamma_init
        print('Initial gamma is larger than max gamma, proceeding with gamma_max=gamma_init.')
    gam = gamma_init
    step_gam = (gamma_max-gamma_init)/gamma_argmax_step


    train_loss=[]
    test_loss=[]
    Info_loss = []
    cer=[]
    wer=[]

    tes_loss1=6

    for epoch in range(1, epochs + 1):


        tra_loss, infoloss = train(model, premodel, device, train_loader, train_loader2, criterion, optimizer, preoptimizer, scheduler,
                                     prescheduler, epoch, gam, optimizer1, preoptimizer1)




        gam+= step_gam

        gam = min(gamma_max,gam)

        tes_loss, c, w =  test(model, device, test_loader, criterion, epoch)

        # if tes_loss<tes_loss1:
        #     tes_loss1=tes_loss
        #     torch.save(model.state_dict(), '/home/exx/Desktop/saif/conformer/lstm100model.pth')




#         scheduler.step(tes_loss)
        train_loss.append(tra_loss)
        test_loss.append(tes_loss)
        Info_loss.append(infoloss)
        cer.append(c)
        wer.append(w)
#         if w<best_wer:
#             best_wer = w
    return train_loss, test_loss, cer, wer, Info_loss


learning_rate = 5e-4
batch_size = 10
epochs = 100
libri_train_set = "train-clean-100"
libri_test_set = "test-other"

train_loss, test_loss, cer, wer, Info_loss = main(learning_rate, batch_size, epochs, libri_train_set, libri_test_set)

TRAIN set size: 2029
TEST set size: 1695
Num Model Parameters 27210504
2029
296


  x, sample_rate = librosa.load(x)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/dataset/arabic/clips/common_voice_ar_19227720.mp3'

In [None]:
# !pip install librosa soundfile

import librosa
import soundfile as sf
import IPython.display as ipd

# Load the audio file
file_path = "/content/drive/MyDrive/dataset/arabic/clips/common_voice_ar_19227720.mp3"
y, sr = librosa.load(file_path)

# Play the audio
ipd.display(ipd.Audio(y, rate=sr))

  y, sr = librosa.load(file_path)


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/dataset/arabic/clips/common_voice_ar_19227720.mp3'