# Reconocimiento de fonemas usando CTC

In [11]:
import json
import torch
import torch.utils.data as data
import torchaudio
import torch.nn as nn
import torchaudio.transforms as F
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from collections import OrderedDict
import tqdm
from torch.utils.data import  Dataset, DataLoader
from torchaudio.models.decoder import ctc_decoder # vamos a hacer un decoder greedy, por razones 
                                                  # didácticas, este se usaría si quisiera
                                                  # implementarlo con beam search

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


## Implementación del dataset y el dataloader

### Dataset
Me convierte los datos crudos para que puedan ser usados por `Dataloader`. Me permite implementar una función `__getitem__()` en la cual leemos los datos y devolvemos por ejemplo el wav y la transcripción de cada dato.

In [12]:
vocab_file = 'data/label_encoder_new.txt'
train_json = 'data/train.json'
test_json = 'data/test.json'
valid_json = 'data/dev.json'
# Parámetros
par = { 'n_fft': 400, 'n_mels': 40, 'bs':4, 
        'dropout': 0.15,
        'cnn_blocks': 2, 'cnn_channels': (128,256), 'cnn_kernelsize': (3,3),
        'rnn_blocks': 4, 'rnn_neurons': 512, 
        'dnn_blocks':2, 'dnn_neurons':512,
        'output_neurons':40
}

def load_phoneme_vocabulary(filepath: str) -> tuple[dict, dict]:
    """
    Carga un vocabulario de fonemas desde un archivo de texto.
    El archivo debe tener el formato 'fonema=>indice' por línea.

    Args:
        filepath (str): La ruta al archivo de vocabulario.

    Returns:
        tuple[dict, dict]: Una tupla que contiene:
            - phoneme_to_idx (dict): Un diccionario que mapea fonema a índice.
            - idx_to_phoneme (dict): Un diccionario que mapea índice a fonema.
    """
    phoneme_to_idx = {}
    idx_to_phoneme = {}

    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('=>')
            if len(parts) == 2:
                phoneme = parts[0].strip().strip("'") 
                idx_str = parts[1].strip()
                try:
                    index = int(idx_str)
                    phoneme_to_idx[phoneme] = index
                    idx_to_phoneme[index] = phoneme
                except ValueError:
                    raise ValueError(f"Error: Índice inválido en la línea: '{line.strip()}'")
            else:
                raise ValueError(f"Error: Línea mal formada (se esperaba 'fonema=>indice'): '{line.strip()}'")
    
    return phoneme_to_idx, idx_to_phoneme


class TimitDataset(Dataset):
    def __init__(self, json_file, vocab_file):
        try:
            with open(json_file, 'r') as f:
                self.datos_json = json.load(f)
        except FileNotFoundError:
            print(f"Error: El archivo {json_file} no se encuentra.")
        # Get a list of all sample IDs (keys in the top-level dictionary)
        self.datos_ids = list(self.datos_json.keys())
        # Load phoneme vocabulary
        self.str2int, self.int2str = load_phoneme_vocabulary(vocab_file)

    def __len__(self):
        return len(self.datos_json)
    
    def __getitem__(self, idx):
        key = self.datos_ids[idx]
        wavdir = self.datos_json[key]['wav']
        duration = self.datos_json[key]['duration']
        phn = self.datos_json[key]['phn']
        # Load the audio file
        waveform, sample_rate = torchaudio.load(wavdir)
        # Convert waveform to a 1D tensor
        waveform = waveform.squeeze(0)
        # Convert phoneme labels to a tensor
        phn_list = phn.strip().split()
        phn_list = [self.str2int[phoneme] for phoneme in phn_list]
        
        return waveform, torch.tensor(phn_list)

train_ds = TimitDataset(train_json,vocab_file)
test_ds = TimitDataset(test_json,vocab_file)
valid_ds = TimitDataset(valid_json,vocab_file)



### DataLoader
Me convierte los datos recibidos del Dataset en minibatches de igual longitud usando la función `fn_collate`

In [13]:
def collate_fn(batch):
    # El batch es una lista de tuplas: [(dato1,label1), (dato2,label2),...]
    sequences, labels = zip(*batch) # Esto devuelve: 
                                    # sequences = (dato1,dato2,...)
                                    # labels = (label1,label2,...)
    #phn_tensors = [torch.tensor([ord(c) for c in label]) for label in labels]

    wav_length = torch.tensor([w.shape[0] for w in sequences], dtype=torch.long)
    phn_length = torch.tensor([p.shape[0] for p in labels], dtype=torch.long)
    
    wav_length = wav_length.float() / torch.max(wav_length.float()) 
    phn_length = phn_length.float() / torch.max(phn_length.float()) 
    padded_wav = pad_sequence(sequences, batch_first=True, padding_value=0)
    padded_phn = pad_sequence(labels, batch_first=True, padding_value= -1)
    return (padded_wav, wav_length), (padded_phn, phn_length) # Esta es la salida del dataloader



train_dl = DataLoader(train_ds, batch_size=8, shuffle=True, collate_fn=collate_fn)
test_dl = DataLoader(train_ds, batch_size=16, shuffle=False, collate_fn=collate_fn)
valid_dl = DataLoader(train_ds, batch_size=16, shuffle=False, collate_fn=collate_fn)

  


## Implementación del modelo

### Bloques auxiliares
  - `Transpose`: Implementa la traspuesta en forma de módulo para poder usarla en `Sequential`
  - `CNN_block`: Implementa un bloque que realiza:
    - convo1d
    - layer normalization
    - Leaky Relu
    - Pooling
    - drop-out 

In [14]:
class Transpose(nn.Module):
    def __init__(self, dim0: int, dim1: int):
        super().__init__()
        self.dim0 = dim0
        self.dim1 = dim1

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return x.transpose(self.dim0, self.dim1)
    
class CNN_block(nn.Module):
    def __init__(self, n_in: int, n_out: int, kernel_size: int, 
                    pool_kernel_size: int = 1, do_prob: float = 0.0):
        super().__init__()
        pad_amount = (kernel_size - 1) // 2
        layers = []
        layers.append( nn.Conv1d(in_channels=n_in, out_channels=n_out, kernel_size=kernel_size, stride=1, 
                      padding=pad_amount, padding_mode= "replicate") )
        layers.append(Transpose(1,2))
        layers.append(nn.LayerNorm(n_out))
        layers.append(Transpose(1,2))
        layers.append(nn.LeakyReLU())
        layers.append(nn.MaxPool1d(kernel_size=pool_kernel_size, stride=1))
        layers.append(nn.Dropout(p=do_prob))
        self.bloque_cnn = nn.Sequential(*layers)
        self.pool_kernel_size = pool_kernel_size
    
    def forward(self, x: torch.Tensor, lengths: torch.Tensor):
        x = self.bloque_cnn(x)

        # Create a boolean mask: True for valid data, False for padding
        new_lengths = lengths.float() - self.pool_kernel_size + 1
        new_lengths = torch.max(new_lengths, torch.tensor(1.0, device=new_lengths.device))
        new_lengths = new_lengths.long()
        output_max_len = x.size(2)
        mask = torch.arange(output_max_len).unsqueeze(0) < new_lengths.unsqueeze(1)
        
        mask = mask.unsqueeze(1) 
        x_masked = x * mask.float()

        return x_masked, new_lengths



### Modelo: `Model_CNN_RNN_DNN_PHN`
Modelo que implementa varios bloques de convolución, recurrencia y feed-forward, además transforma el espacio de logits en probabilidad de fonemas.


In [19]:
class Model_CNN_RNN_DNN(nn.Module):
    def __init__(self, drop_out, n_mels, cnn_blocks, cnn_channels, cnn_kernel_size,
                 rnn_blocks, rnn_neurons, dnn_blocks, dnn_neurons,out_neurons):
        super().__init__()
        in_0 = n_mels
        self.cnn_bloques = nn.ModuleDict()
        self.cnn_blocks = cnn_blocks
        self.dnn_blocks = dnn_blocks
        # Bloques CNN_block. Se implementan con CNN_block        
        n_in = n_mels
        for i in range(self.cnn_blocks):
            n_out = cnn_channels[i]
            self.cnn_bloques[f"cnn_block_{i+1}"] = CNN_block(n_in=n_in, n_out=n_out, 
                                                        kernel_size=cnn_kernel_size[i], 
                                                        do_prob=drop_out)
            n_in = n_out # El siguiente bloque recibe la salida del anterior
        
        # Bloques RNN. Se implementan con GRU
        self.rnn_blocks = rnn_blocks
        self.rnn_bloques = nn.ModuleDict()
        n_in = n_out # El primer bloque recibe la salida del último bloque CNN
        for i in range(rnn_blocks):
            self.rnn_bloques[f"rnn_block_{i+1}"] = nn.GRU(input_size=n_in, hidden_size=rnn_neurons,
                num_layers=1, batch_first=True, bidirectional=True, 
                dropout=0)
            n_in = rnn_neurons*2
        
        # Bloques DNN. Se implementan con DNN_block
        self.dnn_bloques = nn.ModuleDict()
        n_in = rnn_neurons * 2 # El primer bloque recibe la salida del último bloque RNN
        for i in range(dnn_blocks):
            n_out = dnn_neurons
            self.dnn_bloques[f"dnn_block_{i+1}"] = nn.Sequential(nn.Linear(n_in, n_out),
                nn.LayerNorm(n_out), nn.LeakyReLU(),nn.Dropout(p=drop_out))
            n_in = n_out

    def forward(self, x: torch.Tensor, lengths: torch.Tensor):
        
        for i in range(cnn_blocks):
            au = self.cnn_bloques[f"cnn_block_{i+1}"]
            x, lengths = au(x, lengths)
        x = x.permute(0, 2, 1).contiguous() 
        packed_x = pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        for i in range(self.rnn_blocks):
            au = self.rnn_bloques[f"rnn_block_{i+1}"]
            packed_x, _ = au(packed_x)
        x, _ = pad_packed_sequence(packed_x, batch_first=True)
        for i in range(self.dnn_blocks):
            au = self.dnn_bloques[f"dnn_block_{i+1}"]
            x = au(x)
        logit = nn.Linear(self.dnn_neurons, )
        return x, lengths



### Modelo completo
completamos el modelo CNN_RNN_DNN con la capa de logit a probabilidades

In [26]:
modelo = nn.ModuleDict()
modelo['cnn_rnn_dnn'] = Model_CNN_RNN_DNN(
    drop_out=par['dropout'],
    n_mels=par['n_mels'],
    cnn_blocks=par['cnn_blocks'],
    cnn_channels=par['cnn_channels'],
    cnn_kernel_size=par['cnn_kernelsize'],
    rnn_blocks=par['rnn_blocks'] ,
    rnn_neurons=par['rnn_neurons'],
    dnn_blocks=par['dnn_blocks'],
    dnn_neurons=par['dnn_neurons'],
    out_neurons=par['output_neurons']
)   
modelo['logits'] = nn.Linear(par['dnn_neurons'], par['output_neurons'])
modelo['logsoftmax'] = nn.LogSoftmax(dim=-1)    
print(f'Cantidad de parámetros del modelo: {sum(p.numel() for p in modelo.parameters() if p.requires_grad)}')


Cantidad de parámetros del modelo: 17464488


## Proximo paso

In [None]:
class ASR(nn.Module):
    def __init__(self, modelo, params, opt_class):
        super().__init__()
        modelo.to(device)
        # Prepare iterating variables
        self.avg_train_loss = 0.0
        self.step = 0
        self.optimizer_step = 0

    # Train y valid
    def fit(self, train_set, valid_set, n_epochs):
        seguir en el fit de core

        