# Estructura de red convolucional + recurrente para LATINO40, con CTC

Este programa está basado en [Building an End-to-End Speech Recognition Model in PyTorch](https://www.assemblyai.com/blog/end-to-end-speech-recognition-pytorch/), por Michael Nguyen. Es una implementación de la red conocida como [Deep Speech 2](https://arxiv.org/abs/1512.02595), con ligeras variantes. 

El objetivo es: 

- Implementar el modelo de la red (convolucional + recurrente)
- Entrenar utilizando la loss CTC, que contempla todos los alineamientos posibles. 

## Dataset y dataloader.

Ahora ya no existe más el problema del alineamiento porque ese alineamiento se hace dentro de la función de costo:

<img src="figs/loss_CTC.png" alt="loss CTC" width="700"/>

Los *datos* tienen la dimensión de las columnas del Mel-espectrograma, o cantidad de frames, y también dependerá del stride. En cambio, si los *objetivos* son los caracteres, no van a coincidir las cantidades que tengo de uno y otro. Esto en el anterior programa vimos de hacer un estiramiento a mano. La loss CTC lo hace automático. 

In [2]:
import json
import torch
import torch.utils.data as data
import torchaudio
from torch.utils.tensorboard import SummaryWriter

# El dataset es el mismo
class Latino40Dataset(data.Dataset):
    def __init__(self, annotations_file, data_root):
        with open(annotations_file) as json_file:
            data_dict = json.load(json_file)
        self.annotation = list(data_dict.values())# convertir los valores a lista me permite indexarlos
        self.data_dir = data_root

    def __len__(self):
        return len(self.annotation)# largo de la lista

    def __getitem__(self, idx):
        wav_name = self.annotation[idx]['wav'].replace('{data_root}',self.data_dir)
        waveform, sample_rate = torchaudio.load(wav_name)
        label = self.annotation[idx]['words']
        return waveform, sample_rate, label
    

train_dataset = Latino40Dataset("./data/latino40_split/train.json","./data")# Esta es la llamada a Latino40Dataset.__init__
test_dataset = Latino40Dataset("./data/latino40_split/valid.json","./data" )


Mantenemos las misma `collate_fn`, que cargará las palabras, las convierte en caracteres y genera un alineamiento de longitud uniforme. Por último convierte los caracteres en clases (numérico)

In [3]:
# La transformación de los labels es la misma
char_map_str = """  
 <SPACE> 0
 a 1
 b 2
 c 3
 d 4
 e 5
 f 6
 g 7
 h 8
 i 9
 j 10
 k 11
 l 12
 m 13
 n 14
 o 15
 p 16
 q 17
 r 18
 s 19
 t 20
 u 21
 v 22
 w 23
 x 24
 y 25
 z 26
 \u03B5 27
""" #una sola string que contiene todo el mapeo. \u03B5 es el epsilon

# Clase TextTransform: mapea de caracter a entero y viceversa
# La función init de esta clase va a generar dos diccionarios que mapean de caracter a entero y viceversa
class TextTransform:
    """Maps characters to integers and vice versa"""
    def __init__(self):
        # char_map = char_map_str
        self.char_map = {}# diccionario para mapear de caracter a entero
        self.index_map = {}# diccionario para mapear de entero a caracter
        for line in char_map_str.strip().split('\n'):
            ch, index = line.split()
            self.char_map[ch] = int(index)
            self.index_map[int(index)] = ch
        self.index_map[0] = ' '

    def text_to_int(self, text):
        """ Usa el char_map y convierte una secuencia de caracteres a una secuencia de enteros """
        int_sequence = []
        for c in text:
            if c == ' ':
                ch = self.char_map['<SPACE>']
            else:
                ch = self.char_map[c]
            int_sequence.append(ch)
        return int_sequence

    def int_to_text(self, labels):
        """ Usa el index_map y convierte una secuencia de enteros a una secuencia de caracteres"""
        string = []
        for i in labels:
            string.append(self.index_map[int(i)])
        return ''.join(string).replace('', ' ')

texttransform = TextTransform() # Acá se ejecuta el init de la clase TextTransform


In [4]:
import torch.nn as nn
import torchaudio.transforms as T
import torchaudio.functional as F

# La función de transformación de audio para train y test es la misma
train_audio_transforms = nn.Sequential(
    T.MelSpectrogram(sample_rate=16000, n_mels=128, n_fft=512, win_length=400),
    T.FrequencyMasking(freq_mask_param=15),
    T.TimeMasking(time_mask_param=35)
)# aquí se inicializa la transformación de audio para train 

valid_audio_transforms = T.MelSpectrogram(sample_rate=16000, n_mels=128, n_fft=512, win_length=400)




Ahora no es necesario expandir nada, los alineamientos los resuelve la función de costo (como era originalmente)

In [None]:
def data_processing(data, data_type="train"):
    spectrograms = []
    labels = []
    input_lengths = []
    label_lengths = []
    for (waveform, _, utterance) in data:
        if data_type == 'train':
            spec = train_audio_transforms(waveform).squeeze(0).transpose(0, 1)
        else:
            spec = valid_audio_transforms(waveform).squeeze(0).transpose(0, 1)
        spectrograms.append(spec)
        label = torch.Tensor(TextTransform().text_to_int(utterance.lower()))
        labels.append(label)
        input_lengths.append(spec.shape[0]//2)
        label_lengths.append(len(label))

    spectrograms = nn.utils.rnn.pad_sequence(spectrograms, batch_first=True).unsqueeze(1).transpose(2, 3)
    labels = nn.utils.rnn.pad_sequence(labels, batch_first=True)

    return spectrograms, labels, input_lengths, label_lengths



In [None]:
# Vamos a testearlo con un batch de 3 elementos
batch_size = 3
tmp_loader = data.DataLoader(dataset=test_dataset,
                            batch_size=batch_size,
                            shuffle=True,
                            collate_fn=lambda x: data_processing(x, 'test'))

tmp_it = iter(tmp_loader)
x,lab,x_length,lab_length = next(tmp_it)
print('waveforms.shape {}'.format(x.shape))
print('Etiquetas_batched.shape {}'.format(lab.shape))
print('Largo del batch de specgrams {}'.format(x_length))
print('Largo del batch de etiquetas {}'.format(lab_length))

In [None]:
import matplotlib.pyplot as plt
import numpy as np
waveform, _ , _ = test_dataset[0]
print('Waveform shape {}'.format(waveform.shape))
spectrogram,_,input_length,_ = data_processing([train_dataset[0]])
print('Spectrogram shape: {}'.format(spectrogram.shape))
print('Según los parámetros elegidos en la transformación el hop_size = nwin //2, es decir {}'.format(400//2))
print('La longitud del espectrograma sería {} (ni ahí)'.format(waveform.shape[1]//200))
spec = spectrogram.squeeze().data.numpy()
[fig,ax] = plt.subplots()

ax.pcolor(np.log(spec+1e-10))
ax.set_title(label='Espectrograma de la frase: {}'.format(texttransform.int_to_text(lab[0].numpy())))


## Modelo de red


In [None]:
class CNNLayerNorm(nn.Module):
    """Layer normalization built for cnns input"""
    def __init__(self, n_feats):
        super(CNNLayerNorm, self).__init__()
        self.layer_norm = nn.LayerNorm(n_feats)

    def forward(self, x):
        # x (batch, channel, feature, time)
        x = x.transpose(2, 3).contiguous() # (batch, channel, time, feature)
        x = self.layer_norm(x)
        return x.transpose(2, 3).contiguous() # (batch, channel, feature, time) 

class ResidualCNN(nn.Module):
    """Residual CNN inspired by https://arxiv.org/pdf/1603.05027.pdf
        except with layer norm instead of batch norm
    """
    def __init__(self, in_channels, out_channels, kernel, stride, dropout, n_feats):
        super(ResidualCNN, self).__init__()

        self.cnn1 = nn.Conv2d(in_channels, out_channels, kernel, stride, padding=kernel//2)
        self.cnn2 = nn.Conv2d(out_channels, out_channels, kernel, stride, padding=kernel//2)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.layer_norm1 = CNNLayerNorm(n_feats)
        self.layer_norm2 = CNNLayerNorm(n_feats)

    def forward(self, x):
        residual = x  # (batch, channel, feature, time)
        x = self.layer_norm1(x)
        x = torch.nn.functional.gelu(x)
        x = self.dropout1(x)
        x = self.cnn1(x)
        x = self.layer_norm2(x)
        x = torch.nn.functional.gelu(x)
        x = self.dropout2(x)
        x = self.cnn2(x)
        x += residual
        return x # (batch, channel, feature, time)
        
class BidirectionalGRU(nn.Module):

    def __init__(self, rnn_dim, hidden_size, dropout, batch_first):
        super(BidirectionalGRU, self).__init__()

        self.BiGRU = nn.GRU(
            input_size=rnn_dim, hidden_size=hidden_size,
            num_layers=1, batch_first=batch_first, bidirectional=True)
        self.layer_norm = nn.LayerNorm(rnn_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.layer_norm(x)
        x = torch.nn.functional.gelu(x)
        x, _ = self.BiGRU(x)
        x = self.dropout(x)
        return x



In [None]:

class SpeechRecognitionModel(nn.Module):
    """Speech Recognition Model Inspired by DeepSpeech 2"""

    def __init__(self, n_cnn_layers, n_rnn_layers, rnn_dim, n_class, n_feats, stride=2, dropout=0.1):
        super(SpeechRecognitionModel, self).__init__()
        n_feats = n_feats//2
        self.cnn = nn.Conv2d(1, 32, 3, stride=stride, padding=3//2)  # cnn for extracting heirachal features

        # n residual cnn layers with filter size of 32
        self.rescnn_layers = nn.Sequential(*[
            ResidualCNN(32, 32, kernel=3, stride=1, dropout=dropout, n_feats=n_feats) 
            for _ in range(n_cnn_layers)
        ])
        self.fully_connected = nn.Linear(n_feats*32, rnn_dim)
        self.birnn_layers = nn.Sequential(*[
            BidirectionalGRU(rnn_dim=rnn_dim if i==0 else rnn_dim*2,
                             hidden_size=rnn_dim, dropout=dropout, batch_first=i==0)
            for i in range(n_rnn_layers)
        ])
        self.classifier = nn.Sequential(
            nn.Linear(rnn_dim*2, rnn_dim),  # birnn returns rnn_dim*2
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(rnn_dim, n_class)
        )

    def forward(self, x):
        x = self.cnn(x)
        x = self.rescnn_layers(x)
        sizes = x.size()
        x = x.view(sizes[0], sizes[1] * sizes[2], sizes[3])  # (batch, feature, time)
        x = x.transpose(1, 2) # (batch, time, feature)
        x = self.fully_connected(x)
        x = self.birnn_layers(x)
        x = self.classifier(x)
        return x

Pruebo con un batch:

In [None]:
hparams = {
    "n_cnn_layers": 2,
    "n_rnn_layers": 1,
    "rnn_dim": 512,
    "n_class": 28,
    "n_feats": 128,
    "stride": 2,
    "dropout": 0.1,
    "batch_size": batch_size,
}

model1 = SpeechRecognitionModel(
    hparams['n_cnn_layers'], hparams['n_rnn_layers'], hparams['rnn_dim'],
    hparams['n_class'], hparams['n_feats'], hparams['stride'], hparams['dropout']
    )

print(model1)
print('Num Model Parameters', sum([param.nelement() for param in model1.parameters()]))

x1,lab1,x1_length,lab1_length = next(tmp_it)
print('waveforms.shape {}'.format(x.shape))
print('Etiquetas_batched.shape {}'.format(lab1.shape))
y1 = model1(x1)
print('Output batched.shape {}'.format(y1.shape))


Cómo sería la loss ahora?

In [None]:
criterion = nn.CTCLoss(blank=27)

output = model1(x1)  # (batch, time, n_class)
output = torch.nn.functional.log_softmax(output, dim=2)
output = output.transpose(0, 1) # (time, batch, n_class)
print(output.shape)
print(lab1.shape)

loss1 = criterion(output, lab1, x1_length,lab1_length)

print('Loss1 {}'.format(loss1))


In [None]:
writer = SummaryWriter(comment="_DS2V1CTC_BATCH_{batch_size}_NCNN_{n_cnn_layers}_NRNN_{n_rnn_layers}".format(**hparams))
# writer.add_graph(model1, x1)


## Decodificacor y función de costo:

In [None]:
def GreedyDecoder(output, labels, label_lengths, blank_label=27, collapse_repeated=True):
    arg_maxes = torch.argmax(output, dim=2)
    # print(arg_maxes)
    decodes = []
    targets = []
    for i, args in enumerate(arg_maxes):
        decode = []
        targets.append(texttransform.int_to_text(labels[i][:label_lengths[i]].tolist()))
        for j, index in enumerate(args):
            if index != blank_label:
                if collapse_repeated and j != 0 and index == args[j -1]:
                    continue
                decode.append(index.item())
        if( blank_label in decode):
            decodes.append('\u03B5')
        else:
            decodes.append(texttransform.int_to_text(decode))
    return decodes, targets

In [None]:
d1, t1 = GreedyDecoder(y1, lab1, lab1_length, blank_label=27, collapse_repeated=True)
print("Decodificó: {}\nTarget real: {}".format(d1,t1))

In [None]:
%pip install jiwer

In [None]:
import jiwer 

def cer(pred,ref):
    return(jiwer.wer(ref, pred))
    # return(F.edit_distance(pred, ref)/len(ref))

In [None]:
def train(model, device, train_loader, criterion, optimizer, scheduler, epoch, writer):
    model.train()
    data_len = len(train_loader.dataset)
    train_loss = 0
    for batch_idx, _data in enumerate(train_loader):
        spectrograms, labels, input_lengths, label_lengths = _data
        spectrograms, labels = spectrograms.to(device), labels.to(torch.int64).to(device)
     
        optimizer.zero_grad()

        output = model(spectrograms)  # (batch, time, n_class)
        # output = output.transpose(1,2).contiguous() # (batch, time, n_class), Esto correspondía a la red anterior
        output = torch.nn.functional.log_softmax(output, dim=2)
        output = output.transpose(0, 1).contiguous() # (time, batch, n_class)

        loss = criterion(output, labels, input_lengths, label_lengths)
        loss.backward()

        # experiment.log_metric('loss', loss.item(), step=iter_meter.get())
        # experiment.log_metric('learning_rate', scheduler.get_lr(), step=iter_meter.get())

        optimizer.step()
        scheduler.step()
        if batch_idx % 50 == 0 or batch_idx == data_len:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(spectrograms), data_len,
                100. * batch_idx / len(train_loader), loss.item()))
        train_loss += loss.item() / len(train_loader)
    writer.add_scalar('Loss/train', train_loss, epoch)

def test(model, device, test_loader, criterion, epoch, writer):
    print('\nevaluating…')
    model.eval()
    data_test_len = len(test_loader.dataset)
    test_loss = 0
    test_cer, test_wer = [], []
    with torch.no_grad():
        for I, _data in enumerate(test_loader):
            spectrograms, labels, input_lengths, label_lengths = _data 
            spectrograms, labels = spectrograms.to(device), labels.to(torch.int64).to(device)
     
            output = model(spectrograms)  # (batch, time, n_class)
            # output = output.transpose(1,2).contiguous() # (batch, time, n_class)
            output = torch.nn.functional.log_softmax(output, dim=2)
            output = output.transpose(0, 1) # (time, batch, n_class)

            loss = criterion(output, labels, input_lengths, label_lengths)

            test_loss += loss.item() / len(test_loader)
            output = output.transpose(0, 1) # (batch, time, n_class)
            decoded_preds, decoded_targets = GreedyDecoder(output, labels, label_lengths)
            if I % 50 == 0 or I == data_test_len:
                print('Test Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, I * len(spectrograms), data_test_len,
                    100. * I / len(test_loader), loss.item()))


            for j,(pred,targ) in enumerate(zip(decoded_preds,decoded_targets)):
                # print("CER sentence {}: {}".format(j, cer(ref=targ, hypo=pred)*100))
                # print('target: {}\nprediction: {}'.format(targ,pred))
                test_cer.append(cer(ref=targ, pred=pred))
        

    avg_cer = sum(test_cer)/len(test_cer)

    print('Test set: Average loss: {:.4f}, Average CER: {:4f}\n'.format(test_loss, avg_cer))
    writer.add_scalar('Loss/test', test_loss, epoch)
    writer.add_scalar('CER/test', avg_cer, epoch)
    
    return avg_cer


In [None]:
import torch.optim as optim

use_cuda = torch.cuda.is_available()
torch.manual_seed(7)
device = torch.device("cuda" if use_cuda else "cpu")
# device = torch.device("cuda")
print('Device: {}'.format(device))
kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

learning_rate=1e-5
batch_size=10
epochs=500
hparams = {
    "n_cnn_layers": 2,
    "n_rnn_layers": 1,
    "rnn_dim": 512,
    "n_class": 28,
    "n_feats": 128,
    "stride": 2,
    "dropout": 0.1,
    "learning_rate": learning_rate,
    "batch_size": batch_size,
    "epochs": epochs
}

train_loader = data.DataLoader(dataset=train_dataset,
                            batch_size=hparams['batch_size'],
                            shuffle=True,
                            collate_fn=lambda x: data_processing(x, 'train'),
                            **kwargs)
test_loader = data.DataLoader(dataset=test_dataset,
                            batch_size=hparams['batch_size'],
                            shuffle=False,
                            collate_fn=lambda x: data_processing(x, 'valid'),
                            **kwargs)

model1 = SpeechRecognitionModel(
    hparams['n_cnn_layers'], 
    hparams['n_rnn_layers'], 
    hparams['rnn_dim'],
    hparams['n_class'], 
    hparams['n_feats'], 
    hparams['stride'], 
    hparams['dropout']
    ).to(device)



In [None]:
import torch.optim as optim
optimizer = optim.AdamW(model1.parameters(), hparams['learning_rate'])
criterion = nn.CTCLoss(blank=27).to(device)
scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=hparams['learning_rate'], 
                                        steps_per_epoch=int(len(train_loader)),
                                        epochs=hparams['epochs'],
                                        anneal_strategy='linear')

# iter_meter = IterMeter()
for epoch in range(1, epochs + 1):
    train(model1, device, train_loader, criterion, optimizer, scheduler, epoch, writer)
    if (epoch % 10 == 0):
        test(model1, device, test_loader, criterion, epoch, writer)

In [None]:
data_test_it = iter(test_loader)
spectrograms, labels, input_lengths, label_lengths = next(data_test_it) 
spectrograms, labels = spectrograms.to(device), labels.to(device)
output = model1(spectrograms)
output_norm = torch.nn.functional.log_softmax(output, dim=2)
output_norm = output_norm.transpose(0, 1) # (time, batch, n_class)

out = output_norm.transpose(0, 1).to('cpu').detach().numpy()
lab = labels.to('cpu').numpy()
print(out[0].shape,lab[0].shape)

In [None]:
[fig,ax] = plt.subplots()
ax.pcolormesh(np.exp(out[0][:400,:]))
caracteres = np.arange(28)
ctext = []
for i in caracteres:
    ctext.append(texttransform.index_map[i])
caracteres = np.append(caracteres,[28])
ctext.append('\u03B5')
ax.set_xticks(caracteres+0.5,labels=ctext)
ax.set_ylabel('Frame')