In [None]:
!pip install tensorflow==2.10.0



#Imports

In [None]:
import os
import librosa
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
import tarfile

ERROR:absl:cannot import name 'builder' from 'google.protobuf.internal' (/usr/local/lib/python3.10/dist-packages/google/protobuf/internal/__init__.py)
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/tensorflow_datasets/__init__.py", line 79, in <module>
    from tensorflow_datasets import rlds  # pylint: disable=g-bad-import-order
  File "/usr/local/lib/python3.10/dist-packages/tensorflow_datasets/rlds/__init__.py", line 21, in <module>
    from tensorflow_datasets.rlds import envlogger_reader
  File "/usr/local/lib/python3.10/dist-packages/tensorflow_datasets/rlds/envlogger_reader.py", line 21, in <module>
    from tensorflow_datasets.core.utils.lazy_imports_utils import tree
  File "/usr/local/lib/python3.10/dist-packages/tensorflow_datasets/core/__init__.py", line 21, in <module>
    from tensorflow_datasets.core import community
  File "/usr/local/lib/python3.10/dist-packages/tensorflow_datasets/core/community/__init__.py", line 19, in <module>
  

#Para Cargar el Dataset desde Google Colab

  (Antes de ejecutar las estas celdas de código)

1. Descarga el dataset de LJSpeech de esta URL: https://keithito.com/LJ-Speech-Dataset/
2. El archivo se descargará con la extensión ".tar.bz2". Bueno pues descomprime el archivo para que solo tengas la extensión ".tar"
3. Sube el archivo .tar a tu unidad de Google Drive (Necesitarás como 3Gb libres)

Monta en el sistema de archivos de Colab tu unidad de Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Descomprime el archivo .tar de tu unidad en Google Drive a una carpeta nueva llamada dataset

In [None]:
# Nombre del archivo .tar y la carpeta de destino
nombre_archivo_tar = '/content/drive/MyDrive/LJSpeech-1.1.tar'
carpeta_destino = '/content/dataset'

# Abrir el archivo .tar y extraer su contenido
with tarfile.open(nombre_archivo_tar, 'r') as archivo_tar:
    archivo_tar.extractall(path=carpeta_destino)

print('Archivo descomprimido correctamente.')


KeyboardInterrupt: 

Código necesario para que el modelo entienda el DataSet

In [None]:
# Configuración
data_dir = '/content/dataset/LJSpeech-1.1'
wav_dir = os.path.join(data_dir, 'wavs')
metadata_file = os.path.join(data_dir, 'metadata.csv')
sample_rate = 22050
max_length = 500  # Ajusta según tus necesidades

# Función para cargar y preprocesar audio
def load_audio(file_path, sample_rate):
    audio, _ = librosa.load(file_path, sr=sample_rate)
    return audio

# Función para convertir audio a espectrograma
def audio_to_spectrogram(audio, sample_rate):
    spectrogram = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_mels=128)
    spectrogram_db = librosa.power_to_db(spectrogram, ref=np.max)
    return spectrogram_db.T

# Cargar transcripciones y audios
transcriptions = []
spectrograms = []
with open(metadata_file, 'r') as f:
    for line in f:
        parts = line.strip().split('|')
        file_id = parts[0]
        transcription = parts[2]
        transcriptions.append(transcription)

        # Cargar y procesar audio
        wav_path = os.path.join(wav_dir, f'{file_id}.wav')
        audio = load_audio(wav_path, sample_rate)
        spectrogram = audio_to_spectrogram(audio, sample_rate)

        # Padding o truncamiento
        if spectrogram.shape[0] > max_length:
            spectrogram = spectrogram[:max_length, :]
        else:
            pad_width = max_length - spectrogram.shape[0]
            spectrogram = np.pad(spectrogram, ((0, pad_width), (0, 0)), mode='constant')

        spectrograms.append(spectrogram)

# Convertir a tensores de TensorFlow
transcriptions = np.array(transcriptions)
spectrograms = np.array(spectrograms)

# Tokenización de transcripciones (simple ejemplo)
tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(transcriptions)
sequences = tokenizer.texts_to_sequences(transcriptions)
sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_length)

# Dividir en conjuntos de entrenamiento y validación
train_ratio = 0.8
num_train = int(len(sequences) * train_ratio)

x_train = sequences[:num_train]
y_train = spectrograms[:num_train]
x_val = sequences[num_train:]
y_val = spectrograms[num_train:]

#Modelo

In [None]:
class TextEncoder(tf.keras.Model):
    def __init__(self, vocab_size, embed_dim):
        super(TextEncoder, self).__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size, embed_dim)

    def call(self, x):
        return self.embedding(x)

class SpectrogramDecoder(tf.keras.Model):
    def __init__(self, target_dim):
        super(SpectrogramDecoder, self).__init__()
        self.dense = tf.keras.Sequential([
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dense(target_dim, activation=None)  # No activación para espectrogramas sin normalizar
        ])

    def call(self, x):
        return self.dense(x)

class TextToSpectrogram(tf.keras.Model):
    def __init__(self, vocab_size, embed_dim, target_dim):
        super(TextToSpectrogram, self).__init__()
        self.encoder = TextEncoder(vocab_size, embed_dim)
        self.decoder = SpectrogramDecoder(target_dim)

    def call(self, x):
        x = self.encoder(x)
        return self.decoder(x)

class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
            [tf.keras.layers.Dense(ff_dim, activation="relu"), tf.keras.layers.Dense(embed_dim)]
        )
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)



def create_transformer_model(vocab_size, embed_dim, num_heads, ff_dim, target_dim, max_length):
    input_text = tf.keras.layers.Input(shape=(max_length,), name="text_input")
    embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_dim)(input_text)

    transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
    x = transformer_block(embedding_layer,training=False)

    flatten_layer = tf.keras.layers.Flatten()(x)
    dense_layer = tf.keras.layers.Dense(128, activation='relu')(flatten_layer)
    output_layer = tf.keras.layers.Dense(target_dim, activation=None)(dense_layer)

    # Cambiar la capa Reshape para que coincida con la forma objetivo (500, 128)
    output_spectrogram = tf.keras.layers.Reshape((max_length, target_dim // max_length))(output_layer)

    model = tf.keras.Model(inputs=input_text, outputs=output_spectrogram, name="TextToSpectrogramTransformer")
    return model


#Entrenamiento

In [None]:
# Crear modelo
vocab_size = len(tokenizer.word_index) + 1
embed_dim = 128
num_heads = 4
ff_dim = 128
target_dim = spectrograms.shape[1] * spectrograms.shape[2]

model = create_transformer_model(vocab_size, embed_dim, num_heads, ff_dim, target_dim, max_length)
model.compile(optimizer='adam', loss='mse')
model.summary()




In [None]:
# Entrenar modelo
model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=10, batch_size=32)
# Guarda el modelo en una ruta en Google Drive
model.save_weights('/content/drive/My Drive/pesos_mi_modelo.h5')