# Estructura del modelo

In [1]:
!pip install tensorflow==2.10.0



### Imports

In [2]:
import os
import librosa
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

## Gestión del Dataset

### Funciones

In [3]:
def audio_to_spectrogram(file_path, sr = 20500, n_fft = 2048, hop_length = 512):
    # Cargar el audio
    y, sr = librosa.load(file_path, sr = sr)

    # Calcular el espectrograma (STFT)
    S = librosa.stft(y, n_fft = n_fft, hop_length = hop_length)

    # Convertir a magnitudes (la magnitud es el espectrograma)
    spectrogram = np.abs(S)

    # Escalar en logaritmo (opcional para mejor visualización y aprendizaje)
    log_spectrogram = librosa.amplitude_to_db(spectrogram, ref = np.max)

    return log_spectrogram

def spectrogram_to_audio_sin_fase(magnitud, sr, n_fft = 2048, hop_length = 512, n_iter = 32):
    # Reconstrucción del audio con Griffin-Lim
    audio_reconstruido = librosa.griffinlim(magnitud, n_iter = n_iter, hop_length = hop_length, n_fft = n_fft)

    return audio_reconstruido

def visualize_spectrogram(spectrogram, title = "Spectrogram"):
    plt.figure(figsize=(10, 4))

    # Mostrar el espectrograma con un mapa de colores (viridis o inferno suelen ser útiles)
    plt.imshow(spectrogram, aspect = 'auto', origin = 'lower', cmap = 'viridis')
    plt.colorbar(label = "Decibels (dB)")
    plt.title(title)
    plt.xlabel("Time (frames)")
    plt.ylabel("Frequency (bins)")
    plt.tight_layout()

    plt.show()

def pad_or_trim(spectrogram, max_length = 94):
    if spectrogram.shape[1] > max_length:  # Recortar
        return spectrogram[:, :max_length]
    else:  # Rellenar
        padding = np.zeros((spectrogram.shape[0], max_length - spectrogram.shape[1]))
        return np.hstack((spectrogram, padding))


### Carga, clasificación y estructuración de los datos

In [11]:
import os

# Define variables
path = "/content/clip"

vocab = []
word_to_index = {}
index_to_word = {}
data = []
flat_data = []
num = 0

max_length = 94
max_height = 1025

# Filtrar para que solo procese la carpeta "universidad"
for word in os.listdir(path):
    #if word != "universidad":
     #   continue  # Ignorar carpetas o archivos no deseados

    print(num)
    vocab.append(word)

    word_path = os.path.join(path, word)
    spectrogram_list = []

    for index, audio in enumerate(os.listdir(word_path)):
        if index > 100:
            break

        print(index)
        print(audio)
        print(word_path)

        try:
            audio_path = os.path.join(word_path, audio)
            spectrogram = audio_to_spectrogram(audio_path)

            # Ajustar dimensiones
            spectrogram = pad_or_trim(spectrogram)

            spectrogram_list.append(spectrogram)
        except FileNotFoundError:
            print(f"Archivo no encontrado: {audio_path}")
        except Exception as e:
            print(f"Error procesando {audio_path}: {e}")

    data.append(spectrogram_list)
    num += 1

# Crear índices de vocabulario
word_to_index = {word: index for index, word in enumerate(vocab)}
index_to_word = {idx: word for word, idx in word_to_index.items()}

# Convertir datos anidados en una lista plana
for i in range(len(data)):
    for j in range(len(data[i])):
        flat_data.append((i, data[i][j]))


0
0
common_voice_es_20917196.opus
/content/clip/universidad
1
common_voice_es_19654251.opus
/content/clip/universidad
2
common_voice_es_19271626.opus
/content/clip/universidad
3
common_voice_es_19635698.opus
/content/clip/universidad
4
common_voice_es_20413570.opus
/content/clip/universidad
5
common_voice_es_20506618.opus
/content/clip/universidad
6
common_voice_es_20252341.opus
/content/clip/universidad
7
common_voice_es_19703110.opus
/content/clip/universidad
8
common_voice_es_20629610.opus
/content/clip/universidad
9
common_voice_es_20252143.opus
/content/clip/universidad
10
common_voice_es_19790352.opus
/content/clip/universidad
11
common_voice_es_20766832.opus
/content/clip/universidad
12
common_voice_es_20700902.opus
/content/clip/universidad
13
common_voice_es_19629484.opus
/content/clip/universidad
14
common_voice_es_19676160.opus
/content/clip/universidad
15
common_voice_es_19666792.opus
/content/clip/universidad
16
common_voice_es_19877714.opus
/content/clip/universidad
17
co

### Creación del Dataset

In [5]:
batch_size = 32
max_length = 94

labels = [item[0] for item in flat_data]  # Extraer etiquetas
spectrograms = [item[1] for item in flat_data]  # Extraer espectrogramas

# Convertir a tensores
label_tensor = tf.convert_to_tensor(labels, dtype=tf.int32)
spectrogram_tensor = tf.convert_to_tensor(spectrograms, dtype=tf.float32)

# Crear el dataset
dataset = tf.data.Dataset.from_tensor_slices((label_tensor, spectrogram_tensor))

# Barajar y dividir en lotes
dataset = dataset.shuffle(len(labels)).batch(batch_size)

In [6]:
print(vocab)
print(word_to_index)

print(spectrograms)

print(f"Valores mínimos y máximos en labels: {min(labels)}, {max(labels)}")
print(f"Vocab size: {len(vocab)}")

['universidad']
{'universidad': 0}
[array([[-80., -80., -80., ...,   0.,   0.,   0.],
       [-80., -80., -80., ...,   0.,   0.,   0.],
       [-80., -80., -80., ...,   0.,   0.,   0.],
       ...,
       [-80., -80., -80., ...,   0.,   0.,   0.],
       [-80., -80., -80., ...,   0.,   0.,   0.],
       [-80., -80., -80., ...,   0.,   0.,   0.]]), array([[-80., -80., -80., ...,   0.,   0.,   0.],
       [-80., -80., -80., ...,   0.,   0.,   0.],
       [-80., -80., -80., ...,   0.,   0.,   0.],
       ...,
       [-80., -80., -80., ...,   0.,   0.,   0.],
       [-80., -80., -80., ...,   0.,   0.,   0.],
       [-80., -80., -80., ...,   0.,   0.,   0.]]), array([[-80., -80., -80., ...,   0.,   0.,   0.],
       [-80., -80., -80., ...,   0.,   0.,   0.],
       [-80., -80., -80., ...,   0.,   0.,   0.],
       ...,
       [-80., -80., -80., ...,   0.,   0.,   0.],
       [-80., -80., -80., ...,   0.,   0.,   0.],
       [-80., -80., -80., ...,   0.,   0.,   0.]]), array([[-80., -80., -8

# Modelo

### Estructura

In [7]:
class TextEncoder(tf.keras.Model):
    def __init__(self, vocab_size, embed_dim):
        super(TextEncoder, self).__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size, embed_dim)

    def call(self, x):
        return self.embedding(x)

class SpectrogramDecoder(tf.keras.Model):
    def __init__(self, target_dim):
        super(SpectrogramDecoder, self).__init__()
        self.dense = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation='relu'),
            tf.keras.layers.Dense(target_dim, activation=None)  # No activación para espectrogramas sin normalizar
        ])

    def call(self, x):
        return self.dense(x)

class TextToSpectrogram(tf.keras.Model):
    def __init__(self, vocab_size, embed_dim, target_dim):
        super(TextToSpectrogram, self).__init__()
        self.encoder = TextEncoder(vocab_size, embed_dim)
        self.decoder = SpectrogramDecoder(target_dim)

    def call(self, x):
        x = self.encoder(x)
        return self.decoder(x)

def create_model(vocab_size, embed_dim, target_dim):
    # Definir el encoder y el decoder dentro del modelo funcional
    input_text = tf.keras.Input(shape=(None,), name="text_input")  # Entrada de texto
    embedding = tf.keras.layers.Embedding(vocab_size, embed_dim)(input_text)

    # Decoder
    dense_1 = tf.keras.layers.Dense(256, activation='relu')(embedding)
    output_flat = tf.keras.layers.Dense(target_dim, activation=None, name="output")(dense_1)

    # Ajustar dimensiones al formato (1025, 94)
    output_spectrogram = tf.keras.layers.Reshape((max_height, max_length))(output_flat)

    # Crear modelo
    model = tf.keras.Model(inputs=input_text, outputs=output_spectrogram, name="TextToSpectrogram")
    return model


# Entrenamiento del Modelo

### Configuración

In [8]:
vocab_size = len(vocab) # Tamaño del vocabulario
embed_dim = 128  # Dimensión del embedding
target_dim = max_length * max_height  # Dimensiones del espectrograma

model = create_model(vocab_size, embed_dim, target_dim)
model.summary()


Model: "TextToSpectrogram"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_input (InputLayer)     [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 128)         128       
                                                                 
 dense (Dense)               (None, None, 256)         33024     
                                                                 
 output (Dense)              (None, None, 96350)       24761950  
                                                                 
 reshape (Reshape)           (None, 1025, 94)          0         
                                                                 
Total params: 24,795,102
Trainable params: 24,795,102
Non-trainable params: 0
_________________________________________________________________


### Compilación con Funciones de Pérdida

In [9]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss="mse",  # Error cuadrático medio
    metrics=["mae"]  # Error absoluto medio como métrica adicional
)

### Entrenamiento

In [10]:
epochs = 10

history = model.fit(dataset, batch_size = batch_size, epochs = epochs)

Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
