###  Variational Autoencoder (VAE)
El VAE es un tipo de autoencoder que impone una estructura probabilística en el espacio latente. En lugar de simplemente aprender una representación comprimida (codificación), el VAE aprende una distribución de probabilidad para la codificación

In [66]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import Lambda, Input, Dense, Conv1D, MaxPooling1D, UpSampling1D, Flatten, Reshape, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l1
from tensorflow.keras import backend as K
from tensorflow.keras.losses import MeanSquaredError

In [2]:
# Hiperparámetros
batch_size = 32
lr = 1e-3
epochs = 10
latent_dim = 128  # Tamaño del espacio latente
kr = 1e-4  # Regularización L1
drop_prec = 0.25  # Dropout

In [3]:
# Cargar y procesar los datos
df_train = pd.read_parquet('../data/processed/df_train_reduced.parquet')
df_onehot = tf.keras.utils.to_categorical(df_train)
train_X, valid_X = train_test_split(df_onehot, test_size=0.2)


In [4]:
# Dimensiones de entrada
feature_size = train_X.shape[1]
inChannel = train_X.shape[2]
input_shape = (feature_size, inChannel)
input_shape

(1000, 3)

In [5]:
# Codificador del VAE
inputs = Input(shape=input_shape)

In [6]:
# Capas convolucionales del encoder
x = Conv1D(32, 5, padding='same', activation='relu', kernel_regularizer=l1(kr))(inputs)
x = MaxPooling1D(pool_size=2)(x)
x = Dropout(drop_prec)(x)
x = Conv1D(64, 5, padding='same', activation='relu', kernel_regularizer=l1(kr))(x)
x = MaxPooling1D(pool_size=2)(x)
x = Dropout(drop_prec)(x)
x = Conv1D(128, 5, padding='same', activation='relu', kernel_regularizer=l1(kr))(x)
x = Flatten()(x)

In [7]:
# Codificación: media y desviación estándar
z_mean = Dense(latent_dim)(x)
z_log_var = Dense(latent_dim)(x)

In [8]:
# Función de reparametrización (sampling)
def sampling(args):
    z_mean, z_log_var = args
    epsilon = K.random_normal(shape=(K.shape(z_mean)[0], latent_dim), mean=0., stddev=1.)
    return z_mean + K.exp(0.5 * z_log_var) * epsilon


In [9]:
# Espacio latente
z = Lambda(sampling)([z_mean, z_log_var])

In [32]:
# Capa Lambda personalizada para incluir la pérdida KL como parte del modelo
def kl_loss_layer(inputs):
    z_mean, z_log_var = inputs
    beta = 0.001  # Ajusta este valor según tus necesidades
    kl_loss = beta * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
    return kl_loss

kl_loss_output = Lambda(kl_loss_layer)([z_mean, z_log_var])

In [38]:
# Decodificador
x = Dense(128 * (feature_size // 4))(z)
x = Reshape((feature_size // 4, 128))(x)
x = Conv1D(128, 5, padding='same', activation='relu', kernel_regularizer=l1(kr))(x)
x = UpSampling1D(2)(x)
x = Conv1D(64, 5, padding='same', activation='relu', kernel_regularizer=l1(kr))(x)
x = UpSampling1D(2)(x)
outputs = Conv1D(inChannel, 5, activation='sigmoid', padding='same')(x)



In [39]:
# Definir el modelo con la pérdida KL integrada
vae = Model(inputs, [outputs, kl_loss_output])

In [74]:
# Función de pérdida personalizada
def vae_loss(y_true, y_pred):
    # Asegurarse de que las formas de y_true y y_pred coincidan en las primeras dos dimensiones
    y_true_reshaped = K.reshape(y_true, [-1, feature_size, inChannel])
    y_pred_reshaped = K.reshape(y_pred[0], [-1, feature_size, inChannel])

    # Pérdida de reconstrucción
    reconstruction_loss = tf.keras.losses.categorical_crossentropy(y_true_reshaped, y_pred_reshaped)
    reconstruction_loss = K.sum(reconstruction_loss, axis=-1)

    # Pérdida KL (ya calculada en la salida del modelo)
    kl_loss = K.mean(y_pred[1]) * 0.001  # Reducir el peso de la pérdida KL

    return reconstruction_loss + kl_loss

In [75]:
# Compilar el VAE
vae.compile(optimizer='RMSprop', loss=vae_loss, metrics=[['accuracy'], ['mse']])

vae.summary()

In [52]:

# Definir DataGenerator
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, batch_size, x_dataset, missing_perc=0.1, shuffle=True):
        self.batch_size = batch_size
        self.x = x_dataset
        self.missing_perc = missing_perc
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        return int(np.floor(self.x.shape[0] / self.batch_size))

    def __getitem__(self, index):
        indexes = self.indexes[index * self.batch_size:(
            index + 1) * self.batch_size]
        self.x_missing = self.x[indexes].copy()

        for i in range(self.x_missing.shape[0]):
            missing_size = int(self.missing_perc * self.x_missing.shape[1])
            missing_index = np.random.randint(
                self.x_missing.shape[1], size=missing_size)
            self.x_missing[i, missing_index, :] = [1, 0, 0]

        return self.x_missing, self.x[indexes]

    def on_epoch_end(self):
        self.indexes = np.arange(self.x.shape[0])
        if self.shuffle == True:
            np.random.shuffle(self.indexes)



In [53]:
# Crear generadores de datos
train_generator = DataGenerator(batch_size=batch_size, x_dataset=train_X, missing_perc=0.1)
valid_generator = DataGenerator(batch_size=batch_size, x_dataset=valid_X, missing_perc=0.1)


In [60]:
import keras
from keras.callbacks import EarlyStopping, ModelCheckpoint
# early stopping call back with val_loss monitor
EarlyStopping = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=10,
    verbose=0,
    mode='auto',
    baseline=None,
    restore_best_weights=True)


In [76]:
# Entrenamiento del VAE
vae_train = vae.fit(
    train_generator,  # Pasar el generador directamente sin usar 'x='
    epochs=epochs,
    verbose=1,
    validation_data=valid_generator,  # Validación con el generador de validación
    callbacks=[EarlyStopping]
)


Epoch 1/10
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 76ms/step - conv1d_11_accuracy: 0.5012 - loss: 693.1523 - val_conv1d_11_accuracy: 0.5026 - val_loss: 693.1522
Epoch 2/10
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 80ms/step - conv1d_11_accuracy: 0.5076 - loss: 693.1516 - val_conv1d_11_accuracy: 0.5087 - val_loss: 693.1512
Epoch 3/10
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 84ms/step - conv1d_11_accuracy: 0.5031 - loss: 693.1516 - val_conv1d_11_accuracy: 0.5003 - val_loss: 693.1531
Epoch 4/10
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 80ms/step - conv1d_11_accuracy: 0.4997 - loss: 693.1517 - val_conv1d_11_accuracy: 0.4995 - val_loss: 693.1511
Epoch 5/10
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 81ms/step - conv1d_11_accuracy: 0.5002 - loss: 693.1610 - val_conv1d_11_accuracy: 0.5005 - val_loss: 693.1542
Epoch 6/10
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 