<h1>XGBoost</h1>

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Introduction" data-toc-modified-id="Introduction-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Introduction</a></span></li><li><span><a href="#Dataset" data-toc-modified-id="Dataset-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Dataset</a></span><ul class="toc-item"><li><span><a href="#Loading-data" data-toc-modified-id="Loading-data-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Loading data</a></span></li><li><span><a href="#Preprocessing" data-toc-modified-id="Preprocessing-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Preprocessing</a></span></li></ul></li><li><span><a href="#Method" data-toc-modified-id="Method-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Method</a></span><ul class="toc-item"><li><span><a href="#Build-model" data-toc-modified-id="Build-model-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Build model</a></span></li><li><span><a href="#Generate-data" data-toc-modified-id="Generate-data-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Generate data</a></span></li><li><span><a href="#Training" data-toc-modified-id="Training-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Training</a></span></li><li><span><a href="#Plot" data-toc-modified-id="Plot-3.4"><span class="toc-item-num">3.4&nbsp;&nbsp;</span>Plot</a></span></li></ul></li><li><span><a href="#Save-model" data-toc-modified-id="Save-model-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Save model</a></span></li></ul></div>

# Introduction



In [29]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers



Note: you may need to restart the kernel to use updated packages.


In [30]:
# specify a seed for repeating the exact dataset splits
np.random.seed(seed=28213)

# Dataset

## Loading data

In [31]:
input_name = '../data/yeast_genotype_train.txt'
df_train = pd.read_csv('../data/yeast_genotype_train.txt', sep='\t', index_col=0)
df_test = pd.read_csv('../data/yeast_genotype_test.txt', sep='\t', index_col=0)


In [32]:
df_train.shape, df_test.shape

((3513, 28220), (877, 28220))

## Preprocessing

In [None]:
# Función para generar datos faltantes sintéticos en el conjunto de entrenamiento
def introduce_missingness(data, missing_perc=0.1):
    data_missing = data.copy()
    n_samples, n_features = data_missing.shape
    n_missing = int(np.floor(n_samples * n_features * missing_perc))

    missing_indices = np.random.choice(n_samples * n_features, n_missing, replace=False)
    for index in missing_indices:
        i = index // n_features
        j = index % n_features
        data_missing.iloc[i, j] = 0  # Introducir 0s como valores faltantes

    return data_missing
# Introducir faltantes en el 10% de los datos de entrenamiento y prueba
df_train_missing = introduce_missingness(df_train, missing_perc=0.1)
df_test_missing = introduce_missingness(df_test, missing_perc=0.1)

# Introducir faltantes en el 10% de los datos de entrenamiento
# Reemplazar NaN por un valor especial (ej: -1) para que sea reconocido por la red
df_train_missing.fillna(-1, inplace=True)
df_test_missing.fillna(-1, inplace=True)
data_dim = df_train_missing.shape[1]
data_dim

In [None]:
# Definir el generador
def build_generator():
    model = tf.keras.Sequential()
    model.add(layers.Dense(128, input_dim=data_dim))
    model.add(layers.LeakyReLU(alpha=0.2))
    model.add(layers.BatchNormalization(momentum=0.8))
    model.add(layers.Dense(256))
    model.add(layers.LeakyReLU(alpha=0.2))
    model.add(layers.BatchNormalization(momentum=0.8))
    model.add(layers.Dense(512))
    model.add(layers.LeakyReLU(alpha=0.2))
    model.add(layers.BatchNormalization(momentum=0.8))
    model.add(layers.Dense(data_dim, activation='tanh'))
    return model

# Definir el discriminador
def build_discriminator():
    model = tf.keras.Sequential()
    model.add(layers.Dense(512, input_dim=data_dim))
    model.add(layers.LeakyReLU(alpha=0.2))
    model.add(layers.Dense(256))
    model.add(layers.LeakyReLU(alpha=0.2))
    model.add(layers.Dense(1, activation='sigmoid'))
    return model

# Compilar GAN
def build_gan(generator, discriminator):
    discriminator.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    discriminator.trainable = False
    gan_input = tf.keras.Input(shape=(data_dim,))
    generated_data = generator(gan_input)
    gan_output = discriminator(generated_data)
    gan = tf.keras.Model(gan_input, gan_output)
    gan.compile(loss='binary_crossentropy', optimizer='adam')
    return gan


In [None]:
generator = build_generator()
discriminator = build_discriminator()
gan = build_gan(generator, discriminator)


In [None]:
# Entrenar GAN
def train_gan(gan, generator, discriminator, data, epochs=10000, batch_size=32, save_interval=1000):
    half_batch = int(batch_size / 2)
    
    for epoch in range(epochs):
        # Entrenar discriminador con datos reales
        idx = np.random.randint(0, data.shape[0], half_batch)
        real_data = data[idx]
        real_labels = np.ones((half_batch, 1))
        
        # Generar datos sintéticos con el generador
        noise = np.random.normal(0, 1, (half_batch, data_dim))
        fake_data = generator.predict(noise)
        fake_labels = np.zeros((half_batch, 1))
        
        # Entrenar el discriminador
        d_loss_real = discriminator.train_on_batch(real_data, real_labels)
        d_loss_fake = discriminator.train_on_batch(fake_data, fake_labels)
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
        
        # Entrenar el generador (parte de la GAN)
        noise = np.random.normal(0, 1, (batch_size, data_dim))
        valid_y = np.ones((batch_size, 1))
        g_loss = gan.train_on_batch(noise, valid_y)
        
        # Guardar el progreso
        if epoch % save_interval == 0:
            print(f"{epoch} [D loss: {d_loss[0]}, acc.: {100*d_loss[1]}%] [G loss: {g_loss}]")

# Entrenar la GAN con los datos faltantes
train_gan(gan, generator, discriminator, df_train_missing.values, epochs=10000, batch_size=32)




In [None]:
# Imputación de datos faltantes
generated_data = generator.predict(df_train_missing.values)
generated_data_test = generator.predict(df_test_missing.values)

# Reemplazar los valores imputados en el conjunto original
df_train_imputed = df_train_missing.copy()
df_train_imputed[df_train_imputed == -1] = generated_data[df_train_imputed == -1]
# Reemplazar los valores imputados en el conjunto original de prueba
df_test_imputed = df_test_missing.copy()
df_test_imputed[df_test_imputed == -1] = generated_data_test[df_test_imputed == -1]


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc

# Evaluar la precisión de la imputación en el conjunto de prueba comparando con los valores originales
for column in df_test.columns:
    original = df_test[column][df_test_missing[column].isna()]
    imputed = df_test_imputed[column][df_test_missing[column].isna()]

    # Calcular precisión
    accuracy = accuracy_score(original, imputed)
    print(f"Precisión en la imputación de la columna {column}: {accuracy:.4f}")

    # Matriz de confusión
    conf_matrix = confusion_matrix(original, imputed)
    print(f"Matriz de confusión para la columna {column}:")
    print(conf_matrix)

    # Reporte de clasificación
    print(f"Reporte de clasificación para la columna {column}:")
    print(classification_report(original, imputed))