In [92]:
import random as rn
import time

import joblib
from sklearn.preprocessing import RobustScaler, OneHotEncoder, LabelEncoder
import numpy as np
import pandas as pd
from sklearn.metrics import roc_curve, auc
from tqdm import tqdm

import tensorflow as tf
from tensorflow import keras

from utils import COLUMNS_NAME
from models import make_encoder_model_vae, make_decoder_model_vae

## Load data

In [93]:
#Loading data
datasets_dir = '../../data/datasets/'
dataset_name = 'train_dataset.csv'
dataset_path = datasets_dir + dataset_name

In [94]:
train_df = pd.read_csv(dataset_path)
train_df.head()

Unnamed: 0,IDX_STUDENT,COD_PROGRAMA,JORNADA,DURACION,PERIODO_INGRESO,SEXO,ESTADO_CIVIL,ESTRATO,RANGO_EDAD,CONDICION_EXCEPCION,...,RANGO_INGRESOS,RANGO_GASTOS,TIPO_VIVIENDA,PUNTAJE_ICFES,PCN,PLC,PMA,PSC,PIN,GRADUADO_A_TIEMPO
0,STUDENT_1367,2711,1,6,6,1,1,2,2,1,...,2,2,1,252,50,49,51,48,55,0
1,STUDENT_1527,2711,2,7,3,1,1,2,4,1,...,1,1,1,250,49,51,51,51,49,0
2,STUDENT_885,2710,2,7,12,1,1,1,2,1,...,2,2,1,240,46,58,48,38,50,0
3,STUDENT_1879,2710,1,6,2,1,1,2,3,1,...,1,1,2,248,48,55,47,48,51,0
4,STUDENT_1717,2710,1,6,6,1,1,2,4,1,...,4,4,4,241,45,53,51,48,44,0


In [95]:
train_df.shape

(1603, 21)

### Paths

In [96]:
version = 'v1'

In [97]:
# ----------------------------------------------------------------------------
bootstrap_dir = '../../data/bootstrap_ids/' + version +'/'

In [98]:
model_name = 'supervised_vae'
models_dir = '../../outputs/' + model_name + '/bootstrap_ids/' + version + '/'

## Bootstrap

In [99]:
# ----------------------------------------------------------------------------
# Set random seed
random_seed = 42
tf.random.set_seed(random_seed)
np.random.seed(random_seed)
rn.seed(random_seed)

In [100]:
# ----------------------------------------------------------------------------
n_bootstrap = 10

In [101]:
for i_bootstrap in tqdm(range(n_bootstrap)):
    ids_filename = 'cleaned_bootstrap_{:03d}.csv'.format(i_bootstrap)
    ids_path = bootstrap_dir + ids_filename
    
    bootstrap_model_dir = models_dir + '{:03d}'.format(i_bootstrap) + '/'
    # ----------------------------------------------------------------------------
    # Loading data
    dataset_df = pd.read_csv(ids_path)
    
    # ----------------------------------------------------------------------------
    x_data = dataset_df[COLUMNS_NAME].values
    
    scaler = RobustScaler()
    x_data_normalized = scaler.fit_transform(x_data)
    
    # ----------------------------------------------------------------------------
    age = dataset_df['RANGO_EDAD'].values[:, np.newaxis].astype('float32')
    enc_age = OneHotEncoder(sparse_output=False)
    one_hot_age = enc_age.fit_transform(age)

    gender = dataset_df['SEXO'].values[:, np.newaxis].astype('float32')
    enc_gender = OneHotEncoder(sparse_output=False)
    one_hot_gender = enc_gender.fit_transform(gender)

    y_data = np.concatenate((one_hot_age, one_hot_gender), axis=1).astype('float32')
    
    # -------------------------------------------------------------------------------------------------------------
    # Create the dataset iterator
    batch_size = 64
    n_samples = x_data.shape[0]
    
    train_dataset = tf.data.Dataset.from_tensor_slices((x_data_normalized, y_data))
    train_dataset = train_dataset.shuffle(buffer_size=n_samples)
    train_dataset = train_dataset.batch(batch_size)
    
    # -------------------------------------------------------------------------------------------------------------
    # Create models
    n_features = x_data_normalized.shape[1]
    n_labels = y_data.shape[1]
    h_dim = [100, 100]
    z_dim = 20

    encoder = make_encoder_model_vae(n_features, h_dim, z_dim)
    decoder = make_decoder_model_vae(z_dim + n_labels, n_features, h_dim)
    
    # -------------------------------------------------------------------------------------------------------------
    # Define loss functions
    cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True)
    mse = tf.keras.losses.MeanSquaredError()
    
    
    # -------------------------------------------------------------------------------------------------------------
    # Define optimizers
    base_lr = 0.0001
    max_lr = 0.005

    step_size = 2 * np.ceil(n_samples / batch_size)

    ae_optimizer = tf.keras.optimizers.Adam(learning_rate=base_lr)
    
    # -------------------------------------------------------------------------------------------------------------
    # Training function
    @tf.function
    def train_step(batch_x, batch_y):
        # -------------------------------------------------------------------------------------------------------------
        # Autoencoder
        with tf.GradientTape() as ae_tape:
            encoder_output = encoder(batch_x, training=True)
            decoder_output = decoder(tf.concat([encoder_output, batch_y], axis=1), training=True)

            # Autoencoder loss
            ae_loss = mse(batch_x, decoder_output)

        ae_grads = ae_tape.gradient(ae_loss, encoder.trainable_variables + decoder.trainable_variables)
        ae_optimizer.apply_gradients(zip(ae_grads, encoder.trainable_variables + decoder.trainable_variables))


        return ae_loss
    
    # -------------------------------------------------------------------------------------------------------------
    # -------------------------------------------------------------------------------------------------------------
    # Training loop
    global_step = 0
    n_epochs = 150
    gamma = 0.98
    scale_fn = lambda x: gamma ** x
    
    for epoch in range(n_epochs):
        start = time.time()

        epoch_ae_loss_avg = tf.metrics.Mean()

        for _, (batch_x, batch_y) in enumerate(train_dataset):
            global_step = global_step + 1
            cycle = np.floor(1 + global_step / (2 * step_size))
            x_lr = np.abs(global_step / step_size - 2 * cycle + 1)
            clr = base_lr + (max_lr - base_lr) * max(0, 1 - x_lr) * scale_fn(cycle)
            ae_optimizer.lr = clr

            ae_loss = train_step(batch_x, batch_y)

            epoch_ae_loss_avg(ae_loss)

        epoch_time = time.time() - start
        
    # -------------------------------------------------------------------------------------------------------------
    # Save models
    final_model_dir = bootstrap_model_dir + 'model/'
    
    encoder.save(final_model_dir + 'encoder.h5')
    decoder.save(final_model_dir + 'decoder.h5')

    # Save scaler
    joblib.dump(scaler, final_model_dir + 'scaler.joblib')

    joblib.dump(enc_age, final_model_dir + 'age_encoder.joblib')
    joblib.dump(enc_gender, final_model_dir + 'gender_encoder.joblib')

  0%|          | 0/10 [00:00<?, ?it/s]



 10%|█         | 1/10 [00:39<05:58, 39.82s/it]



 20%|██        | 2/10 [01:17<05:10, 38.79s/it]



 30%|███       | 3/10 [01:56<04:30, 38.60s/it]



 40%|████      | 4/10 [02:35<03:52, 38.72s/it]



 50%|█████     | 5/10 [03:14<03:14, 38.86s/it]



 60%|██████    | 6/10 [03:50<02:31, 37.96s/it]



 70%|███████   | 7/10 [04:28<01:53, 37.99s/it]



 80%|████████  | 8/10 [05:07<01:16, 38.30s/it]



 90%|█████████ | 9/10 [05:44<00:37, 37.87s/it]



100%|██████████| 10/10 [06:22<00:00, 38.24s/it]
