In [168]:
import joblib
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tqdm import tqdm

import os

from utils import COLUMNS_NAME

## Paths

In [169]:
version = 'v1'

In [170]:
#Loading data
datasets_dir = '../../data/datasets/'

In [171]:
    dataset_name = 'test_dataset'
dataset_path = '../../data/datasets/' + dataset_name + '.csv'

In [172]:
# ----------------------------------------------------------------------------

In [173]:
model_name = 'supervised_vae'

bootstrap_dir = '../../outputs/' + model_name + '/bootstrap_ids/'
model_dir = bootstrap_dir + version + '/' 

## Load dataset

In [174]:
test_df = pd.read_csv(dataset_path)
test_df.head()

Unnamed: 0,IDX_STUDENT,COD_PROGRAMA,JORNADA,DURACION,PERIODO_INGRESO,SEXO,ESTADO_CIVIL,ESTRATO,RANGO_EDAD,CONDICION_EXCEPCION,...,RANGO_INGRESOS,RANGO_GASTOS,TIPO_VIVIENDA,PUNTAJE_ICFES,PCN,PLC,PMA,PSC,PIN,GRADUADO_A_TIEMPO
0,STUDENT_1093,2711,1,6,15,0,1,2,3,1,...,2,2,1,262,55,54,45,58,50,0
1,STUDENT_628,2131,1,6,14,1,1,1,4,1,...,1,1,2,290,66,58,59,57,50,0
2,STUDENT_1267,2131,1,6,15,0,1,2,2,1,...,3,3,1,304,67,55,63,55,64,0
3,STUDENT_288,2721,2,7,13,1,1,2,4,1,...,4,4,1,173,55,49,69,0,0,0
4,STUDENT_437,2711,2,7,9,1,2,2,4,1,...,2,2,1,262,59,55,50,46,51,0


In [175]:
test_df.shape

(348, 21)

## Boostrap iteration

In [176]:
n_bootstrap = 10

In [177]:
# ----------------------------------------------------------------------------
# Set random seed
random_seed = 42
tf.random.set_seed(random_seed)
np.random.seed(random_seed)

In [178]:
# ----------------------------------------------------------------------------
for i_bootstrap in tqdm(range(n_bootstrap)):
    bootstrap_model_dir = model_dir + '{:03d}'.format(i_bootstrap) + '/'
    
    output_dataset_dir = bootstrap_model_dir + 'outputs/'
    
    if not os.path.exists(output_dataset_dir):
        os.mkdir(bootstrap_model_dir + 'outputs')
    
    # ----------------------------------------------------------------------------
    x_dataset = test_df[COLUMNS_NAME].values

    # ----------------------------------------------------------------------------
    final_model_dir = bootstrap_model_dir + 'model/'
    
    encoder = keras.models.load_model(final_model_dir + 'encoder.h5', compile=False)
    decoder = keras.models.load_model(final_model_dir + 'decoder.h5', compile=False)

    #scaler = joblib.load(final_model_dir + 'scaler.joblib')

    #enc_age = joblib.load(final_model_dir + 'age_encoder.joblib')
    enc_gender = joblib.load(final_model_dir + 'gender_encoder.joblib')
    
    # ----------------------------------------------------------------------------
    #x_normalized = scaler.transform(x_dataset)
    x_normalized = x_dataset

    normalized_df = pd.DataFrame(columns=['IDX_STUDENT'] + COLUMNS_NAME)
    normalized_df['IDX_STUDENT'] = test_df['IDX_STUDENT']
    normalized_df[COLUMNS_NAME] = x_normalized
    normalized_df.to_csv(output_dataset_dir +  'normalized.csv', index=False)
    
    # ----------------------------------------------------------------------------
    #age = test_df['RANGO_EDAD'].values[:, np.newaxis].astype('float32')
    #one_hot_age = enc_age.transform(age)

    gender = test_df['SEXO'].values[:, np.newaxis].astype('float32')
    one_hot_gender = enc_gender.transform(gender)

    #y_one_hot_genderdata = np.concatenate((one_hot_age, one_hot_gender), axis=1).astype('float32')
    y_data = one_hot_gender.astype('float32')
    
    # ----------------------------------------------------------------------------
    encoded = encoder(x_normalized, training=False)
    reconstruction = decoder(tf.concat([encoded, y_data], axis=1), training=False)

    reconstruction_df = pd.DataFrame(columns=['IDX_STUDENT'] + COLUMNS_NAME)
    reconstruction_df['IDX_STUDENT'] = test_df['IDX_STUDENT']
    reconstruction_df[COLUMNS_NAME] = reconstruction.numpy()
    reconstruction_df.to_csv(output_dataset_dir + 'reconstruction.csv', index=False)

    encoded_df = pd.DataFrame(columns=['IDX_STUDENT'] + list(range(encoded.shape[1])))
    encoded_df['IDX_STUDENT'] = test_df['IDX_STUDENT']
    encoded_df[list(range(encoded.shape[1]))] = encoded.numpy()
    encoded_df.to_csv(output_dataset_dir + 'encoded.csv', index=False)
    
    # ----------------------------------------------------------------------------
    reconstruction_error = np.mean((x_normalized - reconstruction) ** 2, axis=1)

    reconstruction_error_df = pd.DataFrame(columns=['IDX_STUDENT', 'Reconstruction error'])
    reconstruction_error_df['IDX_STUDENT'] = test_df['IDX_STUDENT']
    reconstruction_error_df['Reconstruction error'] = reconstruction_error
    reconstruction_error_df.to_csv(output_dataset_dir + 'reconstruction_error.csv', index=False)

100%|██████████| 10/10 [00:07<00:00,  1.26it/s]
