# Prepare some things
## Load some modules

In [1]:
import tensorflow as tf
from tensorflow import keras

import pandas as pd
import numpy as np

In [2]:
import sys
sys.path.append('../')
from models import get_autoencoder_model

In [3]:
# Disable warnings output (TSNE outputs one very time)
import warnings
warnings.filterwarnings('ignore')

## Load the metabolomic data
(alrady min-max normalized)

In [30]:
metabol_exprs = pd.read_csv("ExposomeChallengeData/datasets/metabol_joint_exprs_minmax.csv", index_col=0)
metabol_exprs.head()

Unnamed: 0,serum_metab_1,serum_metab_2,serum_metab_3,serum_metab_4,serum_metab_5,serum_metab_6,serum_metab_7,serum_metab_8,serum_metab_9,serum_metab_10,...,urine_metab_35,urine_metab_36,urine_metab_37,urine_metab_38,urine_metab_39,urine_metab_40,urine_metab_41,urine_metab_42,urine_metab_43,urine_metab_44
430,0.224542,0.205024,0.49047,0.653163,0.515652,0.483009,0.249523,0.340524,0.400406,0.574663,...,0.485001,0.628204,0.489431,0.640612,0.391376,0.665946,0.848047,0.360177,0.587031,0.605114
1187,0.402503,0.549489,0.828354,0.323912,0.431621,0.491824,0.376192,0.34833,0.213731,0.308797,...,0.46904,0.816605,0.441328,0.0,0.494475,0.69617,0.413698,0.396026,0.572658,0.619732
940,0.402503,0.556896,0.706195,0.534558,0.544173,0.514375,0.397976,0.856991,0.358113,0.315096,...,0.469738,0.627037,0.519383,0.0,0.326659,0.64506,0.841829,0.0,0.653218,0.66595
936,0.462293,0.578889,0.451112,0.448921,0.49205,0.553183,0.372429,0.544806,0.388943,0.545676,...,0.457602,0.738061,0.290234,0.177238,0.451808,0.487007,0.759121,0.0,0.678092,0.729031
788,0.247137,0.571595,0.568414,0.417697,0.432269,0.553183,0.531642,0.534388,0.536561,0.704749,...,0.452021,0.639165,0.329798,0.354477,0.25021,0.702331,0.778121,0.3455,0.554585,0.661642


In [31]:
metabol_exprs = metabol_exprs.to_numpy()
np.random.shuffle(metabol_exprs)

In [32]:
num_data_points = np.multiply(*metabol_exprs.shape)
print("Shape of the data set:", metabol_exprs.shape)
print("Number of data points:", num_data_points)

Shape of the data set: (1152, 221)
Number of data points: 254592


## Load the categorical variables
(subset of phenotype and covariates data, already codified)

In [6]:
phenotype_cat = pd.read_csv("ExposomeChallengeData/datasets/phenotype_cat.csv", index_col=0)
covariates_cat = pd.read_csv("ExposomeChallengeData/datasets/covariates_cat.csv", index_col=0)
classes = pd.concat([phenotype_cat, covariates_cat], axis=1)
classes.head()

Unnamed: 0,birth_weight,iq,behaviour,asthma,bmi,cohort,age,sex,education,native,parity
1,3,0,3,0,1,3,1,0,1,2,0
2,3,1,3,0,1,3,2,0,2,2,1
3,3,0,3,1,3,3,1,0,2,2,1
4,1,2,3,0,1,1,4,1,0,2,1
5,3,0,1,0,1,2,4,0,0,2,0


In [7]:
print("Number of class variables:", classes.shape[1])

Number of class variables: 11


# Try AE with different hyperparameters

In [9]:
def dec_callback(epoch, logs):
    global p
    global c_last
    
    if (epoch+1) % SCHEDULE == 0:
        q = model_dec.soft_assignment(metabol_exprs)
        p = compute_p(q)
        
        c_new = q.numpy().argmax(1)
        delta = compute_delta(c_new, c_last)
        c_last = np.copy(c_new)
        # print(f"Delta: {delta:.3f}")
        if (delta < DELTA_THRESHOLD):
            model_dec.stop_training = True
            
callback = tf.keras.callbacks.LambdaCallback(on_epoch_end=dec_callback)

In [43]:
LEARNING_RATE = 1
MOMENTUM = 0.8
optimizer = keras.optimizers.SGD(learning_rate=LEARNING_RATE, momentum=MOMENTUM)

INPUT_DIM = metabol_exprs.shape[1]
LATENT_DIM = 5
N_CLUSTERS = LATENT_DIM

## Intermediate layers number and dimentions

Let's try different numbers and dimentions of intermediate layers:

In [43]:
EPOCHS = 50
BATCH_SIZE = 32
INTERMEDIATE_DIMS = [(512, 512, 2048), # same as in MNIST, for reference
                     (64, 32),
                     (32, 16),
                     (16, 4),
                     (16, 16, 64),
                     (16, 16, 128),
                     (16, 16, 256),
                     (32, 32, 64),
                     (32, 32, 128),
                     (32, 32, 256)
                    ]

# Try every combination of dimention 5 times, 
# then get the mean of the results
for dims in INTERMEDIATE_DIMS:
    loss = []
    val_loss = []
    loss_ratio = []
    for i in np.arange(5):
        model_ae = get_autoencoder_model(INPUT_DIM, LATENT_DIM, dims)
        data_params_ratio = num_data_points / (model_ae.encoder.count_params() + 
                                               model_ae.decoder.count_params())
        if i == 0:
            print(f'{dims} [data/params ratio: {data_params_ratio:.1f}]', end=" ")

        model_ae.compile(optimizer=optimizer, loss="mse")
        history = model_ae.fit(metabol_exprs, metabol_exprs,
                               epochs=EPOCHS,
                               batch_size=BATCH_SIZE,
                               validation_split=0.2,
                               verbose=0)
        loss.append(history.history['loss'][-1])
        val_loss.append(history.history['val_loss'][-1])
        loss_ratio.append(loss[-1] / val_loss[-1])

    print(f"[loss: {np.mean(loss):.4f}, val_loss: {np.mean(val_loss):.4f}, ratio: {np.mean(loss_ratio):.4f}]")

(512, 512, 2048) [data/params ratio: 0.1] [loss: 0.0134, val_loss: 0.0134, ratio: 0.9999]
(64, 32) [data/params ratio: 7.7] [loss: 0.0138, val_loss: 0.0138, ratio: 0.9966]
(32, 16) [data/params ratio: 16.3] [loss: 0.0155, val_loss: 0.0152, ratio: 1.0155]
(16, 4) [data/params ratio: 33.9] [loss: 0.0161, val_loss: 0.0158, ratio: 1.0227]
(16, 16, 64) [data/params ratio: 23.8] [loss: 0.0164, val_loss: 0.0160, ratio: 1.0278]
(16, 16, 128) [data/params ratio: 18.9] [loss: 0.0157, val_loss: 0.0155, ratio: 1.0170]
(16, 16, 256) [data/params ratio: 13.3] [loss: 0.0158, val_loss: 0.0155, ratio: 1.0178]
(32, 32, 64) [data/params ratio: 11.9] [loss: 0.0152, val_loss: 0.0150, ratio: 1.0145]
(32, 32, 128) [data/params ratio: 9.7] [loss: 0.0155, val_loss: 0.0153, ratio: 1.0147]
(32, 32, 256) [data/params ratio: 7.1] [loss: 0.0156, val_loss: 0.0154, ratio: 1.0147]


The best loss is achieved with only two intermediate layers (dims: (64,32)), but the data/params ratio is very low and this can make the model easily overfit.

The model with dimentions (16, 16, 128) gets a slightly lower loss but with a better data/params ratio.

By training for more epochs, de difference in loss can be minimized:

In [49]:
EPOCHS = 200
INTERMEDIATE_DIMS = [(64, 32),
                     (16, 16, 128),
                     (32, 32, 64),
                    ]

# Try every combination of dimention 5 times, 
# then get the mean of the results
for dims in INTERMEDIATE_DIMS:
    loss = []
    val_loss = []
    loss_ratio = []
    for i in np.arange(5):
        model_ae = get_autoencoder_model(INPUT_DIM, LATENT_DIM, dims)
        data_params_ratio = num_data_points / (model_ae.encoder.count_params() + 
                                               model_ae.decoder.count_params())
        if i == 0:
            print(f'{dims} [data/params ratio: {data_params_ratio:.1f}]', end=" ")

        model_ae.compile(optimizer=optimizer, loss="mse")
        history = model_ae.fit(metabol_exprs, metabol_exprs,
                               epochs=EPOCHS,
                               batch_size=BATCH_SIZE,
                               validation_split=0.2,
                               verbose=0)
        loss.append(history.history['loss'][-1])
        val_loss.append(history.history['val_loss'][-1])
        loss_ratio.append(loss[-1] / val_loss[-1])

    print(f"[loss: {np.mean(loss):.4f}, val_loss: {np.mean(val_loss):.4f}, ratio: {np.mean(loss_ratio):.4f}]")

(64, 32) [data/params ratio: 7.7] [loss: 0.0131, val_loss: 0.0131, ratio: 0.9982]
(16, 16, 128) [data/params ratio: 18.9] [loss: 0.0138, val_loss: 0.0138, ratio: 1.0004]
(32, 32, 64) [data/params ratio: 11.9] [loss: 0.0128, val_loss: 0.0128, ratio: 1.0002]


I'll select the combination (16, 16, 128), since it achieves the better relation between data/params ratio and loss.

## Batch size

Let's now try different batch sizes to see if there is are any big differences:

In [52]:
EPOCHS = 50
INTERMEDIATE_DIMS = (16, 16, 128)
BATCH_SIZES = (16, 32, 64, 128)

# Try every batch size 5 times, 
# then get the mean of the results
for batch_size in BATCH_SIZES:
    loss = []
    val_loss = []
    loss_ratio = []
    print(f'batch size: {batch_size}', end=" ")
    for i in np.arange(5):
        model_ae = get_autoencoder_model(INPUT_DIM, LATENT_DIM, INTERMEDIATE_DIMS)
        model_ae.compile(optimizer=optimizer, loss="mse")
        history = model_ae.fit(metabol_exprs, metabol_exprs,
                               epochs=EPOCHS,
                               batch_size=batch_size,
                               validation_split=0.2,
                               verbose=0)
        loss.append(history.history['loss'][-1])
        val_loss.append(history.history['val_loss'][-1])
        loss_ratio.append(loss[-1] / val_loss[-1])

    print(f"[loss: {np.mean(loss):.4f}, val_loss: {np.mean(val_loss):.4f}, ratio: {np.mean(loss_ratio):.4f}]")

batch size: 16 [loss: 0.0151, val_loss: 0.0149, ratio: 1.0099]
batch size: 32 [loss: 0.0160, val_loss: 0.0156, ratio: 1.0195]
batch size: 64 [loss: 0.0179, val_loss: 0.0171, ratio: 1.0464]
batch size: 128 [loss: 0.0192, val_loss: 0.0179, ratio: 1.0686]


The smaller batch sizes achieve a lower loss, but also the training process takes longer.

Both things can be contrarrested by selecting a different amount of epochs. So it does'nt seem to be criticall.

I'll select a batch size of 32 and vary the number of epochs depending on how long it takes for the loss to converge.

# Selected parameters

Based on the results, I decided to select the following parameters:
- Intermediate dimentions: (16, 16, 128)
- Batch size: 32