# Prepare some things
## Load some modules

In [30]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models

import pandas as pd
import numpy as np

In [2]:
import sys
sys.path.append('../')
from models import get_autoencoder_model

In [3]:
# Disable warnings output (TSNE outputs one very time)
import warnings
warnings.filterwarnings('ignore')

## Load the metabolomic data
(alrady min-max normalized)

In [80]:
metabol_exprs = pd.read_csv("ExposomeChallengeData/datasets/metabol_joint_exprs_minmax.csv", index_col=0)
metabol_exprs.head()

Unnamed: 0,serum_metab_1,serum_metab_2,serum_metab_3,serum_metab_4,serum_metab_5,serum_metab_6,serum_metab_7,serum_metab_8,serum_metab_9,serum_metab_10,...,urine_metab_35,urine_metab_36,urine_metab_37,urine_metab_38,urine_metab_39,urine_metab_40,urine_metab_41,urine_metab_42,urine_metab_43,urine_metab_44
430,0.224542,0.205024,0.49047,0.653163,0.515652,0.483009,0.249523,0.340524,0.400406,0.574663,...,0.485001,0.628204,0.489431,0.640612,0.391376,0.665946,0.848047,0.360177,0.587031,0.605114
1187,0.402503,0.549489,0.828354,0.323912,0.431621,0.491824,0.376192,0.34833,0.213731,0.308797,...,0.46904,0.816605,0.441328,0.0,0.494475,0.69617,0.413698,0.396026,0.572658,0.619732
940,0.402503,0.556896,0.706195,0.534558,0.544173,0.514375,0.397976,0.856991,0.358113,0.315096,...,0.469738,0.627037,0.519383,0.0,0.326659,0.64506,0.841829,0.0,0.653218,0.66595
936,0.462293,0.578889,0.451112,0.448921,0.49205,0.553183,0.372429,0.544806,0.388943,0.545676,...,0.457602,0.738061,0.290234,0.177238,0.451808,0.487007,0.759121,0.0,0.678092,0.729031
788,0.247137,0.571595,0.568414,0.417697,0.432269,0.553183,0.531642,0.534388,0.536561,0.704749,...,0.452021,0.639165,0.329798,0.354477,0.25021,0.702331,0.778121,0.3455,0.554585,0.661642


In [81]:
metabol_exprs = metabol_exprs.to_numpy()
# np.random.shuffle(metabol_exprs)

In [6]:
num_data_points = np.multiply(*metabol_exprs.shape)
print("Shape of the data set:", metabol_exprs.shape)
print("Number of data points:", num_data_points)

Shape of the data set: (1152, 221)
Number of data points: 254592


## Load the categorical variables
(subset of phenotype and covariates data, already codified)

In [7]:
phenotype_cat = pd.read_csv("ExposomeChallengeData/datasets/phenotype_cat.csv", index_col=0)
covariates_cat = pd.read_csv("ExposomeChallengeData/datasets/covariates_cat.csv", index_col=0)
classes = pd.concat([phenotype_cat, covariates_cat], axis=1)
classes.head()

Unnamed: 0,birth_weight,iq,behaviour,asthma,bmi,cohort,age,sex,education,native,parity
1,3,0,3,0,1,3,1,0,1,2,0
2,3,1,3,0,1,3,2,0,2,2,1
3,3,0,3,1,3,3,1,0,2,2,1
4,1,2,3,0,1,1,4,1,0,2,1
5,3,0,1,0,1,2,4,0,0,2,0


In [8]:
print("Number of class variables:", classes.shape[1])

Number of class variables: 11


# Construct AE with 1D convolutional layers

## Prepare the input
- pad with zeros to have shape 224 (divisible by 2 five times)
- add 1 dimention

In [93]:
metabol_exprs = np.pad(metabol_exprs, ((0,0),(0,3)), 'constant', constant_values=0)
metabol_exprs.shape

(1152, 224)

In [94]:
metabol_exprs = np.expand_dims(metabol_exprs, -1)
metabol_exprs.shape

(1152, 224, 1)

## Construct the convolutional encoder and decoder

In [125]:
def get_encoder(input_dim, latent_dim, filters=[]):
    encoder_inputs = keras.Input(shape=(input_dim, 1), name="encoder_inputs")
    x = encoder_inputs
    for f in filters:
        x = layers.Conv1D(f, 3, padding='same', activation="relu") (x)
        x = layers.MaxPool1D(2, padding='same') (x)
    x = layers.Flatten(name="flatten") (x)
    encoder_outputs = layers.Dense(latent_dim, activation="relu", name="z") (x)
    return models.Model(encoder_inputs, encoder_outputs, name="Encoder")
    
def get_decoder(output_dim, latent_dim, filters=[]):
    flat_shape = output_dim * filters[-1] // (2*len(filters))
    reshape_shape = (flat_shape//filters[-1], filters[-1])
    
    decoder_inputs = keras.Input(shape=(latent_dim), name="decoder_inputs")
    x = layers.Dense(flat_shape) (decoder_inputs)
    x = layers.Reshape(reshape_shape) (x)
    for f in filters[::-1]:
        x = layers.Conv1DTranspose(f, 3, strides=2, padding="same") (x)
    decoder_outputs = layers.Conv1D(1, 3, activation="sigmoid", padding="same", name="recon_x") (x)
    return models.Model(decoder_inputs, decoder_outputs, name="Decoder")

In [126]:
INPUT_DIM = metabol_exprs.shape[1]
FILTERS = (4, 16)
LATENT_DIM = 5
N_CLUSTERS = LATENT_DIM

encoder = get_encoder(INPUT_DIM, LATENT_DIM, FILTERS)
encoder.summary()

decoder = get_decoder(INPUT_DIM, LATENT_DIM, FILTERS)
decoder.summary()

Model: "Encoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder_inputs (InputLayer)  [(None, 224, 1)]         0         
                                                                 
 conv1d_87 (Conv1D)          (None, 224, 4)            16        
                                                                 
 max_pooling1d_61 (MaxPoolin  (None, 112, 4)           0         
 g1D)                                                            
                                                                 
 conv1d_88 (Conv1D)          (None, 112, 16)           208       
                                                                 
 max_pooling1d_62 (MaxPoolin  (None, 56, 16)           0         
 g1D)                                                            
                                                                 
 flatten (Flatten)           (None, 896)               0   

Number of data points / number of parameters ratio:

In [131]:
np.multiply(metabol_exprs.shape[0], metabol_exprs.shape[1]) / (encoder.count_params() + decoder.count_params())

23.293735331287237