# Train a VAE on Cell Painting LINCS Data

In [2]:
import sys
import pathlib
import numpy as np
import pandas as pd

from tensorflow import keras

from pycytominer.cyto_utils import infer_cp_features

sys.path.insert(0, "../scripts")
from utils import load_data
from vae import VAE

Using TensorFlow backend.


In [3]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [4]:
np.random.seed(123)

<IPython.core.display.Javascript object>

In [5]:
data_splits = ["train", "test"]
data_dict = load_data(data_splits)

<IPython.core.display.Javascript object>

In [6]:
# Prepare data for training
meta_features = infer_cp_features(data_dict["train"], metadata=True)
cp_features = infer_cp_features(data_dict["train"])

train_features_df = data_dict["train"].reindex(cp_features, axis="columns")
train_meta_df = data_dict["train"].reindex(meta_features, axis="columns")

test_features_df = data_dict["test"].reindex(cp_features, axis="columns")
test_meta_df = data_dict["test"].reindex(meta_features, axis="columns")

<IPython.core.display.Javascript object>

In [7]:
print(train_features_df.shape)
train_features_df.head(3)

(8164, 588)


Unnamed: 0,Cells_AreaShape_FormFactor,Cells_AreaShape_Orientation,Cells_AreaShape_Zernike_2_2,Cells_AreaShape_Zernike_3_1,Cells_AreaShape_Zernike_5_1,Cells_AreaShape_Zernike_6_2,Cells_AreaShape_Zernike_7_1,Cells_AreaShape_Zernike_7_7,Cells_AreaShape_Zernike_9_3,Cells_Correlation_Correlation_DNA_AGP,...,Nuclei_Texture_SumAverage_AGP_5_0,Nuclei_Texture_SumEntropy_AGP_10_0,Nuclei_Texture_SumEntropy_AGP_20_0,Nuclei_Texture_SumEntropy_AGP_5_0,Nuclei_Texture_SumEntropy_DNA_10_0,Nuclei_Texture_SumEntropy_DNA_20_0,Nuclei_Texture_SumEntropy_Mito_5_0,Nuclei_Texture_SumVariance_DNA_5_0,Nuclei_Texture_Variance_DNA_20_0,Nuclei_Texture_Variance_DNA_5_0
0,0.58139,0.58943,0.52275,0.47218,0.59942,0.33731,0.46867,0.86056,0.63262,0.69859,...,0.13771,0.27309,0.25188,0.26658,0.64382,0.54983,0.45665,0.39704,0.4339,0.4152
1,0.61141,0.63239,0.64982,0.51768,0.56649,0.28735,0.48172,0.87469,0.6585,0.64112,...,0.11049,0.17156,0.1529,0.16573,0.56529,0.45087,0.45219,0.31356,0.36162,0.33429
2,0.48886,0.61813,0.55293,0.4513,0.59392,0.37508,0.43175,0.76606,0.64708,0.69571,...,0.20347,0.22407,0.21294,0.22257,0.58874,0.50065,0.4247,0.32341,0.35135,0.33536


<IPython.core.display.Javascript object>

In [8]:
print(test_features_df.shape)
test_features_df.head(3)

(1037, 588)


Unnamed: 0,Cells_AreaShape_FormFactor,Cells_AreaShape_Orientation,Cells_AreaShape_Zernike_2_2,Cells_AreaShape_Zernike_3_1,Cells_AreaShape_Zernike_5_1,Cells_AreaShape_Zernike_6_2,Cells_AreaShape_Zernike_7_1,Cells_AreaShape_Zernike_7_7,Cells_AreaShape_Zernike_9_3,Cells_Correlation_Correlation_DNA_AGP,...,Nuclei_Texture_SumAverage_AGP_5_0,Nuclei_Texture_SumEntropy_AGP_10_0,Nuclei_Texture_SumEntropy_AGP_20_0,Nuclei_Texture_SumEntropy_AGP_5_0,Nuclei_Texture_SumEntropy_DNA_10_0,Nuclei_Texture_SumEntropy_DNA_20_0,Nuclei_Texture_SumEntropy_Mito_5_0,Nuclei_Texture_SumVariance_DNA_5_0,Nuclei_Texture_Variance_DNA_20_0,Nuclei_Texture_Variance_DNA_5_0
0,0.5658,0.64874,0.69447,0.46048,0.5178,0.29006,0.45754,0.88344,0.53493,0.36377,...,0.22196,0.21631,0.18994,0.23943,0.50994,0.42373,0.41825,0.24933,0.34295,0.2946
1,0.50517,0.68001,0.66229,0.48497,0.49032,0.23165,0.5165,0.85594,0.53245,0.85154,...,0.91301,0.58116,0.55981,0.61404,0.40951,0.30107,0.49368,0.16326,0.2561,0.20783
2,0.62075,0.60104,0.66595,0.5044,0.53494,0.17595,0.49983,0.91533,0.51284,0.42192,...,0.19964,0.1991,0.18299,0.22215,0.57422,0.49592,0.4044,0.29218,0.38456,0.33795


<IPython.core.display.Javascript object>

In [9]:
encoder_architecture = [100]
decoder_architecture = [100]

<IPython.core.display.Javascript object>

In [10]:
cp_vae = VAE(
    input_dim=train_features_df.shape[1],
    latent_dim=20,
    encoder_architecture=encoder_architecture,
    decoder_architecture=decoder_architecture,
    beta=2,
    verbose=False,
)
cp_vae.compile_vae()

  'be expecting any data to be passed to {0}.'.format(name))


<IPython.core.display.Javascript object>

In [13]:
cp_vae.train(x_train=train_features_df, x_test=test_features_df)

<IPython.core.display.Javascript object>

In [14]:
cp_vae.vae

<keras.engine.training.Model at 0x7f8024689c10>

<IPython.core.display.Javascript object>

In [None]:
# Save training performance
history_df = pd.DataFrame(cp_vae.vae.history.history)
history_df = history_df.assign(
    num_components=latent_dim,
    learning_rate=learning_rate,
    batch_size=batch_size,
    epochs=epochs,
    kappa=kappa,
    seed=seed,
    depth=depth,
    first_layer=first_layer,
    dataset=dataset
)
history_df.to_csv(output_filename, sep='\t')

In [11]:
cp_vae.vae.history.history

{'val_loss': [412.15114776699613,
  401.303383690182,
  399.8858876242127,
  406.778304685852,
  404.0603034406642,
  402.91872389084347,
  401.6807797173525,
  395.7157830263173,
  392.8384124255571,
  391.2090535619105,
  389.4218528107484,
  387.87752948639593,
  387.5745362269844,
  386.96624361514586,
  386.9917402423796,
  386.8040701738319,
  386.59356245079516,
  386.53352124941614,
  386.46870532720965,
  386.3744880286339,
  386.43877573482325,
  386.3163669920933,
  386.2461667332148,
  386.3137052530493,
  386.2920891845468,
  386.3242901734929,
  386.41841744962744,
  386.2678305645228,
  386.3713796499725,
  386.16381418049735,
  386.44335716784633,
  386.35123298667094,
  386.0293076164812,
  386.33599120740604,
  386.3971436076592,
  386.377160214872,
  386.3051806369881,
  386.3521435405617,
  386.32454005271745,
  386.3564197095174,
  386.3353360359096,
  386.1256328939587,
  385.96794059260486,
  386.3464080015972,
  386.3516514630074,
  385.9194096976329,
  386.2593

<IPython.core.display.Javascript object>