# Apply Cell Health Models to Repurposing Set

**Gregory Way, 2019**

The models are trained to predict cell health phenotypes.
Here, I apply the models to Cell Painting data from the repurposing set.

I will use these predictions to identify compound perturbation signatures of cell health impact.

In [1]:
import os
import sys
import numpy as np
import pandas as pd
from joblib import load
import umap

from pycytominer.consensus import modz

sys.path.append("../3.train")
from scripts.ml_utils import load_train_test, load_models

In [2]:
%matplotlib inline

In [3]:
np.random.seed(123)

## 1) Load Models and Training Data

In [4]:
model_dir = os.path.join("..", "3.train", "models")

model_dict, model_coef = load_models(model_dir=model_dir)
shuffle_model_dict, shuffle_model_coef = load_models(model_dir=model_dir, shuffle=True)

In [5]:
data_dir = os.path.join("..", "3.train", "data")
x_train_df, x_test_df, y_train_df, y_test_df = load_train_test(data_dir=data_dir, drop_metadata=True)

## 2) Extract Repurposing Data Files

**NOTE** - these files are not yet public!

In [6]:
# List drug repurposing data
repurposing_project_id = "2015_10_05_DrugRepurposing_AravindSubramanian_GolubLab_Broad"

repurposing_profile_dir = os.path.join(
    "/Users",
    "gway",
    "work",
    "projects",
    repurposing_project_id,
    "workspace",
    "software",
    repurposing_project_id,
    "subsampling",
    "data",
    "profiles"
)

In [7]:
# Build a single data frame that holds all profiles
plate_info = {}
all_dfs = []
all_metadata_dfs = []
all_plates = os.listdir(repurposing_profile_dir)
for plate in all_plates:
    plate_dir = os.path.join(repurposing_profile_dir, plate, "n_all")
    norm_file = os.path.join(plate_dir, "{}_subsample_all_normalized.csv".format(plate))
    plate_info[plate] = norm_file
    
    if os.path.exists(norm_file):
        df = pd.read_csv(norm_file)

        feature_df = df.reindex(x_test_df.columns, axis="columns").fillna(0)
        metadata_df = df.loc[:, df.columns.str.startswith("Metadata_")]
        
        all_dfs.append(feature_df)
        all_metadata_dfs.append(metadata_df)

In [8]:
# Merge feature data and metadata
all_df = pd.concat(all_dfs)
all_metadata_df = pd.concat(all_metadata_dfs)

complete_df = pd.concat([all_metadata_df, all_df], axis="columns").reset_index(drop=True)

print(complete_df.shape)
complete_df.head()

(51839, 1283)


Unnamed: 0,Metadata_plate_map_name,Metadata_broad_sample,Metadata_mg_per_ml,Metadata_mmoles_per_liter,Metadata_solvent,Cells_AreaShape_Area,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,Cells_AreaShape_EulerNumber,Cells_AreaShape_Extent,...,Nuclei_Texture_Variance_DNA_5_0,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_ER_5_0,Nuclei_Texture_Variance_Mito_10_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_Mito_5_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,Nuclei_Texture_Variance_RNA_5_0
0,C-7161-01-LM6-027,,,,DMSO,0.235021,0.753198,-1.498757,0.0,-0.142916,...,0.677879,-0.005078,0.096397,-0.040741,-0.613751,-0.572386,-0.610598,-0.453982,-0.307237,-0.429322
1,C-7161-01-LM6-027,,,,DMSO,0.219177,0.406902,1.150688,0.0,-0.116611,...,0.324203,-0.055949,0.109314,-0.075406,-0.418682,-0.430327,-0.473034,-0.145102,0.054124,-0.303347
2,C-7161-01-LM6-027,,,,DMSO,0.288078,0.776809,1.805348,0.0,-0.149218,...,0.568386,-0.023348,0.037226,0.048057,-0.357287,-0.420457,-0.390093,-0.193983,-0.023151,-0.29085
3,C-7161-01-LM6-027,,,,DMSO,0.203886,0.965698,-0.93652,0.0,-0.109647,...,0.422796,-0.153516,-0.063628,-0.099642,-0.317077,-0.323075,-0.333247,0.169582,0.333924,0.068789
4,C-7161-01-LM6-027,,,,DMSO,-0.155539,0.58005,0.796402,0.0,0.227168,...,0.71017,0.113111,0.188868,0.098013,-0.160567,-0.229516,-0.161166,-0.009321,0.081673,-0.051178


In [9]:
# Round dose information and remove samples with low dose representation
# Note: Need to revisit this, perhaps there is a better way.
# (Hamdah and I chatted about alternative strategies)
complete_df.Metadata_mmoles_per_liter = complete_df.Metadata_mmoles_per_liter.fillna(0).round(1)

doses = complete_df.Metadata_mmoles_per_liter.value_counts()
doses = doses[doses > 100].index.tolist()

complete_df = complete_df.query("Metadata_mmoles_per_liter in @doses")

# Also fill in NaN in Metadata_broad_sample as DMSO
complete_df.Metadata_broad_sample = complete_df.Metadata_broad_sample.fillna("DMSO")

print(complete_df.shape)
complete_df.head()

(51609, 1283)


Unnamed: 0,Metadata_plate_map_name,Metadata_broad_sample,Metadata_mg_per_ml,Metadata_mmoles_per_liter,Metadata_solvent,Cells_AreaShape_Area,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,Cells_AreaShape_EulerNumber,Cells_AreaShape_Extent,...,Nuclei_Texture_Variance_DNA_5_0,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_ER_5_0,Nuclei_Texture_Variance_Mito_10_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_Mito_5_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,Nuclei_Texture_Variance_RNA_5_0
0,C-7161-01-LM6-027,DMSO,,0.0,DMSO,0.235021,0.753198,-1.498757,0.0,-0.142916,...,0.677879,-0.005078,0.096397,-0.040741,-0.613751,-0.572386,-0.610598,-0.453982,-0.307237,-0.429322
1,C-7161-01-LM6-027,DMSO,,0.0,DMSO,0.219177,0.406902,1.150688,0.0,-0.116611,...,0.324203,-0.055949,0.109314,-0.075406,-0.418682,-0.430327,-0.473034,-0.145102,0.054124,-0.303347
2,C-7161-01-LM6-027,DMSO,,0.0,DMSO,0.288078,0.776809,1.805348,0.0,-0.149218,...,0.568386,-0.023348,0.037226,0.048057,-0.357287,-0.420457,-0.390093,-0.193983,-0.023151,-0.29085
3,C-7161-01-LM6-027,DMSO,,0.0,DMSO,0.203886,0.965698,-0.93652,0.0,-0.109647,...,0.422796,-0.153516,-0.063628,-0.099642,-0.317077,-0.323075,-0.333247,0.169582,0.333924,0.068789
4,C-7161-01-LM6-027,DMSO,,0.0,DMSO,-0.155539,0.58005,0.796402,0.0,0.227168,...,0.71017,0.113111,0.188868,0.098013,-0.160567,-0.229516,-0.161166,-0.009321,0.081673,-0.051178


In [10]:
# Create consensus profiles
replicate_cols = ["Metadata_broad_sample", "Metadata_mmoles_per_liter"]

complete_consensus_df = modz(
    complete_df,
    features="infer",
    replicate_columns=replicate_cols,
    precision=5
)

complete_consensus_df = complete_consensus_df.reset_index()

print(complete_consensus_df.shape)
complete_consensus_df.head()

(9350, 1280)


Unnamed: 0,Metadata_broad_sample,Metadata_mmoles_per_liter,Cells_AreaShape_Area,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,Cells_AreaShape_EulerNumber,Cells_AreaShape_Extent,Cells_AreaShape_MaxFeretDiameter,Cells_AreaShape_MaximumRadius,Cells_AreaShape_MeanRadius,...,Nuclei_Texture_Variance_DNA_5_0,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_ER_5_0,Nuclei_Texture_Variance_Mito_10_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_Mito_5_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,Nuclei_Texture_Variance_RNA_5_0
0,BRD-A00147595-001-01-5,0.0,0.016267,0.677665,-0.555257,0.0,0.098221,-0.100163,0.16053,0.143685,...,0.663952,0.092953,0.134578,0.14727,0.148237,0.061721,0.235447,-0.986129,-1.072516,-0.824428
1,BRD-A00147595-001-01-5,0.1,0.259686,-0.159293,0.71495,0.0,0.294426,0.125317,0.392188,0.417936,...,1.378316,0.3426,0.259763,0.250226,0.424115,0.337974,0.505876,-0.877329,-0.915628,-0.68917
2,BRD-A00147595-001-01-5,0.4,0.239895,-0.798964,0.452661,0.0,-0.077656,0.21324,0.348779,0.34458,...,0.694396,-0.134685,-0.094541,-0.100322,0.030833,-0.042674,0.065258,-0.789934,-0.786088,-0.737317
3,BRD-A00147595-001-01-5,1.1,0.067556,-0.053281,-0.21052,0.0,0.074717,-0.008091,0.250587,0.190129,...,0.408977,-0.106872,-0.099624,-0.126474,-0.067633,-0.121632,-0.001493,-0.599667,-0.544571,-0.542333
4,BRD-A00147595-001-01-5,3.3,0.534652,0.096251,0.239508,0.0,0.30525,0.361959,0.764359,0.752745,...,1.185853,0.559913,0.621173,0.565626,0.453919,0.326463,0.510245,-0.524872,-0.415882,-0.320797


In [11]:
# Output consensus profiles
output_file = os.path.join("data", "repurposing_modz_consensus.tsv.gz")
complete_consensus_df.to_csv(output_file, sep='\t', compression="gzip", index=False)

In [12]:
# Extract cell profiler and metadata features
cp_features = x_test_df.columns[~x_test_df.columns.str.startswith("Metadata")].tolist()
meta_cols = complete_consensus_df.columns[complete_consensus_df.columns.str.startswith("Metadata")].tolist()

## 3) Apply all real and shuffled Models to all Repurposing Plates

In [13]:
feature_df = complete_consensus_df.reindex(x_test_df.columns, axis="columns")
metadata_df = complete_consensus_df.loc[:, meta_cols]

all_scores = {}
all_shuffle_scores = {}
for cell_health_feature in model_dict.keys():
    # Apply Real Model Classifiers
    model_clf = model_dict[cell_health_feature]
    pred_df = model_clf.predict(feature_df)
    all_scores[cell_health_feature] = pred_df

    # Apply Shuffled Model Classifiers
    shuffle_model_clf = shuffle_model_dict[cell_health_feature]
    shuffle_pred_df = shuffle_model_clf.predict(feature_df)
    all_shuffle_scores[cell_health_feature] = shuffle_pred_df

## 4) Output Results

In [14]:
output_dir = os.path.join("data", "repurposing_transformed")

In [15]:
# Output scores
all_score_df = pd.DataFrame.from_dict(all_scores)
full_df = (
    metadata_df
    .merge(all_score_df,
           left_index=True,
           right_index=True)
)

output_real_file = os.path.join(output_dir, "repurposing_transformed_real_models.tsv.gz")
full_df.to_csv(output_real_file, sep="\t", index=False, compression="gzip")

print(full_df.shape)
full_df.head()

(9350, 72)


Unnamed: 0,Metadata_broad_sample,Metadata_mmoles_per_liter,cc_cc_g1_mean,cc_mitosis_ph3_neg_n_spots_mean,cc_edu_pos_high_n_spots_h2ax_mean,cc_polyploid_n_spots_mean,vb_live_cell_roundness,cc_g2_ph3_neg_n_spots_mean,cc_polyploid_high_n_spots_h2ax_mean,vb_ros_mean,...,cc_all_nucleus_area_mean,cc_cc_n_spots_mean,vb_ros_back_mean,cc_g1_n_objects,vb_percent_all_apoptosis,cc_all_high_n_spots_h2ax_mean,cc_all_large_round_polyploid_mean,cc_edu_pos_n_objects,cc_all_n_objects,cc_g1_n_spots_per_nucleus_area_mean
0,BRD-A00147595-001-01-5,0.0,0.120634,0.178382,0.199762,0.29887,0.096741,0.180102,0.172349,-0.312105,...,0.231163,0.197463,-0.236127,0.413997,-0.271553,0.23105,0.218693,0.385205,0.377869,-0.214581
1,BRD-A00147595-001-01-5,0.1,-0.039803,0.194389,0.225415,0.302778,-0.004163,0.181513,0.207422,0.001156,...,0.297992,0.196869,-0.078217,0.182844,-0.065291,0.226201,0.569872,0.187732,0.193206,-0.135152
2,BRD-A00147595-001-01-5,0.4,0.018841,0.207383,0.266563,0.251273,-0.01905,0.171398,0.183546,0.079587,...,0.262925,0.116171,-0.016142,0.225392,-0.01728,0.185605,0.515803,0.226434,0.218854,-0.14477
3,BRD-A00147595-001-01-5,1.1,-0.017618,0.199552,0.289828,0.342072,-0.015506,0.236677,0.250459,0.16786,...,0.25739,0.23253,-0.029746,0.157231,-0.131317,0.234373,0.366206,0.223275,0.214861,-0.008258
4,BRD-A00147595-001-01-5,3.3,-0.117578,0.181581,0.186944,0.293099,-0.205885,0.248263,0.204151,0.022875,...,0.493244,0.175922,-0.067296,-0.060306,-0.148291,0.267392,0.881551,-0.00732,-0.052789,-0.141207


In [16]:
shuff_score_df = pd.DataFrame.from_dict(all_shuffle_scores)
full_shuff_df = (
    metadata_df
    .merge(shuff_score_df,
           left_index=True,
           right_index=True)
)

output_real_file = os.path.join(output_dir, "repurposing_transformed_shuffled_models.tsv.gz")
full_shuff_df.to_csv(output_real_file, sep="\t", index=False, compression="gzip")

print(full_shuff_df.shape)
full_shuff_df.head()

(9350, 72)


Unnamed: 0,Metadata_broad_sample,Metadata_mmoles_per_liter,cc_cc_g1_mean,cc_mitosis_ph3_neg_n_spots_mean,cc_edu_pos_high_n_spots_h2ax_mean,cc_polyploid_n_spots_mean,vb_live_cell_roundness,cc_g2_ph3_neg_n_spots_mean,cc_polyploid_high_n_spots_h2ax_mean,vb_ros_mean,...,cc_all_nucleus_area_mean,cc_cc_n_spots_mean,vb_ros_back_mean,cc_g1_n_objects,vb_percent_all_apoptosis,cc_all_high_n_spots_h2ax_mean,cc_all_large_round_polyploid_mean,cc_edu_pos_n_objects,cc_all_n_objects,cc_g1_n_spots_per_nucleus_area_mean
0,BRD-A00147595-001-01-5,0.0,0.105258,0.181764,0.393341,0.342494,-0.223329,0.245661,0.277983,0.013462,...,0.197919,0.206127,-0.007581,-0.141384,0.254468,0.167802,0.261533,-0.157798,-0.160217,0.301111
1,BRD-A00147595-001-01-5,0.1,0.107179,0.181764,0.304029,0.24551,-0.206067,0.243041,0.219325,0.017857,...,0.239668,0.181632,-0.007581,-0.147799,0.204822,0.20808,0.261533,-0.157798,-0.160217,0.301111
2,BRD-A00147595-001-01-5,0.4,0.115401,0.181764,0.317117,0.292844,-0.213773,0.26543,0.239202,0.021183,...,0.289443,0.324795,-0.007581,-0.140763,0.195656,0.232195,0.261533,-0.157798,-0.160217,0.301111
3,BRD-A00147595-001-01-5,1.1,0.115078,0.181764,0.304916,0.347022,-0.221386,0.286266,0.21324,0.022556,...,0.283776,0.240469,-0.007581,-0.143721,0.200981,0.252175,0.261533,-0.157798,-0.160217,0.301111
4,BRD-A00147595-001-01-5,3.3,0.109163,0.181764,0.233148,0.289459,-0.253284,0.270936,0.21354,0.024142,...,0.030851,0.014094,-0.007581,-0.147085,0.138314,0.26689,0.261533,-0.157798,-0.160217,0.301111


## 5) Apply UMAP

### Part 1: Apply UMAP to Cell Health Transformed Repurposing Hub Features

In [17]:
cell_health_features = list(model_dict.keys())

In [18]:
reducer = umap.UMAP(random_state=1234, n_components=2)

metadata_df = full_df.drop(cell_health_features, axis="columns")

real_embedding_df = pd.DataFrame(
    reducer.fit_transform(full_df.loc[:, cell_health_features]),
    columns=["umap_x", "umap_y"]
)

real_embedding_df = (
    metadata_df
    .merge(real_embedding_df,
           left_index=True,
           right_index=True)
)

output_real_file = os.path.join(output_dir, "repurposing_umap_transformed_real_models.tsv.gz")
real_embedding_df.to_csv(output_real_file, sep="\t", index=False, compression="gzip")

Compilation is falling back to object mode WITH looplifting enabled because Function "make_euclidean_tree" failed type inference due to: Cannot unify RandomProjectionTreeNode(array(int64, 1d, C), bool, none, none, none, none) and RandomProjectionTreeNode(none, bool, array(float32, 1d, C), float64, RandomProjectionTreeNode(array(int64, 1d, C), bool, none, none, none, none), RandomProjectionTreeNode(array(int64, 1d, C), bool, none, none, none, none)) for '$14.16', defined at /Users/gway/miniconda3/envs/cell-health/lib/python3.7/site-packages/umap/rp_tree.py (446)

File "../../../../../../../miniconda3/envs/cell-health/lib/python3.7/site-packages/umap/rp_tree.py", line 446:
def make_euclidean_tree(data, indices, rng_state, leaf_size=30):
    <source elided>

        left_node = make_euclidean_tree(data, left_indices, rng_state, leaf_size)
        ^

[1] During: resolving callee type: recursive(type(CPUDispatcher(<function make_euclidean_tree at 0x126ce6b70>)))
[2] During: typing of call a

### Part 2: Apply UMAP to All Repurposing Hub Cell Painting Profiles

In [19]:
reducer = umap.UMAP(random_state=1234, n_components=2)

complete_metadata_df = complete_consensus_df.drop(cp_features, axis="columns")

complete_embedding_df = pd.DataFrame(
    reducer.fit_transform(complete_consensus_df.loc[:, cp_features]),
    columns=["umap_x", "umap_y"]
)

complete_embedding_df = (
    complete_metadata_df
    .merge(complete_embedding_df,
           left_index=True,
           right_index=True)
)

output_real_file = os.path.join(output_dir, "repurposing_umap_transformed_cell_painting.tsv.gz")
complete_embedding_df.to_csv(output_real_file, sep="\t", index=False, compression="gzip")

The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../../../../../miniconda3/envs/cell-health/lib/python3.7/site-packages/umap/nndescent.py", line 47:
    @numba.njit(parallel=True)
    def nn_descent(
    ^

  self.func_ir.loc))
