# Apply Cell Health Models to Repurposing Set

**Gregory Way, 2019**

The models are trained to predict cell health phenotypes.
Here, I apply the models to Cell Painting data from the repurposing set.

I will use these predictions to identify compound perturbation signatures of cell health impact.

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import scipy.stats
from joblib import load
import umap

from pycytominer import feature_select
from pycytominer.cyto_utils import infer_cp_features

sys.path.append("../3.train")
from scripts.ml_utils import load_train_test, load_models



In [2]:
np.random.seed(123)

## 1) Load Models and Training Data

In [3]:
consensus = "modz"
output_dir = "data"

In [4]:
model_dir = os.path.join("..", "3.train", "models")

model_dict, model_coef = load_models(
    model_dir=model_dir,
    consensus=consensus
)

In [5]:
data_dir = os.path.join("..", "3.train", "data")

x_train_df, x_test_df, y_train_df, y_test_df = load_train_test(
    data_dir=data_dir,
    consensus=consensus,
    drop_metadata=True
)

## 2) Load Cell Painting Repurposing Data Files

These files are available from https://github.com/broadinstitute/lincs-cell-painting

In [6]:
batch = "2016_04_01_a549_48hr_batch1"
commit_hash = "27a2d7dd74067b5754c2c045e9b1a9cfb0581ae4"

# We have noticed particular technical issues with this platemap
# remove it from downstream consideration
# https://github.com/broadinstitute/lincs-cell-painting/issues/43
filter_platemap = "C-7161-01-LM6-011"

In [7]:
# Load data
base_url = "https://media.githubusercontent.com/media/broadinstitute/lincs-cell-painting/"
repurp_url = f"{base_url}/{commit_hash}/consensus/{batch}/{batch}_consensus_{consensus}.csv.gz"

complete_consensus_df = pd.read_csv(repurp_url)

print(complete_consensus_df.shape)
complete_consensus_df.head()

(10752, 1788)


Unnamed: 0,Metadata_Plate_Map_Name,Metadata_broad_sample,Metadata_pert_well,Metadata_mmoles_per_liter,Metadata_dose_recode,Cells_AreaShape_Area,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,...,Nuclei_Texture_Variance_DNA_5_0,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_ER_5_0,Nuclei_Texture_Variance_Mito_10_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_Mito_5_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,Nuclei_Texture_Variance_RNA_5_0
0,C-7161-01-LM6-001,BRD-A25234499-001-18-3,B13,10.0,6,0.590843,-0.291888,0.197,0.02541,-0.314847,...,-0.856111,-1.06781,-0.953925,-1.12625,-0.539351,-0.413102,-0.587832,0.822261,0.738317,0.87822
1,C-7161-01-LM6-001,BRD-A25234499-001-18-3,B14,3.3333,5,0.953997,0.615666,-0.003971,0.362362,-0.29934,...,-0.197888,-0.884445,-0.676961,-0.97964,-0.900894,-0.89717,-0.787205,-0.673075,-0.747765,-0.346591
2,C-7161-01-LM6-001,BRD-A25234499-001-18-3,B15,1.1111,4,1.10022,-1.0063,-0.324214,-0.220141,-0.572329,...,-0.25177,-0.775027,-1.13909,-0.699553,-0.785571,-0.801577,-0.534207,-0.420019,-0.706492,-0.202666
3,C-7161-01-LM6-001,BRD-A25234499-001-18-3,B16,0.37037,3,0.616491,0.523367,0.723404,0.135139,0.079898,...,-0.692809,-0.733101,-0.650335,-0.942234,-0.244557,-0.182373,-0.290681,-0.470494,-0.799699,-0.323079
4,C-7161-01-LM6-001,BRD-A25234499-001-18-3,B17,0.12346,2,1.14541,1.45266,-0.05285,0.1947,-0.196077,...,-0.417982,-0.126697,0.261282,-0.116643,-0.35885,-0.438067,-0.292326,-0.400007,-0.667327,-0.206596


In [8]:
# Apply feature selection to the consensus profiles
feature_ops = [
    "variance_threshold",
    "drop_na_columns",
    "blacklist",
    "drop_outliers"
]

consensus_feature_select_df = feature_select(
    complete_consensus_df,
    operation=feature_ops,
    na_cutoff=0
)

print(consensus_feature_select_df.shape)
consensus_feature_select_df.head()

(10752, 572)


Unnamed: 0,Metadata_Plate_Map_Name,Metadata_broad_sample,Metadata_pert_well,Metadata_mmoles_per_liter,Metadata_dose_recode,Cells_AreaShape_FormFactor,Cells_AreaShape_Orientation,Cells_AreaShape_Zernike_2_2,Cells_AreaShape_Zernike_3_1,Cells_AreaShape_Zernike_5_1,...,Nuclei_Texture_SumAverage_AGP_5_0,Nuclei_Texture_SumEntropy_AGP_10_0,Nuclei_Texture_SumEntropy_AGP_20_0,Nuclei_Texture_SumEntropy_AGP_5_0,Nuclei_Texture_SumEntropy_DNA_10_0,Nuclei_Texture_SumEntropy_DNA_20_0,Nuclei_Texture_SumEntropy_Mito_5_0,Nuclei_Texture_SumVariance_DNA_5_0,Nuclei_Texture_Variance_DNA_20_0,Nuclei_Texture_Variance_DNA_5_0
0,C-7161-01-LM6-001,BRD-A25234499-001-18-3,B13,10.0,6,-1.03347,0.389567,-0.754991,-0.083304,-0.14703,...,0.038403,-0.03494,0.004637,-0.050747,-0.743676,-0.696349,-0.23271,-0.877754,-0.844527,-0.856111
1,C-7161-01-LM6-001,BRD-A25234499-001-18-3,B14,3.3333,5,-1.54989,-0.554293,-0.626483,0.754096,-0.405463,...,0.290298,0.109497,0.128128,0.145252,-0.109596,-0.054243,-1.15119,-0.196346,-0.158676,-0.197888
2,C-7161-01-LM6-001,BRD-A25234499-001-18-3,B15,1.1111,4,-1.26105,0.410012,-0.445808,-0.159203,-0.255041,...,0.182766,0.043097,0.114488,0.007156,-0.112551,-0.023289,-0.72162,-0.234444,-0.237228,-0.25177
3,C-7161-01-LM6-001,BRD-A25234499-001-18-3,B16,0.37037,3,-0.654006,-0.003275,-0.223035,0.39684,0.423224,...,-0.252911,-0.496789,-0.429382,-0.49774,-0.557208,-0.452524,-0.61069,-0.612029,-0.596692,-0.692809
4,C-7161-01-LM6-001,BRD-A25234499-001-18-3,B17,0.12346,2,-1.07665,0.113965,-0.670363,0.182955,-0.296822,...,-0.845351,-0.747056,-0.691795,-0.823489,-0.131518,-0.141882,-0.585212,-0.30026,-0.344,-0.417982


In [9]:
# Split metadata and CP Features
cp_features = infer_cp_features(x_test_df)
meta_features = infer_cp_features(complete_consensus_df, metadata=True)

# Realign LINCS data to the same feature ordering as the test dataset
feature_df = complete_consensus_df.reindex(cp_features, axis="columns")
metadata_df = complete_consensus_df.loc[:, meta_features]

print(feature_df.shape)
feature_df.head()

(10752, 949)


Unnamed: 0,Cells_AreaShape_Center_Y,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,Cells_AreaShape_Extent,Cells_AreaShape_Orientation,Cells_AreaShape_Zernike_0_0,Cells_AreaShape_Zernike_1_1,Cells_AreaShape_Zernike_2_0,Cells_AreaShape_Zernike_2_2,Cells_AreaShape_Zernike_3_3,...,Nuclei_Texture_SumEntropy_RNA_5_0,Nuclei_Texture_SumVariance_AGP_20_0,Nuclei_Texture_SumVariance_AGP_5_0,Nuclei_Texture_SumVariance_DNA_10_0,Nuclei_Texture_SumVariance_DNA_20_0,Nuclei_Texture_SumVariance_DNA_5_0,Nuclei_Texture_Variance_AGP_5_0,Nuclei_Texture_Variance_DNA_10_0,Nuclei_Texture_Variance_DNA_20_0,Nuclei_Texture_Variance_DNA_5_0
0,0.197,0.02541,-0.314847,-0.098006,0.389567,-0.108497,-0.049925,0.758369,-0.754991,0.486821,...,-0.046644,-0.132545,-0.204489,-0.773132,-0.776795,-0.877754,-0.241699,-0.88773,-0.844527,-0.856111
1,-0.003971,0.362362,-0.29934,-0.790188,-0.554293,-0.616793,0.420721,0.139467,-0.626483,-0.75848,...,0.181064,0.046638,-0.015287,-0.140727,-0.095103,-0.196346,-0.035929,-0.211766,-0.158676,-0.197888
2,-0.324214,-0.220141,-0.572329,-0.592901,0.410012,-0.196159,-0.155111,1.2399,-0.445808,0.243092,...,0.117535,0.122914,-0.103896,-0.214272,-0.043825,-0.234444,-0.076107,-0.211043,-0.237228,-0.25177
3,0.723404,0.135139,0.079898,0.364728,-0.003275,-0.205024,0.087071,-0.160058,-0.223035,-0.70234,...,0.01754,-0.52912,-0.622424,-0.581336,-0.516791,-0.612029,-0.585321,-0.690282,-0.596692,-0.692809
4,-0.05285,0.1947,-0.196077,-0.474338,0.113965,-0.19254,-0.301498,0.94907,-0.670363,-0.095436,...,-0.070211,-0.563626,-0.719886,-0.278656,-0.247769,-0.30026,-0.668371,-0.41476,-0.344,-0.417982


## 3) Apply all Regression Models to all Repurposing Plates

In [10]:
cell_health_features = list(model_dict.keys())

all_scores = {}
for cell_health_feature in cell_health_features:
    model_clf = model_dict[cell_health_feature]
    pred_df = model_clf.predict(feature_df)
    all_scores[cell_health_feature] = pred_df

## 4) Output Results

In [11]:
# Output scores
all_score_df = pd.DataFrame.from_dict(all_scores)
repurp_predict_df = (
    metadata_df
    .merge(
        all_score_df,
        left_index=True,
        right_index=True
    )
    .query("Metadata_Plate_Map_Name != @filter_platemap")
)

output_real_file = os.path.join(
    output_dir,
    "repurposing_transformed_real_models_{}.tsv.gz".format(consensus)
)
repurp_predict_df.to_csv(output_real_file, sep="\t", index=False, compression="gzip")

print(repurp_predict_df.shape)
repurp_predict_df.head()

(10368, 75)


Unnamed: 0,Metadata_Plate_Map_Name,Metadata_broad_sample,Metadata_pert_well,Metadata_mmoles_per_liter,Metadata_dose_recode,cell_health_modz_target_cc_late_mitosis_n_spots_h2ax_per_nucleus_area_mean,cell_health_modz_target_cc_all_nucleus_roundness_mean,cell_health_modz_target_cc_polyploid_n_objects,cell_health_modz_target_cc_g1_n_objects,cell_health_modz_target_cc_mitosis_n_objects,...,cell_health_modz_target_cc_s_intensity_nucleus_area_mean,cell_health_modz_target_cc_g2_high_h2ax,cell_health_modz_target_cc_late_mitosis_n_spots_h2ax_mean,cell_health_modz_target_cc_cc_high_h2ax,cell_health_modz_target_vb_percent_dead_only,cell_health_modz_target_cc_s_high_h2ax,cell_health_modz_target_cc_cc_n_spots_h2ax_per_nucleus_area_mean,cell_health_modz_target_cc_s_n_spots_h2ax_per_nucleus_area_mean,cell_health_modz_target_cc_g1_plus_g2_count,cell_health_modz_target_vb_live_cell_width_length
0,C-7161-01-LM6-001,BRD-A25234499-001-18-3,B13,10.0,6,0.12184,0.287876,0.332128,0.031668,0.293513,...,-0.04159,0.275196,0.231984,0.034737,0.226777,0.237375,0.105485,0.153677,-0.530986,0.144455
1,C-7161-01-LM6-001,BRD-A25234499-001-18-3,B14,3.3333,5,0.135517,-0.016983,0.175805,0.045467,0.271201,...,-0.615482,0.51695,0.242159,0.012577,0.63289,0.399858,0.022915,0.202797,-0.159769,-0.294545
2,C-7161-01-LM6-001,BRD-A25234499-001-18-3,B15,1.1111,4,0.181414,0.029788,0.20529,0.009674,0.302538,...,-0.577316,0.40125,0.268255,0.080886,0.207432,0.337209,0.180617,0.285974,-0.254186,-0.048311
3,C-7161-01-LM6-001,BRD-A25234499-001-18-3,B16,0.37037,3,0.168418,0.31882,0.194175,0.241066,0.359096,...,-0.407993,0.307553,0.248019,0.047578,0.253732,0.344834,0.138011,0.306638,-0.314815,0.03836
4,C-7161-01-LM6-001,BRD-A25234499-001-18-3,B17,0.12346,2,0.195843,-0.316581,0.1307,0.298723,0.185056,...,-0.114774,0.343276,0.234374,0.226134,0.262437,0.247767,0.298132,0.395778,0.1865,-0.051261


## 5) Apply UMAP

### Part 1: Apply UMAP to Cell Health Transformed Repurposing Hub Features

In [12]:
reducer = umap.UMAP(random_state=1234, n_components=2)

predict_embedding_df = pd.DataFrame(
    reducer.fit_transform(repurp_predict_df.loc[:, cell_health_features]),
    columns=["umap_x", "umap_y"]
)

predict_embedding_df = (
    metadata_df
    .merge(
        predict_embedding_df,
        left_index=True,
        right_index=True
    )
    .query("Metadata_Plate_Map_Name != @filter_platemap")
)

output_real_file = os.path.join(
    output_dir,
    "repurposing_umap_transformed_real_models_{}.tsv.gz".format(consensus)
)

predict_embedding_df.to_csv(output_real_file, sep="\t", index=False, compression="gzip")

print(predict_embedding_df.shape)
predict_embedding_df.head()

Compilation is falling back to object mode WITH looplifting enabled because Function "make_euclidean_tree" failed type inference due to: Cannot unify RandomProjectionTreeNode(array(int64, 1d, C), bool, none, none, none, none) and RandomProjectionTreeNode(none, bool, array(float32, 1d, C), float64, RandomProjectionTreeNode(array(int64, 1d, C), bool, none, none, none, none), RandomProjectionTreeNode(array(int64, 1d, C), bool, none, none, none, none)) for '$14.16', defined at /Users/gway/miniconda3/envs/cell-health/lib/python3.7/site-packages/umap/rp_tree.py (446)

File "../../../../../../../miniconda3/envs/cell-health/lib/python3.7/site-packages/umap/rp_tree.py", line 446:
def make_euclidean_tree(data, indices, rng_state, leaf_size=30):
    <source elided>

        left_node = make_euclidean_tree(data, left_indices, rng_state, leaf_size)
        ^

[1] During: resolving callee type: recursive(type(CPUDispatcher(<function make_euclidean_tree at 0x11c935048>)))
[2] During: typing of call a

(9984, 7)


Unnamed: 0,Metadata_Plate_Map_Name,Metadata_broad_sample,Metadata_pert_well,Metadata_mmoles_per_liter,Metadata_dose_recode,umap_x,umap_y
0,C-7161-01-LM6-001,BRD-A25234499-001-18-3,B13,10.0,6,2.767666,-0.822573
1,C-7161-01-LM6-001,BRD-A25234499-001-18-3,B14,3.3333,5,1.135071,-3.082334
2,C-7161-01-LM6-001,BRD-A25234499-001-18-3,B15,1.1111,4,2.384409,-2.227846
3,C-7161-01-LM6-001,BRD-A25234499-001-18-3,B16,0.37037,3,2.249686,-1.413167
4,C-7161-01-LM6-001,BRD-A25234499-001-18-3,B17,0.12346,2,2.428509,-1.821966


### Part 2: Apply UMAP to All Repurposing Hub Cell Painting Profiles

In [13]:
reducer = umap.UMAP(random_state=1234, n_components=2)

repurp_embedding_df = pd.DataFrame(
    reducer.fit_transform(
        consensus_feature_select_df.loc[:, infer_cp_features(consensus_feature_select_df)]
    ),
    columns=["umap_x", "umap_y"]
)

repurp_embedding_df = (
    metadata_df
    .merge(
        repurp_embedding_df,
        left_index=True,
        right_index=True
    )
    .query("Metadata_Plate_Map_Name != @filter_platemap")
)

output_real_file = os.path.join(
    output_dir,
    "repurposing_umap_transformed_cell_painting_{}.tsv.gz".format(consensus)
)
repurp_embedding_df.to_csv(output_real_file, sep="\t", index=False, compression="gzip")

print(repurp_embedding_df.shape)
repurp_embedding_df.head()

The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../../../../../miniconda3/envs/cell-health/lib/python3.7/site-packages/umap/nndescent.py", line 47:
    @numba.njit(parallel=True)
    def nn_descent(
    ^

  self.func_ir.loc))


(10368, 7)


Unnamed: 0,Metadata_Plate_Map_Name,Metadata_broad_sample,Metadata_pert_well,Metadata_mmoles_per_liter,Metadata_dose_recode,umap_x,umap_y
0,C-7161-01-LM6-001,BRD-A25234499-001-18-3,B13,10.0,6,-1.580029,-0.921932
1,C-7161-01-LM6-001,BRD-A25234499-001-18-3,B14,3.3333,5,-1.904766,-2.4361
2,C-7161-01-LM6-001,BRD-A25234499-001-18-3,B15,1.1111,4,-1.899164,-2.591488
3,C-7161-01-LM6-001,BRD-A25234499-001-18-3,B16,0.37037,3,-1.708769,-0.88606
4,C-7161-01-LM6-001,BRD-A25234499-001-18-3,B17,0.12346,2,-1.857128,-2.708612


## Merge Data Together for Shiny App Exploration

In [14]:
# Load MOA file
moa_url = "https://raw.githubusercontent.com/broadinstitute/lincs-cell-painting/"
moa_url = f"{moa_url}/{commit_hash}/metadata/moa/repurposing_info_external_moa_map_resolved.tsv"

moa_df = pd.read_csv(moa_url, sep="\t")

print(moa_df.shape)
moa_df.head(3)

(1570, 10)


Unnamed: 0,broad_sample,broad_id,pert_iname,InChIKey14,moa,target,broad_date,clinical_phase,alternative_moa,alternative_target
0,BRD-A00147595-001-01-5,BRD-A00147595,balaglitazone,IETKPTYAGKZLKY,insulin sensitizer|PPAR receptor partial agonist,PPARG,broad_id_20170327,Phase 3,,
1,BRD-A00218260-001-03-4,BRD-A00218260,flutrimazole,QHMWCHQXCUNUAK,sterol demethylase inhibitor,,broad_id_20170327,Launched,,
2,BRD-A00376169-001-01-6,BRD-A00376169,KBG,FOLRUCXBTYDAQK,neprilysin inhibitor,MME,broad_id_20170327,Phase 2,,


In [15]:
core_id = [
    "{}-{}".format(
        x.split("-")[0],
        x.split("-")[1]
    ) if x != "DMSO"
    else x
    for x in repurp_embedding_df.Metadata_broad_sample
]

repurp_embedding_with_pert_df = (
    repurp_embedding_df
    .assign(Metadata_broad_core_id=core_id)
    .sort_index(axis="columns")
    .merge(
        moa_df,
        left_on="Metadata_broad_core_id",
        right_on="broad_id",
        how="left"
    )
)

print(repurp_embedding_with_pert_df.shape)
repurp_embedding_with_pert_df.head()

(10368, 18)


Unnamed: 0,Metadata_Plate_Map_Name,Metadata_broad_core_id,Metadata_broad_sample,Metadata_dose_recode,Metadata_mmoles_per_liter,Metadata_pert_well,umap_x,umap_y,broad_sample,broad_id,pert_iname,InChIKey14,moa,target,broad_date,clinical_phase,alternative_moa,alternative_target
0,C-7161-01-LM6-001,BRD-A25234499,BRD-A25234499-001-18-3,6,10.0,B13,-1.580029,-0.921932,BRD-A25234499-001-18-3,BRD-A25234499,aminoglutethimide,ROBVIMPUHSLWNV,glucocorticoid receptor antagonist,CYP11A1|CYP19A1,broad_id_20170327,Launched,,
1,C-7161-01-LM6-001,BRD-A25234499,BRD-A25234499-001-18-3,5,3.3333,B14,-1.904766,-2.4361,BRD-A25234499-001-18-3,BRD-A25234499,aminoglutethimide,ROBVIMPUHSLWNV,glucocorticoid receptor antagonist,CYP11A1|CYP19A1,broad_id_20170327,Launched,,
2,C-7161-01-LM6-001,BRD-A25234499,BRD-A25234499-001-18-3,4,1.1111,B15,-1.899164,-2.591488,BRD-A25234499-001-18-3,BRD-A25234499,aminoglutethimide,ROBVIMPUHSLWNV,glucocorticoid receptor antagonist,CYP11A1|CYP19A1,broad_id_20170327,Launched,,
3,C-7161-01-LM6-001,BRD-A25234499,BRD-A25234499-001-18-3,3,0.37037,B16,-1.708769,-0.88606,BRD-A25234499-001-18-3,BRD-A25234499,aminoglutethimide,ROBVIMPUHSLWNV,glucocorticoid receptor antagonist,CYP11A1|CYP19A1,broad_id_20170327,Launched,,
4,C-7161-01-LM6-001,BRD-A25234499,BRD-A25234499-001-18-3,2,0.12346,B17,-1.857128,-2.708612,BRD-A25234499-001-18-3,BRD-A25234499,aminoglutethimide,ROBVIMPUHSLWNV,glucocorticoid receptor antagonist,CYP11A1|CYP19A1,broad_id_20170327,Launched,,


In [16]:
shiny_merge_cols = [
    "Metadata_Plate_Map_Name",
    "Metadata_broad_sample",
    "Metadata_dose_recode",
    "Metadata_mmoles_per_liter",
    "Metadata_pert_well"
]

shiny_df = (
    repurp_embedding_with_pert_df.merge(
        repurp_predict_df,
        left_on=shiny_merge_cols,
        right_on=shiny_merge_cols,
        how="inner"
    )
    .drop(["broad_sample"], axis="columns")
    .query("Metadata_Plate_Map_Name != @filter_platemap")
)

print(shiny_df.shape)
shiny_df.head()

(10368, 87)


Unnamed: 0,Metadata_Plate_Map_Name,Metadata_broad_core_id,Metadata_broad_sample,Metadata_dose_recode,Metadata_mmoles_per_liter,Metadata_pert_well,umap_x,umap_y,broad_id,pert_iname,...,cell_health_modz_target_cc_s_intensity_nucleus_area_mean,cell_health_modz_target_cc_g2_high_h2ax,cell_health_modz_target_cc_late_mitosis_n_spots_h2ax_mean,cell_health_modz_target_cc_cc_high_h2ax,cell_health_modz_target_vb_percent_dead_only,cell_health_modz_target_cc_s_high_h2ax,cell_health_modz_target_cc_cc_n_spots_h2ax_per_nucleus_area_mean,cell_health_modz_target_cc_s_n_spots_h2ax_per_nucleus_area_mean,cell_health_modz_target_cc_g1_plus_g2_count,cell_health_modz_target_vb_live_cell_width_length
0,C-7161-01-LM6-001,BRD-A25234499,BRD-A25234499-001-18-3,6,10.0,B13,-1.580029,-0.921932,BRD-A25234499,aminoglutethimide,...,-0.04159,0.275196,0.231984,0.034737,0.226777,0.237375,0.105485,0.153677,-0.530986,0.144455
1,C-7161-01-LM6-001,BRD-A25234499,BRD-A25234499-001-18-3,5,3.3333,B14,-1.904766,-2.4361,BRD-A25234499,aminoglutethimide,...,-0.615482,0.51695,0.242159,0.012577,0.63289,0.399858,0.022915,0.202797,-0.159769,-0.294545
2,C-7161-01-LM6-001,BRD-A25234499,BRD-A25234499-001-18-3,4,1.1111,B15,-1.899164,-2.591488,BRD-A25234499,aminoglutethimide,...,-0.577316,0.40125,0.268255,0.080886,0.207432,0.337209,0.180617,0.285974,-0.254186,-0.048311
3,C-7161-01-LM6-001,BRD-A25234499,BRD-A25234499-001-18-3,3,0.37037,B16,-1.708769,-0.88606,BRD-A25234499,aminoglutethimide,...,-0.407993,0.307553,0.248019,0.047578,0.253732,0.344834,0.138011,0.306638,-0.314815,0.03836
4,C-7161-01-LM6-001,BRD-A25234499,BRD-A25234499-001-18-3,2,0.12346,B17,-1.857128,-2.708612,BRD-A25234499,aminoglutethimide,...,-0.114774,0.343276,0.234374,0.226134,0.262437,0.247767,0.298132,0.395778,0.1865,-0.051261


In [17]:
shiny_file = os.path.join(
    "repurposing_cellhealth_shiny",
    "data",
    "moa_cell_health_{}.tsv.gz".format(consensus)
)

shiny_df.to_csv(shiny_file, sep='\t', index=False, compression="gzip")

In [18]:
shiny_combined_df = shiny_df.merge(
    complete_consensus_df,
    on=infer_cp_features(complete_consensus_df, metadata=True),
    how="inner"
)

## Output Correlation Matrix

In [19]:
shiny_features = infer_cp_features(consensus_feature_select_df)
cell_health_features = [x for x in shiny_df if x.startswith("cell_health")]

In [20]:
all_results = []
for cell_health_feature in cell_health_features:
    cell_health = shiny_combined_df.loc[:, cell_health_feature]
    for cp_feature in shiny_features:
        feature = shiny_combined_df.loc[:, cp_feature]
        cor_result, pval = scipy.stats.pearsonr(cell_health, feature)
        all_results.append([cell_health_feature, cp_feature, cor_result, pval])



In [21]:
# Output correlation matrix for cell health predictions and CellProfiler features
cor_results_df = (
    pd.DataFrame(
        np.array(all_results), columns=["cell_health", "cp_feature", "pearson_cor", "pval"]
    )
    .sort_values(by="pearson_cor", ascending=False)
    .reset_index(drop=True)
)

cor_results_df.pearson_cor = cor_results_df.pearson_cor.astype(float)

cor_results_df = (
    cor_results_df
    .pivot_table(columns=["cell_health"], index=["cp_feature"], values="pearson_cor")
)

print(cor_results_df.shape)
cor_results_df.head(3)

(567, 68)


cell_health,cell_health_modz_target_cc_all_high_h2ax,cell_health_modz_target_cc_all_large_notround_polynuclear_mean,cell_health_modz_target_cc_all_large_round_polyploid_mean,cell_health_modz_target_cc_all_n_objects,cell_health_modz_target_cc_all_n_spots_h2ax_mean,cell_health_modz_target_cc_all_n_spots_h2ax_per_nucleus_area_mean,cell_health_modz_target_cc_all_nucleus_area_mean,cell_health_modz_target_cc_all_nucleus_roundness_mean,cell_health_modz_target_cc_cc_early_mitosis,cell_health_modz_target_cc_cc_g1,...,cell_health_modz_target_vb_num_live_cells,cell_health_modz_target_vb_percent_all_apoptosis,cell_health_modz_target_vb_percent_caspase_dead_only,cell_health_modz_target_vb_percent_dead,cell_health_modz_target_vb_percent_dead_only,cell_health_modz_target_vb_percent_early_apoptosis,cell_health_modz_target_vb_percent_late_apoptosis,cell_health_modz_target_vb_percent_live,cell_health_modz_target_vb_ros_back_mean,cell_health_modz_target_vb_ros_mean
cp_feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cells_AreaShape_FormFactor,-0.675377,-0.461193,-0.433846,0.09319,-0.609459,-0.517603,-0.201257,-0.251131,0.17437,0.006384,...,0.160444,0.362358,0.550042,0.303673,0.154904,0.110851,0.36937,-0.303552,0.478296,0.425893
Cells_AreaShape_Orientation,0.109455,0.249087,0.312104,0.27933,0.325335,0.330253,-0.179699,0.309541,-0.276428,-0.270623,...,0.257589,-0.358588,-0.333667,-0.359261,-0.324598,-0.311398,-0.367548,0.35934,-0.365048,-0.35767
Cells_AreaShape_Zernike_2_2,0.032624,0.45994,0.45603,0.686729,0.425207,0.49603,-0.569525,0.588909,-0.558868,-0.603441,...,0.67024,-0.617549,-0.513345,-0.642031,-0.680564,-0.657186,-0.620867,0.642168,-0.536552,-0.571678
