# Apply Cell Health Models to Repurposing Set

**Gregory Way, 2019**

The models are trained to predict cell health phenotypes.
Here, I apply the models to Cell Painting data from the repurposing set.

I will use these predictions to identify compound perturbation signatures of cell health impact.

In [1]:
import os
import sys
import numpy as np
import pandas as pd
from joblib import load
import umap

from pycytominer.consensus import modz

sys.path.append("../3.train")
from scripts.ml_utils import load_train_test, load_models

In [2]:
np.random.seed(123)

## 1) Load Models and Training Data

In [3]:
consensus = "modz"
output_dir = "data"

In [4]:
model_dir = os.path.join("..", "3.train", "models")

model_dict, model_coef = load_models(
    model_dir=model_dir,
    consensus=consensus
)

In [5]:
data_dir = os.path.join("..", "3.train", "data")

x_train_df, x_test_df, y_train_df, y_test_df = load_train_test(
    data_dir=data_dir,
    consensus=consensus,
    drop_metadata=True
)

## 2) Extract Repurposing Data Files

**NOTE** - these files are not yet public!

In [6]:
# List drug repurposing data
repurposing_project_id = "2015_10_05_DrugRepurposing_AravindSubramanian_GolubLab_Broad"

repurposing_profile_dir = os.path.join(
    "/home",
    "ubuntu",
    "efs",
    repurposing_project_id,
    "workspace",
    "software",
    repurposing_project_id,
    "subsampling",
    "full_profile_data"
)

all_plates = list(set([x.split("_")[0] for x in os.listdir(repurposing_profile_dir)]))

In [7]:
# Build a single data frame that holds all profiles
plate_info = {}
all_dfs = []
all_metadata_dfs = []
for plate in all_plates:
    norm_file = os.path.join(repurposing_profile_dir, 
                             "{}_subsample_all_normalized.csv".format(plate))

    plate_info[plate] = norm_file
    
    if os.path.exists(norm_file):
        df = pd.read_csv(norm_file)

        feature_df = df.reindex(x_test_df.columns, axis="columns").fillna(0)
        metadata_df = df.loc[:, df.columns.str.contains("Metadata_")]
        
        all_dfs.append(feature_df)
        all_metadata_dfs.append(metadata_df)

In [8]:
# Merge feature data and metadata
all_df = pd.concat(all_dfs, sort=True)
all_metadata_df = pd.concat(all_metadata_dfs, sort=True)

complete_df = pd.concat([all_metadata_df, all_df], axis="columns").reset_index(drop=True)

# Fill in NaN in Metadata_broad_sample as DMSO
complete_df.Metadata_broad_sample = complete_df.Metadata_broad_sample.fillna("DMSO")

print(complete_df.shape)
complete_df.head()

(51839, 1605)


Unnamed: 0,Image_Metadata_Plate,Image_Metadata_Well,Metadata_broad_sample,Metadata_mg_per_ml,Metadata_mmoles_per_liter,Metadata_plate_map_name,Metadata_solvent,Cells_AreaShape_Area,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,...,Nuclei_Texture_Variance_DNA_5_0,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_ER_5_0,Nuclei_Texture_Variance_Mito_10_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_Mito_5_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,Nuclei_Texture_Variance_RNA_5_0
0,SQ00014816,A01,DMSO,,,C-7161-01-LM6-023,DMSO,1.345312,-1.723697,-0.958103,...,1.458715,1.518992,1.655247,1.223732,0.019179,0.181705,0.554384,-3.82543,-3.217268,-4.302272
1,SQ00014816,A02,DMSO,,,C-7161-01-LM6-023,DMSO,1.401299,0.97717,1.035763,...,0.68182,0.103865,0.43868,-0.322249,-0.153979,-0.047031,-0.196225,0.026355,0.334541,0.086323
2,SQ00014816,A03,DMSO,,,C-7161-01-LM6-023,DMSO,0.798243,0.629534,0.732348,...,1.32436,1.204122,1.175088,1.187142,0.707714,0.864548,0.333489,-0.142881,-0.329093,-0.101865
3,SQ00014816,A04,DMSO,,,C-7161-01-LM6-023,DMSO,0.078415,1.137618,-1.02312,...,0.202593,-0.361407,-0.2226,-0.791294,-0.435237,-1.125888,-0.471284,0.115125,-0.102395,0.220038
4,SQ00014816,A05,DMSO,,,C-7161-01-LM6-023,DMSO,-0.748588,0.014485,-0.871413,...,0.789568,0.073901,0.299092,0.18554,-0.744029,-0.585606,-0.531748,-0.039343,-0.218322,0.156781


In [9]:
# Confirm that all plates are loaded
assert (
    sorted(list(complete_df.Image_Metadata_Plate.unique())) == sorted(all_plates)
)

## Recode Dose Information

In [10]:
def recode_dose(x, doses, return_level=False):
    closest_index = np.argmin([np.abs(dose - x) for dose in doses])
    if np.isnan(x):
        return 0
    if return_level:
        return closest_index + 1
    else:
        return doses[closest_index]

In [11]:
primary_dose_mapping = [0.04, 0.12, 0.37, 1.11, 3.33, 10, 20]

In [12]:
complete_df = complete_df.assign(
    Metadata_dose_recode=(
        complete_df
        .Metadata_mmoles_per_liter
        .apply(
            lambda x: recode_dose(x, primary_dose_mapping, return_level=True)
        )
    )
)

print(complete_df.shape)
complete_df.head()

(51839, 1606)


Unnamed: 0,Image_Metadata_Plate,Image_Metadata_Well,Metadata_broad_sample,Metadata_mg_per_ml,Metadata_mmoles_per_liter,Metadata_plate_map_name,Metadata_solvent,Cells_AreaShape_Area,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,...,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_ER_5_0,Nuclei_Texture_Variance_Mito_10_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_Mito_5_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,Nuclei_Texture_Variance_RNA_5_0,Metadata_dose_recode
0,SQ00014816,A01,DMSO,,,C-7161-01-LM6-023,DMSO,1.345312,-1.723697,-0.958103,...,1.518992,1.655247,1.223732,0.019179,0.181705,0.554384,-3.82543,-3.217268,-4.302272,0
1,SQ00014816,A02,DMSO,,,C-7161-01-LM6-023,DMSO,1.401299,0.97717,1.035763,...,0.103865,0.43868,-0.322249,-0.153979,-0.047031,-0.196225,0.026355,0.334541,0.086323,0
2,SQ00014816,A03,DMSO,,,C-7161-01-LM6-023,DMSO,0.798243,0.629534,0.732348,...,1.204122,1.175088,1.187142,0.707714,0.864548,0.333489,-0.142881,-0.329093,-0.101865,0
3,SQ00014816,A04,DMSO,,,C-7161-01-LM6-023,DMSO,0.078415,1.137618,-1.02312,...,-0.361407,-0.2226,-0.791294,-0.435237,-1.125888,-0.471284,0.115125,-0.102395,0.220038,0
4,SQ00014816,A05,DMSO,,,C-7161-01-LM6-023,DMSO,-0.748588,0.014485,-0.871413,...,0.073901,0.299092,0.18554,-0.744029,-0.585606,-0.531748,-0.039343,-0.218322,0.156781,0


In [13]:
complete_df.Metadata_dose_recode.value_counts()

1    7634
4    7560
3    7560
2    7560
5    7533
6    7512
7    3240
0    3240
Name: Metadata_dose_recode, dtype: int64

## Create Consensus Profiles

### a) Generate different consensus profiles for DMSO

Include Well Level Information

In [14]:
replicate_cols = ["Metadata_broad_sample", "Metadata_dose_recode", "Image_Metadata_Well"]

dmso_consensus_df = modz(
    complete_df.query("Metadata_broad_sample == 'DMSO'"),
    features="infer",
    replicate_columns=replicate_cols,
    precision=5
)

dmso_consensus_df = dmso_consensus_df.reset_index()

print(dmso_consensus_df.shape)
dmso_consensus_df.head(2)

(24, 1601)


Unnamed: 0,Metadata_broad_sample,Metadata_dose_recode,Image_Metadata_Well,Cells_AreaShape_Area,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,Cells_AreaShape_Extent,Cells_AreaShape_FormFactor,...,Nuclei_Texture_Variance_DNA_5_0,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_ER_5_0,Nuclei_Texture_Variance_Mito_10_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_Mito_5_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,Nuclei_Texture_Variance_RNA_5_0
0,DMSO,0,A01,0.666739,-0.350003,-0.798849,-0.274376,-0.090479,0.365063,-1.363923,...,1.360451,0.829904,0.694305,0.94365,-0.016141,-0.200708,0.212915,-1.561151,-1.066828,-1.491089
1,DMSO,0,A02,0.925387,-0.272044,-0.483693,-0.342384,-0.163553,0.51958,-0.634225,...,1.254153,0.63873,0.527234,0.750928,0.143006,0.048972,0.23013,-0.485525,0.243315,-0.633394


### b) Generate consensus profiles for all treatments

In [15]:
replicate_cols = ["Metadata_broad_sample", "Metadata_dose_recode"]

complete_consensus_df = modz(
    complete_df.query("Metadata_broad_sample != 'DMSO'"),
    features="infer",
    replicate_columns=replicate_cols,
    precision=5
)

complete_consensus_df = complete_consensus_df.reset_index()
complete_consensus_df = complete_consensus_df.assign(Image_Metadata_Well="collapsed")

print(complete_consensus_df.shape)
complete_consensus_df.head(2)

(9394, 1601)


Unnamed: 0,Metadata_broad_sample,Metadata_dose_recode,Cells_AreaShape_Area,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,Cells_AreaShape_Extent,Cells_AreaShape_FormFactor,Cells_AreaShape_MajorAxisLength,...,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_ER_5_0,Nuclei_Texture_Variance_Mito_10_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_Mito_5_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,Nuclei_Texture_Variance_RNA_5_0,Image_Metadata_Well
0,BRD-A00147595-001-01-5,1,0.186339,0.924692,-0.348695,-0.846292,-0.941928,0.503658,-0.874197,-0.01521,...,0.140988,0.24812,0.240263,0.801974,0.738027,0.950668,-1.470728,-1.535147,-1.214969,collapsed
1,BRD-A00147595-001-01-5,2,0.879525,-0.195074,1.056588,-1.082775,-1.345161,1.125618,-0.595494,0.516398,...,0.856334,0.634595,0.734347,1.369089,1.444183,1.390633,-1.151322,-1.13197,-1.085988,collapsed


### c) Merge Together

In [16]:
repurp_cp_cols = (
    complete_consensus_df
    .columns
    [~complete_consensus_df.columns.str.contains("Metadata")]
    .tolist()
)

meta_cols = (
    complete_consensus_df
    .drop(repurp_cp_cols, axis="columns")
    .columns
    .tolist()
)

In [17]:
complete_consensus_df = (
    pd.concat(
        [
            complete_consensus_df,
            dmso_consensus_df
        ],
        sort=True
    )
    .reset_index(drop=True)
)

complete_consensus_df = complete_consensus_df.loc[:, meta_cols + repurp_cp_cols]

print(complete_consensus_df.shape)
complete_consensus_df.head()

(9418, 1601)


Unnamed: 0,Metadata_broad_sample,Metadata_dose_recode,Image_Metadata_Well,Cells_AreaShape_Area,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,Cells_AreaShape_Extent,Cells_AreaShape_FormFactor,...,Nuclei_Texture_Variance_DNA_5_0,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_ER_5_0,Nuclei_Texture_Variance_Mito_10_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_Mito_5_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,Nuclei_Texture_Variance_RNA_5_0
0,BRD-A00147595-001-01-5,1,collapsed,0.186339,0.924692,-0.348695,-0.846292,-0.941928,0.503658,-0.874197,...,0.148197,0.140988,0.24812,0.240263,0.801974,0.738027,0.950668,-1.470728,-1.535147,-1.214969
1,BRD-A00147595-001-01-5,2,collapsed,0.879525,-0.195074,1.056588,-1.082775,-1.345161,1.125618,-0.595494,...,1.085211,0.856334,0.634595,0.734347,1.369089,1.444183,1.390633,-1.151322,-1.13197,-1.085988
2,BRD-A00147595-001-01-5,3,collapsed,0.848461,-0.94143,0.764949,-0.497313,-0.70588,0.222394,-0.808033,...,0.526802,0.342139,0.340685,0.353097,0.766674,0.740311,0.735777,-1.106007,-1.091512,-1.21972
3,BRD-A00147595-001-01-5,4,collapsed,0.530306,0.000711,-0.124764,-1.683507,-1.444979,0.784652,-1.049982,...,0.697315,0.976119,0.760866,0.718084,0.755641,0.762213,0.751562,-1.319418,-1.034737,-1.182503
4,BRD-A00147595-001-01-5,5,collapsed,1.859576,0.273028,0.411481,-1.327421,-1.634525,1.293491,-2.62224,...,1.538641,2.577332,2.482127,2.494136,2.445294,2.385832,2.271489,-1.473466,-1.112665,-0.696011


### d) Output Profiles

In [18]:
# Output consensus profiles
output_file = os.path.join(output_dir, "repurposing_{}_consensus.tsv.gz".format(consensus))
complete_consensus_df.to_csv(output_file, sep='\t', compression="gzip", index=False)

In [19]:
# Extract cell profiler and metadata features
cp_features = x_test_df.columns[~x_test_df.columns.str.startswith("Metadata")].tolist()

## 3) Apply all Regression Models to all Repurposing Plates

In [20]:
feature_df = complete_consensus_df.reindex(x_test_df.columns, axis="columns")
metadata_df = complete_consensus_df.loc[:, meta_cols]

all_scores = {}
for cell_health_feature in model_dict.keys():
    # Apply Real Model Classifiers
    model_clf = model_dict[cell_health_feature]
    pred_df = model_clf.predict(feature_df)
    all_scores[cell_health_feature] = pred_df

## 4) Output Results

In [21]:
# Output scores
all_score_df = pd.DataFrame.from_dict(all_scores)
full_df = (
    metadata_df
    .merge(all_score_df,
           left_index=True,
           right_index=True)
)

output_real_file = os.path.join(output_dir,
                                "repurposing_transformed_real_models_{}.tsv.gz".format(consensus))
full_df.to_csv(output_real_file, sep="\t", index=False, compression="gzip")

print(full_df.shape)
full_df.head()

(9418, 73)


Unnamed: 0,Metadata_broad_sample,Metadata_dose_recode,Image_Metadata_Well,cell_health_modz_target_cc_all_n_spots_mean,cell_health_modz_target_cc_g1_high_n_spots_h2ax_mean,cell_health_modz_target_cc_g1_n_objects,cell_health_modz_target_vb_infection_percentage,cell_health_modz_target_vb_ros_mean,cell_health_modz_target_vb_ros_back_mean,cell_health_modz_target_cc_edu_pos_alexa647_intensity_nucleus_area_mean,...,cell_health_modz_target_cc_g2_ph3_pos_n_spots_mean,cell_health_modz_target_cc_edu_pos_high_n_spots_h2ax_mean,cell_health_modz_target_cc_edu_pos_n_objects,cell_health_modz_target_cc_g2_ph3_pos_n_spots_per_nucleus_area_mean,cell_health_modz_target_cc_cc_ph3_pos_hoechst_mitosis_mean,cell_health_modz_target_cc_all_large_round_polyploid_mean,cell_health_modz_target_cc_polyploid_n_spots_per_nucleus_area_mean,cell_health_modz_target_cc_g1_n_spots_mean,cell_health_modz_target_cc_g2_ph3_neg_n_spots_per_nucleus_area_mean,cell_health_modz_target_cc_cc_g1_mean
0,BRD-A00147595-001-01-5,1,collapsed,-0.189944,0.047783,0.238814,0.054131,-0.476258,-0.84134,0.175389,...,0.246334,-0.035865,-0.006384,0.172511,-0.167092,-0.072087,0.012556,0.090961,0.047427,0.136618
1,BRD-A00147595-001-01-5,2,collapsed,0.23604,0.203503,-0.280355,0.05228,-0.226435,-0.586786,0.290606,...,0.277289,0.077988,-0.455297,0.350063,-0.293911,1.56758,0.194967,0.200448,0.284758,0.007193
2,BRD-A00147595-001-01-5,3,collapsed,0.260775,0.198122,-0.100472,0.054434,-0.072772,-0.305702,0.15457,...,0.238786,0.172955,-0.135245,0.242095,-0.090953,1.206485,0.119019,0.112033,0.243323,0.074837
3,BRD-A00147595-001-01-5,4,collapsed,0.419343,0.27406,-0.144688,0.05476,-0.117998,-0.25842,0.014714,...,0.295045,0.173966,-0.091213,0.288943,-0.158568,0.621504,0.274162,0.339131,0.391647,0.330281
4,BRD-A00147595-001-01-5,5,collapsed,0.610659,0.609894,-1.329395,0.046039,0.45186,-0.667298,0.125709,...,0.372522,-0.054804,-1.406514,0.418132,-0.72925,1.827549,-0.005998,0.376219,0.615796,0.448314


## 5) Apply UMAP

### Part 1: Apply UMAP to Cell Health Transformed Repurposing Hub Features

In [22]:
cell_health_features = list(model_dict.keys())

In [23]:
reducer = umap.UMAP(random_state=1234, n_components=2)

metadata_df = full_df.drop(cell_health_features, axis="columns")

real_embedding_df = pd.DataFrame(
    reducer.fit_transform(full_df.loc[:, cell_health_features]),
    columns=["umap_x", "umap_y"]
)

real_embedding_df = (
    metadata_df
    .merge(real_embedding_df,
           left_index=True,
           right_index=True)
)

output_real_file = os.path.join(output_dir,
                                "repurposing_umap_transformed_real_models_{}.tsv.gz".format(consensus))
real_embedding_df.to_csv(output_real_file, sep="\t", index=False, compression="gzip")

Compilation is falling back to object mode WITH looplifting enabled because Function "make_euclidean_tree" failed type inference due to: Cannot unify RandomProjectionTreeNode(array(int64, 1d, C), bool, none, none, none, none) and RandomProjectionTreeNode(none, bool, array(float32, 1d, C), float64, RandomProjectionTreeNode(array(int64, 1d, C), bool, none, none, none, none), RandomProjectionTreeNode(array(int64, 1d, C), bool, none, none, none, none)) for '$46call_function.15', defined at /home/ubuntu/miniconda3/envs/cell-health/lib/python3.7/site-packages/umap/rp_tree.py (446)

File "../../../../../../miniconda3/envs/cell-health/lib/python3.7/site-packages/umap/rp_tree.py", line 446:
def make_euclidean_tree(data, indices, rng_state, leaf_size=30):
    <source elided>

        left_node = make_euclidean_tree(data, left_indices, rng_state, leaf_size)
        ^

[1] During: resolving callee type: recursive(type(CPUDispatcher(<function make_euclidean_tree at 0x7f258d152488>)))
[2] During: ty

### Part 2: Apply UMAP to All Repurposing Hub Cell Painting Profiles

In [24]:
reducer = umap.UMAP(random_state=1234, n_components=2)

complete_metadata_df = complete_consensus_df.drop(cp_features, axis="columns")

complete_embedding_df = pd.DataFrame(
    reducer.fit_transform(complete_consensus_df.loc[:, cp_features]),
    columns=["umap_x", "umap_y"]
)

complete_embedding_df = (
    complete_metadata_df
    .merge(complete_embedding_df,
           left_index=True,
           right_index=True)
)

output_real_file = os.path.join(output_dir,
                                "repurposing_umap_transformed_cell_painting_{}.tsv.gz".format(consensus))
complete_embedding_df.to_csv(output_real_file, sep="\t", index=False, compression="gzip")

The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../../../../miniconda3/envs/cell-health/lib/python3.7/site-packages/umap/nndescent.py", line 47:
    @numba.njit(parallel=True)
    def nn_descent(
    ^

  state.func_ir.loc))


## Merge Data Together for Shiny App Exploration

In [25]:
# Load perturbation information
pert_info_file = os.path.join("data", "pert_info.txt")
pert_info_df = pd.read_csv(pert_info_file, sep='\t')

print(pert_info_df.shape)
pert_info_df.head()

(1571, 4)


Unnamed: 0,pert_id,pert_iname,pert_type,moa
0,DMSO,DMSO,ctl_vehicle,Control vehicle
1,BRD-K25114078,aminoguanidine,trt_cp,Nitric oxide synthase inhibitor
2,BRD-K25204779,pritelivir,trt_cp,Helicase primase inhibitor
3,BRD-K25361343,batimastat,trt_cp,Matrix metalloprotease inhibitor
4,BRD-K25140590,cx516,trt_cp,Glutamate receptor modulator


In [26]:
core_id = [
    "{}-{}".format(
        x.split("-")[0],
        x.split("-")[1]
    ) if x != "DMSO"
    else x
    for x in full_df.Metadata_broad_sample
]

pert_df = (
    real_embedding_df
    .assign(Metadata_broad_core_id=core_id)
    .sort_index(axis="columns")
    .merge(
        pert_info_df,
        left_on="Metadata_broad_core_id",
        right_on="pert_id",
        how="left"
    )
)

print(pert_df.shape)
pert_df.head()

(9418, 10)


Unnamed: 0,Image_Metadata_Well,Metadata_broad_core_id,Metadata_broad_sample,Metadata_dose_recode,umap_x,umap_y,pert_id,pert_iname,pert_type,moa
0,collapsed,BRD-A00147595,BRD-A00147595-001-01-5,1,1.453025,-0.649855,BRD-A00147595,balaglitazone,trt_cp,Insulin sensitizer|PPAR receptor partial agonist
1,collapsed,BRD-A00147595,BRD-A00147595-001-01-5,2,-1.114797,-2.016154,BRD-A00147595,balaglitazone,trt_cp,Insulin sensitizer|PPAR receptor partial agonist
2,collapsed,BRD-A00147595,BRD-A00147595-001-01-5,3,-0.653508,-1.56534,BRD-A00147595,balaglitazone,trt_cp,Insulin sensitizer|PPAR receptor partial agonist
3,collapsed,BRD-A00147595,BRD-A00147595-001-01-5,4,0.039578,-2.144439,BRD-A00147595,balaglitazone,trt_cp,Insulin sensitizer|PPAR receptor partial agonist
4,collapsed,BRD-A00147595,BRD-A00147595-001-01-5,5,-3.133264,-2.998826,BRD-A00147595,balaglitazone,trt_cp,Insulin sensitizer|PPAR receptor partial agonist


In [27]:
shiny_merge_cols = ["Metadata_broad_sample", "Metadata_dose_recode", "Image_Metadata_Well"]

shiny_df = pert_df.merge(
    full_df,
    left_on=shiny_merge_cols,
    right_on=shiny_merge_cols,
    how="inner"
)

print(shiny_df.shape)
shiny_df.head()

(9418, 80)


Unnamed: 0,Image_Metadata_Well,Metadata_broad_core_id,Metadata_broad_sample,Metadata_dose_recode,umap_x,umap_y,pert_id,pert_iname,pert_type,moa,...,cell_health_modz_target_cc_g2_ph3_pos_n_spots_mean,cell_health_modz_target_cc_edu_pos_high_n_spots_h2ax_mean,cell_health_modz_target_cc_edu_pos_n_objects,cell_health_modz_target_cc_g2_ph3_pos_n_spots_per_nucleus_area_mean,cell_health_modz_target_cc_cc_ph3_pos_hoechst_mitosis_mean,cell_health_modz_target_cc_all_large_round_polyploid_mean,cell_health_modz_target_cc_polyploid_n_spots_per_nucleus_area_mean,cell_health_modz_target_cc_g1_n_spots_mean,cell_health_modz_target_cc_g2_ph3_neg_n_spots_per_nucleus_area_mean,cell_health_modz_target_cc_cc_g1_mean
0,collapsed,BRD-A00147595,BRD-A00147595-001-01-5,1,1.453025,-0.649855,BRD-A00147595,balaglitazone,trt_cp,Insulin sensitizer|PPAR receptor partial agonist,...,0.246334,-0.035865,-0.006384,0.172511,-0.167092,-0.072087,0.012556,0.090961,0.047427,0.136618
1,collapsed,BRD-A00147595,BRD-A00147595-001-01-5,2,-1.114797,-2.016154,BRD-A00147595,balaglitazone,trt_cp,Insulin sensitizer|PPAR receptor partial agonist,...,0.277289,0.077988,-0.455297,0.350063,-0.293911,1.56758,0.194967,0.200448,0.284758,0.007193
2,collapsed,BRD-A00147595,BRD-A00147595-001-01-5,3,-0.653508,-1.56534,BRD-A00147595,balaglitazone,trt_cp,Insulin sensitizer|PPAR receptor partial agonist,...,0.238786,0.172955,-0.135245,0.242095,-0.090953,1.206485,0.119019,0.112033,0.243323,0.074837
3,collapsed,BRD-A00147595,BRD-A00147595-001-01-5,4,0.039578,-2.144439,BRD-A00147595,balaglitazone,trt_cp,Insulin sensitizer|PPAR receptor partial agonist,...,0.295045,0.173966,-0.091213,0.288943,-0.158568,0.621504,0.274162,0.339131,0.391647,0.330281
4,collapsed,BRD-A00147595,BRD-A00147595-001-01-5,5,-3.133264,-2.998826,BRD-A00147595,balaglitazone,trt_cp,Insulin sensitizer|PPAR receptor partial agonist,...,0.372522,-0.054804,-1.406514,0.418132,-0.72925,1.827549,-0.005998,0.376219,0.615796,0.448314


In [28]:
shiny_file = os.path.join("repurposing_cellhealth_shiny",
                          "data",
                          "moa_cell_health_{}.tsv.gz".format(consensus))

shiny_df.to_csv(shiny_file, sep='\t', index=False, compression="gzip")