# Apply Cell Health Models to Repurposing Set

**Gregory Way, 2019**

The models are trained to predict cell health phenotypes.
Here, I apply the models to Cell Painting data from the repurposing set.

I will use these predictions to identify compound perturbation signatures of cell health impact.

In [1]:
import os
import sys
import numpy as np
import pandas as pd
from joblib import load
import umap

sys.path.append("../3.train")
from scripts.ml_utils import load_train_test, load_models

In [2]:
np.random.seed(123)

## 1) Load Models and Training Data

In [3]:
model_dir = os.path.join("..", "3.train", "models")

model_dict, model_coef = load_models(model_dir=model_dir)
shuffle_model_dict, shuffle_model_coef = load_models(model_dir=model_dir, shuffle=True)

In [4]:
data_dir = os.path.join("..", "3.train", "data")
x_train_df, x_test_df, y_train_df, y_test_df = load_train_test(data_dir=data_dir, drop_metadata=True)

## 2) Extract Repurposing Data Files

In [5]:
# List drug repurposing data
repurposing_profile_dir = os.path.join(
    "/Users",
    "gway",
    "work",
    "projects",
    "2015_10_05_DrugRepurposing_AravindSubramanian_GolubLab_Broad",
    "workspace",
    "software",
    "2015_10_05_DrugRepurposing_AravindSubramanian_GolubLab_Broad",
    "subsampling",
    "data",
    "profiles"
)

In [6]:
plate_info = {}
all_plates = os.listdir(repurposing_profile_dir)
for plate in all_plates:
    plate_dir = os.path.join(repurposing_profile_dir, plate, "n_all")
    norm_file = os.path.join(plate_dir, "{}_subsample_all_normalized.csv".format(plate))
    plate_info[plate] = norm_file

## 3) Apply all real and shuffled Models to all Repurposing Plates

In [7]:
output_dir = os.path.join("data", "repurposing_transformed")

In [8]:
real_models = []
shuffled_models = []
all_dfs = []
all_metadata_dfs = []
for plate in all_plates:
    norm_file = plate_info[plate]
    if os.path.exists(norm_file):
        df = pd.read_csv(norm_file)

        feature_df = df.reindex(x_test_df.columns, axis="columns").fillna(0)
        metadata_df = df.loc[:, df.columns.str.startswith("Metadata_")]
        
        all_dfs.append(feature_df)
        all_metadata_dfs.append(metadata_df)
        
        all_scores = {}
        all_shuffle_scores = {}
        for cell_health_feature in model_dict.keys():
            # Apply Real Model Classifiers
            model_clf = model_dict[cell_health_feature]
            pred_df = model_clf.predict(feature_df)
            all_scores[cell_health_feature] = pred_df

            # Apply Shuffled Model Classifiers
            shuffle_model_clf = shuffle_model_dict[cell_health_feature]
            shuffle_pred_df = shuffle_model_clf.predict(feature_df)
            all_shuffle_scores[cell_health_feature] = shuffle_pred_df
    
        # Output scores
        all_score_df = pd.DataFrame.from_dict(all_scores)
        full_df = (
            metadata_df
            .merge(all_score_df,
                   left_index=True,
                   right_index=True)
            .assign(Metadata_plate=plate)
        )
        real_models.append(full_df)
            
        shuff_score_df = pd.DataFrame.from_dict(all_shuffle_scores)
        full_shuff_df = (
            metadata_df
            .merge(shuff_score_df,
                   left_index=True,
                   right_index=True)
            .assign(Metadata_plate=plate)
        )
        shuffled_models.append(full_shuff_df)
        
    else:
        print(plate)

SQ00015049
.DS_Store


## 4) Output Results

In [9]:
all_df = pd.concat(all_dfs)
all_metadata_df = pd.concat(all_metadata_dfs)

complete_df = pd.concat([all_metadata_df, all_df], axis="columns").reset_index(drop=True)

print(complete_df.shape)
complete_df.head()

(51839, 1286)


Unnamed: 0,Metadata_plate_map_name,Metadata_broad_sample,Metadata_mg_per_ml,Metadata_mmoles_per_liter,Metadata_solvent,Cells_AreaShape_Area,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,Cells_AreaShape_EulerNumber,Cells_AreaShape_Extent,...,Nuclei_Texture_Variance_DNA_5_0,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_ER_5_0,Nuclei_Texture_Variance_Mito_10_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_Mito_5_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,Nuclei_Texture_Variance_RNA_5_0
0,C-7161-01-LM6-027,,,,DMSO,0.235021,0.753198,-1.498757,0.0,-0.142916,...,0.677879,-0.005078,0.096397,-0.040741,-0.613751,-0.572386,-0.610598,-0.453982,-0.307237,-0.429322
1,C-7161-01-LM6-027,,,,DMSO,0.219177,0.406902,1.150688,0.0,-0.116611,...,0.324203,-0.055949,0.109314,-0.075406,-0.418682,-0.430327,-0.473034,-0.145102,0.054124,-0.303347
2,C-7161-01-LM6-027,,,,DMSO,0.288078,0.776809,1.805348,0.0,-0.149218,...,0.568386,-0.023348,0.037226,0.048057,-0.357287,-0.420457,-0.390093,-0.193983,-0.023151,-0.29085
3,C-7161-01-LM6-027,,,,DMSO,0.203886,0.965698,-0.93652,0.0,-0.109647,...,0.422796,-0.153516,-0.063628,-0.099642,-0.317077,-0.323075,-0.333247,0.169582,0.333924,0.068789
4,C-7161-01-LM6-027,,,,DMSO,-0.155539,0.58005,0.796402,0.0,0.227168,...,0.71017,0.113111,0.188868,0.098013,-0.160567,-0.229516,-0.161166,-0.009321,0.081673,-0.051178


In [10]:
full_real_df = pd.concat(real_models)

# Determine proper alignment of columns
output_cols = full_real_df.columns.tolist()
output_cols.insert(0, output_cols.pop(output_cols.index("Metadata_plate")))
full_real_df = full_real_df.loc[:, output_cols].reset_index(drop=True)

output_real_file = os.path.join(output_dir, "repurposing_transformed_real_models.tsv.gz")
full_real_df.to_csv(output_real_file, sep="\t", index=False, compression="gzip")

print(full_real_df.shape)
full_real_df.head()

(51839, 76)


Unnamed: 0,Metadata_plate,Metadata_plate_map_name,Metadata_broad_sample,Metadata_mg_per_ml,Metadata_mmoles_per_liter,Metadata_solvent,cc_cc_g1_mean,cc_mitosis_ph3_neg_n_spots_mean,cc_edu_pos_high_n_spots_h2ax_mean,cc_polyploid_n_spots_mean,...,cc_all_nucleus_area_mean,cc_cc_n_spots_mean,vb_ros_back_mean,cc_g1_n_objects,vb_percent_all_apoptosis,cc_all_high_n_spots_h2ax_mean,cc_all_large_round_polyploid_mean,cc_edu_pos_n_objects,cc_all_n_objects,cc_g1_n_spots_per_nucleus_area_mean
0,SQ00015211,C-7161-01-LM6-027,,,,DMSO,0.045196,0.333193,0.401969,0.277877,...,-0.301533,0.066646,0.143369,0.080987,-0.008741,0.114305,0.237319,0.202579,0.149984,-0.085613
1,SQ00015211,C-7161-01-LM6-027,,,,DMSO,0.002666,0.346278,0.467986,0.339312,...,-0.057043,0.185185,-0.045923,0.112019,-0.008865,0.185761,0.226098,0.155987,0.186664,-0.074611
2,SQ00015211,C-7161-01-LM6-027,,,,DMSO,0.044806,0.347656,0.484781,0.333901,...,-0.13477,0.203731,-0.155994,0.156995,-0.002609,0.161276,0.197384,0.221609,0.212112,-0.078172
3,SQ00015211,C-7161-01-LM6-027,,,,DMSO,0.033342,0.321796,0.401586,0.26867,...,0.096326,0.102463,-0.33507,0.115113,-0.002003,0.231846,0.270123,0.268766,0.202581,-0.044901
4,SQ00015211,C-7161-01-LM6-027,,,,DMSO,-0.070203,0.327251,0.271946,0.216132,...,0.231349,0.172507,-0.384683,0.255505,-0.007266,0.316771,0.290023,0.420747,0.319717,-0.132968


In [11]:
full_shuffled_df = pd.concat(shuffled_models)
full_shuffled_df = full_shuffled_df.loc[:, output_cols].reset_index(drop=True)

output_real_file = os.path.join(output_dir, "repurposing_transformed_shuffled_models.tsv.gz")
full_shuffled_df.to_csv(output_real_file, sep="\t", index=False, compression="gzip")

print(full_shuffled_df.shape)
full_shuffled_df.head()

(51839, 76)


Unnamed: 0,Metadata_plate,Metadata_plate_map_name,Metadata_broad_sample,Metadata_mg_per_ml,Metadata_mmoles_per_liter,Metadata_solvent,cc_cc_g1_mean,cc_mitosis_ph3_neg_n_spots_mean,cc_edu_pos_high_n_spots_h2ax_mean,cc_polyploid_n_spots_mean,...,cc_all_nucleus_area_mean,cc_cc_n_spots_mean,vb_ros_back_mean,cc_g1_n_objects,vb_percent_all_apoptosis,cc_all_high_n_spots_h2ax_mean,cc_all_large_round_polyploid_mean,cc_edu_pos_n_objects,cc_all_n_objects,cc_g1_n_spots_per_nucleus_area_mean
0,SQ00015211,C-7161-01-LM6-027,,,,DMSO,0.124625,0.110353,0.271222,0.246106,...,0.159699,0.27475,-0.035693,-0.131892,0.189744,0.331681,0.175522,-0.171306,-0.127878,0.25426
1,SQ00015211,C-7161-01-LM6-027,,,,DMSO,0.049898,0.092547,0.120157,0.183863,...,0.196988,0.27475,-0.035693,-0.121031,0.178849,0.26133,0.179047,-0.171306,-0.134952,0.250298
2,SQ00015211,C-7161-01-LM6-027,,,,DMSO,-0.054671,-0.173521,0.0462,0.150682,...,0.191127,0.27475,-0.035693,-0.128056,0.195753,0.23843,0.179332,-0.171306,-0.121302,0.253423
3,SQ00015211,C-7161-01-LM6-027,,,,DMSO,0.062335,-0.120644,0.165967,0.139317,...,0.199954,0.27475,-0.035693,-0.128816,0.182992,0.320382,0.180018,-0.171306,-0.138661,0.256353
4,SQ00015211,C-7161-01-LM6-027,,,,DMSO,0.021092,-0.179247,0.145762,0.151148,...,0.1803,0.27475,-0.035693,-0.123704,0.192018,0.260364,0.178779,-0.171306,-0.140636,0.248248


## 5) Apply UMAP

### Part 1: Apply UMAP to Cell Health Transformed Repurposing Hub Features

In [12]:
cell_health_features = list(model_dict.keys())

In [13]:
reducer = umap.UMAP(random_state=1234, n_components=2)

metadata_df = full_real_df.drop(cell_health_features, axis="columns")

real_embedding_df = pd.DataFrame(
    reducer.fit_transform(full_real_df.loc[:, cell_health_features]),
    columns=["umap_x", "umap_y"]
)

real_embedding_df = (
    metadata_df
    .merge(real_embedding_df,
           left_index=True,
           right_index=True)
)

output_real_file = os.path.join(output_dir, "repurposing_umap_transformed_real_models.tsv.gz")
real_embedding_df.to_csv(output_real_file, sep="\t", index=False, compression="gzip")

Compilation is falling back to object mode WITH looplifting enabled because Function "make_euclidean_tree" failed type inference due to: Cannot unify RandomProjectionTreeNode(array(int64, 1d, C), bool, none, none, none, none) and RandomProjectionTreeNode(none, bool, array(float32, 1d, C), float64, RandomProjectionTreeNode(array(int64, 1d, C), bool, none, none, none, none), RandomProjectionTreeNode(array(int64, 1d, C), bool, none, none, none, none)) for '$14.16', defined at /Users/gway/miniconda3/envs/cell-health/lib/python3.7/site-packages/umap/rp_tree.py (446)

File "../../../../../../../miniconda3/envs/cell-health/lib/python3.7/site-packages/umap/rp_tree.py", line 446:
def make_euclidean_tree(data, indices, rng_state, leaf_size=30):
    <source elided>

        left_node = make_euclidean_tree(data, left_indices, rng_state, leaf_size)
        ^

[1] During: resolving callee type: recursive(type(CPUDispatcher(<function make_euclidean_tree at 0x120689d08>)))
[2] During: typing of call a

### Part 2: Apply UMAP to All Repurposing Hub Cell Painting Profiles

In [14]:
cell_painting_features = [x for x in complete_df.columns if not x.startswith("Metadata_") ]

In [15]:
reducer = umap.UMAP(random_state=1234, n_components=2)

complete_metadata_df = complete_df.drop(cell_painting_features, axis="columns")

complete_embedding_df = pd.DataFrame(
    reducer.fit_transform(complete_df.loc[:, cell_painting_features]),
    columns=["umap_x", "umap_y"]
)

complete_embedding_df = (
    complete_metadata_df
    .merge(complete_embedding_df,
           left_index=True,
           right_index=True)
)

output_real_file = os.path.join(output_dir, "repurposing_umap_transformed_cell_painting.tsv.gz")
complete_embedding_df.to_csv(output_real_file, sep="\t", index=False, compression="gzip")

The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../../../../../miniconda3/envs/cell-health/lib/python3.7/site-packages/umap/nndescent.py", line 47:
    @numba.njit(parallel=True)
    def nn_descent(
    ^

  self.func_ir.loc))
