# Apply Cell Health Models to Repurposing Set

**Gregory Way, 2019**

The models are trained to predict cell health phenotypes.
Here, I apply the models to Cell Painting data from the repurposing set.

I will use these predictions to identify compound perturbation signatures of cell health impact.

In [1]:
import os
import sys
import pandas as pd
from joblib import load

sys.path.append("../3.train")
from scripts.ml_utils import load_train_test, load_models

## 1) Load Models and Training Data

In [2]:
model_dir = os.path.join("..", "3.train", "models")

model_dict, model_coef = load_models(model_dir=model_dir)
shuffle_model_dict, shuffle_model_coef = load_models(model_dir=model_dir, shuffle=True)

In [3]:
data_dir = os.path.join("..", "3.train", "data")
x_train_df, x_test_df, y_train_df, y_test_df = load_train_test(data_dir=data_dir, drop_metadata=True)

## 2) Extract Repurposing Data Files

In [4]:
# List drug repurposing data
repurposing_profile_dir = os.path.join(
    "/Users",
    "gway",
    "work",
    "projects",
    "2015_10_05_DrugRepurposing_AravindSubramanian_GolubLab_Broad",
    "workspace",
    "software",
    "2015_10_05_DrugRepurposing_AravindSubramanian_GolubLab_Broad",
    "subsampling",
    "data",
    "profiles"
)

In [5]:
plate_info = {}
all_plates = os.listdir(repurposing_profile_dir)
for plate in all_plates:
    plate_dir = os.path.join(repurposing_profile_dir, plate, "n_all")
    norm_file = os.path.join(plate_dir, "{}_subsample_all_normalized.csv".format(plate))
    plate_info[plate] = norm_file

## 3) Apply all real and shuffled Models to all Repurposing Plates

In [6]:
output_dir = os.path.join("data", "repurposing_transformed")

In [7]:
real_models = []
shuffled_models = []
for plate in all_plates:
    norm_file = plate_info[plate]
    if os.path.exists(norm_file):
        df = pd.read_csv(norm_file)
        feature_df = df.reindex(x_test_df.columns, axis="columns").fillna(0)
        metadata_df = df.loc[:, df.columns.str.startswith("Metadata_")]
        
        all_scores = {}
        all_shuffle_scores = {}
        for cell_health_feature in model_dict.keys():
            # Apply Real Model Classifiers
            model_clf = model_dict[cell_health_feature]
            pred_df = model_clf.predict(feature_df)
            all_scores[cell_health_feature] = pred_df

            # Apply Shuffled Model Classifiers
            shuffle_model_clf = shuffle_model_dict[cell_health_feature]
            shuffle_pred_df = shuffle_model_clf.predict(feature_df)
            all_shuffle_scores[cell_health_feature] = shuffle_pred_df
    
        # Output scores
        all_score_df = pd.DataFrame.from_dict(all_scores)
        full_df = (
            metadata_df
            .merge(all_score_df,
                   left_index=True,
                   right_index=True)
            .assign(Metadata_plate=plate)
        )
        real_models.append(full_df)
            
        shuff_score_df = pd.DataFrame.from_dict(all_shuffle_scores)
        full_shuff_df = (
            metadata_df
            .merge(shuff_score_df,
                   left_index=True,
                   right_index=True)
            .assign(Metadata_plate=plate)
        )
        shuffled_models.append(full_shuff_df)
        
    else:
        print(plate)

SQ00015049
.DS_Store


## 4) Output Results

In [8]:
full_real_df = pd.concat(real_models)

output_real_file = os.path.join(output_dir, "repurposing_transformed_real_models.tsv.gz")
full_real_df.to_csv(output_real_file, sep="\t", index=False, compression="gzip")

print(full_real_df.shape)
full_real_df.head()

(51839, 76)


Unnamed: 0,Metadata_plate_map_name,Metadata_broad_sample,Metadata_mg_per_ml,Metadata_mmoles_per_liter,Metadata_solvent,cc_cc_g1_mean,cc_mitosis_ph3_neg_n_spots_mean,cc_edu_pos_high_n_spots_h2ax_mean,cc_polyploid_n_spots_mean,vb_live_cell_roundness,...,cc_cc_n_spots_mean,vb_ros_back_mean,cc_g1_n_objects,vb_percent_all_apoptosis,cc_all_high_n_spots_h2ax_mean,cc_all_large_round_polyploid_mean,cc_edu_pos_n_objects,cc_all_n_objects,cc_g1_n_spots_per_nucleus_area_mean,Metadata_plate
0,C-7161-01-LM6-027,,,,DMSO,0.045196,0.333193,0.401969,0.277877,-0.229005,...,0.066646,0.143369,0.080987,-0.008741,0.114305,0.237319,0.202579,0.149984,-0.085613,SQ00015211
1,C-7161-01-LM6-027,,,,DMSO,0.002666,0.346278,0.467986,0.339312,-0.185377,...,0.185185,-0.045923,0.112019,-0.008865,0.185761,0.226098,0.155987,0.186664,-0.074611,SQ00015211
2,C-7161-01-LM6-027,,,,DMSO,0.044806,0.347656,0.484781,0.333901,-0.136758,...,0.203731,-0.155994,0.156995,-0.002609,0.161276,0.197384,0.221609,0.212112,-0.078172,SQ00015211
3,C-7161-01-LM6-027,,,,DMSO,0.033342,0.321796,0.401586,0.26867,-0.187542,...,0.102463,-0.33507,0.115113,-0.002003,0.231846,0.270123,0.268766,0.202581,-0.044901,SQ00015211
4,C-7161-01-LM6-027,,,,DMSO,-0.070203,0.327251,0.271946,0.216132,0.091225,...,0.172507,-0.384683,0.255505,-0.007266,0.316771,0.290023,0.420747,0.319717,-0.132968,SQ00015211


In [9]:
shuffled_real_df = pd.concat(shuffled_models)

output_real_file = os.path.join(output_dir, "repurposing_transformed_shuffled_models.tsv.gz")
shuffled_real_df.to_csv(output_real_file, sep="\t", index=False, compression="gzip")

print(shuffled_real_df.shape)
shuffled_real_df.head()

(51839, 76)


Unnamed: 0,Metadata_plate_map_name,Metadata_broad_sample,Metadata_mg_per_ml,Metadata_mmoles_per_liter,Metadata_solvent,cc_cc_g1_mean,cc_mitosis_ph3_neg_n_spots_mean,cc_edu_pos_high_n_spots_h2ax_mean,cc_polyploid_n_spots_mean,vb_live_cell_roundness,...,cc_cc_n_spots_mean,vb_ros_back_mean,cc_g1_n_objects,vb_percent_all_apoptosis,cc_all_high_n_spots_h2ax_mean,cc_all_large_round_polyploid_mean,cc_edu_pos_n_objects,cc_all_n_objects,cc_g1_n_spots_per_nucleus_area_mean,Metadata_plate
0,C-7161-01-LM6-027,,,,DMSO,0.124625,0.110353,0.271222,0.246106,-0.175616,...,0.27475,-0.035693,-0.131892,0.189744,0.331681,0.175522,-0.171306,-0.127878,0.25426,SQ00015211
1,C-7161-01-LM6-027,,,,DMSO,0.049898,0.092547,0.120157,0.183863,0.02508,...,0.27475,-0.035693,-0.121031,0.178849,0.26133,0.179047,-0.171306,-0.134952,0.250298,SQ00015211
2,C-7161-01-LM6-027,,,,DMSO,-0.054671,-0.173521,0.0462,0.150682,0.129703,...,0.27475,-0.035693,-0.128056,0.195753,0.23843,0.179332,-0.171306,-0.121302,0.253423,SQ00015211
3,C-7161-01-LM6-027,,,,DMSO,0.062335,-0.120644,0.165967,0.139317,-0.065847,...,0.27475,-0.035693,-0.128816,0.182992,0.320382,0.180018,-0.171306,-0.138661,0.256353,SQ00015211
4,C-7161-01-LM6-027,,,,DMSO,0.021092,-0.179247,0.145762,0.151148,-0.09582,...,0.27475,-0.035693,-0.123704,0.192018,0.260364,0.178779,-0.171306,-0.140636,0.248248,SQ00015211
