## Apply all Cell-Health Models to Training and Testing Sets

**Gregory Way, 2019**

In [1]:
import os
import pandas as pd
from joblib import load

from scripts.ml_utils import load_train_test

In [2]:
%matplotlib inline

In [3]:
# Load all models
model_dir = "models"
model_string = "shuffle_False_transform_raw"

model_dict = {}
for model_file in os.listdir(model_dir):

    if model_string not in model_file:
        continue

    model_file_full = os.path.join(model_dir, model_file)
    cell_health_var = (
        model_file_full
        .split("/")[1]
        .replace("cell_health_target_", "")
        .replace("{}.joblib".format(model_string), "")
    )

    model_dict[cell_health_var] = load(model_file_full)

In [4]:
# Load training and testing matrices
x_train_df, x_test_df, y_train_df, y_test_df = load_train_test(drop_metadata=True)

In [5]:
# Apply the models
all_scores = []
for cell_health_feature in model_dict.keys():
    
    model_clf = model_dict[cell_health_feature]
    pred_train_df = (
        pd.DataFrame(model_clf.predict(x_train_df), columns=["score"])
        .assign(profiles=x_train_df.index,
            data_type="train",
                model=cell_health_feature)
    )
    pred_test_df = (
        pd.DataFrame(model_clf.predict(x_test_df), columns=["score"])
        .assign(profiles=x_test_df.index,
                data_type="test",
                model=cell_health_feature)
    )

    pred_df = pd.concat([pred_train_df, pred_test_df]).reset_index(drop=True)
    all_scores.append(pred_df)

In [6]:
# Concatenate scores
all_scores = pd.concat(all_scores).reset_index(drop=True)

print(all_scores.shape)
all_scores.head(2)

(24990, 4)


Unnamed: 0,score,profiles,data_type,model
0,0.047601,profile_346,train,cc_cc_g1_mean_
1,0.334152,profile_185,train,cc_cc_g1_mean_


In [7]:
# Spread data into wide format
all_scores = (
    all_scores
    .pivot_table(index=["profiles", "data_type"],
                 columns="model",
                 values="score")
    .reset_index()
)

all_scores.head(2)

model,profiles,data_type,cc_all_high_n_spots_h2ax_mean_,cc_all_large_notround_polynuclear_mean_,cc_all_large_round_polyploid_mean_,cc_all_n_objects_,cc_all_n_spots_mean_,cc_all_n_spots_per_nucleus_area_mean_,cc_all_nucleus_area_mean_,cc_all_nucleus_roundness_mean_,...,vb_num_live_cells_,vb_percent_all_apoptosis_,vb_percent_all_early_apoptosis_,vb_percent_all_late_apoptosis_,vb_percent_caspase_dead_only_,vb_percent_dead_,vb_percent_dead_only_,vb_percent_live_,vb_ros_back_mean_,vb_ros_mean_
0,profile_0,train,0.143341,0.446032,0.09461,0.085508,0.289411,0.264312,-0.124119,-0.060475,...,-0.06956,0.062529,0.113555,0.054362,0.070312,0.113467,0.086319,-0.114478,0.412711,0.740138
1,profile_1,train,-0.083619,0.33609,-0.114656,0.307962,-0.096816,-0.113923,-0.263247,-0.228549,...,0.197695,0.006167,0.113555,-0.004837,0.070426,0.060157,0.244093,-0.060248,0.222378,0.522029


In [8]:
# Load Metadata Mapping File
file = os.path.join("data", "profile_id_metadata_mapping.tsv")
metadata_df = pd.read_csv(file, sep='\t')

metadata_df.head()

Unnamed: 0,Metadata_profile_id,Metadata_gene_name,Metadata_pert_name,Metadata_cell_line
0,profile_0,AKT1,AKT1-1,A549
1,profile_1,AKT1,AKT1-2,A549
2,profile_2,ARID1B,ARID1B-1,A549
3,profile_3,ARID1B,ARID1B-2,A549
4,profile_4,ATF4,ATF4-1,A549


In [9]:
# Merge together
all_scores = (
    metadata_df.merge(all_scores,
                      left_on="Metadata_profile_id",
                      right_on="profiles")
    .drop("profiles", axis="columns")
)

all_scores.head()

Unnamed: 0,Metadata_profile_id,Metadata_gene_name,Metadata_pert_name,Metadata_cell_line,data_type,cc_all_high_n_spots_h2ax_mean_,cc_all_large_notround_polynuclear_mean_,cc_all_large_round_polyploid_mean_,cc_all_n_objects_,cc_all_n_spots_mean_,...,vb_num_live_cells_,vb_percent_all_apoptosis_,vb_percent_all_early_apoptosis_,vb_percent_all_late_apoptosis_,vb_percent_caspase_dead_only_,vb_percent_dead_,vb_percent_dead_only_,vb_percent_live_,vb_ros_back_mean_,vb_ros_mean_
0,profile_0,AKT1,AKT1-1,A549,train,0.143341,0.446032,0.09461,0.085508,0.289411,...,-0.06956,0.062529,0.113555,0.054362,0.070312,0.113467,0.086319,-0.114478,0.412711,0.740138
1,profile_1,AKT1,AKT1-2,A549,train,-0.083619,0.33609,-0.114656,0.307962,-0.096816,...,0.197695,0.006167,0.113555,-0.004837,0.070426,0.060157,0.244093,-0.060248,0.222378,0.522029
2,profile_2,ARID1B,ARID1B-1,A549,train,0.119751,0.588489,0.066686,0.218016,0.148074,...,0.138779,0.086691,0.113555,0.094341,0.066244,-0.009957,-0.044788,0.009996,-0.083336,-0.191892
3,profile_3,ARID1B,ARID1B-2,A549,train,0.195636,0.534554,0.197059,0.04687,0.16822,...,0.029458,0.204262,0.113555,0.201743,0.090637,0.052019,-0.043981,-0.052339,-0.101968,-0.179666
4,profile_4,ATF4,ATF4-1,A549,train,1.263739,0.124262,2.116508,-1.830519,1.068879,...,-2.380564,0.588663,0.113555,0.575612,0.105472,0.768809,0.728644,-0.768476,-0.882329,-0.06989


In [10]:
# Output file
file = os.path.join("results", "all_model_predictions.tsv")
all_scores.to_csv(file, sep='\t', index=False)