## Apply all Cell-Health Models to Training and Testing Sets

**Gregory Way, 2019**

In [1]:
import os
import pandas as pd
from joblib import load

from scripts.ml_utils import load_train_test, load_models

In [2]:
%matplotlib inline

In [3]:
def apply_model(model, feature, train_x, test_x):
    """
    Apply model to training and testing matrix
    """
    pred_train_df = (
        pd.DataFrame(model.predict(train_x), columns=["score"])
        .assign(profiles=train_x.index,
                Metadata_data_type="train",
                model=feature)
    )
    pred_test_df = (
        pd.DataFrame(model.predict(test_x), columns=["score"])
        .assign(profiles=test_x.index,
                Metadata_data_type="test",
                model=feature)
    )

    pred_df = pd.concat([pred_train_df, pred_test_df]).reset_index(drop=True)
    return pred_df

def sample_squared_error(scores, y):
    """
    Calculate the squared error per sample depending on model scores
    """
    metadata_cols = [x for x in scores.columns if x.startswith("Metadata_")]
    scores_values = scores.drop(metadata_cols, axis="columns")
    
    all_squared_error = {}
    for cell_health_feature in scores_values.columns:
        y_subset_df = y.loc[:, cell_health_feature].dropna().T
        scores_subset = scores_values.loc[:, cell_health_feature].reindex(y_subset_df.index).T

        squared_error = (y_subset_df - scores_subset) ** 2
        all_squared_error[cell_health_feature] = squared_error
    
    return pd.DataFrame(all_squared_error).reindex(scores.index)

## 1) Load Models and Model Coefficients

For real data and shuffled model data.

In [4]:
model_dict, model_coef = load_models()
shuffle_model_dict, shuffle_model_coef = load_models(shuffle=True)

In [5]:
# Load Metadata Mapping File
file = os.path.join("data", "profile_id_metadata_mapping.tsv")
metadata_df = pd.read_csv(file, sep='\t')

metadata_df.head()

Unnamed: 0,Metadata_profile_id,Metadata_gene_name,Metadata_pert_name,Metadata_cell_line
0,profile_0,AKT1,AKT1-1,A549
1,profile_1,AKT1,AKT1-2,A549
2,profile_2,ARID1B,ARID1B-1,A549
3,profile_3,ARID1B,ARID1B-2,A549
4,profile_4,ATF4,ATF4-1,A549


## 2) Load Training and Testing Data

In [6]:
x_train_df, x_test_df, y_train_df, y_test_df = load_train_test(drop_metadata=True)

## 3) Output Model Coefficients

In [7]:
# Extract all model coefficients and output to file
coef_df = pd.DataFrame(model_coef)
coef_df.index = x_test_df.columns
coef_df.index.name = "features"

file = os.path.join("results", "all_model_coefficients.tsv")
coef_df.to_csv(file, sep='\t', index=True)

print(coef_df.shape)
coef_df.head(2)

(1281, 70)


Unnamed: 0_level_0,cc_cc_g1_mean,cc_mitosis_ph3_neg_n_spots_mean,cc_edu_pos_high_n_spots_h2ax_mean,cc_polyploid_n_spots_mean,vb_live_cell_roundness,cc_g2_ph3_neg_n_spots_mean,cc_polyploid_high_n_spots_h2ax_mean,vb_ros_mean,cc_cc_n_spots_per_nucleus_area_mean,cc_mitosis_ph3_neg_high_n_spots_h2ax_mean,...,cc_all_nucleus_area_mean,cc_cc_n_spots_mean,vb_ros_back_mean,cc_g1_n_objects,vb_percent_all_apoptosis,cc_all_high_n_spots_h2ax_mean,cc_all_large_round_polyploid_mean,cc_edu_pos_n_objects,cc_all_n_objects,cc_g1_n_spots_per_nucleus_area_mean
features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cells_AreaShape_Area,0.0,0.0,0.077788,0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0,...,0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0
Cells_AreaShape_Center_X,-0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,...,0.0,-0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0


In [8]:
# Extract all model coefficients and output to file
shuffle_coef_df = pd.DataFrame(shuffle_model_coef)
shuffle_coef_df.index = x_test_df.columns
shuffle_coef_df.index.name = "features"

file = os.path.join("results", "all_model_coefficients_shuffled.tsv")
shuffle_coef_df.to_csv(file, sep='\t', index=True)

print(shuffle_coef_df.shape)
shuffle_coef_df.head(2)

(1281, 70)


Unnamed: 0_level_0,cc_mitosis_ph3_pos_n_spots_mean,vb_percent_caspase_dead_only,vb_infection_percentage,cc_g2_ph3_pos_n_objects,cc_g2_ph3_pos_high_n_spots_h2ax_mean,cc_all_nucleus_roundness_mean,cc_edu_pos_n_spots_mean,vb_percent_all_early_apoptosis,vb_live_cell_width_length,cc_cc_ph3_pos_hoechst_mitosis_mean,...,cc_polynuclear_n_spots_mean,cc_edu_pos_n_objects,cc_g1_n_objects,cc_all_n_objects,cc_cc_g2_ph3_neg_mean,cc_g1_high_n_spots_h2ax_mean,cc_polynuclear_n_spots_per_nucleus_area_mean,cc_polynuclear_n_objects,cc_mitosis_ph3_pos_high_n_spots_h2ax_mean,cc_g2_ph3_neg_n_objects
features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cells_AreaShape_Area,0.0,0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,0.0,...,0.0,-0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
Cells_AreaShape_Center_X,-0.0,0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,...,-0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0,-0.0,0.0


## 4) Apply all models

For real and shuffled data.

In [9]:
all_scores = []
all_shuffle_scores = []
for cell_health_feature in model_dict.keys():
    # Apply Real Model Classifiers
    model_clf = model_dict[cell_health_feature]
    pred_df = apply_model(model=model_clf,
                          feature=cell_health_feature,
                          train_x=x_train_df,
                          test_x=x_test_df)
    all_scores.append(pred_df)
    
    # Apply Shuffled Model Classifiers
    shuffle_model_clf = shuffle_model_dict[cell_health_feature]
    shuffle_pred_df = apply_model(model=shuffle_model_clf,
                                  feature=cell_health_feature,
                                  train_x=x_train_df,
                                  test_x=x_test_df)
    all_shuffle_scores.append(shuffle_pred_df)

## 5) Concatenate scores with Metadata

In [10]:
# Concatenate real data scores
all_scores = (
    pd.concat(all_scores)
    .reset_index(drop=True)
    .pivot_table(index=["profiles", "Metadata_data_type"],
                 columns="model",
                 values="score")
    .reset_index()
)

all_scores = (
    metadata_df.merge(all_scores,
                      left_on="Metadata_profile_id",
                      right_on="profiles")
    .drop("profiles", axis="columns")
)

all_scores.index = all_scores.Metadata_profile_id
all_scores = all_scores.drop("Metadata_profile_id", axis="columns")

# Output file
file = os.path.join("results", "all_model_predictions.tsv")
all_scores.to_csv(file, sep='\t', index=True)

print(all_scores.shape)
all_scores.head(2)

(357, 74)


Unnamed: 0_level_0,Metadata_gene_name,Metadata_pert_name,Metadata_cell_line,Metadata_data_type,cc_all_high_n_spots_h2ax_mean,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_mean,cc_all_n_spots_per_nucleus_area_mean,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_all_early_apoptosis,vb_percent_all_late_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_live,vb_ros_back_mean,vb_ros_mean
Metadata_profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
profile_0,AKT1,AKT1-1,A549,train,0.143341,0.446032,0.09461,0.085508,0.289411,0.264312,...,-0.06956,0.062529,0.113555,0.054362,0.070312,0.113467,0.086319,-0.114478,0.412711,0.740138
profile_1,AKT1,AKT1-2,A549,train,-0.083619,0.33609,-0.114656,0.307962,-0.096816,-0.113923,...,0.197695,0.006167,0.113555,-0.004837,0.070426,0.060157,0.244093,-0.060248,0.222378,0.522029


In [11]:
# Concatenate shuffled data scores
all_shuffle_scores = (
    pd.concat(all_shuffle_scores)
    .reset_index(drop=True)
    .pivot_table(index=["profiles", "Metadata_data_type"],
                 columns="model",
                 values="score")
    .reset_index()
)

all_shuffle_scores = (
    metadata_df.merge(all_shuffle_scores,
                      left_on="Metadata_profile_id",
                      right_on="profiles")
    .drop("profiles", axis="columns")
)

all_shuffle_scores.index = all_shuffle_scores.Metadata_profile_id
all_shuffle_scores = all_shuffle_scores.drop("Metadata_profile_id", axis="columns")

# Output file
file = os.path.join("results", "all_model_predictions_shuffled.tsv")
all_shuffle_scores.to_csv(file, sep='\t', index=True)

print(all_shuffle_scores.shape)
all_shuffle_scores.head(2)

(357, 74)


Unnamed: 0_level_0,Metadata_gene_name,Metadata_pert_name,Metadata_cell_line,Metadata_data_type,cc_all_high_n_spots_h2ax_mean,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_mean,cc_all_n_spots_per_nucleus_area_mean,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_all_early_apoptosis,vb_percent_all_late_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_live,vb_ros_back_mean,vb_ros_mean
Metadata_profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
profile_0,AKT1,AKT1-1,A549,train,0.239382,0.301668,0.179754,-0.131365,0.207153,0.250057,...,-0.22752,0.205359,0.113555,0.19424,0.058084,0.141567,0.254305,-0.231011,-0.035693,0.073917
profile_1,AKT1,AKT1-2,A549,train,0.242489,0.242054,0.181637,-0.108902,0.182758,0.217262,...,-0.22752,0.210314,0.113555,0.19424,0.058084,-0.061834,0.136092,-0.231011,-0.035693,0.073917


## 6) Calculate the Squared Error of Individual Samples

For real and shuffled data

In [12]:
y_df = pd.concat([y_train_df, y_test_df]).reindex(all_scores.index)

print(y_df.shape)
y_df.head(2)

(357, 70)


Unnamed: 0_level_0,cc_all_high_n_spots_h2ax_mean,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_mean,cc_all_n_spots_per_nucleus_area_mean,cc_all_nucleus_area_mean,cc_all_nucleus_roundness_mean,cc_cc_edu_pos_mean,cc_cc_g1_mean,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_all_early_apoptosis,vb_percent_all_late_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_live,vb_ros_back_mean,vb_ros_mean
Metadata_profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
profile_0,0.008156,0.587977,0.01882,0.381501,0.176564,0.187675,-0.170616,0.039147,0.085392,0.040949,...,0.399842,0.0,0.0,0.0,-0.118976,-0.132871,-0.12109,0.132882,0.80697,1.293984
profile_1,0.056667,1.264627,0.24145,0.568443,0.235304,0.372684,-0.276888,-0.183445,0.883316,-1.024142,...,0.10167,0.318027,0.132751,0.467027,0.621374,0.100032,0.074036,-0.099917,0.558041,1.151867


In [13]:
all_score_error = sample_squared_error(scores=all_scores, y=y_df)

all_score_error = (
    metadata_df.merge(all_score_error,
                      left_on="Metadata_profile_id",
                      right_index=True)
)

# Output file
file = os.path.join("results", "all_model_sample_squared_error.tsv")
all_score_error.to_csv(file, sep='\t', index=False)

print(all_score_error.shape)
all_score_error.head(2)

(357, 74)


Unnamed: 0,Metadata_profile_id,Metadata_gene_name,Metadata_pert_name,Metadata_cell_line,cc_all_high_n_spots_h2ax_mean,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_mean,cc_all_n_spots_per_nucleus_area_mean,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_all_early_apoptosis,vb_percent_all_late_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_live,vb_ros_back_mean,vb_ros_mean
0,profile_0,AKT1,AKT1-1,A549,0.018275,0.020149,0.005744,0.087612,0.012734,0.005873,...,0.220338,0.00391,0.012895,0.002955,0.03583,0.060682,0.043019,0.061187,0.15544,0.306745
1,profile_1,AKT1,AKT1-2,A549,0.01968,0.86218,0.126811,0.06785,0.110304,0.236787,...,0.009221,0.097256,0.000369,0.222656,0.303544,0.00159,0.02892,0.001574,0.11267,0.396697


In [14]:
all_shuffle_score_error = sample_squared_error(scores=all_shuffle_scores, y=y_df)

all_shuffle_score_error = (
    metadata_df.merge(all_shuffle_score_error,
                      left_on="Metadata_profile_id",
                      right_index=True)
)

# Output file
file = os.path.join("results", "all_model_sample_squared_error_shuffled.tsv")
all_shuffle_score_error.to_csv(file, sep='\t', index=False)

print(all_shuffle_score_error.shape)
all_shuffle_score_error.head()

(357, 74)


Unnamed: 0,Metadata_profile_id,Metadata_gene_name,Metadata_pert_name,Metadata_cell_line,cc_all_high_n_spots_h2ax_mean,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_mean,cc_all_n_spots_per_nucleus_area_mean,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_all_early_apoptosis,vb_percent_all_late_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_live,vb_ros_back_mean,vb_ros_mean
0,profile_0,AKT1,AKT1-1,A549,0.053465,0.081973,0.0259,0.263032,0.000936,0.003891,...,0.393583,0.042172,0.012895,0.037729,0.03135,0.075316,0.140921,0.132418,0.710081,1.488563
1,profile_1,AKT1,AKT1-2,A549,0.03453,1.045654,0.003578,0.458795,0.002761,0.024156,...,0.108366,0.011602,0.000369,0.074413,0.317296,0.026201,0.003851,0.017186,0.35252,1.161977
2,profile_2,ARID1B,ARID1B-1,A549,0.034627,0.630117,0.001461,0.177479,0.093001,0.131618,...,0.095,0.02136,0.002663,0.00279,0.29161,0.11329,0.015474,0.030657,0.128339,0.000858
3,profile_3,ARID1B,ARID1B-2,A549,0.101952,0.002761,0.075529,0.059973,0.374354,0.235462,...,0.243319,0.009616,0.00063,0.037729,0.098499,0.214912,0.072445,0.086999,0.060368,0.000347
4,profile_4,ATF4,ATF4-1,A549,13.186057,0.535368,9.648393,4.939061,3.853188,6.535966,...,4.479144,0.021234,0.012895,0.037729,0.021794,0.322729,0.347,0.008031,0.354855,0.00106
