## Apply all Cell-Health Models to Training and Testing Sets

**Gregory Way, 2019**

In [1]:
import os
import pandas as pd
from joblib import load

from scripts.ml_utils import load_train_test, load_models

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, positive=False):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  max_n_alpha

In [2]:
%matplotlib inline

In [3]:
def apply_model(model, feature, train_x, test_x):
    """
    Apply model to training and testing matrix
    """
    pred_train_df = (
        pd.DataFrame(model.predict(train_x), columns=["score"])
        .assign(profiles=train_x.index,
                Metadata_data_type="train",
                model=feature)
    )
    pred_test_df = (
        pd.DataFrame(model.predict(test_x), columns=["score"])
        .assign(profiles=test_x.index,
                Metadata_data_type="test",
                model=feature)
    )

    pred_df = pd.concat([pred_train_df, pred_test_df]).reset_index(drop=True)
    return pred_df

def sample_squared_error(scores, y):
    """
    Calculate the squared error per sample depending on model scores
    """
    metadata_cols = [x for x in scores.columns if x.startswith("Metadata_")]
    scores_values = scores.drop(metadata_cols, axis="columns")
    
    all_squared_error = {}
    for cell_health_feature in scores_values.columns:
        y_subset_df = y.loc[:, cell_health_feature].dropna().T
        scores_subset = scores_values.loc[:, cell_health_feature].reindex(y_subset_df.index).T

        squared_error = (y_subset_df - scores_subset) ** 2
        all_squared_error[cell_health_feature] = squared_error
    
    return pd.DataFrame(all_squared_error).reindex(scores.index)

## 1) Load Models and Model Coefficients

For real data and shuffled model data.

In [4]:
consensus = "modz"
method = "median"
# method = "weighted"

In [5]:
model_dict, model_coef = load_models(model_dir=f"models/{method}_agg", consensus=consensus)
shuffle_model_dict, shuffle_model_coef = load_models(model_dir=f"models/{method}_agg", shuffle=True, consensus=consensus)

In [6]:
# Load Metadata Mapping File
data_dir = os.path.join("..", "1.generate-profiles", "data") #"1.generate-profiles", 
file = os.path.join(data_dir, f"{method}_profile_id_metadata_mapping.tsv")
metadata_df = pd.read_csv(file, sep='\t')

metadata_df.head()

Unnamed: 0,Metadata_profile_id,Metadata_cell_line,Metadata_pert_name
0,profile_0,A549,AKT1-1
1,profile_1,A549,AKT1-2
2,profile_2,A549,ARID1B-1
3,profile_3,A549,ARID1B-2
4,profile_4,A549,ATF4-1


## 2) Load Training and Testing Data

In [7]:
train_folder = f"data/train_test/{method}_agg/"

In [8]:
x_train_df, x_test_df, y_train_df, y_test_df = load_train_test(data_dir=train_folder,
                                                               drop_metadata=True, consensus=consensus)

## 3) Output Model Coefficients

In [9]:
# Extract all model coefficients and output to file
coef_df = pd.DataFrame(model_coef)
coef_df.index = x_test_df.columns
coef_df.index.name = "features"

file = os.path.join("results", "{}_agg".format(method),
                    "all_model_coefficients_{}.tsv".format(consensus))
coef_df.to_csv(file, sep='\t', index=True)

print(coef_df.shape)
coef_df.head(2)

(943, 70)


Unnamed: 0_level_0,cell_health_modz_target_cc_cc_g2,cell_health_modz_target_cc_mitosis_n_objects,cell_health_modz_target_cc_g1_n_objects,cell_health_modz_target_vb_infection_percentage,cell_health_modz_target_cc_late_mitosis_high_h2ax,cell_health_modz_target_cc_s_n_spots_h2ax_per_nucleus_area_mean,cell_health_modz_target_vb_ros_mean,cell_health_modz_target_vb_ros_back_mean,cell_health_modz_target_cc_cc_late_mitosis,cell_health_modz_target_vb_percent_live,...,cell_health_modz_target_cc_cc_n_spots_h2ax_per_nucleus_area_mean,cell_health_modz_target_cc_g1_high_h2ax,cell_health_modz_target_vb_live_cell_area,cell_health_modz_target_cc_polynuclear_n_spots_h2ax_mean,cell_health_modz_target_cc_early_mitosis_high_h2ax,cell_health_modz_target_cc_all_large_round_polyploid_mean,cell_health_modz_target_cc_cc_mitosis,cell_health_modz_target_cc_cc_g1,cell_health_modz_target_cc_cc_n_spots_h2ax_mean,cell_health_modz_target_cc_polyploid_high_h2ax
features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cells_AreaShape_Center_Y,0.025784,0.015058,-0.0,-0.0,0.0,-0.0,-0.03125,-0.014778,0.018794,0.002111,...,-0.0,0.0,0.014578,0.0,0.0,0.011547,0.007674,-0.009514,0.0,-0.0
Cells_AreaShape_Compactness,0.0,0.0,-5.625015e-07,0.0,-0.0,0.0,0.0,0.0,0.068026,0.0,...,0.0,-0.0,0.076844,0.0,0.0,0.0,-0.0,0.0,0.0,0.0


In [10]:
# Extract all model coefficients and output to file
shuffle_coef_df = pd.DataFrame(shuffle_model_coef)
shuffle_coef_df.index = x_test_df.columns
shuffle_coef_df.index.name = "features"

file = os.path.join("results", "{}_agg".format(method),
                    "all_model_coefficients_shuffled_{}.tsv".format(consensus))
shuffle_coef_df.to_csv(file, sep='\t', index=True)

print(shuffle_coef_df.shape)
shuffle_coef_df.head(2)

(943, 70)


Unnamed: 0_level_0,cell_health_modz_target_cc_mitosis_high_h2ax,cell_health_modz_target_cc_cc_n_spots_h2ax_per_nucleus_area_mean,cell_health_modz_target_cc_cc_early_mitosis,cell_health_modz_target_vb_percent_late_apoptosis,cell_health_modz_target_cc_s_n_objects,cell_health_modz_target_vb_live_cell_area,cell_health_modz_target_cc_g1_high_h2ax,cell_health_modz_target_vb_live_cell_width_length,cell_health_modz_target_vb_live_cell_roundness,cell_health_modz_target_vb_percent_dead_only,...,cell_health_modz_target_vb_percent_early_apoptosis,cell_health_modz_target_cc_all_n_spots_h2ax_mean,cell_health_modz_target_cc_late_mitosis_n_spots_h2ax_per_nucleus_area_mean,cell_health_modz_target_vb_percent_caspase_dead_only,cell_health_modz_target_vb_percent_dead,cell_health_modz_target_cc_g2_n_spots_h2ax_mean,cell_health_modz_target_cc_g2_g1_count,cell_health_modz_target_cc_all_large_notround_polynuclear_mean,cell_health_modz_target_cc_g2_plus_all_m_count,cell_health_modz_target_cc_cc_s
features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cells_AreaShape_Center_Y,-0.0,0.0,-0.0,0.0,0.0,-0.0,-0.054148,-0.0,0.0,0.0,...,-0.0,0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,0.0
Cells_AreaShape_Compactness,-0.0,0.0,-0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,...,-0.0,-0.0,-0.0,-0.0,0.0,-0.053503,-0.0,-0.0,-0.0,0.0


## 4) Apply all models

For real and shuffled data.

In [11]:
all_scores = []
all_shuffle_scores = []
for cell_health_feature in model_dict.keys():
    # Apply Real Model Classifiers
    model_clf = model_dict[cell_health_feature]
    pred_df = apply_model(model=model_clf,
                          feature=cell_health_feature,
                          train_x=x_train_df,
                          test_x=x_test_df)
    all_scores.append(pred_df)
    
    # Apply Shuffled Model Classifiers
    shuffle_model_clf = shuffle_model_dict[cell_health_feature]
    shuffle_pred_df = apply_model(model=shuffle_model_clf,
                                  feature=cell_health_feature,
                                  train_x=x_train_df,
                                  test_x=x_test_df)
    all_shuffle_scores.append(shuffle_pred_df)

## 5) Concatenate scores with Metadata

In [12]:
# Concatenate real data scores
all_scores = (
    pd.concat(all_scores)
    .reset_index(drop=True)
    .pivot_table(index=["profiles", "Metadata_data_type"],
                 columns="model",
                 values="score")
    .reset_index()
)

all_scores = (
    metadata_df.merge(
        all_scores,
        left_on="Metadata_profile_id",
        right_on="profiles"
    )
    .drop("profiles", axis="columns")
)

all_scores.index = all_scores.Metadata_profile_id
all_scores = all_scores.drop("Metadata_profile_id", axis="columns")

# Remove prefix of variable columns
strip_text = "cell_health_{}_target_".format(consensus)
all_scores.columns = [x.replace(strip_text, "") for x in all_scores.columns]

# Output file
file = os.path.join(
    "results", f"{method}_agg", "all_model_predictions_{}.tsv".format(consensus)
)
all_scores.to_csv(file, sep='\t', index=True)

print(all_scores.shape)
all_scores.head(2)

(357, 73)


Unnamed: 0_level_0,Metadata_cell_line,Metadata_pert_name,Metadata_data_type,cc_all_high_h2ax,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_h2ax_mean,cc_all_n_spots_h2ax_per_nucleus_area_mean,cc_all_nucleus_area_mean,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_early_apoptosis,vb_percent_late_apoptosis,vb_percent_live,vb_ros_back_mean,vb_ros_mean
Metadata_profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
profile_0,A549,AKT1-1,train,0.228417,0.816116,0.14853,0.331433,0.204878,0.142605,-0.114986,...,0.082109,0.332015,0.428979,0.292545,0.309882,0.152234,0.354071,-0.293712,0.294949,0.52689
profile_1,A549,AKT1-2,train,0.012612,0.598985,0.055341,0.455854,-0.003243,-0.05827,-0.251804,...,0.251683,0.367668,0.448494,0.445084,0.38704,0.18508,0.399842,-0.446246,0.36732,0.627541


In [13]:
# Concatenate shuffled data scores
all_shuffle_scores = (
    pd.concat(all_shuffle_scores)
    .reset_index(drop=True)
    .pivot_table(index=["profiles", "Metadata_data_type"],
                 columns="model",
                 values="score")
    .reset_index()
)

all_shuffle_scores = (
    metadata_df.merge(all_shuffle_scores,
                      left_on="Metadata_profile_id",
                      right_on="profiles")
    .drop("profiles", axis="columns")
)

all_shuffle_scores.index = all_shuffle_scores.Metadata_profile_id
all_shuffle_scores = all_shuffle_scores.drop("Metadata_profile_id", axis="columns")

# Remove prefix of variable columns
strip_text = "cell_health_{}_target_".format(consensus)
all_shuffle_scores.columns = [x.replace(strip_text, "") for x in all_shuffle_scores.columns]

# Output file
file = os.path.join(
    "results", f"{method}_agg", "all_model_predictions_shuffled_{}.tsv".format(consensus)
)
all_shuffle_scores.to_csv(file, sep='\t', index=True)

print(all_shuffle_scores.shape)
all_shuffle_scores.head(2)

(357, 73)


Unnamed: 0_level_0,Metadata_cell_line,Metadata_pert_name,Metadata_data_type,cc_all_high_h2ax,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_h2ax_mean,cc_all_n_spots_h2ax_per_nucleus_area_mean,cc_all_nucleus_area_mean,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_early_apoptosis,vb_percent_late_apoptosis,vb_percent_live,vb_ros_back_mean,vb_ros_mean
Metadata_profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
profile_0,A549,AKT1-1,train,0.228903,0.235144,0.233312,-0.059212,0.164803,0.076621,0.06709,...,-0.258438,0.235648,0.164073,-0.129406,0.271165,0.139402,0.228445,-0.261712,0.016651,0.063434
profile_1,A549,AKT1-2,train,0.158393,0.235144,0.233312,-0.044404,0.089701,0.039341,0.052826,...,-0.25704,0.197561,0.168769,-0.063346,0.269579,0.139402,0.228445,-0.264186,0.020347,0.05014


## 6) Calculate the Squared Error of Individual Samples

For real and shuffled data

In [14]:
y_df = pd.concat([y_train_df, y_test_df]).reindex(all_scores.index)

print(y_df.shape)
y_df.head(2)

(357, 70)


Unnamed: 0_level_0,cc_all_high_h2ax,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_h2ax_mean,cc_all_n_spots_h2ax_per_nucleus_area_mean,cc_all_nucleus_area_mean,cc_all_nucleus_roundness_mean,cc_cc_early_mitosis,cc_cc_g1,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_early_apoptosis,vb_percent_late_apoptosis,vb_percent_live,vb_ros_back_mean,vb_ros_mean
Metadata_profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
profile_0,-0.005795,0.580351,0.013975,0.381958,0.150696,0.162511,-0.167603,0.040322,0.375966,0.043915,...,0.438339,0.059414,-0.06505,-0.020236,-0.00797,0.082424,0.0,0.020263,0.408214,0.654575
profile_1,0.050169,1.27773,0.241808,0.577422,0.220829,0.366989,-0.278044,-0.182571,0.736046,-1.023968,...,0.067568,0.256141,0.575026,0.225091,0.220461,0.132834,0.386327,-0.224965,0.284962,0.567898


In [16]:
all_score_error = sample_squared_error(scores=all_scores, y=y_df)

all_score_error = (
    metadata_df.merge(
        all_score_error,
        left_on="Metadata_profile_id",
        right_index=True
    )
)

# Output file
file = os.path.join(
    "results", f"{method}_agg", "all_model_sample_squared_error_{}.tsv".format(consensus)
)
all_score_error.to_csv(file, sep='\t', index=False)

print(all_score_error.shape)
all_score_error.head(2)

(357, 73)


Unnamed: 0,Metadata_profile_id,Metadata_cell_line,Metadata_pert_name,cc_all_high_h2ax,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_h2ax_mean,cc_all_n_spots_h2ax_per_nucleus_area_mean,cc_all_nucleus_area_mean,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_early_apoptosis,vb_percent_late_apoptosis,vb_percent_live,vb_ros_back_mean,vb_ros_mean
0,profile_0,A549,AKT1-1,0.054855,0.055585,0.018105,0.002553,0.002936,0.000396,0.002769,...,0.1269,0.074311,0.244065,0.097832,0.10103,0.004873,0.125366,0.09858,0.012829,0.016303
1,profile_1,A549,AKT1-2,0.001411,0.460695,0.03477,0.014779,0.050208,0.180845,0.000689,...,0.033898,0.012438,0.01601,0.048397,0.027749,0.00273,0.000183,0.048965,0.006783,0.003557


In [17]:
all_shuffle_score_error = sample_squared_error(scores=all_shuffle_scores, y=y_df)

all_shuffle_score_error = (
    metadata_df.merge(
        all_shuffle_score_error,
        left_on="Metadata_profile_id",
        right_index=True
    )
)

# Output file
file = os.path.join(
    "results", f"{method}_agg", "all_model_sample_squared_error_shuffled_{}.tsv".format(consensus)
)
all_shuffle_score_error.to_csv(file, sep='\t', index=False)

print(all_shuffle_score_error.shape)
all_shuffle_score_error.head()

(357, 73)


Unnamed: 0,Metadata_profile_id,Metadata_cell_line,Metadata_pert_name,cc_all_high_h2ax,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_h2ax_mean,cc_all_n_spots_h2ax_per_nucleus_area_mean,cc_all_nucleus_area_mean,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_early_apoptosis,vb_percent_late_apoptosis,vb_percent_live,vb_ros_back_mean,vb_ros_mean
0,profile_0,A549,AKT1-1,0.055083,0.119168,0.048109,0.194631,0.000199,0.007377,0.055081,...,0.485499,0.031058,0.052497,0.011918,0.077916,0.003246,0.052187,0.07951,0.153322,0.349447
1,profile_1,A549,AKT1-2,0.011712,1.086985,7.2e-05,0.386668,0.017194,0.107353,0.109475,...,0.10537,0.003432,0.165045,0.083196,0.002413,4.3e-05,0.024927,0.001538,0.070021,0.268073
2,profile_2,A549,ARID1B-1,0.002528,0.92841,0.004647,0.118477,0.078418,0.190376,0.085412,...,0.10226,0.004801,0.037218,0.018486,0.0529,0.012134,0.003003,0.043814,0.14275,0.056966
3,profile_3,A549,ARID1B-2,0.106506,0.006713,0.104066,0.035216,0.397377,0.140361,0.014569,...,0.253177,0.006221,0.045716,0.075955,0.133169,7e-06,0.013593,0.122005,0.032024,0.046444
4,profile_4,A549,ATF4-1,11.715532,0.051837,9.309035,4.0372,4.565127,7.32229,6.369306,...,4.735934,0.010589,0.091343,0.029103,0.035005,0.037206,0.001265,0.015247,0.685666,0.03706
