# Build consensus signatures (1 signature for each CRISPR guide) from bulk profiles for Cell Health prediction pipeline
**Generate consensus signatures with median + moderated-z-score**

Consensus-profiles are generated via:
MODZ (moderated z-score) 

*reference: cell-health/1.generate-profiles/2.build-consensus-signatures*

In [1]:
import os
import glob
import gzip
from pathlib import Path
import pickle
import re

import numpy as np
import pandas as pd

from pycytominer.consensus import modz
from pycytominer import get_na_columns, aggregate

# from scipy.special import softmax 

from pycytominer import aggregate
from pycytominer.cyto_utils import infer_cp_features
from scripts.utils import calculate_weighted_agg

## Load Cell Painting Data
These are individual df's pf well-level profiles (level 3) that are concatatenated into a single file per aggregation method.

In [2]:
input_folder = 'data/processed/'
output_folder = 'data/profiles/'
method_list = list(set([(x.split("_")[1:][0]).split('.')[0] for x in glob.glob(input_folder+'*.tsv')]))

# since single-cell grit was not calculated for EMPTY wells, we will use median-aggregated well-level profiles 
# for EMPTY perturbations to form the EMPTY consensus profile for the cell health prediction pipeline
empty_list = []
for file in glob.glob(input_folder+'*.tsv'):
    file_cell_line = file.split('/')[-1].split('.')[0].split('_')[0]
    file_method = file.split('/')[-1].split('.')[0].split('_')[1]
    if "EMPTY" in file:
        print(f"adding {file} to list")
        empties_df = (pd.read_csv(file,sep='\t')
                      .assign(Metadata_cellline = file_cell_line, 
                              Metadata_aggmethod = file_method)
                           )
        empty_list.append(empties_df)
#     print(file.split('/')[-1].split('.')[0].split('_')[1])
empty_profiles = pd.concat(empty_list)
empty_profiles = empty_profiles[sorted(empty_profiles, key = lambda x: x not in empty_profiles.filter(like="Metadata").columns)]
print("total shape: ", empty_profiles.shape)
display(empty_profiles.head())


# perform for both well-level aggregation methods (median and grit-informed)
for method in method_list:# ['weighted']: # 
    print(f"for method is: {method}")
    df_list = []
    for file in glob.glob(input_folder+'*.tsv'):
        file_cell_line = file.split('/')[-1].split('.')[0].split('_')[0]
        file_method = file.split('/')[-1].split('.')[0].split('_')[1]
        if method in file and "EMPTY" not in file:
            print(f"adding {file} to {method} df")
            cell_line_df = (pd.read_csv(file,sep='\t')
                            .assign(Metadata_cellline = file_cell_line, Metadata_aggmethod = file_method)
                           )
            df_list.append(cell_line_df)        
    level3profiles = pd.concat(df_list, axis='rows')
    # add in the EMPTY wells
    level3profiles = pd.concat([level3profiles, empty_profiles], axis='rows')
    # reorder the columns
    level3profiles = level3profiles[sorted(level3profiles, key = lambda x: x not in level3profiles.filter(like="Metadata").columns)]
    print(level3profiles.shape)
    display(level3profiles.head())
    print(infer_cp_features(level3profiles, metadata=True))
    
    # Output final merged file (for all cell lines)
    filename = Path(f"{output_folder}cell_health_profiles_{method}_merged.tsv.gz")
    print(f"filename will be: {filename}")
    level3profiles.to_csv(filename, index=False, sep='\t')

adding data/processed/ES2_median_EMPTY.tsv to list
adding data/processed/HCC44_median_EMPTY.tsv to list
adding data/processed/A549_median_EMPTY.tsv to list


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




total shape:  (504, 956)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_WellCol,Metadata_WellRow,Metadata_agg_method,Metadata_aggmethod,Metadata_broad_sample,Metadata_cell_line,Metadata_cellline,Metadata_gene_name,...,Nuclei_Texture_SumVariance_AGP_20_0,Nuclei_Texture_SumVariance_AGP_5_0,Nuclei_Texture_SumVariance_DNA_10_0,Nuclei_Texture_SumVariance_DNA_20_0,Nuclei_Texture_SumVariance_DNA_5_0,Nuclei_Texture_Variance_AGP_5_0,Nuclei_Texture_Variance_DNA_10_0,Nuclei_Texture_Variance_DNA_20_0,Nuclei_Texture_Variance_DNA_5_0,cell_line
0,SQ00014613,A01,1,A,median,median,,ES2,ES2,EMPTY,...,-0.31805,-0.221235,0.06943,-0.050265,0.151535,-0.20523,0.15945,0.131155,0.15593,ES2
1,SQ00014614,A01,1,A,median,median,,ES2,ES2,EMPTY,...,-0.091395,-0.082155,-0.27264,-0.23089,-0.23042,-0.04145,-0.236835,-0.18955,-0.232285,ES2
2,SQ00014615,A01,1,A,median,median,,ES2,ES2,EMPTY,...,-0.18993,-0.16447,-0.32775,-0.30584,-0.29203,-0.094,-0.24588,-0.21698,-0.27028,ES2
3,SQ00014613,A06,6,A,median,median,,ES2,ES2,EMPTY,...,-0.17573,-0.16735,0.00242,-0.05499,0.04654,-0.14455,0.00976,0.0003,0.02373,ES2
4,SQ00014614,A06,6,A,median,median,,ES2,ES2,EMPTY,...,0.02331,0.045275,0.158715,0.073295,0.195675,0.08258,0.144725,0.143755,0.16567,ES2


for method is: median
adding data/processed/ES2_median.tsv to median df
adding data/processed/HCC44_median.tsv to median df
adding data/processed/A549_median.tsv to median df


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




(3456, 956)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_WellCol,Metadata_WellRow,Metadata_agg_method,Metadata_aggmethod,Metadata_broad_sample,Metadata_cell_line,Metadata_cellline,Metadata_gene_name,...,Nuclei_Texture_SumVariance_AGP_20_0,Nuclei_Texture_SumVariance_AGP_5_0,Nuclei_Texture_SumVariance_DNA_10_0,Nuclei_Texture_SumVariance_DNA_20_0,Nuclei_Texture_SumVariance_DNA_5_0,Nuclei_Texture_Variance_AGP_5_0,Nuclei_Texture_Variance_DNA_10_0,Nuclei_Texture_Variance_DNA_20_0,Nuclei_Texture_Variance_DNA_5_0,cell_line
0,SQ00014613,A02,2,A,median,median,,ES2,ES2,MCL1,...,0.0467,0.17243,0.266465,0.091205,0.323115,0.225055,0.2975,0.261055,0.31938,ES2
1,SQ00014614,A02,2,A,median,median,,ES2,ES2,MCL1,...,0.25365,0.28263,0.1222,0.06795,0.15079,0.32381,0.11377,0.13046,0.12595,ES2
2,SQ00014615,A02,2,A,median,median,,ES2,ES2,MCL1,...,-0.18378,-0.17234,-0.36635,-0.32789,-0.34253,-0.11056,-0.28869,-0.23675,-0.31655,ES2
3,SQ00014613,A03,3,A,median,median,BRDN0001054908,ES2,ES2,AKT1,...,0.119705,0.265515,0.570185,0.312945,0.65303,0.25078,0.644895,0.611595,0.64429,ES2
4,SQ00014614,A03,3,A,median,median,BRDN0001054908,ES2,ES2,AKT1,...,0.06954,0.13137,0.12238,0.09632,0.15173,0.14589,0.11943,0.1195,0.12892,ES2


['Metadata_Plate', 'Metadata_Well', 'Metadata_WellCol', 'Metadata_WellRow', 'Metadata_agg_method', 'Metadata_aggmethod', 'Metadata_broad_sample', 'Metadata_cell_line', 'Metadata_cellline', 'Metadata_gene_name', 'Metadata_pert_name', 'Metadata_well_position']
filename will be: data/profiles/cell_health_profiles_median_merged.tsv.gz
for method is: weighted
adding data/processed/A549_weighted.tsv to weighted df
adding data/processed/HCC44_weighted.tsv to weighted df
adding data/processed/ES2_weighted.tsv to weighted df
(3456, 956)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_WellCol,Metadata_WellRow,Metadata_agg_method,Metadata_aggmethod,Metadata_broad_sample,Metadata_cell_line,Metadata_cellline,Metadata_gene_name,...,Nuclei_Texture_SumVariance_AGP_20_0,Nuclei_Texture_SumVariance_AGP_5_0,Nuclei_Texture_SumVariance_DNA_10_0,Nuclei_Texture_SumVariance_DNA_20_0,Nuclei_Texture_SumVariance_DNA_5_0,Nuclei_Texture_Variance_AGP_5_0,Nuclei_Texture_Variance_DNA_10_0,Nuclei_Texture_Variance_DNA_20_0,Nuclei_Texture_Variance_DNA_5_0,cell_line
0,SQ00014610,A02,2,A,weighted,weighted,,A549,A549,MCL1,...,1.284069,1.362209,1.003097,0.676481,1.083978,1.319537,1.112346,1.151328,1.084761,A549
1,SQ00014611,A02,2,A,weighted,weighted,,A549,A549,MCL1,...,0.34457,0.509215,0.296711,0.005066,0.433841,0.511761,0.551234,0.477705,0.518965,A549
2,SQ00014612,A02,2,A,weighted,weighted,,A549,A549,MCL1,...,0.249964,0.409406,0.022137,-0.109076,0.093917,0.45739,0.215358,0.172091,0.164175,A549
3,SQ00014610,A03,3,A,weighted,weighted,BRDN0001054908,A549,A549,AKT1,...,0.370862,0.286531,0.09898,0.056056,0.121362,0.277774,0.137682,0.21814,0.13058,A549
4,SQ00014611,A03,3,A,weighted,weighted,BRDN0001054908,A549,A549,AKT1,...,0.149827,0.280987,0.228706,0.041776,0.307017,0.264183,0.361494,0.295622,0.352904,A549


['Metadata_Plate', 'Metadata_Well', 'Metadata_WellCol', 'Metadata_WellRow', 'Metadata_agg_method', 'Metadata_aggmethod', 'Metadata_broad_sample', 'Metadata_cell_line', 'Metadata_cellline', 'Metadata_gene_name', 'Metadata_pert_name', 'Metadata_well_position']
filename will be: data/profiles/cell_health_profiles_weighted_merged.tsv.gz


## Build Consensus Signatures for aggregation methods
The remainder of this script generates consensus signatures (1 signature for each CRISPR guide perturbation). The remaining cells are
1. ...run once with `method='weighted'` to generate consensus signatures using grit-weighted aggregation of single-cell profiles into well-level profiles
2. ...run again with `method='median'` to generate consensus signatures using standard median aggregation of single-cells profiles into well-level profiles

### Read in well-level profiles

In [23]:
folder = 'data/profiles/'
method='weighted'
# method='median'

In [24]:
x_df = pd.read_csv(Path(f"{folder}cell_health_profiles_{method}_merged.tsv.gz"), sep='\t', low_memory=False)
print(x_df.shape)
display(x_df.head())
x_df.groupby(['Metadata_cell_line']).apply(lambda x: len(get_na_columns(x)))

(3456, 956)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_WellCol,Metadata_WellRow,Metadata_agg_method,Metadata_aggmethod,Metadata_broad_sample,Metadata_cell_line,Metadata_cellline,Metadata_gene_name,...,Nuclei_Texture_SumVariance_AGP_20_0,Nuclei_Texture_SumVariance_AGP_5_0,Nuclei_Texture_SumVariance_DNA_10_0,Nuclei_Texture_SumVariance_DNA_20_0,Nuclei_Texture_SumVariance_DNA_5_0,Nuclei_Texture_Variance_AGP_5_0,Nuclei_Texture_Variance_DNA_10_0,Nuclei_Texture_Variance_DNA_20_0,Nuclei_Texture_Variance_DNA_5_0,cell_line
0,SQ00014610,A02,2,A,weighted,weighted,,A549,A549,MCL1,...,1.284069,1.362209,1.003097,0.676481,1.083978,1.319537,1.112346,1.151328,1.084761,A549
1,SQ00014611,A02,2,A,weighted,weighted,,A549,A549,MCL1,...,0.34457,0.509215,0.296711,0.005066,0.433841,0.511761,0.551234,0.477705,0.518965,A549
2,SQ00014612,A02,2,A,weighted,weighted,,A549,A549,MCL1,...,0.249964,0.409406,0.022137,-0.109076,0.093917,0.45739,0.215358,0.172091,0.164175,A549
3,SQ00014610,A03,3,A,weighted,weighted,BRDN0001054908,A549,A549,AKT1,...,0.370862,0.286531,0.09898,0.056056,0.121362,0.277774,0.137682,0.21814,0.13058,A549
4,SQ00014611,A03,3,A,weighted,weighted,BRDN0001054908,A549,A549,AKT1,...,0.149827,0.280987,0.228706,0.041776,0.307017,0.264183,0.361494,0.295622,0.352904,A549


Metadata_cell_line
A549     14
ES2      15
HCC44    21
dtype: int64

### Load Cell Health labels from cell-health/ project

In [25]:
commit = "8244680d6e6db1a2bc1f709b9dabf7783c4a9670"
base_url = f"https://github.com/broadinstitute/cell-health/raw/{commit}"
url = f"{base_url}/1.generate-profiles/data/labels/normalized_cell_health_labels.tsv"

y_df = pd.read_csv(url, sep='\t').drop(["plate_name", "well_col", "well_row"], axis="columns")

print(y_df.shape)
y_df.head(3)

(2302, 72)


Unnamed: 0,cell_id,guide,cc_all_high_h2ax,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_h2ax_mean,cc_all_n_spots_h2ax_per_nucleus_area_mean,cc_all_nucleus_area_mean,cc_all_nucleus_roundness_mean,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_early_apoptosis,vb_percent_late_apoptosis,vb_percent_live,vb_ros_back_mean,vb_ros_mean
0,ES2,AKT1-1,0.655229,-0.565658,-0.839186,-0.513748,0.3136,0.263062,0.109983,-0.226513,...,0.281397,-0.279051,-0.9203,-0.139875,-0.016549,-0.429141,-0.177258,0.14057,,
1,ES2,AKT1-1,-0.251336,-0.816445,-0.52594,-0.81981,-0.450799,-0.811628,-0.468875,-0.167787,...,0.543716,-0.221588,-1.070176,-0.046783,0.268559,-0.311041,-0.149198,0.040163,-0.29248,0.008339
2,ES2,AKT1-1,0.338568,-0.683965,0.934312,0.29233,0.272986,-0.007936,0.083732,0.05122,...,-0.472052,-0.053067,0.098093,-0.038353,-0.161186,-0.127101,-0.014996,0.038221,,


## Determine how many Cell Painting profiles have Cell Health status labels

In [26]:
x_groupby_cols = ["Metadata_gene_name", "Metadata_pert_name", "Metadata_cell_line"]

x_metacount_df = (
    x_df
    .loc[:, x_groupby_cols]
    .assign(n_measurements=1)
    .groupby(x_groupby_cols)
    .count()
    .reset_index()
    .assign(data_type="cell_painting")
    .merge(x_df.loc[:, x_groupby_cols + ["Metadata_Well", "Metadata_Plate"]],
           how="left",
           on=x_groupby_cols)
)

print(x_metacount_df.shape)
x_metacount_df.head(2)

(3456, 7)


Unnamed: 0,Metadata_gene_name,Metadata_pert_name,Metadata_cell_line,n_measurements,data_type,Metadata_Well,Metadata_Plate
0,AKT1,AKT1-1,A549,6,cell_painting,A03,SQ00014610
1,AKT1,AKT1-1,A549,6,cell_painting,A03,SQ00014611


In [27]:
# cell health labels
y_groupby_cols = ["guide", "cell_id"]

y_metacount_df = (
    y_df
    .loc[:, y_groupby_cols]
    .assign(n_measurements=1)
    .groupby(y_groupby_cols)
    .count()
    .reset_index()
    .assign(data_type="cell_health")
)

print(y_metacount_df.shape)
y_metacount_df.head(2)

(364, 4)


Unnamed: 0,guide,cell_id,n_measurements,data_type
0,AKT1-1,A549,4,cell_health
1,AKT1-1,ES2,4,cell_health


In [28]:
all_measurements_df = (
    x_metacount_df
    .merge(
        y_metacount_df,
        left_on=["Metadata_pert_name", "Metadata_cell_line"],
        right_on=["guide", "cell_id"],
        suffixes=["_paint", "_health"],
        how="inner")
    .sort_values(by=["Metadata_cell_line", "Metadata_pert_name"])
    .reset_index(drop=True)
    .drop(["Metadata_Well", "guide", "cell_id"], axis="columns")
)

file = os.path.join("results", "all_profile_metadata.tsv")
# all_measurements_df.to_csv(file, sep='\t', index=False)

print(all_measurements_df.shape)
all_measurements_df.head()

(3456, 8)


Unnamed: 0,Metadata_gene_name,Metadata_pert_name,Metadata_cell_line,n_measurements_paint,data_type_paint,Metadata_Plate,n_measurements_health,data_type_health
0,AKT1,AKT1-1,A549,6,cell_painting,SQ00014610,4,cell_health
1,AKT1,AKT1-1,A549,6,cell_painting,SQ00014611,4,cell_health
2,AKT1,AKT1-1,A549,6,cell_painting,SQ00014612,4,cell_health
3,AKT1,AKT1-1,A549,6,cell_painting,SQ00014610,4,cell_health
4,AKT1,AKT1-1,A549,6,cell_painting,SQ00014611,4,cell_health


In [29]:
[len(all_measurements_df[x].unique()) for x in all_measurements_df.columns]

[59, 119, 3, 6, 1, 9, 8, 1]

In [30]:
[len(all_measurements_df[x].unique()) for x in all_measurements_df.columns]

[59, 119, 3, 6, 1, 9, 8, 1]

# apply median consensus aggregation...
since the modz didnt work initially

### 1. to Cell Painting Profiles

In [31]:
x_median_df = aggregate(
    x_df,
    strata=["Metadata_cell_line", "Metadata_pert_name"],
    features="infer",
    operation="median"
)

x_median_df = (
    x_median_df
    .query("Metadata_pert_name in @all_measurements_df.Metadata_pert_name.unique()")
    .query("Metadata_cell_line in @all_measurements_df.Metadata_cell_line.unique()")
    .reset_index(drop=True)
    .reset_index()
    .rename({"index": "Metadata_profile_id"}, axis='columns')
)
x_median_df.Metadata_profile_id = ["profile_{}".format(x) for x in x_median_df.Metadata_profile_id]

print(x_median_df.shape)
x_median_df.head()

(357, 946)


Unnamed: 0,Metadata_profile_id,Metadata_cell_line,Metadata_pert_name,Cells_AreaShape_Center_Y,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,Cells_AreaShape_Extent,Cells_AreaShape_Orientation,Cells_AreaShape_Zernike_0_0,Cells_AreaShape_Zernike_1_1,...,Nuclei_Texture_SumEntropy_RNA_5_0,Nuclei_Texture_SumVariance_AGP_20_0,Nuclei_Texture_SumVariance_AGP_5_0,Nuclei_Texture_SumVariance_DNA_10_0,Nuclei_Texture_SumVariance_DNA_20_0,Nuclei_Texture_SumVariance_DNA_5_0,Nuclei_Texture_Variance_AGP_5_0,Nuclei_Texture_Variance_DNA_10_0,Nuclei_Texture_Variance_DNA_20_0,Nuclei_Texture_Variance_DNA_5_0
0,profile_0,A549,AKT1-1,0.242064,0.026447,-0.163592,-0.094232,-0.020648,-0.012759,0.059713,...,0.468986,0.524497,0.832409,0.715932,0.225928,0.909484,0.810709,0.879704,0.856938,0.888772
1,profile_1,A549,AKT1-2,0.034776,-0.066228,-0.020002,0.069651,-0.008441,0.085545,0.113848,...,0.13653,0.265044,0.380289,0.29914,-0.052063,0.446032,0.39273,0.614219,0.575369,0.560667
2,profile_2,A549,ARID1B-1,-0.11742,0.224212,0.14337,-0.254822,0.038828,-0.225436,0.024631,...,-0.265539,-0.674021,-0.726375,-0.743734,-0.665494,-0.82075,-0.661084,-0.627315,-0.633939,-0.701475
3,profile_3,A549,ARID1B-2,0.042407,0.254648,0.124658,-0.300156,0.023645,-0.209193,0.065093,...,0.159614,-0.432893,-0.480478,-0.072464,-0.1252,-0.000501,-0.523288,-0.042473,-0.02265,0.013072
4,profile_4,A549,ATF4-1,-0.003576,-0.071212,-0.045842,0.142506,0.023086,-0.006711,-0.002751,...,-0.042162,0.116233,0.0012,0.088496,0.196083,0.048844,-0.026882,-0.069674,-0.011722,-0.011095


In [32]:
# Output Profile Mapping for Downstream Analysis
profile_id_mapping_df = x_median_df.loc[:, x_median_df.columns.str.startswith("Metadata")]
file = os.path.join("data", "{}_profile_id_metadata_mapping.tsv".format(method))
print(file)
profile_id_mapping_df.to_csv(file, sep='\t', index=False)

print(profile_id_mapping_df.shape)
profile_id_mapping_df.head()

data/weighted_profile_id_metadata_mapping.tsv
(357, 3)


Unnamed: 0,Metadata_profile_id,Metadata_cell_line,Metadata_pert_name
0,profile_0,A549,AKT1-1
1,profile_1,A549,AKT1-2
2,profile_2,A549,ARID1B-1
3,profile_3,A549,ARID1B-2
4,profile_4,A549,ATF4-1


### 2. to Cell Health Panel readouts

In [33]:
cell_health_meta_features = ["cell_id", "guide"]
cell_health_features = y_df.drop(cell_health_meta_features, axis="columns").columns.tolist()
y_meta_merge_cols = ["Metadata_profile_id", "Metadata_pert_name", "Metadata_cell_line"]

In [34]:
y_median_df = aggregate(
    y_df,
    strata=cell_health_meta_features,
    features=cell_health_features,
    operation="median"
)

print(y_median_df.shape)
y_median_df.head()

(364, 72)


Unnamed: 0,cell_id,guide,cc_all_high_h2ax,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_h2ax_mean,cc_all_n_spots_h2ax_per_nucleus_area_mean,cc_all_nucleus_area_mean,cc_all_nucleus_roundness_mean,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_early_apoptosis,vb_percent_late_apoptosis,vb_percent_live,vb_ros_back_mean,vb_ros_mean
0,A549,AKT1-1,0.008156,0.587977,0.01882,0.381501,0.176564,0.187675,-0.170616,0.039147,...,0.399842,0.0,-0.118976,-0.132871,-0.12109,0.0,0.0,0.132882,0.80697,1.293984
1,A549,AKT1-2,0.056667,1.264627,0.24145,0.568443,0.235304,0.372684,-0.276888,-0.183445,...,0.10167,0.318027,0.621374,0.100032,0.074036,0.132751,0.467027,-0.099917,0.558041,1.151867
2,A549,ARID1B-1,0.111163,1.092964,0.151393,0.290203,0.402121,0.4817,-0.27698,-0.149979,...,0.080701,0.3391,0.598093,0.055951,0.042014,0.165161,0.247058,-0.05592,-0.393937,0.103202
3,A549,ARID1B-2,-0.061528,0.320829,-0.091007,0.141819,-0.378769,-0.288693,-0.108741,-0.300783,...,0.265754,0.098699,0.37193,-0.063935,-0.05516,0.138654,0.0,0.063946,0.210005,0.055291
4,A549,ATF4-1,3.967818,0.0034,3.268615,-2.246887,2.891737,2.878938,2.853995,1.243444,...,-2.343919,0.0,-0.089544,0.141535,0.131393,0.0,0.0,-0.141397,-0.63139,0.106477


In [35]:
y_median_df = (
    y_median_df
    .reset_index(drop=True)
    .merge(
        x_median_df.loc[:, y_meta_merge_cols],
        left_on=["guide", "cell_id"],
        right_on=["Metadata_pert_name", "Metadata_cell_line"],
        how="right"
    )
)

# Get columns in correct order
y_columns = (
    y_meta_merge_cols +
    y_median_df
    .loc[:, ~y_median_df.columns.str.startswith("Metadata_")]
    .columns
    .tolist()
)

y_median_df = (
    y_median_df
    .loc[:, y_columns]
    .drop(["guide", "cell_id"], axis="columns")
)

print(y_median_df.shape)
y_median_df.head(5)

(357, 73)


Unnamed: 0,Metadata_profile_id,Metadata_pert_name,Metadata_cell_line,cc_all_high_h2ax,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_h2ax_mean,cc_all_n_spots_h2ax_per_nucleus_area_mean,cc_all_nucleus_area_mean,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_early_apoptosis,vb_percent_late_apoptosis,vb_percent_live,vb_ros_back_mean,vb_ros_mean
0,profile_0,AKT1-1,A549,0.008156,0.587977,0.01882,0.381501,0.176564,0.187675,-0.170616,...,0.399842,0.0,-0.118976,-0.132871,-0.12109,0.0,0.0,0.132882,0.80697,1.293984
1,profile_1,AKT1-2,A549,0.056667,1.264627,0.24145,0.568443,0.235304,0.372684,-0.276888,...,0.10167,0.318027,0.621374,0.100032,0.074036,0.132751,0.467027,-0.099917,0.558041,1.151867
2,profile_2,ARID1B-1,A549,0.111163,1.092964,0.151393,0.290203,0.402121,0.4817,-0.27698,...,0.080701,0.3391,0.598093,0.055951,0.042014,0.165161,0.247058,-0.05592,-0.393937,0.103202
3,profile_3,ARID1B-2,A549,-0.061528,0.320829,-0.091007,0.141819,-0.378769,-0.288693,-0.108741,...,0.265754,0.098699,0.37193,-0.063935,-0.05516,0.138654,0.0,0.063946,0.210005,0.055291
4,profile_4,ATF4-1,A549,3.967818,0.0034,3.268615,-2.246887,2.891737,2.878938,2.853995,...,-2.343919,0.0,-0.089544,0.141535,0.131393,0.0,0.0,-0.141397,-0.63139,0.106477


In [36]:
# Confirm that matrices are aligned

pd.testing.assert_series_equal(
    x_median_df.Metadata_profile_id,
    y_median_df.Metadata_profile_id,
    check_names=True
)

# Are the guides aligned?
pd.testing.assert_series_equal(
    x_median_df.Metadata_pert_name,
    y_median_df.Metadata_pert_name,
    check_names=True
)

# Are the cells aligned?
pd.testing.assert_series_equal(
    x_median_df.Metadata_cell_line,
    y_median_df.Metadata_cell_line,
    check_names=True
)

# apply MODZ consensus aggregation

### ...to Cell Painting Profiles

In [37]:
%%time

x_consensus_df = modz(
    x_df,
    replicate_columns=["Metadata_cell_line", "Metadata_pert_name"],
    precision=5
)

x_consensus_df.head()

CPU times: user 19.2 s, sys: 35.2 ms, total: 19.2 s
Wall time: 19.2 s


Unnamed: 0_level_0,Unnamed: 1_level_0,Cells_AreaShape_Center_Y,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,Cells_AreaShape_Extent,Cells_AreaShape_Orientation,Cells_AreaShape_Zernike_0_0,Cells_AreaShape_Zernike_1_1,Cells_AreaShape_Zernike_2_0,Cells_AreaShape_Zernike_2_2,Cells_AreaShape_Zernike_3_3,...,Nuclei_Texture_SumEntropy_RNA_5_0,Nuclei_Texture_SumVariance_AGP_20_0,Nuclei_Texture_SumVariance_AGP_5_0,Nuclei_Texture_SumVariance_DNA_10_0,Nuclei_Texture_SumVariance_DNA_20_0,Nuclei_Texture_SumVariance_DNA_5_0,Nuclei_Texture_Variance_AGP_5_0,Nuclei_Texture_Variance_DNA_10_0,Nuclei_Texture_Variance_DNA_20_0,Nuclei_Texture_Variance_DNA_5_0
Metadata_cell_line,Metadata_pert_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
A549,AKT1-1,0.503175,0.110106,-0.045764,-0.263006,-0.070482,-0.062049,0.183216,-0.215126,-0.088522,0.191404,...,0.774055,1.033896,1.433717,1.210472,0.595231,1.444966,1.415588,1.565379,1.401021,1.537235
A549,AKT1-2,0.100797,-0.101499,-0.052097,-0.010985,-0.012921,0.12516,0.162637,-0.06345,0.039013,0.057876,...,0.406455,0.367872,0.598358,0.405894,-0.098476,0.651047,0.598719,0.912346,0.730771,0.847872
A549,ARID1B-1,-0.130649,0.448408,0.292884,-0.412305,0.035924,-0.428059,0.025479,-0.303993,0.125645,0.054173,...,-0.281958,-0.785849,-0.795507,-0.779524,-0.689953,-0.724053,-0.712559,-0.533418,-0.577375,-0.584598
A549,ARID1B-2,0.08415,0.552944,0.326113,-0.610228,0.033123,-0.561141,0.100766,-0.406873,0.058547,0.117841,...,0.093125,-0.46995,-0.650276,-0.306734,-0.340945,-0.228247,-0.75963,-0.263143,-0.294382,-0.193774
A549,ATF4-1,0.010101,-0.079724,-0.053684,0.125938,0.022919,0.019611,0.002694,0.092245,-0.095388,-0.066832,...,-0.044726,0.151566,0.048038,0.092861,0.20462,0.045101,0.019923,-0.063601,-0.008132,-0.01101


In [38]:
x_consensus_df = (
    x_consensus_df
    .reset_index()
    .query("Metadata_pert_name in @all_measurements_df.Metadata_pert_name.unique()")
    .query("Metadata_cell_line in @all_measurements_df.Metadata_cell_line.unique()")
    .reset_index(drop=True)
    .reset_index()
    .rename(
        {
            "index": "Metadata_profile_id"
        },
        axis='columns'
    )
)
x_consensus_df.Metadata_profile_id = ["profile_{}".format(x) for x in x_consensus_df.Metadata_profile_id]

print(x_consensus_df.shape)
x_consensus_df.head(5)

(357, 946)


Unnamed: 0,Metadata_profile_id,Metadata_cell_line,Metadata_pert_name,Cells_AreaShape_Center_Y,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,Cells_AreaShape_Extent,Cells_AreaShape_Orientation,Cells_AreaShape_Zernike_0_0,Cells_AreaShape_Zernike_1_1,...,Nuclei_Texture_SumEntropy_RNA_5_0,Nuclei_Texture_SumVariance_AGP_20_0,Nuclei_Texture_SumVariance_AGP_5_0,Nuclei_Texture_SumVariance_DNA_10_0,Nuclei_Texture_SumVariance_DNA_20_0,Nuclei_Texture_SumVariance_DNA_5_0,Nuclei_Texture_Variance_AGP_5_0,Nuclei_Texture_Variance_DNA_10_0,Nuclei_Texture_Variance_DNA_20_0,Nuclei_Texture_Variance_DNA_5_0
0,profile_0,A549,AKT1-1,0.503175,0.110106,-0.045764,-0.263006,-0.070482,-0.062049,0.183216,...,0.774055,1.033896,1.433717,1.210472,0.595231,1.444966,1.415588,1.565379,1.401021,1.537235
1,profile_1,A549,AKT1-2,0.100797,-0.101499,-0.052097,-0.010985,-0.012921,0.12516,0.162637,...,0.406455,0.367872,0.598358,0.405894,-0.098476,0.651047,0.598719,0.912346,0.730771,0.847872
2,profile_2,A549,ARID1B-1,-0.130649,0.448408,0.292884,-0.412305,0.035924,-0.428059,0.025479,...,-0.281958,-0.785849,-0.795507,-0.779524,-0.689953,-0.724053,-0.712559,-0.533418,-0.577375,-0.584598
3,profile_3,A549,ARID1B-2,0.08415,0.552944,0.326113,-0.610228,0.033123,-0.561141,0.100766,...,0.093125,-0.46995,-0.650276,-0.306734,-0.340945,-0.228247,-0.75963,-0.263143,-0.294382,-0.193774
4,profile_4,A549,ATF4-1,0.010101,-0.079724,-0.053684,0.125938,0.022919,0.019611,0.002694,...,-0.044726,0.151566,0.048038,0.092861,0.20462,0.045101,0.019923,-0.063601,-0.008132,-0.01101


### Cell health assays data

In [39]:
%%time 

y_consensus_df = modz(
    y_df,
    features=cell_health_features,
    replicate_columns=cell_health_meta_features,
    precision=5
)

print(y_consensus_df.shape)
y_consensus_df.head()

(364, 70)
CPU times: user 5.52 s, sys: 0 ns, total: 5.52 s
Wall time: 5.52 s


Unnamed: 0_level_0,Unnamed: 1_level_0,cc_all_high_h2ax,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_h2ax_mean,cc_all_n_spots_h2ax_per_nucleus_area_mean,cc_all_nucleus_area_mean,cc_all_nucleus_roundness_mean,cc_cc_early_mitosis,cc_cc_g1,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_early_apoptosis,vb_percent_late_apoptosis,vb_percent_live,vb_ros_back_mean,vb_ros_mean
cell_id,guide,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
A549,AKT1-1,-0.005795,0.580351,0.013975,0.381958,0.150696,0.162511,-0.167603,0.040322,0.375966,0.043915,...,0.438339,0.059414,-0.06505,-0.020236,-0.00797,0.082424,0.0,0.020263,0.408214,0.654575
A549,AKT1-2,0.050169,1.27773,0.241808,0.577422,0.220829,0.366989,-0.278044,-0.182571,0.736046,-1.023968,...,0.067568,0.256141,0.575026,0.225091,0.220461,0.132834,0.386327,-0.224965,0.284962,0.567898
A549,ARID1B-1,0.118598,1.198685,0.16514,0.330071,0.417723,0.514065,-0.308749,-0.172735,0.148168,-0.606599,...,0.100365,0.299229,0.501941,0.055517,0.047697,0.249557,0.283246,-0.055445,-0.363766,-0.205937
A549,ARID1B-2,-0.072919,0.317079,-0.089281,0.155305,-0.385316,-0.300279,-0.120261,-0.299718,0.012239,-0.221492,...,0.283802,0.143096,0.493883,-0.084415,-0.085658,0.13679,0.111855,0.084439,0.198285,-0.162976
A549,ATF4-1,4.286179,0.007467,3.284383,-2.246762,3.015881,3.065773,2.854124,1.246508,0.049095,0.923187,...,-2.434136,0.315381,0.509217,0.100668,0.104857,0.332291,0.192878,-0.100596,-0.811271,-0.092505


In [40]:
y_consensus_df = (
    y_consensus_df
    .reset_index()
    .reset_index(drop=True)
    .merge(
        x_consensus_df.loc[:, y_meta_merge_cols],
        left_on=["guide", "cell_id"],
        right_on=["Metadata_pert_name", "Metadata_cell_line"],
        how="right"
    )
    .loc[:, y_columns]
    .drop(["guide", "cell_id"], axis="columns")
)

print(y_consensus_df.shape)
y_consensus_df.head(5)

(357, 73)


Unnamed: 0,Metadata_profile_id,Metadata_pert_name,Metadata_cell_line,cc_all_high_h2ax,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_h2ax_mean,cc_all_n_spots_h2ax_per_nucleus_area_mean,cc_all_nucleus_area_mean,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_early_apoptosis,vb_percent_late_apoptosis,vb_percent_live,vb_ros_back_mean,vb_ros_mean
0,profile_0,AKT1-1,A549,-0.005795,0.580351,0.013975,0.381958,0.150696,0.162511,-0.167603,...,0.438339,0.059414,-0.06505,-0.020236,-0.00797,0.082424,0.0,0.020263,0.408214,0.654575
1,profile_1,AKT1-2,A549,0.050169,1.27773,0.241808,0.577422,0.220829,0.366989,-0.278044,...,0.067568,0.256141,0.575026,0.225091,0.220461,0.132834,0.386327,-0.224965,0.284962,0.567898
2,profile_2,ARID1B-1,A549,0.118598,1.198685,0.16514,0.330071,0.417723,0.514065,-0.308749,...,0.100365,0.299229,0.501941,0.055517,0.047697,0.249557,0.283246,-0.055445,-0.363766,-0.205937
3,profile_3,ARID1B-2,A549,-0.072919,0.317079,-0.089281,0.155305,-0.385316,-0.300279,-0.120261,...,0.283802,0.143096,0.493883,-0.084415,-0.085658,0.13679,0.111855,0.084439,0.198285,-0.162976
4,profile_4,ATF4-1,A549,4.286179,0.007467,3.284383,-2.246762,3.015881,3.065773,2.854124,...,-2.434136,0.315381,0.509217,0.100668,0.104857,0.332291,0.192878,-0.100596,-0.811271,-0.092505


In [41]:
# Confirm that matrices are aligned
pd.testing.assert_series_equal(
    x_consensus_df.Metadata_profile_id,
    y_consensus_df.Metadata_profile_id,
    check_names=True
)

# Are the guides aligned?
pd.testing.assert_series_equal(
    x_consensus_df.Metadata_pert_name,
    y_consensus_df.Metadata_pert_name,
    check_names=True
)

# Are the cells aligned?
pd.testing.assert_series_equal(
    x_consensus_df.Metadata_cell_line,
    y_consensus_df.Metadata_cell_line,
    check_names=True
)

In [42]:
%%time
consensus_folder = 'data/consensus/'

file = Path(consensus_folder, "{}_agg_cell_painting_median.tsv.gz".format(method))
x_median_df.to_csv(file, sep="\t", index=False)

file = Path(consensus_folder, "{}_agg_cell_health_median.tsv.gz".format(method))
y_median_df.to_csv(file, sep="\t", index=False)

file = Path(consensus_folder, "{}_agg_cell_painting_modz.tsv.gz".format(method))
x_consensus_df.to_csv(file, sep="\t", index=False)

file = Path(consensus_folder, "{}_agg_cell_health_modz.tsv.gz".format(method))
y_consensus_df.to_csv(file, sep="\t", index=False)

CPU times: user 3.92 s, sys: 36 ms, total: 3.95 s
Wall time: 4.21 s
