# Build consensus signatures (1 signature for each CRISPR guide) from bulk profiles for Cell Health prediction pipeline
**Generate consensus signatures with median + moderated-z-score**

Consensus-profiles are generated via:
MODZ (moderated z-score) 

*reference: cell-health/1.generate-profiles/2.build-consensus-signatures*

In [1]:
import os
import glob
import gzip
from pathlib import Path
import pickle
import re

import numpy as np
import pandas as pd

from pycytominer.consensus import modz
from pycytominer import get_na_columns, aggregate

# from scipy.special import softmax

from pycytominer import aggregate
from pycytominer.cyto_utils import infer_cp_features
from scripts.utils import calculate_weighted_agg

## Load Cell Painting Data
These are individual df's pf well-level profiles (level 3) that are concatatenated into a single file per aggregation method.

In [2]:
input_folder = "data/processed/"
output_folder = "data/profiles/"
method_list = list(set([(x.split("_")[1:][0]).split(".")[0] for x in glob.glob(input_folder + "*.tsv")]))

# since single-cell grit was not calculated for EMPTY wells, we will use median-aggregated well-level profiles
# for EMPTY perturbations to form the EMPTY consensus profile for the cell health prediction pipeline
empty_list = []
for file in glob.glob(input_folder + "*.tsv"):
    file_cell_line = file.split("/")[-1].split(".")[0].split("_")[0]
    file_method = file.split("/")[-1].split(".")[0].split("_")[1]
    if "EMPTY" in file:
        print(f"adding {file} to list")
        empties_df = pd.read_csv(file, sep="\t").assign(
            Metadata_cellline=file_cell_line, Metadata_aggmethod=file_method
        )
        empty_list.append(empties_df)
#     print(file.split('/')[-1].split('.')[0].split('_')[1])
empty_profiles = pd.concat(empty_list)
empty_profiles = empty_profiles[
    sorted(
        empty_profiles,
        key=lambda x: x not in empty_profiles.filter(like="Metadata").columns,
    )
]
print("total shape: ", empty_profiles.shape)
display(empty_profiles.head())


# perform for both well-level aggregation methods (median and grit-informed)
for method in method_list:  # ['weighted']: #
    print(f"for method is: {method}")
    df_list = []
    for file in glob.glob(input_folder + "*.tsv"):
        file_cell_line = file.split("/")[-1].split(".")[0].split("_")[0]
        file_method = file.split("/")[-1].split(".")[0].split("_")[1]
        if method in file and "EMPTY" not in file:
            print(f"adding {file} to {method} df")
            cell_line_df = pd.read_csv(file, sep="\t").assign(
                Metadata_cellline=file_cell_line, Metadata_aggmethod=file_method
            )
            df_list.append(cell_line_df)
    level3profiles = pd.concat(df_list, axis="rows")
    # add in the EMPTY wells
    level3profiles = pd.concat([level3profiles, empty_profiles], axis="rows")
    # reorder the columns
    level3profiles = level3profiles[
        sorted(
            level3profiles,
            key=lambda x: x not in level3profiles.filter(like="Metadata").columns,
        )
    ]
    print(level3profiles.shape)
    display(level3profiles.head())
    print(infer_cp_features(level3profiles, metadata=True))

    # Output final merged file (for all cell lines)
    filename = Path(f"{output_folder}cell_health_profiles_{method}_merged.tsv.gz")
    print(f"filename will be: {filename}")
    level3profiles.to_csv(filename, index=False, sep="\t")

adding data/processed/ES2_median_EMPTY.tsv to list
adding data/processed/HCC44_median_EMPTY.tsv to list
adding data/processed/A549_median_EMPTY.tsv to list
total shape:  (504, 956)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_agg_method,Metadata_WellRow,Metadata_WellCol,Metadata_well_position,Metadata_gene_name,Metadata_pert_name,Metadata_broad_sample,Metadata_cell_line,...,Nuclei_Texture_SumVariance_AGP_20_0,Nuclei_Texture_SumVariance_AGP_5_0,Nuclei_Texture_SumVariance_DNA_10_0,Nuclei_Texture_SumVariance_DNA_20_0,Nuclei_Texture_SumVariance_DNA_5_0,Nuclei_Texture_Variance_AGP_5_0,Nuclei_Texture_Variance_DNA_10_0,Nuclei_Texture_Variance_DNA_20_0,Nuclei_Texture_Variance_DNA_5_0,cell_line
0,SQ00014613,A01,median,A,1,A01,EMPTY,EMPTY,,ES2,...,-0.31805,-0.221235,0.06943,-0.050265,0.151535,-0.20523,0.15945,0.131155,0.15593,ES2
1,SQ00014614,A01,median,A,1,A01,EMPTY,EMPTY,,ES2,...,-0.091395,-0.082155,-0.27264,-0.23089,-0.23042,-0.04145,-0.236835,-0.18955,-0.232285,ES2
2,SQ00014615,A01,median,A,1,A01,EMPTY,EMPTY,,ES2,...,-0.18993,-0.16447,-0.32775,-0.30584,-0.29203,-0.094,-0.24588,-0.21698,-0.27028,ES2
3,SQ00014613,A06,median,A,6,A06,EMPTY,EMPTY,,ES2,...,-0.17573,-0.16735,0.00242,-0.05499,0.04654,-0.14455,0.00976,0.0003,0.02373,ES2
4,SQ00014614,A06,median,A,6,A06,EMPTY,EMPTY,,ES2,...,0.02331,0.045275,0.158715,0.073295,0.195675,0.08258,0.144725,0.143755,0.16567,ES2


for method is: weighted
adding data/processed/A549_weighted.tsv to weighted df
adding data/processed/HCC44_weighted.tsv to weighted df
adding data/processed/ES2_weighted.tsv to weighted df
(3456, 956)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_agg_method,Metadata_WellRow,Metadata_WellCol,Metadata_well_position,Metadata_gene_name,Metadata_pert_name,Metadata_broad_sample,Metadata_cell_line,...,Cytoplasm_Correlation_K_Mito_AGP,Cytoplasm_Correlation_Overlap_Mito_AGP,Cells_Correlation_K_AGP_Mito,Cells_Correlation_K_ER_DNA,Cells_Correlation_K_Mito_AGP,Cells_Correlation_Overlap_Mito_AGP,Cytoplasm_Correlation_K_Mito_ER,Cytoplasm_Correlation_Overlap_Mito_ER,Cells_Correlation_K_Mito_ER,Cells_Correlation_Overlap_Mito_ER
0,SQ00014610,A02,weighted,A,2,A02,MCL1,MCL1-5,,A549,...,,,,,,,,,,
1,SQ00014611,A02,weighted,A,2,A02,MCL1,MCL1-5,,A549,...,,,,,,,,,,
2,SQ00014612,A02,weighted,A,2,A02,MCL1,MCL1-5,,A549,...,,,,,,,,,,
3,SQ00014610,A03,weighted,A,3,A03,AKT1,AKT1-1,BRDN0001054908,A549,...,,,,,,,,,,
4,SQ00014611,A03,weighted,A,3,A03,AKT1,AKT1-1,BRDN0001054908,A549,...,,,,,,,,,,


['Metadata_Plate', 'Metadata_Well', 'Metadata_agg_method', 'Metadata_WellRow', 'Metadata_WellCol', 'Metadata_well_position', 'Metadata_gene_name', 'Metadata_pert_name', 'Metadata_broad_sample', 'Metadata_cell_line', 'Metadata_cellline', 'Metadata_aggmethod']
filename will be: data/profiles/cell_health_profiles_weighted_merged.tsv.gz
for method is: median
adding data/processed/ES2_median.tsv to median df
adding data/processed/HCC44_median.tsv to median df
adding data/processed/A549_median.tsv to median df
(3456, 956)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_agg_method,Metadata_WellRow,Metadata_WellCol,Metadata_well_position,Metadata_gene_name,Metadata_pert_name,Metadata_broad_sample,Metadata_cell_line,...,Cells_Correlation_K_Mito_AGP,Cells_Correlation_Overlap_Mito_AGP,Cytoplasm_Correlation_K_Mito_RNA,Cells_Correlation_K_AGP_DNA,Cells_Correlation_K_Mito_RNA,Cells_Correlation_Overlap_DNA_AGP,Cytoplasm_Correlation_K_Mito_ER,Cytoplasm_Correlation_Overlap_Mito_ER,Cells_Correlation_K_Mito_ER,Cells_Correlation_Overlap_Mito_ER
0,SQ00014613,A02,median,A,2,A02,MCL1,MCL1-5,,ES2,...,,,,,,,,,,
1,SQ00014614,A02,median,A,2,A02,MCL1,MCL1-5,,ES2,...,,,,,,,,,,
2,SQ00014615,A02,median,A,2,A02,MCL1,MCL1-5,,ES2,...,,,,,,,,,,
3,SQ00014613,A03,median,A,3,A03,AKT1,AKT1-1,BRDN0001054908,ES2,...,,,,,,,,,,
4,SQ00014614,A03,median,A,3,A03,AKT1,AKT1-1,BRDN0001054908,ES2,...,,,,,,,,,,


['Metadata_Plate', 'Metadata_Well', 'Metadata_agg_method', 'Metadata_WellRow', 'Metadata_WellCol', 'Metadata_well_position', 'Metadata_gene_name', 'Metadata_pert_name', 'Metadata_broad_sample', 'Metadata_cell_line', 'Metadata_cellline', 'Metadata_aggmethod']
filename will be: data/profiles/cell_health_profiles_median_merged.tsv.gz


## Build Consensus Signatures for aggregation methods
The remainder of this script generates consensus signatures (1 signature for each CRISPR guide perturbation). The remaining cells are
1. ...run once with `method='weighted'` to generate consensus signatures using grit-weighted aggregation of single-cell profiles into well-level profiles
2. ...run again with `method='median'` to generate consensus signatures using standard median aggregation of single-cells profiles into well-level profiles

### Read in well-level profiles

In [18]:
folder = "data/profiles/"
# method='weighted'
method = "median"

In [19]:
x_df = pd.read_csv(
    Path(f"{folder}cell_health_profiles_{method}_merged.tsv.gz"),
    sep="\t",
    low_memory=False,
)
print(x_df.shape)
display(x_df.head())
x_df.groupby(["Metadata_cell_line"]).apply(lambda x: len(get_na_columns(x)))

(3456, 956)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_agg_method,Metadata_WellRow,Metadata_WellCol,Metadata_well_position,Metadata_gene_name,Metadata_pert_name,Metadata_broad_sample,Metadata_cell_line,...,Cells_Correlation_K_Mito_AGP,Cells_Correlation_Overlap_Mito_AGP,Cytoplasm_Correlation_K_Mito_RNA,Cells_Correlation_K_AGP_DNA,Cells_Correlation_K_Mito_RNA,Cells_Correlation_Overlap_DNA_AGP,Cytoplasm_Correlation_K_Mito_ER,Cytoplasm_Correlation_Overlap_Mito_ER,Cells_Correlation_K_Mito_ER,Cells_Correlation_Overlap_Mito_ER
0,SQ00014613,A02,median,A,2,A02,MCL1,MCL1-5,,ES2,...,,,,,,,,,,
1,SQ00014614,A02,median,A,2,A02,MCL1,MCL1-5,,ES2,...,,,,,,,,,,
2,SQ00014615,A02,median,A,2,A02,MCL1,MCL1-5,,ES2,...,,,,,,,,,,
3,SQ00014613,A03,median,A,3,A03,AKT1,AKT1-1,BRDN0001054908,ES2,...,,,,,,,,,,
4,SQ00014614,A03,median,A,3,A03,AKT1,AKT1-1,BRDN0001054908,ES2,...,,,,,,,,,,


Metadata_cell_line
A549     14
ES2      15
HCC44    21
dtype: int64

### Load Cell Health labels from cell-health/ project

In [20]:
commit = "8244680d6e6db1a2bc1f709b9dabf7783c4a9670"
base_url = f"https://github.com/broadinstitute/cell-health/raw/{commit}"
url = f"{base_url}/1.generate-profiles/data/labels/normalized_cell_health_labels.tsv"

y_df = pd.read_csv(url, sep="\t").drop(["plate_name", "well_col", "well_row"], axis="columns")

print(y_df.shape)
y_df.head(3)

(2302, 72)


Unnamed: 0,cell_id,guide,cc_all_high_h2ax,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_h2ax_mean,cc_all_n_spots_h2ax_per_nucleus_area_mean,cc_all_nucleus_area_mean,cc_all_nucleus_roundness_mean,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_early_apoptosis,vb_percent_late_apoptosis,vb_percent_live,vb_ros_back_mean,vb_ros_mean
0,ES2,AKT1-1,0.655229,-0.565658,-0.839186,-0.513748,0.3136,0.263062,0.109983,-0.226513,...,0.281397,-0.279051,-0.9203,-0.139875,-0.016549,-0.429141,-0.177258,0.14057,,
1,ES2,AKT1-1,-0.251336,-0.816445,-0.52594,-0.81981,-0.450799,-0.811628,-0.468875,-0.167787,...,0.543716,-0.221588,-1.070176,-0.046783,0.268559,-0.311041,-0.149198,0.040163,-0.29248,0.008339
2,ES2,AKT1-1,0.338568,-0.683965,0.934312,0.29233,0.272986,-0.007936,0.083732,0.05122,...,-0.472052,-0.053067,0.098093,-0.038353,-0.161186,-0.127101,-0.014996,0.038221,,


## Determine how many Cell Painting profiles have Cell Health status labels

In [21]:
x_groupby_cols = ["Metadata_gene_name", "Metadata_pert_name", "Metadata_cell_line"]

x_metacount_df = (
    x_df.loc[:, x_groupby_cols]
    .assign(n_measurements=1)
    .groupby(x_groupby_cols)
    .count()
    .reset_index()
    .assign(data_type="cell_painting")
    .merge(
        x_df.loc[:, x_groupby_cols + ["Metadata_Well", "Metadata_Plate"]],
        how="left",
        on=x_groupby_cols,
    )
)

print(x_metacount_df.shape)
x_metacount_df.head(2)

(3456, 7)


Unnamed: 0,Metadata_gene_name,Metadata_pert_name,Metadata_cell_line,n_measurements,data_type,Metadata_Well,Metadata_Plate
0,AKT1,AKT1-1,A549,6,cell_painting,A03,SQ00014610
1,AKT1,AKT1-1,A549,6,cell_painting,A03,SQ00014611


In [22]:
# cell health labels
y_groupby_cols = ["guide", "cell_id"]

y_metacount_df = (
    y_df.loc[:, y_groupby_cols]
    .assign(n_measurements=1)
    .groupby(y_groupby_cols)
    .count()
    .reset_index()
    .assign(data_type="cell_health")
)

print(y_metacount_df.shape)
y_metacount_df.head(2)

(364, 4)


Unnamed: 0,guide,cell_id,n_measurements,data_type
0,AKT1-1,A549,4,cell_health
1,AKT1-1,ES2,4,cell_health


In [23]:
all_measurements_df = (
    x_metacount_df.merge(
        y_metacount_df,
        left_on=["Metadata_pert_name", "Metadata_cell_line"],
        right_on=["guide", "cell_id"],
        suffixes=["_paint", "_health"],
        how="inner",
    )
    .sort_values(by=["Metadata_cell_line", "Metadata_pert_name"])
    .reset_index(drop=True)
    .drop(["Metadata_Well", "guide", "cell_id"], axis="columns")
)

file = os.path.join("results", "{}_all_profile_metadata.tsv".format(method))
all_measurements_df.to_csv(file, sep="\t", index=False)

print(all_measurements_df.shape)
all_measurements_df.head()

(3456, 8)


Unnamed: 0,Metadata_gene_name,Metadata_pert_name,Metadata_cell_line,n_measurements_paint,data_type_paint,Metadata_Plate,n_measurements_health,data_type_health
0,AKT1,AKT1-1,A549,6,cell_painting,SQ00014610,4,cell_health
1,AKT1,AKT1-1,A549,6,cell_painting,SQ00014611,4,cell_health
2,AKT1,AKT1-1,A549,6,cell_painting,SQ00014612,4,cell_health
3,AKT1,AKT1-1,A549,6,cell_painting,SQ00014610,4,cell_health
4,AKT1,AKT1-1,A549,6,cell_painting,SQ00014611,4,cell_health


In [24]:
[len(all_measurements_df[x].unique()) for x in all_measurements_df.columns]

[59, 119, 3, 6, 1, 9, 8, 1]

# apply median consensus aggregation...
since the modz didnt work initially

### 1. to Cell Painting Profiles

In [25]:
x_median_df = aggregate(
    x_df,
    strata=["Metadata_cell_line", "Metadata_pert_name"],
    features="infer",
    operation="median",
)

x_median_df = (
    x_median_df.query("Metadata_pert_name in @all_measurements_df.Metadata_pert_name.unique()")
    .query("Metadata_cell_line in @all_measurements_df.Metadata_cell_line.unique()")
    .reset_index(drop=True)
    .reset_index()
    .rename({"index": "Metadata_profile_id"}, axis="columns")
)
x_median_df.Metadata_profile_id = ["profile_{}".format(x) for x in x_median_df.Metadata_profile_id]

print(x_median_df.shape)
x_median_df.head()

(357, 946)


Unnamed: 0,Metadata_profile_id,Metadata_cell_line,Metadata_pert_name,Cytoplasm_AreaShape_Center_X,Cytoplasm_AreaShape_Center_Y,Cytoplasm_AreaShape_Compactness,Cytoplasm_AreaShape_Eccentricity,Cytoplasm_AreaShape_Extent,Cytoplasm_AreaShape_FormFactor,Cytoplasm_AreaShape_Orientation,...,Cells_Correlation_K_Mito_AGP,Cells_Correlation_Overlap_Mito_AGP,Cytoplasm_Correlation_K_Mito_RNA,Cells_Correlation_K_AGP_DNA,Cells_Correlation_K_Mito_RNA,Cells_Correlation_Overlap_DNA_AGP,Cytoplasm_Correlation_K_Mito_ER,Cytoplasm_Correlation_Overlap_Mito_ER,Cells_Correlation_K_Mito_ER,Cells_Correlation_Overlap_Mito_ER
0,profile_0,A549,AKT1-1,-0.024985,0.02162,-0.24073,0.214543,0.04332,-0.007575,0.02495,...,,,0.27828,-0.020295,0.223828,0.18194,,,,
1,profile_1,A549,AKT1-2,0.010285,0.020765,-0.170178,0.194282,-0.08664,-0.146165,-0.018693,...,,,0.230625,-0.01515,0.274195,0.134692,,,,
2,profile_2,A549,ARID1B-1,0.02915,-0.0264,-0.220238,0.229557,-0.03645,0.050093,-0.000877,...,,,-0.04864,0.08006,-0.083322,0.08239,,,,
3,profile_3,A549,ARID1B-2,-0.016415,0.017895,-0.290927,0.189573,0.144155,0.237842,-0.007782,...,,,0.000158,-0.014035,-0.02095,-0.0105,,,,
4,profile_4,A549,ATF4-1,0.025055,-0.01902,-0.562333,0.11509,0.807022,0.85518,0.013275,...,,,-0.570725,-0.301855,-0.563685,0.215773,,,,


In [26]:
# Output Profile Mapping for Downstream Analysis
profile_id_mapping_df = x_median_df.loc[:, x_median_df.columns.str.startswith("Metadata")]
file = os.path.join("data", "{}_profile_id_metadata_mapping.tsv".format(method))
print(file)
profile_id_mapping_df.to_csv(file, sep="\t", index=False)

print(profile_id_mapping_df.shape)
profile_id_mapping_df.head()

data/median_profile_id_metadata_mapping.tsv
(357, 3)


Unnamed: 0,Metadata_profile_id,Metadata_cell_line,Metadata_pert_name
0,profile_0,A549,AKT1-1
1,profile_1,A549,AKT1-2
2,profile_2,A549,ARID1B-1
3,profile_3,A549,ARID1B-2
4,profile_4,A549,ATF4-1


### 2. to Cell Health Panel readouts

In [27]:
cell_health_meta_features = ["cell_id", "guide"]
cell_health_features = y_df.drop(cell_health_meta_features, axis="columns").columns.tolist()
y_meta_merge_cols = ["Metadata_profile_id", "Metadata_pert_name", "Metadata_cell_line"]

In [28]:
y_median_df = aggregate(
    y_df,
    strata=cell_health_meta_features,
    features=cell_health_features,
    operation="median",
)

print(y_median_df.shape)
y_median_df.head()

(364, 72)


Unnamed: 0,cell_id,guide,cc_all_high_h2ax,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_h2ax_mean,cc_all_n_spots_h2ax_per_nucleus_area_mean,cc_all_nucleus_area_mean,cc_all_nucleus_roundness_mean,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_early_apoptosis,vb_percent_late_apoptosis,vb_percent_live,vb_ros_back_mean,vb_ros_mean
0,A549,AKT1-1,0.008156,0.587977,0.01882,0.381501,0.176564,0.187675,-0.170616,0.039147,...,0.399842,0.0,-0.118976,-0.132871,-0.12109,0.0,0.0,0.132882,0.80697,1.293984
1,A549,AKT1-2,0.056667,1.264627,0.24145,0.568443,0.235304,0.372684,-0.276888,-0.183445,...,0.10167,0.318027,0.621374,0.100032,0.074036,0.132751,0.467027,-0.099917,0.558041,1.151867
2,A549,ARID1B-1,0.111163,1.092964,0.151393,0.290203,0.402121,0.4817,-0.27698,-0.149979,...,0.080701,0.3391,0.598093,0.055951,0.042014,0.165161,0.247058,-0.05592,-0.393937,0.103202
3,A549,ARID1B-2,-0.061528,0.320829,-0.091007,0.141819,-0.378769,-0.288693,-0.108741,-0.300783,...,0.265754,0.098699,0.37193,-0.063935,-0.05516,0.138654,0.0,0.063946,0.210005,0.055291
4,A549,ATF4-1,3.967818,0.0034,3.268615,-2.246887,2.891737,2.878938,2.853995,1.243444,...,-2.343919,0.0,-0.089544,0.141535,0.131393,0.0,0.0,-0.141397,-0.63139,0.106477


In [29]:
y_median_df = y_median_df.reset_index(drop=True).merge(
    x_median_df.loc[:, y_meta_merge_cols],
    left_on=["guide", "cell_id"],
    right_on=["Metadata_pert_name", "Metadata_cell_line"],
    how="right",
)

# Get columns in correct order
y_columns = y_meta_merge_cols + y_median_df.loc[:, ~y_median_df.columns.str.startswith("Metadata_")].columns.tolist()

y_median_df = y_median_df.loc[:, y_columns].drop(["guide", "cell_id"], axis="columns")

print(y_median_df.shape)
y_median_df.head(5)

(357, 73)


Unnamed: 0,Metadata_profile_id,Metadata_pert_name,Metadata_cell_line,cc_all_high_h2ax,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_h2ax_mean,cc_all_n_spots_h2ax_per_nucleus_area_mean,cc_all_nucleus_area_mean,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_early_apoptosis,vb_percent_late_apoptosis,vb_percent_live,vb_ros_back_mean,vb_ros_mean
0,profile_0,AKT1-1,A549,0.008156,0.587977,0.01882,0.381501,0.176564,0.187675,-0.170616,...,0.399842,0.0,-0.118976,-0.132871,-0.12109,0.0,0.0,0.132882,0.80697,1.293984
1,profile_1,AKT1-2,A549,0.056667,1.264627,0.24145,0.568443,0.235304,0.372684,-0.276888,...,0.10167,0.318027,0.621374,0.100032,0.074036,0.132751,0.467027,-0.099917,0.558041,1.151867
2,profile_2,ARID1B-1,A549,0.111163,1.092964,0.151393,0.290203,0.402121,0.4817,-0.27698,...,0.080701,0.3391,0.598093,0.055951,0.042014,0.165161,0.247058,-0.05592,-0.393937,0.103202
3,profile_3,ARID1B-2,A549,-0.061528,0.320829,-0.091007,0.141819,-0.378769,-0.288693,-0.108741,...,0.265754,0.098699,0.37193,-0.063935,-0.05516,0.138654,0.0,0.063946,0.210005,0.055291
4,profile_4,ATF4-1,A549,3.967818,0.0034,3.268615,-2.246887,2.891737,2.878938,2.853995,...,-2.343919,0.0,-0.089544,0.141535,0.131393,0.0,0.0,-0.141397,-0.63139,0.106477


In [30]:
# Confirm that matrices are aligned

pd.testing.assert_series_equal(x_median_df.Metadata_profile_id, y_median_df.Metadata_profile_id, check_names=True)

# Are the guides aligned?
pd.testing.assert_series_equal(x_median_df.Metadata_pert_name, y_median_df.Metadata_pert_name, check_names=True)

# Are the cells aligned?
pd.testing.assert_series_equal(x_median_df.Metadata_cell_line, y_median_df.Metadata_cell_line, check_names=True)

# apply MODZ consensus aggregation

### ...to Cell Painting Profiles

In [31]:
%%time

x_consensus_df = modz(x_df, replicate_columns=["Metadata_cell_line", "Metadata_pert_name"], precision=5)

x_consensus_df.head()

CPU times: user 3.41 s, sys: 11 ms, total: 3.42 s
Wall time: 3.41 s


Unnamed: 0_level_0,Unnamed: 1_level_0,Metadata_cell_line,Metadata_pert_name,Cytoplasm_AreaShape_Center_X,Cytoplasm_AreaShape_Center_Y,Cytoplasm_AreaShape_Compactness,Cytoplasm_AreaShape_Eccentricity,Cytoplasm_AreaShape_Extent,Cytoplasm_AreaShape_FormFactor,Cytoplasm_AreaShape_Orientation,Cytoplasm_AreaShape_Solidity,...,Cells_Correlation_K_Mito_AGP,Cells_Correlation_Overlap_Mito_AGP,Cytoplasm_Correlation_K_Mito_RNA,Cells_Correlation_K_AGP_DNA,Cells_Correlation_K_Mito_RNA,Cells_Correlation_Overlap_DNA_AGP,Cytoplasm_Correlation_K_Mito_ER,Cytoplasm_Correlation_Overlap_Mito_ER,Cells_Correlation_K_Mito_ER,Cells_Correlation_Overlap_Mito_ER
Metadata_cell_line,Metadata_pert_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
A549,AKT1-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A549,AKT1-2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A549,ARID1B-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A549,ARID1B-2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A549,ATF4-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
x_consensus_df = (
    x_consensus_df.reset_index()
    .query("Metadata_pert_name in @all_measurements_df.Metadata_pert_name.unique()")
    .query("Metadata_cell_line in @all_measurements_df.Metadata_cell_line.unique()")
    .reset_index(drop=True)
    .reset_index()
    .rename({"index": "Metadata_profile_id"}, axis="columns")
)
x_consensus_df.Metadata_profile_id = ["profile_{}".format(x) for x in x_consensus_df.Metadata_profile_id]

print(x_consensus_df.shape)
x_consensus_df.head(5)

ValueError: cannot insert Metadata_pert_name, already exists

### Cell health assays data

In [None]:
%%time

y_consensus_df = modz(
    y_df,
    features=cell_health_features,
    replicate_columns=cell_health_meta_features,
    precision=5,
)

print(y_consensus_df.shape)
y_consensus_df.head()

In [None]:
y_consensus_df = (
    y_consensus_df.reset_index()
    .reset_index(drop=True)
    .merge(
        x_consensus_df.loc[:, y_meta_merge_cols],
        left_on=["guide", "cell_id"],
        right_on=["Metadata_pert_name", "Metadata_cell_line"],
        how="right",
    )
    .loc[:, y_columns]
    .drop(["guide", "cell_id"], axis="columns")
)

print(y_consensus_df.shape)
y_consensus_df.head(5)

In [None]:
# Confirm that matrices are aligned
pd.testing.assert_series_equal(
    x_consensus_df.Metadata_profile_id,
    y_consensus_df.Metadata_profile_id,
    check_names=True,
)

# Are the guides aligned?
pd.testing.assert_series_equal(
    x_consensus_df.Metadata_pert_name,
    y_consensus_df.Metadata_pert_name,
    check_names=True,
)

# Are the cells aligned?
pd.testing.assert_series_equal(
    x_consensus_df.Metadata_cell_line,
    y_consensus_df.Metadata_cell_line,
    check_names=True,
)

In [None]:
%%time
consensus_folder = "data/consensus/"

file = Path(consensus_folder, "{}_agg_cell_painting_median.tsv.gz".format(method))
x_median_df.to_csv(file, sep="\t", index=False)

file = Path(consensus_folder, "{}_agg_cell_health_median.tsv.gz".format(method))
y_median_df.to_csv(file, sep="\t", index=False)

file = Path(consensus_folder, "{}_agg_cell_painting_modz.tsv.gz".format(method))
x_consensus_df.to_csv(file, sep="\t", index=False)

file = Path(consensus_folder, "{}_agg_cell_health_modz.tsv.gz".format(method))
y_consensus_df.to_csv(file, sep="\t", index=False)