## Calculate replicate matching

In [1]:
import pathlib
import numpy as np
import pandas as pd

from pycytominer.cyto_utils import infer_cp_features

from cytominer_eval import evaluate
from cytominer_eval.transform import metric_melt
from cytominer_eval.operations.util import assign_replicates

In [2]:
output_dir = "results"

file = pathlib.Path("data/cell_health_merged_feature_select.csv.gz")
cell_health_df = pd.read_csv(file)

print(cell_health_df.shape)
cell_health_df.head()

(3456, 402)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_WellCol,Metadata_WellRow,Metadata_cell_line,Metadata_gene_name,Metadata_pert_name,Cells_AreaShape_Compactness,Cells_AreaShape_Extent,Cells_AreaShape_Zernike_0_0,...,Nuclei_Texture_InverseDifferenceMoment_Mito_5_0,Nuclei_Texture_InverseDifferenceMoment_RNA_10_0,Nuclei_Texture_InverseDifferenceMoment_RNA_20_0,Nuclei_Texture_SumAverage_AGP_5_0,Nuclei_Texture_SumAverage_ER_5_0,Nuclei_Texture_SumAverage_Mito_20_0,Nuclei_Texture_SumAverage_RNA_5_0,Nuclei_Texture_SumEntropy_ER_5_0,Nuclei_Texture_SumEntropy_Mito_5_0,Nuclei_Texture_SumEntropy_RNA_5_0
0,SQ00014618,A01,1,A,HCC44,EMPTY,EMPTY,-1.515696,0.810956,1.984031,...,-1.365392,-0.351107,-0.163153,2.307568,-2.775524,-3.951667,-1.716353,-2.703082,-3.720976,0.107581
1,SQ00014618,A02,2,A,HCC44,MCL1,MCL1-5,0.246423,0.687241,0.062305,...,-0.070069,1.855687,-0.400335,3.776635,0.947498,-0.541032,0.363568,0.910251,-0.364015,0.165935
2,SQ00014618,A03,3,A,HCC44,AKT1,AKT1-1,0.416772,0.151184,-0.045541,...,-1.491414,0.377266,-0.540338,1.743963,-0.51088,-2.206918,-1.440147,-0.571847,-2.261104,-1.358338
3,SQ00014618,A04,4,A,HCC44,KRAS,KRAS-2B,0.645336,-0.290637,-0.455894,...,-0.956379,0.116267,0.338568,1.467339,-0.749321,-1.318064,-1.466339,-1.060094,-2.056278,-0.975661
4,SQ00014618,A05,5,A,HCC44,AKT1,AKT1-2,0.159822,0.308919,-0.195773,...,-1.200222,0.152902,-0.833538,2.129282,-0.719243,-0.198956,-0.702847,-1.291783,-1.299857,-0.575835


In [3]:
features = infer_cp_features(cell_health_df)
meta_features = infer_cp_features(cell_health_df, metadata=True)

similarity_metric = "pearson"
operation = "replicate_reproducibility"

replicate_groups = ["Metadata_cell_line", "Metadata_gene_name", "Metadata_pert_name"]

control_ids = ["Chr2", "Luc", "LacZ"]

In [4]:
# Melt the input profiles to long format
similarity_melted_df = metric_melt(
    df=cell_health_df,
    features=features,
    metadata_features=meta_features,
    similarity_metric=similarity_metric,
    eval_metric=operation,
)

similarity_melted_df = assign_replicates(
    similarity_melted_df=similarity_melted_df, replicate_groups=replicate_groups
)

print(similarity_melted_df.shape)
similarity_melted_df.head()

(5970240, 21)


Unnamed: 0,Metadata_Plate_pair_a,Metadata_Well_pair_a,Metadata_WellCol_pair_a,Metadata_WellRow_pair_a,Metadata_cell_line_pair_a,Metadata_gene_name_pair_a,Metadata_pert_name_pair_a,Metadata_Plate_pair_b,Metadata_Well_pair_b,Metadata_WellCol_pair_b,...,Metadata_cell_line_pair_b,Metadata_gene_name_pair_b,Metadata_pert_name_pair_b,pair_a_index,pair_b_index,similarity_metric,Metadata_cell_line_replicate,Metadata_gene_name_replicate,Metadata_pert_name_replicate,group_replicate
0,SQ00014618,A01,1,A,HCC44,EMPTY,EMPTY,SQ00014618,A02,2,...,HCC44,MCL1,MCL1-5,0,1,0.34136,True,False,False,False
1,SQ00014618,A01,1,A,HCC44,EMPTY,EMPTY,SQ00014618,A03,3,...,HCC44,AKT1,AKT1-1,0,2,0.603323,True,False,False,False
2,SQ00014618,A01,1,A,HCC44,EMPTY,EMPTY,SQ00014618,A04,4,...,HCC44,KRAS,KRAS-2B,0,3,0.577072,True,False,False,False
3,SQ00014618,A01,1,A,HCC44,EMPTY,EMPTY,SQ00014618,A05,5,...,HCC44,AKT1,AKT1-2,0,4,0.473938,True,False,False,False
4,SQ00014618,A01,1,A,HCC44,EMPTY,EMPTY,SQ00014618,A06,6,...,HCC44,EMPTY,EMPTY,0,5,0.552049,True,True,True,True


In [5]:
non_replicate_cor_95th = (
    similarity_melted_df.query("not group_replicate")
    .groupby("Metadata_cell_line_pair_a")["similarity_metric"]
    .quantile(0.95)
    .reset_index()
    .rename({"Metadata_cell_line_pair_a": "cell_line"}, axis="columns")
)

# Output results
output_file = pathlib.Path(f"{output_dir}/cell_health_nonreplicate_95thpercentile.tsv")

non_replicate_cor_95th.to_csv(output_file, sep="\t", index=False)
non_replicate_cor_95th

Unnamed: 0,cell_line,similarity_metric
0,A549,0.528442
1,ES2,0.502821
2,HCC44,0.4585


In [6]:
# Capture median replicate correlation
median_replicate_correlation_df = (
    similarity_melted_df.query("group_replicate")
    .groupby(
        [
            "Metadata_cell_line_pair_a",
            "Metadata_gene_name_pair_a",
            "Metadata_pert_name_pair_a",
        ]
    )["similarity_metric"]
    .median()
    .reset_index()
    .rename(
        {
            "similarity_metric": "median_replicate_correlation",
            "Metadata_pert_name_pair_a": "perturbation",
            "Metadata_gene_name_pair_a": "group",
            "Metadata_cell_line_pair_a": "cell_line",
        },
        axis="columns",
    )
)

print(median_replicate_correlation_df.shape)
median_replicate_correlation_df.head()

(357, 4)


Unnamed: 0,cell_line,group,perturbation,median_replicate_correlation
0,A549,AKT1,AKT1-1,0.022261
1,A549,AKT1,AKT1-2,-0.302957
2,A549,ARID1B,ARID1B-1,0.311278
3,A549,ARID1B,ARID1B-2,0.405716
4,A549,ATF4,ATF4-1,0.821153


In [7]:
median_empty_correlation_df = (
    similarity_melted_df.query("Metadata_gene_name_pair_b in @control_ids")
    .groupby(
        [
            "Metadata_cell_line_pair_a",
            "Metadata_gene_name_pair_a",
            "Metadata_pert_name_pair_a",
        ]
    )["similarity_metric"]
    .median()
    .reset_index()
    .rename(
        {
            "similarity_metric": "median_control_correlation",
            "Metadata_pert_name_pair_a": "perturbation",
            "Metadata_gene_name_pair_a": "group",
            "Metadata_cell_line_pair_a": "cell_line",
        },
        axis="columns",
    )
)

print(median_empty_correlation_df.shape)
median_empty_correlation_df.head()

(357, 4)


Unnamed: 0,cell_line,group,perturbation,median_control_correlation
0,A549,AKT1,AKT1-1,0.016903
1,A549,AKT1,AKT1-2,0.00239
2,A549,ARID1B,ARID1B-1,0.027581
3,A549,ARID1B,ARID1B-2,0.062314
4,A549,ATF4,ATF4-1,0.081103


In [8]:
full_correlation_results_df = median_replicate_correlation_df.merge(
    median_empty_correlation_df, on=["cell_line", "group", "perturbation"], how="inner"
)

print(full_correlation_results_df.shape)
full_correlation_results_df.head()

(357, 5)


Unnamed: 0,cell_line,group,perturbation,median_replicate_correlation,median_control_correlation
0,A549,AKT1,AKT1-1,0.022261,0.016903
1,A549,AKT1,AKT1-2,-0.302957,0.00239
2,A549,ARID1B,ARID1B-1,0.311278,0.027581
3,A549,ARID1B,ARID1B-2,0.405716,0.062314
4,A549,ATF4,ATF4-1,0.821153,0.081103


In [9]:
# Output results
output_file = pathlib.Path(f"{output_dir}/cell_health_replicate_reproducibility.tsv")

full_correlation_results_df.to_csv(output_file, sep="\t", index=False)