## Calculate grit with different normalization schemes

We compare whole well vs. control-based normalization in grit calculations.

In [1]:
import pathlib
import pandas as pd

from pycytominer.cyto_utils import infer_cp_features, output
from pycytominer.operations import get_na_columns

from cytominer_eval import evaluate
from cytominer_eval.transform import metric_melt
from cytominer_eval.operations.util import assign_replicates

In [2]:
output_dir = "results"

In [3]:
# Load different normalized data
data_dir = pathlib.Path("../../0.download-data/data/cell-health/profiles")
plate_file = pathlib.Path(
    f"{data_dir}/cell_health_profiles_merged_wholeplate_normalized_featureselected.tsv.gz"
)

profile_df = pd.read_csv(plate_file, sep="\t")

features = infer_cp_features(profile_df)
meta_features = infer_cp_features(profile_df, metadata=True)

print(profile_df.shape)
profile_df.head()

(3456, 512)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_WellCol,Metadata_WellRow,Metadata_cell_line,Metadata_gene_name,Metadata_pert_name,Cells_AreaShape_Compactness,Cells_AreaShape_Extent,Cells_AreaShape_FormFactor,...,Nuclei_Texture_SumAverage_DNA_20_0,Nuclei_Texture_SumAverage_ER_20_0,Nuclei_Texture_SumAverage_Mito_5_0,Nuclei_Texture_SumAverage_RNA_5_0,Nuclei_Texture_SumEntropy_Mito_5_0,Nuclei_Texture_SumEntropy_RNA_20_0,Nuclei_Texture_SumEntropy_RNA_5_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_RNA_10_0
0,SQ00014614,A01,1,A,ES2,EMPTY,EMPTY,0.193423,0.810565,0.394067,...,1.906243,-0.768007,-1.883674,-0.661164,-2.079387,-0.879608,-0.586046,-1.565373,-2.109142,0.068394
1,SQ00014614,A02,2,A,ES2,MCL1,MCL1-5,0.063269,0.17946,-0.621186,...,0.734653,0.394547,-0.420094,-0.140664,-0.323854,-0.878583,-0.928709,-0.102971,-0.844094,-0.956752
2,SQ00014614,A03,3,A,ES2,AKT1,AKT1-1,0.200543,-0.002573,-0.097555,...,0.868485,0.388507,0.137862,0.158269,-0.055158,-0.989307,-0.640994,-0.27722,-0.695575,-0.928039
3,SQ00014614,A04,4,A,ES2,KRAS,KRAS-2B,-0.200653,0.398271,-0.255667,...,0.953094,-0.361671,-0.598619,-0.790145,-0.885389,-1.388938,-1.007342,-1.696331,-1.208717,-1.132148
4,SQ00014614,A05,5,A,ES2,AKT1,AKT1-2,0.291402,0.460302,0.235751,...,1.693123,-0.268063,-0.28098,-0.534104,-0.39532,-1.298397,-1.026182,-0.859397,-0.900225,-1.211184


In [4]:
# Define cell health constants
barcode_col = "Metadata_pert_name"
gene_col = "Metadata_gene_name"

replicate_group_grit = {"profile_col": barcode_col, "replicate_group_col": gene_col}

control_group_cut = ["Chr2", "Luc", "LacZ"]
control_group_pert = ["EMPTY"]

control_barcodes_cut = (
    profile_df.loc[
        profile_df[replicate_group_grit["replicate_group_col"]].isin(control_group_cut),
        replicate_group_grit["profile_col"],
    ]
    .unique()
    .tolist()
)

control_barcodes_pert = (
    profile_df.loc[
        profile_df[replicate_group_grit["replicate_group_col"]].isin(
            control_group_pert
        ),
        replicate_group_grit["profile_col"],
    ]
    .unique()
    .tolist()
)

control_barcodes = {
    "cutting_control": control_barcodes_cut,
    "perturbation_control": control_barcodes_pert,
}

control_barcodes

{'cutting_control': ['Chr2-1',
  'Chr2-4',
  'Chr2-5',
  'Chr2-2',
  'Luc-1',
  'LacZ-3',
  'Luc-2',
  'LacZ-2',
  'Chr2-3',
  'Chr2-6'],
 'perturbation_control': ['EMPTY']}

In [5]:
%%time
grit_results = []
for cell_line in profile_df.Metadata_cell_line.unique():
    for control_barcode in control_barcodes:
        for cor_method in ["pearson", "spearman"]:
            result = evaluate(
                profiles=profile_df.query("Metadata_cell_line == @cell_line"),
                features=features,
                meta_features=[barcode_col, gene_col],
                replicate_groups=replicate_group_grit,
                operation="grit",
                similarity_metric=cor_method,
                grit_control_perts=control_barcodes[control_barcode],
            ).assign(
                cell_line=cell_line,
                barcode_control=control_barcode,
                cor_method=cor_method,
            )

            grit_results.append(result)

grit_results = pd.concat(grit_results).reset_index(drop=True)

print(grit_results.shape)
grit_results.head()

(1428, 6)
CPU times: user 23.7 s, sys: 1.3 s, total: 25 s
Wall time: 26.3 s


Unnamed: 0,perturbation,group,grit,cell_line,barcode_control,cor_method
0,AKT1-1,AKT1,0.377003,ES2,cutting_control,pearson
1,AKT1-2,AKT1,0.349674,ES2,cutting_control,pearson
2,ARID1B-1,ARID1B,0.28247,ES2,cutting_control,pearson
3,ARID1B-2,ARID1B,0.352138,ES2,cutting_control,pearson
4,ATF4-1,ATF4,0.999923,ES2,cutting_control,pearson


In [6]:
# Output results
output_dir = "results"
output_file = pathlib.Path(f"{output_dir}/cell_health_grit_wholeplatenormalized.tsv")

grit_results.to_csv(output_file, sep="\t", index=False)