# Calculate grit for different Cell Profiler feature groups

* Per compartment (Cells, Cytoplasm, Nuclei)
* Per compartment feature group (Cells x AreaShape, Cells x Correlation, Nuclei x Texture, etc.)
* Per channel (DNA, RNA, AGP, ER, Mito)
    * Use all features that include any information from one of these channels

In [1]:
import pathlib
import numpy as np
import pandas as pd

from pycytominer.cyto_utils import infer_cp_features

from cytominer_eval import evaluate

In [2]:
compartments = ["Cells", "Cytoplasm", "Nuclei"]

In [3]:
# Load Cell Health data
data_file = pathlib.Path("data/cell_health_merged_feature_select.csv.gz")

df = pd.read_csv(data_file, sep=",")

print(df.shape)
df.head(2)

(3456, 402)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_WellCol,Metadata_WellRow,Metadata_cell_line,Metadata_gene_name,Metadata_pert_name,Cells_AreaShape_Compactness,Cells_AreaShape_Extent,Cells_AreaShape_Zernike_0_0,...,Nuclei_Texture_InverseDifferenceMoment_Mito_5_0,Nuclei_Texture_InverseDifferenceMoment_RNA_10_0,Nuclei_Texture_InverseDifferenceMoment_RNA_20_0,Nuclei_Texture_SumAverage_AGP_5_0,Nuclei_Texture_SumAverage_ER_5_0,Nuclei_Texture_SumAverage_Mito_20_0,Nuclei_Texture_SumAverage_RNA_5_0,Nuclei_Texture_SumEntropy_ER_5_0,Nuclei_Texture_SumEntropy_Mito_5_0,Nuclei_Texture_SumEntropy_RNA_5_0
0,SQ00014618,A01,1,A,HCC44,EMPTY,EMPTY,-1.515696,0.810956,1.984031,...,-1.365392,-0.351107,-0.163153,2.307568,-2.775524,-3.951667,-1.716353,-2.703082,-3.720976,0.107581
1,SQ00014618,A02,2,A,HCC44,MCL1,MCL1-5,0.246423,0.687241,0.062305,...,-0.070069,1.855687,-0.400335,3.776635,0.947498,-0.541032,0.363568,0.910251,-0.364015,0.165935


In [4]:
# Define cell health constants
barcode_col = "Metadata_pert_name"
gene_col = "Metadata_gene_name"

replicate_group_grit = {
    "replicate_id": barcode_col,
    "group_id": gene_col
}

control_group_cut = ["Chr2", "Luc", "LacZ"]

control_barcodes = (
    df.loc[
        df[replicate_group_grit["group_id"]].isin(control_group_cut),
        replicate_group_grit["replicate_id"]
    ]
    .unique()
    .tolist()
)

control_barcodes

['Chr2-1',
 'Chr2-4',
 'Chr2-5',
 'Chr2-2',
 'Luc-1',
 'LacZ-3',
 'Luc-2',
 'LacZ-2',
 'Chr2-3',
 'Chr2-6']

In [5]:
all_features = infer_cp_features(df, compartments=compartments)
meta_features = infer_cp_features(df, metadata=True)

meta_features

['Metadata_Plate',
 'Metadata_Well',
 'Metadata_WellCol',
 'Metadata_WellRow',
 'Metadata_cell_line',
 'Metadata_gene_name',
 'Metadata_pert_name']

In [6]:
grit_compartment_results = []
for cell_line in df.Metadata_cell_line.unique():
    for compartment in compartments:
        compartment_features = infer_cp_features(df, compartments=compartment)
        subset_df = df.loc[:, meta_features + compartment_features]

        result = evaluate(
            profiles=subset_df.query("Metadata_cell_line == @cell_line"),
            features=compartment_features,
            meta_features=[barcode_col, gene_col],
            replicate_groups=replicate_group_grit,
            operation="grit",
            similarity_metric="pearson",
            grit_control_perts=control_barcodes
        ).assign(
            cell_line=cell_line,
            barcode_control="cutting_control",
            cor_method="pearson",
            compartment=compartment,
            channel="all",
            feature_group="all",
            num_features=len(compartment_features)
        )
        
        grit_compartment_results.append(result)
        
grit_compartment_results = pd.concat(grit_compartment_results).reset_index(drop=True)

print(grit_compartment_results.shape)
grit_compartment_results.head()

(1071, 10)


Unnamed: 0,perturbation,group,grit,cell_line,barcode_control,cor_method,compartment,channel,feature_group,num_features
0,AKT1-1,AKT1,0.687706,HCC44,cutting_control,pearson,Cells,all,all,100
1,AKT1-2,AKT1,0.666448,HCC44,cutting_control,pearson,Cells,all,all,100
2,ARID1B-1,ARID1B,0.46597,HCC44,cutting_control,pearson,Cells,all,all,100
3,ARID1B-2,ARID1B,0.3229,HCC44,cutting_control,pearson,Cells,all,all,100
4,ATF4-1,ATF4,0.186567,HCC44,cutting_control,pearson,Cells,all,all,100


## Calculate grit for feature groups

In [7]:
feature_group_compartments = list(set(["_".join(x.split("_")[0:2]) for x in all_features]))

grit_subcompartment_results = []
for cell_line in df.Metadata_cell_line.unique():
    for compartment_group in feature_group_compartments:
        compartment_features = df.loc[:, df.columns.str.startswith(compartment_group)].columns.tolist()
        subset_df = df.loc[:, meta_features + compartment_features]
        
        compartment, feature_group = compartment_group.split("_")

        result = evaluate(
            profiles=subset_df.query("Metadata_cell_line == @cell_line"),
            features=compartment_features,
            meta_features=[barcode_col, gene_col],
            replicate_groups=replicate_group_grit,
            operation="grit",
            similarity_metric="pearson",
            grit_control_perts=control_barcodes
        ).assign(
            cell_line=cell_line,
            barcode_control="cutting_control",
            cor_method="pearson",
            compartment=compartment,
            channel="all",
            feature_group=feature_group,
            num_features=len(compartment_features)
        )
        
        grit_subcompartment_results.append(result)
        
grit_subcompartment_results = pd.concat(grit_subcompartment_results).reset_index(drop=True)

print(grit_subcompartment_results.shape)
grit_subcompartment_results.head()

(6783, 10)


Unnamed: 0,perturbation,group,grit,cell_line,barcode_control,cor_method,compartment,channel,feature_group,num_features
0,AKT1-1,AKT1,0.526183,HCC44,cutting_control,pearson,Nuclei,all,Granularity,17
1,AKT1-2,AKT1,0.569175,HCC44,cutting_control,pearson,Nuclei,all,Granularity,17
2,ARID1B-1,ARID1B,0.22027,HCC44,cutting_control,pearson,Nuclei,all,Granularity,17
3,ARID1B-2,ARID1B,0.145174,HCC44,cutting_control,pearson,Nuclei,all,Granularity,17
4,ATF4-1,ATF4,-0.247228,HCC44,cutting_control,pearson,Nuclei,all,Granularity,17


## Calculate grit for channels

In [8]:
channels = ["DNA", "RNA", "Mito", "AGP", "ER"]

grit_channel_results = []
for cell_line in df.Metadata_cell_line.unique():
    for channel in channels:
        channel_features = df.loc[:, df.columns.str.contains(channel)].columns.tolist()

        subset_df = df.loc[:, meta_features + compartment_features]
        
        compartment, feature_group = compartment_group.split("_")

        result = evaluate(
            profiles=subset_df.query("Metadata_cell_line == @cell_line"),
            features=compartment_features,
            meta_features=[barcode_col, gene_col],
            replicate_groups=replicate_group_grit,
            operation="grit",
            similarity_metric="pearson",
            grit_control_perts=control_barcodes
        ).assign(
            cell_line=cell_line,
            barcode_control="cutting_control",
            cor_method="pearson",
            compartment="all",
            channel=channel,
            feature_group="all",
            num_features=len(channel_features)
        )
        
        grit_channel_results.append(result)
        
grit_channel_results = pd.concat(grit_channel_results).reset_index(drop=True)

print(grit_channel_results.shape)
grit_channel_results.head()

(1785, 10)


Unnamed: 0,perturbation,group,grit,cell_line,barcode_control,cor_method,compartment,channel,feature_group,num_features
0,AKT1-1,AKT1,0.432191,HCC44,cutting_control,pearson,all,DNA,all,78
1,AKT1-2,AKT1,0.493862,HCC44,cutting_control,pearson,all,DNA,all,78
2,ARID1B-1,ARID1B,0.943583,HCC44,cutting_control,pearson,all,DNA,all,78
3,ARID1B-2,ARID1B,0.874703,HCC44,cutting_control,pearson,all,DNA,all,78
4,ATF4-1,ATF4,0.396764,HCC44,cutting_control,pearson,all,DNA,all,78


## Concatenate results together

In [9]:
full_grit_results = pd.concat(
    [
        grit_compartment_results,
        grit_subcompartment_results,
        grit_channel_results,
    ],
    axis="rows"
).reset_index(drop=True)

print(full_grit_results.shape)
full_grit_results.head()

(9639, 10)


Unnamed: 0,perturbation,group,grit,cell_line,barcode_control,cor_method,compartment,channel,feature_group,num_features
0,AKT1-1,AKT1,0.687706,HCC44,cutting_control,pearson,Cells,all,all,100
1,AKT1-2,AKT1,0.666448,HCC44,cutting_control,pearson,Cells,all,all,100
2,ARID1B-1,ARID1B,0.46597,HCC44,cutting_control,pearson,Cells,all,all,100
3,ARID1B-2,ARID1B,0.3229,HCC44,cutting_control,pearson,Cells,all,all,100
4,ATF4-1,ATF4,0.186567,HCC44,cutting_control,pearson,Cells,all,all,100


In [10]:
# Output results
output_dir = "results"
output_file = pathlib.Path(f"{output_dir}/cell_health_grit_compartments.tsv.gz")

full_grit_results.to_csv(
    output_file,
    sep="\t",
    compression={"method": "gzip", "mtime": 1},
    index=False
)