## Calculate Grit for Bulk Cell Health profiles

Here, we calculate grit in several permutations

1. Across the three different cell lines (A549, ES2, HCC44)
2. Using two different kinds of controls (cutting and permutation)
3. Using two different correlation metrics (Pearson and Spearman)
4. Using two different metrics to summarize control-based z-scored replicate correlations (mean and median)

We also calculate mp-value for the same perturbations.

In [7]:
import pathlib
import numpy as np
import pandas as pd

from pycytominer.cyto_utils import infer_cp_features, output
from pycytominer import feature_select

from cytominer_eval import evaluate
from cytominer_eval.transform import metric_melt
from cytominer_eval.operations.util import assign_replicates

In [8]:
# Load Cell Health data
commit = "07e4b40c39dd27084be36fbef4d64c5654b2960f"
base_url = f"https://github.com/broadinstitute/cell-health/raw/{commit}"
url = (
    f"{base_url}/1.generate-profiles/data/processed/cell_health_profiles_merged.tsv.gz"
)

df = pd.read_csv(url, sep="\t")

print(df.shape)
df.head(2)

(3456, 956)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_WellCol,Metadata_WellRow,Metadata_cell_line,Metadata_gene_name,Metadata_pert_name,Cells_AreaShape_Center_Y,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,...,Nuclei_Texture_SumEntropy_RNA_5_0,Nuclei_Texture_SumVariance_AGP_20_0,Nuclei_Texture_SumVariance_AGP_5_0,Nuclei_Texture_SumVariance_DNA_10_0,Nuclei_Texture_SumVariance_DNA_20_0,Nuclei_Texture_SumVariance_DNA_5_0,Nuclei_Texture_Variance_AGP_5_0,Nuclei_Texture_Variance_DNA_10_0,Nuclei_Texture_Variance_DNA_20_0,Nuclei_Texture_Variance_DNA_5_0
0,SQ00014618,A01,1,A,HCC44,EMPTY,EMPTY,-0.894997,-1.515696,-1.787667,...,0.107581,-0.659049,-0.676846,-1.229791,-1.336051,-1.125138,-0.97236,-1.393856,-1.244227,-1.308729
1,SQ00014618,A02,2,A,HCC44,MCL1,MCL1-5,-0.479926,0.246423,0.629901,...,0.165935,1.999006,1.204036,0.560228,0.686189,0.601634,1.154001,0.596441,0.680359,0.715469


In [9]:
# Perform feature selection
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "drop_na_columns",
    "blocklist",
    "drop_outliers",
]

df = feature_select(profiles=df, operation=feature_select_ops, na_cutoff=0)

features = infer_cp_features(df)
meta_features = infer_cp_features(df, metadata=True)

print(df.shape)
df.head(2)

(3456, 402)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_WellCol,Metadata_WellRow,Metadata_cell_line,Metadata_gene_name,Metadata_pert_name,Cells_AreaShape_Compactness,Cells_AreaShape_Extent,Cells_AreaShape_Zernike_0_0,...,Nuclei_Texture_InverseDifferenceMoment_Mito_5_0,Nuclei_Texture_InverseDifferenceMoment_RNA_10_0,Nuclei_Texture_InverseDifferenceMoment_RNA_20_0,Nuclei_Texture_SumAverage_AGP_5_0,Nuclei_Texture_SumAverage_ER_5_0,Nuclei_Texture_SumAverage_Mito_20_0,Nuclei_Texture_SumAverage_RNA_5_0,Nuclei_Texture_SumEntropy_ER_5_0,Nuclei_Texture_SumEntropy_Mito_5_0,Nuclei_Texture_SumEntropy_RNA_5_0
0,SQ00014618,A01,1,A,HCC44,EMPTY,EMPTY,-1.515696,0.810956,1.984031,...,-1.365392,-0.351107,-0.163153,2.307568,-2.775524,-3.951667,-1.716353,-2.703082,-3.720976,0.107581
1,SQ00014618,A02,2,A,HCC44,MCL1,MCL1-5,0.246423,0.687241,0.062305,...,-0.070069,1.855687,-0.400335,3.776635,0.947498,-0.541032,0.363568,0.910251,-0.364015,0.165935


In [10]:
# Output feature selected file
output_file = pathlib.Path("data/cell_health_merged_feature_select.csv.gz")

output(
    df=df,
    output_filename=output_file,
    sep=",",
    compression_options={"method": "gzip", "mtime": 1},
)

PosixPath('data/cell_health_merged_feature_select.csv.gz')

In [11]:
# Define cell health constants
barcode_col = "Metadata_pert_name"
gene_col = "Metadata_gene_name"

replicate_group_grit = {"profile_col": barcode_col, "replicate_group_col": gene_col}

control_group_cut = ["Chr2", "Luc", "LacZ"]
control_group_pert = ["EMPTY"]

control_barcodes_cut = (
    df.loc[
        df[replicate_group_grit["replicate_group_col"]].isin(control_group_cut),
        replicate_group_grit["profile_col"],
    ]
    .unique()
    .tolist()
)

control_barcodes_pert = (
    df.loc[
        df[replicate_group_grit["replicate_group_col"]].isin(control_group_pert),
        replicate_group_grit["profile_col"],
    ]
    .unique()
    .tolist()
)

control_barcodes = {
    "cutting_control": control_barcodes_cut,
    "perturbation_control": control_barcodes_pert,
}

control_barcodes

{'cutting_control': ['Chr2-1',
  'Chr2-4',
  'Chr2-5',
  'Chr2-2',
  'Luc-1',
  'LacZ-3',
  'Luc-2',
  'LacZ-2',
  'Chr2-3',
  'Chr2-6'],
 'perturbation_control': ['EMPTY']}

In [12]:
%%time
grit_results = []
for cell_line in df.Metadata_cell_line.unique():
    for control_barcode in control_barcodes:
        for cor_method in ["pearson", "spearman"]:
            for replicate_summary_method in ["mean", "median"]:

                result = evaluate(
                    profiles=df.query("Metadata_cell_line == @cell_line"),
                    features=features,
                    meta_features=[barcode_col, gene_col],
                    replicate_groups=replicate_group_grit,
                    operation="grit",
                    similarity_metric=cor_method,
                    grit_control_perts=control_barcodes[control_barcode],
                    grit_replicate_summary_method=replicate_summary_method,
                ).assign(
                    cell_line=cell_line,
                    barcode_control=control_barcode,
                    cor_method=cor_method,
                    grit_replicate_summary_method=replicate_summary_method,
                )

                grit_results.append(result)

grit_results = pd.concat(grit_results).reset_index(drop=True)

print(grit_results.shape)
grit_results.head()

(2856, 7)
CPU times: user 40.3 s, sys: 2.62 s, total: 42.9 s
Wall time: 43.1 s


Unnamed: 0,perturbation,group,grit,cell_line,barcode_control,cor_method,grit_replicate_summary_method
0,AKT1-1,AKT1,0.793952,HCC44,cutting_control,pearson,mean
1,AKT1-2,AKT1,0.77584,HCC44,cutting_control,pearson,mean
2,ARID1B-1,ARID1B,0.448876,HCC44,cutting_control,pearson,mean
3,ARID1B-2,ARID1B,0.323462,HCC44,cutting_control,pearson,mean
4,ATF4-1,ATF4,0.214374,HCC44,cutting_control,pearson,mean


In [13]:
# Some perturbations have only one guide per gene, these cannot have grit scores
print(grit_results.grit.isna().sum())
grit_results.loc[grit_results.grit.isna(), :].reset_index(drop=True).head(5)

144


Unnamed: 0,perturbation,group,grit,cell_line,barcode_control,cor_method,grit_replicate_summary_method
0,AURKB-2,AURKB,,HCC44,cutting_control,pearson,mean
1,BRAF-2,BRAF,,HCC44,cutting_control,pearson,mean
2,BRAF1-1,BRAF1,,HCC44,cutting_control,pearson,mean
3,EMPTY,EMPTY,,HCC44,cutting_control,pearson,mean
4,SLC2A1-1,SLC2A1,,HCC44,cutting_control,pearson,mean


In [14]:
# Output results
output_dir = "results"
output_file = pathlib.Path(f"{output_dir}/cell_health_grit.tsv")

grit_results.to_csv(output_file, sep="\t", index=False)

## Calculate mp-value

In [15]:
%%time
mp_results = []

for cell_line in df.Metadata_cell_line.unique():
    for num_permutations in [10, 100, 1000, 5000]:

        mp_value_params = {"nb_permutations": num_permutations}

        result = evaluate(
            profiles=df.query("Metadata_cell_line == @cell_line"),
            features=features,
            meta_features=[barcode_col, gene_col],
            replicate_groups="Metadata_pert_name",
            operation="mp_value",
            grit_control_perts=control_barcodes["cutting_control"],
            mp_value_params=mp_value_params,
        ).assign(
            cell_line=cell_line,
            barcode_control="cutting_control",
            num_permutations=num_permutations,
        )

        mp_results.append(result)

mp_results = pd.concat(mp_results).reset_index(drop=True)

print(mp_results.shape)
mp_results.head()

(1428, 5)
CPU times: user 39min 42s, sys: 29min 10s, total: 1h 8min 52s
Wall time: 35min 59s


Unnamed: 0,Metadata_pert_name,mp_value,cell_line,barcode_control,num_permutations
0,AKT1-1,0.1,HCC44,cutting_control,10
1,AKT1-2,0.2,HCC44,cutting_control,10
2,ARID1B-1,0.2,HCC44,cutting_control,10
3,ARID1B-2,0.3,HCC44,cutting_control,10
4,ATF4-1,0.0,HCC44,cutting_control,10


In [16]:
# Output results
output_file = pathlib.Path(f"{output_dir}/cell_health_mpvalue.tsv")

mp_results.to_csv(output_file, sep="\t", index=False)