## Calculate grit in different simulated scenarios

1. Calculate shuffled grit
2. Change the number of controls used for grit calculation

In [1]:
import pathlib
import numpy as np
import pandas as pd

from pycytominer.cyto_utils import infer_cp_features

from cytominer_eval import evaluate

In [2]:
np.random.seed(123)

In [3]:
# Load Cell Health data normalized two different ways
plate = "SQ00014613"

dfs = {}
data_file = pathlib.Path("data/cell_health_merged_feature_select.csv.gz")

dfs["ctrl_based"] = pd.read_csv(data_file, sep=",").query("Metadata_Plate == @plate").reset_index(drop=True)

whole_plate_dir = pathlib.Path(f"../../0.download-data/data/cell-health/profiles")
data_file = pathlib.Path(f"{whole_plate_dir}/cell_health_profiles_merged_wholeplate_normalized_featureselected.tsv.gz")

dfs["whole_plate"] = pd.read_csv(data_file, sep="\t").query("Metadata_Plate == @plate").reset_index(drop=True)

for norm_method in dfs:
    df = dfs[norm_method]
    print(df.shape)
    df.head(2)

(384, 402)
(384, 437)


## 1. Calculate grit with shuffled permutations

In [4]:
compartments = ["Cells", "Cytoplasm", "Nuclei"]
cor_method = "pearson"
num_shuffle_permutations = 5

In [5]:
# Define grit 
barcode_col = "Metadata_pert_name"
gene_col = "Metadata_gene_name"

replicate_group_grit = {
    "replicate_id": barcode_col,
    "group_id": gene_col
}

control_group_cut = ["Chr2", "Luc", "LacZ"]
control_group_pert = ["EMPTY"]

control_barcodes_cut = (
    df.loc[
        df[replicate_group_grit["group_id"]].isin(control_group_cut),
        replicate_group_grit["replicate_id"]
    ]
    .unique()
    .tolist()
)

control_barcodes_pert = (
    df.loc[
        df[replicate_group_grit["group_id"]].isin(control_group_pert),
        replicate_group_grit["replicate_id"]
    ]
    .unique()
    .tolist()
)

control_barcodes = {
    "cutting_control": control_barcodes_cut,
    "perturbation_control": control_barcodes_pert
}

control_barcodes

{'cutting_control': ['Chr2-1',
  'Chr2-4',
  'Chr2-5',
  'Chr2-2',
  'Luc-1',
  'LacZ-3',
  'Luc-2',
  'LacZ-2',
  'Chr2-3',
  'Chr2-6'],
 'perturbation_control': ['EMPTY']}

In [6]:
%%time
grit_results = []
for i in range(0, num_shuffle_permutations):
    for norm_method in dfs:
        df = dfs[norm_method]
        
        morph_features = infer_cp_features(df, compartments=compartments)
        meta_features = infer_cp_features(df, metadata=True)

        for cell_line in df.Metadata_cell_line.unique():

            profiles = df.query("Metadata_cell_line == @cell_line").reset_index(drop=True)
            meta_df = profiles.loc[:, meta_features]

            feature_df = profiles.drop(meta_features, axis="columns")

            shuffle_df = pd.concat(
                [
                    meta_df,
                    feature_df.apply(
                        lambda x: np.random.permutation(x.values)
                    ).reset_index(drop=True),
                ],
                axis="columns"
            )

            result = evaluate(
                profiles=shuffle_df,
                features=morph_features,
                meta_features=[barcode_col, gene_col],
                replicate_groups=replicate_group_grit,
                operation="grit",
                similarity_metric=cor_method,
                grit_control_perts=control_barcodes["cutting_control"]
            ).assign(
                cell_line=cell_line,
                barcode_control="cutting_control",
                cor_method=cor_method,
                random_iteration=i,
                shuffle_method="independent_column",
                norm_method=norm_method
            )
            grit_results.append(result)


            shuffle_df = df.copy()
            shuffle_df.loc[:, ["Metadata_gene_name", "Metadata_pert_name"]] = (
                df.loc[:, ["Metadata_gene_name", "Metadata_pert_name"]]
                .sample(n=df.shape[0])
                .reset_index(drop=True)
            )

            result = evaluate(
                profiles=shuffle_df,
                features=morph_features,
                meta_features=[barcode_col, gene_col],
                replicate_groups=replicate_group_grit,
                operation="grit",
                similarity_metric=cor_method,
                grit_control_perts=control_barcodes["cutting_control"]
            ).assign(
                cell_line=cell_line,
                barcode_control="cutting_control",
                cor_method=cor_method,
                random_iteration=i,
                shuffle_method="full_metadata_shuffle",
                norm_method=norm_method
            )
            grit_results.append(result)


            control_df = df.query("Metadata_pert_name in @control_barcodes['cutting_control']").reset_index(drop=True)
            shuffle_df = df.query("Metadata_pert_name not in @control_barcodes['cutting_control']").reset_index(drop=True)

            shuffle_df.loc[:, ["Metadata_gene_name", "Metadata_pert_name"]] = (
                shuffle_df.loc[:, ["Metadata_gene_name", "Metadata_pert_name"]]
                .sample(n=shuffle_df.shape[0])
                .reset_index(drop=True)
            )

            shuffle_df = pd.concat([control_df, shuffle_df]).reset_index(drop=True)

            result = evaluate(
                profiles=shuffle_df,
                features=morph_features,
                meta_features=[barcode_col, gene_col],
                replicate_groups=replicate_group_grit,
                operation="grit",
                similarity_metric=cor_method,
                grit_control_perts=control_barcodes["cutting_control"]
            ).assign(
                cell_line=cell_line,
                barcode_control="cutting_control",
                cor_method=cor_method,
                random_iteration=i,
                shuffle_method="metadata_shuffle_ctrl_fixed",
                norm_method=norm_method
            )
            grit_results.append(result)
        

# Calculate plate wise grit using real data
for norm_method in dfs:
    df = dfs[norm_method]
    
    morph_features = infer_cp_features(df, compartments=compartments)
    meta_features = infer_cp_features(df, metadata=True)

    result = evaluate(
        profiles=df,
        features=morph_features,
        meta_features=[barcode_col, gene_col],
        replicate_groups=replicate_group_grit,
        operation="grit",
        similarity_metric=cor_method,
        grit_control_perts=control_barcodes["cutting_control"]
    ).assign(
        cell_line=cell_line,
        barcode_control="cutting_control",
        cor_method=cor_method,
        random_iteration="real",
        shuffle_method="real",
        norm_method=norm_method
    )

    grit_results.append(result)

grit_results = pd.concat(grit_results).reset_index(drop=True)

print(grit_results.shape)
grit_results.head()

(3808, 9)
CPU times: user 17.1 s, sys: 897 ms, total: 18 s
Wall time: 18 s


Unnamed: 0,perturbation,group,grit,cell_line,barcode_control,cor_method,random_iteration,shuffle_method,norm_method
0,AKT1-1,AKT1,-0.038195,ES2,cutting_control,pearson,0,independent_column,ctrl_based
1,AKT1-2,AKT1,-0.071214,ES2,cutting_control,pearson,0,independent_column,ctrl_based
2,ARID1B-1,ARID1B,0.333085,ES2,cutting_control,pearson,0,independent_column,ctrl_based
3,ARID1B-2,ARID1B,0.560773,ES2,cutting_control,pearson,0,independent_column,ctrl_based
4,ATF4-1,ATF4,0.562125,ES2,cutting_control,pearson,0,independent_column,ctrl_based


In [7]:
# Output results
output_dir = "results"
output_file = pathlib.Path(f"{output_dir}/cell_health_grit_randomshuffled_{plate}.tsv")

grit_results.to_csv(output_file, sep="\t", index=False)

## 2. Titrate the amount of control perturbations present when calculating grit

In [8]:
control_ns = [56, 50, 40, 30, 20, 15, 10, 7, 5, 4, 3, 2]

In [9]:
%%time
grit_results = []
for i in range(0, num_shuffle_permutations):
    for norm_method in dfs:
        df = dfs[norm_method]

        morph_features = infer_cp_features(df, compartments=compartments)
        meta_features = infer_cp_features(df, metadata=True)
        
        for n in control_ns:
            for cell_line in df.Metadata_cell_line.unique():

                profiles = df.query("Metadata_cell_line == @cell_line").reset_index(drop=True)

                control_df = (
                    profiles
                    .query("Metadata_pert_name in @control_barcodes['perturbation_control']")
                )
                treatment_df = (
                    profiles
                    .query("Metadata_pert_name not in @control_barcodes['perturbation_control']")
                )

                control_dropped_profiles_df = pd.concat(
                    [
                        treatment_df,
                        control_df.sample(n)
                    ], axis="rows"
                )

                result = evaluate(
                    profiles=control_dropped_profiles_df,
                    features=morph_features,
                    meta_features=[barcode_col, gene_col],
                    replicate_groups=replicate_group_grit,
                    operation="grit",
                    similarity_metric=cor_method,
                    grit_control_perts=control_barcodes["perturbation_control"]
                ).assign(
                    cell_line=cell_line,
                    barcode_control="perturbation_control",
                    cor_method=cor_method,
                    random_iteration=i,
                    num_controls=n,
                    norm_method=norm_method
                )

                grit_results.append(result)

grit_results = pd.concat(grit_results).reset_index(drop=True)

print(grit_results.shape)
grit_results.head()

(14280, 9)
CPU times: user 1min 1s, sys: 2.68 s, total: 1min 4s
Wall time: 1min 4s


Unnamed: 0,perturbation,group,grit,cell_line,barcode_control,cor_method,random_iteration,num_controls,norm_method
0,AKT1-1,AKT1,1.458837,ES2,perturbation_control,pearson,0,56,ctrl_based
1,AKT1-2,AKT1,1.203577,ES2,perturbation_control,pearson,0,56,ctrl_based
2,ARID1B-1,ARID1B,1.33112,ES2,perturbation_control,pearson,0,56,ctrl_based
3,ARID1B-2,ARID1B,1.190342,ES2,perturbation_control,pearson,0,56,ctrl_based
4,ATF4-1,ATF4,2.223579,ES2,perturbation_control,pearson,0,56,ctrl_based


In [10]:
# Output results
output_dir = "results"
output_file = pathlib.Path(f"{output_dir}/cell_health_grit_control_titration_{plate}.tsv")

grit_results.to_csv(output_file, sep="\t", index=False)