## Calculate grit in different simulated scenarios

1. Calculate shuffled grit
2. Change the number of controls used for grit calculation

In [1]:
import pathlib
import numpy as np
import pandas as pd

from pycytominer.cyto_utils import infer_cp_features

from cytominer_eval import evaluate

In [2]:
np.random.seed(123)

In [3]:
# Load Cell Health data
plate = "SQ00014613"
data_file = pathlib.Path("data/cell_health_merged_feature_select.csv.gz")

df = pd.read_csv(data_file, sep=",").query("Metadata_Plate == @plate").reset_index(drop=True)

print(df.shape)
df.head(2)

(384, 402)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_WellCol,Metadata_WellRow,Metadata_cell_line,Metadata_gene_name,Metadata_pert_name,Cells_AreaShape_Compactness,Cells_AreaShape_Extent,Cells_AreaShape_Zernike_0_0,...,Nuclei_Texture_InverseDifferenceMoment_Mito_5_0,Nuclei_Texture_InverseDifferenceMoment_RNA_10_0,Nuclei_Texture_InverseDifferenceMoment_RNA_20_0,Nuclei_Texture_SumAverage_AGP_5_0,Nuclei_Texture_SumAverage_ER_5_0,Nuclei_Texture_SumAverage_Mito_20_0,Nuclei_Texture_SumAverage_RNA_5_0,Nuclei_Texture_SumEntropy_ER_5_0,Nuclei_Texture_SumEntropy_Mito_5_0,Nuclei_Texture_SumEntropy_RNA_5_0
0,SQ00014613,A01,1,A,ES2,EMPTY,EMPTY,1.17116,-1.745811,-0.894733,...,-2.792228,0.674249,0.299396,-0.008194,2.866711,1.451779,1.971102,1.742827,1.280318,-0.206361
1,SQ00014613,A02,2,A,ES2,MCL1,MCL1-5,1.826013,-2.236247,-1.863712,...,0.652688,1.713258,0.403442,2.154541,2.51882,0.738909,0.920606,2.030791,0.532702,-0.429812


## 1. Calculate grit with shuffled permutations

In [4]:
compartments = ["Cells", "Cytoplasm", "Nuclei"]
cor_method = "pearson"
num_shuffle_permutations = 5

In [5]:
# Define grit 
barcode_col = "Metadata_pert_name"
gene_col = "Metadata_gene_name"

replicate_group_grit = {
    "replicate_id": barcode_col,
    "group_id": gene_col
}

control_group_cut = ["Chr2", "Luc", "LacZ"]
control_group_pert = ["EMPTY"]

control_barcodes_cut = (
    df.loc[
        df[replicate_group_grit["group_id"]].isin(control_group_cut),
        replicate_group_grit["replicate_id"]
    ]
    .unique()
    .tolist()
)

control_barcodes_pert = (
    df.loc[
        df[replicate_group_grit["group_id"]].isin(control_group_pert),
        replicate_group_grit["replicate_id"]
    ]
    .unique()
    .tolist()
)

control_barcodes = {
    "cutting_control": control_barcodes_cut,
    "perturbation_control": control_barcodes_pert
}

control_barcodes

{'cutting_control': ['Chr2-1',
  'Chr2-4',
  'Chr2-5',
  'Chr2-2',
  'Luc-1',
  'LacZ-3',
  'Luc-2',
  'LacZ-2',
  'Chr2-3',
  'Chr2-6'],
 'perturbation_control': ['EMPTY']}

In [6]:
morph_features = infer_cp_features(df, compartments=compartments)
meta_features = infer_cp_features(df, metadata=True)

meta_features

['Metadata_Plate',
 'Metadata_Well',
 'Metadata_WellCol',
 'Metadata_WellRow',
 'Metadata_cell_line',
 'Metadata_gene_name',
 'Metadata_pert_name']

In [7]:
%%time
grit_results = []
for i in range(0, num_shuffle_permutations):
    for cell_line in df.Metadata_cell_line.unique():
    
        profiles = df.query("Metadata_cell_line == @cell_line").reset_index(drop=True)
        meta_df = profiles.loc[:, meta_features]

        feature_df = profiles.drop(meta_features, axis="columns")

        shuffle_df = pd.concat(
            [
                meta_df,
                feature_df.apply(
                    lambda x: np.random.permutation(x.values)
                ).reset_index(drop=True),
            ],
            axis="columns"
        )

        result = evaluate(
            profiles=shuffle_df,
            features=morph_features,
            meta_features=[barcode_col, gene_col],
            replicate_groups=replicate_group_grit,
            operation="grit",
            similarity_metric=cor_method,
            grit_control_perts=control_barcodes["cutting_control"]
        ).assign(
            cell_line=cell_line,
            barcode_control="cutting_control",
            cor_method=cor_method,
            random_iteration=i,
            shuffle_method="independent_column"
        )
        grit_results.append(result)
        

        shuffle_df = df.copy()
        shuffle_df.loc[:, ["Metadata_gene_name", "Metadata_pert_name"]] = (
            df.loc[:, ["Metadata_gene_name", "Metadata_pert_name"]]
            .sample(n=df.shape[0])
            .reset_index(drop=True)
        )
        
        result = evaluate(
            profiles=shuffle_df,
            features=morph_features,
            meta_features=[barcode_col, gene_col],
            replicate_groups=replicate_group_grit,
            operation="grit",
            similarity_metric=cor_method,
            grit_control_perts=control_barcodes["cutting_control"]
        ).assign(
            cell_line=cell_line,
            barcode_control="cutting_control",
            cor_method=cor_method,
            random_iteration=i,
            shuffle_method="full_metadata_shuffle"
        )
        grit_results.append(result)
        
        
        control_df = df.query("Metadata_pert_name in @control_barcodes['cutting_control']").reset_index(drop=True)
        shuffle_df = df.query("Metadata_pert_name not in @control_barcodes['cutting_control']").reset_index(drop=True)

        shuffle_df.loc[:, ["Metadata_gene_name", "Metadata_pert_name"]] = (
            shuffle_df.loc[:, ["Metadata_gene_name", "Metadata_pert_name"]]
            .sample(n=shuffle_df.shape[0])
            .reset_index(drop=True)
        )

        shuffle_df = pd.concat([control_df, shuffle_df]).reset_index(drop=True)

        
        result = evaluate(
            profiles=shuffle_df,
            features=morph_features,
            meta_features=[barcode_col, gene_col],
            replicate_groups=replicate_group_grit,
            operation="grit",
            similarity_metric=cor_method,
            grit_control_perts=control_barcodes["cutting_control"]
        ).assign(
            cell_line=cell_line,
            barcode_control="cutting_control",
            cor_method=cor_method,
            random_iteration=i,
            shuffle_method="metadata_shuffle_ctrl_fixed"
        )
        grit_results.append(result)
        

# Calculate plate wise grit using real data
result = evaluate(
    profiles=df,
    features=morph_features,
    meta_features=[barcode_col, gene_col],
    replicate_groups=replicate_group_grit,
    operation="grit",
    similarity_metric=cor_method,
    grit_control_perts=control_barcodes["cutting_control"]
).assign(
    cell_line=cell_line,
    barcode_control="cutting_control",
    cor_method=cor_method,
    random_iteration="real",
    shuffle_method="real",
)

grit_results.append(result)

grit_results = pd.concat(grit_results).reset_index(drop=True)

print(grit_results.shape)
grit_results.head()

(1904, 8)
CPU times: user 9.17 s, sys: 439 ms, total: 9.61 s
Wall time: 9.64 s


Unnamed: 0,perturbation,group,grit,cell_line,barcode_control,cor_method,random_iteration,shuffle_method
0,AKT1-1,AKT1,-0.038195,ES2,cutting_control,pearson,0,independent_column
1,AKT1-2,AKT1,-0.071214,ES2,cutting_control,pearson,0,independent_column
2,ARID1B-1,ARID1B,0.333085,ES2,cutting_control,pearson,0,independent_column
3,ARID1B-2,ARID1B,0.560773,ES2,cutting_control,pearson,0,independent_column
4,ATF4-1,ATF4,0.562125,ES2,cutting_control,pearson,0,independent_column


In [8]:
# Output results
output_dir = "results"
output_file = pathlib.Path(f"{output_dir}/cell_health_grit_randomshuffled_{plate}.tsv")

grit_results.to_csv(output_file, sep="\t", index=False)

## 2. Titrate the amount of control perturbations present when calculating grit

In [9]:
control_ns = [56, 50, 40, 30, 20, 15, 10, 7, 5, 4, 3, 2]

In [10]:
%%time
grit_results = []
for i in range(0, num_shuffle_permutations):
    for n in control_ns:
        for cell_line in df.Metadata_cell_line.unique():

            profiles = df.query("Metadata_cell_line == @cell_line").reset_index(drop=True)

            control_df = (
                profiles
                .query("Metadata_pert_name in @control_barcodes['perturbation_control']")
            )
            treatment_df = (
                profiles
                .query("Metadata_pert_name not in @control_barcodes['perturbation_control']")
            )

            control_dropped_profiles_df = pd.concat(
                [
                    treatment_df,
                    control_df.sample(n)
                ], axis="rows"
            )
            
            result = evaluate(
                profiles=control_dropped_profiles_df,
                features=morph_features,
                meta_features=[barcode_col, gene_col],
                replicate_groups=replicate_group_grit,
                operation="grit",
                similarity_metric=cor_method,
                grit_control_perts=control_barcodes["perturbation_control"]
            ).assign(
                cell_line=cell_line,
                barcode_control="perturbation_control",
                cor_method=cor_method,
                random_iteration=i,
                num_controls=n
            )

            grit_results.append(result)

grit_results = pd.concat(grit_results).reset_index(drop=True)

print(grit_results.shape)
grit_results.head()

(7140, 8)
CPU times: user 29.2 s, sys: 1.13 s, total: 30.4 s
Wall time: 30.4 s


Unnamed: 0,perturbation,group,grit,cell_line,barcode_control,cor_method,random_iteration,num_controls
0,AKT1-1,AKT1,1.458837,ES2,perturbation_control,pearson,0,56
1,AKT1-2,AKT1,1.203577,ES2,perturbation_control,pearson,0,56
2,ARID1B-1,ARID1B,1.33112,ES2,perturbation_control,pearson,0,56
3,ARID1B-2,ARID1B,1.190342,ES2,perturbation_control,pearson,0,56
4,ATF4-1,ATF4,2.223579,ES2,perturbation_control,pearson,0,56


In [11]:
# Output results
output_dir = "results"
output_file = pathlib.Path(f"{output_dir}/cell_health_grit_control_titration_{plate}.tsv")

grit_results.to_csv(output_file, sep="\t", index=False)