## Calculate Precision and Recall of profile clusters

Given correlations, can we retrieve profiles of similar MOAs?

### Part 1 - Calculate pairwise correlations and identify common targets and MOAs

In [1]:
import pathlib
import pandas as pd
from sklearn.metrics import precision_score, recall_score

from pycytominer.cyto_utils import infer_cp_features

from scripts.precision_recall_utils import calc_pairwise_corr, categorize_comparisons

In [2]:
# Load input data
assay = "cell_painting"  # Can also be "cell_painting"
profile_dir = pathlib.Path("Consensus", assay, "moa_sizes_consensus_datasets")

if assay == "cell_painting":
    profile_file = pathlib.Path(profile_dir, "cell_painting_moa_analytical_set_profiles.tsv.gz")
else:
    profile_file = pathlib.Path(profile_dir, "l1000_moa_analytical_set_profiles.tsv.gz")

profile_df = pd.read_csv(profile_file, sep="\t", low_memory=False)

if assay == "L1000":
    # Load Cell Painting pert id columns to merge target column
    profile_file = pathlib.Path("Consensus", "cell_painting", "moa_sizes_consensus_datasets", "cell_painting_moa_analytical_set_profiles.tsv.gz")
    cp_df = pd.read_csv(profile_file, sep="\t", usecols=["pert_iname", "moa", "Metadata_target"]).drop_duplicates()
    
    # Merge target info to L1000 data
    profile_df = profile_df.merge(cp_df, on=["pert_iname", "moa"], how="left")
    profile_df.Metadata_target = profile_df.Metadata_target.astype(str)
    
print(profile_df.shape)
profile_df.head()

(5574, 1032)


Unnamed: 0,Metadata_Plate_Map_Name,Metadata_cell_id,Metadata_broad_sample,Metadata_pert_well,Metadata_mmoles_per_liter,Metadata_dose_recode,Metadata_time_point,Metadata_moa,Metadata_target,Cells_AreaShape_Area,...,Nuclei_Texture_Variance_DNA_20_0,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,Nuclei_Texture_Variance_RNA_5_0,broad_id,pert_iname,moa
0,C-7161-01-LM6-001,A549,BRD-A26032986-050-02-1,C13,10.0,6,48H,calmodulin antagonist,CALM1,-0.975063,...,0.340203,0.116535,0.525492,1.27825,-0.51845,-0.326173,0.837018,BRD-A26032986,zaldaride,calmodulin antagonist
1,C-7161-01-LM6-001,A549,BRD-A26032986-050-02-1,C14,3.3333,5,48H,calmodulin antagonist,CALM1,0.303898,...,-0.003729,0.261119,0.681637,-0.515889,-0.482979,-1.00399,-0.651142,BRD-A26032986,zaldaride,calmodulin antagonist
2,C-7161-01-LM6-001,A549,BRD-A26032986-050-02-1,C15,1.1111,4,48H,calmodulin antagonist,CALM1,-0.405912,...,-0.522046,-0.350453,-0.616435,-0.461255,-1.12225,-0.665091,0.206879,BRD-A26032986,zaldaride,calmodulin antagonist
3,C-7161-01-LM6-001,A549,BRD-A26032986-050-02-1,C16,0.37037,3,48H,calmodulin antagonist,CALM1,-0.014589,...,-0.711311,0.097249,-0.516527,0.191278,-0.251272,-1.15633,-0.716931,BRD-A26032986,zaldaride,calmodulin antagonist
4,C-7161-01-LM6-001,A549,BRD-A26032986-050-02-1,C17,0.12346,2,48H,calmodulin antagonist,CALM1,-0.418292,...,-0.381192,0.129927,-0.908353,-1.13484,-1.03883,-0.375225,-0.347097,BRD-A26032986,zaldaride,calmodulin antagonist


In [3]:
# Distinguish profile and metadata features
if assay == "cell_painting":
    cp_features = infer_cp_features(profile_df)
    meta_features = ["pert_iname", "moa", "Metadata_target", "Metadata_dose_recode"]
    dose_col = "Metadata_dose_recode"
else:
    cp_features = profile_df.loc[:, profile_df.columns.str.endswith("_at")].columns.tolist()
    meta_features = ["pert_iname", "moa", "Metadata_target", "dose"]
    dose_col = "dose"

In [4]:
# Calculate pairwise correlations for precision/recall calculations
corr_dose_df = (
    profile_df
    .groupby(dose_col)
    .apply(
        lambda x: calc_pairwise_corr(
            profile_df=x,
            metadata_cols=meta_features,
            features=cp_features
        )
    )
    .reset_index(drop=True)
)

# Drop comparisons of the same perturbation across multiple doses
id_cols = ["pert_iname"]

compare_df = corr_dose_df.loc[:, [f"{x}_compare" for x in id_cols]]
compare_df.columns = id_cols
is_replicate = (
    corr_dose_df.loc[:, id_cols] == 
    compare_df
).all(
    axis="columns"
)

corr_dose_df = corr_dose_df.loc[~is_replicate, :].reset_index(drop=True)

print(corr_dose_df.shape)
corr_dose_df.head()

  numpy.divide(r, std_x[:, numpy.newaxis], out=r)
  numpy.divide(r, std_y[numpy.newaxis, :], out=r)


(5172648, 10)


Unnamed: 0,pert_iname,moa,Metadata_target,Metadata_dose_recode,original_index,correlation,pert_iname_compare,moa_compare,Metadata_target_compare,Metadata_dose_recode_compare
0,zaldaride,calmodulin antagonist,CALM1,1,1,0.126103,clobetasol,glucocorticoid receptor agonist,NR3C1|PLA2G1B,1
1,saquinavir,hiv protease inhibitor,CYP3A4,1,1,0.023967,clobetasol,glucocorticoid receptor agonist,NR3C1|PLA2G1B,1
2,nicardipine,calcium channel blocker,ADORA3,1,1,0.11803,clobetasol,glucocorticoid receptor agonist,NR3C1|PLA2G1B,1
3,pki-179,mtor inhibitor|pi3k inhibitor,MTOR,1,1,0.180723,clobetasol,glucocorticoid receptor agonist,NR3C1|PLA2G1B,1
4,selegiline,monoamine oxidase inhibitor,MAOA|MAOB,1,1,-0.02611,clobetasol,glucocorticoid receptor agonist,NR3C1|PLA2G1B,1


### Categorize comparisons

We need to create a column that captures which MOAs/Targets are the same, and which are different.
We also need to make sure that comparisons are not of the same compound but at different doses.

In [5]:
# Note, this takes a couple minutes to complete
corr_match_df = corr_dose_df.apply(lambda x: categorize_comparisons(x), axis="columns")

corr_dose_df = pd.concat([corr_dose_df, corr_match_df], axis="columns")

print(corr_dose_df.shape)
corr_dose_df.head(10)

(5172648, 12)


Unnamed: 0,pert_iname,moa,Metadata_target,Metadata_dose_recode,original_index,correlation,pert_iname_compare,moa_compare,Metadata_target_compare,Metadata_dose_recode_compare,match_moa,match_target
0,zaldaride,calmodulin antagonist,CALM1,1,1,0.126103,clobetasol,glucocorticoid receptor agonist,NR3C1|PLA2G1B,1,False,False
1,saquinavir,hiv protease inhibitor,CYP3A4,1,1,0.023967,clobetasol,glucocorticoid receptor agonist,NR3C1|PLA2G1B,1,False,False
2,nicardipine,calcium channel blocker,ADORA3,1,1,0.11803,clobetasol,glucocorticoid receptor agonist,NR3C1|PLA2G1B,1,False,False
3,pki-179,mtor inhibitor|pi3k inhibitor,MTOR,1,1,0.180723,clobetasol,glucocorticoid receptor agonist,NR3C1|PLA2G1B,1,False,False
4,selegiline,monoamine oxidase inhibitor,MAOA|MAOB,1,1,-0.02611,clobetasol,glucocorticoid receptor agonist,NR3C1|PLA2G1B,1,False,False
5,inosine,neurotrophic agent,PARP1|PNP,1,1,-0.02296,clobetasol,glucocorticoid receptor agonist,NR3C1|PLA2G1B,1,False,False
6,acebutolol,adrenergic receptor antagonist,ADRB1,1,1,-0.148574,clobetasol,glucocorticoid receptor agonist,NR3C1|PLA2G1B,1,False,False
7,pca-4248,platelet activating factor receptor antagonist,PTAFR,1,1,0.017728,clobetasol,glucocorticoid receptor agonist,NR3C1|PLA2G1B,1,False,False
8,canrenoic-acid,mineralocorticoid receptor antagonist,NR3C2,1,1,-0.093515,clobetasol,glucocorticoid receptor agonist,NR3C1|PLA2G1B,1,False,False
9,17-hydroxyprogesterone-caproate,progesterone receptor agonist,AR|ESR1|ESR2|NR1H4|NR1I2|NR3C1|PGR,1,1,0.067429,clobetasol,glucocorticoid receptor agonist,NR3C1|PLA2G1B,1,False,True


In [6]:
# Output data
output_file = pathlib.Path("results", f"dose_corr_matching_moa_target_{assay}.tsv.gz")
corr_dose_df.to_csv(output_file, sep="\t", index=False)