## Calculate Precision and Recall of profile clusters

Given correlations, can we retrieve profiles of similar MOAs?

### Part 1 - Calculate pairwise correlations and identify common targets and MOAs

In [1]:
import pathlib
import pandas as pd
from sklearn.metrics import precision_score, recall_score

from pycytominer.cyto_utils import infer_cp_features

from scripts.precision_recall_utils import calc_pairwise_corr, categorize_comparisons

In [2]:
assay = "L1000"  # Can be either "L1000" or "cell_painting"

partition = "part2" # L1000 large to fit into memory, split into two sections, Cell Painting worked in one part

In [3]:
# Load input data
profile_dir = pathlib.Path("Consensus", assay, "moa_sizes_consensus_datasets")

if assay == "cell_painting":
    profile_file = pathlib.Path(profile_dir, "cell_painting_moa_analytical_set_profiles.tsv.gz")
else:
    profile_file = pathlib.Path(profile_dir, "l1000_moa_analytical_set_profiles.tsv.gz")

profile_df = pd.read_csv(profile_file, sep="\t", low_memory=False)

if assay == "L1000":
    # Load Cell Painting pert id columns to merge target column
    profile_file = pathlib.Path("Consensus", "cell_painting", "moa_sizes_consensus_datasets", "cell_painting_moa_analytical_set_profiles.tsv.gz")
    cp_df = pd.read_csv(profile_file, sep="\t", usecols=["pert_iname", "moa", "Metadata_target"]).drop_duplicates()
    
    # Merge target info to L1000 data
    profile_df = profile_df.merge(cp_df, on=["pert_iname", "moa"], how="left")
    profile_df.Metadata_target = profile_df.Metadata_target.astype(str)
    
print(profile_df.shape)
profile_df.head()

(5736, 981)


Unnamed: 0,sig_id,200814_at,222103_at,201453_x_at,204131_s_at,200059_s_at,205067_at,213702_x_at,214435_x_at,201334_s_at,...,218529_at,211071_s_at,203341_at,205379_at,pert_id,pert_idose,dose,pert_iname,moa,Metadata_target
0,REP.A001_A549_24H:A07,-0.061635,0.408537,0.824534,0.536392,-0.566594,-0.308054,0.189936,0.184868,-0.068203,...,-0.24764,0.416466,-0.676134,-2.665621,BRD-K25114078,10 uM,6,aminoguanidine,nitric oxide synthase inhibitor,AKR1B1|NOS2|TIMP3
1,REP.A001_A549_24H:A08,-0.506381,0.030745,-0.787902,0.187344,0.039911,-0.547436,0.416978,-0.994681,0.740328,...,0.143608,0.318085,0.363956,-0.592373,BRD-K25114078,3.33 uM,5,aminoguanidine,nitric oxide synthase inhibitor,AKR1B1|NOS2|TIMP3
2,REP.A001_A549_24H:A09,-0.509867,0.0415,0.263433,-0.613,0.4546,0.286267,1.1984,-0.469433,0.5137,...,-1.577267,-2.051033,0.038333,-0.911967,BRD-K25114078,1.11 uM,4,aminoguanidine,nitric oxide synthase inhibitor,AKR1B1|NOS2|TIMP3
3,REP.A001_A549_24H:A10,-0.296316,-0.42135,-0.588798,0.15658,0.953095,-0.90707,-0.457481,-0.051327,-0.141768,...,0.150566,0.615505,0.119067,-0.322888,BRD-K25114078,0.37 uM,3,aminoguanidine,nitric oxide synthase inhibitor,AKR1B1|NOS2|TIMP3
4,REP.A001_A549_24H:A11,-0.48779,0.080023,0.082183,-0.539648,0.423894,-0.206426,0.177557,-0.256125,0.21015,...,-0.469356,-0.396239,0.325295,-1.037028,BRD-K25114078,0.12 uM,2,aminoguanidine,nitric oxide synthase inhibitor,AKR1B1|NOS2|TIMP3


In [4]:
# Distinguish profile and metadata features
if assay == "cell_painting":
    cp_features = infer_cp_features(profile_df)
    meta_features = ["pert_iname", "moa", "Metadata_target", "Metadata_dose_recode"]
    dose_col = "Metadata_dose_recode"
else:
    cp_features = profile_df.loc[:, profile_df.columns.str.endswith("_at")].columns.tolist()
    meta_features = ["pert_iname", "moa", "Metadata_target", "dose"]
    dose_col = "dose"

In [5]:
# Calculate pairwise correlations for precision/recall calculations
all_corr_df = (
    calc_pairwise_corr(
        profile_df=profile_df,
        metadata_cols=meta_features,
        features=cp_features
    )
    .reset_index(drop=True)
)

# Drop comparisons of the same perturbation across multiple doses
id_cols = ["pert_iname"]

compare_df = all_corr_df.loc[:, [f"{x}_compare" for x in id_cols]]
compare_df.columns = id_cols
is_replicate = (
    all_corr_df.loc[:, id_cols] == 
    compare_df
).all(
    axis="columns"
)

all_corr_df = all_corr_df.loc[~is_replicate, :].reset_index(drop=True)

print(all_corr_df.shape)
all_corr_df.head()

  numpy.divide(r, std_x[:, numpy.newaxis], out=r)
  numpy.divide(r, std_y[numpy.newaxis, :], out=r)


(32867208, 10)


Unnamed: 0,pert_iname,moa,Metadata_target,dose,original_index,correlation,pert_iname_compare,moa_compare,Metadata_target_compare,dose_compare
0,batimastat,matrix metalloprotease inhibitor,ADAM28|ADAMTS5|MMP12|MMP16|MMP2|MMP8,6,1,0.139786,aminoguanidine,nitric oxide synthase inhibitor,AKR1B1|NOS2|TIMP3,5
1,batimastat,matrix metalloprotease inhibitor,ADAM28|ADAMTS5|MMP12|MMP16|MMP2|MMP8,5,1,0.138883,aminoguanidine,nitric oxide synthase inhibitor,AKR1B1|NOS2|TIMP3,5
2,batimastat,matrix metalloprotease inhibitor,ADAM28|ADAMTS5|MMP12|MMP16|MMP2|MMP8,4,1,0.27956,aminoguanidine,nitric oxide synthase inhibitor,AKR1B1|NOS2|TIMP3,5
3,batimastat,matrix metalloprotease inhibitor,ADAM28|ADAMTS5|MMP12|MMP16|MMP2|MMP8,2,1,0.222865,aminoguanidine,nitric oxide synthase inhibitor,AKR1B1|NOS2|TIMP3,5
4,batimastat,matrix metalloprotease inhibitor,ADAM28|ADAMTS5|MMP12|MMP16|MMP2|MMP8,1,1,0.141339,aminoguanidine,nitric oxide synthase inhibitor,AKR1B1|NOS2|TIMP3,5


### Categorize comparisons

We need to create a column that captures which MOAs/Targets are the same, and which are different.
We also need to make sure that comparisons are not of the same compound but at different doses.

In [6]:
# Note, this takes several minutes to complete
if partition == "part1":
    all_corr_match_df = all_corr_df.iloc[0:int(32867208/2),:].apply(lambda x: categorize_comparisons(x), axis="columns")
    all_corr_match_df = pd.concat([all_corr_df.iloc[0:int(32867208/2),:], all_corr_match_df], axis="columns")
elif partition == "part2":
    all_corr_match_df = all_corr_df.iloc[int(32867208/2):32867208,:].apply(lambda x: categorize_comparisons(x), axis="columns")
    all_corr_match_df = pd.concat([all_corr_df.iloc[int(32867208/2):32867208,:], all_corr_match_df], axis="columns")

print(all_corr_match_df.shape)
all_corr_match_df.head(10)

(16433604, 12)


Unnamed: 0,pert_iname,moa,Metadata_target,dose,original_index,correlation,pert_iname_compare,moa_compare,Metadata_target_compare,dose_compare,match_moa,match_target
16433604,tacrolimus,calcineurin inhibitor,FKBP1A,6,2868,-0.021903,meglitinide,potassium channel blocker,CCR2,5,False,False
16433605,tacrolimus,calcineurin inhibitor,FKBP1A,5,2868,-0.01792,meglitinide,potassium channel blocker,CCR2,5,False,False
16433606,tacrolimus,calcineurin inhibitor,FKBP1A,4,2868,-0.000732,meglitinide,potassium channel blocker,CCR2,5,False,False
16433607,tacrolimus,calcineurin inhibitor,FKBP1A,3,2868,0.007709,meglitinide,potassium channel blocker,CCR2,5,False,False
16433608,tacrolimus,calcineurin inhibitor,FKBP1A,2,2868,0.008832,meglitinide,potassium channel blocker,CCR2,5,False,False
16433609,tacrolimus,calcineurin inhibitor,FKBP1A,1,2868,-0.006466,meglitinide,potassium channel blocker,CCR2,5,False,False
16433610,pranlukast,leukotriene receptor antagonist,CYSLTR1|CYSLTR2|IL5|MUC2|NFKB1|RNASE3|TNF,6,2868,-0.013899,meglitinide,potassium channel blocker,CCR2,5,False,False
16433611,pranlukast,leukotriene receptor antagonist,CYSLTR1|CYSLTR2|IL5|MUC2|NFKB1|RNASE3|TNF,5,2868,-0.013104,meglitinide,potassium channel blocker,CCR2,5,False,False
16433612,pranlukast,leukotriene receptor antagonist,CYSLTR1|CYSLTR2|IL5|MUC2|NFKB1|RNASE3|TNF,4,2868,0.011878,meglitinide,potassium channel blocker,CCR2,5,False,False
16433613,pranlukast,leukotriene receptor antagonist,CYSLTR1|CYSLTR2|IL5|MUC2|NFKB1|RNASE3|TNF,3,2868,-0.003028,meglitinide,potassium channel blocker,CCR2,5,False,False


In [7]:
# Output data
output_file = pathlib.Path("results", f"dose_corr_matching_moa_target_{assay}_{partition}.tsv.gz")
all_corr_match_df.to_csv(output_file, sep="\t", index=False)