## Acquire pairwise Spearman correlations for gene targets

For both L1000 and Cell Painting

In [1]:
import pathlib
import numpy as np
import pandas as pd

from pycytominer.cyto_utils import infer_cp_features

In [2]:
# Load Cell Painting data
cp_file = pathlib.Path(
    "Consensus",
    "cell_painting",
    "moa_sizes_consensus_datasets",
    "cell_painting_moa_analytical_set_profiles.tsv.gz"
)

cp_df = pd.read_csv(cp_file, sep="\t")

cp_features = infer_cp_features(cp_df)

print(cp_df.shape)
cp_df.head(2)

(5574, 1032)


Unnamed: 0,Metadata_Plate_Map_Name,Metadata_cell_id,Metadata_broad_sample,Metadata_pert_well,Metadata_mmoles_per_liter,Metadata_dose_recode,Metadata_time_point,Metadata_moa,Metadata_target,Cells_AreaShape_Area,...,Nuclei_Texture_Variance_DNA_20_0,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,Nuclei_Texture_Variance_RNA_5_0,broad_id,pert_iname,moa
0,C-7161-01-LM6-001,A549,BRD-A26032986-050-02-1,C13,10.0,6,48H,calmodulin antagonist,CALM1,-0.975063,...,0.340203,0.116535,0.525492,1.27825,-0.51845,-0.326173,0.837018,BRD-A26032986,zaldaride,calmodulin antagonist
1,C-7161-01-LM6-001,A549,BRD-A26032986-050-02-1,C14,3.3333,5,48H,calmodulin antagonist,CALM1,0.303898,...,-0.003729,0.261119,0.681637,-0.515889,-0.482979,-1.00399,-0.651142,BRD-A26032986,zaldaride,calmodulin antagonist


In [3]:
# Match compounds that target the same genes
# Note the compound can also target _other_ genes as well
all_targets = {x: list(set(x.split("|"))) for x in cp_df.Metadata_target.unique().tolist()}

cp_target_comparisons = {}
for target in all_targets:
    target_set = set(all_targets[target])
    for compare_target in all_targets:
        if target == compare_target:
            next
        compare_target_set = set(all_targets[compare_target])
        
        if len(target_set.intersection(compare_target_set)) > 0:
            if target in cp_target_comparisons:
                cp_target_comparisons[target].append(compare_target)
            else:
                cp_target_comparisons[target] = [compare_target]

In [4]:
# Calculate median pairwise correlations for All doses
target_all_dose_cor_df = []
for target in cp_target_comparisons:
    cp_subset = cp_target_comparisons[target]
    
    cp_subset_df = (
        cp_df
        .query("Metadata_target in @cp_subset")
        .reset_index(drop=True)
        .loc[:, cp_features]
        .transpose()
        .astype(float)
        .corr(method="spearman")
    )

    np.fill_diagonal(cp_subset_df.values, np.nan)

    n_compounds = cp_subset_df.shape[0]

    target_median_score = (
        cp_subset_df
        .melt(value_name="pairwise_cor", ignore_index=False)
        .dropna()
        .pairwise_cor
        .median()
    )
    
    target_all_dose_cor_df.append([target, "All", target_median_score, n_compounds])
    
target_all_dose_cor_df = pd.DataFrame(target_all_dose_cor_df)

print(target_all_dose_cor_df.shape)
target_all_dose_cor_df.head()

(545, 4)


Unnamed: 0,0,1,2,3
0,CALM1,All,0.013568,30
1,NR3C1|PLA2G1B,All,0.305219,96
2,CYP3A4,All,0.004833,114
3,ADORA3,All,0.013479,54
4,MTOR,All,0.260616,60


In [5]:
# Calculate median pairwise correlations for each dose individually
target_dose_cor_df = []
for dose in cp_df.Metadata_dose_recode.unique():
    for target in cp_target_comparisons:
        cp_subset = cp_target_comparisons[target]

        cp_subset_df = (
            cp_df
            .query("Metadata_target in @cp_subset")
            .query("Metadata_dose_recode == @dose")
            .reset_index(drop=True)
            .loc[:, cp_features]
            .transpose()
            .astype(float)
            .corr(method="spearman")
        )

        np.fill_diagonal(cp_subset_df.values, np.nan)
        
        n_compounds = cp_subset_df.shape[0]

        target_median_score = (
            cp_subset_df
            .melt(value_name="pairwise_cor", ignore_index=False)
            .dropna()
            .pairwise_cor
            .median()
        )

        target_dose_cor_df.append([target, dose, target_median_score, n_compounds])

target_dose_cor_df = pd.DataFrame(target_dose_cor_df)

print(target_dose_cor_df.shape)
target_dose_cor_df.head()

(3270, 4)


Unnamed: 0,0,1,2,3
0,CALM1,6,0.314849,5
1,NR3C1|PLA2G1B,6,0.314087,16
2,CYP3A4,6,0.020401,19
3,ADORA3,6,0.015786,9
4,MTOR,6,0.304648,10


In [6]:
# Load L1000 data
l1000_file = pathlib.Path(
    "Consensus",
    "L1000",
    "moa_sizes_consensus_datasets",
    "l1000_moa_analytical_set_profiles.tsv.gz"
)

l1000_df = (
    pd.read_csv(l1000_file, sep="\t")
    .merge(
        cp_df.loc[:, ["pert_iname", "Metadata_target"]].drop_duplicates(),
        on=["pert_iname"],
        how="left"
    )
)

l1000_features = l1000_df.columns[l1000_df.columns.str.endswith("_at")]
                       
print(l1000_df.shape)
l1000_df.head(2)

(5736, 981)


Unnamed: 0,sig_id,200814_at,222103_at,201453_x_at,204131_s_at,200059_s_at,205067_at,213702_x_at,214435_x_at,201334_s_at,...,218529_at,211071_s_at,203341_at,205379_at,pert_id,pert_idose,dose,pert_iname,moa,Metadata_target
0,REP.A001_A549_24H:A07,-0.061635,0.408537,0.824534,0.536392,-0.566594,-0.308054,0.189936,0.184868,-0.068203,...,-0.24764,0.416466,-0.676134,-2.665621,BRD-K25114078,10 uM,6,aminoguanidine,nitric oxide synthase inhibitor,AKR1B1|NOS2|TIMP3
1,REP.A001_A549_24H:A08,-0.506381,0.030745,-0.787902,0.187344,0.039911,-0.547436,0.416978,-0.994681,0.740328,...,0.143608,0.318085,0.363956,-0.592373,BRD-K25114078,3.33 uM,5,aminoguanidine,nitric oxide synthase inhibitor,AKR1B1|NOS2|TIMP3


In [7]:
# Match compounds that target the same genes
# Note the compound can also target _other_ genes as well
all_targets = {x: list(set(x.split("|"))) for x in l1000_df.Metadata_target.astype(str).unique().tolist()}

l1000_target_comparisons = {}
for target in all_targets:
    target_set = set(all_targets[target])
    for compare_target in all_targets:
        if target == compare_target:
            next
        compare_target_set = set(all_targets[compare_target])
        
        if len(target_set.intersection(compare_target_set)) > 0:
            if target in l1000_target_comparisons:
                l1000_target_comparisons[target].append(compare_target)
            else:
                l1000_target_comparisons[target] = [compare_target]

In [8]:
# Calculate median pairwise correlations for All doses
target_all_dose_l1000_cor_df = []
for target in l1000_target_comparisons:
    l1000_subset = l1000_target_comparisons[target]
    
    l1000_subset_df = (
        l1000_df
        .query("Metadata_target in @l1000_subset")
        .reset_index(drop=True)
        .loc[:, l1000_features]
        .transpose()
        .astype(float)
        .corr(method="spearman")
    )

    np.fill_diagonal(l1000_subset_df.values, np.nan)

    n_compounds = l1000_subset_df.shape[0]

    target_median_score = (
        l1000_subset_df
        .melt(value_name="pairwise_cor", ignore_index=False)
        .dropna()
        .pairwise_cor
        .median()
    )
    
    target_all_dose_l1000_cor_df.append([target, "All", target_median_score, n_compounds])
    
target_all_dose_l1000_cor_df = pd.DataFrame(target_all_dose_l1000_cor_df)

print(target_all_dose_l1000_cor_df.shape)
target_all_dose_l1000_cor_df.head()

(546, 4)


Unnamed: 0,0,1,2,3
0,AKR1B1|NOS2|TIMP3,All,0.037543,36
1,ADAM28|ADAMTS5|MMP12|MMP16|MMP2|MMP8,All,0.03855,18
2,unknown,All,0.023692,894
3,PTGS1|PTGS2,All,0.027537,192
4,DRD2|HTR2A,All,0.029801,312


In [9]:
# Calculate median pairwise correlations for each dose individually
target_dose_l1000_cor_df = []
for dose in l1000_df.dose.unique():
    for target in l1000_target_comparisons:
        l1000_subset = l1000_target_comparisons[target]

        l1000_subset_df = (
            l1000_df
            .query("Metadata_target in @l1000_subset")
            .query("dose == @dose")
            .reset_index(drop=True)
            .loc[:, l1000_features]
            .transpose()
            .astype(float)
            .corr(method="spearman")
        )

        np.fill_diagonal(l1000_subset_df.values, np.nan)
 
        n_compounds = l1000_subset_df.shape[0]

        target_median_score = (
            l1000_subset_df
            .melt(value_name="pairwise_cor", ignore_index=False)
            .dropna()
            .pairwise_cor
            .median()
        )

        target_dose_l1000_cor_df.append([target, dose, target_median_score, n_compounds])

target_dose_l1000_cor_df = pd.DataFrame(target_dose_l1000_cor_df)

print(target_dose_l1000_cor_df.shape)
target_dose_l1000_cor_df.head()

(3276, 4)


Unnamed: 0,0,1,2,3
0,AKR1B1|NOS2|TIMP3,6,0.01583,6
1,ADAM28|ADAMTS5|MMP12|MMP16|MMP2|MMP8,6,0.103949,3
2,unknown,6,0.025958,149
3,PTGS1|PTGS2,6,0.02785,32
4,DRD2|HTR2A,6,0.033124,52


In [10]:
# Combine and output results
target_results_df = pd.concat(
    [
        pd.concat(
            [
                target_all_dose_cor_df,
                target_dose_cor_df
            ], axis="rows"
        ).assign(assay="Cell Painting"),
        pd.concat(
            [
                target_all_dose_l1000_cor_df,
                target_dose_l1000_cor_df
            ], axis="rows"
        ).assign(assay="L1000")
    ], axis="rows"
)

target_results_df.columns = ["target", "dose", "median_correlation", "n_compounds", "assay"]

target_results_df = (
    target_results_df
    .sort_values(by="median_correlation", ascending=False)
    .reset_index(drop=True)
)

output_file = pathlib.Path("results", "gene_target_median_pairwise_correlations.tsv.gz")
target_results_df.to_csv(output_file, sep="\t", index=False)

print(target_results_df.shape)
target_results_df.head()

(7637, 5)


Unnamed: 0,target,dose,median_correlation,n_compounds,assay
0,RPL3,All,0.823493,6,L1000
1,HSP90AA1,4,0.760377,2,L1000
2,HSP90AA1,5,0.738467,2,L1000
3,HSP90AA1,6,0.737793,2,L1000
4,RPL3,All,0.73523,6,Cell Painting
