## Calculate null distribution from median Cell Painting scores with same sample size as L1000

Code modified from @adeboyeML

In [1]:
import os
import pathlib
import pandas as pd
import numpy as np
from collections import defaultdict
from pycytominer import feature_select
from statistics import median
import random
from scipy import stats
import pickle


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)

In [2]:
np.random.seed(42)

In [3]:
# Load common compounds
common_file = pathlib.Path(
    "..", "..", "..", "6.paper_figures", "data", "significant_compounds_by_threshold_both_assays.tsv.gz"
)
common_df = pd.read_csv(common_file, sep="\t")

common_compounds = common_df.compound.unique()
print(len(common_compounds))

1327


In [4]:
cp_level4_path = "cellpainting_lvl4_cpd_replicate_datasets"

In [5]:
df_level4 = pd.read_csv(os.path.join(cp_level4_path, 'cp_level4_cpd_replicates_nonspherized.csv.gz'), 
                        compression='gzip',low_memory = False)

print(df_level4.shape)
df_level4.head()

(51833, 756)


Unnamed: 0,Metadata_broad_sample,Metadata_pert_id,Metadata_Plate,Metadata_Well,Metadata_broad_id,Metadata_moa,Metadata_dose_recode,Cells_AreaShape_Area,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,...,Nuclei_Texture_SumEntropy_RNA_5_0,Nuclei_Texture_SumVariance_DNA_20_0,Nuclei_Texture_SumVariance_Mito_5_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,broad_id,pert_iname,moa,replicate_name
0,DMSO,,SQ00015211,A01,,,0,1.3193,1.4653,1.154,...,-0.71713,2.0593,-1.7357,1.1804,-2.3096,-2.7904,DMSO,DMSO,Control vehicle,replicate_0
1,DMSO,,SQ00015211,A02,,,0,1.2554,0.20932,0.23914,...,-1.9266,1.178,-1.2801,1.2156,-1.165,-0.32779,DMSO,DMSO,Control vehicle,replicate_1
2,DMSO,,SQ00015211,A03,,,0,1.5332,0.60472,1.5611,...,-1.159,1.6743,-0.69133,1.0166,-1.3454,-0.8573,DMSO,DMSO,Control vehicle,replicate_2
3,DMSO,,SQ00015211,A04,,,0,1.193,-0.069775,0.71251,...,-0.38376,1.3799,-0.54346,0.73752,0.001202,1.5759,DMSO,DMSO,Control vehicle,replicate_3
4,DMSO,,SQ00015211,A05,,,0,-0.25553,-2.6282,-3.5061,...,0.36826,1.9934,0.37547,1.4357,-0.66247,-0.14288,DMSO,DMSO,Control vehicle,replicate_4


In [6]:
df_cpd_med_scores = pd.read_csv(os.path.join(cp_level4_path, 'cpd_replicate_median_scores_nonspherized.csv'))
df_cpd_med_scores = df_cpd_med_scores.set_index('cpd').rename_axis(None, axis=0).copy()

# Subset to common compound measurements
df_cpd_med_scores = df_cpd_med_scores.loc[df_cpd_med_scores.index.isin(common_compounds), :].rename(columns={"cpd_size": "no_of_replicates"})

print(df_cpd_med_scores.shape)
df_cpd_med_scores .head()

(1258, 7)


Unnamed: 0,dose_1,dose_2,dose_3,dose_4,dose_5,dose_6,no_of_replicates
17-hydroxyprogesterone-caproate,0.420117,0.174304,0.002633,0.005545,-0.001344,0.087531,5
2-iminobiotin,0.006743,0.006157,0.001829,0.02953,-0.001344,0.001358,5
3-amino-benzamide,0.002071,0.216798,0.008649,0.001617,0.000475,0.272364,5
3-deazaadenosine,-0.001438,-0.001403,0.000457,0.000966,-0.001108,0.006632,5
abacavir,0.021846,2e-06,0.033281,0.001342,0.001542,0.012932,5


In [7]:
def get_cpds_replicates(df, df_lvl4):
    """
    This function returns all replicates id/names found in each compound 
    and in all doses(1-6)
    """
    
    dose_list = list(set(df_lvl4['Metadata_dose_recode'].unique().tolist()))[1:7]
    replicates_in_all = []
    cpds_replicates = {}
    for dose in dose_list:
        rep_list = []
        df_doses = df_lvl4[df_lvl4['Metadata_dose_recode'] == dose].copy()
        for cpd in df.index:
            replicate_names = df_doses[df_doses['pert_iname'] == cpd]['replicate_name'].values.tolist()
            rep_list += replicate_names
            if cpd not in cpds_replicates:
                cpds_replicates[cpd] = [replicate_names]
            else:
                cpds_replicates[cpd] += [replicate_names]
        replicates_in_all.append(rep_list)
        
    return replicates_in_all, cpds_replicates

In [8]:
replicates_in_all, cpds_replicates = get_cpds_replicates(df_cpd_med_scores, df_level4)

In [9]:
def get_replicates_classes_per_dose(df, df_lvl4, cpds_replicates):
    
    """
    This function gets all replicates ids for each distinct 
    no_of_replicates (i.e. number of replicates per cpd) class per dose (1-6)
    
    Returns replicate_class_dict dictionary, with no_of_replicate classes as the keys, 
    and all the replicate_ids for each no_of_replicate class as the values
    """
    
    df['replicate_id'] = list(cpds_replicates.values())
    dose_list = list(set(df_lvl4['Metadata_dose_recode'].unique().tolist()))[1:7]
    replicate_class_dict = {}
    for dose in dose_list:
        for size in df['no_of_replicates'].unique():
            rep_lists = []
            for idx in range(df[df['no_of_replicates'] == size].shape[0]):
                rep_ids = df[df['no_of_replicates'] == size]['replicate_id'].values.tolist()[idx][dose-1]
                rep_lists += rep_ids
            if size not in replicate_class_dict:
                replicate_class_dict[size] = [rep_lists]
            else:
                replicate_class_dict[size] += [rep_lists]
                
    return replicate_class_dict

In [10]:
cpd_replicate_class_dict = get_replicates_classes_per_dose(df_cpd_med_scores, df_level4, cpds_replicates)

In [11]:
cpd_replicate_class_dict.keys()

dict_keys([5, 4, 2, 10])

In [12]:
def check_similar_replicates(replicates, dose, cpd_dict):
    """This function checks if two replicates are of the same compounds"""
    
    for x in range(len(replicates)):
        for y in range(x+1, len(replicates)):
            for kys in cpd_dict:
                if all(i in cpd_dict[kys][dose-1] for i in [replicates[x], replicates[y]]):
                    return True
    return False

In [13]:
def get_random_replicates(all_replicates, no_of_replicates, dose, replicates_ids, cpd_replicate_dict):
    """
    This function return a list of random replicates that are not of the same compounds
    or found in the current cpd's size list
    """
    while (True):
        random_replicates = random.sample(all_replicates, no_of_replicates)
        if not (any(rep in replicates_ids for rep in random_replicates) & 
                (check_similar_replicates(random_replicates, dose, cpd_replicate_dict))):
            break
    return random_replicates

In [14]:
def get_null_distribution_replicates(
    cpd_replicate_class_dict,
    dose_list,
    replicates_lists,
    cpd_replicate_dict,
    rand_num = 1000
):
    
    """
    This function returns a null distribution dictionary, with no_of_replicates(replicate class) 
    as the keys and 1000 lists of randomly selected replicate combinations as the values
    for each no_of_replicates class per DOSE(1-6)
    """
    random.seed(1903)
    null_distribution_reps = {}
    for dose in dose_list:
        for replicate_class in cpd_replicate_class_dict:
            replicates_ids = cpd_replicate_class_dict[replicate_class][dose-1]
            replicate_list = []
            for idx in range(rand_num):
                start_again = True
                while (start_again):
                    rand_cpds = get_random_replicates(replicates_lists[dose-1], replicate_class, dose, 
                                                      replicates_ids, cpd_replicate_dict)
                    if rand_cpds not in replicate_list:
                        start_again = False
                replicate_list.append(rand_cpds)
            if replicate_class not in null_distribution_reps:
                null_distribution_reps[replicate_class] = [replicate_list]
            else:
                null_distribution_reps[replicate_class] += [replicate_list]
    
    return null_distribution_reps

In [15]:
len(cpds_replicates.keys())

1258

In [16]:
dose_list = list(set(df_level4['Metadata_dose_recode'].unique().tolist()))[1:7]

null_distribution_replicates = get_null_distribution_replicates(
    cpd_replicate_class_dict, dose_list, replicates_in_all, cpds_replicates
)

In [17]:
def save_to_pickle(null_distribution, path, file_name):
    """This function saves the null distribution replicates ids into a pickle file"""
    
    if not os.path.exists(path):
        os.mkdir(path)
        
    with open(os.path.join(path, file_name), 'wb') as handle:
        pickle.dump(null_distribution, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [18]:
#save the null_distribution_moa to pickle
save_to_pickle(null_distribution_replicates, cp_level4_path, 'null_distribution_nonspherized.pickle')

In [19]:
##load the null_distribution_moa from pickle
with open(os.path.join(cp_level4_path, 'null_distribution_nonspherized.pickle'), 'rb') as handle:
    null_distribution_replicates = pickle.load(handle)

In [20]:
def assert_null_distribution(null_distribution_reps, dose_list):
    
    """
    This function assert that each of the list in the 1000 lists of random replicate 
    combination (per dose) for each no_of_replicate class are distinct with no duplicates
    """
    duplicates_reps = {}
    for dose in dose_list:
        for keys in null_distribution_reps:
            null_dist = null_distribution_reps[keys][dose-1]
            for reps in null_dist:
                dup_reps = []
                new_list = list(filter(lambda x: x != reps, null_dist))
                if (len(new_list) != len(null_dist) - 1):
                    dup_reps.append(reps)
            if dup_reps:
                if keys not in duplicates_reps:
                    duplicates_reps[keys] = [dup_reps]
                else:
                    duplicates_reps[keys] += [dup_reps]
    return duplicates_reps

In [21]:
duplicate_replicates = assert_null_distribution(null_distribution_replicates, dose_list)

In [22]:
duplicate_replicates ##no duplicates

{}

In [23]:
def calc_null_dist_median_scores(df, dose_num, replicate_lists):
    """
    This function calculate the median of the correlation 
    values for each list in the 1000 lists of random replicate 
    combination for each no_of_replicate class per dose
    """
    df_dose = df[df['Metadata_dose_recode'] == dose_num].copy()
    df_dose = df_dose.set_index('replicate_name').rename_axis(None, axis=0)
    df_dose.drop(['Metadata_broad_sample', 'Metadata_pert_id', 'Metadata_dose_recode', 
                  'Metadata_Plate', 'Metadata_Well', 'Metadata_broad_id', 'Metadata_moa', 
                  'broad_id', 'pert_iname', 'moa'], 
                 axis = 1, inplace = True)
    median_corr_list = []
    for rep_list in replicate_lists:
        df_reps = df_dose.loc[rep_list].copy()
        reps_corr = df_reps.astype('float64').T.corr(method = 'spearman').values
        median_corr_val = median(list(reps_corr[np.triu_indices(len(reps_corr), k = 1)]))
        median_corr_list.append(median_corr_val)
    return median_corr_list

In [24]:
def get_null_dist_median_scores(null_distribution_cpds, dose_list, df):
    """ 
    This function calculate the median correlation scores for all 
    1000 lists of randomly combined compounds for each no_of_replicate class 
    across all doses (1-6)
    """
    null_distribution_medians = {}
    for key in null_distribution_cpds:
        median_score_list = []
        for dose in dose_list:
            replicate_median_scores = calc_null_dist_median_scores(df, dose, null_distribution_cpds[key][dose-1])
            median_score_list.append(replicate_median_scores)
        null_distribution_medians[key] = median_score_list
    return null_distribution_medians

In [25]:
null_distribution_medians = get_null_dist_median_scores(null_distribution_replicates, dose_list, df_level4)

In [26]:
def compute_dose_median_scores(null_dist_medians, dose_list):
    """
    This function align median scores per dose, and return a dictionary, 
    with keys as dose numbers and values as all median null distribution/non-replicate correlation 
    scores for each dose
    """
    median_scores_per_dose = {}
    for dose in dose_list:
        median_list = []
        for keys in null_distribution_medians:
            dose_median_list = null_distribution_medians[keys][dose-1]
            median_list += dose_median_list
        median_scores_per_dose[dose] = median_list
    return median_scores_per_dose

In [27]:
dose_null_medians = compute_dose_median_scores(null_distribution_medians, dose_list)

In [28]:
#save the null_distribution_medians_per_dose to pickle
save_to_pickle(dose_null_medians, cp_level4_path, 'null_dist_medians_per_dose_nonspherized.pickle')

In [29]:
def get_p_value(median_scores_list, df, dose_name, cpd_name):
    """
    This function calculate the p-value from the 
    null_distribution median scores for each compound
    """
    actual_med = df.loc[cpd_name, dose_name]
    p_value = np.sum(median_scores_list >= actual_med) / len(median_scores_list)
    return p_value

In [30]:
def get_moa_p_vals(null_dist_median, dose_list, df_med_values):
    """
    This function returns a dict, with compounds as the keys and the compound's 
    p-values for each dose (1-6) as the values
    """
    null_p_vals = {}
    for key in null_dist_median:
        df_replicate_class = df_med_values[df_med_values['no_of_replicates'] == key]
        for cpd in df_replicate_class.index:
            dose_p_values = []
            for num in dose_list:
                dose_name = 'dose_' + str(num)
                cpd_p_value = get_p_value(null_dist_median[key][num-1], df_replicate_class, dose_name, cpd)
                dose_p_values.append(cpd_p_value)
            null_p_vals[cpd] = dose_p_values
    sorted_null_p_vals = {key:value for key, value in sorted(null_p_vals.items(), key=lambda item: item[0])}
    return sorted_null_p_vals

In [31]:
null_p_vals = get_moa_p_vals(null_distribution_medians, dose_list, df_cpd_med_scores)

In [32]:
df_null_p_vals = pd.DataFrame.from_dict(null_p_vals, orient='index', columns = ['dose_' + str(x) for x in dose_list])

In [33]:
df_null_p_vals['no_of_replicates'] = df_cpd_med_scores['no_of_replicates']

In [34]:
df_null_p_vals.head(10)

Unnamed: 0,dose_1,dose_2,dose_3,dose_4,dose_5,dose_6,no_of_replicates
17-hydroxyprogesterone-caproate,0.002,0.101,0.528,0.537,0.496,0.393,5
2-iminobiotin,0.526,0.559,0.533,0.429,0.496,0.677,5
3-amino-benzamide,0.546,0.054,0.5,0.551,0.488,0.077,5
3-deazaadenosine,0.554,0.584,0.542,0.553,0.496,0.663,5
abacavir,0.472,0.581,0.407,0.551,0.483,0.639,5
abiraterone-acetate,0.062,0.0,0.544,0.536,0.481,0.0,5
abt-202,0.552,0.576,0.546,0.544,0.545,0.675,5
abt-239,0.256,0.0,0.0,0.553,0.496,0.679,5
abt-724,0.55,0.576,0.376,0.562,0.488,0.677,5
acarbose,0.386,0.34,0.273,0.501,0.489,0.58,4


In [35]:
def save_to_csv(df, path, file_name):
    """saves dataframes to csv"""
    
    if not os.path.exists(path):
        os.mkdir(path)
    
    df.to_csv(os.path.join(path, file_name), index = False)

In [36]:
save_to_csv(df_null_p_vals.reset_index().rename({'index':'cpd'}, axis = 1), cp_level4_path, 
            'cpd_replicate_p_values_nonspherized.csv')

In [37]:
cpd_summary_file = pathlib.Path(cp_level4_path, 'cpd_replicate_p_values_melted_nonspherized.csv')

dose_recode_info = {
    'dose_1': '0.04 uM', 'dose_2':'0.12 uM', 'dose_3':'0.37 uM',
    'dose_4': '1.11 uM', 'dose_5':'3.33 uM', 'dose_6':'10 uM'
}

# Melt the p values
cpd_score_summary_pval_df = (
    df_null_p_vals
    .reset_index()
    .rename(columns={"index": "compound"})
    .melt(
        id_vars=["compound", "no_of_replicates"],
        value_vars=["dose_1", "dose_2", "dose_3", "dose_4", "dose_5", "dose_6"],
        var_name="dose",
        value_name="p_value"
    )
)

cpd_score_summary_pval_df.dose = cpd_score_summary_pval_df.dose.replace(dose_recode_info)

# Melt the median matching scores
cpd_score_summary_df = (
    df_cpd_med_scores
    .reset_index()
    .rename(columns={"index": "compound"})
    .melt(
        id_vars=["compound", "no_of_replicates"],
        value_vars=["dose_1", "dose_2", "dose_3", "dose_4", "dose_5", "dose_6"],
        var_name="dose",
        value_name="matching_score"
    )

)

cpd_score_summary_df.dose = cpd_score_summary_df.dose.replace(dose_recode_info)

summary_df = (
    cpd_score_summary_pval_df
    .merge(cpd_score_summary_df, on=["compound", "no_of_replicates", "dose"], how="inner")
    .assign(
        assay="Cell Painting",
        normalization="non_spherized",
        category="all_data"
    )
)

summary_df.to_csv(cpd_summary_file, sep="\t", index=False)

print(summary_df.shape)
summary_df.head()

(7548, 8)


Unnamed: 0,compound,no_of_replicates,dose,p_value,matching_score,assay,normalization,category
0,17-hydroxyprogesterone-caproate,5,0.04 uM,0.002,0.420117,Cell Painting,non_spherized,all_data
1,2-iminobiotin,5,0.04 uM,0.526,0.006743,Cell Painting,non_spherized,all_data
2,3-amino-benzamide,5,0.04 uM,0.546,0.002071,Cell Painting,non_spherized,all_data
3,3-deazaadenosine,5,0.04 uM,0.554,-0.001438,Cell Painting,non_spherized,all_data
4,abacavir,5,0.04 uM,0.472,0.021846,Cell Painting,non_spherized,all_data
