### Calculating Null Distribution



Null distribution - is generated by getting the median correlation score of randomly combined replicates that do not come from the same compounds.



### The goal here: 

-- is to compute the **p-value** for each compound per dose by evaluating the probability of random combinations of replicates (from different compounds) having greater median correlation score than replicates that come from the same compound.




- In our case, we generated 1000 median correlation scores from randomly combined replicates as the **null distribution** for each no_of_replicates/replicate class per DOSE i.e. for a no_of_replicates class for every DOSE (1-6) - we have 1000 medians scores from randomly combined replicates of different compounds.





**no_of_replicate** is the number of replicates in a specific compound and **no_of_replicate class** is a specific group of compounds that have the same amount of replicates e.g all compounds with 5 replicates in them are in the same no_of_replicates class.

In [1]:
import os
import pathlib
import pandas as pd
import numpy as np
from collections import defaultdict
from pycytominer import feature_select
from statistics import median
import random
from scipy import stats
import pickle


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)

In [2]:
# Load common compounds
common_file = pathlib.Path(
    "..", "..", "..", "6.paper_figures", "data", "significant_compounds_by_threshold_both_assays.tsv.gz"
)
common_df = pd.read_csv(common_file, sep="\t")

common_compounds = common_df.compound.unique()
print(len(common_compounds))

1327


### - Load in Level 4 Datasets generated from `calculate_median_scores_notebook`

In [3]:
cp_level4_path = "cellpainting_lvl4_cpd_replicate_datasets"

In [4]:
df_level4 = pd.read_csv(os.path.join(cp_level4_path, 'cp_level4_cpd_replicates.csv.gz'), 
                        compression='gzip',low_memory = False)

print(df_level4.shape)
df_level4.head()

(51833, 812)


Unnamed: 0,Metadata_broad_sample,Metadata_pert_id,Metadata_Plate,Metadata_Well,Metadata_broad_id,Metadata_moa,Metadata_dose_recode,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,Cells_AreaShape_Extent,...,Nuclei_Texture_SumVariance_Mito_5_0,Nuclei_Texture_SumVariance_RNA_20_0,Nuclei_Texture_Variance_DNA_20_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,broad_id,pert_iname,moa,replicate_name
0,DMSO,,SQ00015211,A01,,,0,2.366133,-0.032317,0.618638,...,-0.571713,0.757513,-0.63458,1.440005,-0.044817,0.264545,DMSO,DMSO,Control vehicle,replicate_0
1,DMSO,,SQ00015211,A02,,,0,0.376276,-1.135014,0.179721,...,-0.17745,-1.684778,-1.235524,1.958443,0.165417,-0.985233,DMSO,DMSO,Control vehicle,replicate_1
2,DMSO,,SQ00015211,A03,,,0,-0.175386,2.064517,-0.017523,...,-1.04828,1.192945,0.008292,0.194224,-1.680378,-0.697503,DMSO,DMSO,Control vehicle,replicate_2
3,DMSO,,SQ00015211,A04,,,0,-0.756906,0.715242,0.186433,...,0.868495,1.649616,-0.205897,0.205308,-0.927785,0.712298,DMSO,DMSO,Control vehicle,replicate_3
4,DMSO,,SQ00015211,A05,,,0,0.715957,-2.628349,0.839847,...,-2.069335,0.174793,0.756266,1.393177,-0.537319,0.420051,DMSO,DMSO,Control vehicle,replicate_4


In [5]:
df_cpd_med_scores = pd.read_csv(os.path.join(cp_level4_path, 'cpd_replicate_median_scores.csv'))
df_cpd_med_scores = df_cpd_med_scores.set_index('cpd').rename_axis(None, axis=0).copy()

# Subset to common compound measurements
df_cpd_med_scores = df_cpd_med_scores.loc[df_cpd_med_scores.index.isin(common_compounds), :]

print(df_cpd_med_scores.shape)
df_cpd_med_scores.head()

(1327, 7)


Unnamed: 0,dose_1,dose_2,dose_3,dose_4,dose_5,dose_6,no_of_replicates
17-hydroxyprogesterone-caproate,0.060655,0.035172,0.06198,0.078039,0.146046,0.47584,5
2-iminobiotin,0.0427,-0.002962,0.024638,0.001683,0.023984,0.032439,5
3-amino-benzamide,0.104273,0.062608,0.071169,0.053764,0.074714,0.011697,5
3-deazaadenosine,0.010217,0.045891,0.019658,0.011578,0.057315,0.119805,5
ABT-737,0.00447,0.076636,0.143538,0.3162,0.358355,0.668907,5


In [6]:
def get_cpds_replicates(df, df_lvl4):
    """
    This function returns all replicates id/names found in each compound 
    and in all doses(1-6)
    """
    
    dose_list = list(set(df_lvl4['Metadata_dose_recode'].unique().tolist()))[1:7]
    replicates_in_all = []
    cpds_replicates = {}
    for dose in dose_list:
        rep_list = []
        df_doses = df_lvl4[df_lvl4['Metadata_dose_recode'] == dose].copy()
        for cpd in df.index:
            replicate_names = df_doses[df_doses['pert_iname'] == cpd]['replicate_name'].values.tolist()
            rep_list += replicate_names
            if cpd not in cpds_replicates:
                cpds_replicates[cpd] = [replicate_names]
            else:
                cpds_replicates[cpd] += [replicate_names]
        replicates_in_all.append(rep_list)
        
    return replicates_in_all, cpds_replicates

In [7]:
replicates_in_all, cpds_replicates = get_cpds_replicates(df_cpd_med_scores, df_level4)

In [8]:
def get_replicates_classes_per_dose(df, df_lvl4, cpds_replicates):
    
    """
    This function gets all replicates ids for each distinct 
    no_of_replicates (i.e. number of replicates per cpd) class per dose (1-6)
    
    Returns replicate_class_dict dictionary, with no_of_replicate classes as the keys, 
    and all the replicate_ids for each no_of_replicate class as the values
    """
    
    df['replicate_id'] = list(cpds_replicates.values())
    dose_list = list(set(df_lvl4['Metadata_dose_recode'].unique().tolist()))[1:7]
    replicate_class_dict = {}
    for dose in dose_list:
        for size in df['no_of_replicates'].unique():
            rep_lists = []
            for idx in range(df[df['no_of_replicates'] == size].shape[0]):
                rep_ids = df[df['no_of_replicates'] == size]['replicate_id'].values.tolist()[idx][dose-1]
                rep_lists += rep_ids
            if size not in replicate_class_dict:
                replicate_class_dict[size] = [rep_lists]
            else:
                replicate_class_dict[size] += [rep_lists]
                
    return replicate_class_dict

In [9]:
cpd_replicate_class_dict = get_replicates_classes_per_dose(df_cpd_med_scores, df_level4, cpds_replicates)

In [10]:
cpd_replicate_class_dict.keys()

dict_keys([5, 2, 4, 10])

In [11]:
def check_similar_replicates(replicates, dose, cpd_dict):
    """This function checks if two replicates are of the same compounds"""
    
    for x in range(len(replicates)):
        for y in range(x+1, len(replicates)):
            for kys in cpd_dict:
                if all(i in cpd_dict[kys][dose-1] for i in [replicates[x], replicates[y]]):
                    return True
    return False

In [12]:
def get_random_replicates(all_replicates, no_of_replicates, dose, replicates_ids, cpd_replicate_dict):
    """
    This function return a list of random replicates that are not of the same compounds
    or found in the current cpd's size list
    """
    while (True):
        random_replicates = random.sample(all_replicates, no_of_replicates)
        if not (any(rep in replicates_ids for rep in random_replicates) & 
                (check_similar_replicates(random_replicates, dose, cpd_replicate_dict))):
            break
    return random_replicates

In [13]:
def get_null_distribution_replicates(
    cpd_replicate_class_dict,
    dose_list,
    replicates_lists,
    cpd_replicate_dict,
    rand_num = 1000
):
    
    """
    This function returns a null distribution dictionary, with no_of_replicates(replicate class) 
    as the keys and 1000 lists of randomly selected replicate combinations as the values
    for each no_of_replicates class per DOSE(1-6)
    """
    random.seed(1903)
    null_distribution_reps = {}
    for dose in dose_list:
        for replicate_class in cpd_replicate_class_dict:
            replicates_ids = cpd_replicate_class_dict[replicate_class][dose-1]
            replicate_list = []
            for idx in range(rand_num):
                start_again = True
                while (start_again):
                    rand_cpds = get_random_replicates(replicates_lists[dose-1], replicate_class, dose, 
                                                      replicates_ids, cpd_replicate_dict)
                    if rand_cpds not in replicate_list:
                        start_again = False
                replicate_list.append(rand_cpds)
            if replicate_class not in null_distribution_reps:
                null_distribution_reps[replicate_class] = [replicate_list]
            else:
                null_distribution_reps[replicate_class] += [replicate_list]
    
    return null_distribution_reps

In [14]:
len(cpds_replicates.keys())

1327

In [15]:
dose_list = list(set(df_level4['Metadata_dose_recode'].unique().tolist()))[1:7]

null_distribution_replicates = get_null_distribution_replicates(
    cpd_replicate_class_dict, dose_list, replicates_in_all, cpds_replicates
)

In [16]:
def save_to_pickle(null_distribution, path, file_name):
    """This function saves the null distribution replicates ids into a pickle file"""
    
    if not os.path.exists(path):
        os.mkdir(path)
        
    with open(os.path.join(path, file_name), 'wb') as handle:
        pickle.dump(null_distribution, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [17]:
#save the null_distribution_moa to pickle
save_to_pickle(null_distribution_replicates, cp_level4_path, 'null_distribution.pickle')

In [18]:
##load the null_distribution_moa from pickle
with open(os.path.join(cp_level4_path, 'null_distribution.pickle'), 'rb') as handle:
    null_distribution_replicates = pickle.load(handle)

In [19]:
def assert_null_distribution(null_distribution_reps, dose_list):
    
    """
    This function assert that each of the list in the 1000 lists of random replicate 
    combination (per dose) for each no_of_replicate class are distinct with no duplicates
    """
    duplicates_reps = {}
    for dose in dose_list:
        for keys in null_distribution_reps:
            null_dist = null_distribution_reps[keys][dose-1]
            for reps in null_dist:
                dup_reps = []
                new_list = list(filter(lambda x: x != reps, null_dist))
                if (len(new_list) != len(null_dist) - 1):
                    dup_reps.append(reps)
            if dup_reps:
                if keys not in duplicates_reps:
                    duplicates_reps[keys] = [dup_reps]
                else:
                    duplicates_reps[keys] += [dup_reps]
    return duplicates_reps

In [20]:
duplicate_replicates = assert_null_distribution(null_distribution_replicates, dose_list)

In [21]:
duplicate_replicates ##no duplicates

{}

In [22]:
def calc_null_dist_median_scores(df, dose_num, replicate_lists):
    """
    This function calculate the median of the correlation 
    values for each list in the 1000 lists of random replicate 
    combination for each no_of_replicate class per dose
    """
    df_dose = df[df['Metadata_dose_recode'] == dose_num].copy()
    df_dose = df_dose.set_index('replicate_name').rename_axis(None, axis=0)
    df_dose.drop(['Metadata_broad_sample', 'Metadata_pert_id', 'Metadata_dose_recode', 
                  'Metadata_Plate', 'Metadata_Well', 'Metadata_broad_id', 'Metadata_moa', 
                  'broad_id', 'pert_iname', 'moa'], 
                 axis = 1, inplace = True)
    median_corr_list = []
    for rep_list in replicate_lists:
        df_reps = df_dose.loc[rep_list].copy()
        reps_corr = df_reps.astype('float64').T.corr(method = 'spearman').values
        median_corr_val = median(list(reps_corr[np.triu_indices(len(reps_corr), k = 1)]))
        median_corr_list.append(median_corr_val)
    return median_corr_list

In [23]:
def get_null_dist_median_scores(null_distribution_cpds, dose_list, df):
    """ 
    This function calculate the median correlation scores for all 
    1000 lists of randomly combined compounds for each no_of_replicate class 
    across all doses (1-6)
    """
    null_distribution_medians = {}
    for key in null_distribution_cpds:
        median_score_list = []
        for dose in dose_list:
            replicate_median_scores = calc_null_dist_median_scores(df, dose, null_distribution_cpds[key][dose-1])
            median_score_list.append(replicate_median_scores)
        null_distribution_medians[key] = median_score_list
    return null_distribution_medians

In [24]:
null_distribution_medians = get_null_dist_median_scores(null_distribution_replicates, dose_list, df_level4)

In [25]:
def compute_dose_median_scores(null_dist_medians, dose_list):
    """
    This function align median scores per dose, and return a dictionary, 
    with keys as dose numbers and values as all median null distribution/non-replicate correlation 
    scores for each dose
    """
    median_scores_per_dose = {}
    for dose in dose_list:
        median_list = []
        for keys in null_distribution_medians:
            dose_median_list = null_distribution_medians[keys][dose-1]
            median_list += dose_median_list
        median_scores_per_dose[dose] = median_list
    return median_scores_per_dose

In [26]:
dose_null_medians = compute_dose_median_scores(null_distribution_medians, dose_list)

In [27]:
#save the null_distribution_medians_per_dose to pickle
save_to_pickle(dose_null_medians, cp_level4_path, 'null_dist_medians_per_dose.pickle')

**A P value can be computed nonparametrically by evaluating the probability of random replicates of different compounds having median similarity value greater than replicates of the same compounds.**

In [28]:
def get_p_value(median_scores_list, df, dose_name, cpd_name):
    """
    This function calculate the p-value from the 
    null_distribution median scores for each compound
    """
    actual_med = df.loc[cpd_name, dose_name]
    p_value = np.sum(median_scores_list >= actual_med) / len(median_scores_list)
    return p_value

In [29]:
def get_moa_p_vals(null_dist_median, dose_list, df_med_values):
    """
    This function returns a dict, with compounds as the keys and the compound's 
    p-values for each dose (1-6) as the values
    """
    null_p_vals = {}
    for key in null_dist_median:
        df_replicate_class = df_med_values[df_med_values['no_of_replicates'] == key]
        for cpd in df_replicate_class.index:
            dose_p_values = []
            for num in dose_list:
                dose_name = 'dose_' + str(num)
                cpd_p_value = get_p_value(null_dist_median[key][num-1], df_replicate_class, dose_name, cpd)
                dose_p_values.append(cpd_p_value)
            null_p_vals[cpd] = dose_p_values
    sorted_null_p_vals = {key:value for key, value in sorted(null_p_vals.items(), key=lambda item: item[0])}
    return sorted_null_p_vals

In [30]:
null_p_vals = get_moa_p_vals(null_distribution_medians, dose_list, df_cpd_med_scores)

In [31]:
df_null_p_vals = pd.DataFrame.from_dict(null_p_vals, orient='index', columns = ['dose_' + str(x) for x in dose_list])

In [32]:
df_null_p_vals['no_of_replicates'] = df_cpd_med_scores['no_of_replicates']

In [33]:
df_null_p_vals.head(10)

Unnamed: 0,dose_1,dose_2,dose_3,dose_4,dose_5,dose_6,no_of_replicates
17-hydroxyprogesterone-caproate,0.002,0.03,0.0,0.0,0.0,0.0,5
2-iminobiotin,0.011,0.667,0.089,0.554,0.125,0.127,5
3-amino-benzamide,0.0,0.001,0.0,0.003,0.001,0.471,5
3-deazaadenosine,0.356,0.006,0.148,0.319,0.004,0.0,5
ABT-737,0.505,0.0,0.0,0.0,0.0,0.0,5
AICA-ribonucleotide,0.221,0.012,0.999,0.215,0.692,0.621,2
AKT-inhibitor-1-2,0.99,0.713,0.036,0.075,0.004,0.0,5
ALX-5407,0.56,0.237,0.013,0.12,0.0,0.009,5
AS-605240,0.002,0.0,0.0,0.0,0.0,0.0,5
AT-7519,0.0,0.0,0.0,0.0,0.0,0.0,5


In [34]:
def save_to_csv(df, path, file_name):
    """saves dataframes to csv"""
    
    if not os.path.exists(path):
        os.mkdir(path)
    
    df.to_csv(os.path.join(path, file_name), index = False)

In [35]:
save_to_csv(df_null_p_vals.reset_index().rename({'index':'cpd'}, axis = 1), cp_level4_path, 
            'cpd_replicate_p_values.csv')