### Calculating Null Distribution

Gregory Way, modified from code written by Adeniyi Adeboye

- Null distribution - is generated by getting the median correlation score of randomly combined compounds that do not share/come from the same MOAs.


### The goal here:

- is to compute the p-value for each MOA per dose by evaluating the probability of random combinations of compounds (from different MOAs) having greater median correlation score than compounds of the same MOA.
- In our case, we generated 1000 median correlation scores from randomly combined compounds as the **null distribution** for each MOA_SIZE class ***i.e. for a moa_size class - we have 1000 medians scores from randomly combined compounds of different MOAs.***
- Moa_size is the number of compounds in a specific MOA and moa_size class is a specific group of MOAs that have the same number of compounds ***e.g all MOAs with just 2 compounds in them are in the same moa_size class.***


### Note:

To generate the null distribution for modz and rank level-5 data, you will have to execute this notebook twice for each of them.

In [1]:
import os
import pathlib
import requests
import pickle
import argparse
import pandas as pd
import numpy as np
import re
from os import walk
from collections import Counter
import matplotlib.pyplot as plt
%matplotlib inline
import random
import shutil
from statistics import median

In [2]:
np.random.seed(42)

#### - Load in the datasets required, 
- They were generated from the `L1000_moa_median_scores_calculation.notebook`

In [3]:
df_lvl5 = pd.read_csv(os.path.join('moa_sizes_consensus_datasets', 'modz_level5_data.csv'))
df_moa_vals = pd.read_csv(os.path.join('moa_sizes_consensus_datasets', 'matching_score_per_MOA_L1000_dose_independent.tsv.gz'), sep="\t")
df_moa_cpds = pd.read_csv(os.path.join('moa_sizes_consensus_datasets', 'L1000_moa_compounds.csv'))

In [4]:
df_lvl5.shape

(7963, 980)

In [5]:
df_moa_vals.shape

(211, 3)

In [6]:
df_moa_cpds.shape

(211, 8)

In [7]:
def conv_cols_to_list(df_moa_cpds):
    """This function convert string values in compound dataframe to lists"""
    
    moa_cpd_cols = [col for col in df_moa_cpds.columns.tolist() 
                 if (col.startswith("moa_cpds_"))]
    for col in moa_cpd_cols:
        df_moa_cpds[col] = df_moa_cpds[col].apply(lambda row: row.split(';'))
    return df_moa_cpds

In [8]:
df_moa_cpds = conv_cols_to_list(df_moa_cpds)

In [9]:
def get_cpd_agg(data_moa, dose_number):
    """
    This function aggregate values for a particular 
    dose by taking the mean value of distinct compounds in the dose
    """
    
    df_dose = data_moa[data_moa['dose'] == dose_number].copy()
    meta_cols = ['pert_id', 'dose', 'pert_idose', 'moa', 'sig_id']
    df_dose.drop(meta_cols, axis = 1, inplace = True)
    df_compound_agg = df_dose.groupby(['pert_iname']).agg(['mean'])
    df_compound_agg.columns  = df_compound_agg.columns.droplevel(1)
    df_compound_agg.rename_axis(None, axis=0, inplace = True)
    
    return df_compound_agg

In [10]:
def cpds_found_in_all_doses(data_moa):
    """This function return a list of compounds found in all doses (1 - 6)"""
    cpds_fd = []
    for num in range(1,7):
        df_cpd_agg = get_cpd_agg(data_moa, num)
        all_cpds = df_cpd_agg.index.tolist()
        cpds_fd.append(all_cpds)
    
    cpds_fd_in_all = [cpd for list_cpds in cpds_fd 
                      for cpd in list_cpds 
                      if all(cpd in list_of_cpds for list_of_cpds in cpds_fd)]
    cpds_fd_in_all = list(set(cpds_fd_in_all))
    
    return cpds_fd_in_all

In [11]:
cpds_fd_in_all = cpds_found_in_all_doses(df_lvl5)

In [12]:
len(cpds_fd_in_all)

1327

In [13]:
all_moa_list = df_lvl5['moa'].unique().tolist()

In [14]:
len(all_moa_list)

583

In [15]:
#moa with their corresponding compounds
all_moa_dict = {moa: [cpd for cpd in df_lvl5['pert_iname'][df_lvl5['moa']== moa].unique().tolist() 
                      if cpd in cpds_fd_in_all] 
                for moa in all_moa_list}
all_moa_dict = {kys:all_moa_dict[kys] for kys in all_moa_dict if all_moa_dict[kys]}

In [16]:
df_moa_cpds = df_moa_cpds.drop(columns=["moa_size"]).merge(df_moa_vals, on="moa")
print(df_moa_cpds.shape)

(211, 9)


In [17]:
def generate_moa_size_dict(df_moa_cpds):
    """
    Generates a dictionary with distinct moa_sizes 
    (moa_size == number of compounds that is present in each MOA) 
    as the keys and all compounds of MOAs with that particular size as the values
    """
    moa_size_dict = {}
    for size in df_moa_cpds['no_of_replicates'].unique():
        size_df = df_moa_cpds[df_moa_cpds['no_of_replicates'] == size].drop(['no_of_replicates', 'moa', 'spearman_correlation'], axis = 1)
        size_df_values = size_df.values.tolist()
        size_df_values = [x for cpd_list in size_df_values for x in cpd_list]
        size_df_values = list(set([x for cpd in size_df_values for x in cpd]))
        moa_size_dict[size] = size_df_values
    return moa_size_dict

In [18]:
moa_sizes_dict = generate_moa_size_dict(df_moa_cpds)

In [19]:
len(df_moa_cpds['no_of_replicates'].unique())

20

In [20]:
len(moa_sizes_dict)

20

In [21]:
def get_random_cpds(all_cpds, moa_size, moa_cpds, all_moa_cpds):
    """
    This function return a list of random cpds that are not of the same moas 
    or found in the current moa cpd's list
    """
    while (True):
        random_cpds = random.sample(all_cpds, moa_size)
        if not (any(cpds in moa_cpds for cpds in random_cpds)):
            break
    return random_cpds

In [22]:
def get_null_distribution_cpds(moa_size_dict, cpds_list, all_moa_dict, rand_num = 1000):
    
    """
    This function returns the null distribution dictionary, with MOAs as the keys and 
    1000 lists of randomly selected compounds combinations as the values for each MOA
    """
    null_distribution_moa = {}
    for size in moa_size_dict:
        print(size)
        moa_cpds = moa_size_dict[size]
        moa_cpds_list = []
        for idx in range(rand_num):
            start_again = True
            while (start_again):
                rand_cpds = get_random_cpds(cpds_list, size, moa_cpds, all_moa_dict)
                if rand_cpds not in moa_cpds_list:
                    start_again = False
            moa_cpds_list.append(rand_cpds)
        null_distribution_moa[size] = moa_cpds_list
    
    return null_distribution_moa

In [23]:
null_distribution_moa = get_null_distribution_cpds(moa_sizes_dict, cpds_fd_in_all, all_moa_dict)

12
18
48
126
24
120
186
36
30
84
66
42
90
162
54
114
78
144
168
72


In [24]:
#save the null_distribution_moa to pickle, you only need to run the code once
with open(os.path.join('moa_sizes_consensus_datasets', 'null_distribution_doseindependent.pickle'), 'wb') as handle:
    pickle.dump(null_distribution_moa, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [25]:
##load the null_distribution_moa from pickle
with open(os.path.join('moa_sizes_consensus_datasets', 'null_distribution_doseindependent.pickle'), 'rb') as handle:
    null_distribution_moa = pickle.load(handle)

In [26]:
print('moa_size', '\tnumber of generated lists of randomly combined compounds')
for keys in null_distribution_moa:
    print(keys, '\t\t', len(null_distribution_moa[keys]))

moa_size 	number of generated lists of randomly combined compounds
12 		 1000
18 		 1000
48 		 1000
126 		 1000
24 		 1000
120 		 1000
186 		 1000
36 		 1000
30 		 1000
84 		 1000
66 		 1000
42 		 1000
90 		 1000
162 		 1000
54 		 1000
114 		 1000
78 		 1000
144 		 1000
168 		 1000
72 		 1000


In [27]:
def assert_null_distribution(null_distribution_moa):
    
    """
    This function assert that each of the list in the 1000 lists of 
    random compounds combination for each MOA are distinct with no duplicates
    """
    
    duplicates_moa = {}
    for keys in null_distribution_moa:
        null_dist = null_distribution_moa[keys]
        for cpds_moa in null_dist:
            cpds_duplicates = []
            new_list = list(filter(lambda cpds_list: cpds_list != cpds_moa, null_dist))
            if (len(new_list) != len(null_dist) - 1):
                cpds_duplicates.append(cpds_moa)
        if cpds_duplicates:
            duplicates_moa[keys] = cpds_duplicates
    return duplicates_moa

In [28]:
duplicates_cpds_list = assert_null_distribution(null_distribution_moa)

In [29]:
duplicates_cpds_list ##no duplicate found

{}

In [30]:
df_lvl5.head()

Unnamed: 0,sig_id,200814_at,222103_at,201453_x_at,204131_s_at,200059_s_at,205067_at,213702_x_at,214435_x_at,201334_s_at,...,212536_at,218529_at,211071_s_at,203341_at,205379_at,pert_id,pert_idose,dose,pert_iname,moa
0,REP.A001_A549_24H:A07,-0.061635,0.408537,0.824534,0.536392,-0.566594,-0.308054,0.189936,0.184868,-0.068203,...,2.125713,-0.24764,0.416466,-0.676134,-2.665621,BRD-K25114078,10 uM,6,aminoguanidine,nitric oxide synthase inhibitor
1,REP.A001_A549_24H:A08,-0.506381,0.030745,-0.787902,0.187344,0.039911,-0.547436,0.416978,-0.994681,0.740328,...,0.360761,0.143608,0.318085,0.363956,-0.592373,BRD-K25114078,3.33 uM,5,aminoguanidine,nitric oxide synthase inhibitor
2,REP.A001_A549_24H:A09,-0.509867,0.0415,0.263433,-0.613,0.4546,0.286267,1.1984,-0.469433,0.5137,...,-0.214733,-1.577267,-2.051033,0.038333,-0.911967,BRD-K25114078,1.11 uM,4,aminoguanidine,nitric oxide synthase inhibitor
3,REP.A001_A549_24H:A10,-0.296316,-0.42135,-0.588798,0.15658,0.953095,-0.90707,-0.457481,-0.051327,-0.141768,...,-0.453476,0.150566,0.615505,0.119067,-0.322888,BRD-K25114078,0.37 uM,3,aminoguanidine,nitric oxide synthase inhibitor
4,REP.A001_A549_24H:A11,-0.48779,0.080023,0.082183,-0.539648,0.423894,-0.206426,0.177557,-0.256125,0.21015,...,0.350906,-0.469356,-0.396239,0.325295,-1.037028,BRD-K25114078,0.12 uM,2,aminoguanidine,nitric oxide synthase inhibitor


In [31]:
def get_cpd_agg_dose_independent(data_moa):
    """
    This function calculates the mean value of distinct compounds
    """
    
    df = data_moa.copy()

    df_cpd_agg = df.groupby(['pert_iname', 'dose']).agg(['mean']).reset_index()
    df_cpd_agg.index = df_cpd_agg.pert_iname
    meta_cols = ["pert_iname", "dose"]
        
    df_cpd_agg.drop(meta_cols, axis = 1, inplace = True)
    
    return df_cpd_agg

In [32]:
def calc_null_dist_median_scores(data_moa, moa_cpds_list):
    """
    This function calculate the median of the correlation 
    values for each of the list in the 1000 lists of 
    random compounds combination for each MOA
    """
    df_cpd_agg = get_cpd_agg_dose_independent(data_moa)
    median_corr_list = []
    for list_of_cpds in moa_cpds_list:
        df_cpds = df_cpd_agg.loc[list_of_cpds]
        cpds_corr = df_cpds.transpose().corr(method = 'spearman')
        
        if len(list_of_cpds) == 1:
            median_corr_val = 1
        else:
            cpds_corr.index.name = "pert_iname_compare"
            cpds_corr = cpds_corr.reset_index().melt(id_vars = "pert_iname_compare", value_name="spearman_corr")
            cpds_corr = cpds_corr.assign(keep_me_diff_comparison = cpds_corr.pert_iname_compare != cpds_corr.pert_iname)
            cpds_corr = cpds_corr.query("keep_me_diff_comparison")
            median_corr_val = cpds_corr.spearman_corr.median()
            
        median_corr_list.append(median_corr_val)
    return median_corr_list

In [33]:
def get_null_dist_median_scores(null_distribution_moa, df_moa):
    """ 
    This function calculate the median correlation scores for all 
    1000 lists of randomly combined compounds for each moa_size class 
    """
    null_distribution_medians = {}
    for key in null_distribution_moa:
        null_distribution_medians[key] = calc_null_dist_median_scores(df_moa, null_distribution_moa[key])
    return null_distribution_medians

**A P value can be computed nonparametrically by evaluating the probability of random compounds of different MOAs having greater median similarity value than compounds of the same MOAs.**

In [34]:
null_distribution_medians = get_null_dist_median_scores(null_distribution_moa, df_lvl5)

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)
  obj = obj._drop_axis(labels, axis, level=level, errors=errors)
  obj = obj._drop_axis(labels, axis, level=level, errors=errors)
  obj = obj._drop_axis(labels, axis, level=level, errors=errors)
  obj = obj._drop_axis(labels, axis, level=level, errors=errors)
  obj = obj._drop_axis(labels, axis, level=level, errors=errors)
  obj = obj._drop_axis(labels, axis, level=level, errors=errors)
  obj = obj._drop_axis(labels, axis, level=level, errors=errors)
  obj = obj._drop_axis(labels, axis, level=level, errors=errors)
  obj = obj._drop_axis(labels, axis, level=level, errors=errors)
  obj = obj._drop_axis(labels, axis, level=level, errors=errors)
  obj = obj._drop_axis(labels, axis, level=level, errors=errors)
  obj = obj._drop_axis(labels, axis, level=level, errors=errors)
  obj = obj._drop_axis(labels, axis, level=level, errors=errors)
  obj = obj._drop_axis(labels, axis, level=level, errors=errors)
  obj = obj._drop_axis(la

In [35]:
def get_p_value(median_scores_list, df_moa_values, moa_name):
    """
    This function calculate the p-value from the 
    null_distribution median scores for each MOA
    """
    actual_med = df_moa_values.loc[moa_name, :].spearman_correlation
    p_value = np.sum(median_scores_list >= actual_med) / len(median_scores_list)
    return p_value

In [36]:
def get_moa_p_vals(null_dist_median, df_moa_values):
    """
    This function returns a dict, with MOAs as the keys and the MOA's p-values as the values
    """
    null_p_vals = {}
    df_moa_values = df_moa_values.set_index('moa').rename_axis(None, axis=0)
    for key in null_dist_median:
        df_moa_size = df_moa_values[df_moa_values['no_of_replicates'] == key]
        for moa in df_moa_size.index:
            moa_p_value = get_p_value(null_dist_median[key], df_moa_size, moa)
            null_p_vals[moa] = moa_p_value
    sorted_null_p_vals = {key:value for key, value in sorted(null_p_vals.items(), key=lambda item: item[0])}
    return sorted_null_p_vals

In [37]:
null_p_vals = get_moa_p_vals(null_distribution_medians, df_moa_vals)

In [38]:
df_null_p_vals = pd.DataFrame.from_dict(null_p_vals, orient='index', 
                                        columns = ["p_value_alldose"]).reset_index().rename(columns={"index": "moa"})

In [39]:
df_null_p_vals['moa_size'] = df_moa_vals['no_of_replicates']

In [40]:
df_null_p_vals.head(10)

Unnamed: 0,moa,p_value_alldose,moa_size
0,5 alpha reductase inhibitor,0.0,12
1,acat inhibitor,1.0,18
2,acetylcholine receptor agonist,0.0,48
3,acetylcholine receptor antagonist,0.002,126
4,acetylcholine release stimulant,0.008,12
5,acetylcholinesterase inhibitor,0.0,12
6,adenosine receptor agonist,0.09,18
7,adenosine receptor antagonist,0.0,24
8,adrenergic inhibitor,0.997,12
9,adrenergic receptor agonist,0.001,120


In [41]:
def save_to_csv(df, path, file_name):
    """saves moa dataframes to csv"""
    
    if not os.path.exists(path):
        os.mkdir(path)
    
    df.to_csv(os.path.join(path, file_name), index = False)

In [42]:
save_to_csv(df_null_p_vals, 'moa_sizes_consensus_datasets', 'modz_null_p_values_doseindependent.csv')