### Level 4 - Normalized DMSO Profiles Cell painting data


#### The goal here:

-- is to determine the median score of each compound per dose based on taking the median of the correlation values between replicates of the same compound.

- Level 4 data - are replicate level data i.e. where you have multiple profiles been perturbed by the same compound (pertubagen)

[LINCS Cell painting Level 4 Dataset](https://github.com/broadinstitute/lincs-cell-painting/tree/master/profiles/2016_04_01_a549_48hr_batch1)

In [1]:
import os
import pathlib
import pandas as pd
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from pycytominer import feature_select
from statistics import median
import random
sns.set_style("darkgrid")
from scipy import stats
import pickle

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)

In [2]:
commit = "94bfaeeab0d107beac262b4307aa6e9b783625fa"
spherized_profile_link = f"https://github.com/broadinstitute/lincs-cell-painting/blob/{commit}\
/spherized_profiles/profiles/2016_04_01_a549_48hr_batch1_dmso_spherized_profiles_with_input_normalized_by_whole_plate.csv.gz?raw=true"

In [3]:
pertinfo_file = '../aligned_moa_CP_L1000.csv'

In [4]:
df_level4 = pd.read_csv(spherized_profile_link, compression='gzip',low_memory = False)

In [5]:
df_level4.shape

(52223, 830)

In [6]:
df_level4.head()

Unnamed: 0,Metadata_plate_map_name,Metadata_broad_sample,Metadata_mg_per_ml,Metadata_mmoles_per_liter,Metadata_solvent,Metadata_Plate,Metadata_Well,Metadata_pert_id,Metadata_pert_mfc_id,Metadata_pert_well,...,Nuclei_Texture_SumEntropy_Mito_5_0,Nuclei_Texture_SumEntropy_RNA_20_0,Nuclei_Texture_SumEntropy_RNA_5_0,Nuclei_Texture_SumVariance_DNA_20_0,Nuclei_Texture_SumVariance_Mito_5_0,Nuclei_Texture_SumVariance_RNA_20_0,Nuclei_Texture_Variance_DNA_20_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0
0,C-7161-01-LM6-027,DMSO,0.0,0.0,DMSO,SQ00015211,A01,,,A01,...,-1.691233,0.641488,0.467653,2.364342,-0.571713,0.757513,-0.63458,1.440005,-0.044817,0.264545
1,C-7161-01-LM6-027,DMSO,0.0,0.0,DMSO,SQ00015211,A02,,,A02,...,-0.546898,-0.810054,-1.668961,1.532843,-0.17745,-1.684778,-1.235524,1.958443,0.165417,-0.985233
2,C-7161-01-LM6-027,DMSO,0.0,0.0,DMSO,SQ00015211,A03,,,A03,...,-0.814702,2.058604,-1.91436,2.639759,-1.04828,1.192945,0.008292,0.194224,-1.680378,-0.697503
3,C-7161-01-LM6-027,DMSO,0.0,0.0,DMSO,SQ00015211,A04,,,A04,...,-0.532522,-0.632071,-0.415327,1.475599,0.868495,1.649616,-0.205897,0.205308,-0.927785,0.712298
4,C-7161-01-LM6-027,DMSO,0.0,0.0,DMSO,SQ00015211,A05,,,A05,...,-0.436136,0.20485,-0.154737,3.705564,-2.069335,0.174793,0.756266,1.393177,-0.537319,0.420051


In [7]:
len(df_level4['Metadata_Plate'].unique())

136

- We have 136 plates * 384 wells; in each plate we have 384 wells

In [8]:
dose_liter = df_level4['Metadata_mmoles_per_liter'].unique().tolist()

In [9]:
dose_liter

[0.0,
 1.0,
 0.33333,
 0.11111,
 0.037037,
 0.012346,
 0.0041152,
 10.0,
 3.3333,
 1.1111,
 0.37037,
 0.12346,
 0.041152,
 20.0,
 19.999,
 11.2,
 3.7333,
 1.2444,
 0.41481,
 0.13827,
 0.04609,
 9.5837,
 3.1946,
 1.0649,
 0.35495,
 0.11832,
 0.039439,
 11.547,
 3.8489,
 1.283,
 0.42766,
 0.14255,
 0.047517,
 6.7937,
 2.2646,
 0.75485,
 0.25162,
 0.083873,
 0.027958,
 10.05,
 3.3499,
 1.1166,
 0.37221,
 0.12407,
 0.041356,
 10.716,
 3.5721,
 1.1907,
 0.3969,
 0.1323,
 0.0441,
 4.5176,
 1.5059,
 0.50195,
 0.16732,
 0.055772,
 0.018591,
 2.0,
 0.66667,
 0.22222,
 0.074074,
 0.024691,
 0.0082305,
 4.1701,
 1.39,
 0.46334,
 0.15445,
 0.051482,
 0.017161,
 4.8003,
 1.6001,
 0.53337,
 0.17779,
 0.059264,
 0.019755,
 5.7741,
 1.9247,
 0.64157,
 0.21386,
 0.071285,
 0.023762,
 10.072,
 3.3574,
 1.1191,
 0.37305,
 0.12435,
 0.04145,
 8.106,
 2.702,
 0.90067,
 0.30022,
 0.10007,
 0.033358]

- We have 93 unique doses across the level 4 dataset, we are going to **recode the doses to 8 distinct doses**, this means we are going to assign this 93 unique doses to the nearest 8 distinct doses.

| Dose | Dose Recode |
| :--: | :---------: |
| 0 (DMSO) | 0 |
| ~0.04 | 1 |
| ~0.12 | 2 |
| ~0.37 | 3 |
| ~1.11 | 4 |
| ~3.33 | 5 |
| ~10 | 6 |
| ~20 | 7 |

In [10]:
def recode_dose(dose_value):
    """This function recode the doses in Level-4 data to 8 distinct dose classes"""
    
    doses = [0.04,0.12,0.37,1.11,3.33,10.0,20.0,25.0]
    for x in range(len(doses)-1):
        if (dose_value > 0.0) & (dose_value <= 0.04):
            dose_value = 0.04
        elif doses[x] <= round(dose_value,2) < doses[x+1]:
            dose_value = doses[x]
    return dose_value

In [11]:
df_level4['Metadata_dose_recode'] = df_level4['Metadata_mmoles_per_liter'].apply(recode_dose)

In [12]:
df_level4['Metadata_dose_recode'].unique()

array([ 0.  ,  0.37,  0.12,  0.04, 10.  ,  3.33,  1.11, 20.  ])

In [13]:
def feature_selection(df_lvl4): 
    """
    Perform feature selection by dropping columns with null values 
    (greater than 384 i.e. equivalent to one plate worth of cell profiles) 
    and highly correlated values from the data.
    """
    metadata_columns = [x for x in df_lvl4.columns if (x.startswith("Metadata_"))]
    df_lvl4_metadata = df_lvl4[metadata_columns].copy()
    df_lvl4_features = df_lvl4.drop(metadata_columns, axis = 1)
    null_cols = [col for col in df_lvl4_features.columns if df_lvl4_features[col].isnull().sum() > 384]
    df_lvl4_features.drop(null_cols, axis = 1, inplace=True)
    ##feature selection was done already..prior to getting the spherized data!!
    ###df_lvl4_features = feature_select(df_lvl4_features, operation=["correlation_threshold", "variance_threshold"])
    
    for col in df_lvl4_features.columns:
        if df_lvl4_features[col].isnull().sum():
            df_lvl4_features[col].fillna(value=df_lvl4_features[col].mean(), inplace = True)
            
    df_meta_info = df_lvl4_metadata[['Metadata_broad_sample', 'Metadata_pert_id', 'Metadata_Plate', 'Metadata_Well',
                                     'Metadata_broad_id', 'Metadata_moa', 'Metadata_dose_recode']].copy()
    df_lvl4_new = pd.concat([df_meta_info, df_lvl4_features], axis=1)
    
    return df_lvl4_new

In [14]:
df_level4_new = feature_selection(df_level4)

In [15]:
df_level4_new.shape

(52223, 808)

In [16]:
def merge_dataframe(df, pertinfo_file):
    """
    This function merge aligned L1000 and Cell painting Metadata information dataframe 
    with the Level-4 data, change the values of the Metadata_dose_recode column 
    and create a new column 'replicate_name' that represents each replicate in the dataset
    """ 
    df_pertinfo = pd.read_csv(pertinfo_file)
    df_lvl4_new = df.merge(df_pertinfo, on='Metadata_broad_sample', how = 'outer')
    no_cpds_df = df_lvl4_new[df_lvl4_new['pert_iname'].isnull()].copy().reset_index(drop = True)
    df_lvl4_new.drop(df_lvl4_new[df_lvl4_new['pert_iname'].isnull()].index, inplace = True)
    df_lvl4_new.reset_index(drop= True, inplace = True)
    df_lvl4_new['Metadata_dose_recode'] = df_lvl4_new['Metadata_dose_recode'].map({0.0:0,0.04:1,0.12:2,0.37:3,1.11:4,
                                                                                   3.33:5,10.0:6,20.0:7})
    df_lvl4_new['replicate_name'] = ['replicate_' + str(x) for x in range(df_lvl4_new.shape[0])]
    
    return df_lvl4_new, no_cpds_df

In [17]:
df_level4_new, df_level4_no_cpds = merge_dataframe(df_level4_new, pertinfo_file)

In [18]:
##list of "Broad samples" WITHOUT Compounds after aligning L1000 and Cell painting MOAs
df_level4_no_cpds['Metadata_broad_sample'].unique().tolist()

['BRD-A20131130-001-01-7',
 'BRD-M98279124-300-01-1',
 'BRD-K87278688-001-01-0',
 'BRD-K21547160-001-01-4',
 'BRD-A44448661-001-04-8',
 'BRD-K41438959-001-01-7',
 'BRD-A37288617-003-02-2',
 'BRD-K52080565-001-09-2',
 'BRD-K73395020-001-02-3',
 'BRD-K01192156-001-02-7',
 'BRD-A84045418-001-03-1',
 'BRD-K60623809-001-02-0',
 'BRD-K51033547-003-02-6']

In [19]:
def get_median_score(cpds_list, df):
    """
    This function calculates the median score for each compound based on its replicates
    """
    
    cpds_median_score = {}
    for cpd in cpds_list:
        cpd_replicates = df[df['pert_iname'] == cpd].copy()
        cpd_replicates.drop(['Metadata_broad_sample', 'Metadata_pert_id', 'Metadata_dose_recode', 'Metadata_Plate',
                             'Metadata_Well', 'Metadata_broad_id', 'Metadata_moa', 'broad_id', 
                             'pert_iname', 'moa', 'replicate_name'], axis = 1, inplace = True)
        cpd_replicates_corr = cpd_replicates.astype('float64').T.corr(method = 'pearson').values
        if len(cpd_replicates_corr) == 1:
            median_val = 1
        else:
            median_val = median(list(cpd_replicates_corr[np.triu_indices(len(cpd_replicates_corr), k = 1)]))
        
        cpds_median_score[cpd] = median_val
        
    return cpds_median_score

In [20]:
def check_compounds(cpd_med_score, df):
    """
    Check if all distinct compounds in the Level-4 dataframe are present 
    in the cpd_med_score dictionary, if not add the compounds as keys to the dictionary 
    and give them a null value.
    """
    cpd_list = df['pert_iname'].unique().tolist()
    cpd_keys = cpd_med_score.keys()
    for cpd in cpd_list:
        if cpd not in cpd_keys:
            cpd_med_score[cpd] = np.nan
            
    return cpd_med_score

In [21]:
def get_cpd_medianscores(df):
    
    """This function computes median scores for all compounds found in the Level-4 dataframe PER DOSE (1-6)"""
    
    dose_list = list(set(df['Metadata_dose_recode'].unique().tolist()))[1:7]
    
    for dose in dose_list:
        df_dose = df[df['Metadata_dose_recode'] == dose].copy()
        cpds_list = df_dose['pert_iname'].unique().tolist()
        cpds_median_score = get_median_score(cpds_list, df_dose)
        cpds_median_score = check_compounds(cpds_median_score, df)
        sorted_med_score = {key:value for key, value in sorted(cpds_median_score.items(), key=lambda item: item[0])}
        if dose == 1:
            df_cpd_med_score = pd.DataFrame.from_dict(sorted_med_score, orient='index', columns = ['dose_1'])
        else:
            df_cpd_med_score['dose_' + str(dose)] = sorted_med_score.values()
            
    return df_cpd_med_score

In [22]:
df_cpd_med_score = get_cpd_medianscores(df_level4_new)

In [23]:
df_cpd_med_score.head(10)

Unnamed: 0,dose_1,dose_2,dose_3,dose_4,dose_5,dose_6
10-DEBC,0.018594,0.035,0.033021,0.007172,0.09714,0.52336
"16,16-dimethylprostaglandin-e2",0.04699,0.017677,0.006421,,,
17-hydroxyprogesterone-caproate,0.054557,0.044132,0.064253,0.0942,0.165742,0.449927
2-iminobiotin,0.053791,0.008659,0.033482,0.0145,0.012886,0.039215
2-methoxyestradiol,0.129437,0.114419,0.246409,0.439804,0.565411,0.663339
"3,3'-diindolylmethane",0.035497,0.036071,0.047993,,,
3-amino-benzamide,0.111933,0.060453,0.070041,0.0599,0.074856,0.019274
3-deazaadenosine,0.015754,0.059329,0.008057,0.01625,0.052921,0.146709
ABT-737,0.016174,0.091927,0.159638,0.351149,0.388167,0.714429
AEE788,0.651048,0.053435,0.119485,0.146616,0.636165,0.662422


In [24]:
def drop_cpds_with_null(df):
    """
    This function drop compounds with median scores of 1 
    or null values in any of the dose points (1-6)
    """
    cpds_with_null = []
    for cpd in df.index:
        if any(df.loc[cpd] == 1) | any(df.loc[cpd].isnull()):
            cpds_with_null.append(cpd)
    df.drop(cpds_with_null, axis = 0, inplace = True)
    
    return df

In [25]:
df_cpd_med_score = drop_cpds_with_null(df_cpd_med_score)

In [26]:
df_cpd_med_score.head(10)

Unnamed: 0,dose_1,dose_2,dose_3,dose_4,dose_5,dose_6
10-DEBC,0.018594,0.035,0.033021,0.007172,0.09714,0.52336
17-hydroxyprogesterone-caproate,0.054557,0.044132,0.064253,0.0942,0.165742,0.449927
2-iminobiotin,0.053791,0.008659,0.033482,0.0145,0.012886,0.039215
2-methoxyestradiol,0.129437,0.114419,0.246409,0.439804,0.565411,0.663339
3-amino-benzamide,0.111933,0.060453,0.070041,0.0599,0.074856,0.019274
3-deazaadenosine,0.015754,0.059329,0.008057,0.01625,0.052921,0.146709
ABT-737,0.016174,0.091927,0.159638,0.351149,0.388167,0.714429
AEE788,0.651048,0.053435,0.119485,0.146616,0.636165,0.662422
AICA-ribonucleotide,0.034447,0.118091,-0.123321,0.006123,0.000301,0.011579
AKT-inhibitor-1-2,-0.036008,-0.003265,0.035792,0.041529,0.045559,0.138988


In [27]:
def no_of_replicates_per_cpd(df, df_lvl4):
    """This function computes the numbers of replicates for each compound"""
    
    dose_list = list(set(df_lvl4['Metadata_dose_recode'].unique().tolist()))[1:7]
    cpds_no_of_reps = {}
    for cpd in df.index:
        num_of_reps = 0
        df_cpd = df_lvl4[df_lvl4['pert_iname'] == cpd].copy()
        for dose in dose_list:
            df_dose = df_cpd[df_cpd['Metadata_dose_recode'] == dose].copy()
            num_of_reps += df_dose.shape[0]
        cpds_no_of_reps[cpd] = num_of_reps // len(dose_list)
    df['no_of_replicates'] = cpds_no_of_reps.values()
    return df

In [28]:
df_cpd_med_score = no_of_replicates_per_cpd(df_cpd_med_score, df_level4_new)

In [29]:
df_cpd_med_score["no_of_replicates"].unique()

array([ 5,  2,  4, 10])

In [30]:
df_cpd_med_score.shape

(1538, 7)

In [31]:
def save_to_csv(df, path, file_name, compress=None):
    """saves dataframes to csv"""
    
    if not os.path.exists(path):
        os.mkdir(path)
    
    df.to_csv(os.path.join(path, file_name), index=False, compression=compress)

In [32]:
save_to_csv(df_cpd_med_score.reset_index().rename({'index':'cpd'}, axis = 1), 
            'cellpainting_lvl4_cpd_replicate_datasets', 'cpd_replicate_median_scores.csv')

In [33]:
save_to_csv(df_level4_new, 'cellpainting_lvl4_cpd_replicate_datasets', 
            'cp_level4_cpd_replicates.csv.gz', compress="gzip")

In [34]:
# Output files for visualization
results_dir = pathlib.Path("../results")
cpd_summary_file = pathlib.Path(f"{results_dir}/median_score_per_compound_CellPainting.tsv.gz")

dose_recode_info = {
    'dose_1': '0.04 uM', 'dose_2':'0.12 uM', 'dose_3':'0.37 uM',
    'dose_4': '1.11 uM', 'dose_5':'3.33 uM', 'dose_6':'10 uM'
}

In [35]:
cpd_score_summary_df = (
    df_cpd_med_score
    .reset_index()
    .rename(columns={"index": "compound"})
    .melt(
        id_vars=["compound", "no_of_replicates"],
        value_vars=["dose_1", "dose_2", "dose_3", "dose_4", "dose_5", "dose_6"],
        var_name="dose",
        value_name="median_replicate_score"
    )
)

cpd_score_summary_df.dose = cpd_score_summary_df.dose.replace(dose_recode_info)

cpd_score_summary_df.to_csv(cpd_summary_file, sep="\t", index=False)
cpd_score_summary_df.head()

Unnamed: 0,compound,no_of_replicates,dose,median_replicate_score
0,10-DEBC,5,0.04 uM,0.018594
1,17-hydroxyprogesterone-caproate,5,0.04 uM,0.054557
2,2-iminobiotin,5,0.04 uM,0.053791
3,2-methoxyestradiol,5,0.04 uM,0.129437
4,3-amino-benzamide,5,0.04 uM,0.111933
