### - Split Compounds into Train & Test data based on the number of MOAs that are attributed to them.

In [1]:
import os
import pathlib
import requests
import pickle
import argparse
import pandas as pd
import numpy as np
import re
from os import walk
from collections import Counter
import random
import shutil
from split_compounds import split_cpds_moas

In [2]:
# Load common compounds
common_file = pathlib.Path(
    "..", "..", "6.paper_figures", "data", "significant_compounds_by_threshold_both_assays.tsv.gz"
)
common_df = pd.read_csv(common_file, sep="\t")

common_compounds = common_df.compound.unique()
print(len(common_compounds))
print(common_df.shape)
common_df.head(2)

1327
(7962, 10)


Unnamed: 0,compound,dose,median_replicate_score_cellpainting,median_replicate_score_l1000,pass_cellpainting_thresh,pass_l1000_thresh,pass_both,cell_painting_num_reproducible,l1000_num_reproducible,total_reproducible
0,17-hydroxyprogesterone-caproate,0.04 uM,0.054557,0.07337,True,False,False,5,2,7
1,2-iminobiotin,0.04 uM,0.053791,0.085434,True,False,False,1,2,3


In [3]:
data_path = '../../1.Data-exploration/Profiles_level4/cell_painting/cellpainting_lvl4_cpd_replicate_datasets/'

df_level4_cp = pd.read_csv(
    os.path.join(data_path, 'cp_level4_cpd_replicates.csv.gz'), 
    compression='gzip',
    low_memory = False
)

data_path = '../../1.Data-exploration/Profiles_level4/L1000/L1000_lvl4_cpd_replicate_datasets/'

df_level4_L1 = pd.read_csv(
    os.path.join(data_path, 'L1000_level4_cpd_replicates.csv.gz'),
    compression='gzip',
    low_memory = False
)

In [4]:
### We are interested in compounds found both in L1000 and Cell painting
cp_cpd = df_level4_cp['pert_iname'].unique().tolist()
L1_cpd = df_level4_L1['pert_iname'].unique().tolist()

df_level4_cp = df_level4_cp.loc[df_level4_cp['pert_iname'].isin(common_df.compound)].reset_index(drop=True)
df_level4_L1 = df_level4_L1.loc[df_level4_L1['pert_iname'].isin(common_df.compound)].reset_index(drop=True)

In [5]:
for cpd in df_level4_cp['pert_iname'].unique():
    if cpd not in df_level4_L1['pert_iname'].unique():
        print('Something is Wrong!!')

In [6]:
len(df_level4_cp['pert_iname'].unique())

1258

In [7]:
len(df_level4_L1['pert_iname'].unique())

1258

In [8]:
##Exclude DMSO 
df_level4_cp = df_level4_cp[df_level4_cp['pert_iname'] != 'DMSO'].reset_index(drop=True)
df_level4_L1 = df_level4_L1[df_level4_L1['pert_iname'] != 'DMSO'].reset_index(drop=True)

In [9]:
df_level4_cp.shape

(38273, 812)

In [10]:
df_level4_L1.shape

(23263, 988)

In [11]:
df_level4_cp['moa'] = df_level4_cp['moa'].apply(lambda x: x.lower())
df_level4_L1['moa'] = df_level4_L1['moa'].apply(lambda x: x.lower())

In [12]:
#compounds and their respective MOAs -- using either df_level4_cp or df_level4_L1 is okay
df_cpds_moas = df_level4_cp.drop_duplicates(['pert_iname','moa'])[['pert_iname','moa']]
cpds_moa = dict(zip(df_cpds_moas['pert_iname'], df_cpds_moas['moa']))

In [13]:
len(cpds_moa)

1258

In [14]:
df_pert_cpds_moas = split_cpds_moas(cpds_moa)

In [15]:
df_pert_cpds_moas

Unnamed: 0,pert_iname,moa,train,test,marked
0,ketoprofen,cyclooxygenase inhibitor,True,False,True
1,valdecoxib,cyclooxygenase inhibitor,False,True,True
2,epirizole,cyclooxygenase inhibitor,True,False,True
3,ketorolac,cyclooxygenase inhibitor,True,False,True
4,balsalazide,cyclooxygenase inhibitor,True,False,True
...,...,...,...,...,...
1566,amiprilose,cd antagonist,True,False,True
1567,indapamide,thiazide diuretic,True,False,True
1568,oseltamivir-carboxylate,neuraminidase inhibitor,True,False,True
1569,ingenol-mebutate,pkc activator,True,False,True


In [16]:
len(df_pert_cpds_moas[df_pert_cpds_moas['test']]['moa'].unique()) ##moas in the test data

246

In [17]:
def get_moa_count(df):
    """
    Get the number of compounds MOAs are present in, for both train and test data
    """
    df_moa_ct = df.drop(['pert_iname'], axis=1).groupby(['moa']).agg(['sum'])
    df_moa_ct.columns = df_moa_ct.columns.droplevel(1)
    df_moa_ct.reset_index(inplace=True)
    return df_moa_ct

In [18]:
def get_test_ratio(df):
    if df['test'] > 0:
        return df["train"] / df["test"]
    return 0

In [19]:
df_moa_count = get_moa_count(df_pert_cpds_moas)

In [20]:
df_moa_count['test_ratio'] = df_moa_count.apply(get_test_ratio, axis=1)

In [21]:
##All MOAs found in test should be found in train data, so this should output nothing...GOOD!
df_moa_count[(df_moa_count['train'] == 0) & (df_moa_count['test'] >= 1)]

Unnamed: 0,moa,train,test,marked,test_ratio


In [22]:
##moas that are represented in more than one compounds (> 1), present in train set but not present in test set
df_moa_count[(df_moa_count['train'] > 1) & (df_moa_count['test'] == 0)]

Unnamed: 0,moa,train,test,marked,test_ratio
405,quorum sensing signaling modulator,2,0,2,0.0
408,reducing agent,4,0,4,0.0
487,unidentified pharmacological activity,2,0,2,0.0
494,vitamin b,2,0,2,0.0


In [23]:
len(df_pert_cpds_moas[df_pert_cpds_moas['train']]['pert_iname'].unique()) ##no of compounds in train data

941

In [24]:
len(df_pert_cpds_moas[df_pert_cpds_moas['test']]['pert_iname'].unique()) ##no of compounds in test data

317

In [25]:
def save_to_csv(df, path, file_name, compress=None):
    """saves dataframes to csv"""
    
    if not os.path.exists(path):
        os.mkdir(path)
    
    df.to_csv(os.path.join(path, file_name), index=False, compression=compress)

In [26]:
save_to_csv(df_pert_cpds_moas, "data", 'split_moas_cpds.csv')