### - Split the data in Cell painting & L1000 into train/test based on their compounds 

In [1]:
import os
import requests
import pickle
import argparse
import pandas as pd
import numpy as np
import re
from os import walk
from collections import Counter
import random
import shutil

In [2]:
# Run with both "" and "_subsample" for the two Cell Painting input data types
file_indicator=""

In [3]:
cp_data_path = '../../1.Data-exploration/Profiles_level4/cell_painting/cellpainting_lvl4_cpd_replicate_datasets/'
l1000_data_path = "../../1.Data-exploration/Profiles_level4/L1000/L1000_lvl4_cpd_replicate_datasets/"

cpd_split_path = '../1.compound_split_train_test/data'

In [4]:
df_level4_cp = pd.read_csv(
    os.path.join(cp_data_path, f'cp_level4_cpd_replicates{file_indicator}.csv.gz'),
    low_memory = False
)

df_level4_L1 = pd.read_csv(
    os.path.join(l1000_data_path, 'L1000_level4_cpd_replicates.csv.gz'), 
    compression='gzip',
    low_memory = False
)

In [5]:
df_cpds_moas_lincs = pd.read_csv(os.path.join(cpd_split_path, 'split_moas_cpds.csv'))

In [6]:
df_cpds_moas_lincs.head()

Unnamed: 0,pert_iname,moa,train,test,marked
0,ketoprofen,cyclooxygenase inhibitor,True,False,True
1,valdecoxib,cyclooxygenase inhibitor,False,True,True
2,epirizole,cyclooxygenase inhibitor,True,False,True
3,ketorolac,cyclooxygenase inhibitor,True,False,True
4,balsalazide,cyclooxygenase inhibitor,True,False,True


In [7]:
all_cpds = df_cpds_moas_lincs['pert_iname'].unique()

In [8]:
df_level4_cp = df_level4_cp.loc[df_level4_cp['pert_iname'].isin(all_cpds)].reset_index(drop=True)
df_level4_L1 = df_level4_L1.loc[df_level4_L1['pert_iname'].isin(all_cpds)].reset_index(drop=True)

In [9]:
df_level4_cp.shape

(38273, 812)

In [10]:
df_level4_L1.shape

(23263, 988)

In [11]:
df_level4_cp['moa'] = df_level4_cp['moa'].apply(lambda x: x.lower())
df_level4_L1['moa'] = df_level4_L1['moa'].apply(lambda x: x.lower())

In [12]:
df_cpds_moas = df_cpds_moas_lincs.copy()

In [13]:
len(df_cpds_moas['pert_iname'].unique()) ##no of compounds in the whole data

1258

In [14]:
len(df_cpds_moas['moa'].unique()) ##no of MOA

501

In [15]:
def create_moa_targets(df):
    """Create the binary multi-label MOA targets for each compound"""
    df['val'] = 1
    df_moas_targets = pd.pivot_table(df, values=['val'], index='pert_iname',columns=['moa'], fill_value=0)
    df_moas_targets.columns.names = (None,None)
    df_moas_targets.columns = df_moas_targets.columns.droplevel(0)
    df_moas_targets = df_moas_targets.reset_index().rename({'index':'pert_iname'}, axis = 1)
    return df_moas_targets

In [16]:
df_moa_targets = create_moa_targets(df_cpds_moas)

In [17]:
df_moa_targets

Unnamed: 0,pert_iname,11-beta hydroxysteroid dehydrogenase inhibitor,11-beta-hsd1 inhibitor,"17,20 lyase inhibitor",3-ketoacyl coa thiolase inhibitor,3beta-hydroxy-delta5-steroid dehydrogenase inhibitor,5 alpha reductase inhibitor,abl kinase inhibitor,acat inhibitor,acetylcholine precursor,...,vasopressin receptor antagonist,vegfr inhibitor,vesicular monoamine transporter inhibitor,vitamin b,vitamin d receptor agonist,vitamin k antagonist,voltage-gated sodium channel blocker,voltage-gated sodium channel modulator,xanthine oxidase inhibitor,xiap inhibitor
0,17-hydroxyprogesterone-caproate,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2-iminobiotin,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3-amino-benzamide,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3-deazaadenosine,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,abacavir,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1253,zk811752,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1254,zofenopril-calcium,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1255,zolpidem,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1256,zonisamide,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
df_level4_cp = df_level4_cp.merge(df_moa_targets, on='pert_iname')
df_level4_L1 = df_level4_L1.merge(df_moa_targets, on='pert_iname')

In [19]:
df_level4_cp.shape

(38273, 1313)

In [20]:
df_level4_L1.shape

(23263, 1489)

### - compounds split (80/20) based on MOAs -- based on split_moas_cpds

In [21]:
train_cpds = df_cpds_moas_lincs[df_cpds_moas_lincs['train']]['pert_iname'].unique()
test_cpds = df_cpds_moas_lincs[df_cpds_moas_lincs['test']]['pert_iname'].unique()

In [22]:
len(train_cpds)

941

In [23]:
len(test_cpds)

317

In [24]:
def train_test_split(train_cpds, test_cpds, df):
    df_trn = df.loc[df['pert_iname'].isin(train_cpds)].reset_index(drop=True)
    df_tst = df.loc[df['pert_iname'].isin(test_cpds)].reset_index(drop=True)
    return df_trn, df_tst

In [25]:
df_level4_cp_trn, df_level4_cp_tst = train_test_split(train_cpds, test_cpds, df_level4_cp)
df_level4_L1_trn, df_level4_L1_tst = train_test_split(train_cpds, test_cpds, df_level4_L1)

In [26]:
df_level4_cp_trn.shape

(27485, 1313)

In [27]:
df_level4_cp_tst.shape

(10788, 1313)

In [28]:
df_level4_L1_trn.shape

(16802, 1489)

In [29]:
df_level4_L1_tst.shape

(6461, 1489)

### - Shuffle train data - 2nd train data
#### - Shuffle the target labels in the train data so that replicates of the same compound/MOA have different MOA labels

In [30]:
def create_shuffle_data(df_trn, target_cols):
    """Create shuffled train data where the replicates of each compound are given wrong target labels"""
    df_trn_cpy = df_trn.copy()
    df_trn_tgts = df_trn_cpy[target_cols].copy()
    rand_df = pd.DataFrame(np.random.permutation(df_trn_tgts), columns =df_trn_tgts.columns.tolist())
    df_trn_cpy.drop(target_cols, axis = 1, inplace = True)
    df_trn_cpy = pd.concat([df_trn_cpy, rand_df], axis = 1)
    return df_trn_cpy

In [31]:
target_cols = df_moa_targets.columns[1:]

In [32]:
df_lvl4_cp_trn_shuf = create_shuffle_data(df_level4_cp_trn, target_cols)
df_lvl4_L1_trn_shuf = create_shuffle_data(df_level4_L1_trn, target_cols)

In [33]:
df_lvl4_cp_trn_shuf.shape

(27485, 1313)

In [34]:
df_lvl4_L1_trn_shuf.shape

(16802, 1489)

#### - Save to CSV

In [35]:
def save_to_csv(df, path, file_name, compress=None):
    """saves dataframes to csv"""
    
    if not os.path.exists(path):
        os.mkdir(path)
    
    df.to_csv(os.path.join(path, file_name), index=False, compression=compress)

In [36]:
save_to_csv(df_level4_cp_trn, "model_data/cp/", f'train_lvl4_data{file_indicator}.csv.gz', compress="gzip")
save_to_csv(df_level4_cp_tst, "model_data/cp/", f'test_lvl4_data{file_indicator}.csv.gz', compress="gzip")
save_to_csv(df_lvl4_cp_trn_shuf, "model_data/cp/", f'train_shuffle_lvl4_data{file_indicator}.csv.gz', compress="gzip")

In [37]:
save_to_csv(df_level4_L1_trn, "model_data/L1/", 'train_lvl4_data.csv.gz', compress="gzip")
save_to_csv(df_level4_L1_tst, "model_data/L1/", 'test_lvl4_data.csv.gz', compress="gzip")
save_to_csv(df_lvl4_L1_trn_shuf, "model_data/L1/", 'train_shuffle_lvl4_data.csv.gz', compress="gzip")

In [38]:
save_to_csv(df_moa_targets, "model_data/cp/", f'target_labels{file_indicator}.csv')
save_to_csv(df_moa_targets, "model_data/L1/", 'target_labels.csv')