# Split the L1000 Data into Training/Testing/Validation Sets

Split the data 80% training, 10% testing, 10% validation, balanced by platemap.

In [1]:
import sys
import pathlib
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from pycytominer import feature_select

sys.path.insert(0, "../../scripts")
from utils import transform, infer_L1000_features

In [2]:
# %load_ext nb_black

In [3]:
seed = 9876
test_split = 0.2

output_dir = pathlib.Path("data")
output_dir.mkdir(exist_ok=True)

In [4]:
# Load data
phase2_L1000_df = pd.read_csv("../0B.process-data/data/L1000_phase2.tsv.gz", sep="\t")

print(phase2_L1000_df.shape)
phase2_L1000_df.head(2)

(345976, 981)


Unnamed: 0,cid,cell_id,pert_id,780,7849,6193,23,9552,387,10921,...,54681,11000,6915,6253,7264,5467,2767,23038,57048,79716
0,REP.A001_A375_24H_X1_B22:A03,A375,DMSO,14.8204,0.0,-3.5241,-0.3428,-0.7251,1.9822,0.0,...,1.0894,0.5645,-1.8231,0.387,0.375,-1.6591,1.0853,-0.672,4.2066,-0.8088
1,REP.A001_A375_24H_X1_B22:A04,A375,DMSO,-0.52,4.3134,-0.1895,-0.8369,0.2381,0.5766,0.0,...,-0.3895,-0.9399,-2.0347,-0.0523,0.484,1.0437,1.5125,0.8923,-0.3861,-0.7052


In [5]:
features = infer_L1000_features(phase2_L1000_df)
meta_features = infer_L1000_features(phase2_L1000_df, metadata=True)

In [6]:
# Zero One Normalize Data
phase2_L1000_df = transform(
    phase2_L1000_df, features=features, meta_features=meta_features, operation = "-1+1"
)

In [7]:
phase2_L1000_df

Unnamed: 0,cid,cell_id,pert_id,780,7849,6193,23,9552,387,10921,...,54681,11000,6915,6253,7264,5467,2767,23038,57048,79716
0,REP.A001_A375_24H_X1_B22:A03,A375,DMSO,0.223063,-0.437648,-0.239575,0.123498,0.150186,0.322459,0.541439,...,-0.265757,-0.190809,-0.269884,0.362559,-0.059494,-0.699902,0.051617,-0.301789,0.107861,-0.079371
1,REP.A001_A375_24H_X1_B22:A04,A375,DMSO,-0.294278,-0.312419,-0.087194,0.102535,0.188610,0.257436,0.541439,...,-0.306876,-0.236472,-0.277581,0.345204,-0.054869,-0.583246,0.069114,-0.242816,-0.043908,-0.076082
2,REP.A001_A375_24H_X1_B22:A05,A375,DMSO,-0.311709,-0.470298,-0.145467,0.156171,0.196788,0.178790,0.228516,...,-0.314800,-0.185777,-0.203564,0.427487,-0.193507,-0.657406,0.048636,-0.333125,-0.024345,-0.072546
3,REP.A001_A375_24H_X1_B22:A06,A375,DMSO,-0.286875,-0.451206,-0.098427,0.114397,0.161867,0.246871,0.568886,...,-0.291189,-0.245008,-0.256868,0.378318,-0.072282,-0.650517,0.057671,-0.312775,-0.076233,-0.074041
4,REP.A001_A375_24H_X1_B22:A07,A375,BRD-K25114078,-0.262702,-0.444108,-0.093464,0.177384,0.179918,0.192325,0.541439,...,-0.283257,-0.233695,-0.211538,0.321370,-0.158978,-0.630917,0.152374,-0.575464,-0.005681,-0.058052
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345971,XPR002_YAPC.311_96H_X3_B22:P19,YAPC.311,BRDN0000986672,-0.149541,-0.442908,-0.069807,0.174041,0.205285,0.414206,0.571022,...,-0.310797,-0.206213,-0.240814,0.351280,-0.179891,-0.579245,0.008273,-0.320473,-0.032794,-0.035053
345972,XPR002_YAPC.311_96H_X3_B22:P21,YAPC.311,BRDN0000990361,-0.279834,-0.455993,-0.085234,0.138042,0.118564,0.234616,0.523930,...,-0.319521,-0.226067,-0.205299,0.358999,-0.109166,-0.690208,-0.012393,-0.269907,-0.018142,-0.075568
345973,XPR002_YAPC.311_96H_X3_B22:P22,YAPC.311,BRDN0000735497,-0.272647,-0.389402,-0.136159,0.225118,0.189743,0.224185,0.514980,...,-0.220126,-0.127477,-0.142926,0.423275,-0.124182,-0.603821,0.067230,-0.360890,-0.053438,-0.045576
345974,XPR002_YAPC.311_96H_X3_B22:P23,YAPC.311,BRDN0000733798,-0.317753,-0.416991,-0.209694,0.138581,0.093660,0.300837,0.545341,...,-0.253546,-0.254879,-0.220363,0.363594,-0.117143,-0.687334,-0.027035,-0.301359,0.474975,-0.043577


In [8]:
# Split data into 80% train, 20% test
train_df, test_df = train_test_split(
    phase2_L1000_df,
    test_size=test_split,
    random_state=seed,
    stratify=phase2_L1000_df.cell_id,
)

In [9]:
# Split test data into 50% validation, 50% test
test_df, valid_df = train_test_split(
    test_df,
    test_size=0.5,
    random_state=seed,
    stratify=test_df.cell_id,
)

In [10]:
print(train_df.shape)
print(test_df.shape)
print(valid_df.shape)

(276780, 981)
(34598, 981)
(34598, 981)


In [11]:
# Output data splits
train_file = pathlib.Path(output_dir, "L1000PHASE2level4-1+1_train.tsv.gz")
test_file = pathlib.Path(output_dir, "L1000PHASE2level4-1+1_test.tsv.gz")
valid_file = pathlib.Path(output_dir, "L1000PHASE2level4-1+1_valid.tsv.gz")
complete_file = pathlib.Path(output_dir, "L1000PHASE2level4-1+1_complete.tsv.gz")

# train_df.to_csv(train_file, sep="\t", index=False, float_format="%.5g")
# test_df.to_csv(test_file, sep="\t", index=False, float_format="%.5g")
# valid_df.to_csv(valid_file, sep="\t", index=False, float_format="%.5g")
phase2_L1000_df.to_csv(complete_file, sep="\t", index=False, float_format="%.5g")