# Split the L1000 Data into Training/Testing/Validation Sets

Split the data 80% training, 10% testing, 10% validation, balanced by platemap.

In [1]:
import sys
import pathlib
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from pycytominer import feature_select
from pycytominer.cyto_utils import infer_cp_features

sys.path.insert(0, "../../scripts")
from utils import transform, infer_L1000_features

In [2]:
# %load_ext nb_black

In [3]:
seed = 9876
test_split = 0.2

output_dir = pathlib.Path("data")
output_dir.mkdir(exist_ok=True)

In [13]:
# Load data
phase2_L1000_df = pd.read_csv("../1.process-data/data/L1000_adenyi.tsv.gz", sep="\t")

print(phase2_L1000_df.shape)
phase2_L1000_df.head(2)

(27837, 978)


Unnamed: 0,200814_at,222103_at,201453_x_at,204131_s_at,200059_s_at,205067_at,213702_x_at,214435_x_at,201334_s_at,213721_at,...,204662_at,203047_at,203343_at,217995_at,218450_at,212536_at,218529_at,211071_s_at,203341_at,205379_at
0,0.3547,-0.494,-0.1721,-0.0339,-0.4355,1.8263,-0.1316,0.0853,-0.466,-0.3113,...,-0.1345,-0.1423,0.1349,0.3684,-3.1025,-0.6226,-3.7452,-1.3157,1.0145,0.1046
1,-0.213,0.4931,-0.8768,-0.6968,-1.7018,-0.3779,-0.6745,-1.9799,-1.1429,-1.3309,...,0.5892,3.2737,-15.1256,0.5294,-0.6672,3.7304,0.6153,-1.49,0.975,-1.8243


In [6]:
# features = infer_L1000_features(phase2_L1000_df)
# meta_features = infer_L1000_features(phase2_L1000_df, metadata=True

In [14]:
phase2_L1000_df.columns

Index(['200814_at', '222103_at', '201453_x_at', '204131_s_at', '200059_s_at',
       '205067_at', '213702_x_at', '214435_x_at', '201334_s_at', '213721_at',
       ...
       '204662_at', '203047_at', '203343_at', '217995_at', '218450_at',
       '212536_at', '218529_at', '211071_s_at', '203341_at', '205379_at'],
      dtype='object', length=978)

In [15]:
# Zero One Normalize Data
phase2_L1000_df = transform(
    phase2_L1000_df, features=phase2_L1000_df.columns.to_list(), meta_features=[], operation = "-1+1"
)

In [16]:
# Split data into 80% train, 20% test
train_df, test_df = train_test_split(
    phase2_L1000_df,
    test_size=test_split,
    random_state=seed,
#     stratify=phase2_L1000_df.cell_id,
)

In [17]:
# Split test data into 50% validation, 50% test
test_df, valid_df = train_test_split(
    test_df,
    test_size=0.5,
    random_state=seed,
#     stratify=test_df.cell_id,
)

In [18]:
print(train_df.shape)
print(test_df.shape)
print(valid_df.shape)

(22269, 978)
(2784, 978)
(2784, 978)


In [21]:
# Output data splits
train_file = pathlib.Path(output_dir, "L1000adenyi_train.tsv.gz")
test_file = pathlib.Path(output_dir, "L1000adenyi_test.tsv.gz")
valid_file = pathlib.Path(output_dir, "L1000adenyi_valid.tsv.gz")
complete_file = pathlib.Path(output_dir, "L1000adenyi_complete.tsv.gz")

train_df.to_csv(train_file, sep="\t", index=False, float_format="%.5g")
test_df.to_csv(test_file, sep="\t", index=False, float_format="%.5g")
valid_df.to_csv(valid_file, sep="\t", index=False, float_format="%.5g")
phase2_L1000_df.to_csv(complete_file, sep="\t", index=False, float_format="%.5g")