# Split the Cell Painting Data into Training/Testing/Validation Sets

Split the data 80% training, 10% testing, 10% validation, balanced by platemap.

In [3]:
import sys
import pathlib
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from pycytominer import feature_select
from pycytominer.cyto_utils import infer_cp_features

sys.path.insert(0, "../../scripts")
from utils import transform

In [12]:
%load_ext nb_black

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

In [13]:
seed = 9876
test_split = 0.1
feature_select_opts = [
    "variance_threshold",
    "blacklist",
    "drop_na_columns",
    "drop_outliers",
]

output_dir = pathlib.Path("data")
output_dir.mkdir(exist_ok=True)

<IPython.core.display.Javascript object>

In [14]:
consensus = "modz"
batch = "2016_04_01_a549_48hr_batch1"
commit_hash = "27a2d7dd74067b5754c2c045e9b1a9cfb0581ae4"

# We have noticed particular technical issues with this platemap
# remove it from downstream consideration
# https://github.com/broadinstitute/lincs-cell-painting/issues/43
filter_platemap = "C-7161-01-LM6-011"

<IPython.core.display.Javascript object>

In [15]:
# Load data
base_url = (
    "https://media.githubusercontent.com/media/broadinstitute/lincs-cell-painting/"
)
repurp_url = (
    f"{base_url}/{commit_hash}/consensus/{batch}/{batch}_consensus_{consensus}.csv.gz"
)

complete_consensus_df = pd.read_csv(repurp_url).query(
    "Metadata_Plate_Map_Name != @filter_platemap"
)

complete_consensus_df = complete_consensus_df.assign(
    Metadata_unique_id=complete_consensus_df.Metadata_broad_sample
    + "_dose_"
    + complete_consensus_df.Metadata_dose_recode.astype(str)
)

print(complete_consensus_df.shape)
complete_consensus_df.head(2)

(10368, 1789)


Unnamed: 0,Metadata_Plate_Map_Name,Metadata_broad_sample,Metadata_pert_well,Metadata_mmoles_per_liter,Metadata_dose_recode,Cells_AreaShape_Area,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,...,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_ER_5_0,Nuclei_Texture_Variance_Mito_10_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_Mito_5_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,Nuclei_Texture_Variance_RNA_5_0,Metadata_unique_id
0,C-7161-01-LM6-001,BRD-A25234499-001-18-3,B13,10.0,6,0.590843,-0.291888,0.197,0.02541,-0.314847,...,-1.06781,-0.953925,-1.12625,-0.539351,-0.413102,-0.587832,0.822261,0.738317,0.87822,BRD-A25234499-001-18-3_dose_6
1,C-7161-01-LM6-001,BRD-A25234499-001-18-3,B14,3.3333,5,0.953997,0.615666,-0.003971,0.362362,-0.29934,...,-0.884445,-0.676961,-0.97964,-0.900894,-0.89717,-0.787205,-0.673075,-0.747765,-0.346591,BRD-A25234499-001-18-3_dose_5


<IPython.core.display.Javascript object>

In [16]:
# Perform feature selection
complete_consensus_df = feature_select(
    profiles=complete_consensus_df,
    features="infer",
    samples="none",
    operation=feature_select_opts,
    output_file="none",
    na_cutoff=0,
    corr_threshold=0.9,
    corr_method="pearson",
    freq_cut=0.05,
    unique_cut=0.1,
)

print(complete_consensus_df.shape)

(10368, 594)


<IPython.core.display.Javascript object>

In [17]:
# Zero One Normalize Data
complete_consensus_df = transform(complete_consensus_df)

<IPython.core.display.Javascript object>

In [18]:
# Split data
train_df, test_df = train_test_split(
    complete_consensus_df,
    test_size=test_split,
    random_state=seed,
    stratify=complete_consensus_df.Metadata_Plate_Map_Name,
)

<IPython.core.display.Javascript object>

In [19]:
print(train_df.shape)
print(test_df.shape)

(9331, 594)
(1037, 594)


<IPython.core.display.Javascript object>

In [20]:
# Output data splits
train_file = pathlib.Path(output_dir, "cell_painting_train.tsv.gz")
test_file = pathlib.Path(output_dir, "cell_painting_test.tsv.gz")
complete_file = pathlib.Path(output_dir, "cell_painting_complete.tsv.gz")

train_df.to_csv(train_file, sep="\t", index=False, float_format="%.5g")
test_df.to_csv(test_file, sep="\t", index=False, float_format="%.5g")
complete_consensus_df.to_csv(complete_file, sep="\t", index=False, float_format="%.5g")

<IPython.core.display.Javascript object>