# Stratify Data into Training and Testing Sets

**Gregory Way, 2019**

Split the input data into training and testing sets balanced by cell line.

We generate training and test sets from median and MODZ consensus profiles.
We use the same training and testing samples for both consensus profiles.

In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from pycytominer.get_na_columns import get_na_columns

In [2]:
np.random.seed(123)

In [3]:
test_proportion = 0.15
data_dir = os.path.join("..", "1.generate-profiles", "data")

## Load Data

### Median Consensus Profiles

In [4]:
file = os.path.join(data_dir, "consensus", "cell_painting_median.tsv.gz")
x_median_df = pd.read_csv(file, sep="\t")

print(x_median_df.shape)
x_median_df.head(2)

(357, 952)


Unnamed: 0,Metadata_profile_id,Metadata_cell_line,Metadata_pert_name,Cells_AreaShape_Center_Y,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,Cells_AreaShape_Extent,Cells_AreaShape_Orientation,Cells_AreaShape_Zernike_0_0,Cells_AreaShape_Zernike_1_1,...,Nuclei_Texture_SumEntropy_RNA_5_0,Nuclei_Texture_SumVariance_AGP_20_0,Nuclei_Texture_SumVariance_AGP_5_0,Nuclei_Texture_SumVariance_DNA_10_0,Nuclei_Texture_SumVariance_DNA_20_0,Nuclei_Texture_SumVariance_DNA_5_0,Nuclei_Texture_Variance_AGP_5_0,Nuclei_Texture_Variance_DNA_10_0,Nuclei_Texture_Variance_DNA_20_0,Nuclei_Texture_Variance_DNA_5_0
0,profile_0,A549,AKT1-1,0.596128,0.39128,0.463658,-0.221246,1.0115,-0.305663,-0.440232,...,0.647205,0.827639,0.863358,0.929565,0.886017,0.985453,0.892091,1.071022,1.072497,1.120483
1,profile_1,A549,AKT1-2,0.515609,-0.156584,0.092082,0.330569,-0.208782,0.083119,0.506794,...,-0.071948,0.597993,0.455386,0.537999,0.664216,0.527794,0.528653,0.608856,0.747248,0.555587


In [5]:
file = os.path.join(data_dir, "consensus", "cell_health_median.tsv.gz")
y_median_df = pd.read_csv(file, sep="\t")

print(y_median_df.shape)
y_median_df.head(2)

(357, 73)


Unnamed: 0,Metadata_profile_id,Metadata_pert_name,Metadata_cell_line,cc_all_high_h2ax,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_h2ax_mean,cc_all_n_spots_h2ax_per_nucleus_area_mean,cc_all_nucleus_area_mean,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_early_apoptosis,vb_percent_late_apoptosis,vb_percent_live,vb_ros_back_mean,vb_ros_mean
0,profile_0,AKT1-1,A549,0.008156,0.587977,0.01882,0.381501,0.176564,0.187675,-0.170616,...,0.399842,0.0,-0.118976,-0.132871,-0.12109,0.0,0.0,0.132882,0.80697,1.293984
1,profile_1,AKT1-2,A549,0.056667,1.264627,0.24145,0.568443,0.235304,0.372684,-0.276888,...,0.10167,0.318027,0.621374,0.100032,0.074036,0.132751,0.467027,-0.099917,0.558041,1.151867


#### Split into Training and Testing Sets

In [6]:
x_train_df, x_test_df, y_train_df, y_test_df = train_test_split(
    x_median_df,
    y_median_df,
    test_size=test_proportion,
    stratify=y_median_df.Metadata_cell_line,
    random_state=42
)

In [7]:
print(x_train_df.shape)
print(x_test_df.shape)

(303, 952)
(54, 952)


In [8]:
training_samples = x_train_df.Metadata_profile_id.tolist()
testing_samples = x_test_df.Metadata_profile_id.tolist()

In [9]:
file = os.path.join("data", "x_train_median.tsv.gz")
x_train_df.to_csv(file, sep="\t", index=False)

file = os.path.join("data", "y_train_median.tsv.gz")
y_train_df.to_csv(file, sep="\t", index=False)

file = os.path.join("data", "x_test_median.tsv.gz")
x_test_df.to_csv(file, sep="\t", index=False)

file = os.path.join("data", "y_test_median.tsv.gz")
y_test_df.to_csv(file, sep="\t", index=False)

### MODZ Consensus Profiles

In [10]:
file = os.path.join(data_dir, "consensus", "cell_painting_modz.tsv.gz")
x_consensus_df = pd.read_csv(file, sep="\t")

print(x_consensus_df.shape)
x_consensus_df.head(2)

(357, 952)


Unnamed: 0,Metadata_profile_id,Metadata_cell_line,Metadata_pert_name,Cells_AreaShape_Center_Y,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,Cells_AreaShape_Extent,Cells_AreaShape_Orientation,Cells_AreaShape_Zernike_0_0,Cells_AreaShape_Zernike_1_1,...,Nuclei_Texture_SumEntropy_RNA_5_0,Nuclei_Texture_SumVariance_AGP_20_0,Nuclei_Texture_SumVariance_AGP_5_0,Nuclei_Texture_SumVariance_DNA_10_0,Nuclei_Texture_SumVariance_DNA_20_0,Nuclei_Texture_SumVariance_DNA_5_0,Nuclei_Texture_Variance_AGP_5_0,Nuclei_Texture_Variance_DNA_10_0,Nuclei_Texture_Variance_DNA_20_0,Nuclei_Texture_Variance_DNA_5_0
0,profile_0,A549,AKT1-1,-0.18016,-0.155631,0.014646,0.188053,1.231056,0.031064,-0.585477,...,0.562585,0.988876,0.87995,0.904785,0.906875,0.923143,0.944998,0.984938,1.122724,0.961945
1,profile_1,A549,AKT1-2,0.370572,-0.247842,-0.030773,0.433778,0.062456,0.26686,0.838679,...,0.018933,0.446225,0.359496,0.557998,0.631931,0.504751,0.407462,0.522251,0.64437,0.519441


In [11]:
file = os.path.join(data_dir, "consensus", "cell_health_modz.tsv.gz")
y_consensus_df = pd.read_csv(file, sep="\t")

print(y_consensus_df.shape)
y_consensus_df.head(2)

(357, 73)


Unnamed: 0,Metadata_profile_id,Metadata_pert_name,Metadata_cell_line,cc_all_high_h2ax,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_h2ax_mean,cc_all_n_spots_h2ax_per_nucleus_area_mean,cc_all_nucleus_area_mean,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_early_apoptosis,vb_percent_late_apoptosis,vb_percent_live,vb_ros_back_mean,vb_ros_mean
0,profile_0,AKT1-1,A549,-0.005795,0.580351,0.013975,0.381958,0.150696,0.162511,-0.167603,...,0.438339,0.059414,-0.06505,-0.020236,-0.00797,0.082424,0.0,0.020263,0.408214,0.654575
1,profile_1,AKT1-2,A549,0.050169,1.27773,0.241808,0.577422,0.220829,0.366989,-0.278044,...,0.067568,0.256141,0.575026,0.225091,0.220461,0.132834,0.386327,-0.224965,0.284962,0.567898


#### Split into Training and Testing Sets

In [12]:
x_train_df = x_consensus_df.query("Metadata_profile_id in @training_samples").reindex(x_train_df.index)
y_train_df = y_consensus_df.query("Metadata_profile_id in @training_samples").reindex(y_train_df.index)

x_test_df = x_consensus_df.query("Metadata_profile_id in @testing_samples").reindex(x_test_df.index)
y_test_df = y_consensus_df.query("Metadata_profile_id in @testing_samples").reindex(y_test_df.index)

In [13]:
print(x_train_df.shape)
print(x_test_df.shape)

(303, 952)
(54, 952)


In [14]:
file = os.path.join("data", "x_train_modz.tsv.gz")
x_train_df.to_csv(file, sep="\t", index=False)

file = os.path.join("data", "y_train_modz.tsv.gz")
y_train_df.to_csv(file, sep="\t", index=False)

file = os.path.join("data", "x_test_modz.tsv.gz")
x_test_df.to_csv(file, sep="\t", index=False)

file = os.path.join("data", "y_test_modz.tsv.gz")
y_test_df.to_csv(file, sep="\t", index=False)