# Stratify Data into Training and Testing Sets

**Gregory Way, 2019**

Split the input data into training and testing sets balanced by cell line.

In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from pycytominer.get_na_columns import get_na_columns

In [2]:
np.random.seed(123)

In [3]:
test_proportion = 0.15
data_dir = os.path.join("..", "1.generate-profiles", "data")

## Load Data

In [4]:
file = os.path.join(data_dir, "consensus", "cell_painting_modz.tsv.gz")
x_consensus_df = pd.read_csv(file, sep="\t")

num_original_features = x_consensus_df.shape[1]

print(x_consensus_df.shape)
x_consensus_df.head(2)

(357, 1284)


Unnamed: 0,Metadata_profile_id,Metadata_cell_line,Metadata_pert_name,Cells_AreaShape_Area,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,Cells_AreaShape_EulerNumber,Cells_AreaShape_Extent,Cells_AreaShape_MaxFeretDiameter,Cells_AreaShape_MaximumRadius,...,Nuclei_Texture_Variance_DNA_5_0,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_ER_5_0,Nuclei_Texture_Variance_Mito_10_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_Mito_5_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,Nuclei_Texture_Variance_RNA_5_0
0,profile_0,A549,AKT1-1,0.07193,0.058673,-0.048603,0.0,0.092972,0.11407,0.117479,...,0.933031,1.271866,1.1438,1.246878,0.649138,0.590802,0.590566,0.342978,0.164454,0.529136
1,profile_1,A549,AKT1-2,-0.138047,0.396241,0.464852,0.0,0.268839,-0.156438,-0.116528,...,0.526123,0.371192,0.194148,0.412839,0.012015,0.055485,0.089602,0.162255,0.147265,0.086007


In [5]:
file = os.path.join(data_dir, "consensus", "cell_health_modz.tsv.gz")
y_consensus_df = pd.read_csv(file, sep="\t")

print(y_consensus_df.shape)
y_consensus_df.head(2)

(357, 73)


Unnamed: 0,Metadata_profile_id,Metadata_pert_name,Metadata_cell_line,cc_all_high_n_spots_h2ax_mean,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_mean,cc_all_n_spots_per_nucleus_area_mean,cc_all_nucleus_area_mean,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_all_early_apoptosis,vb_percent_all_late_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_live,vb_ros_back_mean,vb_ros_mean
0,profile_0,AKT1-1,A549,-0.005795,0.580351,0.013975,0.381958,0.150696,0.162511,-0.167603,...,0.438339,0.059414,0.082424,0.0,-0.06505,-0.020236,-0.00797,0.020263,0.408214,0.654575
1,profile_1,AKT1-2,A549,0.050169,1.27773,0.241808,0.577422,0.220829,0.366989,-0.278044,...,0.067568,0.256141,0.132834,0.386327,0.575026,0.225091,0.220461,-0.224965,0.284962,0.567898


## Subset Features into those acquired in Repurposing Data

In [6]:
# Note, these files are not yet public!
repurposing_project_id = "2015_10_05_DrugRepurposing_AravindSubramanian_GolubLab_Broad"
example_plate = "SQ00015058"

repurposing_profile_dir = os.path.join(
    "/Users",
    "gway",
    "work",
    "projects",
    repurposing_project_id,
    "workspace",
    "software",
    repurposing_project_id,
    "subsampling",
    "data",
    "profiles"
)

plate_dir = os.path.join(repurposing_profile_dir, example_plate, "n_all")
example_plate_file = os.path.join(plate_dir, "{}_subsample_all_normalized.csv".format(example_plate))
repurposing_df = pd.read_csv(example_plate_file)

print(repurposing_df.shape)
repurposing_df.head()

(384, 1351)


Unnamed: 0,Metadata_plate_map_name,Metadata_broad_sample,Metadata_mg_per_ml,Metadata_mmoles_per_liter,Metadata_solvent,Image_Metadata_Plate,Image_Metadata_Well,Cells_AreaShape_Area,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,...,Nuclei_Texture_Variance_DNA_5_0,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_ER_5_0,Nuclei_Texture_Variance_Mito_10_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_Mito_5_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,Nuclei_Texture_Variance_RNA_5_0
0,C-7161-01-LM6-028,,,,DMSO,SQ00015058,A01,0.067943,0.723845,0.627867,...,0.388595,-0.220344,-0.021542,-0.302676,-0.417957,-0.36199,-0.442155,-0.087422,-0.134215,-0.104054
1,C-7161-01-LM6-028,,,,DMSO,SQ00015058,A02,0.210561,1.64894,-0.426731,...,0.556791,-0.160838,-0.004258,-0.203185,-0.158496,-0.149739,-0.219452,-0.066237,-0.224963,-0.011944
2,C-7161-01-LM6-028,,,,DMSO,SQ00015058,A03,-0.155591,0.329824,1.363809,...,0.827955,0.262965,0.309588,0.150367,-0.092583,-0.165437,-0.044953,-0.271777,-0.299115,-0.183484
3,C-7161-01-LM6-028,,,,DMSO,SQ00015058,A04,-0.136381,0.569663,-0.062554,...,0.534781,-0.044292,-0.143393,0.01694,-0.152441,-0.157008,-0.074708,-0.407897,-0.472541,-0.271706
4,C-7161-01-LM6-028,,,,DMSO,SQ00015058,A05,0.549936,1.357707,0.438191,...,-0.085308,-0.15788,-0.278797,-0.26535,-0.375957,-0.369248,-0.370915,-0.322905,-0.16942,-0.282563


In [7]:
cp_features = set(repurposing_df.columns[~repurposing_df.columns.str.startswith("Metadata")])
cp_features = sorted(
    list(
        cp_features
        .intersection(
            set(
                x_consensus_df.columns[~x_consensus_df.columns.str.startswith("Metadata")]
            )
        )
    )
)

len(cp_features)

1278

In [8]:
meta_cols = x_consensus_df.columns[x_consensus_df.columns.str.startswith("Metadata")].tolist()
x_consensus_df = x_consensus_df.loc[:, meta_cols + cp_features]
num_subset_features = x_consensus_df.shape[1]

print(x_consensus_df.shape)
x_consensus_df.head()

(357, 1281)


Unnamed: 0,Metadata_profile_id,Metadata_cell_line,Metadata_pert_name,Cells_AreaShape_Area,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,Cells_AreaShape_EulerNumber,Cells_AreaShape_Extent,Cells_AreaShape_MaxFeretDiameter,Cells_AreaShape_MaximumRadius,...,Nuclei_Texture_Variance_DNA_5_0,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_ER_5_0,Nuclei_Texture_Variance_Mito_10_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_Mito_5_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,Nuclei_Texture_Variance_RNA_5_0
0,profile_0,A549,AKT1-1,0.07193,0.058673,-0.048603,0.0,0.092972,0.11407,0.117479,...,0.933031,1.271866,1.1438,1.246878,0.649138,0.590802,0.590566,0.342978,0.164454,0.529136
1,profile_1,A549,AKT1-2,-0.138047,0.396241,0.464852,0.0,0.268839,-0.156438,-0.116528,...,0.526123,0.371192,0.194148,0.412839,0.012015,0.055485,0.089602,0.162255,0.147265,0.086007
2,profile_2,A549,ARID1B-1,0.178516,0.739506,-0.348868,0.0,-0.481855,0.321717,0.120957,...,-0.521309,-0.019196,0.134715,-0.100963,-0.049586,0.076895,-0.202828,0.331005,0.200163,0.346467
3,profile_3,A549,ARID1B-2,0.486482,-0.019855,0.370251,0.0,-0.373809,0.62825,0.41706,...,-0.018916,-0.361304,-0.291002,-0.287105,-0.12557,-0.115333,0.073608,0.619306,0.429432,0.680737
4,profile_4,A549,ATF4-1,3.42064,-0.124241,-0.071464,0.0,0.46515,3.456872,3.254955,...,0.417267,-0.250491,0.05976,-0.144306,0.012049,-0.202045,0.220401,1.356764,2.36021,0.349848


In [9]:
print("subsetting by repurposing features removed {} features".format(num_original_features - num_subset_features))

subsetting by repurposing features removed 3 features


## Split into Training and Testing

In [10]:
x_train_df, x_test_df, y_train_df, y_test_df = train_test_split(
    x_consensus_df,
    y_consensus_df,
    test_size=test_proportion,
    stratify=y_consensus_df.Metadata_cell_line,
    random_state=42
)

In [11]:
print(x_train_df.shape)
print(x_test_df.shape)

(303, 1281)
(54, 1281)


In [12]:
file = os.path.join("data", "x_train.tsv.gz")
x_train_df.to_csv(file, sep="\t", index=False)

file = os.path.join("data", "y_train.tsv.gz")
y_train_df.to_csv(file, sep="\t", index=False)

file = os.path.join("data", "x_test.tsv.gz")
x_test_df.to_csv(file, sep="\t", index=False)

file = os.path.join("data", "y_test.tsv.gz")
y_test_df.to_csv(file, sep="\t", index=False)