# Test train split 

In this notebook, we split our labeled dataset into training, testing, and validation sets. Addressing label imbalance in the control group, we determined the average number of labeled cells and randomly selected control cells to include in the training data. The resulting data splits were saved in the `./data` folder under the `./processed/splits` directory.

We also used the `stratify` paramter in order to maintain the proportion of labels when splitting. 

In [1]:
import json
import pathlib
import sys

import pandas as pd
from sklearn.model_selection import train_test_split

from src import analysis_utils

sys.path.append("../../")  # noqa

In [2]:
# notebook parameters
seed = 0

# setting up paths
project_root_path = pathlib.Path("../../").resolve(strict=True)
data_dir = pathlib.Path("../../data").resolve(strict=True)
mitocheck_data_path = (data_dir / "raw/mitocheck_data/normalized_data").resolve(
    strict=True
)
config_dir = pathlib.Path("../../configs").resolve()

# adding to the processed data folder
data_split_dir = (data_dir / "processed/split/").resolve()
data_split_dir.mkdir(exist_ok=True)

In [3]:
# loading in dataset
neg_df = pd.read_parquet(mitocheck_data_path / "negative_control_data.parquet")
neg_df.insert(
    0, "Mitocheck_Phenotypic_Class", "Negative"
)  # adding class label to negative control
training_df = pd.read_parquet(mitocheck_data_path / "training_data.parquet").drop(
    columns="Unnamed: 0"
)

In [4]:
# conducting feature alignment
neg_meta, neg_feats = analysis_utils.split_mitocheck_features(neg_df)
training_meta, training_feats = analysis_utils.split_mitocheck_features(training_df)

This indicates that both controls have the identical feature names now lets look at the training data

In [5]:
check_feature_names_training = set(neg_feats) - set(training_feats)
print(
    "number of non intersecting training features:", len(check_feature_names_training)
)

number of non intersecting training features: 0


By the looks of these, both feature spaces are identical indicating that there is no need to identify shared feature between the controls and the training dataset. Therefore we are going to concatenate the positive 

In [6]:
# we use the average label counts of all cell stage labels to generated the number of controls we add in our dataset
n_samples_for_controls = round(
    training_df["Mitocheck_Phenotypic_Class"].value_counts().mean()
)
sub_n_controls = neg_df.sample(n=n_samples_for_controls, random_state=seed)

completed_training = pd.concat(
    [
        training_df[["Mitocheck_Phenotypic_Class"] + training_feats],
        sub_n_controls[["Mitocheck_Phenotypic_Class"] + training_feats],
    ]
)

# display
print(completed_training.shape)
completed_training.head()

(3098, 1438)


Unnamed: 0,Mitocheck_Phenotypic_Class,CP__AreaShape_Area,CP__AreaShape_BoundingBoxArea,CP__AreaShape_BoundingBoxMaximum_X,CP__AreaShape_BoundingBoxMaximum_Y,CP__AreaShape_BoundingBoxMinimum_X,CP__AreaShape_BoundingBoxMinimum_Y,CP__AreaShape_Center_X,CP__AreaShape_Center_Y,CP__AreaShape_Compactness,...,DP__efficientnet_1270,DP__efficientnet_1271,DP__efficientnet_1272,DP__efficientnet_1273,DP__efficientnet_1274,DP__efficientnet_1275,DP__efficientnet_1276,DP__efficientnet_1277,DP__efficientnet_1278,DP__efficientnet_1279
0,Large,2.514724,2.329739,-0.703055,0.394466,-0.70853,0.332127,-0.705193,0.366053,0.438754,...,1.526493,-0.388909,-0.715202,-0.939279,-0.077689,1.965509,18.685819,0.061676,2.641369,-0.086854
1,Large,3.493695,3.958694,-0.787151,0.280066,-0.818922,0.221209,-0.804212,0.24982,0.044005,...,-0.482883,-1.354858,-0.85668,-0.934949,0.725091,2.25545,-0.565433,1.628086,-0.605625,-0.748135
2,Large,2.752992,2.450258,-0.739847,0.630199,-0.745327,0.564362,-0.741842,0.597839,0.748203,...,0.888706,1.350431,-0.648841,0.264205,0.131341,0.678315,0.171044,0.342206,-0.581597,0.505556
3,Large,1.256048,2.636869,0.723946,0.078999,0.687149,0.054831,0.701374,0.068296,2.794339,...,-1.001625,-0.801021,-0.586539,0.076197,0.599191,1.74209,0.36552,0.643759,-1.906097,1.01937
4,Large,2.110705,2.971213,-0.482304,-1.304201,-0.503515,-1.35591,-0.495365,-1.324741,1.569087,...,0.950706,-0.811825,-0.522427,-1.402842,-0.28994,2.66125,0.126978,-0.824945,-0.494285,1.763332


In [7]:
# encode the class into values
state_decoder_encoder = {}
state_decoder_encoder["encoder"] = {
    state: idx
    for idx, state in enumerate(training_df["Mitocheck_Phenotypic_Class"].unique())
}
state_decoder_encoder["decoder"] = {
    idx: state
    for idx, state in enumerate(training_df["Mitocheck_Phenotypic_Class"].unique())
}
with open(config_dir / "cell_state_codes.json", mode="w") as f:
    json.dump(state_decoder_encoder, f)

In [8]:
# update Mitocheck Phenotype with codes
training_df["Mitocheck_Phenotypic_Class"] = training_df[
    "Mitocheck_Phenotypic_Class"
].apply(lambda cell_state: state_decoder_encoder["encoder"][cell_state])
training_df

Unnamed: 0,Mitocheck_Phenotypic_Class,Cell_UUID,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Frame,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,...,DP__efficientnet_1270,DP__efficientnet_1271,DP__efficientnet_1272,DP__efficientnet_1273,DP__efficientnet_1274,DP__efficientnet_1275,DP__efficientnet_1276,DP__efficientnet_1277,DP__efficientnet_1278,DP__efficientnet_1279
0,0,21da27ab-873a-41f4-ab98-49170cae9a2d,397,618,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,1.526493,-0.388909,-0.715202,-0.939279,-0.077689,1.965509,18.685819,0.061676,2.641369,-0.086854
1,0,82f7949b-4ea2-45c8-8dd9-7854caf49077,359,584,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,-0.482883,-1.354858,-0.856680,-0.934949,0.725091,2.255450,-0.565433,1.628086,-0.605625,-0.748135
2,0,cec7234f-fe35-4411-aded-f8112bb31219,383,685,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,0.888706,1.350431,-0.648841,0.264205,0.131341,0.678315,0.171044,0.342206,-0.581597,0.505556
3,0,43d9e7c9-c9ec-45ce-8820-048bfb896989,932,532,LT0013_38,42,75,1,LT0013_38_42,LT0013_38/LT0013_38_42_75.tif,...,-1.001625,-0.801021,-0.586539,0.076197,0.599191,1.742090,0.365520,0.643759,-1.906097,1.019370
4,0,63ce6652-338e-4afd-9c77-dbc0e903bf92,477,130,LT0013_38,42,75,1,LT0013_38_42,LT0013_38/LT0013_38_42_75.tif,...,0.950706,-0.811825,-0.522427,-1.402842,-0.289940,2.661250,0.126978,-0.824945,-0.494285,1.763332
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2911,15,380728fc-28b0-423f-b8a7-07be1af590d9,383,219,LT0601_01,217,49,1,LT0601_01_217,LT0601_01/LT0601_01_217_49.tif,...,0.549654,8.142944,1.619399,-1.521878,-0.182734,-1.608294,-0.783477,-2.613400,0.442609,1.977761
2912,15,30ed67c7-8de2-4d78-bce9-3fa1aff28565,975,294,LT0603_03,2,49,1,LT0603_03_2,LT0603_03/LT0603_03_2_49.tif,...,0.358861,6.294227,1.827482,-0.997080,-0.614779,-1.270435,-1.335869,-0.560155,0.836314,3.473351
2913,15,2960b13e-6090-4592-b2a9-d1c4c1b24b50,898,302,LT0603_03,2,49,1,LT0603_03_2,LT0603_03/LT0603_03_2_49.tif,...,0.570003,10.106912,1.130243,-1.288302,-0.956321,-1.409762,-0.058448,-0.025529,0.628679,1.657651
2914,15,fbc9ce6a-2b29-4115-b218-4ee5b8c50ac1,946,281,LT0603_03,2,49,1,LT0603_03_2,LT0603_03/LT0603_03_2_49.tif,...,-0.023441,11.088221,2.068912,-0.977407,-1.108647,-1.399433,-2.744383,-2.037700,0.667556,2.438798


In [9]:
# Generating sample data split for demo
demo_split = training_df.sample(n=30, random_state=seed)[training_feats]
demo_split.to_csv(project_root_path / "demo_data.csv", index=False)

Next let's remove the entries that were selected as demo split data from the main training_df in order to prevent leakage  

In [10]:
# updating training_df by removing entries
training_df = training_df.drop(index=demo_split.index)

# display
print(training_df.shape)
training_df.head()

(2886, 1450)


Unnamed: 0,Mitocheck_Phenotypic_Class,Cell_UUID,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Frame,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,...,DP__efficientnet_1270,DP__efficientnet_1271,DP__efficientnet_1272,DP__efficientnet_1273,DP__efficientnet_1274,DP__efficientnet_1275,DP__efficientnet_1276,DP__efficientnet_1277,DP__efficientnet_1278,DP__efficientnet_1279
0,0,21da27ab-873a-41f4-ab98-49170cae9a2d,397,618,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,1.526493,-0.388909,-0.715202,-0.939279,-0.077689,1.965509,18.685819,0.061676,2.641369,-0.086854
1,0,82f7949b-4ea2-45c8-8dd9-7854caf49077,359,584,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,-0.482883,-1.354858,-0.85668,-0.934949,0.725091,2.25545,-0.565433,1.628086,-0.605625,-0.748135
2,0,cec7234f-fe35-4411-aded-f8112bb31219,383,685,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,0.888706,1.350431,-0.648841,0.264205,0.131341,0.678315,0.171044,0.342206,-0.581597,0.505556
3,0,43d9e7c9-c9ec-45ce-8820-048bfb896989,932,532,LT0013_38,42,75,1,LT0013_38_42,LT0013_38/LT0013_38_42_75.tif,...,-1.001625,-0.801021,-0.586539,0.076197,0.599191,1.74209,0.36552,0.643759,-1.906097,1.01937
4,0,63ce6652-338e-4afd-9c77-dbc0e903bf92,477,130,LT0013_38,42,75,1,LT0013_38_42,LT0013_38/LT0013_38_42_75.tif,...,0.950706,-0.811825,-0.522427,-1.402842,-0.28994,2.66125,0.126978,-0.824945,-0.494285,1.763332


## Splitting dataset into training, testing and validation sets 

In [11]:
X = training_df[training_feats]
y = training_df["Mitocheck_Phenotypic_Class"]

# Split the data into training and temporary sets (80%/20%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=seed, stratify=y
)

# Split the temporary set into testing and validation sets (50%/50%)
X_test, X_val, y_test, y_val = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=seed, stratify=y_temp
)

# Optional: Print the shapes of the resulting datasets
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)
print("Validation set shape:", X_val.shape, y_val.shape)

# save the data
# Convert X_train, X_test, and X_val DataFrames to Parquet files
X_train.to_parquet(data_split_dir / "X_train.parquet")
X_test.to_parquet(data_split_dir / "X_test.parquet")
X_val.to_parquet(data_split_dir / "X_val.parquet")

# Convert y_train, y_test, and y_val Series to Parquet files
y_train.to_frame().to_parquet(data_split_dir / "y_train.parquet")
y_test.to_frame().to_parquet(data_split_dir / "y_test.parquet")
y_val.to_frame().to_parquet(data_split_dir / "y_val.parquet")

Training set shape: (2308, 1437) (2308,)
Testing set shape: (289, 1437) (289,)
Validation set shape: (289, 1437) (289,)
