## Determine Data Splits

We use batch 11 data to demonstrate proof of concept that we can identify features that distinguish resistant from sensitive clones.

In this notebook, I create a single dataset, using the following procedure:

1. Load batch 11 normalized (level 4a) data
2. Split the five wildtype and and five sensitive clones into training/testing sets
3. Keep the wildtype parental and clone A/E held out
4. Perform feature selection using the training data only
5. Also load the batch 3 data and add to the analytical set as an inference set

In [1]:
import sys
import pathlib
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from pycytominer import normalize, feature_select
from pycytominer.cyto_utils import infer_cp_features, write_gct

sys.path.insert(0, "../2.describe-data/scripts")
from processing_utils import load_data

In [2]:
np.random.seed(1233)

In [3]:
data_dir = pathlib.Path("../0.generate-profiles/profiles")
cell_count_dir = pathlib.Path("../0.generate-profiles/cell_counts/")

output_dir = pathlib.Path("data")

profile_suffix = "augmented.csv.gz"

feature_select_opts = [
    "variance_threshold",
    "correlation_threshold",
    "drop_na_columns",
    "blocklist",
    "drop_outliers",
]

corr_threshold = 0.90
na_cutoff = 0

test_set_size = 0.25

In [4]:
dataset = "bortezomib"

batch = "2021_02_08_Batch11"
plate = "219814"

In [5]:
clones = [
    "BZ001",
    "BZ002",
    "BZ003",
    "BZ004",
    "BZ005",
    "WT clone 01",
    "WT clone 02",
    "WT clone 03",
    "WT clone 04",
    "WT clone 05"
]

In [6]:
# Load and harmonize data for the given plates
df = load_data(
    batch=batch,
    plates=plate,
    profile_dir=data_dir,
    suffix=profile_suffix,
    combine_dfs=True,
    harmonize_cols=True,
    add_cell_count=True,
    cell_count_dir=cell_count_dir
)

# Add important metadata features
df = df.assign(
    Metadata_dataset=dataset,
    Metadata_batch=batch,
    Metadata_clone_type="resistant",
    Metadata_clone_type_indicator=1,
    Metadata_model_split="training"
)

df.loc[df.Metadata_clone_number.str.contains("WT"), "Metadata_clone_type"] = "sensitive"
df.loc[df.Metadata_clone_number.str.contains("WT"), "Metadata_clone_type_indicator"] = 0
df = df.assign(
    Metadata_unique_sample_name=[f"profile_{x}_{dataset}" for x in range(0, df.shape[0])]
)

df.head()

Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_batch,Metadata_cell_count,Metadata_cell_density,Metadata_celltype_shorthand_from_plate_graph,Metadata_clone_number,Metadata_date,Metadata_plate_map_name,Metadata_time_to_adhere,...,Nuclei_Texture_Variance_RNA_20_03,Nuclei_Texture_Variance_RNA_5_00,Nuclei_Texture_Variance_RNA_5_01,Nuclei_Texture_Variance_RNA_5_02,Nuclei_Texture_Variance_RNA_5_03,Metadata_dataset,Metadata_clone_type,Metadata_clone_type_indicator,Metadata_model_split,Metadata_unique_sample_name
0,219814,B02,2021_02_08_Batch11,13361,2.5x10^3 cells/well,1,WT_parental,20210205,219814,48 hr,...,0.029529,0.047538,0.046648,0.048181,0.046588,bortezomib,sensitive,0,training,profile_0_bortezomib
1,219814,B03,2021_02_08_Batch11,8065,2.5x10^3 cells/well,2,CloneA,20210205,219814,48 hr,...,0.008264,0.018875,0.018935,0.019347,0.019223,bortezomib,resistant,1,training,profile_1_bortezomib
2,219814,B04,2021_02_08_Batch11,11036,2.5x10^3 cells/well,3,CloneE,20210205,219814,48 hr,...,0.0,0.005396,0.005324,0.005326,0.005398,bortezomib,resistant,1,training,profile_2_bortezomib
3,219814,B05,2021_02_08_Batch11,14092,2.5x10^3 cells/well,4,WT clone 01,20210205,219814,48 hr,...,0.0,0.002238,0.002246,0.002289,0.002343,bortezomib,sensitive,0,training,profile_3_bortezomib
4,219814,B06,2021_02_08_Batch11,3168,2.5x10^3 cells/well,5,WT clone 02,20210205,219814,48 hr,...,0.17747,0.178579,0.176105,0.179812,0.175457,bortezomib,sensitive,0,training,profile_4_bortezomib


In [7]:
# Normalize with respect to WT controls
df = normalize(
    df,
    features="infer",
    meta_features="infer",
    samples="Metadata_clone_number == 'WT_parental'",
    method="standardize",
    output_file="none"
)

In [8]:
# Select only the uncharacterized clones
training_df = df.query("Metadata_clone_number in @clones").reset_index(drop=True)

In [9]:
# Split data
train_samples, test_samples = train_test_split(
    training_df.Metadata_unique_sample_name,
    random_state=9876,
    test_size=test_set_size,
    stratify=training_df.Metadata_clone_number.astype(str)
)

print(len(train_samples))
print(len(test_samples))

30
10


In [10]:
# Apply feature selection using only the training samples
feature_select_training_df = feature_select(
    training_df.query("Metadata_unique_sample_name in @train_samples"),
    operation=feature_select_opts,
    na_cutoff=na_cutoff,
)

In [11]:
# Identify testing data
testing_df = (
    training_df
    .query("Metadata_unique_sample_name in @test_samples")
    .reindex(feature_select_training_df.columns, axis="columns")
)

testing_df.loc[:, "Metadata_model_split"] = "testing"

In [12]:
# Load inference data (a different hold out)
inference_batch = "2019_06_25_Batch3"
inference_file = pathlib.Path(f"../3.bulk-signatures/data/{inference_batch}_combined_normalized.csv.gz")
inference_df = pd.read_csv(inference_file)

inference_df = inference_df.assign(
    Metadata_dataset="untreated_mystery_clones",
    Metadata_batch=inference_batch,
    Metadata_clone_type="resistant",
    Metadata_clone_type_indicator=1,
    Metadata_model_split="inference"
)

inference_df.loc[inference_df.Metadata_clone_number.str.contains("WT"), "Metadata_clone_type"] = "sensitive"
inference_df.loc[inference_df.Metadata_clone_number.str.contains("WT"), "Metadata_clone_type_indicator"] = 0
inference_df = inference_df.assign(
    Metadata_unique_sample_name=[f"profile_{x}_inference" for x in range(0, inference_df.shape[0])]
)

inference_df =  inference_df.reindex(feature_select_training_df.columns, axis="columns")

inference_df.Metadata_clone_number.value_counts()

WT_parental    6
BZ014          3
BZ013          3
BZ007          3
BZ011          3
WT001          3
BZ012          3
WT009          3
BZ002          3
BZ015          3
WT005          3
BZ001          3
WT007          3
WT015          3
WT004          3
BZ009          3
BZ005          3
BZ016          3
WT014          3
WT010          3
WT008          3
BZ010          3
WT013          3
WT003          3
BZ004          3
BZ017          3
WT002          3
WT011          3
BZ018          3
BZ003          3
BZ006          3
WT006          3
WT012          3
BZ008          3
Name: Metadata_clone_number, dtype: int64

In [13]:
# Combine profiles into a single dataset and output
heldout_df = (
    df.query("Metadata_clone_number not in @clones")
    .reindex(feature_select_training_df.columns, axis="columns")
)

heldout_df.loc[:, "Metadata_model_split"] = "holdout"

bortezomib_df = pd.concat(
    [
        feature_select_training_df,
        testing_df,
        heldout_df,
        inference_df
    ],
    axis="rows"
).reset_index(drop=True)

output_file = pathlib.Path(f"{output_dir}/bortezomib_signature_analytical_set.tsv.gz")
bortezomib_df.to_csv(output_file, sep="\t", index=False)

In [14]:
print(bortezomib_df.shape)
bortezomib_df.head()

(165, 272)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_batch,Metadata_cell_count,Metadata_cell_density,Metadata_celltype_shorthand_from_plate_graph,Metadata_clone_number,Metadata_date,Metadata_plate_map_name,Metadata_time_to_adhere,...,Nuclei_RadialDistribution_MeanFrac_Mito_1of4,Nuclei_RadialDistribution_MeanFrac_Mito_4of4,Nuclei_RadialDistribution_MeanFrac_RNA_1of4,Nuclei_RadialDistribution_MeanFrac_RNA_4of4,Nuclei_RadialDistribution_RadialCV_DNA_1of4,Nuclei_Texture_Correlation_DNA_10_03,Nuclei_Texture_Correlation_ER_10_00,Nuclei_Texture_Correlation_ER_10_01,Nuclei_Texture_Correlation_ER_10_03,Nuclei_Texture_Correlation_Mito_10_02
0,219814,B10,2021_02_08_Batch11,12453,2.5x10^3 cells/well,4.0,WT clone 01,20210205.0,219814,48 hr,...,0.920735,2.29247,2.786923,0.215033,1.801948,2.811151,-0.372571,-0.41286,-0.411285,1.541415
1,219814,B11,2021_02_08_Batch11,2914,2.5x10^3 cells/well,5.0,WT clone 02,20210205.0,219814,48 hr,...,0.497789,-1.757208,1.13621,-4.166181,2.112966,0.127117,0.241969,-0.079047,-0.070812,1.985364
2,219814,C02,2021_02_08_Batch11,6314,2.5x10^3 cells/well,10.0,BZ002,20210205.0,219814,48 hr,...,3.432913,2.349513,2.21048,6.010481,1.421137,3.359439,0.020542,-0.181673,-0.189915,0.86675
3,219814,C03,2021_02_08_Batch11,4275,2.5x10^3 cells/well,9.0,BZ001,20210205.0,219814,48 hr,...,5.828402,-4.883283,3.258141,-5.702246,3.305521,-2.633341,0.354186,-0.355529,-0.378322,1.878571
4,219814,C04,2021_02_08_Batch11,3869,2.5x10^3 cells/well,8.0,WT clone 05,20210205.0,219814,48 hr,...,1.450401,-1.311083,2.835185,-3.429578,2.026013,1.787448,0.213797,0.024429,-0.159858,1.896527


In [15]:
assert len(bortezomib_df.Metadata_unique_sample_name.unique()) == bortezomib_df.shape[0]