# Combine Specific Batches for Downstream Experiments

Here, I combine batches 1 (20X) and 2 because they measure clones A and E.
I also combine batches 5, 6, and 7. These platemaps are all the same, and each measure 8 different wild-type and mutant clones.

In [1]:
import os
import pandas as pd

from pycytominer import feature_select, write_gct
from pycytominer.cyto_utils import infer_cp_features

from scripts.processing_utils import load_data

In [2]:
# Set constants
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "drop_na_columns",
    "blacklist",
    "drop_outliers"
]
profile_dir = os.path.join("..", "0.generate-profiles", "profiles")
cell_count_dir = os.path.join("..", "0.generate-profiles", "cell_counts")
gct_dir = os.path.join("data", "gct_files")
output_dir = os.path.join("data", "merged")

suffix = "normalized.csv.gz"

batches = [x for x in os.listdir(profile_dir) if x != ".DS_Store"]
batches

['2019_02_15_Batch1_40X',
 '2019_03_20_Batch2',
 '2019_06_25_Batch3',
 '2019_11_11_Batch4',
 '2019_11_20_Batch6',
 '2019_02_15_Batch1_20X',
 '2019_11_19_Batch5',
 '2019_11_22_Batch7']

In [3]:
dataset_a = ["2019_02_15_Batch1_20X", "2019_03_20_Batch2"]
dataset_b = ["2019_11_19_Batch5", "2019_11_20_Batch6", "2019_11_22_Batch7"]

In [4]:
dataset_a_dict = {}
dataset_b_dict = {}
for batch in batches:    

    df = load_data(
        batch=batch,
        suffix=suffix,
        profile_dir=profile_dir,
        combine_dfs=True,
        add_cell_count=True,
        cell_count_dir=cell_count_dir
    )

    if batch in dataset_a:
        dataset_a_dict[batch] = df
    if batch in dataset_b:
        dataset_b_dict[batch] = df

## Process and Output Dataset A

In [5]:
dataset_a_df = pd.concat(dataset_a_dict.values()).reset_index(drop=True)
dataset_a_df = dataset_a_df.assign(Metadata_clone_type="resistant")
dataset_a_df.loc[dataset_a_df.Metadata_CellLine.str.contains("WT"), "Metadata_clone_type"] = "wildtype"

meta_cols = infer_cp_features(dataset_a_df, metadata=True)
cp_cols = infer_cp_features(dataset_a_df)

dataset_a_df = dataset_a_df.reindex(meta_cols + cp_cols, axis="columns")

print(dataset_a_df.shape)
dataset_a_df.head()

(648, 3537)


Unnamed: 0,Metadata_CellLine,Metadata_Dosage,Metadata_Plate,Metadata_Site,Metadata_Well,Metadata_batch,Metadata_cell_count,Metadata_plate_map_name,Metadata_clone_type,Cells_AreaShape_Area,...,Nuclei_Texture_Variance_RNA_10_02,Nuclei_Texture_Variance_RNA_10_03,Nuclei_Texture_Variance_RNA_20_00,Nuclei_Texture_Variance_RNA_20_01,Nuclei_Texture_Variance_RNA_20_02,Nuclei_Texture_Variance_RNA_20_03,Nuclei_Texture_Variance_RNA_5_00,Nuclei_Texture_Variance_RNA_5_01,Nuclei_Texture_Variance_RNA_5_02,Nuclei_Texture_Variance_RNA_5_03
0,WT,0.0,207106_exposure320,1,B02,2019_03_20_Batch2,858,PlateMap_207106_exposure320,wildtype,-0.316712,...,-0.465525,-0.470943,-0.460892,-0.431124,-0.487739,-0.465303,-0.470795,-0.46506,-0.463741,-0.464521
1,WT,0.0,207106_exposure320,2,B02,2019_03_20_Batch2,1000,PlateMap_207106_exposure320,wildtype,-0.464506,...,-0.468731,-0.481636,-0.553875,-0.577115,-0.549136,-0.546084,-0.474566,-0.465935,-0.462315,-0.471233
2,WT,0.0,207106_exposure320,3,B02,2019_03_20_Batch2,892,PlateMap_207106_exposure320,wildtype,-0.096352,...,-0.465051,-0.492141,-0.470427,-0.450707,-0.464878,-0.501691,-0.47756,-0.468926,-0.464237,-0.472244
3,WT,0.0,207106_exposure320,4,B02,2019_03_20_Batch2,997,PlateMap_207106_exposure320,wildtype,-0.264118,...,-0.433786,-0.443056,-0.431051,-0.395937,-0.467196,-0.453414,-0.426949,-0.437194,-0.420482,-0.429004
4,WT,0.0,207106_exposure320,5,B02,2019_03_20_Batch2,803,PlateMap_207106_exposure320,wildtype,-0.225505,...,-0.414115,-0.431964,-0.439145,-0.480739,-0.474449,-0.468048,-0.412061,-0.413095,-0.410372,-0.416613


In [6]:
pd.crosstab(
    dataset_a_df.Metadata_CellLine,
    dataset_a_df.Metadata_Dosage
)

Metadata_Dosage,0.0,0.7,7.0,70.0
Metadata_CellLine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CloneA,54,54,54,54
CloneE,54,54,54,54
WT,54,54,54,54


In [7]:
dataset_a_name = "combined_cloneAcloneE_dataset"

In [8]:
output_file = os.path.join(output_dir, "{}.csv.gz".format(dataset_a_name))
dataset_a_df.to_csv(output_file, index=False, compression="gzip")

dataset_a_featureselect_df = feature_select(dataset_a_df, operation=feature_select_ops)

output_file = os.path.join(output_dir, "{}_feature_select.csv.gz".format(dataset_a_name))
dataset_a_featureselect_df.to_csv(output_file, index=False, compression="gzip")

output_gct_file = os.path.join(gct_dir, "{}_feature_select.gct".format(dataset_a_name))
write_gct(profiles=dataset_a_featureselect_df, output_file=output_gct_file)

print(dataset_a_featureselect_df.shape)
dataset_a_featureselect_df.head()

(648, 364)


Unnamed: 0,Metadata_CellLine,Metadata_Dosage,Metadata_Plate,Metadata_Site,Metadata_Well,Metadata_batch,Metadata_cell_count,Metadata_plate_map_name,Metadata_clone_type,Cells_AreaShape_Compactness,...,Nuclei_Texture_Correlation_RNA_10_02,Nuclei_Texture_Correlation_RNA_10_03,Nuclei_Texture_Correlation_RNA_20_00,Nuclei_Texture_Correlation_RNA_20_01,Nuclei_Texture_Correlation_RNA_20_02,Nuclei_Texture_Correlation_RNA_20_03,Nuclei_Texture_Entropy_DNA_20_01,Nuclei_Texture_InfoMeas1_AGP_20_01,Nuclei_Texture_InfoMeas1_AGP_5_02,Nuclei_Texture_InfoMeas1_ER_5_02
0,WT,0.0,207106_exposure320,1,B02,2019_03_20_Batch2,858,PlateMap_207106_exposure320,wildtype,0.562423,...,0.328992,0.690206,0.871334,0.151627,0.914795,0.480204,-0.565972,0.071699,-0.916581,0.127344
1,WT,0.0,207106_exposure320,2,B02,2019_03_20_Batch2,1000,PlateMap_207106_exposure320,wildtype,0.561568,...,0.148875,0.36086,0.871334,0.690241,0.941983,1.309516,-1.320771,0.414996,-0.192904,0.226048
2,WT,0.0,207106_exposure320,3,B02,2019_03_20_Batch2,892,PlateMap_207106_exposure320,wildtype,0.435727,...,-0.014604,0.475724,0.81281,0.208812,0.941983,0.207877,-0.858067,0.128648,-0.191167,0.276941
3,WT,0.0,207106_exposure320,4,B02,2019_03_20_Batch2,997,PlateMap_207106_exposure320,wildtype,0.328713,...,0.296432,-0.018025,0.871334,0.395015,0.941983,0.322972,-0.756164,-0.006232,-0.696263,-0.070625
4,WT,0.0,207106_exposure320,5,B02,2019_03_20_Batch2,803,PlateMap_207106_exposure320,wildtype,0.790237,...,-0.079835,-0.264972,0.871334,1.119686,0.941983,0.520378,-0.552356,0.235831,-0.141804,0.415015


## Process and Output Dataset B

In [9]:
dataset_b_df = pd.concat(dataset_b_dict.values()).reset_index(drop=True)

dataset_b_df = dataset_b_df.assign(Metadata_clone_type="resistant")
dataset_b_df.loc[dataset_b_df.Metadata_clone_number.str.contains("WT"), "Metadata_clone_type"] = "wildtype"

meta_cols = infer_cp_features(dataset_b_df, metadata=True)
cp_cols = infer_cp_features(dataset_b_df)

dataset_b_df = dataset_b_df.reindex(meta_cols + cp_cols, axis="columns")

print(dataset_b_df.shape)
dataset_b_df.head()

(5098, 3539)


Unnamed: 0,Metadata_Plate,Metadata_Site,Metadata_Well,Metadata_batch,Metadata_cell_count,Metadata_clone_number,Metadata_plate_ID,Metadata_plate_filename,Metadata_plate_map_name,Metadata_treatment,...,Nuclei_Texture_Variance_RNA_10_02,Nuclei_Texture_Variance_RNA_10_03,Nuclei_Texture_Variance_RNA_20_00,Nuclei_Texture_Variance_RNA_20_01,Nuclei_Texture_Variance_RNA_20_02,Nuclei_Texture_Variance_RNA_20_03,Nuclei_Texture_Variance_RNA_5_00,Nuclei_Texture_Variance_RNA_5_01,Nuclei_Texture_Variance_RNA_5_02,Nuclei_Texture_Variance_RNA_5_03
0,217760,1,B02,2019_11_20_Batch6,2057,BZ017,217760,20191120-20191115-HiDensity,217760,DMSO,...,0.046572,0.00754,-0.194067,-0.124377,-0.125827,-0.210232,0.006278,0.002671,-0.022907,-0.019552
1,217760,2,B02,2019_11_20_Batch6,1888,BZ017,217760,20191120-20191115-HiDensity,217760,DMSO,...,0.297381,0.282411,0.195861,0.175852,0.182031,0.179886,0.252371,0.266454,0.316769,0.29082
2,217760,3,B02,2019_11_20_Batch6,2107,BZ017,217760,20191120-20191115-HiDensity,217760,DMSO,...,0.755113,0.732732,0.325665,0.406218,0.349112,0.33893,0.712833,0.730032,0.762556,0.740912
3,217760,4,B02,2019_11_20_Batch6,1765,BZ017,217760,20191120-20191115-HiDensity,217760,DMSO,...,0.877661,0.830078,0.727742,0.439429,0.666144,0.645646,0.757764,0.808093,0.805912,0.848294
4,217760,5,B02,2019_11_20_Batch6,1665,BZ017,217760,20191120-20191115-HiDensity,217760,DMSO,...,0.619238,0.608443,0.602519,0.623694,0.675651,0.744548,0.575179,0.557953,0.643485,0.58482


In [10]:
dataset_b_df.Metadata_clone_number.value_counts()

WT_parental    1020
WT002           510
BZ018           510
WT008           510
BZ008           510
WT009           510
BZ001           510
WT011           509
BZ017           509
Name: Metadata_clone_number, dtype: int64

In [11]:
dataset_b_name = "combined_four_clone_dataset"

In [12]:
output_file = os.path.join(output_dir, "{}.csv.gz".format(dataset_b_name))
dataset_b_df.to_csv(output_file, index=False, compression="gzip")

dataset_b_featureselect_df = feature_select(dataset_b_df, operation=feature_select_ops)

output_file = os.path.join(output_dir, "{}_feature_select.csv.gz".format(dataset_b_name))
dataset_b_featureselect_df.to_csv(output_file, index=False, compression="gzip")

output_gct_file = os.path.join(gct_dir, "{}_feature_select.gct".format(dataset_b_name))
write_gct(profiles=dataset_b_featureselect_df, output_file=output_gct_file)

print(dataset_b_featureselect_df.shape)
dataset_b_featureselect_df.head()

(5098, 374)


Unnamed: 0,Metadata_Plate,Metadata_Site,Metadata_Well,Metadata_batch,Metadata_cell_count,Metadata_clone_number,Metadata_plate_ID,Metadata_plate_filename,Metadata_plate_map_name,Metadata_treatment,...,Nuclei_Texture_Correlation_RNA_20_03,Nuclei_Texture_Entropy_AGP_20_03,Nuclei_Texture_InfoMeas1_DNA_10_01,Nuclei_Texture_InfoMeas1_DNA_10_02,Nuclei_Texture_InfoMeas1_ER_20_03,Nuclei_Texture_InfoMeas1_ER_5_00,Nuclei_Texture_InfoMeas1_Mito_10_00,Nuclei_Texture_InfoMeas1_Mito_20_03,Nuclei_Texture_InfoMeas1_Mito_5_00,Nuclei_Texture_SumAverage_DNA_20_03
0,217760,1,B02,2019_11_20_Batch6,2057,BZ017,217760,20191120-20191115-HiDensity,217760,DMSO,...,-0.860474,-0.922726,0.516766,0.972085,-0.670301,-0.537814,0.704182,1.353595,-1.581568,-0.426531
1,217760,2,B02,2019_11_20_Batch6,1888,BZ017,217760,20191120-20191115-HiDensity,217760,DMSO,...,-0.240437,0.087708,1.30398,1.604534,-0.327493,-0.784226,1.195328,1.268855,-0.400801,-0.382978
2,217760,3,B02,2019_11_20_Batch6,2107,BZ017,217760,20191120-20191115-HiDensity,217760,DMSO,...,1.152139,-0.27774,0.121702,0.816479,-0.544643,-0.423655,0.952628,1.536982,0.017048,0.512289
3,217760,4,B02,2019_11_20_Batch6,1765,BZ017,217760,20191120-20191115-HiDensity,217760,DMSO,...,-0.129783,0.198506,1.039113,1.485221,-0.350702,-0.182166,0.827373,0.745705,-0.471595,-1.133967
4,217760,5,B02,2019_11_20_Batch6,1665,BZ017,217760,20191120-20191115-HiDensity,217760,DMSO,...,-0.41825,0.816742,1.623718,1.56338,-0.413778,-1.563381,0.395075,0.829642,-1.436955,-1.014045
