# Combine Specific Batches for Downstream Experiments

Here, I combine batches 1 (20X) and 2 because they measure clones A and E.
I also combine batches 5, 6, and 7. These platemaps are all the same, and each measure 8 different wild-type and mutant clones.

In [1]:
import os
import pandas as pd

from pycytominer import feature_select, write_gct
from pycytominer.cyto_utils import infer_cp_features

from scripts.processing_utils import load_data

In [2]:
# Set constants
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "drop_na_columns",
    "blacklist",
    "drop_outliers"
]
profile_dir = os.path.join("..", "0.generate-profiles", "profiles")
gct_dir = os.path.join("data", "gct_files")
output_dir = os.path.join("data", "merged")

suffix = "normalized.csv.gz"

batches = [x for x in os.listdir(profile_dir) if x != ".DS_Store"]
batches

['2019_02_15_Batch1_40X',
 '2019_03_20_Batch2',
 '2019_06_25_Batch3',
 '2019_11_11_Batch4',
 '2019_11_20_Batch6',
 '2019_02_15_Batch1_20X',
 '2019_11_19_Batch5',
 '2019_11_22_Batch7']

In [3]:
dataset_a = ["2019_02_15_Batch1_20X", "2019_03_20_Batch2"]
dataset_b = ["2019_11_19_Batch5", "2019_11_20_Batch6", "2019_11_22_Batch7"]

In [4]:
dataset_a_dict = {}
dataset_b_dict = {}
for batch in batches:    

    if batch in dataset_a:
        dataset_a_dict[batch] = load_data(
            batch=batch, suffix=suffix, profile_dir=profile_dir, combine_dfs=True
        )
        
    if batch in dataset_b:
        dataset_b_dict[batch] = load_data(
            batch=batch, suffix=suffix, profile_dir=profile_dir, combine_dfs=True
        )

## Process and Output Dataset A

In [5]:
dataset_a_df = pd.concat(dataset_a_dict.values()).reset_index(drop=True)
dataset_a_df = dataset_a_df.assign(Metadata_clone_type="resistant")
dataset_a_df.loc[dataset_a_df.Metadata_CellLine.str.contains("WT"), "Metadata_clone_type"] = "wildtype"

meta_cols = infer_cp_features(dataset_a_df, metadata=True)
cp_cols = infer_cp_features(dataset_a_df)

dataset_a_df = dataset_a_df.reindex(meta_cols + cp_cols, axis="columns")

print(dataset_a_df.shape)
dataset_a_df.head()

(72, 3535)


Unnamed: 0,Metadata_CellLine,Metadata_Dosage,Metadata_Plate,Metadata_Well,Metadata_batch,Metadata_plate_map_name,Metadata_clone_type,Cells_AreaShape_Area,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,...,Nuclei_Texture_Variance_RNA_10_02,Nuclei_Texture_Variance_RNA_10_03,Nuclei_Texture_Variance_RNA_20_00,Nuclei_Texture_Variance_RNA_20_01,Nuclei_Texture_Variance_RNA_20_02,Nuclei_Texture_Variance_RNA_20_03,Nuclei_Texture_Variance_RNA_5_00,Nuclei_Texture_Variance_RNA_5_01,Nuclei_Texture_Variance_RNA_5_02,Nuclei_Texture_Variance_RNA_5_03
0,WT,0.0,207106_exposure320,B02,2019_03_20_Batch2,PlateMap_207106_exposure320,wildtype,-0.23997,0.774097,0.981307,...,-0.460955,-0.471566,-0.472902,-0.463908,-0.491301,-0.491651,-0.46513,-0.463746,-0.458918,-0.461595
1,WT,0.0,207106_exposure320,B03,2019_03_20_Batch2,PlateMap_207106_exposure320,wildtype,-0.324702,-1.704724,0.793306,...,-0.407693,-0.409836,-0.41875,-0.400586,-0.431729,-0.435196,-0.407739,-0.408274,-0.404893,-0.403287
2,WT,0.0,207106_exposure320,B04,2019_03_20_Batch2,PlateMap_207106_exposure320,wildtype,-0.283277,1.097421,0.998398,...,-0.418464,-0.430647,-0.440995,-0.44215,-0.445753,-0.467471,-0.423247,-0.419649,-0.414115,-0.415869
3,CloneA,0.0,207106_exposure320,B05,2019_03_20_Batch2,PlateMap_207106_exposure320,resistant,-2.162447,0.789493,-0.471426,...,-0.821523,-0.818418,-0.812761,-0.830032,-0.805281,-0.798594,-0.821887,-0.822436,-0.825891,-0.824499
4,CloneA,0.0,207106_exposure320,B06,2019_03_20_Batch2,PlateMap_207106_exposure320,resistant,-1.269935,-0.365237,-0.830337,...,-0.862363,-0.858491,-0.852563,-0.877956,-0.843264,-0.838066,-0.863207,-0.863628,-0.867076,-0.866256


In [6]:
pd.crosstab(
    dataset_a_df.Metadata_CellLine,
    dataset_a_df.Metadata_Dosage
)

Metadata_Dosage,0.0,0.7,7.0,70.0
Metadata_CellLine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CloneA,6,6,6,6
CloneE,6,6,6,6
WT,6,6,6,6


In [7]:
dataset_a_name = "combined_cloneAcloneE_dataset"

In [8]:
dataset_a_featureselect_df = feature_select(dataset_a_df, operation=feature_select_ops)

output_file = os.path.join(output_dir, "{}.csv".format(dataset_a_name))
dataset_a_featureselect_df.to_csv(output_file, index=False)

output_gct_file = os.path.join(gct_dir, "{}.gct".format(dataset_a_name))
write_gct(profiles=dataset_a_featureselect_df, output_file=output_gct_file)

print(dataset_a_featureselect_df.shape)
dataset_a_featureselect_df.head()

(72, 252)


Unnamed: 0,Metadata_CellLine,Metadata_Dosage,Metadata_Plate,Metadata_Well,Metadata_batch,Metadata_plate_map_name,Metadata_clone_type,Cells_AreaShape_Compactness,Cells_AreaShape_Orientation,Cells_AreaShape_Solidity,...,Nuclei_Texture_Correlation_DNA_10_03,Nuclei_Texture_Correlation_ER_10_01,Nuclei_Texture_Correlation_ER_20_03,Nuclei_Texture_Correlation_Mito_10_03,Nuclei_Texture_Correlation_Mito_20_00,Nuclei_Texture_Correlation_RNA_10_01,Nuclei_Texture_Correlation_RNA_10_02,Nuclei_Texture_Correlation_RNA_10_03,Nuclei_Texture_Correlation_RNA_20_00,Nuclei_Texture_SumVariance_ER_20_01
0,WT,0.0,207106_exposure320,B02,2019_03_20_Batch2,PlateMap_207106_exposure320,wildtype,1.325917,0.350969,-0.701823,...,0.635602,-0.115739,0.654788,-0.243628,-0.03042,-0.039547,0.162129,0.4843,1.172104,-0.643649
1,WT,0.0,207106_exposure320,B03,2019_03_20_Batch2,PlateMap_207106_exposure320,wildtype,1.164187,-0.054254,-0.788705,...,0.988743,-0.196532,0.975687,-0.225656,-0.03042,-0.068849,-0.170842,0.523793,1.172104,-0.634201
2,WT,0.0,207106_exposure320,B04,2019_03_20_Batch2,PlateMap_207106_exposure320,wildtype,1.127666,1.699161,-0.617913,...,0.581176,0.158584,0.626946,-0.214842,-0.03042,-0.163731,-0.087247,0.457444,0.766137,-0.628041
3,CloneA,0.0,207106_exposure320,B05,2019_03_20_Batch2,PlateMap_207106_exposure320,resistant,0.493556,-0.138822,-1.206096,...,1.25756,-1.407506,1.688787,-0.332457,-0.03042,-1.440942,-1.480886,-0.494817,1.172104,-1.165874
4,CloneA,0.0,207106_exposure320,B06,2019_03_20_Batch2,PlateMap_207106_exposure320,resistant,0.498403,-0.706464,-1.140831,...,1.196471,-1.062395,1.369196,-0.334405,-0.03042,-1.480594,-1.303818,-0.425634,0.762304,-1.078278


## Process and Output Dataset B

In [9]:
dataset_b_df = pd.concat(dataset_b_dict.values()).reset_index(drop=True)

dataset_b_df = dataset_b_df.assign(Metadata_clone_type="resistant")
dataset_b_df.loc[dataset_b_df.Metadata_clone_number.str.contains("WT"), "Metadata_clone_type"] = "wildtype"

meta_cols = infer_cp_features(dataset_b_df, metadata=True)
cp_cols = infer_cp_features(dataset_b_df)

dataset_b_df = dataset_b_df.reindex(meta_cols + cp_cols, axis="columns")

print(dataset_b_df.shape)
dataset_b_df.head()

(300, 3537)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_batch,Metadata_clone_number,Metadata_plate_ID,Metadata_plate_filename,Metadata_plate_map_name,Metadata_treatment,Metadata_clone_type,Cells_AreaShape_Area,...,Nuclei_Texture_Variance_RNA_10_02,Nuclei_Texture_Variance_RNA_10_03,Nuclei_Texture_Variance_RNA_20_00,Nuclei_Texture_Variance_RNA_20_01,Nuclei_Texture_Variance_RNA_20_02,Nuclei_Texture_Variance_RNA_20_03,Nuclei_Texture_Variance_RNA_5_00,Nuclei_Texture_Variance_RNA_5_01,Nuclei_Texture_Variance_RNA_5_02,Nuclei_Texture_Variance_RNA_5_03
0,217760,B02,2019_11_20_Batch6,BZ017,217760,20191120-20191115-HiDensity,217760,DMSO,resistant,-0.409916,...,1.03352,1.0077,0.920125,0.87017,0.924758,0.886717,1.006241,1.009751,1.015261,1.031261
1,217760,B03,2019_11_20_Batch6,WT002,217760,20191120-20191115-HiDensity,217760,DMSO,wildtype,-0.137631,...,0.142262,0.094713,0.156082,0.038916,0.135246,-0.042784,0.107654,0.125698,0.115468,0.136594
2,217760,B04,2019_11_20_Batch6,WT008,217760,20191120-20191115-HiDensity,217760,DMSO,wildtype,-0.739635,...,-0.138435,-0.159706,-0.178242,-0.134872,-0.183906,-0.125331,-0.121369,-0.134951,-0.127765,-0.129745
3,217760,B05,2019_11_20_Batch6,WT009,217760,20191120-20191115-HiDensity,217760,DMSO,wildtype,0.938743,...,0.676709,0.694346,0.798707,0.716014,0.783981,0.733235,0.640209,0.675864,0.653471,0.677681
4,217760,B06,2019_11_20_Batch6,BZ018,217760,20191120-20191115-HiDensity,217760,DMSO,resistant,-0.858759,...,0.148026,0.157586,0.030911,0.091131,0.037021,0.118609,0.149601,0.152945,0.151491,0.143787


In [10]:
dataset_b_df.Metadata_clone_number.value_counts()

WT_parental    60
WT002          30
WT008          30
BZ018          30
BZ008          30
WT011          30
WT009          30
BZ001          30
BZ017          30
Name: Metadata_clone_number, dtype: int64

In [11]:
dataset_b_name = "combined_four_clone_dataset"

In [12]:
dataset_b_featureselect_df = feature_select(dataset_b_df, operation=feature_select_ops)

output_file = os.path.join(output_dir, "{}.csv".format(dataset_b_name))
dataset_b_featureselect_df.to_csv(output_file, index=False)

output_gct_file = os.path.join(gct_dir, "{}.gct".format(dataset_b_name))
write_gct(profiles=dataset_b_featureselect_df, output_file=output_gct_file)

print(dataset_b_featureselect_df.shape)
dataset_b_featureselect_df.head()

(300, 315)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_batch,Metadata_clone_number,Metadata_plate_ID,Metadata_plate_filename,Metadata_plate_map_name,Metadata_treatment,Metadata_clone_type,Cells_AreaShape_Compactness,...,Nuclei_Texture_Correlation_Mito_10_00,Nuclei_Texture_Correlation_Mito_20_02,Nuclei_Texture_Correlation_Mito_20_03,Nuclei_Texture_Correlation_RNA_10_01,Nuclei_Texture_Correlation_RNA_20_02,Nuclei_Texture_Entropy_DNA_20_03,Nuclei_Texture_InfoMeas1_DNA_5_02,Nuclei_Texture_InfoMeas1_ER_20_03,Nuclei_Texture_InfoMeas1_Mito_5_00,Nuclei_Texture_SumAverage_DNA_20_01
0,217760,B02,2019_11_20_Batch6,BZ017,217760,20191120-20191115-HiDensity,217760,DMSO,resistant,0.189274,...,0.111782,0.804833,0.432627,0.66617,0.73386,-1.41833,0.118819,-0.374701,-0.641564,-1.796205
1,217760,B03,2019_11_20_Batch6,WT002,217760,20191120-20191115-HiDensity,217760,DMSO,wildtype,-0.222248,...,-0.991681,0.393211,1.634457,-0.471892,-0.098693,-1.155697,-0.089767,1.499978,1.406381,-1.375335
2,217760,B04,2019_11_20_Batch6,WT008,217760,20191120-20191115-HiDensity,217760,DMSO,wildtype,-1.257285,...,-0.507651,0.040824,0.691947,-0.156254,0.201488,-0.989994,0.40193,0.649963,0.101806,-0.07752
3,217760,B05,2019_11_20_Batch6,WT009,217760,20191120-20191115-HiDensity,217760,DMSO,wildtype,-1.611328,...,0.745114,0.021943,0.097973,1.251972,-0.309386,-0.094936,-1.829349,-0.156229,-0.504957,-1.876129
4,217760,B06,2019_11_20_Batch6,BZ018,217760,20191120-20191115-HiDensity,217760,DMSO,resistant,0.790436,...,0.300582,0.429726,-0.433379,0.677186,0.750527,-0.82852,0.151016,-0.778626,-1.104598,-0.295274
