# Merge Batched Data

**Gregory Way, 2019**

Currently, the data were collected in two batches.
In this notebook, I merge the batches together and output `.csv` and `.gct` files.

In [1]:
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(magrittr))
suppressPackageStartupMessages(library(curl))

In [2]:
con <- curl("https://raw.githubusercontent.com/broadinstitute/cytominer_scripts/master/write_gct.R")
source(con)
close(con)

In [3]:
# Set column types for reading in data
batch_cols = readr::cols(
    .default = readr::col_double(),
    Metadata_Plate = readr::col_character(),
    Metadata_Well = readr::col_character(),
    Metadata_Assay_Plate_Barcode = readr::col_character(),
    Metadata_Plate_Map_Name = readr::col_character(),
    Metadata_Batch_Number = readr::col_integer(),
    Metadata_well_position = readr::col_character(),
    Metadata_CellLine = readr::col_character()
)

In [4]:
file <- file.path("data", "2019_02_15_Batch1_20X", "HCT116bortezomib_normalized_variable_selected.csv")
batch1_df <- readr::read_csv(file, col_types = batch_cols)

print(dim(batch1_df))
head(batch1_df)

[1]  36 129


Metadata_Plate,Metadata_Well,Metadata_Assay_Plate_Barcode,Metadata_Plate_Map_Name,Metadata_Batch_Number,Metadata_well_position,Metadata_CellLine,Metadata_Dosage,Cells_AreaShape_Eccentricity,Cells_AreaShape_EulerNumber,⋯,Nuclei_RadialDistribution_RadialCV_ER_1of4,Nuclei_RadialDistribution_RadialCV_Mito_1of4,Nuclei_RadialDistribution_RadialCV_Mito_2of4,Nuclei_RadialDistribution_RadialCV_RNA_1of4,Nuclei_Texture_Correlation_AGP_20_00,Nuclei_Texture_Correlation_DNA_10_02,Nuclei_Texture_Correlation_DNA_10_03,Nuclei_Texture_Correlation_ER_20_03,Nuclei_Texture_Correlation_RNA_10_01,Nuclei_Texture_Correlation_RNA_10_03
HCT116bortezomib,B03,HCT116bortezomib,PlateMap_HCT116bortezomib,1,B03,WT,0,-0.5389166,0.09930445,⋯,-0.5634461,-0.36363157,-0.51669779,-1.585785,-0.92986703,-0.26073548,0.7951588,0.76644102,-1.43355903,-1.0812188
HCT116bortezomib,B04,HCT116bortezomib,PlateMap_HCT116bortezomib,1,B04,WT,0,-0.432274,0.1042688,⋯,-0.7727316,-0.22590011,-0.51984516,-1.190355,-0.18336538,-0.12331085,0.8626262,0.38923127,-0.55055292,-0.514341
HCT116bortezomib,B05,HCT116bortezomib,PlateMap_HCT116bortezomib,1,B05,WT,0,-0.3829535,-0.25798361,⋯,-0.6295427,-0.08707354,-0.58749972,-1.269969,-0.53280023,-0.01186864,0.8444659,0.03431637,0.08001385,0.5337525
HCT116bortezomib,B06,HCT116bortezomib,PlateMap_HCT116bortezomib,1,B06,CloneA,0,-0.3280018,-0.05900018,⋯,-0.7019101,2.07612511,0.61859753,-1.640622,0.52326014,-0.47115313,0.7696258,0.44740917,0.56713754,0.2405925
HCT116bortezomib,B07,HCT116bortezomib,PlateMap_HCT116bortezomib,1,B07,CloneA,0,-1.0457478,0.58322895,⋯,-0.4133835,1.3137388,-0.06525608,-1.395745,-0.01205029,-1.15464299,0.6300836,0.18093449,-0.32444885,-0.1624425
HCT116bortezomib,B08,HCT116bortezomib,PlateMap_HCT116bortezomib,1,B08,CloneA,0,-0.326913,-0.11641325,⋯,-0.6355139,2.45811202,0.27192665,-1.670846,0.2163621,0.1327427,1.1058908,0.39362591,1.74532329,1.1468407


In [5]:
file <- file.path("data", "2019_03_20_Batch2", "207106_exposure320_normalized_variable_selected.csv")
batch2_df <- readr::read_csv(file, col_types = batch_cols)

print(dim(batch2_df))
head(batch2_df)

[1]  36 171


Metadata_Plate,Metadata_Well,Metadata_Assay_Plate_Barcode,Metadata_Plate_Map_Name,Metadata_Batch_Number,Metadata_well_position,Metadata_CellLine,Metadata_Dosage,Cells_AreaShape_Compactness,Cells_AreaShape_Orientation,⋯,Nuclei_Texture_Correlation_DNA_10_01,Nuclei_Texture_Correlation_ER_10_01,Nuclei_Texture_Correlation_Mito_5_02,Nuclei_Texture_Correlation_Mito_5_03,Nuclei_Texture_Correlation_RNA_10_01,Nuclei_Texture_Correlation_RNA_10_03,Nuclei_Texture_Correlation_RNA_20_00,Nuclei_Texture_Entropy_ER_20_01,Nuclei_Texture_SumVariance_ER_20_01,Nuclei_Texture_SumVariance_RNA_20_01
207106_exposure320,B02,207106_exposure320,PlateMap_207106_exposure320,2,B02,WT,0,1.3191326,0.9176338,⋯,0.7871451,0.2753495,0.23614066,0.12067149,0.4023975,0.6558692,0.7843925,-0.3963174,-0.05320719,0.6949548
207106_exposure320,B03,207106_exposure320,PlateMap_207106_exposure320,2,B03,WT,0,1.6230878,0.3895463,⋯,0.7169757,-0.1075322,0.08524138,-0.007435339,-0.1742759,0.5114458,0.5909146,-0.4505974,0.02310763,0.9688375
207106_exposure320,B04,207106_exposure320,PlateMap_207106_exposure320,2,B04,WT,0,1.6363529,1.2591442,⋯,0.7656041,0.384544,0.15507666,0.085089214,0.1596963,0.6254404,0.4115369,-0.3335012,-0.13302498,0.6550726
207106_exposure320,B05,207106_exposure320,PlateMap_207106_exposure320,2,B05,CloneA,0,0.2434214,0.1989603,⋯,1.1657956,-1.0558787,-0.69040151,-0.84533207,-1.666773,-0.3139594,0.7705396,-1.7510916,-0.9613671,-0.6939089
207106_exposure320,B06,207106_exposure320,PlateMap_207106_exposure320,2,B06,CloneA,0,-0.2089785,-0.3516031,⋯,0.9566119,-0.7786303,-0.96949399,-1.048383063,-1.7693063,-0.2434259,0.4786586,-1.5143882,-0.89358488,-0.8336638
207106_exposure320,B07,207106_exposure320,PlateMap_207106_exposure320,2,B07,CloneA,0,-0.2530082,0.294631,⋯,0.9474292,-0.7042756,-0.86614241,-0.964391229,-1.8522586,-0.2413778,0.248797,-1.6065633,-0.93381058,-0.7832081


In [6]:
# What are the common columns
common_cols <- intersect(colnames(batch1_df), colnames(batch2_df))
length(common_cols)

In [7]:
# Combine dataframes together by common columns
batch1_commoncols_df <- batch1_df %>%
    dplyr::select(common_cols)

batch2_commoncols_df <- batch2_df %>%
    dplyr::select(common_cols)

all_commoncols_df <- dplyr::bind_rows(batch1_commoncols_df,
                                      batch2_commoncols_df)

dim(all_commoncols_df)

In [8]:
file <- file.path("data", "merged_intersected_variable_selected.csv")
readr::write_csv(all_commoncols_df, file)

In [9]:
# Output GCT file with combined columns
file <- paste0(tools::file_path_sans_ext(file), ".gct")
write_gct(all_commoncols_df, file)