## Generate Morpheus Input Data

**Gregory Way, 2019**

Use this script to concatenate all of the cell painting data into one `.gct` file for input into morpheus.

In [1]:
import os
import pandas as pd

In [2]:
%load_ext rpy2.ipython

## Batch 1

In [3]:
batch_id = "2019_04_16_Batch1"
backend_dir = os.path.join("..", "..", "backend", batch_id)

plate_dirs = [os.path.join(backend_dir, x) for x in os.listdir(backend_dir)]

In [4]:
# Build full cell painting dataset
df_list = []
for plate_dir in plate_dirs:
    plate_files = os.listdir(plate_dir)
    for plate_file in plate_files:
        if "normalized_variable_selected.csv" in plate_file:
            plate_file = os.path.join(plate_dir, plate_file)
            df = pd.read_csv(plate_file)
            print("reading {} with profile count: {}".format(plate_file, df.shape[0]))
            df_list.append(df)

reading ../../backend/2019_04_16_Batch1/BR00101080/BR00101080_normalized_variable_selected.csv with profile count: 60
reading ../../backend/2019_04_16_Batch1/BR00101081/BR00101081_normalized_variable_selected.csv with profile count: 60
reading ../../backend/2019_04_16_Batch1/BR00101075/BR00101075_normalized_variable_selected.csv with profile count: 60
reading ../../backend/2019_04_16_Batch1/BR00101079/BR00101079_normalized_variable_selected.csv with profile count: 60
reading ../../backend/2019_04_16_Batch1/BR00101077/BR00101077_normalized_variable_selected.csv with profile count: 56
reading ../../backend/2019_04_16_Batch1/BR00101083/BR00101083_normalized_variable_selected.csv with profile count: 60
reading ../../backend/2019_04_16_Batch1/BR00101082/BR00101082_normalized_variable_selected.csv with profile count: 60
reading ../../backend/2019_04_16_Batch1/BR00101076/BR00101076_normalized_variable_selected.csv with profile count: 54
reading ../../backend/2019_04_16_Batch1/BR00101078/BR001

In [5]:
# Combine into a single file
cp_df = pd.concat(df_list).reset_index(drop=True)
cp_df.Metadata_diff_day = cp_df.Metadata_diff_day.astype(str)

print(cp_df.shape)
cp_df.head()

(530, 549)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_Assay_Plate_Barcode,Metadata_Plate_Map_Name,Metadata_well_position,Metadata_cell_line,Metadata_patient,Metadata_FFA,Metadata_diff_day,Cells_AreaShape_EulerNumber,...,Nuclei_Texture_InfoMeas2_DNA_10_01,Nuclei_Texture_InfoMeas2_Mito_10_03,Nuclei_Texture_InverseDifferenceMoment_DNA_10_00,Nuclei_Texture_InverseDifferenceMoment_ER_10_02,Nuclei_Texture_InverseDifferenceMoment_Mito_10_02,Nuclei_Texture_InverseDifferenceMoment_Mito_20_02,Nuclei_Texture_SumAverage_DNA_20_03,Nuclei_Texture_SumVariance_AGP_20_01,Nuclei_Texture_SumVariance_ER_20_03,Nuclei_Texture_SumVariance_Mito_20_01
0,BR00101080,B02,BR00101080,BR00101080,B02,vc,PAC_261,0,3,-3.840565,...,-1.814082,0.039732,1.401266,-0.336906,-0.388075,2.894933,0.674111,0.915279,0.533406,0.954802
1,BR00101080,B03,BR00101080,BR00101080,B03,sc,PAC_246,0,3,-0.178843,...,-1.32451,-0.330499,1.344428,2.740432,0.738743,0.208088,-0.359262,-0.52642,-0.7073,-0.273817
2,BR00101080,B04,BR00101080,BR00101080,B04,vc,PAC_246,0,3,0.627837,...,-0.281201,0.867523,0.164746,-0.332073,-0.59539,-0.495383,-0.59937,-0.536304,-0.745026,-0.146896
3,BR00101080,B05,BR00101080,BR00101080,B05,sc,PAC_266,0,3,-0.134838,...,-0.480267,0.412462,0.433084,0.885338,-0.114261,-0.603112,-0.771305,-0.499049,-0.822919,-0.105638
4,BR00101080,B06,BR00101080,BR00101080,B06,vc,PAC_266,0,3,-3.069128,...,-2.483936,-1.712249,2.19371,1.653152,1.362144,6.744472,1.021106,0.937149,0.18647,0.28861


In [6]:
# Output combined file
file = os.path.join("data", "combined_normalized_variable_selected.tsv")
cp_df.to_csv(file, index=False, sep='\t')

# Collapse Data for Morpheus Heatmaps

In [7]:
# Extract out day 15 data
cp_15_df = cp_df.query("Metadata_diff_day in ['15', '15+iso']").reset_index(drop=True)
print(cp_15_df.shape)

cp_non15_df = cp_df.query("Metadata_diff_day not in ['15', '15+iso']").reset_index(drop=True)
print(cp_non15_df.shape)

(114, 549)
(416, 549)


In [8]:
replicate_cols = ["Metadata_Plate",
                  "Metadata_cell_line",
                  "Metadata_patient",
                  "Metadata_FFA",
                  "Metadata_diff_day"]

cp_15_collapsed_df = cp_15_df.groupby(replicate_cols).mean().reset_index()
cp_non15_collapsed_df = cp_non15_df.groupby(replicate_cols).mean().reset_index()

## Use `write_gct.R` to build the Moxrpheus Input

In [9]:
%%R -i cp_df -i batch_id -i backend_dir -i cp_15_df -i cp_non15_df -i cp_15_collapsed_df -i cp_non15_collapsed_df

library(dplyr)
library(magrittr)

file <- file.path("..", "cytominer_scripts", "write_gct.R")
source(file)

output <- file.path("results", "morpheus",
                    paste0("full_", batch_id, "_morpheus.gct"))
channels <- NULL
create_row_annotations <- TRUE
feature_regex <- "^Nuclei_|^Cells_|^Cytoplasm_"

# Step 1: Output combined gct file
write_gct(x = cp_df,
          path = output,
          channels = channels,
          create_row_annotations = create_row_annotations,
          feature_regex = feature_regex)

# Step 2: Output specific plate combinations
# (replicate collapsed and non replicate collapsed)
output <- file.path("results", "morpheus",
                    paste0("full_", batch_id, "_morpheus_day15.gct"))

write_gct(x = cp_15_df,
          path = output,
          channels = channels,
          create_row_annotations = create_row_annotations,
          feature_regex = feature_regex)

output <- file.path("results", "morpheus",
                    paste0("full_", batch_id, "_morpheus_nonday15.gct"))

write_gct(x = cp_non15_df,
          path = output,
          channels = channels,
          create_row_annotations = create_row_annotations,
          feature_regex = feature_regex)

output <- file.path("results", "morpheus",
                    paste0("full_", batch_id, "_morpheus_day15_collapsed.gct"))

write_gct(x = cp_15_collapsed_df,
          path = output,
          channels = channels,
          create_row_annotations = create_row_annotations,
          feature_regex = feature_regex)

output <- file.path("results", "morpheus",
                    paste0("full_", batch_id, "_morpheus_nonday15_collapsed.gct"))

write_gct(x = cp_non15_collapsed_df,
          path = output,
          channels = channels,
          create_row_annotations = create_row_annotations,
          feature_regex = feature_regex)

# Step 3: Also generate and write individual gct files
plate_cols <- readr::cols(
    .default = readr::col_double(),
    Metadata_Plate = readr::col_character(),
    Metadata_Well = readr::col_character(),
    Metadata_Assay_Plate_Barcode = readr::col_character(),
    Metadata_Plate_Map_Name = readr::col_character(),
    Metadata_well_position = readr::col_character(),
    Metadata_cell_line = readr::col_character(),
    Metadata_patient = readr::col_character(),
    Metadata_FFA = readr::col_character(),
    Metadata_diff_day = readr::col_character()
)

all_plate_dirs <- list.files(backend_dir, full.names = TRUE)
for (plate_dir in all_plate_dirs) {
    plate_file <- list.files(plate_dir, full.names = FALSE, pattern = "normalized_variable_selected")[1]
    full_plate_file <- file.path(plate_dir, plate_file)

    df <- readr::read_csv(full_plate_file, col_types = plate_cols)
    
    output_file <- file.path("results", "morpheus",
                             paste0(tools::file_path_sans_ext(plate_file),
                                    "_", batch_id, "_morpheus.gct"))
    write_gct(x = df,
              path = output_file,
              channels = channels,
              create_row_annotations = create_row_annotations,
              feature_regex = feature_regex)
}

  res = PandasDataFrame.from_items(items)
Attaching package: ‘dplyr’



    filter, lag



    intersect, setdiff, setequal, union




## Batch 2

In [10]:
batch_id = "2019_06_11_Batch2"
file = os.path.join("data", "merged_profiles_{}.tsv.gz".format(batch_id))

cp_df = pd.read_csv(file, sep='\t')
print(cp_df.shape)
cp_df.head(3)

(312, 453)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_Assay_Plate_Barcode,Metadata_Plate_Map_Name,Metadata_well_position,Metadata_cell_line,Metadata_condition_O2,Metadata_treatment,Cells_AreaShape_Compactness,Cells_AreaShape_EulerNumber,...,Nuclei_Texture_SumAverage_AGP_20_02,Nuclei_Texture_SumAverage_ER_20_00,Nuclei_Texture_SumAverage_Mito_20_01,Nuclei_Texture_SumEntropy_DNA_5_02,Nuclei_Texture_SumVariance_AGP_10_03,Nuclei_Texture_SumVariance_DNA_10_01,Nuclei_Texture_SumVariance_DNA_20_00,Nuclei_Texture_SumVariance_DNA_20_03,Nuclei_Texture_SumVariance_Mito_20_03,Nuclei_Texture_SumVariance_Mito_5_02
0,BR00101066,B02,BR00101066,BR00101066,B02,SNU761,21,OA,0.85189,-6.002905,...,-0.354477,0.325898,-1.209791,-1.002227,0.304018,-0.441063,-1.493966,-1.256423,-0.875879,-0.896676
1,BR00101066,B03,BR00101066,BR00101066,B03,SNU761,21,PA,-0.526301,-0.21603,...,-0.610291,-0.521134,-0.660242,1.687832,0.867847,1.368041,0.453843,-0.515845,0.042559,0.69548
2,BR00101066,B04,BR00101066,BR00101066,B04,SNU761,21,OA_PA,-0.598634,-0.071529,...,2.984472,1.63434,4.691127,-1.52342,3.588932,-1.171869,0.616111,1.229639,1.907388,0.13326


In [11]:
# Merge replicate cols
replicate_cols = ["Metadata_Plate",
                  "Metadata_cell_line",
                  "Metadata_condition_O2",
                  "Metadata_treatment"]

cp_collapse_df = cp_df.groupby(replicate_cols).mean().reset_index()

## Use `write_gct.R` to build the Morpheus Input

In [12]:
%%R -i cp_df -i batch_id -i cp_collapse_df

library(dplyr)
library(magrittr)

file <- file.path("..", "cytominer_scripts", "write_gct.R")
source(file)

output <- file.path("results", "morpheus",
                    paste0("full_", batch_id, "_morpheus.gct"))
output_collapsed <- file.path("results", "morpheus",
                              paste0("collapsed_", batch_id, "_morpheus.gct"))

channels <- NULL
create_row_annotations <- TRUE
feature_regex <- "^Nuclei_|^Cells_|^Cytoplasm_"

# Step 1: Output combined gct file
write_gct(x = cp_df,
          path = output,
          channels = channels,
          create_row_annotations = create_row_annotations,
          feature_regex = feature_regex)

# Step 2: Output replicate collapsed gct file
write_gct(x = cp_collapse_df,
          path = output_collapsed,
          channels = channels,
          create_row_annotations = create_row_annotations,
          feature_regex = feature_regex)

## Batch 3

In [13]:
batch_id = "2019_08_06_Batch3"
backend_dir = os.path.join("..", "..", "backend", batch_id)

plate_dirs = [os.path.join(backend_dir, x) for x in os.listdir(backend_dir)]

In [14]:
# Build full cell painting dataset
df_list = []
for plate_dir in plate_dirs:
    plate_files = os.listdir(plate_dir)
    for plate_file in plate_files:
        if "normalized_variable_selected.csv" in plate_file:
            plate_file = os.path.join(plate_dir, plate_file)
            df = pd.read_csv(plate_file)
            print("reading {} with profile count: {}".format(plate_file, df.shape[0]))
            df_list.append(df)

reading ../../backend/2019_08_06_Batch3/D14/D14_normalized_variable_selected.csv with profile count: 80
reading ../../backend/2019_08_06_Batch3/Day3/Day3_normalized_variable_selected.csv with profile count: 80
reading ../../backend/2019_08_06_Batch3/Day0/Day0_normalized_variable_selected.csv with profile count: 80
reading ../../backend/2019_08_06_Batch3/Day8/Day8_normalized_variable_selected.csv with profile count: 80


In [15]:
# Combine into a single file
cp_df = pd.concat(df_list).reset_index(drop=True)
cp_df.Metadata_diff_day = cp_df.Metadata_diff_day.astype(str)

print(cp_df.shape)
cp_df.head()

(320, 381)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_Assay_Plate_Barcode,Metadata_Plate_Map_Name,Metadata_well_position,Metadata_cell_line,Metadata_patient,Metadata_FFA,Metadata_diff_day,Cells_AreaShape_Eccentricity,...,Nuclei_Texture_InfoMeas1_ER_5_02,Nuclei_Texture_InfoMeas1_ER_5_03,Nuclei_Texture_InfoMeas1_Mito_5_02,Nuclei_Texture_InfoMeas2_AGP_5_00,Nuclei_Texture_InfoMeas2_DNA_5_02,Nuclei_Texture_InfoMeas2_ER_5_00,Nuclei_Texture_SumAverage_DNA_20_03,Nuclei_Texture_SumEntropy_DNA_20_01,Nuclei_Texture_SumVariance_AGP_5_02,Nuclei_Texture_SumVariance_DNA_5_01
0,D14,A01,D14,D14,A01,sc,252,0,14,0.630662,...,-2.125743,-2.665606,-2.356083,0.682714,-0.725557,1.865769,-2.826216,-3.226772,-0.090367,0.149223
1,D14,A02,D14,D14,A02,vc,252,0,14,0.3143,...,-0.504041,-0.845735,0.011658,0.135625,-0.753222,0.709504,0.677852,1.411267,0.193334,-0.612095
2,D14,A04,D14,D14,A04,sc,214,0,14,1.060115,...,-0.505854,-0.615315,-0.732776,0.090121,0.718639,0.633794,0.975302,1.887109,-0.063542,0.677937
3,D14,A05,D14,D14,A05,vc,214,0,14,0.067539,...,-0.085408,-0.234523,0.018579,-0.547645,-0.538573,-0.179935,-0.275668,0.295252,-0.033855,-0.710885
4,D14,A06,D14,D14,A06,sc,270,0,14,-0.48144,...,-0.069393,-0.234787,-4.93212,-1.852053,1.232599,1.310763,1.594189,0.075149,-0.793935,0.026775


In [16]:
# Output combined file
file = os.path.join("data", "{}_combined_normalized_variable_selected.tsv".format(batch_id))
cp_df.to_csv(file, index=False, sep='\t')

In [17]:
replicate_cols = ["Metadata_Plate",
                  "Metadata_cell_line",
                  "Metadata_patient",
                  "Metadata_FFA",
                  "Metadata_diff_day"]

cp_collapsed_df = cp_df.groupby(replicate_cols).mean().reset_index()

## Use `write_gct.R` to build the Moxrpheus Input

In [18]:
%%R -i cp_df -i batch_id -i cp_df -i cp_collapsed_df

library(dplyr)
library(magrittr)

file <- file.path("..", "cytominer_scripts", "write_gct.R")
source(file)

output <- file.path("results", "morpheus",
                    paste0("full_", batch_id, "_morpheus.gct"))
output_collapsed <- file.path("results", "morpheus",
                              paste0("collapsed_", batch_id, "_morpheus.gct"))

channels <- NULL
create_row_annotations <- TRUE
feature_regex <- "^Nuclei_|^Cells_|^Cytoplasm_"

# Step 1: Output combined gct file
write_gct(x = cp_df,
          path = output,
          channels = channels,
          create_row_annotations = create_row_annotations,
          feature_regex = feature_regex)

# Step 2: Output replicate collapsed gct file
write_gct(x = cp_collapsed_df,
          path = output_collapsed,
          channels = channels,
          create_row_annotations = create_row_annotations,
          feature_regex = feature_regex)