## Generate Morpheus Input Data

**Gregory Way, 2019**

Use this script to concatenate all of the cell painting data into one `.gct` file for input into morpheus.

In [1]:
import os
import pandas as pd

In [2]:
%load_ext rpy2.ipython

In [3]:
batch_id = "CRISPR_PILOT_B1"
backend_dir = os.path.join("..", "..", "..", "backend", batch_id)

plate_dirs = [os.path.join(backend_dir, x) for x in os.listdir(backend_dir)]
plate_dirs

['../../../backend/CRISPR_PILOT_B1/SQ00014618',
 '../../../backend/CRISPR_PILOT_B1/SQ00014611',
 '../../../backend/CRISPR_PILOT_B1/SQ00014616',
 '../../../backend/CRISPR_PILOT_B1/SQ00014617',
 '../../../backend/CRISPR_PILOT_B1/SQ00014610',
 '../../../backend/CRISPR_PILOT_B1/SQ00014615',
 '../../../backend/CRISPR_PILOT_B1/SQ00014612',
 '../../../backend/CRISPR_PILOT_B1/SQ00014613',
 '../../../backend/CRISPR_PILOT_B1/SQ00014614']

In [4]:
# Build full cell painting dataset
df_list = []
for plate_dir in plate_dirs:
    plate_files = os.listdir(plate_dir)
    for plate_file in plate_files:
        if "normalized_variable_selected.csv" in plate_file:
            plate_file = os.path.join(plate_dir, plate_file)
            df = pd.read_csv(plate_file)
            print("reading {} with profile count: {}".format(plate_file, df.shape[0]))
            df_list.append(df)

reading ../../../backend/CRISPR_PILOT_B1/SQ00014618/SQ00014618_normalized_variable_selected.csv with profile count: 384
reading ../../../backend/CRISPR_PILOT_B1/SQ00014611/SQ00014611_normalized_variable_selected.csv with profile count: 384
reading ../../../backend/CRISPR_PILOT_B1/SQ00014616/SQ00014616_normalized_variable_selected.csv with profile count: 384
reading ../../../backend/CRISPR_PILOT_B1/SQ00014617/SQ00014617_normalized_variable_selected.csv with profile count: 384
reading ../../../backend/CRISPR_PILOT_B1/SQ00014610/SQ00014610_normalized_variable_selected.csv with profile count: 384
reading ../../../backend/CRISPR_PILOT_B1/SQ00014615/SQ00014615_normalized_variable_selected.csv with profile count: 384
reading ../../../backend/CRISPR_PILOT_B1/SQ00014612/SQ00014612_normalized_variable_selected.csv with profile count: 384
reading ../../../backend/CRISPR_PILOT_B1/SQ00014613/SQ00014613_normalized_variable_selected.csv with profile count: 384
reading ../../../backend/CRISPR_PILOT_B1

In [5]:
# Combine into a single file
cp_df = pd.concat(df_list).reset_index(drop=True)

cp_df.Metadata_broad_sample = cp_df.Metadata_broad_sample.astype(str)
cp_df.Metadata_pert_mfc_id = cp_df.Metadata_pert_mfc_id.astype(str)

print(cp_df.shape)
cp_df.head()

(3456, 262)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_Assay_Plate_Barcode,Metadata_Plate_Map_Name,Metadata_well_position,Metadata_WellRow,Metadata_WellCol,Metadata_gene_name,Metadata_pert_name,Metadata_broad_sample,...,Nuclei_Texture_InverseDifferenceMoment_AGP_10_0,Nuclei_Texture_InverseDifferenceMoment_AGP_5_0,Nuclei_Texture_InverseDifferenceMoment_DNA_5_0,Nuclei_Texture_InverseDifferenceMoment_Mito_5_0,Nuclei_Texture_SumAverage_AGP_5_0,Nuclei_Texture_SumAverage_DNA_20_0,Nuclei_Texture_SumAverage_ER_5_0,Nuclei_Texture_SumEntropy_RNA_20_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_Mito_20_0
0,SQ00014618,A01,SQ00014618,DEPENDENCIES1_HCC44,A01,A,1,EMPTY,EMPTY,,...,-1.42448,-0.972617,-0.490979,-1.365392,2.307568,0.088018,-2.775524,0.605332,-0.180865,-1.082234
1,SQ00014618,A02,SQ00014618,DEPENDENCIES1_HCC44,A02,A,2,MCL1,MCL1-5,,...,-2.071168,-1.361724,0.858007,-0.070069,3.776635,-0.719651,0.947498,1.107159,1.895482,1.142913
2,SQ00014618,A03,SQ00014618,DEPENDENCIES1_HCC44,A03,A,3,AKT1,AKT1-1,BRDN0001054908,...,-0.974627,-0.759857,-0.47334,-1.491414,1.743963,-0.771675,-0.51088,-0.819023,-0.972473,-0.306569
3,SQ00014618,A04,SQ00014618,DEPENDENCIES1_HCC44,A04,A,4,KRAS,KRAS-2B,,...,-0.487037,-0.704191,0.056703,-0.956379,1.467339,-1.773067,-0.749321,-0.236351,-0.466001,-1.229109
4,SQ00014618,A05,SQ00014618,DEPENDENCIES1_HCC44,A05,A,5,AKT1,AKT1-2,BRDN0001055115,...,-1.107815,-0.858286,0.297309,-1.200222,2.129282,0.122742,-0.719243,-0.616396,-0.807424,-0.710445


# Extract Data for the Most Replicable Genes

The genes include: *ITGAV*, *KIF11*, *MYC*, *POLR2D*, and *PSMA1*. (Plus one control LacZ)

In [6]:
# We are interested here in the most replicable CRISPR'd genes
genes = ['ITGAV', 'KIF11', 'MYC', 'POLR2D', 'PSMA1', 'LacZ']
cp_genes_df = cp_df.query("Metadata_gene_name in @genes").reset_index(drop=True)

cp_genes_df = (
    cp_genes_df
    .groupby(
        ['Metadata_cell_line', 'Metadata_gene_name', 'Metadata_pert_name']
    )
    .mean()
    .reset_index()
)

print(cp_genes_df.shape)
cp_genes_df.head(2)

(36, 250)


Unnamed: 0,Metadata_cell_line,Metadata_gene_name,Metadata_pert_name,Metadata_WellCol,Metadata_pert_id,Metadata_pert_id_vendor,Cells_AreaShape_Solidity,Cells_AreaShape_Zernike_2_0,Cells_AreaShape_Zernike_2_2,Cells_AreaShape_Zernike_4_2,...,Nuclei_Texture_InverseDifferenceMoment_AGP_10_0,Nuclei_Texture_InverseDifferenceMoment_AGP_5_0,Nuclei_Texture_InverseDifferenceMoment_DNA_5_0,Nuclei_Texture_InverseDifferenceMoment_Mito_5_0,Nuclei_Texture_SumAverage_AGP_5_0,Nuclei_Texture_SumAverage_DNA_20_0,Nuclei_Texture_SumAverage_ER_5_0,Nuclei_Texture_SumEntropy_RNA_20_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_Mito_20_0
0,A549,ITGAV,ITGAV-1,12.5,,,-0.124933,-1.086303,-3.885984,0.724592,...,-0.271258,0.362721,7.195031,-0.147917,0.824257,-7.019511,5.711034,3.216943,2.837458,3.735676
1,A549,ITGAV,ITGAV-2,12.5,,,-0.068721,-1.423705,-4.905642,0.248028,...,-0.314611,0.271491,6.825672,-0.328807,0.971509,-7.566384,6.603913,2.89616,3.25049,4.348153


## Use `write_gct.R` to build the Morpheus Input

In [7]:
%%R -i cp_df -i batch_id -i backend_dir -i cp_genes_df

library(dplyr)
library(magrittr)

file <- file.path("..", "..", "cytominer_scripts", "write_gct.R")
source(file)

output <- file.path("results", "morpheus",
                    paste0("full_", batch_id, "_morpheus.gct"))
channels <- NULL
create_row_annotations <- TRUE
feature_regex <- "^Nuclei_|^Cells_|^Cytoplasm_"

# Step 1: Output combined gct file
write_gct(x = cp_df,
          path = output,
          channels = channels,
          create_row_annotations = create_row_annotations,
          feature_regex = feature_regex)

# Step 2: Output specific genes
# (replicate collapsed and non replicate collapsed)
output <- file.path("results", "morpheus", "reproducible_genes.gct")

write_gct(x = cp_genes_df,
          path = output,
          channels = channels,
          create_row_annotations = create_row_annotations,
          feature_regex = feature_regex)

# Step 3: Also generate and write individual gct files
plate_cols = readr::cols(
    .default = readr::col_double(),
    Metadata_Plate = readr::col_character(),
    Metadata_Well = readr::col_character(),
    Metadata_Assay_Plate_Barcode = readr::col_character(),
    Metadata_Plate_Map_Name = readr::col_character(),
    Metadata_well_position = readr::col_character(),
    Metadata_WellRow = readr::col_character(),
    Metadata_WellCol = readr::col_integer(),
    Metadata_gene_name = readr::col_character(),
    Metadata_pert_name = readr::col_character(),
    Metadata_broad_sample = readr::col_character(),
    Metadata_cell_line = readr::col_character(),
    Metadata_pert_id = readr::col_character(),
    Metadata_pert_mfc_id = readr::col_character(),
    Metadata_pert_well = readr::col_character(),
    Metadata_pert_id_vendor = readr::col_character(),
    Metadata_cell_id = readr::col_character(),
    Metadata_broad_sample_type = readr::col_character(),
    Metadata_pert_type = readr::col_character()
)

all_plate_dirs <- list.files(backend_dir, full.names = TRUE)
for (plate_dir in all_plate_dirs) {
    plate_file <- list.files(plate_dir, full.names = FALSE,
                             pattern = "normalized_variable_selected")[1]
    full_plate_file <- file.path(plate_dir, plate_file)

    df <- readr::read_csv(full_plate_file, col_types = plate_cols)
    
    output_file <- file.path("results", "morpheus",
                             paste0(tools::file_path_sans_ext(plate_file),
                                    "_", batch_id, "_morpheus.gct"))
    write_gct(x = df,
              path = output_file,
              channels = channels,
              create_row_annotations = create_row_annotations,
              feature_regex = feature_regex)
}

  res = PandasDataFrame.from_items(items)
Attaching package: ‘dplyr’



    filter, lag



    intersect, setdiff, setequal, union


