# Apply Signature Analysis to Cell Morphology Features

Gregory Way, 2020

Here, I apply [`singscore`](https://bioconductor.org/packages/devel/bioc/vignettes/singscore/inst/doc/singscore.html) ([Foroutan et al. 2018](https://doi.org/10.1186/s12859-018-2435-4)) to our Cell Painting profiles.
This notebook largely follows the [package vignette](https://bioconductor.org/packages/devel/bioc/vignettes/singscore/inst/doc/singscore.html).

To review how I derived these signatures see `1.derive-bulk-signatures.ipynb`.

In [1]:
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(singscore))

source(file.path("utils", "singscore_utils.R"))

In [2]:
seed <- 1234
num_permutations <- 1000
datasets <- c(
    "cloneAE",
    "ixazomib",
    "cb5083"
)

data_dir <- "data"
data_file <- file.path(data_dir, "bulk_profiles_analytical_set.csv.gz")

input_results_dir <- file.path("results", "signatures")
signature_file <- file.path(input_results_dir, "signature_summary_full_bulk_signature.tsv")
tukey_file <- file.path(input_results_dir, "tukey_results_full_bulk_signature.tsv.gz")

output_dir <- file.path("results", "singscore")
output_results_file <- file.path(output_dir, "full_bulk_signature_singscore_results.tsv.gz")

In [3]:
set.seed(seed)

In [4]:
# Load profiles
bulk_col_types <- readr::cols(
    .default = readr::col_double(),
    Metadata_Plate = readr::col_character(),
    Metadata_Well = readr::col_character(),
    Metadata_batch = readr::col_character(),
    Metadata_clone_number = readr::col_character(),
    Metadata_plate_map_name = readr::col_character(),
    Metadata_treatment = readr::col_character(),
    Metadata_dataset = readr::col_character(),
    Metadata_clone_type = readr::col_character(),
    Metadata_clone_type_indicator = readr::col_character(),
    Metadata_model_split = readr::col_character(),
    Metadata_cell_density = readr::col_character(),
    Metadata_plate_filename = readr::col_character(),
    Metadata_treatment_time = readr::col_character(),
    Metadata_unique_sample_name = readr::col_character(),
    Metadata_time_to_adhere = readr::col_character()
)

bulk_df <- readr::read_csv(data_file, col_types = bulk_col_types)

print(dim(bulk_df))
head(bulk_df, 4)

[1]  612 3547


Metadata_Plate,Metadata_Well,Metadata_batch,Metadata_clone_number,Metadata_plate_map_name,Metadata_treatment,Metadata_dataset,Metadata_clone_type,Metadata_clone_type_indicator,Metadata_model_split,⋯,Nuclei_Texture_Variance_RNA_10_02,Nuclei_Texture_Variance_RNA_10_03,Nuclei_Texture_Variance_RNA_20_00,Nuclei_Texture_Variance_RNA_20_01,Nuclei_Texture_Variance_RNA_20_02,Nuclei_Texture_Variance_RNA_20_03,Nuclei_Texture_Variance_RNA_5_00,Nuclei_Texture_Variance_RNA_5_01,Nuclei_Texture_Variance_RNA_5_02,Nuclei_Texture_Variance_RNA_5_03
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
HCT116bortezomib,B03,2019_02_15_Batch1_20X,WT_parental,PlateMap_HCT116bortezomib,0.1% DMSO,cloneAE,sensitive,0,training,⋯,-0.793648,-0.7971384,-0.8497648,-0.8422521,-0.8371976,-0.8360676,-0.7911125,-0.7921908,-0.7929302,-0.7948077
HCT116bortezomib,B04,2019_02_15_Batch1_20X,WT_parental,PlateMap_HCT116bortezomib,0.1% DMSO,cloneAE,sensitive,0,training,⋯,-0.6146661,-0.6211401,-0.6681232,-0.6482634,-0.6645153,-0.6630872,-0.6084365,-0.6124887,-0.6099403,-0.6126431
HCT116bortezomib,B05,2019_02_15_Batch1_20X,WT_parental,PlateMap_HCT116bortezomib,0.1% DMSO,cloneAE,sensitive,0,training,⋯,-0.7211521,-0.726177,-0.7334183,-0.7047708,-0.7273677,-0.7210842,-0.7165588,-0.7199086,-0.7220983,-0.721685
HCT116bortezomib,B06,2019_02_15_Batch1_20X,CloneA,PlateMap_HCT116bortezomib,0.1% DMSO,cloneAE,resistant,1,training,⋯,-0.9428608,-0.9424811,-0.9696387,-0.9835661,-0.9495513,-0.9579973,-0.9454605,-0.9447918,-0.9505914,-0.9474764


In [5]:
# Load signatures
sig_col_types <- readr::cols(
    signature_features = readr::col_character(),
    plate_exclude = readr::col_logical(),
    batch_exclude = readr::col_logical(),
    non_specific_exclude = readr::col_logical(),
    final_signature = readr::col_logical(),
    dataset = readr::col_character()
)

signature_df <- readr::read_tsv(signature_file, col_types = sig_col_types)

print(dim(signature_df))
head(signature_df, 4)

[1] 186   6


signature_features,plate_exclude,batch_exclude,non_specific_exclude,final_signature,dataset
<chr>,<lgl>,<lgl>,<lgl>,<lgl>,<chr>
Cytoplasm_Correlation_Costes_RNA_Mito,False,True,False,False,cloneAE
Cytoplasm_Correlation_Manders_DNA_ER,False,True,False,False,cloneAE
Nuclei_RadialDistribution_MeanFrac_Mito_4of4,False,False,False,True,cloneAE
Cytoplasm_Granularity_3_Mito,False,True,False,False,cloneAE


In [6]:
# Load Tukey results (to determine if feature is "up" or "down")
tukey_cols <- readr::cols(
    term = readr::col_character(),
    comparison = readr::col_character(),
    estimate = readr::col_double(),
    conf.low = readr::col_double(),
    conf.high = readr::col_double(),
    adj.p.value = readr::col_double(),
    feature = readr::col_character(),
    neg_log_adj_p = readr::col_double(),
    dataset = readr::col_character()
)

tukey_df <- readr::read_tsv(tukey_file, col_types = tukey_cols)

print(dim(tukey_df))
head(tukey_df, 4)

[1] 23288     9


term,comparison,estimate,conf.low,conf.high,adj.p.value,feature,neg_log_adj_p,dataset
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<chr>
Metadata_clone_type_indicator,1-0,-0.07805113,-0.07805113,-0.07805113,0,Cytoplasm_Correlation_Costes_RNA_Mito,inf,cloneAE
Metadata_batch,2019_03_20_Batch2-2019_02_15_Batch1_20X,0.36668143,0.36668143,0.36668143,0,Cytoplasm_Correlation_Costes_RNA_Mito,inf,cloneAE
Metadata_batch,2020_07_02_Batch8-2019_02_15_Batch1_20X,-0.15710637,-0.15710637,-0.15710637,0,Cytoplasm_Correlation_Costes_RNA_Mito,inf,cloneAE
Metadata_batch,2020_07_02_Batch8-2019_03_20_Batch2,-0.5237878,-0.5237878,-0.5237878,0,Cytoplasm_Correlation_Costes_RNA_Mito,inf,cloneAE


## Process signature

Each signature contains features that are "up" and features that are "down".
An "up" feature has higher values in resistant clones, while a "down" feature has lower values in resistant clones.
We determine if the feature is up or down based on the tukey estimate comparison.

In [7]:
signature_features <- list()
for (dataset in datasets) {
    # Subset data to process dataset-specific signature
    signature_subset_df <- signature_df %>%
        dplyr::filter(dataset == !!dataset, final_signature)
    
    tukey_subset_df <- tukey_df %>%
        dplyr::filter(
            dataset == !!dataset,
            term == "Metadata_clone_type_indicator",
            feature %in% signature_subset_df$signature_features
        )
    
    # Ensure that the comparison is always resistant vs. senstive
    # and never the other way around!
    stopifnot(length(table(tukey_subset_df$comparison)) == 1)
    
    # Determine feature direction
    up_features <- tukey_subset_df %>% dplyr::filter(estimate > 0) %>% dplyr::pull(feature)
    down_features <- tukey_subset_df %>% dplyr::filter(estimate < 0) %>% dplyr::pull(feature)
    
    # Store signature for downstream analyses
    signature_features[[dataset]] <- list("up" = up_features, "down" = down_features)
}

In [8]:
# Print signature size info
for (dataset in datasets) {
    print(dataset)
    print(length(signature_features[[dataset]][["up"]]))
    print(length(signature_features[[dataset]][["down"]]))
}

[1] "cloneAE"
[1] 11
[1] 8
[1] "ixazomib"
[1] 12
[1] 17
[1] "cb5083"
[1] 58
[1] 39


## Apply singscore

Apply the algorithm to each dataset using each signature.

This means that I apply signatures to training, testing, and validation sets.
The testing and validation sets were not used to build the signature.
The validation set comes from an entirely different plate.
This also means that I apply the signature derived from one experiment to all other drug resistant experiments.

In [9]:
sing_score_results <- list()
for (dataset in datasets) {
    
    bulk_subset_df <- bulk_df %>% dplyr::filter(Metadata_dataset == !!dataset)
    sing_score_results[[dataset]] <- list()
    
    for (signature in datasets) {
        signature_info <- signature_features[[signature]]

        singscore_output = singscorePipeline(
            df = bulk_subset_df,
            sig_feature_list = signature_info,
            num_permutations = num_permutations
        )
        
        full_results_df <- singscore_output[["results"]]
        permuted <- singscore_output[["permuted"]]

        # Get max and minimum values of permutation results
        min_val <- quantile(as.vector(as.matrix(permuted)), 0.05)
        max_val <- quantile(as.vector(as.matrix(permuted)), 0.95)
        
        # Annotate some key metadata and store to list
        full_results_df <- full_results_df %>%
            dplyr::mutate(
                dataset = dataset,
                signature = signature,
                min_permuted_value = min_val,
                max_permuted_value = max_val
            )
        
        # Store results
        sing_score_results[[paste0(dataset, "-", signature)]] <- full_results_df
    }
}

“'tidy.numeric' is deprecated.
“`data_frame()` is deprecated as of tibble 1.1.0.
Please use `tibble()` instead.
“'tidy.numeric' is deprecated.
“'tidy.numeric' is deprecated.
“'tidy.numeric' is deprecated.
“'tidy.numeric' is deprecated.
“'tidy.numeric' is deprecated.
“'tidy.numeric' is deprecated.
“'tidy.numeric' is deprecated.
“'tidy.numeric' is deprecated.
See help("Deprecated")”

In [10]:
all_singscore_results_df <- dplyr::bind_rows(sing_score_results)

table(all_singscore_results_df$dataset, all_singscore_results_df$signature)

          
           cb5083 cloneAE ixazomib
  cb5083      240     240      240
  cloneAE     192     192      192
  ixazomib    180     180      180

In [11]:
all_singscore_results_df %>% readr::write_tsv(output_results_file)

print(dim(all_singscore_results_df))
head(all_singscore_results_df)

[1] 1836   30


Metadata_Plate,Metadata_Well,Metadata_batch,Metadata_clone_number,Metadata_plate_map_name,Metadata_treatment,Metadata_dataset,Metadata_clone_type,Metadata_clone_type_indicator,Metadata_model_split,⋯,TotalDispersion,UpScore,UpDispersion,DownScore,DownDispersion,Metadata_permuted_p_value,dataset,signature,min_permuted_value,max_permuted_value
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<dbl>
HCT116bortezomib,B03,2019_02_15_Batch1_20X,WT_parental,PlateMap_HCT116bortezomib,0.1% DMSO,cloneAE,sensitive,0,training,⋯,1982.2362,-0.045842273,1478.1522,-0.03398437,504.084,0.741,cloneAE,cloneAE,-0.2246796,0.2273132
HCT116bortezomib,B04,2019_02_15_Batch1_20X,WT_parental,PlateMap_HCT116bortezomib,0.1% DMSO,cloneAE,sensitive,0,training,⋯,2151.9939,-0.088311319,1599.7254,-0.01583807,552.2685,0.789,cloneAE,cloneAE,-0.2246796,0.2273132
HCT116bortezomib,B05,2019_02_15_Batch1_20X,WT_parental,PlateMap_HCT116bortezomib,0.1% DMSO,cloneAE,sensitive,0,training,⋯,2204.6262,-0.001124409,1395.1266,0.04009233,809.4996,0.421,cloneAE,cloneAE,-0.2246796,0.2273132
HCT116bortezomib,B06,2019_02_15_Batch1_20X,CloneA,PlateMap_HCT116bortezomib,0.1% DMSO,cloneAE,resistant,1,training,⋯,919.9533,0.251053325,441.8148,0.16054687,478.1385,0.003,cloneAE,cloneAE,-0.2246796,0.2273132
HCT116bortezomib,B07,2019_02_15_Batch1_20X,CloneA,PlateMap_HCT116bortezomib,0.1% DMSO,cloneAE,resistant,1,training,⋯,654.5679,0.257644687,343.9632,0.16491477,310.6047,0.001,cloneAE,cloneAE,-0.2246796,0.2273132
HCT116bortezomib,B08,2019_02_15_Batch1_20X,CloneA,PlateMap_HCT116bortezomib,0.1% DMSO,cloneAE,resistant,1,training,⋯,762.0564,0.323506604,373.6152,0.21793324,388.4412,0.001,cloneAE,cloneAE,-0.2246796,0.2273132
