## Building a resistance signature

Gregory Way, 2021

We use the training profiles as defined in `0.training-test-split.ipynb` to identify a signature of morphology features that are most different between resistant and sensitive clones.

## Models

Our goal is to identify morphology features that purely influence sensitivity status and are not influenced by technical covariates.
The technical covariates we consider to be nuissance factors include:

* Batch
* Treatment time
* Intraclonal type variation
  * E.g. features significantly different between individual clones of the same senstivity status

To isolate features explained by clone status, we identify the contribution of technical covariates to the variance in feature differences.

We apply two models:

1. Analysis of Variance (ANOVA) for categorical variables 
2. Linear model for cell count as a continuous variable

## Procedure

1. Load profiles
2. Subset to training set samples
3. Subset to feature selected features (see `0.training-test-split.ipynb` for details)
4. Fit the models
5. Select features based on a systematic removal of features that are significantly implicated by technical artifacts
6. Form the signature
  * The signature is all features that are significantly different for the resistance status factor and not removed via the procedure above

In [1]:
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(broom))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(ggrepel))

source(file.path("utils", "signature_utils.R"))

In [2]:
set.seed(123)

dataset <- "bortezomib"
input_data_dir <- "data"
data_file <- file.path(input_data_dir, "bortezomib_signature_analytical_set.tsv.gz")
feat_file <- file.path(input_data_dir, "dataset_features_selected.tsv")

output_fig_dir = file.path("figures", "anova")
output_results_dir = file.path("results", "signatures")

alpha = 0.05

In [3]:
# Load profiles
bulk_col_types <- readr::cols(
    .default = readr::col_double(),
    Metadata_Plate = readr::col_character(),
    Metadata_Well = readr::col_character(),
    Metadata_cell_count = readr::col_integer(),
    Metadata_batch = readr::col_character(),
    Metadata_clone_number = readr::col_character(),
    Metadata_plate_map_name = readr::col_character(),
    Metadata_treatment = readr::col_character(),
    Metadata_dataset = readr::col_character(),
    Metadata_clone_type = readr::col_character(),
    Metadata_clone_type_indicator = readr::col_character(),
    Metadata_model_split = readr::col_character(),
    Metadata_cell_density = readr::col_character(),
    Metadata_treatment_time = readr::col_character(),
    Metadata_unique_sample_name = readr::col_character(),
    Metadata_time_to_adhere = readr::col_character()
)

data_df <- readr::read_tsv(data_file, col_types = bulk_col_types)

print(dim(data_df))
head(data_df, 4)

[1]  420 3546


Metadata_Plate,Metadata_Well,Metadata_batch,Metadata_cell_count,Metadata_cell_density,Metadata_celltype_shorthand_from_plate_graph,Metadata_clone_number,Metadata_date,Metadata_plate_map_name,Metadata_time_to_adhere,⋯,Nuclei_Texture_Variance_RNA_10_02,Nuclei_Texture_Variance_RNA_10_03,Nuclei_Texture_Variance_RNA_20_00,Nuclei_Texture_Variance_RNA_20_01,Nuclei_Texture_Variance_RNA_20_02,Nuclei_Texture_Variance_RNA_20_03,Nuclei_Texture_Variance_RNA_5_00,Nuclei_Texture_Variance_RNA_5_01,Nuclei_Texture_Variance_RNA_5_02,Nuclei_Texture_Variance_RNA_5_03
<chr>,<chr>,<chr>,<int>,<chr>,<dbl>,<chr>,<dbl>,<chr>,<chr>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
219907,B02,2021_03_03_Batch12,6139,2.5x10^3 cells/well,1,WT_parental,20210205,219814,48 hr,⋯,-0.6430638,-0.6342366,-0.6554599,-0.6673215,-0.6546287,-0.6977682,-0.6320059,-0.6297243,-0.6345844,-0.632946
219907,B03,2021_03_03_Batch12,4567,2.5x10^3 cells/well,2,CloneA,20210205,219814,48 hr,⋯,-0.8308873,-0.8344854,-0.8304672,-0.8178338,-0.8309076,-0.8234369,-0.8347194,-0.8332018,-0.8327623,-0.8296075
219907,B04,2021_03_03_Batch12,5624,2.5x10^3 cells/well,3,CloneE,20210205,219814,48 hr,⋯,-0.9813265,-0.97597,-0.9728225,-0.9617426,-0.9721434,-0.9618017,-0.9802334,-0.9790954,-0.9839611,-0.979349
219907,B05,2021_03_03_Batch12,5894,2.5x10^3 cells/well,4,WT clone 01,20210205,219814,48 hr,⋯,-1.0862537,-1.0826744,-1.0689886,-1.0663278,-1.0579078,-1.061034,-1.0902747,-1.0863724,-1.0886331,-1.0853202


In [4]:
# Load feature selected features
all_selected_features_df <- readr::read_tsv(feat_file, col_types = readr::cols())
head(all_selected_features_df, 3)

features,dataset
<chr>,<chr>
Cells_AreaShape_Compactness,bortezomib
Cells_AreaShape_Eccentricity,bortezomib
Cells_AreaShape_Extent,bortezomib


In [5]:
# Subset dataset
bulk_subset_df <- data_df %>%
    dplyr::filter(
        Metadata_dataset == !!dataset,
        Metadata_model_split == "training",
    )

# Apply feature selection performed in 0.compile-bulk-datasets
selected_features <- all_selected_features_df %>%
    dplyr::filter(dataset == !!dataset) %>%
    dplyr::pull(features)

bulk_subset_df <- bulk_subset_df %>%
    dplyr::select(starts_with("Metadata"), all_of(selected_features))

# Populate the list for signature building
bulk_subset_df$Metadata_clone_type_indicator <- factor(
    bulk_subset_df$Metadata_clone_type_indicator, levels = c("0", "1")
)

# Print dataset description
print(paste("Training dataset:", dataset))
print(table(
    bulk_subset_df$Metadata_clone_number,
    bulk_subset_df$Metadata_batch
))

[1] "Training dataset: bortezomib"
             
              2021_03_03_Batch12 2021_03_03_Batch13 2021_03_03_Batch15
  BZ001                        4                  3                  4
  BZ002                        3                  3                  3
  BZ003                        4                  4                  2
  BZ004                        3                  4                  4
  BZ005                        3                  4                  3
  WT clone 01                  4                  4                  2
  WT clone 02                  4                  2                  4
  WT clone 03                  2                  4                  4
  WT clone 04                  4                  2                  4
  WT clone 05                  4                  3                  4
             
              2021_03_05_Batch16 2021_03_05_Batch17 2021_03_12_Batch18
  BZ001                        4                  2                  3
  BZ002       

In [6]:
formula_terms <- paste(
    "~",
    "Metadata_clone_type_indicator", "+",
    "Metadata_treatment_time", "+",
    "Metadata_batch", "+",
    "Metadata_clone_number"
)

cell_count_formula <- paste(
    "~",
    "Metadata_clone_type_indicator", "+",
    "scale(Metadata_cell_count)"
)

In [7]:
# Fit ANOVA to determine sources of variation and process results
lm_results <- perform_anova(bulk_subset_df, formula_terms)

# Order the full results data frame by significance and extract feature names
full_results_df <- lm_results[["full_results_df"]] %>%
    dplyr::arrange(desc(neg_log_p))

features <- unique(full_results_df$feature)

# Perform TukeyHSD posthoc test
tukey_results <- process_tukey(
    aov_list = lm_results[["aovs"]],
    features = features
)

# Fit a linear model on cell counts
cell_count_results <- perform_linear_model(bulk_subset_df, cell_count_formula) %>%
    dplyr::arrange(desc(neg_log_p)) %>%
    dplyr::mutate(dataset = dataset)

In [8]:
# Isolate features that represent significant differences between resistant and senstive clones
anova_results_df <- lm_results[["full_results_df"]] %>%
    dplyr::mutate(dataset = dataset)

tukey_results_df <- tukey_results %>%
    dplyr::mutate(dataset = dataset)

features <- unique(anova_results_df$feature)

num_cp_features <- length(features)
signif_line <- -log10(alpha / num_cp_features)

signif_line

In [9]:
# Derive signature by systematically removing features influenced by technical artifacts
signature_features <- tukey_results_df %>%
    dplyr::filter(term == "Metadata_clone_type_indicator", neg_log_adj_p > !!signif_line) %>%
    dplyr::pull(feature)

# Determine if the clone number comparison is between like-clones
wt_clone_count <- stringr::str_count(
    tukey_results_df %>%
    dplyr::filter(term == "Metadata_clone_number") %>%
    dplyr::pull("comparison"), "WT"
)

# Exclude features with consistent within sensitivity-type clones
feature_exclude_nonspecific_variation <- tukey_results_df %>%
    dplyr::filter(term == "Metadata_clone_number") %>%
    dplyr::mutate(wt_clone_count = wt_clone_count) %>%
    dplyr::filter(neg_log_adj_p > !!signif_line, wt_clone_count != 1) %>%
    dplyr::count(feature) %>%
    dplyr::arrange(desc(n)) %>%
    dplyr::filter(n > 1) %>%
    dplyr::pull(feature)

# Exclude features that are significantly different as explained by batch
feature_exclude_batch <- tukey_results_df %>%
    dplyr::filter(term == "Metadata_batch", neg_log_adj_p > !!signif_line) %>%
    dplyr::pull(feature)

# Exclude features that are significantly impacted by cell count
feature_exclude_cell_count <- cell_count_results %>%
    dplyr::filter(term == "scale(Metadata_cell_count)", neg_log_p > !!signif_line) %>%
    dplyr::pull(feature)

# Exclude features that are significantly impacted by treatment time
feature_exclude_time <- tukey_results_df %>%
    dplyr::filter(term == "Metadata_treatment_time", neg_log_adj_p > !!signif_line) %>%
    dplyr::pull(feature)

# Restrict signature
final_signature_features <- setdiff(
    signature_features, unique(feature_exclude_cell_count)
)
final_signature_features <- setdiff(
    final_signature_features, unique(feature_exclude_nonspecific_variation)
)
final_signature_features <- setdiff(
    final_signature_features, unique(feature_exclude_batch)
)
final_signature_features <- setdiff(
    final_signature_features, unique(feature_exclude_cell_count)
)
final_signature_features <- setdiff(
    final_signature_features, unique(feature_exclude_time)
)
# Create a summary of the signatures
signature_summary_df <- tibble(features)

signature_summary_df <- signature_summary_df %>%
    dplyr::mutate(
        non_status_significant_exclude = !(signature_summary_df$features %in% signature_features),
        batch_exclude = signature_summary_df$features %in% feature_exclude_batch,
        cell_count_exclude = signature_summary_df$features %in% feature_exclude_cell_count,
        non_specific_exclude = signature_summary_df$features %in% feature_exclude_nonspecific_variation,
        treatment_time_exclude = signature_summary_df$features %in% feature_exclude_time,
        final_signature = signature_summary_df$features %in% final_signature_features,
        dataset = dataset
    )

print(paste("For the dataset:", dataset))
print(paste("the number of features in the core signature:", sum(signature_summary_df$final_signature)))

[1] "For the dataset: bortezomib"
[1] "the number of features in the core signature: 45"


In [10]:
# Determine feature direction
final_signature <- signature_summary_df %>% dplyr::filter(final_signature)

tukey_subset_results_df <- tukey_results_df %>%
    dplyr::filter(
        dataset == !!dataset,
        term == "Metadata_clone_type_indicator",
        feature %in% final_signature$features
    )

up_features <- tukey_subset_results_df %>% dplyr::filter(estimate > 0) %>% dplyr::pull(feature)
down_features <- tukey_subset_results_df %>% dplyr::filter(estimate < 0) %>% dplyr::pull(feature)

# Store signature for downstream analyses
signature_features <- list("up" = up_features, "down" = down_features)

signature_features

In [11]:
anova_output_file <- file.path(output_results_dir, paste0("anova_results_", dataset, "_signature.tsv.gz"))
tukey_output_file <- file.path(output_results_dir, paste0("tukey_results_", dataset, "_signature.tsv.gz"))
cell_count_output_file <- file.path(output_results_dir, paste0("lm_cell_count_results_", dataset, "_signature.tsv.gz"))
signature_output_file <- file.path(output_results_dir, paste0("signature_summary_", dataset, "_signature.tsv.gz"))

anova_results_df %>% readr::write_tsv(anova_output_file)
tukey_results_df %>% readr::write_tsv(tukey_output_file)
cell_count_results %>% readr::write_tsv(cell_count_output_file)
signature_summary_df %>% readr::write_tsv(signature_output_file)