In [1]:
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(broom))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(ggrepel))

source(file.path("../3.bulk-signatures/scripts", "signature_utils.R"))

In [2]:
set.seed(123)

dataset <- "bortezomib"
input_data_dir <- "data"
data_file <- file.path(input_data_dir, "bortezomib_signature_analytical_set.tsv.gz")

output_fig_dir = file.path("figures", "anova")
output_results_dir = file.path("results", "signatures")

In [3]:
# Load profiles
bulk_col_types <- readr::cols(
    .default = readr::col_double(),
    Metadata_Plate = readr::col_character(),
    Metadata_Well = readr::col_character(),
    Metadata_cell_count = readr::col_integer(),
    Metadata_batch = readr::col_character(),
    Metadata_clone_number = readr::col_character(),
    Metadata_plate_map_name = readr::col_character(),
    Metadata_treatment = readr::col_character(),
    Metadata_dataset = readr::col_character(),
    Metadata_clone_type = readr::col_character(),
    Metadata_clone_type_indicator = readr::col_character(),
    Metadata_model_split = readr::col_character(),
    Metadata_cell_density = readr::col_character(),
    Metadata_treatment_time = readr::col_character(),
    Metadata_unique_sample_name = readr::col_character(),
    Metadata_time_to_adhere = readr::col_character()
)

data_df <- readr::read_tsv(data_file, col_types = bulk_col_types)

print(dim(data_df))
head(data_df, 4)

[1] 165 272


Metadata_Plate,Metadata_Well,Metadata_batch,Metadata_cell_count,Metadata_cell_density,Metadata_celltype_shorthand_from_plate_graph,Metadata_clone_number,Metadata_date,Metadata_plate_map_name,Metadata_time_to_adhere,⋯,Nuclei_RadialDistribution_MeanFrac_Mito_1of4,Nuclei_RadialDistribution_MeanFrac_Mito_4of4,Nuclei_RadialDistribution_MeanFrac_RNA_1of4,Nuclei_RadialDistribution_MeanFrac_RNA_4of4,Nuclei_RadialDistribution_RadialCV_DNA_1of4,Nuclei_Texture_Correlation_DNA_10_03,Nuclei_Texture_Correlation_ER_10_00,Nuclei_Texture_Correlation_ER_10_01,Nuclei_Texture_Correlation_ER_10_03,Nuclei_Texture_Correlation_Mito_10_02
<chr>,<chr>,<chr>,<int>,<chr>,<dbl>,<chr>,<dbl>,<chr>,<chr>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
219814,B10,2021_02_08_Batch11,12453,2.5x10^3 cells/well,4,WT clone 01,20210205,219814,48 hr,⋯,0.920735,2.29247,2.786923,0.2150331,1.801948,2.8111508,-0.372571,-0.41285981,-0.41128528,1.5414147
219814,B11,2021_02_08_Batch11,2914,2.5x10^3 cells/well,5,WT clone 02,20210205,219814,48 hr,⋯,0.4977887,-1.757208,1.13621,-4.1661807,2.112966,0.1271175,0.2419688,-0.07904743,-0.07081247,1.9853635
219814,C02,2021_02_08_Batch11,6314,2.5x10^3 cells/well,10,BZ002,20210205,219814,48 hr,⋯,3.4329133,2.349513,2.21048,6.0104808,1.421137,3.3594385,0.0205416,-0.18167256,-0.18991518,0.8667502
219814,C03,2021_02_08_Batch11,4275,2.5x10^3 cells/well,9,BZ001,20210205,219814,48 hr,⋯,5.8284016,-4.883283,3.258141,-5.7022461,3.305521,-2.6333408,0.3541864,-0.35552921,-0.37832162,1.8785706


In [4]:
# Subset dataset
bulk_subset_df <- data_df %>%
    dplyr::filter(
        Metadata_dataset == !!dataset,
        Metadata_model_split == "training",
    )

# Populate the list for signature building
bulk_subset_df$Metadata_clone_type_indicator <- factor(
    bulk_subset_df$Metadata_clone_type_indicator, levels = c("0", "1")
)

# Print dataset description
print(paste("Training dataset:", dataset))
print(table(
    bulk_subset_df$Metadata_clone_number,
    bulk_subset_df$Metadata_batch
))

[1] "Training dataset: bortezomib"
             
              2021_02_08_Batch11
  BZ001                        3
  BZ002                        3
  BZ003                        3
  BZ004                        3
  BZ005                        3
  WT clone 01                  3
  WT clone 02                  3
  WT clone 03                  3
  WT clone 04                  3
  WT clone 05                  3


In [5]:
formula_terms <- paste(
    "~",
    "Metadata_clone_type_indicator", "+",
    "Metadata_clone_number"
)

cell_count_formula <- paste(
    "~",
    "Metadata_clone_type_indicator", "+",
    "scale(Metadata_cell_count)"
)

In [6]:
# Fit linear model to determine sources of variation and process results
lm_results <- perform_anova(bulk_subset_df, formula_terms)

# Order the full results data frame by significance and extract feature names
full_results_df <- lm_results[["full_results_df"]] %>%
    dplyr::arrange(desc(neg_log_p))

features <- unique(full_results_df$feature)

# Perform TukeyHSD posthoc test
tukey_results <- process_tukey(
    aov_list = lm_results[["aovs"]],
    features = features
)

# Fit a linear model on cell counts
cell_count_results <- perform_linear_model(bulk_subset_df, cell_count_formula) %>%
    dplyr::arrange(desc(neg_log_p)) %>%
    dplyr::mutate(dataset = dataset)

In [7]:
anova_results_df <- lm_results[["full_results_df"]] %>%
    dplyr::mutate(dataset = dataset)

tukey_results_df <- tukey_results %>%
    dplyr::mutate(dataset = dataset)

features <- unique(anova_results_df$feature)

num_cp_features <- length(features)
signif_line <- -log10(0.05 / num_cp_features)

# Derive signature by systematically removing features influenced by technical artifacts
signature_features <- tukey_results_df %>%
    dplyr::filter(term == "Metadata_clone_type_indicator", neg_log_adj_p > !!signif_line) %>%
    dplyr::pull(feature)

# Determine if the clone number comparison is between like-clones
wt_clone_count <- stringr::str_count(
    tukey_results_df %>%
    dplyr::filter(term == "Metadata_clone_number") %>%
    dplyr::pull("comparison"), "WT"
)

# Exclude features with very high within sensitivity-type clones
feature_exclude_nonspecific_variation <- unique(
    tukey_results_df %>%
    dplyr::filter(term == "Metadata_clone_number") %>%
    dplyr::mutate(wt_clone_count = wt_clone_count) %>%
    dplyr::filter(neg_log_adj_p > !!signif_line, wt_clone_count != 1) %>%
    dplyr::pull(feature)
    )

# Exclude features that are significantly impacted by cell count
feature_exclude_cell_count <- cell_count_results %>%
    dplyr::filter(term == "scale(Metadata_cell_count)", neg_log_p > !!signif_line) %>%
    dplyr::pull(feature)

final_signature_features <- setdiff(
    signature_features, unique(feature_exclude_cell_count)
)
final_signature_features <- setdiff(
    final_signature_features, unique(feature_exclude_nonspecific_variation)
)

# Create a summary of the signatures
signature_summary_df <- tibble(features)

signature_summary_df <- signature_summary_df %>%
    dplyr::mutate(
        non_status_significant_exclude = !(signature_summary_df$features %in% signature_features),
        cell_count_exclude = signature_summary_df$features %in% feature_exclude_cell_count,
        non_specific_exclude = signature_summary_df$features %in% feature_exclude_nonspecific_variation,
        final_signature = signature_summary_df$features %in% final_signature_features,
        dataset = dataset
    )

print(paste("For the dataset:", dataset))
print(paste("the number of features in the core signature:", sum(signature_summary_df$final_signature)))

[1] "For the dataset: bortezomib"
[1] "the number of features in the core signature: 8"


In [8]:
signature_summary_df %>% dplyr::filter(final_signature)

features,non_status_significant_exclude,cell_count_exclude,non_specific_exclude,final_signature,dataset
<chr>,<lgl>,<lgl>,<lgl>,<lgl>,<chr>
Cells_AreaShape_Zernike_3_3,False,False,False,True,bortezomib
Cells_AreaShape_Zernike_4_2,False,False,False,True,bortezomib
Cells_Correlation_Overlap_DNA_ER,False,False,False,True,bortezomib
Cells_Correlation_Overlap_DNA_Mito,False,False,False,True,bortezomib
Cytoplasm_AreaShape_Zernike_5_5,False,False,False,True,bortezomib
Cytoplasm_RadialDistribution_MeanFrac_DNA_1of4,False,False,False,True,bortezomib
Nuclei_Correlation_Correlation_Mito_ER,False,False,False,True,bortezomib
Nuclei_RadialDistribution_FracAtD_ER_4of4,False,False,False,True,bortezomib


In [9]:
anova_output_file <- file.path(output_results_dir, paste0("anova_results_", dataset, "_signature.tsv.gz"))
tukey_output_file <- file.path(output_results_dir, paste0("tukey_results_", dataset, "_signature.tsv.gz"))
cell_count_output_file <- file.path(output_results_dir, paste0("lm_cell_count_results_", dataset, "_signature.tsv.gz"))
signature_output_file <- file.path(output_results_dir, paste0("signature_summary_", dataset, "_signature.tsv.gz"))

anova_results_df %>% readr::write_tsv(anova_output_file)
tukey_results_df %>% readr::write_tsv(tukey_output_file)
cell_count_results %>% readr::write_tsv(cell_count_output_file)
signature_summary_df %>% readr::write_tsv(signature_output_file)