In [1]:
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(broom))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(ggrepel))

source(file.path("../3.bulk-signatures/scripts", "signature_utils.R"))

In [2]:
set.seed(123)

dataset <- "bortezomib"
input_data_dir <- "data"
data_file <- file.path(input_data_dir, "bortezomib_signature_analytical_set.tsv.gz")

output_fig_dir = file.path("figures", "anova")
output_results_dir = file.path("results", "signatures")

In [3]:
# Load profiles
bulk_col_types <- readr::cols(
    .default = readr::col_double(),
    Metadata_Plate = readr::col_character(),
    Metadata_Well = readr::col_character(),
    Metadata_cell_count = readr::col_integer(),
    Metadata_batch = readr::col_character(),
    Metadata_clone_number = readr::col_character(),
    Metadata_plate_map_name = readr::col_character(),
    Metadata_treatment = readr::col_character(),
    Metadata_dataset = readr::col_character(),
    Metadata_clone_type = readr::col_character(),
    Metadata_clone_type_indicator = readr::col_character(),
    Metadata_model_split = readr::col_character(),
    Metadata_cell_density = readr::col_character(),
    Metadata_treatment_time = readr::col_character(),
    Metadata_unique_sample_name = readr::col_character(),
    Metadata_time_to_adhere = readr::col_character()
)

data_df <- readr::read_tsv(data_file, col_types = bulk_col_types)

print(dim(data_df))
head(data_df, 4)

[1]  405 3546


Metadata_Plate,Metadata_Well,Metadata_batch,Metadata_cell_count,Metadata_cell_density,Metadata_celltype_shorthand_from_plate_graph,Metadata_clone_number,Metadata_date,Metadata_plate_map_name,Metadata_time_to_adhere,⋯,Nuclei_Texture_Variance_RNA_10_02,Nuclei_Texture_Variance_RNA_10_03,Nuclei_Texture_Variance_RNA_20_00,Nuclei_Texture_Variance_RNA_20_01,Nuclei_Texture_Variance_RNA_20_02,Nuclei_Texture_Variance_RNA_20_03,Nuclei_Texture_Variance_RNA_5_00,Nuclei_Texture_Variance_RNA_5_01,Nuclei_Texture_Variance_RNA_5_02,Nuclei_Texture_Variance_RNA_5_03
<chr>,<chr>,<chr>,<int>,<chr>,<dbl>,<chr>,<dbl>,<chr>,<chr>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
219907,B02,2021_03_03_Batch12,6139,2.5x10^3 cells/well,1,WT_parental,20210205,219814,48 hr,⋯,-0.6430638,-0.6342366,-0.6554599,-0.6673215,-0.6546287,-0.6977682,-0.6320059,-0.6297243,-0.6345844,-0.632946
219907,B03,2021_03_03_Batch12,4567,2.5x10^3 cells/well,2,CloneA,20210205,219814,48 hr,⋯,-0.8308873,-0.8344854,-0.8304672,-0.8178338,-0.8309076,-0.8234369,-0.8347194,-0.8332018,-0.8327623,-0.8296075
219907,B04,2021_03_03_Batch12,5624,2.5x10^3 cells/well,3,CloneE,20210205,219814,48 hr,⋯,-0.9813265,-0.97597,-0.9728225,-0.9617426,-0.9721434,-0.9618017,-0.9802334,-0.9790954,-0.9839611,-0.979349
219907,B05,2021_03_03_Batch12,5894,2.5x10^3 cells/well,4,WT clone 01,20210205,219814,48 hr,⋯,-1.0862537,-1.0826744,-1.0689886,-1.0663278,-1.0579078,-1.061034,-1.0902747,-1.0863724,-1.0886331,-1.0853202


In [4]:
# Subset dataset
bulk_subset_df <- data_df %>%
    dplyr::filter(
        Metadata_dataset == !!dataset,
        Metadata_model_split == "training",
    )

# Populate the list for signature building
bulk_subset_df$Metadata_clone_type_indicator <- factor(
    bulk_subset_df$Metadata_clone_type_indicator, levels = c("0", "1")
)

# Print dataset description
print(paste("Training dataset:", dataset))
print(table(
    bulk_subset_df$Metadata_clone_number,
    bulk_subset_df$Metadata_batch
))

[1] "Training dataset: bortezomib"
             
              2021_03_03_Batch12 2021_03_03_Batch13 2021_03_03_Batch15
  BZ001                        3                  4                  4
  BZ002                        4                  4                  4
  BZ003                        3                  2                  4
  BZ004                        3                  2                  4
  BZ005                        4                  3                  4
  CloneA                       4                  3                  4
  CloneE                       3                  4                  2
  WT clone 01                  3                  4                  2
  WT clone 02                  4                  3                  4
  WT clone 03                  4                  3                  3
  WT clone 04                  3                  3                  4
  WT clone 05                  2                  4                  4
  WT_parental               

In [6]:
formula_terms <- paste(
    "~",
    "Metadata_clone_type_indicator", "+",
    "Metadata_batch", "+",
    "Metadata_clone_number"
)

cell_count_formula <- paste(
    "~",
    "Metadata_clone_type_indicator", "+",
    "scale(Metadata_cell_count)"
)

In [7]:
# Fit linear model to determine sources of variation and process results
lm_results <- perform_anova(bulk_subset_df, formula_terms)

# Order the full results data frame by significance and extract feature names
full_results_df <- lm_results[["full_results_df"]] %>%
    dplyr::arrange(desc(neg_log_p))

features <- unique(full_results_df$feature)

# Perform TukeyHSD posthoc test
tukey_results <- process_tukey(
    aov_list = lm_results[["aovs"]],
    features = features
)

# Fit a linear model on cell counts
cell_count_results <- perform_linear_model(bulk_subset_df, cell_count_formula) %>%
    dplyr::arrange(desc(neg_log_p)) %>%
    dplyr::mutate(dataset = dataset)

In [11]:
anova_results_df <- lm_results[["full_results_df"]] %>%
    dplyr::mutate(dataset = dataset)

tukey_results_df <- tukey_results %>%
    dplyr::mutate(dataset = dataset)

features <- unique(anova_results_df$feature)

num_cp_features <- length(features)
signif_line <- -log10(0.05 / num_cp_features)

# Derive signature by systematically removing features influenced by technical artifacts
signature_features <- tukey_results_df %>%
    dplyr::filter(term == "Metadata_clone_type_indicator", neg_log_adj_p > !!signif_line) %>%
    dplyr::pull(feature)

# Determine if the clone number comparison is between like-clones
wt_clone_count <- stringr::str_count(
    tukey_results_df %>%
    dplyr::filter(term == "Metadata_clone_number") %>%
    dplyr::pull("comparison"), "WT"
)

# Exclude features with very high within sensitivity-type clones
feature_exclude_nonspecific_variation <- unique(
    tukey_results_df %>%
    dplyr::filter(term == "Metadata_clone_number") %>%
    dplyr::mutate(wt_clone_count = wt_clone_count) %>%
    dplyr::filter(neg_log_adj_p > !!signif_line, wt_clone_count != 1) %>%
    dplyr::pull(feature)
    )

feature_exclude_batch <- tukey_results_df %>%
    dplyr::filter(term == "Metadata_batch", neg_log_adj_p > !!signif_line) %>%
    dplyr::pull(feature)

# Exclude features that are significantly impacted by cell count
feature_exclude_cell_count <- cell_count_results %>%
    dplyr::filter(term == "scale(Metadata_cell_count)", neg_log_p > !!signif_line) %>%
    dplyr::pull(feature)

final_signature_features <- setdiff(
    signature_features, unique(feature_exclude_cell_count)
)
final_signature_features <- setdiff(
    final_signature_features, unique(feature_exclude_nonspecific_variation)
)

# Create a summary of the signatures
signature_summary_df <- tibble(features)

signature_summary_df <- signature_summary_df %>%
    dplyr::mutate(
        non_status_significant_exclude = !(signature_summary_df$features %in% signature_features),
        cell_count_exclude = signature_summary_df$features %in% feature_exclude_cell_count,
        non_specific_exclude = signature_summary_df$features %in% feature_exclude_nonspecific_variation,
        final_signature = signature_summary_df$features %in% final_signature_features,
        dataset = dataset
    )

print(paste("For the dataset:", dataset))
print(paste("the number of features in the core signature:", sum(signature_summary_df$final_signature)))

[1] "For the dataset: bortezomib"
[1] "the number of features in the core signature: 34"


In [12]:
signature_summary_df %>% dplyr::filter(final_signature)

features,non_status_significant_exclude,cell_count_exclude,non_specific_exclude,final_signature,dataset
<chr>,<lgl>,<lgl>,<lgl>,<lgl>,<chr>
Cells_Correlation_K_DNA_AGP,False,False,False,True,bortezomib
Cells_Correlation_RWC_AGP_RNA,False,False,False,True,bortezomib
Cells_Correlation_RWC_ER_Mito,False,False,False,True,bortezomib
Cells_Intensity_MeanIntensityEdge_Mito,False,False,False,True,bortezomib
Cytoplasm_Correlation_K_Mito_DNA,False,False,False,True,bortezomib
Cytoplasm_Correlation_Manders_DNA_AGP,False,False,False,True,bortezomib
Cytoplasm_Correlation_RWC_DNA_Mito,False,False,False,True,bortezomib
Cytoplasm_Granularity_14_ER,False,False,False,True,bortezomib
Cytoplasm_Granularity_15_Mito,False,False,False,True,bortezomib
Cytoplasm_Texture_Entropy_Mito_20_01,False,False,False,True,bortezomib


In [13]:
anova_output_file <- file.path(output_results_dir, paste0("anova_results_", dataset, "_signature.tsv.gz"))
tukey_output_file <- file.path(output_results_dir, paste0("tukey_results_", dataset, "_signature.tsv.gz"))
cell_count_output_file <- file.path(output_results_dir, paste0("lm_cell_count_results_", dataset, "_signature.tsv.gz"))
signature_output_file <- file.path(output_results_dir, paste0("signature_summary_", dataset, "_signature.tsv.gz"))

anova_results_df %>% readr::write_tsv(anova_output_file)
tukey_results_df %>% readr::write_tsv(tukey_output_file)
cell_count_results %>% readr::write_tsv(cell_count_output_file)
signature_summary_df %>% readr::write_tsv(signature_output_file)

In [14]:
dataset

In [15]:
signature_output_file