## Visualize Distribution of Recoded Features

**Gregory Way, 2019**

We tested the ability of classification models to predict high and low cell health features.
Ultimately, we decided that these models were unstable and not reliable.
Therefore, we did not use them in downstream applications.

One potential reason the classification approach performed poorly is because the cell health input data must be first transformed to binary response variables.
This notebook is used to visualize the binarization scheme to force the model input into a classification framework. 

I also determine the extend of confounding variables in the recoding scheme.
Basically, asking if there are any plate effects or other potential technical artifacts.

In [1]:
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(dplyr))


“package ‘ggplot2’ was built under R version 3.6.3”
“package ‘dplyr’ was built under R version 3.6.3”


In [2]:
method = 'median'
# method = 'weighted'
consensus <- "modz"
output_dir <- file.path("figures", sprintf("%s_agg", method), "feature_distribution", consensus)

dir.create(output_dir, showWarnings = FALSE, recursive = TRUE)


In [3]:
# Load Data
y_cols <- readr::cols(
    Metadata_profile_id = readr::col_character(),
    recode_target_value = readr::col_double(),
    target = readr::col_character(),
    data_type = readr::col_character(),
    shuffle = readr::col_character(),
    y_transform = readr::col_character(),
    y_type = readr::col_character()
)

y_file <- file.path("results",sprintf('%s_agg', method),
                    paste0("full_cell_health_y_labels_", consensus, ".tsv.gz"))
y_df <- readr::read_tsv(y_file,
                        col_types = y_cols)

y_binary_df <- y_df %>%
    dplyr::filter(shuffle == "shuffle_false",
                  y_transform == "binarize",
                  y_type == "y_true")

y_raw_scores_df <- y_df %>%
    dplyr::filter(shuffle == "shuffle_false",
                  y_transform == "raw",
                  y_type == "y_true")


In [4]:
# Process data for plotting
y_plot_df <- y_raw_scores_df %>%
    dplyr::inner_join(y_binary_df,
                      by = c("Metadata_profile_id",
                             "target",
                             "data_type",
                             "shuffle",
                             "y_type"),
                      suffix = c("_raw", "_binary"))

y_plot_df$data_type <- dplyr::recode(y_plot_df$data_type,
                                     "train" = "Train",
                                     "test" = "Test")

head(y_plot_df, 3)


Metadata_profile_id,recode_target_value_raw,target,data_type,shuffle,y_transform_raw,y_type,recode_target_value_binary,y_transform_binary
profile_153,9.671827e-05,cc_all_high_h2ax,Train,shuffle_false,raw,y_true,0,binarize
profile_149,0.4110561,cc_all_high_h2ax,Train,shuffle_false,raw,y_true,0,binarize
profile_331,-0.1869515,cc_all_high_h2ax,Train,shuffle_false,raw,y_true,0,binarize


In [5]:
# Generate and save figures
pdf_file <- file.path(
    output_dir,
    paste0("all_binary_distributions_std_", consensus, ".pdf")
)
dir.create(file.path(dirname(pdf_file)))

pdf(pdf_file, width = 5, height = 3.5, onefile = TRUE)

for (target in unique(y_plot_df$target)) {
    y_plot_subset_df = y_plot_df %>%
        dplyr::filter(target == !!target)

    target_gg <- 
        ggplot(y_plot_subset_df,
               aes(x = recode_target_value_raw,
                   fill = as.factor(recode_target_value_binary))) +
            geom_histogram(bins = 50, alpha = 0.6) +
            facet_grid(~ data_type,
                       scales = "free_y") +
            scale_fill_manual(name = "Binary\nRecoding",
                              labels = c("0" = "0", "1" = "1"),
                              values = c("0" = "#AEA367", "1" = "#403019")) +
            xlab(target) +
            ylab("Count") +
            theme_bw() +
            ggtitle(target) +
            theme(axis.text = element_text(size = 8),
                  axis.title = element_text(size = 9),
                  strip.text = element_text(size = 7),
                  legend.title = element_text(size = 8),
                  title = element_text(size = 12),
                  strip.background = element_rect(colour = "black",
                                                  fill = "#fdfff4"))

    output_file <- file.path(output_dir, paste0(target, "_dist_", consensus, ".png"))
    
    ggsave(filename = output_file,
           plot = target_gg,
           width = 5,
           height = 2.75,
           dpi = 400)
    
    print(target_gg)
}

dev.off()


“'figures/median_agg/feature_distribution/modz' already exists”
