# Visualize Model Coefficients

**Gregory Way, 2019**

In [1]:
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(cowplot))
suppressPackageStartupMessages(library(ggrepel))

In [2]:
consensus <- "modz"

## Load Function

In [3]:
coef_plot <- function(df, target_name, compartment_features, coef_theme, top_plot_num = 15) {
    # Compile a series of plots that describe model coefficients
    #
    # Arguments:
    # df - coefficient dataframe with feature metadata and weights
    # target_name - a string of a specific cell health model
    # compartment_features - a vector of which features to plot
    # coef_theme - a ggplot theme object to apply to all plots
    # top_plot_num - the number of top individual features to visualize
    
    # Subset the input dataframe to focus on the specific target
    subset_coef_df <- coef_df %>% dplyr::filter(target == !!target_name)

    # Extract and process specific feature sets
    area_df <- subset_coef_df %>%
        dplyr::filter(feature_group  == "AreaShape") %>%
        dplyr::group_by(shuffle, compartment, feature_group) %>%
        dplyr::top_n(n = 1, wt = abs_weight)

    compartment_df <- subset_coef_df %>%
        dplyr::filter(feature_group %in% !!compartment_features) %>%
        dplyr::group_by(shuffle, compartment, feature_group, channel) %>%
        dplyr::top_n(n = 1, wt = abs_weight)

    # Process individual feature name info
    total_features <- length(unique(subset_coef_df$feature))
    total_non_zero_features <- nrow(
        subset_coef_df %>%
            dplyr::filter(shuffle == "Real") %>%
            dplyr::filter(abs_weight > 0)
        )
    feature_title <- paste0(
        round((total_non_zero_features / total_features) * 100, 2),
        "% Non-Zero"
    )

    top_n_features <- subset_coef_df %>%
        dplyr::filter(shuffle == "Real") %>%
        dplyr::top_n(n = top_plot_num, wt = abs_weight) %>%
        dplyr::pull(feature)

    feature_order <- subset_coef_df %>%
        dplyr::filter(shuffle == "Real") %>%
        dplyr::arrange(weight) %>%
        dplyr::pull(feature)

    subset_coef_features_df <- subset_coef_df %>%
        dplyr::filter(shuffle == "Real") %>%
        dplyr::filter(feature %in% !!top_n_features)

    subset_coef_features_df$feature <- factor(
        subset_coef_features_df$feature, levels = feature_order
    )

    # 1st Plot - Area Features
    area_gg <- ggplot(area_df, aes(x = compartment, y = feature_group)) +
        geom_point(aes(fill = abs_weight), size = 3, pch = 21) +
        facet_wrap(~shuffle) +
        scale_fill_viridis_c(name = "Abs Weight") +
        ylab("") +
        xlab("Compartment") +
        theme(axis.text.y = element_text(angle = 90, hjust = 0.5)) +
        theme_bw() +
        coef_theme

    # 2nd Plot - Other Compartment Features
    compartment_gg <- ggplot(compartment_df, aes(x = channel, y = feature_group)) +
        geom_point(aes(fill = abs_weight), size = 3, pch = 21) +
        facet_grid(compartment~shuffle) +
        scale_fill_viridis_c(name = "Abs Weight") +
        ylab("Feature Group") +
        xlab("Channel") +
        theme_bw() +
        coef_theme +
        theme(axis.text.x = element_text(angle = 90))

    # 3rd Plot - Individual Feature Names
    feature_name_gg <- ggplot(subset_coef_features_df, aes(x = feature, y = weight)) +
        geom_bar(fill = "#8DB495", color = "black", stat = "identity") +
        ggtitle(feature_title) +
        coord_flip() +
        xlab("") +
        ylab("Model Coefficient") +
        theme_bw() +
        coef_theme

    # Get cowplot title
    full_title <- ggdraw() + 
      draw_label(
          target_name,
          fontface = 'bold',
          x = 0,
          hjust = -0.1
      )
    
    # Compile together into single cowplot
    coef_full_gg <- cowplot::plot_grid(
        full_title,
        cowplot::plot_grid(
            feature_name_gg,
            cowplot::plot_grid(
                area_gg,
                compartment_gg,
                nrow = 2,
                rel_heights = c(0.25, 1),
                align = "v",
                axis = "l"
            ),
            ncol = 2
        ),
        nrow = 2,
        rel_heights = c(0.1, 1)
    )
    
    return(coef_full_gg)
}

## Load Data and Process Feature Info

In [4]:
coef_file <- file.path("results", paste0("full_cell_health_coefficients_", consensus, ".tsv.gz"))

coef_df <- readr::read_tsv(
    coef_file,
    col_types = readr::cols(
        feature = readr::col_character(),
        weight = readr::col_double(),
        abs_weight = readr::col_double(),
        target = readr::col_character(),
        y_transform = readr::col_character(),
        shuffle = readr::col_character()
    )
) %>%
    dplyr::filter(y_transform == "raw") %>%
    tidyr::separate(
        feature,
        into = c(
            "compartment",
            "feature_group",
            "measurement",
            "channel", 
            "parameter1", 
            "parameter2"
        ),
        sep = "_",
        remove = FALSE
    )

coef_df$shuffle <- dplyr::recode(coef_df$shuffle,
                                 "shuffle_true" = "Permuted",
                                 "shuffle_false" = "Real")
coef_df$shuffle <- factor(coef_df$shuffle,
                          levels = c("Real", "Permuted"))

print(dim(coef_df))
head(coef_df, 5)

“Expected 6 pieces. Missing pieces filled with `NA` in 141820 rows [2, 3, 5, 6, 8, 10, 11, 16, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 37, 38, ...].”

[1] 223720     12


feature,compartment,feature_group,measurement,channel,parameter1,parameter2,weight,abs_weight,target,y_transform,shuffle
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<fct>
Nuclei_Texture_InfoMeas1_AGP_5_0,Nuclei,Texture,InfoMeas1,AGP,5.0,0.0,-0.04896722,0.04896722,cc_all_high_n_spots_h2ax_mean,raw,Permuted
Nuclei_Granularity_12_Mito,Nuclei,Granularity,12,Mito,,,-0.04010161,0.04010161,cc_all_high_n_spots_h2ax_mean,raw,Permuted
Cytoplasm_Intensity_MaxIntensity_RNA,Cytoplasm,Intensity,MaxIntensity,RNA,,,-0.03304003,0.03304003,cc_all_high_n_spots_h2ax_mean,raw,Permuted
Cells_Texture_Variance_ER_20_0,Cells,Texture,Variance,ER,20.0,0.0,0.03236982,0.03236982,cc_all_high_n_spots_h2ax_mean,raw,Permuted
Cells_Granularity_11_AGP,Cells,Granularity,11,AGP,,,-0.03047637,0.03047637,cc_all_high_n_spots_h2ax_mean,raw,Permuted


## Set Constants

In [5]:
compartment_features <- c("Texture", "Intensity", "RadialDistribution", "Correlation", "Granularity")
top_plot_num <- 15
coef_theme <- theme(
    strip.text = element_text(size = 6,
                              color = "black",
                              margin = margin(1, 1, 1, 1)),
    strip.background = element_rect(colour = "black",
                                    fill = "#fdfff4"),
    axis.text = element_text(size = 5),
    axis.title = element_text(size = 6),
    plot.title = element_text(size = 6),
    legend.title = element_text(size = 6),
    legend.text = element_text(size = 5),
    legend.key.width = unit(0.5, "cm"),
    legend.key.size = unit(0.3, "cm")
)

## Generate Plot for every Cell Health Target

In [6]:
pdf_file <- file.path("figures",
                      paste0("all_model_coefficients_", consensus, ".pdf"))
pdf(pdf_file, width = 5, height = 7, onefile = TRUE)

for (target in unique(coef_df$target)) {
    coef_gg <- coef_plot(
        df = coef_df,
        target_name = target,
        compartment_features = compartment_features,
        coef_theme = coef_theme,
        top_plot_num = top_plot_num
    )
    output_file <- file.path(
        "figures",
        "coefficients",
        paste0("model_", consensus, "_", target, ".png")
    )
    cowplot::save_plot(output_file, coef_gg, base_height = 5, base_width = 7)
    
    print(coef_gg)
}

dev.off()