# Generate plots to review filtering

After our filtering of cell type clusters for doublets and low gene expression, we need to generate visualizations to review the accuracy of our filtering process.

Here, we'll load the tables we generated during filtering to enable review for each cell type - cell metadata with labels, Louvain clusters, flags for filtering, and UMAP coordinates; tables of 

In [218]:
quiet_library <- function(...) { suppressPackageStartupMessages(library(...)) }

quiet_library(cowplot)
quiet_library(data.table)
quiet_library(dplyr)
quiet_library(ggplot2)
quiet_library(ggrastr)
quiet_library(hise)
quiet_library(purrr)

## Helper functions

In [235]:
plot_clusters <- function(meta_df, cluster_col, plot_title) {
    color <- rlang::parse_expr(cluster_col)

    label_df <- meta_df %>%
      group_by(!!color) %>%
      summarise(umap_1 = median(umap_1),
                umap_2 = median(umap_2))
    
    ggplot() +
      geom_point_rast(
          data = meta_df,
          aes(x = umap_1,
              y = umap_2,
              color = !!color),
          size = 0.2
      ) +
      geom_text(
          data = label_df,
          aes(x = umap_1,
              y = umap_2,
              label = !!color)
      ) +
      scale_color_discrete() +
      scale_fill_discrete() +
      theme_bw() +
      ggtitle(plot_title)
}

In [241]:
plot_removal <- function(meta_df, color_df, plot_title) {

    label_df <- meta_df %>%
      group_by(remove_reason) %>%
      summarise(umap_1 = median(umap_1),
                umap_2 = median(umap_2))
    
    ggplot() +
      geom_point_rast(
          data = meta_df,
          aes(x = umap_1,
              y = umap_2,
              color = remove_reason),
          size = 0.2
      ) +
      geom_label(
          data = label_df,
          aes(x = umap_1,
              y = umap_2,
              label = remove_reason,
              fill = remove_reason)
      ) +
      scale_color_manual(
          breaks = color_df$remove_reason,
          values = color_df$remove_color
      ) +
      scale_fill_manual(
          breaks = color_df$remove_reason,
          values = color_df$remove_color
      ) +
      theme_bw() +
      ggtitle(plot_title)
}

In [305]:
plot_l3 <- function(meta_df, plot_title) {

    label_df <- meta_df %>%
      group_by(AIFI_L3) %>%
      summarise(umap_1 = median(umap_1),
                umap_2 = median(umap_2),
                n_cells = n()) %>%
      filter(n_cells > 100)
    
    ggplot() +
      geom_point_rast(
          data = meta_df,
          aes(x = umap_1,
              y = umap_2,
              color = AIFI_L3),
          size = 0.2
      ) +
      geom_label(
          data = label_df,
          aes(x = umap_1,
              y = umap_2,
              label = AIFI_L3)
      ) +
      scale_color_discrete() +
      scale_fill_discrete() +
      theme_bw() +
      theme(legend.position = "none") +
      ggtitle(plot_title)
}

In [237]:
plot_markers <- function(marker_df, filter_df, color_df) {
    
    filter_tiles <- filter_df %>%
      inner_join(marker_df %>% 
                  select(louvain_2, gene) %>%
                  unique(),
                 by = "louvain_2"
                )
    
    ggplot(marker_df) +
      geom_point( # Annoying extra hidden layer to make axes work
          data = marker_df,
          aes(x = louvain_2,
              y = gene),
          size = 0.1,
          alpha = 0
      ) +
      geom_tile(
          data = filter_tiles,
          aes(x = louvain_2,
              y = gene,
              color = remove_reason),
          linewidth = 1,
          fill = "white"
      ) +
      geom_point(
          data = marker_df,
          aes(x = louvain_2,
              y = gene,
              size = gene_frac,
              fill = log(gene_mean + 1)),
          pch = 21
      ) +
      scale_size_area() +
      scale_fill_gradientn(
          colors = c("black", "darkred", "red", "orangered", "orange")
      ) +
      scale_color_manual(
          breaks = color_df$remove_reason,
          values = color_df$remove_color
      ) +
      theme_bw() +
      theme(axis.ticks = element_blank())
}

In [238]:
plot_genes <- function(meta_df, color_df) {
    plot_df <- meta_df %>%
      left_join(color_df, by = "remove_reason")

    med_df <- plot_df %>%
      group_by(louvain_2) %>%
      summarise(med_genes = median(n_genes))
    
    ggplot() +
      geom_violin(
          data = meta_df,
          aes(x = louvain_2,
              y = log10(n_genes),
              fill = remove_reason)
      ) +
      geom_point(
          data = med_df,
          aes(x = louvain_2,
              y = log10(med_genes))
      ) +
      scale_fill_manual(
          breaks = color_df$remove_reason,
          values = color_df$remove_color
      ) +
      theme_bw()
}

In [292]:
plot_summary <- function(meta_df, color_df) {

    summary_df <- meta_df %>%
      group_by(remove_reason) %>%
      summarise(n_reason = n()) %>%
      ungroup() %>%
      mutate(perc_reason = format(round(n_reason / sum(n_reason) * 100,2), nsmall = 2)) %>%
      mutate(ypos = n():1)

    total_df <- data.frame(
        remove_reason = "Total removed",
        n_reason = sum(meta_df$remove_reason != "Not removed"),
        perc_reason = format(round(sum(meta_df$remove_reason != "Not removed") / nrow(meta_df) * 100, 2), nsmall = 2),
        ypos = 0
    )

    header_df <- data.frame(
        label = c("Reason", "N", "%"),
        xpos = c(1, 2.4, 3),
        ypos = max(summary_df$ypos + 1),
        hjust = c(0, 1, 1)
    )

    summary_df <- rbind(summary_df, total_df)
    
    ggplot() +
      geom_rect(
          data = summary_df,
          aes(xmin = 0.9, xmax = 3.1,
              ymin = ypos - 0.5, ymax = ypos + 0.5,
              fill = remove_reason),
          alpha = 0.5
      ) +
      geom_text(
          data = header_df,
          aes(x = xpos, y = ypos,
              label = label, hjust = hjust)
      ) +
      geom_text(
          data = summary_df,
          aes(x = 1,
              y = ypos,
              label = remove_reason),
          hjust = 0
      ) +
      geom_text(
          data = summary_df,
          aes(x = 2.4,
              y = ypos,
              label = n_reason),
          hjust = 1
      ) +
      geom_text(
          data = summary_df,
          aes(x = 3,
              y = ypos,
              label = perc_reason),
          hjust = 1
      ) +
      scale_fill_manual(
          breaks = color_df$remove_reason,
          values = color_df$remove_color
      ) +
      scale_x_continuous("", expand = c(0,0)) +
      theme_bw() +
      theme(legend.position = "none",
            panel.grid = element_blank(),
            axis.text = element_blank(),
            axis.ticks = element_blank(),
            panel.border = element_blank())
}

In [301]:
assemble_plots <- function(filter_df, marker_df, meta_df, color_df, group_name) {
    cluster_levels <- unique(meta_df$louvain_2) %>% sort()

    filter_df$louvain_2 <- factor(filter_df$louvain_2, levels = cluster_levels, ordered = TRUE)
    marker_df$louvain_2 <- factor(marker_df$louvain_2, levels = cluster_levels, ordered = TRUE)
    meta_df$louvain_2 <- factor(meta_df$louvain_2, levels = cluster_levels, ordered = TRUE)

    gene_order <- marker_df$gene %>% unique() %>% sort(decreasing = TRUE)
    marker_df$gene <- factor(marker_df$gene, levels = gene_order, ordered = TRUE)

    filter_df <- filter_df %>% left_join(reason_colors, by = "remove_reason")
    meta_df <- meta_df %>% left_join(reason_colors, by = "remove_reason")

    cluster_umap <- plot_clusters(meta_df, "louvain_2", group_name)
    removal_umap <- plot_removal(meta_df, reason_colors, group_name)
    l3_umap <- plot_l3(meta_df, group_name)
    marker_dotplot <- plot_markers(marker_df, filter_df, reason_colors)
    gene_violins <- plot_genes(meta_df, reason_colors)
    summary_table <- plot_summary(meta_df, reason_colors)
    
    blank_plot <- ggplot() + theme_bw() + theme(panel.border = element_blank())
    
    top_row <- plot_grid(cluster_umap, removal_umap, l3_umap, nrow = 1, ncol = 3)
    bottom_row <- plot_grid(
        marker_dotplot, summary_table,
        gene_violins, blank_plot,
        nrow = 2, ncol = 2,
        rel_heights = c(1, 0.5),
        rel_widths = c(0.8, 0.2)
    )
    all_plots <- plot_grid(
        top_row, 
        bottom_row,
        nrow = 2, ncol = 1
    )
    
    return(all_plots)
}

## Color values

In [270]:
reason_colors <- data.frame(
    remove_reason = c(
        "Not removed", "Total removed",
        "B cell doublet", "Erythrocyte doublet",
        "Myeloid doublet", "Platelet doublet", 
        "T cell doublet", "Low gene count"
    ),
    remove_color = c(
        "#8efaa4", "#ffffff",
        "#fcb890", "#fa9c9b",
        "#b997f7", "#e6db8c",
        "#99acf2", "#8c8c8c"
    )
)

## Retrieve review files from HISE

These were stored as a .tar.gz archive to make transfer a bit easier. We'll pull down the .tar.gz then decompress and unbundle with a `system()` call.

In [2]:
review_uuid <- "cc96d018-24e4-4e53-a2db-34806a60eff4"

In [3]:
cache_res <- cacheFiles(list(review_uuid))

[1] "Initiating file download for diha_qc_AIFI_L2_review_2024-03-15.tar.gz"
[1] "Download successful."


In [4]:
review_tar_file <- list.files(paste0("cache/", review_uuid), full.names = TRUE)

In [6]:
untar_call <- paste("tar -xzf", review_tar_file)
system(untar_call)

Because of how they were stored in the originating instance, these will untar to the `output/review/` subfolder.

In [16]:
filter_files <- list.files("output/review", pattern = "filter_df", full.names = TRUE)
names(filter_files) <- sub(".+diha_qc_(.+)_filter.+", "\\1", filter_files)

marker_files <- list.files("output/review", pattern = "marker_df", full.names = TRUE)
names(marker_files) <- sub(".+diha_qc_(.+)_marker.+", "\\1", marker_files)

meta_files <- list.files("output/review", pattern = "obs_df", full.names = TRUE)
names(meta_files) <- sub(".+diha_qc_(.+)_obs.+", "\\1", meta_files)

In [17]:
group_names <- names(filter_files)

## Generate plots for every group

In [306]:
walk(
    group_names,
    function(group_name) {
        filter_file <- filter_files[group_name]
        marker_file <- marker_files[group_name]
        meta_file <- meta_files[group_name]

        filter_df <- as.data.frame(fread(filter_file))
        marker_df <- as.data.frame(fread(marker_file))
        meta_df <- as.data.frame(fread(meta_file))

        all_plots <- assemble_plots(
            filter_df, marker_df, meta_df,
            reason_colors, group_name
        )

        out_file <- paste0(
            "output/diha_filter-review_", group_name, ".png" 
        )
        
        ggsave(
            out_file,
            all_plots,
            width = 20, height = 12
        )
    }
)

In [294]:
sessionInfo()

R version 4.3.2 (2023-10-31)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 20.04.6 LTS

Matrix products: default
BLAS/LAPACK: /opt/conda/lib/libopenblasp-r0.3.25.so;  LAPACK version 3.11.0

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
 [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
 [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

time zone: Etc/UTC
tzcode source: system (glibc)

attached base packages:
[1] grid      stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
[1] cowplot_1.1.2     ggrastr_1.0.2     purrr_1.0.2       hise_2.16.0      
[5] ggplot2_3.4.4     dplyr_1.1.4       data.table_1.15.0

loaded via a namespace (and not attached):
 [1] gtable_0.3.4      jsonlite_1.8.8    compiler_4.3.2    crayon_1.5.2     
 [5] tidyselect_1.2.0  ggbeeswarm_0.7.2  IRd