# Review plots and statistics for label refinement

In this notebook, we'll use tools in R to plot and compute summary statistics about our cell label refinements.

## Install packages

This package includes helper functions for plot colorsets

In [1]:
devtools::install_github("hypercompetent/colorway", upgrade = "never", quiet = TRUE)

This package includes helpers for plot types, including river/sankey plots

In [2]:
devtools::install_github("alleninstitute/scrattch.vis", upgrade = "never", quiet = TRUE)

## Load libraries

`colorway`: ggplot colorsets  
`cowplot`: Plot grid arrangement  
`hise`: The R SDK for HISE  
`data.table`: Efficient implementation of data frames  
`dplyr`: Manipulation of data frames  
`ggplot2`: Plotting using the grammar of graphics  
`ggrastr`: Rasterized plotting to help with large UMAPs  
`purrr`: functional programming tools for R  
`furrr`: paralellization of `purrr` using `futures`  
`scrattch.vis`: Plotting helper functions

In [3]:
quiet_library <- function(...) { suppressPackageStartupMessages(library(...)) }

quiet_library(colorway)
quiet_library(cowplot)
quiet_library(hise)
quiet_library(data.table)
quiet_library(dplyr)
quiet_library(ggplot2)
quiet_library(ggrastr)
quiet_library(purrr)
quiet_library(furrr)
quiet_library(scrattch.vis)

In [4]:
plan(multisession, workers = 12)

In [5]:
if(!dir.exists("output")) {
    dir.create("output")
}

## Helper functions

These functions pull our previously generated results and provide paths to the files after retrieving them.

In [6]:
cache_path_uuid <- function(uuid) {
    cache_dir = paste0('cache/', uuid)
    if (!dir.exists(cache_dir)) {
        cacheFiles(list(uuid))
    }
    list.files(cache_dir, full.names = TRUE)
}

In [7]:
cache_tar_path_uuid <- function(uuid) {
    cache_dir = paste0('cache/', uuid)
    if (!dir.exists(cache_dir)) {
        cacheFiles(list(uuid))
    }
    if (!dir.exists('output/review')) {
        dir.create('output/review')
    }
    tarfile = list.files(cache_dir, full.names = TRUE)
    old_rev_files = list.files('output/review', full.names = TRUE)
    system(paste("tar -xf", tarfile))
    new_rev_files = list.files('output/review', full.names = TRUE)
    setdiff(new_rev_files, old_rev_files)
}

## Plotting functions

In [8]:
format_perc <- function(x, dec = 2) {
    format(round(x * 100, 2), nsmall = dec)
}

In [36]:
plot_umap <- function(meta_df, color_by = "AIFI_L2", color_name = NULL) {
    if(is.null(color_name)) {
        color_name <- color_by
    }
    color_by = rlang::parse_expr(color_by)
    
    p <- ggplot() +
      geom_point_rast(
          data = meta_df,
          aes(x = umap_1,
              y = umap_2,
              color = !!color_by),
          size = 0.1
      ) +
      scale_color_varibow() +
      large_guides() +
      theme_bw() +
      theme(axis.ticks = element_blank(),
            legend.position = "bottom")
    p
}

In [35]:
plot_markers <- function(marker_df, level = "AIFI_L2") {
    level = rlang::parse_expr(level)
    
    ggplot() +
      geom_point(
          data = marker_df,
          aes(x = gene,
              y = !!level,
              size = gene_frac,
              fill = log(gene_mean + 1)),
          pch = 21
      ) +
      scale_size_area() +
      scale_fill_gradientn(
          colors = c("black", "darkred", "red", "orangered", "orange")
      ) +
      theme_bw() +
      theme(axis.ticks = element_blank(),
            axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.3))
}

In [11]:
get_low_counts <- function(meta, column, collapse_n) {
    column <- rlang::parse_expr(column)
    meta %>%
      group_by(!!column) %>%
      summarise(n_cells = n(), .groups = "keep") %>%
      filter(n_cells <= collapse_n) %>%
      select(!!column) %>%
      unlist()
}

In [12]:
plot_l2_river <- function(meta, l2_colors, collapse_n = 10) {
    plot_meta <- meta %>%
      select(original_L2, predicted_AIFI_L2, AIFI_L2)
    
    if(!is.null(collapse_n)) {
        original_drop <- get_low_counts(plot_meta, "original_L2", collapse_n)
        predicted_drop <- get_low_counts(plot_meta, "predicted_AIFI_L2", collapse_n)
        plot_meta <- plot_meta %>%
          mutate(original_L2 = ifelse(original_L2 %in% original_drop, "other", original_L2),
                 predicted_AIFI_L2 = ifelse(predicted_AIFI_L2 %in% predicted_drop, "other", predicted_AIFI_L2))
    }
    
    l2_plot_data <- plot_meta %>%
      left_join(rename(l2_colors, original_L2 = L2, original_L2_id = id, original_L2_color = color),
                by = "original_L2") %>%
      left_join(rename(l2_colors, predicted_AIFI_L2 = L2, predicted_AIFI_L2_id = id, predicted_AIFI_L2_color = color),
                by = "predicted_AIFI_L2") %>%
      left_join(rename(l2_colors, AIFI_L2 = L2, AIFI_L2_id = id, AIFI_L2_color = color),
                by = "AIFI_L2") %>%
      rename(original_L2_label = original_L2,
             predicted_AIFI_L2_label = predicted_AIFI_L2,
             AIFI_L2_label = AIFI_L2)

    build_river_plot(
        anno = l2_plot_data,
        grouping = c("original_L2", "predicted_AIFI_L2", "AIFI_L2"),
        label_pos = c("left", "center", "right"),
        fill_group = "predicted_AIFI_L2"
    ) + 
        scale_x_continuous(expand = c(0.3, 0.3)) +
        ggtitle("L2 Original -> Predicted -> Final") +
        theme(plot.background = element_rect(fill = "white"))
}

In [13]:
plot_l3_river <- function(meta, l3_colors, collapse_n = 10) {
    plot_meta <- meta %>%
        select(original_L3, predicted_AIFI_L3, AIFI_L3)
    
    if(!is.null(collapse_n)) {
        original_drop <- get_low_counts(plot_meta, "original_L3", collapse_n)
        predicted_drop <- get_low_counts(plot_meta, "predicted_AIFI_L3", collapse_n)
        plot_meta <- plot_meta %>%
          mutate(original_L3 = ifelse(original_L3 %in% original_drop, "other", original_L3),
                 predicted_AIFI_L3 = ifelse(predicted_AIFI_L3 %in% predicted_drop, "other", predicted_AIFI_L3))
    }
    
    l3_plot_data <- plot_meta %>%
      left_join(rename(l3_colors, original_L3 = L3, original_L3_id = id, original_L3_color = color),
                by = "original_L3") %>%
      left_join(rename(l3_colors, predicted_AIFI_L3 = L3, predicted_AIFI_L3_id = id, predicted_AIFI_L3_color = color),
                by = "predicted_AIFI_L3") %>%
      left_join(rename(l3_colors, AIFI_L3 = L3, AIFI_L3_id = id, AIFI_L3_color = color),
                by = "AIFI_L3") %>%
      rename(original_L3_label = original_L3,
             predicted_AIFI_L3_label = predicted_AIFI_L3,
             AIFI_L3_label = AIFI_L3)

    build_river_plot(
        anno = l3_plot_data,
        grouping = c("original_L3", "predicted_AIFI_L3", "AIFI_L3"),
        label_pos = c("left", "center", "right"),
        fill_group = "predicted_AIFI_L3"
    ) + 
        scale_x_continuous(expand = c(0.3, 0.3)) +
        ggtitle("L3 Original -> Predicted -> Final") +
        theme(plot.background = element_rect(fill = "white"))
}

## Previous, non-tracked version of cell labels

We previously performed cell labeling in a way that wasn't tracked in our CertPro system. We'll compare our new results to these original results and make sure they're similar.

In [14]:
label_uuid <- "3868592c-0087-4ed8-98b2-4bf1b8676111"
res <- cacheFiles(list(label_uuid))
label_parquet <- list.files(paste0("cache/",label_uuid), full.names = TRUE)
label_csv <- sub("parquet","csv",label_parquet)
label_csv <- basename(label_csv)

Convert from parquet to csv for easy reading in R because our IDEs are having trouble with the `arrow` package:

In [15]:
system_call <- paste0(
    "python -c \'",
    "import pandas; ",
    "df = pandas.read_parquet(\"",label_parquet,"\");",
    "df = df[[\"barcodes\",\"AIFI_L1\",\"AIFI_L2\",\"AIFI_L3\"]];",
    "df.to_csv(\"",label_csv,"\")",
    "\'"
)

In [16]:
system_call

In [17]:
system(system_call)

Python version information in case we need it for reproducibility:

In [18]:
system("python --version", intern = TRUE)

In [19]:
system("python -c 'import pandas; print(pandas.__version__)'", intern = TRUE)

In [20]:
og_labels <- fread(label_csv, header = TRUE)
og_labels <- og_labels[,V1:=NULL]

In [21]:
og_labels <- as.data.frame(og_labels)

In [22]:
names(og_labels) <- sub("AIFI", "original", names(og_labels))

In [23]:
nrow(og_labels)

In [24]:
head(og_labels)

Unnamed: 0_level_0,barcodes,original_L1,original_L2,original_L3
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>
1,05ea9806794211eb93b836d1cb6129eb,DC,cDC1,cDC1
2,e225c914794011eb9282e2ceeb91ba52,DC,cDC1,cDC1
3,b1379eae795411eb958b0245821e6993,DC,cDC1,cDC1
4,b13d3a8a795411eb958b0245821e6993,DC,cDC1,cDC1
5,b1430d16795411eb958b0245821e6993,DC,cDC1,cDC1
6,7f926876794e11eb80b0ca7613f64db1,DC,cDC1,cDC1


In [25]:
og_split <- split(og_labels, "original_L2")

## Generate consistent colors for plotting

In [26]:
l2_colors <- og_labels %>%
  select(original_L2) %>%
  unique() %>%
  arrange(original_L2) %>%
  mutate(id = 1:n(), 
         color = varibow(n())) %>%
  rename(L2 = original_L2)
l2_colors <- rbind(l2_colors, data.frame(L2 = "other", id = max(l2_colors$id) + 1, color = "#000000") )

In [27]:
l3_colors <- og_labels %>%
  select(original_L3) %>%
  unique() %>%
  arrange(original_L3) %>%
  mutate(id = 1:n(),
         color = varibow(n())) %>%
  rename(L3 = original_L3)
l3_colors <- rbind(l3_colors, data.frame(L3 = "other", id = max(l3_colors$id) + 1, color = "#000000") )

## Small cell sets

These cell types were refined across all samples because they were small enough to be manageable using all data.

In [28]:
small_uuids <- list(
    asdc = list(
        meta = 'cce70b23-5728-4588-963e-8f0afafa3943',
        l2_markers = 'fa63cbdd-2652-4c4e-957b-9570fc9c07d4',
        l3_markers = 'dbc20c14-5167-4e3f-a3bf-d9220dcfbe69'),
    b_memory_cells = list(
        meta = '49cb02da-d9a2-488c-b866-72335e6ec616',
        l2_markers = '7d11f691-561a-458a-90af-1c751ddc501d',
        l3_markers = '7ad29a45-9bbe-418b-8603-bab68042d754'),
    b_naive_cells = list(
        meta = '3a1b1129-65ca-4f17-bc0f-2a53ac2ae963',
        l2_markers = 'd2ed48a3-55b4-4d96-96ec-c18724be1dd0',
        l3_markers = '785537bb-e651-454d-b332-4a971dc7cc87'),
    dc = list(
        meta = '63287a4b-adf0-42fc-8a75-37223d2f97de',
        l2_markers = '7b88180c-c403-4b67-a3a7-8d7577306516',
        l3_markers = '0474e3de-fc64-4b76-af50-f8da731a372d'),
    gdt_cd8aa = list(
        meta = 'cd513dd2-a72f-48e6-85b7-bad4676cb13f',
        l2_markers = '460daa3f-18be-4db3-a758-14d5e7401067',
        l3_markers = '9c513b26-4852-4f13-8cf9-d6f15b05e76a'),
    treg = list(
        meta = 'aab9cd47-29cd-4be4-bb86-2ed511b3d4e2',
        l2_markers = '6471c399-6248-4332-a96f-bb6ffb49cc82',
        l3_markers = 'a5c244f4-108f-4495-82de-8e774b73de60'),
    progen_prolif = list(
        meta = '8226e775-8f4d-452e-b81e-3f3fb4a70d1e',
        l2_markers = 'a6296c2b-ccf5-40b5-b3c9-2a0820532cc6',
        l3_markers = '7fbbd0f0-2162-40b8-8511-7380f8657280'),
    eryth_platelet = list(
        meta = 'e5c881d6-0576-4067-b624-4e984fe97900',
        l2_markers = '754d78fe-5ca5-49c4-aa26-33e06ab76fd1',
        l3_markers = '1f0160f4-8e32-4f98-8960-b7086fcb0cee'),
    plasma_ilc = list(
        meta = 'b3636d73-58c9-4972-8fb9-1cb51439c377',
        l2_markers = 'b5034de4-5650-4c8f-ae25-c8cb0449463d',
        l3_markers = '68d14aa0-a632-4f53-8c8a-5c90441b20fe'),
    dnt_mait = list(
        meta = '4ce56aca-03a9-46b2-9a32-924a35602fce',
        l2_markers = 'bcaf5cc3-9255-4223-bd84-95795853dd2b',
        l3_markers = 'f1dd5a2b-38e2-4787-a2d3-d2a49cbf3353'),
    cd8_naive = list(
        meta = '45668c5c-1a48-4fae-9d59-fa566063263e',
        l2_markers = 'ed30d2e7-5c8e-4477-bd9b-c378d262a531',
        l3_markers = '83046eb4-730a-4e4d-af50-3fc7d03f5942')
)

In [29]:
small_paths <- future_map(
    small_uuids,
    function(uuid_set) {
        map(uuid_set, cache_path_uuid)
    }
)

In [30]:
small_dfs <- future_map(
    small_paths,
    function(path_set) {
        dts <- map(path_set, fread)
        map(dts, as.data.frame)
    }
)

In [31]:
small_dfs <- map(
    small_dfs,
    function(df_set) {
        df_set$meta <- df_set$meta %>%
          left_join(og_labels, by = "barcodes")
        df_set
    }
)

In [38]:
walk2(
    small_dfs, names(small_dfs),
    function(df_set, set_name) {

        orig_l2_umap <- plot_umap(df_set$meta, "original_L2")
        pred_l2_umap <- plot_umap(df_set$meta, "predicted_AIFI_L2")
        l2_umap <- plot_umap(df_set$meta, "AIFI_L2")

        orig_l3_umap <- plot_umap(df_set$meta, "original_L3")
        pred_l3_umap <- plot_umap(df_set$meta, "predicted_AIFI_L3")
        l3_umap <- plot_umap(df_set$meta, "AIFI_L3")

        l2_river <- plot_l2_river(df_set$meta, l2_colors)
        l3_river <- plot_l3_river(df_set$meta, l3_colors)

        l2_marker <- plot_markers(df_set$l2_markers, "AIFI_L2")
        l3_marker <- plot_markers(df_set$l3_markers, "AIFI_L3")

        umap_grid <- plot_grid(orig_l2_umap, pred_l2_umap, l2_umap,
                               orig_l3_umap, pred_l3_umap, l3_umap,
                               ncol = 3, nrow = 2,
                               align = "hv")
        umap_file <- paste0("output/diha_", set_name, "_review_umaps_",Sys.Date(),".png")
        ggsave(
            umap_file,
            umap_grid,
            width = 20, height = 18
        )
        
        river_grid <- plot_grid(l2_river, l3_river,
                                ncol = 2, nrow = 1)
        marker_grid <- plot_grid(l2_marker, l3_marker,
                                 ncol = 1, nrow = 2,
                                 rel_heights = c(0.5, 1))
        other_grid <- plot_grid(marker_grid, river_grid,
                                ncol = 2, nrow = 1,
                                rel_widths = c(0.7,1))
        
        other_file <- paste0("output/diha_", set_name, "_review_plots_",Sys.Date(),".png")
        ggsave(
            other_file,
            other_grid,
            width = 24, height = 8
        )
    }
)

[1m[22mJoining with `by = join_by(group)`
[1m[22mJoining with `by = join_by(original_L2_id, group1)`
[1m[22mJoining with `by = join_by(predicted_AIFI_L2_id, group2)`
[1m[22mJoining with `by = join_by(predicted_AIFI_L2_id, group1)`
[1m[22mJoining with `by = join_by(AIFI_L2_id, group2)`
[1m[22mJoining with `by = join_by(group)`
[1m[22mJoining with `by = join_by(original_L3_id, group1)`
[1m[22mJoining with `by = join_by(predicted_AIFI_L3_id, group2)`
[1m[22mJoining with `by = join_by(predicted_AIFI_L3_id, group1)`
[1m[22mJoining with `by = join_by(AIFI_L3_id, group2)`
“[1m[22mRemoved 625 rows containing missing values (`geom_point()`).”
“[1m[22mRemoved 625 rows containing missing values (`geom_point()`).”
“[1m[22mRemoved 1 rows containing missing values (`geom_text()`).”
“[1m[22mRemoved 1 rows containing missing values (`geom_text()`).”
[1m[22mJoining with `by = join_by(group)`
[1m[22mJoining with `by = join_by(original_L2_id, group1)`
[1m[22mJoining with

## Large cell sets

In [40]:
large_uuids <- list(
    nk_cell = list(
        meta = 'c844b878-3f58-4dfc-b065-f1de41ab0a49',
        tarfile = '2e3c5be0-6edd-4a72-aa6c-ccd0e66c0cb7'
    ),
    memory_cd4_t_cell = list(
        meta = '12fe9894-2d59-48e8-9625-24219fd422f2',
        tarfile = '07c8e4e0-d30a-4780-a664-f5ea728630e2'
    ),
    monocyte = list(
        meta = '77b59624-9594-4e9c-ab6d-2d5b7a6a132d',
        tarfile = '8aebf007-0076-437e-9261-3db53f7c1a43'
    ),
    naive_cd4_t_cell = list(
        meta = '39d01e15-cb0a-46e7-be0c-858d53e39969',
        tarfile = 'e79864fc-bf3b-4057-b151-0ff1302ce84c'
    ),
    memory_cd8_t_cell = list(
        meta = '5a2d9fe6-debf-4f20-ac16-f279788453ac',
        tarfile = 'ec1b4e79-c41f-4ddc-ad8b-58d35385ce17'
    )
)

In [45]:
large_paths = map(
    large_uuids,
    function(uuid_list) {
        meta_path = cache_path_uuid(list(uuid_list[["meta"]]))
        review_paths = cache_tar_path_uuid(list(uuid_list[["tarfile"]]))
        list(meta = meta_path,
             review_paths = review_paths)
    }
)

In [60]:
large_paths[[1]]$review_paths

In [81]:
large_review_paths <- map(
    large_paths,
    function(path_set) {
        path_groups <- sub("diha_(.+)_AIFI.+", "\\1", basename(path_set$review_paths))
        path_groups <- path_groups[grepl("BR", path_groups)]
        path_groups <- unique(path_groups)
        path_groups
        res <- map(path_groups,
         function(path_group) {
             path_files <- path_set$review_paths[grepl(path_group, path_set$review_paths)]
             
             meta_files <- path_files[grepl("meta", path_files)]
             meta_files <- sort(meta_files)
             meta_file <- meta_files[length(meta_files)]
        
             l2_marker_files <- path_files[grepl("L2.+markers", path_files)]
             l2_marker_files <- sort(l2_marker_files)
             l2_marker_file <- l2_marker_files[length(l2_marker_files)]
             
             list(meta = meta_file,
                  l2_markers = l2_marker_file
                  #l3_markers = path_files[grepl("L3.+markers", path_files)]
             )
         })
        names(res) <- gsub(" ", "_",path_groups)
        res
    }
)
large_review_paths <- unlist(large_review_paths, recursive = FALSE)
names(large_review_paths) <- sub("^.+\\.", "", names(large_review_paths))

In [84]:
large_dfs <- map(
    large_review_paths,
    function(path_set) {
        path_dfs <- map(path_set, fread)
        path_dfs <- map(path_dfs, as.data.frame)
        path_dfs
    }
)

In [86]:
large_dfs <- map(
    large_dfs,
    function(df_set) {
        df_set$meta <- df_set$meta %>%
          left_join(og_labels, by = "barcodes")
        df_set
    }
)

In [None]:
walk2(
    large_dfs, names(large_dfs),
    function(df_set, set_name) {

        orig_l2_umap <- plot_umap(df_set$meta, "original_L2")
        pred_l2_umap <- plot_umap(df_set$meta, "predicted_AIFI_L2")
        l2_umap <- plot_umap(df_set$meta, "AIFI_L2")

        orig_l3_umap <- plot_umap(df_set$meta, "original_L3")
        pred_l3_umap <- plot_umap(df_set$meta, "predicted_AIFI_L3")
        l3_umap <- plot_umap(df_set$meta, "AIFI_L3")

        l2_river <- plot_l2_river(df_set$meta, l2_colors)
        l3_river <- plot_l3_river(df_set$meta, l3_colors)

        l2_marker <- plot_markers(df_set$l2_markers, "AIFI_L2")
        #l3_marker <- plot_markers(df_set$l3_markers, "AIFI_L3")

        umap_grid <- plot_grid(orig_l2_umap, pred_l2_umap, l2_umap,
                               orig_l3_umap, pred_l3_umap, l3_umap,
                               ncol = 3, nrow = 2,
                               align = "hv")
        umap_file <- paste0("output/diha_", set_name, "_review_umaps_",Sys.Date(),".png")
        ggsave(
            umap_file,
            umap_grid,
            width = 20, height = 18
        )
        
        river_grid <- plot_grid(l2_river, l3_river,
                                ncol = 2, nrow = 1)
        # marker_grid <- plot_grid(l2_marker, l3_marker,
        #                          ncol = 1, nrow = 2,
        #                          rel_heights = c(0.5, 1))
        # other_grid <- plot_grid(marker_grid, river_grid,
        other_grid <- plot_grid(l2_marker, river_grid,
                                ncol = 2, nrow = 1,
                                rel_widths = c(0.7,1))
        
        other_file <- paste0("output/diha_", set_name, "_review_plots_",Sys.Date(),".png")
        ggsave(
            other_file,
            other_grid,
            width = 24, height = 8
        )
    }
)

[1m[22mJoining with `by = join_by(group)`
[1m[22mJoining with `by = join_by(original_L2_id, group1)`
[1m[22mJoining with `by = join_by(predicted_AIFI_L2_id, group2)`
[1m[22mJoining with `by = join_by(predicted_AIFI_L2_id, group1)`
[1m[22mJoining with `by = join_by(AIFI_L2_id, group2)`
[1m[22mJoining with `by = join_by(group)`
[1m[22mJoining with `by = join_by(original_L3_id, group1)`
[1m[22mJoining with `by = join_by(predicted_AIFI_L3_id, group2)`
[1m[22mJoining with `by = join_by(predicted_AIFI_L3_id, group1)`
[1m[22mJoining with `by = join_by(AIFI_L3_id, group2)`
“[1m[22mRemoved 2463 rows containing missing values (`geom_point()`).”
“[1m[22mRemoved 2463 rows containing missing values (`geom_point()`).”
“[1m[22mRemoved 1 rows containing missing values (`geom_text()`).”
“[1m[22mRemoved 1 rows containing missing values (`geom_text()`).”
[1m[22mJoining with `by = join_by(group)`
[1m[22mJoining with `by = join_by(original_L2_id, group1)`
[1m[22mJoining wi