## Install packages

This package includes helper functions for plot colorsets

In [1]:
devtools::install_github("hypercompetent/colorway", upgrade = "never", quiet = TRUE)

This package includes helpers for plot types, including river/sankey plots

In [2]:
devtools::install_github("alleninstitute/scrattch.vis", upgrade = "always", quiet = TRUE)

This package includes helpers for plotting tables

In [42]:
install.packages("ggpmisc", upgrade = "never", quiet = TRUE)

Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done



In [43]:
quiet_library <- function(...) { suppressPackageStartupMessages(library(...)) }

quiet_library(hise)
quiet_library(ggplot2)
quiet_library(ggrastr)
quiet_library(ggpmisc)
quiet_library(colorway)
quiet_library(cowplot)
quiet_library(scrattch.vis)
quiet_library(data.table)
quiet_library(dplyr)
quiet_library(purrr)

In [104]:
if(!dir.exists("output/plots")) {
    dir.create("output/plots", recursive = TRUE)
}

## Helper functions

In [4]:
format_perc <- function(x, dec = 2) {
    format(round(x * 100, 2), nsmall = dec)
}

In [5]:
plot_l3 <- function(meta_df, cell_type, color_by = "AIFI_L2", l3_colors) {

    color_by = rlang::parse_expr(color_by)

    meta_df <- meta_df[sample(1:nrow(meta_df), nrow(meta_df)),]
    
    p <- ggplot() +
      geom_point_rast(
          data = meta_df,
          aes(x = umap_1,
              y = umap_2,
              color = !!color_by),
          size = 0.02
      ) +
      scale_color_manual(
          breaks = l3_colors$L3,
          values = l3_colors$color
      ) +
      large_guides() +
      theme_bw() +
      theme(axis.ticks = element_blank(),
            legend.position = "bottom") +
      ggtitle(paste0(color_by,": ",cell_type))
    p
}

In [6]:
plot_diff <- function(meta_df, cell_type) {

    meta_df <- meta_df %>%
      mutate(diff = case_when(
          is.na(original_L3) ~ "Previously removed",
          original_L3 == AIFI_L3 ~ "Matched",
          TRUE ~ "Mismatched"
      ))

    color_df <- data.frame(
        diff = c("Previously removed", "Matched", "Mismatched"),
        color = c("gray80", "skyblue", "orangered")
    )
    
    meta_df <- meta_df[sample(1:nrow(meta_df), nrow(meta_df)),]
    
    p <- ggplot() +
      geom_point_rast(
          data = meta_df,
          aes(x = umap_1,
              y = umap_2,
              color = diff),
          size = 0.02
      ) +
      scale_color_manual(
          breaks = color_df$diff,
          values = color_df$color
      ) +
      large_guides() +
      theme_bw() +
      theme(axis.ticks = element_blank(),
            legend.position = "bottom") +
      ggtitle(paste0("Matching: ",cell_type))
    p
}

In [None]:
plot_l3_river <- function(meta, l3_colors) {
    plot_meta <- meta %>%
        select(original_L3, predicted_AIFI_L3, AIFI_L3)
    
    l3_plot_data <- plot_meta %>%
      left_join(rename(l3_colors, original_L3 = L3, original_L3_id = id, original_L3_color = color),
                by = "original_L3") %>%
      left_join(rename(l3_colors, predicted_AIFI_L3 = L3, predicted_AIFI_L3_id = id, predicted_AIFI_L3_color = color),
                by = "predicted_AIFI_L3") %>%
      left_join(rename(l3_colors, AIFI_L3 = L3, AIFI_L3_id = id, AIFI_L3_color = color),
                by = "AIFI_L3") %>%
      rename(original_L3_label = original_L3,
             predicted_AIFI_L3_label = predicted_AIFI_L3,
             AIFI_L3_label = AIFI_L3)

    build_river_plot(
        anno = l3_plot_data,
        grouping = c("original_L3", "AIFI_L3"),
        label_pos = c("left", "right"),
        fill_group = "original_L3"
    ) + 
        scale_x_continuous(expand = c(0.3, 0.3)) +
        ggtitle("L3 Original -> Final") +
        theme(plot.background = element_rect(fill = "white"))
}

In [101]:
plot_summaries <- function(meta, og_meta) {
    l3_summary <- meta %>%
      mutate(diff = case_when(
              is.na(original_L3) ~ "Previously removed",
              original_L3 == AIFI_L3 ~ "Matched",
              TRUE ~ "Mismatched"
          )) %>%
      group_by(diff) %>%
      tally() %>%
      mutate(percent = format_perc(n/sum(n)))
    l2_summary <- meta %>%
      mutate(diff = case_when(
              is.na(original_L2) ~ "Previously removed",
              original_L2 == AIFI_L2 ~ "Matched",
              TRUE ~ "Mismatched"
          )) %>%
      group_by(diff) %>%
      tally() %>%
      mutate(percent = format_perc(n/sum(n)))

    
    count_summary <- data.frame(
        original_n = nrow(og_meta),
        updated_n = nrow(meta)
    ) %>%
      mutate(perc_change = format_perc((updated_n - original_n) / original_n))
    
    l3_tb <- tibble(x = 0, y = 0.95, tb = list(l3_summary))
    l2_tb <- tibble(x = 0, y = 0.45, tb = list(l2_summary))
    count_tb <- tibble(x = 0, y = 0, tb = list(count_summary))
    labels <- data.frame(x = 0, y = c(0.2, 0.6, 1), labels = c("Count Summary", "L2 Summary", "L3 Summary"))
    ggplot() +
        geom_text(data = labels,
           aes(x = x, y = y ,label = labels),
              size = 5) +
        geom_table(data = l3_tb,
             aes(x = x, y = y, label = tb),
              size = 5) +
        geom_table(data = l2_tb,
             aes(x = x, y = y, label = tb),
            size = 5) +
        geom_table(data = count_tb,
             aes(x = x, y = y, label = tb),
            size = 5) +
        theme_void() +
        theme(panel.background = element_rect(fill = "white"))
}

## Read cell type hierarchy from HISE

In [8]:
hierarchy_uuid <- "1a44252c-8cab-4c8f-92c9-d8f3af633790"
hise_res <- cacheFiles(list(hierarchy_uuid))
hierarchy_file <- list.files(paste0("cache/", hierarchy_uuid), full.names = TRUE)
hierarchy_df <- read.csv(hierarchy_file, row.names = 1)

## Generate consistent colors for plotting

In [9]:
set.seed(3030)

In [10]:
colorset <- varibow(nrow(hierarchy_df))

l3_colors <- hierarchy_df %>%
  select(AIFI_L3) %>%
  mutate(id = 1:n(),
         color = sample(colorset, n(), replace = FALSE)) %>%
  rename(L3 = AIFI_L3)
l3_colors <- rbind(l3_colors, data.frame(L3 = "other", id = max(l3_colors$id) + 1, color = "#000000") )

## Previous, non-tracked version of cell labels

We previously performed cell labeling in a way that wasn't tracked in our CertPro system. We'll compare our new results to these original results and make sure they're similar.

In [11]:
label_uuid <- "3868592c-0087-4ed8-98b2-4bf1b8676111"
res <- cacheFiles(list(label_uuid))
label_parquet <- list.files(paste0("cache/",label_uuid), full.names = TRUE)
label_csv <- sub("parquet","csv",label_parquet)
label_csv <- basename(label_csv)

Convert from parquet to csv for easy reading in R because our IDEs are having trouble with the `arrow` package:

In [12]:
system_call <- paste0(
    "python -c \'",
    "import pandas; ",
    "df = pandas.read_parquet(\"",label_parquet,"\");",
    "df = df[[\"barcodes\",\"AIFI_L1\",\"AIFI_L2\",\"AIFI_L3\"]];",
    "df.to_csv(\"",label_csv,"\")",
    "\'"
)

In [13]:
system_call

In [14]:
system(system_call)

Python version information in case we need it for reproducibility:

In [15]:
system("python --version", intern = TRUE)

In [16]:
system("python -c 'import pandas; print(pandas.__version__)'", intern = TRUE)

In [17]:
og_labels <- fread(label_csv, header = TRUE)
og_labels <- og_labels[,V1:=NULL]

In [18]:
og_labels <- as.data.frame(og_labels)

In [19]:
names(og_labels) <- sub("AIFI", "original", names(og_labels))

In [20]:
nrow(og_labels)

In [21]:
head(og_labels)

Unnamed: 0_level_0,barcodes,original_L1,original_L2,original_L3
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>
1,05ea9806794211eb93b836d1cb6129eb,DC,cDC1,cDC1
2,e225c914794011eb9282e2ceeb91ba52,DC,cDC1,cDC1
3,b1379eae795411eb958b0245821e6993,DC,cDC1,cDC1
4,b13d3a8a795411eb958b0245821e6993,DC,cDC1,cDC1
5,b1430d16795411eb958b0245821e6993,DC,cDC1,cDC1
6,7f926876794e11eb80b0ca7613f64db1,DC,cDC1,cDC1


## Retrieve L3 labels and UMAP coordinates

In [22]:
tar_uuid <- '611ae161-1ecb-4ff0-a460-2a5dcc54a238'

In [23]:
hise_res <- cacheFiles(list(tar_uuid))

In [24]:
tar_path <- list.files(paste0("cache/", tar_uuid), full.names = TRUE)

In [25]:
extract_call <- paste("tar -xzf", tar_path)
system(extract_call)

In [26]:
meta_files <- list.files("output", pattern = ".csv", full.names = TRUE)

In [27]:
meta_list <- map(meta_files, fread)

In [37]:
meta_file <- paste0("output/diha_L3_review_all_meta_", Sys.Date(), ".csv")
if(!file.exists(meta_file)) {    
    all_meta <- map(meta_list, as.data.frame) %>%
      map(select, -V1, -index) %>%
      list_rbind()
    all_meta <- all_meta %>%
      left_join(og_labels, by = "barcodes")
    fwrite(all_meta, meta_file)
} else {
    all_meta <- fread(meta_file)
    all_meta <- as.data.frame(all_meta)
}

In [38]:
meta_list <- split(all_meta, all_meta$AIFI_L3)

In [39]:
original_list <- split(all_meta, all_meta$original_L3)
original_list <- original_list[names(meta_list)]

In [107]:
walk2(
    meta_list, original_list,
    function(meta, og_meta) {
        cell_type <- meta$AIFI_L3[1]
        print(cell_type)
        
        l3_summary <- meta %>%
          mutate(diff = case_when(
                  is.na(original_L3) ~ "Previously removed",
                  original_L3 == AIFI_L3 ~ "Matched",
                  TRUE ~ "Mismatched"
              )) %>%
          group_by(diff) %>%
          tally() %>%
          mutate(percent = format_perc(n/sum(n)))
        l2_summary <- meta %>%
          mutate(diff = case_when(
                  is.na(original_L2) ~ "Previously removed",
                  original_L2 == AIFI_L2 ~ "Matched",
                  TRUE ~ "Mismatched"
              )) %>%
          group_by(diff) %>%
          tally() %>%
          mutate(percent = format_perc(n/sum(n)))
        
        og_umap <- plot_l3(meta, cell_type, "original_L3", l3_colors = l3_colors)
        final_umap <- plot_l3(meta, cell_type, "predicted_AIFI_L3", l3_colors = l3_colors)
        diff_umap <- plot_diff(meta, cell_type)
        
        top_row <- plot_grid(og_umap, final_umap, diff_umap, 
                             nrow = 1, ncol = 3, align = "h")
        
        to_new_river_plot <- plot_l3_river(meta, l3_colors)
        from_og_river_plot <- plot_l3_river(og_meta, l3_colors)
        summary_plot <- plot_summaries(meta, og_meta)

        bottom_row <- plot_grid(from_og_river_plot, to_new_river_plot, summary_plot, 
                                nrow = 1, ncol = 3)

        all_plots <- plot_grid(top_row, bottom_row, 
                               nrow = 2, ncol = 1,
                               rel_heights = c(1, 1))

        ggsave(
            paste0("output/plots/", cell_type, "_", Sys.Date(), ".png"),
            all_plots,
            width = 20, height = 16
        )        
        
    }
)

[1] "Activated memory B cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "Adaptive NK cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "ASDC"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "BaEoMaP cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "C1Q+ CD16 monocyte"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "CD14+ cDC2"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "CD27- effector B cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "CD27+ effector B cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "CD4 MAIT"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "CD56bright NK cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "CD8 MAIT"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "CD8aa"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "CD95 memory B cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "cDC1"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "CLP cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "CM CD4 T cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "CM CD8 T cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "CMP cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "Core CD14 monocyte"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "Core CD16 monocyte"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "Core memory B cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "Core naive B cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "Core naive CD4 T cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "Core naive CD8 T cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "DN T cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "Early memory B cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "Erythrocyte"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "GZMB- CD27- EM CD4 T cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "GZMB- CD27+ EM CD4 T cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "GZMB+ Vd2 gdT"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "GZMK- CD27+ EM CD8 T cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "GZMK- CD56dim NK cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "GZMK+ CD27+ EM CD8 T cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "GZMK+ CD56dim NK cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "GZMK+ memory CD4 Treg"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "GZMK+ Vd2 gdT"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "HLA-DRhi cDC2"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "IL1B+ CD14 monocyte"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "ILC"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "Intermediate monocyte"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "ISG+ CD14 monocyte"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "ISG+ CD16 monocyte"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "ISG+ CD56dim NK cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "ISG+ cDC2"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "ISG+ MAIT"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "ISG+ memory CD4 T cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "ISG+ memory CD8 T cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "ISG+ naive B cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "ISG+ naive CD4 T cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "ISG+ naive CD8 T cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "KLRB1+ memory CD4 Treg"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "KLRB1+ memory CD8 Treg"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "KLRF1- effector Vd1 gdT"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "KLRF1- GZMB+ CD27- EM CD8 T cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "KLRF1- GZMB+ CD27- memory CD4 T cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "KLRF1+ effector Vd1 gdT"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "KLRF1+ GZMB+ CD27- EM CD8 T cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "Memory CD4 Treg"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "Memory CD8 Treg"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "Naive CD4 Treg"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "Naive Vd1 gdT"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "pDC"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "Plasma cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "Platelet"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "Proliferating NK cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "Proliferating T cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "SOX4+ naive CD4 T cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "SOX4+ naive CD8 T cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "SOX4+ Vd1 gdT"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "Transitional B cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”


[1] "Type 2 polarized memory B cell"


“[1m[22mRemoved 1 row containing missing values or values outside the scale range
(`geom_text()`).”
