# Visualising taxa abundance

In [None]:
setwd("/mnt/c/Users/Cedric/Desktop/git_repos/blood_microbiome")
require(tidyverse)
require(ggplot2)
require(data.table)
require(foreach)
require(ggpubr)
require(scales)
require(ggsci)
require(ggforce)
require(ggpubr)


Loading required package: tidyverse



### Load non-contaminant list

In [None]:
# After thresholding
prev_df <- fread("results/decontamination/prevalence_RA0.005_read10.csv")
original_list <- colnames(prev_df)
original_list <- original_list[original_list != "npm_research_id"]
length(original_list)

# Differential prevalence
diff_prev_nc <- read.csv("results/decontamination/diff_prev_V3/noncontaminants.RA0.005.read_threshold10.max_prev0.25.fold_diff2.txt")$taxa
diff_prev_c <- read.csv("results/decontamination/diff_prev_V3/contaminants.RA0.005.read_threshold10.max_prev0.25.fold_diff2.txt")$taxa
length(diff_prev_nc)
length(diff_prev_c)

# Correlation filter
corr_nc <- read.csv("results/decontamination/correlation_decontamination/nc.diff_prev_V3.RA0.005.read_threshold10.max_prev0.25.fold_diff2.corr_t0.7.within_batch.S.n9999.txt")$non_contaminant_taxon
corr_c <- read.csv("results/decontamination/correlation_decontamination/contam.diff_prev_V3.RA0.005.read_threshold10.max_prev0.25.fold_diff2.corr_t0.7.within_batch.S.n9999.txt")$contaminant

length(corr_nc)
length(corr_c)

# Simple batch filter
batch_nc <- read.csv("results/decontamination/simple_batch_decontam/nc.corr_t0.7.within_batch.txt")$taxa
length(batch_nc)

# After max count filter
species_read_zeroed <- fread("results/decontamination/read_matrix_n124.global_decontaminated.zeroed.csv")
final_list <- colnames(species_read_zeroed)
final_list <- final_list[final_list != "npm_research_id"]
length(final_list)

In [None]:
length(batch_nc)

## Compare to host range list (Shaw et al., 2020)

Original paper: https://onlinelibrary.wiley.com/doi/full/10.1111/mec.15463

Data: https://figshare.com/articles/dataset/The_phylogenetic_range_of_bacterial_and_viral_pathogens_of_vertebrates_dataset_and_supplementary_material/8262779

### Get human infection list

In [None]:
host_df <- read.csv("data/PathogenVsHostDB-2019-05-30.csv", row.names = 1)
human_df <- host_df %>% 
    select(Species, Human) %>%
    distinct()

before_df <- tibble(Species = original_list) %>%
    separate(Species, into = c("Genus", NA, NA), sep = " ", remove = F) %>%
    mutate(Genus = ifelse(grepl("virus", Species, ignore.case = T), NA, Genus)) %>%
    left_join(human_df)

fwrite(before_df, "results/decontamination/comparison/shaw_human_list_raw.csv")

### After manual literature review

In [None]:
human_df_parsed <- fread("results/decontamination/comparison/shaw_human_list_parsed.csv") %>%
    select(-Genus)

before_df_parsed <- tibble(Species = original_list) %>%
    separate(Species, into = c("Genus", NA, NA), sep = " ", remove = F) %>%
    mutate(Genus = ifelse(grepl("virus", Species, ignore.case = T), NA, Genus)) %>%
    left_join(human_df_parsed)

after_df <- tibble(Species = final_list) %>%
    separate(Species, into = c("Genus", NA, NA), sep = " ", remove = F) %>%
    mutate(Genus = ifelse(grepl("virus", Species, ignore.case = T), NA, Genus)) %>%
    left_join(human_df_parsed)

In [None]:
get_shaw_circle <- function(df, title) {
    df %>% 
        mutate(Human = ifelse(is.na(Human), "Unclassified", Human)) %>%
        mutate(Human = factor(Human, levels = c("Yes", "No", "Unclassified"))) %>%
        ggplot(aes(x = 1, fill = Human)) +
            geom_bar(aes(y = ..count..)) +
            coord_polar("y") +
            theme_minimal() +
            labs(title = title) +
            theme(axis.title.x = element_blank(),
                  axis.title.y = element_blank(),
                  panel.border = element_blank(),
                  panel.grid = element_blank(),
                  axis.ticks = element_blank(),
                  axis.text = element_blank()) +
            scale_fill_manual(limits = c("Yes", "No", "Unclassified"),
                          values = c("palegreen4", "deepskyblue4", "grey"))
}

ggarrange(get_shaw_circle(before_df_parsed, "Before decontamination"), 
          get_shaw_circle(after_df, "After decontamination"), 
          common.legend = T)

In [None]:
get_shaw_circle <- function(df, title) {
    rpie <- 1
    rlabel <-  0.6 * rpie
    df %>% 
        mutate(Human = factor(Human, levels = c("Yes", "No"))) %>%
        group_by(Human) %>%
        summarise(cnt = n(), total = nrow(df)) %>%
        mutate(end_angle = 2 * pi * cumsum(cnt) / total,      # ending angle for each pie slice
               start_angle = lag(end_angle, default = 0),   # starting angle for each pie slice
               mid_angle = 0.5 * (start_angle + end_angle)) %>%  # middle of each pie slice, for the text label
        ggplot() +
        geom_arc_bar(aes(x0 = 0, y0 = 0, r0 = 0, r = rpie,
                    start = start_angle, end = end_angle, fill = Human)) +
        geom_text(aes(x = rlabel * sin(mid_angle), y = rlabel * cos(mid_angle), label = cnt),
                    hjust = 0.5, vjust = 1, size = 20) +
        coord_fixed() +
        scale_x_continuous(limits = c(-1, 1), name = "", breaks = NULL, labels = NULL) +
        scale_y_continuous(limits = c(-1, 1), name = "", breaks = NULL, labels = NULL) +
        scale_fill_manual(limits = c("Yes", "No"),
                          values = c("palegreen4", "deepskyblue4")) +
        theme(panel.background = element_rect(fill = "white"),
              plot.margin=grid::unit(c(0,0,0,0),"cm"),
              axis.text = element_blank(),
              axis.title = element_blank(),
              axis.ticks=element_blank(),
              legend.position = "none",
              axis.ticks.length = unit(0, "pt")) +
        labs(x = NULL, y = NULL, fill = NULL) +
        labs(fill = "Human-associated?", title = title)

}

ggarrange(get_shaw_circle(before_df_parsed, "Before decontamination"), 
          get_shaw_circle(after_df, "After decontamination"), 
          common.legend = T, legend = "left")

ggsave("results/decontamination/comparison/shaw_comparison.png", dpi = 600)

# Get separate plots
before_shaw <- get_shaw_circle(before_df_parsed, NULL)
ggsave("results/decontamination/comparison/shaw_before_nolegend.png", plot = before_shaw, dpi = 600)

after_shaw <- get_shaw_circle(after_df, NULL)
ggsave("results/decontamination/comparison/shaw_after_nolegend.png", plot = after_shaw, dpi = 600)


ggarrange(get_shaw_circle(before_df_parsed, NULL), 
          get_shaw_circle(after_df, NULL), legend = "none")

## Compare to contaminant list (Poore et al., 2020)

Supplementary Table 7 from https://www.nature.com/articles/s41586-020-2095-1#MOESM2

In [None]:
poore_list <- fread("data/poore_et_al/poore_et_al_contaminant_list.csv")
before_df2 <- before_df_parsed %>%
    left_join(poore_list)

after_df2 <- after_df %>%
    left_join(poore_list)

In [None]:
get_poore_circle <- function(df, title) {
    rpie <- 1
    rlabel <-  0.6 * rpie
    
    df %>% 
        mutate(category = case_when(is.na(category) ~ "Not common contaminant",
                                    category == "LIKELY CONTAMINANT" ~ "Likely",
                                    category == "POTENTIALLY PATHOGENIC OR COMMENSAL" ~ "Potential pathogen/commensal",
                                    category == "MIXED EVIDENCE" ~ "Mixed evidence")) %>%
        mutate(category = factor(category, c("Likely", 
                                             "Mixed evidence",
                                             "Potential pathogen/commensal",
                                             "Not common contaminant"))) %>%
        group_by(category) %>%
        summarise(cnt = n(), total = nrow(df)) %>%
        mutate(end_angle = 2 * pi * cumsum(cnt)/ total,      # ending angle for each pie slice
               start_angle = lag(end_angle, default = 0),   # starting angle for each pie slice
               mid_angle = 0.5 * (start_angle + end_angle)) %>%  # middle of each pie slice, for the text label
        ggplot() +
        geom_arc_bar(aes(x0 = 0, y0 = 0, r0 = 0, r = rpie,
                    start = start_angle, end = end_angle, fill = category)) +
        geom_text(aes(x = rlabel * sin(mid_angle), y = rlabel * cos(mid_angle), label = cnt),
                    hjust = 0.5, vjust = 0.5, size = 20) +
        coord_fixed() +
        scale_x_continuous(limits = c(-1, 1), name = "", breaks = NULL, labels = NULL) +
        scale_y_continuous(limits = c(-1, 1), name = "", breaks = NULL, labels = NULL) +
        scale_fill_manual(values = c("firebrick2", "mediumorchid3", "dodgerblue3", "grey90")) +
        theme(panel.background = element_rect(fill = "white"),
              plot.margin=grid::unit(c(0,0,0,0),"cm"),
              axis.text = element_blank(),
              axis.title = element_blank(),
              axis.ticks=element_blank(),
              legend.position = "none",
              axis.ticks.length = unit(0, "pt")) +
        labs(x = NULL, y = NULL, fill = NULL) +
        labs(fill = "Categorisation by Poore et al. (2020)", title = title)
}

ggarrange(get_poore_circle(before_df2, "Before decontamination"), 
          get_poore_circle(after_df2, "After decontamination"), 
          common.legend = T, legend = "left")

ggsave("results/decontamination/comparison/poore_comparison.png", dpi = 600)

# Get separate plots
before_poore <- get_poore_circle(before_df2, NULL)
ggsave("results/decontamination/comparison/poore_before_nolegend.png", plot = before_poore, dpi = 600)

after_poore <- get_poore_circle(after_df2, NULL)
ggsave("results/decontamination/comparison/poore_after_nolegend.png", plot = after_poore, dpi = 600)

## Compare to hospital blood culture records

In [None]:
blood_list <- read.csv("results/blood_culture_records/blood_culture_taxa_list.txt")$taxa
nc_list <- fread("results/decontamination/curated_n124_global_decontamination_stats.csv")$taxa
nc_list <- tolower(unique(nc_list))
all_list <- tolower(before_df_parsed$Species)
length(nc_list)


In [None]:
all_in_blood <- all_list[all_list %in% blood_list]
nc_in_blood <- nc_list[nc_list %in% blood_list]

missing_after_decon <- all_in_blood[!(all_in_blood %in% nc_in_blood)]

In [None]:
read_df <- fread("results/decontamination/read_matrix.raw.zeroed.csv")

colnames(read_df) <- tolower(colnames(read_df))
read_df %>%
    select(all_of(missing_after_decon)) %>%
    pivot_longer(everything(), names_to = "taxa", values_to = "read_count") %>%
    filter(taxa == "proteus mirabilis", read_count != 0)
#     group_by(taxa) %>%
#     summarise(max = max(read_count)) %>%
#     arrange(max)

In [None]:
nc_list[!(nc_list %in% blood_list)]

In [None]:
blood_list <- read.csv("results/blood_culture_records/blood_culture_taxa_list.txt")$taxa

get_blood_circle <- function(df, title) {
    rpie <- 1
    rlabel <-  0.6 * rpie
    
    df %>% 
        mutate(Species = tolower(Species)) %>%
        mutate(category = ifelse(Species %in% blood_list, "Yes", "No")) %>%
        group_by(category) %>%
        summarise(cnt = n(), total = nrow(df)) %>%
        mutate(end_angle = 2 * pi * cumsum(cnt)/ total,      # ending angle for each pie slice
               start_angle = lag(end_angle, default = 0),   # starting angle for each pie slice
               mid_angle = 0.5 * (start_angle + end_angle)) %>%  # middle of each pie slice, for the text label
        ggplot() +
        geom_arc_bar(aes(x0 = 0, y0 = 0, r0 = 0, r = rpie,
                    start = start_angle, end = end_angle, fill = category)) +
        geom_text(aes(x = rlabel * sin(mid_angle), y = rlabel * cos(mid_angle), label = cnt),
                    hjust = 0.5, vjust = 0.5, size = 20) +
        coord_fixed() +
        scale_x_continuous(limits = c(-1, 1), name = "", breaks = NULL, labels = NULL) +
        scale_y_continuous(limits = c(-1, 1), name = "", breaks = NULL, labels = NULL) +
        scale_fill_manual(values = c("coral2", "darkturquoise")) +
        theme(panel.background = element_rect(fill = "white"),
              plot.margin=grid::unit(c(0,0,0,0),"cm"),
              axis.text = element_blank(),
              axis.title = element_blank(),
              axis.ticks=element_blank(),
              legend.position = "none",
              axis.ticks.length = unit(0, "pt")) +
        labs(x = NULL, y = NULL, fill = NULL) +
        labs(fill = "Detected in blood culture?", title = title)
}

ggarrange(get_blood_circle(before_df_parsed, "Before decontamination"), 
          get_blood_circle(after_df, "After decontamination"), 
          common.legend = T, legend = "left")

ggsave("results/decontamination/comparison/blood_comparison.png", dpi = 600)

# Get separate plots
before_blood <- get_blood_circle(before_df_parsed, NULL)
ggsave("results/decontamination/comparison/blood_before_nolegend.png", plot = before_blood, dpi = 600)

after_blood <- get_blood_circle(after_df, NULL)
ggsave("results/decontamination/comparison/blood_after_nolegend.png", plot = after_blood, dpi = 600)