# Visualising taxa abundance

In [1]:
setwd("/mnt/c/Users/Cedric/Desktop/git_repos/blood_microbiome")
require(tidyverse)
require(ggplot2)
require(data.table)
require(foreach)
require(compositions)
require(ggpubr)
require(scales)
require(ggsci)
require(ggforce)
require(see)
require(doParallel)
require(VennDiagram)
require(randomcoloR)
require(vegan)
# registerDoParallel(cores=8)

Loading required package: tidyverse

“running command 'timedatectl' had status 1”
── [1mAttaching packages[22m ─────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.6     [32m✔[39m [34mdplyr  [39m 1.0.7
[32m✔[39m [34mtidyr  [39m 1.1.4     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.0     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Loading required package: data.table


Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’:

    between, first, last


The following object is masked from ‘p

## Pre-process data

### Load and parse data

In [2]:
get_meta_cols <- function(meta) {
    meta_cols <- colnames(meta)[grep("kit|flow_cell|instrument_id|site_supplying", colnames(meta))]
    to_exclude <- c("hiseq_xtm_flow_cell_v2_5_id")
    meta_cols <- meta_cols[!(meta_cols %in% to_exclude)]
    return(meta_cols)
}


load_data <- function(file_path) {
    df <- fread(file_path) %>%
        separate(sample, into = c(NA, "npm_research_id"), sep = "\\.")
    return(df)
}


In [3]:
rank <- "S"

to_retain <- fread("data/samples_above_100_microbial_reads.txt")$npm_research_id

meta <- fread("data/SG10K_Health_metadata.n10714.16March2021.parsed.csv") %>%
    filter(npm_research_id %in% to_retain)

meta_cols <- get_meta_cols(meta)

species_df <- load_data(str_glue("data/taxonomic_profiles/07_abundance_matrix/abundance_matrix.S.pipeline2_210322.tsv")) %>%
    filter(npm_research_id %in% to_retain) %>%
    select(-unclassified, -"Homo sapiens")

prev_df <- fread("results/decontamination/prevalence_RA0.005_read10.csv")

In [4]:
sum(species_df$`Torque teno virus` != 0)

## Basic stats

## Decontamination

#### Load non-contaminant list

In [5]:
# Differential prevalence
diff_prev_nc <- read.csv("results/decontamination/diff_prev_V3_no_split/noncontaminants.RA0.005.read_threshold10.max_prev0.25.fold_diff2.txt")$taxa
diff_prev_c <- read.csv("results/decontamination/diff_prev_V3_no_split/contaminants.RA0.005.read_threshold10.max_prev0.25.fold_diff2.txt")$taxa

# Correlation filter
corr_nc <- read.csv("results/decontamination/correlation_decontamination_no_split/nc.diff_prev_V3.RA0.005.read_threshold10.max_prev0.25.fold_diff2.corr_t0.7.within_batch.S.n9999.txt")$non_contaminant_taxon
corr_c <- read.csv("results/decontamination/correlation_decontamination_no_split/contam.diff_prev_V3.RA0.005.read_threshold10.max_prev0.25.fold_diff2.corr_t0.7.within_batch.S.n9999.txt")$contaminant

# Simple batch filter
batch_nc <- read.csv("results/decontamination/simple_batch_decontam_no_split/nc.corr_t0.7.within_batch.txt")$taxa

### Set 'absent' taxa to zero

In [6]:
species_zeroed <- species_df %>% 
    select(all_of(colnames(prev_df))) %>%
    column_to_rownames("npm_research_id")

prev_bool_df <- prev_df %>%
    column_to_rownames("npm_research_id")

for(i in seq(ncol(species_zeroed))) {
    species_zeroed[!prev_bool_df[, i], i] <- 0
}

species_filt <- species_zeroed %>% 
    rownames_to_column("npm_research_id") %>%
    select(all_of(c("npm_research_id", batch_nc)))


# fwrite(species_zeroed %>% rownames_to_column("npm_research_id"), "results/decontamination/read_matrix.raw.zeroed.csv")

#### Relative abundance

In [7]:
otu_to_RA <- function(df) {
    row_names <- df$npm_research_id
    RA_df <- t(apply(df[, 2:ncol(df)], 1, function(x) {x / sum(x)}))
    
    return(cbind(data.frame(npm_research_id = row_names), as.data.frame(RA_df, check.names = F)))
}

species_filt_RA <- otu_to_RA(species_filt)

#### CLR transform

In [8]:
RA_to_clr <- function(df) {
    row_names <- df$npm_research_id
    clr_df <- clr(df %>% select(where(is.numeric)))
    return(cbind(data.frame(npm_research_id = row_names), as.data.frame(clr_df, check.names = F)))
}

# species_clr <- RA_to_clr(species_RA)
# head(species_clr)

### Summary stats of non-contaminants

#### Max read count

In [9]:
max_df <- species_filt %>% 
    select(-npm_research_id) %>%
    pivot_longer(everything(), names_to = "taxa", values_to = "read_count") %>%
    group_by(taxa) %>%
    summarise(max_count = max(read_count)) %>%
    arrange(desc(max_count))

#### Overall prevalence

In [10]:
prev_stats <- apply(prev_df[, 2:ncol(prev_df)], 2, sum) / nrow(prev_df)
overall_prev <- data.frame(taxa = names(prev_stats), overall_prevalence = as.vector(prev_stats))

In [11]:
prev_max <- max_df %>%
    left_join(overall_prev) %>%
    arrange(desc(overall_prevalence)) %>%
    mutate(n_samples = overall_prevalence * nrow(prev_df),
           max_bin = case_when(max_count < 10 ~ "<10",
                               max_count >= 10 & max_count < 50 ~ "10-50",
                               max_count >= 50 & max_count < 100 ~ "50-100",
                               max_count >= 100 & max_count < 500 ~"100-500",
                               max_count >= 500 ~">=500")) %>%
    mutate(max_bin = factor(max_bin, levels = c("<10", "10-50", "50-100", "100-500", ">=500"))) %>%
    arrange(desc(overall_prevalence))
prev_max

Joining, by = "taxa"


taxa,max_count,overall_prevalence,n_samples,max_bin
<chr>,<dbl>,<dbl>,<dbl>,<fct>
Cutibacterium acnes,22596,0.047462619,419,>=500
Moraxella osloensis,2402,0.019143634,169,>=500
Human mastadenovirus C,17475,0.018577254,164,>=500
Mycolicibacterium aubagnense,1938,0.012233802,108,>=500
Lactobacillus iners,1104,0.012007250,106,>=500
Lactobacillus crispatus,7799,0.010647938,94,>=500
Staphylococcus epidermidis,9140,0.008608971,76,>=500
Sulfuritalea hydrogenivorans,86,0.008608971,76,50-100
Gardnerella vaginalis,2123,0.008495696,75,>=500
Acidovorax sp. KKS102,336,0.007362936,65,100-500


#### Max count filter

In [12]:
prev_max_filt <- prev_max %>% filter(max_count > 100)
prev_max_filt %>%
arrange(desc(max_count))

taxa,max_count,overall_prevalence,n_samples,max_bin
<chr>,<dbl>,<dbl>,<dbl>,<fct>
Fusobacterium nucleatum,194199,0.0011327594,10,>=500
Cutibacterium acnes,22596,0.0474626189,419,>=500
Human mastadenovirus C,17475,0.0185772542,164,>=500
Neisseria subflava,15385,0.0015858632,14,>=500
Corynebacterium segmentosum,14476,0.0010194835,9,>=500
Haemophilus parainfluenzae,12183,0.0020389669,18,>=500
Fannyhessea vaginae,10395,0.0023787947,21,>=500
Staphylococcus epidermidis,9140,0.0086089715,76,>=500
Human betaherpesvirus 6A,8770,0.0020389669,18,>=500
Human betaherpesvirus 6B,8476,0.0033982782,30,>=500


### Save decontaminated data

In [13]:
n_final <- nrow(prev_max_filt)
species_read_final <- species_filt %>%
    select(all_of(c("npm_research_id", prev_max_filt$taxa)))

# Remove zero rows
non_zero_rows <- species_read_final$npm_research_id[rowSums(species_read_final %>% select(-npm_research_id)) != 0]

species_read_zeroed <- species_read_final %>%
    filter(npm_research_id %in% non_zero_rows)

species_PA_final <- prev_df %>%
    select(all_of(c("npm_research_id", prev_max_filt$taxa))) %>%
    filter(npm_research_id %in% non_zero_rows)

species_RA_final <- otu_to_RA(species_read_final %>% filter(npm_research_id %in% non_zero_rows))

fwrite(species_read_final, 
       str_glue("results/decontamination/max_count_no_split/read_matrix_n{n_final}.global_decontaminated.zeroed.csv"))
fwrite(species_PA_final, 
       str_glue("results/decontamination/max_count_no_split/PA_matrix_n{n_final}.global_decontaminated.zeroed.csv"))
fwrite(species_RA_final, 
       str_glue("results/decontamination/max_count_no_split/RA_matrix_n{n_final}.global_decontaminated.zeroed.csv"))
fwrite(species_RA_final %>% select(npm_research_id), 
       str_glue("results/decontamination/max_count_no_split/ids_n{n_final}.global_decontaminated.zeroed.txt"), 
       row.names = F)

fwrite(prev_max_filt, str_glue("results/decontamination/max_count_no_split/global_decontamination_stats_n{n_final}.csv"))

### Print decontamination taxa

In [14]:
raw_no <- sum(colSums(species_df %>% select(-npm_research_id)) != 0)
prev_no <- colnames(prev_df %>% select(-npm_research_id))
prev_no <- length(prev_no)
print(str_glue("Raw no. of taxa: {raw_no}"))
print(str_glue("After zero-filter: {prev_no}"))
print(str_glue("After diff. prev. filter: {length(diff_prev_nc)}"))
print(str_glue("After corr. filter: {length(corr_nc)}"))
print(str_glue("After batch filter: {length(batch_nc)}"))
print(str_glue("After max. count filter: {nrow(prev_max_filt)}"))

Raw no. of taxa: 8841
After zero-filter: 870
After diff. prev. filter: 710
After corr. filter: 526
After batch filter: 183
After max. count filter: 117


In [20]:
n_distinct(nc_split$taxa)
n_distinct(nc_split$taxa)

In [21]:
nc_split <- fread("results/decontamination/curated_n122_global_decontamination_stats.parsed.csv") %>%
    distinct(taxa)
nc_split$taxa[!(nc_split$taxa %in% prev_max_filt$taxa)]
prev_max_filt$taxa[!(prev_max_filt$taxa %in% nc_split$taxa)]


length(unique(nc_split$taxa))
length(unique(prev_max_filt$taxa))