# Identication of putative contaminant taxa

In [3]:
tempdir <- function() {return("~/.Rtmp")}

In [97]:
setwd("/home/projects/14001280/PROJECTS/blood_microbiome/")
require(tidyverse)
require(ggplot2)
require(data.table)
require(egg)
require(ANCOMBC)
require(phyloseq)

### Create Phyloseq object

#### Load abundance matrix

In [98]:
load_data <- function(file_path) {
    df <- fread(file_path) %>%
        separate(sample, into = c(NA, "npm_research_id"), sep = "_")
    df <- as.matrix(df,rownames=1)
    df <- otu_table(df, taxa_are_rows = F)
    return(df)
}

# genus_df <- load_data("data/temp_files/07_abundance_matrix/subset_100.G.tsv")
species_df <- load_data("data/temp_files/07_abundance_matrix/subset_100.S.tsv")
head(species_df)

Unnamed: 0,Pseudomonas mendocina,Pseudomonas aeruginosa,Pseudomonas pseudoalcaligenes,Pseudomonas furukawaii,Pseudomonas alcaligenes,Pseudomonas citronellolis,Pseudomonas resinovorans,Pseudomonas sp. LPH1,Pseudomonas alcaliphila,Pseudomonas sihuiensis,⋯,Desulfovibrio hydrothermalis,Bifidobacterium asteroides,Carboxydocella thermautotrophica,Capnocytophaga cynodegmi,Capnocytophaga sp. H2931,Bacteroides thetaiotaomicron,Gordonia phage Bantam,Natronolimnobius sp. AArc-Mg,Pseudoalteromonas phenolica,Anoxybacillus flavithermus
WHB5437,37324,1755,769,213,257,137,114,1823,1363,1319,⋯,0,0,0,0,0,0,0,0,0,0
WHB10342,2,13,0,0,2,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
WHB10244,1,0,0,0,0,0,0,0,3,3,⋯,0,0,0,0,0,0,0,0,0,0
WHB10758,4,3,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
WHB5100,48191,2507,985,238,290,171,151,2371,1601,1669,⋯,0,0,0,0,0,0,0,0,0,0
WHB7258,0,0,0,0,1,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


#### Load metadata

In [99]:
meta <- as.matrix(fread("data/SG10K_Health_metadata.n10714.16March2021.parsed.csv"), rownames = 1)
meta <- meta[match(rownames(species_df), rownames(meta)), ]
meta <- as.data.frame(meta[, colnames(meta) != "removal_requested_by_supplier"])
meta <- sample_data(meta)
head(meta)

Unnamed: 0_level_0,multiplex_pool_id,supplier_id,gis_internal_sample_id,site_supplying_sample,year_of_birth,supplied_gender,self_reported_ethnicity,extraction_kit,date_of_dna_extraction,plate_position,⋯,supplied_and_computed_gender_match,supplied_and_computed_ethinicity_match,sop_agreement_for_coverage__14._28_for_15x.30x,sample_life_cycle,current,source_cohort,original_supplier_id,duplicate_info,duplicate,duplicate_pair
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
WHB5437,MUX8443,0618-0040,PRISM_0618_0040_NHC_KJH_1,PRISM,1973,M,Chinese,"Chemagic DNA Blood Kit (Perkin Elmer, MA)",13/12/2018,E7,⋯,Y,,Y,PRISM VCF,Y,PRISM,0618-0040/NHC/KJH,,PASS,
WHB10342,MUX9870,03221,HELIOS_03221_1,HELIOS,1959,M,Malay,abGENIX Whole Blood Genomic DNA Extraction Kit,12/6/2019,G10,⋯,Y,,Y,HELIOS VCF,Y,HELIOS,03221,,PASS,
WHB10244,MUX9801,03118,HELIOS_03118_1,HELIOS,1980,M,Chinese,abGENIX Whole Blood Genomic DNA Extraction Kit,12/6/2019,E10,⋯,Y,,Y,HELIOS VCF,Y,HELIOS,03118,,PASS,
WHB10758,MUX10626,17003-2940,MEC_17003_2940_1,MEC,1964,F,Malay,Maxwell RSC Blood DNA Kit (AS1400),1/8/2019,H11,⋯,Y,,Y,MEC VCF,Y,MEC,17003-2940,,PASS,
WHB5100,MUX8333,1117-0045,PRISM_1117_0045_NHC_FSB_1,PRISM,1958,F,Chinese,"Chemagic DNA Blood Kit (Perkin Elmer, MA)",23/11/2018,E7,⋯,Y,,Y,PRISM VCF,Y,PRISM,1117-0045/NHC/FSB,,PASS,
WHB7258,MUX10198,17003-1054,MEC_17003_1054_1,MEC,1971,M,Malay,Maxwell RSC Blood DNA Kit (AS1400),1/12/2018,H10,⋯,N,,Y,MEC VCF,Y,MEC,17003-1054,,PASS,


#### Check if rownames are matching 

In [100]:
all(rownames(meta) == rownames(species_df))

#### Phyloseq object

In [101]:
phy <- phyloseq(species_df, meta)

### Run ANCOM-BC

Default parameters

In [145]:
meta_cols <- colnames(meta)[grep("kit|flow_cell|instrument_id|source", colnames(meta))]
to_exclude <- c("library_prep_kit", "hiseq_xtm_flow_cell_v2_5_id")
meta_cols <- meta_cols[!(meta_cols %in% to_exclude)]
# meta_cols <- paste(meta_cols, collapse = " + ")

In [146]:
for (col in meta_cols) {
    tmp <- tibble(data.frame(meta)) %>%
        group_by(get(col)) %>%
        summarise(n = n())
    print(col)
    print(tmp)
}

[1] "extraction_kit"
[90m# A tibble: 5 x 2[39m
  `get(col)`                                         n
  [3m[90m<chr>[39m[23m                                          [3m[90m<int>[39m[23m
[90m1[39m Chemagic DNA Blood Kit (Perkin Elmer, MA)         41
[90m2[39m Maxwell RSC Blood DNA Kit (AS1400)                36
[90m3[39m QIAamp DNA Blood Mini Kit                         50
[90m4[39m QIAsymphony DSP DNA Mini Kit (192)                14
[90m5[39m abGENIX Whole Blood Genomic DNA Extraction Kit    50
[1] "instrument_id"
[90m# A tibble: 6 x 2[39m
  `get(col)`     n
  [3m[90m<chr>[39m[23m      [3m[90m<int>[39m[23m
[90m1[39m [90m"[39m[90m"[39m            56
[90m2[39m [90m"[39mNG001[90m"[39m       28
[90m3[39m [90m"[39mNG002[90m"[39m       28
[90m4[39m [90m"[39mNG003[90m"[39m       21
[90m5[39m [90m"[39mNG004[90m"[39m       25
[90m6[39m [90m"[39mNG005[90m"[39m       33
[1] "hiseq_xtm_sbs_kit_300_cycles_v2__box_1of_2__lot"
[90m#

In [None]:
result_list <- list()
for (col in meta_cols) {
    print(col)
    out <- ancombc(phyloseq = phy, formula = col,
                  p_adj_method = "holm", zero_cut = 0.95, lib_cut = 0,
                  group = "source_cohort", struc_zero = TRUE, neg_lb = FALSE,
                  tol = 1e-5, max_iter = 100, conserve = TRUE,
                  alpha = 0.05, global = FALSE)

    res <- out$res
    result_list[col] <- res
}

In [81]:
diff <- apply(res$diff_abn, 1, sum)
data.frame(taxa = names(diff[diff == 0]))

taxa
<chr>
Xanthomonas translucens
Rhizobium gallicum
Rhizobium favelukesii
Sinorhizobium sp. RAC02
Mesorhizobium sp. M4B.F.Ca.ET.058.02.1.1
Mesorhizobium sp. M9A.F.Ca.ET.002.03.1.2
Bordetella genomosp. 8
Achromobacter sp. AONIH1
Burkholderia lata
Burkholderia pyrrocinia


### Normalise

In [10]:
normalise <- function(df) {
    RA_df <- t(apply(df, 1, function(x) {x / sum(x) * 100}))
    return(as_tibble(RA_df))
}

X_genus <- normalise(X_genus)
X_species <- normalise(X_species)

In [11]:
head(X_genus)
Y_genus

Pseudomonas,Azotobacter,Acinetobacter,Moraxella,Aeromonas,Zobellella,Stenotrophomonas,Xanthomonas,Pseudoxanthomonas,Lysobacter,⋯,Pseudoflavitalea,Isosphaera,Catenovulum,Nodularia,Methanosarcina,Fischerella,Woesvirus,Segniliparus,Candidatus Hamiltonella,Pseudopedobacter
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
0.3794266,0.02107926,0.0,0.0,0.0,0.0,0.04215852,0.02107926,0.0,0.042158516,⋯,0,0,0,0,0,0,0.02107926,0.02107926,0,0
0.7876231,0.0281294,0.0562587904,0.028129395,0.0,0.0,0.08438819,0.0,0.028129395,0.028129395,⋯,0,0,0,0,0,0,0.0,0.0,0,0
84.4253461,0.03479578,0.0003079273,0.0,0.09176233,0.0009237818,0.04557324,0.02124698,0.007390255,0.008621964,⋯,0,0,0,0,0,0,0.0,0.0,0,0
83.9630223,0.03277978,0.000264353,0.0,0.0996611,0.000264353,0.0563072,0.02061954,0.00951671,0.005815767,⋯,0,0,0,0,0,0,0.0,0.0,0,0
84.6402402,0.03502877,0.0035743647,0.001429746,0.09364835,0.0014297459,0.04718161,0.01286771,0.010723094,0.007863602,⋯,0,0,0,0,0,0,0.0,0.0,0,0
93.8488468,0.05054054,0.0049307841,0.0,0.11094264,0.001232696,0.05916941,0.02095583,0.007396176,0.011094264,⋯,0,0,0,0,0,0,0.0,0.0,0,0


npm_research_id,multiplex_pool_id,supplier_id,gis_internal_sample_id,site_supplying_sample,year_of_birth,supplied_gender,self_reported_ethnicity,extraction_kit,date_of_dna_extraction,⋯,supplied_and_computed_ethinicity_match,sop_agreement_for_coverage_14/28_for_15x/30x,sample_life_cycle,current,source_cohort,originalsupplierid,duplicateinfo,duplicate,duplicatepair,removalrequestedbysupplier
<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<chr>,<chr>,<chr>,<chr>,⋯,<lgl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>
WHB10411,MUX9829,03295,HELIOS_03295_1,HELIOS,1970,F,Chinese,abGENIX Whole Blood Genomic DNA Extraction Kit,13/6/2019,⋯,,Y,HELIOS VCF,Y,HELIOS,03295,,PASS,,
WHB10415,MUX9830,03299,HELIOS_03299_1,HELIOS,1974,F,Chinese,abGENIX Whole Blood Genomic DNA Extraction Kit,13/6/2019,⋯,,Y,HELIOS VCF,Y,HELIOS,03299,,PASS,,
WHB3378,MUX8073,0116-0060,PRISM_0116_0060_NHC_TGH_1,PRISM,1962,M,Chinese,"Chemagic DNA Blood Kit (Perkin Elmer, MA)",21/9/2018,⋯,,Y,PRISM VCF,Y,PRISM,0116-0060/NHC/TGH,,PASS,,
WHB3404,MUX8080,0216-0021,PRISM_0216_0021_NHC_SBL_1,PRISM,1961,F,Chinese,"Chemagic DNA Blood Kit (Perkin Elmer, MA)",21/9/2018,⋯,,Y,PRISM VCF,Y,PRISM,0216-0021/NHC/SBL,,PASS,,
WHB3640,MUX8204,0916-0082,PRISM_0916_0082_NHC_ESC_1,PRISM,1976,F,Chinese,"Chemagic DNA Blood Kit (Perkin Elmer, MA)",8/10/2018,⋯,,N,PRISM VCF,Y,PRISM,0916-0082/NHC/ESC,,PASS,,
WHB3717,MUX8211,0117-0016,PRISM_0117_0016_NHC_WW9_1,PRISM,1985,F,Chinese,"Chemagic DNA Blood Kit (Perkin Elmer, MA)",24/10/2018,⋯,,Y,PRISM VCF,Y,PRISM,0117-0016/NHC/WW9,,PASS,,
WHB3744,MUX8220,0217-0044,PRISM_0217_0044_NHC_CYS_1,PRISM,1967,F,Chinese,"Chemagic DNA Blood Kit (Perkin Elmer, MA)",24/10/2018,⋯,,Y,PRISM VCF,Y,PRISM,0217-0044/NHC/CYS,,PASS,,
WHB3758,MUX8235,0217-0059,PRISM_0217_0059_NHC_TJS_1,PRISM,1993,M,Chinese,"Chemagic DNA Blood Kit (Perkin Elmer, MA)",2/11/2018,⋯,,Y,PRISM VCF,Y,PRISM,0217-0059/NHC/TJS,,PASS,,
WHB3959,MUX9014,010-10013,GUSTO_Kids_010_10013_1,GUSTO_Kids,2009,F,Chinese,QIAsymphony DSP DNA Midi Kit,12/10/2018,⋯,,N,GUSTO_Kids VCF,Y,GUSTO_Kids,010-10013,,PASS,,
WHB4054,MUX9034,010-21460,GUSTO_Kids_010_21460_1,GUSTO_Kids,2010,M,Malay,QIAsymphony DSP DNA Midi Kit,15/10/2018,⋯,,Y,GUSTO_Kids VCF,Y,GUSTO_Kids,010-21460,,PASS,,


### Remove low abundance taxa

Please check that normalised values are 0-100% and not 0-1. Here we want to remove taxa that have less than a 1% maximum rel abundance across samples

In [12]:
remove_low_abundance <- function(df, threshold) {
    remove_df <- df %>%
        pivot_longer(everything(), names_to = "rank", values_to = "abundance") %>%
        group_by(rank) %>%
        summarise(max_abundance = max(abundance)) %>%
        filter(max_abundance > threshold)
    
    to_keep <- remove_df$rank
    print(paste0(length(to_keep), "/", ncol(df), " kept at threshold of ", threshold))
    return(df %>% select(all_of(to_keep)))
}

X_genus <- remove_low_abundance(X_genus, 0.5)
X_species <- remove_low_abundance(X_species, 0.5)


[1] "98/658 kept at threshold of 0.5"
[1] "218/1922 kept at threshold of 0.5"


In [None]:
filter_results <- function(result_df) {
    filt_df <- result_df %>% 
        group_by(taxon) %>%
        summarise(smallest_p_val = min(p_val)) %>%
        filter(smallest_p_val > t) %>%
        mutate(log_p_val = -log(smallest_p_val, base = 10), 
               taxon = fct_reorder(taxon, log_p_val, .desc = F))
    return(filt_df)
}

plot_genus <- filter_results(genus_result)
plot_species <- filter_results(species_result)

In [None]:
length(unique(plot_genus$taxon))
length(unique(plot_species$taxon))

### Visualise results

In [None]:
plot_results <- function(filt_df, taxon_name, subset) {    
    if (is.na(subset)) {
        to_plot <- filt_df
    } else {
        to_plot <- filt_df %>%
        arrange(log_p_val) %>%
        slice(1:subset)
    }
    
    plt <- to_plot %>%
        ggplot(aes(x = taxon, y = log_p_val, fill = log_p_val)) +
        geom_bar(stat = "identity") + 
        labs(x = taxon_name, y = "-lg(minimum p-value)") +
        theme(axis.text.x = element_text(angle = 45, hjust = 1),
              legend.position = "none")
    
    return(plt)
}

plot_results(plot_genus, "Genus", NA)
plot_results(plot_species, "Species", 30)

## Sanity check

In [None]:
join_results <- function(df, plot_df) {
    max_df <- df %>%
        pivot_longer(everything(), names_to = "rank", values_to = "abundance") %>%
        group_by(rank) %>%
        summarise(max_abundance = max(abundance))

    plot_df %>%
        rename(rank = taxon) %>%
        left_join(max_df, "rank") %>%
        arrange(log_p_val)
}

join_results(X_genus, plot_genus)
join_results(X_species, plot_species)

In [None]:
taxon <- "Bacillus"
cbind(X_genus, Y_genus) %>%
    mutate(log_abundance = log(get(taxon))) %>%
    ggplot(aes(x = source_cohort, y = log_abundance)) + 
    geom_boxplot()


### ANCOM analysis

In [None]:
if (!requireNamespace("BiocManager", quietly=TRUE))
    install.packages("BiocManager")
BiocManager::install("ANCOMBC")

In [None]:
library(nlme)
library(tidyverse)
library(ggplot2)
library(compositions)
source("scripts/ancom_v2.1.R")

In [None]:
test_batch_effects <- function(X, Y) {
    meta_cols <- colnames(meta)[grep("kit|flow_cell|instrument_id|source", colnames(meta))]
    taxa <- colnames(X)

    result_df <- tibble()

    for (col in meta_cols) {
        for (taxon in taxa) {
            x <- X[, taxon]
            y <- Y[, col]
            test_df <- cbind(x, y)

            # Remove samples with unknown or NA metadata
            test_filt <- test_df %>%
                filter(!grepl("unknown", get(col), ignore.case = T) & !is.na(get(col)))
            
            # Remove factors that do not have any reads
            reads <- test_filt %>%
                group_by(get(col)) %>%
                summarise(max_reads = max(get(taxon))) %>%
                filter(max_reads != 0)
            
            to_keep <- reads$`get(col)`
            
            if (length(to_keep) >= 2) {
                test_filt <- test_filt %>%
                    filter(get(col) %in% to_keep)

                # One-way ANOVA
                taxon_vec <- log(as_vector(test_filt[, taxon]))
                a <- aov(taxon_vec ~ as_vector(test_filt[, col]))
                summ <- unlist(summary(a))
                deg_f <- summ["Df1"]
                F_stat <- summ["F value1"]
                p_val <- summ["Pr(>F)1"]

                # Save results
                morsel <- tibble(taxon = taxon, meta_name = col, df = deg_f, F_stat = F_stat, p_val = p_val)
                result_df <- result_df %>% bind_rows(morsel)
            } else {
#                 print(paste("Skipped", taxon, col))
#                 reads
#                 to_keep
            }
        }
    }
    
    return(result_df)
}

genus_log_result <- test_batch_effects(X_genus, Y_genus)
species_log_result <- test_batch_effects(X_species, Y_species)