# Identication of putative contaminant taxa

In [4]:
tempdir()

In [1]:
setwd("/home/projects/14001280/PROJECTS/blood_microbiome/")
require(tidyverse)
require(ggplot2)
require(data.table)
require(egg)

Loading required package: tidyverse

“Your system is mis-configured: ‘/etc/localtime’ is not a symlink”
“It is strongly recommended to set envionment variable TZ to ‘Asia/Singapore’ (or equivalent)”
── [1mAttaching packages[22m ─────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.4     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.2     [32m✔[39m [34mdplyr  [39m 1.0.7
[32m✔[39m [34mtidyr  [39m 1.1.3     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Loading required package: data.table


Attaching package: ‘data.table’


The 

### Load data

In [2]:
load_data <- function(file_path) {
    df <- fread(file_path) %>%
        separate(sample, into = c(NA, "npm_research_id"), sep = "_")
    return(df)
}

genus_df <- load_data("data/temp_files/07_abundance_matrix/subset_100.G.tsv")
species_df <- load_data("data/temp_files/07_abundance_matrix/subset_100.S.tsv")

ERROR: Error in fread(file_path): File 'data/temp_files/07_abundance_matrix/subset_100.G.tsv' does not exist or is non-readable. getwd()=='/home/projects/14001280/PROJECTS/blood_microbiome'


In [None]:
nrow(genus_df) == nrow(species_df)
meta <- fread("data/SG10K_Health_metadata.n10714.16March2021.parsed.csv") %>%
    filter(npm_research_id %in% genus_df$npm_research_id)
nrow(meta)
head(meta)

### Explore metadata

In [None]:
colnames(meta)

In [None]:
plots <- list()
columns <- c("source_cohort", "extraction_kit", "library_prep_kit", "vendor_sequencing_centre", 
             "instrument_id", "plate_name", "run_id", 
             "hiseq_xtm_sbs_kit_300_cycles_v2_box_1of_2_lot", "hiseq_xtm_sbs_kit_300_cycles_v2_box_2_of_2_lot",
             "hiseq_xtm_pe_cluster_kit_cbottm_v2_box_1_of_2_lot", "hiseq_xtm_pe_cluster_kit_cbottm_v2_box_2_of_2_lot",
             "hiseq_xtm_flow_cell_v25_lot", "originalsupplierid")
for (column in columns) {
    plt <- meta %>%
        mutate(across(everything(), as.character)) %>%
        select(all_of(column)) %>%
        group_by_at(column) %>%
        summarise(n = n()) %>%
        ggplot(aes_string(x = column, y = "n", fill = column)) +
            geom_bar(stat = "identity") +
            theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
            geom_text(aes_string(label = "n"))
    plots[[column]] <- plt
}

In [None]:
length(plots)

In [None]:
plots

### Parse sample identifiers

In [None]:
ncol(meta) + 1

### Retrieve samples that are present

In [None]:
retrieve_samples <- function(df) {
    df <- meta %>%
        inner_join(df, by = "npm_research_id")
    X <- df %>%
        select(colnames(.)[(ncol(meta) + 1):ncol(.)]) %>%
        as_tibble()
    Y <- df %>% 
        select(colnames(.)[1:ncol(meta)]) %>%
        as_tibble()
    
    return(list(X, Y))
}
dat_genus <- retrieve_samples(genus_df)
X_genus <- dat_genus[[1]]
Y_genus <- dat_genus[[2]]

dat_species <- retrieve_samples(species_df)
X_species <- dat_species[[1]]
Y_species <- dat_species[[2]]

### Normalise

In [None]:
normalise <- function(df) {
    RA_df <- t(apply(df, 1, function(x) {x / sum(x) * 100}))
    return(as_tibble(RA_df))
}

X_genus <- normalise(X_genus)
X_species <- normalise(X_species)

In [None]:
head(X_genus)
Y_genus

### Remove low abundance taxa

Please check that normalised values are 0-100% and not 0-1. Here we want to remove taxa that have less than a 1% maximum rel abundance across samples

In [None]:
remove_low_abundance <- function(df, threshold) {
    remove_df <- df %>%
        pivot_longer(everything(), names_to = "rank", values_to = "abundance") %>%
        group_by(rank) %>%
        summarise(max_abundance = max(abundance)) %>%
        filter(max_abundance > threshold)
    
    to_keep <- remove_df$rank
    print(paste0(length(to_keep), "/", ncol(df), " kept at threshold of ", threshold))
    return(df %>% select(all_of(to_keep)))
}

X_genus <- remove_low_abundance(X_genus, 0.5)
X_species <- remove_low_abundance(X_species, 0.5)


### Test for batch effects

In [None]:
test_batch_effects <- function(X, Y) {
    meta_cols <- colnames(meta)[grep("kit|flow_cell|instrument_id|source", colnames(meta))]
    taxa <- colnames(X)

    result_df <- tibble()

    for (col in meta_cols) {
        for (taxon in taxa) {
#             X <- X_genus
#             Y <- Y_genus
#             taxon <- taxa[6]
#             col <- meta_cols[5]
            x <- X[, taxon]
            y <- Y[, col]
            test_df <- cbind(x, y)

            # Remove samples with unknown or NA metadata
            test_filt <- test_df %>%
                filter(!grepl("unknown", get(col), ignore.case = T) & !is.na(get(col)))
            
            # Remove factors that do not have any reads
            reads <- test_filt %>%
                group_by(get(col)) %>%
                summarise(max_reads = max(get(taxon))) %>%
                filter(max_reads != 0)
            
            to_keep <- reads$`get(col)`
            
            if (length(to_keep) >= 2) {
                test_filt <- test_filt %>%
                    filter(get(col) %in% to_keep)

                # One-way ANOVA
                a <- aov(as_vector(test_filt[, taxon]) ~ as_vector(test_filt[, col]))
                summ <- unlist(summary(a))
                deg_f <- summ["Df1"]
                F_stat <- summ["F value1"]
                p_val <- summ["Pr(>F)1"]

                # Save results
                morsel <- tibble(taxon = taxon, meta_name = col, df = deg_f, F_stat = F_stat, p_val = p_val)
                result_df <- result_df %>% bind_rows(morsel)
            } else {
#                 print(paste("Skipped", taxon, col))
#                 reads
#                 to_keep
                
            }
        }
    }
    
    return(result_df)
}

genus_result <- test_batch_effects(X_genus, Y_genus)
species_result <- test_batch_effects(X_species, Y_species)

In [None]:
genus_result
species_result

### Retain genera whose smallest p-value is greater than t

we might want to correct the threshold for multiple testing

In [None]:
t <- 0.05

In [None]:
length(unique(genus_result$taxon))
length(unique(species_result$taxon))

In [None]:
filter_results <- function(result_df) {
    filt_df <- result_df %>% 
        group_by(taxon) %>%
        summarise(smallest_p_val = min(p_val)) %>%
        filter(smallest_p_val > t) %>%
        mutate(log_p_val = -log(smallest_p_val, base = 10), 
               taxon = fct_reorder(taxon, log_p_val, .desc = F))
    return(filt_df)
}

plot_genus <- filter_results(genus_result)
plot_species <- filter_results(species_result)

In [None]:
length(unique(plot_genus$taxon))
length(unique(plot_species$taxon))

### Visualise results

In [None]:
plot_results <- function(filt_df, taxon_name, subset) {    
    if (is.na(subset)) {
        to_plot <- filt_df
    } else {
        to_plot <- filt_df %>%
        arrange(log_p_val) %>%
        slice(1:subset)
    }
    
    plt <- to_plot %>%
        ggplot(aes(x = taxon, y = log_p_val, fill = log_p_val)) +
        geom_bar(stat = "identity") + 
        labs(x = taxon_name, y = "-lg(minimum p-value)") +
        theme(axis.text.x = element_text(angle = 45, hjust = 1),
              legend.position = "none",
              plot.margin = margin(t = 0, r = 0, b = 0, l = 2, unit = "cm"))
    
    return(plt)
}

plot_results(plot_genus, "Genus", NA)
plot_results(plot_species, "Species", 30)

## Sanity check

#### Genera flagged as non-contaminants

In [None]:
join_results <- function(df, plot_df) {
    max_df <- df %>%
        pivot_longer(everything(), names_to = "rank", values_to = "abundance") %>%
        group_by(rank) %>%
        summarise(max_abundance = max(abundance))

    plot_df %>%
        rename(rank = taxon) %>%
        left_join(max_df, "rank") %>%
        arrange(log_p_val)
}

join_results(X_genus, plot_genus)
join_results(X_species, plot_species)

#### Genera flagged as contaminants

In [None]:
genus_result %>%
    filter(meta_name == "source_cohort") %>%
    arrange(p_val)

#### Plot per-batch RA of contaminants vs. non-contaminants

In [None]:
plot_sanity_plot <- function(tax) {
    print(tax)
    temp_plt <- cbind(X_genus, Y_genus) %>%
        mutate(log_abundance = log(get(tax) + 0.0001)) %>%
        ggplot(aes(x = source_cohort, y = get(tax), fill = source_cohort)) + 
        geom_point() +
        geom_boxplot(alpha = 0.5) +
        labs(x = "Source cohort", y = paste(tax, "% abundance")) +
        theme(legend.position = "none",
          axis.text.x = element_text(angle = 45, hjust = 1),
          text = element_text(size = 15))
    return(temp_plt)
}

In [None]:
plot_sanity_plot("Pseudomonas")
plot_sanity_plot("Klebsiella")
plot_sanity_plot("Brachybacterium")


In [None]:
plot_log_sanity_plot <- function(tax) {
    print(tax)
    temp_plt <- cbind(X_genus, Y_genus) %>%
        mutate(log_abundance = log(get(tax) + 0.0001, base = 10)) %>%
        ggplot(aes(x = source_cohort, y = log_abundance, fill = source_cohort)) + 
        geom_point() +
        geom_boxplot(alpha = 0.5) +
        labs(x = "Source cohort", y = paste(tax, "lg(% abundance + 0.0001)")) +
        theme(legend.position = "none",
          axis.text.x = element_text(angle = 45, hjust = 1),
          text = element_text(size = 15))
    return(temp_plt)
}

plot_log_sanity_plot("Klebsiella")