# Identication of putative contaminant taxa

In [112]:
# tempdir <- function() {return("~/.Rtmp")}

In [113]:
setwd("/home/projects/14001280/PROJECTS/blood_microbiome/")
require(tidyverse)
require(ggplot2)
require(data.table)
require(egg)
require(ANCOMBC)
require(phyloseq)

### Create Phyloseq object

#### Load abundance matrix

In [114]:
load_data <- function(file_path) {
    df <- fread(file_path) %>%
        separate(sample, into = c(NA, "npm_research_id"), sep = "_")
    df <- as.matrix(df,rownames=1)
    df <- otu_table(df, taxa_are_rows = F)
    return(df)
}

#### Load metadata

In [115]:
parse_metadata <- function(file_path, df) {
    meta <- as.matrix(fread(file_path, na.strings=c("", NA)), rownames = 1)
    meta <- meta[match(rownames(df), rownames(meta)), ]
    meta <- as.data.frame(meta[, colnames(meta) != "removal_requested_by_supplier"])
    meta <- sample_data(meta)
    return(meta)
}

### Run ANCOM-BC

Default parameters

In [116]:
get_meta_cols <- function(meta) {
    meta_cols <- colnames(meta)[grep("kit|flow_cell|instrument_id|source", colnames(meta))]
    to_exclude <- c("library_prep_kit", "hiseq_xtm_flow_cell_v2_5_id")
    meta_cols <- meta_cols[!(meta_cols %in% to_exclude)]
    return(meta_cols)
}

In [117]:
print_group_freqs <- function(meta) {
    for (col in meta_cols) {
        tmp <- tibble(data.frame(meta)) %>%
            group_by(get(col)) %>%
            summarise(n = n())
        print(col)
        print(tmp)
    }
}

In [118]:
run_ancom_bc <- function(phy, meta, meta_cols) {
    result_list <- list()
    n_samples <- 10

    for (col in meta_cols) {
        print(col)

        # Remove levels with < 10 samples
        tmp <- tibble(data.frame(meta)) %>%
            group_by(get(col)) %>%
            summarise(n = n())

        # Vector of group levels to keep
        to_keep <- tmp[tmp$n >= n_samples, "get(col)"]$`get(col)`

        # Retrieve groupings
        var_values <- sample_data(phy)[[col]]

        # Prune groups
        phy_subset <- prune_samples(var_values %in% to_keep, phy)
        out <- ancombc(phyloseq = phy_subset, formula = col,
                      p_adj_method = "BH", zero_cut = 1, lib_cut = 0,
                      group = col, struc_zero = TRUE, neg_lb = FALSE,
                      tol = 1e-5, max_iter = 2, conserve = TRUE,
                      alpha = 0.05, global = FALSE)

        col_result <- out$res
        result_list[[col]] <- col_result
        return(result_list)
    }
}

### Combine non-differentially abundant taxa

In [126]:
get_non_contaminants <- function(result_list, meta_cols) {
    taxa_list <- list()
    
    for (col in meta_cols) {
        print(col)
        col_result <- result_list[[col]]
        diff <- apply(col_result$diff_abn, 1, sum)
        taxa <- data.frame(taxa = names(diff[diff <= 2]))$taxa
        taxa_list[[col]] <- taxa
        }
    
    non_contaminants <- Reduce(intersect, taxa_list)
    return(non_contaminants)
}

In [128]:
col_result

In [127]:
# # Load data
# species_df <- load_data("data/temp_files/07_abundance_matrix/subset_100.S.tsv")
# head(species_df)

# # Retain relevant metadata
# meta <- parse_metadata("data/SG10K_Health_metadata.n10714.16March2021.parsed.csv", species_df)

# # Get metadata columns of interest
# meta_cols <- get_meta_cols(meta)

# # Create phyloseq object
# phy <- phyloseq(species_df, meta)

# # Print batch levels and frequencies
# print_group_freqs(meta)

# # # Run ANCOM-BC
# result_list <- run_ancom_bc(phy, meta, meta_cols)

# Get list of non-contaminants
non_contaminants <- get_non_contaminants(result_list, meta_cols)

[1] "extraction_kit"
[1] "instrument_id"


ERROR: Error in apply(col_result$diff_abn, 1, sum): dim(X) must have a positive length
