# Identication of putative contaminant taxa

In [1]:
setwd("/mnt/c/Users/Cedric/Desktop/git_repos/blood_microbiome")
require(foreach)
require(tidyverse)
require(ggplot2)
require(data.table)
require(doParallel)
registerDoParallel(cores=6)

Loading required package: foreach

Loading required package: tidyverse

“running command 'timedatectl' had status 1”
── [1mAttaching packages[22m ─────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.6     [32m✔[39m [34mdplyr  [39m 1.0.7
[32m✔[39m [34mtidyr  [39m 1.1.4     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.0     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mpurrr[39m::[32maccumulate()[39m masks [34mforeach[39m::accumulate()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m     masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m        masks [34mstats[39m::lag()
[31m✖[39m [34mpurrr[39m::[32mwhen()[39m       masks [34

### Functions

#### Data preprocessing

In [2]:
load_data <- function(file_path) {
    df <- as.data.frame(fread(file_path)) %>%
        separate(sample, into = c(NA, "npm_research_id"), sep = "\\.")
    return(df)
}


load_metadata <- function(file_path, df) {
    meta <- fread(file_path, na.strings=c("", NA))
    meta <- meta %>% 
        filter(npm_research_id %in% df$npm_research_id) %>%
        select(-removal_requested_by_supplier) %>%
        replace(is.na(.), "unknown")
    return(meta)
}


subset_metadata <- function(meta, n_subset) {
    meta <- as.data.frame(meta)
    cohorts <- unique(meta$site_supplying_sample)
    subset_vec <- c()

    for (i in cohorts) {
        ids <- meta$npm_research_id[meta$site_supplying_sample == i]
        
        if (length(ids) > n_subset) {
            subset_ids <- sample(ids, n_subset)
            subset_vec <- c(subset_vec, subset_ids)
        } else {
            subset_vec <- c(subset_vec, ids)
        }
    }

    meta_sub <- meta %>%
      filter(npm_research_id %in% subset_vec) 
    return(meta_sub)
}


retrieve_rows_from_meta <- function(df, meta) {
    return(df %>% filter(npm_research_id %in% meta$npm_research_id)) 
}


remove_cols <- function(df, col_to_exclude) {
    return(df %>% select(-all_of(col_to_exclude)))
}


remove_low_freq_taxa <- function(df, frac_presence) {
    n_original <- ncol(df[, colnames(df) != "npm_research_id"])
    PA_df <- apply(df[, 2:ncol(df)], 2, function(x) {ifelse(x > 0, T, F)})
    frac_df <- apply(PA_df, 2, function(x) {sum(x) / nrow(PA_df)})
    to_keep <- names(frac_df[frac_df > frac_presence])
    to_keep <- c("npm_research_id", to_keep)
    n_new <- length(to_keep) - 1
    print(str_glue("{n_new} / {n_original} taxa are present in {frac_presence} of samples"))
    return(df %>% select(all_of(to_keep)))
}


otu_to_RA <- function(df) {
    mat <- as.matrix(df[, colnames(df) != "npm_research_id"])
    RA_df <- as.data.frame(mat / rowSums(mat))
    RA_df <- add_column(RA_df, df$npm_research_id, .before = 1)
    colnames(RA_df)[1] <- "npm_research_id"
    
    return(RA_df)
}


get_meta_cols <- function(meta, meta_regex, to_exclude) {
    meta_cols <- colnames(meta)[grep(meta_regex, colnames(meta))]
    meta_cols <- meta_cols[!(meta_cols %in% to_exclude)]
    return(meta_cols)
}


otu_to_PA <- function(df, read_threshold) {
    prev_read <- df %>%
        column_to_rownames("npm_research_id")
    
    prev_read[prev_read <= read_threshold] <- 0
    prev_read[prev_read > read_threshold] <- 1

    return(prev_read)
}


RA_to_PA <- function(RA_df, PA_threshold) {
    prev_RA <- RA_df %>% column_to_rownames("npm_research_id")
    prev_RA[prev_RA <= PA_threshold] <- 0
    prev_RA[prev_RA > PA_threshold] <- 1
    
    return(prev_RA)
}


RA_to_clr <- function(df) {
    mat <- df[, colnames(df) != "npm_research_id"]
    clr_df <- clr(mat)
    return(cbind(data.frame(npm_research_id = df$npm_research_id), as.data.frame(clr_df, check.names = F)))
}


filter_taxa_by_presence <- function(prev_df, presence_t) {
    taxa_counts <- apply(prev_df, 2, sum)
    to_keep <- names(taxa_counts)[taxa_counts > presence_t]
    return(prev_df %>% select(all_of(to_keep)))
}


filter_batch_levels <- function(dat, metadat, column) {
    # Remove levels with < x samples
    tmp <- tibble(data.frame(metadat)) %>%
            group_by(get(column)) %>%
            summarise(n = n())

    # Vector of group levels to keep
    to_keep <- tmp[tmp$n >= min_samples, "get(column)"]$`get(column)`
    to_keep <- to_keep[to_keep != "Unknown"]
    n_levels <- length(to_keep)

    if (n_levels < 2) {
        print(str_glue("After pruning, {column} has < 2 levels"))
    } else {
        print(str_glue("After pruning, {column} has {n_levels} levels"))
    }

    # Remove rows in metadata
    dat_meta <- dat %>%
        left_join(metadat, by = "npm_research_id") %>%
        filter(get(column) %in% to_keep)

    return(dat_meta)
}


preprocess_data <- function(df, meta_filt, RA_threshold, read_threshold, presence_t) {
    # Filter data
    df_filt <- retrieve_rows_from_meta(df, meta_filt)
    df_filt2 <- remove_cols(df_filt, c(human, "unclassified"))
    RA_df <- otu_to_RA(df_filt2)
    prev_RA <- RA_to_PA(RA_df, RA_threshold)
    prev_read <- otu_to_PA(df_filt2, read_threshold)
    prev_df <- as.data.frame(prev_read & prev_RA)
    prev_df <- filter_taxa_by_presence(prev_df, presence_t = presence_t) # Remove taxa that are not present in any samples
    prev_df <- prev_df %>% rownames_to_column("npm_research_id")
    return(prev_df)
}


#### Prevalence decontamination

In [3]:
get_global_contaminants <- function(column, max_prev_t, fold_diff_t) {
    decon_raw <- fread(str_glue("results/decontamination/diff_prev_V3/decon_V3_raw.RA{RA_threshold}.read_threshold{read_threshold}.csv"))
    level_c <- decon_raw %>%
        filter(meta_col == column,
               max_prev > max_prev_t,
               fold_diff > fold_diff_t)

    level_c <- level_c$taxa

    return(level_c)
}

get_level_prevalence <- function(level_c, prev_df, meta_filt, column, max_prev_t, fold_diff_t) {
    dat_meta <- filter_batch_levels(prev_df, meta_filt, column)

    morsels <- foreach(taxon = level_c) %dopar% {
        prev_stats <- dat_meta %>%
            group_by_at(c(column)) %>%
            summarise(prevalence = sum(get(taxon)) / n()) %>%
            add_column(taxa = taxon)

        prev_stats
    }

    stopImplicitCluster()

    res <- bind_rows(morsels)

    min_df <- res %>%
        group_by(taxa) %>%
        summarise(min_prev = min(prevalence))

    level_prevalence <- min_df %>%
        right_join(res) %>%
        mutate(fold_diff = prevalence / min_prev) %>%
        filter(fold_diff > fold_diff_t,
               prevalence > max_prev_t)
    return(level_prevalence)
}



#### Correlation decontamination

In [4]:
corr_decontam <- function(df, non_contaminants, contaminants) {
    nc_list <- foreach (non_contaminant_taxon = non_contaminants) %dopar% {
        morsel <- tibble()
        for (contaminant_taxon in contaminants) {
            spearman_test <- cor.test(df[ ,contaminant_taxon], df[, non_contaminant_taxon])
            rho <- spearman_test$estimate
            print(str_glue("{non_contaminant_taxon} has correlation of {rho} with {contaminant_taxon}"))
            morsel <- morsel %>% bind_rows(tibble(non_contaminant_taxon = non_contaminant_taxon, 
                                           contaminant_taxon = contaminant_taxon,
                                           rho = rho))
        }

        return(morsel)
    }

    corr_df <- bind_rows(nc_list)  
    return(corr_df)
}


parse_corr_results <- function(corr_df, corr_t) {
    parsed <- corr_res %>%
    mutate(above_corr = ifelse(rho > corr_t, T, F)) %>%
    group_by(non_contaminant_taxon) %>%
    summarise(n_corr = sum(above_corr))
    
    contaminants <- parsed %>%
        filter(n_corr > 0) %>%
        distinct(non_contaminant_taxon) %>%
        rename(contaminants = non_contaminant_taxon)
    
    non_contaminants <- parsed %>%
        filter(n_corr == 0) %>%
        distinct(non_contaminant_taxon)
    
    return(list(non_contaminants = non_contaminants, contaminants = contaminants))
}

### Run decontamination

In [5]:
taxa_rank <- "S"
n <- 9999
human <- "Homo sapiens"
RA_threshold <- 0.01
read_threshold <- 10
max_prev_t <- 0.25
fold_diff_t <- 2
presence_t <- 0
min_samples <- 100
corr_t <- 0.7

# Load data and remove low microbial read samples
to_retain <- fread("data/samples_above_100_microbial_reads.txt")$npm_research_id

df <- load_data(str_glue("data/temp_files_9999/07_abundance_matrix/abundance_matrix.subset_9999.{taxa_rank}.tsv")) %>% 
    filter(npm_research_id %in% to_retain)

meta <- load_metadata("data/SG10K_Health_metadata.n10714.16March2021.parsed.csv", df) %>% 
    filter(npm_research_id %in% to_retain)

# Get metadata columns of interest
meta_cols <- get_meta_cols(meta, 
                           meta_regex = "kit|flow_cell|site_supplying", 
                           to_exclude = c("library_prep_kit", "hiseq_xtm_flow_cell_v2_5_id"))

# Get metadata subset
meta_filt <- subset_metadata(meta, n)

# Load global results
decon_raw <- fread(str_glue("results/decontamination/diff_prev_V3/decon_V3_raw.RA{RA_threshold}.read_threshold{read_threshold}.csv"))

# Preprocess data
prev_df <- preprocess_data(df, meta_filt, RA_threshold, read_threshold, presence_t)

# Get CLR matrix for correlation decontamination
clr_df <- df %>% 
    select(colnames(prev_df))
clr_df <- otu_to_RA(clr_df)
clr_df <- RA_to_clr(clr_df)

### Analyse global contaminant list to get level-specific contaminants

In [6]:
column <- "site_supplying_sample"
# column <- "extraction_kit"

#### Get differentially prevalent samples

In [7]:
level_c <- get_global_contaminants(column, max_prev_t = max_prev_t, fold_diff_t = fold_diff_t)

level_prevalence <- get_level_prevalence(level_c = level_c, prev_df = prev_df, 
                     meta_filt = meta_filt, column = column, 
                     max_prev_t = max_prev_t, 
                     fold_diff_t = fold_diff_t)

After pruning, site_supplying_sample has 6 levels


Joining, by = "taxa"



#### Find taxa that are correlated within each level

In [8]:
# Iterate through all levels in meta column
level_list <- deframe(unique(level_prevalence[, column]))
morsels <- foreach (level = level_list) %do% {
#     level <- "QIAamp DNA Blood Mini Kit"
    contams <- level_prevalence %>% 
        filter(get(column) == level)
    contams <- contams$taxa
    
    noncontam <- colnames(prev_df)
    noncontam <- noncontam[!(noncontam %in% c(contams, "npm_research_id"))]

    # Get samples from level
    meta_fine <- meta_filt %>% 
        filter(get(column) == level)
    
    clr_df_fine <- clr_df %>% 
        filter(npm_research_id %in% meta_fine$npm_research_id)

    corr_res <- corr_decontam(clr_df_fine, noncontam, contams)
    parsed_res <- parse_corr_results(corr_res, corr_t = corr_t)
    corr_c <- parsed_res[["contaminants"]]
    corr_nc <- parsed_res[["non_contaminants"]]

    # Merge decontamination results into dataframe for convenience
    all_contams <- unique(c(corr_c$contaminants, contams))
    taxa_string <- paste0(all_contams, collapse = "|")
    
    return(tibble(meta_col = column, level = level, 
                  n_samples = nrow(clr_df_fine), n_prev_contams = length(contams),
                  n_all_contams = length(all_contams), contams = taxa_string))
}

ERROR: Error in {: task 1 failed - "object 'contam' not found"


#### Save results

In [None]:
final_results <- bind_rows(morsels)
fwrite(final_results, str_glue("results/decontamination/post_hoc_fine_grain/{column}_fine_grain_results.csv"))