# Identication of putative contaminant taxa

In [1]:
setwd("/mnt/c/Users/Cedric/Desktop/git_repos/blood_microbiome")
require(foreach)
require(tidyverse)
require(ggplot2)
require(data.table)
require(doParallel)
require(compositions)
require(VennDiagram)
registerDoParallel(cores=9)

Loading required package: foreach

Loading required package: tidyverse

“running command 'timedatectl' had status 1”
── [1mAttaching packages[22m ─────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.6     [32m✔[39m [34mdplyr  [39m 1.0.7
[32m✔[39m [34mtidyr  [39m 1.1.4     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.0     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mpurrr[39m::[32maccumulate()[39m masks [34mforeach[39m::accumulate()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m     masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m        masks [34mstats[39m::lag()
[31m✖[39m [34mpurrr[39m::[32mwhen()[39m       masks [34

### Data processing functions

In [2]:
load_data <- function(file_path) {
    dat <- as.data.frame(fread(file_path)) %>%
        separate(sample, into = c(NA, "npm_research_id"), sep = "\\.")
    return(dat)
}


load_metadata <- function(file_path, dat) {
    meta <- fread(file_path, na.strings=c("", NA))
    meta <- meta %>% 
        filter(npm_research_id %in% dat$npm_research_id) %>%
        select(-removal_requested_by_supplier) %>%
        replace(is.na(.), "unknown")
    return(meta)
}


subset_metadata <- function(meta, n_subset) {
    meta <- as.data.frame(meta)
    cohorts <- unique(meta$site_supplying_sample)
    subset_vec <- c()

    for (i in cohorts) {
        ids <- meta$npm_research_id[meta$site_supplying_sample == i]
        
        if (length(ids) > n_subset) {
            subset_ids <- sample(ids, n_subset)
            subset_vec <- c(subset_vec, subset_ids)
        } else {
            subset_vec <- c(subset_vec, ids)
        }
    }

    meta_sub <- meta %>%
      filter(npm_research_id %in% subset_vec) 
    return(meta_sub)
}


retrieve_rows_from_meta <- function(dat, meta) {
    return(dat %>% filter(npm_research_id %in% meta$npm_research_id)) 
}


remove_cols <- function(dat, col_to_exclude) {
    return(dat %>% select(-all_of(col_to_exclude)))
}


remove_low_freq_taxa <- function(dat, frac_presence) {
    n_original <- ncol(dat[, colnames(dat) != "npm_research_id"])
    PA_df <- apply(dat[, 2:ncol(dat)], 2, function(x) {ifelse(x > 0, T, F)})
    frac_df <- apply(PA_df, 2, function(x) {sum(x) / nrow(PA_df)})
    to_keep <- names(frac_df[frac_df > frac_presence])
    to_keep <- c("npm_research_id", to_keep)
    n_new <- length(to_keep) - 1
    print(str_glue("{n_new} / {n_original} taxa are present in {frac_presence} of samples"))
    return(dat %>% select(all_of(to_keep)))
}


otu_to_RA <- function(dat) {
    mat <- as.matrix(dat[, colnames(dat) != "npm_research_id"])
    RA_df <- as.data.frame(mat / rowSums(mat))
    RA_df <- add_column(RA_df, dat$npm_research_id, .before = 1)
    colnames(RA_df)[1] <- "npm_research_id"
    
    return(RA_df)
}


get_meta_cols <- function(meta, meta_regex, to_exclude) {
    meta_cols <- colnames(meta)[grep(meta_regex, colnames(meta))]
    meta_cols <- meta_cols[!(meta_cols %in% to_exclude)]
    return(meta_cols)
}


otu_to_PA <- function(dat, read_threshold) {
    prev_read <- dat %>%
        column_to_rownames("npm_research_id")
    
    prev_read[prev_read <= read_threshold] <- F
    prev_read[prev_read > read_threshold] <- T

    return(prev_read)
}


RA_to_PA <- function(RA_df, PA_threshold) {
    prev_RA <- RA_df %>% column_to_rownames("npm_research_id")
    prev_RA[prev_RA <= PA_threshold] <- F
    prev_RA[prev_RA > PA_threshold] <- T
    
    return(prev_RA)
}


RA_to_clr <- function(dat) {
    mat <- dat[, colnames(dat) != "npm_research_id"]
    clr_df <- clr(mat)
    return(cbind(data.frame(npm_research_id = dat$npm_research_id), as.data.frame(clr_df, check.names = F)))
}


filter_taxa_by_presence <- function(prev_df, presence_t) {
    taxa_counts <- apply(prev_df, 2, sum)
    to_keep <- names(taxa_counts)[taxa_counts > presence_t]
    return(prev_df %>% select(all_of(to_keep)))
}

### Decontamination functions

In [3]:
filter_batch_levels <- function(dat, metadat, column) {
    # Remove levels with < x samples
    tmp <- tibble(data.frame(metadat)) %>%
            group_by(get(column)) %>%
            summarise(n = n())

    # Vector of group levels to keep
    to_keep <- tmp[tmp$n >= min_samples, "get(column)"]$`get(column)`
    to_keep <- to_keep[to_keep != "Unknown"]
    n_levels <- length(to_keep)

    if (n_levels < 2) {
        print(str_glue("After pruning, {column} has < 2 levels"))
    } else {
        print(str_glue("After pruning, {column} has {n_levels} levels"))
    }

    # Remove rows in metadata
    dat_meta <- dat %>%
        left_join(metadat, by = "npm_research_id") %>%
        filter(get(column) %in% to_keep)

    return(dat_meta)
}


get_taxon_diff_prev <- function(dat_meta, column, taxon) {
    prev_stats <- dat_meta %>%
        group_by_at(c(column)) %>%
        summarise(prevalence = sum(get(taxon)) / n())

    max_prev <- max(prev_stats$prevalence)
    min_prev <- min(prev_stats$prevalence)
    fold_diff <- max_prev / min_prev
    max_level <- pull(prev_stats, column)[which(prev_stats$prevalence == max(prev_stats$prevalence))][1]
    min_level <- pull(prev_stats, column)[which(prev_stats$prevalence == min(prev_stats$prevalence))][1]

    crumb <- tibble(taxa = taxon, meta_col = column,
           max_level = max_level, min_level = min_level, 
           max_prev = max_prev, min_prev = min_prev,
           fold_diff = fold_diff)

    crumb <- distinct(crumb, taxa, .keep_all = T)

    return(crumb)
}

get_diff_prev <- function(dat, metadat, meta_cols, taxa_vec, prev_threshold, min_samples) {
    morsels <- foreach (column = meta_cols) %do% {
        dat_meta <- filter_batch_levels(dat, metadat, column)
        
        crumbs <- foreach (taxon = taxa_vec) %dopar% {
            get_taxon_diff_prev(dat_meta, column, taxon)
        }
        
        stopImplicitCluster()
        
        morsel <- bind_rows(crumbs)
        return(morsel)
    }

    result_df <- bind_rows(morsels)
    return(result_df)
}


preprocess_data <- function(dat, meta_filt, RA_threshold, read_threshold, presence_t) {
    # Filter data
    df_filt <- retrieve_rows_from_meta(dat, meta_filt)
    df_filt2 <- remove_cols(df_filt, c(human, "unclassified"))
    RA_df <- otu_to_RA(df_filt2)
    prev_RA <- RA_to_PA(RA_df, RA_threshold)
    prev_read <- otu_to_PA(df_filt2, read_threshold)
    prev_df <- as.data.frame(prev_read & prev_RA)
    prev_df <- filter_taxa_by_presence(prev_df, presence_t = presence_t) # Remove taxa that are not present in any samples
    prev_df <- prev_df %>% rownames_to_column("npm_research_id")
    return(prev_df)
}

decontaminate <- function(dat, meta_filt, meta_cols, human, RA_threshold, read_threshold, presence_t, min_samples, taxa_vec = NA) {
    # Preprocess df
    prev_df <- preprocess_data(dat, meta_filt, RA_threshold, read_threshold, presence_t)
    
    # Get list of taxa
    if (is.na(taxa_vec)) {
        taxa_vec <- colnames(prev_df)
        taxa_vec <- taxa_vec[taxa_vec != "npm_research_id"]
    }

    # Differential prevalence
    res <- get_diff_prev(prev_df, meta_filt, meta_cols, taxa_vec, min_samples = min_samples)

    return(res)
}

### Run decontamination

In [4]:
human <- "Homo sapiens"
RA_threshold <- 0.005
read_threshold <- 10
presence_t <- 0
min_samples <- 100

# Load data and remove low microbial read samples
to_retain <- fread("data/samples_above_100_microbial_reads.txt")$npm_research_id

df <- load_data(str_glue("data/taxonomic_profiles/07_abundance_matrix/abundance_matrix.S.pipeline2_210322.tsv")) %>% 
    filter(npm_research_id %in% to_retain)

meta <- load_metadata("data/SG10K_Health_metadata.n10714.16March2021.parsed.csv", df) %>% 
    filter(npm_research_id %in% to_retain)

# Get metadata columns of interest
meta_cols <- get_meta_cols(meta, 
                           meta_regex = "kit|flow_cell|site_supplying", 
                           to_exclude = c("hiseq_xtm_flow_cell_v2_5_id"))

# Get abundance matrix of non-zero taxa
df_zeroed <- preprocess_data(df, meta, RA_threshold, read_threshold, presence_t)
# fwrite(df_zeroed, str_glue("results/decontamination/prevalence_RA{RA_threshold}_read{read_threshold}.csv"))

# Run decontamination
decon_raw <- decontaminate(df, meta_filt = meta, meta_cols = meta_cols, 
                           human = human, RA_threshold = RA_threshold, read_threshold = read_threshold, 
                           presence_t = presence_t, min_samples = min_samples)

After pruning, site_supplying_sample has 6 levels
After pruning, extraction_kit has 6 levels
After pruning, library_prep_kit has 3 levels
After pruning, hiseq_xtm_sbs_kit_300_cycles_v2__box_1of_2__lot has 20 levels
After pruning, hiseq_xtm_sbs_kit_300_cycles_v2__box_2_of_2__lot has 19 levels
After pruning, hiseq_xtm_pe_cluster_kit_cbottm_v2__box_1_of_2__lot has 19 levels
After pruning, hiseq_xtm_pe_cluster_kit_cbottm_v2__box_2_of_2__lot has 23 levels
After pruning, hiseq_xtm_flow_cell_v2_5_lot has 19 levels


In [5]:
morsels <- foreach(max_prev_t = c(0.25, 0.5, 0.75)) %do% {
    crumbs <- foreach(fold_diff_t = c(2, 5, 10)) %do% {
        temp_contam <- decon_raw %>%
            mutate(diff_abn = ifelse(fold_diff > fold_diff_t, T, F)) %>%
            filter(max_prev > max_prev_t, diff_abn) %>%
            distinct(taxa)

        
        tibble(max_prev = max_prev_t, fold_diff = fold_diff_t, n_contam = length(temp_contam$taxa))
    }
    
    return(bind_rows(crumbs))
}

bind_rows(morsels)

max_prev,fold_diff,n_contam
<dbl>,<dbl>,<int>
0.25,2,160
0.25,5,160
0.25,10,160
0.5,2,118
0.5,5,118
0.5,10,118
0.75,2,83
0.75,5,83
0.75,10,83


In [6]:
max_prev_t <- 0.25
fold_diff_t <- 2

contam <- decon_raw %>%
    mutate(diff_abn = ifelse(fold_diff > fold_diff_t, T, F)) %>%
    filter(max_prev > max_prev_t, diff_abn) %>%
    distinct(taxa)


nc <- colnames(df_zeroed)
nc <- nc[!(nc %in% contam$taxa)]
nc <- tibble(taxa = nc[nc != "npm_research_id"])

In [7]:
length(contam$taxa)
length(nc$taxa)

#### Determine how many taxa were not analysable

In [8]:
not_analysable <- decon_raw %>%
    group_by(taxa) %>%
    summarise(max_max_prev = max(max_prev)) %>%
    filter(max_max_prev <= 0.25)

sum(not_analysable$taxa %in% nc$taxa)

In [9]:
fwrite(decon_raw, str_glue("results/decontamination/diff_prev_V3_no_split/decon_V3_raw.RA{RA_threshold}.read_threshold{read_threshold}.csv"))
fwrite(contam, str_glue("results/decontamination/diff_prev_V3_no_split/contaminants.RA{RA_threshold}.read_threshold{read_threshold}.max_prev{max_prev_t}.fold_diff{fold_diff_t}.txt"))
fwrite(nc, str_glue("results/decontamination/diff_prev_V3_no_split/noncontaminants.RA{RA_threshold}.read_threshold{read_threshold}.max_prev{max_prev_t}.fold_diff{fold_diff_t}.txt"))


### Sanity check

In [10]:
column <- "extraction_kit"
# column <- "hiseq_xtm_sbs_kit_300_cycles_v2__box_2_of_2__lot"

for (taxon in c("Stenotrophomonas maltophilia", "Alcaligenes faecalis")) {
    df_zeroed %>% 
        left_join(meta) %>%
        group_by_at(c(column)) %>%
        summarise(prevalence = sum(get(taxon)) / n()) %>%
        filter(get(column) != "Unknown") %>%
        ggplot(aes(x = get(column), y = prevalence)) +
            geom_bar(stat = "identity") +
            theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
        ylim(0, 1) +
        labs(x = column, y = "Prevalence", title = taxon)
    ggsave(str_glue("results/decontamination/diff_prev_V3/{taxon}.{column}.batch_prevalence.png"), dpi = 300)
}

Joining, by = "npm_research_id"
Saving 6.67 x 6.67 in image

Joining, by = "npm_research_id"
Saving 6.67 x 6.67 in image

