# Identication of putative contaminant taxa

In [1]:
setwd("/mnt/c/Users/Cedric/Desktop/git_repos/blood_microbiome")
require(foreach)
require(tidyverse)
require(ggplot2)
require(data.table)

Loading required package: foreach

Loading required package: tidyverse

“running command 'timedatectl' had status 1”
── [1mAttaching packages[22m ─────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.3     [32m✔[39m [34mdplyr  [39m 1.0.7
[32m✔[39m [34mtidyr  [39m 1.1.3     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.0.1     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mpurrr[39m::[32maccumulate()[39m masks [34mforeach[39m::accumulate()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m     masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m        masks [34mstats[39m::lag()
[31m✖[39m [34mpurrr[39m::[32mwhen()[39m       masks [34

In [2]:
load_data <- function(file_path) {
    df <- as.data.frame(fread(file_path)) %>%
        separate(sample, into = c(NA, "npm_research_id"), sep = "\\.")
    return(df)
}


load_metadata <- function(file_path, df) {
    meta <- fread(file_path, na.strings=c("", NA))
    meta <- meta %>% 
        filter(npm_research_id %in% df$npm_research_id) %>%
        select(-removal_requested_by_supplier) %>%
        replace(is.na(.), "unknown")
    return(meta)
}


subset_metadata <- function(meta, n_subset) {
    meta <- as.data.frame(meta)
    cohorts <- unique(meta$site_supplying_sample)
    subset_vec <- c()

    for (i in cohorts) {
        ids <- meta$npm_research_id[meta$site_supplying_sample == i]
        
        if (length(ids) > n_subset) {
            subset_ids <- sample(ids, n_subset)
            subset_vec <- c(subset_vec, subset_ids)
        } else {
            subset_vec <- c(subset_vec, ids)
        }
    }

    meta_sub <- meta %>%
      filter(npm_research_id %in% subset_vec) 
    return(meta_sub)
}


retrieve_rows_from_meta <- function(df, meta) {
    return(df %>% filter(npm_research_id %in% meta$npm_research_id)) 
}


remove_cols <- function(df, col_to_exclude) {
    return(df %>% select(-all_of(col_to_exclude)))
}


remove_low_freq_taxa <- function(df, frac_presence) {
    n_original <- ncol(df[, colnames(df) != "npm_research_id"])
    PA_df <- apply(df[, 2:ncol(df)], 2, function(x) {ifelse(x > 0, T, F)})
    frac_df <- apply(PA_df, 2, function(x) {sum(x) / nrow(PA_df)})
    to_keep <- names(frac_df[frac_df > frac_presence])
    to_keep <- c("npm_research_id", to_keep)
    n_new <- length(to_keep) - 1
    print(str_glue("{n_new} / {n_original} taxa are present in {frac_presence} of samples"))
    return(df %>% select(all_of(to_keep)))
}


otu_to_RA <- function(df) {
    RA_df <- as.data.frame(t(apply(df[, colnames(df) != "npm_research_id"], 1, function(x) {x / sum(x)})))
    RA_df <- cbind(df[, colnames(df) == "npm_research_id"], RA_df)
    colnames(RA_df)[1] <- "npm_research_id"
    
    return(as.data.frame(RA_df))
}


get_metadata_plots <- function(meta, meta_cols) {
    meta <- as.matrix(meta)
    meta <- as_tibble(meta, rownames = "sample")

    plots <- list()
    
    for (column in meta_cols) {
        plt <- meta %>%
            mutate(across(everything(), as.character)) %>%
            select(all_of(column)) %>%
            group_by_at(column) %>%
            summarise(n = n()) %>%
            ggplot(aes_string(x = column, y = "n", fill = column)) +
                geom_bar(stat = "identity") +
                theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
                geom_text(aes_string(label = "n"))
        plots[[column]] <- plt
    }
    
    return(plots)
}


get_meta_cols <- function(meta) {
    meta_cols <- colnames(meta)[grep("kit|flow_cell|instrument_id", colnames(meta))]
    to_exclude <- c("library_prep_kit", "hiseq_xtm_flow_cell_v2_5_id")
    meta_cols <- meta_cols[!(meta_cols %in% to_exclude)]
    return(meta_cols)
}


print_group_freqs <- function(meta, meta_cols) {
    for (col in meta_cols) {
        tmp <- tibble(data.frame(meta)) %>%
            group_by(get(col)) %>%
            summarise(n = n())
        print(col)
        print(tmp)
    }
}


get_batch_prevalence <- function(x) {sum(x) / length(x)}


remove_low_read_samples <- function(df) {
    to_retain <- fread("data/samples_above_10_reads.txt")$npm_research_id
    return(df %>% filter(npm_research_id %in% to_retain))
}


remove_empty_rows <- function(df) {
    df_filt <- df %>% 
        rowwise(npm_research_id) %>% 
        mutate(total = sum(across(everything()))) %>%
        filter(total != 0) %>%
        select(-total)
    
    n_original <- nrow(df)
    n_removed <- n_original - nrow(df_filt)
    
    print(str_glue("{n_removed}/{n_original} samples removed due to having no reads of interest"))
    
    return(as.data.frame(df_filt))
}

#### Test for differential prevalence

In [3]:
run_diff_prev <- function(dat, metadat, meta_cols, read_threshold, prev_threshold, min_samples) {
    morsels <- foreach (column = meta_cols) %do% {
        print(column)
        # Remove levels with < x samples
        tmp <- tibble(data.frame(metadat)) %>%
                group_by(get(column)) %>%
                summarise(n = n())

        # Vector of group levels to keep
        to_keep <- tmp[tmp$n >= min_samples, "get(column)"]$`get(column)`
        to_keep <- to_keep[to_keep != "Unknown"]
        n_levels <- length(to_keep)

        if (n_levels < 2) {
            print(str_glue("After pruning, {column} has < 2 levels"))
        } else {
            print(str_glue("After pruning, {column} has {n_levels} levels"))
        }

        # Remove rows in metadata
        metadat_filt <- metadat %>% filter(get(column) %in% to_keep)

        # Retrieve rows
        dat_filt <- retrieve_rows_from_meta(dat, metadat_filt)
        metadat_filt <- metadat_filt %>% filter(npm_research_id %in% dat_filt$npm_research_id)

        # Convert abundance table to presence absence table
        prev_filt <- dat_filt[, colnames(dat_filt) != "npm_research_id"]
        prev_filt[prev_filt < read_threshold] <- 0
        prev_filt[prev_filt >= read_threshold] <- 1
        prev_filt$npm_research_id <- dat_filt$npm_research_id
        
        taxa_vec <- colnames(prev_filt)
        taxa_vec <- taxa_vec[taxa_vec != "npm_research_id"]

        crumbs <- foreach (taxon = taxa_vec) %do% {
            prev_stats <- prev_filt %>% 
                left_join(metadat_filt, by = "npm_research_id") %>%
                group_by_at(c(column)) %>%
                summarise(sum = sum(get(taxon))) %>%
                mutate(batch_n = as.vector(table(metadat_filt[, column])),
                       prevalence = sum / as.vector(table(metadat_filt[, column])))

            max_prev <- max(prev_stats$prevalence)
            min_prev <- min(prev_stats$prevalence)
            fold_diff <- max_prev / min_prev
            max_level <- pull(prev_stats, column)[which(prev_stats$prevalence == max(prev_stats$prevalence))][1]
            min_level <- pull(prev_stats, column)[which(prev_stats$prevalence == min(prev_stats$prevalence))][1]
            diff_prev <- ifelse(fold_diff > prev_threshold, T, F)

            crumb <- tibble(taxa = taxon, meta_col = column,
                   max_level = max_level, min_level = min_level, 
                   max_prev = max_prev, min_prev = min_prev,
                   fold_diff = fold_diff,
                   diff_prev = diff_prev)
            
            crumb <- distinct(crumb, taxa, .keep_all = T)

            return(crumb)
        }
        
        morsel <- bind_rows(crumbs)
        return(morsel)
    }

    result_df <- bind_rows(morsels)
    return(result_df)
}

#### Diff. prevalence decontamination procedure

In [4]:
decontaminate <- function(rank, n_subset, n, human, read_threshold) {
    # Load data and remove low microbial read samples
    to_retain <- fread("data/samples_above_95_reads.txt")$npm_research_id
    df <- load_data(str_glue("data/temp_files_{n_subset}/07_abundance_matrix/abundance_matrix.subset_{n_subset}.{rank}.tsv")) %>% 
        filter(npm_research_id %in% to_retain)
    meta <- load_metadata("data/SG10K_Health_metadata.n10714.16March2021.parsed.csv", df) %>%
        filter(npm_research_id %in% to_retain, site_supplying_sample == "HELIOS")
    
    # Get metadata subset
    meta_filt <- subset_metadata(meta, n)

    # Filter data
    df_filt <- retrieve_rows_from_meta(df, meta_filt)
    df_filt2 <- remove_cols(df_filt, c(human, "unclassified"))
    df_filt3 <- remove_low_freq_taxa(df_filt2, frac_presence = 0.05)
    df_filt4 <- remove_empty_rows(df_filt3)
    df_filt4 <- otu_to_RA(df_filt4)

    # Get metadata columns of interest
    meta_cols <- get_meta_cols(meta_filt)
    
#     # Print group freqs
#     print_group_freqs(meta_filt, meta_cols)
    
    # Differential prevalence
    res <- run_diff_prev(df_filt4, meta_filt, meta_cols, read_threshold = read_threshold, prev_threshold = 2 , min_samples = 5)
    
    return(res)
}


parse_decontamination_results <- function(decontam_res) {
    non_contaminants <- decontam_res %>%     
        group_by(taxa) %>%
        summarise(n_diff = sum(diff_prev)) %>%
        filter(n_diff == 0)
    non_contaminants <- non_contaminants$taxa

    contaminants <- decontam_res %>%
        group_by(taxa) %>%
        summarise(n_diff = sum(diff_prev)) %>%
        filter(n_diff > 0)
    contaminants <- contaminants$taxa
    
    return(list(non_contaminants = non_contaminants, contaminants = contaminants))
}


In [5]:
full_genus_df <- decontaminate("G", 9999, 9999, "Homo", 0.001)
full_genus_res <- parse_decontamination_results(full_genus_df)
fwrite(full_genus_df, str_glue("results/decontamination/diff_prev_raw/diff_prev_results.9999.G.HELIOS.RA.csv"))

full_species_df <- decontaminate("S", 9999, 9999, "Homo sapiens", 0.001)
full_species_res <- parse_decontamination_results(full_species_df)
fwrite(full_species_df, str_glue("results/decontamination/diff_prev_raw/diff_prev_results.9999.S.HELIOS.RA.csv"))

fwrite(tibble(non_contaminants = full_genus_res[["non_contaminants"]]), str_glue("results/decontamination/prevalence.noncontam.G.HELIOS.n9999.RA.txt"))
fwrite(tibble(contaminants = full_genus_res[["contaminants"]]), str_glue("results/decontamination/prevalence.contam.G.n9999.RA.txt"))

fwrite(tibble(non_contaminants = full_species_res[["non_contaminants"]]), str_glue("results/decontamination/prevalence.noncontam.S.HELIOS.n9999.RA.txt"))
fwrite(tibble(contaminants = full_species_res[["contaminants"]]), str_glue("results/decontamination/prevalence.contam.S.HELIOS.n9999.RA.txt"))


600 / 1492 taxa are present in 0.05 of samples
0/2285 samples removed due to having no reads of interest
[1] "extraction_kit"
After pruning, extraction_kit has < 2 levels
[1] "instrument_id"
After pruning, instrument_id has 5 levels
[1] "hiseq_xtm_sbs_kit_300_cycles_v2__box_1of_2__lot"
After pruning, hiseq_xtm_sbs_kit_300_cycles_v2__box_1of_2__lot has 15 levels
[1] "hiseq_xtm_sbs_kit_300_cycles_v2__box_2_of_2__lot"
After pruning, hiseq_xtm_sbs_kit_300_cycles_v2__box_2_of_2__lot has 17 levels
[1] "hiseq_xtm_pe_cluster_kit_cbottm_v2__box_1_of_2__lot"
After pruning, hiseq_xtm_pe_cluster_kit_cbottm_v2__box_1_of_2__lot has 14 levels
[1] "hiseq_xtm_pe_cluster_kit_cbottm_v2__box_2_of_2__lot"
After pruning, hiseq_xtm_pe_cluster_kit_cbottm_v2__box_2_of_2__lot has 13 levels
[1] "hiseq_xtm_flow_cell_v2_5_lot"
After pruning, hiseq_xtm_flow_cell_v2_5_lot has 15 levels
1665 / 5199 taxa are present in 0.05 of samples
0/2285 samples removed due to having no reads of interest
[1] "extraction_kit"
After

### Correlation filter

In [None]:
load_parse_RA <- function(n_subset, rank, human) {
    # Load data
    to_retain <- fread("data/samples_above_95_reads.txt")$npm_research_id
    df <- load_data(str_glue("data/temp_files_{n_subset}/07_abundance_matrix/abundance_matrix.subset_{n_subset}.{rank}.tsv")) %>% 
        filter(npm_research_id %in% to_retain)
    meta <- load_metadata("data/SG10K_Health_metadata.n10714.16March2021.parsed.csv", df)
    
    # Get metadata subset
    meta_filt <- subset_metadata(meta, 9999)

    # Filter data
    df_filt <- retrieve_rows_from_meta(df, meta_filt)
    df_filt2 <- remove_cols(df_filt, c(human, "unclassified"))
    df_filt3 <- remove_low_freq_taxa(df_filt2, frac_presence = 0.05)
    df_filt4 <- remove_empty_rows(df_filt3)
    print(ncol(df_filt4))
    RA_df <- otu_to_RA(df_filt4)
    
    return(RA_df)
}

In [7]:
corr_decontam <- function(RA_df, non_contaminants, contaminants) {
    nc_list <- foreach (non_contaminant_taxon = non_contaminants) %do% {
        morsel <- tibble()
        for (contaminant_taxon in contaminants) {
            spearman_test <- cor.test(RA_df[ ,contaminant_taxon], RA_df[, non_contaminant_taxon])
            rho <- spearman_test$estimate
#             p_val <- spearman_test$p.value
#             p_adj <- p.adjust(p_val, method = "BH", n = length(length(non_contaminants) * length(contaminants)))
    #         print(str_glue("{non_contaminant_taxon} has correlation of {rho} with {contaminant_taxon}, p = {p_adj}"))
            morsel <- morsel %>% bind_rows(tibble(non_contaminant_taxon = non_contaminant_taxon, 
                                           contaminant_taxon = contaminant_taxon,
                                           rho = rho))
        }

        return(morsel)
    }

    corr_df <- bind_rows(nc_list)  
    return(corr_df)
}

In [None]:
species_RA <- load_parse_RA(9999, "S", "Homo sapiens")
genus_RA <- load_parse_RA(9999, "G", "Homo")

In [6]:
genus_c <- full_genus_res[["contaminants"]]
genus_nc <- full_genus_res[["non_contaminants"]]

species_c <- full_species_res[["contaminants"]]
species_nc <- full_species_res[["non_contaminants"]]

# species_corr <- corr_decontam(species_RA, species_nc, species_c)
# genus_corr <- corr_decontam(genus_RA, genus_nc, genus_c)

In [10]:
species_nc

In [None]:
genus_n_nc <- length(genus_nc)
species_n_nc <- length(species_nc)

print(str_glue("{genus_n_nc}/{ncol(genus_RA)} genus after differential prevalence decontamination"))
print(str_glue("{species_n_nc}/{ncol(species_RA)} species after differential prevalence decontamination"))

#### Visualise number of taxa retained by correlation threshold

In [None]:
get_corr_dist <- function(corr_df, non_contaminants) {
    rho_df <- tibble()
    for (i in seq(0, 1, 0.05)) {
        filtered_corr <- corr_df %>% 
            filter(rho > i)

        n_retained <- length(non_contaminants) - length(unique(filtered_corr$non_contaminant_taxon))
        morsel <- tibble(rho = i, n_retained = n_retained)
        rho_df <- rho_df %>% bind_rows(morsel)
    }
    
    rho_plt <- rho_df %>%
        ggplot(aes(x = rho, y = n_retained)) +
        geom_point() +
        labs(y = "No. of taxa retained", x = "Rho threshold")
    print(rho_plt)
    return(rho_df)
}

#### Save contaminant and non-contaminant taxa

In [None]:
get_filtered_taxa <- function(corr_df, nc, corr_t) {
    to_remove <- corr_df %>% 
        filter(rho > corr_t) %>%
        distinct(non_contaminant_taxon) %>%
        rename(to_remove = non_contaminant_taxon)
    to_keep <- tibble(non_contaminants = nc) %>% 
        filter(!(non_contaminants %in% to_remove$to_remove))
    return(list(to_remove = to_remove, to_keep = to_keep))
}

In [None]:
corr_t <- 0.7
species_corr_res <- get_filtered_taxa(species_corr, species_nc, corr_t)
genus_corr_res <- get_filtered_taxa(genus_corr, genus_nc, corr_t)

species_corr_res
genus_corr_res

In [None]:
fwrite(species_corr_res[["to_keep"]], 
       str_glue("results/decontamination/prevalence.corr.noncontam.S.n9999.RA.txt"))
fwrite(species_corr_res[["to_remove"]], 
       str_glue("results/decontamination/prevalence.corr.contam.S.n9999.RA.txt"))

fwrite(genus_corr_res[["to_keep"]], 
       str_glue("results/decontamination/prevalence.corr.noncontam.G.n9999.RA.txt"))
fwrite(genus_corr_res[["to_remove"]], 
       str_glue("results/decontamination/prevalence.corr.contam.G.n9999.RA.txt"))

In [None]:
genus_corr_res[["to_keep"]]
species_corr_res[["to_keep"]]

### Identify RA thresholds

In [None]:
long_RA <- species_RA %>%
    pivot_longer(!npm_research_id, names_to = "taxon", values_to = "rel_a") 

#### RA thresholds ~ percentile

In [None]:
quantiles <- foreach (perc = seq(73, 100, 0.5)) %do% {
    quant <- quantile(long_RA$rel_a, perc / 100)
    return(tibble(perc = perc, quant = quant))
}

q_df <- bind_rows(quantiles)

In [None]:
q_df %>%
    ggplot(aes(x = perc, y = log(quant, base = 10))) +
    geom_line()

In [None]:
length(full_genus_res[["non_contaminants"]])

In [None]:
threshold_df <- tibble()
for (perc in seq(73, 100, 0.5)) {
    read_threshold <- quantile(long_RA$rel_a, perc / 100)
    full_species_df <- decontaminate("S", 9999, 9999, "Homo sapiens", read_threshold = read_threshold)
    full_species_res <- parse_decontamination_results(full_species_df)
    fwrite(full_species_df, str_glue("results/decontamination/RA_thresholds_HELIOS/diff_prev_results.9999.S.RA.perc_{read_threshold}.csv"))
    fwrite(tibble(non_contaminants = full_species_res[["non_contaminants"]]), str_glue("results/decontamination/RA_thresholds_HELIOS/prevalence.noncontam.S.n9999.RA.perc_{read_threshold}.txt"))
    threshold_df <- threshold_df %>% bind_rows(tibble(perc = perc, read_threshold = read_threshold, n_nc = length(full_species_res[["non_contaminants"]])))
}

In [None]:
threshold_df

In [11]:
threshold_df <- tibble()
for (read_threshold in c(0.00001, 0.0001, 0.0001, 0.05, 0.1, 0.15, 0.2, 0.3)) {
    full_species_df <- decontaminate("S", 9999, 9999, "Homo sapiens", read_threshold = read_threshold)
    full_species_res <- parse_decontamination_results(full_species_df)
    fwrite(full_species_df, str_glue("results/decontamination/RA_thresholds_HELIOS/diff_prev_results.9999.S.RA.perc_{read_threshold}.csv"))
    fwrite(tibble(non_contaminants = full_species_res[["non_contaminants"]]), str_glue("results/decontamination/RA_thresholds_HELIOS/prevalence.noncontam.S.n9999.RA.perc_{read_threshold}.txt"))
    threshold_df <- threshold_df %>% bind_rows(tibble(read_threshold = read_threshold, n_nc = length(full_species_res[["non_contaminants"]])))
}

In [None]:
threshold_df