# Identication of putative contaminant taxa

In [1]:
setwd("/mnt/c/Users/Cedric/Desktop/git_repos/blood_microbiome")
require(foreach)
require(tidyverse)
require(ggplot2)
require(data.table)
require(doParallel)
require(compositions)
require(VennDiagram)
registerDoParallel(cores=6)

Loading required package: foreach

Loading required package: tidyverse

“running command 'timedatectl' had status 1”
── [1mAttaching packages[22m ─────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.6     [32m✔[39m [34mdplyr  [39m 1.0.7
[32m✔[39m [34mtidyr  [39m 1.1.4     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.0     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mpurrr[39m::[32maccumulate()[39m masks [34mforeach[39m::accumulate()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m     masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m        masks [34mstats[39m::lag()
[31m✖[39m [34mpurrr[39m::[32mwhen()[39m       masks [34

### Load and preprocess data

In [2]:
load_metadata <- function(file_path, df) {
    meta <- fread(file_path, na.strings=c("", NA))
    meta <- meta %>% 
        filter(npm_research_id %in% df$npm_research_id) %>%
        select(-removal_requested_by_supplier) %>%
        replace(is.na(.), "unknown")
    return(meta)
}


get_meta_cols <- function(meta, meta_regex, to_exclude) {
    meta_cols <- colnames(meta)[grep(meta_regex, colnames(meta))]
    meta_cols <- meta_cols[!(meta_cols %in% to_exclude)]
    return(meta_cols)
}

In [4]:
# Load data and remove low microbial read samples
to_retain <- fread("data/samples_above_100_microbial_reads.txt")$npm_research_id

prev_df <- fread("results/decontamination/prevalence_RA0.01_read10.csv")
corr_nc <- read.csv("results/decontamination/correlation_decontamination/nc.diff_prev_V3.RA0.01.read_threshold10.max_prev0.25.fold_diff2.corr_t0.7.S.n9999.txt")$non_contaminant_taxon
prev_df <- prev_df %>% select(all_of(c("npm_research_id", corr_nc)))
prev_df

meta <- load_metadata("data/SG10K_Health_metadata.n10714.16March2021.parsed.csv", prev_df) %>% 
    filter(npm_research_id %in% to_retain)

# Get metadata columns of interest
meta_cols <- get_meta_cols(meta, 
                           meta_regex = "kit|flow_cell|site_supplying", 
                           to_exclude = c("library_prep_kit", "hiseq_xtm_flow_cell_v2_5_id"))


npm_research_id,Achromobacter sp. AONIH1,Achromobacter spanius,Acidaminococcus intestini,Acidovorax sp. KKS102,Acinetobacter baumannii,Acinetobacter guillouiae,Acinetobacter haemolyticus,Acinetobacter junii,Acinetobacter lwoffii,⋯,Variovorax paradoxus,Variovorax sp. HW608,Variovorax sp. PMC12,Veillonella dispar,Veillonella parvula,Wolbachia endosymbiont of Culex quinquefasciatus,Woolly monkey hepatitis B virus,Xanthomonas campestris,Xylanimonas cellulosilytica,Zhihengliuella sp. ISTPL4
<chr>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,⋯,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>
WHB669,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,⋯,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE
WHB8678,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,⋯,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE
WHH7080,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,⋯,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE
WHB4818,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,⋯,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE
WHB9470,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,⋯,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE
WHB4637,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,⋯,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE
WHB3974,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,⋯,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE
WHB7065,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,⋯,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE
WHB8830,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,⋯,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE
WHB7956,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,⋯,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE


In [5]:
column <- "extraction_kit"
result_df <- tibble(taxa = corr_nc)

for(column in meta_cols) {
    res <- prev_df %>% 
        left_join(meta %>% select(all_of(c("npm_research_id", column)))) %>%
        select(-npm_research_id) %>%
        pivot_longer(!column, names_to = "taxa", values_to = "presence") %>%
        filter(presence) %>%
        group_by(taxa) %>%
        summarise(n_batches = n_distinct(get(column)))
    colnames(res)[2] <- column

    result_df <- result_df %>%
        left_join(res, "taxa")
}

result_df <- result_df %>% column_to_rownames("taxa")
result_df[result_df <= 1] <- 0
result_df[result_df > 1] <- 1
result_df

Joining, by = "npm_research_id"

Note: Using an external vector in selections is ambiguous.
[34mℹ[39m Use `all_of(column)` instead of `column` to silence this message.
[34mℹ[39m See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
[90mThis message is displayed once per session.[39m

Joining, by = "npm_research_id"

Joining, by = "npm_research_id"

Joining, by = "npm_research_id"

Joining, by = "npm_research_id"

Joining, by = "npm_research_id"

Joining, by = "npm_research_id"



Unnamed: 0_level_0,site_supplying_sample,extraction_kit,hiseq_xtm_sbs_kit_300_cycles_v2__box_1of_2__lot,hiseq_xtm_sbs_kit_300_cycles_v2__box_2_of_2__lot,hiseq_xtm_pe_cluster_kit_cbottm_v2__box_1_of_2__lot,hiseq_xtm_pe_cluster_kit_cbottm_v2__box_2_of_2__lot,hiseq_xtm_flow_cell_v2_5_lot
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Achromobacter sp. AONIH1,1,1,1,1,1,1,1
Achromobacter spanius,0,0,1,1,1,1,1
Acidaminococcus intestini,0,0,0,0,0,0,0
Acidovorax sp. KKS102,1,1,1,1,1,1,1
Acinetobacter baumannii,1,1,1,1,1,1,1
Acinetobacter guillouiae,1,1,1,1,1,1,1
Acinetobacter haemolyticus,1,1,1,1,1,1,1
Acinetobacter junii,1,1,1,1,1,1,1
Acinetobacter lwoffii,0,0,0,0,0,0,0
Acinetobacter nosocomialis,0,0,0,1,1,1,1


#### Parse results

In [6]:
row_sums <- rowSums(result_df)
parsed_df <- tibble(taxa = names(row_sums), n_cols = row_sums)

In [7]:
prev_stats <- apply(prev_df[, 2:ncol(prev_df)], 2, sum) / nrow(prev_df)
overall_prev <- data.frame(taxa = names(prev_stats), overall_prevalence = as.vector(prev_stats)) %>%
    mutate(n_samples = overall_prevalence * nrow(prev_df))

parsed_df %>% 
    left_join(overall_prev) %>%
    filter(n_cols == 7) %>%
    arrange(desc(overall_prevalence))

nc <- parsed_df %>%
    filter(n_cols == 7) %>%
    select(taxa)

fwrite(nc, "results/decontamination/simple_batch_decontam/nc.txt")

Joining, by = "taxa"



taxa,n_cols,overall_prevalence,n_samples
<chr>,<dbl>,<dbl>,<dbl>
Cutibacterium acnes,7,0.033886503,292
Mitsuaria sp. 7,7,0.026807474,231
Microbacterium hominis,7,0.021121040,182
Microbacterium sp. PM5,7,0.019264245,166
Bacillus cereus,7,0.017059301,147
Moraxella osloensis,7,0.016711152,144
Burkholderia contaminans,7,0.015434606,133
Human mastadenovirus C,7,0.014738308,127
Microbacterium sp. Y-01,7,0.013113613,113
Rhodococcus sp. NJ-530,7,0.010908669,94
