# Identication of putative contaminant taxa

In [2]:
setwd("/mnt/c/Users/Cedric/Desktop/git_repos/blood_microbial_signatures/")
require(foreach)
require(tidyverse)
require(ggplot2)
require(data.table)
require(doParallel)
require(compositions)
require(VennDiagram)
registerDoParallel(cores=6)

Loading required package: foreach

Loading required package: tidyverse

“running command 'timedatectl' had status 1”
── [1mAttaching packages[22m ─────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.6     [32m✔[39m [34mdplyr  [39m 1.0.7
[32m✔[39m [34mtidyr  [39m 1.1.4     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.0     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mpurrr[39m::[32maccumulate()[39m masks [34mforeach[39m::accumulate()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m     masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m        masks [34mstats[39m::lag()
[31m✖[39m [34mpurrr[39m::[32mwhen()[39m       masks [34

### Load and preprocess data

In [3]:
load_metadata <- function(file_path, df) {
    meta <- fread(file_path, na.strings=c("", NA))
    meta <- meta %>% 
        filter(npm_research_id %in% df$npm_research_id) %>%
        select(-removal_requested_by_supplier) %>%
        replace(is.na(.), "unknown")
    return(meta)
}


get_meta_cols <- function(meta, meta_regex, to_exclude) {
    meta_cols <- colnames(meta)[grep(meta_regex, colnames(meta))]
    meta_cols <- meta_cols[!(meta_cols %in% to_exclude)]
    return(meta_cols)
}

In [5]:
meta_cols

In [4]:
corr_t <- 0.7

# Load data and remove low microbial read samples
to_retain <- fread("data/samples_above_100_microbial_reads.txt")$npm_research_id

prev_df <- fread("results/decontamination/prevalence_RA0.005_read10.csv")
corr_nc_within <- read.csv(str_glue("results/decontamination/correlation_decontamination/nc.diff_prev_V3.RA0.005.read_threshold10.max_prev0.25.fold_diff2.corr_t{corr_t}.within_batch.S.n9999.txt"))$non_contaminant_taxon

prev_df <- prev_df %>% select(all_of(c("npm_research_id", corr_nc_within)))

meta <- load_metadata("data/SG10K_Health_metadata.n10714.16March2021.parsed.csv", prev_df) %>% 
    filter(npm_research_id %in% to_retain)

# Get metadata columns of interest
meta_cols <- get_meta_cols(meta, 
                           meta_regex = "kit|flow_cell|site_supplying", 
                           to_exclude = c("library_prep_kit", "hiseq_xtm_flow_cell_v2_5_id"))
prev_df

npm_research_id,Fusarium graminearum,Buchnera aphidicola,Xanthomonas euvesicatoria,Janthinobacterium svalbardensis,Janthinobacterium sp. LM6,Corynebacterium jeikeium,Nocardioides seonyuensis,Clostridium botulinum,Candidatus Nitrosocosmicus franklandus,⋯,Torque teno virus 2,Rickettsia montanensis,Human alphaherpesvirus 2,Streptococcus sp. FDAARGOS_522,Woolly monkey hepatitis B virus,Streptococcus pasteurianus,Rickettsia endosymbiont of Ixodes pacificus,Capuchin monkey hepatitis B virus,Bacillus sp. TK-2,Lactobacillus phage Lv-1
<chr>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,⋯,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>
WHB10753,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,⋯,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE
WHB9289,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,⋯,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE
WHB7247,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,⋯,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE
WHB10124,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,⋯,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE
WHB8107,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,⋯,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE
WHB9315,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,⋯,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE
WHB10904,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,⋯,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE
WHB8989,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,⋯,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE
WHH1247,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,⋯,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE
WHB9026,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,⋯,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE


In [5]:
# column <- "extraction_kit"
result_df <- tibble(taxa = corr_nc_within)

for(column in meta_cols) {
    res <- prev_df %>% 
        left_join(meta %>% select(all_of(c("npm_research_id", column)))) %>%
        select(-npm_research_id) %>%
        pivot_longer(!column, names_to = "taxa", values_to = "presence") %>%
        filter(presence) %>%
        group_by(taxa) %>%
        summarise(n_batches = n_distinct(get(column)))
    colnames(res)[2] <- column

    result_df <- result_df %>%
        left_join(res, "taxa")
}

result_df <- result_df %>% column_to_rownames("taxa")
result_df[result_df <= 1] <- 0
result_df[result_df > 1] <- 1
result_df

Joining, by = "npm_research_id"
Note: Using an external vector in selections is ambiguous.
[34mℹ[39m Use `all_of(column)` instead of `column` to silence this message.
[34mℹ[39m See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
[90mThis message is displayed once per session.[39m
Joining, by = "npm_research_id"
Joining, by = "npm_research_id"
Joining, by = "npm_research_id"
Joining, by = "npm_research_id"
Joining, by = "npm_research_id"
Joining, by = "npm_research_id"


Unnamed: 0_level_0,site_supplying_sample,extraction_kit,hiseq_xtm_sbs_kit_300_cycles_v2__box_1of_2__lot,hiseq_xtm_sbs_kit_300_cycles_v2__box_2_of_2__lot,hiseq_xtm_pe_cluster_kit_cbottm_v2__box_1_of_2__lot,hiseq_xtm_pe_cluster_kit_cbottm_v2__box_2_of_2__lot,hiseq_xtm_flow_cell_v2_5_lot
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Fusarium graminearum,0,0,1,1,1,1,1
Buchnera aphidicola,0,0,0,0,0,0,0
Xanthomonas euvesicatoria,0,0,0,0,0,0,0
Janthinobacterium svalbardensis,0,0,0,0,0,0,0
Janthinobacterium sp. LM6,0,0,0,0,0,0,0
Corynebacterium jeikeium,1,1,1,1,1,1,1
Nocardioides seonyuensis,0,0,0,0,0,0,0
Clostridium botulinum,0,0,0,0,0,0,0
Candidatus Nitrosocosmicus franklandus,1,1,1,1,1,1,1
Torque teno virus 6,1,1,1,1,1,1,1


#### Parse results

In [6]:
row_sums <- rowSums(result_df)
parsed_df <- tibble(taxa = names(row_sums), n_cols = row_sums)

In [7]:
prev_stats <- apply(prev_df[, 2:ncol(prev_df)], 2, sum) / nrow(prev_df)
overall_prev <- data.frame(taxa = names(prev_stats), overall_prevalence = as.vector(prev_stats)) %>%
    mutate(n_samples = overall_prevalence * nrow(prev_df))

parsed_df %>% 
    left_join(overall_prev) %>%
    filter(n_cols == 7) %>%
    arrange(desc(overall_prevalence))

nc <- parsed_df %>%
    filter(n_cols == 7) %>%
    select(taxa)

fwrite(nc, str_glue("results/decontamination/simple_batch_decontam/nc.corr_t{corr_t}.within_batch.txt"))

Joining, by = "taxa"


taxa,n_cols,overall_prevalence,n_samples
<chr>,<dbl>,<dbl>,<dbl>
Cutibacterium acnes,7,0.047458390,422
Moraxella osloensis,7,0.019118309,170
Human mastadenovirus C,7,0.018780927,167
Mycolicibacterium aubagnense,7,0.012145749,108
Lactobacillus iners,7,0.011920828,106
Lactobacillus crispatus,7,0.010571300,94
Phyllobacterium zundukense,7,0.009334233,83
Sulfuritalea hydrogenivorans,7,0.008547009,76
Staphylococcus epidermidis,7,0.008547009,76
Gardnerella vaginalis,7,0.008434548,75


In [8]:
parsed_df %>% 
    filter(taxa == "Proteus mirabilis")

taxa,n_cols
<chr>,<dbl>
Proteus mirabilis,0
