# Identication of putative contaminant taxa

In [1]:
# tempdir <- function() {return("~/.Rtmp")}

In [2]:
setwd("/mnt/c/Users/Cedric/Desktop/git_repos/blood_microbiome")
require(ALDEx2)
require(tidyverse)
require(ggplot2)
require(data.table)
require(phyloseq)

Loading required package: ALDEx2

Loading required package: zCompositions

Loading required package: MASS

Loading required package: NADA

Loading required package: survival


Attaching package: ‘NADA’


The following object is masked from ‘package:stats’:

    cor


Loading required package: truncnorm

Loading required package: Rfast

Loading required package: Rcpp

Loading required package: RcppZiggurat

Loading required package: tidyverse

“running command 'timedatectl' had status 1”
── [1mAttaching packages[22m ─────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.3     [32m✔[39m [34mdplyr  [39m 1.0.7
[32m✔[39m [34mtidyr  [39m 1.1.3     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.0.1     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ──────────────────────────────────────────────────

### Create Phyloseq object

#### Load abundance matrix

In [3]:
load_data <- function(file_path) {
    df <- fread(file_path) %>%
        separate(sample, into = c(NA, "npm_research_id"), sep = "\\.")
    return(df)
}

df <- load_data(str_glue("data/temp_files_9999/07_abundance_matrix/abundance_matrix.subset_9999.S.tsv"))

#### Load metadata

In [4]:
load_metadata <- function(file_path, df) {
    meta <- fread(file_path, na.strings=c("", NA))
    meta <- meta %>% 
        filter(npm_research_id %in% df$npm_research_id) %>%
        select(-removal_requested_by_supplier) %>%
        replace(is.na(.), "")
    return(meta)
}

meta <- load_metadata("data/SG10K_Health_metadata.n10714.16March2021.parsed.csv", df)

#### Get metadata subsets

In [5]:
subset_metadata <- function(meta, n_subset) {
    meta <- as.data.frame(meta)
    cohorts <- unique(meta$site_supplying_sample)
    subset_vec <- c()

    for (i in cohorts) {
        ids <- meta$npm_research_id[meta$site_supplying_sample == i]
        
        if (length(ids) > n_subset) {
            subset_ids <- sample(ids, n_subset)
            subset_vec <- c(subset_vec, subset_ids)
        } else {
            subset_vec <- c(subset_vec, ids)
        }
    }

    meta_sub <- meta %>%
      filter(npm_research_id %in% subset_vec) 
    return(meta_sub)
}

meta_filt <- subset_metadata(meta, 10)

#### Retrieve rows in abundance matrix

In [6]:
retrieve_rows_from_meta <- function(df, meta) {
    return(df %>% filter(npm_research_id %in% meta$npm_research_id)) 
}

df_filt <- retrieve_rows_from_meta(df, meta_filt)

#### Remove human and unclassified reads

In [7]:
remove_cols <- function(df, col_to_exclude) {
    return(df %>% select(-all_of(col_to_exclude)))
}

df_filt2 <- remove_cols(df_filt, c("Homo sapiens", "unclassified"))

#### Remove taxa present in <~x% of samples

In [8]:
remove_low_freq_taxa <- function(df) {
    n_original <- ncol(df)
    frac_presence <- 0.05
    PA_df <- apply(df[, 2:ncol(df)], 2, function(x) {ifelse(x > 0, T, F)})
    frac_df <- apply(PA_df, 2, function(x) {sum(x) / nrow(PA_df)})
    to_keep <- names(frac_df[frac_df > frac_presence])
    to_keep <- c("npm_research_id", to_keep)
    print(str_glue("{length(to_keep)} / {n_original} taxa are present in {frac_presence} of samples"))
    return(df %>% select(all_of(to_keep)))
}

df_filt3 <- remove_low_freq_taxa(df_filt2)

1700 / 5200 taxa are present in 0.05 of samples


#### Plot metadata barcharts

In [9]:
get_metadata_plots <- function(meta, meta_cols) {
    meta <- as.matrix(meta)
    meta <- as_tibble(meta, rownames = "sample")

    plots <- list()
    
    for (column in meta_cols) {
        plt <- meta %>%
            mutate(across(everything(), as.character)) %>%
            select(all_of(column)) %>%
            group_by_at(column) %>%
            summarise(n = n()) %>%
            ggplot(aes_string(x = column, y = "n", fill = column)) +
                geom_bar(stat = "identity") +
                theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
                geom_text(aes_string(label = "n"))
        plots[[column]] <- plt
    }
    
    return(plots)
}


In [10]:
get_meta_cols <- function(meta) {
    meta_cols <- colnames(meta)[grep("kit|flow_cell|instrument_id|site_supplying", colnames(meta))]
    to_exclude <- c("library_prep_kit", "hiseq_xtm_flow_cell_v2_5_id")
    meta_cols <- meta_cols[!(meta_cols %in% to_exclude)]
    return(meta_cols)
}

meta_cols <- get_meta_cols(meta)

In [11]:
print_group_freqs <- function(meta, meta_cols) {
    for (col in meta_cols) {
        tmp <- tibble(data.frame(meta)) %>%
            group_by(get(col)) %>%
            summarise(n = n())
        print(col)
        print(tmp)
    }
}

print_group_freqs(meta_filt, meta_cols)

[1] "site_supplying_sample"
[90m# A tibble: 7 × 2[39m
  `get(col)`     n
  [3m[90m<chr>[39m[23m      [3m[90m<int>[39m[23m
[90m1[39m GUSTO         10
[90m2[39m HELIOS        10
[90m3[39m MEC           10
[90m4[39m PRISM         10
[90m5[39m SERI          10
[90m6[39m SSMP          10
[90m7[39m TTSH          10
[1] "extraction_kit"
[90m# A tibble: 7 × 2[39m
  `get(col)`                                         n
  [3m[90m<chr>[39m[23m                                          [3m[90m<int>[39m[23m
[90m1[39m abGENIX Whole Blood Genomic DNA Extraction Kit    10
[90m2[39m Chemagic DNA Blood Kit (Perkin Elmer, MA)          8
[90m3[39m Maxwell RSC Blood DNA Kit (AS1400)                 5
[90m4[39m QIAamp DNA Blood Mini Kit                         10
[90m5[39m QIAsymphony DSP DNA Midi Kit                      10
[90m6[39m QIAsymphony DSP DNA Mini Kit (192)                 5
[90m7[39m Unknown                                           22
[1] "instrume

### Run ALDEx2

In [12]:
retrieve_rows_from_meta(df_filt3, meta_filt)

npm_research_id,Paraburkholderia fungorum,Paraburkholderia xenovorans,Paraburkholderia hospita,Ralstonia pickettii,Ralstonia insidiosa,Ralstonia mannitolilytica,Ralstonia solanacearum,Burkholderia cepacia,Burkholderia sp. LA-2-3-30-S1-D2,⋯,Pelobacter acetylenicus,Staphylococcus simulans,Veillonella parvula,Citrobacter sp. CRE-46,Granulicella tundricola,Granulosicoccus antarcticus,Candidatus Desulfovibrio trichonymphae,Corynebacterium riegelii,Halioglobus pacificus,Corynebacterium pseudotuberculosis
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
WHB1034,0,0,0,70,670,15,13,10,1,⋯,0,0,0,0,0,0,0,0,0,0
WHB1098,0,0,0,122,1216,30,28,1,0,⋯,0,0,0,0,0,0,0,0,0,0
WHB4443,0,0,0,0,2,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
WHH785,1,0,0,14,1,2,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
WHH693,2,0,0,19,1,1,0,2,0,⋯,0,0,0,0,0,0,0,0,0,0
WHH1154,74,8,2,2173,950,235,358,8,3,⋯,0,0,0,1,0,0,0,0,0,0
WHH4972,0,0,0,1,1,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
WHH2333,1,2,0,11,5,11,5,4,0,⋯,0,0,0,0,0,0,0,0,1,0
WHB691,28,0,0,1,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
WHB5168,1,0,1,1,2,5,6,1,0,⋯,1,0,0,0,2,0,0,0,0,1


In [13]:
run_aldex2 <- function(df, meta, meta_cols, n_samples, mc_samples) {
    result_df <- tibble()

    for (column in meta_cols) {
        print(column)
#         column <- "extraction_kit"
        # Remove levels with < x samples
        tmp <- tibble(data.frame(meta)) %>%
            group_by(get(column)) %>%
            summarise(n = n())
        
        print(tmp)

        # Vector of group levels to keep
        to_keep <- tmp[tmp$n >= n_samples, "get(column)"]$`get(column)`
        to_keep <- to_keep[to_keep != "Unknown"]
        
        print(length(to_keep))

        # Remove rows in metadata
        meta_filt <- meta %>% filter(get(column) %in% to_keep)
        
        # Retrieve rows
        df_filt <- retrieve_rows_from_meta(df, meta_filt)
        n_filt <- nrow(df_filt)
        print(str_glue("There are {n_filt} samples after pruning metadata levels"))
        
        # Convert abundance table to taxa x samples
        df_filt <- t(df_filt %>% column_to_rownames("npm_research_id"))
        
        # Convert to CLR
        x <- aldex.clr(df_filt, meta_filt[, column], mc.samples = mc_samples, denom = "all", verbose = T, useMC = T)
        
        # Run Kruskal-Wallis
        morsel <- aldex.kw(x) %>% mutate(meta_col = column)

        result_df <- result_df %>% bind_rows(morsel)
    }
    
    return(result_df %>% rownames_to_column("taxa"))
}

#### Decontamination procedure for each subset

In [None]:
# decontaminate <- function(n) {
    # Load data
#     df <- load_data(str_glue("data/temp_files_{n_subset}/07_abundance_matrix/abundance_matrix.subset_{n_subset}.S.tsv"))
    df <- load_data(str_glue("data/temp_files_9999/07_abundance_matrix/abundance_matrix.subset_9999.S.tsv"))
    meta <- load_metadata("data/SG10K_Health_metadata.n10714.16March2021.parsed.csv", df)
    
    # Get metadata subset
    meta_filt <- subset_metadata(meta, 200)

    # Filter data
    df_filt <- retrieve_rows_from_meta(df, meta_filt)
    df_filt2 <- remove_cols(df_filt, c("Homo sapiens", "unclassified"))
    df_filt3 <- remove_low_freq_taxa(df_filt2)

    # Get metadata columns of interest
    meta_cols <- get_meta_cols(meta_filt)

    # Print batch levels and frequencies
#     print_group_freqs(meta_filt, meta_cols)

    # Run ANCOM-BC
    result_df <- run_aldex2(df_filt3, meta_filt, meta_cols, n_samples = 10, mc_samples = 128)
    result_df
    
#     return(non_contaminants)
#     }


1708 / 5200 taxa are present in 0.05 of samples
[1] "site_supplying_sample"
[90m# A tibble: 7 × 2[39m
  `get(column)`     n
  [3m[90m<chr>[39m[23m         [3m[90m<int>[39m[23m
[90m1[39m GUSTO           200
[90m2[39m HELIOS          200
[90m3[39m MEC             200
[90m4[39m PRISM           200
[90m5[39m SERI            200
[90m6[39m SSMP             63
[90m7[39m TTSH            200
[1] 7
There are 1263 samples after pruning metadata levels


multicore environment is is OK -- using the BiocParallel package

removed rows with sums equal to zero

computing center with all features

data format is OK

dirichlet samples complete

transformation complete

operating in serial mode



In [None]:
result_df %>%
    separate(taxa, into = c("taxa", NA), remove = T, sep = "\\.\\.") %>%
    mutate(diff_abn = ifelse(kw.eBH < 0.05, T, F)) %>%
    pivot_wider(names_from = meta_col, values_from = diff_abn, id_cols = taxa)

hm <- result_df %>%
    separate(taxa, into = c("taxa", NA), remove = T, sep = "\\.\\.") %>%
    mutate(diff_abn = ifelse(kw.eBH < 0.05, T, F)) %>%
    ggplot(aes(x = meta_col, y = taxa, fill = diff_abn)) +
    geom_tile() +
    theme(axis.text.y = element_blank(),
          axis.text.x = element_text(angle = 45, hjust = 1)) +
    labs(x = "Batch Info", y = "Species", fill = "Contaminant?")
hm

hm2 <- result_df %>%
    separate(taxa, into = c("taxa", NA), remove = T, sep = "\\.\\.") %>%
    mutate(diff_abn = ifelse(kw.ep < 0.05, T, F)) %>%
    ggplot(aes(x = meta_col, y = taxa, fill = diff_abn)) +
    geom_tile() +
    theme(axis.text.y = element_blank(),
          axis.text.x = element_text(angle = 45, hjust = 1)) +
    labs(x = "Batch Info", y = "Species", fill = "Contaminant?")
hm2

# ggsave("results/decontamination/ANCOMBC_decontamination_heatmap.n9606.100821.png", dpi = 300)

In [None]:
# fwrite(result_df, "results/decontamination/ANCOMBC_decontamination_results.n9706.100821.csv")

In [None]:
dawdawdawdwadwa####################################################

### Explore metadata

In [None]:
n_subset <- 100

# Load data
df <- load_data(str_glue("data/temp_files_{n_subset}/07_abundance_matrix/abundance_matrix.subset_{n_subset}.S.tsv"))

# Retain relevant metadata
meta <- parse_metadata("data/SG10K_Health_metadata.n10714.16March2021.parsed.csv", df)

# Get metadata columns of interest
meta_cols <- get_meta_cols(meta)

meta <- as.matrix(meta)
meta <- as_tibble(meta, rownames = "sample")

plots <- list()

for (column in meta_cols) {
    plt <- meta %>%
        mutate(across(everything(), as.character)) %>%
        select(all_of(column)) %>%
        group_by_at(column) %>%
        summarise(n = n()) %>%
        ggplot(aes_string(x = column, y = "n", fill = column)) +
            geom_bar(stat = "identity") +
            theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
            geom_text(aes_string(label = "n"))
    plots[[column]] <- plt
}

plots


### MAIN

In [None]:
# nc_list <- list()

# for (i in c(100)) {
#     nc_list[[str_glue("subset_{i}")]] <- decontaminate(i)
# }

# nc_list

### Sanity check

In [None]:
# Reduce(intersect, nc_list)

In [None]:
plot_sanity_plot <- function(X, Y, tax) {
    print(tax)
    temp_plt <- cbind(X, Y) %>%
        mutate(log_abundance = log(get(tax) + 0.0001)) %>%
        ggplot(aes(x = site_supplying_sample, y = get(tax), fill = site_supplying_sample)) + 
        geom_point() +
        geom_boxplot(alpha = 0.5) +
        labs(x = "Source cohort", y = paste(tax, "% abundance")) +
        theme(legend.position = "none",
          axis.text.x = element_text(angle = 45, hjust = 1),
          text = element_text(size = 15))
    return(temp_plt)
}

In [None]:
# Load data
taxon <- "Cutibacterium acnes"
df <- load_data(str_glue("data/temp_files_{n_subset}/07_abundance_matrix/abundance_matrix.subset_{n_subset}.S.tsv"))

# Remove homo and unclassified
col_to_exclude <- colnames(df)[grepl("Homo|unclassified", colnames(df))]
df <- remove_cols(df, col_to_exclude)

# Retain relevant metadata
meta <- parse_metadata("data/SG10K_Health_metadata.n10714.16March2021.parsed.csv", df)

# Normalise to RA
normalise <- function(df) {
    RA_df <- t(apply(df, 1, function(x) {x / sum(x) * 100}))
    return(as_tibble(RA_df))
}

species_df_RA <- normalise(df)

plot_sanity_plot(df, meta, taxon)
print(str_glue("Max no. of reads = {max(df[, taxon])}"))
print(str_glue("Mean no. of reads = {mean(df[, taxon])}"))

In [None]:
l1 <- tibble(l1)

fwrite(l1, str_glue("/home/projects/14001280/PROJECTS/blood_microbiome/results/decontamination/decontaminated_list.n{n_subset}.csv"))