# Identication of putative contaminant taxa

In [1]:
setwd("/mnt/c/Users/Cedric/Desktop/git_repos/blood_microbiome")
require(ALDEx2)
require(tidyverse)
require(ggplot2)
require(data.table)
require(phyloseq)
require(foreach)

Loading required package: ALDEx2

Loading required package: zCompositions

Loading required package: MASS

Loading required package: NADA

Loading required package: survival


Attaching package: ‘NADA’


The following object is masked from ‘package:stats’:

    cor


Loading required package: truncnorm

Loading required package: Rfast

Loading required package: Rcpp

Loading required package: RcppZiggurat

Loading required package: tidyverse

“running command 'timedatectl' had status 1”
── [1mAttaching packages[22m ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.3     [32m✔[39m [34mdplyr  [39m 1.0.7
[32m✔[39m [34mtidyr  [39m 1.1.3     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.0.1     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22

### Create Phyloseq object

#### Load abundance matrix

In [2]:
load_data <- function(file_path) {
    df <- as.data.frame(fread(file_path)) %>%
        separate(sample, into = c(NA, "npm_research_id"), sep = "\\.")
    return(df)
}

df <- load_data(str_glue("data/temp_files_9999/07_abundance_matrix/abundance_matrix.subset_9999.S.tsv"))

#### Load metadata

In [3]:
load_metadata <- function(file_path, df) {
    meta <- fread(file_path, na.strings=c("", NA))
    meta <- meta %>% 
        filter(npm_research_id %in% df$npm_research_id) %>%
        select(-removal_requested_by_supplier) %>%
        replace(is.na(.), "unknown")
    return(meta)
}

meta <- load_metadata("data/SG10K_Health_metadata.n10714.16March2021.parsed.csv", df)

#### Get metadata subsets

In [4]:
subset_metadata <- function(meta, n_subset) {
    meta <- as.data.frame(meta)
    cohorts <- unique(meta$site_supplying_sample)
    subset_vec <- c()

    for (i in cohorts) {
        ids <- meta$npm_research_id[meta$site_supplying_sample == i]
        
        if (length(ids) > n_subset) {
            subset_ids <- sample(ids, n_subset)
            subset_vec <- c(subset_vec, subset_ids)
        } else {
            subset_vec <- c(subset_vec, ids)
        }
    }

    meta_sub <- meta %>%
      filter(npm_research_id %in% subset_vec) 
    return(meta_sub)
}

meta_filt <- subset_metadata(meta, 10)

#### Retrieve rows in abundance matrix

In [24]:
retrieve_rows_from_meta <- function(df, meta) {
    return(df %>% filter(npm_research_id %in% meta$npm_research_id)) 
}

retrieve_rows_from_meta(df_filt2, meta_filt)

npm_research_id,Paraburkholderia fungorum,Paraburkholderia xenovorans,Paraburkholderia hospita,Ralstonia pickettii,Ralstonia insidiosa,Ralstonia mannitolilytica,Ralstonia solanacearum,Burkholderia cepacia,Burkholderia sp. LA-2-3-30-S1-D2,⋯,Mycobacterium virus Che12,Grapevine Cabernet Sauvignon reovirus,Delftia phage IME-DE1,Fowlpox virus,Bacillus sp. Pc3,Agrotis segetum nucleopolyhedrovirus B,Vibrio phage 11895-B1,Mycobacterium phage Milly,Cacao swollen shoot Togo A virus,Streptomyces phage phiSASD1
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
WHB8245,1,0,3,0,2,1,0,1,0,⋯,0,0,0,0,0,0,0,0,0,0
WHH4997,1,0,0,5,1,1,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
WHB5219,1,0,1,1,2,2,0,7,0,⋯,0,0,0,0,0,0,0,0,0,0
WHB4488,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
WHB910,0,0,1,462,4556,74,95,26,3,⋯,0,0,0,0,0,0,0,0,0,0
WHB9087,0,0,0,2,3,1,1,0,0,⋯,0,0,0,0,0,0,0,0,0,0
WHB10468,0,0,0,3,3,0,1,0,0,⋯,0,0,0,0,0,0,0,0,0,0
WHB730,0,0,0,0,1,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
WHB3582,1,11,4,11,2,8,1,0,0,⋯,0,0,0,0,0,0,0,0,0,0
WHB9659,0,3,1,27,7,4,4,0,1,⋯,0,0,0,0,0,0,0,0,0,0


#### Remove human and unclassified reads

In [27]:
remove_cols <- function(df, col_to_exclude) {
    return(df %>% select(-all_of(col_to_exclude)))
}

df_filt <- remove_cols(df, c("Homo sapiens", "unclassified"))

#### Remove taxa present in < 5% of samples

In [71]:
remove_low_freq_taxa <- function(df, frac_presence) {
    n_original <- ncol(df)
    PA_df <- apply(df[, 2:ncol(df)], 2, function(x) {ifelse(x > 0, T, F)})
    frac_df <- apply(PA_df, 2, function(x) {sum(x) / nrow(PA_df)})
    to_keep <- names(frac_df[frac_df > frac_presence])
    to_keep <- c("npm_research_id", to_keep)
    print(str_glue("{length(to_keep)} / {n_original} taxa are present in {frac_presence} of samples"))
    return(df %>% select(all_of(to_keep)))
}

df_filt2 <- remove_low_freq_taxa(df_filt, 0.05)

1737 / 5202 taxa are present in 0.05 of samples


#### Convert to relative abundance

In [9]:
otu_to_RA <- function(df) {
    rownames(df) <- df$npm_research_id
    RA_df <- t(apply(df[, 2:ncol(df)], 1, function(x) {x / sum(x)}))
    
    return(as.data.frame(RA_df))
}

RA_df <- otu_to_RA(df_filt2)

#### Plot metadata barcharts

In [10]:
get_metadata_plots <- function(meta, meta_cols) {
    meta <- as.matrix(meta)
    meta <- as_tibble(meta, rownames = "sample")

    plots <- list()
    
    for (column in meta_cols) {
        plt <- meta %>%
            mutate(across(everything(), as.character)) %>%
            select(all_of(column)) %>%
            group_by_at(column) %>%
            summarise(n = n()) %>%
            ggplot(aes_string(x = column, y = "n", fill = column)) +
                geom_bar(stat = "identity") +
                theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
                geom_text(aes_string(label = "n"))
        plots[[column]] <- plt
    }
    
    return(plots)
}


In [11]:
get_meta_cols <- function(meta) {
    meta_cols <- colnames(meta)[grep("kit|flow_cell|instrument_id|site_supplying", colnames(meta))]
    to_exclude <- c("library_prep_kit", "hiseq_xtm_flow_cell_v2_5_id")
    meta_cols <- meta_cols[!(meta_cols %in% to_exclude)]
    return(meta_cols)
}

meta_cols <- get_meta_cols(meta)

In [12]:
print_group_freqs <- function(meta, meta_cols) {
    for (col in meta_cols) {
        tmp <- tibble(data.frame(meta)) %>%
            group_by(get(col)) %>%
            summarise(n = n())
        print(col)
        print(tmp)
    }
}

print_group_freqs(meta_filt, meta_cols)

[1] "site_supplying_sample"
[90m# A tibble: 7 × 2[39m
  `get(col)`     n
  [3m[90m<chr>[39m[23m      [3m[90m<int>[39m[23m
[90m1[39m GUSTO        200
[90m2[39m HELIOS       200
[90m3[39m MEC          200
[90m4[39m PRISM        200
[90m5[39m SERI         200
[90m6[39m SSMP          63
[90m7[39m TTSH         200
[1] "extraction_kit"
[90m# A tibble: 7 × 2[39m
  `get(col)`                                         n
  [3m[90m<chr>[39m[23m                                          [3m[90m<int>[39m[23m
[90m1[39m abGENIX Whole Blood Genomic DNA Extraction Kit   200
[90m2[39m Chemagic DNA Blood Kit (Perkin Elmer, MA)        164
[90m3[39m Maxwell RSC Blood DNA Kit (AS1400)               148
[90m4[39m QIAamp DNA Blood Mini Kit                        200
[90m5[39m QIAsymphony DSP DNA Midi Kit                     200
[90m6[39m QIAsymphony DSP DNA Mini Kit (192)                52
[90m7[39m Unknown                                          299
[1] "instrume

### Run differential prevalence test

In [13]:
batch1<- 0
batch2 <- 0
batch1 == batch2 & batch2 == 0

In [88]:
test_contam <- function(x, prev_threshold) {
    contam <- F

    for (i in seq(length(x) - 1)) {
        for (j in seq(i + 1, length(x))) {
#             print(str_glue("{i}-{j}"))
            batch1 <- x[i]
            batch2 <- x[j]
            
            # True if one batch is zero, false if both are zero
            if (batch1 == batch2 & batch2 == 0) {
                contam <- F
#                 print("both zero")
            } 
            else if (batch1 != batch2 & min(c(batch1, batch2)) == 0) {
                contam <- T
#                 print("one zero")
            } 
            else {
                diff_prev <- max(c(batch1, batch2)) /  min(c(batch1, batch2))
                
                if (diff_prev > prev_threshold) {
#                     print(str_glue("{batch1}, {batch2} diff prevalence in batch {i} and batch {j}, with diff of {diff_prev} fold"))
                    contam <- T
                } else {
#                     print(str_glue("{batch1}, {batch2}, diff_prev is {diff_prev} not diff prevalent"))
                }
            }
        }
    }
    return(contam)
}

#### Convert read count table to prevalence table

In [43]:
get_batch_prevalence <- function(x) {sum(x) / length(x)}

#### Decontamination

In [92]:
run_diff_prev <- function(df, meta, meta_cols, read_threshold, prev_threshold, min_samples) {

    morsels <- foreach (column = meta_cols) %do% {
#         read_threshold <- 0.05
#         prev_threshold <- 2
#         min_samples <- 2
#         df <- df_filt3
#         min_samples <- 10
#         meta <- meta_filt
#         column <- meta_cols[3]
        
        # Remove levels with < x samples
        tmp <- tibble(data.frame(meta)) %>%
            group_by(get(column)) %>%
            summarise(n = n())
        
        print(tmp)

        # Vector of group levels to keep
        to_keep <- tmp[tmp$n >= min_samples, "get(column)"]$`get(column)`
        to_keep <- to_keep[to_keep != "Unknown"]
        n_levels <- length(to_keep)
        
        if (n_levels < 2) {
            print(str_glue("After pruning, {column} has < 2 levels"))
        } else {
            print(str_glue("After pruning, {column} has {n_levels} levels"))
        }
        
        # Remove rows in metadata
        meta_filt <- meta %>% filter(get(column) %in% to_keep)
        
        # Retrieve rows
        df_filt <- retrieve_rows_from_meta(df, meta_filt) %>% column_to_rownames("npm_research_id")
        n_filt <- nrow(df_filt)
        print(str_glue("There are {n_filt} samples after pruning metadata levels"))
        
        # Convert abundance table to presence absence table
        df_filt[df_filt >= read_threshold] <- 1
        df_filt[df_filt < read_threshold] <- 0
        
        batch_prev <- bind_cols(df_filt, meta_filt) %>%
            group_by(get(column)) %>%
            summarise_if(is.numeric, get_batch_prevalence) %>%
            select(-any_of(colnames(meta_filt)))
        
        # Test batch effects
        morsel <- batch_prev %>%
            summarise_if(is.numeric, test_contam, prev_threshold) %>%
            mutate(meta_col = column)

        return(morsel)
    }
    
    result_df <- bind_rows(morsels)
    return(result_df)
}

#### Decontamination procedure for each subset

In [133]:
# decontaminate <- function(n) {
    # Load data
#     df <- load_data(str_glue("data/temp_files_{n_subset}/07_abundance_matrix/abundance_matrix.subset_{n_subset}.S.tsv"))
    df <- load_data(str_glue("data/temp_files_9999/07_abundance_matrix/abundance_matrix.subset_9999.S.tsv"))
    meta <- load_metadata("data/SG10K_Health_metadata.n10714.16March2021.parsed.csv", df)
    
    # Get metadata subset
    meta_filt <- subset_metadata(meta, 9999)

    # Filter data
    df_filt <- retrieve_rows_from_meta(df, meta_filt)
    df_filt2 <- remove_cols(df_filt, c("Homo sapiens", "unclassified"))
    df_filt3 <- remove_low_freq_taxa(df_filt2, frac_presence = 0.05)

    # Get metadata columns of interest
    meta_cols <- get_meta_cols(meta_filt)

    res <- run_diff_prev(df_filt3, meta_filt, meta_cols, read_threshold = 0.05, prev_threshold = 2 , min_samples = 5)

    
#     return(non_contaminants)
#     }


1747 / 5200 taxa are present in 0.05 of samples
[90m# A tibble: 7 × 2[39m
  `get(column)`     n
  [3m[90m<chr>[39m[23m         [3m[90m<int>[39m[23m
[90m1[39m GUSTO           968
[90m2[39m HELIOS         [4m2[24m285
[90m3[39m MEC            [4m2[24m787
[90m4[39m PRISM          [4m1[24m248
[90m5[39m SERI           [4m1[24m436
[90m6[39m SSMP             63
[90m7[39m TTSH            919
After pruning, site_supplying_sample has 7 levels
There are 9706 samples after pruning metadata levels
[90m# A tibble: 7 × 2[39m
  `get(column)`                                      n
  [3m[90m<chr>[39m[23m                                          [3m[90m<int>[39m[23m
[90m1[39m abGENIX Whole Blood Genomic DNA Extraction Kit  [4m2[24m281
[90m2[39m Chemagic DNA Blood Kit (Perkin Elmer, MA)       [4m1[24m040
[90m3[39m Maxwell RSC Blood DNA Kit (AS1400)              [4m2[24m117
[90m4[39m QIAamp DNA Blood Mini Kit                        919
[90m5[39m QIAs

In [147]:
parsed_res <- res %>%
  summarise(across(where(is.logical), ~ sum(.x)))

parsed_res <- as.data.frame(t(parsed_res)) %>%
    rownames_to_column("taxa")
non_contaminants <- parsed_res %>% filter(V1 < 1)
non_contaminants <- non_contaminants$taxa
non_contaminants

contaminants <- parsed_res %>% filter(V1 >= 1)
contaminants <- contaminants$taxa

In [176]:
nc_list <- foreach (non_contaminant_taxon = non_contaminants) %do% {
    morsel <- tibble()
    for (contaminant_taxon in contaminants) {
        spearman_test <- cor.test(RA_df[ ,contaminant_taxon], RA_df[, non_contaminant_taxon])
        rho <- spearman_test$estimate
        p_val <- spearman_test$p.value
        p_adj <- p.adjust(p_val, method = "BH", n = length(length(non_contaminants) * length(contaminants)))
#         print(str_glue("{non_contaminant_taxon} has correlation of {rho} with {contaminant_taxon}, p = {p_adj}"))
        morsel <- morsel %>% bind_rows(tibble(non_contaminant_taxon = non_contaminant_taxon, 
                                       contaminant_taxon = contaminant_taxon,
                                       rho = rho, p_adj = p_adj))
    }
    
    return(morsel)
}

corr_df <- bind_rows(nc_list)
corr_df

non_contaminant_taxon,contaminant_taxon,rho,p_adj
<chr>,<chr>,<dbl>,<dbl>
Bradyrhizobium sp. SK17,Paraburkholderia fungorum,-0.001853085,8.555871e-01
Bradyrhizobium sp. SK17,Paraburkholderia xenovorans,-0.035754965,4.436247e-04
Bradyrhizobium sp. SK17,Paraburkholderia hospita,-0.024092035,1.795919e-02
Bradyrhizobium sp. SK17,Ralstonia pickettii,-0.099585394,1.073470e-22
Bradyrhizobium sp. SK17,Ralstonia insidiosa,-0.059422862,5.183530e-09
Bradyrhizobium sp. SK17,Ralstonia mannitolilytica,-0.087992678,4.772082e-18
Bradyrhizobium sp. SK17,Ralstonia solanacearum,-0.079305597,6.141331e-15
Bradyrhizobium sp. SK17,Burkholderia cepacia,-0.034827650,6.227354e-04
Bradyrhizobium sp. SK17,Burkholderia sp. LA-2-3-30-S1-D2,-0.012663179,2.136027e-01
Bradyrhizobium sp. SK17,Burkholderia metallica,-0.030215298,2.995810e-03


In [187]:
contaminant_taxon <- "Paraburkholderia fungorum"
non_contaminant_taxon <- "Bradyrhizobium sp. SK17"
spearman_test <- cor.test(RA_df[ ,contaminant_taxon], RA_df[, non_contaminant_taxon])
rho <- spearman_test$estimate
p_val <- spearman_test$p.value
p_adj <- p.adjust(p_val, method = "BH", n = length(length(non_contaminants) * length(contaminants)))
spearman_test


	Pearson's product-moment correlation

data:  RA_df[, contaminant_taxon] and RA_df[, non_contaminant_taxon]
t = -0.182, df = 9646, p-value = 0.8556
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.02180671  0.01810201
sample estimates:
         cor 
-0.001853085 


In [185]:
corr_df

non_contaminant_taxon,contaminant_taxon,rho,p_adj
<chr>,<chr>,<dbl>,<dbl>
Bradyrhizobium sp. SK17,Paraburkholderia fungorum,-0.001853085,8.555871e-01
Bradyrhizobium sp. SK17,Paraburkholderia xenovorans,-0.035754965,4.436247e-04
Bradyrhizobium sp. SK17,Paraburkholderia hospita,-0.024092035,1.795919e-02
Bradyrhizobium sp. SK17,Ralstonia pickettii,-0.099585394,1.073470e-22
Bradyrhizobium sp. SK17,Ralstonia insidiosa,-0.059422862,5.183530e-09
Bradyrhizobium sp. SK17,Ralstonia mannitolilytica,-0.087992678,4.772082e-18
Bradyrhizobium sp. SK17,Ralstonia solanacearum,-0.079305597,6.141331e-15
Bradyrhizobium sp. SK17,Burkholderia cepacia,-0.034827650,6.227354e-04
Bradyrhizobium sp. SK17,Burkholderia sp. LA-2-3-30-S1-D2,-0.012663179,2.136027e-01
Bradyrhizobium sp. SK17,Burkholderia metallica,-0.030215298,2.995810e-03


In [210]:
sum(apply(RA_df, 2, function(x) {sum(x > 0, na.rm = T)}) > 0.5* 9706)

In [216]:
sum(apply(RA_df, 2, median, na.rm = T) > 0)

In [192]:
test <- corr_df %>% 
    filter(rho > 0 & p_adj < 0.05)
length(unique(test$non_contaminant_taxon))

In [None]:
result_df %>%
    separate(taxa, into = c("taxa", NA), remove = T, sep = "\\.\\.") %>%
    mutate(diff_abn = ifelse(kw.eBH < 0.05, T, F)) %>%
    pivot_wider(names_from = meta_col, values_from = diff_abn, id_cols = taxa)

hm <- result_df %>%
    separate(taxa, into = c("taxa", NA), remove = T, sep = "\\.\\.") %>%
    mutate(diff_abn = ifelse(kw.eBH < 0.05, T, F)) %>%
    ggplot(aes(x = meta_col, y = taxa, fill = diff_abn)) +
    geom_tile() +
    theme(axis.text.y = element_blank(),
          axis.text.x = element_text(angle = 45, hjust = 1)) +
    labs(x = "Batch Info", y = "Species", fill = "Contaminant?")
hm

hm2 <- result_df %>%
    separate(taxa, into = c("taxa", NA), remove = T, sep = "\\.\\.") %>%
    mutate(diff_abn = ifelse(kw.ep < 0.05, T, F)) %>%
    ggplot(aes(x = meta_col, y = taxa, fill = diff_abn)) +
    geom_tile() +
    theme(axis.text.y = element_blank(),
          axis.text.x = element_text(angle = 45, hjust = 1)) +
    labs(x = "Batch Info", y = "Species", fill = "Contaminant?")
hm2

# ggsave("results/decontamination/ANCOMBC_decontamination_heatmap.n9606.100821.png", dpi = 300)

In [None]:
# fwrite(result_df, "results/decontamination/ANCOMBC_decontamination_results.n9706.100821.csv")

In [None]:
dawdawdawdwadwa####################################################

### Explore metadata

In [None]:
n_subset <- 100

# Load data
df <- load_data(str_glue("data/temp_files_{n_subset}/07_abundance_matrix/abundance_matrix.subset_{n_subset}.S.tsv"))

# Retain relevant metadata
meta <- parse_metadata("data/SG10K_Health_metadata.n10714.16March2021.parsed.csv", df)

# Get metadata columns of interest
meta_cols <- get_meta_cols(meta)

meta <- as.matrix(meta)
meta <- as_tibble(meta, rownames = "sample")

plots <- list()

for (column in meta_cols) {
    plt <- meta %>%
        mutate(across(everything(), as.character)) %>%
        select(all_of(column)) %>%
        group_by_at(column) %>%
        summarise(n = n()) %>%
        ggplot(aes_string(x = column, y = "n", fill = column)) +
            geom_bar(stat = "identity") +
            theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
            geom_text(aes_string(label = "n"))
    plots[[column]] <- plt
}

plots


### MAIN

In [None]:
# nc_list <- list()

# for (i in c(100)) {
#     nc_list[[str_glue("subset_{i}")]] <- decontaminate(i)
# }

# nc_list

### Sanity check

In [None]:
# Reduce(intersect, nc_list)

In [None]:
plot_sanity_plot <- function(X, Y, tax) {
    print(tax)
    temp_plt <- cbind(X, Y) %>%
        mutate(log_abundance = log(get(tax) + 0.0001)) %>%
        ggplot(aes(x = site_supplying_sample, y = get(tax), fill = site_supplying_sample)) + 
        geom_point() +
        geom_boxplot(alpha = 0.5) +
        labs(x = "Source cohort", y = paste(tax, "% abundance")) +
        theme(legend.position = "none",
          axis.text.x = element_text(angle = 45, hjust = 1),
          text = element_text(size = 15))
    return(temp_plt)
}

In [None]:
# Load data
taxon <- "Cutibacterium acnes"
df <- load_data(str_glue("data/temp_files_{n_subset}/07_abundance_matrix/abundance_matrix.subset_{n_subset}.S.tsv"))

# Remove homo and unclassified
col_to_exclude <- colnames(df)[grepl("Homo|unclassified", colnames(df))]
df <- remove_cols(df, col_to_exclude)

# Retain relevant metadata
meta <- parse_metadata("data/SG10K_Health_metadata.n10714.16March2021.parsed.csv", df)

# Normalise to RA
normalise <- function(df) {
    RA_df <- t(apply(df, 1, function(x) {x / sum(x) * 100}))
    return(as_tibble(RA_df))
}

species_df_RA <- normalise(df)

plot_sanity_plot(df, meta, taxon)
print(str_glue("Max no. of reads = {max(df[, taxon])}"))
print(str_glue("Mean no. of reads = {mean(df[, taxon])}"))

In [None]:
l1 <- tibble(l1)

fwrite(l1, str_glue("/home/projects/14001280/PROJECTS/blood_microbiome/results/decontamination/decontaminated_list.n{n_subset}.csv"))