# iRep (bPTR) and coverage breadth analysis

In [1]:
rm(list = ls())
setwd("/mnt/c/Users/Cedric/Desktop/git_repos/blood_microbiome")
require("Rsamtools")
require(Biostrings)
require(tidyverse)
require(data.table)
require(foreach)
require(doParallel)
require(ape)
registerDoParallel(cores = 10)

Loading required package: Rsamtools

Loading required package: GenomeInfoDb

Loading required package: BiocGenerics

Loading required package: parallel


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
    union, unique, unsplit, which.max, which.min


Loading required package: S4Vectors

L

In [2]:
species_df <- fread("results/decontamination/read_matrix.raw.zeroed.csv")

In [3]:
files <- list.files("results/irep_analysis/raw_output/irep_out", full.names = T)
files <- files[grepl(".tsv", files)]

morsels <- foreach(file = files) %do% {
    temp_df <- read.csv(file, sep = "\t", check.names = F)
    colnames(temp_df)[1] <- "ref_path"
    temp_df %>% 
        mutate(across(c(4:ncol(temp_df)), function(x) (as.numeric(ifelse(x == "n/a", NA, x)))),
               across(c(1:3), as.character)) %>%
        pivot_longer(!all_of(c("ref_path", "ORI", "TER")), names_to = "prefix", values_to = "bPTR") %>%
        separate(ref_path, sep = "/", into = c(rep(NA, 11), "file_name")) %>%
        separate(file_name, sep = "_", into = c("genus", "species", "suffix1", "suffix2"), remove = F) %>%
        separate(prefix, into = c(rep(NA, 6), "id"), sep = "/") %>%
        separate(id, into = c("npm_research_id"), sep = "_") %>%
        mutate(prefix = gsub(".fasta", "", file_name))
}

irep_df <- bind_rows(morsels) 


“Expected 1 pieces. Additional pieces discarded in 5 rows [1, 2, 3, 4, 5].”
“Expected 1 pieces. Additional pieces discarded in 5 rows [1, 2, 3, 4, 5].”
“Expected 1 pieces. Additional pieces discarded in 5 rows [1, 2, 3, 4, 5].”
“Expected 1 pieces. Additional pieces discarded in 5 rows [1, 2, 3, 4, 5].”
“Expected 1 pieces. Additional pieces discarded in 5 rows [1, 2, 3, 4, 5].”
“Expected 1 pieces. Additional pieces discarded in 5 rows [1, 2, 3, 4, 5].”
“Expected 1 pieces. Additional pieces discarded in 5 rows [1, 2, 3, 4, 5].”
“Expected 1 pieces. Additional pieces discarded in 5 rows [1, 2, 3, 4, 5].”
“Expected 1 pieces. Additional pieces discarded in 5 rows [1, 2, 3, 4, 5].”
“Expected 1 pieces. Additional pieces discarded in 4 rows [1, 2, 3, 4].”
“Expected 1 pieces. Additional pieces discarded in 5 rows [1, 2, 3, 4, 5].”
“Expected 1 pieces. Additional pieces discarded in 5 rows [1, 2, 3, 4, 5].”
“Expected 4 pieces. Additional pieces discarded in 3 rows [1, 2, 3].”
“Expected 1 pieces. A

## Parse coverage breadth

In [4]:
bam_path <- "results/irep_analysis/raw_output/bam_files"
ref_path <- "data/irep_data/genome_references"
ref_files <- list.files(ref_path, ".fasta", full.names = F)
prefixes <- gsub(".fasta", "", ref_files)
prefixes

In [5]:
morsels <- foreach (i = seq(length(prefixes))) %dopar% {
    prefix <- prefixes[i]
    bams <- list.files(str_glue("{bam_path}/{prefix}"))
    bams <- bams[!grepl(".bai", bams)]

    crumbs <- foreach (j = seq(length(bams))) %do% {
        bam <- bams[j]
        id <- str_split(bam, "_")[[1]][1]
        ref <- readDNAStringSet(str_glue("{ref_path}/{prefixes[i]}.fasta"))
        ref_length <- width(ref)
        params <- PileupParam(distinguish_strands = F, distinguish_nucleotides = F)
        pile_df <- pileup(str_glue("{bam_path}/{prefix}/{bam}"), pileupParam = params)
        
        n_covered1 <- pile_df %>%
            filter(count >= 1) %>%
            nrow()
        
        n_covered5 <- pile_df %>%
            filter(count >= 5) %>%
            nrow()

        perc_covered1 <- n_covered1 / ref_length * 100
        perc_covered5 <- n_covered5 / ref_length * 100
        
        taxon <- str_split(prefix, "_")[[1]]
            read_count <- species_df %>% 
                filter(npm_research_id == id) %>%
                select(contains(taxon[1])) %>%
                select(contains(taxon[2])) %>%
                select(!contains("monkey"))

        if (ncol(read_count) > 1) {
            read_count <- read_count %>% 
                select(contains(taxon[3]))
            if (ncol(read_count) > 1) {
            read_count <- read_count %>% 
                select(contains(paste0(" ", taxon[4])))
            }
        }
        
        if (all(dim(read_count) == 1)) {
            read_count <- deframe(read_count)
        } else {
            read_count <- NA
        }
        
        return(tibble(prefix = prefix, 
                      npm_research_id = id,
                      pairs_assigned = read_count,
                      perc_covered1 = perc_covered1,
                      perc_covered5 = perc_covered5))
    }
    
    return(bind_rows(crumbs))
}

cov_df <- bind_rows(morsels)

### Add number of human or microbial reads mapped

In [6]:
microbe_df <- fread("results/irep_analysis/raw_output/microbial_mapped_read_counts.csv") %>%
    rename(microbe_pairs_mapped = pairs_mapped)
human_df <- fread("results/irep_analysis/raw_output/human_mapped_read_counts.csv") %>%
    rename(human_pairs_mapped = pairs_mapped)

### Add microbe to human cell ratio
Estimates from https://bmcresnotes.biomedcentral.com/articles/10.1186/s13104-019-4137-z/tables/2
Leucocyte count from https://pubmed.ncbi.nlm.nih.gov/15782774/

In [7]:
human_length <- 6320012150
human_gc <- 40.89
leuco_per_ul <- 6750

In [8]:
ref_paths <- paste0(ref_path, "/", ref_files)

length_morsels <- foreach(prefix_name = prefixes) %do% {
    microbe_ref_path <- ref_paths[grepl(prefix_name, ref_paths)]
    microbe_ref <- read.FASTA(microbe_ref_path)
    microbe_length <- length(as.character(microbe_ref)[[1]])

    tibble(prefix = prefix_name, 
           microbe_length = microbe_length)
}

length_df <- bind_rows(length_morsels)

In [52]:
merged_df <- cov_df %>%
    left_join(microbe_df) %>%
    left_join(human_df %>% distinct(.keep_all = T)) %>%
    left_join(irep_df) %>%
    left_join(length_df) %>%
    relocate(microbe_pairs_mapped, human_pairs_mapped, .after = 3) %>%
    mutate(microbe_pairs_mapped = microbe_pairs_mapped / 2,
           human_pairs_mapped = human_pairs_mapped / 2) %>%
    mutate(microbe_count = (microbe_pairs_mapped / microbe_length) / (human_pairs_mapped / human_length) * leuco_per_ul) 

merged_df

Joining, by = c("prefix", "npm_research_id")
Joining, by = "npm_research_id"
Joining, by = c("prefix", "npm_research_id")
Joining, by = "prefix"


prefix,npm_research_id,pairs_assigned,microbe_pairs_mapped,human_pairs_mapped,perc_covered1,perc_covered5,file_name,genus,species,suffix1,suffix2,ORI,TER,bPTR,microbe_length,microbe_count
<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<int>,<dbl>
Achromobacter_xylosoxidans_NZ_CP043820.1,1116-0023,430178,429196,348366222,79.292320359,7.450282e+01,Achromobacter_xylosoxidans_NZ_CP043820.1.fasta,Achromobacter,xylosoxidans,NZ,CP043820.1.fasta,0,0,,6402982,8.208410e+03
Achromobacter_xylosoxidans_NZ_CP043820.1,WHB3658,385855,438419,364361186,78.389787758,7.366085e+01,Achromobacter_xylosoxidans_NZ_CP043820.1.fasta,Achromobacter,xylosoxidans,NZ,CP043820.1.fasta,0,0,,6402982,8.016719e+03
Achromobacter_xylosoxidans_NZ_CP043820.1,WHB3674,459561,494858,327626517,78.805094251,7.456206e+01,Achromobacter_xylosoxidans_NZ_CP043820.1.fasta,Achromobacter,xylosoxidans,NZ,CP043820.1.fasta,0,0,,6402982,1.006331e+04
Achromobacter_xylosoxidans_NZ_CP043820.1,WHB3686,541795,599335,344250628,79.082105806,7.525644e+01,Achromobacter_xylosoxidans_NZ_CP043820.1.fasta,Achromobacter,xylosoxidans,NZ,CP043820.1.fasta,0,0,,6402982,1.159937e+04
Achromobacter_xylosoxidans_NZ_CP043820.1,WHB3734,688629,769978,370885795,79.583059893,7.609348e+01,Achromobacter_xylosoxidans_NZ_CP043820.1.fasta,Achromobacter,xylosoxidans,NZ,CP043820.1.fasta,0,0,,6402982,1.383176e+04
Acinetobacter_baumannii_NZ_CP043953.1,0116-0053,7673,28842,364796907,79.329072139,1.697212e+01,Acinetobacter_baumannii_NZ_CP043953.1.fasta,Acinetobacter,baumannii,NZ,CP043953.1.fasta,3869401,1967227,1.896385,3972439,8.490606e+02
Acinetobacter_baumannii_NZ_CP043953.1,WHB4240,168,210,365572623,1.316042864,1.963529e-03,Acinetobacter_baumannii_NZ_CP043953.1.fasta,Acinetobacter,baumannii,NZ,CP043953.1.fasta,3869401,1967227,,3972439,6.168933e+00
Acinetobacter_baumannii_NZ_CP043953.1,WHB4251,183,233,284142227,1.460462955,4.782956e-04,Acinetobacter_baumannii_NZ_CP043953.1.fasta,Acinetobacter,baumannii,NZ,CP043953.1.fasta,3869401,1967227,,3972439,8.806120e+00
Acinetobacter_baumannii_NZ_CP043953.1,WHB4438,379,453,408444066,2.824813672,3.373242e-03,Acinetobacter_baumannii_NZ_CP043953.1.fasta,Acinetobacter,baumannii,NZ,CP043953.1.fasta,3869401,1967227,,3972439,1.191050e+01
Acinetobacter_baumannii_NZ_CP043953.1,WHB4828,190,231,371860124,1.642920130,0.000000e+00,Acinetobacter_baumannii_NZ_CP043953.1.fasta,Acinetobacter,baumannii,NZ,CP043953.1.fasta,3869401,1967227,,3972439,6.671090e+00


In [53]:
parsed_df <- merged_df %>% 
    left_join(ori_df)
parsed_df
fwrite(merged_df, "results/irep_analysis/coverage_irep_results.raw.csv")

Joining, by = c("prefix", "ORI", "TER")


prefix,npm_research_id,pairs_assigned,microbe_pairs_mapped,human_pairs_mapped,perc_covered1,perc_covered5,file_name,genus,species,suffix1,suffix2,ORI,TER,bPTR,microbe_length,microbe_count
<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<int>,<dbl>
Achromobacter_xylosoxidans_NZ_CP043820.1,1116-0023,430178,429196,348366222,79.292320359,7.450282e+01,Achromobacter_xylosoxidans_NZ_CP043820.1.fasta,Achromobacter,xylosoxidans,NZ,CP043820.1.fasta,0,0,,6402982,8.208410e+03
Achromobacter_xylosoxidans_NZ_CP043820.1,WHB3658,385855,438419,364361186,78.389787758,7.366085e+01,Achromobacter_xylosoxidans_NZ_CP043820.1.fasta,Achromobacter,xylosoxidans,NZ,CP043820.1.fasta,0,0,,6402982,8.016719e+03
Achromobacter_xylosoxidans_NZ_CP043820.1,WHB3674,459561,494858,327626517,78.805094251,7.456206e+01,Achromobacter_xylosoxidans_NZ_CP043820.1.fasta,Achromobacter,xylosoxidans,NZ,CP043820.1.fasta,0,0,,6402982,1.006331e+04
Achromobacter_xylosoxidans_NZ_CP043820.1,WHB3686,541795,599335,344250628,79.082105806,7.525644e+01,Achromobacter_xylosoxidans_NZ_CP043820.1.fasta,Achromobacter,xylosoxidans,NZ,CP043820.1.fasta,0,0,,6402982,1.159937e+04
Achromobacter_xylosoxidans_NZ_CP043820.1,WHB3734,688629,769978,370885795,79.583059893,7.609348e+01,Achromobacter_xylosoxidans_NZ_CP043820.1.fasta,Achromobacter,xylosoxidans,NZ,CP043820.1.fasta,0,0,,6402982,1.383176e+04
Acinetobacter_baumannii_NZ_CP043953.1,0116-0053,7673,28842,364796907,79.329072139,1.697212e+01,Acinetobacter_baumannii_NZ_CP043953.1.fasta,Acinetobacter,baumannii,NZ,CP043953.1.fasta,3869401,1967227,1.896385,3972439,8.490606e+02
Acinetobacter_baumannii_NZ_CP043953.1,WHB4240,168,210,365572623,1.316042864,1.963529e-03,Acinetobacter_baumannii_NZ_CP043953.1.fasta,Acinetobacter,baumannii,NZ,CP043953.1.fasta,3869401,1967227,,3972439,6.168933e+00
Acinetobacter_baumannii_NZ_CP043953.1,WHB4251,183,233,284142227,1.460462955,4.782956e-04,Acinetobacter_baumannii_NZ_CP043953.1.fasta,Acinetobacter,baumannii,NZ,CP043953.1.fasta,3869401,1967227,,3972439,8.806120e+00
Acinetobacter_baumannii_NZ_CP043953.1,WHB4438,379,453,408444066,2.824813672,3.373242e-03,Acinetobacter_baumannii_NZ_CP043953.1.fasta,Acinetobacter,baumannii,NZ,CP043953.1.fasta,3869401,1967227,,3972439,1.191050e+01
Acinetobacter_baumannii_NZ_CP043953.1,WHB4828,190,231,371860124,1.642920130,0.000000e+00,Acinetobacter_baumannii_NZ_CP043953.1.fasta,Acinetobacter,baumannii,NZ,CP043953.1.fasta,3869401,1967227,,3972439,6.671090e+00


In [54]:
ori_df <- merged_df %>%
    select(prefix, ORI, TER) %>%
    filter(ORI != "0" & TER != "0") %>%
    distinct()

In [55]:
summarised_df <- merged_df %>%
    group_by(prefix) %>%
    summarise(mean_perc_covered1 = mean(perc_covered1),
              mean_perc_covered5 = mean(perc_covered5),
              min_perc_covered1 = min(perc_covered1),
              max_perc_covered1 = max(perc_covered1),
              max_bPTR = max(bPTR, na.rm = T),
              max_microbe_count = max(microbe_count)) %>%
    mutate(max_bPTR = ifelse(max_bPTR == -Inf, NA, max_bPTR)) %>%
    left_join(ori_df) %>%
    separate(prefix, into = c("genus", "species", "suffix1", "suffix2"), sep = "_", remove = F) %>%
    mutate(taxa = ifelse(grepl("Torque", prefix), 
                         paste(genus, species, suffix1, suffix2), 
                         paste(genus, species, suffix1)), .before = 1) %>%
    mutate(taxa = gsub(" NZ| NC", "", taxa)) %>%
    mutate(taxa = gsub("Microbacterium", "Microbacterium sp.", taxa)) %>%
    mutate(taxa = gsub("Prevotella oral taxon", "Prevotella sp. oral taxon 299", taxa)) %>%
    mutate(taxa = gsub("Rickettsia", "Rickettsia sp.", taxa)) %>%
    select(-genus, -species, -suffix1, -suffix2) %>%
    arrange(desc(mean_perc_covered1))


fwrite(summarised_df, "results/irep_analysis/coverage_irep_results.parsed.csv")

summarised_df

“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to max; returning -Inf”
Joining, by = "prefix"
“Expected 4 pieces. Additio

taxa,prefix,mean_perc_covered1,mean_perc_covered5,min_perc_covered1,max_perc_covered1,max_bPTR,max_microbe_count,ORI,TER
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>
Human betaherpesvirus 6B,Human_betaherpesvirus_6B_NC_000898.1,98.48291943,95.44136,98.3412907,98.6928951,,6952.852,,
Human betaherpesvirus 6A,Human_betaherpesvirus_6A_NC_001664.4,96.76555108,90.96362,96.63755349,96.982645,1.282505,11117.46,44954.0,132612.0
Achromobacter xylosoxidans,Achromobacter_xylosoxidans_NZ_CP043820.1,79.03047361,74.81513,78.38978776,79.5830599,,13831.76,,
Hepatitis B virus,Hepatitis_B_virus_genB_JX661488.1,76.39191291,44.86262,61.33748056,98.755832,,249634.1,,
Pseudomonas mendocina,Pseudomonas_mendocina_NZ_CP013124.1,71.41336595,67.39544,71.12799127,71.7810391,,19889.41,,
Cutibacterium acnes,Cutibacterium_acnes_NC_021085.1,36.15403798,3.473737,15.33223211,77.4795425,,1763.105,,
Rickettsia sp. Tillamook 23,Rickettsia_Tillamook_23_NZ_CP060138.1,31.85980557,2.757348,0.35879756,63.3608136,1.34666,1197.889,240832.0,888370.0
Alcaligenes faecalis,Alcaligenes_faecalis_NZ_CP013119.1,28.02434056,0.5987024,23.36010861,31.7478145,,363.4384,,
Torque teno virus 6,Torque_teno_virus_6_NC_014094.1,18.83940621,17.05803,16.08636977,21.5924426,,4477.293,,
Neisseria subflava,Neisseria_subflava_NZ_CP039887.1,17.89799782,5.411104,1.20911307,81.7316806,1.506405,1882.095,1883368.0,797590.0
