# iRep (bPTR) and coverage breadth analysis

In [9]:
rm(list = ls())
setwd("/mnt/c/Users/Cedric/Desktop/git_repos/blood_microbiome")
require("Rsamtools")
require(Biostrings)
require(tidyverse)
require(data.table)
require(foreach)
require(doParallel)
registerDoParallel(cores = 10)

In [10]:
files <- list.files("results/irep_analysis/raw_output/irep_out", full.names = T)
files <- files[grepl(".tsv", files)]

morsels <- foreach(file = files) %do% {
    temp_df <- read.csv(file, sep = "\t", check.names = F)
    colnames(temp_df)[1] <- "ref_path"
    temp_df %>% 
        mutate(across(c(4:ncol(temp_df)), function(x) (as.numeric(ifelse(x == "n/a", NA, x)))),
               across(c(1:3), as.character)) %>%
        pivot_longer(!all_of(c("ref_path", "ORI", "TER")), names_to = "prefix", values_to = "bPTR") %>%
        separate(ref_path, sep = "/", into = c(rep(NA, 11), "file_name")) %>%
        separate(file_name, sep = "_", into = c("genus", "species", "suffix1", "suffix2"), remove = F) %>%
        separate(prefix, into = c(rep(NA, 6), "id"), sep = "/") %>%
        separate(id, into = c("npm_research_id"), sep = "_") %>%
        mutate(prefix = gsub(".fasta", "", file_name))
}

irep_df <- bind_rows(morsels) 


“Expected 1 pieces. Additional pieces discarded in 5 rows [1, 2, 3, 4, 5].”
“Expected 1 pieces. Additional pieces discarded in 5 rows [1, 2, 3, 4, 5].”
“Expected 1 pieces. Additional pieces discarded in 5 rows [1, 2, 3, 4, 5].”
“Expected 1 pieces. Additional pieces discarded in 5 rows [1, 2, 3, 4, 5].”
“Expected 1 pieces. Additional pieces discarded in 5 rows [1, 2, 3, 4, 5].”
“Expected 1 pieces. Additional pieces discarded in 5 rows [1, 2, 3, 4, 5].”
“Expected 1 pieces. Additional pieces discarded in 5 rows [1, 2, 3, 4, 5].”
“Expected 1 pieces. Additional pieces discarded in 5 rows [1, 2, 3, 4, 5].”
“Expected 1 pieces. Additional pieces discarded in 5 rows [1, 2, 3, 4, 5].”
“Expected 1 pieces. Additional pieces discarded in 4 rows [1, 2, 3, 4].”
“Expected 1 pieces. Additional pieces discarded in 5 rows [1, 2, 3, 4, 5].”
“Expected 4 pieces. Additional pieces discarded in 3 rows [1, 2, 3].”
“Expected 1 pieces. Additional pieces discarded in 3 rows [1, 2, 3].”
“Expected 4 pieces. Additio

## Parse coverage breadth

In [11]:
irep_df

file_name,genus,species,suffix1,suffix2,ORI,TER,npm_research_id,bPTR,prefix
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>
Achromobacter_xylosoxidans_NZ_CP043820.1.fasta,Achromobacter,xylosoxidans,NZ,CP043820.1.fasta,0,0,1116-0023,,Achromobacter_xylosoxidans_NZ_CP043820.1
Achromobacter_xylosoxidans_NZ_CP043820.1.fasta,Achromobacter,xylosoxidans,NZ,CP043820.1.fasta,0,0,WHB3658,,Achromobacter_xylosoxidans_NZ_CP043820.1
Achromobacter_xylosoxidans_NZ_CP043820.1.fasta,Achromobacter,xylosoxidans,NZ,CP043820.1.fasta,0,0,WHB3674,,Achromobacter_xylosoxidans_NZ_CP043820.1
Achromobacter_xylosoxidans_NZ_CP043820.1.fasta,Achromobacter,xylosoxidans,NZ,CP043820.1.fasta,0,0,WHB3686,,Achromobacter_xylosoxidans_NZ_CP043820.1
Achromobacter_xylosoxidans_NZ_CP043820.1.fasta,Achromobacter,xylosoxidans,NZ,CP043820.1.fasta,0,0,WHB3734,,Achromobacter_xylosoxidans_NZ_CP043820.1
Acinetobacter_baumannii_NZ_CP043953.1.fasta,Acinetobacter,baumannii,NZ,CP043953.1.fasta,3869401,1967227,0116-0053,1.896385,Acinetobacter_baumannii_NZ_CP043953.1
Acinetobacter_baumannii_NZ_CP043953.1.fasta,Acinetobacter,baumannii,NZ,CP043953.1.fasta,3869401,1967227,WHB4240,,Acinetobacter_baumannii_NZ_CP043953.1
Acinetobacter_baumannii_NZ_CP043953.1.fasta,Acinetobacter,baumannii,NZ,CP043953.1.fasta,3869401,1967227,WHB4251,,Acinetobacter_baumannii_NZ_CP043953.1
Acinetobacter_baumannii_NZ_CP043953.1.fasta,Acinetobacter,baumannii,NZ,CP043953.1.fasta,3869401,1967227,WHB4438,,Acinetobacter_baumannii_NZ_CP043953.1
Acinetobacter_baumannii_NZ_CP043953.1.fasta,Acinetobacter,baumannii,NZ,CP043953.1.fasta,3869401,1967227,WHB4828,,Acinetobacter_baumannii_NZ_CP043953.1


In [12]:
bam_path <- "results/irep_analysis/raw_output/bam_files"
ref_path <- "data/irep_data/genome_references"
prefixes <- list.files(ref_path, ".fasta", full.names = F)
prefixes <- gsub(".fasta", "", prefixes)
prefixes <- prefixes

In [13]:
morsels <- foreach (i = seq(length(prefixes))) %dopar% {
    prefix <- prefixes[i]
    bams <- list.files(str_glue("{bam_path}/{prefix}"))
    bams <- bams[!grepl(".bai", bams)]

    crumbs <- foreach (j = seq(length(bams))) %do% {
        bam <- bams[j]
        id <- str_split(bam, "_")[[1]][1]
        ref <- readDNAStringSet(str_glue("{ref_path}/{prefixes[i]}.fasta"))
        ref_length <- width(ref)
        params <- PileupParam(distinguish_strands = F, distinguish_nucleotides = F)
        pile_df <- pileup(str_glue("{bam_path}/{prefix}/{bam}"), pileupParam = params)
        
        n_covered1 <- pile_df %>%
            filter(count >= 1) %>%
            nrow()
        
        n_covered5 <- pile_df %>%
            filter(count >= 5) %>%
            nrow()

        perc_covered1 <- n_covered1 / ref_length * 100
        perc_covered5 <- n_covered5 / ref_length * 100
        
        return(tibble(prefix = prefix, 
                      npm_research_id = id,
                      perc_covered1 = perc_covered1,
                      perc_covered5 = perc_covered5))
    }
    
    return(bind_rows(crumbs))
}

cov_df <- bind_rows(morsels)

### Merged results

In [14]:
merged_df <- cov_df %>%
    left_join(irep_df) %>%
    arrange(desc(perc_covered1))

merged_df

fwrite(merged_df, "results/irep_analysis/coverage_irep_results.raw.csv")

Joining, by = c("prefix", "npm_research_id")


prefix,npm_research_id,perc_covered1,perc_covered5,file_name,genus,species,suffix1,suffix2,ORI,TER,bPTR
<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>
Hepatitis_B_virus_genB_JX661488.1,WHH4833,98.75583,91.10420,Hepatitis_B_virus_genB_JX661488.1.fasta,Hepatitis,B,virus,genB,0,0,
Human_betaherpesvirus_6B_NC_000898.1,WHB4851,98.69290,96.78560,,,,,,,,
Human_betaherpesvirus_6B_NC_000898.1,WHB4247,98.52326,95.91830,,,,,,,,
Human_betaherpesvirus_6B_NC_000898.1,WHB3815,98.42950,95.99418,,,,,,,,
Human_betaherpesvirus_6B_NC_000898.1,WHB5092,98.42765,95.58891,,,,,,,,
Human_betaherpesvirus_6B_NC_000898.1,WHH836,98.34129,92.91980,,,,,,,,
Human_betaherpesvirus_6A_NC_001664.4,WHH1669,96.98265,93.26946,Human_betaherpesvirus_6A_NC_001664.4.fasta,Human,betaherpesvirus,6A,NC,44954,132612,1.106910
Human_betaherpesvirus_6A_NC_001664.4,WHB7185,96.85778,86.94425,Human_betaherpesvirus_6A_NC_001664.4.fasta,Human,betaherpesvirus,6A,NC,44954,132612,1.015876
Human_betaherpesvirus_6A_NC_001664.4,WHH1879,96.68524,93.85863,Human_betaherpesvirus_6A_NC_001664.4.fasta,Human,betaherpesvirus,6A,NC,44954,132612,1.031950
Human_betaherpesvirus_6A_NC_001664.4,WHB4439,96.66453,93.55244,Human_betaherpesvirus_6A_NC_001664.4.fasta,Human,betaherpesvirus,6A,NC,44954,132612,1.068195


In [15]:
ori_df <- merged_df %>%
    select(prefix, ORI, TER) %>%
    filter(ORI != "0" & TER != "0") %>%
    distinct()

In [17]:
summarised_df <- merged_df %>%
    group_by(prefix) %>%
    summarise(mean_perc_covered1 = mean(perc_covered1),
              mean_perc_covered5 = mean(perc_covered5),
              min_perc_covered1 = min(perc_covered1),
              max_perc_covered1 = max(perc_covered1),
              max_bPTR = max(bPTR, na.rm = T)) %>%
    mutate(max_bPTR = ifelse(max_bPTR == -Inf, NA, max_bPTR)) %>%
    left_join(ori_df) %>%
    separate(prefix, into = c("genus", "species", "suffix1", "suffix2"), sep = "_", remove = F) %>%
    mutate(taxa = ifelse(grepl("Torque", prefix), 
                         paste(genus, species, suffix1, suffix2), 
                         paste(genus, species, suffix1)), .before = 1) %>%
    mutate(taxa = gsub(" NZ| NC", "", taxa)) %>%
    mutate(taxa = gsub("Microbacterium", "Microbacterium sp.", taxa)) %>%
    mutate(taxa = gsub("Rickettsia", "Rickettsia sp.", taxa)) %>%
    select(-prefix, -genus, -species, -suffix1, -suffix2) %>%
    arrange(desc(mean_perc_covered1))


fwrite(summarised_df, "results/irep_analysis/coverage_irep_results.parsed.csv")

summarised_df

“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to max; returning -Inf”
Joining, by = "prefix"
“Expected 4 pieces. Additio

taxa,mean_perc_covered1,mean_perc_covered5,min_perc_covered1,max_perc_covered1,max_bPTR,ORI,TER
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>
Human betaherpesvirus 6B,98.48291943,95.44136,98.3412907,98.6928951,,,
Human betaherpesvirus 6A,96.76555108,90.96362,96.63755349,96.982645,1.282505,44954.0,132612.0
Microbacterium sp. Nx66,93.03599611,58.78693,90.96880688,94.1362366,,,
Achromobacter xylosoxidans,79.03047361,74.81513,78.38978776,79.5830599,,,
Hepatitis B virus,76.39191291,44.86262,61.33748056,98.755832,,,
Pseudomonas mendocina,71.41336595,67.39544,71.12799127,71.7810391,,,
Cutibacterium acnes,36.15403798,3.473737,15.33223211,77.4795425,,,
Rickettsia sp. Tillamook 23,31.85980557,2.757348,0.35879756,63.3608136,1.34666,240832.0,888370.0
Alcaligenes faecalis,28.02434056,0.5987024,23.36010861,31.7478145,,,
Microbacterium sp. PM5,23.38667772,10.93712,8.76017757,68.7207383,1.111539,2347566.0,744493.0
