# QC for pre-processing steps

In [1]:
setwd("/home/projects/14001280/PROJECTS/blood_microbiome/")
require(tidyverse)
require(ggplot2)
require(data.table)
require(ShortRead)

Loading required package: tidyverse

“Your system is mis-configured: ‘/etc/localtime’ is not a symlink”
“It is strongly recommended to set envionment variable TZ to ‘Asia/Singapore’ (or equivalent)”
── [1mAttaching packages[22m ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.4     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.2     [32m✔[39m [34mdplyr  [39m 1.0.7
[32m✔[39m [34mtidyr  [39m 1.1.3     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    mas

In [2]:
meta <- fread("data/SG10K_Health_metadata.n10714.16March2021.parsed.csv")

In [3]:
head(meta)

npm_research_id,multiplex_pool_id,supplier_id,gis_internal_sample_id,site_supplying_sample,year_of_birth,supplied_gender,self_reported_ethnicity,extraction_kit,date_of_dna_extraction,⋯,supplied_and_computed_gender_match,sop_agreement_for_coverage__14/_28_for_15x/30x,sample_life_cycle,current,source_cohort,original_supplier_id,duplicate_info,duplicate,duplicate_pair,removal_requested_by_supplier
<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
WHB10000,MUX9693,358,HELIOS_00358_1,HELIOS,1982,F,Chinese,abGENIX Whole Blood Genomic DNA Extraction Kit,6/6/2019,⋯,Y,Y,HELIOS VCF,Y,HELIOS,358,Unknown,PASS,Unknown,Unknown
WHB10001,MUX9693,359,HELIOS_00359_1,HELIOS,1979,F,Chinese,abGENIX Whole Blood Genomic DNA Extraction Kit,6/6/2019,⋯,Y,Y,HELIOS VCF,Y,HELIOS,359,Unknown,PASS,Unknown,Unknown
WHB10002,MUX9695,360,HELIOS_00360_1,HELIOS,1970,F,Chinese,abGENIX Whole Blood Genomic DNA Extraction Kit,6/6/2019,⋯,Y,N,HELIOS VCF,Y,HELIOS,360,Unknown,PASS,Unknown,Unknown
WHB10003,MUX9694,361,HELIOS_00361_1,HELIOS,1969,M,Chinese,abGENIX Whole Blood Genomic DNA Extraction Kit,6/6/2019,⋯,Y,Y,HELIOS VCF,Y,HELIOS,361,Unknown,PASS,Unknown,Unknown
WHB10005,MUX9694,363,HELIOS_00363_1,HELIOS,1959,F,Chinese,abGENIX Whole Blood Genomic DNA Extraction Kit,6/6/2019,⋯,Y,N,HELIOS VCF,Y,HELIOS,363,Unknown,PASS,Unknown,Unknown
WHB10006,MUX9694,364,HELIOS_00364_1,HELIOS,1955,F,Chinese,abGENIX Whole Blood Genomic DNA Extraction Kit,6/6/2019,⋯,Y,Y,HELIOS VCF,Y,HELIOS,364,Unknown,PASS,Unknown,Unknown


### Get intermediate file directories for all pre-processing steps

In [4]:
steps <- list.dirs("data/temp_files_100")

### Iterate through preprocessing step directories and through fastq/bam/files

In [None]:
df <- tibble()

for (step in steps) {
#     step <- steps[3]
    if (grepl("_fastq|_bam", step)) {
        step_name <- strsplit(step, "/")
        step_name <- unlist(step_name)[3]
        file_names <- list.files(step)
        print(paste("pre-processing step:", step))

        for (file in file_names) {
    #         file <- file_names[40]
            file_path <- paste0(step, "/", file)

            # Get file size
            file_size <- file.size(file_path)
            npm_research_id <- strsplit(file, "\\.")
            npm_research_id <- unlist(npm_research_id)[2]

            # Get no. of reads
            if (grepl(".fastq", file_path)) {
                n_reads <- countLines(file_path) / 4
            } else {
                n_reads <- NA
            }

            morsel <- tibble(step_name = step_name, 
                             npm_research_id = npm_research_id,
                             file_size = file_size, 
                             file_name = file, 
                             n_reads = n_reads)

            df <- df %>% bind_rows(morsel)

        }
    }
}

df

### Files that failed CRAM to fastq extraction
These files do not have any reads/mates that are unmapped.

In [None]:
for (i in c("01_bam", "01_fastq", "02_fastq", "03_fastq", "04_fastq")) {
    n <- df %>%
        filter(!is.na(step_name)) %>% # Omit entries of directories
        filter(step_name == i) %>%
        filter(file_size != 0) %>%
        nrow()
    print(paste0(i, ": ", n))
    }

### Plot read count distributions

In [None]:
df %>%
    filter(grepl("01|02|03|04", step_name),
           n_reads != 0) %>%
    mutate(step_name = case_when(step_name == "01_fastq" ~ "01_raw",
                                 step_name == "02_fastq" ~ "02_trim",
                                 step_name == "03_fastq" ~ "03_filter",
                                 step_name == "04_fastq" ~ "04_low_complexity"),
           n_reads = log(n_reads, base = 10)) %>%
    filter(!grepl("04", step_name)) %>%
    ggplot(aes(x = n_reads, fill = step_name)) +
        geom_density(alpha = 0.4) +
        scale_fill_discrete() +
        labs(x = "lg(Read Count)", y = "Density")
