In [2]:
setwd("/mnt/c/Users/Cedric/Desktop/git_repos/blood_microbiome")
require(foreach)
require(tidyverse)
require(ggplot2)
require(data.table)

Loading required package: foreach

Loading required package: tidyverse

“running command 'timedatectl' had status 1”
── [1mAttaching packages[22m ─────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.3     [32m✔[39m [34mdplyr  [39m 1.0.7
[32m✔[39m [34mtidyr  [39m 1.1.3     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.0.1     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mpurrr[39m::[32maccumulate()[39m masks [34mforeach[39m::accumulate()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m     masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m        masks [34mstats[39m::lag()
[31m✖[39m [34mpurrr[39m::[32mwhen()[39m       masks [34

In [3]:
load_data <- function(file_path) {
    df <- as.data.frame(fread(file_path)) %>%
        separate(sample, into = c(NA, "npm_research_id"), sep = "\\.")
    return(df)
}


load_metadata <- function(file_path, df) {
    meta <- fread(file_path, na.strings=c("", NA))
    meta <- meta %>% 
        filter(npm_research_id %in% df$npm_research_id) %>%
        select(-removal_requested_by_supplier) %>%
        replace(is.na(.), "unknown")
    return(meta)
}


subset_metadata <- function(meta, n_subset) {
    meta <- as.data.frame(meta)
    cohorts <- unique(meta$site_supplying_sample)
    subset_vec <- c()

    for (i in cohorts) {
        ids <- meta$npm_research_id[meta$site_supplying_sample == i]
        
        if (length(ids) > n_subset) {
            subset_ids <- sample(ids, n_subset)
            subset_vec <- c(subset_vec, subset_ids)
        } else {
            subset_vec <- c(subset_vec, ids)
        }
    }

    meta_sub <- meta %>%
      filter(npm_research_id %in% subset_vec) 
    return(meta_sub)
}


retrieve_rows_from_meta <- function(df, meta) {
    return(df %>% filter(npm_research_id %in% meta$npm_research_id)) 
}


remove_cols <- function(df, col_to_exclude) {
    return(df %>% select(-all_of(col_to_exclude)))
}


remove_low_freq_taxa <- function(df, frac_presence) {
    n_original <- ncol(df[, colnames(df) != "npm_research_id"])
    PA_df <- apply(df[, 2:ncol(df)], 2, function(x) {ifelse(x > 0, T, F)})
    frac_df <- apply(PA_df, 2, function(x) {sum(x) / nrow(PA_df)})
    to_keep <- names(frac_df[frac_df > frac_presence])
    to_keep <- c("npm_research_id", to_keep)
    n_new <- length(to_keep) - 1
    print(str_glue("{n_new} / {n_original} taxa are present in {frac_presence} of samples"))
    return(df %>% select(all_of(to_keep)))
}


otu_to_RA <- function(df) {
    rownames(df) <- df$npm_research_id
    RA_df <- t(apply(df[, colnames(df) != "npm_research_id"], 1, function(x) {x / sum(x)}))
    
    return(as.data.frame(RA_df))
}


get_meta_cols <- function(meta) {
    meta_cols <- colnames(meta)[grep("kit|flow_cell|instrument_id|site_supplying", colnames(meta))]
    to_exclude <- c("library_prep_kit", "hiseq_xtm_flow_cell_v2_5_id")
    meta_cols <- meta_cols[!(meta_cols %in% to_exclude)]
    return(meta_cols)
}


remove_low_read_samples <- function(df) {
    to_retain <- fread("data/samples_above_10_reads.txt")$npm_research_id
    return(df %>% filter(npm_research_id %in% to_retain))
}


remove_empty_rows <- function(df) {
    df_filt <- df %>% 
        rowwise(npm_research_id) %>% 
        mutate(total = sum(across(everything()))) %>%
        filter(total != 0) %>%
        select(-total)
    
    n_original <- nrow(df)
    n_removed <- n_original - nrow(df_filt)
    
    print(str_glue("{n_removed}/{n_original} samples removed due to having no reads of interest"))
    
    return(as.data.frame(df_filt))
}

In [22]:
load_parse_RA <- function(n_subset, rank, human) {
    # Load data
    to_retain <- fread("data/samples_above_95_reads.txt")$npm_research_id
    df <- load_data(str_glue("data/temp_files_{n_subset}/07_abundance_matrix/abundance_matrix.subset_{n_subset}.{rank}.tsv")) %>% 
        filter(npm_research_id %in% to_retain)
    meta <- load_metadata("data/SG10K_Health_metadata.n10714.16March2021.parsed.csv", df)
    
    # Filter data
    df_filt <- retrieve_rows_from_meta(df, meta)
    df_filt2 <- remove_cols(df_filt, c(human, "unclassified"))
    df_filt3 <- remove_low_freq_taxa(df_filt2, frac_presence = 0.05)
    df_filt4 <- remove_empty_rows(df_filt3)
    print(ncol(df_filt4))
    RA_df <- otu_to_RA(df_filt4) %>% rownames_to_column("npm_research_id")
    
    return(RA_df %>% left_join(meta))
}

In [23]:
species <- load_parse_RA(9999, "S", "Homo sapiens")
genus <- load_parse_RA(9999, "G", "Homo")

1792 / 5199 taxa are present in 0.05 of samples
0/8728 samples removed due to having no reads of interest
[1] 1793


Joining, by = "npm_research_id"



612 / 1492 taxa are present in 0.05 of samples
0/8728 samples removed due to having no reads of interest
[1] 613


Joining, by = "npm_research_id"



In [24]:
n_samples <- nrow(species_RA)
n_samples == nrow(species_RA)
fwrite(species, str_glue("data/temp_files_9999/07_abundance_matrix/species_RA.n{n_samples}.csv"))
fwrite(genus, str_glue("data/temp_files_9999/07_abundance_matrix/genus_RA.n{n_samples}.csv"))