In [61]:
setwd("/home/projects/14001280/PROJECTS/blood_microbiome/data")
require(tidyverse)
require(ggplot2)
require(data.table)

### Load metadata

In [62]:
# Raw metadata
meta <- fread("SG10K_Health_metadata.n10714.16March2021.txt", na.strings=c("", NA)) %>%
    select(-"Supplied And Computed Ethinicity Match", -"Cohorts Joint-called Together", -"JVCF Size", -"Cohort JVCF Data Capacity") %>%
    replace(is.na(.), "Unknown")

# List of r3 samples
meta2 <- fread("SG10K_Health_r5.3.n9770.sample_ids.txt", header = F)$V1

head(meta)
head(meta2)



NPM Research ID,Multiplex Pool ID,Supplier ID,GIS Internal Sample ID,Site Supplying Sample,Year Of Birth,Supplied Gender,Self Reported Ethnicity,Extraction Kit,Date Of DNA Extraction,⋯,Supplied And Computed Gender Match,SOP Agreement For Coverage <14/<28 For 15x/30x,Sample Life Cycle,Current,Source Cohort,Original_supplier_ID,duplicate_info,duplicate,duplicate_pair,removal-requested-by-supplier
<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
FAILED00365,Unknown,365,HELIOS_00365_1,HELIOS,1954,M,Chinese,abGENIX Whole Blood Genomic DNA Extraction Kit,6/6/2019,⋯,Unknown,Unknown,HELIOS REG,N,HELIOS,365,Unknown,Unknown,Unknown,Unknown
FAILED01615,Unknown,1615,HELIOS_01615_2,HELIOS,1956,F,Chinese,abGENIX Whole Blood Genomic DNA Extraction Kit,20/1/2019,⋯,Unknown,Unknown,HELIOS REG,N,HELIOS,1615,Unknown,Unknown,Unknown,Unknown
FAILED01688,Unknown,1688,HELIOS_01688_2,HELIOS,1952,F,Others,abGENIX Whole Blood Genomic DNA Extraction Kit,20/1/2019,⋯,Unknown,Unknown,HELIOS REG,N,HELIOS,1688,Unknown,Unknown,Unknown,Unknown
FAILED01750,Unknown,1750,HELIOS_01750_1,HELIOS,1957,F,Chinese,abGENIX Whole Blood Genomic DNA Extraction Kit,13/2/2019,⋯,Unknown,Unknown,HELIOS REG,N,HELIOS,1750,Unknown,Unknown,Unknown,Unknown
FAILED01799,Unknown,1799,HELIOS_01799_1,HELIOS,1948,F,Chinese,abGENIX Whole Blood Genomic DNA Extraction Kit,13/2/2019,⋯,Unknown,Unknown,HELIOS REG,N,HELIOS,1799,Unknown,Unknown,Unknown,Unknown
FAILED02755,Unknown,2755,HELIOS_02755_1,HELIOS,1951,F,Chinese,abGENIX Whole Blood Genomic DNA Extraction Kit,30/5/2019,⋯,Unknown,Unknown,HELIOS REG,N,HELIOS,2755,Unknown,Unknown,Unknown,Unknown


### Parse column names

In [63]:
n <- gsub("[^0-9A-Za-z///' ]", " ", colnames(meta))
n <- tolower(n)
n <- gsub(" ", "_", n)
colnames(meta) <- n
colnames(meta)

### Rename cohorts

In [65]:
meta <- meta %>% 
    mutate(site_supplying_sample = case_when(
        site_supplying_sample == "SEED" ~ "SERI",
        site_supplying_sample == "GUSTO_Kids" ~ "GUSTO",
        site_supplying_sample == "SE ASIAN" ~ "SSMP",
        TRUE ~ as.character(site_supplying_sample)))

### Recode directory names for SE ASIAN samples


In [66]:
meta <- meta %>%
    mutate(multiplex_pool_id = case_when(
        grepl("SSM00", npm_research_id) ~ "SSMP_001_to_009",
        grepl("SSM01", npm_research_id) ~ "SSMP_010_to_019",
        grepl("SSM02", npm_research_id) ~ "SSMP_020_to_029",
        grepl("SSM03", npm_research_id) ~ "SSMP_030_to_039",
        grepl("SSM04", npm_research_id) ~ "SSMP_040_to_049",
        grepl("SSM05", npm_research_id) ~ "SSMP_050_to_059",
        grepl("SSM06", npm_research_id) ~ "SSMP_060_to_069",
        grepl("SSM07", npm_research_id) ~ "SSMP_070_to_079",
        grepl("SSM08", npm_research_id) ~ "SSMP_080_to_089",
        grepl("SSM09|SSM100", npm_research_id) ~ "SSMP_090_to_100",
        TRUE ~ as.character(multiplex_pool_id)))
# test %>%
#     select(site_supplying_sample, multiplex_pool_id, npm_research_id) %>%
#     filter(site_supplying_sample == "SSMP")

## Add MUX name for PRISM top-ups

In [67]:
meta <- meta %>%
    mutate(multiplex_pool_id = case_when(
        grepl("\\-", npm_research_id) ~ "PRISM_topup",
        TRUE ~ as.character(multiplex_pool_id)))

### Filter by r5.3 samples and remove duplicate samples

In [68]:
meta_filt <- meta %>% 
  filter(npm_research_id %in% meta2) %>% # Retain only r5.3 samples
  replace(is.na(.), "") %>%
  filter(duplicate_info != "TRUE.DUPLICATE") # Remove 64 duplicates

fwrite(meta_filt, "SG10K_Health_metadata.n10714.16March2021.parsed.csv", row.names = F, sep = ",", na = "Unknown")

In [69]:
unique(meta_filt$instrument_id)

### Retrieve random n samples per cohort

In [70]:
n_subset <- 9999
cohorts <- unique(meta_filt$site_supplying_sample)
subset_vec <- c()

for (i in cohorts) {
    ids <- meta_filt$npm_research_id[meta_filt$site_supplying_sample == i]
    if (length(ids) > n_subset) {
        subset_ids <- sample(ids, n_subset)
        subset_vec <- c(subset_vec, subset_ids)
    } else {
        subset_vec <- c(subset_vec, ids)
    }
}

meta_sub <- meta_filt %>%
  filter(npm_research_id %in% subset_vec) %>%
  select(site_supplying_sample, multiplex_pool_id, npm_research_id)

In [71]:
fwrite(meta_sub, paste0("subset_list_", n_subset, ".csv"), row.names = F, col.names = F, sep = ",")