In [1]:
# run-DSS.R
# Runs Dispersion Shrinkage Estimation method on methylation data (one chromosome at a time)
library(data.table)
library(tidyverse)
library(argparse)
library(DSS)

── [1mAttaching packages[22m ────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.6     [32m✔[39m [34mdplyr  [39m 1.0.7
[32m✔[39m [34mtidyr  [39m 1.1.4     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.1     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ───────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mbetween()[39m   masks [34mdata.table[39m::between()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m    masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mfirst()[39m     masks [34mdata.table[39m::first()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m       masks [34mstats[39m::lag()
[31m✖[39m [34mdplyr[39m::[32mlast()[39m    

Filter first!!!

In [2]:
parser <- ArgumentParser()
parser$add_argument("--chr", default= "chr6", help='Chromosome to run DSS on')
parser$add_argument("--covariates", default= "NK,CD4T")
parser$add_argument("--num_pcs", default= "2", help = 'number of principal components (precomputed)')
args <- parser$parse_known_args()

# Setup output directory, will save called regions, figures?
odir <- paste0("result-", args[[1]]$chr)
dir.create(odir, showWarning = FALSE)

In [3]:
pc.df <- read_csv(file.path("../../data/prin-comps-array-samples/", paste0(args[[1]]$chr , ".csv")), 
                  col_types = cols())

In [4]:
ss.df <- read_csv("../../data/meta/phenos-cleaned.csv", col_types = cols())
DT <- fread(file.path("../../data/cov-meth/", paste0(args[[1]]$chr, ".tsv")))

In [5]:
make_tall_matrix <- function(DT, value.var) {
    # Turn to wide format with samples indexing columns
    # and position / locus indexing row
    out <- dcast(DT, pos~sample, value.var=value.var)
    #rownames(out) <- out$pos
    #out[ ,c("pos") := NULL]
    
    return(out)
}


drop_null_positions <- function(X, p=0.5){
    # Drops positions in X with too many nulls,
    # X is from `make_tall_matrix` and p is a percentage from 0 to 1
    stopifnot(p >= 0, p <= 1)
    
    # Cut point as float
    cut <- (ncol(X) - 1) * p
    num_null <- rowSums(is.na(X))
    
    return(X[num_null <= cut, ])
}




In [None]:
M <- make_tall_matrix(DT, "methylated") %>% drop_null_positions
Cov <- make_tall_matrix(DT, "coverage") %>% drop_null_positions

In [None]:
valid_samples <- intersect(intersect(colnames(M), ss.df$sample), pc.df$sample)

filt.df <- ss.df %>%
            inner_join(pc.df, by = "sample") %>% 
            dplyr::filter(sample %in% valid_samples) 

# pack years smoking needs to be not na
filt.df$pack_years[is.na(filt.df$pack_years)] <- 0

drop_cols <- as.character(setdiff(colnames(M), valid_samples))[-1] # dont remove pos
M[, (drop_cols):=NULL]
Cov[, (drop_cols):=NULL]

length(valid_samples)

In [8]:
load.samples <- filt.df %>% filter(cohort == "AD") %>% pull(sample) %>% as.character 
ctrl.samples <- filt.df %>% filter(cohort == "CONTROL") %>% pull(sample) %>% as.character

In [9]:
get_na_mask <- function(X, col.names){
    mask <- is.na(X)
    cn <- colnames(mask)
    for (j in 1:ncol(mask)){
        if ((cn[j] %in% col.names)) {
            mask[, j] <- FALSE
        }
    }
    return(mask)
}

impute_by_group <- function(X, col.names, round.mean = TRUE){
    # The imputed value is the position mean
    position.means <- rowMeans(X[ , col.names, with = FALSE], na.rm = TRUE)
    
    # If 
    if (round.mean) {position.means <- round(position.means)}
    mask <- get_na_mask(X, col.names)
    
    # Imputation step
    # With data.tables, for loops much master than vectorization
    for (i in 1:nrow(X)){
        for (j in which(mask[i, ])){
            set(X, i, j, value = position.means[i])
        }
    }
}

In [10]:
impute_by_group(M, load.samples)


In [11]:
impute_by_group(M, ctrl.samples)
impute_by_group(Cov, load.samples)
impute_by_group(Cov, ctrl.samples)

In [12]:
head(M)

pos,100,101,104,105,106,107,108,109,110,⋯,178,179,180,181,182,183,184,185,186,188
<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
10526255,4,4,10,4,4,4,0,4,0,⋯,4,1,4,4,6,4,4,4,4,7
10526440,5,5,12,5,5,3,1,3,5,⋯,8,0,5,3,13,3,8,5,5,4
10526442,5,5,10,5,5,4,0,4,5,⋯,7,8,5,4,14,4,5,5,5,7
10526456,3,7,12,7,7,5,1,5,3,⋯,12,12,7,5,19,5,9,7,7,8
10526459,2,4,11,4,4,3,1,3,5,⋯,7,10,4,3,13,3,5,4,4,3
10526462,6,6,12,6,6,5,1,5,5,⋯,11,10,6,5,18,5,8,6,6,6


In [13]:
M <- drop_null_positions(M, 0)
Cov <- drop_null_positions(Cov, 0)

In [14]:
# Order needs to be correct!!!
valid_pos <- intersect(M$pos, Cov$pos)
M <- M[pos %in% valid_pos , c("pos", filt.df$sample), with = FALSE]
Cov <- Cov[pos %in% valid_pos, c("pos", filt.df$sample), with = FALSE]

In [15]:
# create bs seq object, needs chromosome identiifer, methylated reads, and unmethylated reads
bs <- BSseq(chr = rep(DT$chr[1], nrow(M)), pos = M$pos,
            M = as.matrix(M[ , -c("pos"), with=FALSE]), 
            Cov = as.matrix(Cov[, -c("pos"), with=FALSE]), 
            sampleNames = names(M)[-1])


In [16]:
# Print this check in the future!!!!
all( filt.df$sample == colnames(bs) )

In [None]:
#TODO: formula from input flags
dml.fit <- DMLfit.multiFactor(bs, design = filt.df, smoothing = TRUE, smoothing.span = 200, 
            formula = ~cohort + PC1 + PC2 + Gran + CD8T + CD4T + NK + Bcell + bmi + age + sex)

In [None]:
colnames(dml.fit$X)
test.cohort <- DMLtest.multiFactor(dml.fit, coef = 2)

In [None]:
table(is.na(test.cohort$stat))

In [None]:
methylation <- M / Cov
methylation$pos <- M$pos
fwrite(methylation, file.path(odir, "methylation.csv"))

In [None]:
save(list = c("test.cohort"), file= file.path(odir, "test-values.RData"))
fwrite(M, file.path(odir, "M.csv"))
fwrite(Cov, file.path(odir, "Cov.csv"))


In [None]:
dmrs <- callDMR(test.cohort, p.threshold=0.01, minCG = 5 )

In [None]:
dmrs