### input and libs

In [73]:
source("~/github/common_libs/mylibs.R")
source("~/github/common_libs/mylibs.R")
require(BSgenome.Mmusculus.UCSC.mm10)
require(SummarizedExperiment)
require(GenomicRanges)


countInsertions <- function(query, fragments, by = "RG") {
    # Count By Fragments Insertions
    message("reading tags")
    tic()
    inserts <- fread(fragments, col.names = c("chr", "start", "end", "RG", "score", 
        "strand")) %>% dplyr::select(-score) %>% mutate(start = start + 1)
    a = toc()
    message("overlaping")
    inserts <- GRanges(seqnames = inserts$chr, ranges = IRanges(start = inserts$start, 
        end = inserts$end), strand = inserts$strand, RG = inserts$RG) %>% resize(width = 1, 
        fix = "start")
    overlapDF <- DataFrame(findOverlaps(query, inserts, ignore.strand = TRUE, maxgap = -1L, 
        minoverlap = 0L, type = "any"))
    overlapDF$name <- mcols(inserts)[overlapDF[, 2], by]
    overlapTDF <- transform(overlapDF, id = match(name, unique(name)))
    # Calculate Overlap Stats
    inPeaks <- table(overlapDF$name)
    total <- table(mcols(inserts)[, by])
    total <- total[names(inPeaks)]
    frip <- inPeaks/total
    # Summarize
    sparseM <- Matrix::sparseMatrix(i = overlapTDF[, 1], j = overlapTDF[, 4], x = rep(1, 
        nrow(overlapTDF)), dims = c(length(query), length(unique(overlapDF$name))))
    colnames(sparseM) <- unique(overlapDF$name)
    total <- total[colnames(sparseM)]
    frip <- frip[colnames(sparseM)]
    out <- list(counts = sparseM, frip = frip, total = total)
    toc() - a
    return(out)
}


extendedPeakSet <- function(df, BSgenome = NULL, extend = 250, blacklist = NULL, 
    nSummits = 1e+05) {
    # Helper Functions
    readSummits <- function(file) {
        df <- fread(file, col.names = c("chr", "start", "end", "name", "score")) %>% 
            dplyr::select(1, 2, 3, 5)
        return(GenomicRanges::makeGRangesFromDataFrame(df = df, keep.extra.columns = TRUE, 
            starts.in.df.are.0based = TRUE))
    }
    nonOverlappingGRanges <- function(gr, by = "score", decreasing = TRUE, verbose = FALSE) {
        stopifnot(by %in% colnames(mcols(gr)))
        clusterGRanges <- function(gr, filter = TRUE, by = "score", decreasing = TRUE) {
            gr <- sort(sortSeqlevels(gr))
            r <- GenomicRanges::reduce(gr, min.gapwidth = 0L, ignore.strand = TRUE)
            o <- findOverlaps(gr, r)
            mcols(gr)$cluster <- subjectHits(o)
            gr <- gr[order(mcols(gr)[, by], decreasing = decreasing), ]
            gr <- gr[!duplicated(mcols(gr)$cluster), ]
            gr <- sort(sortSeqlevels(gr))
            mcols(gr)$cluster <- NULL
            return(gr)
        }
        if (verbose) {
            message("Converging", appendLF = FALSE)
        }
        i <- 0
        gr_converge <- gr
        while (length(gr_converge) > 0) {
            if (verbose) {
                message(".", appendLF = FALSE)
            }
            i <- i + 1
            gr_selected <- clusterGRanges(gr = gr_converge, filter = TRUE, by = by, 
                decreasing = decreasing)
            gr_converge <- subsetByOverlaps(gr_converge, gr_selected, invert = TRUE)  #blacklist selected gr
            if (i == 1) {
                # if i=1 then set gr_all to clustered
                gr_all <- gr_selected
            } else {
                gr_all <- c(gr_all, gr_selected)
            }
        }
        if (verbose) {
            message("\nSelected ", length(gr_all), " from ", length(gr))
        }
        gr_all <- sort(sortSeqlevels(gr_all))
        return(gr_all)
    }
    # Check-------
    stopifnot(extend > 0)
    stopifnot("samples" %in% colnames(df))
    stopifnot("groups" %in% colnames(df))
    stopifnot("summits" %in% colnames(df))
    stopifnot(!is.null(BSgenome))
    stopifnot(all(apply(df, 1, function(x) {
        file.exists(paste0(x[3]))
    })))
    #------------
    # Deal with blacklist
    if (is.null(blacklist)) {
        blacklist <- GRanges()
    } else if (is.character(blacklist)) {
        blacklist <- rtracklayer::import.bed(blacklist)
    }
    stopifnot(inherits(blacklist, "GenomicRanges"))
    #------------
    # Time to do stuff
    chromSizes <- GRanges(names(seqlengths(BSgenome)), IRanges(1, seqlengths(BSgenome)))
    chromSizes <- GenomeInfoDb::keepStandardChromosomes(chromSizes, pruning.mode = "coarse")
    groups <- unique(df$groups)
    groupGRList <- GenomicRanges::GenomicRangesList(lapply(seq_along(groups), function(i) {
        df_group = df[which(df$groups == groups[i]), ]
        grList <- GenomicRanges::GenomicRangesList(lapply(paste0(df_group$summits), 
            function(x) {
                extended_summits <- readSummits(x) %>% resize(., width = 2 * extend + 
                  1, fix = "center") %>% subsetByOverlaps(., chromSizes, type = "within") %>% 
                  subsetByOverlaps(., blacklist, invert = TRUE) %>% nonOverlappingGRanges(., 
                  by = "score", decreasing = TRUE)
                extended_summits <- extended_summits[order(extended_summits$score, 
                  decreasing = TRUE)]
                if (!is.null(nSummits)) {
                  extended_summits <- head(extended_summits, nSummits)
                }
                mcols(extended_summits)$scoreQuantile <- trunc(rank(mcols(extended_summits)$score))/length(mcols(extended_summits)$score)
                extended_summits
            }))
        # Non Overlapping
        grNonOverlapping <- nonOverlappingGRanges(unlist(grList), by = "scoreQuantile", 
            decreasing = TRUE)
        # Free Up Memory
        remove(grList)
        gc()
        grNonOverlapping
    }))
    grFinal <- nonOverlappingGRanges(unlist(groupGRList), by = "scoreQuantile", decreasing = TRUE)
    grFinal <- sort(sortSeqlevels(grFinal))
    return(grFinal)
}

split_write_per_lib <- function(lib = "JYH_854_1_2", clust_res, data_dir = "~/data/outputs/snATACj/", 
    save_dir = "~/scratch/outputs_snATACj/immune_allCons/rd1_clust_bed/tags/", verbose = F) {
    ### split reads
    split_reads <- function() {
        ### split reads for lib's tagalign
        tic()
        message("read tagalign file...")
        frag_lib <- fread(paste0("zcat ", data_dir, lib, "/", lib, ".filt.nodup.tn5.tagAlign.gz"), 
            header = F)
        t1 = toc()
        message("splitting reads")
        frag_lib <- frag_lib %>% inner_join(clust_res, by = c(V4 = "index")) %>% 
            setDT
        frag_lib_split = split(frag_lib, by = c("leiden"), keep.by = F)
        # rm(frag_lib)
        toc() - t1
        return(frag_lib_split)
    }
    ### save bed files and peak calling
    save_tag <- function(frag_lib_split) {
        if (!dir.exists(save_dir)) 
            dir.create(save_dir, recursive = T)
        sapply(names(frag_lib_split), function(i) {
            if (verbose) 
                message("write ", lib, " clust ", i)
            fwrite(frag_lib_split[[i]] %>% dplyr::select(starts_with("V")), file = paste0(save_dir, 
                "c", i, ".tagAlign"), append = T, sep = "\t", col.names = F)
        })
    }
    ## main
    save_tag(split_reads())
}

### split reads

In [4]:
clust_res = "./cluster_res.csv"
clust_res <- fread(clust_res) %>% dplyr::select(-starts_with("umap"))
clust_res$lib <- sub("_[ATCG]+$", "", clust_res$index)

In [None]:
for (l in (clust_res$lib %>% unique)[7:9]) {
    message("running ", l)
    split_write_per_lib(lib = l, clust_res = clust_res)
}

running XH_206_1_2
read tagalign file...
"Previous fread() session was not cleaned up properly. Cleaned up ok at the beginning of this fread() call."

elapsed time is 177.008000 seconds 


splitting reads


### call peaks

In [5]:
dir_peaks <- "~/scratch/outputs_snATACj/immune_allCons/rd1_clust_bed/peaks/"
if (!dir.exists(save_dir)) dir.create(save_dir, recursive = T)

method <- "q"
cutoff <- 0.05
shift <- -75
extsize <- 150
genome_size <- 1.87e+09
n_clust <- clust_res %>% pull(leiden) %>% unique %>% length
mclapply(1:n_clust - 1, function(j) {
    message(sprintf("%s of %s", j, n_clust))
    cluster_tagj <- paste0(dir_peaks, "c", j, ".tagAlign")
    cmdPeaks <- sprintf("macs2 callpeak -g %s --name %s --treatment %s --outdir %s --format BED --nomodel --call-summits --nolambda --keep-dup all", 
        genome_size, paste0("c", j), cluster_tagj, dir_peaks)
    if (!is.null(shift) & !is.null(extsize)) {
        cmdPeaks <- sprintf("%s --shift %s --extsize %s", cmdPeaks, shift, extsize)
    }
    if (tolower(method) == "p") {
        cmdPeaks <- sprintf("%s -p %s", cmdPeaks, cutoff)
    } else {
        cmdPeaks <- sprintf("%s -q %s", cmdPeaks, cutoff)
    }
    message("Running Macs2...")
    message(cmdPeaks)
    system(cmdPeaks, intern = TRUE)
}, mc.cores = 4)

### Make Non-Overlapping Peak Set

In [None]:
df <- data.frame(samples = gsub("\\_summits.bed", "", list.files(dir_peaks, pattern = "\\_summits.bed", 
    full.names = FALSE)), groups = "scATAC", summits = list.files(dir_peaks, pattern = "\\_summits.bed", 
    full.names = TRUE))


unionPeaks <- extendedPeakSet(df = df, BSgenome = genome, extend = 250, blacklist = "~/data/GENOME/mm10/mm10.blacklist.bed", 
    nSummits = 2e+05)
unionPeaks <- unionPeaks[seqnames(unionPeaks) %in% paste0("chr", c(1:19, "X"))]
unionPeaks <- keepSeqlevels(unionPeaks, paste0("chr", c(1:19, "X")))

# Create Counts list
fragmentFiles <- list.files(dir_peaks, pattern = "\\.tagAlign", full.names = T)
countsPeaksList <- lapply(seq_along(fragmentFiles), function(i) {
    message(sprintf("%s of %s", i, length(fragmentFiles)))
    gc()
    countInsertions(unionPeaks, fragmentFiles[i], by = "RG")
})

# CountsMatrix
mat <- lapply(countsPeaksList, function(x) x[[1]]) %>% Reduce("cbind", .)
frip <- lapply(countsPeaksList, function(x) x[[2]]) %>% unlist
total <- lapply(countsPeaksList, function(x) x[[3]]) %>% unlist

se <- SummarizedExperiment(assays = SimpleList(counts = mat), rowRanges = unionPeaks)
rownames(se) <- paste(seqnames(se), start(se), end(se), sep = "_")
colData(se)$FRIP <- frip
colData(se)$uniqueFrags <- total/2
saveRDS(se, "~/scratch/outputs_snATACj/immune_allCons/rd1_clust_bed/peak_cell_se.rds")