In [2]:
library(BSgenome.Hsapiens.UCSC.hg38)
library(data.table)
for (i in names(Hsapiens)[1:24]){ # Load L1 target site annotation files
        load(paste0('../sim-develop/data/root_maps/',i,'.rda'))
}
library(EnsDb.Hsapiens.v86)

gann <- genes(EnsDb.Hsapiens.v86)
gann <- data.table(chrom=as.vector(seqnames(gann)),start=start(gann),end=end(gann),geneSym=gann$gene_name)
head(gann)

system(paste("rm ./count_all_sites_in_genes.log"))
outPath <- '~/jackgl/gene_all_counts/out'


Loading required package: BSgenome
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs

The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colMeans,
    colnames, colSums, dirname, do.call, duplicated, eval, evalq,
    Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply,
    lengths, Map, mapply, match, mget, order, paste, pmax, pmax.int,
    pmin, pmin.int, Position, rank, rbind, Reduce, rowMeans, rownames,
    rowSums, sapply, setdiff, sort, table, tapply, union, unique,
    unsplit, which, which.max, which.min

Loading required package: S4

chrom,start,end,geneSym
1,11869,14409,DDX11L1
1,14404,29570,WASH7P
1,17369,17436,MIR6859-1
1,29554,31109,MIR1302-2
1,34554,36081,FAM138A
1,52473,53312,OR4G4P


In [None]:

for (ii in names(Hsapiens)[1:24]){ # Loop over chromosomes

    ptm <- proc.time()
    line <- paste0('######## Chromosome ',ii,' #########\n')
    write(line,file=paste0(outPath,'.log'),append=TRUE)
    
    map<-get(paste0(ii,'Map')) # Get the L1 target site annotation for the current chromosome
    ict<-    map[[2]]
    icl<-    map[[3]]
    iot<-    map[[4]]
    iol<-    map[[5]]
    insites<-map[[1]]

    chrno <- strsplit(ii,"chr")[[1]][2] # Get the current chromosome symbol
    tmpann_1 <- gann[gann$chrom==chrno,] # Extract gann entries for current chromosome
    currGenes <- unique(tmpann_1$geneSym) # Get list of genes in current chromosome
    numCurrGenes <- length(currGenes) # Get length of the list
    counts <-  array(NA,dim=c(numCurrGenes,4)) # Allocate matrix for counts
    gnames <- array(NA,dim=c(numCurrGenes,1))
    l<-1

    for (jj in currGenes) { # Loop over genes of current chromosome

        line <- paste0('gene ',l,'/',numCurrGenes,' for ',ii,':\t',jj)
        write(line,file=paste0(outPath,'.log'),append=TRUE)
        
        # count sites in exons of current gene
        tmpann_2 <- tmpann_1[tmpann_1$geneSym==jj,] # Extract gann entries for current gene

        sen        <-inrange(insites[ict[!is.na(ict[,1]),1],1],tmpann_2$start,tmpann_2$end) # Check if any Closed-Tight category sites are within the start-end range of tmpann_2
        antisen    <-inrange(insites[ict[!is.na(ict[,2]),2],2],tmpann_2$start,tmpann_2$end)
        counts[l,1]<-length(which(sen)) + length(which(antisen))                            # Fill an element of the counts table

        sen        <-inrange(insites[icl[!is.na(icl[,1]),1],1],tmpann_2$start,tmpann_2$end)
        antisen    <-inrange(insites[icl[!is.na(icl[,2]),2],2],tmpann_2$start,tmpann_2$end)
        counts[l,2]<-length(which(sen)) + length(which(antisen))

        sen        <-inrange(insites[iot[!is.na(iot[,1]),1],1],tmpann_2$start,tmpann_2$end)
        antisen    <-inrange(insites[iot[!is.na(iot[,2]),2],2],tmpann_2$start,tmpann_2$end)
        counts[l,3]<-length(which(sen)) + length(which(antisen))

        sen        <-inrange(insites[iol[!is.na(iol[,1]),1],1],tmpann_2$start,tmpann_2$end)
        antisen    <-inrange(insites[iol[!is.na(iol[,2]),2],2],tmpann_2$start,tmpann_2$end)
        counts[l,4]<-length(which(sen)) + length(which(antisen))

        gnames[l] <- jj

        l <- l+1

    }
    print(proc.time() - ptm)
    save(counts, gnames, file=paste0(outPath,ii,'.rda'))
}