This notebook will count the frequency of L1 target sites in exonic regions of the human genome, and save the data for each chromosome.

In [3]:
library(BSgenome.Hsapiens.UCSC.hg38)
library(data.table)
for (i in names(Hsapiens)[1:24]){ # Load L1 target site annotation files
        cat(paste0('Loading target site annotation: ',i,'\n'))
        load(paste0('../sim-develop/data/root_maps/',i,'.rda'))
}
load('../sim-develop/data/exann.rda') # Load exon annotation table

system(paste("rm ./count_gene_exon_sites.log"))
outPath <- './gene_exon_counts/out'


Loading target site annotation: chr1
Loading target site annotation: chr2
Loading target site annotation: chr3
Loading target site annotation: chr4
Loading target site annotation: chr5
Loading target site annotation: chr6
Loading target site annotation: chr7
Loading target site annotation: chr8
Loading target site annotation: chr9
Loading target site annotation: chr10
Loading target site annotation: chr11
Loading target site annotation: chr12
Loading target site annotation: chr13
Loading target site annotation: chr14
Loading target site annotation: chr15
Loading target site annotation: chr16
Loading target site annotation: chr17
Loading target site annotation: chr18
Loading target site annotation: chr19
Loading target site annotation: chr20
Loading target site annotation: chr21
Loading target site annotation: chr22
Loading target site annotation: chrX
Loading target site annotation: chrY


In [5]:

for (ii in names(Hsapiens)[24]){ # Loop over chromosomes

    ptm <- proc.time()
    line <- paste0('######## Chromosome ',ii,' #########\n')
    write(line,file=paste0(outPath,'.log'),append=TRUE)
    
    map<-get(paste0(ii,'Map')) # Get the L1 target site annotation for the current chromosome
    ict<-    map[[2]]
    icl<-    map[[3]]
    iot<-    map[[4]]
    iol<-    map[[5]]
    insites<-map[[1]]

    chrno <- strsplit(ii,"chr")[[1]][2] # Get the current chromosome symbol
    tmpann_1 <- exann[exann$chrom==chrno,] # Extract exann entries for current chromosome
    currGenes <- unique(tmpann_1$geneSym) # Get list of genes in current chromosome
    numCurrGenes <- length(currGenes) # Get length of the list
    counts <-  array(NA,dim=c(numCurrGenes,4)) # Allocate matrix for counts
    gnames <- array(NA,dim=c(numCurrGenes,1))
    l<-1

    for (jj in currGenes) { # Loop over genes of current chromosome

        line <- paste0('gene ',l,'/',numCurrGenes,' for ',ii,':\t',jj)
        write(line,file=paste0(outPath,'.log'),append=TRUE)
        
        # count sites in exons of current gene
        tmpann_2 <- tmpann_1[tmpann_1$geneSym==jj,] # Extract exann entries for current gene

        sen        <-inrange(insites[ict[!is.na(ict[,1]),1],1],tmpann_2$start,tmpann_2$end) # Check if any Closed-Tight category sites are within the start-end range of tmpann_2
        antisen    <-inrange(insites[ict[!is.na(ict[,2]),2],2],tmpann_2$start,tmpann_2$end)
        counts[l,1]<-length(which(sen)) + length(which(antisen))                            # Fill an element of the counts table

        sen        <-inrange(insites[icl[!is.na(icl[,1]),1],1],tmpann_2$start,tmpann_2$end)
        antisen    <-inrange(insites[icl[!is.na(icl[,2]),2],2],tmpann_2$start,tmpann_2$end)
        counts[l,2]<-length(which(sen)) + length(which(antisen))

        sen        <-inrange(insites[iot[!is.na(iot[,1]),1],1],tmpann_2$start,tmpann_2$end)
        antisen    <-inrange(insites[iot[!is.na(iot[,2]),2],2],tmpann_2$start,tmpann_2$end)
        counts[l,3]<-length(which(sen)) + length(which(antisen))

        sen        <-inrange(insites[iol[!is.na(iol[,1]),1],1],tmpann_2$start,tmpann_2$end)
        antisen    <-inrange(insites[iol[!is.na(iol[,2]),2],2],tmpann_2$start,tmpann_2$end)
        counts[l,4]<-length(which(sen)) + length(which(antisen))

        gnames[l] <- jj

        l <- l+1

    }
    print(proc.time() - ptm)
    save(counts, gnames, file=paste0(outPath,ii,'.rda'))
}

   user  system elapsed 
 42.783   0.040  32.758 
