This notebook will count the frequency of L1 target sites in exonic regions of each gene, and save a separate file for each chromosome.

In [2]:
library(data.table)
library(IRanges)
for (i in c(1:22,'X','Y')){ # Load L1 target site annotation files
        load(paste0('./data/l1_site_maps/chr',i,'.rda'))
}
load('./data/exons.rda') # Load gene annotation
dir.create('./data/gene_counts_exon/')
outPath <- './data/gene_counts_exon/out'

“'./data/gene_counts_exon' already exists”


In [3]:
for (ii in c(1:22,'X','Y')){ # Loop over chromosomes

    ptm <- proc.time()
    line <- paste0('######## Chromosome ',ii,' #########\n')
    write(line,file=paste0(outPath,'.log'),append=TRUE)
    
    map<-get(paste0('chr',ii,'Map')) # Get the L1 target site annotation for the current chromosome
    ict<-    map[[2]]
    icl<-    map[[3]]
    iot<-    map[[4]]
    iol<-    map[[5]]
    insites<-map[[1]]

    tmp_chrom <- exons[exons$chrom==ii,] # Extract exons entries for current chromosome
    currGenes <- unique(tmp_chrom$gene_id) # Get list of genes in current chromosome
    numCurrGenes <- length(currGenes) # Get length of the list
    counts <-  array(NA,dim=c(numCurrGenes,4)) # Allocate matrix for counts
    gnames <- array(NA,dim=c(numCurrGenes,1))
    lens <- array(NA,dim=c(numCurrGenes,1))
    
    l<-1

    for (jj in currGenes) { # Loop over genes of current chromosome

        line <- paste0('gene ',l,'/',numCurrGenes,' for ',ii,':\t',jj)
        write(line,file=paste0(outPath,'.log'),append=TRUE)
        
        # count sites in exons of current gene
        tmp_gene <- tmp_chrom[tmp_chrom$gene_id==jj,] # Extract exons entries for current gene

        sen        <-inrange(insites[ict[!is.na(ict[,1]),1],1],tmp_gene$start,tmp_gene$end) # Check if any Closed-Tight category sites are within the start-end range of tmp_gene
        antisen    <-inrange(insites[ict[!is.na(ict[,2]),2],2],tmp_gene$start,tmp_gene$end)
        counts[l,1]<-length(which(sen)) + length(which(antisen))                            # Fill an element of the counts table

        sen        <-inrange(insites[icl[!is.na(icl[,1]),1],1],tmp_gene$start,tmp_gene$end)
        antisen    <-inrange(insites[icl[!is.na(icl[,2]),2],2],tmp_gene$start,tmp_gene$end)
        counts[l,2]<-length(which(sen)) + length(which(antisen))

        sen        <-inrange(insites[iot[!is.na(iot[,1]),1],1],tmp_gene$start,tmp_gene$end)
        antisen    <-inrange(insites[iot[!is.na(iot[,2]),2],2],tmp_gene$start,tmp_gene$end)
        counts[l,3]<-length(which(sen)) + length(which(antisen))

        sen        <-inrange(insites[iol[!is.na(iol[,1]),1],1],tmp_gene$start,tmp_gene$end)
        antisen    <-inrange(insites[iol[!is.na(iol[,2]),2],2],tmp_gene$start,tmp_gene$end)
        counts[l,4]<-length(which(sen)) + length(which(antisen))

        gnames[l] <- jj
        tmp <- IRanges(tmp_gene$start,tmp_gene$end)
        tmp <- reduce(tmp)
        lens[l] <- sum(width(tmp))

        l <- l+1

    }
    print(proc.time() - ptm)
    save(counts, gnames, lens, file=paste0(outPath,ii,'.rda'))
}

     user    system   elapsed 
10003.430    37.036  1250.855 
    user   system  elapsed 
6689.617   23.283  852.895 
    user   system  elapsed 
5068.858   15.534  630.468 
    user   system  elapsed 
3610.427   12.715  453.050 
    user   system  elapsed 
3901.417    9.660  477.638 
    user   system  elapsed 
4409.360   14.158  534.878 
    user   system  elapsed 
3466.851    9.925  412.728 
    user   system  elapsed 
2477.845    7.065  290.814 
    user   system  elapsed 
2395.299    6.327  268.512 
    user   system  elapsed 
2424.689    7.149  274.762 
    user   system  elapsed 
4300.284   10.922  487.675 
    user   system  elapsed 
3636.109    9.013  409.669 
   user  system elapsed 
994.124   2.933 109.037 
    user   system  elapsed 
1553.384    3.860  165.397 
    user   system  elapsed 
1385.485    4.085  143.881 
    user   system  elapsed 
1730.906    4.113  173.045 
    user   system  elapsed 
2345.702    6.115  236.012 
   user  system elapsed 
685.937   1.633  71.223