This notebook will count the frequency of L1 target sites in exonic regions of each gene, and save a separate file for each chromosome.

In [5]:
library(data.table)
library(IRanges)
for (i in c(1:22,'X','Y')){ # Load L1 target site annotation files
        load(paste0('./data/large_files/chr',i,'.rda'))
}
load('./data/exons.rda') # Load gene annotation
dir.create('./data/gene_counts_exon/')
outPath <- './data/gene_counts_exon/out'

Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs

The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colMeans,
    colnames, colSums, dirname, do.call, duplicated, eval, evalq,
    Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply,
    lengths, Map, mapply, match, mget, order, paste, pmax, pmax.int,
    pmin, pmin.int, Position, rank, rbind, Reduce, rowMeans, rownames,
    rowSums, sapply, setdiff, sort, table, tapply, union, unique,
    unsplit, which, which.max, which.min

Loading required package: S4Vectors
Loading required package: s

In [None]:
for (ii in c(1:22,'X','Y')){ # Loop over chromosomes

    ptm <- proc.time()
    line <- paste0('######## Chromosome ',ii,' #########\n')
    write(line,file=paste0(outPath,'.log'),append=TRUE)
    
    map<-get(paste0('chr',ii,'Map')) # Get the L1 target site annotation for the current chromosome
    ict<-    map[[2]]
    icl<-    map[[3]]
    iot<-    map[[4]]
    iol<-    map[[5]]
    insites<-map[[1]]

    tmp_chrom <- exons[exons$chrom==ii,] # Extract exons entries for current chromosome
    currGenes <- unique(tmp_chrom$gene_id) # Get list of genes in current chromosome
    numCurrGenes <- length(currGenes) # Get length of the list
    counts <-  array(NA,dim=c(numCurrGenes,4)) # Allocate matrix for counts
    gnames <- array(NA,dim=c(numCurrGenes,1))
    lens <- array(NA,dim=c(numCurrGenes,1))
    
    l<-1

    for (jj in currGenes) { # Loop over genes of current chromosome

        line <- paste0('gene ',l,'/',numCurrGenes,' for ',ii,':\t',jj)
        write(line,file=paste0(outPath,'.log'),append=TRUE)
        
        # count sites in exons of current gene
        tmp_gene <- tmp_chrom[tmp_chrom$gene_id==jj,] # Extract exons entries for current gene

        sen        <-inrange(insites[ict[!is.na(ict[,1]),1],1],tmp_gene$start,tmp_gene$end) # Check if any Closed-Tight category sites are within the start-end range of tmp_gene
        antisen    <-inrange(insites[ict[!is.na(ict[,2]),2],2],tmp_gene$start,tmp_gene$end)
        counts[l,1]<-length(which(sen)) + length(which(antisen))                            # Fill an element of the counts table

        sen        <-inrange(insites[icl[!is.na(icl[,1]),1],1],tmp_gene$start,tmp_gene$end)
        antisen    <-inrange(insites[icl[!is.na(icl[,2]),2],2],tmp_gene$start,tmp_gene$end)
        counts[l,2]<-length(which(sen)) + length(which(antisen))

        sen        <-inrange(insites[iot[!is.na(iot[,1]),1],1],tmp_gene$start,tmp_gene$end)
        antisen    <-inrange(insites[iot[!is.na(iot[,2]),2],2],tmp_gene$start,tmp_gene$end)
        counts[l,3]<-length(which(sen)) + length(which(antisen))

        sen        <-inrange(insites[iol[!is.na(iol[,1]),1],1],tmp_gene$start,tmp_gene$end)
        antisen    <-inrange(insites[iol[!is.na(iol[,2]),2],2],tmp_gene$start,tmp_gene$end)
        counts[l,4]<-length(which(sen)) + length(which(antisen))

        gnames[l] <- jj
        tmp <- IRanges(tmp_gene$start,tmp_gene$end)
        tmp <- reduce(tmp)
        lens[l] <- sum(width(tmp))

        l <- l+1

    }
    print(proc.time() - ptm)
    save(counts, gnames, lens, file=paste0(outPath,ii,'.rda'))
}