In [1]:
library(BSgenome.Hsapiens.UCSC.hg38)
library(GenomicRanges)
library(naturalsort)
library(data.table)


Loading required package: BSgenome
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:stats’:

    IQR, mad, xtabs

The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, as.vector, cbind, colnames,
    do.call, duplicated, eval, evalq, Filter, Find, get, grep, grepl,
    intersect, is.unsorted, lapply, lengths, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
    union, unique, unlist, unsplit

Loading required package: S4Vectors
Loading required package: stats4
Loading required package: IRanges
Loadi

In [2]:
#--- Use system (Unix) commands to extract exon locations from a hg38 .gff3 file
cat("\nFiltering hg38 gff3 file for exon locations... \n")
system(paste("grep exon ../Data/humangenome/Homo_sapiens.GRCh38.89.gff3 > tmp")) # Write lines containing "exon" to a tmp file
system(paste("cut -f1,4,5 tmp > tmp2")) # Extract columns 1, 4, 5 from tmp file, and write result to ../Data directory
cat("Reading exon location table...\n")
exann <- read.table("tmp2") # Read exonic regions into table (chromName	start	end)
system(paste("rm tmp"))
system(paste("rm tmp2"))
cat("Done")


Filtering hg38 gff3 file for exon locations... 
Reading exon location table...
Done

In [3]:
tmp<-c()
exann <- GRanges(seqnames=exann$V1,IRanges(exann$V2,exann$V3)) # Convert anno into GRanges object (bioconductor)
exann <- reduce(exann) # Reduce to non-overlapping ranges
tmp$chrom <- as.vector(seqnames(exann)) # Next lines are for conversion back to data.frame
tmp$start <- start(exann)
tmp$end <- end(exann)
exann <- data.frame(tmp)
names(exann) <- c('chrom','start','end')
head(exann)

chrom,start,end
1,11869,12227
1,12613,12721
1,12975,13052
1,13221,14501
1,15005,15038
1,15796,15947


In [5]:
exon_counts <- 	array(0,dim=c(24,9)) # Allocate memory for a 2D array containing counts of exonic insertion sites of each S-V type for each chromosome, and other information

#--- Loop through chromosome names
j<-1
for (i in names(Hsapiens)[1:24]){

	cat("\nProccessing ",i,"...")
	load(paste0("../Data/root_maps/",i,".rda")) # Load map file for current chromosome

	# Data objects containing indices of S-V sites for each category are labeled with the chromosome name.
	# Here we copy the data objects to a set of new names which can be used consistently in the following
    # code.
	map<-get(paste0(i,"Map"))	
        ict<-map$ict
        icl<-map$icl
        iot<-map$iot
        iol<-map$iol
        insites<-map$insites

	# Extract exonic regions for the current chromosome
	chrno<-strsplit(i,"chr")[[1]][2] # Get the current chromosome number (or letter)
	exann_i <- exann[exann$chrom == chrno,] # Extract a subset of the 'exann' table for regions in the current chromosome

	# Fill in columns 1-4 of the row of the exon_counts array corresponding to current chromosome.
	# These columns contain the number of sites of each S-V category lying within exons 
	# Categories are in this order: closed-tight, closed-loose, open-tight, open-loose
	tmp	<-inrange(insites[ict[which(!is.na(ict[,1])),1],1],exann_i$start,exann_i$end) # Check if any Closed-Tight category sites are within the start-end range of exann_i
	tmp2<-inrange(insites[ict[which(!is.na(ict[,2])),2],2],exann_i$start,exann_i$end) 
	exon_counts[j,1]<-length(which(tmp == TRUE)) + length(which(tmp2==TRUE)) 	  # Fill an element of the exon_counts table with the count
	tmp     <-inrange(insites[icl[which(!is.na(icl[,1])),1],1],exann_i$start,exann_i$end) # Check if any Closed-Tight category sites are within the start-end range of exann_i
        tmp2    <-inrange(insites[icl[which(!is.na(icl[,2])),2],2],exann_i$start,exann_i$end)
	exon_counts[j,2]<-length(which(tmp == TRUE)) + length(which(tmp2==TRUE))
	tmp     <-inrange(insites[iot[which(!is.na(iot[,1])),1],1],exann_i$start,exann_i$end) # Check if any Closed-Tight category sites are within the start-end range of exann_i
	tmp2    <-inrange(insites[iot[which(!is.na(iot[,2])),2],2],exann_i$start,exann_i$end)
	exon_counts[j,3]<-length(which(tmp == TRUE)) + length(which(tmp2==TRUE))
        tmp     <-inrange(insites[iol[which(!is.na(iol[,1])),1],1],exann_i$start,exann_i$end) # Check if any Closed-Tight category sites are within the start-end range of exann_i
	tmp2    <-inrange(insites[iol[which(!is.na(iol[,2])),2],2],exann_i$start,exann_i$end)
	exon_counts[j,4]<-length(which(tmp == TRUE)) + length(which(tmp2==TRUE))  


	# Fill in columns 5-8 of the row of the exon_counts array corresponding to current chromosome.
	# These columns contain the total number of sites of each category
	exon_counts[j,5]<-length(which(!is.na(ict)))
	exon_counts[j,6]<-length(which(!is.na(icl)))
	exon_counts[j,7]<-length(which(!is.na(iot)))
	exon_counts[j,8]<-length(which(!is.na(iol)))

	# The last column contains the exon fraction of the current chromosome
	tmp<-IRanges(exann_i$start,exann_i$end)
	exon_counts[j,9] <- sum(width(tmp))/length(Hsapiens[[i]])

	j<-j+1
}


Proccessing  chr1 ...
Proccessing  chr2 ...
Proccessing  chr3 ...
Proccessing  chr4 ...
Proccessing  chr5 ...
Proccessing  chr6 ...
Proccessing  chr7 ...
Proccessing  chr8 ...
Proccessing  chr9 ...
Proccessing  chr10 ...
Proccessing  chr11 ...
Proccessing  chr12 ...
Proccessing  chr13 ...
Proccessing  chr14 ...
Proccessing  chr15 ...
Proccessing  chr16 ...
Proccessing  chr17 ...
Proccessing  chr18 ...
Proccessing  chr19 ...
Proccessing  chr20 ...
Proccessing  chr21 ...
Proccessing  chr22 ...
Proccessing  chrX ...
Proccessing  chrY ...

In [7]:
write.csv(exon_counts,file="~/jackgl/exon_counts.csv")