In [7]:
library(BSgenome.Hsapiens.UCSC.hg38)
library(GenomicRanges)
library(naturalsort)
library(data.table)

In [8]:
#--- Use system (Unix) commands to extract exon locations from a hg38 .gff3 file
cat("\nFiltering hg38 gff3 file (Ensembl v86) for CDS locations... \n")
system(paste("grep ID=CDS ../Data/humangenome/Homo_sapiens.GRCh38.86.gff3 > tmp")) # Write lines containing "ID=CDS" to a tmp file
system(paste("cut -f1,4,5 tmp > tmp2")) # Extract columns 1, 4, 5 from tmp file, and write result to tmp2 directory
cat("Reading CDS location table...\n")
anno <- read.table("tmp2") # Read exonic regions into table (chromName	start	end)
system(paste("rm tmp"))
system(paste("rm tmp2"))
cat("Done")


Filtering hg38 gff3 file (Ensembl v86) for CDS locations... 
Reading CDS location table...
Done

In [9]:
anno <- GRanges(seqnames=anno$V1,IRanges(anno$V2,anno$V3)) # Convert anno into GRanges object (bioconductor)
anno <- reduce(anno) # Reduce to non-overlapping ranges
anno <- data.table(chrom=as.vector(seqnames(anno)),start=start(anno),end=end(anno))
names(anno) <- c('chrom','start','end')
head(anno)

chrom,start,end
1,69091,70008
1,182709,182746
1,183114,183240
1,183922,184158
1,184927,184971
1,184977,185049


In [10]:
cds_counts <- 	array(0,dim=c(24,9)) # Allocate memory for a 2D array containing counts of CDS insertion sites of each S-V type for each chromosome, and other information

#--- Loop through chromosome names
j<-1
for (i in names(Hsapiens)[1:24]){

	cat("\nProccessing ",i,"...")
	load(paste0("../Data/root_maps/",i,".rda")) # Load map file for current chromosome

	# Data objects containing indices of S-V sites for each category are labeled with the chromosome name.
	# Here we copy the data objects to a set of new names which can be used consistently in the following
    # code.
	map<-get(paste0(i,"Map"))	
        ict<-map$ict
        icl<-map$icl
        iot<-map$iot
        iol<-map$iol
        insites<-map$insites

	# Extract exonic regions for the current chromosome
	chrno<-strsplit(i,"chr")[[1]][2] # Get the current chromosome number (or letter)
	anno_i <- anno[anno$chrom == chrno,] # Extract a subset of the 'anno' table for regions in the current chromosome

	# Fill in columns 1-4 of the row of the cds_counts array corresponding to current chromosome.
	# These columns contain the number of sites of each S-V category lying within coding regions 
	# Categories are in this order: closed-tight, closed-loose, open-tight, open-loose
	tmp	<-inrange(insites[ict[which(!is.na(ict[,1])),1],1],anno_i$start,anno_i$end) # Check if any Closed-Tight category sites are within the start-end range of anno_i
	tmp2 	<-inrange(insites[ict[which(!is.na(ict[,2])),2],2],anno_i$start,anno_i$end) 
	cds_counts[j,1]<-length(which(tmp == TRUE)) + length(which(tmp2==TRUE)) 	  # Fill an element of the cds_counts table with the count
	tmp     <-inrange(insites[icl[which(!is.na(icl[,1])),1],1],anno_i$start,anno_i$end) # Check if any Closed-Tight category sites are within the start-end range of anno_i
        tmp2    <-inrange(insites[icl[which(!is.na(icl[,2])),2],2],anno_i$start,anno_i$end)
	cds_counts[j,2]<-length(which(tmp == TRUE)) + length(which(tmp2==TRUE))
	tmp     <-inrange(insites[iot[which(!is.na(iot[,1])),1],1],anno_i$start,anno_i$end) # Check if any Closed-Tight category sites are within the start-end range of anno_i
	tmp2    <-inrange(insites[iot[which(!is.na(iot[,2])),2],2],anno_i$start,anno_i$end)
	cds_counts[j,3]<-length(which(tmp == TRUE)) + length(which(tmp2==TRUE))
        tmp     <-inrange(insites[iol[which(!is.na(iol[,1])),1],1],anno_i$start,anno_i$end) # Check if any Closed-Tight category sites are within the start-end range of anno_i
	tmp2    <-inrange(insites[iol[which(!is.na(iol[,2])),2],2],anno_i$start,anno_i$end)
	cds_counts[j,4]<-length(which(tmp == TRUE)) + length(which(tmp2==TRUE))  


	# Fill in columns 5-8 of the row of the cds_counts array corresponding to current chromosome.
	# These columns contain the total number of sites of each category
	cds_counts[j,5]<-length(which(!is.na(ict)))
	cds_counts[j,6]<-length(which(!is.na(icl)))
	cds_counts[j,7]<-length(which(!is.na(iot)))
	cds_counts[j,8]<-length(which(!is.na(iol)))

	# The last column contains the coding region fraction of the current chromosome
	tmp<-IRanges(anno_i$start,anno_i$end)
	cds_counts[j,9] <- sum(width(tmp))/length(Hsapiens[[i]])

	j<-j+1
}


Proccessing  chr1 ...
Proccessing  chr2 ...
Proccessing  chr3 ...
Proccessing  chr4 ...
Proccessing  chr5 ...
Proccessing  chr6 ...
Proccessing  chr7 ...
Proccessing  chr8 ...
Proccessing  chr9 ...
Proccessing  chr10 ...
Proccessing  chr11 ...
Proccessing  chr12 ...
Proccessing  chr13 ...
Proccessing  chr14 ...
Proccessing  chr15 ...
Proccessing  chr16 ...
Proccessing  chr17 ...
Proccessing  chr18 ...
Proccessing  chr19 ...
Proccessing  chr20 ...
Proccessing  chr21 ...
Proccessing  chr22 ...
Proccessing  chrX ...
Proccessing  chrY ...

In [11]:
write.csv(cds_counts,file="~/jackgl/cds_counts.csv")