In [1]:
library(BSgenome.Hsapiens.UCSC.hg38)
library(data.table)
load('./data/genes.rda')

Loading required package: BSgenome
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs

The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colMeans,
    colnames, colSums, dirname, do.call, duplicated, eval, evalq,
    Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply,
    lengths, Map, mapply, match, mget, order, paste, pmax, pmax.int,
    pmin, pmin.int, Position, rank, rbind, Reduce, rowMeans, rownames,
    rowSums, sapply, setdiff, sort, table, tapply, union, unique,
    unsplit, which, which.max, which.min

Loading required package: S4

In [None]:
gene_counts <- 	array(0,dim=c(24,9)) # Allocate memory for a 2D array containing counts of gene insertion sites of each S-V type for each chromosome, and other information

#--- Loop through chromosome names
j<-1
for (i in names(Hsapiens)[1:24]){

	cat("\nProccessing ",i,"...")
	load(paste0("./data/large_files/",i,".rda")) # Load map file for current chromosome

	# Data objects containing indices of S-V sites for each category are labeled with the chromosome name.
	# Here we copy the data objects to a set of new names which can be used consistently in the following
    # code.
	map<-get(paste0(i,"Map"))	
        ict<-map$ict
        icl<-map$icl
        iot<-map$iot
        iol<-map$iol
        insites<-map$insites

	# Extract gene regions for the current chromosome
	chrno<-strsplit(i,"chr")[[1]][2] # Get the current chromosome number (or letter)
	genes_i <- genes[genes$chrom == chrno,] # Extract a subset of the 'genes' table for regions in the current chromosome

	# Fill in columns 1-4 of the row of the gene_counts array corresponding to current chromosome.
	# These columns contain the number of sites of each S-V category lying within genes 
	# Categories are in this order: closed-tight, closed-loose, open-tight, open-loose
	tmp     <-inrange(insites[ict[which(!is.na(ict[,1])),1],1],genes_i$start,genes_i$end) # Check if any Closed-Tight category sites are within the start-end range of genes_i
	tmp2    <-inrange(insites[ict[which(!is.na(ict[,2])),2],2],genes_i$start,genes_i$end) 
	gene_counts[j,1]<-length(which(tmp == TRUE)) + length(which(tmp2==TRUE)) 	  # Fill an element of the gene_counts table with the count
	tmp     <-inrange(insites[icl[which(!is.na(icl[,1])),1],1],genes_i$start,genes_i$end) # Check if any Closed-Tight category sites are within the start-end range of genes_i
    tmp2    <-inrange(insites[icl[which(!is.na(icl[,2])),2],2],genes_i$start,genes_i$end)
	gene_counts[j,2]<-length(which(tmp == TRUE)) + length(which(tmp2==TRUE))
	tmp     <-inrange(insites[iot[which(!is.na(iot[,1])),1],1],genes_i$start,genes_i$end) # Check if any Closed-Tight category sites are within the start-end range of genes_i
	tmp2    <-inrange(insites[iot[which(!is.na(iot[,2])),2],2],genes_i$start,genes_i$end)
	gene_counts[j,3]<-length(which(tmp == TRUE)) + length(which(tmp2==TRUE))
    tmp     <-inrange(insites[iol[which(!is.na(iol[,1])),1],1],genes_i$start,genes_i$end) # Check if any Closed-Tight category sites are within the start-end range of genes_i
	tmp2    <-inrange(insites[iol[which(!is.na(iol[,2])),2],2],genes_i$start,genes_i$end)
	gene_counts[j,4]<-length(which(tmp == TRUE)) + length(which(tmp2==TRUE))  


	# Fill in columns 5-8 of the row of the gene_counts array corresponding to current chromosome.
	# These columns contain the total number of sites of each category
	gene_counts[j,5]<-length(which(!is.na(ict)))
	gene_counts[j,6]<-length(which(!is.na(icl)))
	gene_counts[j,7]<-length(which(!is.na(iot)))
	gene_counts[j,8]<-length(which(!is.na(iol)))

	# The last column contains the gene fraction of the current chromosome
	tmp<-IRanges(genes_i$start,genes_i$end)
    tmp<-reduce(tmp) # Reduce to non-overlapping ranges
	gene_counts[j,9] <- sum(width(tmp))/length(Hsapiens[[i]])

	j<-j+1
}

In [None]:
write.csv(gene_counts,file="./data/gene_counts.csv")