In [5]:
library(BSgenome.Hsapiens.UCSC.hg38)
library(EnsDb.Hsapiens.v86)
library(data.table)
for (i in names(Hsapiens)[1:24]){ # load all chromosome map files
        load(paste0('../sim-develop/data/root_maps/',i,'.rda'))
}
chroms <- c(1:22,'X','Y')

gann <- genes(EnsDb.Hsapiens.v86)
# gann <- genes(EnsDb.Hsapiens.v86,filter=GeneBiotypeFilter("protein_coding"))
gann <- data.table(chrom=as.vector(seqnames(gann)),start=start(gann),end=end(gann),geneSym=gann$gene_name)
gann <- gann[gann$chrom %in% chroms]

In [6]:
head(gann)

chrom,start,end,geneSym
1,11869,14409,DDX11L1
1,14404,29570,WASH7P
1,17369,17436,MIR6859-1
1,29554,31109,MIR1302-2
1,34554,36081,FAM138A
1,52473,53312,OR4G4P


In [7]:
# Take the union of overlapping gene ranges, and omit parent gene data
gann <- GRanges(gann$chrom,IRanges(gann$start,gann$end))
gann <- reduce(gann)
gann <- data.table(chrom=as.vector(seqnames(gann)),start=start(gann),end=end(gann))
head(gann)

chrom,start,end
1,11869,31109
1,34554,36081
1,52473,53312
1,62948,63887
1,69091,70008
1,89295,134836


#### In the cell below, we count the number of L1 target sites of each Snap-Velcro type in genes and non-gene regions

In [8]:
counts <-  array(0,dim=c(24,8)) # Allocate matrix for counts
gene_lens <- rep(0,24)

#--- Loop through chromosome names
j<-1 # chromosome counter
for (i in names(Hsapiens)[1:24]){


        # Data objects containing indices of S-V sites for each category are labeled with the chromosome name.
        # Here we copy the data objects to a set of variables with generic names which can be used consistently 
        # in the loop. 
        map<-get(paste0(i,'Map')) # Get the site map data for the current chrom
        ict<-    map[[2]]
        icl<-    map[[3]]
        iot<-    map[[4]]
        iol<-    map[[5]]
        insites<-map[[1]] 
    
        i <- strsplit(i,"chr")[[1]][2] # Remove "chr" from chromosome name

################################################
        # count in genes for current chromosome (columns 1-4)
        ann_i <- gann[gann$chrom == i,] 
    
        sen        <-inrange(insites[ict[!is.na(ict[,1]),1],1],ann_i$start,ann_i$end) # Check if any Closed-Tight category sites are within the start-end range of ann_i
        antisen    <-inrange(insites[ict[!is.na(ict[,2]),2],2],ann_i$start,ann_i$end) 
        counts[j,1]<-length(which(sen)) + length(which(antisen))                      # Fill an element of the counts table with the count
        
        sen        <-inrange(insites[icl[!is.na(icl[,1]),1],1],ann_i$start,ann_i$end) 
        antisen    <-inrange(insites[icl[!is.na(icl[,2]),2],2],ann_i$start,ann_i$end)
        counts[j,2]<-length(which(sen)) + length(which(antisen))
        
        sen        <-inrange(insites[iot[!is.na(iot[,1]),1],1],ann_i$start,ann_i$end) 
        antisen    <-inrange(insites[iot[!is.na(iot[,2]),2],2],ann_i$start,ann_i$end)
        counts[j,3]<-length(which(sen)) + length(which(antisen))
        
        sen        <-inrange(insites[iol[!is.na(iol[,1]),1],1],ann_i$start,ann_i$end) 
        antisen    <-inrange(insites[iol[!is.na(iol[,2]),2],2],ann_i$start,ann_i$end)
        counts[j,4]<-length(which(sen)) + length(which(antisen))  
    
################################################
        # count in non-genes for current chromosome (columns 5-8)
        
        counts[j,5]   <- length(which(!is.na(as.vector(ict)))) - counts[j,1]
        counts[j,6]  <- length(which(!is.na(as.vector(icl)))) - counts[j,2]
        counts[j,7]  <- length(which(!is.na(as.vector(iot)))) - counts[j,3]
        counts[j,8]  <- length(which(!is.na(as.vector(iol)))) - counts[j,4]
    
        gene_lens[j] <- sum(ann_i$end-ann_i$start)
    
        j<-j+1
        rm('sen','antisen')
}

In [9]:
colnames(counts) <- c('ct_gene','cl_gene','ot_gene','ol_gene','ct_nongene','cl_nongene','ot_nongene','ol_nongene')
rownames(counts) <- names(Hsapiens)[1:24]
# counts

Here we account for male and female diploid genomes

#### Male

In [10]:
# Male genome - double the count for chromosomes 1:22
counts_dipl_m <- counts
counts_dipl_m[1:22,]<-counts_dipl_m[1:22,]*2
# counts_dipl_m

#### Female

In [11]:
# Female genome - double the counts for chroms. 1:23; count for chrom. Y is set to 0
counts_dipl_f <- counts
counts_dipl_f[1:23,]<-counts_dipl_f[1:23,]*2
counts_dipl_f[24,]<-counts_dipl_f[24,]*0
# counts_dipl_f

## Calculating probabilities of ENd (endonuclease-dependent) gene vs non-gene insertion

#### Male

In [12]:
counts_ovchr_m <- colSums(counts_dipl_m)
probs_ENd_m <- rep(0,2)
for (i in 1:2){
    probs_ENd_m[i] = 11.55*counts_ovchr_m[(i-1)*4+1] + 
                   7.25*counts_ovchr_m[(i-1)*4+2] + 
                   1.95*counts_ovchr_m[(i-1)*4+3] + 
                   1*counts_ovchr_m[(i-1)*4+4]
}
probs_ENd_m <- probs_ENd_m/sum(probs_ENd_m)
probs_ENd_m

#### Female

In [13]:
counts_ovchr_f <- colSums(counts_dipl_f)
probs_ENd_f <- rep(0,2)
for (i in 1:2){
    probs_ENd_f[i] = 11.55*counts_ovchr_f[(i-1)*4+1] + 
                   7.25*counts_ovchr_f[(i-1)*4+2] + 
                   1.95*counts_ovchr_f[(i-1)*4+3] + 
                   1*counts_ovchr_f[(i-1)*4+4]
}
probs_ENd_f <- probs_ENd_f/sum(probs_ENd_f)
probs_ENd_f

## Calculating probabilities of ENi (endonuclease-independent) gene vs non-gene insertion

In [14]:
chrlens <- rep(0,24)
for (i in 1:24){
    chrlens[i] <- length(Hsapiens[[i]])
}
sum(chrlens)

#### Male

In [15]:
probs_ENi_m <- rep(0,2)
probs_ENi_m[1] <- (sum(gene_lens[1:22]*2)+sum(gene_lens[23:24]))/(sum(chrlens[1:22]*2)+sum(chrlens[23:24]))
probs_ENi_m[2] <- 1-probs_ENi_m[1]
probs_ENi_m

#### Female

In [16]:
probs_ENi_f <- rep(0,2)
probs_ENi_f[1] <- sum(gene_lens[1:23]*2)/sum(chrlens[1:23]*2)
probs_ENi_f[2] <- 1-probs_ENi_f[1]
probs_ENi_f

## Calculating final probabilities of gene vs non-gene insertion

#### Male

In [17]:
pd_gnvsnon_m <- (.9*probs_ENd_m)+(0.1*probs_ENi_m) # Assuming 0.9 probability that insertion will be ENd and 0.1 for ENi
pd_gnvsnon_m <- pd_gnvsnon_m/sum(pd_gnvsnon_m)
pd_gnvsnon_m

#### Female

In [18]:
pd_gnvsnon_f <- (.9*probs_ENd_f)+(0.1*probs_ENi_f) # Assuming 0.9 probability that insertion will be ENd and 0.1 for ENi
pd_gnvsnon_f <- pd_gnvsnon_f/sum(pd_gnvsnon_f)
pd_gnvsnon_f

In [19]:
lens <- cbind(gene_lens, chrlens)
lens

gene_lens,chrlens
133088986,248956422
135587963,242193529
119424238,198295559
95612228,190214555
94539903,181538259
84854934,170805979
91509773,159345973
80702568,145138636
60557467,138394717
72469213,133797422


In [20]:
save(lens,counts_dipl_m,counts_dipl_f,pd_gnvsnon_m,pd_gnvsnon_f,file='../sim-develop/data/genevsnon_counts.rda')