In [1]:
library(BSgenome.Hsapiens.UCSC.hg38)
library(data.table)
for (i in names(Hsapiens)[1:24]){ # load all chromosome map files
        load(paste0('./data/large_files/',i,'.rda'))
}
chroms <- c(1:22,'X','Y')
load('./data/genes.rda')
weights <- read.csv('./data/snap_velcro_weights.csv')
weights <- weights[7,2:5]

Loading required package: BSgenome
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs

The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colMeans,
    colnames, colSums, dirname, do.call, duplicated, eval, evalq,
    Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply,
    lengths, Map, mapply, match, mget, order, paste, pmax, pmax.int,
    pmin, pmin.int, Position, rank, rbind, Reduce, rowMeans, rownames,
    rowSums, sapply, setdiff, sort, table, tapply, union, unique,
    unsplit, which, which.max, which.min

Loading required package: S4

In [2]:
# Take the union of overlapping gene ranges, and omit parent gene data
genes <- GRanges(genes$chrom,IRanges(genes$start,genes$end))
genes <- reduce(genes)
genes <- data.table(chrom=as.vector(seqnames(genes)),start=start(genes),end=end(genes))
head(genes)

chrom,start,end
1,65419,71585
1,450703,451697
1,685679,686673
1,923928,959309
1,960587,965715
1,966497,982093


#### In the cell below, we count the number of L1 target sites of each Snap-Velcro type in genes and non-gene regions

In [3]:
counts <-  array(0,dim=c(24,8)) # Allocate matrix for counts
gene_lens <- rep(0,24)

#--- Loop through chromosome names
j<-1 # chromosome counter
for (i in names(Hsapiens)[1:24]){


        # Data objects containing indices of S-V sites for each category are labeled with the chromosome name.
        # Here we copy the data objects to a set of variables with generic names which can be used consistently 
        # in the loop. 
        map<-get(paste0(i,'Map')) # Get the site map data for the current chrom
        ict<-    map[[2]]
        icl<-    map[[3]]
        iot<-    map[[4]]
        iol<-    map[[5]]
        insites<-map[[1]] 
    
        i <- strsplit(i,"chr")[[1]][2] # Remove "chr" from chromosome name

################################################
        # count in genes for current chromosome (columns 1-4)
        ann_i <- genes[genes$chrom == i,] 
    
        sen        <-inrange(insites[ict[!is.na(ict[,1]),1],1],ann_i$start,ann_i$end) # Check if any Closed-Tight category sites are within the start-end range of ann_i
        antisen    <-inrange(insites[ict[!is.na(ict[,2]),2],2],ann_i$start,ann_i$end) 
        counts[j,1]<-length(which(sen)) + length(which(antisen))                      # Fill an element of the counts table with the count
        
        sen        <-inrange(insites[icl[!is.na(icl[,1]),1],1],ann_i$start,ann_i$end) 
        antisen    <-inrange(insites[icl[!is.na(icl[,2]),2],2],ann_i$start,ann_i$end)
        counts[j,2]<-length(which(sen)) + length(which(antisen))
        
        sen        <-inrange(insites[iot[!is.na(iot[,1]),1],1],ann_i$start,ann_i$end) 
        antisen    <-inrange(insites[iot[!is.na(iot[,2]),2],2],ann_i$start,ann_i$end)
        counts[j,3]<-length(which(sen)) + length(which(antisen))
        
        sen        <-inrange(insites[iol[!is.na(iol[,1]),1],1],ann_i$start,ann_i$end) 
        antisen    <-inrange(insites[iol[!is.na(iol[,2]),2],2],ann_i$start,ann_i$end)
        counts[j,4]<-length(which(sen)) + length(which(antisen))  
    
################################################
        # count in non-genes for current chromosome (columns 5-8)
        
        counts[j,5]   <- length(which(!is.na(as.vector(ict)))) - counts[j,1]
        counts[j,6]  <- length(which(!is.na(as.vector(icl)))) - counts[j,2]
        counts[j,7]  <- length(which(!is.na(as.vector(iot)))) - counts[j,3]
        counts[j,8]  <- length(which(!is.na(as.vector(iol)))) - counts[j,4]
    
        gene_lens[j] <- sum(ann_i$end-ann_i$start)
    
        j<-j+1
        rm('sen','antisen')
}

In [4]:
colnames(counts) <- c('ct_gene','cl_gene','ot_gene','ol_gene','ct_nongene','cl_nongene','ot_nongene','ol_nongene')
rownames(counts) <- names(Hsapiens)[1:24]
# counts

Here we account for male and female diploid genomes

#### Male

In [5]:
# Male genome - double the count for chromosomes 1:22
counts_dipl_m <- counts
counts_dipl_m[1:22,]<-counts_dipl_m[1:22,]*2
# counts_dipl_m

#### Female

In [6]:
# Female genome - double the counts for chroms. 1:23; count for chrom. Y is set to 0
counts_dipl_f <- counts
counts_dipl_f[1:23,]<-counts_dipl_f[1:23,]*2
counts_dipl_f[24,]<-counts_dipl_f[24,]*0
# counts_dipl_f

## Calculating probabilities of ENd (endonuclease-dependent) gene vs non-gene insertion

#### Male

In [7]:
counts_ovchr_m <- colSums(counts_dipl_m)
probs_ENd_m <- rep(0,2)
for (i in 1:2){
    probs_ENd_m[i] = weights$closed_tight*counts_ovchr_m[(i-1)*4+1] + 
                   weights$closed_loose*counts_ovchr_m[(i-1)*4+2] + 
                   weights$open_tight*counts_ovchr_m[(i-1)*4+3] + 
                   weights$open_loose*counts_ovchr_m[(i-1)*4+4]
}
probs_ENd_m <- probs_ENd_m/sum(probs_ENd_m)
probs_ENd_m

#### Female

In [8]:
counts_ovchr_f <- colSums(counts_dipl_f)
probs_ENd_f <- rep(0,2)
for (i in 1:2){
    probs_ENd_f[i] = weights$closed_tight*counts_ovchr_f[(i-1)*4+1] + 
                   weights$closed_loose*counts_ovchr_f[(i-1)*4+2] + 
                   weights$open_tight*counts_ovchr_f[(i-1)*4+3] + 
                   weights$open_loose*counts_ovchr_f[(i-1)*4+4]
}
probs_ENd_f <- probs_ENd_f/sum(probs_ENd_f)
probs_ENd_f

## Calculating probabilities of ENi (endonuclease-independent) gene vs non-gene insertion

In [9]:
chrlens <- rep(0,24)
for (i in 1:24){
    chrlens[i] <- length(Hsapiens[[i]])
}
sum(chrlens)

#### Male

In [10]:
probs_ENi_m <- rep(0,2)
probs_ENi_m[1] <- (sum(gene_lens[1:22]*2)+sum(gene_lens[23:24]))/(sum(chrlens[1:22]*2)+sum(chrlens[23:24]))
probs_ENi_m[2] <- 1-probs_ENi_m[1]
probs_ENi_m

#### Female

In [11]:
probs_ENi_f <- rep(0,2)
probs_ENi_f[1] <- sum(gene_lens[1:23]*2)/sum(chrlens[1:23]*2)
probs_ENi_f[2] <- 1-probs_ENi_f[1]
probs_ENi_f

## Calculating final probabilities of gene vs non-gene insertion

#### Male

In [12]:
pd_gnvsnon_m <- (.9*probs_ENd_m)+(0.1*probs_ENi_m) # Assuming 0.9 probability that insertion will be ENd and 0.1 for ENi
pd_gnvsnon_m <- pd_gnvsnon_m/sum(pd_gnvsnon_m)
pd_gnvsnon_m

#### Female

In [13]:
pd_gnvsnon_f <- (.9*probs_ENd_f)+(0.1*probs_ENi_f) # Assuming 0.9 probability that insertion will be ENd and 0.1 for ENi
pd_gnvsnon_f <- pd_gnvsnon_f/sum(pd_gnvsnon_f)
pd_gnvsnon_f

In [14]:
lens <- cbind(gene_lens, chrlens)
lens

gene_lens,chrlens
112034113,248956422
104246043,242193529
98072541,198295559
72773785,190214555
70407233,181538259
71356941,170805979
76562590,159345973
60676872,145138636
51144913,138394717
63228370,133797422


In [15]:
save(lens,counts_dipl_m,counts_dipl_f,pd_gnvsnon_m,pd_gnvsnon_f,file='./data/genevsnon_counts.rda')