In [1]:
library(BSgenome.Hsapiens.UCSC.hg38)
library(data.table)
for (i in names(Hsapiens)[1:24]){ # load all chromosome map files
        cat(paste0('Loading...',i,'\n'))
        load(paste0('../sim-develop/data/root_maps/',i,'.rda'))
}
load('../sim-develop/data/exann.rda')

Loading required package: BSgenome
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs

The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, cbind, colMeans, colnames,
    colSums, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply, lengths, Map, mapply, match,
    mget, order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rowMeans, rownames, rowSums, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which, which.max, which.min

Loading required package: S4Vectors
Loading require

Loading...chr1
Loading...chr2
Loading...chr3
Loading...chr4
Loading...chr5
Loading...chr6
Loading...chr7
Loading...chr8
Loading...chr9
Loading...chr10
Loading...chr11
Loading...chr12
Loading...chr13
Loading...chr14
Loading...chr15
Loading...chr16
Loading...chr17
Loading...chr18
Loading...chr19
Loading...chr20
Loading...chr21
Loading...chr22
Loading...chrX
Loading...chrY


In [2]:
head(exann)

chrom,start,end,gene_id,istsg,geneSym
chr1,11874,12227,100287102,0,DDX11L1
chr1,12595,12721,100287102,0,DDX11L1
chr1,13221,16765,100287102,0,DDX11L1
chr1,16854,17055,653635,0,WASH7P
chr1,17233,18061,653635,0,WASH7P
chr1,18268,18379,653635,0,WASH7P


#### In the cell below, we count the number of insertion sites of each Snap-Velcro type in 3 types of regions: TSG exons, non-TSG exons, and non-exonic regions

In [3]:
counts <-  array(0,dim=c(24,12)) # Allocate matrix for counts

#--- Loop through chromosome names
j<-1 # chromosome counter
for (i in names(Hsapiens)[1:24]){


        # Data objects containing indices of S-V sites for each category are labeled with the chromosome name.
        # Here we copy the data objects to a set of variables with generic names which can be used consistently 
        # in the loop. 
        map<-get(paste0(i,'Map')) # Get the site map data for the current chrom
        ict<-    map[[2]]
        icl<-    map[[3]]
        iot<-    map[[4]]
        iol<-    map[[5]]
        insites<-map[[1]]

################################################
        # count in TSG exons for current chromosome (columns 1-4)
        ann_i <- exann[exann$chrom == i & exann$istsg==1,]
    
        sen        <-inrange(insites[ict[!is.na(ict[,1]),1],1],ann_i$start,ann_i$end) # Check if any Closed-Tight category sites are within the start-end range of ann_i
        antisen    <-inrange(insites[ict[!is.na(ict[,2]),2],2],ann_i$start,ann_i$end) 
        counts[j,1]<-length(which(sen)) + length(which(antisen))               # Fill an element of the counts table with the count
        
        sen        <-inrange(insites[icl[!is.na(icl[,1]),1],1],ann_i$start,ann_i$end) 
        antisen    <-inrange(insites[icl[!is.na(icl[,2]),2],2],ann_i$start,ann_i$end)
        counts[j,2]<-length(which(sen)) + length(which(antisen))
        
        sen        <-inrange(insites[iot[!is.na(iot[,1]),1],1],ann_i$start,ann_i$end) 
        antisen    <-inrange(insites[iot[!is.na(iot[,2]),2],2],ann_i$start,ann_i$end)
        counts[j,3]<-length(which(sen)) + length(which(antisen))
        
        sen        <-inrange(insites[iol[!is.na(iol[,1]),1],1],ann_i$start,ann_i$end) 
        antisen    <-inrange(insites[iol[!is.na(iol[,2]),2],2],ann_i$start,ann_i$end)
        counts[j,4]<-length(which(sen)) + length(which(antisen))  

################################################
        # count in non-TSG exons for current chromosome (columns 5-8)
        ann_i <- exann[exann$chrom == i & exann$istsg==0,] # only difference from last block is setting exann$istsg==0 here (and different columns of 'counts')

        sen        <-inrange(insites[ict[!is.na(ict[,1]),1],1],ann_i$start,ann_i$end) # Check if any Closed-Tight category sites are within the start-end range of ann_i
        antisen    <-inrange(insites[ict[!is.na(ict[,2]),2],2],ann_i$start,ann_i$end) 
        counts[j,5]<-length(which(sen)) + length(which(antisen))                      # Fill an element of the counts table with the count
        
        sen        <-inrange(insites[icl[!is.na(icl[,1]),1],1],ann_i$start,ann_i$end) 
        antisen    <-inrange(insites[icl[!is.na(icl[,2]),2],2],ann_i$start,ann_i$end)
        counts[j,6]<-length(which(sen)) + length(which(antisen))
        
        sen        <-inrange(insites[iot[!is.na(iot[,1]),1],1],ann_i$start,ann_i$end) 
        antisen    <-inrange(insites[iot[!is.na(iot[,2]),2],2],ann_i$start,ann_i$end)
        counts[j,7]<-length(which(sen)) + length(which(antisen))
        
        sen        <-inrange(insites[iol[!is.na(iol[,1]),1],1],ann_i$start,ann_i$end) 
        antisen    <-inrange(insites[iol[!is.na(iol[,2]),2],2],ann_i$start,ann_i$end)
        counts[j,8]<-length(which(sen)) + length(which(antisen))  
    
################################################
        # count in non-exons for current chromosome (columns 9-12)
        
        counts[j,9]   <- length(which(!is.na(as.vector(ict)))) - counts[j,1] - counts[j,5]
        counts[j,10]  <- length(which(!is.na(as.vector(icl)))) - counts[j,2] - counts[j,6]
        counts[j,11]  <- length(which(!is.na(as.vector(iot)))) - counts[j,3] - counts[j,7]
        counts[j,12]  <- length(which(!is.na(as.vector(iol)))) - counts[j,4] - counts[j,8]
    
        j<-j+1
        rm('sen','antisen')
}

In [4]:
colnames(counts) <- c('ct_tsg','cl_tsg','ot_tsg','ol_tsg','ct_nontsg','cl_nontsg','ot_nontsg','ol_nontsg','ct_nonexon','cl_nonexon','ot_nonexon','ol_nonexon')
rownames(counts) <- names(Hsapiens)[1:24]
counts

Unnamed: 0,ct_tsg,cl_tsg,ot_tsg,ol_tsg,ct_nontsg,cl_nontsg,ot_nontsg,ol_nontsg,ct_nonexon,cl_nonexon,ot_nonexon,ol_nonexon
chr1,2713,5472,11420,39432,46985,105613,228957,812672,2622841,4089980,10248146,27118910
chr2,2352,5160,11039,34620,37507,91020,195883,665516,2747329,4668518,11573976,30198334
chr3,3177,6653,13964,46905,28745,67379,143562,494504,2281438,3924864,9737293,25445085
chr4,1219,3064,6358,21576,24456,58391,127386,405276,2213927,4008126,10072339,25611500
chr5,1232,2828,5849,20207,26986,62472,134077,446793,2062826,3615352,8968452,23484693
chr6,2077,4353,9234,29694,26223,63747,137065,464873,1974473,3396230,8463275,21844592
chr7,1406,3367,7100,24242,25117,57073,123541,424872,1890448,3029610,7589918,19654782
chr8,1416,3058,6415,21647,18621,43580,94079,314392,1624416,2802963,6985742,18325238
chr9,1236,2678,5710,19745,18054,43017,92306,330894,1393508,2252541,5579878,14791315
chr10,1188,2831,5717,19834,20928,50012,106943,365097,1509965,2414919,6007481,15971029


## Calculating ENd (endonuclease-dependent) mutation type probabilities

In [5]:
counts_ovchr <- colSums(counts)
probs_ENd <- rep(0,3)
for (i in 1:3){
    probs_ENd[i] = 11.55*counts_ovchr[(i-1)*4+1] + 
               7.25*counts_ovchr[(i-1)*4+2] + 
               1.95*counts_ovchr[(i-1)*4+3] + 
               1*counts_ovchr[(i-1)*4+4]
}
probs_ENd <- probs_ENd/sum(probs_ENd)
probs_ENd

## Calculating ENi (endonuclease-independent) mutation type probabilities

In [6]:
genomeLen <- 0
for (i in 1:24){
    genomeLen <- genomeLen + length(Hsapiens[[i]])
}
genomeLen

In [7]:
probs_ENi <- rep(0,3)
tmp <- exann[exann$istsg==1,]
probs_ENi[1] <- sum(tmp$end-tmp$start)/genomeLen
tmp2 <- exann[exann$istsg==0,]
probs_ENi[2] <- sum(tmp2$end-tmp2$start)/genomeLen
probs_ENi[3] <- (genomeLen-sum(tmp$end-tmp$start)-sum(tmp2$end-tmp2$start))/genomeLen
probs_ENi

In [8]:
pdfinal <- (.9*probs_ENd)+(0.1*probs_ENi)
pdfinal <- pdfinal/sum(pdfinal)
pdfinal

In [9]:
save(pdfinal,file='../sim-develop/data/mutation_type_pd.rda')