In [3]:
# -----------------------------------------------------------------------------
# gen-sim.r

# Entry point for simulating retrotransposition of L1 elements on hg38.
#
# This script loads a file representing a series of 4x24 arrays that's generated 
# by 'get_sv_dist.r'. It uses this to calculate a 'probability of insertion' 
# vector for the chromosomes specified. 
# 
# Insertion site probabilities are based on the 'Snap-Velcro model' described 
# in Clement Monot, et al. (2013) "The Specificity and Flexibility of L1 Reverse 
# Transcription Priming at Imperfect T-Tracts." PLOS Genetics, 9:5.
#
# This script does two things:
#	* Selects the insertion sites.
#	* Prepares the sequences for insertion.
#
# Dependencies: R(>= 2.8.0, Packages - Biostrings, BSgenome (for default hg38), 
# GenomicRanges)
# -----------------------------------------------------------------------------


In [4]:
#--- Load libraries
library(Biostrings)
library(BSgenome.Hsapiens.UCSC.hg38)
library(GenomicRanges)

Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:stats’:

    IQR, mad, xtabs

The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, as.vector, cbind, colnames,
    do.call, duplicated, eval, evalq, Filter, Find, get, grep, grepl,
    intersect, is.unsorted, lapply, lengths, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
    union, unique, unlist, unsplit

Loading required package: S4Vectors
Loading required package: stats4
Loading required package: IRanges
Loading required package: XVector
Loadin

In [5]:
copyNum <- 20 # Set the number of copies

#--- An endonuclease encoded by L1 is normally required for retrotransposition
# to occur. The following variable 
# just represents the allowed fraction of endonuclease-independent insertions and
# is currently modeled as occurring in random locations. 
# This fraction of ENi insertions is tunable, default is 0.1 (10% of insertions).  
ENifrc <- 0.1

genome <- Hsapiens # hg38 human genome.

cat("Copy number: ",copyNum,"\n")
cat("ENi insertion fraction: ",ENifrc,"\n")

Copy number:  20 
ENi insertion fraction:  0.1 


In [6]:
#--- Initialize variables that will hold the simulated insertion site data
sites_loci<-c() # array of "loci" (integer in range 1-chromosomge length)
sites_chrm<-c() # array of chromosome names corresponding to each loci above 1,2,..22,X,Y
sites_strand<-c() # array containing insertion strand symbol "+", "-"
sites_classes<-c() # array containing the type of site each insertion occured at (1 of 4 Snap-Vel categories, or endonuclease-independent)

# Here we create a table mapping ["+","-"] to [1,2]. During the generation of insertion sites
# we randomly sample from [1,2] as part of the selection of the strand of the insertion.
strdict<-c("+","-")
names(strdict)<-c(1,2)

In [7]:
#--- Sample chromosomes based on probability ranking. 

# The data file chrmpd.rda must either be provided or generated by running'get_sv_dist.r'.
load("./data/chrmpd.rda")

# The variable chrmcnt, loaded from chrmpd.rda above, is 24x4 matrix containing the count
# of each type (column) of insertion site for each chromosome (row).
# Columns 1-4 are 'closed-tight', 'closed-loose', 'open-tight', 'open-loose', respectively.
# These counts can be used to form a probability model of insertion across the chromosomes.
# Here, we simply use the first column (prob=chrmcnt[,1]) as our distribution. That
# is, the number of 'closed-tight' sites.

# Sample from the list of chromosome names based on their probability ranking.
# Take copyNum samples.
chrmlist<-sample(x=names(genome)[1:24],copyNum,replace=TRUE,prob=chrmcnt[,1])
chrmlist<-table(chrmlist)
cat("\nChromosomes: ",names(chrmlist),"\n")

#--- Load map file for chosen chromosomes
for (i in names(chrmlist)) {
        cat("\nLoading map file...")
        load(paste0("./data/root_maps/",i,".rda"))
}
cat("\n")


Chromosomes:  chr1 chr11 chr12 chr14 chr16 chr2 chr21 chr3 chr4 chr5 chr7 chr8 chr9 

Loading map file...
Loading map file...
Loading map file...
Loading map file...
Loading map file...
Loading map file...
Loading map file...
Loading map file...
Loading map file...
Loading map file...
Loading map file...
Loading map file...
Loading map file...


In [8]:
#--- Generate genome-wide insertions
for (chrnm in names(chrmlist)) { # loop over the selected chromosomes
        
        # Print current chromosome
        cat("\nChromosome: ",chrnm)

        # Each chromosome map file contains locations of the 4 SV types of insertion sites
        # Variables:
        #     insites - all locations of S-V type sites
        #     ict     - indices of 'insites' which are 'closed-tight'
        #     iol     - indices of 'insites' which are 'open-loose'
        #     etc...
    
        # We use get() to extract the map data for a specific chromosome from the local variables by its name
        map<-get(paste0(chrnm,"Map"))
        ict<-map$ict
        icl<-map$icl
        iot<-map$iot
        iol<-map$iol
        insites<-map$insites

        chrcopyNum<-chrmlist[[chrnm]] # get the number of insertions in the current chromosome

        # Create the insertion probability distribution according to the enrichment values from Monot et al. (2013)
        pd <- c(11.55*length(which(!is.na(ict))),7.25*length(which(!is.na(icl))),1.95*length(which(!is.na(iot))),1*length(which(!is.na(iol))))
        pd <- (pd/sum(pd))*(1-ENifrc) # Adjust distribution to account for fraction of endonuc.-independent insertions 
        pd <- append(pd,ENifrc)
        cat("\nSite class distribution:\n",pd)

        # Sample (from the distribution defined above) which types of SV sites will be selected
        classes <- sample(x = c(1:5),chrcopyNum,replace=TRUE,prob=pd)
    
        # Allocate memory for sites and strand of each site
        sites <- rep(0,chrcopyNum)
        strand <-rep(0,chrcopyNum)

        for (i in 1:chrcopyNum) { # loop chrcopyNum times
            
                # Select a site from the set of sites of the current category. The current category
                # is provided by the current index of 'classes'
                if (classes[i]==1) {
                        tmp<-sample(c(1,2),1) # Randomly sample strand #FIXME - should be updated
                        sites[i] <- insites[ict[sample(c(1:length(which(!is.na(ict[,tmp])))),1),tmp]]
                        strand[i] <- strdict[[tmp]]
                } else if (classes[i]==2) {
                        tmp<-sample(c(1,2),1)
                        sites[i] <- insites[icl[sample(c(1:length(which(!is.na(icl[,tmp])))),1),tmp]]
                        strand[i] <- strdict[[tmp]]
                } else if (classes[i]==3) {
                        tmp<-sample(c(1,2),1)
                        sites[i] <- insites[iot[sample(c(1:length(which(!is.na(iot[,tmp])))),1),tmp]]
                        strand[i] <- strdict[[tmp]]
                } else if (classes[i]==4) {
                        tmp<-sample(c(1,2),1)
                        sites[i] <- insites[iol[sample(c(1:length(which(!is.na(iol[,tmp])))),1),tmp]]
                        strand[i] <- strdict[[tmp]]
                } else if (classes[i]==5) {
                        sites[i]<-runif(1,1,length(genome[[chrnm]]))
                        strand[i] <- strdict[[sample(c(1,2),1)]]
                }
        }

        cat("\nInsertion sites:\n")
        cat(sites,"\n")
        #cat("Chosen "Snap" sections:\n")
        #for (i in 1:copyNum) {
        #       print(chr[(sites[i]-3):sites[i]])
        #}
        
        # Append the sites selected for the current chromosome to the genome-wide variables
        sites_loci<-append(sites_loci,sites)
        sites_chrm<-append(sites_chrm,rep(chrnm,chrcopyNum))
        sites_strand<-append(sites_strand,strand)
        sites_classes<-append(sites_classes,classes)
}
rm(ict,icl,iot,iol,insites) # clean up


Chromosome:  chr1
Site class distribution:
 0.2531321 0.2497691 0.1677221 0.2293768 0.1
Insertion sites:
32574951 

Chromosome:  chr11
Site class distribution:
 0.2409367 0.2525472 0.1696881 0.236828 0.1
Insertion sites:
110147809 111007138 39523764 

Chromosome:  chr12
Site class distribution:
 0.2525038 0.2506428 0.168869 0.2279844 0.1
Insertion sites:
125117466 

Chromosome:  chr14
Site class distribution:
 0.247229 0.2542373 0.1687052 0.2298284 0.1
Insertion sites:
89500031 

Chromosome:  chr16
Site class distribution:
 0.2830065 0.2360391 0.1589274 0.222027 0.1
Insertion sites:
52680478 

Chromosome:  chr2
Site class distribution:
 0.2402244 0.2577759 0.1714281 0.2305716 0.1
Insertion sites:
147178840 134480063 

Chromosome:  chr21
Site class distribution:
 0.2361277 0.2579334 0.1707289 0.23521 0.1
Insertion sites:
35594105 39387221 

Chromosome:  chr3
Site class distribution:
 0.2381101 0.2583632 0.1719472 0.2315796 0.1
Insertion sites:
70673105 150283482 91080410 

Chromosome: 