## Load libraries and necessary data files, and define global variables

In [40]:
#--- Load libraries and necessary data files, and define global variables
library(data.tree)
library(data.table)
library(Biostrings)
library(BSgenome.Hsapiens.UCSC.hg38)
library(GenomicRanges)
genome <- Hsapiens
source("process_L1s.r")
source("mapsequence.r")
load("../data/chrmpd.rda") # load chromosome probability distribution
load("../data/L1RankTable.rda")
load("../data/geneann.rda")
load("../data/exann.rda")
geneann = geneann[geneann$type=='gene']
trpd <- read.table("../data/L1truncpd.csv",sep=",")
tdpd <- read.table("../data/L1transdpd.csv",sep=",")
# for (i in names(Hsapiens)[1:24]){ # load all chromosome map files
#         load(paste0("../data/root_maps/",i,".rda"))
# }
strdict<-c("+","-")
names(strdict)<-c(1,2)

## Define Functions

In [41]:
# PURPOSE: To update the insertion site annotation of a chromosome for a clone.
# The L1 insertions which have occurred in the clone will be accounted for
#
# INPUT:
#   chrnm         (string) chromosome name
#   chrMap        (list) chromosome annotation
#   sites_chrm    (numeric vector) chromosomes containing L1 insertions for the clone
#   sites_loci    (character vector) insertion positions (in the respective chromosome)
#   l1s           (DNAStringSet) l1 sequences and orientation 
#
# OUTPUT: (list) updated chromosome annotation


update_chrom_map <- function(chrnm,chrMap,sites_chrm,sites_loci,l1s) {  
    
    if (length(sites_chrm)==0){
        return(chrMap)
    }

    ict<-chrMap$ict
    icl<-chrMap$icl
    iot<-chrMap$iot
    iol<-chrMap$iol
    insites<-chrMap$insites 

    chrloci = sites_loci[sites_chrm==chrnm] # Get the sites where insertions occurred in the chromosome
    chrl1s = l1s[sites_chrm==chrnm] # Get the L1s elements which were inserted    

    for (i in 1:length(chrloci)) { # Loop over the simulated insertion sites
        insites[which(is.na(insites))]<- -1 # Replace NA with -1
        indx <- insites>chrloci[i] # Get indices of target sites which lie downstream of the point
        insites[indx] <- insites[indx] + width(chrl1s[i]) # Shift the target sites by the length of the L1
        l1_map <- mapSeq_SV(chrl1s[i]) # Map target sites in the L1
        l1_map$insites <- l1_map$insites + chrloci[i] # Convert L1 loci to chromosome loci
        insites <- rbind(insites,l1_map$insites) # Add target sites within L1 to chrom map
        ict <- rbind(ict,l1_map$ict)
        icl <- rbind(icl,l1_map$icl)
        iot <- rbind(iot,l1_map$iot)
        iol <- rbind(iol,l1_map$iol)
    }

    return(list(insites,ict,icl,iot,iol))

}

In [42]:
# PURPOSE: To simulate the transposition of an L1 sequence in a given genome
#
# INPUT:
#   genome        (BSgenome) reference genome
#   node          (data.tree node) input node of tree
#   copyNum       (integer) number of L1 insertions to simulate
#
# OUTPUT: (list) inserted sequences, sites, and strand

gen_sim <- function(genome,node,copyNum) {

        sites_loci<-c() # Initialize arrays for storing simulated ins. site data
        sites_chrm<-c()
        sites_strand<-c()
        sites_classes<-c()

        #--- Sample chromosomes based on probability ranking. The data file chrmpd.rda 
        #--- must either be provided or generated by running 'get_sv_dist.r'.
        #--- Here the ranking is provided by the number of 'TTTT' patterns.
        chrmlist<-sample(x=names(genome)[1:24],copyNum,replace=TRUE,prob=chrmcnt[,1])
        chrmlist<-table(chrmlist)

        for (chrnm in names(chrmlist)) {


                #cat("\nChromosome: ",chrnm)
            
                map<-get(paste0(chrnm,"Map"))
                map<-update_chrom_map(chrnm,map,node$sites_chrm,node$sites_loci,node$l1s)
                ict<-map$ict
                icl<-map$icl
                iot<-map$iot
                iol<-map$iol
                insites<-map$insites
            

                chrcopyNum<-chrmlist[[chrnm]]

                pd <- c(11.55*length(which(!is.na(ict))),
                        7.25*length(which(!is.na(icl))),
                        1.95*length(which(!is.na(iot))),
                        1*length(which(!is.na(iol))))
                pd <- (pd/sum(pd))*(1-ENifrc)
                pd <- append(pd,ENifrc)
                #cat("\nSite class distribution:\n",pd)

                #--- Generates insertion sites
                classes <- sample(x = c(1:5),chrcopyNum,replace=TRUE,prob=pd)
                sites <- rep(0,chrcopyNum)
                strand <-rep(0,chrcopyNum)
            
                for (i in 1:chrcopyNum) {
                        if (classes[i]==1) {
                                tmp<-sample(c(1,2),1)
                                sites[i] <- insites[ict[sample(c(1:length(which(!is.na(ict[,tmp])))),1),tmp]]
                                strand[i] <- strdict[[tmp]]
                        } else if (classes[i]==2) {
                                tmp<-sample(c(1,2),1)
                                sites[i] <- insites[icl[sample(c(1:length(which(!is.na(icl[,tmp])))),1),tmp]]
                                strand[i] <- strdict[[tmp]]
                        } else if (classes[i]==3) {
                                tmp<-sample(c(1,2),1)
                                sites[i] <- insites[iot[sample(c(1:length(which(!is.na(iot[,tmp])))),1),tmp]]
                                strand[i] <- strdict[[tmp]]
                        } else if (classes[i]==4) {
                                tmp<-sample(c(1,2),1)
                                sites[i] <- insites[iol[sample(c(1:length(which(!is.na(iol[,tmp])))),1),tmp]]
                                strand[i] <- strdict[[tmp]]
                        } else if (classes[i]==5) {
                                sites[i]<-runif(1,1,length(genome[[chrnm]]))
                                strand[i] <- strdict[[sample(c(1,2),1)]]
                        }
                }

                sites_loci<-append(sites_loci,sites)
                sites_chrm<-append(sites_chrm,rep(chrnm,chrcopyNum))
                sites_strand<-append(sites_strand,strand)
                sites_classes<-append(sites_classes,classes)

        }

        #--- Creates sequences for insertion
        tmp <- process_L1s(genome,L1RankTable,trpd,tdpd,copyNum)
        l1s <- tmp[[1]]
        l1indcs <- tmp[[2]]
        tdlen <- tmp[[3]]
        trlen <- tmp[[4]]
        return(list(l1s,sites_chrm,sites_loci,sites_strand))
}




In [43]:
# PURPOSE: To check whether the division rate of a new clone should be changed from its parent
#
# INPUT:
#   r             (float) division rate
#   geneann       annotation of genes
#   sites_chrm    (numeric vector) chromosomes containing L1 insertions for the clone
#   sites_loci    (character vector) insertion positions (in the respective chromosome)
#
# OUTPUT: (float) possibly updated division rate

rank_clone <- function(r, geneann, sites_chrm, sites_loci) {
    
    gene_hits=0; # set counter to zero
    tsg_hits=0;
    for (i in 1:length(unique(sites_chrm))) { # loop over chromosomes inserted into
        tmp=geneann[geneann$chrom==unique(sites_chrm)[i],] # reduce annotation table to entries for current chrom
        chrmann_ntsg=tmp[tmp$istsg==0,]
        chrmann_tsg =tmp[tmp$istsg==1,]
        tmp = sites_loci[sites_chrm==unique(sites_chrm)[i]] # reduce insertion loci to entries for current chrom
        tmp_hits = between(tmp,chrmann_ntsg$start,chrmann_ntsg$end) # create logical for insertions, whether into non-tsg-gene or not
        gene_hits=gene_hits+length(which(tmp_hits==TRUE)) # count the number of non-tsg-gene insertions
        #print(gene_hits)
        tmp_hits  = between(tmp,chrmann_tsg$start,chrmann_tsg$end) 
        tsg_hits =tsg_hits+length(which(tmp_hits==TRUE))
    }
    
    if (gene_hits > 0) {
        r=0
    } else if (tsg_hits > 0) {
        r = r*2^tsg_hits; # TSG insertion doubles cell division rate
    }
    
    return(r)
}

In [44]:
# Updates the gene annotation of the clone

update_geneann <- function(geneann, simout, tes) {
    
    tmp = mapply(append, simout, tes, SIMPLIFY = FALSE)
    for (i in 1:length(tmp[[3]])) {
        geneann[geneann$chrom==tmp[[2]][i] & geneann$start>tmp[[3]][i],]$start <- geneann[geneann$chrom==tmp[[2]][i] & geneann$start>tmp[[3]][i],]$start + width(tmp[[1]][i])  
        geneann[geneann$chrom==tmp[[2]][i] & geneann$start>tmp[[3]][i],]$end <- geneann[geneann$chrom==tmp[[2]][i] & geneann$start>tmp[[3]][i],]$end + width(tmp[[1]][i])  
        geneann[geneann$chrom==tmp[[2]][i] & geneann$end>tmp[[3]][i] & geneann$start<tmp[[3]][i],]$end <- geneann[geneann$chrom==tmp[[2]][i] & geneann$end>tmp[[3]][i] & geneann$start<tmp[[3]][i],]$end + width(tmp[[1]][i])        
    }
    return(geneann) 
}

In [45]:
# PURPOSE: To call gen_sim.r with some probability (probability of transposition, tp) for each clone at each time step
#
# INPUT:
#   node          (data.tree node) current node of the data tree
#   tnum          (integer) time step number
#
# OUTPUT: void

maybeTranspose <- function(node,tnum) {
    
    if (node$r==0){
        return()
    }
    
    if (sample(x=c(0,1),1,prob=c(1-node$tp, node$tp))) {
        simout <- gen_sim(genome,node,1)#round(runif(1,1,3)))
        geneann <- update_geneann(exann,simout,node$tes)
        r_tmp <- rank_clone(node$r, exann, simout[[2]], simout[[3]])
        tmp<-mapply(append, simout, node$tes, SIMPLIFY = FALSE)
        node$AddChild(tnum, r=r_tmp, tp=node$tp, tes=tmp, ncells=1)
    }
    
    node$ncells <- node$ncells + (node$ncells*node$r)
 
}



### Define parameters

In [46]:
#--- Set simulation parameters
ENifrc<- .1       # Fraction of endonuclease-independent (random) insertions
inPopSize <- 1   # Initial number of cells in root clone
inDivRate <- 1    # Initial division rate
intp <- 0.5       # Initial probability of transposition

NT <- 5           # Number of time iterations


### Clone tree creation

In [47]:
CellPop <- Node$new(1)
CellPop$ncells <- inPopSize
CellPop$r <- inDivRate
CellPop$tp <- intp
CellPop$tes <- list(DNAStringSet("TTATTTA"),c("chr1"),c(1001140),c("+"))
CellPop$r <- rank_clone(CellPop$r, exann, CellPop$tes[[2]], CellPop$tes[[3]])
CellPop$r

ptm <- proc.time()
for (i in 2:NT) {
    
    CellPop$Do(maybeTranspose,i)
               
}
proc.time() - ptm

   user  system elapsed 
  1.412   0.200   1.614 

In [48]:
print(CellPop,'ncells','r')

  levelName ncells r
1 1             16 1
2  ¦--2          1 0
3  ¦--3          4 1
4  ¦   °--4      2 1
5  °--4          2 1
6      °--5      1 1


# Scratch

In [26]:
load('../data/exann.rda')

In [30]:
# Count how many bp are annotated as exons
tmp<-0 # tmp will store the highest gene end value
lens<-0 # 
for (i in 1:nrow(exann)) { # Loop over GFF rows
    if (tmp > exann$end[i]){ # If tmp is greater than the current gene end, skip to next loop
        next
    }
    else if (tmp > exann$start[i]) { # Else if tmp is greater than the current gene start, only add the difference between the current gene end and tmp
        lens<-lens+(exann$end[i]-tmp)
    }
    else {lens<-lens+(exann$end[i]-exann$start[i])} # Else, add the difference between the current start and end
    tmp <- exann$end[i] # Get end of current gene
}
lens

In [31]:
tmp <- IRanges(exann$start,exann$end)
sum(width(tmp))

In [35]:
l <-0
for (i in names(Hsapiens)[1:24]) {
    l <- l + length(Hsapiens[[i]])
}

In [36]:
sum(width(tmp))/l

In [None]:
# This demonstrates that each row of the GFF has values increasing from start to end
gnan <- geneann
for (i in 1:nrow(gnan)) {
    
    if (gnan$start[i] > gnan$end[i]){
    print(geneann[i])
    break
    }

}

In [23]:
head(exann[exann$istsg==0,])

chrom,start,end,istsg
chr1,11874,12227,0
chr1,12595,12721,0
chr1,12613,12721,0
chr1,12646,12697,0
chr1,13221,14409,0
chr1,13403,14409,0


In [38]:
# Test rank_clone
r <- rank_clone(1, exann, "chr1", 11873)
r