## Load libraries and necessary data files

In [13]:
library(data.tree)
library(data.table)
library(BSgenome.Hsapiens.UCSC.hg38)
genome <- Hsapiens
load("../data/exann.rda")
load("../data/chrmpd.rda") # load chromosome probability distribution
load("../data/L1RankTable.rda")
load('../data/chromSitePd.rda')
trpd <- read.table("../data/L1truncpd.csv",sep=",")
tdpd <- read.table("../data/L1transdpd.csv",sep=",")
for (i in names(Hsapiens)[1:24]){ # load all chromosome map files
        load(paste0("../data/root_maps/",i,".rda"))
}
strdict<-c("+","-")
names(strdict)<-c(1,2)

## Define Functions

In [3]:
# PURPOSE: To simulate L1 insertions in hg38
#
# INPUT:
#   copyNum       (integer) number of L1 insertions to simulate
#
# OUTPUT:         (list) loci of insertions, indices of inserted L1s from L1RankTable
# 
# Note: See gen-sim-notebook.ipynb for a more highly commented version
gen_sim <- function(copyNum) {

        sites_loci<-c() # Initialize arrays for storing simulated ins. site data
        sites_chrm<-c()
        sites_strand<-c()
        sites_classes<-c()

        #--- Sample chromosomes based on probability ranking. The data file chrmpd.rda 
        #--- must either be provided or generated by running 'get_sv_dist.r'.
        #--- Here the ranking is provided by the number of 'TTTT' (closed-tight) sites.
        chrmlist<-sample(x=names(genome)[1:24],copyNum,replace=TRUE,prob=chrmpd[,1])
        chrmlist<-table(chrmlist)

        for (chrnm in names(chrmlist)) {

                map<-get(paste0(chrnm,"Map"))
                ict<-map[[2]]
                icl<-map[[3]]
                iot<-map[[4]]
                iol<-map[[5]]
                insites<-map[[1]]

                chrcopyNum<-chrmlist[[chrnm]]

                #--- Generates insertion sites
                classes <- sample(x = c(1:5),chrcopyNum,replace=TRUE,prob=pds[[chrnm]])
                sites <- rep(0,chrcopyNum)
                strand <-rep(0,chrcopyNum)

            
                for (i in 1:chrcopyNum) {
                        if (classes[i]==1) {
                            tmp<-sample(c(1,2),1)
                            sites[i] <- insites[ict[sample(c(1:length(which(!is.na(ict[,tmp])))),1),tmp]]
                            strand[i] <- strdict[[tmp]]
                        } else if (classes[i]==2) {
                            tmp<-sample(c(1,2),1)
                            sites[i] <- insites[icl[sample(c(1:length(which(!is.na(icl[,tmp])))),1),tmp]]
                            strand[i] <- strdict[[tmp]]
                        } else if (classes[i]==3) {
                            tmp<-sample(c(1,2),1)
                            sites[i] <- insites[iot[sample(c(1:length(which(!is.na(iot[,tmp])))),1),tmp]]
                            strand[i] <- strdict[[tmp]]
                        } else if (classes[i]==4) {
                            tmp<-sample(c(1,2),1)
                            sites[i] <- insites[iol[sample(c(1:length(which(!is.na(iol[,tmp])))),1),tmp]]
                            strand[i] <- strdict[[tmp]]
                        } else if (classes[i]==5) {
                            sites[i]<-runif(1,1,length(genome[[chrnm]]))
                            strand[i] <- strdict[[sample(c(1,2),1)]]
                        }
                }

                sites_loci<-append(sites_loci,sites)
                sites_chrm<-append(sites_chrm,rep(chrnm,chrcopyNum))
                sites_strand<-append(sites_strand,strand)
                sites_classes<-append(sites_classes,classes)

        }

        # Sample copyNum L1s from the list (with replacement), based on activity ranking
        l1indcs <- sample(x=c(1:40), copyNum, replace=TRUE, prob=L1RankTable$score[1:40])

        # Sample copyNum truncation fractions and transduction lengths from their respective 
        # probability densities trpd, and tdpd
#         trfrc <- sample(x=trpd[[1]], copyNum, replace=TRUE, prob=trpd[[2]])
#         tdlen <- sample(x=tdpd[[1]], copyNum, replace=TRUE, prob=tdpd[[2]])

    return(list(sites_chrm,sites_loci,sites_strand,l1indcs))
}

In [5]:
# PURPOSE: To count the number of driver and passenger mutations in a new clone and determine 
#          the birth rate and the genes affected
#
# INPUT:
#   node          (data.tree node) current node of genotype tree
#   anno          (data frame) Annotation of genes (i.e. chromosome   start   end)
#   sites_chrm    (character vector) chromosomes containing L1 insertions for the clone
#   sites_loci    (numeric vector) insertion positions (in the respective chromosome)
#   sd            (float) driver mutation selection
#   sp            (float) passenger mutation selection
#
# OUTPUT:
#   B             (float) birth rate
#   np            (int) number of passenger mutations
#   nd            (int) number of driver mutations
#   genes         (list of strings) list of symbols of affected genes
count_hits <- function(node, anno, sites_chrm, sites_loci, sd, sp) {

    np<-0; # set counters to zero
    nd<-0;
    genes <- c()
    
    for (i in 1:length(unique(sites_chrm))) { # loop over chromosomes inserted into
        
        tmp=anno[anno$chrom==unique(sites_chrm)[i],] # reduce annotation table to entries for current chrom

        tmp2 = sites_loci[sites_chrm==unique(sites_chrm)[i]] # reduce insertion loci to entries for current chrom
        
        ins <- lapply(tmp2,between,tmp$start,tmp$end) # find indices of affected genes
        ins <- unlist(lapply(ins,which))
        
        if (length(ins)>0) {
            genes <- rbind(genes,data.table(sym=tmp$geneSym[ins],tsg=tmp$istsg[ins]))
            np <- np+length(which(tmp$istsg[ins]==0)) # count the number of non-tsg insertions
            nd <- nd+length(which(tmp$istsg[ins]==1)) # count the number of tsg insertions
        } else {
            np <- 0
            nd <- 0
        }

    }
    if (np > 0 || nd > 0) {
        b <- ((1+sd)^(node$nd+nd))/((1+sp)^(node$np+np)) # Calculate birth rate of clone
    } else {
        b <- node$B
    }

    return(list(b, np, nd, genes))
}



In [41]:
# PURPOSE: To call gen_sim.r with some probability (probability of transposition, mu) for a clone at a time step
#
# INPUT:
#   node        (data.tree node) current node of the data tree
#   sp          passenger mutation selection coefficient
#   sd          driver mutation selection coefficient
#
# OUTPUT: void
maybeTranspose <- function(node, sd, sp) {

    if (node$B==0){ # if the birth rate of the clone is zero, skip the node
        return()
    }
    
    # increase the number of cells by the existing number * the birth rate
    # decrease by the existing number * the death rate
    nc <- max(tail(node$ncells,n=1) + round(tail(node$ncells,n=1)*node$B) - round(tail(node$ncells,n=1)*(tail(N,n=1)/N0)), 0)
    
    # sample from binomial distribution for number of transpositions
    if (nc < 4.2e9) {ntrans <- rbinom(1,nc,mu)} # rbinom() fails for large n
    else {ntrans <- nc*mu} # If n is too large, use the expected number of events (mean of distribution)
    if (ntrans > 0) {
        simout <- gen_sim(ntrans)
        nc <- nc-ntrans
        for (i in 1:ntrans) { #### possible bottleneck
            l<<-l+1
            # determine birth rate, number of driver/passenger mutations, and affected genes of new clone
            tmp1 <- count_hits(node, exann, lapply(simout,'[',i)[[1]], lapply(simout,'[',i)[[2]], sd, sp)
            # determine the loci/L1s of insertions in the new clone
            tmp2 <- mapply(append, lapply(simout,'[',i), node$tes, SIMPLIFY = FALSE)
            node$AddChild(l, ncells=1, B=tmp1[[1]], nd=node$nd+tmp1[[3]], np=node$np+tmp1[[2]], genes=rbind(node$genes,tmp1[[4]]), tes=tmp2)
            gn <<- rbind(gn,tmp1[[4]]) # append hit genes to list
        }
    }   
    node$ncells <- append(node$ncells,nc)
}

### Define parameters

In [61]:
#--- Set simulation parameters
ENifrc<- .1       # Fraction of endonuclease-independent (random) insertions
N0 <- 1           # Initial number of cells in root clone
B0 <- 1           # Initial birth rate
mu <- 0.2         # Probability of transposition / timestep of a single cell

NT <- 40           # Number of time steps

### Clone tree creation

In [160]:
l<-1 # Clone counter

N <- N0 # total population size
gn <- c() # list of genes hit

CellPop <- Node$new(1) # Initialize data.tree as single node

CellPop$ncells <- c(N0) # Set initial number of cells of clone
CellPop$B <- B0 # Set initial birth rate of clone
CellPop$np <- 0 # Set initial number of drivier mutations
CellPop$nd <- 0 # Set initial number of passenger mutations
CellPop$genes <- c()
# Set the L1 insertions existing in root node
CellPop$tes <- list(c(),c(),c(),c())
# CellPop$tes <- list(c("chr1","chr1"),c(11874,11149270),c("+","+"),c(0,0))

# update attributes based on initialized insertions
tmp <- count_hits(CellPop, exann, CellPop$tes[[1]], CellPop$tes[[2]], 0.1, 0.001)
CellPop$B <- tmp[[1]]
CellPop$np <- tmp[[2]]
CellPop$nd <- tmp[[3]]
CellPop$genes <- tmp[[4]]
gn <- tmp[[4]] # list of genes hit

# print birth rate of root clone
CellPop$B

ptm <- proc.time()
for (i in 2:NT) {

    t <- Traverse(CellPop,traversal='pre-order')#,filterFun=function(x) tail(x$ncells,n=1) > 0)    
    lapply(t,maybeTranspose,0.5,0.5)
    
    if (length(t)==1) {
        N <- append(N, tail(t[[1]]$ncells,n=1))
    } else {
        N <- append(N,sum(vapply(CellPop$Get('ncells'),tail,n=1L,FUN.VALUE = numeric(1))))
    }

    
}
(proc.time() - ptm)[3]

# print(CellPop,'ncells','B')
# sum(vapply(CellPop$Get('ncells'),tail,n=1L,FUN.VALUE = numeric(1)))
# CellPop$Get('ncells')
N
gn

NULL