## Load libraries and necessary data files, and define global variables

In [1]:
#--- Load libraries and necessary data files, and define global variables
library(data.tree)
library(data.table)
library(Biostrings)
library(BSgenome.Hsapiens.UCSC.hg38)
library(GenomicRanges)
genome <- Hsapiens
source("process_L1s.r")
source("mapsequence.r")
load("./data/chrmpd.rda") # load chromosome probability distribution
load("./data/L1RankTable.rda")
load("./data/geneann.rda")
trpd <- read.table("./data/L1truncpd.csv",sep=",")
tdpd <- read.table("./data/L1transdpd.csv",sep=",")
for (i in names(Hsapiens)[1:24]){ # load all chromosome map files
        load(paste0("./data/root_maps/",i,".rda"))
}
strdict<-c("+","-")
names(strdict)<-c(1,2)

Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:stats’:

    IQR, mad, xtabs

The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, as.vector, cbind, colnames,
    do.call, duplicated, eval, evalq, Filter, Find, get, grep, grepl,
    intersect, is.unsorted, lapply, lengths, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
    union, unique, unlist, unsplit

Loading required package: S4Vectors
Loading required package: stats4
Loading required package: IRanges

Attaching package: ‘IRanges’

The follo

## Define Functions

In [2]:
update_chrom_map <- function(chrnm,chrMap,sites_chrm,sites_loci,l1s) {  
    
if (length(sites_chrm)==0){
    return(chrMap)
}
    
ict<-chrMap$ict
icl<-chrMap$icl
iot<-chrMap$iot
iol<-chrMap$iol
insites<-chrMap$insites 
    
chrloci = sites_loci[sites_chrm==chrnm] # Get the sites where insertions occurred in the chromosome
chrl1s = l1s[sites_chrm==chrnm] # Get the L1s elements which were inserted    
    
for (i in 1:length(chrloci)) { # Loop over the simulated insertion points
    insites[which(is.na(insites))]<- -1 # Replace NA with -1
    indx <- insites>chrloci[i] # Get indices of target sites which lie downstream of the point
    insites[indx] <- insites[indx] + width(chrl1s[i]) # Shift the target sites by the length of the L1
    l1_map <- mapSeq_SV(chrl1s[i]) # Map target sites in the L1
    l1_map$insites <- l1_map$insites + chrloci[i] # Convert L1 loci to chromosome loci
    insites <- rbind(insites,l1_map$insites) # Add target sites within L1 to chrom map
    ict <- rbind(ict,l1_map$ict)
    icl <- rbind(icl,l1_map$icl)
    iot <- rbind(iot,l1_map$iot)
    iol <- rbind(iol,l1_map$iol)
}

return(list(insites,ict,icl,iot,iol))

}

In [3]:
gen_sim <- function(genome,node,copyNum) {

        sites_loci<-c() # Initialize arrays for storing simulated ins. site data
        sites_chrm<-c()
        sites_strand<-c()
        sites_classes<-c()

        #--- Sample chromosomes based on probability ranking. The data file chrmpd.rda 
        #--- must either be provided or generated by running 'get_sv_dist.r'.
        #--- Here the ranking is provided by the number of 'TTTT' patterns.
        chrmlist<-sample(x=names(genome)[1:24],copyNum,replace=TRUE,prob=chrmcnt[,1])
        chrmlist<-table(chrmlist)

        for (chrnm in names(chrmlist)) {


                #cat("\nChromosome: ",chrnm)
            
                map<-get(paste0(chrnm,"Map"))
                map<-update_chrom_map(chrnm,map,node$sites_chrm,node$sites_loci,node$l1s)
                ict<-map$ict
                icl<-map$icl
                iot<-map$iot
                iol<-map$iol
                insites<-map$insites
            

                chrcopyNum<-chrmlist[[chrnm]]

                pd <- c(11.55*length(which(!is.na(ict))),
                        7.25*length(which(!is.na(icl))),
                        1.95*length(which(!is.na(iot))),
                        1*length(which(!is.na(iol))))
                pd <- (pd/sum(pd))*(1-ENifrc)
                pd <- append(pd,ENifrc)
                #cat("\nSite class distribution:\n",pd)

                #--- Generates insertion sites
                classes <- sample(x = c(1:5),chrcopyNum,replace=TRUE,prob=pd)
                sites <- rep(0,chrcopyNum)
                strand <-rep(0,chrcopyNum)
            
                for (i in 1:chrcopyNum) {
                        if (classes[i]==1) {
                                tmp<-sample(c(1,2),1)
                                sites[i] <- insites[ict[sample(c(1:length(which(!is.na(ict[,tmp])))),1),tmp]]
                                strand[i] <- strdict[[tmp]]
                        } else if (classes[i]==2) {
                                tmp<-sample(c(1,2),1)
                                sites[i] <- insites[icl[sample(c(1:length(which(!is.na(icl[,tmp])))),1),tmp]]
                                strand[i] <- strdict[[tmp]]
                        } else if (classes[i]==3) {
                                tmp<-sample(c(1,2),1)
                                sites[i] <- insites[iot[sample(c(1:length(which(!is.na(iot[,tmp])))),1),tmp]]
                                strand[i] <- strdict[[tmp]]
                        } else if (classes[i]==4) {
                                tmp<-sample(c(1,2),1)
                                sites[i] <- insites[iol[sample(c(1:length(which(!is.na(iol[,tmp])))),1),tmp]]
                                strand[i] <- strdict[[tmp]]
                        } else if (classes[i]==5) {
                                sites[i]<-runif(1,1,length(genome[[chrnm]]))
                                strand[i] <- strdict[[sample(c(1,2),1)]]
                        }
                }

                sites_loci<-append(sites_loci,sites)
                sites_chrm<-append(sites_chrm,rep(chrnm,chrcopyNum))
                sites_strand<-append(sites_strand,strand)
                sites_classes<-append(sites_classes,classes)

        }

        #--- Creates sequences for insertion
        tmp <- process_L1s(genome,L1RankTable,trpd,tdpd,copyNum)
        l1s <- tmp[[1]]
        l1indcs <- tmp[[2]]
        tdlen <- tmp[[3]]
        trlen <- tmp[[4]]

        return(list(l1s,sites_chrm,sites_loci,sites_strand))
}




In [4]:
rank_clone <- function(r, geneann, sites_chrm, sites_loci) {
    
    gene_hits=0; # set counter to zero
    tsg_hits=0;
    for (i in 1:length(sites_chrm)) { # loop over chromosomes inserted into
        tmp=geneann[geneann$chrom==unique(sites_chrm)[i]] # reduce annotation table to entries for current chrom
        chrmann_ntsg=tmp[tmp$istsg==0]
        chrmann_tsg =tmp[tmp$istsg==1]
        tmp = sites_loci[sites_chrm==unique(sites_chrm)[i]] # reduce insertion loci to entries for current chrom
        tmp_hits = between(tmp,chrmann_ntsg$start,chrmann_ntsg$end) # create logical for insertions, whether into non-tsg-gene or not
        gene_hits=gene_hits+length(which(tmp_hits==TRUE)) # count the number of non-tsg-gene insertions
        tmp_hits  = between(tmp,chrmann_tsg$start,chrmann_tsg$end) 
        tsg_hits =tsg_hits+length(which(tmp_hits==TRUE))
    }
    
    if (gene_hits > 0) {
        r=0
    } else if (tsg_hits > 0) {
        r = r+tsg_hits*2; # TSG insertion doubles cell division rate
    }
    
    return(r)
}

In [5]:
maybeTranspose <- function(node,tnum) {
    
    if (sample(x=c(0,1),1,prob=c(1-node$tp, node$tp))) {
        simout <- gen_sim(genome,node, 1)#round(runif(1,1,3)))
        #ncln = ncln+1
        r_tmp <- rank_clone(node$r, geneann, simout$sites_chrm, simout$sites_loci)
        node$AddChild(tnum)
        Set(node$children, r=r_tmp, tp=node$tp, l1s=node$l1s, sites=node$sites, ncells=1)
        
    }
    
    node$ncells <- node$ncells*(2^node$r)
 
}



### Define parameters

In [6]:
#--- Set simulation parameters
ENifrc<- .1       # Fraction of endonuclease-independent (random) insertions
inPopSize <- 10   # Initial number of cells in root clone
inDivRate <- 1    # Initial division rate
intp <- 0.5       # Initial probability of transposition

NT <- 10           # Number of time iterations


### Clone tree creation

In [7]:
CellPop <- Node$new(1)
CellPop$ncells <- inPopSize
CellPop$r <- inDivRate
CellPop$tp <- intp
CellPop$l1s <- DNAStringSet()
CellPop$sites_chrm <- c()
CellPop$sites_loci <- c()

for (i in 2:NT) {
    
    CellPop$Do(maybeTranspose,i)
               
}
    

In [8]:
print(CellPop,'ncells','r')

                levelName ncells r
1  1                        5120 1
2   ¦--2                       8 1
3   ¦   ¦--3                   2 1
4   ¦   ¦   ¦--5               2 1
5   ¦   ¦   ¦   ¦--7           2 1
6   ¦   ¦   ¦   ¦   ¦--9       2 1
7   ¦   ¦   ¦   ¦   °--10      1 1
8   ¦   ¦   ¦   °--10          1 1
9   ¦   ¦   ¦--8               2 1
10  ¦   ¦   ¦   ¦--9           2 1
11  ¦   ¦   ¦   ¦   °--10      1 1
12  ¦   ¦   ¦   °--10          1 1
13  ¦   ¦   °--10              1 1
14  ¦   ¦--7                   2 1
15  ¦   ¦   ¦--8               2 1
16  ¦   ¦   ¦   °--9           2 1
17  ¦   ¦   ¦--9               2 1
18  ¦   ¦   ¦   °--10          1 1
19  ¦   ¦   °--10              1 1
20  ¦   °--10                  1 1
21  ¦--3                       8 1
22  ¦   ¦--4                   4 1
23  ¦   ¦   °--10              1 1
24  ¦   ¦--5                   4 1
25  ¦   ¦   ¦--7               4 1
26  ¦   ¦   ¦   ¦--9           2 1
27  ¦   ¦   ¦   ¦   °--10      1 1
28  ¦   ¦   ¦   °--1

In [9]:
save(CellPop,file="./data/CellPop_out.rda")