In [22]:
library(data.table)
library(GenomicRanges)
library(org.Hs.eg.db)
for (i in c(1:22,'X','Y')){ # load all chromosome map files
        load(paste0('./data/large_files/chr',i,'.rda'))
}
load('./data/exons.rda')
load('./data/genes.rda')
load('./data/exonicvsnon_counts.rda')
weights <- read.csv('./data/snap_velcro_weights.csv')
weights <- weights[7,2:5]

In [23]:
get_drivers <- function(gene_file) {
    geneList <- read.csv(paste0('~/jackgl/',gene_file),header=F)
    geneList <- geneList$V1
    print(genes[genes$gene_id %in% geneList])
    geneList <- genes[genes$gene_id %in% geneList]$gene_sym
    return(geneList)
}

In [24]:
geneList_lung <- get_drivers('drivers_sclc.txt')

    chrom     start       end         gene_id gene_sym   gene_biotype    len
 1:    10  87863113  87971930 ENSG00000171862     PTEN protein_coding 108817
 2:    11 118436464 118526832 ENSG00000118058    KMT2A protein_coding  90368
 3:    13  48303726  48599436 ENSG00000139687      RB1 protein_coding 295710
 4:    16   3725054   3880726 ENSG00000005339   CREBBP protein_coding 155672
 5:    17   7661779   7687550 ENSG00000141510     TP53 protein_coding  25771
 6:     2  88556741  88627576 ENSG00000172071  EIF2AK3 protein_coding  70835
 7:    22  41091786  41180079 ENSG00000100393    EP300 protein_coding  88293
 8:     3 142449235 142578826 ENSG00000175054      ATR protein_coding 129591
 9:     4 186587783 186726722 ENSG00000083857     FAT1 protein_coding 138939
10:     7  81699008  81770438 ENSG00000019991      HGF protein_coding  71430
11:     7 152134922 152436005 ENSG00000055609    KMT2C protein_coding 301083
12:     9  16409503  16870843 ENSG00000173068     BNC2 protein_coding 461340

In [25]:
geneList_colon <- get_drivers('drivers_coread.txt')

   chrom     start       end         gene_id gene_sym   gene_biotype    len
1:    10 112950250 113167678 ENSG00000148737   TCF7L2 protein_coding 217428
2:    17   7661779   7687550 ENSG00000141510     TP53 protein_coding  25771
3:    18  47808957  47931146 ENSG00000175387    SMAD2 protein_coding 122189
4:    18  51028394  51085045 ENSG00000141646    SMAD4 protein_coding  56651
5:     4 152320544 152536063 ENSG00000109670    FBXW7 protein_coding 215519
6:     5 112707498 112846239 ENSG00000134982      APC protein_coding 138741
7:     X  64185117  64205744 ENSG00000184675    AMER1 protein_coding  20627


In [26]:
geneList_brain <- get_drivers('drivers_gbm.txt')

   chrom    start      end         gene_id gene_sym   gene_biotype    len
1:    10 87863113 87971930 ENSG00000171862     PTEN protein_coding 108817
2:    13 48303726 48599436 ENSG00000139687      RB1 protein_coding 295710
3:    17  7661779  7687550 ENSG00000141510     TP53 protein_coding  25771
4:    17 31094927 31382116 ENSG00000196712      NF1 protein_coding 287189
5:     5 68215720 68301821 ENSG00000145675   PIK3R1 protein_coding  86101


In [27]:
cat('Genes not in annotation\n\n')
cat('Lung:')
geneList_lung[!(geneList_lung %in% genes$gene_sym)]
cat('Colon:')
geneList_colon[!(geneList_colon %in% genes$gene_sym)]
cat('Brain:')
geneList_brain[!(geneList_brain %in% genes$gene_sym)]

Genes not in annotation

Lung:

Colon:

Brain:

In [28]:
# This function will count the number of L1 target sites in the provided ranges (assumed to be exonic) (columns 1-4),
# in other exonic ranges (col 5-8), and outside of exonic ranges (col 9-12)
count_in_ranges <- function(exon_ranges,counts_dipl) {
    
    counts <-  array(0,dim=c(24,12)) # Allocate matrix for counts

    #--- Loop through chromosome names
    j<-1 # chromosome counter
    for (i in c(1:22,'X','Y')){

        # If the chromosome is Y and the supplied diploid genome counts are for female genome, leave the 
        # counts at 0
        if (i=='Y' && sum(counts_dipl[24,])==0) {next}

        # Data objects containing indices of S-V sites for each category are labeled with the chromosome name.
        # Here we copy the data objects to a set of variables with generic names which can be used consistently 
        # in the loop. 
        map<-get(paste0('chr',i,'Map')) # Get the site map data for the current chrom
        ict<-    map[[2]]
        icl<-    map[[3]]
        iot<-    map[[4]]
        iol<-    map[[5]]
        insites<-map[[1]] 

################################################
        # Count in exon_ranges for current chromosome (columns 1-4)
        ann_i <- exon_ranges[exon_ranges$chrom == i,]

        sen        <-inrange(insites[ict[!is.na(ict[,1]),1],1],ann_i$start,ann_i$end) # Check if any Closed-Tight category sites are within the start-end range of ann_i
        antisen    <-inrange(insites[ict[!is.na(ict[,2]),2],2],ann_i$start,ann_i$end) 
        counts[j,1]<-length(which(sen)) + length(which(antisen))                      # Fill an element of the counts table with the count

        sen        <-inrange(insites[icl[!is.na(icl[,1]),1],1],ann_i$start,ann_i$end) 
        antisen    <-inrange(insites[icl[!is.na(icl[,2]),2],2],ann_i$start,ann_i$end)
        counts[j,2]<-length(which(sen)) + length(which(antisen))

        sen        <-inrange(insites[iot[!is.na(iot[,1]),1],1],ann_i$start,ann_i$end) 
        antisen    <-inrange(insites[iot[!is.na(iot[,2]),2],2],ann_i$start,ann_i$end)
        counts[j,3]<-length(which(sen)) + length(which(antisen))

        sen        <-inrange(insites[iol[!is.na(iol[,1]),1],1],ann_i$start,ann_i$end) 
        antisen    <-inrange(insites[iol[!is.na(iol[,2]),2],2],ann_i$start,ann_i$end)
        counts[j,4]<-length(which(sen)) + length(which(antisen)) 

        # Double the counts if necessary, to account for diploid genome
        if (i %in% c('X','Y') & sum(counts_dipl[24,])!=0) { # If the chrom. is X or Y and genome male, do nothing
        
        } else {
            counts[j,1:4] <- counts[j,1:4]*2
        }

################################################
        # Count in the remaining exonic regions, using counts_dipl (generated by analyses/count_sites_exonicvsnon.ipynb)

        counts[j,5]<-(counts_dipl[j,1])-counts[j,1]
        counts[j,6]<-(counts_dipl[j,2])-counts[j,2]
        counts[j,7]<-(counts_dipl[j,3])-counts[j,3]
        counts[j,8]<-(counts_dipl[j,4])-counts[j,4]

################################################
        # Count outside of exonic ranges for current chromosome (columns 9-12)

        if (i %in% c('X','Y') && sum(counts_dipl[24,])!=0) { # If X or Y chrom and male genome, don't double count
            counts[j,9]   <- length(which(!is.na(as.vector(ict)))) - counts[j,1] - counts[j,5]
            counts[j,10]  <- length(which(!is.na(as.vector(icl)))) - counts[j,2] - counts[j,6]
            counts[j,11]  <- length(which(!is.na(as.vector(iot)))) - counts[j,3] - counts[j,7]
            counts[j,12]  <- length(which(!is.na(as.vector(iol)))) - counts[j,4] - counts[j,8]
        } else if (i == 'Y' && sum(counts_dipl[24,]==0)) { # If Y and female genome, don't count any
            counts[j,9:12] <- 0
        } else { # Else, double counts
            counts[j,9]   <- length(which(!is.na(as.vector(ict))))*2 - counts[j,1] - counts[j,5]
            counts[j,10]  <- length(which(!is.na(as.vector(icl))))*2 - counts[j,2] - counts[j,6]
            counts[j,11]  <- length(which(!is.na(as.vector(iot))))*2 - counts[j,3] - counts[j,7]
            counts[j,12]  <- length(which(!is.na(as.vector(iol))))*2 - counts[j,4] - counts[j,8]
        }
        
        
        j<-j+1
    }
    colnames(counts) <- c('ct_sub','cl_sub','ot_sub','ol_sub','ct_rest','cl_rest','ot_rest','ol_rest','ct_outrange','cl_outrange','ot_outrange','ol_outrange')
    rownames(counts) <- paste0('chr',c(1:22,'X','Y'))
    return(counts)
}

In [29]:
get_prob_dist <- function(counts_matrix, exon_ranges) {
    
    if (sum(counts_matrix[24,])==0) {gend='f'} 
    else {gend='m'}
    counts_matrix <- colSums(counts_matrix)
    probs_ENd <- rep(0,3)
    for (i in 1:3){
        probs_ENd[i] = weights$closed_tight*counts_matrix[(i-1)*4+1] + 
                       weights$closed_loose*counts_matrix[(i-1)*4+2] + 
                       weights$open_tight*counts_matrix[(i-1)*4+3] + 
                       weights$open_loose*counts_matrix[(i-1)*4+4]
    }
    probs_ENd <- probs_ENd/sum(probs_ENd)
    probs_ENd
    
    probs_ENi <- rep(0,3)
    if (gend=='f') { # If female genome, double all sizes
        
        target_size <- sum(exon_ranges$end[exon_ranges$chrom!='Y']-exon_ranges$start[exon_ranges$chrom!='Y'])*2
        exons_size <- sum(lens[1:23,1]*2)
        genome_size <- sum(lens[1:23,2]*2)
        
    } else { # If male genome, double sizes only for chroms. 1-22
        
        target_size <- sum(exon_ranges$end[exon_ranges$chrom %in% c('X','Y')]-exon_ranges$start[exon_ranges$chrom %in% c('X','Y')])
        target_size <- target_size + sum(exon_ranges$end[!(exon_ranges$chrom %in% c('X','Y'))]
                                         -exon_ranges$start[!(exon_ranges$chrom %in% c('X','Y'))])*2
        
        exons_size <-  sum(lens[1:22,1]*2)+sum(lens[23:24,1])
        genome_size <- sum(lens[1:22,2]*2)+sum(lens[23:24,2])
    }
    
    probs_ENi[1] <- target_size/genome_size # Chance of target mutation
    probs_ENi[2] <- (exons_size-target_size)/genome_size # Chance of passenger mutation
    probs_ENi[3] <- (genome_size - exons_size)/genome_size # Chance of null mutation
    
    pd <- (.9*probs_ENd)+(0.1*probs_ENi) # Assuming 0.9 probability that insertion will be ENd and 0.1 for ENi
    pd <- pd/sum(pd)
    return(pd)
    
}

In [32]:
get_stats <- function(driver_list) {

    # Here we find the exonic ranges for the gene list of the current cancer, and take the union of any overlapping exonic ranges
    driver_exons <- exons[exons$gene_sym %in% driver_list,]

    driver_exons <- GRanges(driver_exons$chrom,IRanges(driver_exons$start,driver_exons$end))
    driver_exons <- reduce(driver_exons)
    driver_exons <- data.table(chrom=as.vector(seqnames(driver_exons)),start=start(driver_exons),end=end(driver_exons))

    counts_m <- count_in_ranges(driver_exons,counts_dipl_m)
    # head(counts_m)
    counts_f <- count_in_ranges(driver_exons,counts_dipl_f)
    # head(counts_f)

    cat('Male driver to passenger site count ratio (unweighted)\n')
    print(sum(rowSums(counts_m[,1:4]))/sum(rowSums(counts_m[,5:8])))
    cat('Female driver to passenger site count ratio (unweighted)\n')
    print(sum(rowSums(counts_f[,1:4]))/sum(rowSums(counts_f[,5:8])))

    counts_m<-counts_m
    counts_f<-counts_f

    pd_m <- get_prob_dist(counts_m,driver_exons)
    cat('Male probabilities of driver, passenger and null L1 insertion (per L1 insertion)\n')
    print(pd_m)
    pd_f <- get_prob_dist(counts_f,driver_exons)
    cat('Female probabilities of driver, passenger and null L1 insertion\n')
    print(pd_f)
    
    return(list(pd_m, pd_f))
}

## Lung cancer

In [34]:
cat('Sample of associated drivers:')
head(geneList_lung)
out <- get_stats(geneList_lung)
pd_lung_m <- out[[1]]
pd_lung_f <- out[[2]]

Sample of associated drivers:

Male driver to passenger site count ratio (unweighted)
[1] 0.002045646
Female driver to passenger site count ratio (unweighted)
[1] 0.002043483
Male probabilities of driver, passenger and null L1 insertion (per L1 insertion)
[1] 5.017602e-05 2.457912e-02 9.753707e-01
Female probabilities of driver, passenger and null L1 insertion
[1] 4.981933e-05 2.444584e-02 9.755043e-01


## Colon cancer

In [35]:
cat('Sample of associated drivers:')
head(geneList_colon)
out <- get_stats(geneList_colon)
pd_colon_m <- out[[1]]
pd_colon_f <- out[[2]]

Sample of associated drivers:

Male driver to passenger site count ratio (unweighted)
[1] 0.001202801
Female driver to passenger site count ratio (unweighted)
[1] 0.001215079
Male probabilities of driver, passenger and null L1 insertion (per L1 insertion)
[1] 3.032403e-05 2.459897e-02 9.753707e-01
Female probabilities of driver, passenger and null L1 insertion
[1] 3.040437e-05 2.446525e-02 9.755043e-01


## Brain cancer

In [36]:
cat('Sample of associated drivers:')
head(geneList_brain)
out <- get_stats(geneList_brain)
pd_brain_m <- out[[1]]
pd_brain_f <- out[[2]]

Sample of associated drivers:

Male driver to passenger site count ratio (unweighted)
[1] 0.0009180281
Female driver to passenger site count ratio (unweighted)
[1] 0.0009022759
Male probabilities of driver, passenger and null L1 insertion (per L1 insertion)
[1] 2.295825e-05 2.460633e-02 9.753707e-01
Female probabilities of driver, passenger and null L1 insertion
[1] 2.244402e-05 2.447321e-02 9.755043e-01


In [37]:
save(pd_lung_m,pd_lung_f,pd_colon_m,pd_colon_f,pd_brain_m,pd_brain_f,file='./data/tumor_type_pd_intogen.rda')
save(geneList_lung,geneList_colon,geneList_brain,file='./data/tumor_type_driver_lists_intogen.rda')