In [1]:
library(GenomicRanges)
library(BSgenome.Hsapiens.UCSC.hg38)
library(data.table)
for (i in names(Hsapiens)[1:24]){ # load all chromosome map files
#         cat(paste0('Loading...',i,'\n'))
        load(paste0('../sim-develop/data/root_maps/',i,'.rda'))
}

dftmp <- read.csv('~/jackgl/analyses/gene_lists/frequently_mutated_genes_TCGA_lung.csv')
geneList_lung <- as.vector(dftmp$Symbol) # List of genes mutated in 10% of cases for lung cancer
dftmp <- read.csv('~/jackgl/analyses/gene_lists/frequently_mutated_genes_TCGA_colon.csv')
geneList_colon <- as.vector(dftmp$Symbol) # " " " " " for colon cancer
dftmp <- read.csv('~/jackgl/analyses/gene_lists/frequently_mutated_genes_TCGA_brain.csv')
geneList_brain <- as.vector(dftmp$Symbol) # " " " " " for brain cancer

load('../sim-develop/data/exann.rda')
load('../sim-develop/data/exonicvsnon_counts.rda')

Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs

The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colMeans,
    colnames, colSums, dirname, do.call, duplicated, eval, evalq,
    Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply,
    lengths, Map, mapply, match, mget, order, paste, pmax, pmax.int,
    pmin, pmin.int, Position, rank, rbind, Reduce, rowMeans, rownames,
    rowSums, sapply, setdiff, sort, table, tapply, union, unique,
    unsplit, which, which.max, which.min

Loading required package: S4Ve

In [2]:
# This function will count the number of L1 target sites in the provided ranges (assumed to be exonic) (columns 1-4),
# in other exonic ranges (col 5-8), and outside of exonic ranges (col 9-12)
count_in_ranges <- function(exon_ranges) {
    
    counts <-  array(0,dim=c(24,12)) # Allocate matrix for counts

    #--- Loop through chromosome names
    j<-1 # chromosome counter
    for (i in names(Hsapiens)[1:24]){


            # Data objects containing indices of S-V sites for each category are labeled with the chromosome name.
            # Here we copy the data objects to a set of variables with generic names which can be used consistently 
            # in the loop. 
            map<-get(paste0(i,'Map')) # Get the site map data for the current chrom
            ict<-    map[[2]]
            icl<-    map[[3]]
            iot<-    map[[4]]
            iol<-    map[[5]]
            insites<-map[[1]] 

            i <- strsplit(i,"chr")[[1]][2] # Remove "chr" from chromosome name

    ################################################
            # count in exon_ranges for current chromosome (columns 1-4)
            ann_i <- exon_ranges[exon_ranges$chrom == i,]

            sen        <-inrange(insites[ict[!is.na(ict[,1]),1],1],ann_i$start,ann_i$end) # Check if any Closed-Tight category sites are within the start-end range of ann_i
            antisen    <-inrange(insites[ict[!is.na(ict[,2]),2],2],ann_i$start,ann_i$end) 
            counts[j,1]<-length(which(sen)) + length(which(antisen))                      # Fill an element of the counts table with the count

            sen        <-inrange(insites[icl[!is.na(icl[,1]),1],1],ann_i$start,ann_i$end) 
            antisen    <-inrange(insites[icl[!is.na(icl[,2]),2],2],ann_i$start,ann_i$end)
            counts[j,2]<-length(which(sen)) + length(which(antisen))

            sen        <-inrange(insites[iot[!is.na(iot[,1]),1],1],ann_i$start,ann_i$end) 
            antisen    <-inrange(insites[iot[!is.na(iot[,2]),2],2],ann_i$start,ann_i$end)
            counts[j,3]<-length(which(sen)) + length(which(antisen))

            sen        <-inrange(insites[iol[!is.na(iol[,1]),1],1],ann_i$start,ann_i$end) 
            antisen    <-inrange(insites[iol[!is.na(iol[,2]),2],2],ann_i$start,ann_i$end)
            counts[j,4]<-length(which(sen)) + length(which(antisen)) 
        
    ################################################
            # count in the remaining exonic regions, using counts_dipl (generated by analyses/count_sites_exonicvsnon.ipynb)
        
            if (!(i %in% c('X','Y'))) {
                counts[j,5]<-(counts_dipl[j,1]/2)-counts[j,1]
                counts[j,6]<-(counts_dipl[j,2]/2)-counts[j,2]
                counts[j,7]<-(counts_dipl[j,3]/2)-counts[j,3]
                counts[j,8]<-(counts_dipl[j,4]/2)-counts[j,4]
            } else {
                counts[j,5]<-counts_dipl[j,1]-counts[j,1]
                counts[j,6]<-counts_dipl[j,2]-counts[j,2]
                counts[j,7]<-counts_dipl[j,3]-counts[j,3]
                counts[j,8]<-counts_dipl[j,4]-counts[j,4]
            }

    ################################################
            # count outside of exonic ranges for current chromosome (columns 9-12)

            counts[j,9]   <- length(which(!is.na(as.vector(ict)))) - counts[j,1] - counts[j,5]
            counts[j,10]  <- length(which(!is.na(as.vector(icl)))) - counts[j,2] - counts[j,6]
            counts[j,11]  <- length(which(!is.na(as.vector(iot)))) - counts[j,3] - counts[j,7]
            counts[j,12]  <- length(which(!is.na(as.vector(iol)))) - counts[j,4] - counts[j,8]

            j<-j+1
    }
    colnames(counts) <- c('ct_sub','cl_sub','ot_sub','ol_sub','ct_rest','cl_rest','ot_rest','ol_rest','ct_outrange','cl_outrange','ot_outrange','ol_outrange')
    rownames(counts) <- names(Hsapiens)[1:24]
    counts[1:22,]<-counts[1:22,]*2 # Account for diploid genome
    return(counts)
}

In [3]:
get_prob_dist <- function(counts_matrix, exon_ranges) {
    
    counts_matrix <- colSums(counts_matrix)
    probs_ENd <- rep(0,3)
    for (i in 1:3){
        probs_ENd[i] = 11.55*counts_matrix[(i-1)*4+1] + 
                       7.25*counts_matrix[(i-1)*4+2] + 
                       1.95*counts_matrix[(i-1)*4+3] + 
                       1*counts_matrix[(i-1)*4+4]
    }
    probs_ENd <- probs_ENd/sum(probs_ENd)
    probs_ENd
    
    probs_ENi <- rep(0,3)
    probs_ENi[1] <- sum(exon_ranges$end-exon_ranges$start)/sum(lens[,2])
    probs_ENi[2] <- (sum(lens[,1])-sum(exon_ranges$end-exon_ranges$start))/sum(lens[,2])
    probs_ENi[3] <- (sum(lens[,2])-sum(lens[,1]))/sum(lens[,2])
    
    pd <- (.9*probs_ENd)+(0.1*probs_ENi) # Assuming 0.9 probability that insertion will be ENd and 0.1 for ENi
    pd <- pd/sum(pd)
    return(pd)
    
}

## Lung cancer

In [4]:
head(geneList_lung)
head(exann)

chrom,start,end,geneSym
1,11869,12227,DDX11L1
1,12010,12057,DDX11L1
1,12179,12227,DDX11L1
1,12613,12721,DDX11L1
1,12613,12697,DDX11L1
1,12975,13052,DDX11L1


In [5]:
# Here we find the exonic ranges for the gene list of the current cancer, and take the union of any overlapping exonic ranges
# exon_ranges <- exann[exann$geneSym %in% geneList_lung,]
exon_ranges <- exann[exann$geneSym %in% geneList_lung[1:12],] # For genes mutated in at least 25% of cases

exon_ranges <- GRanges(exon_ranges$chrom,IRanges(exon_ranges$start,exon_ranges$end))
exon_ranges <- reduce(exon_ranges)
exon_ranges <- data.table(chrom=as.vector(seqnames(exon_ranges)),start=start(exon_ranges),end=end(exon_ranges))

In [6]:
counts <- count_in_ranges(exon_ranges)
head(counts)

Unnamed: 0,ct_sub,cl_sub,ot_sub,ol_sub,ct_rest,cl_rest,ot_rest,ol_rest,ct_outrange,cl_outrange,ot_outrange,ol_outrange
chr1,408,1362,3030,12570,151088,322240,707404,2452298,5193582,8078528,20266612,53477160
chr2,812,4354,9032,37772,123900,279350,609862,2031952,5449664,9245692,22942902,59727216
chr3,0,0,0,0,101554,223498,483424,1625082,4525166,7774294,19306214,50347906
chr4,0,0,0,0,79462,186306,412928,1305738,4399742,7952856,19999238,50770966
chr5,0,0,0,0,87766,198600,433212,1421296,4094322,7162704,17783544,46482090
chr6,558,1436,2880,10958,83200,190378,416088,1378136,3921788,6736846,16800180,43289224


Ratio of sites in geneList_lung genes vs. all other Ensembl genes (driver to passenger ratio)

In [7]:
sum(rowSums(counts[,1:4]))/sum(rowSums(counts[,5:8]))

In [8]:
counts_lung<-counts

In [9]:
pd_lung <- get_prob_dist(counts_lung,exon_ranges)

## Colon cancer

In [10]:
head(geneList_colon)
head(exann)

chrom,start,end,geneSym
1,11869,12227,DDX11L1
1,12010,12057,DDX11L1
1,12179,12227,DDX11L1
1,12613,12721,DDX11L1
1,12613,12697,DDX11L1
1,12975,13052,DDX11L1


In [11]:
# exon_ranges <- exann[exann$geneSym %in% geneList_colon,]
exon_ranges <- exann[exann$geneSym %in% geneList_colon[1:11],] # For genes mutated in at least 25% of cases

exon_ranges <- GRanges(exon_ranges$chrom,IRanges(exon_ranges$start,exon_ranges$end))
exon_ranges <- reduce(exon_ranges)
exon_ranges <- data.table(chrom=as.vector(seqnames(exon_ranges)),start=start(exon_ranges),end=end(exon_ranges))

In [12]:
counts <- count_in_ranges(exon_ranges)
head(counts)

Unnamed: 0,ct_sub,cl_sub,ot_sub,ol_sub,ct_rest,cl_rest,ot_rest,ol_rest,ct_outrange,cl_outrange,ot_outrange,ol_outrange
chr1,204,620,1376,6430,151292,322982,709058,2458438,5193582,8078528,20266612,53477160
chr2,940,3728,7400,31758,123772,279976,611494,2037966,5449664,9245692,22942902,59727216
chr3,200,530,962,3062,101354,222968,482462,1622020,4525166,7774294,19306214,50347906
chr4,60,446,740,3768,79402,185860,412188,1301970,4399742,7952856,19999238,50770966
chr5,152,438,956,3260,87614,198162,432256,1418036,4094322,7162704,17783544,46482090
chr6,558,1436,2880,10958,83200,190378,416088,1378136,3921788,6736846,16800180,43289224


Ratio of sites in geneList_colon genes vs. all other Ensembl genes (driver to passenger ratio)

In [13]:
sum(rowSums(counts[,1:4]))/sum(rowSums(counts[,5:8]))

In [14]:
counts_colon<-counts

In [15]:
pd_colon <- get_prob_dist(counts_colon,exon_ranges)

## Brain cancer

In [16]:
head(geneList_brain)
head(exann)

chrom,start,end,geneSym
1,11869,12227,DDX11L1
1,12010,12057,DDX11L1
1,12179,12227,DDX11L1
1,12613,12721,DDX11L1
1,12613,12697,DDX11L1
1,12975,13052,DDX11L1


In [17]:
# exon_ranges <- exann[exann$geneSym %in% geneList_brain,]
exon_ranges <- exann[exann$geneSym %in% geneList_brain[1:3],] # For genes mutated in at least 25% of cases

exon_ranges <- GRanges(exon_ranges$chrom,IRanges(exon_ranges$start,exon_ranges$end))
exon_ranges <- reduce(exon_ranges)
exon_ranges <- data.table(chrom=as.vector(seqnames(exon_ranges)),start=start(exon_ranges),end=end(exon_ranges))

In [18]:
counts <- count_in_ranges(exon_ranges)
head(counts)

Unnamed: 0,ct_sub,cl_sub,ot_sub,ol_sub,ct_rest,cl_rest,ot_rest,ol_rest,ct_outrange,cl_outrange,ot_outrange,ol_outrange
chr1,0,0,0,0,151496,323602,710434,2464868,5193582,8078528,20266612,53477160
chr2,90,168,406,1010,124622,283536,618488,2068714,5449664,9245692,22942902,59727216
chr3,0,0,0,0,101554,223498,483424,1625082,4525166,7774294,19306214,50347906
chr4,0,0,0,0,79462,186306,412928,1305738,4399742,7952856,19999238,50770966
chr5,0,0,0,0,87766,198600,433212,1421296,4094322,7162704,17783544,46482090
chr6,0,0,0,0,83758,191814,418968,1389094,3921788,6736846,16800180,43289224


Ratio of sites in geneList_brain genes vs. all other Ensembl genes (driver to passenger ratio)

In [19]:
sum(rowSums(counts[,1:4]))/sum(rowSums(counts[,5:8]))

In [20]:
counts_brain<-counts

In [21]:
pd_brain <- get_prob_dist(counts_brain,exon_ranges)

In [22]:
save(pd_lung,pd_colon,pd_brain,file='../sim-develop/data/cancer_type_pd_th25.rda')