In [1]:
library(GenomicRanges)
library(BSgenome.Hsapiens.UCSC.hg38)
library(data.table)
for (i in names(Hsapiens)[1:24]){ # load all chromosome map files
        load(paste0('../sim-develop/data/root_maps/',i,'.rda'))
}

load('../sim-develop/data/exann.rda')
load('../sim-develop/data/exonicvsnon_counts.rda')

Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs

The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colMeans,
    colnames, colSums, dirname, do.call, duplicated, eval, evalq,
    Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply,
    lengths, Map, mapply, match, mget, order, paste, pmax, pmax.int,
    pmin, pmin.int, Position, rank, rbind, Reduce, rowMeans, rownames,
    rowSums, sapply, setdiff, sort, table, tapply, union, unique,
    unsplit, which, which.max, which.min

Loading required package: S4Ve

In [2]:
#--- get_drivers()
get_drivers <- function(cancer_type){
    
    geneList <- read.csv('~/jackgl/sim-develop/data/Census_allThu Sep 13 17_30_17 2018.csv') # Cancer Gene Census Data Table
    if (cancer_type=='lung') {
        toMatch <- c('lung')
    } else if (cancer_type=='colon') {
        toMatch <- c('colon','colorectal')
    } else if (cancer_type=='brain') {
        toMatch <- c('astrocytoma','meningioma','head','oligodendroglio')
    }
    # Get genes affecting chosen cancer type
    geneList<-geneList[grep(paste(toMatch,collapse='|'),geneList$Tumour.Types.Somatic),]
    # Filter for only Tier 1 genes
#     geneList<-geneList[geneList$Tier==1,]
    # Filter for TSGs
    geneList<-geneList[grep('TSG',geneList$Role.in.Cancer),]
    geneList <- as.vector(geneList$Gene.Symbol) # List of genes mutated in 10% of brain cancer cases, from TCGA
    
    return(geneList)
}

In [3]:
geneList_lung <- get_drivers('lung')
geneList_colon <- get_drivers('colon')
geneList_brain <- get_drivers('brain')

In [4]:
# This function will count the number of L1 target sites in the provided ranges (assumed to be exonic) (columns 1-4),
# in other exonic ranges (col 5-8), and outside of exonic ranges (col 9-12)
count_in_ranges <- function(exon_ranges,counts_dipl) {
    
    counts <-  array(0,dim=c(24,12)) # Allocate matrix for counts

    #--- Loop through chromosome names
    j<-1 # chromosome counter
    for (i in names(Hsapiens)[1:24]){

        # If the chromosome is Y and the supplied diploid genome counts are for female genome, leave the 
        # counts at 0
        if (i=='chrY' && sum(counts_dipl[24,])==0) {next}

        # Data objects containing indices of S-V sites for each category are labeled with the chromosome name.
        # Here we copy the data objects to a set of variables with generic names which can be used consistently 
        # in the loop. 
        map<-get(paste0(i,'Map')) # Get the site map data for the current chrom
        ict<-    map[[2]]
        icl<-    map[[3]]
        iot<-    map[[4]]
        iol<-    map[[5]]
        insites<-map[[1]] 

        i <- strsplit(i,"chr")[[1]][2] # Remove "chr" from chromosome name

################################################
        # Count in exon_ranges for current chromosome (columns 1-4)
        ann_i <- exon_ranges[exon_ranges$chrom == i,]

        sen        <-inrange(insites[ict[!is.na(ict[,1]),1],1],ann_i$start,ann_i$end) # Check if any Closed-Tight category sites are within the start-end range of ann_i
        antisen    <-inrange(insites[ict[!is.na(ict[,2]),2],2],ann_i$start,ann_i$end) 
        counts[j,1]<-length(which(sen)) + length(which(antisen))                      # Fill an element of the counts table with the count

        sen        <-inrange(insites[icl[!is.na(icl[,1]),1],1],ann_i$start,ann_i$end) 
        antisen    <-inrange(insites[icl[!is.na(icl[,2]),2],2],ann_i$start,ann_i$end)
        counts[j,2]<-length(which(sen)) + length(which(antisen))

        sen        <-inrange(insites[iot[!is.na(iot[,1]),1],1],ann_i$start,ann_i$end) 
        antisen    <-inrange(insites[iot[!is.na(iot[,2]),2],2],ann_i$start,ann_i$end)
        counts[j,3]<-length(which(sen)) + length(which(antisen))

        sen        <-inrange(insites[iol[!is.na(iol[,1]),1],1],ann_i$start,ann_i$end) 
        antisen    <-inrange(insites[iol[!is.na(iol[,2]),2],2],ann_i$start,ann_i$end)
        counts[j,4]<-length(which(sen)) + length(which(antisen)) 

        # Double the counts if necessary, to account for diploid genome
        if (i %in% c('X','Y') & sum(counts_dipl[24,])!=0) { # If the chrom. is X or Y and genome male, do nothing
        
        } else {
            counts[j,1:4] <- counts[j,1:4]*2
        }

################################################
        # Count in the remaining exonic regions, using counts_dipl (generated by analyses/count_sites_exonicvsnon.ipynb)

        counts[j,5]<-(counts_dipl[j,1])-counts[j,1]
        counts[j,6]<-(counts_dipl[j,2])-counts[j,2]
        counts[j,7]<-(counts_dipl[j,3])-counts[j,3]
        counts[j,8]<-(counts_dipl[j,4])-counts[j,4]

################################################
        # Count outside of exonic ranges for current chromosome (columns 9-12)

        if (i %in% c('X','Y') && sum(counts_dipl[24,])!=0) { # If X or Y chrom and male genome, don't double count
            counts[j,9]   <- length(which(!is.na(as.vector(ict)))) - counts[j,1] - counts[j,5]
            counts[j,10]  <- length(which(!is.na(as.vector(icl)))) - counts[j,2] - counts[j,6]
            counts[j,11]  <- length(which(!is.na(as.vector(iot)))) - counts[j,3] - counts[j,7]
            counts[j,12]  <- length(which(!is.na(as.vector(iol)))) - counts[j,4] - counts[j,8]
        } else if (i == 'Y' && sum(counts_dipl[24,]==0)) { # If Y and female genome, don't count any
            counts[j,9:12] <- 0
        } else { # Else, double counts
            counts[j,9]   <- length(which(!is.na(as.vector(ict))))*2 - counts[j,1] - counts[j,5]
            counts[j,10]  <- length(which(!is.na(as.vector(icl))))*2 - counts[j,2] - counts[j,6]
            counts[j,11]  <- length(which(!is.na(as.vector(iot))))*2 - counts[j,3] - counts[j,7]
            counts[j,12]  <- length(which(!is.na(as.vector(iol))))*2 - counts[j,4] - counts[j,8]
        }
        
        
        j<-j+1
    }
    colnames(counts) <- c('ct_sub','cl_sub','ot_sub','ol_sub','ct_rest','cl_rest','ot_rest','ol_rest','ct_outrange','cl_outrange','ot_outrange','ol_outrange')
    rownames(counts) <- names(Hsapiens)[1:24]
    return(counts)
}

In [5]:
get_prob_dist <- function(counts_matrix, exon_ranges) {
    
    if (sum(counts_matrix[24,])==0) {gend='f'} 
    else {gend='m'}
    counts_matrix <- colSums(counts_matrix)
    probs_ENd <- rep(0,3)
    for (i in 1:3){
        probs_ENd[i] = 11.55*counts_matrix[(i-1)*4+1] + 
                       7.25*counts_matrix[(i-1)*4+2] + 
                       1.95*counts_matrix[(i-1)*4+3] + 
                       1*counts_matrix[(i-1)*4+4]
    }
    probs_ENd <- probs_ENd/sum(probs_ENd)
    probs_ENd
    
    probs_ENi <- rep(0,3)
    if (gend=='f') { # If female genome, double all sizes
        
        target_size <- sum(exon_ranges$end[exon_ranges$chrom!='Y']-exon_ranges$start[exon_ranges$chrom!='Y'])*2
        exons_size <- sum(lens[1:23,1]*2)
        genome_size <- sum(lens[1:23,2]*2)
        
    } else { # If male genome, double sizes only for chroms. 1-22
        
        target_size <- sum(exon_ranges$end[exon_ranges$chrom %in% c('X','Y')]-exon_ranges$start[exon_ranges$chrom %in% c('X','Y')])
        target_size <- target_size + sum(exon_ranges$end[!(exon_ranges$chrom %in% c('X','Y'))]
                                         -exon_ranges$start[!(exon_ranges$chrom %in% c('X','Y'))])*2
        
        exons_size <-  sum(lens[1:22,1]*2)+sum(lens[23:24,1])
        genome_size <- sum(lens[1:22,2]*2)+sum(lens[23:24,2])
    }
    
    probs_ENi[1] <- target_size/genome_size # Chance of target mutation
    probs_ENi[2] <- (exons_size-target_size)/genome_size # Chance of passenger mutation
    probs_ENi[3] <- (genome_size - exons_size)/genome_size # Chance of null mutation
    
    pd <- (.9*probs_ENd)+(0.1*probs_ENi) # Assuming 0.9 probability that insertion will be ENd and 0.1 for ENi
    pd <- pd/sum(pd)
    return(pd)
    
}

## Lung cancer

In [6]:
head(geneList_lung)
head(exann)

length(geneList_lung)
length(which(geneList_lung %in% exann$geneSym))

chrom,start,end,geneSym
1,11869,12227,DDX11L1
1,12010,12057,DDX11L1
1,12179,12227,DDX11L1
1,12613,12721,DDX11L1
1,12613,12697,DDX11L1
1,12975,13052,DDX11L1


In [7]:
# Here we find the exonic ranges for the gene list of the current cancer, and take the union of any overlapping exonic ranges
# exon_ranges <- exann[exann$geneSym %in% geneList_lung,]
exon_ranges <- exann[exann$geneSym %in% geneList_lung[1:12],] # For genes mutated in at least 25% of cases

exon_ranges <- GRanges(exon_ranges$chrom,IRanges(exon_ranges$start,exon_ranges$end))
exon_ranges <- reduce(exon_ranges)
exon_ranges <- data.table(chrom=as.vector(seqnames(exon_ranges)),start=start(exon_ranges),end=end(exon_ranges))

In [8]:
counts_m <- count_in_ranges(exon_ranges,counts_dipl_m)
head(counts_m)
counts_f <- count_in_ranges(exon_ranges,counts_dipl_f)
head(counts_f)

Unnamed: 0,ct_sub,cl_sub,ot_sub,ol_sub,ct_rest,cl_rest,ot_rest,ol_rest,ct_outrange,cl_outrange,ot_outrange,ol_outrange
chr1,0,0,0,0,151496,323602,710434,2464868,5193582,8078528,20266612,53477160
chr2,200,534,1176,3488,124512,283170,617718,2066236,5449664,9245692,22942902,59727216
chr3,0,0,0,0,101554,223498,483424,1625082,4525166,7774294,19306214,50347906
chr4,292,830,1786,5394,79170,185476,411142,1300344,4399742,7952856,19999238,50770966
chr5,0,0,0,0,87766,198600,433212,1421296,4094322,7162704,17783544,46482090
chr6,0,0,0,0,83758,191814,418968,1389094,3921788,6736846,16800180,43289224


Unnamed: 0,ct_sub,cl_sub,ot_sub,ol_sub,ct_rest,cl_rest,ot_rest,ol_rest,ct_outrange,cl_outrange,ot_outrange,ol_outrange
chr1,0,0,0,0,151496,323602,710434,2464868,5193582,8078528,20266612,53477160
chr2,200,534,1176,3488,124512,283170,617718,2066236,5449664,9245692,22942902,59727216
chr3,0,0,0,0,101554,223498,483424,1625082,4525166,7774294,19306214,50347906
chr4,292,830,1786,5394,79170,185476,411142,1300344,4399742,7952856,19999238,50770966
chr5,0,0,0,0,87766,198600,433212,1421296,4094322,7162704,17783544,46482090
chr6,0,0,0,0,83758,191814,418968,1389094,3921788,6736846,16800180,43289224


Ratio of sites in geneList_lung genes vs. all other Ensembl genes (driver to passenger ratio)

In [9]:
sum(rowSums(counts_m[,1:4]))/sum(rowSums(counts_m[,5:8]))
sum(rowSums(counts_f[,1:4]))/sum(rowSums(counts_f[,5:8]))

In [10]:
counts_lung_m<-counts_m
counts_lung_f<-counts_f

In [11]:
pd_lung_m <- get_prob_dist(counts_lung_m,exon_ranges)
pd_lung_m
pd_lung_f <- get_prob_dist(counts_lung_f,exon_ranges)
pd_lung_f

## Colon cancer

In [12]:
head(geneList_colon)
head(exann)

length(geneList_colon)
length(which(geneList_colon %in% exann$geneSym))

chrom,start,end,geneSym
1,11869,12227,DDX11L1
1,12010,12057,DDX11L1
1,12179,12227,DDX11L1
1,12613,12721,DDX11L1
1,12613,12697,DDX11L1
1,12975,13052,DDX11L1


In [13]:
# exon_ranges <- exann[exann$geneSym %in% geneList_colon,]
exon_ranges <- exann[exann$geneSym %in% geneList_colon[1:11],] # For genes mutated in at least 25% of cases

exon_ranges <- GRanges(exon_ranges$chrom,IRanges(exon_ranges$start,exon_ranges$end))
exon_ranges <- reduce(exon_ranges)
exon_ranges <- data.table(chrom=as.vector(seqnames(exon_ranges)),start=start(exon_ranges),end=end(exon_ranges))

In [14]:
counts_m <- count_in_ranges(exon_ranges,counts_dipl_m)
head(counts_m)
counts_f <- count_in_ranges(exon_ranges,counts_dipl_f)
head(counts_f)

Unnamed: 0,ct_sub,cl_sub,ot_sub,ol_sub,ct_rest,cl_rest,ot_rest,ol_rest,ct_outrange,cl_outrange,ot_outrange,ol_outrange
chr1,0,0,0,0,151496,323602,710434,2464868,5193582,8078528,20266612,53477160
chr2,0,0,0,0,124712,283704,618894,2069724,5449664,9245692,22942902,59727216
chr3,0,0,0,0,101554,223498,483424,1625082,4525166,7774294,19306214,50347906
chr4,0,0,0,0,79462,186306,412928,1305738,4399742,7952856,19999238,50770966
chr5,152,438,956,3260,87614,198162,432256,1418036,4094322,7162704,17783544,46482090
chr6,0,0,0,0,83758,191814,418968,1389094,3921788,6736846,16800180,43289224


Unnamed: 0,ct_sub,cl_sub,ot_sub,ol_sub,ct_rest,cl_rest,ot_rest,ol_rest,ct_outrange,cl_outrange,ot_outrange,ol_outrange
chr1,0,0,0,0,151496,323602,710434,2464868,5193582,8078528,20266612,53477160
chr2,0,0,0,0,124712,283704,618894,2069724,5449664,9245692,22942902,59727216
chr3,0,0,0,0,101554,223498,483424,1625082,4525166,7774294,19306214,50347906
chr4,0,0,0,0,79462,186306,412928,1305738,4399742,7952856,19999238,50770966
chr5,152,438,956,3260,87614,198162,432256,1418036,4094322,7162704,17783544,46482090
chr6,0,0,0,0,83758,191814,418968,1389094,3921788,6736846,16800180,43289224


Ratio of sites in geneList_colon genes vs. all other Ensembl genes (driver to passenger ratio)

In [15]:
sum(rowSums(counts_m[,1:4]))/sum(rowSums(counts_m[,5:8]))
sum(rowSums(counts_f[,1:4]))/sum(rowSums(counts_f[,5:8]))

In [16]:
counts_colon_m<-counts_m
counts_colon_f<-counts_f

In [17]:
pd_colon_m <- get_prob_dist(counts_colon_m,exon_ranges)
pd_colon_m
pd_colon_f <- get_prob_dist(counts_colon_f,exon_ranges)
pd_colon_f

## Brain cancer

In [18]:
head(geneList_brain)
head(exann)

length(geneList_brain)
length(which(geneList_brain %in% exann$geneSym))

chrom,start,end,geneSym
1,11869,12227,DDX11L1
1,12010,12057,DDX11L1
1,12179,12227,DDX11L1
1,12613,12721,DDX11L1
1,12613,12697,DDX11L1
1,12975,13052,DDX11L1


In [19]:
# exon_ranges <- exann[exann$geneSym %in% geneList_brain,]
exon_ranges <- exann[exann$geneSym %in% geneList_brain[1:3],] # For genes mutated in at least 25% of cases

exon_ranges <- GRanges(exon_ranges$chrom,IRanges(exon_ranges$start,exon_ranges$end))
exon_ranges <- reduce(exon_ranges)
exon_ranges <- data.table(chrom=as.vector(seqnames(exon_ranges)),start=start(exon_ranges),end=end(exon_ranges))

In [20]:
counts_m <- count_in_ranges(exon_ranges,counts_dipl_m)
head(counts_m)
counts_f <- count_in_ranges(exon_ranges,counts_dipl_f)
head(counts_f)

Unnamed: 0,ct_sub,cl_sub,ot_sub,ol_sub,ct_rest,cl_rest,ot_rest,ol_rest,ct_outrange,cl_outrange,ot_outrange,ol_outrange
chr1,0,0,0,0,151496,323602,710434,2464868,5193582,8078528,20266612,53477160
chr2,0,0,0,0,124712,283704,618894,2069724,5449664,9245692,22942902,59727216
chr3,0,0,0,0,101554,223498,483424,1625082,4525166,7774294,19306214,50347906
chr4,86,378,650,3424,79376,185928,412278,1302314,4399742,7952856,19999238,50770966
chr5,0,0,0,0,87766,198600,433212,1421296,4094322,7162704,17783544,46482090
chr6,0,0,0,0,83758,191814,418968,1389094,3921788,6736846,16800180,43289224


Unnamed: 0,ct_sub,cl_sub,ot_sub,ol_sub,ct_rest,cl_rest,ot_rest,ol_rest,ct_outrange,cl_outrange,ot_outrange,ol_outrange
chr1,0,0,0,0,151496,323602,710434,2464868,5193582,8078528,20266612,53477160
chr2,0,0,0,0,124712,283704,618894,2069724,5449664,9245692,22942902,59727216
chr3,0,0,0,0,101554,223498,483424,1625082,4525166,7774294,19306214,50347906
chr4,86,378,650,3424,79376,185928,412278,1302314,4399742,7952856,19999238,50770966
chr5,0,0,0,0,87766,198600,433212,1421296,4094322,7162704,17783544,46482090
chr6,0,0,0,0,83758,191814,418968,1389094,3921788,6736846,16800180,43289224


Ratio of sites in geneList_brain genes vs. all other Ensembl genes (driver to passenger ratio)

In [21]:
sum(rowSums(counts_m[,1:4]))/sum(rowSums(counts_m[,5:8]))
sum(rowSums(counts_f[,1:4]))/sum(rowSums(counts_f[,5:8]))

In [22]:
counts_brain_m<-counts_m
counts_brain_f<-counts_f

In [23]:
pd_brain_m <- get_prob_dist(counts_brain_m,exon_ranges)
pd_brain_m
pd_brain_f <- get_prob_dist(counts_brain_f,exon_ranges)
pd_brain_f

In [24]:
save(pd_lung_m,pd_lung_f,pd_colon_m,pd_colon_f,pd_brain_m,pd_brain_f,file='../sim-develop/data/cancer_type_pd_cgc.rda')