In [2]:
library(data.table)
library(GenomicRanges)
library(org.Hs.eg.db)
for (i in c(1:22,'X','Y')){ # load all chromosome map files
        load(paste0('./data/large_files/chr',i,'.rda'))
}
load('./data/exons.rda')
load('./data/genes.rda')
load('./data/exonicvsnon_counts.rda')
weights <- read.csv('./data/snap_velcro_weights.csv')
weights <- weights[7,2:5]

In [3]:
#--- get_drivers()
get_drivers <- function(cancer_type){
    
    geneList <- read.csv('./data/Census_allThu Sep 13 17_30_17 2018.csv') # Cancer Gene Census Data Table
    if (cancer_type=='lung') {
        toMatch <- c('lung')
    } else if (cancer_type=='colon') {
        toMatch <- c('colon','colorectal')
    } else if (cancer_type=='brain') {
        toMatch <- c('astrocytoma','meningioma','head','oligodendroglio')
    }
    # Get genes affecting chosen cancer type
    geneList<-geneList[grep(paste(toMatch,collapse='|'),geneList$Tumour.Types.Somatic),]
    # Filter for only Tier 1 genes
#     geneList<-geneList[geneList$Tier==1,]
    # Filter for TSGs
    geneList<-geneList[grep('TSG',geneList$Role.in.Cancer),]
    
    return(geneList)
}

In [4]:
geneList_lung <- get_drivers('lung')
geneList_colon <- get_drivers('colon')
geneList_brain <- get_drivers('brain')

In [5]:
cat('Genes not in annotation\n\n')
cat('Lung:')
geneList_lung[!(geneList_lung$Gene.Symbol %in% genes$gene_sym),]
cat('Colon:')
geneList_colon[!(geneList_colon$Gene.Symbol %in% genes$gene_sym),]
cat('Brain:')
geneList_brain[!(geneList_brain$Gene.Symbol %in% genes$gene_sym),]

Genes not in annotation

Lung:

Unnamed: 0,Gene.Symbol,Name,Entrez.GeneId,Genome.Location,Tier,Hallmark,Chr.Band,Somatic,Germline,Tumour.Types.Somatic.,Tumour.Types.Germline.,Cancer.Syndrome,Tissue.Type,Molecular.Genetics,Role.in.Cancer,Mutation.Types,Translocation.Partner,Other.Germline.Mut,Other.Syndrome,Synonyms
381,MALAT1,metastasis associated lung adenocarcinoma transcript 1 ( lnc-RNA; non-protein coding),378938,11:65502644-65502817,2,,11q31.1,yes,,"renal cell carcinoma (childhood epithelioid), lung",,,E,Dom,"oncogene, TSG, fusion",T,TFEB,,,"378938,MALAT1,PRO1073,Q9UHZ2"


Colon:

Gene.Symbol,Name,Entrez.GeneId,Genome.Location,Tier,Hallmark,Chr.Band,Somatic,Germline,Tumour.Types.Somatic.,Tumour.Types.Germline.,Cancer.Syndrome,Tissue.Type,Molecular.Genetics,Role.in.Cancer,Mutation.Types,Translocation.Partner,Other.Germline.Mut,Other.Syndrome,Synonyms


Brain:

Gene.Symbol,Name,Entrez.GeneId,Genome.Location,Tier,Hallmark,Chr.Band,Somatic,Germline,Tumour.Types.Somatic.,Tumour.Types.Germline.,Cancer.Syndrome,Tissue.Type,Molecular.Genetics,Role.in.Cancer,Mutation.Types,Translocation.Partner,Other.Germline.Mut,Other.Syndrome,Synonyms


MALAT1 is a long intergenic non-coding RNA, not in the annotation of protein-coding genes

In [6]:
geneList_lung <- geneList_lung[geneList_lung$Gene.Symbol %in% genes$gene_sym,]

In [7]:
# This function will count the number of L1 target sites in the provided ranges (assumed to be exonic) (columns 1-4),
# in other exonic ranges (col 5-8), and outside of exonic ranges (col 9-12)
count_in_ranges <- function(exon_ranges,counts_dipl) {
    
    counts <-  array(0,dim=c(24,12)) # Allocate matrix for counts

    #--- Loop through chromosome names
    j<-1 # chromosome counter
    for (i in c(1:22,'X','Y')){

        # If the chromosome is Y and the supplied diploid genome counts are for female genome, leave the 
        # counts at 0
        if (i=='Y' && sum(counts_dipl[24,])==0) {next}

        # Data objects containing indices of S-V sites for each category are labeled with the chromosome name.
        # Here we copy the data objects to a set of variables with generic names which can be used consistently 
        # in the loop. 
        map<-get(paste0('chr',i,'Map')) # Get the site map data for the current chrom
        ict<-    map[[2]]
        icl<-    map[[3]]
        iot<-    map[[4]]
        iol<-    map[[5]]
        insites<-map[[1]] 

################################################
        # Count in exon_ranges for current chromosome (columns 1-4)
        ann_i <- exon_ranges[exon_ranges$chrom == i,]

        sen        <-inrange(insites[ict[!is.na(ict[,1]),1],1],ann_i$start,ann_i$end) # Check if any Closed-Tight category sites are within the start-end range of ann_i
        antisen    <-inrange(insites[ict[!is.na(ict[,2]),2],2],ann_i$start,ann_i$end) 
        counts[j,1]<-length(which(sen)) + length(which(antisen))                      # Fill an element of the counts table with the count

        sen        <-inrange(insites[icl[!is.na(icl[,1]),1],1],ann_i$start,ann_i$end) 
        antisen    <-inrange(insites[icl[!is.na(icl[,2]),2],2],ann_i$start,ann_i$end)
        counts[j,2]<-length(which(sen)) + length(which(antisen))

        sen        <-inrange(insites[iot[!is.na(iot[,1]),1],1],ann_i$start,ann_i$end) 
        antisen    <-inrange(insites[iot[!is.na(iot[,2]),2],2],ann_i$start,ann_i$end)
        counts[j,3]<-length(which(sen)) + length(which(antisen))

        sen        <-inrange(insites[iol[!is.na(iol[,1]),1],1],ann_i$start,ann_i$end) 
        antisen    <-inrange(insites[iol[!is.na(iol[,2]),2],2],ann_i$start,ann_i$end)
        counts[j,4]<-length(which(sen)) + length(which(antisen)) 

        # Double the counts if necessary, to account for diploid genome
        if (i %in% c('X','Y') & sum(counts_dipl[24,])!=0) { # If the chrom. is X or Y and genome male, do nothing
        
        } else {
            counts[j,1:4] <- counts[j,1:4]*2
        }

################################################
        # Count in the remaining exonic regions, using counts_dipl (generated by analyses/count_sites_exonicvsnon.ipynb)

        counts[j,5]<-(counts_dipl[j,1])-counts[j,1]
        counts[j,6]<-(counts_dipl[j,2])-counts[j,2]
        counts[j,7]<-(counts_dipl[j,3])-counts[j,3]
        counts[j,8]<-(counts_dipl[j,4])-counts[j,4]

################################################
        # Count outside of exonic ranges for current chromosome (columns 9-12)

        if (i %in% c('X','Y') && sum(counts_dipl[24,])!=0) { # If X or Y chrom and male genome, don't double count
            counts[j,9]   <- length(which(!is.na(as.vector(ict)))) - counts[j,1] - counts[j,5]
            counts[j,10]  <- length(which(!is.na(as.vector(icl)))) - counts[j,2] - counts[j,6]
            counts[j,11]  <- length(which(!is.na(as.vector(iot)))) - counts[j,3] - counts[j,7]
            counts[j,12]  <- length(which(!is.na(as.vector(iol)))) - counts[j,4] - counts[j,8]
        } else if (i == 'Y' && sum(counts_dipl[24,]==0)) { # If Y and female genome, don't count any
            counts[j,9:12] <- 0
        } else { # Else, double counts
            counts[j,9]   <- length(which(!is.na(as.vector(ict))))*2 - counts[j,1] - counts[j,5]
            counts[j,10]  <- length(which(!is.na(as.vector(icl))))*2 - counts[j,2] - counts[j,6]
            counts[j,11]  <- length(which(!is.na(as.vector(iot))))*2 - counts[j,3] - counts[j,7]
            counts[j,12]  <- length(which(!is.na(as.vector(iol))))*2 - counts[j,4] - counts[j,8]
        }
        
        
        j<-j+1
    }
    colnames(counts) <- c('ct_sub','cl_sub','ot_sub','ol_sub','ct_rest','cl_rest','ot_rest','ol_rest','ct_outrange','cl_outrange','ot_outrange','ol_outrange')
    rownames(counts) <- paste0('chr',c(1:22,'X','Y'))
    return(counts)
}

In [8]:
get_prob_dist <- function(counts_matrix, exon_ranges) {
    
    if (sum(counts_matrix[24,])==0) {gend='f'} 
    else {gend='m'}
    counts_matrix <- colSums(counts_matrix)
    probs_ENd <- rep(0,3)
    for (i in 1:3){
        probs_ENd[i] = weights$closed_tight*counts_matrix[(i-1)*4+1] + 
                       weights$closed_loose*counts_matrix[(i-1)*4+2] + 
                       weights$open_tight*counts_matrix[(i-1)*4+3] + 
                       weights$open_loose*counts_matrix[(i-1)*4+4]
    }
    probs_ENd <- probs_ENd/sum(probs_ENd)
    probs_ENd
    
    probs_ENi <- rep(0,3)
    if (gend=='f') { # If female genome, double all sizes
        
        target_size <- sum(exon_ranges$end[exon_ranges$chrom!='Y']-exon_ranges$start[exon_ranges$chrom!='Y'])*2
        exons_size <- sum(lens[1:23,1]*2)
        genome_size <- sum(lens[1:23,2]*2)
        
    } else { # If male genome, double sizes only for chroms. 1-22
        
        target_size <- sum(exon_ranges$end[exon_ranges$chrom %in% c('X','Y')]-exon_ranges$start[exon_ranges$chrom %in% c('X','Y')])
        target_size <- target_size + sum(exon_ranges$end[!(exon_ranges$chrom %in% c('X','Y'))]
                                         -exon_ranges$start[!(exon_ranges$chrom %in% c('X','Y'))])*2
        
        exons_size <-  sum(lens[1:22,1]*2)+sum(lens[23:24,1])
        genome_size <- sum(lens[1:22,2]*2)+sum(lens[23:24,2])
    }
    
    probs_ENi[1] <- target_size/genome_size # Chance of target mutation
    probs_ENi[2] <- (exons_size-target_size)/genome_size # Chance of passenger mutation
    probs_ENi[3] <- (genome_size - exons_size)/genome_size # Chance of null mutation
    
    pd <- (.9*probs_ENd)+(0.1*probs_ENi) # Assuming 0.9 probability that insertion will be ENd and 0.1 for ENi
    pd <- pd/sum(pd)
    return(pd)
    
}

In [9]:
get_stats <- function(driver_list) {

    # Here we find the exonic ranges for the gene list of the current cancer, and take the union of any overlapping exonic ranges
    driver_exons <- exons[exons$gene_sym %in% driver_list$Gene.Symbol,]

    driver_exons <- GRanges(driver_exons$chrom,IRanges(driver_exons$start,driver_exons$end))
    driver_exons <- reduce(driver_exons)
    driver_exons <- data.table(chrom=as.vector(seqnames(driver_exons)),start=start(driver_exons),end=end(driver_exons))

    counts_m <- count_in_ranges(driver_exons,counts_dipl_m)
    # head(counts_m)
    counts_f <- count_in_ranges(driver_exons,counts_dipl_f)
    # head(counts_f)

    cat('Male driver to passenger site count ratio (unweighted)\n')
    print(sum(rowSums(counts_m[,1:4]))/sum(rowSums(counts_m[,5:8])))
    cat('Female driver to passenger site count ratio (unweighted)\n')
    print(sum(rowSums(counts_f[,1:4]))/sum(rowSums(counts_f[,5:8])))

    counts_m<-counts_m
    counts_f<-counts_f

    pd_m <- get_prob_dist(counts_m,driver_exons)
    cat('Male probabilities of driver, passenger and null L1 insertion (per L1 insertion)\n')
    print(pd_m)
    pd_f <- get_prob_dist(counts_f,driver_exons)
    cat('Female probabilities of driver, passenger and null L1 insertion\n')
    print(pd_f)
    
    return(list(pd_m, pd_f))
}

## Lung cancer

In [10]:
cat('Sample of associated drivers:')
head(geneList_lung)
out <- get_stats(geneList_lung)
pd_lung_m <- out[[1]]
pd_lung_f <- out[[2]]

Sample of associated drivers:

Unnamed: 0,Gene.Symbol,Name,Entrez.GeneId,Genome.Location,Tier,Hallmark,Chr.Band,Somatic,Germline,Tumour.Types.Somatic.,Tumour.Types.Germline.,Cancer.Syndrome,Tissue.Type,Molecular.Genetics,Role.in.Cancer,Mutation.Types,Translocation.Partner,Other.Germline.Mut,Other.Syndrome,Synonyms
147,CPEB3,cytoplasmic polyadenylation element binding protein 3,22849,10:92052212-92240350,2,,10q23.32,yes,,lung cancer,,,E,,TSG,D,,,,"22849,CPEB3,ENSG00000107864,Q8NE35"
158,CSMD3,CUB and Sushi multiple domains 3,114788,8:112224771-113436854,2,,8q23.3,yes,,"ovarian cancer, oral SCC, lung cancer",,,E,,TSG,"Mis, N",,,,"114788,CSMD3,ENSG00000164796,Q7Z407"
164,CUL3,cullin 3,8452,2:224474245-224585009,2,,2q36.2,yes,,lung cancer,,,E,,TSG,N,,,,"8452,CUL3,ENSG00000036257,Q13618"
191,EED,embryonic ectoderm development,8726,11:86245230-86278525,2,,11q14.2,yes,,"malignant peripheral nerve sheath tumours, MDS and related, lung adenocarcinoma",,,"E, L",,TSG,"Mis, F",,,,"8726,EED,ENSG00000074266,O75530"
286,GPC5,glypican 5,2262,13:91399047-92866439,2,,13q31.3,yes,,lung adenocarcinoma,,,E,,TSG,D,,,,"2262,ENSG00000179399,GPC5,P78333"
288,GRIN2A,"glutamate receptor, ionotropic, N-methyl D-aspartate 2A",2903,16:9763149-10180411,1,,16p13.2,yes,,"melanoma, colorectal carcinoma, gastric carcinoma, lung carcinoma",,,E,Rec,TSG,"Mis, N, F, O",,yes,Focal epilepsy and speech disorder with or without mental retardation,"2903,ENSG00000183454,GRIN2A,NMDAR2A,Q12879"


Male driver to passenger site count ratio (unweighted)
[1] 0.001959844
Female driver to passenger site count ratio (unweighted)
[1] 0.001933592
Male probabilities of driver, passenger and null L1 insertion (per L1 insertion)
[1] 4.728702e-05 2.458201e-02 9.753707e-01
Female probabilities of driver, passenger and null L1 insertion
[1] 0.0000464248 0.0244492341 0.9755043411


## Colon cancer

In [11]:
cat('Sample of associated drivers:')
head(geneList_colon)
out <- get_stats(geneList_colon)
pd_colon_m <- out[[1]]
pd_colon_f <- out[[2]]

Sample of associated drivers:

Unnamed: 0,Gene.Symbol,Name,Entrez.GeneId,Genome.Location,Tier,Hallmark,Chr.Band,Somatic,Germline,Tumour.Types.Somatic.,Tumour.Types.Germline.,Cancer.Syndrome,Tissue.Type,Molecular.Genetics,Role.in.Cancer,Mutation.Types,Translocation.Partner,Other.Germline.Mut,Other.Syndrome,Synonyms
21,APC,adenomatous polyposis of the colon gene,324,5:112754891-112844126,1,Yes,5q21,yes,yes,"colorectal, pancreatic, desmoid, hepatoblastoma, glioma, other CNS","colorectal, pancreatic, desmoid, hepatoblastoma, glioma, other CNS",adenomatous polyposis coli; Turcot syndrome,"E, M, O",Rec,TSG,"D, Mis, N, F, S",,,,"324,APC,ENSG00000134982,P25054"
27,ARHGEF10,Rho guanine nucleotide exchange factor 10,9639,8:1876561-1957263,2,,8p23.3,yes,,colon cancer,,,E,,TSG,D,,,,"9639,ARHGEF10,ENSG00000104728,O15013"
44,AXIN1,axin 1,8312,16:288122-347025,1,Yes,16p13.3,yes,,"colorectal, endometrial, prostate, hepatocellular carcinoma, hepatoblastoma, sporadic medulloblastoma",,,"E, O",Rec,TSG,"D, Mis, N, F, S",,,,"8312,AXIN1,ENSG00000103126,O15169"
45,AXIN2,axin 2,8313,17:65529976-65558620,1,Yes,17q24.1,yes,yes,"colorectal carcinoma, stomach carcinoma, hepatocellular carcinoma",colorectal carcinoma,oligodontia-colorectal cancer syndrome,E,Rec,TSG,"Mis, F, N",,,,"8313,AXIN2,ENSG00000168646,Q9Y2T1"
46,B2M,beta-2-microglobulin,567,15:44711547-44716342,1,Yes,15q21.1,yes,,"DLBCL, melanoma, colorectal adenocarcinoma",,,"E, L",Rec,TSG,"Mis, N, F",,yes,Immunodeficiency 43,"567,B2M,ENSG00000166710,P61769"
49,BAX,"BCL2 associated X, apoptosis regulator",581,19:48954929-48961097,1,,19q13.33,yes,,colorectal cancer,,,E,,TSG,"F, Mis",,,,"581,BAX,ENSG00000087088,Q07812"


Male driver to passenger site count ratio (unweighted)
[1] 0.004747007
Female driver to passenger site count ratio (unweighted)
[1] 0.004665248
Male probabilities of driver, passenger and null L1 insertion (per L1 insertion)
[1] 0.0001169808 0.0245123119 0.9753707073
Female probabilities of driver, passenger and null L1 insertion
[1] 0.0001143815 0.0243812774 0.9755043411


## Brain cancer

In [12]:
cat('Sample of associated drivers:')
head(geneList_brain)
out <- get_stats(geneList_brain)
pd_brain_m <- out[[1]]
pd_brain_f <- out[[2]]

Sample of associated drivers:

Unnamed: 0,Gene.Symbol,Name,Entrez.GeneId,Genome.Location,Tier,Hallmark,Chr.Band,Somatic,Germline,Tumour.Types.Somatic.,Tumour.Types.Germline.,Cancer.Syndrome,Tissue.Type,Molecular.Genetics,Role.in.Cancer,Mutation.Types,Translocation.Partner,Other.Germline.Mut,Other.Syndrome,Synonyms
132,CIC,capicua homolog,23152,19:42284705-42295191,1,Yes,19q13.2,yes,,"oligodendroglioma, soft tissue sarcoma",,,"O, M",Rec,"oncogene, TSG, fusion","Mis, F, S,T","DUX4L1, FOXO4",,,"23152,CIC,ENSG00000079432"
159,CTCF,CCCTC-binding factor,10664,16:67610833-67637872,1,Yes,16q22.1,yes,,"endometrial, breast, head and neck cancer",,,E,Dom,TSG,"Mis, N",,,"Mental retardation, autosomal dominant 21","10664,CTCF,ENSG00000102974,P49711"
238,FAT1,FAT atypical cadherin 1,2195,4:186588592-186709827,1,Yes,4q35.2,yes,yes,"oral squamous cell, chemorefractory CLL, head and neck, pancreatic acinar cell carcinoma",pancreatic,,"E, L",,TSG,"Mis, N, F, S",,,,"2195,ENSG00000083857,FAT,FAT1,Q14517"
240,FAT4,FAT atypical cadherin 4,79633,4:125316412-125491768,1,Yes,4q28.1,yes,,"lymphoma, pancreatic, head and neck, melanoma, hepatocellular carcinoma",,,"E, L",,TSG,"Mis, N",,yes,"Hennekam lymphangiectasia-lymphedema syndrome 2, Van Maldergem syndrome 2","79633,ENSG00000196159,FAT4"
353,KLF4,Kruppel-like factor 4,9314,9:107485751-107489172,1,Yes,9q31,yes,,meningioma,,,O,Dom,"oncogene, TSG",Mis,,,,"9314,ENSG00000136826,KLF4,O43474"
445,NF2,neurofibromatosis type 2 gene,4771,22:29603999-29694802,1,Yes,22q12.2,yes,yes,"meningioma, acoustic neuroma, renal","meningioma, acoustic neuroma",neurofibromatosis type 2,O,Rec,TSG,"D, Mis, N, F, S, O",,,,"4771,ENSG00000186575,NF2,P35240"


Male driver to passenger site count ratio (unweighted)
[1] 0.0006636383
Female driver to passenger site count ratio (unweighted)
[1] 0.000652254
Male probabilities of driver, passenger and null L1 insertion (per L1 insertion)
[1] 1.653611e-05 2.461276e-02 9.753707e-01
Female probabilities of driver, passenger and null L1 insertion
[1] 1.617498e-05 2.447948e-02 9.755043e-01


In [13]:
save(pd_lung_m,pd_lung_f,pd_colon_m,pd_colon_f,pd_brain_m,pd_brain_f,file='./data/tumor_type_pd_cgc.rda')
save(geneList_lung,geneList_colon,geneList_brain,file='./data/tumor_type_driver_lists_cgc.rda')