In [58]:
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(data.table)
library(org.Hs.eg.db)

In [59]:
exann <- exons(TxDb.Hsapiens.UCSC.hg38.knownGene)
head(exann)

GRanges object with 6 ranges and 1 metadata column:
      seqnames         ranges strand |   exon_id
         <Rle>      <IRanges>  <Rle> | <integer>
  [1]     chr1 [11874, 12227]      + |         1
  [2]     chr1 [12595, 12721]      + |         2
  [3]     chr1 [12613, 12721]      + |         3
  [4]     chr1 [12646, 12697]      + |         4
  [5]     chr1 [13221, 14409]      + |         5
  [6]     chr1 [13403, 14409]      + |         6
  -------
  seqinfo: 455 sequences (1 circular) from hg38 genome

In [60]:
tail(exann[seqnames(exann)=="chrY"])

GRanges object with 6 ranges and 1 metadata column:
      seqnames               ranges strand |   exon_id
         <Rle>            <IRanges>  <Rle> | <integer>
  [1]     chrY [26578095, 26578328]      - |    293785
  [2]     chrY [57094007, 57094084]      - |    293786
  [3]     chrY [57094564, 57094605]      - |    293787
  [4]     chrY [57212178, 57213357]      - |    293788
  [5]     chrY [57213856, 57213964]      - |    293789
  [6]     chrY [57214350, 57214703]      - |    293790
  -------
  seqinfo: 455 sequences (1 circular) from hg38 genome

In [61]:
exann <- exann[1:293790]

In [62]:
exann <- data.table(chrom=as.vector(seqnames(exann)),
                 start=start(exann),
                 end=end(exann))

In [63]:
head(exann)

chrom,start,end
chr1,11874,12227
chr1,12595,12721
chr1,12613,12721
chr1,12646,12697
chr1,13221,14409
chr1,13403,14409


In [64]:
sum(exann$end-exann$start)/3e9

In [65]:
exann <- reduce(GRanges(exann$chrom,IRanges(exann$start,exann$end)))

In [66]:
exann <- data.table(chrom=as.vector(seqnames(exann)),
                   start=start(exann),
                   end=end(exann))

In [67]:
sum(exann$end-exann$start)/3e9

In [68]:
gann <- genes(TxDb.Hsapiens.UCSC.hg38.knownGene)

In [69]:
head(gann)

GRanges object with 6 ranges and 1 metadata column:
            seqnames               ranges strand |     gene_id
               <Rle>            <IRanges>  <Rle> | <character>
          1    chr19 [58346806, 58362848]      - |           1
         10     chr8 [18391245, 18401213]      + |          10
        100    chr20 [44619522, 44651735]      - |         100
       1000    chr18 [27950966, 28177481]      - |        1000
  100008589    chr21 [ 8213888,  8401980]      + |   100008589
  100009613    chr11 [70072434, 70075348]      - |   100009613
  -------
  seqinfo: 455 sequences (1 circular) from hg38 genome

In [92]:
exann$gene_id <- vector('list',nrow(exann))

for (i in unique(exann$chrom)) {
# for (i in "chr1") {
    
    tmpE <- exann[exann$chrom==i] # get exon annotation for current chromosome
    tmpG <- gann[seqnames(gann)==i] # get gene annotation for current chromosome
    geneInd <- lapply(tmpE$start,between,start(tmpG),end(tmpG)) # get logical indices for each exon start loci being within any genes
    
    geneInd <- lapply(geneInd,which) # turn logical indices into direct indices
    geneIDs <- lapply(geneInd,function(x) tmpG$gene_id[x]) # get the gene IDs of the genes with those indices
                
    geneIDs[lapply(geneIDs,length)==0]<-list(0)
                      
    exann[exann$chrom==i]$gene_id <- geneIDs # assign the retrieved gene IDs to the gene_id column for the current chromosome

}


In [93]:
head(exann)

chrom,start,end,gene_id
chr1,11874,12227,100287102
chr1,12595,12721,100287102
chr1,13221,16765,100287102
chr1,16854,17055,653635
chr1,17233,18061,653635
chr1,18268,18379,653635


In [94]:
tsgs <- read.table("../data/Human_TSGs.txt")
tsgs <- tsgs[2:nrow(tsgs),]
names(tsgs) <- c("GeneID","GeneSymbol")
head(tsgs)

Unnamed: 0,GeneID,GeneSymbol
2,43,ACHE
3,95,ACY1
4,104,ADARB1
5,141,ADPRH
6,142,PARP1
7,185,AGTR1


In [95]:
exann$istsg <- 0
unfound <- 0
k<-1

for (i in 1:nrow(tsgs)){ # Loop over TSGs
    tmp <- which(exann$gene_id==tsgs$GeneID[i]) # Find exons with gene_id matching the current TSG's id
    if (length(tmp) > 0) { # If at least one is found
        exann$istsg[tmp] <- 1
    } else {unfound <- unfound+1}
}

In [96]:
paste0(nrow(tsgs)-unfound,'/',nrow(tsgs), ' TSGs found (', (nrow(tsgs)-unfound)/nrow(tsgs),' %)')

In [99]:
head(exann[exann$istsg==1])

chrom,start,end,gene_id,istsg
chr1,1013467,1013576,9636,1
chr1,1013984,1014539,9636,1
chr1,1167104,1167198,406984,1
chr1,1167863,1167952,406983,1
chr1,1203508,1203968,8784,1
chr1,1204034,1204236,8784,1


In [100]:
x <- org.Hs.egSYMBOL

In [101]:
xf <- function(ids) {
    # ids contains gene ids of a given exon
    tmp<-c()
    n0 <- length(which(ids==0)) # How many of the gene ids for current exon are 0 (missing from TxDB)
    if (n0>0) {tmp<-append(tmp,rep("NA",n0))} # If any, append an equal number of "NA"s to tmp
    nMiss <- length(which(!is.element(ids,mappedkeys(x)))) # How many gene ids are not elements of org.Hs.eg.Db
    if (nMiss>0) { # If any, append an equal number of "NA"s to tmp
        tmp<-append(tmp,rep("NA",nMiss))
    }
    return(c(as.list(x[ids[ids!=0 & is.element(ids,mappedkeys(x))]]),tmp))
}
test <- lapply(exann$gene_id,xf)

In [102]:
exann$geneSym <- test

In [103]:
head(exann)

chrom,start,end,gene_id,istsg,geneSym
chr1,11874,12227,100287102,0,DDX11L1
chr1,12595,12721,100287102,0,DDX11L1
chr1,13221,16765,100287102,0,DDX11L1
chr1,16854,17055,653635,0,WASH7P
chr1,17233,18061,653635,0,WASH7P
chr1,18268,18379,653635,0,WASH7P


In [104]:
save(exann,file='../data/exann.rda')