In [1]:
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(data.table)
library(org.Hs.eg.db)

Loading required package: GenomicFeatures
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:stats’:

    IQR, mad, xtabs

The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, as.vector, cbind, colnames,
    do.call, duplicated, eval, evalq, Filter, Find, get, grep, grepl,
    intersect, is.unsorted, lapply, lengths, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
    union, unique, unlist, unsplit

Loading required package: S4Vectors
Loading required package: stats4
Loading required package: IRange

In [2]:
exann <- exons(TxDb.Hsapiens.UCSC.hg38.knownGene, columns = c("exon_id","gene_id"))
head(exann)

GRanges object with 6 ranges and 2 metadata columns:
      seqnames         ranges strand |   exon_id         gene_id
         <Rle>      <IRanges>  <Rle> | <integer> <CharacterList>
  [1]     chr1 [11874, 12227]      + |         1       100287102
  [2]     chr1 [12595, 12721]      + |         2       100287102
  [3]     chr1 [12613, 12721]      + |         3       100287102
  [4]     chr1 [12646, 12697]      + |         4       100287102
  [5]     chr1 [13221, 14409]      + |         5       100287102
  [6]     chr1 [13403, 14409]      + |         6       100287102
  -------
  seqinfo: 455 sequences (1 circular) from hg38 genome

In [3]:
tail(exann[seqnames(exann)=="chrY"])

GRanges object with 6 ranges and 2 metadata columns:
      seqnames               ranges strand |   exon_id         gene_id
         <Rle>            <IRanges>  <Rle> | <integer> <CharacterList>
  [1]     chrY [26578095, 26578328]      - |    293785                
  [2]     chrY [57094007, 57094084]      - |    293786                
  [3]     chrY [57094564, 57094605]      - |    293787                
  [4]     chrY [57212178, 57213357]      - |    293788          727856
  [5]     chrY [57213856, 57213964]      - |    293789          727856
  [6]     chrY [57214350, 57214703]      - |    293790          727856
  -------
  seqinfo: 455 sequences (1 circular) from hg38 genome

In [4]:
exann <- exann[1:293790]

In [5]:
exann$gene_id[lapply(exann$gene_id,length)==0]<-0

In [6]:
exann <- data.table(chrom=as.vector(seqnames(exann)),
                 start=start(exann),
                 end=end(exann),
                 gene_id=as.vector(exann$gene_id))

In [7]:
head(exann)

chrom,start,end,gene_id
chr1,11874,12227,100287102
chr1,12595,12721,100287102
chr1,12613,12721,100287102
chr1,12646,12697,100287102
chr1,13221,14409,100287102
chr1,13403,14409,100287102


In [8]:
sum(exann$end-exann$start)/3e9

In [9]:
tsgs <- read.table("../data/Human_TSGs.txt")
tsgs <- tsgs[2:nrow(tsgs),]
names(tsgs) <- c("GeneID","GeneSymbol")
head(tsgs)

Unnamed: 0,GeneID,GeneSymbol
2,43,ACHE
3,95,ACY1
4,104,ADARB1
5,141,ADPRH
6,142,PARP1
7,185,AGTR1


In [10]:
exann$istsg <- 0
unfound <- 0
k<-1

for (i in 1:nrow(tsgs)){ # Loop over TSGs
    tmp <- which(exann$gene_id==tsgs$GeneID[i]) # Find exons with gene_id matching the current TSG's id
    if (length(tmp) > 0) { # If at least one is found
        exann$istsg[tmp] <- 1
    } else {unfound <- unfound+1}
}

In [11]:
paste0(nrow(tsgs)-unfound,'/',nrow(tsgs), ' TSGs found (', (nrow(tsgs)-unfound)/nrow(tsgs),' %)')

In [12]:
head(exann)

chrom,start,end,gene_id,istsg
chr1,11874,12227,100287102,0
chr1,12595,12721,100287102,0
chr1,12613,12721,100287102,0
chr1,12646,12697,100287102,0
chr1,13221,14409,100287102,0
chr1,13403,14409,100287102,0


In [13]:
# save(exann,file='../data/exann.rda')

In [14]:
x <- org.Hs.egSYMBOL

In [15]:
xf <- function(ids) {
    # ids contains gene ids of a given exon
    tmp<-c()
    n0 <- length(which(ids==0)) # How many of the gene ids for current exon are 0 (missing from TxDB)
    if (n0>0) {tmp<-append(tmp,rep("NA",n0))} # If any, append an equal number of "NA"s to tmp
    nMiss <- length(which(!is.element(ids,mappedkeys(x)))) # How many gene ids are not elements of org.Hs.eg.Db
    if (nMiss>0) { # If any, append an equal number of "NA"s to tmp
        tmp<-append(tmp,rep("NA",nMiss))
    }
    return(c(as.list(x[ids[ids!=0 & is.element(ids,mappedkeys(x))]]),tmp))
}
test <- lapply(exann$gene_id,xf)

In [16]:
exann$geneSym <- test

In [17]:
head(exann)

chrom,start,end,gene_id,istsg,geneSym
chr1,11874,12227,100287102,0,DDX11L1
chr1,12595,12721,100287102,0,DDX11L1
chr1,12613,12721,100287102,0,DDX11L1
chr1,12646,12697,100287102,0,DDX11L1
chr1,13221,14409,100287102,0,DDX11L1
chr1,13403,14409,100287102,0,DDX11L1


In [18]:
save(exann,file='../data/exann.rda')