#### In this notebook a data.table object is created holding human exon annotations corresponding to the gene annotation ./data/genes.rda

In [1]:
library(data.table)
library(ensembldb)
load('./data/genes.rda') # Load gene annotation
EnsDbFile <- './data/large_files/Homo_sapiens.GRCh38.94.sqlite' # Load an sqlite database object previously generated by ensembldb ensDbFromGFF() function
edb <- EnsDb(EnsDbFile) # Create annotation database object from sqlite file

Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs

The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colMeans,
    colnames, colSums, dirname, do.call, duplicated, eval, evalq,
    Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply,
    lengths, Map, mapply, match, mget, order, paste, pmax, pmax.int,
    pmin, pmin.int, Position, rank, rbind, Reduce, rowMeans, rownames,
    rowSums, sapply, setdiff, sort, table, tapply, union, unique,
    unsplit, which, which.max, which.min

Loading required package: GenomicRanges
Loading required packag

In [2]:
#--- Get exon annotation
exons <- exons(edb,columns=c('gene_name','gene_id','gene_biotype'))
#--- Convert the annotation to data.table object
exons <- data.table(chrom=as.vector(seqnames(exons)), start=start(exons), end=end(exons), gene_id=exons$gene_id, gene_sym=exons$gene_name, gene_biotype=exons$gene_biotype)
#--- Make field for exon length
exons$len <- exons$end-exons$start

In [3]:
#--- Filter for only exons in chromosomes 1-Y
exons <- exons[exons$chrom %in% c(1:22,'X','Y'),]
nrow(exons)

In [4]:
#--- Filter for only exons of genes included in gene annotation
exons <- exons[exons$gene_id %in% genes$gene_id,]
length(unique(exons$gene_id))

In [5]:
head(exons)

chrom,start,end,gene_id,gene_sym,gene_biotype,len
1,65419,65433,ENSG00000186092,OR4F5,protein_coding,14
1,65520,65573,ENSG00000186092,OR4F5,protein_coding,53
1,69037,71585,ENSG00000186092,OR4F5,protein_coding,2548
1,69055,70108,ENSG00000186092,OR4F5,protein_coding,1053
1,450703,451697,ENSG00000284733,OR4F29,protein_coding,994
1,685679,686673,ENSG00000284662,OR4F16,protein_coding,994


In [6]:
save(exons,file='./data/exons.rda')