In [1]:
library(data.table)
library(ensembldb)
EnsDbFile <- './data/large_files/Homo_sapiens.GRCh38.94.sqlite' # Load an sqlite database object previously generated by ensembldb ensDbFromGFF() function
edb <- EnsDb(EnsDbFile) # Create annotation database object from sqlite file

Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs

The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colMeans,
    colnames, colSums, dirname, do.call, duplicated, eval, evalq,
    Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply,
    lengths, Map, mapply, match, mget, order, paste, pmax, pmax.int,
    pmin, pmin.int, Position, rank, rbind, Reduce, rowMeans, rownames,
    rowSums, sapply, setdiff, sort, table, tapply, union, unique,
    unsplit, which, which.max, which.min

Loading required package: GenomicRanges
Loading required packag

In [2]:
#--- Get gene annotation
genes <- genes(edb)
#--- Convert the annotation to data.table object
genes <- data.table(chrom=as.vector(seqnames(genes)), start=start(genes), end=end(genes), gene_id=genes$gene_id, gene_sym=genes$gene_name, gene_biotype=genes$gene_biotype)
#--- Make field for gene length
genes$len <- genes$end-genes$start
#--- Print number of genes in annotation
nrow(genes)

In [3]:
#--- Filter for only genes in chromosomes 1-Y
genes <- genes[genes$chrom %in% c(1:22,'X','Y'),]
nrow(genes)
#--- Filter out any LRG genes
# genes <- genes[grep('ENSG',genes$gene_id),]
# nrow(genes)

In [4]:
#--- Print number of unique gene symbols
length(unique(genes$gene_sym))

In [5]:
#--- Filter annotation for only genes with biotype protein_coding
genes <- genes[which(genes$gene_biotype=='protein_coding'),]

In [6]:
#--- Print number of genes
length(genes$gene_sym)
#--- Print number of unique gene symbols
length(unique(genes$gene_sym))

In [7]:
# --- Print annotation of genes with duplicate symbols
# genes[genes$gene_sym %in% genes$gene_sym[duplicated(genes$gene_sym)],]

In [8]:
#--- Create a list of Ensembl IDs to remove corresponding to duplicated gene symbols with overlapping ranges amongst duplicates
todel <- c()
for (ii in genes$gene_sym[duplicated(genes$gene_sym)]) {
    tmp <- genes[genes$gene_sym == ii,]
    tmp <- tmp[order(-tmp$len),]
    if (length(intersect(tmp$start[1]:tmp$end[1],tmp$start[2]:tmp$end[2]))>0) {
        todel <- append(todel,tail(tmp$gene_id,nrow(tmp)-1))
    }
}

In [9]:
#--- Filter out overlapping duplicate genes
genes <- genes[!(genes$gene_id %in% todel),]

In [10]:
nrow(genes)

In [11]:
head(genes)

chrom,start,end,gene_id,gene_sym,gene_biotype,len
1,65419,71585,ENSG00000186092,OR4F5,protein_coding,6166
1,450703,451697,ENSG00000284733,OR4F29,protein_coding,994
1,685679,686673,ENSG00000284662,OR4F16,protein_coding,994
1,923928,944581,ENSG00000187634,SAMD11,protein_coding,20653
1,944204,959309,ENSG00000188976,NOC2L,protein_coding,15105
1,960587,965715,ENSG00000187961,KLHL17,protein_coding,5128


In [12]:
save(genes,file='./data/genes.rda')