Merge pull request BIMSBbioinfo#112 from frenkiboy/master

gffToGRanges update 2
al2na · Aug 14, 2015 · c115dce · c115dce
2 parents 53e94b2 + ca8a702
commit c115dce
Show file tree

Hide file tree

Showing 5 changed files with 26 additions and 59 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -21,6 +21,7 @@ Depends:
     R (>= 3.0.0),grid
 Imports:
 	data.table,
+  GenomeInfoDb,
   GenomicRanges,
 	GenomicAlignments,
   ggplot2,

diff --git a/NAMESPACE b/NAMESPACE
@@ -21,16 +21,19 @@ importMethodsFrom("rtracklayer", import)
 
 #importMethodsFrom("IRanges",nearest,as.data.frame,values,length,elementLengths,width,start,end)
 
-importFrom("readr", read_delim)
 importFrom("data.table",data.table)
-importFrom("plyr",rbind.fill)
-importFrom("reshape2", melt)
-importFrom("parallel", mclapply)
-importFrom("plotrix", dispersion)
-importFrom("plotrix", std.error)
+importFrom("GenomeInfoDb",seqlevels)
 importFrom("matrixStats",colMedians)
 importFrom("matrixStats",colSds)
 importFrom("matrixStats",colQuantiles)
+importFrom("parallel", mclapply)
+importFrom("plotrix", dispersion)
+importFrom("plotrix", std.error)
+importFrom("plyr",rbind.fill)
+importFrom("readr", read_delim)
+importFrom("reshape2", melt)
+
+
 
 exportClasses(AnnotationByFeature)
 exportClasses(AnnotationByGeneParts)

diff --git a/NEWS b/NEWS
@@ -3,7 +3,7 @@ genomation 1.1.12
 
 IMPROVEMENTS AND BUG FIXES
 
-* gffToGRanges parses column 9 of the gff file correctly; added ensembl=TRUE to prepend chr to seqlevels
+* gffToGRanges is now a wrapper for import from rtracklayer
 
 genomation 1.1.11
 --------------

diff --git a/R/readData.R b/R/readData.R
@@ -466,12 +466,6 @@ setMethod("readTranscriptFeatures",
 #'                 The file can end in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip}
 #'                 and/or start with \code{http://} or \code{ftp://}. If the file is not compressed
 #'                 it can also start with \code{https://} or \code{ftps://}.
-#' @param track.line Can be an integer specifying the number of track lines to skip, 
-#'                  "auto" to detect the header lines automatically
-#'                   or FALSE(default) if the bed file doesn't have track lines.
-#'                   "auto" detects both UCSC header lines and lines starting with #
-#' @param split.group boolean, whether to split the 9th column of the file
-#' @param split.char character that is used as a separator of the 9th column. ';' by default
 #' @param filter a character designating which elements to retain from the gff file (e.g. exon, CDS, ...)
 #' @param zero.based \code{boolean} whether the coordinates are 0 or 1 based. 0 is the default
 #' @param ensembl \code{boolean} if TRUE, add the chr prefix to seqlevels. FALSE by default
@@ -484,50 +478,20 @@ setMethod("readTranscriptFeatures",
 #' 
 #' @docType methods
 #' @export
-gffToGRanges = function(gff.file, track.line=FALSE, split.group=FALSE, split.char=';',filter=NULL, 
-                        zero.based=FALSE, ensembl=FALSE){
+gffToGRanges = function(gff.file, filter=NULL, zero.based=FALSE, ensembl=FALSE){
 
-  gff = readGeneric(gff.file, 
-                    chr=1,
-                    start=4,
-                    end=5, 
-                    strand=7,
-                    meta.cols=list(source=2,
-                                  feature=3,
-                                  score=6,
-                                  frame=8,
-                                  group=9), 
-                    zero.based=zero.based,
-                    skip=track.line)
-
-  if(split.group){
-    message('splitting the group.column...')
-    group = strsplit(gff$group, '\\s+')
-    group = lapply(group, function(x){
-                              vals = x[seq(2,length(x),2)]
-                              vals = sub(split.char, '', vals)
-                              vals = sub('^"', '', vals)
-                              vals = sub('"$', '', vals)
-                              d = data.table(t(vals))
-                              data.table::setnames(d, x[seq(1,length(x),2)])
-                              d
-    })
-    group = data.table::rbindlist(group, fill=TRUE)
-    gff$group = NULL
-    values(gff) = cbind(values(gff), as.data.frame(group))
-  }
-
-  if(!is.null(filter)){        
-    if(filter %in% gff$feature){
-      message(paste("Filtering", filter, "features...\n"))
-      gff = gff[gff$feature == filter,]
-    }else{
-      stop("The given feature is not present in the gff file")
-    }
-  }
+  gff = rtracklayer::import(gff.file)
+  if(zero.based)
+    gff$start = gff$start + 1
 
   if(ensembl)
     seqlevels(gff) = paste('chr',seqlevels(gff),sep='')
 
+  if(!is.null(filter)){
+    if(!any(gff$type == filter))
+      stop(paste(filter, 'category does not exist in the gff file'))
+    gff = gff[grepl(filter, gff$type)]
+  }
+
   return(gff)
 }
diff --git a/inst/unitTests/test_readData.R b/inst/unitTests/test_readData.R
@@ -122,14 +122,13 @@ test_gffToGRanges = function()
 {
   library(GenomicRanges)
   tab.test = system.file('unitTests/test.gtf', package='genomation')
-  gff1 = gffToGRanges(tab.test, track.line='auto')
+  gff1 = gffToGRanges(tab.test)
   checkIdentical(length(gff1), 3L)
-  checkIdentical(ncol(values(gff1)), 5L)
+
+  gff2 = gffToGRanges(tab.test, filter='exon')
+  checkIdentical(length(gff2), 1L)
 
-  gff2 = gffToGRanges(tab.test, track.line='auto', split.group=TRUE)
-  checkIdentical(ncol(values(gff2)), 13L)
-
-  gff3 = gffToGRanges(tab.test, track.line='auto',ensembl=TRUE)
+  gff3 = gffToGRanges(tab.test, ensembl=TRUE)
   checkIdentical(as.character(seqlevels(gff3)), 'chr1')
 }