All inputs and outputs of exported functions are Ranges objects now.

asrinivasan-oa · Jul 25, 2016 · 94b397f · 94b397f
1 parent 6d9ea5d
commit 94b397f
Show file tree

Hide file tree

Showing 34 changed files with 641 additions and 786 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -8,7 +8,8 @@ Authors@R: c(
     c("aut", "cre")), person("Open Analytics", role = "cph"))
 Maintainer: Arunkumar Srinivasan <asrinivasan@openanalytics.eu>
 Depends:
-    R (>= 3.3)
+    R (>= 3.3),
+    GenomicRanges
 Imports:
     stats,
     utils,
@@ -20,7 +21,6 @@ Imports:
     Rsamtools,
     rtracklayer,
     IRanges,
-    GenomicRanges,
     GenomicFeatures,
     GenomicAlignments
 Suggests:

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,11 +1,5 @@
 # Generated by roxygen2: do not edit by hand
 
-S3method(tidy_cols,bam)
-S3method(tidy_cols,bed)
-S3method(tidy_cols,default)
-S3method(tidy_cols,gff)
-S3method(tidy_cols,gtf)
-export(as_granges)
 export(construct_introns)
 export(extract)
 export(read_bam)
@@ -14,7 +8,6 @@ export(read_format)
 export(read_gff)
 export(read_gtf)
 export(supported_formats)
-export(tidy_cols)
 import(GenomicFeatures)
 import(data.table)
 import(methods)
@@ -27,8 +20,6 @@ importFrom(GenomicAlignments,readGAlignments)
 importFrom(GenomicAlignments,rname)
 importFrom(GenomicAlignments,summarizeOverlaps)
 importFrom(GenomicAlignments,width)
-importFrom(GenomicRanges,GRanges)
-importFrom(GenomicRanges,split)
 importFrom(IRanges,IRanges)
 importFrom(Rsamtools,ScanBamParam)
 importFrom(Rsamtools,scanBamFlag)

diff --git a/R/as-granges.R b/R/as-granges.R
@@ -0,0 +1,42 @@
+#' @title Convert data.table to GRanges object
+#' 
+#' @description Convert \code{gtf}, \code{gff}, \code{bed}, \code{bam} 
+#' or a valid \code{data.table} to a GRanges object.
+#'
+#' @param x An object of class \code{gtf}, \code{gff}, \code{bed} or 
+#' \code{bam} or a valid \code{data.table} object.
+#' @param ignore_strand Logical argument to pass to \code{GRanges} function. 
+#' Indicates whether \code{strand} should be ignored when constructing 
+#' \code{GRanges} object or not. Default is \code{FALSE}.
+#' @return A \code{GRanges} object.
+#' @aliases as_granges
+#' @examples
+#' path <- system.file("tests", package="gread")
+#' gff_file <- file.path(path, "sample.gff")
+#' gtf_file <- file.path(path, "sample.gtf")
+#' bed_file <- file.path(path, "sample.bed")
+#' bam_file <- file.path(path, "sample.bam")
+#' 
+#' gff <- read_format(gff_file)
+#' gtf <- read_format(gtf_file)
+#' bed <- read_format(bed_file)
+#' bam <- read_format(bam_file)
+#' 
+#' as_granges(gff)
+#' as_granges(gtf)
+#' as_granges(bed)
+#' as_granges(bam)
+#' 
+#' as_granges(gff, ignore_strand=FALSE)
+#' as_granges(gtf, ignore_strand=FALSE)
+#' as_granges(bed, ignore_strand=FALSE)
+#' as_granges(bam, ignore_strand=FALSE)
+#' @seealso \code{\link{read_format}} \code{\link{extract}} 
+#' \code{\link{construct_introns}}
+as_granges <- function(x, ignore_strand=FALSE) {
+
+    stopifnot(is.gtf(x)||is.gff(x)||is.bed(x)||is.bam(x)||is.data.table(x))
+    x = shallow(x)
+    if (ignore_strand) x[, "strand" := NULL]
+    as(setDF(x), "GRanges")
+}
diff --git a/R/construct_introns.R → R/construct-introns.R b/R/construct_introns.R → R/construct-introns.R
@@ -1,4 +1,4 @@
-#' @title Construct introns from \code{gtf} or \code{gff} objects
+#' @title Construct introns from gtf/gff objects
 #'
 #' @description This function generates intronic coordinates by extracting 
 #' all the \code{exons} from a \code{gtf} or \code{gff} object.
@@ -9,7 +9,8 @@
 #' and returned invisibily, else just the \code{intron} coordinates are 
 #' returned.
 #' @return An object of class \code{gtf/gff} with updated \code{intron} 
-#' coordinates or just the intron coordinates depending on \code{update}.
+#' coordinates or just the intron coordinates depending on \code{update}. 
+#' They all inherit from \code{GRanges}.
 #' @export
 #' @examples
 #' path <- system.file("tests", package="gread")
@@ -21,20 +22,21 @@
 #' # same as above, but returns only constructed introns
 #' introns <- construct_introns(gtf, update=FALSE)
 #' @seealso \code{\link{supported_formats}} \code{\link{read_format}} 
-#' \code{\link{extract}} \code{\link{tidy_cols}} \code{\link{as_granges}} 
+#' \code{\link{extract}} \code{\link{as_granges}} 
 construct_introns <- function(x, update=TRUE) {
-    stopifnot(inherits(x, "gtf") || inherits(x, "gff"), 
-                "feature" %chin% names(x), 
+    stopifnot(is.gtf(x) || is.gff(x))
+    x = as_data_table(x)
+    stopifnot("feature" %chin% names(x), 
                 "transcript_id" %chin% names(x), 
                 update %in% c(TRUE, FALSE))
 
     # to please R CMD CHECK
-    feature=seqname=transcript_id=NULL
+    feature=seqnames=transcript_id=NULL
     x_class = copy(class(x))
     exons = x[feature == "exon"]
     if (!nrow(exons)) stop("feature == 'exon' returned 0 rows.")
-    setorderv(exons, c("transcript_id", "seqname", "start", "end", "strand"))
-    introns = exons[, .(seqname = seqname[1L], 
+    setorderv(exons, c("transcript_id", "seqnames", "start", "end", "strand"))
+    introns = exons[, .(seqnames = seqnames[1L], 
                         start = end[seq_len(.N-1L)]+1L, 
                         end = start[seq_len(.N-1L)+1L]-1L,
                         feature = "intron",
@@ -45,22 +47,23 @@ construct_introns <- function(x, update=TRUE) {
         stop("Exons for transcript ids [", paste(ids, collaspse=" "), 
             "] have start > end")
     exons[, c(setdiff(names(introns), "transcript_id")) := NULL]
-    exons = unique(exons, by="transcript_id")
+    exons = unique(shallow(exons, reset_class=TRUE), by="transcript_id")
     ecols = names(exons)
     introns[exons, (ecols) := mget(ecols), on="transcript_id"]
     # reset all exon related cols
     introns[, grep("^exon", names(introns), value=TRUE) := NA]
     colorder = names(x)
     if (update) {
         x = rbind(x, introns)
-        setorderv(x, c("seqname", "start", "end"))
+        setorderv(x, c("seqnames", "start", "end"))
         x = x[, .SD, by="transcript_id"]
         setattr(x, 'class', x_class)
         setcolorder(x, colorder)
-        return(x)
+        ans = x
     } else {
         setcolorder(introns, colorder)
-            setattr(introns, 'class', unique(c("intron", x_class)))
-        return(introns[])
+        setattr(introns, 'class', c("intron", "data.table", "data.frame"))
+        ans = introns
     }
+    new(class(ans)[1L], as(setDF(ans), "GRanges"))
 }
diff --git a/R/extract_features.R → R/extract-features.R b/R/extract_features.R → R/extract-features.R
@@ -6,7 +6,8 @@
 #' @details Extract features based on various criteria (usually intended for  
 #' obtaining read counts using \code{gcount} for a given \code{bam} file.
 #' 
-#' @param x Input object of class \code{gtf} or \code{gff}.
+#' @param x Input object of class \code{gtf} or \code{gff} which inherits 
+#' from \code{GRanges}.
 #' @param feature A character vector of (usually related) features to extract 
 #' from. One of \code{"gene_exon"}, \code{"gene"}, \code{"gene_intron"}, 
 #' \code{"exon"}, \code{"intron"}. NB: \code{"exon"} feature must be present 
@@ -51,10 +52,9 @@
 #' @return An object of class \code{"gene"} when \code{feature} is 
 #' \code{"gene"}, \code{"gene_exon"} or \code{"gene_intron"}, and of class 
 #' \code{"exon"} and \code{"intron"} when \code{feature} is \code{"exon"} or 
-#' \code{"intron"} respectively.
-#' @seealso \code{\link{read_format}} \code{\link{tidy_cols}} 
-#' \code{\link{as_granges}} \code{\link{extract}} 
-#' \code{\link{construct_introns}}
+#' \code{"intron"} respectively. They all inherit from \code{GRanges}.
+#' @seealso \code{\link{read_format}} \code{\link{as_granges}} 
+#' \code{\link{extract}} \code{\link{construct_introns}}
 #' @export
 #' @examples
 #' path <- system.file("tests", package="gread")
@@ -74,14 +74,18 @@ extract <- function(x, feature=c("gene_exon", "gene", "gene_intron", "exon",
     splits  = unlist(strsplit(feature, "_", fixed=TRUE))
     feature = splits[1L]
     subfeature = if (is.na(splits[2L])) NULL else splits[2L]
-    stopifnot(is.gtf(x) || is.gff(x), "feature" %in% names(x), 
-        ignore_strand %in% c(FALSE, TRUE), transcript_id %in% names(x), 
-        gene_id %in% names(x), nrow(x)>0L)
-    x = shallow(x) # TODO: use data.table:::shallow when exported
+    stopifnot(is.gtf(x) || is.gff(x))
+    x <- as_data_table(x)
+    stopifnot("feature" %in% names(x), 
+                ignore_strand %in% c(FALSE, TRUE), 
+                transcript_id %in% names(x), 
+                gene_id %in% names(x), nrow(x)>0L)
     setattr(x, 'class', c(match.arg(feature), class(x)))
-    setnames(x, c(transcript_id, gene_id), c("transcript_id", "gene_id"))
-    extract_feature(x, unique(x$feature), subfeature, 
+    setnames(x, c(transcript_id, gene_id), 
+                c("transcript_id", "gene_id"))
+    ans = extract_feature(x, unique(x$feature), subfeature, 
                 match.arg(type), ignore_strand, ...)
+    new(class(ans)[1L], as(setDF(ans), "GRanges"))
 }
 
 # ----- internal functions for extracting specific features ----- #
@@ -99,24 +103,24 @@ extract_feature.gene <- function(x, uniq_features, feature, type,
                         ignore_strand=FALSE, ...) {
 
     # to pleae R CMD CHECK
-    seqname=gene_id=transcript_id=NULL
+    seqnames=gene_id=transcript_id=NULL
     if (is.null(feature)) {
         if (!"exon" %in% uniq_features)
             stop("'exon' must be a feature in input object 'x'.")
         construct_transcripts <- function(x) {
-            x[feature %chin% "exon", .(seqname=seqname[1L], start=min(start), 
+            x[feature %chin% "exon", .(seqnames=seqnames[1L], start=min(start), 
                     end=max(end), strand=strand[1L], gene_id=gene_id[1L]), 
             by="transcript_id"]
         }
         transcripts = construct_transcripts(x)
     } else {
-        cols=c("transcript_id", "seqname", "start", "end", "strand", "gene_id")
+        cols=c("transcript_id", "seqnames", "start", "end", "strand", "gene_id")
         .feature = feature
         transcripts = x[feature %chin% .feature, cols, with=FALSE]
     }
     ans = switch (type, 
                     default = {
-                        genes = transcripts[, .(seqname=seqname[1L], 
+                        genes = transcripts[, .(seqnames=seqnames[1L], 
                             start=min(start), end=max(end), strand=strand[1L], 
                         transcript_id=paste(unique(transcript_id), 
                             collapse=";")), by="gene_id"]
@@ -149,7 +153,7 @@ extract_feature.gene <- function(x, uniq_features, feature, type,
                     }, 
                     intersect = {
                         genes = transcripts[, if (max(start) <= min(end)) 
-                                .(seqname=seqname[1L], start=max(start), 
+                                .(seqnames=seqnames[1L], start=max(start), 
                                 end=min(end), strand=strand[1L], 
                                 transcript_id=paste(unique(transcript_id), 
                                     collapse=";")), by="gene_id"]
@@ -176,7 +180,7 @@ extract_feature.gene <- function(x, uniq_features, feature, type,
                 collapse=";")), by="queryHits"]
     genes[, "overlaps":=gene_id][olaps$queryHits, "overlaps":=olaps$gene_id]
 
-    colorder = c("seqname", "start", "end", "length", "strand", 
+    colorder = c("seqnames", "start", "end", "length", "strand", 
                     "transcript_id", "gene_id", "overlaps")
     setcolorder(genes, colorder)
     genes[]
@@ -188,7 +192,7 @@ extract_feature.exon <- function(x, uniq_features, feature, type,
 
     # to please R CMD CHECK
     transcript_id=NULL
-    cols = c("transcript_id", "seqname", "start", "end", "strand", "gene_id")
+    cols = c("transcript_id", "seqnames", "start", "end", "strand", "gene_id")
     .feature = if (is.null(feature)) "exon" else feature
     exons = x[feature %chin% .feature, cols, with=FALSE]
     ans = switch (type, 
@@ -258,7 +262,7 @@ extract_feature.exon <- function(x, uniq_features, feature, type,
     # ans[, "overlaps" := gene_id][olaps$queryHits, "overlaps" := 
     #                       olaps$gene_id]
 
-    colorder = c("seqname", "start", "end", "length", "strand", 
+    colorder = c("seqnames", "start", "end", "length", "strand", 
                     "transcript_id", "gene_id") #, "overlaps")
     setcolorder(exons, colorder)
     exons[]
@@ -269,14 +273,16 @@ extract_feature.intron <- function(x, uniq_features, feature, type,
     stopifnot("exon" %in% uniq_features)
     # to please R CMD CHECK
     transcript_id=NULL
-    cols = c("transcript_id", "seqname", "start", "end", "strand", "gene_id")
+    cols = c("transcript_id", "seqnames", "start", "end", "strand", "gene_id")
     .feature = if (is.null(feature)) "intron" else feature
     introns = x[feature %chin% .feature, cols, with=FALSE]
     if (nrow(introns) == 0L) {
         # most likely the object doesn't have introns, so construct them
         warning("No introns found in input object. Attempting to construct 
             introns using construct_introns().")
-        introns = construct_introns(x, update=FALSE)
+        xx = new(class(x)[2L], as_granges(x))
+        introns = as_data_table(construct_introns(xx, update=FALSE))
+        setattr(introns, 'class', data.table::copy(class(x)))
     }
     if (nrow(introns) == 0L)
         stop("construct_introns() resulted in 0 introns as well. 
@@ -348,15 +354,40 @@ extract_feature.intron <- function(x, uniq_features, feature, type,
     # ans[, "overlaps" := gene_id][olaps$queryHits, "overlaps" := 
     #                               olaps$gene_id]
 
-    colorder = c("seqname", "start", "end", "length", "strand", 
+    colorder = c("seqnames", "start", "end", "length", "strand", 
                     "transcript_id", "gene_id") #, "overlaps")
     setcolorder(introns, colorder)
     introns[]
 }
 
-# ----- Helper functions for extract_features ----- #
+# ----- Helper functions for extract_features --------------------------------
 
 is.gtf <- function(x) inherits(x, 'gtf')
 is.gff <- function(x) inherits(x, 'gff')
 is.bed <- function(x) inherits(x, 'bed')
 is.bam <- function(x) inherits(x, 'bam')
+
+#' @title Convert gtf/gff/bam/bed S4 objects to data.table and preserve class
+#' 
+#' @description \code{as.data.table} converts the input object to 
+#' \code{data.table}, but strips away all other classes except 
+#' \code{data.table} and \code{data.frame}. 
+#' 
+#' This internal function is a simple wrapper to \code{as.data.table} but 
+#' also retains the extra class information of the input object.
+#' @aliases as_data_table
+#' @return A data.table object with input class preserved.
+#' @examples
+#' \dontrun{
+#' setClass("foo", contains="GRanges")
+#' x = new("foo", GRanges(letters[1:5], IRanges(1:5, 6:10)))
+#' class(x) # [1] "foo"
+#' class(gread:::as_data_table(x)) # [1] "foo" "data.table" "data.frame"
+#' }
+as_data_table <- function(x, reset_class=FALSE) {
+    stopifnot(inherits(x, "GRanges"))
+    cl = if (identical(class(x), "GRanges") || reset_class) NULL else class(x)
+    ans = as.data.table(x)
+    setattr(ans, 'class', c(cl, "data.table", "data.frame"))
+    ans[]
+}