From 94b397f2a4906379f096d621fa449a9d0aefb929 Mon Sep 17 00:00:00 2001
From: asrinivasan-oa <asrinivasan@openanalytics.eu>
Date: Mon, 25 Jul 2016 13:21:36 +0200
Subject: [PATCH] All inputs and outputs of exported functions are Ranges
 objects now.

---
 DESCRIPTION                                   |   4 +-
 NAMESPACE                                     |   9 -
 R/as-granges.R                                |  42 +++
 ...onstruct_introns.R => construct-introns.R} |  29 +-
 R/{extract_features.R => extract-features.R}  |  77 +++--
 R/gread-format.R                              | 137 ++++++++
 R/gread.R                                     |   1 -
 R/{as_granges.R => internal-funs.R}           |  84 ++---
 R/intersect-bed.R                             |  22 +-
 R/{non_overlaps.R => non-overlaps.R}          |  40 ++-
 R/read-format.R                               | 148 +++++++++
 R/read_format.R                               | 311 ------------------
 R/s4-setup.R                                  |   7 +
 ...upported_formats.R => supported-formats.R} |   2 +-
 R/{test_gread.R => test-gread.R}              |   3 +-
 R/{tidy_format.R => tidy-format.R}            |  65 +---
 inst/tests/tests.Rraw                         | 215 ++++++------
 man/as_bam.Rd                                 |   2 +-
 man/as_data_table.Rd                          |  28 ++
 man/as_granges.Rd                             |  11 +-
 man/construct_introns.Rd                      |   9 +-
 man/disjoin_overlaps.Rd                       |   2 +-
 man/extract.Rd                                |  12 +-
 man/find_overlaps.Rd                          |   2 +-
 man/intersect_bed.Rd                          |   5 +-
 man/intersect_overlaps.Rd                     |   2 +-
 man/non_overlaps.Rd                           |   6 +-
 man/read_format.Rd                            |  52 +--
 man/reduce_overlaps.Rd                        |   2 +-
 man/shallow.Rd                                |   2 +-
 man/strictly_nonunique.Rd                     |   2 +-
 man/supported_formats.Rd                      |   4 +-
 man/test_gread.Rd                             |   5 +-
 man/tidy_cols.Rd                              |  85 -----
 34 files changed, 641 insertions(+), 786 deletions(-)
 create mode 100644 R/as-granges.R
 rename R/{construct_introns.R => construct-introns.R} (76%)
 rename R/{extract_features.R => extract-features.R} (86%)
 create mode 100644 R/gread-format.R
 rename R/{as_granges.R => internal-funs.R} (80%)
 rename R/{non_overlaps.R => non-overlaps.R} (85%)
 create mode 100644 R/read-format.R
 delete mode 100644 R/read_format.R
 create mode 100644 R/s4-setup.R
 rename R/{supported_formats.R => supported-formats.R} (87%)
 rename R/{test_gread.R => test-gread.R} (99%)
 rename R/{tidy_format.R => tidy-format.R} (69%)
 create mode 100644 man/as_data_table.Rd
 delete mode 100644 man/tidy_cols.Rd

diff --git a/DESCRIPTION b/DESCRIPTION
index 647b0f1..9a4d3a6 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -8,7 +8,8 @@ Authors@R: c(
     c("aut", "cre")), person("Open Analytics", role = "cph"))
 Maintainer: Arunkumar Srinivasan <asrinivasan@openanalytics.eu>
 Depends:
-    R (>= 3.3)
+    R (>= 3.3),
+    GenomicRanges
 Imports:
     stats,
     utils,
@@ -20,7 +21,6 @@ Imports:
     Rsamtools,
     rtracklayer,
     IRanges,
-    GenomicRanges,
     GenomicFeatures,
     GenomicAlignments
 Suggests:
diff --git a/NAMESPACE b/NAMESPACE
index 58d1f3d..79408ac 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,11 +1,5 @@
 # Generated by roxygen2: do not edit by hand
 
-S3method(tidy_cols,bam)
-S3method(tidy_cols,bed)
-S3method(tidy_cols,default)
-S3method(tidy_cols,gff)
-S3method(tidy_cols,gtf)
-export(as_granges)
 export(construct_introns)
 export(extract)
 export(read_bam)
@@ -14,7 +8,6 @@ export(read_format)
 export(read_gff)
 export(read_gtf)
 export(supported_formats)
-export(tidy_cols)
 import(GenomicFeatures)
 import(data.table)
 import(methods)
@@ -27,8 +20,6 @@ importFrom(GenomicAlignments,readGAlignments)
 importFrom(GenomicAlignments,rname)
 importFrom(GenomicAlignments,summarizeOverlaps)
 importFrom(GenomicAlignments,width)
-importFrom(GenomicRanges,GRanges)
-importFrom(GenomicRanges,split)
 importFrom(IRanges,IRanges)
 importFrom(Rsamtools,ScanBamParam)
 importFrom(Rsamtools,scanBamFlag)
diff --git a/R/as-granges.R b/R/as-granges.R
new file mode 100644
index 0000000..525aa77
--- /dev/null
+++ b/R/as-granges.R
@@ -0,0 +1,42 @@
+#' @title Convert data.table to GRanges object
+#' 
+#' @description Convert \code{gtf}, \code{gff}, \code{bed}, \code{bam} 
+#' or a valid \code{data.table} to a GRanges object.
+#'
+#' @param x An object of class \code{gtf}, \code{gff}, \code{bed} or 
+#' \code{bam} or a valid \code{data.table} object.
+#' @param ignore_strand Logical argument to pass to \code{GRanges} function. 
+#' Indicates whether \code{strand} should be ignored when constructing 
+#' \code{GRanges} object or not. Default is \code{FALSE}.
+#' @return A \code{GRanges} object.
+#' @aliases as_granges
+#' @examples
+#' path <- system.file("tests", package="gread")
+#' gff_file <- file.path(path, "sample.gff")
+#' gtf_file <- file.path(path, "sample.gtf")
+#' bed_file <- file.path(path, "sample.bed")
+#' bam_file <- file.path(path, "sample.bam")
+#' 
+#' gff <- read_format(gff_file)
+#' gtf <- read_format(gtf_file)
+#' bed <- read_format(bed_file)
+#' bam <- read_format(bam_file)
+#' 
+#' as_granges(gff)
+#' as_granges(gtf)
+#' as_granges(bed)
+#' as_granges(bam)
+#' 
+#' as_granges(gff, ignore_strand=FALSE)
+#' as_granges(gtf, ignore_strand=FALSE)
+#' as_granges(bed, ignore_strand=FALSE)
+#' as_granges(bam, ignore_strand=FALSE)
+#' @seealso \code{\link{read_format}} \code{\link{extract}} 
+#' \code{\link{construct_introns}}
+as_granges <- function(x, ignore_strand=FALSE) {
+
+    stopifnot(is.gtf(x)||is.gff(x)||is.bed(x)||is.bam(x)||is.data.table(x))
+    x = shallow(x)
+    if (ignore_strand) x[, "strand" := NULL]
+    as(setDF(x), "GRanges")
+}
diff --git a/R/construct_introns.R b/R/construct-introns.R
similarity index 76%
rename from R/construct_introns.R
rename to R/construct-introns.R
index 8f2a27c..e24f981 100644
--- a/R/construct_introns.R
+++ b/R/construct-introns.R
@@ -1,4 +1,4 @@
-#' @title Construct introns from \code{gtf} or \code{gff} objects
+#' @title Construct introns from gtf/gff objects
 #'
 #' @description This function generates intronic coordinates by extracting 
 #' all the \code{exons} from a \code{gtf} or \code{gff} object.
@@ -9,7 +9,8 @@
 #' and returned invisibily, else just the \code{intron} coordinates are 
 #' returned.
 #' @return An object of class \code{gtf/gff} with updated \code{intron} 
-#' coordinates or just the intron coordinates depending on \code{update}.
+#' coordinates or just the intron coordinates depending on \code{update}. 
+#' They all inherit from \code{GRanges}.
 #' @export
 #' @examples
 #' path <- system.file("tests", package="gread")
@@ -21,20 +22,21 @@
 #' # same as above, but returns only constructed introns
 #' introns <- construct_introns(gtf, update=FALSE)
 #' @seealso \code{\link{supported_formats}} \code{\link{read_format}} 
-#' \code{\link{extract}} \code{\link{tidy_cols}} \code{\link{as_granges}} 
+#' \code{\link{extract}} \code{\link{as_granges}} 
 construct_introns <- function(x, update=TRUE) {
-    stopifnot(inherits(x, "gtf") || inherits(x, "gff"), 
-                "feature" %chin% names(x), 
+    stopifnot(is.gtf(x) || is.gff(x))
+    x = as_data_table(x)
+    stopifnot("feature" %chin% names(x), 
                 "transcript_id" %chin% names(x), 
                 update %in% c(TRUE, FALSE))
 
     # to please R CMD CHECK
-    feature=seqname=transcript_id=NULL
+    feature=seqnames=transcript_id=NULL
     x_class = copy(class(x))
     exons = x[feature == "exon"]
     if (!nrow(exons)) stop("feature == 'exon' returned 0 rows.")
-    setorderv(exons, c("transcript_id", "seqname", "start", "end", "strand"))
-    introns = exons[, .(seqname = seqname[1L], 
+    setorderv(exons, c("transcript_id", "seqnames", "start", "end", "strand"))
+    introns = exons[, .(seqnames = seqnames[1L], 
                         start = end[seq_len(.N-1L)]+1L, 
                         end = start[seq_len(.N-1L)+1L]-1L,
                         feature = "intron",
@@ -45,7 +47,7 @@ construct_introns <- function(x, update=TRUE) {
         stop("Exons for transcript ids [", paste(ids, collaspse=" "), 
             "] have start > end")
     exons[, c(setdiff(names(introns), "transcript_id")) := NULL]
-    exons = unique(exons, by="transcript_id")
+    exons = unique(shallow(exons, reset_class=TRUE), by="transcript_id")
     ecols = names(exons)
     introns[exons, (ecols) := mget(ecols), on="transcript_id"]
     # reset all exon related cols
@@ -53,14 +55,15 @@ construct_introns <- function(x, update=TRUE) {
     colorder = names(x)
     if (update) {
         x = rbind(x, introns)
-        setorderv(x, c("seqname", "start", "end"))
+        setorderv(x, c("seqnames", "start", "end"))
         x = x[, .SD, by="transcript_id"]
         setattr(x, 'class', x_class)
         setcolorder(x, colorder)
-        return(x)
+        ans = x
     } else {
         setcolorder(introns, colorder)
-            setattr(introns, 'class', unique(c("intron", x_class)))
-        return(introns[])
+        setattr(introns, 'class', c("intron", "data.table", "data.frame"))
+        ans = introns
     }
+    new(class(ans)[1L], as(setDF(ans), "GRanges"))
 }
diff --git a/R/extract_features.R b/R/extract-features.R
similarity index 86%
rename from R/extract_features.R
rename to R/extract-features.R
index a790818..f7e83ed 100644
--- a/R/extract_features.R
+++ b/R/extract-features.R
@@ -6,7 +6,8 @@
 #' @details Extract features based on various criteria (usually intended for  
 #' obtaining read counts using \code{gcount} for a given \code{bam} file.
 #' 
-#' @param x Input object of class \code{gtf} or \code{gff}.
+#' @param x Input object of class \code{gtf} or \code{gff} which inherits 
+#' from \code{GRanges}.
 #' @param feature A character vector of (usually related) features to extract 
 #' from. One of \code{"gene_exon"}, \code{"gene"}, \code{"gene_intron"}, 
 #' \code{"exon"}, \code{"intron"}. NB: \code{"exon"} feature must be present 
@@ -51,10 +52,9 @@
 #' @return An object of class \code{"gene"} when \code{feature} is 
 #' \code{"gene"}, \code{"gene_exon"} or \code{"gene_intron"}, and of class 
 #' \code{"exon"} and \code{"intron"} when \code{feature} is \code{"exon"} or 
-#' \code{"intron"} respectively.
-#' @seealso \code{\link{read_format}} \code{\link{tidy_cols}} 
-#' \code{\link{as_granges}} \code{\link{extract}} 
-#' \code{\link{construct_introns}}
+#' \code{"intron"} respectively. They all inherit from \code{GRanges}.
+#' @seealso \code{\link{read_format}} \code{\link{as_granges}} 
+#' \code{\link{extract}} \code{\link{construct_introns}}
 #' @export
 #' @examples
 #' path <- system.file("tests", package="gread")
@@ -74,14 +74,18 @@ extract <- function(x, feature=c("gene_exon", "gene", "gene_intron", "exon",
     splits  = unlist(strsplit(feature, "_", fixed=TRUE))
     feature = splits[1L]
     subfeature = if (is.na(splits[2L])) NULL else splits[2L]
-    stopifnot(is.gtf(x) || is.gff(x), "feature" %in% names(x), 
-        ignore_strand %in% c(FALSE, TRUE), transcript_id %in% names(x), 
-        gene_id %in% names(x), nrow(x)>0L)
-    x = shallow(x) # TODO: use data.table:::shallow when exported
+    stopifnot(is.gtf(x) || is.gff(x))
+    x <- as_data_table(x)
+    stopifnot("feature" %in% names(x), 
+                ignore_strand %in% c(FALSE, TRUE), 
+                transcript_id %in% names(x), 
+                gene_id %in% names(x), nrow(x)>0L)
     setattr(x, 'class', c(match.arg(feature), class(x)))
-    setnames(x, c(transcript_id, gene_id), c("transcript_id", "gene_id"))
-    extract_feature(x, unique(x$feature), subfeature, 
+    setnames(x, c(transcript_id, gene_id), 
+                c("transcript_id", "gene_id"))
+    ans = extract_feature(x, unique(x$feature), subfeature, 
                 match.arg(type), ignore_strand, ...)
+    new(class(ans)[1L], as(setDF(ans), "GRanges"))
 }
 
 # ----- internal functions for extracting specific features ----- #
@@ -99,24 +103,24 @@ extract_feature.gene <- function(x, uniq_features, feature, type,
                         ignore_strand=FALSE, ...) {
 
     # to pleae R CMD CHECK
-    seqname=gene_id=transcript_id=NULL
+    seqnames=gene_id=transcript_id=NULL
     if (is.null(feature)) {
         if (!"exon" %in% uniq_features)
             stop("'exon' must be a feature in input object 'x'.")
         construct_transcripts <- function(x) {
-            x[feature %chin% "exon", .(seqname=seqname[1L], start=min(start), 
+            x[feature %chin% "exon", .(seqnames=seqnames[1L], start=min(start), 
                     end=max(end), strand=strand[1L], gene_id=gene_id[1L]), 
             by="transcript_id"]
         }
         transcripts = construct_transcripts(x)
     } else {
-        cols=c("transcript_id", "seqname", "start", "end", "strand", "gene_id")
+        cols=c("transcript_id", "seqnames", "start", "end", "strand", "gene_id")
         .feature = feature
         transcripts = x[feature %chin% .feature, cols, with=FALSE]
     }
     ans = switch (type, 
                     default = {
-                        genes = transcripts[, .(seqname=seqname[1L], 
+                        genes = transcripts[, .(seqnames=seqnames[1L], 
                             start=min(start), end=max(end), strand=strand[1L], 
                         transcript_id=paste(unique(transcript_id), 
                             collapse=";")), by="gene_id"]
@@ -149,7 +153,7 @@ extract_feature.gene <- function(x, uniq_features, feature, type,
                     }, 
                     intersect = {
                         genes = transcripts[, if (max(start) <= min(end)) 
-                                .(seqname=seqname[1L], start=max(start), 
+                                .(seqnames=seqnames[1L], start=max(start), 
                                 end=min(end), strand=strand[1L], 
                                 transcript_id=paste(unique(transcript_id), 
                                     collapse=";")), by="gene_id"]
@@ -176,7 +180,7 @@ extract_feature.gene <- function(x, uniq_features, feature, type,
                 collapse=";")), by="queryHits"]
     genes[, "overlaps":=gene_id][olaps$queryHits, "overlaps":=olaps$gene_id]
 
-    colorder = c("seqname", "start", "end", "length", "strand", 
+    colorder = c("seqnames", "start", "end", "length", "strand", 
                     "transcript_id", "gene_id", "overlaps")
     setcolorder(genes, colorder)
     genes[]
@@ -188,7 +192,7 @@ extract_feature.exon <- function(x, uniq_features, feature, type,
 
     # to please R CMD CHECK
     transcript_id=NULL
-    cols = c("transcript_id", "seqname", "start", "end", "strand", "gene_id")
+    cols = c("transcript_id", "seqnames", "start", "end", "strand", "gene_id")
     .feature = if (is.null(feature)) "exon" else feature
     exons = x[feature %chin% .feature, cols, with=FALSE]
     ans = switch (type, 
@@ -258,7 +262,7 @@ extract_feature.exon <- function(x, uniq_features, feature, type,
     # ans[, "overlaps" := gene_id][olaps$queryHits, "overlaps" := 
     #                       olaps$gene_id]
 
-    colorder = c("seqname", "start", "end", "length", "strand", 
+    colorder = c("seqnames", "start", "end", "length", "strand", 
                     "transcript_id", "gene_id") #, "overlaps")
     setcolorder(exons, colorder)
     exons[]
@@ -269,14 +273,16 @@ extract_feature.intron <- function(x, uniq_features, feature, type,
     stopifnot("exon" %in% uniq_features)
     # to please R CMD CHECK
     transcript_id=NULL
-    cols = c("transcript_id", "seqname", "start", "end", "strand", "gene_id")
+    cols = c("transcript_id", "seqnames", "start", "end", "strand", "gene_id")
     .feature = if (is.null(feature)) "intron" else feature
     introns = x[feature %chin% .feature, cols, with=FALSE]
     if (nrow(introns) == 0L) {
         # most likely the object doesn't have introns, so construct them
         warning("No introns found in input object. Attempting to construct 
             introns using construct_introns().")
-        introns = construct_introns(x, update=FALSE)
+        xx = new(class(x)[2L], as_granges(x))
+        introns = as_data_table(construct_introns(xx, update=FALSE))
+        setattr(introns, 'class', data.table::copy(class(x)))
     }
     if (nrow(introns) == 0L)
         stop("construct_introns() resulted in 0 introns as well. 
@@ -348,15 +354,40 @@ extract_feature.intron <- function(x, uniq_features, feature, type,
     # ans[, "overlaps" := gene_id][olaps$queryHits, "overlaps" := 
     #                               olaps$gene_id]
 
-    colorder = c("seqname", "start", "end", "length", "strand", 
+    colorder = c("seqnames", "start", "end", "length", "strand", 
                     "transcript_id", "gene_id") #, "overlaps")
     setcolorder(introns, colorder)
     introns[]
 }
 
-# ----- Helper functions for extract_features ----- #
+# ----- Helper functions for extract_features --------------------------------
 
 is.gtf <- function(x) inherits(x, 'gtf')
 is.gff <- function(x) inherits(x, 'gff')
 is.bed <- function(x) inherits(x, 'bed')
 is.bam <- function(x) inherits(x, 'bam')
+
+#' @title Convert gtf/gff/bam/bed S4 objects to data.table and preserve class
+#' 
+#' @description \code{as.data.table} converts the input object to 
+#' \code{data.table}, but strips away all other classes except 
+#' \code{data.table} and \code{data.frame}. 
+#' 
+#' This internal function is a simple wrapper to \code{as.data.table} but 
+#' also retains the extra class information of the input object.
+#' @aliases as_data_table
+#' @return A data.table object with input class preserved.
+#' @examples
+#' \dontrun{
+#' setClass("foo", contains="GRanges")
+#' x = new("foo", GRanges(letters[1:5], IRanges(1:5, 6:10)))
+#' class(x) # [1] "foo"
+#' class(gread:::as_data_table(x)) # [1] "foo" "data.table" "data.frame"
+#' }
+as_data_table <- function(x, reset_class=FALSE) {
+    stopifnot(inherits(x, "GRanges"))
+    cl = if (identical(class(x), "GRanges") || reset_class) NULL else class(x)
+    ans = as.data.table(x)
+    setattr(ans, 'class', c(cl, "data.table", "data.frame"))
+    ans[]
+}
diff --git a/R/gread-format.R b/R/gread-format.R
new file mode 100644
index 0000000..59b9fff
--- /dev/null
+++ b/R/gread-format.R
@@ -0,0 +1,137 @@
+gread <- function(token) {
+    UseMethod("gread")
+}
+
+gread.default <- function(token) {
+    stop("No method available for format ", gsub("(.*)_format$", 
+            "", class(token)))
+}
+
+gread.gtf_format <- function(token) {
+    # to please R CMD CHECK
+    score=phase=NULL
+    # rtracklayer::readGFF reads gff v1 'attributes' column as 'group',
+    # but we want it to be always 'attributes'. Also, it tidies up the 
+    # attributes column by default
+    rtracklayer_fun <- function() {
+        ans = setDT(rtracklayer::readGFF(token$file, version = 2L, 
+                columns = rtracklayer::GFFcolnames()))
+        ans[, (1:3) := lapply(.SD, as.character), .SDcols=1:3][]
+        # remove additional attributes
+        setattr(ans, 'pragmas', NULL)
+        setattr(ans, 'attrcol_fmt', NULL)
+        setattr(ans, 'ncol0', NULL)
+        setattr(ans, 'ntag', NULL)
+        setattr(ans, 'raw_data', NULL)
+        ans
+    }
+    fread_fun <- function() fread(token$file, colClasses = token$types, 
+        showProgress = token$verbose)
+    read_table_fun <- function() read_table(token$file, sep = "\t", 
+        header = FALSE, comment.char = "#", nrows = -1L, colClasses = 
+        token$types, quote = "")
+    ans = tryCatch(rtracklayer_fun(), error = function(o) { 
+                if (token$verbose) cat("rtracklayer::readGFF failed to read", 
+                    " the gtf file. Reverting to data.table::fread.\n", sep="")
+                tryCatch(fread_fun(), error = function(o) {
+                    if (token$verbose) cat("data.table::fread failed to read", 
+                        " as well. Reverting to base::read.table.\n", sep="")
+                    read_table_fun()
+                })
+          })
+    setnames(ans, head(names(ans), 9L), token$names)
+    if (is.character(ans[["score"]])) 
+        ans[, "score" := suppressWarnings(as.numeric(score))]
+    if (is.character(ans[["phase"]])) 
+        ans[, "phase" := suppressWarnings(as.integer(phase))]
+    setattr(ans, 'class', c("gtf", "data.table", "data.frame"))
+}
+
+gread.gff_format <- function(token) {
+    # to please R CMD CHECK
+    score=phase=NULL
+    # rtracklayer::readGFF reads gff v1 'attributes' column as 'group',
+    # but we want it to be always 'attributes'. Also, it tidies up the 
+    # attributes column by default
+    rtracklayer_fun <- function() {
+        ans = setDT(as.data.frame(rtracklayer::readGFF(token$file, columns = 
+                rtracklayer::GFFcolnames())))
+        if (!token$tidy_cols) set(ans, j = tail(names(ans), -9L), value = NULL)
+        ans[, (1:3) := lapply(.SD, as.character), .SDcols=1:3][]
+        list_cols = which(vapply(ans, is.list, TRUE))
+        if (length(list_cols)) {
+            list_paste <- function(x) sapply(x, paste, collapse=",")
+            # paste all values of list cols together
+            ans[, (list_cols) := lapply(.SD, list_paste), .SDcols=list_cols]
+        }
+        # as.data.frame drops all extra attributes
+        ans
+    }
+    fread_fun <- function() fread(token$file, colClasses = token$types, 
+        showProgress = token$verbose)
+    read_table_fun <- function() read_table(token$file, sep = "\t", 
+        header = FALSE, comment.char = "#", nrows = -1L, colClasses = 
+        token$types, quote = "")
+    ans = tryCatch(rtracklayer_fun(), error = function(o) { 
+                if (token$verbose) cat("rtracklayer::readGFF failed to read",
+                    " the gtf file. Reverting to data.table::fread.\n", sep="")
+                tryCatch(fread_fun(), error = function(o) {
+                    if (token$verbose) cat("data.table::fread failed to read", 
+                        " as well. Reverting to base::read.table.\n", sep="")
+                    read_table_fun()
+                })
+          })
+    setnames(ans, head(names(ans), 9L), token$names)
+    if (is.character(ans[["score"]])) 
+        ans[, "score" := suppressWarnings(as.numeric(score))]
+    if (is.character(ans[["phase"]])) 
+        ans[, "phase" := suppressWarnings(as.integer(phase))]
+    setattr(ans, 'class', c("gff", "data.table", "data.frame"))
+}
+
+gread.bed_format <- function(token) {
+    ans = tryCatch(fread(token$file, colClasses = token$types, sep="\t", 
+                showProgress = token$verbose), error = function(o) {
+                    if (token$verbose) cat("data.table::fread failed to read", 
+                        " the bed file. Reverting to 'read.table'", 
+                        " from base R.\n", sep="")
+                    read_table(token$file, sep = "\t", header = FALSE, 
+                        comment.char = "#", nrows = -1L, colClasses = 
+                        token$types, quote = "")
+                    })
+    # setnames will error if bed file has < 3 cols
+    setnames(ans, head(token$names, max(3L, ncol(ans))))
+    # ans[, start := start+1L] # TODO: should we automatically set start+1L?
+    setattr(ans, 'class', c("bed", "data.table", "data.frame"))
+}
+
+gread.bam_format <- function(token) {
+    if (token$verbose) 
+        cat("Loading bam file: ",token$file,", please wait...\n", sep="")
+    stopifnot(length(token$file)==1L)
+    
+    what  = c("flag")
+    flags = Rsamtools::scanBamFlag(isUnmappedQuery=FALSE)
+    if (!length(token$chromosomes)) {
+        param = Rsamtools::ScanBamParam(what=what, tag=token$tags, flag=flags)
+    } else {
+            hdr = Rsamtools::scanBamHeader(token$file)[[1]]$targets
+            hdr = hdr[names(hdr) %in% token$chromosomes]
+            if (!all(names(hdr) %in% token$chromosomes)) {
+                stop("chromosomes [", paste(setdiff(token$chromosomes, 
+                    names(hdr)), collapse=", "), "] not found in bam file.")
+            }
+            which = as_granges(data.table(seqnames = names(hdr), 
+                        start=1L, end = hdr, strand="*"), ignore_strand=TRUE)
+            param = Rsamtools::ScanBamParam(what = what, which = which, 
+                                tag = token$tags, flag = flags)
+    }
+    # load bam with corresponding param
+    ans = as_bam(GenomicAlignments::readGAlignments(token$file, param=param))
+    setattr(ans, 'class', c("bam", "data.table", "data.frame"))
+}
+
+# Internal helper functions --------------------------------------------------
+read_table <- function(file, ...) {
+    setDT(read.table(file, stringsAsFactors=FALSE, as.is=TRUE, ...))
+}
diff --git a/R/gread.R b/R/gread.R
index 8b02969..415feee 100644
--- a/R/gread.R
+++ b/R/gread.R
@@ -38,7 +38,6 @@
 #' @importFrom utils capture.output head packageVersion read.table tail
 #' @importFrom stats na.omit
 #' @importFrom Rsamtools ScanBamParam scanBamFlag scanBamHeader
-#' @importFrom GenomicRanges GRanges split
 #' @importFrom IRanges IRanges
 #' @importMethodsFrom GenomicRanges disjoin reduce intersect  findOverlaps 
 #' @importMethodsFrom GenomicRanges countOverlaps seqnames start end strand
diff --git a/R/as_granges.R b/R/internal-funs.R
similarity index 80%
rename from R/as_granges.R
rename to R/internal-funs.R
index 7786fa7..bdcb527 100644
--- a/R/as_granges.R
+++ b/R/internal-funs.R
@@ -1,49 +1,4 @@
-## internal functions ---------------------------------
-
-#' @title Convert to GRanges object
-#' 
-#' @description Convert \code{gtf}, \code{gff}, \code{bed}, \code{bam} 
-#' or a valid \code{data.table} to a GRanges object, preserving all 
-#' additional columns.
-#'
-#' @param x An object of class \code{gtf}, \code{gff}, \code{bed} or 
-#' \code{bam} or a valid \code{data.table} object.
-#' @param ignore_strand Logical argument to pass to \code{GRanges} function. 
-#' Indicates whether \code{strand} should be ignored when constructing 
-#' \code{GRanges} object or not. Default is \code{FALSE}.
-#' @return A \code{GRanges} object.
-#' @export
-#' @aliases as_granges
-#' @examples
-#' path <- system.file("tests", package="gread")
-#' gff_file <- file.path(path, "sample.gff")
-#' gtf_file <- file.path(path, "sample.gtf")
-#' bed_file <- file.path(path, "sample.bed")
-#' bam_file <- file.path(path, "sample.bam")
-#' 
-#' gff <- read_format(gff_file)
-#' gtf <- read_format(gtf_file)
-#' bed <- read_format(bed_file)
-#' bam <- read_format(bam_file)
-#' 
-#' as_granges(gff)
-#' as_granges(gtf)
-#' as_granges(bed)
-#' as_granges(bam)
-#' 
-#' as_granges(gff, ignore_strand=FALSE)
-#' as_granges(gtf, ignore_strand=FALSE)
-#' as_granges(bed, ignore_strand=FALSE)
-#' as_granges(bam, ignore_strand=FALSE)
-#' @seealso \code{\link{read_format}} 
-#' \code{\link{tidy_cols}} \code{\link{extract}} \code{\link{construct_introns}}
-as_granges <- function(x, ignore_strand=FALSE) {
-
-    stopifnot(is.gtf(x)||is.gff(x)||is.bed(x)||is.bam(x)||is.data.table(x))
-    x = shallow(x)
-    if (ignore_strand) x[, "strand" := NULL]
-    as(setDF(x), "GRanges")
-}
+## internal helper functions -------------------------------------------------
 
 #' @title Return only those rows where rows per group is > 1.
 #'
@@ -122,7 +77,7 @@ find_overlaps <- function(x, y, ignore_redundant=FALSE,
 #' @return A \code{data.table} with reduced ranges.
 reduce_overlaps <- function(x, by="gene_id", ignore_strand=FALSE) {
     # to please R CMD CHECK
-    group=seqname=i.strand=`..N..`=NULL
+    group=seqnames=i.strand=`..N..`=NULL
     stopifnot(length(by) == 1L, by %in% names(x), is.gtf(x) || is.gff(x) 
         || is.bed(x) || is.bam(x), ignore_strand %in% c(FALSE, TRUE))
     red = as.data.frame(GenomicRanges::reduce(GenomicRanges::split(as_granges(
@@ -130,11 +85,10 @@ reduce_overlaps <- function(x, by="gene_id", ignore_strand=FALSE) {
     setDT(red)[, group := NULL]
     setcolorder(red, c("seqnames", "start", "end", "width", "strand", 
                         "group_name"))
-    setnames(red, c("seqnames", "width", "group_name"), c("seqname", 
-                        "length", by))
-    red[, `:=`(seqname=as.character(seqname), strand=as.character(strand))]
+    setnames(red, c("width", "group_name"), c("length", by))
+    red[, `:=`(seqnames=as.character(seqnames), strand=as.character(strand))]
     # restore original order, nomatch = "errror" would be great to have here!
-    ux = unique(shallow(x, c(by, "strand")), by=c(by))
+    ux = unique(shallow(x, c(by, "strand"), reset_class=TRUE), by=c(by))
     red = red[ux, on=c(by)]
     # if ignore_strand, replace strand
     if (ignore_strand) red[, strand := i.strand]
@@ -162,18 +116,17 @@ disjoin_overlaps <- function(x, by="gene_id", ignore_strand=FALSE) {
         is.bed(x) || is.bam(x), ignore_strand %in% c(FALSE, TRUE))
 
     # to please R CMD CHECK
-    group=seqname=i.strand=NULL
+    group=seqnames=i.strand=NULL
     dj = as.data.frame(GenomicRanges::disjoin(GenomicRanges::split(as_granges(
                         x, ignore_strand), x[[by]])))
     setDT(dj)[, group := NULL]
     setcolorder(dj, c("seqnames", "start", "end", "width", "strand", 
                         "group_name"))
-    setnames(dj, c("seqnames", "width", "group_name"), c("seqname", 
-                        "length", by))
-    dj[, `:=`(seqname=as.character(seqname), strand=as.character(strand))]
+    setnames(dj, c("width", "group_name"), c("length", by))
+    dj[, `:=`(seqnames=as.character(seqnames), strand=as.character(strand))]
     # restore original order
     # TODO: revisit when nomatch = "error" is implemented
-    ux = unique(shallow(x, c(by, "strand")), by=c(by))
+    ux = unique(shallow(x, c(by, "strand"), reset_class=TRUE), by=c(by))
     dj = dj[ux, on=c(by)]
     # if ignore_strand, replace strand
     if (ignore_strand) dj[, strand := i.strand]
@@ -197,7 +150,7 @@ disjoin_overlaps <- function(x, by="gene_id", ignore_strand=FALSE) {
 #' @return A \code{data.table} with intersecting ranges.
 intersect_overlaps <- function(x, by="gene_id", ignore_strand=FALSE) {
     # to please R CMD CHECK
-    group=seqname=i.strand=NULL
+    group=seqnames=i.strand=NULL
     stopifnot(length(by) == 1L, by %in% names(x), is.gtf(x) || is.gff(x) || 
         is.bed(x) || is.bam(x), ignore_strand %in% c(FALSE, TRUE))
     s_gr = GenomicRanges::split(as_granges(x, ignore_strand), x[[by]])
@@ -205,11 +158,10 @@ intersect_overlaps <- function(x, by="gene_id", ignore_strand=FALSE) {
     setDT(isect)[, group := NULL]
     setcolorder(isect, c("seqnames", "start", "end", "width", "strand", 
                         "group_name"))
-    setnames(isect, c("seqnames", "width", "group_name"), c("seqname", 
-                        "length", by))
-    isect[, `:=`(seqname=as.character(seqname), strand=as.character(strand))]
+    setnames(isect, c("width", "group_name"), c("length", by))
+    isect[, `:=`(seqnames=as.character(seqnames), strand=as.character(strand))]
     # restore original order, nomatch = "errror" would be great to have here!
-    ux = unique(shallow(x, c(by, "strand")), by=c(by))
+    ux = unique(shallow(x, c(by, "strand"), reset_class=TRUE), by=c(by))
     isect = isect[ux, on=c(by)]
     # if ignore_strand, replace strand
     if (ignore_strand) isect[, strand := i.strand]
@@ -236,14 +188,16 @@ intersect_overlaps <- function(x, by="gene_id", ignore_strand=FALSE) {
 #' y <- gread:::shallow(x) # only copies column pointers
 #' class(y) # class(x) is retained
 #' }
-shallow <- function(x, cols = names(x)) {
+shallow <- function(x, cols = names(x), reset_class = FALSE) {
     stopifnot(is.data.table(x), all(cols %in% names(x)))
     ans = vector("list", length(cols))
     setattr(ans, 'names', data.table::copy(cols))
     for (col in cols)
         ans[[col]] = x[[col]]
     setDT(ans)
-    setattr(ans, 'class', data.table::copy(class(x)))
+    class = if (!reset_class) data.table::copy(class(x)) 
+            else c("data.table", "data.frame")
+    setattr(ans, 'class', class)
     ans[]
 }
 
@@ -272,7 +226,7 @@ shallow <- function(x, cols = names(x)) {
 as_bam <- function(x) {
     stopifnot(inherits(x, "GAlignments"))
     ans = list(
-            seqname=as.character(seqnames(x)), 
+            seqnames=as.character(seqnames(x)), 
             start=start(x), 
             end=end(x), 
             strand=as.character(strand(x)),
@@ -306,7 +260,7 @@ as_bam <- function(x) {
 # #' @return A \code{TxDb} object.
 # #' @aliases as_txdb
 # #' @seealso \code{\link{as_granges}} \code{\link{read_format}} 
-# #' \code{\link{tidy_cols}} \code{\link{extract}}
+# #' \code{\link{extract}}
 # #' @export
 # #' @examples
 # #' path <- system.file("tests", package="gread")
diff --git a/R/intersect-bed.R b/R/intersect-bed.R
index 6bb9dc9..759bb14 100644
--- a/R/intersect-bed.R
+++ b/R/intersect-bed.R
@@ -7,7 +7,7 @@
 #' 
 #' @param gtf_file Full path to a \code{gtf} file.
 #' @param bed_file Full path to a \code{bed} file. The file must contain at 
-#' least three columns, with them corresponding to \code{seqname} (or 
+#' least three columns, with them corresponding to \code{seqnames} (or 
 #' \code{chr}), \code{start} and \code{end}.
 #' @param select_features A 1-column \code{data.table} or a named list of 
 #' length=1. The name of the list indicates the column in the \code{gtf} file 
@@ -15,16 +15,15 @@
 #' containing the values to \emph{retain}.
 #' @seealso \code{\link{read_format}}, \code{\link{non_overlaps}}, 
 #' \code{\link{construct_introns}}, \code{\link{extract}}
-#' \code{\link{tidy_cols}}
-#' @return A \code{gtf} object containing just those transcripts that overlap 
-#' with the provided bed file.
+#' @return A \code{gtf} object that inherits from \code{GRanges} containing 
+#' just those transcripts that overlap with the provided bed file.
 intersect_bed <- function(gtf_file, bed_file, select_features) {
 
     # to please R CMD CHECK
-    feature=seqname=transcript_id=NULL
-    # load seqname, start and end columns
+    feature=seqnames=transcript_id=NULL
+    # load seqnames, start and end columns
     bed = read_format(bed_file)
-    setnames(bed, names(bed)[1:3], c("seqname", "start", "end"))
+    setnames(bed, names(bed)[1:3], c("seqnames", "start", "end"))
     bed[, "start" := start+1L]
     setkey(bed)
 
@@ -33,8 +32,8 @@ intersect_bed <- function(gtf_file, bed_file, select_features) {
         select_features = unique(as.data.table(select_features))
         gtf = gtf[select_features, on=names(select_features)[1L], nomatch=0L]
     }
-    bed_chrs = unique(bed$seqname)
-    gtf_chrs = unique(gtf$seqname)
+    bed_chrs = unique(bed$seqnames)
+    gtf_chrs = unique(gtf$seqnames)
     if (length(diff_chrs <- setdiff(bed_chrs, gtf_chrs))) {
         if (length(diff_chrs) == length(bed_chrs))
             stop("Chromosomes in bed file are completely different from 
@@ -45,7 +44,7 @@ intersect_bed <- function(gtf_file, bed_file, select_features) {
 
     construct_transcripts <- function(x) {
         x[feature %chin% "exon", 
-            .(seqname = seqname[1L], start = min(start), 
+            .(seqnames = seqnames[1L], start = min(start), 
                 end = max(end), strand = strand[1L]), 
         by = "transcript_id"]
     }
@@ -53,5 +52,6 @@ intersect_bed <- function(gtf_file, bed_file, select_features) {
 
     olaps = foverlaps(transcripts, bed, type="any", which=TRUE, nomatch=0L)
     tids  = transcripts[olaps$xid, unique(transcript_id)]
-    gtf[.(tids), on="transcript_id", nomatch=0L]
+    ans = gtf[.(tids), on="transcript_id", nomatch=0L]
+    new("gtf", as(setDF(ans), "GRanges"))
 }
diff --git a/R/non_overlaps.R b/R/non-overlaps.R
similarity index 85%
rename from R/non_overlaps.R
rename to R/non-overlaps.R
index 46221dd..8aa5b6c 100644
--- a/R/non_overlaps.R
+++ b/R/non-overlaps.R
@@ -14,8 +14,8 @@
 #' features \code{intron} and \code{exon}.
 #' @param transcript_id Column name in \code{x} corresponding to transcript id.
 #' @param gene_id Column name in \code{x} corresponding to gene id.
-#' @return Returns a list of data.table containing \emph{non overlapping 
-#' exons} and \emph{non overlapping introns}.
+#' @return Returns a list of \code{GRanges} objects containing 
+#' \emph{non overlapping exons} and \emph{non overlapping introns}.
 #' @examples
 #' \dontrun{
 #' path <- system.file("tests", package="gread")
@@ -25,13 +25,19 @@
 #' }
 non_overlaps <- function(x, transcript_id="transcript_id", gene_id="gene_id"){
 
-    # to please R CMD CHECK
-    feature=idx=epos=ecnt=keep=edge=check=V1=seqname=pos=e.gid=i.gid=N=NULL
     # taken from gffutils::non_overlaps    
-    stopifnot(is.gtf(x) || is.gff(x), "feature" %chin% names(x), 
-        transcript_id %chin% names(x), gene_id %chin% names(x))
+    stopifnot(is.gtf(x) || is.gff(x))
+    x = as_data_table(x)
+    stopifnot("feature" %chin% names(x), 
+                transcript_id %chin% names(x), 
+                gene_id %chin% names(x))
+    # to please R CMD CHECK
+    feature=idx=epos=ecnt=keep=edge=check=V1=seqnames=pos=e.gid=i.gid=N=NULL
     uniq_features = unique(x[["feature"]])
-    if (!"intron" %in% uniq_features) x = construct_introns(x)
+    if (!"intron" %in% uniq_features) {
+        xx = new(class(x)[1L], as_granges(x))
+        x = as_data_table(construct_introns(xx))
+    }
     uniq_features = unique(x[["feature"]])
     stopifnot(all(c("intron", "exon") %in% uniq_features))
     i = x[feature == "intron"]
@@ -104,21 +110,21 @@ non_overlaps <- function(x, transcript_id="transcript_id", gene_id="gene_id"){
     # meaning there is a smaller intron at the same position for this gene 
     # that we've already accounted for
     ni = i[!a_minus_w$queryHits]
-    setkey(ni, "seqname", "start", "end")
+    setkey(ni, "seqnames", "start", "end")
     tmp__ = ni[[transcript_id]]
     ni[, c(transcript_id) := as.list(tmp__)]
-    ni = ni[, c("seqname", "start", "end", "strand", 
+    ni = ni[, c("seqnames", "start", "end", "strand", 
                     transcript_id, gene_id), with=FALSE]
-    count = ni[, .N, by = list(seqname, start, end)]
+    count = ni[, .N, by = list(seqnames, start, end)]
     # code here is moved to the end... July 25 (due to overlapping genes 
     # interpretation difference between intron and exon)
 
     # non_overlapping exon
     # edited May 30th 2013, redid the whole thing: July 17 2013
     # data.table ABSOLUTELY ROCKS!
-    ne = e[, list(seqname=seqname[1L], pos=c(start, end)), by=c(gene_id)]
+    ne = e[, list(seqnames=seqnames[1L], pos=c(start, end)), by=c(gene_id)]
     setkey(ne)
-    ne = ne[, list(seqname=seqname[1L], start=pos[1:(.N-1)], end=pos[2:.N]), 
+    ne = ne[, list(seqnames=seqnames[1L], start=pos[1:(.N-1)], end=pos[2:.N]), 
                     by=c(gene_id)]
 
     nolaps = find_overlaps(ne, ni, type = "any")
@@ -128,10 +134,10 @@ non_overlaps <- function(x, transcript_id="transcript_id", gene_id="gene_id"){
 
     ne = ne[!unique(nolaps$queryHits)]
     ne = ne[start != end]
-    ne = ne[, list(seqname=seqname[1L], pos=c(start, end)), by=c(gene_id)]
+    ne = ne[, list(seqnames=seqnames[1L], pos=c(start, end)), by=c(gene_id)]
     setkeyv(ne, c(gene_id, "pos"))
     ne = ne[ne[, .N, by=key(ne)][N == 1]][, N := NULL]
-    ne = ne[, list(seqname=seqname[1L], start=pos[c(TRUE, FALSE)], 
+    ne = ne[, list(seqnames=seqnames[1L], start=pos[c(TRUE, FALSE)], 
                     end=pos[c(FALSE, TRUE)]), by=c(gene_id)]
     setkeyv(ne, c(gene_id))
 
@@ -142,7 +148,7 @@ non_overlaps <- function(x, transcript_id="transcript_id", gene_id="gene_id"){
     setnames(ne.rest, "transcript_id", transcript_id)
     ne = ne[ne.rest]
     setcolorder(ne, names(ni))
-    setkey(ne, seqname, start, end)
+    setkey(ne, seqnames, start, end)
     
     # now get overlapping intron
     ni[count[N > 1], `:=`(strand=paste(unique(strand), sep="", collapse=""), 
@@ -150,11 +156,13 @@ non_overlaps <- function(x, transcript_id="transcript_id", gene_id="gene_id"){
                         gene_id=list(unique(unlist(gene_id)))), 
        by=.EACHI] ## version 1.9.3+
     setnames(ni, c("transcript_id", "gene_id"), c(transcript_id, gene_id))
-    ni = unique(ni)
+    ni = unique(shallow(ni, reset_class=TRUE))
     # calling unique on 'ni' is more efficient than this, since 'ni' is key'd. 
     # changed July 17 
     # .gff[["non_overlapping_intron"]] = ni[count, mult = "first"][, N := NULL]
     setkey(ne, NULL)
     setkey(ni, NULL)
+    ne = as(setDF(ne), "GRanges")
+    ni = as(setDF(ni), "GRanges")
     list(non_overlapping_exon = ne, non_overlapping_intron = ni)
 }
diff --git a/R/read-format.R b/R/read-format.R
new file mode 100644
index 0000000..44c23fb
--- /dev/null
+++ b/R/read-format.R
@@ -0,0 +1,148 @@
+#' @title Quick and easy reading of gtf/gff/bed/bam files
+#'
+#' @description \code{read_gtf}, \code{read_gff}, \code{read_bed} and 
+#' \code{read_bam} are specific functions corresponding to respective file 
+#' formats. 
+#' 
+#' \code{read_format} is a top-level convenience function that detects and 
+#' reads automatically from the extension and is sufficient in most cases.
+#'
+#' @param file Complete path to the input file.
+#' @param format Type of annotation. Currently allowed values are 
+#' \code{"gtf"}, \code{"gff"}, \code{"bed"} and \code{"bam"}. If no input is 
+#' provided, then it will be guessed from input file's extension.
+#' @param filter Filter rows where \code{start/stop} are invalid, and where 
+#' \code{start > end}. Default (\code{FALSE}) is to not filter, i.e., retain 
+#' all invalid rows as well.
+#' @param chromosomes Argument to \code{read_bam}. If specified only reads 
+#' from those chromosomes will be loaded.
+#' @param tags Additional (optional) tags to load from the bam file. By 
+#' default, the tags \code{"NM"} and \code{"MD"} are loaded.
+#' @param verbose If \code{TRUE}, sends useful status messages to the console. 
+#' Default is \code{FALSE}.
+#' @aliases read_format read_gtf read_gff read_bed read_bam
+#' @return An object of class \code{gtf}, \code{gff}, \code{bed} or \code{bam}, 
+#' corresponding to the input file format, that inherits from \code{GRanges}.
+#' @seealso \code{\link{supported_formats}} 
+#' \code{\link{extract}} \code{\link{construct_introns}}
+#' @keywords file
+#' @export
+#' @examples
+#' path <- system.file("tests", package="gread")
+#' gff_file <- file.path(path, "sample.gff")
+#' gtf_file <- file.path(path, "sample.gtf")
+#' bed_file <- file.path(path, "sample.bed")
+#' bam_file <- file.path(path, "sample.bam")
+#' 
+#' read_format(gff_file) # read gff
+#' read_gff(gff_file)    # same as above
+#' read_format(gtf_file) # read gtf
+#' read_gtf(gtf_file)    # same as above
+#' read_format(bed_file) # read bed
+#' read_bed(bed_file)    # same as above
+#' read_format(bam_file) # read bam
+#' read_bam(bam_file)    # same as above
+#' 
+#' gtf_filter_file <- file.path(path, "sample_filter.gtf")
+#' read_format(gtf_filter_file, filter=TRUE) # filter invalid rows
+#' read_gtf(gtf_filter_file, filter=TRUE)    # same as above
+read_format <- function(file, format=detect_format(file), filter=FALSE, 
+            chromosomes=NULL, tags=c("NM", "MD"), verbose=FALSE) {
+    
+    if (!file.exists(file))
+        stop(toupper(format), " file '", file, "' not found.")
+    types = format_types(format=format)
+    names = format_names(format=format)
+    token = tokenise(file, format, filter, chromosomes, tags, verbose, 
+                        types=types, names=names)
+    ans = gread(token)
+    tidy_cols(ans, verbose=verbose)
+    if (token$filter && !identical(format, "bam")) {
+        before = nrow(ans)
+        # TODO: optimise this subset using internal any in data.table
+        ans = ans[start <= end] # will take care of NAs as well
+        after = nrow(ans)
+        if (token$verbose && before-after) 
+            cat(before-after, " invalid rows filtered. These include rows",
+                " where start/end coordinates were NA/NaN and ", 
+                " where start > end.\n", sep="")
+    }
+    # Returning 'GRanges' following Hervé's feedback
+    # See https://github.com/Bioconductor/Contributions/issues/25
+    new(class(ans)[1L], as(setDF(ans), "GRanges"))
+}
+
+#' @rdname read_format
+#' @export
+read_gtf <- function(file, filter=FALSE, verbose=FALSE) {
+    read_format(file, "gtf", filter, verbose)
+}
+
+#' @rdname read_format
+#' @export
+read_gff <- function(file, filter=FALSE, verbose=FALSE) {
+    read_format(file, "gff", filter, verbose)
+}
+
+#' @rdname read_format
+#' @export
+read_bed <- function(file, filter=FALSE, verbose=FALSE) {
+    read_format(file, "bed", filter, verbose)
+}
+
+#' @rdname read_format
+#' @export
+read_bam <- function(file, filter=FALSE, chromosomes=NULL, 
+                        tags=c("NM", "MD"), verbose=FALSE) {
+    read_format(file, "bam", filter, chromosomes, tags, verbose)
+}
+
+# Internal helper functions --------------------------------------------------
+
+detect_format <- function(file) {
+
+    type = substr(tolower(tools::file_ext(file)), 1L, 3L)
+    if (!type %in% c("gtf", "gff", "bam", "bed"))
+        stop("File does not have gtf/gff/bed/bam extension.")
+    type
+}
+
+format_types <- function(format) {
+    chr = "character"
+    int = "integer"
+    num = "numeric"
+    switch(format, gtf=, gff=c(rep(chr, 3L), rep(int, 2L), num, chr, int, chr),
+                   bed=c(chr, rep(int, 2L), rep(chr, 3L), rep(int, 2L), 
+                            chr, int, rep(chr, 2L)),
+                   bam=c())
+}
+
+format_names <- function(format) {
+    switch(format, gtf=c("seqnames", "source", "feature", "start", 
+                    "end", "score", "strand", "frame", "attributes"), 
+                   gff=c("seqnames", "source", "feature", "start", 
+                    "end", "score", "strand", "phase", "attributes"), 
+                   # all 15 columns for bed, only first 3 are compulsory
+                   bed=c("seqnames", "start", "end", "name", "score", 
+                    "strand", "thickStart", "thickEnd", "itemRgb", 
+                    "blockCount", "blockSizes", "blockStarts", "expCount", 
+                    "expIds", "expScores"), 
+                   bam=c("seqnames", "start", "end", "strand", "cigar", 
+                    "qwidth", "width", "njunc", "flag"))
+}
+
+tokenise <- function(file, format, filter, chromosomes, 
+                       tags, verbose, ...) {
+    arglist = as.list(match.call(expand.dots=TRUE))[-1L]
+    argnames = setdiff(names(arglist), "format")
+    if (!length(tags)) tags = character(0) # ScanBamParam needs this
+    tokens = structure(c(list(file, filter, chromosomes, tags, verbose), 
+                list(...)), class=paste(format, "format", sep="_"))
+    data.table::setattr(tokens, 'names', argnames)
+    tokens
+}
+
+`.` <- function(...) {
+    stop(".() is only meant for use in `j` argument of 
+        data.table, and is an alias to list()")
+}
diff --git a/R/read_format.R b/R/read_format.R
deleted file mode 100644
index 07349f4..0000000
--- a/R/read_format.R
+++ /dev/null
@@ -1,311 +0,0 @@
-#' @title Quickly and easily read gtf/gff/bed/bam files
-#'
-#' @description \code{read_gtf}, \code{read_gff}, \code{read_bed} and 
-#' \code{read_bam} are specific functions corresponding to respective file 
-#' formats. 
-#' 
-#' \code{read_format} is a top-level convenience function that detects and 
-#' reads automatically from the extension and is sufficient in most cases.
-#'
-#' @details Note that \code{gff1} uses \code{group} instead of 
-#' \code{attributes} as the column name, but \code{gread} always names it 
-#' as \code{attributes}. Similarly the first three columns of \code{bed} 
-#' format are named \code{seqname, start, end} instead of \code{chrom, 
-#' chromStart, chromEnd} for consistency.
-#' 
-#' The argument \code{tidy_cols} (\code{TRUE} by default) tidies up the 
-#' \code{attributes} column in case of \code{gtf/gff} format files. The 
-#' \code{attributes} column itself is removed since it is tidied up into 
-#' multiple columns. If this is not desirable, use \code{tidy_cols = FALSE} to 
-#' load the file \emph{as-is} and then use the \code{tidy_cols} function with 
-#' \code{remove_cols = NULL}.
-#' 
-#' In case of \code{gff} format, when `tidy_cols=TRUE`, generation of columns 
-#' \code{"transcript_id"} and \code{"gene_id"} will be attempted as these 
-#' columns are essential in most cases for downstream analyses. If possible, 
-#' the columns \code{"transcript_name"} and \code{"gene_name"} will also be 
-#' created.
-#' 
-#' @param file Complete path to the input file.
-#' @param format Type of annotation. Currently allowed values are 
-#' \code{"gtf"}, \code{"gff"}, \code{"bed"} and \code{"bam"}. If no input is 
-#' provided, then it will be guessed from input file's extension.
-#' @param filter Filter rows where \code{start/stop} are invalid, and where 
-#' \code{start > end}. Default is \code{FALSE} (not to filter, but retain all 
-#' invalid rows).
-#' @param tidy_cols If \code{TRUE} (default), returns only essential columns for 
-#' further analysis (for e.g, \code{score}, \code{itemRgb} etc. are removed), 
-#' \code{attributes} column is cleaned up with separate columns for 
-#' \code{gene}, \code{transcript} id etc. Default is \code{TRUE}.
-#' @param chromosomes Argument to \code{read_bam}. If specified only reads 
-#' from those chromosomes will be loaded.
-#' @param tags Additional (optional) tags to load from the bam file. By 
-#' default, the tags \code{"NM"} and \code{"MD"} are loaded.
-#' @param verbose If \code{TRUE}, sends useful status messages to the console. 
-#' Default is \code{FALSE}.
-#' @aliases read_format read_gtf read_gff read_bed read_bam
-#' @return An object of class \code{gtf}, \code{gff}, \code{bed} or \code{bam}, 
-#' corresponding to the input file format, that inherits from \code{data.table}.
-#' @seealso \code{\link{supported_formats}} \code{\link{tidy_cols}} 
-#' \code{\link{extract}} \code{\link{construct_introns}}
-#' @keywords file
-#' @export
-#' @examples
-#' path <- system.file("tests", package="gread")
-#' gff_file <- file.path(path, "sample.gff")
-#' gtf_file <- file.path(path, "sample.gtf")
-#' bed_file <- file.path(path, "sample.bed")
-#' bam_file <- file.path(path, "sample.bam")
-#' 
-#' read_format(gff_file) # read gff
-#' read_gff(gff_file)    # same as above
-#' read_format(gtf_file) # read gtf
-#' read_gtf(gtf_file)    # same as above
-#' read_format(bed_file) # read bed
-#' read_bed(bed_file)    # same as above
-#' read_format(bam_file) # read bam
-#' read_bam(bam_file)    # same as above
-#' 
-#' read_format(gtf_file, tidy_cols=FALSE) # load as is, don't tidy
-#' 
-#' gtf_filter_file <- file.path(path, "sample_filter.gtf")
-#' read_format(gtf_filter_file, filter=TRUE) # filter invalid rows
-#' read_gtf(gtf_filter_file, filter=TRUE)    # same as above
-read_format <- function(file, format=detect_format(file), filter=FALSE, 
-            tidy_cols=TRUE, chromosomes=NULL, tags=c("NM", "MD"), 
-            verbose=FALSE) {
-    
-    if (!file.exists(file))
-        stop(toupper(format), " file '", file, "' not found.")
-    types = format_types(format=format)
-    names = format_names(format=format)
-    token = tokenise(file, format, filter, tidy_cols, chromosomes, tags, verbose, 
-                        types=types, names=names)
-    ans = gread(token)
-    if (tidy_cols) tidy_cols(ans, verbose=verbose)
-    if (token$filter && !identical(format, "bam")) {
-        before = nrow(ans)
-        # TO DO: optimise this subset using internal any in data.table
-        ans = ans[start <= end] # will take care of NAs as well
-        after = nrow(ans)
-        if (token$verbose && before-after) 
-            cat(before-after, " invalid rows filtered. These include rows",
-                " where start/end coordinates were NA/NaN and ", 
-                " where start > end.\n", sep="")
-    }
-    ans
-}
-
-#' @rdname read_format
-#' @export
-read_gtf <- function(file, filter=FALSE, tidy_cols=TRUE, verbose=FALSE) {
-    read_format(file, "gtf", filter, tidy_cols, verbose)
-}
-
-#' @rdname read_format
-#' @export
-read_gff <- function(file, filter=FALSE, tidy_cols=TRUE, verbose=FALSE) {
-    read_format(file, "gff", filter, tidy_cols, verbose)
-}
-
-#' @rdname read_format
-#' @export
-read_bed <- function(file, filter=FALSE, tidy_cols=TRUE, verbose=FALSE) {
-    read_format(file, "bed", filter, tidy_cols, verbose)
-}
-
-#' @rdname read_format
-#' @export
-read_bam <- function(file, filter=FALSE, tidy_cols=TRUE, chromosomes=NULL, 
-                        tags=c("NM", "MD"), verbose=FALSE) {
-    read_format(file, "bam", filter, tidy_cols, chromosomes, tags, verbose)
-}
-
-# Helper/Internal functions for read_format ---------------------
-
-detect_format <- function(file) {
-
-    type = substr(tolower(tools::file_ext(file)), 1L, 3L)
-    if (!type %in% c("gtf", "gff", "bam", "bed"))
-        stop("File does not have gtf/gff/bed/bam extension.")
-    type
-}
-
-tokenise <- function(file, format, filter, tidy_cols, chromosomes, 
-                       tags, verbose, ...) {
-    arglist = as.list(match.call(expand.dots=TRUE))[-1L]
-    argnames = setdiff(names(arglist), "format")
-    if (!length(tags)) tags = character(0) # ScanBamParam needs this
-    tokens = structure(c(list(file, filter, tidy_cols, chromosomes, tags, verbose), 
-                list(...)), class=paste(format, "format", sep="_"))
-    data.table::setattr(tokens, 'names', argnames)
-    tokens
-}
-
-# Helper/Internal functions for read_gtf/read_gff/read_bed -------
-
-format_names <- function(format) {
-    switch(format, gtf=c("seqname", "source", "feature", "start", 
-                    "end", "score", "strand", "frame", "attributes"), 
-                   gff=c("seqname", "source", "feature", "start", 
-                    "end", "score", "strand", "phase", "attributes"), 
-                   # all 15 columns for bed, only first 3 are compulsory
-                   bed=c("seqname", "start", "end", "name", "score", 
-                    "strand", "thickStart", "thickEnd", "itemRgb", 
-                    "blockCount", "blockSizes", "blockStarts", "expCount", 
-                    "expIds", "expScores"), 
-                   bam=c("seqname", "start", "end", "strand", "cigar", 
-                    "qwidth", "width", "njunc", "flag"))
-}
-
-format_types <- function(format) {
-    chr = "character"
-    int = "integer"
-    num = "numeric"
-    switch(format, gtf=, gff=c(rep(chr, 3L), rep(int, 2L), num, chr, int, chr),
-                   bed=c(chr, rep(int, 2L), rep(chr, 3L), rep(int, 2L), 
-                            chr, int, rep(chr, 2L)),
-                   bam=c())
-}
-
-read_table <- function(file, ...) 
-    setDT(read.table(file, stringsAsFactors=FALSE, as.is=TRUE, ...))
-
-gread <- function(token) {
-    UseMethod("gread")
-}
-
-gread.default <- function(token) {
-    stop("No method available for format ", gsub("(.*)_format$", 
-            "", class(token)))
-}
-
-gread.gtf_format <- function(token) {
-    # to please R CMD CHECK
-    score=phase=NULL
-    # rtracklayer::readGFF reads gff v1 'attributes' column as 'group',
-    # but we want it to be always 'attributes'. Also, it tidies up the 
-    # attributes column by default
-    rtracklayer_fun <- function() {
-        ans = setDT(rtracklayer::readGFF(token$file, version = 2L, 
-                columns = rtracklayer::GFFcolnames()))
-        if (!token$tidy_cols) set(ans, j = tail(names(ans), -9L), value = NULL)
-        ans[, (1:3) := lapply(.SD, as.character), .SDcols=1:3][]
-        # remove additional attributes
-        setattr(ans, 'pragmas', NULL)
-        setattr(ans, 'attrcol_fmt', NULL)
-        setattr(ans, 'ncol0', NULL)
-        setattr(ans, 'ntag', NULL)
-        setattr(ans, 'raw_data', NULL)
-        ans
-    }
-    fread_fun <- function() fread(token$file, colClasses = token$types, 
-        showProgress = token$verbose)
-    read_table_fun <- function() read_table(token$file, sep = "\t", 
-        header = FALSE, comment.char = "#", nrows = -1L, colClasses = 
-        token$types, quote = "")
-    ans = tryCatch(rtracklayer_fun(), error = function(o) { 
-                if (token$verbose) cat("rtracklayer::readGFF failed to read", 
-                    " the gtf file. Reverting to data.table::fread.\n", sep="")
-                tryCatch(fread_fun(), error = function(o) {
-                    if (token$verbose) cat("data.table::fread failed to read", 
-                        " as well. Reverting to base::read.table.\n", sep="")
-                    read_table_fun()
-                })
-          })
-    setnames(ans, head(names(ans), 9L), token$names)
-    if (is.character(ans[["score"]])) 
-        ans[, "score" := suppressWarnings(as.numeric(score))]
-    if (is.character(ans[["phase"]])) 
-        ans[, "phase" := suppressWarnings(as.integer(phase))]
-    setattr(ans, 'class', c("gtf", "data.table", "data.frame"))
-}
-
-gread.gff_format <- function(token) {
-    # to please R CMD CHECK
-    score=phase=NULL
-    # rtracklayer::readGFF reads gff v1 'attributes' column as 'group',
-    # but we want it to be always 'attributes'. Also, it tidies up the 
-    # attributes column by default
-    rtracklayer_fun <- function() {
-        ans = setDT(as.data.frame(rtracklayer::readGFF(token$file, columns = 
-                rtracklayer::GFFcolnames())))
-        if (!token$tidy_cols) set(ans, j = tail(names(ans), -9L), value = NULL)
-        ans[, (1:3) := lapply(.SD, as.character), .SDcols=1:3][]
-        list_cols = which(vapply(ans, is.list, TRUE))
-        if (length(list_cols)) {
-            list_paste <- function(x) sapply(x, paste, collapse=",")
-            # paste all values of list cols together
-            ans[, (list_cols) := lapply(.SD, list_paste), .SDcols=list_cols]
-        }
-        # as.data.frame drops all extra attributes
-        ans
-    }
-    fread_fun <- function() fread(token$file, colClasses = token$types, 
-        showProgress = token$verbose)
-    read_table_fun <- function() read_table(token$file, sep = "\t", 
-        header = FALSE, comment.char = "#", nrows = -1L, colClasses = 
-        token$types, quote = "")
-    ans = tryCatch(rtracklayer_fun(), error = function(o) { 
-                if (token$verbose) cat("rtracklayer::readGFF failed to read",
-                    " the gtf file. Reverting to data.table::fread.\n", sep="")
-                tryCatch(fread_fun(), error = function(o) {
-                    if (token$verbose) cat("data.table::fread failed to read", 
-                        " as well. Reverting to base::read.table.\n", sep="")
-                    read_table_fun()
-                })
-          })
-    setnames(ans, head(names(ans), 9L), token$names)
-    if (is.character(ans[["score"]])) 
-        ans[, "score" := suppressWarnings(as.numeric(score))]
-    if (is.character(ans[["phase"]])) 
-        ans[, "phase" := suppressWarnings(as.integer(phase))]
-    setattr(ans, 'class', c("gff", "data.table", "data.frame"))
-}
-
-gread.bed_format <- function(token) {
-    ans = tryCatch(fread(token$file, colClasses = token$types, sep="\t", 
-                showProgress = token$verbose), error = function(o) {
-                    if (token$verbose) cat("data.table::fread failed to read", 
-                        " the bed file. Reverting to 'read.table'", 
-                        " from base R.\n", sep="")
-                    read_table(token$file, sep = "\t", header = FALSE, 
-                        comment.char = "#", nrows = -1L, colClasses = 
-                        token$types, quote = "")
-                    })
-    # setnames will error if bed file has < 3 cols
-    setnames(ans, head(token$names, max(3L, ncol(ans))))
-    # ans[, start := start+1L] # TODO: should we automatically set start+1L?
-    setattr(ans, 'class', c("bed", "data.table", "data.frame"))
-}
-
-gread.bam_format <- function(token) {
-    if (token$verbose) 
-        cat("Loading bam file: ",token$file,", please wait...\n", sep="")
-    stopifnot(length(token$file)==1L)
-    
-    what  = c("flag")
-    flags = Rsamtools::scanBamFlag(isUnmappedQuery=FALSE)
-    if (!length(token$chromosomes)) {
-        param = Rsamtools::ScanBamParam(what=what, tag=token$tags, flag=flags)
-    } else {
-            hdr = Rsamtools::scanBamHeader(token$file)[[1]]$targets
-            hdr = hdr[names(hdr) %in% token$chromosomes]
-            if (!all(names(hdr) %in% token$chromosomes)) {
-                stop("chromosomes [", paste(setdiff(token$chromosomes, 
-                    names(hdr)), collapse=", "), "] not found in bam file.")
-            }
-            which = as_granges(data.table(seqname = names(hdr), 
-                        start=1L, end = hdr, strand="*"), ignore_strand=TRUE)
-            param = Rsamtools::ScanBamParam(what = what, which = which, 
-                                tag = token$tags, flag = flags)
-    }
-    # load bam with corresponding param
-    ans = as_bam(GenomicAlignments::readGAlignments(token$file, param=param))
-    setattr(ans, 'class', c("bam", "data.table", "data.frame"))
-}
-
-`.` <- function(...) {
-    stop(".() is only meant for use in `j` argument of 
-        data.table, and is an alias to list()")
-}
diff --git a/R/s4-setup.R b/R/s4-setup.R
new file mode 100644
index 0000000..2a8b6e8
--- /dev/null
+++ b/R/s4-setup.R
@@ -0,0 +1,7 @@
+setClass("gtf", contains="GRanges")
+setClass("gff", contains="GRanges")
+setClass("bed", contains="GRanges")
+setClass("bam", contains="GRanges")
+setClass("gene", contains="GRanges")
+setClass("exon", contains="GRanges")
+setClass("intron", contains="GRanges")
diff --git a/R/supported_formats.R b/R/supported-formats.R
similarity index 87%
rename from R/supported_formats.R
rename to R/supported-formats.R
index 9445c7e..5abcbab 100644
--- a/R/supported_formats.R
+++ b/R/supported-formats.R
@@ -4,7 +4,7 @@
 #'
 #' @return A character vector listing all currently supported formats.
 #' @aliases supported_formats
-#' @seealso \code{\link{read_format}} \code{\link{tidy_cols}}
+#' @seealso \code{\link{read_format}}
 #' @examples
 #' supported_formats()
 #' @export
diff --git a/R/test_gread.R b/R/test-gread.R
similarity index 99%
rename from R/test_gread.R
rename to R/test-gread.R
index ed0a085..866e1e9 100644
--- a/R/test_gread.R
+++ b/R/test-gread.R
@@ -19,7 +19,6 @@
 #' otherwise.
 #' @aliases test_gread
 #' @seealso \code{\link{read_format}} \code{\link{extract}} 
-#' \code{\link{tidy_cols}}
 #' @examples
 #' \dontrun{
 #' gread:::test_gread()
@@ -28,7 +27,7 @@ test_gread <- function(verbose=FALSE, pkg="pkg", silent=FALSE) {
     if (exists("test_gread", .GlobalEnv, inherits=FALSE)) { # package developer
         if ("package:gread" %in% search()) stop("gread package loaded")
         if (.Platform$OS.type == "unix") {
-            d = path.expand("~/Documents/oa-git/gread/inst/tests")
+            d = path.expand("~/Documents/oa-github/gread/inst/tests")
         }
     } else { # user
         d = paste(getNamespaceInfo("gread", "path"), "/tests", sep="")
diff --git a/R/tidy_format.R b/R/tidy-format.R
similarity index 69%
rename from R/tidy_format.R
rename to R/tidy-format.R
index c89a2e1..ccf900c 100644
--- a/R/tidy_format.R
+++ b/R/tidy-format.R
@@ -1,68 +1,11 @@
-#' @title Tidy up gtf/gff/bed/bam objects
-#'
-#' @description Tidy up data depending on the type of input format. 
-#'
-#' Note that this operation is performed \emph{by reference}, to avoid 
-#' unnecessary copies, for efficiency. Therefore there is no need to assign 
-#' the result back to a variable. 
-#'
-#' @details In case of \code{gtf} and \code{gff} formats, the 
-#' \code{attributes} column is extracted into separate columns after which 
-#' it is removed (by default). Set \code{remove_cols} to \code{NULL} if you 
-#' would like to retain the column even after tidying up.
-#' 
-#' In case of \code{bed} and \code{bam} files, no columns are removed by 
-#' default. However, you can use \code{remove_cols} argument if necessary.
-#' 
-#' In case of \code{bam} files, the function \code{GAlignments} from the 
-#' \code{GenomicAlignments} Bioconductor package is used to read in the 
-#' file, with the additional arguments for \code{ScanBamParam()} function 
-#' using \code{NM} and \code{MD} tags by default.
-#'
-#' @seealso \code{\link{supported_formats}} \code{\link{read_format}} 
-#' \code{\link{extract}} \code{\link{construct_introns}} 
-#' \code{\link{as_granges}}
-#' @param x Input object, of class \code{gtf}, \code{gff}, \code{bed} or 
-#' \code{bam}.
-#' @param remove_cols A character vector of column names to be removed.
-#' @param verbose Logical. Default is \code{FALSE}. If \code{TRUE}, helpful 
-#' status messages are printed on to the console. 
-#' @param ... Arguments that are ignored at the moment.
-#' @aliases tidy_cols tidy_cols.gtf tidy_cols.gff tidy_cols.bed tidy_cols.bam
-#' @return A tidied object of class \code{gtf}, \code{gff}, \code{bed} or 
-#' \code{bam}, that inherits from \code{data.table}.
-#' @export
-#' @examples
-#' path <- system.file("tests", package="gread")
-#' gff_file <- file.path(path, "sample.gff")
-#' gtf_file <- file.path(path, "sample.gtf")
-#' bed_file <- file.path(path, "sample.bed")
-#' bam_file <- file.path(path, "sample.bam")
-#' 
-#' gtf <- read_format(gtf_file, tidy_cols=FALSE)
-#' gff <- read_format(gtf_file, tidy_cols=FALSE)
-#' bed <- read_format(bed_file, tidy_cols=FALSE)
-#' bam <- read_format(bam_file, tidy_cols=FALSE)
-#' 
-#' tidy_cols(gtf, remove_cols=NULL)[]          # tidy attr. col, but not remove
-#' tidy_cols(gff, remove_cols=NULL)[]          # same as above, but for gff
-#' tidy_cols(gtf, remove_cols="attributes")[]  # tidy & remove attr. col
-#' tidy_cols(gff, remove_cols="attributes")[]  # same as above, but for gff
-#' 
-#' tidy_cols(bed, remove_cols="name")[]        # remove name column
-#' tidy_cols(bam, remove_cols=c("NM", "MD"))[] # remove additional loaded tags 
 tidy_cols <- function(x, ...) {
     UseMethod("tidy_cols")
 }
 
-#' @rdname tidy_cols
-#' @export
 tidy_cols.default <- function(x, ...) {
     stop("No default method available.")
 }
 
-#' @rdname tidy_cols
-#' @export
 tidy_cols.gtf <- function(x, remove_cols="attributes", verbose=FALSE, ...) {
     stopifnot(identical(head(names(x), 9L), format_names("gtf")))
     meta_fun <- function(vec, attr) {
@@ -101,8 +44,6 @@ tidy_cols.gtf <- function(x, remove_cols="attributes", verbose=FALSE, ...) {
     invisible(x)
 }
 
-#' @rdname tidy_cols
-#' @export
 tidy_cols.gff <- function(x, remove_cols="attributes", verbose=FALSE, ...) {
     stopifnot(identical(head(names(x), 9L), format_names("gff")))
     meta_fun <- function(vec, attr) {
@@ -137,22 +78,18 @@ tidy_cols.gff <- function(x, remove_cols="attributes", verbose=FALSE, ...) {
     invisible(x)
 }
 
-#' @rdname tidy_cols
-#' @export
 tidy_cols.bed <- function(x, remove_cols=NULL, verbose=FALSE, ...) {
     stopifnot(identical(names(x), head(format_names("bed"), length(x))))
     remove_cols(x, remove_cols, verbose) # updates by reference
     invisible(x)
 }
 
-#' @rdname tidy_cols
-#' @export
 tidy_cols.bam <- function(x, remove_cols=NULL, verbose=FALSE, ...) {
     remove_cols(x, remove_cols, verbose) # updates by reference
     invisible(x)
 }
 
-## internal function used in tidy_cols methods -------------------
+## internal helper functions -------------------------------------------------
 
 remove_cols <- function(x, cols, verbose) {
     if (is.null(cols)) {
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index 0057560..c610954 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -17,6 +17,7 @@ if (!exists("test_gread", .GlobalEnv, inherits=FALSE)) {
     strictly_nonunique = gread:::strictly_nonunique
     shallow            = gread:::shallow
     non_overlaps       = gread:::non_overlaps
+    tidy_cols          = gread:::tidy_cols
 } else {
     .devtesting=TRUE
 }
@@ -40,7 +41,7 @@ test(0.0, read_format("unavailable.unknown"), error="File does not")
 ## gtf -----------------------------------------------
 ff = c("exon", "stop_codon", "CDS", "start_codon")
 gtf_ans = data.table(
-            seqname="7",
+            seqnames="7",
             source="protein_coding",
              feature=ff[c(1:3,3,1,4)],
              start=INT(27221134,27222415,27222418,27224055,27224055,27224761),
@@ -57,19 +58,16 @@ gtf_ans = data.table(
              transcript_name="HOXA11-001",
              tss_id="TSS161914",
              protein_id=c(NA,NA,"ENSP00000006015","ENSP00000006015",NA,NA))
-setattr(gtf_ans, 'class', c('gtf', class(gtf_ans)))
+gtf_ans = new("gtf", as(setDF(gtf_ans), "GRanges"))
 
 test(1.0, read_format("unavailable.gtf"), error="not found")
 test(1.1, read_format("sample.gtf"), gtf_ans)
 test(1.2, read_gtf("sample.gtf"), gtf_ans)
 test(1.3, read_format("sample_filter.gtf", filter=TRUE), gtf_ans[-4L])
 test(1.4, read_gtf("sample_filter.gtf", filter=TRUE), gtf_ans[-4L])
-gtf_ans2 = gtf_ans[, 1:8, with=FALSE
-             ][, attributes := fread("sample.gtf", select=9L)]
-test(1.5, read_gtf("sample.gtf", tidy_cols=FALSE), gtf_ans2)
 
 ## gff -----------------------------------------------
-gff_ans = data.table(seqname="7",
+gff_ans = data.table(seqnames="7",
                      source="protein_coding",
                      feature=c("gene","RNA","exon", "exon"),
                      start=INT(27221129, 27221134, 27221134, 27224055),
@@ -81,34 +79,29 @@ gff_ans = data.table(seqname="7",
                      ID=c("ENSG00000005073","ENST00000006015",
                         "ENST00000006015.2", "ENST00000006015.1"),
                      Name=c("HOXA11","HOXA11-001",NA,NA),
-                     length=c("3714",NA,NA,NA),
                      Parent=c(NA,"ENSG00000005073",
                         "ENST00000006015", 
                         "ENST00000006015"),
+                     length=c("3714",NA,NA,NA),
                      transcript_id=c(NA,rep("ENST00000006015", 3L)), 
                      gene_id="ENSG00000005073", 
                      transcript_name=c(NA,rep("HOXA11-001", 3L)),
                      gene_name="HOXA11")
-setattr(gff_ans, 'class', c('gff', class(gff_ans)))
-# ignore colorder. v1.9.6 of data.table lacks ignore.col.order feature
+gff_ans = new("gff", as(setDF(gff_ans), "GRanges"))
+
 # so doing it using setcolorder for now.
 test(2.0, read_format("unavailable.gff"), error="not found")
-test(2.1, setcolorder(read_format("sample.gff"), names(gff_ans)), gff_ans)
-test(2.2, setcolorder(read_gff("sample.gff"), names(gff_ans)), gff_ans)
-test(2.3, setcolorder(read_format("sample_filter.gff", filter=TRUE), 
-          names(gff_ans)), gff_ans[-4L])
-test(2.4, setcolorder(read_gff("sample_filter.gff", filter=TRUE), 
-          names(gff_ans)), gff_ans[-4L])
-gff_ans2 = gff_ans[, 1:8, with=FALSE
-             ][, attributes := fread("sample.gff", select=9L)]
-test(2.5, read_gff("sample.gff", tidy_cols=FALSE), gff_ans2)
-cols = c("seqname", "source", "feature", "start", "end", "score", 
-        "strand", "phase", "Alias", "Name", "length", "Parent")
-test(2.6, read_gff("sample_no_id.gff"), gff_ans[, cols, with=FALSE], 
-            warning="gff file")
+test(2.1, read_format("sample.gff"), gff_ans)
+test(2.2, read_gff("sample.gff"), gff_ans)
+test(2.3, read_format("sample_filter.gff", filter=TRUE), gff_ans[-4L])
+test(2.4, read_gff("sample_filter.gff", filter=TRUE), gff_ans[-4L])
+gff_ans2 = gff_ans
+cols = c("transcript_id", "gene_id", "transcript_name", "gene_name", "ID")
+mcols(gff_ans2)[, cols] = NULL
+test(2.5, read_gff("sample_no_id.gff"), gff_ans2, warning="gff file")
 
 ## bam -----------------------------------------------
-bam_ans = data.table(seqname=c("1","5","X"),
+bam_ans = data.table(seqnames=c("1","5","X"),
                      start=c(220276091L,79733612L,12994378L),
                      end=c(220276834L,79733687L,12994453L),
                      strand="-",
@@ -119,23 +112,27 @@ bam_ans = data.table(seqname=c("1","5","X"),
                      flag=16L,
                      NM=0L,
                      MD=c("75","76","76"))
-setattr(bam_ans, 'class', c("bam", class(bam_ans)))
+bam_ans = new("bam", as(setDF(bam_ans), "GRanges"))
 
 test(3.0, read_format("unavailable.bam"), error="not found")
 test(3.1, read_format("sample.bam"), bam_ans)
 test(3.2, read_bam("sample.bam"), bam_ans)
-test(3.3, read_format("sample.bam", tags="NM"), bam_ans[, !"MD", with=FALSE])
+bam_ans2 = bam_ans
+mcols(bam_ans2)[, "MD"] = NULL
+test(3.3, read_format("sample.bam", tags="NM"), bam_ans2)
 test(3.4, read_format("sample.bam", tags="XYZ"), error="invalid class")
-test(3.5, read_format("sample_indexed.bam", chromosomes="1"), bam_ans[1L])
-test(3.6, nrow(read_format("sample_indexed.bam", chromosomes="Y")), 0L)
-test(3.7, read_bam("sample.bam", tags=NULL), 
-                bam_ans[, !c("NM", "MD"), with=FALSE])
+test(3.5, read_format("sample_indexed.bam", chromosomes="1"), 
+            keepSeqlevels(bam_ans, "1"))
+test(3.6, length(read_format("sample_indexed.bam", chromosomes="Y")), 0L)
+bam_ans3 = bam_ans
+mcols(bam_ans3)[, c("NM", "MD")] = NULL
+test(3.7, read_bam("sample.bam", tags=NULL), bam_ans3)
 
 ## bed -----------------------------------------------
 names = c("NS500200:12:H0RBKAGXX:3:11405:20307:5324", 
           "NS500200:10:H0PVJAGXX:3:21608:13122:16773", 
           "NS500200:10:H0PVJAGXX:2:13108:16527:7109")
-bed_ans = data.table(seqname=c("1","5","X"),
+bed_ans = data.table(seqnames=c("1","5","X"),
                      start=c(220276090L,79733611L,12994377L),
                      end=c(220276834L,79733687L,12994453L),
                      name=names,
@@ -147,63 +144,57 @@ bed_ans = data.table(seqname=c("1","5","X"),
                      blockCount=c(2L,1L,1L),
                      blockSizes=c("29,46","76","76"),
                      blockStarts=c("0,698","0","0"))
-setattr(bed_ans, 'class', c("bed", class(bed_ans)))
+bed_ans = new("bed", as(setDF(bed_ans), "GRanges"))
 
 test(4.0, read_format("unavailable.bed"), error="not found")
 test(4.1, read_format("sample.bed"), bed_ans)
 test(4.2, read_bed("sample.bed"), bed_ans)
-test(4.3, read_format("sample.bed", tidy_cols=FALSE), bed_ans)
-test(4.4, read_bed("sample.bed", tidy_cols=FALSE), bed_ans)
 
-## tidy_cols ------------------------------------------------------
+# ## tidy_cols ------------------------------------------------------
 
-gtf <- read_format("sample.gtf", tidy_cols=FALSE)
-test(5.1, names(gtf), format_names("gtf"))
-tidy_cols(gtf, remove_cols=NULL) # tidy, but don't remove attributes col
-test(5.2, sort(names(gtf)), sort(c(format_names("gtf"), 
-    tail(names(gtf_ans), -8L)))) # ignore order since rtracklayer returns 
-                                 # columns in different order
-tidy_cols(gtf, remove_cols="attributes")
-test(5.3, sort(names(gtf)), sort(names(gtf_ans)))
+# gtf <- read_format("sample.gtf", tidy_cols=FALSE)
+# test(5.1, names(gtf), format_names("gtf"))
+# tidy_cols(gtf, remove_cols=NULL) # tidy, but don't remove attributes col
+# test(5.2, sort(names(gtf)), sort(c(format_names("gtf"), 
+#     tail(names(gtf_ans), -8L)))) # ignore order since rtracklayer returns 
+#                                  # columns in different order
+# tidy_cols(gtf, remove_cols="attributes")
+# test(5.3, sort(names(gtf)), sort(names(gtf_ans)))
 
-gff <- read_format("sample.gff", tidy_cols=FALSE)
-test(5.4, names(gff), format_names("gff"))
-tidy_cols(gff, remove_cols=NULL) # tidy, but don't remove attributes col
-test(5.5, sort(names(gff)), sort(c(format_names("gff"), 
-    tail(names(gff_ans), -8L)))) # ignore order since rtracklayer returns 
-                                 # columns in different order
-tidy_cols(gff, remove_cols="attributes")
-test(5.6, sort(names(gff)), sort(names(gff_ans)))
+# gff <- read_format("sample.gff", tidy_cols=FALSE)
+# test(5.4, names(gff), format_names("gff"))
+# tidy_cols(gff, remove_cols=NULL) # tidy, but don't remove attributes col
+# test(5.5, sort(names(gff)), sort(c(format_names("gff"), 
+#     tail(names(gff_ans), -8L)))) # ignore order since rtracklayer returns 
+#                                  # columns in different order
+# tidy_cols(gff, remove_cols="attributes")
+# test(5.6, sort(names(gff)), sort(names(gff_ans)))
 
-bed <- read_format("sample.bed", tidy_cols=FALSE)
-tidy_cols(bed, remove_cols="name") # tidy, but don't remove attributes col
-test(5.7, names(bed), setdiff(names(bed_ans), "name"))
+# bed <- read_format("sample.bed", tidy_cols=FALSE)
+# tidy_cols(bed, remove_cols="name") # tidy, but don't remove attributes col
+# test(5.7, names(bed), setdiff(names(bed_ans), "name"))
 
-bam <- read_format("sample.bam", tidy_cols=FALSE)
-tidy_cols(bam, remove_cols="MD") # tidy, but don't remove attributes col
-test(5.8, names(bam), setdiff(names(bam_ans), "MD"))
-# also test warning when the column to remove is not present
-test(5.9, tidy_cols(bam, remove_cols="XYZ"), bam, warning="Skipping column")
+# bam <- read_format("sample.bam", tidy_cols=FALSE)
+# tidy_cols(bam, remove_cols="MD") # tidy, but don't remove attributes col
+# test(5.8, names(bam), setdiff(names(bam_ans), "MD"))
+# # also test warning when the column to remove is not present
+# test(5.9, tidy_cols(bam, remove_cols="XYZ"), bam, warning="Skipping column")
 
-## as_granges ------------------------------------------------
+## as_data_table ------------------------------------------------
 gff = read_format("sample.gff")
 gtf = read_format("sample.gtf")
 bed = read_format("sample.bed")
 bam = read_format("sample.bam")
-test(6.1, as_granges(gff), as(as.data.frame(gff), "GRanges"))
-test(6.2, as_granges(gtf), as(as.data.frame(gtf), "GRanges"))
-test(6.3, as_granges(bed), as(as.data.frame(bed), "GRanges"))
-test(6.4, as_granges(bam), as(as.data.frame(bam), "GRanges"))
+test(6.1, class(as_data_table(gff))[1L], "gff")
+test(6.2, class(as_data_table(gtf))[1L], "gtf")
+test(6.3, class(as_data_table(bed))[1L], "bed")
+test(6.4, class(as_data_table(bam))[1L], "bam")
 
-test(6.5, as_granges(gff, ignore_strand=TRUE), 
-          as(as.data.frame(copy(gff)[, strand:=NULL]), "GRanges"))
-test(6.6, as_granges(gtf, ignore_strand=TRUE), 
-          as(as.data.frame(copy(gtf)[, strand:=NULL]), "GRanges"))
-test(6.7, as_granges(bed, ignore_strand=TRUE), 
-          as(as.data.frame(copy(bed)[, strand:=NULL]), "GRanges"))
-test(6.8, as_granges(bam, ignore_strand=TRUE), 
-          as(as.data.frame(copy(bam)[, strand:=NULL]), "GRanges"))
-test(6.9, as_granges(1:2), error = "is not TRUE")
+class = c("data.table", "data.frame") # class reset
+test(6.5, class(as_data_table(gff, TRUE)), class)
+test(6.6, class(as_data_table(gtf, TRUE)), class)
+test(6.7, class(as_data_table(bed, TRUE)), class)
+test(6.8, class(as_data_table(bam, TRUE)), class)
 
 ## as_txdb -----------------------------------------------
 ## No tests yet
@@ -217,37 +208,38 @@ test(7.1, strictly_nonunique(dt), dt[!c(2L,5L)])
 bam = read_format("sample.bam")
 bed = read_format("sample.bed")[c(1,1,2,3)]
 internal_fun <- function(x, y) {
-    ans = GenomicRanges::findOverlaps(as(as.data.frame(x), "GRanges"), 
-                                  as(as.data.frame(y), "GRanges"))
+    ans = GenomicRanges::findOverlaps(x, y)
     setDT(list(queryHits=queryHits(ans), subjectHits=subjectHits(ans)))
 }
-test(8.1, find_overlaps(bam, bed), internal_fun(bam, bed))
-test(8.2, find_overlaps(bed, bed), internal_fun(bed, bed))
-test(8.3, find_overlaps(bed, bed, ignore_redundant=TRUE), 
-          internal_fun(bed, bed)[-3L])
+test(8.1, find_overlaps(as_data_table(bam), as_data_table(bed)), 
+            internal_fun(bam, bed))
+test(8.2, find_overlaps(as_data_table(bed), as_data_table(bed)), 
+            internal_fun(bed, bed))
+test(8.3, find_overlaps(as_data_table(bed), as_data_table(bed), 
+            ignore_redundant=TRUE), internal_fun(bed, bed)[-3L])
 # TODO: test for ignore_strand. Not done now since it is just passed as 
 # arg to GenomicRanges::findOverlaps. To revisit when DT's non-equi joins 
 # will be ready.
 
 ## reduce_overlaps ------------------------------------
-gff = read_format("sample.gff")
+gff = as_data_table(read_format("sample.gff"), reset_class=TRUE)
 gff1 = gff[2L]
 gff2 = gff[2L][, `:=`(start = 27221000L, end = 27223735L)]
 gff = rbind(gff1, gff2)
 setattr(gff, 'class', c("gff", "data.table", "data.frame"))
-ans = data.table(seqname = "7", start = 27221000L, end = 27224835L, 
+ans = data.table(seqnames = "7", start = 27221000L, end = 27224835L, 
         length = 3836L, strand = "-", Parent = "ENSG00000005073")
 setattr(ans, 'class', class(gff))
 test(9.1, reduce_overlaps(gff, "Parent"), ans)
 test(9.2, reduce_overlaps(gff, "Par"), error="")
 
 ## disjoin_overlaps ------------------------------------
-gff = read_format("sample.gff")
+gff = as_data_table(read_format("sample.gff"), reset_class=TRUE)
 gff1 = gff[2L]
 gff2 = gff[2L][, `:=`(start = 27221000L, end = 27223735L)]
 gff = rbind(gff1, gff2)
 setattr(gff, 'class', c("gff", "data.table", "data.frame"))
-ans = data.table(seqname = "7", start = c(27221000L, 27221134L, 27223736L), 
+ans = data.table(seqnames = "7", start = c(27221000L, 27221134L, 27223736L), 
         end = c(27221133L, 27223735L, 27224835L), 
         length = c(134L, 2602L, 1100L), strand = "-", 
         Parent = "ENSG00000005073")
@@ -256,13 +248,13 @@ test(10.1, disjoin_overlaps(gff, "Parent"), ans)
 test(10.2, disjoin_overlaps(gff, "Par"), error="")
 
 ## intersect_overlaps ------------------------------------
-gff = read_format("sample.gff")
+gff = as_data_table(read_format("sample.gff"), reset_class=TRUE)
 gff1 = gff[2L]
 gff2 = gff[2L][, `:=`(start = 27221000L, end = 27224235)]
 gff3 = gff[2L][, `:=`(start = 27523634L, end = 27524761L)]
 gff = rbind(gff1, gff2, gff3)
 setattr(gff, 'class', c("gff", "data.table", "data.frame"))
-ans = data.table(seqname = "7", start = c(27221000L, 27523634L), 
+ans = data.table(seqnames = "7", start = c(27221000L, 27523634L), 
         end = c(27224835L, 27524761L), length = c(3836L ,1128L), 
         strand = "-", Parent = "ENSG00000005073")
 setattr(ans, 'class', class(gff))
@@ -271,15 +263,18 @@ test(11.2, intersect_overlaps(gff, "Par"), error="")
 
 ## shallow ------------------------------------
 dt = data.table(x=1, y=2, z="a")
+setattr(dt, 'class', c("bla", class(dt)))
 test(12.1, shallow(data.frame()), error="")
 test(12.2, shallow(dt), dt)
 test(12.3, shallow(dt, "x"), dt[, .(x)])
-test(12.4, shallow(dt, "x"), dt[, .(x)])
-test(12.5, shallow(dt, "xx"), error="")
+test(12.4, shallow(dt, "x", TRUE), as.data.table(dt[, .(x)]))
+test(12.5, class(shallow(dt, "x"))[1L], "bla")
+test(12.6, class(shallow(dt, "x", TRUE)), class)
+test(12.7, shallow(dt, "xx"), error="")
 
 ## construct_introns -----------------------------------------
 gtf = read_format("sample.gtf")
-ans = data.table(seqname = "7", source = "protein_coding", feature = "intron", 
+ans = data.table(seqnames = "7", source = "protein_coding", feature = "intron", 
         start = 27222648L, end = 27224054L, score = NA_real_, strand = "-", 
         frame = NA_integer_, gene_id = "ENSG00000005073", 
         transcript_id = "ENST00000006015", exon_number = NA, 
@@ -287,47 +282,49 @@ ans = data.table(seqname = "7", source = "protein_coding", feature = "intron",
         p_id = "P4212", transcript_name = "HOXA11-001", tss_id = "TSS161914", 
         protein_id = NA_character_)
 setattr(ans, 'class', c("gtf", class(ans)))
-ans2 = copy(ans)
-setattr(ans2, 'class', c("intron", class(ans2)))
+ans2 = new("intron", as(setDF(ans), "GRanges"))
 test(13.1, construct_introns(gtf, update=FALSE), ans2)
-ans = rbind(gtf, ans)[order(seqname,start,end)]
+ans = rbind(as_data_table(gtf)[, width := NULL], ans
+        )[order(seqnames,start,end)]
 setattr(ans, 'class', c("gtf", class(ans)))
-test(13.2, construct_introns(gtf), ans)
+ans2 = new("gtf", as(setDF(ans), "GRanges"))
+test(13.2, construct_introns(gtf), ans2)
 
 ## extract features ------------------------------------------
 gtf = read_format("sample.gtf")
-ans = gtf[feature=="exon", .(seqname, start, end, length=end-start+1L, 
+ans = as_data_table(gtf)[feature=="exon", 
+        .(seqnames, start, end, length=end-start+1L, 
             strand, transcript_id, gene_id)]
-setattr(ans, 'class', c("exon", class(gtf)))
-test(14.1, extract(gtf, feature="exon", type="union"), ans)
+ans2 = new("exon", as_granges(ans))
+test(14.1, extract(gtf, feature="exon", type="union"), ans2)
 ans[, overlaps := gene_id]
-setattr(ans, 'class', c("gene", class(gtf)))
-test(14.2, extract(gtf, feature="gene_exon", type="union"), ans)
-ans = data.table(seqname = "7", start = 27221134L, end = 27224835L, 
+ans2 = new("gene", as_granges(ans))
+test(14.2, extract(gtf, feature="gene_exon", type="union"), ans2)
+ans = data.table(seqnames = "7", start = 27221134L, end = 27224835L, 
         length = 3702L, strand = "-", transcript_id = "ENST00000006015", 
         gene_id = "ENSG00000005073", overlaps = "ENSG00000005073")
-setattr(ans, 'class', c("gene", class(gtf)))
-test(14.3, extract(gtf, feature="gene", type="union"), ans)
+ans2 = new("gene", as_granges(ans))
+test(14.3, extract(gtf, feature="gene", type="union"), ans2)
 # test for feature="intron" on gtf object to check that introns are 
 # constructed automatically using construct_introns() with a warning
-ans = data.table(seqname="7", start=27222648L, end=27224054L, length=1407L, 
+ans = data.table(seqnames="7", start=27222648L, end=27224054L, length=1407L, 
             strand="-", transcript_id="ENST00000006015", 
             gene_id="ENSG00000005073")
-setattr(ans, 'class', c("intron", "gtf", class(ans)))
-test(14.4, extract(gtf, "intron", "union"), ans, 
+ans2 = new("intron", as_granges(ans))
+test(14.4, extract(gtf, "intron", "union"), ans2, 
     warning="No introns found in input")
 
 ## non_overlaps ----------------------------------------------
 gtf  = read_format("sample.gtf")
-gtf2 = construct_introns(gtf)
-e = gtf2[feature=="exon", .(seqname,start,end,strand,transcript_id,gene_id)]
-i = gtf2[feature=="intron", .(seqname,start,end,strand,transcript_id,gene_id)]
+gtf2 = as_data_table(construct_introns(gtf))
+e = gtf2[feature=="exon", .(seqnames,start,end,strand,transcript_id,gene_id)]
+i = gtf2[feature=="intron", .(seqnames,start,end,strand,transcript_id,gene_id)]
 e[, transcript_id := as.list(transcript_id)]
 i[, transcript_id := as.list(transcript_id)]
-setorder(e, seqname,start,end)
-setorder(i, seqname,start,end)
-test(15.1, non_overlaps(gtf), list(non_overlapping_exon=e, 
-                                   non_overlapping_intron=i))
+setorder(e, seqnames,start,end)
+setorder(i, seqnames,start,end)
+test(15.1, non_overlaps(gtf), list(non_overlapping_exon=as_granges(e), 
+                                   non_overlapping_intron=as_granges(i)))
 
 ## test summary ----------------------------------------------
 options(warn=0L)
diff --git a/man/as_bam.Rd b/man/as_bam.Rd
index 8def378..c7bc8ee 100644
--- a/man/as_bam.Rd
+++ b/man/as_bam.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/as_granges.R
+% Please edit documentation in R/internal-funs.R
 \name{as_bam}
 \alias{as_bam}
 \title{convert \code{GAlignments} object to \code{data.table}}
diff --git a/man/as_data_table.Rd b/man/as_data_table.Rd
new file mode 100644
index 0000000..395e0e8
--- /dev/null
+++ b/man/as_data_table.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/extract-features.R
+\name{as_data_table}
+\alias{as_data_table}
+\title{Convert gtf/gff/bam/bed S4 objects to data.table and preserve class}
+\usage{
+as_data_table(x)
+}
+\value{
+A data.table object with input class preserved.
+}
+\description{
+\code{as.data.table} converts the input object to 
+\code{data.table}, but strips away all other classes except 
+\code{data.table} and \code{data.frame}. 
+
+This internal function is a simple wrapper to \code{as.data.table} but 
+also retains the extra class information of the input object.
+}
+\examples{
+\dontrun{
+setClass("foo", contains="GRanges")
+x = new("foo", GRanges(letters[1:5], IRanges(1:5, 6:10)))
+class(x) # [1] "foo"
+class(gread:::as_data_table(x)) # [1] "foo" "data.table" "data.frame"
+}
+}
+
diff --git a/man/as_granges.Rd b/man/as_granges.Rd
index 41b3129..161a79b 100644
--- a/man/as_granges.Rd
+++ b/man/as_granges.Rd
@@ -1,8 +1,8 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/as_granges.R
+% Please edit documentation in R/as-granges.R
 \name{as_granges}
 \alias{as_granges}
-\title{Convert to GRanges object}
+\title{Convert data.table to GRanges object}
 \usage{
 as_granges(x, ignore_strand = FALSE)
 }
@@ -19,8 +19,7 @@ A \code{GRanges} object.
 }
 \description{
 Convert \code{gtf}, \code{gff}, \code{bed}, \code{bam} 
-or a valid \code{data.table} to a GRanges object, preserving all 
-additional columns.
+or a valid \code{data.table} to a GRanges object.
 }
 \examples{
 path <- system.file("tests", package="gread")
@@ -45,7 +44,7 @@ as_granges(bed, ignore_strand=FALSE)
 as_granges(bam, ignore_strand=FALSE)
 }
 \seealso{
-\code{\link{read_format}} 
-\code{\link{tidy_cols}} \code{\link{extract}} \code{\link{construct_introns}}
+\code{\link{read_format}} \code{\link{extract}} 
+\code{\link{construct_introns}}
 }
 
diff --git a/man/construct_introns.Rd b/man/construct_introns.Rd
index 35c94fd..22651ee 100644
--- a/man/construct_introns.Rd
+++ b/man/construct_introns.Rd
@@ -1,8 +1,8 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/construct_introns.R
+% Please edit documentation in R/construct-introns.R
 \name{construct_introns}
 \alias{construct_introns}
-\title{Construct introns from \code{gtf} or \code{gff} objects}
+\title{Construct introns from gtf/gff objects}
 \usage{
 construct_introns(x, update = TRUE)
 }
@@ -16,7 +16,8 @@ returned.}
 }
 \value{
 An object of class \code{gtf/gff} with updated \code{intron} 
-coordinates or just the intron coordinates depending on \code{update}.
+coordinates or just the intron coordinates depending on \code{update}. 
+They all inherit from \code{GRanges}.
 }
 \description{
 This function generates intronic coordinates by extracting 
@@ -34,6 +35,6 @@ introns <- construct_introns(gtf, update=FALSE)
 }
 \seealso{
 \code{\link{supported_formats}} \code{\link{read_format}} 
-\code{\link{extract}} \code{\link{tidy_cols}} \code{\link{as_granges}}
+\code{\link{extract}} \code{\link{as_granges}}
 }
 
diff --git a/man/disjoin_overlaps.Rd b/man/disjoin_overlaps.Rd
index e18cadc..2188d9c 100644
--- a/man/disjoin_overlaps.Rd
+++ b/man/disjoin_overlaps.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/as_granges.R
+% Please edit documentation in R/internal-funs.R
 \name{disjoin_overlaps}
 \alias{disjoin_overlaps}
 \title{Compute disjoint ranges on a gtf/gff/bed/bam object.}
diff --git a/man/extract.Rd b/man/extract.Rd
index 1e70dd1..96b9f6e 100644
--- a/man/extract.Rd
+++ b/man/extract.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/extract_features.R
+% Please edit documentation in R/extract-features.R
 \name{extract}
 \alias{extract}
 \alias{extract_feature}
@@ -11,7 +11,8 @@ extract(x, feature = c("gene_exon", "gene", "gene_intron", "exon", "intron"),
   gene_id = "gene_id", ...)
 }
 \arguments{
-\item{x}{Input object of class \code{gtf} or \code{gff}.}
+\item{x}{Input object of class \code{gtf} or \code{gff} which inherits 
+from \code{GRanges}.}
 
 \item{feature}{A character vector of (usually related) features to extract 
 from. One of \code{"gene_exon"}, \code{"gene"}, \code{"gene_intron"}, 
@@ -62,7 +63,7 @@ value is \code{"gene_id"}.}
 An object of class \code{"gene"} when \code{feature} is 
 \code{"gene"}, \code{"gene_exon"} or \code{"gene_intron"}, and of class 
 \code{"exon"} and \code{"intron"} when \code{feature} is \code{"exon"} or 
-\code{"intron"} respectively.
+\code{"intron"} respectively. They all inherit from \code{GRanges}.
 }
 \description{
 Provides functions for further post processing on objects of 
@@ -84,8 +85,7 @@ exons <- extract(gtf, feature="gene_exon", type="union")
 genes <- extract(gtf, feature="gene", type="default")
 }
 \seealso{
-\code{\link{read_format}} \code{\link{tidy_cols}} 
-\code{\link{as_granges}} \code{\link{extract}} 
-\code{\link{construct_introns}}
+\code{\link{read_format}} \code{\link{as_granges}} 
+\code{\link{extract}} \code{\link{construct_introns}}
 }
 
diff --git a/man/find_overlaps.Rd b/man/find_overlaps.Rd
index 4172f4c..cf74ca7 100644
--- a/man/find_overlaps.Rd
+++ b/man/find_overlaps.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/as_granges.R
+% Please edit documentation in R/internal-funs.R
 \name{find_overlaps}
 \alias{find_overlaps}
 \title{Find overlapping indices of two gtf/gff/bed/bam objects}
diff --git a/man/intersect_bed.Rd b/man/intersect_bed.Rd
index 35954af..307cc44 100644
--- a/man/intersect_bed.Rd
+++ b/man/intersect_bed.Rd
@@ -19,8 +19,8 @@ to filter on. The column/value should be a \emph{character} vector
 containing the values to \emph{retain}.}
 }
 \value{
-A \code{gtf} object containing just those transcripts that overlap 
-with the provided bed file.
+A \code{gtf} object that inherits from \code{GRanges} containing 
+just those transcripts that overlap with the provided bed file.
 }
 \description{
 Given a \code{gtf} file, and a \code{bed} file, this function 
@@ -31,6 +31,5 @@ function is at the moment only for internal purposes.
 \seealso{
 \code{\link{read_format}}, \code{\link{non_overlaps}}, 
 \code{\link{construct_introns}}, \code{\link{extract}}
-\code{\link{tidy_cols}}
 }
 
diff --git a/man/intersect_overlaps.Rd b/man/intersect_overlaps.Rd
index 270920e..8e96c8c 100644
--- a/man/intersect_overlaps.Rd
+++ b/man/intersect_overlaps.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/as_granges.R
+% Please edit documentation in R/internal-funs.R
 \name{intersect_overlaps}
 \alias{intersect_overlaps}
 \title{Compute intersecting ranges on a gtf/gff/bed/bam object with itself.}
diff --git a/man/non_overlaps.Rd b/man/non_overlaps.Rd
index a3d70f7..63971fd 100644
--- a/man/non_overlaps.Rd
+++ b/man/non_overlaps.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/non_overlaps.R
+% Please edit documentation in R/non-overlaps.R
 \name{non_overlaps}
 \alias{non_overlaps}
 \title{Generate non-overlapping intron coordinates}
@@ -15,8 +15,8 @@ features \code{intron} and \code{exon}.}
 \item{gene_id}{Column name in \code{x} corresponding to gene id.}
 }
 \value{
-Returns a list of data.table containing \emph{non overlapping 
-exons} and \emph{non overlapping introns}.
+Returns a list of \code{GRanges} objects containing 
+\emph{non overlapping exons} and \emph{non overlapping introns}.
 }
 \description{
 This function takes an \code{gtf/gff} object. If the 
diff --git a/man/read_format.Rd b/man/read_format.Rd
index e83c974..f0d632c 100644
--- a/man/read_format.Rd
+++ b/man/read_format.Rd
@@ -1,25 +1,24 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/read_format.R
+% Please edit documentation in R/read-format.R
 \name{read_format}
 \alias{read_bam}
 \alias{read_bed}
 \alias{read_format}
 \alias{read_gff}
 \alias{read_gtf}
-\title{Quickly and easily read gtf/gff/bed/bam files}
+\title{Quick and easy reading of gtf/gff/bed/bam files}
 \usage{
 read_format(file, format = detect_format(file), filter = FALSE,
-  tidy_cols = TRUE, chromosomes = NULL, tags = c("NM", "MD"),
-  verbose = FALSE)
+  chromosomes = NULL, tags = c("NM", "MD"), verbose = FALSE)
 
-read_gtf(file, filter = FALSE, tidy_cols = TRUE, verbose = FALSE)
+read_gtf(file, filter = FALSE, verbose = FALSE)
 
-read_gff(file, filter = FALSE, tidy_cols = TRUE, verbose = FALSE)
+read_gff(file, filter = FALSE, verbose = FALSE)
 
-read_bed(file, filter = FALSE, tidy_cols = TRUE, verbose = FALSE)
+read_bed(file, filter = FALSE, verbose = FALSE)
 
-read_bam(file, filter = FALSE, tidy_cols = TRUE, chromosomes = NULL,
-  tags = c("NM", "MD"), verbose = FALSE)
+read_bam(file, filter = FALSE, chromosomes = NULL, tags = c("NM", "MD"),
+  verbose = FALSE)
 }
 \arguments{
 \item{file}{Complete path to the input file.}
@@ -29,13 +28,8 @@ read_bam(file, filter = FALSE, tidy_cols = TRUE, chromosomes = NULL,
 provided, then it will be guessed from input file's extension.}
 
 \item{filter}{Filter rows where \code{start/stop} are invalid, and where 
-\code{start > end}. Default is \code{FALSE} (not to filter, but retain all 
-invalid rows).}
-
-\item{tidy_cols}{If \code{TRUE} (default), returns only essential columns for 
-further analysis (for e.g, \code{score}, \code{itemRgb} etc. are removed), 
-\code{attributes} column is cleaned up with separate columns for 
-\code{gene}, \code{transcript} id etc. Default is \code{TRUE}.}
+\code{start > end}. Default (\code{FALSE}) is to not filter, i.e., retain 
+all invalid rows as well.}
 
 \item{chromosomes}{Argument to \code{read_bam}. If specified only reads 
 from those chromosomes will be loaded.}
@@ -48,7 +42,7 @@ Default is \code{FALSE}.}
 }
 \value{
 An object of class \code{gtf}, \code{gff}, \code{bed} or \code{bam}, 
-corresponding to the input file format, that inherits from \code{data.table}.
+corresponding to the input file format, that inherits from \code{GRanges}.
 }
 \description{
 \code{read_gtf}, \code{read_gff}, \code{read_bed} and 
@@ -58,26 +52,6 @@ formats.
 \code{read_format} is a top-level convenience function that detects and 
 reads automatically from the extension and is sufficient in most cases.
 }
-\details{
-Note that \code{gff1} uses \code{group} instead of 
-\code{attributes} as the column name, but \code{gread} always names it 
-as \code{attributes}. Similarly the first three columns of \code{bed} 
-format are named \code{seqname, start, end} instead of \code{chrom, 
-chromStart, chromEnd} for consistency.
-
-The argument \code{tidy_cols} (\code{TRUE} by default) tidies up the 
-\code{attributes} column in case of \code{gtf/gff} format files. The 
-\code{attributes} column itself is removed since it is tidied up into 
-multiple columns. If this is not desirable, use \code{tidy_cols = FALSE} to 
-load the file \emph{as-is} and then use the \code{tidy_cols} function with 
-\code{remove_cols = NULL}.
-
-In case of \code{gff} format, when `tidy_cols=TRUE`, generation of columns 
-\code{"transcript_id"} and \code{"gene_id"} will be attempted as these 
-columns are essential in most cases for downstream analyses. If possible, 
-the columns \code{"transcript_name"} and \code{"gene_name"} will also be 
-created.
-}
 \examples{
 path <- system.file("tests", package="gread")
 gff_file <- file.path(path, "sample.gff")
@@ -94,14 +68,12 @@ read_bed(bed_file)    # same as above
 read_format(bam_file) # read bam
 read_bam(bam_file)    # same as above
 
-read_format(gtf_file, tidy_cols=FALSE) # load as is, don't tidy
-
 gtf_filter_file <- file.path(path, "sample_filter.gtf")
 read_format(gtf_filter_file, filter=TRUE) # filter invalid rows
 read_gtf(gtf_filter_file, filter=TRUE)    # same as above
 }
 \seealso{
-\code{\link{supported_formats}} \code{\link{tidy_cols}} 
+\code{\link{supported_formats}} 
 \code{\link{extract}} \code{\link{construct_introns}}
 }
 \keyword{file}
diff --git a/man/reduce_overlaps.Rd b/man/reduce_overlaps.Rd
index 2245d5e..42ccdce 100644
--- a/man/reduce_overlaps.Rd
+++ b/man/reduce_overlaps.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/as_granges.R
+% Please edit documentation in R/internal-funs.R
 \name{reduce_overlaps}
 \alias{reduce_overlaps}
 \title{Compute reduced ranges on a gtf/gff/bed/bam object.}
diff --git a/man/shallow.Rd b/man/shallow.Rd
index 46b698a..034152b 100644
--- a/man/shallow.Rd
+++ b/man/shallow.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/as_granges.R
+% Please edit documentation in R/internal-funs.R
 \name{shallow}
 \alias{shallow}
 \title{shallow copy a \code{data.table}}
diff --git a/man/strictly_nonunique.Rd b/man/strictly_nonunique.Rd
index e36db5a..6613ff3 100644
--- a/man/strictly_nonunique.Rd
+++ b/man/strictly_nonunique.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/as_granges.R
+% Please edit documentation in R/internal-funs.R
 \name{strictly_nonunique}
 \alias{strictly_nonunique}
 \title{Return only those rows where rows per group is > 1.}
diff --git a/man/supported_formats.Rd b/man/supported_formats.Rd
index 8b3be0b..c0736f0 100644
--- a/man/supported_formats.Rd
+++ b/man/supported_formats.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/supported_formats.R
+% Please edit documentation in R/supported-formats.R
 \name{supported_formats}
 \alias{supported_formats}
 \title{Currently supported formats}
@@ -16,6 +16,6 @@ Maintains a vector of all the formats supported currently.
 supported_formats()
 }
 \seealso{
-\code{\link{read_format}} \code{\link{tidy_cols}}
+\code{\link{read_format}}
 }
 
diff --git a/man/test_gread.Rd b/man/test_gread.Rd
index 1f15399..ce6c140 100644
--- a/man/test_gread.Rd
+++ b/man/test_gread.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/test_gread.R
+% Please edit documentation in R/test-gread.R
 \name{test_gread}
 \alias{test_gread}
 \title{Run a set of tests}
@@ -38,7 +38,6 @@ gread:::test_gread()
 }
 }
 \seealso{
-\code{\link{read_format}} \code{\link{extract}} 
-\code{\link{tidy_cols}}
+\code{\link{read_format}} \code{\link{extract}}
 }
 
diff --git a/man/tidy_cols.Rd b/man/tidy_cols.Rd
deleted file mode 100644
index e05cb55..0000000
--- a/man/tidy_cols.Rd
+++ /dev/null
@@ -1,85 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/tidy_format.R
-\name{tidy_cols}
-\alias{tidy_cols}
-\alias{tidy_cols.bam}
-\alias{tidy_cols.bed}
-\alias{tidy_cols.default}
-\alias{tidy_cols.gff}
-\alias{tidy_cols.gtf}
-\title{Tidy up gtf/gff/bed/bam objects}
-\usage{
-tidy_cols(x, ...)
-
-\method{tidy_cols}{default}(x, ...)
-
-\method{tidy_cols}{gtf}(x, remove_cols = "attributes", verbose = FALSE, ...)
-
-\method{tidy_cols}{gff}(x, remove_cols = "attributes", verbose = FALSE, ...)
-
-\method{tidy_cols}{bed}(x, remove_cols = NULL, verbose = FALSE, ...)
-
-\method{tidy_cols}{bam}(x, remove_cols = NULL, verbose = FALSE, ...)
-}
-\arguments{
-\item{x}{Input object, of class \code{gtf}, \code{gff}, \code{bed} or 
-\code{bam}.}
-
-\item{...}{Arguments that are ignored at the moment.}
-
-\item{remove_cols}{A character vector of column names to be removed.}
-
-\item{verbose}{Logical. Default is \code{FALSE}. If \code{TRUE}, helpful 
-status messages are printed on to the console.}
-}
-\value{
-A tidied object of class \code{gtf}, \code{gff}, \code{bed} or 
-\code{bam}, that inherits from \code{data.table}.
-}
-\description{
-Tidy up data depending on the type of input format. 
-
-Note that this operation is performed \emph{by reference}, to avoid 
-unnecessary copies, for efficiency. Therefore there is no need to assign 
-the result back to a variable.
-}
-\details{
-In case of \code{gtf} and \code{gff} formats, the 
-\code{attributes} column is extracted into separate columns after which 
-it is removed (by default). Set \code{remove_cols} to \code{NULL} if you 
-would like to retain the column even after tidying up.
-
-In case of \code{bed} and \code{bam} files, no columns are removed by 
-default. However, you can use \code{remove_cols} argument if necessary.
-
-In case of \code{bam} files, the function \code{GAlignments} from the 
-\code{GenomicAlignments} Bioconductor package is used to read in the 
-file, with the additional arguments for \code{ScanBamParam()} function 
-using \code{NM} and \code{MD} tags by default.
-}
-\examples{
-path <- system.file("tests", package="gread")
-gff_file <- file.path(path, "sample.gff")
-gtf_file <- file.path(path, "sample.gtf")
-bed_file <- file.path(path, "sample.bed")
-bam_file <- file.path(path, "sample.bam")
-
-gtf <- read_format(gtf_file, tidy_cols=FALSE)
-gff <- read_format(gtf_file, tidy_cols=FALSE)
-bed <- read_format(bed_file, tidy_cols=FALSE)
-bam <- read_format(bam_file, tidy_cols=FALSE)
-
-tidy_cols(gtf, remove_cols=NULL)[]          # tidy attr. col, but not remove
-tidy_cols(gff, remove_cols=NULL)[]          # same as above, but for gff
-tidy_cols(gtf, remove_cols="attributes")[]  # tidy & remove attr. col
-tidy_cols(gff, remove_cols="attributes")[]  # same as above, but for gff
-
-tidy_cols(bed, remove_cols="name")[]        # remove name column
-tidy_cols(bam, remove_cols=c("NM", "MD"))[] # remove additional loaded tags 
-}
-\seealso{
-\code{\link{supported_formats}} \code{\link{read_format}} 
-\code{\link{extract}} \code{\link{construct_introns}} 
-\code{\link{as_granges}}
-}
-