From 94b397f2a4906379f096d621fa449a9d0aefb929 Mon Sep 17 00:00:00 2001 From: asrinivasan-oa Date: Mon, 25 Jul 2016 13:21:36 +0200 Subject: [PATCH] All inputs and outputs of exported functions are Ranges objects now. --- DESCRIPTION | 4 +- NAMESPACE | 9 - R/as-granges.R | 42 +++ ...onstruct_introns.R => construct-introns.R} | 29 +- R/{extract_features.R => extract-features.R} | 77 +++-- R/gread-format.R | 137 ++++++++ R/gread.R | 1 - R/{as_granges.R => internal-funs.R} | 84 ++--- R/intersect-bed.R | 22 +- R/{non_overlaps.R => non-overlaps.R} | 40 ++- R/read-format.R | 148 +++++++++ R/read_format.R | 311 ------------------ R/s4-setup.R | 7 + ...upported_formats.R => supported-formats.R} | 2 +- R/{test_gread.R => test-gread.R} | 3 +- R/{tidy_format.R => tidy-format.R} | 65 +--- inst/tests/tests.Rraw | 215 ++++++------ man/as_bam.Rd | 2 +- man/as_data_table.Rd | 28 ++ man/as_granges.Rd | 11 +- man/construct_introns.Rd | 9 +- man/disjoin_overlaps.Rd | 2 +- man/extract.Rd | 12 +- man/find_overlaps.Rd | 2 +- man/intersect_bed.Rd | 5 +- man/intersect_overlaps.Rd | 2 +- man/non_overlaps.Rd | 6 +- man/read_format.Rd | 52 +-- man/reduce_overlaps.Rd | 2 +- man/shallow.Rd | 2 +- man/strictly_nonunique.Rd | 2 +- man/supported_formats.Rd | 4 +- man/test_gread.Rd | 5 +- man/tidy_cols.Rd | 85 ----- 34 files changed, 641 insertions(+), 786 deletions(-) create mode 100644 R/as-granges.R rename R/{construct_introns.R => construct-introns.R} (76%) rename R/{extract_features.R => extract-features.R} (86%) create mode 100644 R/gread-format.R rename R/{as_granges.R => internal-funs.R} (80%) rename R/{non_overlaps.R => non-overlaps.R} (85%) create mode 100644 R/read-format.R delete mode 100644 R/read_format.R create mode 100644 R/s4-setup.R rename R/{supported_formats.R => supported-formats.R} (87%) rename R/{test_gread.R => test-gread.R} (99%) rename R/{tidy_format.R => tidy-format.R} (69%) create mode 100644 man/as_data_table.Rd delete mode 100644 man/tidy_cols.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 647b0f1..9a4d3a6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -8,7 +8,8 @@ Authors@R: c( c("aut", "cre")), person("Open Analytics", role = "cph")) Maintainer: Arunkumar Srinivasan Depends: - R (>= 3.3) + R (>= 3.3), + GenomicRanges Imports: stats, utils, @@ -20,7 +21,6 @@ Imports: Rsamtools, rtracklayer, IRanges, - GenomicRanges, GenomicFeatures, GenomicAlignments Suggests: diff --git a/NAMESPACE b/NAMESPACE index 58d1f3d..79408ac 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,11 +1,5 @@ # Generated by roxygen2: do not edit by hand -S3method(tidy_cols,bam) -S3method(tidy_cols,bed) -S3method(tidy_cols,default) -S3method(tidy_cols,gff) -S3method(tidy_cols,gtf) -export(as_granges) export(construct_introns) export(extract) export(read_bam) @@ -14,7 +8,6 @@ export(read_format) export(read_gff) export(read_gtf) export(supported_formats) -export(tidy_cols) import(GenomicFeatures) import(data.table) import(methods) @@ -27,8 +20,6 @@ importFrom(GenomicAlignments,readGAlignments) importFrom(GenomicAlignments,rname) importFrom(GenomicAlignments,summarizeOverlaps) importFrom(GenomicAlignments,width) -importFrom(GenomicRanges,GRanges) -importFrom(GenomicRanges,split) importFrom(IRanges,IRanges) importFrom(Rsamtools,ScanBamParam) importFrom(Rsamtools,scanBamFlag) diff --git a/R/as-granges.R b/R/as-granges.R new file mode 100644 index 0000000..525aa77 --- /dev/null +++ b/R/as-granges.R @@ -0,0 +1,42 @@ +#' @title Convert data.table to GRanges object +#' +#' @description Convert \code{gtf}, \code{gff}, \code{bed}, \code{bam} +#' or a valid \code{data.table} to a GRanges object. +#' +#' @param x An object of class \code{gtf}, \code{gff}, \code{bed} or +#' \code{bam} or a valid \code{data.table} object. +#' @param ignore_strand Logical argument to pass to \code{GRanges} function. +#' Indicates whether \code{strand} should be ignored when constructing +#' \code{GRanges} object or not. Default is \code{FALSE}. +#' @return A \code{GRanges} object. +#' @aliases as_granges +#' @examples +#' path <- system.file("tests", package="gread") +#' gff_file <- file.path(path, "sample.gff") +#' gtf_file <- file.path(path, "sample.gtf") +#' bed_file <- file.path(path, "sample.bed") +#' bam_file <- file.path(path, "sample.bam") +#' +#' gff <- read_format(gff_file) +#' gtf <- read_format(gtf_file) +#' bed <- read_format(bed_file) +#' bam <- read_format(bam_file) +#' +#' as_granges(gff) +#' as_granges(gtf) +#' as_granges(bed) +#' as_granges(bam) +#' +#' as_granges(gff, ignore_strand=FALSE) +#' as_granges(gtf, ignore_strand=FALSE) +#' as_granges(bed, ignore_strand=FALSE) +#' as_granges(bam, ignore_strand=FALSE) +#' @seealso \code{\link{read_format}} \code{\link{extract}} +#' \code{\link{construct_introns}} +as_granges <- function(x, ignore_strand=FALSE) { + + stopifnot(is.gtf(x)||is.gff(x)||is.bed(x)||is.bam(x)||is.data.table(x)) + x = shallow(x) + if (ignore_strand) x[, "strand" := NULL] + as(setDF(x), "GRanges") +} diff --git a/R/construct_introns.R b/R/construct-introns.R similarity index 76% rename from R/construct_introns.R rename to R/construct-introns.R index 8f2a27c..e24f981 100644 --- a/R/construct_introns.R +++ b/R/construct-introns.R @@ -1,4 +1,4 @@ -#' @title Construct introns from \code{gtf} or \code{gff} objects +#' @title Construct introns from gtf/gff objects #' #' @description This function generates intronic coordinates by extracting #' all the \code{exons} from a \code{gtf} or \code{gff} object. @@ -9,7 +9,8 @@ #' and returned invisibily, else just the \code{intron} coordinates are #' returned. #' @return An object of class \code{gtf/gff} with updated \code{intron} -#' coordinates or just the intron coordinates depending on \code{update}. +#' coordinates or just the intron coordinates depending on \code{update}. +#' They all inherit from \code{GRanges}. #' @export #' @examples #' path <- system.file("tests", package="gread") @@ -21,20 +22,21 @@ #' # same as above, but returns only constructed introns #' introns <- construct_introns(gtf, update=FALSE) #' @seealso \code{\link{supported_formats}} \code{\link{read_format}} -#' \code{\link{extract}} \code{\link{tidy_cols}} \code{\link{as_granges}} +#' \code{\link{extract}} \code{\link{as_granges}} construct_introns <- function(x, update=TRUE) { - stopifnot(inherits(x, "gtf") || inherits(x, "gff"), - "feature" %chin% names(x), + stopifnot(is.gtf(x) || is.gff(x)) + x = as_data_table(x) + stopifnot("feature" %chin% names(x), "transcript_id" %chin% names(x), update %in% c(TRUE, FALSE)) # to please R CMD CHECK - feature=seqname=transcript_id=NULL + feature=seqnames=transcript_id=NULL x_class = copy(class(x)) exons = x[feature == "exon"] if (!nrow(exons)) stop("feature == 'exon' returned 0 rows.") - setorderv(exons, c("transcript_id", "seqname", "start", "end", "strand")) - introns = exons[, .(seqname = seqname[1L], + setorderv(exons, c("transcript_id", "seqnames", "start", "end", "strand")) + introns = exons[, .(seqnames = seqnames[1L], start = end[seq_len(.N-1L)]+1L, end = start[seq_len(.N-1L)+1L]-1L, feature = "intron", @@ -45,7 +47,7 @@ construct_introns <- function(x, update=TRUE) { stop("Exons for transcript ids [", paste(ids, collaspse=" "), "] have start > end") exons[, c(setdiff(names(introns), "transcript_id")) := NULL] - exons = unique(exons, by="transcript_id") + exons = unique(shallow(exons, reset_class=TRUE), by="transcript_id") ecols = names(exons) introns[exons, (ecols) := mget(ecols), on="transcript_id"] # reset all exon related cols @@ -53,14 +55,15 @@ construct_introns <- function(x, update=TRUE) { colorder = names(x) if (update) { x = rbind(x, introns) - setorderv(x, c("seqname", "start", "end")) + setorderv(x, c("seqnames", "start", "end")) x = x[, .SD, by="transcript_id"] setattr(x, 'class', x_class) setcolorder(x, colorder) - return(x) + ans = x } else { setcolorder(introns, colorder) - setattr(introns, 'class', unique(c("intron", x_class))) - return(introns[]) + setattr(introns, 'class', c("intron", "data.table", "data.frame")) + ans = introns } + new(class(ans)[1L], as(setDF(ans), "GRanges")) } diff --git a/R/extract_features.R b/R/extract-features.R similarity index 86% rename from R/extract_features.R rename to R/extract-features.R index a790818..f7e83ed 100644 --- a/R/extract_features.R +++ b/R/extract-features.R @@ -6,7 +6,8 @@ #' @details Extract features based on various criteria (usually intended for #' obtaining read counts using \code{gcount} for a given \code{bam} file. #' -#' @param x Input object of class \code{gtf} or \code{gff}. +#' @param x Input object of class \code{gtf} or \code{gff} which inherits +#' from \code{GRanges}. #' @param feature A character vector of (usually related) features to extract #' from. One of \code{"gene_exon"}, \code{"gene"}, \code{"gene_intron"}, #' \code{"exon"}, \code{"intron"}. NB: \code{"exon"} feature must be present @@ -51,10 +52,9 @@ #' @return An object of class \code{"gene"} when \code{feature} is #' \code{"gene"}, \code{"gene_exon"} or \code{"gene_intron"}, and of class #' \code{"exon"} and \code{"intron"} when \code{feature} is \code{"exon"} or -#' \code{"intron"} respectively. -#' @seealso \code{\link{read_format}} \code{\link{tidy_cols}} -#' \code{\link{as_granges}} \code{\link{extract}} -#' \code{\link{construct_introns}} +#' \code{"intron"} respectively. They all inherit from \code{GRanges}. +#' @seealso \code{\link{read_format}} \code{\link{as_granges}} +#' \code{\link{extract}} \code{\link{construct_introns}} #' @export #' @examples #' path <- system.file("tests", package="gread") @@ -74,14 +74,18 @@ extract <- function(x, feature=c("gene_exon", "gene", "gene_intron", "exon", splits = unlist(strsplit(feature, "_", fixed=TRUE)) feature = splits[1L] subfeature = if (is.na(splits[2L])) NULL else splits[2L] - stopifnot(is.gtf(x) || is.gff(x), "feature" %in% names(x), - ignore_strand %in% c(FALSE, TRUE), transcript_id %in% names(x), - gene_id %in% names(x), nrow(x)>0L) - x = shallow(x) # TODO: use data.table:::shallow when exported + stopifnot(is.gtf(x) || is.gff(x)) + x <- as_data_table(x) + stopifnot("feature" %in% names(x), + ignore_strand %in% c(FALSE, TRUE), + transcript_id %in% names(x), + gene_id %in% names(x), nrow(x)>0L) setattr(x, 'class', c(match.arg(feature), class(x))) - setnames(x, c(transcript_id, gene_id), c("transcript_id", "gene_id")) - extract_feature(x, unique(x$feature), subfeature, + setnames(x, c(transcript_id, gene_id), + c("transcript_id", "gene_id")) + ans = extract_feature(x, unique(x$feature), subfeature, match.arg(type), ignore_strand, ...) + new(class(ans)[1L], as(setDF(ans), "GRanges")) } # ----- internal functions for extracting specific features ----- # @@ -99,24 +103,24 @@ extract_feature.gene <- function(x, uniq_features, feature, type, ignore_strand=FALSE, ...) { # to pleae R CMD CHECK - seqname=gene_id=transcript_id=NULL + seqnames=gene_id=transcript_id=NULL if (is.null(feature)) { if (!"exon" %in% uniq_features) stop("'exon' must be a feature in input object 'x'.") construct_transcripts <- function(x) { - x[feature %chin% "exon", .(seqname=seqname[1L], start=min(start), + x[feature %chin% "exon", .(seqnames=seqnames[1L], start=min(start), end=max(end), strand=strand[1L], gene_id=gene_id[1L]), by="transcript_id"] } transcripts = construct_transcripts(x) } else { - cols=c("transcript_id", "seqname", "start", "end", "strand", "gene_id") + cols=c("transcript_id", "seqnames", "start", "end", "strand", "gene_id") .feature = feature transcripts = x[feature %chin% .feature, cols, with=FALSE] } ans = switch (type, default = { - genes = transcripts[, .(seqname=seqname[1L], + genes = transcripts[, .(seqnames=seqnames[1L], start=min(start), end=max(end), strand=strand[1L], transcript_id=paste(unique(transcript_id), collapse=";")), by="gene_id"] @@ -149,7 +153,7 @@ extract_feature.gene <- function(x, uniq_features, feature, type, }, intersect = { genes = transcripts[, if (max(start) <= min(end)) - .(seqname=seqname[1L], start=max(start), + .(seqnames=seqnames[1L], start=max(start), end=min(end), strand=strand[1L], transcript_id=paste(unique(transcript_id), collapse=";")), by="gene_id"] @@ -176,7 +180,7 @@ extract_feature.gene <- function(x, uniq_features, feature, type, collapse=";")), by="queryHits"] genes[, "overlaps":=gene_id][olaps$queryHits, "overlaps":=olaps$gene_id] - colorder = c("seqname", "start", "end", "length", "strand", + colorder = c("seqnames", "start", "end", "length", "strand", "transcript_id", "gene_id", "overlaps") setcolorder(genes, colorder) genes[] @@ -188,7 +192,7 @@ extract_feature.exon <- function(x, uniq_features, feature, type, # to please R CMD CHECK transcript_id=NULL - cols = c("transcript_id", "seqname", "start", "end", "strand", "gene_id") + cols = c("transcript_id", "seqnames", "start", "end", "strand", "gene_id") .feature = if (is.null(feature)) "exon" else feature exons = x[feature %chin% .feature, cols, with=FALSE] ans = switch (type, @@ -258,7 +262,7 @@ extract_feature.exon <- function(x, uniq_features, feature, type, # ans[, "overlaps" := gene_id][olaps$queryHits, "overlaps" := # olaps$gene_id] - colorder = c("seqname", "start", "end", "length", "strand", + colorder = c("seqnames", "start", "end", "length", "strand", "transcript_id", "gene_id") #, "overlaps") setcolorder(exons, colorder) exons[] @@ -269,14 +273,16 @@ extract_feature.intron <- function(x, uniq_features, feature, type, stopifnot("exon" %in% uniq_features) # to please R CMD CHECK transcript_id=NULL - cols = c("transcript_id", "seqname", "start", "end", "strand", "gene_id") + cols = c("transcript_id", "seqnames", "start", "end", "strand", "gene_id") .feature = if (is.null(feature)) "intron" else feature introns = x[feature %chin% .feature, cols, with=FALSE] if (nrow(introns) == 0L) { # most likely the object doesn't have introns, so construct them warning("No introns found in input object. Attempting to construct introns using construct_introns().") - introns = construct_introns(x, update=FALSE) + xx = new(class(x)[2L], as_granges(x)) + introns = as_data_table(construct_introns(xx, update=FALSE)) + setattr(introns, 'class', data.table::copy(class(x))) } if (nrow(introns) == 0L) stop("construct_introns() resulted in 0 introns as well. @@ -348,15 +354,40 @@ extract_feature.intron <- function(x, uniq_features, feature, type, # ans[, "overlaps" := gene_id][olaps$queryHits, "overlaps" := # olaps$gene_id] - colorder = c("seqname", "start", "end", "length", "strand", + colorder = c("seqnames", "start", "end", "length", "strand", "transcript_id", "gene_id") #, "overlaps") setcolorder(introns, colorder) introns[] } -# ----- Helper functions for extract_features ----- # +# ----- Helper functions for extract_features -------------------------------- is.gtf <- function(x) inherits(x, 'gtf') is.gff <- function(x) inherits(x, 'gff') is.bed <- function(x) inherits(x, 'bed') is.bam <- function(x) inherits(x, 'bam') + +#' @title Convert gtf/gff/bam/bed S4 objects to data.table and preserve class +#' +#' @description \code{as.data.table} converts the input object to +#' \code{data.table}, but strips away all other classes except +#' \code{data.table} and \code{data.frame}. +#' +#' This internal function is a simple wrapper to \code{as.data.table} but +#' also retains the extra class information of the input object. +#' @aliases as_data_table +#' @return A data.table object with input class preserved. +#' @examples +#' \dontrun{ +#' setClass("foo", contains="GRanges") +#' x = new("foo", GRanges(letters[1:5], IRanges(1:5, 6:10))) +#' class(x) # [1] "foo" +#' class(gread:::as_data_table(x)) # [1] "foo" "data.table" "data.frame" +#' } +as_data_table <- function(x, reset_class=FALSE) { + stopifnot(inherits(x, "GRanges")) + cl = if (identical(class(x), "GRanges") || reset_class) NULL else class(x) + ans = as.data.table(x) + setattr(ans, 'class', c(cl, "data.table", "data.frame")) + ans[] +} diff --git a/R/gread-format.R b/R/gread-format.R new file mode 100644 index 0000000..59b9fff --- /dev/null +++ b/R/gread-format.R @@ -0,0 +1,137 @@ +gread <- function(token) { + UseMethod("gread") +} + +gread.default <- function(token) { + stop("No method available for format ", gsub("(.*)_format$", + "", class(token))) +} + +gread.gtf_format <- function(token) { + # to please R CMD CHECK + score=phase=NULL + # rtracklayer::readGFF reads gff v1 'attributes' column as 'group', + # but we want it to be always 'attributes'. Also, it tidies up the + # attributes column by default + rtracklayer_fun <- function() { + ans = setDT(rtracklayer::readGFF(token$file, version = 2L, + columns = rtracklayer::GFFcolnames())) + ans[, (1:3) := lapply(.SD, as.character), .SDcols=1:3][] + # remove additional attributes + setattr(ans, 'pragmas', NULL) + setattr(ans, 'attrcol_fmt', NULL) + setattr(ans, 'ncol0', NULL) + setattr(ans, 'ntag', NULL) + setattr(ans, 'raw_data', NULL) + ans + } + fread_fun <- function() fread(token$file, colClasses = token$types, + showProgress = token$verbose) + read_table_fun <- function() read_table(token$file, sep = "\t", + header = FALSE, comment.char = "#", nrows = -1L, colClasses = + token$types, quote = "") + ans = tryCatch(rtracklayer_fun(), error = function(o) { + if (token$verbose) cat("rtracklayer::readGFF failed to read", + " the gtf file. Reverting to data.table::fread.\n", sep="") + tryCatch(fread_fun(), error = function(o) { + if (token$verbose) cat("data.table::fread failed to read", + " as well. Reverting to base::read.table.\n", sep="") + read_table_fun() + }) + }) + setnames(ans, head(names(ans), 9L), token$names) + if (is.character(ans[["score"]])) + ans[, "score" := suppressWarnings(as.numeric(score))] + if (is.character(ans[["phase"]])) + ans[, "phase" := suppressWarnings(as.integer(phase))] + setattr(ans, 'class', c("gtf", "data.table", "data.frame")) +} + +gread.gff_format <- function(token) { + # to please R CMD CHECK + score=phase=NULL + # rtracklayer::readGFF reads gff v1 'attributes' column as 'group', + # but we want it to be always 'attributes'. Also, it tidies up the + # attributes column by default + rtracklayer_fun <- function() { + ans = setDT(as.data.frame(rtracklayer::readGFF(token$file, columns = + rtracklayer::GFFcolnames()))) + if (!token$tidy_cols) set(ans, j = tail(names(ans), -9L), value = NULL) + ans[, (1:3) := lapply(.SD, as.character), .SDcols=1:3][] + list_cols = which(vapply(ans, is.list, TRUE)) + if (length(list_cols)) { + list_paste <- function(x) sapply(x, paste, collapse=",") + # paste all values of list cols together + ans[, (list_cols) := lapply(.SD, list_paste), .SDcols=list_cols] + } + # as.data.frame drops all extra attributes + ans + } + fread_fun <- function() fread(token$file, colClasses = token$types, + showProgress = token$verbose) + read_table_fun <- function() read_table(token$file, sep = "\t", + header = FALSE, comment.char = "#", nrows = -1L, colClasses = + token$types, quote = "") + ans = tryCatch(rtracklayer_fun(), error = function(o) { + if (token$verbose) cat("rtracklayer::readGFF failed to read", + " the gtf file. Reverting to data.table::fread.\n", sep="") + tryCatch(fread_fun(), error = function(o) { + if (token$verbose) cat("data.table::fread failed to read", + " as well. Reverting to base::read.table.\n", sep="") + read_table_fun() + }) + }) + setnames(ans, head(names(ans), 9L), token$names) + if (is.character(ans[["score"]])) + ans[, "score" := suppressWarnings(as.numeric(score))] + if (is.character(ans[["phase"]])) + ans[, "phase" := suppressWarnings(as.integer(phase))] + setattr(ans, 'class', c("gff", "data.table", "data.frame")) +} + +gread.bed_format <- function(token) { + ans = tryCatch(fread(token$file, colClasses = token$types, sep="\t", + showProgress = token$verbose), error = function(o) { + if (token$verbose) cat("data.table::fread failed to read", + " the bed file. Reverting to 'read.table'", + " from base R.\n", sep="") + read_table(token$file, sep = "\t", header = FALSE, + comment.char = "#", nrows = -1L, colClasses = + token$types, quote = "") + }) + # setnames will error if bed file has < 3 cols + setnames(ans, head(token$names, max(3L, ncol(ans)))) + # ans[, start := start+1L] # TODO: should we automatically set start+1L? + setattr(ans, 'class', c("bed", "data.table", "data.frame")) +} + +gread.bam_format <- function(token) { + if (token$verbose) + cat("Loading bam file: ",token$file,", please wait...\n", sep="") + stopifnot(length(token$file)==1L) + + what = c("flag") + flags = Rsamtools::scanBamFlag(isUnmappedQuery=FALSE) + if (!length(token$chromosomes)) { + param = Rsamtools::ScanBamParam(what=what, tag=token$tags, flag=flags) + } else { + hdr = Rsamtools::scanBamHeader(token$file)[[1]]$targets + hdr = hdr[names(hdr) %in% token$chromosomes] + if (!all(names(hdr) %in% token$chromosomes)) { + stop("chromosomes [", paste(setdiff(token$chromosomes, + names(hdr)), collapse=", "), "] not found in bam file.") + } + which = as_granges(data.table(seqnames = names(hdr), + start=1L, end = hdr, strand="*"), ignore_strand=TRUE) + param = Rsamtools::ScanBamParam(what = what, which = which, + tag = token$tags, flag = flags) + } + # load bam with corresponding param + ans = as_bam(GenomicAlignments::readGAlignments(token$file, param=param)) + setattr(ans, 'class', c("bam", "data.table", "data.frame")) +} + +# Internal helper functions -------------------------------------------------- +read_table <- function(file, ...) { + setDT(read.table(file, stringsAsFactors=FALSE, as.is=TRUE, ...)) +} diff --git a/R/gread.R b/R/gread.R index 8b02969..415feee 100644 --- a/R/gread.R +++ b/R/gread.R @@ -38,7 +38,6 @@ #' @importFrom utils capture.output head packageVersion read.table tail #' @importFrom stats na.omit #' @importFrom Rsamtools ScanBamParam scanBamFlag scanBamHeader -#' @importFrom GenomicRanges GRanges split #' @importFrom IRanges IRanges #' @importMethodsFrom GenomicRanges disjoin reduce intersect findOverlaps #' @importMethodsFrom GenomicRanges countOverlaps seqnames start end strand diff --git a/R/as_granges.R b/R/internal-funs.R similarity index 80% rename from R/as_granges.R rename to R/internal-funs.R index 7786fa7..bdcb527 100644 --- a/R/as_granges.R +++ b/R/internal-funs.R @@ -1,49 +1,4 @@ -## internal functions --------------------------------- - -#' @title Convert to GRanges object -#' -#' @description Convert \code{gtf}, \code{gff}, \code{bed}, \code{bam} -#' or a valid \code{data.table} to a GRanges object, preserving all -#' additional columns. -#' -#' @param x An object of class \code{gtf}, \code{gff}, \code{bed} or -#' \code{bam} or a valid \code{data.table} object. -#' @param ignore_strand Logical argument to pass to \code{GRanges} function. -#' Indicates whether \code{strand} should be ignored when constructing -#' \code{GRanges} object or not. Default is \code{FALSE}. -#' @return A \code{GRanges} object. -#' @export -#' @aliases as_granges -#' @examples -#' path <- system.file("tests", package="gread") -#' gff_file <- file.path(path, "sample.gff") -#' gtf_file <- file.path(path, "sample.gtf") -#' bed_file <- file.path(path, "sample.bed") -#' bam_file <- file.path(path, "sample.bam") -#' -#' gff <- read_format(gff_file) -#' gtf <- read_format(gtf_file) -#' bed <- read_format(bed_file) -#' bam <- read_format(bam_file) -#' -#' as_granges(gff) -#' as_granges(gtf) -#' as_granges(bed) -#' as_granges(bam) -#' -#' as_granges(gff, ignore_strand=FALSE) -#' as_granges(gtf, ignore_strand=FALSE) -#' as_granges(bed, ignore_strand=FALSE) -#' as_granges(bam, ignore_strand=FALSE) -#' @seealso \code{\link{read_format}} -#' \code{\link{tidy_cols}} \code{\link{extract}} \code{\link{construct_introns}} -as_granges <- function(x, ignore_strand=FALSE) { - - stopifnot(is.gtf(x)||is.gff(x)||is.bed(x)||is.bam(x)||is.data.table(x)) - x = shallow(x) - if (ignore_strand) x[, "strand" := NULL] - as(setDF(x), "GRanges") -} +## internal helper functions ------------------------------------------------- #' @title Return only those rows where rows per group is > 1. #' @@ -122,7 +77,7 @@ find_overlaps <- function(x, y, ignore_redundant=FALSE, #' @return A \code{data.table} with reduced ranges. reduce_overlaps <- function(x, by="gene_id", ignore_strand=FALSE) { # to please R CMD CHECK - group=seqname=i.strand=`..N..`=NULL + group=seqnames=i.strand=`..N..`=NULL stopifnot(length(by) == 1L, by %in% names(x), is.gtf(x) || is.gff(x) || is.bed(x) || is.bam(x), ignore_strand %in% c(FALSE, TRUE)) red = as.data.frame(GenomicRanges::reduce(GenomicRanges::split(as_granges( @@ -130,11 +85,10 @@ reduce_overlaps <- function(x, by="gene_id", ignore_strand=FALSE) { setDT(red)[, group := NULL] setcolorder(red, c("seqnames", "start", "end", "width", "strand", "group_name")) - setnames(red, c("seqnames", "width", "group_name"), c("seqname", - "length", by)) - red[, `:=`(seqname=as.character(seqname), strand=as.character(strand))] + setnames(red, c("width", "group_name"), c("length", by)) + red[, `:=`(seqnames=as.character(seqnames), strand=as.character(strand))] # restore original order, nomatch = "errror" would be great to have here! - ux = unique(shallow(x, c(by, "strand")), by=c(by)) + ux = unique(shallow(x, c(by, "strand"), reset_class=TRUE), by=c(by)) red = red[ux, on=c(by)] # if ignore_strand, replace strand if (ignore_strand) red[, strand := i.strand] @@ -162,18 +116,17 @@ disjoin_overlaps <- function(x, by="gene_id", ignore_strand=FALSE) { is.bed(x) || is.bam(x), ignore_strand %in% c(FALSE, TRUE)) # to please R CMD CHECK - group=seqname=i.strand=NULL + group=seqnames=i.strand=NULL dj = as.data.frame(GenomicRanges::disjoin(GenomicRanges::split(as_granges( x, ignore_strand), x[[by]]))) setDT(dj)[, group := NULL] setcolorder(dj, c("seqnames", "start", "end", "width", "strand", "group_name")) - setnames(dj, c("seqnames", "width", "group_name"), c("seqname", - "length", by)) - dj[, `:=`(seqname=as.character(seqname), strand=as.character(strand))] + setnames(dj, c("width", "group_name"), c("length", by)) + dj[, `:=`(seqnames=as.character(seqnames), strand=as.character(strand))] # restore original order # TODO: revisit when nomatch = "error" is implemented - ux = unique(shallow(x, c(by, "strand")), by=c(by)) + ux = unique(shallow(x, c(by, "strand"), reset_class=TRUE), by=c(by)) dj = dj[ux, on=c(by)] # if ignore_strand, replace strand if (ignore_strand) dj[, strand := i.strand] @@ -197,7 +150,7 @@ disjoin_overlaps <- function(x, by="gene_id", ignore_strand=FALSE) { #' @return A \code{data.table} with intersecting ranges. intersect_overlaps <- function(x, by="gene_id", ignore_strand=FALSE) { # to please R CMD CHECK - group=seqname=i.strand=NULL + group=seqnames=i.strand=NULL stopifnot(length(by) == 1L, by %in% names(x), is.gtf(x) || is.gff(x) || is.bed(x) || is.bam(x), ignore_strand %in% c(FALSE, TRUE)) s_gr = GenomicRanges::split(as_granges(x, ignore_strand), x[[by]]) @@ -205,11 +158,10 @@ intersect_overlaps <- function(x, by="gene_id", ignore_strand=FALSE) { setDT(isect)[, group := NULL] setcolorder(isect, c("seqnames", "start", "end", "width", "strand", "group_name")) - setnames(isect, c("seqnames", "width", "group_name"), c("seqname", - "length", by)) - isect[, `:=`(seqname=as.character(seqname), strand=as.character(strand))] + setnames(isect, c("width", "group_name"), c("length", by)) + isect[, `:=`(seqnames=as.character(seqnames), strand=as.character(strand))] # restore original order, nomatch = "errror" would be great to have here! - ux = unique(shallow(x, c(by, "strand")), by=c(by)) + ux = unique(shallow(x, c(by, "strand"), reset_class=TRUE), by=c(by)) isect = isect[ux, on=c(by)] # if ignore_strand, replace strand if (ignore_strand) isect[, strand := i.strand] @@ -236,14 +188,16 @@ intersect_overlaps <- function(x, by="gene_id", ignore_strand=FALSE) { #' y <- gread:::shallow(x) # only copies column pointers #' class(y) # class(x) is retained #' } -shallow <- function(x, cols = names(x)) { +shallow <- function(x, cols = names(x), reset_class = FALSE) { stopifnot(is.data.table(x), all(cols %in% names(x))) ans = vector("list", length(cols)) setattr(ans, 'names', data.table::copy(cols)) for (col in cols) ans[[col]] = x[[col]] setDT(ans) - setattr(ans, 'class', data.table::copy(class(x))) + class = if (!reset_class) data.table::copy(class(x)) + else c("data.table", "data.frame") + setattr(ans, 'class', class) ans[] } @@ -272,7 +226,7 @@ shallow <- function(x, cols = names(x)) { as_bam <- function(x) { stopifnot(inherits(x, "GAlignments")) ans = list( - seqname=as.character(seqnames(x)), + seqnames=as.character(seqnames(x)), start=start(x), end=end(x), strand=as.character(strand(x)), @@ -306,7 +260,7 @@ as_bam <- function(x) { # #' @return A \code{TxDb} object. # #' @aliases as_txdb # #' @seealso \code{\link{as_granges}} \code{\link{read_format}} -# #' \code{\link{tidy_cols}} \code{\link{extract}} +# #' \code{\link{extract}} # #' @export # #' @examples # #' path <- system.file("tests", package="gread") diff --git a/R/intersect-bed.R b/R/intersect-bed.R index 6bb9dc9..759bb14 100644 --- a/R/intersect-bed.R +++ b/R/intersect-bed.R @@ -7,7 +7,7 @@ #' #' @param gtf_file Full path to a \code{gtf} file. #' @param bed_file Full path to a \code{bed} file. The file must contain at -#' least three columns, with them corresponding to \code{seqname} (or +#' least three columns, with them corresponding to \code{seqnames} (or #' \code{chr}), \code{start} and \code{end}. #' @param select_features A 1-column \code{data.table} or a named list of #' length=1. The name of the list indicates the column in the \code{gtf} file @@ -15,16 +15,15 @@ #' containing the values to \emph{retain}. #' @seealso \code{\link{read_format}}, \code{\link{non_overlaps}}, #' \code{\link{construct_introns}}, \code{\link{extract}} -#' \code{\link{tidy_cols}} -#' @return A \code{gtf} object containing just those transcripts that overlap -#' with the provided bed file. +#' @return A \code{gtf} object that inherits from \code{GRanges} containing +#' just those transcripts that overlap with the provided bed file. intersect_bed <- function(gtf_file, bed_file, select_features) { # to please R CMD CHECK - feature=seqname=transcript_id=NULL - # load seqname, start and end columns + feature=seqnames=transcript_id=NULL + # load seqnames, start and end columns bed = read_format(bed_file) - setnames(bed, names(bed)[1:3], c("seqname", "start", "end")) + setnames(bed, names(bed)[1:3], c("seqnames", "start", "end")) bed[, "start" := start+1L] setkey(bed) @@ -33,8 +32,8 @@ intersect_bed <- function(gtf_file, bed_file, select_features) { select_features = unique(as.data.table(select_features)) gtf = gtf[select_features, on=names(select_features)[1L], nomatch=0L] } - bed_chrs = unique(bed$seqname) - gtf_chrs = unique(gtf$seqname) + bed_chrs = unique(bed$seqnames) + gtf_chrs = unique(gtf$seqnames) if (length(diff_chrs <- setdiff(bed_chrs, gtf_chrs))) { if (length(diff_chrs) == length(bed_chrs)) stop("Chromosomes in bed file are completely different from @@ -45,7 +44,7 @@ intersect_bed <- function(gtf_file, bed_file, select_features) { construct_transcripts <- function(x) { x[feature %chin% "exon", - .(seqname = seqname[1L], start = min(start), + .(seqnames = seqnames[1L], start = min(start), end = max(end), strand = strand[1L]), by = "transcript_id"] } @@ -53,5 +52,6 @@ intersect_bed <- function(gtf_file, bed_file, select_features) { olaps = foverlaps(transcripts, bed, type="any", which=TRUE, nomatch=0L) tids = transcripts[olaps$xid, unique(transcript_id)] - gtf[.(tids), on="transcript_id", nomatch=0L] + ans = gtf[.(tids), on="transcript_id", nomatch=0L] + new("gtf", as(setDF(ans), "GRanges")) } diff --git a/R/non_overlaps.R b/R/non-overlaps.R similarity index 85% rename from R/non_overlaps.R rename to R/non-overlaps.R index 46221dd..8aa5b6c 100644 --- a/R/non_overlaps.R +++ b/R/non-overlaps.R @@ -14,8 +14,8 @@ #' features \code{intron} and \code{exon}. #' @param transcript_id Column name in \code{x} corresponding to transcript id. #' @param gene_id Column name in \code{x} corresponding to gene id. -#' @return Returns a list of data.table containing \emph{non overlapping -#' exons} and \emph{non overlapping introns}. +#' @return Returns a list of \code{GRanges} objects containing +#' \emph{non overlapping exons} and \emph{non overlapping introns}. #' @examples #' \dontrun{ #' path <- system.file("tests", package="gread") @@ -25,13 +25,19 @@ #' } non_overlaps <- function(x, transcript_id="transcript_id", gene_id="gene_id"){ - # to please R CMD CHECK - feature=idx=epos=ecnt=keep=edge=check=V1=seqname=pos=e.gid=i.gid=N=NULL # taken from gffutils::non_overlaps - stopifnot(is.gtf(x) || is.gff(x), "feature" %chin% names(x), - transcript_id %chin% names(x), gene_id %chin% names(x)) + stopifnot(is.gtf(x) || is.gff(x)) + x = as_data_table(x) + stopifnot("feature" %chin% names(x), + transcript_id %chin% names(x), + gene_id %chin% names(x)) + # to please R CMD CHECK + feature=idx=epos=ecnt=keep=edge=check=V1=seqnames=pos=e.gid=i.gid=N=NULL uniq_features = unique(x[["feature"]]) - if (!"intron" %in% uniq_features) x = construct_introns(x) + if (!"intron" %in% uniq_features) { + xx = new(class(x)[1L], as_granges(x)) + x = as_data_table(construct_introns(xx)) + } uniq_features = unique(x[["feature"]]) stopifnot(all(c("intron", "exon") %in% uniq_features)) i = x[feature == "intron"] @@ -104,21 +110,21 @@ non_overlaps <- function(x, transcript_id="transcript_id", gene_id="gene_id"){ # meaning there is a smaller intron at the same position for this gene # that we've already accounted for ni = i[!a_minus_w$queryHits] - setkey(ni, "seqname", "start", "end") + setkey(ni, "seqnames", "start", "end") tmp__ = ni[[transcript_id]] ni[, c(transcript_id) := as.list(tmp__)] - ni = ni[, c("seqname", "start", "end", "strand", + ni = ni[, c("seqnames", "start", "end", "strand", transcript_id, gene_id), with=FALSE] - count = ni[, .N, by = list(seqname, start, end)] + count = ni[, .N, by = list(seqnames, start, end)] # code here is moved to the end... July 25 (due to overlapping genes # interpretation difference between intron and exon) # non_overlapping exon # edited May 30th 2013, redid the whole thing: July 17 2013 # data.table ABSOLUTELY ROCKS! - ne = e[, list(seqname=seqname[1L], pos=c(start, end)), by=c(gene_id)] + ne = e[, list(seqnames=seqnames[1L], pos=c(start, end)), by=c(gene_id)] setkey(ne) - ne = ne[, list(seqname=seqname[1L], start=pos[1:(.N-1)], end=pos[2:.N]), + ne = ne[, list(seqnames=seqnames[1L], start=pos[1:(.N-1)], end=pos[2:.N]), by=c(gene_id)] nolaps = find_overlaps(ne, ni, type = "any") @@ -128,10 +134,10 @@ non_overlaps <- function(x, transcript_id="transcript_id", gene_id="gene_id"){ ne = ne[!unique(nolaps$queryHits)] ne = ne[start != end] - ne = ne[, list(seqname=seqname[1L], pos=c(start, end)), by=c(gene_id)] + ne = ne[, list(seqnames=seqnames[1L], pos=c(start, end)), by=c(gene_id)] setkeyv(ne, c(gene_id, "pos")) ne = ne[ne[, .N, by=key(ne)][N == 1]][, N := NULL] - ne = ne[, list(seqname=seqname[1L], start=pos[c(TRUE, FALSE)], + ne = ne[, list(seqnames=seqnames[1L], start=pos[c(TRUE, FALSE)], end=pos[c(FALSE, TRUE)]), by=c(gene_id)] setkeyv(ne, c(gene_id)) @@ -142,7 +148,7 @@ non_overlaps <- function(x, transcript_id="transcript_id", gene_id="gene_id"){ setnames(ne.rest, "transcript_id", transcript_id) ne = ne[ne.rest] setcolorder(ne, names(ni)) - setkey(ne, seqname, start, end) + setkey(ne, seqnames, start, end) # now get overlapping intron ni[count[N > 1], `:=`(strand=paste(unique(strand), sep="", collapse=""), @@ -150,11 +156,13 @@ non_overlaps <- function(x, transcript_id="transcript_id", gene_id="gene_id"){ gene_id=list(unique(unlist(gene_id)))), by=.EACHI] ## version 1.9.3+ setnames(ni, c("transcript_id", "gene_id"), c(transcript_id, gene_id)) - ni = unique(ni) + ni = unique(shallow(ni, reset_class=TRUE)) # calling unique on 'ni' is more efficient than this, since 'ni' is key'd. # changed July 17 # .gff[["non_overlapping_intron"]] = ni[count, mult = "first"][, N := NULL] setkey(ne, NULL) setkey(ni, NULL) + ne = as(setDF(ne), "GRanges") + ni = as(setDF(ni), "GRanges") list(non_overlapping_exon = ne, non_overlapping_intron = ni) } diff --git a/R/read-format.R b/R/read-format.R new file mode 100644 index 0000000..44c23fb --- /dev/null +++ b/R/read-format.R @@ -0,0 +1,148 @@ +#' @title Quick and easy reading of gtf/gff/bed/bam files +#' +#' @description \code{read_gtf}, \code{read_gff}, \code{read_bed} and +#' \code{read_bam} are specific functions corresponding to respective file +#' formats. +#' +#' \code{read_format} is a top-level convenience function that detects and +#' reads automatically from the extension and is sufficient in most cases. +#' +#' @param file Complete path to the input file. +#' @param format Type of annotation. Currently allowed values are +#' \code{"gtf"}, \code{"gff"}, \code{"bed"} and \code{"bam"}. If no input is +#' provided, then it will be guessed from input file's extension. +#' @param filter Filter rows where \code{start/stop} are invalid, and where +#' \code{start > end}. Default (\code{FALSE}) is to not filter, i.e., retain +#' all invalid rows as well. +#' @param chromosomes Argument to \code{read_bam}. If specified only reads +#' from those chromosomes will be loaded. +#' @param tags Additional (optional) tags to load from the bam file. By +#' default, the tags \code{"NM"} and \code{"MD"} are loaded. +#' @param verbose If \code{TRUE}, sends useful status messages to the console. +#' Default is \code{FALSE}. +#' @aliases read_format read_gtf read_gff read_bed read_bam +#' @return An object of class \code{gtf}, \code{gff}, \code{bed} or \code{bam}, +#' corresponding to the input file format, that inherits from \code{GRanges}. +#' @seealso \code{\link{supported_formats}} +#' \code{\link{extract}} \code{\link{construct_introns}} +#' @keywords file +#' @export +#' @examples +#' path <- system.file("tests", package="gread") +#' gff_file <- file.path(path, "sample.gff") +#' gtf_file <- file.path(path, "sample.gtf") +#' bed_file <- file.path(path, "sample.bed") +#' bam_file <- file.path(path, "sample.bam") +#' +#' read_format(gff_file) # read gff +#' read_gff(gff_file) # same as above +#' read_format(gtf_file) # read gtf +#' read_gtf(gtf_file) # same as above +#' read_format(bed_file) # read bed +#' read_bed(bed_file) # same as above +#' read_format(bam_file) # read bam +#' read_bam(bam_file) # same as above +#' +#' gtf_filter_file <- file.path(path, "sample_filter.gtf") +#' read_format(gtf_filter_file, filter=TRUE) # filter invalid rows +#' read_gtf(gtf_filter_file, filter=TRUE) # same as above +read_format <- function(file, format=detect_format(file), filter=FALSE, + chromosomes=NULL, tags=c("NM", "MD"), verbose=FALSE) { + + if (!file.exists(file)) + stop(toupper(format), " file '", file, "' not found.") + types = format_types(format=format) + names = format_names(format=format) + token = tokenise(file, format, filter, chromosomes, tags, verbose, + types=types, names=names) + ans = gread(token) + tidy_cols(ans, verbose=verbose) + if (token$filter && !identical(format, "bam")) { + before = nrow(ans) + # TODO: optimise this subset using internal any in data.table + ans = ans[start <= end] # will take care of NAs as well + after = nrow(ans) + if (token$verbose && before-after) + cat(before-after, " invalid rows filtered. These include rows", + " where start/end coordinates were NA/NaN and ", + " where start > end.\n", sep="") + } + # Returning 'GRanges' following Hervé's feedback + # See https://github.com/Bioconductor/Contributions/issues/25 + new(class(ans)[1L], as(setDF(ans), "GRanges")) +} + +#' @rdname read_format +#' @export +read_gtf <- function(file, filter=FALSE, verbose=FALSE) { + read_format(file, "gtf", filter, verbose) +} + +#' @rdname read_format +#' @export +read_gff <- function(file, filter=FALSE, verbose=FALSE) { + read_format(file, "gff", filter, verbose) +} + +#' @rdname read_format +#' @export +read_bed <- function(file, filter=FALSE, verbose=FALSE) { + read_format(file, "bed", filter, verbose) +} + +#' @rdname read_format +#' @export +read_bam <- function(file, filter=FALSE, chromosomes=NULL, + tags=c("NM", "MD"), verbose=FALSE) { + read_format(file, "bam", filter, chromosomes, tags, verbose) +} + +# Internal helper functions -------------------------------------------------- + +detect_format <- function(file) { + + type = substr(tolower(tools::file_ext(file)), 1L, 3L) + if (!type %in% c("gtf", "gff", "bam", "bed")) + stop("File does not have gtf/gff/bed/bam extension.") + type +} + +format_types <- function(format) { + chr = "character" + int = "integer" + num = "numeric" + switch(format, gtf=, gff=c(rep(chr, 3L), rep(int, 2L), num, chr, int, chr), + bed=c(chr, rep(int, 2L), rep(chr, 3L), rep(int, 2L), + chr, int, rep(chr, 2L)), + bam=c()) +} + +format_names <- function(format) { + switch(format, gtf=c("seqnames", "source", "feature", "start", + "end", "score", "strand", "frame", "attributes"), + gff=c("seqnames", "source", "feature", "start", + "end", "score", "strand", "phase", "attributes"), + # all 15 columns for bed, only first 3 are compulsory + bed=c("seqnames", "start", "end", "name", "score", + "strand", "thickStart", "thickEnd", "itemRgb", + "blockCount", "blockSizes", "blockStarts", "expCount", + "expIds", "expScores"), + bam=c("seqnames", "start", "end", "strand", "cigar", + "qwidth", "width", "njunc", "flag")) +} + +tokenise <- function(file, format, filter, chromosomes, + tags, verbose, ...) { + arglist = as.list(match.call(expand.dots=TRUE))[-1L] + argnames = setdiff(names(arglist), "format") + if (!length(tags)) tags = character(0) # ScanBamParam needs this + tokens = structure(c(list(file, filter, chromosomes, tags, verbose), + list(...)), class=paste(format, "format", sep="_")) + data.table::setattr(tokens, 'names', argnames) + tokens +} + +`.` <- function(...) { + stop(".() is only meant for use in `j` argument of + data.table, and is an alias to list()") +} diff --git a/R/read_format.R b/R/read_format.R deleted file mode 100644 index 07349f4..0000000 --- a/R/read_format.R +++ /dev/null @@ -1,311 +0,0 @@ -#' @title Quickly and easily read gtf/gff/bed/bam files -#' -#' @description \code{read_gtf}, \code{read_gff}, \code{read_bed} and -#' \code{read_bam} are specific functions corresponding to respective file -#' formats. -#' -#' \code{read_format} is a top-level convenience function that detects and -#' reads automatically from the extension and is sufficient in most cases. -#' -#' @details Note that \code{gff1} uses \code{group} instead of -#' \code{attributes} as the column name, but \code{gread} always names it -#' as \code{attributes}. Similarly the first three columns of \code{bed} -#' format are named \code{seqname, start, end} instead of \code{chrom, -#' chromStart, chromEnd} for consistency. -#' -#' The argument \code{tidy_cols} (\code{TRUE} by default) tidies up the -#' \code{attributes} column in case of \code{gtf/gff} format files. The -#' \code{attributes} column itself is removed since it is tidied up into -#' multiple columns. If this is not desirable, use \code{tidy_cols = FALSE} to -#' load the file \emph{as-is} and then use the \code{tidy_cols} function with -#' \code{remove_cols = NULL}. -#' -#' In case of \code{gff} format, when `tidy_cols=TRUE`, generation of columns -#' \code{"transcript_id"} and \code{"gene_id"} will be attempted as these -#' columns are essential in most cases for downstream analyses. If possible, -#' the columns \code{"transcript_name"} and \code{"gene_name"} will also be -#' created. -#' -#' @param file Complete path to the input file. -#' @param format Type of annotation. Currently allowed values are -#' \code{"gtf"}, \code{"gff"}, \code{"bed"} and \code{"bam"}. If no input is -#' provided, then it will be guessed from input file's extension. -#' @param filter Filter rows where \code{start/stop} are invalid, and where -#' \code{start > end}. Default is \code{FALSE} (not to filter, but retain all -#' invalid rows). -#' @param tidy_cols If \code{TRUE} (default), returns only essential columns for -#' further analysis (for e.g, \code{score}, \code{itemRgb} etc. are removed), -#' \code{attributes} column is cleaned up with separate columns for -#' \code{gene}, \code{transcript} id etc. Default is \code{TRUE}. -#' @param chromosomes Argument to \code{read_bam}. If specified only reads -#' from those chromosomes will be loaded. -#' @param tags Additional (optional) tags to load from the bam file. By -#' default, the tags \code{"NM"} and \code{"MD"} are loaded. -#' @param verbose If \code{TRUE}, sends useful status messages to the console. -#' Default is \code{FALSE}. -#' @aliases read_format read_gtf read_gff read_bed read_bam -#' @return An object of class \code{gtf}, \code{gff}, \code{bed} or \code{bam}, -#' corresponding to the input file format, that inherits from \code{data.table}. -#' @seealso \code{\link{supported_formats}} \code{\link{tidy_cols}} -#' \code{\link{extract}} \code{\link{construct_introns}} -#' @keywords file -#' @export -#' @examples -#' path <- system.file("tests", package="gread") -#' gff_file <- file.path(path, "sample.gff") -#' gtf_file <- file.path(path, "sample.gtf") -#' bed_file <- file.path(path, "sample.bed") -#' bam_file <- file.path(path, "sample.bam") -#' -#' read_format(gff_file) # read gff -#' read_gff(gff_file) # same as above -#' read_format(gtf_file) # read gtf -#' read_gtf(gtf_file) # same as above -#' read_format(bed_file) # read bed -#' read_bed(bed_file) # same as above -#' read_format(bam_file) # read bam -#' read_bam(bam_file) # same as above -#' -#' read_format(gtf_file, tidy_cols=FALSE) # load as is, don't tidy -#' -#' gtf_filter_file <- file.path(path, "sample_filter.gtf") -#' read_format(gtf_filter_file, filter=TRUE) # filter invalid rows -#' read_gtf(gtf_filter_file, filter=TRUE) # same as above -read_format <- function(file, format=detect_format(file), filter=FALSE, - tidy_cols=TRUE, chromosomes=NULL, tags=c("NM", "MD"), - verbose=FALSE) { - - if (!file.exists(file)) - stop(toupper(format), " file '", file, "' not found.") - types = format_types(format=format) - names = format_names(format=format) - token = tokenise(file, format, filter, tidy_cols, chromosomes, tags, verbose, - types=types, names=names) - ans = gread(token) - if (tidy_cols) tidy_cols(ans, verbose=verbose) - if (token$filter && !identical(format, "bam")) { - before = nrow(ans) - # TO DO: optimise this subset using internal any in data.table - ans = ans[start <= end] # will take care of NAs as well - after = nrow(ans) - if (token$verbose && before-after) - cat(before-after, " invalid rows filtered. These include rows", - " where start/end coordinates were NA/NaN and ", - " where start > end.\n", sep="") - } - ans -} - -#' @rdname read_format -#' @export -read_gtf <- function(file, filter=FALSE, tidy_cols=TRUE, verbose=FALSE) { - read_format(file, "gtf", filter, tidy_cols, verbose) -} - -#' @rdname read_format -#' @export -read_gff <- function(file, filter=FALSE, tidy_cols=TRUE, verbose=FALSE) { - read_format(file, "gff", filter, tidy_cols, verbose) -} - -#' @rdname read_format -#' @export -read_bed <- function(file, filter=FALSE, tidy_cols=TRUE, verbose=FALSE) { - read_format(file, "bed", filter, tidy_cols, verbose) -} - -#' @rdname read_format -#' @export -read_bam <- function(file, filter=FALSE, tidy_cols=TRUE, chromosomes=NULL, - tags=c("NM", "MD"), verbose=FALSE) { - read_format(file, "bam", filter, tidy_cols, chromosomes, tags, verbose) -} - -# Helper/Internal functions for read_format --------------------- - -detect_format <- function(file) { - - type = substr(tolower(tools::file_ext(file)), 1L, 3L) - if (!type %in% c("gtf", "gff", "bam", "bed")) - stop("File does not have gtf/gff/bed/bam extension.") - type -} - -tokenise <- function(file, format, filter, tidy_cols, chromosomes, - tags, verbose, ...) { - arglist = as.list(match.call(expand.dots=TRUE))[-1L] - argnames = setdiff(names(arglist), "format") - if (!length(tags)) tags = character(0) # ScanBamParam needs this - tokens = structure(c(list(file, filter, tidy_cols, chromosomes, tags, verbose), - list(...)), class=paste(format, "format", sep="_")) - data.table::setattr(tokens, 'names', argnames) - tokens -} - -# Helper/Internal functions for read_gtf/read_gff/read_bed ------- - -format_names <- function(format) { - switch(format, gtf=c("seqname", "source", "feature", "start", - "end", "score", "strand", "frame", "attributes"), - gff=c("seqname", "source", "feature", "start", - "end", "score", "strand", "phase", "attributes"), - # all 15 columns for bed, only first 3 are compulsory - bed=c("seqname", "start", "end", "name", "score", - "strand", "thickStart", "thickEnd", "itemRgb", - "blockCount", "blockSizes", "blockStarts", "expCount", - "expIds", "expScores"), - bam=c("seqname", "start", "end", "strand", "cigar", - "qwidth", "width", "njunc", "flag")) -} - -format_types <- function(format) { - chr = "character" - int = "integer" - num = "numeric" - switch(format, gtf=, gff=c(rep(chr, 3L), rep(int, 2L), num, chr, int, chr), - bed=c(chr, rep(int, 2L), rep(chr, 3L), rep(int, 2L), - chr, int, rep(chr, 2L)), - bam=c()) -} - -read_table <- function(file, ...) - setDT(read.table(file, stringsAsFactors=FALSE, as.is=TRUE, ...)) - -gread <- function(token) { - UseMethod("gread") -} - -gread.default <- function(token) { - stop("No method available for format ", gsub("(.*)_format$", - "", class(token))) -} - -gread.gtf_format <- function(token) { - # to please R CMD CHECK - score=phase=NULL - # rtracklayer::readGFF reads gff v1 'attributes' column as 'group', - # but we want it to be always 'attributes'. Also, it tidies up the - # attributes column by default - rtracklayer_fun <- function() { - ans = setDT(rtracklayer::readGFF(token$file, version = 2L, - columns = rtracklayer::GFFcolnames())) - if (!token$tidy_cols) set(ans, j = tail(names(ans), -9L), value = NULL) - ans[, (1:3) := lapply(.SD, as.character), .SDcols=1:3][] - # remove additional attributes - setattr(ans, 'pragmas', NULL) - setattr(ans, 'attrcol_fmt', NULL) - setattr(ans, 'ncol0', NULL) - setattr(ans, 'ntag', NULL) - setattr(ans, 'raw_data', NULL) - ans - } - fread_fun <- function() fread(token$file, colClasses = token$types, - showProgress = token$verbose) - read_table_fun <- function() read_table(token$file, sep = "\t", - header = FALSE, comment.char = "#", nrows = -1L, colClasses = - token$types, quote = "") - ans = tryCatch(rtracklayer_fun(), error = function(o) { - if (token$verbose) cat("rtracklayer::readGFF failed to read", - " the gtf file. Reverting to data.table::fread.\n", sep="") - tryCatch(fread_fun(), error = function(o) { - if (token$verbose) cat("data.table::fread failed to read", - " as well. Reverting to base::read.table.\n", sep="") - read_table_fun() - }) - }) - setnames(ans, head(names(ans), 9L), token$names) - if (is.character(ans[["score"]])) - ans[, "score" := suppressWarnings(as.numeric(score))] - if (is.character(ans[["phase"]])) - ans[, "phase" := suppressWarnings(as.integer(phase))] - setattr(ans, 'class', c("gtf", "data.table", "data.frame")) -} - -gread.gff_format <- function(token) { - # to please R CMD CHECK - score=phase=NULL - # rtracklayer::readGFF reads gff v1 'attributes' column as 'group', - # but we want it to be always 'attributes'. Also, it tidies up the - # attributes column by default - rtracklayer_fun <- function() { - ans = setDT(as.data.frame(rtracklayer::readGFF(token$file, columns = - rtracklayer::GFFcolnames()))) - if (!token$tidy_cols) set(ans, j = tail(names(ans), -9L), value = NULL) - ans[, (1:3) := lapply(.SD, as.character), .SDcols=1:3][] - list_cols = which(vapply(ans, is.list, TRUE)) - if (length(list_cols)) { - list_paste <- function(x) sapply(x, paste, collapse=",") - # paste all values of list cols together - ans[, (list_cols) := lapply(.SD, list_paste), .SDcols=list_cols] - } - # as.data.frame drops all extra attributes - ans - } - fread_fun <- function() fread(token$file, colClasses = token$types, - showProgress = token$verbose) - read_table_fun <- function() read_table(token$file, sep = "\t", - header = FALSE, comment.char = "#", nrows = -1L, colClasses = - token$types, quote = "") - ans = tryCatch(rtracklayer_fun(), error = function(o) { - if (token$verbose) cat("rtracklayer::readGFF failed to read", - " the gtf file. Reverting to data.table::fread.\n", sep="") - tryCatch(fread_fun(), error = function(o) { - if (token$verbose) cat("data.table::fread failed to read", - " as well. Reverting to base::read.table.\n", sep="") - read_table_fun() - }) - }) - setnames(ans, head(names(ans), 9L), token$names) - if (is.character(ans[["score"]])) - ans[, "score" := suppressWarnings(as.numeric(score))] - if (is.character(ans[["phase"]])) - ans[, "phase" := suppressWarnings(as.integer(phase))] - setattr(ans, 'class', c("gff", "data.table", "data.frame")) -} - -gread.bed_format <- function(token) { - ans = tryCatch(fread(token$file, colClasses = token$types, sep="\t", - showProgress = token$verbose), error = function(o) { - if (token$verbose) cat("data.table::fread failed to read", - " the bed file. Reverting to 'read.table'", - " from base R.\n", sep="") - read_table(token$file, sep = "\t", header = FALSE, - comment.char = "#", nrows = -1L, colClasses = - token$types, quote = "") - }) - # setnames will error if bed file has < 3 cols - setnames(ans, head(token$names, max(3L, ncol(ans)))) - # ans[, start := start+1L] # TODO: should we automatically set start+1L? - setattr(ans, 'class', c("bed", "data.table", "data.frame")) -} - -gread.bam_format <- function(token) { - if (token$verbose) - cat("Loading bam file: ",token$file,", please wait...\n", sep="") - stopifnot(length(token$file)==1L) - - what = c("flag") - flags = Rsamtools::scanBamFlag(isUnmappedQuery=FALSE) - if (!length(token$chromosomes)) { - param = Rsamtools::ScanBamParam(what=what, tag=token$tags, flag=flags) - } else { - hdr = Rsamtools::scanBamHeader(token$file)[[1]]$targets - hdr = hdr[names(hdr) %in% token$chromosomes] - if (!all(names(hdr) %in% token$chromosomes)) { - stop("chromosomes [", paste(setdiff(token$chromosomes, - names(hdr)), collapse=", "), "] not found in bam file.") - } - which = as_granges(data.table(seqname = names(hdr), - start=1L, end = hdr, strand="*"), ignore_strand=TRUE) - param = Rsamtools::ScanBamParam(what = what, which = which, - tag = token$tags, flag = flags) - } - # load bam with corresponding param - ans = as_bam(GenomicAlignments::readGAlignments(token$file, param=param)) - setattr(ans, 'class', c("bam", "data.table", "data.frame")) -} - -`.` <- function(...) { - stop(".() is only meant for use in `j` argument of - data.table, and is an alias to list()") -} diff --git a/R/s4-setup.R b/R/s4-setup.R new file mode 100644 index 0000000..2a8b6e8 --- /dev/null +++ b/R/s4-setup.R @@ -0,0 +1,7 @@ +setClass("gtf", contains="GRanges") +setClass("gff", contains="GRanges") +setClass("bed", contains="GRanges") +setClass("bam", contains="GRanges") +setClass("gene", contains="GRanges") +setClass("exon", contains="GRanges") +setClass("intron", contains="GRanges") diff --git a/R/supported_formats.R b/R/supported-formats.R similarity index 87% rename from R/supported_formats.R rename to R/supported-formats.R index 9445c7e..5abcbab 100644 --- a/R/supported_formats.R +++ b/R/supported-formats.R @@ -4,7 +4,7 @@ #' #' @return A character vector listing all currently supported formats. #' @aliases supported_formats -#' @seealso \code{\link{read_format}} \code{\link{tidy_cols}} +#' @seealso \code{\link{read_format}} #' @examples #' supported_formats() #' @export diff --git a/R/test_gread.R b/R/test-gread.R similarity index 99% rename from R/test_gread.R rename to R/test-gread.R index ed0a085..866e1e9 100644 --- a/R/test_gread.R +++ b/R/test-gread.R @@ -19,7 +19,6 @@ #' otherwise. #' @aliases test_gread #' @seealso \code{\link{read_format}} \code{\link{extract}} -#' \code{\link{tidy_cols}} #' @examples #' \dontrun{ #' gread:::test_gread() @@ -28,7 +27,7 @@ test_gread <- function(verbose=FALSE, pkg="pkg", silent=FALSE) { if (exists("test_gread", .GlobalEnv, inherits=FALSE)) { # package developer if ("package:gread" %in% search()) stop("gread package loaded") if (.Platform$OS.type == "unix") { - d = path.expand("~/Documents/oa-git/gread/inst/tests") + d = path.expand("~/Documents/oa-github/gread/inst/tests") } } else { # user d = paste(getNamespaceInfo("gread", "path"), "/tests", sep="") diff --git a/R/tidy_format.R b/R/tidy-format.R similarity index 69% rename from R/tidy_format.R rename to R/tidy-format.R index c89a2e1..ccf900c 100644 --- a/R/tidy_format.R +++ b/R/tidy-format.R @@ -1,68 +1,11 @@ -#' @title Tidy up gtf/gff/bed/bam objects -#' -#' @description Tidy up data depending on the type of input format. -#' -#' Note that this operation is performed \emph{by reference}, to avoid -#' unnecessary copies, for efficiency. Therefore there is no need to assign -#' the result back to a variable. -#' -#' @details In case of \code{gtf} and \code{gff} formats, the -#' \code{attributes} column is extracted into separate columns after which -#' it is removed (by default). Set \code{remove_cols} to \code{NULL} if you -#' would like to retain the column even after tidying up. -#' -#' In case of \code{bed} and \code{bam} files, no columns are removed by -#' default. However, you can use \code{remove_cols} argument if necessary. -#' -#' In case of \code{bam} files, the function \code{GAlignments} from the -#' \code{GenomicAlignments} Bioconductor package is used to read in the -#' file, with the additional arguments for \code{ScanBamParam()} function -#' using \code{NM} and \code{MD} tags by default. -#' -#' @seealso \code{\link{supported_formats}} \code{\link{read_format}} -#' \code{\link{extract}} \code{\link{construct_introns}} -#' \code{\link{as_granges}} -#' @param x Input object, of class \code{gtf}, \code{gff}, \code{bed} or -#' \code{bam}. -#' @param remove_cols A character vector of column names to be removed. -#' @param verbose Logical. Default is \code{FALSE}. If \code{TRUE}, helpful -#' status messages are printed on to the console. -#' @param ... Arguments that are ignored at the moment. -#' @aliases tidy_cols tidy_cols.gtf tidy_cols.gff tidy_cols.bed tidy_cols.bam -#' @return A tidied object of class \code{gtf}, \code{gff}, \code{bed} or -#' \code{bam}, that inherits from \code{data.table}. -#' @export -#' @examples -#' path <- system.file("tests", package="gread") -#' gff_file <- file.path(path, "sample.gff") -#' gtf_file <- file.path(path, "sample.gtf") -#' bed_file <- file.path(path, "sample.bed") -#' bam_file <- file.path(path, "sample.bam") -#' -#' gtf <- read_format(gtf_file, tidy_cols=FALSE) -#' gff <- read_format(gtf_file, tidy_cols=FALSE) -#' bed <- read_format(bed_file, tidy_cols=FALSE) -#' bam <- read_format(bam_file, tidy_cols=FALSE) -#' -#' tidy_cols(gtf, remove_cols=NULL)[] # tidy attr. col, but not remove -#' tidy_cols(gff, remove_cols=NULL)[] # same as above, but for gff -#' tidy_cols(gtf, remove_cols="attributes")[] # tidy & remove attr. col -#' tidy_cols(gff, remove_cols="attributes")[] # same as above, but for gff -#' -#' tidy_cols(bed, remove_cols="name")[] # remove name column -#' tidy_cols(bam, remove_cols=c("NM", "MD"))[] # remove additional loaded tags tidy_cols <- function(x, ...) { UseMethod("tidy_cols") } -#' @rdname tidy_cols -#' @export tidy_cols.default <- function(x, ...) { stop("No default method available.") } -#' @rdname tidy_cols -#' @export tidy_cols.gtf <- function(x, remove_cols="attributes", verbose=FALSE, ...) { stopifnot(identical(head(names(x), 9L), format_names("gtf"))) meta_fun <- function(vec, attr) { @@ -101,8 +44,6 @@ tidy_cols.gtf <- function(x, remove_cols="attributes", verbose=FALSE, ...) { invisible(x) } -#' @rdname tidy_cols -#' @export tidy_cols.gff <- function(x, remove_cols="attributes", verbose=FALSE, ...) { stopifnot(identical(head(names(x), 9L), format_names("gff"))) meta_fun <- function(vec, attr) { @@ -137,22 +78,18 @@ tidy_cols.gff <- function(x, remove_cols="attributes", verbose=FALSE, ...) { invisible(x) } -#' @rdname tidy_cols -#' @export tidy_cols.bed <- function(x, remove_cols=NULL, verbose=FALSE, ...) { stopifnot(identical(names(x), head(format_names("bed"), length(x)))) remove_cols(x, remove_cols, verbose) # updates by reference invisible(x) } -#' @rdname tidy_cols -#' @export tidy_cols.bam <- function(x, remove_cols=NULL, verbose=FALSE, ...) { remove_cols(x, remove_cols, verbose) # updates by reference invisible(x) } -## internal function used in tidy_cols methods ------------------- +## internal helper functions ------------------------------------------------- remove_cols <- function(x, cols, verbose) { if (is.null(cols)) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 0057560..c610954 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17,6 +17,7 @@ if (!exists("test_gread", .GlobalEnv, inherits=FALSE)) { strictly_nonunique = gread:::strictly_nonunique shallow = gread:::shallow non_overlaps = gread:::non_overlaps + tidy_cols = gread:::tidy_cols } else { .devtesting=TRUE } @@ -40,7 +41,7 @@ test(0.0, read_format("unavailable.unknown"), error="File does not") ## gtf ----------------------------------------------- ff = c("exon", "stop_codon", "CDS", "start_codon") gtf_ans = data.table( - seqname="7", + seqnames="7", source="protein_coding", feature=ff[c(1:3,3,1,4)], start=INT(27221134,27222415,27222418,27224055,27224055,27224761), @@ -57,19 +58,16 @@ gtf_ans = data.table( transcript_name="HOXA11-001", tss_id="TSS161914", protein_id=c(NA,NA,"ENSP00000006015","ENSP00000006015",NA,NA)) -setattr(gtf_ans, 'class', c('gtf', class(gtf_ans))) +gtf_ans = new("gtf", as(setDF(gtf_ans), "GRanges")) test(1.0, read_format("unavailable.gtf"), error="not found") test(1.1, read_format("sample.gtf"), gtf_ans) test(1.2, read_gtf("sample.gtf"), gtf_ans) test(1.3, read_format("sample_filter.gtf", filter=TRUE), gtf_ans[-4L]) test(1.4, read_gtf("sample_filter.gtf", filter=TRUE), gtf_ans[-4L]) -gtf_ans2 = gtf_ans[, 1:8, with=FALSE - ][, attributes := fread("sample.gtf", select=9L)] -test(1.5, read_gtf("sample.gtf", tidy_cols=FALSE), gtf_ans2) ## gff ----------------------------------------------- -gff_ans = data.table(seqname="7", +gff_ans = data.table(seqnames="7", source="protein_coding", feature=c("gene","RNA","exon", "exon"), start=INT(27221129, 27221134, 27221134, 27224055), @@ -81,34 +79,29 @@ gff_ans = data.table(seqname="7", ID=c("ENSG00000005073","ENST00000006015", "ENST00000006015.2", "ENST00000006015.1"), Name=c("HOXA11","HOXA11-001",NA,NA), - length=c("3714",NA,NA,NA), Parent=c(NA,"ENSG00000005073", "ENST00000006015", "ENST00000006015"), + length=c("3714",NA,NA,NA), transcript_id=c(NA,rep("ENST00000006015", 3L)), gene_id="ENSG00000005073", transcript_name=c(NA,rep("HOXA11-001", 3L)), gene_name="HOXA11") -setattr(gff_ans, 'class', c('gff', class(gff_ans))) -# ignore colorder. v1.9.6 of data.table lacks ignore.col.order feature +gff_ans = new("gff", as(setDF(gff_ans), "GRanges")) + # so doing it using setcolorder for now. test(2.0, read_format("unavailable.gff"), error="not found") -test(2.1, setcolorder(read_format("sample.gff"), names(gff_ans)), gff_ans) -test(2.2, setcolorder(read_gff("sample.gff"), names(gff_ans)), gff_ans) -test(2.3, setcolorder(read_format("sample_filter.gff", filter=TRUE), - names(gff_ans)), gff_ans[-4L]) -test(2.4, setcolorder(read_gff("sample_filter.gff", filter=TRUE), - names(gff_ans)), gff_ans[-4L]) -gff_ans2 = gff_ans[, 1:8, with=FALSE - ][, attributes := fread("sample.gff", select=9L)] -test(2.5, read_gff("sample.gff", tidy_cols=FALSE), gff_ans2) -cols = c("seqname", "source", "feature", "start", "end", "score", - "strand", "phase", "Alias", "Name", "length", "Parent") -test(2.6, read_gff("sample_no_id.gff"), gff_ans[, cols, with=FALSE], - warning="gff file") +test(2.1, read_format("sample.gff"), gff_ans) +test(2.2, read_gff("sample.gff"), gff_ans) +test(2.3, read_format("sample_filter.gff", filter=TRUE), gff_ans[-4L]) +test(2.4, read_gff("sample_filter.gff", filter=TRUE), gff_ans[-4L]) +gff_ans2 = gff_ans +cols = c("transcript_id", "gene_id", "transcript_name", "gene_name", "ID") +mcols(gff_ans2)[, cols] = NULL +test(2.5, read_gff("sample_no_id.gff"), gff_ans2, warning="gff file") ## bam ----------------------------------------------- -bam_ans = data.table(seqname=c("1","5","X"), +bam_ans = data.table(seqnames=c("1","5","X"), start=c(220276091L,79733612L,12994378L), end=c(220276834L,79733687L,12994453L), strand="-", @@ -119,23 +112,27 @@ bam_ans = data.table(seqname=c("1","5","X"), flag=16L, NM=0L, MD=c("75","76","76")) -setattr(bam_ans, 'class', c("bam", class(bam_ans))) +bam_ans = new("bam", as(setDF(bam_ans), "GRanges")) test(3.0, read_format("unavailable.bam"), error="not found") test(3.1, read_format("sample.bam"), bam_ans) test(3.2, read_bam("sample.bam"), bam_ans) -test(3.3, read_format("sample.bam", tags="NM"), bam_ans[, !"MD", with=FALSE]) +bam_ans2 = bam_ans +mcols(bam_ans2)[, "MD"] = NULL +test(3.3, read_format("sample.bam", tags="NM"), bam_ans2) test(3.4, read_format("sample.bam", tags="XYZ"), error="invalid class") -test(3.5, read_format("sample_indexed.bam", chromosomes="1"), bam_ans[1L]) -test(3.6, nrow(read_format("sample_indexed.bam", chromosomes="Y")), 0L) -test(3.7, read_bam("sample.bam", tags=NULL), - bam_ans[, !c("NM", "MD"), with=FALSE]) +test(3.5, read_format("sample_indexed.bam", chromosomes="1"), + keepSeqlevels(bam_ans, "1")) +test(3.6, length(read_format("sample_indexed.bam", chromosomes="Y")), 0L) +bam_ans3 = bam_ans +mcols(bam_ans3)[, c("NM", "MD")] = NULL +test(3.7, read_bam("sample.bam", tags=NULL), bam_ans3) ## bed ----------------------------------------------- names = c("NS500200:12:H0RBKAGXX:3:11405:20307:5324", "NS500200:10:H0PVJAGXX:3:21608:13122:16773", "NS500200:10:H0PVJAGXX:2:13108:16527:7109") -bed_ans = data.table(seqname=c("1","5","X"), +bed_ans = data.table(seqnames=c("1","5","X"), start=c(220276090L,79733611L,12994377L), end=c(220276834L,79733687L,12994453L), name=names, @@ -147,63 +144,57 @@ bed_ans = data.table(seqname=c("1","5","X"), blockCount=c(2L,1L,1L), blockSizes=c("29,46","76","76"), blockStarts=c("0,698","0","0")) -setattr(bed_ans, 'class', c("bed", class(bed_ans))) +bed_ans = new("bed", as(setDF(bed_ans), "GRanges")) test(4.0, read_format("unavailable.bed"), error="not found") test(4.1, read_format("sample.bed"), bed_ans) test(4.2, read_bed("sample.bed"), bed_ans) -test(4.3, read_format("sample.bed", tidy_cols=FALSE), bed_ans) -test(4.4, read_bed("sample.bed", tidy_cols=FALSE), bed_ans) -## tidy_cols ------------------------------------------------------ +# ## tidy_cols ------------------------------------------------------ -gtf <- read_format("sample.gtf", tidy_cols=FALSE) -test(5.1, names(gtf), format_names("gtf")) -tidy_cols(gtf, remove_cols=NULL) # tidy, but don't remove attributes col -test(5.2, sort(names(gtf)), sort(c(format_names("gtf"), - tail(names(gtf_ans), -8L)))) # ignore order since rtracklayer returns - # columns in different order -tidy_cols(gtf, remove_cols="attributes") -test(5.3, sort(names(gtf)), sort(names(gtf_ans))) +# gtf <- read_format("sample.gtf", tidy_cols=FALSE) +# test(5.1, names(gtf), format_names("gtf")) +# tidy_cols(gtf, remove_cols=NULL) # tidy, but don't remove attributes col +# test(5.2, sort(names(gtf)), sort(c(format_names("gtf"), +# tail(names(gtf_ans), -8L)))) # ignore order since rtracklayer returns +# # columns in different order +# tidy_cols(gtf, remove_cols="attributes") +# test(5.3, sort(names(gtf)), sort(names(gtf_ans))) -gff <- read_format("sample.gff", tidy_cols=FALSE) -test(5.4, names(gff), format_names("gff")) -tidy_cols(gff, remove_cols=NULL) # tidy, but don't remove attributes col -test(5.5, sort(names(gff)), sort(c(format_names("gff"), - tail(names(gff_ans), -8L)))) # ignore order since rtracklayer returns - # columns in different order -tidy_cols(gff, remove_cols="attributes") -test(5.6, sort(names(gff)), sort(names(gff_ans))) +# gff <- read_format("sample.gff", tidy_cols=FALSE) +# test(5.4, names(gff), format_names("gff")) +# tidy_cols(gff, remove_cols=NULL) # tidy, but don't remove attributes col +# test(5.5, sort(names(gff)), sort(c(format_names("gff"), +# tail(names(gff_ans), -8L)))) # ignore order since rtracklayer returns +# # columns in different order +# tidy_cols(gff, remove_cols="attributes") +# test(5.6, sort(names(gff)), sort(names(gff_ans))) -bed <- read_format("sample.bed", tidy_cols=FALSE) -tidy_cols(bed, remove_cols="name") # tidy, but don't remove attributes col -test(5.7, names(bed), setdiff(names(bed_ans), "name")) +# bed <- read_format("sample.bed", tidy_cols=FALSE) +# tidy_cols(bed, remove_cols="name") # tidy, but don't remove attributes col +# test(5.7, names(bed), setdiff(names(bed_ans), "name")) -bam <- read_format("sample.bam", tidy_cols=FALSE) -tidy_cols(bam, remove_cols="MD") # tidy, but don't remove attributes col -test(5.8, names(bam), setdiff(names(bam_ans), "MD")) -# also test warning when the column to remove is not present -test(5.9, tidy_cols(bam, remove_cols="XYZ"), bam, warning="Skipping column") +# bam <- read_format("sample.bam", tidy_cols=FALSE) +# tidy_cols(bam, remove_cols="MD") # tidy, but don't remove attributes col +# test(5.8, names(bam), setdiff(names(bam_ans), "MD")) +# # also test warning when the column to remove is not present +# test(5.9, tidy_cols(bam, remove_cols="XYZ"), bam, warning="Skipping column") -## as_granges ------------------------------------------------ +## as_data_table ------------------------------------------------ gff = read_format("sample.gff") gtf = read_format("sample.gtf") bed = read_format("sample.bed") bam = read_format("sample.bam") -test(6.1, as_granges(gff), as(as.data.frame(gff), "GRanges")) -test(6.2, as_granges(gtf), as(as.data.frame(gtf), "GRanges")) -test(6.3, as_granges(bed), as(as.data.frame(bed), "GRanges")) -test(6.4, as_granges(bam), as(as.data.frame(bam), "GRanges")) +test(6.1, class(as_data_table(gff))[1L], "gff") +test(6.2, class(as_data_table(gtf))[1L], "gtf") +test(6.3, class(as_data_table(bed))[1L], "bed") +test(6.4, class(as_data_table(bam))[1L], "bam") -test(6.5, as_granges(gff, ignore_strand=TRUE), - as(as.data.frame(copy(gff)[, strand:=NULL]), "GRanges")) -test(6.6, as_granges(gtf, ignore_strand=TRUE), - as(as.data.frame(copy(gtf)[, strand:=NULL]), "GRanges")) -test(6.7, as_granges(bed, ignore_strand=TRUE), - as(as.data.frame(copy(bed)[, strand:=NULL]), "GRanges")) -test(6.8, as_granges(bam, ignore_strand=TRUE), - as(as.data.frame(copy(bam)[, strand:=NULL]), "GRanges")) -test(6.9, as_granges(1:2), error = "is not TRUE") +class = c("data.table", "data.frame") # class reset +test(6.5, class(as_data_table(gff, TRUE)), class) +test(6.6, class(as_data_table(gtf, TRUE)), class) +test(6.7, class(as_data_table(bed, TRUE)), class) +test(6.8, class(as_data_table(bam, TRUE)), class) ## as_txdb ----------------------------------------------- ## No tests yet @@ -217,37 +208,38 @@ test(7.1, strictly_nonunique(dt), dt[!c(2L,5L)]) bam = read_format("sample.bam") bed = read_format("sample.bed")[c(1,1,2,3)] internal_fun <- function(x, y) { - ans = GenomicRanges::findOverlaps(as(as.data.frame(x), "GRanges"), - as(as.data.frame(y), "GRanges")) + ans = GenomicRanges::findOverlaps(x, y) setDT(list(queryHits=queryHits(ans), subjectHits=subjectHits(ans))) } -test(8.1, find_overlaps(bam, bed), internal_fun(bam, bed)) -test(8.2, find_overlaps(bed, bed), internal_fun(bed, bed)) -test(8.3, find_overlaps(bed, bed, ignore_redundant=TRUE), - internal_fun(bed, bed)[-3L]) +test(8.1, find_overlaps(as_data_table(bam), as_data_table(bed)), + internal_fun(bam, bed)) +test(8.2, find_overlaps(as_data_table(bed), as_data_table(bed)), + internal_fun(bed, bed)) +test(8.3, find_overlaps(as_data_table(bed), as_data_table(bed), + ignore_redundant=TRUE), internal_fun(bed, bed)[-3L]) # TODO: test for ignore_strand. Not done now since it is just passed as # arg to GenomicRanges::findOverlaps. To revisit when DT's non-equi joins # will be ready. ## reduce_overlaps ------------------------------------ -gff = read_format("sample.gff") +gff = as_data_table(read_format("sample.gff"), reset_class=TRUE) gff1 = gff[2L] gff2 = gff[2L][, `:=`(start = 27221000L, end = 27223735L)] gff = rbind(gff1, gff2) setattr(gff, 'class', c("gff", "data.table", "data.frame")) -ans = data.table(seqname = "7", start = 27221000L, end = 27224835L, +ans = data.table(seqnames = "7", start = 27221000L, end = 27224835L, length = 3836L, strand = "-", Parent = "ENSG00000005073") setattr(ans, 'class', class(gff)) test(9.1, reduce_overlaps(gff, "Parent"), ans) test(9.2, reduce_overlaps(gff, "Par"), error="") ## disjoin_overlaps ------------------------------------ -gff = read_format("sample.gff") +gff = as_data_table(read_format("sample.gff"), reset_class=TRUE) gff1 = gff[2L] gff2 = gff[2L][, `:=`(start = 27221000L, end = 27223735L)] gff = rbind(gff1, gff2) setattr(gff, 'class', c("gff", "data.table", "data.frame")) -ans = data.table(seqname = "7", start = c(27221000L, 27221134L, 27223736L), +ans = data.table(seqnames = "7", start = c(27221000L, 27221134L, 27223736L), end = c(27221133L, 27223735L, 27224835L), length = c(134L, 2602L, 1100L), strand = "-", Parent = "ENSG00000005073") @@ -256,13 +248,13 @@ test(10.1, disjoin_overlaps(gff, "Parent"), ans) test(10.2, disjoin_overlaps(gff, "Par"), error="") ## intersect_overlaps ------------------------------------ -gff = read_format("sample.gff") +gff = as_data_table(read_format("sample.gff"), reset_class=TRUE) gff1 = gff[2L] gff2 = gff[2L][, `:=`(start = 27221000L, end = 27224235)] gff3 = gff[2L][, `:=`(start = 27523634L, end = 27524761L)] gff = rbind(gff1, gff2, gff3) setattr(gff, 'class', c("gff", "data.table", "data.frame")) -ans = data.table(seqname = "7", start = c(27221000L, 27523634L), +ans = data.table(seqnames = "7", start = c(27221000L, 27523634L), end = c(27224835L, 27524761L), length = c(3836L ,1128L), strand = "-", Parent = "ENSG00000005073") setattr(ans, 'class', class(gff)) @@ -271,15 +263,18 @@ test(11.2, intersect_overlaps(gff, "Par"), error="") ## shallow ------------------------------------ dt = data.table(x=1, y=2, z="a") +setattr(dt, 'class', c("bla", class(dt))) test(12.1, shallow(data.frame()), error="") test(12.2, shallow(dt), dt) test(12.3, shallow(dt, "x"), dt[, .(x)]) -test(12.4, shallow(dt, "x"), dt[, .(x)]) -test(12.5, shallow(dt, "xx"), error="") +test(12.4, shallow(dt, "x", TRUE), as.data.table(dt[, .(x)])) +test(12.5, class(shallow(dt, "x"))[1L], "bla") +test(12.6, class(shallow(dt, "x", TRUE)), class) +test(12.7, shallow(dt, "xx"), error="") ## construct_introns ----------------------------------------- gtf = read_format("sample.gtf") -ans = data.table(seqname = "7", source = "protein_coding", feature = "intron", +ans = data.table(seqnames = "7", source = "protein_coding", feature = "intron", start = 27222648L, end = 27224054L, score = NA_real_, strand = "-", frame = NA_integer_, gene_id = "ENSG00000005073", transcript_id = "ENST00000006015", exon_number = NA, @@ -287,47 +282,49 @@ ans = data.table(seqname = "7", source = "protein_coding", feature = "intron", p_id = "P4212", transcript_name = "HOXA11-001", tss_id = "TSS161914", protein_id = NA_character_) setattr(ans, 'class', c("gtf", class(ans))) -ans2 = copy(ans) -setattr(ans2, 'class', c("intron", class(ans2))) +ans2 = new("intron", as(setDF(ans), "GRanges")) test(13.1, construct_introns(gtf, update=FALSE), ans2) -ans = rbind(gtf, ans)[order(seqname,start,end)] +ans = rbind(as_data_table(gtf)[, width := NULL], ans + )[order(seqnames,start,end)] setattr(ans, 'class', c("gtf", class(ans))) -test(13.2, construct_introns(gtf), ans) +ans2 = new("gtf", as(setDF(ans), "GRanges")) +test(13.2, construct_introns(gtf), ans2) ## extract features ------------------------------------------ gtf = read_format("sample.gtf") -ans = gtf[feature=="exon", .(seqname, start, end, length=end-start+1L, +ans = as_data_table(gtf)[feature=="exon", + .(seqnames, start, end, length=end-start+1L, strand, transcript_id, gene_id)] -setattr(ans, 'class', c("exon", class(gtf))) -test(14.1, extract(gtf, feature="exon", type="union"), ans) +ans2 = new("exon", as_granges(ans)) +test(14.1, extract(gtf, feature="exon", type="union"), ans2) ans[, overlaps := gene_id] -setattr(ans, 'class', c("gene", class(gtf))) -test(14.2, extract(gtf, feature="gene_exon", type="union"), ans) -ans = data.table(seqname = "7", start = 27221134L, end = 27224835L, +ans2 = new("gene", as_granges(ans)) +test(14.2, extract(gtf, feature="gene_exon", type="union"), ans2) +ans = data.table(seqnames = "7", start = 27221134L, end = 27224835L, length = 3702L, strand = "-", transcript_id = "ENST00000006015", gene_id = "ENSG00000005073", overlaps = "ENSG00000005073") -setattr(ans, 'class', c("gene", class(gtf))) -test(14.3, extract(gtf, feature="gene", type="union"), ans) +ans2 = new("gene", as_granges(ans)) +test(14.3, extract(gtf, feature="gene", type="union"), ans2) # test for feature="intron" on gtf object to check that introns are # constructed automatically using construct_introns() with a warning -ans = data.table(seqname="7", start=27222648L, end=27224054L, length=1407L, +ans = data.table(seqnames="7", start=27222648L, end=27224054L, length=1407L, strand="-", transcript_id="ENST00000006015", gene_id="ENSG00000005073") -setattr(ans, 'class', c("intron", "gtf", class(ans))) -test(14.4, extract(gtf, "intron", "union"), ans, +ans2 = new("intron", as_granges(ans)) +test(14.4, extract(gtf, "intron", "union"), ans2, warning="No introns found in input") ## non_overlaps ---------------------------------------------- gtf = read_format("sample.gtf") -gtf2 = construct_introns(gtf) -e = gtf2[feature=="exon", .(seqname,start,end,strand,transcript_id,gene_id)] -i = gtf2[feature=="intron", .(seqname,start,end,strand,transcript_id,gene_id)] +gtf2 = as_data_table(construct_introns(gtf)) +e = gtf2[feature=="exon", .(seqnames,start,end,strand,transcript_id,gene_id)] +i = gtf2[feature=="intron", .(seqnames,start,end,strand,transcript_id,gene_id)] e[, transcript_id := as.list(transcript_id)] i[, transcript_id := as.list(transcript_id)] -setorder(e, seqname,start,end) -setorder(i, seqname,start,end) -test(15.1, non_overlaps(gtf), list(non_overlapping_exon=e, - non_overlapping_intron=i)) +setorder(e, seqnames,start,end) +setorder(i, seqnames,start,end) +test(15.1, non_overlaps(gtf), list(non_overlapping_exon=as_granges(e), + non_overlapping_intron=as_granges(i))) ## test summary ---------------------------------------------- options(warn=0L) diff --git a/man/as_bam.Rd b/man/as_bam.Rd index 8def378..c7bc8ee 100644 --- a/man/as_bam.Rd +++ b/man/as_bam.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/as_granges.R +% Please edit documentation in R/internal-funs.R \name{as_bam} \alias{as_bam} \title{convert \code{GAlignments} object to \code{data.table}} diff --git a/man/as_data_table.Rd b/man/as_data_table.Rd new file mode 100644 index 0000000..395e0e8 --- /dev/null +++ b/man/as_data_table.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/extract-features.R +\name{as_data_table} +\alias{as_data_table} +\title{Convert gtf/gff/bam/bed S4 objects to data.table and preserve class} +\usage{ +as_data_table(x) +} +\value{ +A data.table object with input class preserved. +} +\description{ +\code{as.data.table} converts the input object to +\code{data.table}, but strips away all other classes except +\code{data.table} and \code{data.frame}. + +This internal function is a simple wrapper to \code{as.data.table} but +also retains the extra class information of the input object. +} +\examples{ +\dontrun{ +setClass("foo", contains="GRanges") +x = new("foo", GRanges(letters[1:5], IRanges(1:5, 6:10))) +class(x) # [1] "foo" +class(gread:::as_data_table(x)) # [1] "foo" "data.table" "data.frame" +} +} + diff --git a/man/as_granges.Rd b/man/as_granges.Rd index 41b3129..161a79b 100644 --- a/man/as_granges.Rd +++ b/man/as_granges.Rd @@ -1,8 +1,8 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/as_granges.R +% Please edit documentation in R/as-granges.R \name{as_granges} \alias{as_granges} -\title{Convert to GRanges object} +\title{Convert data.table to GRanges object} \usage{ as_granges(x, ignore_strand = FALSE) } @@ -19,8 +19,7 @@ A \code{GRanges} object. } \description{ Convert \code{gtf}, \code{gff}, \code{bed}, \code{bam} -or a valid \code{data.table} to a GRanges object, preserving all -additional columns. +or a valid \code{data.table} to a GRanges object. } \examples{ path <- system.file("tests", package="gread") @@ -45,7 +44,7 @@ as_granges(bed, ignore_strand=FALSE) as_granges(bam, ignore_strand=FALSE) } \seealso{ -\code{\link{read_format}} -\code{\link{tidy_cols}} \code{\link{extract}} \code{\link{construct_introns}} +\code{\link{read_format}} \code{\link{extract}} +\code{\link{construct_introns}} } diff --git a/man/construct_introns.Rd b/man/construct_introns.Rd index 35c94fd..22651ee 100644 --- a/man/construct_introns.Rd +++ b/man/construct_introns.Rd @@ -1,8 +1,8 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/construct_introns.R +% Please edit documentation in R/construct-introns.R \name{construct_introns} \alias{construct_introns} -\title{Construct introns from \code{gtf} or \code{gff} objects} +\title{Construct introns from gtf/gff objects} \usage{ construct_introns(x, update = TRUE) } @@ -16,7 +16,8 @@ returned.} } \value{ An object of class \code{gtf/gff} with updated \code{intron} -coordinates or just the intron coordinates depending on \code{update}. +coordinates or just the intron coordinates depending on \code{update}. +They all inherit from \code{GRanges}. } \description{ This function generates intronic coordinates by extracting @@ -34,6 +35,6 @@ introns <- construct_introns(gtf, update=FALSE) } \seealso{ \code{\link{supported_formats}} \code{\link{read_format}} -\code{\link{extract}} \code{\link{tidy_cols}} \code{\link{as_granges}} +\code{\link{extract}} \code{\link{as_granges}} } diff --git a/man/disjoin_overlaps.Rd b/man/disjoin_overlaps.Rd index e18cadc..2188d9c 100644 --- a/man/disjoin_overlaps.Rd +++ b/man/disjoin_overlaps.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/as_granges.R +% Please edit documentation in R/internal-funs.R \name{disjoin_overlaps} \alias{disjoin_overlaps} \title{Compute disjoint ranges on a gtf/gff/bed/bam object.} diff --git a/man/extract.Rd b/man/extract.Rd index 1e70dd1..96b9f6e 100644 --- a/man/extract.Rd +++ b/man/extract.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/extract_features.R +% Please edit documentation in R/extract-features.R \name{extract} \alias{extract} \alias{extract_feature} @@ -11,7 +11,8 @@ extract(x, feature = c("gene_exon", "gene", "gene_intron", "exon", "intron"), gene_id = "gene_id", ...) } \arguments{ -\item{x}{Input object of class \code{gtf} or \code{gff}.} +\item{x}{Input object of class \code{gtf} or \code{gff} which inherits +from \code{GRanges}.} \item{feature}{A character vector of (usually related) features to extract from. One of \code{"gene_exon"}, \code{"gene"}, \code{"gene_intron"}, @@ -62,7 +63,7 @@ value is \code{"gene_id"}.} An object of class \code{"gene"} when \code{feature} is \code{"gene"}, \code{"gene_exon"} or \code{"gene_intron"}, and of class \code{"exon"} and \code{"intron"} when \code{feature} is \code{"exon"} or -\code{"intron"} respectively. +\code{"intron"} respectively. They all inherit from \code{GRanges}. } \description{ Provides functions for further post processing on objects of @@ -84,8 +85,7 @@ exons <- extract(gtf, feature="gene_exon", type="union") genes <- extract(gtf, feature="gene", type="default") } \seealso{ -\code{\link{read_format}} \code{\link{tidy_cols}} -\code{\link{as_granges}} \code{\link{extract}} -\code{\link{construct_introns}} +\code{\link{read_format}} \code{\link{as_granges}} +\code{\link{extract}} \code{\link{construct_introns}} } diff --git a/man/find_overlaps.Rd b/man/find_overlaps.Rd index 4172f4c..cf74ca7 100644 --- a/man/find_overlaps.Rd +++ b/man/find_overlaps.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/as_granges.R +% Please edit documentation in R/internal-funs.R \name{find_overlaps} \alias{find_overlaps} \title{Find overlapping indices of two gtf/gff/bed/bam objects} diff --git a/man/intersect_bed.Rd b/man/intersect_bed.Rd index 35954af..307cc44 100644 --- a/man/intersect_bed.Rd +++ b/man/intersect_bed.Rd @@ -19,8 +19,8 @@ to filter on. The column/value should be a \emph{character} vector containing the values to \emph{retain}.} } \value{ -A \code{gtf} object containing just those transcripts that overlap -with the provided bed file. +A \code{gtf} object that inherits from \code{GRanges} containing +just those transcripts that overlap with the provided bed file. } \description{ Given a \code{gtf} file, and a \code{bed} file, this function @@ -31,6 +31,5 @@ function is at the moment only for internal purposes. \seealso{ \code{\link{read_format}}, \code{\link{non_overlaps}}, \code{\link{construct_introns}}, \code{\link{extract}} -\code{\link{tidy_cols}} } diff --git a/man/intersect_overlaps.Rd b/man/intersect_overlaps.Rd index 270920e..8e96c8c 100644 --- a/man/intersect_overlaps.Rd +++ b/man/intersect_overlaps.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/as_granges.R +% Please edit documentation in R/internal-funs.R \name{intersect_overlaps} \alias{intersect_overlaps} \title{Compute intersecting ranges on a gtf/gff/bed/bam object with itself.} diff --git a/man/non_overlaps.Rd b/man/non_overlaps.Rd index a3d70f7..63971fd 100644 --- a/man/non_overlaps.Rd +++ b/man/non_overlaps.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/non_overlaps.R +% Please edit documentation in R/non-overlaps.R \name{non_overlaps} \alias{non_overlaps} \title{Generate non-overlapping intron coordinates} @@ -15,8 +15,8 @@ features \code{intron} and \code{exon}.} \item{gene_id}{Column name in \code{x} corresponding to gene id.} } \value{ -Returns a list of data.table containing \emph{non overlapping -exons} and \emph{non overlapping introns}. +Returns a list of \code{GRanges} objects containing +\emph{non overlapping exons} and \emph{non overlapping introns}. } \description{ This function takes an \code{gtf/gff} object. If the diff --git a/man/read_format.Rd b/man/read_format.Rd index e83c974..f0d632c 100644 --- a/man/read_format.Rd +++ b/man/read_format.Rd @@ -1,25 +1,24 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/read_format.R +% Please edit documentation in R/read-format.R \name{read_format} \alias{read_bam} \alias{read_bed} \alias{read_format} \alias{read_gff} \alias{read_gtf} -\title{Quickly and easily read gtf/gff/bed/bam files} +\title{Quick and easy reading of gtf/gff/bed/bam files} \usage{ read_format(file, format = detect_format(file), filter = FALSE, - tidy_cols = TRUE, chromosomes = NULL, tags = c("NM", "MD"), - verbose = FALSE) + chromosomes = NULL, tags = c("NM", "MD"), verbose = FALSE) -read_gtf(file, filter = FALSE, tidy_cols = TRUE, verbose = FALSE) +read_gtf(file, filter = FALSE, verbose = FALSE) -read_gff(file, filter = FALSE, tidy_cols = TRUE, verbose = FALSE) +read_gff(file, filter = FALSE, verbose = FALSE) -read_bed(file, filter = FALSE, tidy_cols = TRUE, verbose = FALSE) +read_bed(file, filter = FALSE, verbose = FALSE) -read_bam(file, filter = FALSE, tidy_cols = TRUE, chromosomes = NULL, - tags = c("NM", "MD"), verbose = FALSE) +read_bam(file, filter = FALSE, chromosomes = NULL, tags = c("NM", "MD"), + verbose = FALSE) } \arguments{ \item{file}{Complete path to the input file.} @@ -29,13 +28,8 @@ read_bam(file, filter = FALSE, tidy_cols = TRUE, chromosomes = NULL, provided, then it will be guessed from input file's extension.} \item{filter}{Filter rows where \code{start/stop} are invalid, and where -\code{start > end}. Default is \code{FALSE} (not to filter, but retain all -invalid rows).} - -\item{tidy_cols}{If \code{TRUE} (default), returns only essential columns for -further analysis (for e.g, \code{score}, \code{itemRgb} etc. are removed), -\code{attributes} column is cleaned up with separate columns for -\code{gene}, \code{transcript} id etc. Default is \code{TRUE}.} +\code{start > end}. Default (\code{FALSE}) is to not filter, i.e., retain +all invalid rows as well.} \item{chromosomes}{Argument to \code{read_bam}. If specified only reads from those chromosomes will be loaded.} @@ -48,7 +42,7 @@ Default is \code{FALSE}.} } \value{ An object of class \code{gtf}, \code{gff}, \code{bed} or \code{bam}, -corresponding to the input file format, that inherits from \code{data.table}. +corresponding to the input file format, that inherits from \code{GRanges}. } \description{ \code{read_gtf}, \code{read_gff}, \code{read_bed} and @@ -58,26 +52,6 @@ formats. \code{read_format} is a top-level convenience function that detects and reads automatically from the extension and is sufficient in most cases. } -\details{ -Note that \code{gff1} uses \code{group} instead of -\code{attributes} as the column name, but \code{gread} always names it -as \code{attributes}. Similarly the first three columns of \code{bed} -format are named \code{seqname, start, end} instead of \code{chrom, -chromStart, chromEnd} for consistency. - -The argument \code{tidy_cols} (\code{TRUE} by default) tidies up the -\code{attributes} column in case of \code{gtf/gff} format files. The -\code{attributes} column itself is removed since it is tidied up into -multiple columns. If this is not desirable, use \code{tidy_cols = FALSE} to -load the file \emph{as-is} and then use the \code{tidy_cols} function with -\code{remove_cols = NULL}. - -In case of \code{gff} format, when `tidy_cols=TRUE`, generation of columns -\code{"transcript_id"} and \code{"gene_id"} will be attempted as these -columns are essential in most cases for downstream analyses. If possible, -the columns \code{"transcript_name"} and \code{"gene_name"} will also be -created. -} \examples{ path <- system.file("tests", package="gread") gff_file <- file.path(path, "sample.gff") @@ -94,14 +68,12 @@ read_bed(bed_file) # same as above read_format(bam_file) # read bam read_bam(bam_file) # same as above -read_format(gtf_file, tidy_cols=FALSE) # load as is, don't tidy - gtf_filter_file <- file.path(path, "sample_filter.gtf") read_format(gtf_filter_file, filter=TRUE) # filter invalid rows read_gtf(gtf_filter_file, filter=TRUE) # same as above } \seealso{ -\code{\link{supported_formats}} \code{\link{tidy_cols}} +\code{\link{supported_formats}} \code{\link{extract}} \code{\link{construct_introns}} } \keyword{file} diff --git a/man/reduce_overlaps.Rd b/man/reduce_overlaps.Rd index 2245d5e..42ccdce 100644 --- a/man/reduce_overlaps.Rd +++ b/man/reduce_overlaps.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/as_granges.R +% Please edit documentation in R/internal-funs.R \name{reduce_overlaps} \alias{reduce_overlaps} \title{Compute reduced ranges on a gtf/gff/bed/bam object.} diff --git a/man/shallow.Rd b/man/shallow.Rd index 46b698a..034152b 100644 --- a/man/shallow.Rd +++ b/man/shallow.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/as_granges.R +% Please edit documentation in R/internal-funs.R \name{shallow} \alias{shallow} \title{shallow copy a \code{data.table}} diff --git a/man/strictly_nonunique.Rd b/man/strictly_nonunique.Rd index e36db5a..6613ff3 100644 --- a/man/strictly_nonunique.Rd +++ b/man/strictly_nonunique.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/as_granges.R +% Please edit documentation in R/internal-funs.R \name{strictly_nonunique} \alias{strictly_nonunique} \title{Return only those rows where rows per group is > 1.} diff --git a/man/supported_formats.Rd b/man/supported_formats.Rd index 8b3be0b..c0736f0 100644 --- a/man/supported_formats.Rd +++ b/man/supported_formats.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/supported_formats.R +% Please edit documentation in R/supported-formats.R \name{supported_formats} \alias{supported_formats} \title{Currently supported formats} @@ -16,6 +16,6 @@ Maintains a vector of all the formats supported currently. supported_formats() } \seealso{ -\code{\link{read_format}} \code{\link{tidy_cols}} +\code{\link{read_format}} } diff --git a/man/test_gread.Rd b/man/test_gread.Rd index 1f15399..ce6c140 100644 --- a/man/test_gread.Rd +++ b/man/test_gread.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/test_gread.R +% Please edit documentation in R/test-gread.R \name{test_gread} \alias{test_gread} \title{Run a set of tests} @@ -38,7 +38,6 @@ gread:::test_gread() } } \seealso{ -\code{\link{read_format}} \code{\link{extract}} -\code{\link{tidy_cols}} +\code{\link{read_format}} \code{\link{extract}} } diff --git a/man/tidy_cols.Rd b/man/tidy_cols.Rd deleted file mode 100644 index e05cb55..0000000 --- a/man/tidy_cols.Rd +++ /dev/null @@ -1,85 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/tidy_format.R -\name{tidy_cols} -\alias{tidy_cols} -\alias{tidy_cols.bam} -\alias{tidy_cols.bed} -\alias{tidy_cols.default} -\alias{tidy_cols.gff} -\alias{tidy_cols.gtf} -\title{Tidy up gtf/gff/bed/bam objects} -\usage{ -tidy_cols(x, ...) - -\method{tidy_cols}{default}(x, ...) - -\method{tidy_cols}{gtf}(x, remove_cols = "attributes", verbose = FALSE, ...) - -\method{tidy_cols}{gff}(x, remove_cols = "attributes", verbose = FALSE, ...) - -\method{tidy_cols}{bed}(x, remove_cols = NULL, verbose = FALSE, ...) - -\method{tidy_cols}{bam}(x, remove_cols = NULL, verbose = FALSE, ...) -} -\arguments{ -\item{x}{Input object, of class \code{gtf}, \code{gff}, \code{bed} or -\code{bam}.} - -\item{...}{Arguments that are ignored at the moment.} - -\item{remove_cols}{A character vector of column names to be removed.} - -\item{verbose}{Logical. Default is \code{FALSE}. If \code{TRUE}, helpful -status messages are printed on to the console.} -} -\value{ -A tidied object of class \code{gtf}, \code{gff}, \code{bed} or -\code{bam}, that inherits from \code{data.table}. -} -\description{ -Tidy up data depending on the type of input format. - -Note that this operation is performed \emph{by reference}, to avoid -unnecessary copies, for efficiency. Therefore there is no need to assign -the result back to a variable. -} -\details{ -In case of \code{gtf} and \code{gff} formats, the -\code{attributes} column is extracted into separate columns after which -it is removed (by default). Set \code{remove_cols} to \code{NULL} if you -would like to retain the column even after tidying up. - -In case of \code{bed} and \code{bam} files, no columns are removed by -default. However, you can use \code{remove_cols} argument if necessary. - -In case of \code{bam} files, the function \code{GAlignments} from the -\code{GenomicAlignments} Bioconductor package is used to read in the -file, with the additional arguments for \code{ScanBamParam()} function -using \code{NM} and \code{MD} tags by default. -} -\examples{ -path <- system.file("tests", package="gread") -gff_file <- file.path(path, "sample.gff") -gtf_file <- file.path(path, "sample.gtf") -bed_file <- file.path(path, "sample.bed") -bam_file <- file.path(path, "sample.bam") - -gtf <- read_format(gtf_file, tidy_cols=FALSE) -gff <- read_format(gtf_file, tidy_cols=FALSE) -bed <- read_format(bed_file, tidy_cols=FALSE) -bam <- read_format(bam_file, tidy_cols=FALSE) - -tidy_cols(gtf, remove_cols=NULL)[] # tidy attr. col, but not remove -tidy_cols(gff, remove_cols=NULL)[] # same as above, but for gff -tidy_cols(gtf, remove_cols="attributes")[] # tidy & remove attr. col -tidy_cols(gff, remove_cols="attributes")[] # same as above, but for gff - -tidy_cols(bed, remove_cols="name")[] # remove name column -tidy_cols(bam, remove_cols=c("NM", "MD"))[] # remove additional loaded tags -} -\seealso{ -\code{\link{supported_formats}} \code{\link{read_format}} -\code{\link{extract}} \code{\link{construct_introns}} -\code{\link{as_granges}} -} -