Skip to content

Commit

Permalink
All inputs and outputs of exported functions are Ranges objects now.
Browse files Browse the repository at this point in the history
  • Loading branch information
asrinivasan-oa committed Jul 25, 2016
1 parent 6d9ea5d commit 94b397f
Show file tree
Hide file tree
Showing 34 changed files with 641 additions and 786 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ Authors@R: c(
c("aut", "cre")), person("Open Analytics", role = "cph"))
Maintainer: Arunkumar Srinivasan <asrinivasan@openanalytics.eu>
Depends:
R (>= 3.3)
R (>= 3.3),
GenomicRanges
Imports:
stats,
utils,
Expand All @@ -20,7 +21,6 @@ Imports:
Rsamtools,
rtracklayer,
IRanges,
GenomicRanges,
GenomicFeatures,
GenomicAlignments
Suggests:
Expand Down
9 changes: 0 additions & 9 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,11 +1,5 @@
# Generated by roxygen2: do not edit by hand

S3method(tidy_cols,bam)
S3method(tidy_cols,bed)
S3method(tidy_cols,default)
S3method(tidy_cols,gff)
S3method(tidy_cols,gtf)
export(as_granges)
export(construct_introns)
export(extract)
export(read_bam)
Expand All @@ -14,7 +8,6 @@ export(read_format)
export(read_gff)
export(read_gtf)
export(supported_formats)
export(tidy_cols)
import(GenomicFeatures)
import(data.table)
import(methods)
Expand All @@ -27,8 +20,6 @@ importFrom(GenomicAlignments,readGAlignments)
importFrom(GenomicAlignments,rname)
importFrom(GenomicAlignments,summarizeOverlaps)
importFrom(GenomicAlignments,width)
importFrom(GenomicRanges,GRanges)
importFrom(GenomicRanges,split)
importFrom(IRanges,IRanges)
importFrom(Rsamtools,ScanBamParam)
importFrom(Rsamtools,scanBamFlag)
Expand Down
42 changes: 42 additions & 0 deletions R/as-granges.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#' @title Convert data.table to GRanges object
#'
#' @description Convert \code{gtf}, \code{gff}, \code{bed}, \code{bam}
#' or a valid \code{data.table} to a GRanges object.
#'
#' @param x An object of class \code{gtf}, \code{gff}, \code{bed} or
#' \code{bam} or a valid \code{data.table} object.
#' @param ignore_strand Logical argument to pass to \code{GRanges} function.
#' Indicates whether \code{strand} should be ignored when constructing
#' \code{GRanges} object or not. Default is \code{FALSE}.
#' @return A \code{GRanges} object.
#' @aliases as_granges
#' @examples
#' path <- system.file("tests", package="gread")
#' gff_file <- file.path(path, "sample.gff")
#' gtf_file <- file.path(path, "sample.gtf")
#' bed_file <- file.path(path, "sample.bed")
#' bam_file <- file.path(path, "sample.bam")
#'
#' gff <- read_format(gff_file)
#' gtf <- read_format(gtf_file)
#' bed <- read_format(bed_file)
#' bam <- read_format(bam_file)
#'
#' as_granges(gff)
#' as_granges(gtf)
#' as_granges(bed)
#' as_granges(bam)
#'
#' as_granges(gff, ignore_strand=FALSE)
#' as_granges(gtf, ignore_strand=FALSE)
#' as_granges(bed, ignore_strand=FALSE)
#' as_granges(bam, ignore_strand=FALSE)
#' @seealso \code{\link{read_format}} \code{\link{extract}}
#' \code{\link{construct_introns}}
as_granges <- function(x, ignore_strand=FALSE) {

stopifnot(is.gtf(x)||is.gff(x)||is.bed(x)||is.bam(x)||is.data.table(x))
x = shallow(x)
if (ignore_strand) x[, "strand" := NULL]
as(setDF(x), "GRanges")
}
29 changes: 16 additions & 13 deletions R/construct_introns.R → R/construct-introns.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#' @title Construct introns from \code{gtf} or \code{gff} objects
#' @title Construct introns from gtf/gff objects
#'
#' @description This function generates intronic coordinates by extracting
#' all the \code{exons} from a \code{gtf} or \code{gff} object.
Expand All @@ -9,7 +9,8 @@
#' and returned invisibily, else just the \code{intron} coordinates are
#' returned.
#' @return An object of class \code{gtf/gff} with updated \code{intron}
#' coordinates or just the intron coordinates depending on \code{update}.
#' coordinates or just the intron coordinates depending on \code{update}.
#' They all inherit from \code{GRanges}.
#' @export
#' @examples
#' path <- system.file("tests", package="gread")
Expand All @@ -21,20 +22,21 @@
#' # same as above, but returns only constructed introns
#' introns <- construct_introns(gtf, update=FALSE)
#' @seealso \code{\link{supported_formats}} \code{\link{read_format}}
#' \code{\link{extract}} \code{\link{tidy_cols}} \code{\link{as_granges}}
#' \code{\link{extract}} \code{\link{as_granges}}
construct_introns <- function(x, update=TRUE) {
stopifnot(inherits(x, "gtf") || inherits(x, "gff"),
"feature" %chin% names(x),
stopifnot(is.gtf(x) || is.gff(x))
x = as_data_table(x)
stopifnot("feature" %chin% names(x),
"transcript_id" %chin% names(x),
update %in% c(TRUE, FALSE))

# to please R CMD CHECK
feature=seqname=transcript_id=NULL
feature=seqnames=transcript_id=NULL
x_class = copy(class(x))
exons = x[feature == "exon"]
if (!nrow(exons)) stop("feature == 'exon' returned 0 rows.")
setorderv(exons, c("transcript_id", "seqname", "start", "end", "strand"))
introns = exons[, .(seqname = seqname[1L],
setorderv(exons, c("transcript_id", "seqnames", "start", "end", "strand"))
introns = exons[, .(seqnames = seqnames[1L],
start = end[seq_len(.N-1L)]+1L,
end = start[seq_len(.N-1L)+1L]-1L,
feature = "intron",
Expand All @@ -45,22 +47,23 @@ construct_introns <- function(x, update=TRUE) {
stop("Exons for transcript ids [", paste(ids, collaspse=" "),
"] have start > end")
exons[, c(setdiff(names(introns), "transcript_id")) := NULL]
exons = unique(exons, by="transcript_id")
exons = unique(shallow(exons, reset_class=TRUE), by="transcript_id")
ecols = names(exons)
introns[exons, (ecols) := mget(ecols), on="transcript_id"]
# reset all exon related cols
introns[, grep("^exon", names(introns), value=TRUE) := NA]
colorder = names(x)
if (update) {
x = rbind(x, introns)
setorderv(x, c("seqname", "start", "end"))
setorderv(x, c("seqnames", "start", "end"))
x = x[, .SD, by="transcript_id"]
setattr(x, 'class', x_class)
setcolorder(x, colorder)
return(x)
ans = x
} else {
setcolorder(introns, colorder)
setattr(introns, 'class', unique(c("intron", x_class)))
return(introns[])
setattr(introns, 'class', c("intron", "data.table", "data.frame"))
ans = introns
}
new(class(ans)[1L], as(setDF(ans), "GRanges"))
}
77 changes: 54 additions & 23 deletions R/extract_features.R → R/extract-features.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
#' @details Extract features based on various criteria (usually intended for
#' obtaining read counts using \code{gcount} for a given \code{bam} file.
#'
#' @param x Input object of class \code{gtf} or \code{gff}.
#' @param x Input object of class \code{gtf} or \code{gff} which inherits
#' from \code{GRanges}.
#' @param feature A character vector of (usually related) features to extract
#' from. One of \code{"gene_exon"}, \code{"gene"}, \code{"gene_intron"},
#' \code{"exon"}, \code{"intron"}. NB: \code{"exon"} feature must be present
Expand Down Expand Up @@ -51,10 +52,9 @@
#' @return An object of class \code{"gene"} when \code{feature} is
#' \code{"gene"}, \code{"gene_exon"} or \code{"gene_intron"}, and of class
#' \code{"exon"} and \code{"intron"} when \code{feature} is \code{"exon"} or
#' \code{"intron"} respectively.
#' @seealso \code{\link{read_format}} \code{\link{tidy_cols}}
#' \code{\link{as_granges}} \code{\link{extract}}
#' \code{\link{construct_introns}}
#' \code{"intron"} respectively. They all inherit from \code{GRanges}.
#' @seealso \code{\link{read_format}} \code{\link{as_granges}}
#' \code{\link{extract}} \code{\link{construct_introns}}
#' @export
#' @examples
#' path <- system.file("tests", package="gread")
Expand All @@ -74,14 +74,18 @@ extract <- function(x, feature=c("gene_exon", "gene", "gene_intron", "exon",
splits = unlist(strsplit(feature, "_", fixed=TRUE))
feature = splits[1L]
subfeature = if (is.na(splits[2L])) NULL else splits[2L]
stopifnot(is.gtf(x) || is.gff(x), "feature" %in% names(x),
ignore_strand %in% c(FALSE, TRUE), transcript_id %in% names(x),
gene_id %in% names(x), nrow(x)>0L)
x = shallow(x) # TODO: use data.table:::shallow when exported
stopifnot(is.gtf(x) || is.gff(x))
x <- as_data_table(x)
stopifnot("feature" %in% names(x),
ignore_strand %in% c(FALSE, TRUE),
transcript_id %in% names(x),
gene_id %in% names(x), nrow(x)>0L)
setattr(x, 'class', c(match.arg(feature), class(x)))
setnames(x, c(transcript_id, gene_id), c("transcript_id", "gene_id"))
extract_feature(x, unique(x$feature), subfeature,
setnames(x, c(transcript_id, gene_id),
c("transcript_id", "gene_id"))
ans = extract_feature(x, unique(x$feature), subfeature,
match.arg(type), ignore_strand, ...)
new(class(ans)[1L], as(setDF(ans), "GRanges"))
}

# ----- internal functions for extracting specific features ----- #
Expand All @@ -99,24 +103,24 @@ extract_feature.gene <- function(x, uniq_features, feature, type,
ignore_strand=FALSE, ...) {

# to pleae R CMD CHECK
seqname=gene_id=transcript_id=NULL
seqnames=gene_id=transcript_id=NULL
if (is.null(feature)) {
if (!"exon" %in% uniq_features)
stop("'exon' must be a feature in input object 'x'.")
construct_transcripts <- function(x) {
x[feature %chin% "exon", .(seqname=seqname[1L], start=min(start),
x[feature %chin% "exon", .(seqnames=seqnames[1L], start=min(start),
end=max(end), strand=strand[1L], gene_id=gene_id[1L]),
by="transcript_id"]
}
transcripts = construct_transcripts(x)
} else {
cols=c("transcript_id", "seqname", "start", "end", "strand", "gene_id")
cols=c("transcript_id", "seqnames", "start", "end", "strand", "gene_id")
.feature = feature
transcripts = x[feature %chin% .feature, cols, with=FALSE]
}
ans = switch (type,
default = {
genes = transcripts[, .(seqname=seqname[1L],
genes = transcripts[, .(seqnames=seqnames[1L],
start=min(start), end=max(end), strand=strand[1L],
transcript_id=paste(unique(transcript_id),
collapse=";")), by="gene_id"]
Expand Down Expand Up @@ -149,7 +153,7 @@ extract_feature.gene <- function(x, uniq_features, feature, type,
},
intersect = {
genes = transcripts[, if (max(start) <= min(end))
.(seqname=seqname[1L], start=max(start),
.(seqnames=seqnames[1L], start=max(start),
end=min(end), strand=strand[1L],
transcript_id=paste(unique(transcript_id),
collapse=";")), by="gene_id"]
Expand All @@ -176,7 +180,7 @@ extract_feature.gene <- function(x, uniq_features, feature, type,
collapse=";")), by="queryHits"]
genes[, "overlaps":=gene_id][olaps$queryHits, "overlaps":=olaps$gene_id]

colorder = c("seqname", "start", "end", "length", "strand",
colorder = c("seqnames", "start", "end", "length", "strand",
"transcript_id", "gene_id", "overlaps")
setcolorder(genes, colorder)
genes[]
Expand All @@ -188,7 +192,7 @@ extract_feature.exon <- function(x, uniq_features, feature, type,

# to please R CMD CHECK
transcript_id=NULL
cols = c("transcript_id", "seqname", "start", "end", "strand", "gene_id")
cols = c("transcript_id", "seqnames", "start", "end", "strand", "gene_id")
.feature = if (is.null(feature)) "exon" else feature
exons = x[feature %chin% .feature, cols, with=FALSE]
ans = switch (type,
Expand Down Expand Up @@ -258,7 +262,7 @@ extract_feature.exon <- function(x, uniq_features, feature, type,
# ans[, "overlaps" := gene_id][olaps$queryHits, "overlaps" :=
# olaps$gene_id]

colorder = c("seqname", "start", "end", "length", "strand",
colorder = c("seqnames", "start", "end", "length", "strand",
"transcript_id", "gene_id") #, "overlaps")
setcolorder(exons, colorder)
exons[]
Expand All @@ -269,14 +273,16 @@ extract_feature.intron <- function(x, uniq_features, feature, type,
stopifnot("exon" %in% uniq_features)
# to please R CMD CHECK
transcript_id=NULL
cols = c("transcript_id", "seqname", "start", "end", "strand", "gene_id")
cols = c("transcript_id", "seqnames", "start", "end", "strand", "gene_id")
.feature = if (is.null(feature)) "intron" else feature
introns = x[feature %chin% .feature, cols, with=FALSE]
if (nrow(introns) == 0L) {
# most likely the object doesn't have introns, so construct them
warning("No introns found in input object. Attempting to construct
introns using construct_introns().")
introns = construct_introns(x, update=FALSE)
xx = new(class(x)[2L], as_granges(x))
introns = as_data_table(construct_introns(xx, update=FALSE))
setattr(introns, 'class', data.table::copy(class(x)))
}
if (nrow(introns) == 0L)
stop("construct_introns() resulted in 0 introns as well.
Expand Down Expand Up @@ -348,15 +354,40 @@ extract_feature.intron <- function(x, uniq_features, feature, type,
# ans[, "overlaps" := gene_id][olaps$queryHits, "overlaps" :=
# olaps$gene_id]

colorder = c("seqname", "start", "end", "length", "strand",
colorder = c("seqnames", "start", "end", "length", "strand",
"transcript_id", "gene_id") #, "overlaps")
setcolorder(introns, colorder)
introns[]
}

# ----- Helper functions for extract_features ----- #
# ----- Helper functions for extract_features --------------------------------

is.gtf <- function(x) inherits(x, 'gtf')
is.gff <- function(x) inherits(x, 'gff')
is.bed <- function(x) inherits(x, 'bed')
is.bam <- function(x) inherits(x, 'bam')

#' @title Convert gtf/gff/bam/bed S4 objects to data.table and preserve class
#'
#' @description \code{as.data.table} converts the input object to
#' \code{data.table}, but strips away all other classes except
#' \code{data.table} and \code{data.frame}.
#'
#' This internal function is a simple wrapper to \code{as.data.table} but
#' also retains the extra class information of the input object.
#' @aliases as_data_table
#' @return A data.table object with input class preserved.
#' @examples
#' \dontrun{
#' setClass("foo", contains="GRanges")
#' x = new("foo", GRanges(letters[1:5], IRanges(1:5, 6:10)))
#' class(x) # [1] "foo"
#' class(gread:::as_data_table(x)) # [1] "foo" "data.table" "data.frame"
#' }
as_data_table <- function(x, reset_class=FALSE) {
stopifnot(inherits(x, "GRanges"))
cl = if (identical(class(x), "GRanges") || reset_class) NULL else class(x)
ans = as.data.table(x)
setattr(ans, 'class', c(cl, "data.table", "data.frame"))
ans[]
}
Loading

0 comments on commit 94b397f

Please sign in to comment.