Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Merge branch 'master' of git@github.com:bioperl/Bio-FeatureIO

Conflicts (fixed):
	MANIFEST
	MANIFEST.SKIP
  • Loading branch information...
commit 890c9de1d8755d541497a2e2355f1e3b445e9b99 2 parents 13feeff + 4295ccc
Chris Fields authored
4 .gitignore
View
@@ -2,6 +2,7 @@
.tmp
*#
.#*
+.*.swp
*(Autosaved)blib*
Build
Build.bat
@@ -12,4 +13,7 @@ pm_to_blib*
cover_db
pod2htm*.tmp
.emacs.*
+t/LocalDB/SeqFeature_*.t
blib*
+*.bak
+MYMETA.yml
35 Build.PL
View
@@ -18,18 +18,51 @@ my $build = Module::Build->new(
create_license => 1,
meta_merge => {
'resources' => {
- 'repository' => 'svn://code.open-bio.org/bioperl/Bio-FeatureIO',
+ 'repository' => 'git@github.com:bioperl/bioperl-live.git',
},
},
+ auto_features => {
+ Network => {
+ description => "Enable tests that need an internet connection",
+ requires => { 'LWP::UserAgent' => 0 }
+ }
+ },
+ get_options => {
+ accept => { },
+ network => { }
+ },
requires => {
'perl' => '5.6.1',
'Bio::Root::Version' => '1.006900',
'URI::Escape' => '0', # dealing with web resources/Bio::FeatureIO::gff,Bio::FeatureIO::interpro,Bio::DB::Biblio::eutils,Bio::DB::EUtilParameters,Bio::DB::Query::GenBank,Bio::DB::NCBIHelper,Bio::SeqFeature::Annotated',
'XML::DOM::XPath' => '0.13', # parsing interpro features/Bio::FeatureIO::interpro',
},
+ recursive_test_files => 1,
dynamic_config => 1,
);
+my $accept = $build->args->{accept};
+
+# Do network tests?
+my $do_network_tests = 0;
+if ($build->args('network')) {
+ $do_network_tests = $build->feature('Network');
+}
+elsif ($build->feature('Network')) {
+ $do_network_tests = $accept ? 0 :
+ $build->y_n("Do you want to run tests that require connection to ".
+ "servers across the internet\n(likely to cause some failures)? y/n",
+ 'n');
+}
+
+if ($do_network_tests) {
+ $build->notes(network => 1);
+ $build->log_info(" - will run internet-requiring tests\n");
+} else {
+ $build->notes(network => 0);
+ $build->log_info(" - will not run internet-requiring tests\n");
+}
+
# Create the build script and exit
$build->create_build_script;
6 MANIFEST
View
@@ -11,16 +11,19 @@ inc/Test/More.pm
inc/Test/Simple.pm
inc/Test/Tutorial.pod
inc/Test/Warn.pm
-lib/Bio/Factory/FeatureFactory.pm
lib/Bio/FeatureIO.pm
lib/Bio/FeatureIO/bed.pm
lib/Bio/FeatureIO/gff.pm
lib/Bio/FeatureIO/gtf.pm
+lib/Bio/FeatureIO/Handler/GenericFeatureHandler.pm
lib/Bio/FeatureIO/interpro.pm
lib/Bio/FeatureIO/ptt.pm
lib/Bio/FeatureIO/vecscreen_simple.pm
+lib/Bio/SeqFeature/Annotated.pm
MANIFEST This list of files
README
+t/bed.t
+t/data/1.bed
t/data/directives.gff3
t/data/dna1.fa
t/data/hybrid1.gff3
@@ -33,4 +36,5 @@ t/FeatureIO.t
t/FeatureIO.x
t/gff.t
t/ptt.t
+t/SeqFeature/Annotated.t
t/vecscreen.t
24 MANIFEST.SKIP
View
@@ -1,5 +1,4 @@
-
-#!start included /usr/local/share/perl/5.10.0/ExtUtils/MANIFEST.SKIP
+#!start included /opt/perl512/lib/5.12.1/ExtUtils/MANIFEST.SKIP
# Avoid version control files.
\bRCS\b
\bCVS\b
@@ -9,6 +8,12 @@
\B\.git\b
\B\.gitignore\b
\b_darcs\b
+\B\.cvsignore$
+
+# Avoid VMS specific MakeMaker generated files
+\bDescrip.MMS$
+\bDESCRIP.MMS$
+\bdescrip.mms$
# Avoid Makemaker generated and utility files.
\bMANIFEST\.bak
@@ -22,6 +27,10 @@
# Avoid Module::Build generated and utility files.
\bBuild$
\b_build/
+\bBuild.bat$
+\bBuild.COM$
+\bBUILD.COM$
+\bbuild.com$
# Avoid temp and backup files.
~$
@@ -29,10 +38,19 @@
\#$
\b\.#
\.bak$
+\.tmp$
+\.#
+\.rej$
+
+# Avoid OS-specific files/dirs
+# Mac OSX metadata
+\B\.DS_Store
+# Mac OSX SMB mount metadata files
+\B\._
# Avoid Devel::Cover files.
\bcover_db\b
-#!end included /usr/local/share/perl/5.10.0/ExtUtils/MANIFEST.SKIP
+#!end included /opt/perl512/lib/5.12.1/ExtUtils/MANIFEST.SKIP
# Avoid configuration metadata file
^MYMETA\.
132 lib/Bio/FeatureIO.pm
View
@@ -16,7 +16,7 @@
=head1 NAME
-Bio::FeatureIO - Handler for FeatureIO
+Bio::FeatureIO - BioPerl IO classes for creating a stream of sequence features
=head1 SYNOPSIS
@@ -45,7 +45,11 @@ Bio::FeatureIO - Handler for FeatureIO
=head1 DESCRIPTION
-An I/O iterator subsystem for genomic sequence features.
+An I/O iterator subsystem for genomic sequence features. This set of parsers
+can be used on many levels:
+
+1) Simple parsing: the next_dataset() method returns hash-refs containing
+both the data parsed and
Bio::FeatureIO is a handler module for the formats in the FeatureIO set (eg,
Bio::FeatureIO::GFF). It is the officially sanctioned way of getting at the
@@ -260,10 +264,11 @@ methods. Internal methods are usually preceded with a _
package Bio::FeatureIO;
use strict;
+use warnings;
use Symbol;
-use base qw(Bio::Root::Root Bio::Root::IO);
+use base qw(Bio::Root::IO);
=head2 new
@@ -281,37 +286,37 @@ use base qw(Bio::Root::Root Bio::Root::IO);
my $entry = 0;
sub new {
- my ($caller,@args) = @_;
- my $class = ref($caller) || $caller;
-
- # or do we want to call SUPER on an object if $caller is an
- # object?
- if( $class =~ /Bio::FeatureIO::(\S+)/ ) {
-
- my ($self) = $class->SUPER::new(@args);
- $self->_initialize(@args);
- return $self;
-
- } else {
-
- my %param = @args;
-
- @param{ map { lc $_ } keys %param } = values %param; # lowercase keys
- my $format = $param{'-format'} ||
- $class->_guess_format( $param{-file} || $ARGV[0] );
-
- if( ! $format ) {
- if ($param{-file}) {
- $format = $class->_guess_format($param{-file});
- } elsif ($param{-fh}) {
- $format = $class->_guess_format(undef);
+ my ($caller,@args) = @_;
+ my $class = ref($caller) || $caller;
+
+ # or do we want to call SUPER on an object if $caller is an
+ # object?
+ if( $class =~ /Bio::FeatureIO::(\S+)/ ) {
+
+ my ($self) = $class->SUPER::new(@args);
+ $self->_initialize(@args);
+ return $self;
+
+ } else {
+
+ my %param = @args;
+
+ @param{ map { lc $_ } keys %param } = values %param; # lowercase keys
+ my $format = $param{'-format'} ||
+ $class->_guess_format( $param{-file} || $ARGV[0] );
+
+ if( ! $format ) {
+ if ($param{-file}) {
+ $format = $class->_guess_format($param{-file});
+ } elsif ($param{-fh}) {
+ $format = $class->_guess_format(undef);
+ }
}
- }
- $format = "\L$format"; # normalize capitalization to lower case
- return unless( $class->_load_format_module($format) );
- return "Bio::FeatureIO::$format"->new(@args);
-
- }
+ $format = "\L$format"; # normalize capitalization to lower case
+ return unless( $class->_load_format_module($format) );
+ return "Bio::FeatureIO::$format"->new(@args);
+
+ }
}
=head2 newFh
@@ -330,9 +335,9 @@ See L<Bio::FeatureIO::Fh>
=cut
sub newFh {
- my $class = shift;
- return unless my $self = $class->new(@_);
- return $self->fh;
+ my $class = shift;
+ return unless my $self = $class->new(@_);
+ return $self->fh;
}
=head2 fh
@@ -390,7 +395,7 @@ sub _initialize {
Returns : a Bio::SeqFeatureI feature object
Args : none
-See L<Bio::Root::RootI>, L<Bio::SeqFeatureI>
+See L<Bio::Root::RootI>, L<Bio::SeqFeatureI>.
=cut
@@ -414,6 +419,24 @@ sub write_feature {
$self->throw_not_implemented();
}
+=head2 seq
+
+ Title : seq
+ Usage : $obj->seq() OR $obj->seq($newSeq)
+ Example :
+ Returns : Bio::SeqI object
+ Args : newSeq (optional)
+
+=cut
+
+sub seq {
+ my $self = shift;
+ my $val = shift;
+
+ $self->{'seq'} = $val if defined($val);
+ return $self->{'seq'};
+}
+
=head2 _load_format_module
Title : _load_format_module
@@ -446,41 +469,6 @@ END
return $ok;
}
-=head2 seq
-
- Title : seq
- Usage : $obj->seq() OR $obj->seq($newSeq)
- Example :
- Returns : Bio::SeqI object
- Args : newSeq (optional)
-
-=cut
-
-sub seq {
- my $self = shift;
- my $val = shift;
-
- $self->{'seq'} = $val if defined($val);
- return $self->{'seq'};
-}
-
-=head2 _filehandle
-
- Title : _filehandle
- Usage : $obj->_filehandle($newval)
- Function: This method is deprecated. Call _fh() instead.
- Example :
- Returns : value of _filehandle
- Args : newvalue (optional)
-
-
-=cut
-
-sub _filehandle {
- my ($self,@args) = @_;
- return $self->_fh(@args);
-}
-
=head2 _guess_format
Title : _guess_format
193 lib/Bio/Factory/FeatureFactory.pm → lib/Bio/FeatureIO/Handler/GenericFeatureHandler.pm
View
@@ -1,36 +1,211 @@
-package Bio::Factory::FeatureFactory;
+package Bio::FeatureIO::Handler::GenericFeatureHandler;
-use base qw(Bio::Root::Root Bio::Factory::ObjectFactoryI);
+use base qw(Bio::Root::Root Bio::HandlerBaseI);
-sub unflattener {
+use strict;
+use warnings;
+use Data::Dumper;
+use Bio::SeqFeature::Generic;
+use Bio::SeqIO;
+
+my $ct = 0;
+my %GFF3_RESERVED_TAGS = map {$_ => $ct++ }
+ qw(ID Name Alias Parent Target Gap
+ Derives_from Note Dbxref Ontology_term Index);
+my %HANDLERS = (
+ 'directive' => \&directives,
+ 'comment' => \&comment,
+ 'feature' => \&seqfeature,
+ 'sequence' => \&sequence,
+);
+
+sub new {
+ my ($class, @args) = @_;
+ my $self = $class->SUPER::new(@args);
+ $self = {@args};
+ bless $self,$class;
+ $self->handler_methods();
+ return $self;
}
-sub
-
-sub create_feature {
+sub data_handler {
+ my ($self, $data) = @_;
+ my $nm = $data->{MODE} || $self->throw("No type tag defined!\n".Dumper($data));
+
+ $self->set_parameters('mode', $nm eq 'directive' ? $data->{DATA}->{type} : $nm);
+ # this should handle data on the fly w/o caching; any caching should be
+ # done in the driver!
+ my $method = (exists $self->{'handlers'}->{$nm}) ? ($self->{'handlers'}->{$nm}) :
+ (exists $self->{'handlers'}->{'_DEFAULT_'}) ? ($self->{'handlers'}->{'_DEFAULT_'}) :
+ undef;
+
+ # needs a can check, but $self->can oddly isn't working here...
+ if ($method && ref $method eq 'CODE') {
+ return $method->($data, $self);
+ } else {
+ $self->debug("No handler defined for $nm\n");
+ return;
+ }
}
+sub handler_methods {
+ my $self = shift;
+ my $handlers = shift;
+ if (!($self->{'handlers'}) || defined $handlers && ref $handlers eq 'HASH') {
+ $self->{'handlers'} = $handlers || \%HANDLERS;
+ }
+ return ($self->{'handlers'});
+}
+
+sub set_handler_helper {
+ my ($self, $name, $sub) = @_;
+ return if !($name && $sub);
+ $self->throw("Passed callback must be a code ref") if $sub && ref $sub eq 'CODE';
+ $self->{'handlers'}->{$name} = $sub;
+}
+
+sub format {
+ my $self = shift;
+ return $self->{format} = shift if @_;
+ return $self->{format};
+}
+
+sub reset_parameters {
+ my ($self) = @_;
+ $self->{parameters} = {};
+}
+
+sub get_parameters {
+ my ($self, $param) = @_;
+ return if !($param);
+ $self->{parameters}->{$param};
+}
+
+sub set_parameters {
+ my ($self, $param, $value) = @_;
+ return if !($param && defined $value);
+ $self->{parameters}->{$param} = $value;
+}
+
+# this needs to be a Bio::SeqFeature::CollectionI that can distinguish
+# between sequence regions; the simplest versions don't
+
sub feature_collection {
+ my $self = shift;
+ return $self->{feature_collection} = shift if @_;
+ return $self->{feature_collection};
+}
+
+sub file_handle {
+ return shift->{-fh};
+}
+
+sub fasta_mode {
+ my $self = shift;
+ my $mode = $self->get_parameters('mode');
+ return unless $mode;
+ $mode eq 'sequence' || $mode eq 'sequence-region' ? 1 : 0;
+}
+
+sub resolve_references {
+ my $self = shift;
+ my $mode = $self->get_parameters('mode');
+ return unless $mode;
+ $mode eq 'resolve-references' || $mode eq 'sequence' || $mode eq 'sequence-region' ? 1 : 0;
+}
+
+################ HANDLERS ################
+
+# Handler methods are designed so they are called as sub references, not as
+# class or instance based calls. This decouples them from any handler class and
+# allow some customization (for instance, if we need the parent parser to
+# override them). The parser in this case is ultimately responsible for
+# determining what happens to the data.
+
+# Note this just passes in the data w/o munging it beyond recognition
+sub seqfeature {
+ my ($data, $handler) = @_;
+
+ my %sf_data = map {'-'.$_ => $data->{DATA}->{$_}}
+ grep { $data->{DATA}->{$_} ne '.' }
+ sort keys %{$data->{DATA}};
+ if ($data->{DATA}->{attributes}) {
+ delete $sf_data{-attributes};
+ my %tags;
+
+ # TODO: GFF3-specific split; need to make more general
+ for my $kv (split(/\s*;\s*/, $data->{DATA}->{attributes})) {
+ my ($key, $rest) = split(/[=\s]/, $kv, 2);
+ # add optional/required URI unescaping here
+ my @vals = split(',',$rest);
+ $tags{$key} = \@vals;
+ }
+ $sf_data{-tag} = \%tags;
+ }
+
+ return Bio::SeqFeature::Generic->new(%sf_data);
+}
+
+sub directives {
+ my ($data, $handler) = @_;
+ my $directive = $data->{DATA}->{type};
+ if ($directive eq 'sequence') {
+ my $fh = $handler->file_handle;
+ $handler->throw("Handler doesn't have a set file handle") if !$fh;
+ return Bio::SeqIO->new(-format => 'fasta',
+ -fh => $fh);
+ } elsif ($directive eq 'sequence-region') {
+ # we can make returning a features optional here, but we should do
+ # something with the data in all cases
+
+ my $sf_data = $data->{DATA};
+ return Bio::SeqFeature::Generic->new(-start => $sf_data->{start},
+ -end => $sf_data->{end},
+ -strand => 1,
+ -seq_id => $sf_data->{id},
+ -primary_tag => 'region');
+ } else {
+ # defaults for other directives?
+ }
+ return;
+}
+
+sub sequence {
+ my ($data, $handler) = @_;
+ # if we reach this point, the sequence stream has already been read, so
+ # we need to seek back to the start point. Note if the stream isn't seekable
+ # this will fail spectacularly at this point!
+ my ($start, $len) = @{$data}{qw(START LENGTH)};
+ my $fh = $handler->file_handle;
+ $handler->throw("Handler doesn't have a set file handle") if !$fh;
+ seek($fh, $start, 0);
+ return Bio::SeqIO->new(-format => 'fasta',
+ -fh => $fh);
}
+# no op, we just skip these
+sub comment {}
+
1;
__END__
=head1 NAME
-Bio::Factory::FeatureFactory - <One-line description of module's purpose>
+Bio::FeatureIO::Handler::GenericFeatureHandler.pm - <One-line description of module's
+purpose>
=head1 VERSION
-This documentation refers to Bio::Factory::FeatureFactory version Bio::Root::Root.
+This documentation refers to Bio::FeatureIO::Handler::GenericFeatureHandler.pm version
+Bio::Root::Root.
=head1 SYNOPSIS
- use Bio::Factory::FeatureFactory;
+ use Bio::FeatureIO::Handler::GenericFeatureHandler.pm;
# Brief but working code example(s) here showing the most common usage(s)
# This section will be as far as many users bother reading,
7 lib/Bio/FeatureIO/bed.pm
View
@@ -78,7 +78,6 @@ use strict;
use base qw(Bio::FeatureIO);
use Bio::SeqFeature::Annotated;
use Bio::Annotation::SimpleValue;
-use Bio::OntologyIO;
use Scalar::Util qw(looks_like_number);
=head2 _initialize
@@ -228,8 +227,10 @@ sub next_feature {
return $self->next_feature;
}
- my $feature = Bio::SeqFeature::Annotated->new(-start => $start, # start is 0 based
- -end => --$end, # end is not part of the feature
+ # start is 0 based, need it 1-based;
+ # end is one beyond the feature ends and thus already 1-based
+ my $feature = Bio::SeqFeature::Annotated->new(-start => ++$start,
+ -end => $end,
$score ? (-score => $score) : (),
$strand ? (-strand => $strand eq '+' ? 1 : -1) : ());
1,499 lib/Bio/FeatureIO/gff.pm
View
@@ -1,256 +1,205 @@
-=pod
-
-=head1 NAME
-
-Bio::FeatureIO::gff - read/write GFF feature files
-
-=head1 SYNOPSIS
-
- my $feature; #get a Bio::SeqFeature::Annotated somehow
- my $featureOut = Bio::FeatureIO->new(
- -format => 'gff',
- -version => 3,
- -fh => \*STDOUT,
- -validate_terms => 1, #boolean. validate ontology terms online? default 0 (false).
- );
- $featureOut->write_feature($feature);
-
-=head1 DESCRIPTION
-
- Currently implemented:
-
- version read? write?
- ------------------------------
- GFF 1 N N
- GFF 2 N N
- GFF 2.5 (GTF) N Y
- GFF 3 Y Y
-
-=head1 FEEDBACK
-
-=head2 Mailing Lists
-
-User feedback is an integral part of the evolution of this and other
-Bioperl modules. Send your comments and suggestions preferably to
-the Bioperl mailing list. Your participation is much appreciated.
-
- bioperl-l@bioperl.org - General discussion
- http://bioperl.org/wiki/Mailing_list - About the mailing lists
-
-=head2 Support
-
-Please direct usage questions or support issues to the mailing list:
-
-I<bioperl-l@bioperl.org>
-
-rather than to the module maintainer directly. Many experienced and
-reponsive experts will be able look at the problem and quickly
-address it. Please include a thorough description of the problem
-with code and data examples if at all possible.
-
-=head2 Reporting Bugs
-
-Report bugs to the Bioperl bug tracking system to help us keep track
-of the bugs and their resolution. Bug reports can be submitted via
-the web:
-
- http://bugzilla.open-bio.org/
-
-=head1 AUTHOR
-
- Allen Day, <allenday@ucla.edu>
-
-=head1 CONTRIBUTORS
-
- Steffen Grossmann, <grossman@molgen.mpg.de>
- Scott Cain, <cain@cshl.edu>
- Rob Edwards <rob@salmonella.org>
-
-=head1 APPENDIX
-
-The rest of the documentation details each of the object methods.
-Internal methods are usually preceded with a _
-
-=cut
-
-
-# Let the code begin...
-
package Bio::FeatureIO::gff;
-use strict;
-
-#these are alphabetical, keep them that way.
-use Bio::Annotation::DBLink;
-use Bio::Annotation::OntologyTerm;
-use Bio::Annotation::SimpleValue;
-use Bio::Annotation::Target;
-use Bio::FeatureIO;
-use Bio::Ontology::OntologyStore;
-use Bio::OntologyIO;
-use Bio::SeqFeature::Annotated;
-use Bio::SeqIO;
-use URI::Escape;
use base qw(Bio::FeatureIO);
-use constant DEFAULT_VERSION => 3;
-my $RESERVED_TAGS = "ID|Name|Alias|Parent|Target|Gap|Derives_from|Note|Dbxref|dbxref|Ontology_term|Index|CRUD";
+use strict;
+use warnings;
+#use URI::Escape;
+use Bio::FeatureIO::Handler::GenericFeatureHandler;
+use Scalar::Util qw(blessed);
+use Data::Dumper;
-sub _initialize {
- my($self,%arg) = @_;
+=head1
- $self->SUPER::_initialize(%arg);
+Need to come up with a controlled vocabulary for parceling out this data. This
+should be in lines with a defined schema, such as Chado or BioSQL. Should
+possibly refactor the SeqIO drivers similarly.
- $self->version( $arg{-version} || DEFAULT_VERSION);
- $self->validate($arg{-validate_terms} || 0);
+=cut
- if ($arg{-file} =~ /^>.*/ ) {
- $self->_print("##gff-version " . $self->version() . "\n");
- }
- else {
- my $directive;
- while(($directive = $self->_readline()) && ($directive =~ /^##/) ){
- $self->_handle_directive($directive);
- }
- $self->_pushback($directive);
- }
+sub _initialize {
+ my($self, @args) = @_;
- #need to validate against SOFA, no SO
- if ($self->validate) {
- $self->so(
- Bio::Ontology::OntologyStore->get_ontology('Sequence Ontology Feature Annotation')
- );
- }
+ $self->SUPER::_initialize(@args);
+
+ my ($handler, $handler_args, $format) =
+ $self->_rearrange([qw(HANDLER HANDLER_ARGS FORMAT)] , @args);
+ $format ||= 'GFF3';
+ $handler ||= Bio::FeatureIO::Handler::GenericFeatureHandler->new(-verbose => $self->verbose,
+ -fh => $self->_fh);
+ if (!ref($handler) || !$handler->isa('Bio::HandlerBaseI')) {
+ $self->throw('Passed object must be a Bio::HandlerBaseI');
+ }
+ $handler->format($format);
+ $self->_init_stream();
+ $self->handler($handler);
}
-=head2 next_feature()
-
- Usage : my $feature = $featureio->next_feature();
- Function: reads a feature record from a GFF stream and returns it as an object.
- Returns : a Bio::SeqFeature::Annotated object
- Args : N/A
-
-=cut
-
+# raw feature stream; returned features are as-is, may be modified post-return
sub next_feature {
- my $self = shift;
- my $gff_string;
-
- my($f) = $self->_buffer_feature();
- if($f){
- return $f;
- }
-
- return if $self->fasta_mode();
-
- # be graceful about empty lines or comments, and make sure we return undef
- # if the input is consumed
- while(($gff_string = $self->_readline()) && defined($gff_string)) {
- next if $gff_string =~ /^\s*$/; #skip blank lines
- next if $gff_string =~ /^\#[^#]/; #skip comments, but not directives
- last;
- }
-
- return unless $gff_string;
-
- # looks like we went into FASTA mode without a directive.
- if($gff_string =~ /^>/){
- $self->_pushback($gff_string);
- $self->fasta_mode(1);
- return;
- }
-
- # got a directive
- elsif($gff_string =~ /^##/){
- $self->_handle_directive($gff_string);
- # recurse down to the next line. this will bottom out on finding a real feature or EOF
- return $self->next_feature();
- }
-
- # got a feature
- else {
- return $self->_handle_feature($gff_string);
- }
+ my $self = shift;
+ return if $self->fasta_mode;
+ DATASET:
+ while (my $ds = $self->next_dataset) {
+ # leave it to the handler to decide when a feature is returned
+ while (my $object = $self->handler->data_handler($ds)) {
+ return $object if $object->isa('Bio::SeqFeatureI');
+ if ($object->isa('Bio::SeqIO')) {
+ $self->seqio($object);
+ return;
+ }
+ }
+ }
}
-=head2 next_feature_group
+=head1
- Title : next_feature_group
- Usage : @feature_group = $stream->next_feature_group
- Function: Reads the next feature_group from $stream and returns it.
-
- Feature groups in GFF3 files are separated by '###' directives. The
- features in a group might form a hierarchical structure. The
- complete hierarchy of features is returned, i.e. the returned array
- represents only the top-level features. Lower-level features can
- be accessed using the 'get_SeqFeatures' method recursively.
+Data is passed as hash-refs, similar to a SAX-based data stream, but containing
+chunks of related information. A version of this is implemented in Bio::SeqIO
+plugins gbdriver, embldriver, and swissdriver.
- Example : # getting the complete hierarchy of features in a GFF3 file
- my @toplevel_features;
- while (my @fg = $stream->next_feature_group) {
- push(@toplevel_features, @fg);
- }
- Returns : an array of Bio::SeqFeature::Annotated objects
- Args : none
+The key issue is defining specifically how bits are bundled and passed along to
+the data handler. the other key point is that the start and length of the
+specific chunk of data passed in is also passed along, primarily if one wanted
+to create lazy feature collections .
=cut
-sub next_feature_group {
- my $self = shift;
-
- my $feat;
- my %seen_ids;
- my @all_feats;
- my @toplevel_feats;
-
- $self->{group_not_done} = 1;
-
- while ($self->{group_not_done} && ($feat = $self->next_feature()) && defined($feat)) {
- # we start by collecting all features in the group and
- # memorizing those which have an ID attribute
- my $anno_ID = $feat->get_Annotations('ID');
- if(ref($anno_ID)) {
- my $attr_ID = $anno_ID->value;
- $self->throw("Oops! ID $attr_ID exists more than once in your file!")
- if (exists($seen_ids{$attr_ID}));
- $seen_ids{$attr_ID} = $feat;
- }
- push(@all_feats, $feat);
- }
-
- # assemble the top-level features
- foreach $feat (@all_feats) {
- my @parents = $feat->get_Annotations('Parent');
- if (@parents) {
- foreach my $parent (@parents) {
- my $parent_id = $parent->value;
- $self->throw("Parent with ID $parent_id not found!") unless (exists($seen_ids{$parent_id}));
- $seen_ids{$parent_id}->add_SeqFeature($feat);
- }
- } else {
- push(@toplevel_feats, $feat);
- }
- }
-
- return @toplevel_feats;
+sub next_dataset {
+ my $self = shift;
+ local $/ = "\n";
+ my $dataset;
+ my $len = 0;
+ GFFLINE:
+ while (my $line = $self->_readline) {
+ $len += CORE::length($line);
+ if ($line =~ /^\s*$/) {
+ next GFFLINE # blank lines
+ }
+ elsif ($line =~ /^(\#{1,2})\s*(\S+)\s*([^\n]+)?$/) { # comments and directives
+ if (length($1) == 1) {
+ chomp $line;
+ @{$dataset}{qw(MODE DATA)} = ('comment', {DATA => $line});
+ } else {
+ $self->{mode} = 'directive';
+ @{$dataset}{qw(MODE DATA)} = ('directive', $self->directive($2, $3));
+ }
+ } elsif ($line =~ /^>/) { # sequence
+ chomp $line;
+ @{$dataset}{qw(MODE DATA)} = ('sequence', {'sequence-header' => $line});
+ $self->{mode} = 'sequence';
+ } elsif ($line =~ /(?:\t[^\t]+){8}/) {
+ chomp $line;
+ $self->{mode} = $dataset->{MODE} = 'feature';
+ my %feat;
+ @feat{qw(seq_id source primary_tag start end score strand phase attributes)}
+ = split("\t",$line,9);
+ $dataset->{DATA} = \%feat;
+ } else {
+ if ($self->{mode} eq 'sequence') {
+ chomp $line;
+ @{$dataset}{qw(MODE DATA)} = ('sequence', {sequence => $line});
+ } else {
+ # anything else should be sequence, but there should be some
+ # kind of directive to change the mode or a typical FASTA header
+ # should be found, if not, die
+ $self->throw("Unknown line: $line, parser was in mode ".$self->{mode});
+ }
+ }
+ if ($dataset) {
+ @$dataset{qw(START LENGTH)} = ($self->{stream_start}, $len);
+ $self->{stream_start} += $len;
+ return $dataset;
+ }
+ return;
+ }
}
-=head2 next_seq()
+sub directive {
+ my ($self, $directive, $rest) = @_;
+ $rest ||= '';
+ my %data;
+
+ if ($directive eq 'sequence-region') {
+ @data{qw(type id start end)} = ('sequence-region', split(/\s+/, $rest));
+ } elsif ($directive eq 'genome-build') {
+ @data{qw(type source buildname)} = ($directive, split(/\s+/, $rest));
+ } elsif ($directive eq '#') {
+ $data{type} = 'resolve-references';
+ } elsif ($directive eq 'FASTA') {
+ $data{type} = 'sequence';
+ } else {
+ @data{qw(type data)} = ($directive, $rest);
+ }
+ \%data;
+}
-access the FASTA section (if any) at the end of the GFF stream. note that this method
-will return undef if not all features in the stream have been handled
+sub handler {
+ my ($self, $handler) = @_;
+ if ($handler) {
+ $self->throw("Handler must be a Bio::HandlerBaseI") unless
+ blessed($handler) && $handler->isa('Bio::HandlerBaseI');
+ $self->{handler} = $handler;
+ }
+ return $self->{handler} if $self->{handler};
+ $self->throw("Handler not set");
+}
-=cut
+sub _init_stream {
+ my $self = shift;
+ my $fh = $self->_fh;
+ my $start = tell $fh;
+ @{$self}{qw(stream_start stream_type)} =
+ ($start >= 0) ? ($start, 'seekable') : (0, 'string')
+}
+
+sub next_feature_group {
+ my $self = shift;
+ return if $self->fasta_mode;
+ $self->{sf_cache} ||= [];
+ my %seen_ids;
+ my @all_feats;
+ my @toplevel_feats;
+
+ while (my $ds = $self->next_dataset) {
+ my $object = $self->handler->data_handler($ds);
+ #print STDERR "Mode:".$self->handler->get_parameters('mode')."\n";
+ if ($object && $object->isa('Bio::SeqIO')) {
+ $self->seqio($object);
+ last;
+ }
+ last if $self->handler->resolve_references;
+ next unless $object;
+ if ($object->has_tag('ID')) {
+ my ($id) = $object->get_tag_values('ID');
+ $self->throw("Oops! ID $id exists more than once in your file!")
+ if (exists($seen_ids{$id}));
+ $seen_ids{$id} = $object;
+ push @all_feats, $object;
+ push @toplevel_feats, $object if !$object->has_tag('Parent');
+ }
+ if ($object->has_tag('Parent')) {
+ my @parents = $object->get_tag_values('Parent');
+ for my $parent_id (@parents) {
+ if (exists $seen_ids{$parent_id}) {
+ $seen_ids{$parent_id}->add_SeqFeature($object);
+ } else {
+ $self->throw("Parent with ID $parent_id not found!");
+ }
+ }
+ }
+ }
+ return @toplevel_feats;
+}
sub next_seq() {
- my $self = shift;
- return unless $self->fasta_mode();
-
- #first time next_seq has been called. initialize Bio::SeqIO instance
- if(!$self->seqio){
- $self->seqio( Bio::SeqIO->new(-format => 'fasta', -fh => $self->_fh()) );
- }
- return $self->seqio->next_seq();
+ my $self = shift;
+ return unless $self->fasta_mode;
+ #first time next_seq has been called. initialize Bio::SeqIO instance
+ if(!$self->seqio){
+ $self->seqio( Bio::SeqIO->new(-format => 'fasta', -fh => $self->_fh()) );
+ }
+ return $self->seqio->next_seq();
}
=head2 write_feature()
@@ -264,25 +213,25 @@ sub next_seq() {
=cut
-sub write_feature {
- my($self,$feature) = @_;
- if (!$feature) {
- $self->throw("gff.pm cannot write_feature unless you give a feature to write.\n");
- }
- $self->throw("only Bio::SeqFeature::Annotated objects are writeable") unless $feature->isa('Bio::SeqFeature::Annotated');
-
- if($self->version == 1){
- return $self->_write_feature_1($feature);
- } elsif($self->version == 2){
- return $self->_write_feature_2($feature);
- } elsif($self->version == 2.5){
- return $self->_write_feature_25($feature);
- } elsif($self->version == 3){
- return $self->_write_feature_3($feature);
- } else {
- $self->throw(sprintf("don't know how to write GFF version %s",$self->version));
- }
-}
+#sub write_feature {
+# my($self,$feature) = @_;
+# if (!$feature) {
+# $self->throw("gff.pm cannot write_feature unless you give a feature to write.\n");
+# }
+# $self->throw("only Bio::SeqFeature::Annotated objects are writeable") unless $feature->isa('Bio::SeqFeature::Annotated');
+#
+# if($self->version == 1){
+# return $self->_write_feature_1($feature);
+# } elsif($self->version == 2){
+# return $self->_write_feature_2($feature);
+# } elsif($self->version == 2.5){
+# return $self->_write_feature_25($feature);
+# } elsif($self->version == 3){
+# return $self->_write_feature_3($feature);
+# } else {
+# $self->throw(sprintf("don't know how to write GFF version %s",$self->version));
+# }
+#}
################################################################################
@@ -295,25 +244,13 @@ sub write_feature {
Usage : $obj->fasta_mode($newval)
Function:
Example :
- Returns : value of fasta_mode (a scalar)
- Args : on set, new value (a scalar or undef, optional)
-
-Side effect when setting: rewind the file handle a little bit to get the last
-carriage return that was swallowed when the previous line was processed.
+ Returns : Value of fasta_mode (a scalar)
+ Args : None
=cut
sub fasta_mode {
- my($self,$val) = @_;
-
- $self->{'fasta_mode'} = $val if defined($val);
-
- if ($val && $val == 1) {
- # seek $self->_fh(), -1, 1; #rewind 1 byte to get the previous line's \n
- $self->_pushback("\n");
- }
-
- return $self->{'fasta_mode'};
+ return shift->handler->fasta_mode;
}
=head2 seqio()
@@ -323,15 +260,16 @@ sub fasta_mode {
Returns : value of seqio (a scalar)
Args : on set, new value (a scalar or undef, optional)
-
=cut
sub seqio {
- my($self,$val) = @_;
- $self->{'seqio'} = $val if defined($val);
- return $self->{'seqio'};
+ my($self,$val) = @_;
+ $self->{'seqio'} = $val if defined($val);
+ return $self->{'seqio'};
}
+# TODO: reimplement to call the handler's set parameters (getter only)
+
=head2 sequence_region()
Usage :
@@ -339,22 +277,21 @@ sub seqio {
Returns :
Args :
-
=cut
-sub sequence_region {
- my ($self,$k,$v) = @_;
- if(defined($k) && defined($v)){
- $self->{'sequence_region'}{$k} = $v;
- return $v;
- }
- elsif(defined($k)){
- return $self->{'sequence_region'}{$k};
- }
- else {
- return;
- }
-}
+#sub sequence_region {
+# my ($self,$k,$v) = @_;
+# if(defined($k) && defined($v)){
+# $self->{'sequence_region'}{$k} = $v;
+# return $v;
+# }
+# elsif(defined($k)){
+# return $self->{'sequence_region'}{$k};
+# }
+# else {
+# return;
+# }
+#}
=head2 so()
@@ -367,11 +304,12 @@ sub sequence_region {
=cut
sub so {
- my $self = shift;
- my $val = shift;
- ###FIXME validate $val object's type
- $self->{so} = $val if defined($val);
- return $self->{so};
+ shift->throw_not_implemented;
+ #my $self = shift;
+ #my $val = shift;
+ ####FIXME validate $val object's type
+ #$self->{so} = $val if defined($val);
+ #return $self->{so};
}
=head2 validate()
@@ -386,9 +324,10 @@ sub so {
=cut
sub validate {
- my($self,$val) = @_;
- $self->{'validate'} = $val if defined($val);
- return $self->{'validate'};
+ shift->throw_not_implemented;
+ #my($self,$val) = @_;
+ #$self->{'validate'} = $val if defined($val);
+ #return $self->{'validate'};
}
=head2 version()
@@ -401,16 +340,17 @@ sub validate {
=cut
sub version {
- my $self = shift;
- my $val = shift;
- my %valid = map {$_=>1} (1, 2, 2.5, 3);
- if(defined $val && $valid{$val}){
- return $self->{'version'} = $val;
- }
- elsif(defined($val)){
- $self->throw('invalid version. valid versions: '.join(' ', sort keys %valid));
- }
- return $self->{'version'};
+ shift->throw_not_implemented;
+ #my $self = shift;
+ #my $val = shift;
+ #my %valid = map {$_=>1} (1, 2, 2.5, 3);
+ #if(defined $val && $valid{$val}){
+ # return $self->{'version'} = $val;
+ #}
+ #elsif(defined($val)){
+ # $self->throw('invalid version. valid versions: '.join(' ', sort keys %valid));
+ #}
+ #return $self->{'version'};
}
################################################################################
@@ -419,108 +359,6 @@ sub version {
=cut
-=head2 _buffer_feature()
-
- Usage :
- Function: ###FIXME
- Returns :
- Args :
-
-=cut
-
-sub _buffer_feature {
- my ($self,$f) = @_;
-
- if ( $f ) {
- push @{ $self->{'buffer'} }, $f;
- return $f;
- }
- elsif ( $self->{'buffer'} ) {
- return shift @{ $self->{'buffer'} };
- }
- else {
- return;
- }
-}
-
-
-=head1 _handle_directive()
-
-this method is called for lines beginning with '##'.
-
-=cut
-
-sub _handle_directive {
- my($self,$directive_string) = @_;
-
- $directive_string =~ s/^##//; #remove escape
- my($directive,@arg) = split /\s+/, $directive_string;
-
- if($directive eq 'gff-version'){
- my $version = $arg[0];
- if($version != 3){
- $self->throw("this is not a gff version 3 document, it is version '$version'");
- }
- }
-
- elsif($directive eq 'sequence-region'){
- # RAE: Sequence regions are in the format sequence-region seqid start end
- # for these we want to store the seqid, start, and end. Then when we validate
- # we want to make sure that the features are within the seqid/start/end
-
- $self->throw('Both start and end for sequence region should be defined')
- unless $arg[1] && $arg[2];
- my $fta = Bio::Annotation::OntologyTerm->new();
- $fta->name( 'region');
-
- my $f = Bio::SeqFeature::Annotated->new();
- $f->seq_id( $arg[0] );
- $f->start( $arg[1] );
- $f->end( $arg[2] );
-
- $f->type( $fta );
-
- #cache this in sequence_region(), we may need it for validation later.
- $self->sequence_region($f->seq_id => $f);
-
- #NOTE: is this the right thing to do -- treat this as a feature? -allenday
- #buffer it to be returned by next_feature()
- $self->_buffer_feature($f);
- }
-
- elsif($directive eq 'feature-ontology'){
- $self->warn("'##$directive' directive handling not yet implemented");
- }
-
- elsif($directive eq 'attribute-ontology'){
- $self->warn("'##$directive' directive handling not yet implemented");
- }
-
- elsif($directive eq 'source-ontology'){
- $self->warn("'##$directive' directive handling not yet implemented");
- }
-
- elsif($directive eq 'FASTA' or $directive =~ /^>/){
- #next_seq() will take care of this.
- $self->fasta_mode(1);
- return;
- }
-
- elsif($directive eq '#'){
- #all forward references resolved
- $self->{group_not_done} = 0;
- }
-
- elsif($directive eq 'organism') {
- my $organism = $arg[0];
- $self->organism($organism);
- }
-
- else {
- $self->throw("don't know what do do with directive: '##".$directive."'");
- }
-}
-
=head1 _handle_feature()
this method is called for each line not beginning with '#'. it parses the line and returns a
@@ -528,436 +366,549 @@ Bio::SeqFeature::Annotated object.
=cut
-sub _handle_feature {
- my($self,$feature_string) = @_;
-
- my $feat = Bio::SeqFeature::Annotated->new();
-
- my($seq,$source,$type,$start,$end,$score,$strand,$phase,$attribute_string) = split /\t/, $feature_string;
-
- $feat->seq_id($seq);
- $feat->source_tag($source);
- $feat->start($start) unless $start eq '.';
- $feat->end($end) unless $end eq '.';
- $feat->strand($strand eq '+' ? 1 : $strand eq '-' ? -1 : 0);
- $feat->score($score);
- $feat->phase($phase);
+#sub _handle_feature {
+# my($self,$feature_string) = @_;
+#
+# my $feat = Bio::SeqFeature::Annotated->new();
+#
+# my($seq,$source,$type,$start,$end,$score,$strand,$phase,$attribute_string) = split /\t/, $feature_string;
+#
+# $feat->seq_id($seq);
+# $feat->source_tag($source);
+# $feat->start($start) unless $start eq '.';
+# $feat->end($end) unless $end eq '.';
+# $feat->strand($strand eq '+' ? 1 : $strand eq '-' ? -1 : 0);
+# $feat->score($score);
+# $feat->phase($phase);
+#
+# my $fta = Bio::Annotation::OntologyTerm->new();
+#
+# if($self->validate()){
+# # RAE Added a couple of validations based on the GFF3 spec at
+# # http://song.sourceforge.net/gff3.shtml
+# # 1. Validate the id
+# if ($seq =~ /[^a-zA-Z0-9\.\-\:\^\*\$\@\!\+\_\?]/) { # I just escaped everything.
+# $self->throw("Validation Error: seqid ($seq) contains characters that are not [a-zA-Z0-9.:^*\$\@!+_?\-] and not escaped");
+# }
+#
+# if ($seq =~ /\s/) {
+# $self->throw("Validation Error: seqid ($seq) contains unescaped whitespace")
+# }
+#
+# # NOTE i think we're handling this in as a directive, and this test may be removed -allenday
+# if ($seq =~ /^>/) {
+# $self->throw("Validation Error: seqid ($seq) begins with a >")
+# }
+#
+# # 2. Validate the starts and stops.
+# # these need to be within the region's bounds, and
+# # also start <= end. bail out if either is not true.
+# if ($start > $end) {
+# $self->throw("Validation Error: start ($start) must be less than or equal to end in $seq");
+# }
+#
+# my $region = $self->sequence_region($seq);
+# # NOTE: we can only validate against sequence-region that are declared in the file.
+# # if i reference some region from elsewhere, can't validate. if we want to be really strict
+# # we should bail out here. -allenday
+# if ( defined($region) && $start < $region->start() || $end > $region->end() ) {
+# $self->throw("Validation Error: sequence location ($seq from $start to $end) does not appear to lie within a defined sequence-region")
+# }
+#
+# # 3. Validate the strand.
+# # In the unvalidated version +=1 and -=-1. Everything else is 0. We just need to warn when it is not [+-.?]
+# $self->throw("Validation Error: strand is not one of [+-.?] at $seq") if ($strand =~ /^[^\+\-\.\?]$/);
+#
+# # 4. Validate the phase to be one of [.012]
+# $self->throw("Validation Error: phase is not one of [.012] at $seq") if ($phase =~ /^[^\.012]$/);
+#
+# my $feature_type;
+# if($type =~ /^\D+:\d+$/){
+# #looks like an identifier
+# ($feature_type) = $self->so->find_terms(-identifier => $type);
+# } else {
+# #looks like a name
+# ($feature_type) = $self->so->find_terms(-name => $type);
+# }
+#
+# if(!$feature_type){
+# $self->throw("Validation Error: couldn't find ontology term for '$type'.");
+# }
+# $fta->term($feature_type);
+# } else {
+#
+# if($type =~ /^\D+:\d+$/){
+# #looks like an identifier
+# $fta->identifier($type)
+# } else {
+# $fta->name($type);
+# }
+# }
+#
+# $feat->type($fta);
+#
+# my %attr = ();
+# chomp $attribute_string;
+#
+# unless ( $attribute_string eq '.' ) {
+# my @attributes = split ';', $attribute_string;
+# foreach my $attribute (@attributes){
+# my($key,$values) = split '=', $attribute;
+#
+# # remove leading and trailing quotes from values
+# $values =~ s/^["']//;
+# $values =~ s/["']$//; #' terminate the quote for emacs
+#
+# my @values = map{uri_unescape($_)} split ',', $values;
+#
+# #minor hack to allow for multiple instances of the same tag
+# if ($attr{$key}) {
+# my @tmparray = @{$attr{$key}};
+# push @tmparray, @values;
+# $attr{$key} = [@tmparray];
+# } else {
+# $attr{$key} = [@values];
+# }
+# }
+# }
+#
+# #Handle Dbxref attributes
+# if($attr{Dbxref} or $attr{dbxref}){
+# foreach my $value (@{ $attr{Dbxref} }, @{ $attr{dbxref} }){
+# my $a = Bio::Annotation::DBLink->new();
+# my($db,$accession) = $value =~ /^(.+?):(.+)$/;
+#
+# if(!$db or !$accession){ #dbxref malformed
+# $self->throw("Error in line:\n$feature_string\nDbxref value '$value' did not conform to GFF3 specification");
+# next;
+# }
+#
+# $a->database($db);
+# $a->primary_id($accession);
+# $feat->add_Annotation('Dbxref',$a);
+# }
+# }
+#
+# #Handle Ontology_term attributes
+# if($attr{Ontology_term}){
+# foreach my $id (@{ $attr{Ontology_term} }){
+# my $a = Bio::Annotation::OntologyTerm->new();
+#
+# if($self->validate()){
+# my $ont_name = Bio::Ontology::OntologyStore->guess_ontology($id);
+# my $ont = Bio::Ontology::OntologyStore->get_ontology($ont_name);
+# my($term) = $ont->find_terms(-identifier => $id);
+# $a->term($term);
+# } else {
+# $a->identifier($id);
+# }
+#
+# $feat->add_Annotation('Ontology_term',$a);
+# }
+# }
+#
+# #Handle Gap attributes
+# if($attr{Gap}){
+# for my $value (@{ $attr{Gap} }) {
+# my $a = Bio::Annotation::SimpleValue->new();
+# $a->value($value);
+# $feat->add_Annotation('Gap',$a);
+# }
+# }
+#
+# #Handle Target attributes
+# if($attr{Target}){
+# my $target_collection = Bio::Annotation::Collection->new();
+#
+# foreach my $target_string (@{ $attr{Target} } ) {
+#
+# #only replace + for space if + has been used in place of it
+# #that is, + could also mean plus strand, and we don't want
+# #to accidentally remove it
+#
+# #presumably you can't use + for space and + for strand in the same string.
+# $target_string =~ s/\+/ /g unless $target_string =~ / /;
+#
+# my ($t_id,$tstart,$tend,$strand,$extra) = split /\s+/, $target_string;
+# if (!$tend || $extra) { # too much or too little stuff in the string
+# $self->throw("The value in the Target string, $target_string, does not conform to the GFF3 specification");
+# }
+#
+# my $a = Bio::Annotation::Target->new(
+# -target_id => $t_id,
+# -start => $tstart,
+# -end => $tend,
+# );
+#
+# if ($strand && $strand eq '+') {
+# $strand = 1;
+# } elsif ($strand && $strand eq '-') {
+# $strand = -1;
+# } else {
+# $strand = '';
+# }
+#
+# $a->strand($strand) if $strand;
+# $feat->add_Annotation('Target',$a);
+# }
+# }
+#
+# #Handle ID attribute. May only have one ID, throw error otherwise
+#
+# if($attr{ID}){
+# if(scalar( @{ $attr{ID} } ) > 1){
+# $self->throw("Error in line:\n$feature_string\nA feature may have at most one ID value");
+# }
+#
+# #ID's must be unique in the file
+# if ($self->{'allIDs'}->{${$attr{ID}}[0]} && $self->validate()) {
+# $self->throw("Validation Error: The ID ${$attr{ID}}[0] occurs more than once in the file, but should be unique");
+# }
+# $self->{'allIDs'}->{${$attr{ID}}[0]} = 1;
+#
+#
+# my $a = Bio::Annotation::SimpleValue->new();
+# $a->value( @{ $attr{ID} }[0] );
+# $feat->add_Annotation('ID',$a);
+# }
+#
+# #Handle Name attribute. May only have one Name, throw error otherwise
+# if($attr{Name}){
+# if(scalar( @{ $attr{Name} } ) > 1){
+# $self->throw("Error in line:\n$feature_string\nA feature may have at most one Name value");
+# }
+#
+# my $a = Bio::Annotation::SimpleValue->new();
+# $a->value( @{ $attr{Name} }[0] );
+# $feat->add_Annotation('Name',$a);
+# }
+#
+# foreach my $other_canonical (qw(Alias Parent Note Derives_from Index CRUD)){
+# if($attr{$other_canonical}){
+# foreach my $value (@{ $attr{$other_canonical} }){
+# my $a = Bio::Annotation::SimpleValue->new();
+# $a->value($value);
+# $feat->add_Annotation($other_canonical,$a);
+# }
+# }
+# }
+#
+# my @non_reserved_tags = grep {/^[a-z]/} keys %attr;
+# foreach my $non_reserved_tag (@non_reserved_tags) {
+# next if ($non_reserved_tag eq 'dbxref');
+# foreach my $value (@{ $attr{$non_reserved_tag} }){
+# $feat = $self->_handle_non_reserved_tag($feat,$non_reserved_tag,$value);
+# }
+# }
+#
+# my @illegal_tags = grep
+# {!/($RESERVED_TAGS)/}
+# grep {/^[A-Z]/} keys %attr;
+#
+# if (@illegal_tags > 0) {
+# my $tags = join(", ", @illegal_tags);
+# $self->throw("The following tag(s) are illegal and are causing this parser to die: $tags");
+# }
+#
+# return $feat;
+#}
+
+#=head2 _handle_non_reserved_tag()
+#
+# Usage : $self->_handle_non_reserved_tag($feature,$tag,$value)
+# Function: Deal with non-reserved word tags in the ninth column
+# Returns : An updated Bio::SeqFeature::Annotated object
+# Args : A Bio::SeqFeature::Annotated and a tag/value pair
+#
+#Note that this method can be overridden in a subclass to provide
+#special handling of non-reserved word tags.
+#
+#=cut
+
+#sub _handle_non_reserved_tag {
+# my $self = shift;
+# my ($feat,$tag,$value) = @_;
+#
+# # to customize through subclassing and overriding:
+# #if ($tag eq 'someTagOfInterest') {
+# # do something different
+# # else { do what is below
+#
+# my $a;
+# if ($tag eq 'comment') {
+# $a = Bio::Annotation::Comment->new();
+# }
+# else {
+# $a = Bio::Annotation::SimpleValue->new();
+# }
+# $a->value($value);
+# $feat->add_Annotation($tag,$a);
+#
+# return $feat;
+#}
+
+#=head1 organims
+#
+#Gets/sets the organims from the organism directive
+#
+#=cut
+
+#sub organism {
+# my $self = shift;
+# my $organism = shift if defined(@_);
+# return $self->{'organism'} = $organism if defined($organism);
+# return $self->{'organism'};
+#}
- my $fta = Bio::Annotation::OntologyTerm->new();
- if($self->validate()){
- # RAE Added a couple of validations based on the GFF3 spec at http://song.sourceforge.net/gff3.shtml
- # 1. Validate the id
- if ($seq =~ /[^a-zA-Z0-9\.\-\:\^\*\$\@\!\+\_\?]/) { # I just escaped everything.
- $self->throw("Validation Error: seqid ($seq) contains characters that are not [a-zA-Z0-9.:^*\$\@!+_?\-] and not escaped");
- }
+=head1 _write_feature_1()
- if ($seq =~ /\s/) {
- $self->throw("Validation Error: seqid ($seq) contains unescaped whitespace")
- }
+write a feature in GFF v1 format. currently not implemented.
- # NOTE i think we're handling this in as a directive, and this test may be removed -allenday
- if ($seq =~ /^>/) {
- $self->throw("Validation Error: seqid ($seq) begins with a >")
- }
+=cut
- # 2. Validate the starts and stops.
- # these need to be within the region's bounds, and
- # also start <= end. bail out if either is not true.
- if ($start > $end) {
- $self->throw("Validation Error: start ($start) must be less than or equal to end in $seq");
- }
+#sub _write_feature_1 {
+# my($self,$feature) = @_;
+# $self->throw(sprintf("write_feature unimplemented for GFF version %s",$self->version));
+#}
- my $region = $self->sequence_region($seq);
- # NOTE: we can only validate against sequence-region that are declared in the file.
- # if i reference some region from elsewhere, can't validate. if we want to be really strict
- # we should bail out here. -allenday
- if ( defined($region) && $start < $region->start() || $end > $region->end() ) {
- $self->throw("Validation Error: sequence location ($seq from $start to $end) does not appear to lie within a defined sequence-region")
- }
+=head1 _write_feature_2()
- # 3. Validate the strand.
- # In the unvalidated version +=1 and -=-1. Everything else is 0. We just need to warn when it is not [+-.?]
- $self->throw("Validation Error: strand is not one of [+-.?] at $seq") if ($strand =~ /^[^\+\-\.\?]$/);
+write a feature in GFF v2 format. currently not implemented.
- # 4. Validate the phase to be one of [.012]
- $self->throw("Validation Error: phase is not one of [.012] at $seq") if ($phase =~ /^[^\.012]$/);
+=cut
- my $feature_type;
- if($type =~ /^\D+:\d+$/){
- #looks like an identifier
- ($feature_type) = $self->so->find_terms(-identifier => $type);
- } else {
- #looks like a name
- ($feature_type) = $self->so->find_terms(-name => $type);
- }
+#sub _write_feature_2 {
+# my($self,$feature) = @_;
+# $self->throw(sprintf("write_feature unimplemented for GFF version %s",$self->version));
+#}
- if(!$feature_type){
- $self->throw("Validation Error: couldn't find ontology term for '$type'.");
- }
- $fta->term($feature_type);
- } else {
+=head1 _write_feature_25()
- if($type =~ /^\D+:\d+$/){
- #looks like an identifier
- $fta->identifier($type)
- } else {
- $fta->name($type);
- }
- }
+write a feature in GFF v2.5 (aka GTF) format.
- $feat->type($fta);
+=cut
- my %attr = ();
- chomp $attribute_string;
+#sub _write_feature_25 {
+# my($self,$feature,$group) = @_;
+#
+# #the top-level feature is an aggregate of all subfeatures
+# my ($transcript_id, $gene_id) = (($feature->get_Annotations('transcript_id'))[0], ($feature->get_Annotations('gene_id'))[0]);
+# if(!defined($group)){
+# $group = ($feature->get_Annotations('ID'))[0];
+# $transcript_id ||= $group;
+# $gene_id ||= $group;
+# }
+#
+#
+# my $seq = ref($feature->seq_id) ? $feature->seq_id->value : $feature->seq_id;
+# my $source = $feature->source->value;
+# my $type = $feature->type->name;
+# $type = 'EXON' if $type eq 'exon'; #a GTF peculiarity, incosistent with the sequence ontology.
+# my $min = $feature->start || '.';
+# my $max = $feature->end || '.';
+# my $strand = $feature->strand == 1 ? '+' : $feature->strand == -1 ? '-' : '.';
+# my $score = defined($feature->score) ? (ref($feature->score) ? $feature->score->value : $feature->score) : '.'; # score is optional
+# my $frame = defined($feature->frame) ? (ref($feature->frame) ? $feature->frame->value : $feature->frame) : (ref($feature->phase) ? $feature->phase->value : $feature->phase);
+#
+# #these are the only valid types in a GTF document
+# if($type eq 'EXON' or $type eq 'CDS' or $type eq 'start_codon' or $type eq 'stop_codon'){
+# my $attr = sprintf('gene_id "%s"; transcript_id "%s";',$gene_id ? $gene_id->value : '',$transcript_id ? $transcript_id->value : '');
+# my $outstring = sprintf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n",
+# $seq,$source,$type,$min,$max,$score,$strand,$frame eq '.' ? 0 : $frame,$attr);
+#
+# $self->_print($outstring);
+# }
+#
+# foreach my $subfeat ($feature->get_SeqFeatures){
+# $self->_write_feature_25($subfeat,$group);
+# }
+#}
- unless ( $attribute_string eq '.' ) {
- my @attributes = split ';', $attribute_string;
- foreach my $attribute (@attributes){
- my($key,$values) = split '=', $attribute;
+=head1 _write_feature_3()
- # remove leading and trailing quotes from values
- $values =~ s/^["']//;
- $values =~ s/["']$//; #' terminate the quote for emacs
+write a feature in GFF v3 format.
- my @values = map{uri_unescape($_)} split ',', $values;
+=cut
- #minor hack to allow for multiple instances of the same tag
- if ($attr{$key}) {
- my @tmparray = @{$attr{$key}};
- push @tmparray, @values;
- $attr{$key} = [@tmparray];
- } else {
- $attr{$key} = [@values];
- }
- }
- }
-
- #Handle Dbxref attributes
- if($attr{Dbxref} or $attr{dbxref}){
- foreach my $value (@{ $attr{Dbxref} }, @{ $attr{dbxref} }){
- my $a = Bio::Annotation::DBLink->new();
- my($db,$accession) = $value =~ /^(.+?):(.+)$/;
-
- if(!$db or !$accession){ #dbxref malformed
- $self->throw("Error in line:\n$feature_string\nDbxref value '$value' did not conform to GFF3 specification");
- next;
- }
-
- $a->database($db);
- $a->primary_id($accession);
- $feat->add_Annotation('Dbxref',$a);
- }
- }
-
- #Handle Ontology_term attributes
- if($attr{Ontology_term}){
- foreach my $id (@{ $attr{Ontology_term} }){
- my $a = Bio::Annotation::OntologyTerm->new();
-
- if($self->validate()){
- my $ont_name = Bio::Ontology::OntologyStore->guess_ontology($id);
- my $ont = Bio::Ontology::OntologyStore->get_ontology($ont_name);
- my($term) = $ont->find_terms(-identifier => $id);
- $a->term($term);
- } else {
- $a->identifier($id);
- }
-
- $feat->add_Annotation('Ontology_term',$a);
- }
- }
-
- #Handle Gap attributes
- if($attr{Gap}){
- for my $value (@{ $attr{Gap} }) {
- my $a = Bio::Annotation::SimpleValue->new();
- $a->value($value);
- $feat->add_Annotation('Gap',$a);
- }
- }
-
- #Handle Target attributes
- if($attr{Target}){
- my $target_collection = Bio::Annotation::Collection->new();
-
- foreach my $target_string (@{ $attr{Target} } ) {
-
- #only replace + for space if + has been used in place of it
- #that is, + could also mean plus strand, and we don't want
- #to accidentally remove it
-
- #presumably you can't use + for space and + for strand in the same string.
- $target_string =~ s/\+/ /g unless $target_string =~ / /;
-
- my ($t_id,$tstart,$tend,$strand,$extra) = split /\s+/, $target_string;
- if (!$tend || $extra) { # too much or too little stuff in the string
- $self->throw("The value in the Target string, $target_string, does not conform to the GFF3 specification");
- }
-
- my $a = Bio::Annotation::Target->new(
- -target_id => $t_id,
- -start => $tstart,
- -end => $tend,
- );
-
- if ($strand && $strand eq '+') {
- $strand = 1;
- } elsif ($strand && $strand eq '-') {
- $strand = -1;
- } else {
- $strand = '';
- }
-
- $a->strand($strand) if $strand;
- $feat->add_Annotation('Target',$a);
- }
- }
+#sub _write_feature_3 {
+# my($self,$feature) = @_;
+# my $seq = ref($feature->seq_id) ? $feature->seq_id->value : $feature->seq_id;
+# my $source;
+# if ($feature->source()) {
+# $source = $feature->source->value;
+# }
+# else {
+# $source = $feature->source() || "unknownsource";
+# }
+# my $type;
+# if ($feature->type()) { $type = $feature->type->name; }
+# else { $type = "region"; }
+# my $min = $feature->start || '.';
+# my $max = $feature->end || '.';
+# my $strand = $feature->strand == 1 ? '+' : $feature->strand == -1 ? '-' : '.';
+# my $score = defined($feature->score) ? (ref($feature->score) ? $feature->score->value : $feature->score) : undef;
+# my $phase = defined($feature->phase) ? (ref($feature->phase) ? $feature->phase->value : $feature->phase) : undef;
+#
+# my @attr;
+# if(my @v = ($feature->get_Annotations('Name'))){
+# my $vstring = join ',', map {uri_escape($_->value)} @v;
+# push @attr, "Name=$vstring";
+# }
+# if(my @v = ($feature->get_Annotations('ID'))){
+# my $vstring = join ',', map {uri_escape($_->value)} @v;
+# push @attr, "ID=$vstring";
+# $self->throw('GFF3 features may have at most one ID, feature with these IDs is invalid:\n'.$vstring) if scalar(@v) > 1;
+# }
+# if(my @v = ($feature->get_Annotations('Parent'))){
+# my $vstring = join ',', map {uri_escape($_->value)} @v;
+# push @attr, "Parent=$vstring";
+# }
+# if(my @v = ($feature->get_Annotations('dblink'))){
+# my $vstring = join ',', map {uri_escape($_->database .':'. $_->primary_id)} @v;
+# push @attr, "Dbxref=$vstring";
+# }
+# if(my @v = ($feature->get_Annotations('ontology_term'))){
+# my $vstring = join ',', map {uri_escape($_->identifier)} @v;
+# push @attr, "Ontology_term=$vstring";
+# }
+# if(my @v = ($feature->get_Annotations('comment'))){
+# my $vstring = join ',', map {uri_escape($_->text)} @v;
+# push @attr, "Note=$vstring";
+# }
+# if(my @v = ($feature->get_Annotations('Target'))){
+# my %strand_map = ( 1=>'+', 0=>'', -1=>'-', '+' => '+', '-' => '-' );
+# my $vstring = join ',', map {
+# uri_escape($_->target_id).' '.$_->start.' '.$_->end.(defined $_->strand ? ' '.$strand_map{$_->strand} : '')
+# } @v;
+# push @attr, "Target=$vstring";
+# }
+#
+# my $attr = join ';', @attr;
+#
+# my $outstring = sprintf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n",
+# $seq,$source,$type,$min,$max,$score,$strand,$phase,$attr);
+#
+# $self->_print($outstring);
+#
+# foreach my $subfeat ($feature->get_SeqFeatures){
+# $self->_write_feature_3($subfeat);
+# }
+#}
- #Handle ID attribute. May only have one ID, throw error otherwise
+1;
- if($attr{ID}){
- if(scalar( @{ $attr{ID} } ) > 1){
- $self->throw("Error in line:\n$feature_string\nA feature may have at most one ID value");
- }
+__END__
- #ID's must be unique in the file
- if ($self->{'allIDs'}->{${$attr{ID}}[0]} && $self->validate()) {
- $self->throw("Validation Error: The ID ${$attr{ID}}[0] occurs more than once in the file, but should be unique");
- }
- $self->{'allIDs'}->{${$attr{ID}}[0]} = 1;
+=pod
+=head1 NAME
- my $a = Bio::Annotation::SimpleValue->new();
- $a->value( @{ $attr{ID} }[0] );
- $feat->add_Annotation('ID',$a);
- }
+Bio::FeatureIO::newgff - read/write GFF feature files
- #Handle Name attribute. May only have one Name, throw error otherwise
- if($attr{Name}){
- if(scalar( @{ $attr{Name} } ) > 1){
- $self->throw("Error in line:\n$feature_string\nA feature may have at most one Name value");
- }
+=head1 SYNOPSIS
- my $a = Bio::Annotation::SimpleValue->new();
- $a->value( @{ $attr{Name} }[0] );
- $feat->add_Annotation('Name',$a);
- }
-
- foreach my $other_canonical (qw(Alias Parent Note Derives_from Index CRUD)){
- if($attr{$other_canonical}){
- foreach my $value (@{ $attr{$other_canonical} }){
- my $a = Bio::Annotation::SimpleValue->new();
- $a->value($value);
- $feat->add_Annotation($other_canonical,$a);
- }
- }
- }
+ my $feature; #get a Bio::SeqFeatureI somehow
+ my $featureOut = Bio::FeatureIO->new(
+ -format => 'gff',
+ -version => 3,
+ -fh => \*STDOUT,
+ -validate_terms => 1, #boolean. validate ontology terms online? default 0 (false).
+ );
+ $featureOut->write_feature($feature);
- my @non_reserved_tags = grep {/^[a-z]/} keys %attr;
- foreach my $non_reserved_tag (@non_reserved_tags) {
- next if ($non_reserved_tag eq 'dbxref');
- foreach my $value (@{ $attr{$non_reserved_tag} }){
- $feat = $self->_handle_non_reserved_tag($feat,$non_reserved_tag,$value);
- }
- }
+=head1 DESCRIPTION
- my @illegal_tags = grep
- {!/($RESERVED_TAGS)/}
- grep {/^[A-Z]/} keys %attr;
+ Currently implemented:
- if (@illegal_tags > 0) {
- my $tags = join(", ", @illegal_tags);
- $self->throw("The following tag(s) are illegal and are causing this parser to die: $tags");
- }
+ version read? write?
+ ------------------------------
+ GFF 1 N N
+ GFF 2 N N
+ GFF 2.5 (GTF) N Y
+ GFF 3 Y Y
- return $feat;
-}
+=head1 FEEDBACK
-=head2 _handle_non_reserved_tag()
+=head2 Mailing Lists
- Usage : $self->_handle_non_reserved_tag($feature,$tag,$value)
- Function: Deal with non-reserved word tags in the ninth column
- Returns : An updated Bio::SeqFeature::Annotated object
- Args : A Bio::SeqFeature::Annotated and a tag/value pair
+User feedback is an integral part of the evolution of this and other
+Bioperl modules. Send your comments and suggestions preferably to
+the Bioperl mailing list. Your participation is much appreciated.
-Note that this method can be overridden in a subclass to provide
-special handling of non-reserved word tags.
+ bioperl-l@bioperl.org - General discussion
+ http://bioperl.org/wiki/Mailing_list - About the mailing lists
-=cut
+=head2 Support
-sub _handle_non_reserved_tag {
- my $self = shift;
- my ($feat,$tag,$value) = @_;
-
- # to customize through subclassing and overriding:
- #if ($tag eq 'someTagOfInterest') {
- # do something different
- # else { do what is below
-
- my $a;
- if ($tag eq 'comment') {
- $a = Bio::Annotation::Comment->new();
- }
- else {
- $a = Bio::Annotation::SimpleValue->new();
- }
- $a->value($value);
- $feat->add_Annotation($tag,$a);
-
- return $feat;
-}
+Please direct usage questions or support issues to the mailing list:
-=head1 organims
+I<bioperl-l@bioperl.org>
-Gets/sets the organims from the organism directive
+rather than to the module maintainer directly. Many experienced and
+reponsive experts will be able look at the problem and quickly
+address it. Please include a thorough description of the problem
+with code and data examples if at all possible.
-=cut
+=head2 Reporting Bugs
-sub organism {
- my $self = shift;
- my $organism = shift if defined(@_);
- return $self->{'organism'} = $organism if defined($organism);
- return $self->{'organism'};
-}
+Report bugs to the Bioperl bug tracking system to help us keep track
+of the bugs and their resolution. Bug reports can be submitted via
+the web:
+ http://bugzilla.open-bio.org/
-=head1 _write_feature_1()
+=head1 AUTHOR
-write a feature in GFF v1 format. currently not implemented.
+ Allen Day, <allenday@ucla.edu>
-=cut
+=head1 CONTRIBUTORS
-sub _write_feature_1 {
- my($self,$feature) = @_;
- $self->throw(sprintf("write_feature unimplemented for GFF version %s",$self->version));
-}
+ Steffen Grossmann, <grossman@molgen.mpg.de>
+ Scott Cain, <cain@cshl.edu>
+ Rob Edwards <rob@salmonella.org>
-=head1 _write_feature_2()
+=head1 APPENDIX
-write a feature in GFF v2 format. currently not implemented.
+The rest of the documentation details each of the object methods.
+Internal methods are usually preceded with a _
=cut
-sub _write_feature_2 {
- my($self,$feature) = @_;
- $self->throw(sprintf("write_feature unimplemented for GFF version %s",$self->version));
-}
-
-=head1 _write_feature_25()
+=head2 next_feature()
-write a feature in GFF v2.5 (aka GTF) format.
+ Usage : my $feature = $featureio->next_feature();
+ Function: reads a feature record from a GFF stream and returns it as an object.
+ Returns : a Bio::SeqFeature::Annotated object
+ Args : N/A
=cut
-sub _write_feature_25 {
- my($self,$feature,$group) = @_;
-
- #the top-level feature is an aggregate of all subfeatures
- my ($transcript_id, $gene_id) = (($feature->get_Annotations('transcript_id'))[0], ($feature->get_Annotations('gene_id'))[0]);
- if(!defined($group)){
- $group = ($feature->get_Annotations('ID'))[0];
- $transcript_id ||= $group;
- $gene_id ||= $group;
- }
-
+=head2 next_feature_group
- my $seq = ref($feature->seq_id) ? $feature->seq_id->value : $feature->seq_id;
- my $source = $feature->source->value;
- my $type = $feature->type->name;
- $type = 'EXON' if $type eq 'exon'; #a GTF peculiarity, incosistent with the sequence ontology.
- my $min = $feature->start || '.';
- my $max = $feature->end || '.';
- my $strand = $feature->strand == 1 ? '+' : $feature->strand == -1 ? '-' : '.';
- my $score = defined($feature->score) ? (ref($feature->score) ? $feature->score->value : $feature->score) : '.'; # score is optional
- my $frame = defined($feature->frame) ? (ref($feature->frame) ? $feature->frame->value : $feature->frame) : (ref($feature->phase) ? $feature->phase->value : $feature->phase);
-
- #these are the only valid types in a GTF document
- if($type eq 'EXON' or $type eq 'CDS' or $type eq 'start_codon' or $type eq 'stop_codon'){
- my $attr = sprintf('gene_id "%s"; transcript_id "%s";',$gene_id ? $gene_id->value : '',$transcript_id ? $transcript_id->value : '');
- my $outstring = sprintf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n",
- $seq,$source,$type,$min,$max,$score,$strand,$frame eq '.' ? 0 : $frame,$attr);
-
- $self->_print($outstring);
- }
-
- foreach my $subfeat ($feature->get_SeqFeatures){
- $self->_write_feature_25($subfeat,$group);
- }
-}
+ Title : next_feature_group
+ Usage : @feature_group = $stream->next_feature_group
+ Function: Reads the next feature_group from $stream and returns it.
-=head1 _write_feature_3()
+ Feature groups in GFF3 files are separated by '###' directives. The
+ features in a group might form a hierarchical structure. The
+ complete hierarchy of features is returned, i.e. the returned array
+ represents only the top-level features. Lower-level features can
+ be accessed using the 'get_SeqFeatures' method recursively.
-write a feature in GFF v3 format.
+ Example : # getting the complete hierarchy of features in a GFF3 file
+ my @toplevel_features;
+ while (my @fg = $stream->next_feature_group) {
+ push(@toplevel_features, @fg);
+ }
+ Returns : an array of Bio::SeqFeature::Annotated objects
+ Args : none
=cut
-sub _write_feature_3 {
- my($self,$feature) = @_;
- my $seq = ref($feature->seq_id) ? $feature->seq_id->value : $feature->seq_id;
- my $source;
- if ($feature->source()) {
- $source = $feature->source->value;
- }
- else {
- $source = $feature->source() || "unknownsource";
- }
- my $type;
- if ($feature->type()) { $type = $feature->type->name; }
- else { $type = "region"; }
- my $min = $feature->start || '.';
- my $max = $feature->end || '.';
- my $strand = $feature->strand == 1 ? '+' : $feature->strand == -1 ? '-' : '.';
- my $score = defined($feature->score) ? (ref($feature->score) ? $feature->score->value : $feature->score) : undef;
- my $phase = defined($feature->phase) ? (ref($feature->phase) ? $feature->phase->value : $feature->phase) : undef;
-
- my @attr;
- if(my @v = ($feature->get_Annotations('Name'))){
- my $vstring = join ',', map {uri_escape($_->value)} @v;
- push @attr, "Name=$vstring";
- }
- if(my @v = ($feature->get_Annotations('ID'))){
- my $vstring = join ',', map {uri_escape($_->value)} @v;
- push @attr, "ID=$vstring";
- $self->throw('GFF3 features may have at most one ID, feature with these IDs is invalid:\n'.$vstring) if scalar(@v) > 1;
- }
- if(my @v = ($feature->get_Annotations('Parent'))){
- my $vstring = join ',', map {uri_escape($_->value)} @v;
- push @attr, "Parent=$vstring";
- }
- if(my @v = ($feature->get_Annotations('dblink'))){
- my $vstring = join ',', map {uri_escape($_->database .':'. $_->primary_id)} @v;
- push @attr, "Dbxref=$vstring";
- }
- if(my @v = ($feature->get_Annotations('ontology_term'))){
- my $vstring = join ',', map {uri_escape($_->identifier)} @v;
- push @attr, "Ontology_term=$vstring";
- }
- if(my @v = ($feature->get_Annotations('comment'))){
- my $vstring = join ',', map {uri_escape($_->text)} @v;
- push @attr, "Note=$vstring";
- }
- if(my @v = ($feature->get_Annotations('Target'))){
- my %strand_map = ( 1=>'+', 0=>'', -1=>'-', '+' => '+', '-' => '-' );
- my $vstring = join ',', map {
- uri_escape($_->target_id).' '.$_->start.' '.$_->end.(defined $_->strand ? ' '.$strand_map{$_->strand} : '')
- } @v;
- push @attr, "Target=$vstring";
- }
-
- my $attr = join ';', @attr;
-
- my $outstring = sprintf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n",
- $seq,$source,$type,$min,$max,$score,$strand,$phase,$attr);
-
- $self->_print($outstring);
-
- foreach my $subfeat ($feature->get_SeqFeatures){
- $self->_write_feature_3($subfeat);
- }
-}
-
-
+=head2 next_seq()
+access the FASTA section (if any) at the end of the GFF stream. note that this
+method will return undef if not all features in the stream have been handled
-1;
+=cut
1  lib/Bio/FeatureIO/interpro.pm
View
@@ -63,7 +63,6 @@ package Bio::FeatureIO::interpro;
use strict;
use base qw(Bio::FeatureIO);
use Bio::SeqFeature::Annotated;
-use Bio::OntologyIO;
use Bio::Annotation::Comment;
use Bio::Annotation::DBLink;
1,204 lib/Bio/SeqFeature/Annotated.pm
View
@@ -0,0 +1,1204 @@
+# $Id$
+#
+# BioPerl module for Bio::SeqFeature::Annotated
+#
+# Please direct questions and support issues to <bioperl-l@bioperl.org>
+#
+# Cared for by Allen Day <allenday at ucla.edu>
+#
+# Copyright Allen Day
+#
+# You may distribute this module under the same terms as perl itself
+
+# POD documentation - main docs before the code
+
+=head1 NAME
+
+Bio::SeqFeature::Annotated - PLEASE PUT SOMETHING HERE
+
+=head1 SYNOPSIS
+
+ # none yet, complain to authors
+
+=head1 DESCRIPTION
+
+None yet, complain to authors.
+
+=head1 Implemented Interfaces
+
+This class implements the following interfaces.
+
+=over 4
+
+=item Bio::SeqFeatureI
+
+Note that this includes implementing Bio::RangeI.
+
+=item Bio::AnnotatableI
+
+=item Bio::FeatureHolderI
+
+Features held by a feature are essentially sub-features.
+
+=back
+
+=head1 FEEDBACK
+
+=head2 Mailing Lists
+
+User feedback is an integral part of the evolution of this and other
+Bioperl modules. Send your comments and suggestions preferably to one
+of the Bioperl mailing lists. Your participation is much appreciated.
+
+ bioperl-l@bioperl.org - General discussion
+ http://bioperl.org/wiki/Mailing_lists - About the mailing lists
+
+=head2 Support
+
+Please direct usage questions or support issues to the mailing list:
+
+I<bioperl-l@bioperl.org>
+
+rather than to the module maintainer directly. Many experienced and
+reponsive experts will be able look at the problem and quickly
+address it. Please include a thorough description of the problem
+with code and data examples if at all possible.
+
+=head2 Reporting Bugs
+
+Report bugs to the Bioperl bug tracking system to help us keep track
+the bugs and their resolution. Bug reports can be submitted via
+the web:
+
+ http://bugzilla.open-bio.org/
+
+=head1 AUTHOR - Allen Day
+
+Allen Day E<lt>allenday at ucla.eduE<gt>
+
+=head1 APPENDIX
+
+The rest of the documentation details each of the object
+methods. Internal methods are usually preceded with a _
+
+=cut
+
+
+package Bio::SeqFeature::Annotated;
+
+use strict;
+
+use Bio::Annotation::Collection;
+use Bio::Annotation::OntologyTerm;
+use Bio::Annotation::Target;
+use Bio::LocatableSeq;
+use Bio::Location::Simple;
+use Bio::Ontology::OntologyStore;
+use Bio::Tools::GFF;
+use Bio::SeqFeature::AnnotationAdaptor;
+use Data::Dumper;
+use URI::Escape;
+
+use base qw(Bio::Root::Root
+ Bio::SeqFeature::TypedSeqFeatureI
+ Bio::AnnotatableI
+ Bio::FeatureHolderI);
+
+our %tagclass = (
+ comment => 'Bio::Annotation::Comment',
+ dblink => 'Bio::Annotation::DBLink',
+ description => 'Bio::Annotation::SimpleValue',
+ gene_name => 'Bio::Annotation::SimpleValue',
+ ontology_term => 'Bio::Annotation::OntologyTerm',
+ reference => 'Bio::Annotation::Reference',
+ __DEFAULT__ => 'Bio::Annotation::SimpleValue',
+);
+
+our %tag2text = (
+ 'Bio::Annotation::Comment' => 'text',
+ 'Bio::Annotation::DBLink' => 'primary_id',
+ 'Bio::Annotation::SimpleValue' => 'value',
+ 'Bio::Annotation::SimpleValue' => 'value',
+ 'Bio::Annotation::OntologyTerm' => 'name',
+ 'Bio::Annotation::Reference' => 'title',
+ __DEFAULT__ => 'value',
+);
+
+######################################
+#get_SeqFeatures
+#display_name
+#primary_tag
+#source_tag x with warning
+#has_tag
+#get_tag_values
+#get_tagset_values
+#get_all_tags
+#attach_seq
+#seq x
+#entire_seq x
+#seq_id
+#gff_string
+#_static_gff_handler
+#start x
+#end x
+#strand x
+#location
+#primary_id
+
+=head1 PREAMBLE
+
+Okay, where to start...
+
+The original idea for this class appears to lump all SeqFeatureI data
+(primary_tag, source_tag, etc) into AnnotationI objects into an
+Bio::Annotation::Collection. The type is then checked against SOFA.
+
+There have been several requests to have type checking be optionally run.
+
+Bio::FeatureHolderI::create_hierarchy_from_ParentIDs
+Bio::FeatureHolderI::feature_count
+Bio::FeatureHolderI::get_all_SeqFeatures
+Bio::FeatureHolderI::set_ParentIDs_from_hierarchy
+Bio::RangeI::contains
+Bio::RangeI::disconnected_ranges
+Bio::RangeI::equals
+Bio::RangeI::intersection
+Bio::RangeI::offsetStranded
+Bio::RangeI::overlap_extent
+Bio::RangeI::overlaps
+Bio::RangeI::subtract
+Bio::RangeI::union
+Bio::SeqFeature::Annotated::Dumper
+Bio::SeqFeature::Annotated::MAX_TYPE_CACHE_MEMBERS
+Bio::SeqFeature::Annotated::add_Annotation
+Bio::SeqFeature::Annotated::add_SeqFeature
+Bio::SeqFeature::Annotated::add_tag_value
+Bio::SeqFeature::Annotated::add_target
+Bio::SeqFeature::Annotated::annotation
+Bio::SeqFeature::Annotated::attach_seq
+Bio::SeqFeature::Annotated::display_name
+Bio::SeqFeature::Annotated::each_target
+Bio::SeqFeature::Annotated::end
+Bio::SeqFeature::Annotated::entire_seq
+Bio::SeqFeature::Annotated::frame
+Bio::SeqFeature::Annotated::from_feature
+Bio::SeqFeature::Annotated::get_Annotations
+Bio::SeqFeature::Annotated::get_SeqFeatures
+Bio::SeqFeature::Annotated::get_all_tags
+Bio::SeqFeature::Annotated::get_tag_values
+Bio::SeqFeature::Annotated::get_tagset_values
+Bio::SeqFeature::Annotated::has_tag
+Bio::SeqFeature::Annotated::length
+Bio::SeqFeature::Annotated::location
+Bio::SeqFeature::Annotated::name
+Bio::SeqFeature::Annotated::new
+Bio::SeqFeature::Annotated::phase
+Bio::SeqFeature::Annotated::primary_tag
+Bio::SeqFeature::Annotated::remove_Annotations
+Bio::SeqFeature::Annotated::remove_SeqFeatures
+Bio::SeqFeature::Annotated::remove_tag
+Bio::SeqFeature::Annotated::score
+Bio::SeqFeature::Annotated::seq
+Bio::SeqFeature::Annotated::seq_id
+Bio::SeqFeature::Annotated::source
+Bio::SeqFeature::Annotated::source_tag
+Bio::SeqFeature::Annotated::start
+Bio::SeqFeature::Annotated::strand
+Bio::SeqFeature::Annotated::type
+Bio::SeqFeature::Annotated::uri_escape
+Bio::SeqFeature::Annotated::uri_unescape
+Bio::SeqFeature::TypedSeqFeatureI::croak
+Bio::SeqFeature::TypedSeqFeatureI::ontology_term
+Bio::SeqFeatureI::generate_unique_persistent_id
+Bio::SeqFeatureI::gff_string
+Bio::SeqFeatureI::primary_id
+Bio::SeqFeatureI::spliced_seq
+
+=cut
+
+sub new {
+ my ( $caller, @args) = @_;
+ my ($self) = $caller->SUPER::new(@args);
+
+ $self->_initialize(@args);
+
+ return $self;
+}
+
+sub _initialize {
+ my ($self,@args) = @_;
+ my ($start, $end, $strand, $frame, $phase, $score,
+ $name, $annot, $location,
+ $display_name, # deprecate
+ $seq_id, $type,$source,$feature
+ ) =
+ $self->_rearrange([qw(START
+ END
+ STRAND
+ FRAME
+ PHASE
+ SCORE
+ NAME
+ ANNOTATION
+ LOCATION
+ DISPLAY_NAME
+ SEQ_ID
+ TYPE
+ SOURCE
+ FEATURE
+ )], @args);
+ defined $start && $self->start($start);
+ defined $end && $self->end($end);
+ defined $strand && $self->strand($strand);
+ defined $frame && $self->frame($frame);
+ defined $phase && $self->phase($phase);
+ defined $score && $self->score($score);
+ defined $source && ref($source) ? $self->source($source) : $self->source_tag($source);
+ defined $type && ref($type) ? $self->type($type) : $self->primary_tag($type);
+ defined $location && $self->location($location);
+ defined $annot && $self->annotation($annot);
+ defined $feature && $self->from_feature(