Permalink
Browse files

cleanup GFF parsing a bit more, start work on BED

  • Loading branch information...
1 parent f0d45ce commit 8d387a22dad41daf909740078dd7ca8e855e9134 Chris Fields committed Aug 17, 2010
Showing with 400 additions and 260 deletions.
  1. +1 −1 README
  2. +74 −72 lib/Bio/FeatureIO.pm
  3. +15 −8 lib/Bio/FeatureIO/Handler/GenericFeatureHandler.pm
  4. +127 −83 lib/Bio/FeatureIO/bed.pm
  5. +80 −92 lib/Bio/FeatureIO/gff.pm
  6. +4 −4 lib/Bio/FeatureIO/gtf.pm
  7. +3 −0 t/bed.t
  8. +95 −0 t/data/gtf2gff3.cfg
  9. +1 −0 t/gff.t
View
2 README
@@ -22,7 +22,7 @@ This module requires these other modules and libraries:
COPYRIGHT AND LICENCE
-Copyright (C) 2010 by Christopher Fields and Allen Day
+Copyright (C) 2010 by Chris Fields and Allen Day
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.10.1 or,
View
@@ -2,7 +2,7 @@
#
# BioPerl module for Bio::FeatureIO
#
-# Please direct questions and support issues to <bioperl-l@bioperl.org>
+# Please direct questions and support issues to <bioperl-l@bioperl.org>
#
# Reimplementation by Chris Fields <cjfields at bioperl dot org>
#
@@ -127,12 +127,12 @@ from a file and prints them out in fasta format with some HTML tags:
use Bio::FeatureIO;
use IO::String;
my $in = Bio::FeatureIO->new('-file' => "my.gff" ,
- '-format' => 'EMBL');
+ '-format' => 'EMBL');
while ( my $f = $in->next_feature() ) {
# the output handle is reset for every file
my $stringio = IO::String->new($string);
my $out = Bio::FeatureIO->new('-fh' => $stringio,
- '-format' => 'gtf');
+ '-format' => 'gtf');
# output goes into $string
$out->write_feature($f);
# modify $string
@@ -265,7 +265,8 @@ package Bio::FeatureIO;
use strict;
use warnings;
-
+use Config::Tiny;
+
use Symbol;
use base qw(Bio::Root::IO);
@@ -286,36 +287,38 @@ use base qw(Bio::Root::IO);
my $entry = 0;
sub new {
- my ($caller,@args) = @_;
+ my ( $caller, @args ) = @_;
my $class = ref($caller) || $caller;
-
+
# or do we want to call SUPER on an object if $caller is an
# object?
- if( $class =~ /Bio::FeatureIO::(\S+)/ ) {
-
- my ($self) = $class->SUPER::new(@args);
- $self->_initialize(@args);
- return $self;
-
- } else {
-
- my %param = @args;
-
- @param{ map { lc $_ } keys %param } = values %param; # lowercase keys
- my $format = $param{'-format'} ||
- $class->_guess_format( $param{-file} || $ARGV[0] );
-
- if( ! $format ) {
- if ($param{-file}) {
- $format = $class->_guess_format($param{-file});
- } elsif ($param{-fh}) {
- $format = $class->_guess_format(undef);
+ if ( $class =~ /Bio::FeatureIO::(\S+)/ ) {
+
+ my ($self) = $class->SUPER::new(@args);
+ $self->_initialize(@args);
+ return $self;
+
+ }
+ else {
+
+ my %param = @args;
+
+ @param{ map { lc $_ } keys %param } = values %param; # lowercase keys
+ my $format = $param{'-format'}
+ || $class->_guess_format( $param{-file} || $ARGV[0] );
+
+ if ( !$format ) {
+ if ( $param{-file} ) {
+ $format = $class->_guess_format( $param{-file} );
+ }
+ elsif ( $param{-fh} ) {
+ $format = $class->_guess_format(undef);
+ }
}
- }
- $format = "\L$format"; # normalize capitalization to lower case
- return unless( $class->_load_format_module($format) );
- return "Bio::FeatureIO::$format"->new(@args);
-
+ $format = "\L$format"; # normalize capitalization to lower case
+ return unless ( $class->_load_format_module($format) );
+ return "Bio::FeatureIO::$format"->new(@args);
+
}
}
@@ -353,24 +356,23 @@ sub newFh {
=cut
-
sub fh {
- my $self = shift;
- my $class = ref($self) || $self;
- my $s = Symbol::gensym;
- tie $$s,$class,$self;
- return $s;
+ my $self = shift;
+ my $class = ref($self) || $self;
+ my $s = Symbol::gensym;
+ tie $$s, $class, $self;
+ return $s;
}
# _initialize is chained for all FeatureIO classes
sub _initialize {
- my($self, %arg) = @_;
+ my ( $self, %arg ) = @_;
# flush is initialized by the Root::IO init
# initialize the IO part
- $self->seq($arg{-seq});
+ $self->seq( $arg{-seq} );
$self->_initialize_io(%arg);
}
@@ -400,8 +402,8 @@ See L<Bio::Root::RootI>, L<Bio::SeqFeatureI>.
=cut
sub next_feature {
- my ($self, $seq) = @_;
- $self->throw_not_implemented;
+ my ( $self, $seq ) = @_;
+ $self->throw_not_implemented;
}
=head2 write_feature
@@ -415,7 +417,7 @@ sub next_feature {
=cut
sub write_feature {
- my ($self, $seq) = @_;
+ my ( $self, $seq ) = @_;
$self->throw_not_implemented();
}
@@ -430,11 +432,11 @@ sub write_feature {
=cut
sub seq {
- my $self = shift;
- my $val = shift;
+ my $self = shift;
+ my $val = shift;
- $self->{'seq'} = $val if defined($val);
- return $self->{'seq'};
+ $self->{'seq'} = $val if defined($val);
+ return $self->{'seq'};
}
=head2 _load_format_module
@@ -449,24 +451,21 @@ sub seq {
=cut
sub _load_format_module {
- my ($self, $format) = @_;
+ my ( $self, $format ) = @_;
my $class = ref($self) || $self;
- my $module = $class."::$format";#"Bio::Feature::" . $format;
+ my $module = $class . "::$format"; #"Bio::Feature::" . $format;
my $ok;
- eval {
- $ok = $self->_load_module($module);
- };
- if ( $@ ) {
- print STDERR <<END;
+ eval { $ok = $self->_load_module($module); };
+ if ($@) {
+ print STDERR <<END;
$self: $format cannot be found
Exception $@
For more information about the FeatureIO system please see the FeatureIO docs.
This includes ways of checking for formats at compile time, not run time
END
- ;
- }
- return $ok;
+ }
+ return $ok;
}
=head2 _guess_format
@@ -482,14 +481,14 @@ END
=cut
sub _guess_format {
- my $class = shift;
- return unless $_ = shift;
- return 'gff' if /\.gff3?$/i;
- return 'gff' if /\.gtf$/i;
- return 'bed' if /\.bed$/i;
- return 'ptt' if /\.ptt$/i;
-
- return 'gff'; #the default
+ my $class = shift;
+ return unless $_ = shift;
+ return 'gff' if /\.gff3?$/i;
+ return 'gff' if /\.gtf$/i;
+ return 'bed' if /\.bed$/i;
+ return 'ptt' if /\.ptt$/i;
+
+ return 'gff'; #the default
}
sub DESTROY {
@@ -498,22 +497,25 @@ sub DESTROY {
}
sub TIEHANDLE {
- my ($class,$val) = @_;
- return bless {'featio' => $val}, $class;
+ my ( $class, $val ) = @_;
+ return bless { 'featio' => $val }, $class;
}
sub READLINE {
- my $self = shift;
- return $self->{'featio'}->next_feature() unless wantarray;
- my (@list, $obj);
- push @list, $obj while $obj = $self->{'featio'}->next_feature();
- return @list;
+ my $self = shift;
+ return $self->{'featio'}->next_feature() unless wantarray;
+ my ( @list, $obj );
+ push @list, $obj while $obj = $self->{'featio'}->next_feature();
+ return @list;
}
sub PRINT {
- my $self = shift;
- $self->{'featio'}->write_feature(@_);
+ my $self = shift;
+ $self->{'featio'}->write_feature(@_);
}
1;
+__END__
+
+
@@ -6,6 +6,7 @@ use strict;
use warnings;
use Data::Dumper;
use Bio::SeqFeature::Generic;
+use Bio::SeqFeature::Tools::Unflattener;
use Bio::SeqIO;
my $ct = 0;
@@ -20,6 +21,10 @@ my %HANDLERS = (
'sequence' => \&sequence,
);
+our $ONTOLOGY_STORE;
+our $UNFLATTENER;
+our $ID_HANDLER;
+
sub new {
my ($class, @args) = @_;
my $self = $class->SUPER::new(@args);
@@ -35,15 +40,12 @@ sub data_handler {
$self->set_parameters('mode', $nm eq 'directive' ? $data->{DATA}->{type} : $nm);
- # this should handle data on the fly w/o caching; any caching should be
- # done in the driver!
my $method = (exists $self->{'handlers'}->{$nm}) ? ($self->{'handlers'}->{$nm}) :
(exists $self->{'handlers'}->{'_DEFAULT_'}) ? ($self->{'handlers'}->{'_DEFAULT_'}) :
undef;
- # needs a can check, but $self->can oddly isn't working here...
if ($method && ref $method eq 'CODE') {
- return $method->($data, $self);
+ return $self->$method($data);
} else {
$self->debug("No handler defined for $nm\n");
return;
@@ -72,6 +74,11 @@ sub format {
return $self->{format};
}
+sub fast {
+ my $self = shift;
+ $self->{fast} || 0;
+}
+
sub reset_parameters {
my ($self) = @_;
$self->{parameters} = {};
@@ -126,7 +133,7 @@ sub resolve_references {
# Note this just passes in the data w/o munging it beyond recognition
sub seqfeature {
- my ($data, $handler) = @_;
+ my ($handler, $data) = @_;
my %sf_data = map {'-'.$_ => $data->{DATA}->{$_}}
grep { $data->{DATA}->{$_} ne '.' }
@@ -150,15 +157,15 @@ sub seqfeature {
}
sub directives {
- my ($data, $handler) = @_;
+ my ($handler, $data) = @_;
my $directive = $data->{DATA}->{type};
if ($directive eq 'sequence') {
my $fh = $handler->file_handle;
$handler->throw("Handler doesn't have a set file handle") if !$fh;
return Bio::SeqIO->new(-format => 'fasta',
-fh => $fh);
} elsif ($directive eq 'sequence-region') {
- # we can make returning a features optional here, but we should do
+ # we can make returning a feature optional here, but we should do
# something with the data in all cases
my $sf_data = $data->{DATA};
@@ -174,7 +181,7 @@ sub directives {
}
sub sequence {
- my ($data, $handler) = @_;
+ my ($handler, $data) = @_;
# if we reach this point, the sequence stream has already been read, so
# we need to seek back to the start point. Note if the stream isn't seekable
# this will fail spectacularly at this point!
Oops, something went wrong.

0 comments on commit 8d387a2

Please sign in to comment.