Permalink
Browse files

deal with OX lines and TaxIDs (patch courtesy Jacob Bunk Nielsen)

  • Loading branch information...
1 parent e8c205c commit 8b00b61d8641c636ad33e420197b9b7fc452ef42 Chris Fields committed Dec 23, 2010
Showing with 92 additions and 1 deletion.
  1. +18 −0 Bio/SeqIO/embl.pm
  2. +74 −1 t/SeqIO/embl.t
View
@@ -267,6 +267,7 @@ sub next_seq {
my $buffer = $line;
local $_;
BEFORE_FEATURE_TABLE :
+ my $ncbi_taxid;
until ( !defined $buffer ) {
$_ = $buffer;
# Exit at start of Feature table
@@ -339,6 +340,10 @@ sub next_seq {
# NCBI TaxID Xref
elsif (/^OX/) {
+ if (/NCBI_TaxID=(\d+)/) {
+ $ncbi_taxid=$1;
+ }
+
my @links = $self->_read_EMBL_TaxID_DBLink(\$buffer);
foreach my $dblink ( @links ) {
$annotation->add_Annotation('dblink',$dblink);
@@ -399,6 +404,9 @@ sub next_seq {
$ftunit->_generic_seqfeature($self->location_factory(), $name);
# add taxon_id from source if available
+ # Notice, this will override what is found in the OX line.
+ # this is by design as this seems to be the official way
+ # of specifying a TaxID
if ($params{'-species'} && ($feat->primary_tag eq 'source')
&& $feat->has_tag('db_xref')
&& (! $params{'-species'}->ncbi_taxid())) {
@@ -418,6 +426,12 @@ sub next_seq {
}
}
}
+ # Set taxid found in OX line
+ if ($params{'-species'} && defined $ncbi_taxid
+ && (! $params{'-species'}->ncbi_taxid())) {
+ $params{'-species'}->ncbi_taxid($ncbi_taxid);
+ }
+
# skip comments
while ( defined ($buffer) && $buffer =~ /^XX/ ) {
$buffer = $self->_readline();
@@ -727,6 +741,10 @@ sub write_seq {
if ($spec->organelle) {
$self->_write_line_EMBL_regex("OG ","OG ",$spec->organelle,'; |$',80) || return;
}
+ my $ncbi_taxid = $spec->ncbi_taxid;
+ if ($ncbi_taxid) {
+ $self->_print("OX NCBI_TaxID=$ncbi_taxid\n") || return;
+ }
$self->_print("XX\n") || return;
}
# Reference lines
View
@@ -8,7 +8,7 @@ BEGIN {
use lib '../..';
use Bio::Root::Test;
- test_begin(-tests => 87);
+ test_begin(-tests => 95);
use_ok('Bio::SeqIO::embl');
}
@@ -242,3 +242,76 @@ foreach my $feature ($seq->top_SeqFeatures) {
ok($ret, 'Parse long qualifier');
is($error, undef);
}
+
+# NCBI TaxIDs should roundtrip
+{
+ my $seq=Bio::Seq->new(-seq=>'actg');
+ my $species = Bio::Species->new(-ncbi_taxid => 7165, -classification=>
+ [ 'Anopheles gambiae', 'Anopheles', 'Culicoidea',
+ 'Nematocera', 'Diptera', 'Endopterygota',
+ 'Neoptera', 'Pterygota', 'Insecta', 'Hexapoda',
+ 'Arthropoda', 'Metazoa', 'Eukaryota' ]);
+
+ $seq->species($species);
+ is($seq->species->ncbi_taxid, 7165, 'TaxID set correctly');
+
+ # Write EMBL
+ my $string;
+ open(my $str_fh, '>', \$string) || skip("Can't open string, skipping", 2);
+
+ my $out=Bio::SeqIO->new(-format=>'embl', -fh => $str_fh);
+ $out->write_seq($seq);
+
+ # Read EMBL
+ my $in=Bio::SeqIO->new(-format=>'embl', -string => $string);
+ my $embl_seq;
+ my $ret=eval { $embl_seq=$in->next_seq };
+ my $error;
+ $error=$@ if (!$ret);
+
+ # Check that TaxID has roundtripped
+ my $embl_species = $embl_seq->species;
+ ok(defined $embl_species, "The read sequence has a species object");
+ is($embl_species->ncbi_taxid, 7165, "NCBI TaxID has roundtripped");
+ is($embl_species->binomial(), 'Anopheles gambiae', "Name has roundtripped");
+}
+
+# a taxon db_xref on a source feature should override an OX line
+{
+ my $seq=Bio::Seq->new(-seq=>'actg');
+ my $species = Bio::Species->new(-ncbi_taxid => 7165, -classification=>
+ [ 'Anopheles gambiae', 'Anopheles', 'Culicoidea',
+ 'Nematocera', 'Diptera', 'Endopterygota',
+ 'Neoptera', 'Pterygota', 'Insecta', 'Hexapoda',
+ 'Arthropoda', 'Metazoa', 'Eukaryota' ]);
+
+ $seq->species($species);
+ is($seq->species->ncbi_taxid, 7165, 'TaxID set correctly');
+
+ my $seq_feature = Bio::SeqFeature::Generic->new(-primary=>'source',
+ -start => 1,
+ -end=> length($seq->seq));
+
+ $seq_feature->add_tag_value('db_xref', 'taxon:9606');
+ $seq->add_SeqFeature($seq_feature);
+
+ # Write EMBL
+ my $string;
+ open(my $str_fh, '>', \$string) || skip("Can't open string, skipping", 2);
+
+ my $out=Bio::SeqIO->new(-format=>'embl', -fh => $str_fh);
+ $out->write_seq($seq);
+
+ # Read EMBL
+ my $in=Bio::SeqIO->new(-format=>'embl', -string => $string);
+ my $embl_seq;
+ my $ret=eval { $embl_seq=$in->next_seq };
+ my $error;
+ $error=$@ if (!$ret);
+
+ # Check that TaxID has roundtripped
+ my $embl_species = $embl_seq->species;
+ ok(defined $embl_species, "The read sequence has a species object");
+ is($embl_species->ncbi_taxid, 9606, "The taxid of the source feature overrides that of the OX line");
+ is($embl_species->binomial(), 'Anopheles gambiae', "Name has roundtripped");
+}

0 comments on commit 8b00b61

Please sign in to comment.