Skip to content
Browse files

This commit was manufactured by cvs2svn to create tag

'prerelease-06'.

svn path=/bioperl-live/tags/prerelease-06/; revision=1143
  • Loading branch information...
1 parent b7af73a commit a46f61296f90018244583fbc55bc701408f2914c nobody committed Jan 27, 2000
Sorry, we could not display the entire diff because it was too big.
View
121 Bio/AnnSeq.pm
@@ -74,7 +74,7 @@ Report bugs to the Bioperl bug tracking system to help us keep track
bioperl-bugs@bio.perl.org
http://bio.perl.org/bioperl-bugs/
-=head1 AUTHOR - Ewan Birney, inspired by Ian Korf objects
+=head1 AUTHOR - Ewan Birney, inspired by Ian Korf's objects
Email birney@sanger.ac.uk
@@ -111,8 +111,6 @@ sub _initialize {
my($ann);
my $make = $self->SUPER::_initialize;
$self->{'_as_feat'} = [];
- $self->{'date'} = [];
- $self->{'secondary_accession'} = [];
$ann = new Bio::Annotation;
$self->annotation($ann);
@@ -308,28 +306,6 @@ sub species {
}
}
-=head2 sub_species
-
- Title : sub_species
- Usage :
- Function: Gets or sets the sub_species
- Example : $sub_species = $self->sub_species();
- Returns : Bio::Species object
- Args : Bio::Species object or none;
-
-
-=cut
-
-sub sub_species {
- my ($self, $sub_species) = @_;
-
- if ($sub_species) {
- $self->{'sub_species'} = $sub_species;
- } else {
- return $self->{'sub_species'}
- }
-}
-
=head1 EMBL/GenBank/DDBJ methods
These methods are here to support the EMBL/GenBank/DDBJ format.
@@ -364,63 +340,6 @@ sub division{
}
-=head2 molecule
-
- Title : molecule
- Usage : $obj->molecule($newval)
- Function:
- Returns : type of molecule (DNA, mRNA)
- Args : newvalue (optional)
-
-
-=cut
-
-sub molecule{
- my $obj = shift;
- if( @_ ) {
- my $value = shift;
- $obj->{'molecule'} = $value;
- }
- return $obj->{'molecule'};
-
-}
-
-=head2 add_date
-
- Title : add_date
- Usage : $self->add_domment($ref)
- Function: adds a date
- Example :
- Returns :
- Args :
-
-
-=cut
-
-sub add_date{
- my ($self) = shift;
- foreach my $dt ( @_ ) {
- push(@{$self->{'date'}},$dt);
- }
-}
-
-=head2 each_Comment
-
- Title : each_date
- Usage : foreach $dt ( $self->each_date() )
- Function: gets an array of dates
- Example :
- Returns :
- Args :
-
-
-=cut
-
-sub each_date{
- my ($self) = @_;
- return @{$self->{'date'}};
-}
-
=head2 accession
Title : accession
@@ -444,42 +363,6 @@ sub accession{
}
-=head2 add_secondary_accession
-
- Title : add_secondary_accession
- Usage : $self->add_domment($ref)
- Function: adds a secondary_accession
- Example :
- Returns :
- Args :
-
-
-=cut
-
-sub add_secondary_accession{
- my ($self) = shift;
- foreach my $dt ( @_ ) {
- push(@{$self->{'secondary_accession'}},$dt);
- }
-}
-
-=head2 each_Comment
-
- Title : each_secondary_accession
- Usage : foreach $dt ( $self->each_secondary_accession() )
- Function: gets an array of secondary_accessions
- Example :
- Returns :
- Args :
-
-
-=cut
-
-sub each_secondary_accession{
- my ($self) = @_;
- return @{$self->{'secondary_accession'}};
-}
-
=head2 sv
Title : sv
@@ -530,3 +413,5 @@ sub keywords{
+
+
View
4 Bio/AnnSeqIO.pm
@@ -21,8 +21,8 @@ Bio::AnnSeqIO - Handler for AnnSeqIO Formats
$in = Bio::AnnSeqIO->new(-file => "inputfilename" , -format => 'Fasta');
$out = Bio::AnnSeqIO->new(-file => ">outputfilename" , -format => 'EMBL');
- while ($seq = $in->next_annseq() ) {
- $out->write_annseq($seq);
+ while $seq ( $in->next_annseq() ) {
+ $out->write_annseq($out);
}
=head1 DESCRIPTION
View
793 Bio/AnnSeqIO/EMBL.pm
@@ -162,49 +162,25 @@ sub _initialize {
=cut
-sub next_annseq {
+sub next_annseq{
my ($self,@args) = @_;
- my ($seq,$fh,$c,$line,$name,$desc,$acc,$seqc,$mol,$div, $date, $comment, @date_arr);
- my $annseq = Bio::AnnSeq->new();
+ my ($seq,$fh,$c,$line,$name,$desc,$acc,$seqc);
$fh = $self->_filehandle();
- $line = <$fh>; # This needs to be before the first eof() test
-
if( eof $fh ) {
return undef; # no throws - end of file
}
- if( $line =~ /^\s+$/ ) {
- while( <$fh> ) {
- /\S/ && last;
- }
- $line = $_;
- }
-
- if( !defined $line ) {
- return undef; # end of file
- }
- $line =~ /^ID\s+\S+/ || $self->throw("EMBL stream with no ID. Not embl in my book");
- $line =~ /^ID\s+(\S+)\s+\S+\; (.+)\; (\S+)/;
+ $line = <$fh>;
+ $line =~ /^ID\s+(\S+)/ || $self->throw("EMBL stream with no ID. Not embl in my book");
$name = $1;
-
- $mol= $2;
- $mol =~ s/\;//;
- if ($mol) {
- $annseq->molecule($mol);
- }
-
- if ($3) {
- $div = $3;
- $div =~ s/\;//;
- $annseq->division($div);
- }
-
+
# $self->warn("not parsing upper annotation in EMBL file yet!");
+
my $buffer = $line;
-
+ my $annseq = Bio::AnnSeq->new();
BEFORE_FEATURE_TABLE :
until( eof($fh) ) {
@@ -218,71 +194,35 @@ sub next_annseq {
if (/^DE\s+(\S.*\S)/) {
$desc .= $desc ? " $1" : $1;
}
-
- #accession number
if( /^AC\s+(\S+);?/ ) {
$acc = $1;
- $acc =~ s/\;//;
$annseq->accession($acc);
}
-
- #version number
+
if( /^SV\s+(\S+);?/ ) {
my $sv = $1;
- $sv =~ s/\;//;
$annseq->sv($sv);
}
- #date (NOTE: takes last date line)
- if( /^DT\s+(\S+)/ ) {
- my $date = $1;
- $date =~ s/\;//;
- $annseq->add_date($date);
- }
-
- #keywords
if( /^KW (.*)\S*$/ ) {
my $keywords = $1;
$annseq->keywords($keywords);
}
- # Organism name and phylogenetic information
- elsif (/^O[SC]/) {
- my $species = _read_EMBL_Species(\$buffer, $fh);
- $annseq->species( $species );
- }
-
+ # accession numbers...
+
# References
elsif (/^R/) {
my @refs = &_read_EMBL_References(\$buffer,$fh);
$annseq->annotation->add_Reference(@refs);
}
- # DB Xrefs
- elsif (/^DR/) {
- my @links = &_read_EMBL_DBLink(\$buffer,$fh);
- $annseq->annotation->add_DBLink(@links);
+ # Organism name and phylogenetic information
+ elsif (/^O[SC]/) {
+ my $species = _read_EMBL_Species(\$buffer, $fh);
+ $annseq->species( $species );
}
- # Comments
- elsif (/^CC\s+(.*)/) {
- $comment .= $1;
- $comment .= " ";
- while (<$fh>) {
- if (/^CC\s+(.*)/) {
- $comment .= $1;
- $comment .= " ";
- }
- else {
- last;
- }
- }
- my $commobj = Bio::Annotation::Comment->new();
- $commobj->text($comment);
- $annseq->annotation->add_Comment($commobj);
- $comment = "";
- }
-
# Get next line.
$buffer = <$fh>;
}
@@ -296,7 +236,7 @@ sub next_annseq {
until( eof($fh) ) {
my $ftunit = &_read_FTHelper_EMBL($fh,\$buffer);
# process ftunit
- $ftunit->_generic_seqfeature($annseq);
+ &_generic_seqfeature($annseq,$ftunit);
if( $buffer !~ /^FT/ ) {
last;
@@ -335,260 +275,202 @@ sub next_annseq {
=cut
sub write_annseq {
- my ($self,$annseq) = @_;
+ my ($self,$annseq) = @_;
- if( !defined $annseq ) {
- $self->throw("Attempting to write with no annseq!");
- }
+ if( !defined $annseq ) {
+ $self->throw("Attempting to write with no annseq!");
+ }
- if( ! ref $annseq || ! $annseq->isa('Bio::AnnSeqI') ) {
- $self->warn(" $annseq is not a AnnSeqI compliant module. Attempting to dump, but may fail!");
- }
+ if( ! ref $annseq || ! $annseq->isa('Bio::AnnSeqI') ) {
+ $self->warn(" $annseq is not a AnnSeqI compliant module. Attempting to dump, but may fail!");
+ }
- my $fh = $self->_filehandle();
- my $seq = $annseq->seq();
- my $str = $seq->seq;
+ my $fh = $self->_filehandle();
+ my $seq = $annseq->seq();
+ my $i;
+ my $str = $seq->seq;
- my $mol;
- my $div;
- my $len = $seq->seq_len();
+ my $div;
+ my $len = $seq->seq_len();
- if ($annseq->can('division') && defined $annseq->division) {
- $div = $annseq->division();
- }
- $div ||= 'UNK';
-
- if ($annseq->can('molecule')) {
- $mol = $annseq->molecule();
- }
- $mol ||= 'XXX';
+ if( !$annseq->can('division') || ($div = $annseq->division()) == undef ) {
+ $div = 'UNK';
+ }
+
+ my $temp_line;
+ if( $self->_id_generation_func ) {
+ $temp_line = &{$self->_id_generation_func}($annseq);
+ } else {
+ $temp_line = sprintf ("%-11sstandard; DNA; $div; %d BP.",$seq->id(),$len);
+ }
+
+ print $fh "ID $temp_line\n";
- my $temp_line;
- if( $self->_id_generation_func ) {
- $temp_line = &{$self->_id_generation_func}($annseq);
- } else {
- $temp_line = sprintf("%-11sstandard; $mol; $div; %d BP.", $seq->id(), $len);
- }
-
- print $fh "ID $temp_line\n",
- "XX\n";
-
- # Write the accession line if present
- {
- my( $acc );
- if( my $func = $self->_ac_generation_func ) {
- $acc = &{$func}($annseq);
- } elsif( $annseq->can('accession')) {
- $acc = $annseq->accession;
- }
- if (defined $acc) {
- print $fh "AC $acc\n",
- "XX\n";
- }
- }
+ print $fh "XX \n";
- # Write the sv line if present
- {
- my( $sv );
- if (my $func = $self->_sv_generation_func) {
- $sv = &{$func}($annseq);
- } elsif( $annseq->can('sv')) {
- $sv = $annseq->sv;
- }
- if (defined $sv) {
- print $fh "SV $sv\n",
- "XX\n";
- }
- }
- # Date lines
- my $switch=0;
- foreach my $dt ( $annseq->each_date() ) {
- _write_line_EMBL_regex($fh,"DT ","DT ",$dt,'\s+|$',80);
- $switch=1;
- }
- if ($switch == 1) {
- print $fh "XX\n";
- }
+ # if there, write the accession line
+
+ if( $self->_ac_generation_func ) {
+ $temp_line = &{$self->_ac_generation_func}($annseq);
+ print $fh "AC $temp_line\n";
+ print $fh "XX \n";
+ } else {
+ # nothing at the moment
+ }
+
+ # if there, write the sv line
+
+ if( $self->_sv_generation_func ) {
+ $temp_line = &{$self->_sv_generation_func}($annseq);
+ if( $temp_line ) {
+ print $fh "SV $temp_line\n";
+ print $fh "XX \n";
+ }
+ } else {
+ # nothing at the moment
+ }
+
+
+ # this next line screws up perl mode parsing. Sorry. It is a pain!
+ _write_line_EMBL_regex($fh,"DE ","DE ",$seq->desc(),'\s+|$',80);
+ print $fh "XX \n";
+
+ # if there, write the kw line
+
+ if( $self->_kw_generation_func ) {
+ $temp_line = &{$self->_kw_generation_func}($annseq);
+ print $fh "KW $temp_line\n";
+ print $fh "XX \n";
+ } else {
+ # nothing at the moment
+ }
+
- # Description lines
- _write_line_EMBL_regex($fh,"DE ","DE ",$seq->desc(),'\s+|$',80);
- print $fh "XX\n";
-
- # if there, write the kw line
- {
- my( $kw );
- if( my $func = $self->_kw_generation_func ) {
- $kw = &{$func}($annseq);
- } elsif( $annseq->can('keywords') ) {
- $kw = $annseq->keywords;
- }
- if (defined $kw) {
- print $fh "KW $kw\n",
- "XX\n";
- }
- }
# Organism lines
if (my $spec = $annseq->species) {
my($species, $genus, @class) = $spec->classification();
my $OS = "$genus $species";
- if (my $sub_species = $spec->sub_species) {
- $OS .= " $sub_species";
- }
if (my $common = $spec->common_name) {
$OS .= " ($common)";
}
print $fh "OS $OS\n";
- my $OC = join('; ', reverse(@class));
- $OC =~ s/\;\s+$//;
- $OC .= ".";
+ my $OC = join('; ', reverse(@class)). '.';
_write_line_EMBL_regex($fh,"OC ","OC ",$OC,'; |$',80);
- if ($spec->organelle) {
- _write_line_EMBL_regex($fh,"OG ","OG ",$spec->organelle,'; |$',80);
- }
- print $fh "XX\n";
+ print $fh "XX \n";
}
+
+ # Comment lines
+
+ foreach my $comment ( $annseq->annotation->each_Comment() ) {
+ _write_line_EMBL_regex($fh,"CC ","CC ",$comment->text(),'\s+|$',80);
+ print $fh "XX \n";
+ }
- # Reference lines
- my $t = 1;
- foreach my $ref ( $annseq->annotation->each_Reference() ) {
- print $fh "RN [$t]\n";
-
- # Having no RP line is legal, but we need both
- # start and end for a valid location.
- my $start = $ref->start;
- my $end = $ref->end;
- if ($start and $end) {
- print $fh "RP $start-$end\n";
- } elsif ($start or $end) {
- $self->throw("Both start and end are needed for a valid RP line. Got: start='$start' end='$end'");
- }
+ # Reference lines
+ my $t = 1;
+ foreach my $ref ( $annseq->annotation->each_Reference() ) {
+ print $fh "RN [$t]\n";
+ &_write_line_EMBL_regex($fh,"RA ","RA ",$ref->authors,'\s+|$',80);
+ &_write_line_EMBL_regex($fh,"RT ","RT ",$ref->title,'\s+|$',80);
+ &_write_line_EMBL_regex($fh,"RL ","RL ",$ref->location,'\s+|$',80);
+ print $fh "XX \n";
+ $t++;
+ }
- if (my $med = $ref->medline) {
- print $fh "RX MEDLINE; $med\n";
- }
- &_write_line_EMBL_regex($fh, "RA ", "RA ", $ref->authors, '\s+|$', 80);
- # If there is no title to the reference, it appears
- # as a single semi-colon. All titles must end in
- # a semi-colon.
- my $ref_title = $ref->title || '';
- $ref_title =~ s/[\s;]*$/;/;
- &_write_line_EMBL_regex($fh, "RT ", "RT ", $ref_title, '\s+|$', 80);
+ print $fh "FH Key Location/Qualifiers\n";
+ print $fh "FH \n";
- &_write_line_EMBL_regex($fh, "RL ", "RL ", $ref->location, '\s+|$', 80);
- if ($ref->comment) {
- &_write_line_EMBL_regex($fh, "RC ", "RC ", $ref->comment, '\s+|$', 80);
- }
- print $fh "XX\n";
- $t++;
- }
- # DB Xref lines
- if (my @db_xref = $annseq->annotation->each_DBLink) {
- foreach my $dr (@db_xref) {
- my $db_name = $dr->database;
- my $prim = $dr->primary_id;
- my $opt = $dr->optional_id || '';
-
- my $line = "$db_name; $prim; $opt.";
- &_write_line_EMBL_regex($fh, "DR ", "DR ", $line, '\s+|$', 80);
- }
- print $fh "XX\n";
- }
+ if( defined $self->_post_sort ) {
+ # we need to read things into an array. Process. Sort them. Print 'em
- # Comment lines
- foreach my $comment ( $annseq->annotation->each_Comment() ) {
- _write_line_EMBL_regex($fh, "CC ", "CC ", $comment->text, '\s+|$', 80);
- print $fh "XX\n";
- }
- # "\\s\+\|\$"
+ my $post_sort_func = $self->_post_sort();
+ my @fth;
- ## FEATURE TABLE
+ foreach my $sf ( $annseq->top_SeqFeatures ) {
+ push(@fth,Bio::AnnSeqIO::FTHelper::from_SeqFeature($sf,$annseq));
+ }
- print $fh "FH Key Location/Qualifiers\n";
- print $fh "FH\n";
+ @fth = sort { &$post_sort_func($a,$b) } @fth;
+
+ foreach my $fth ( @fth ) {
+ &_print_EMBL_FTHelper($fth,$fh);
+ }
+ } else {
+ # not post sorted. And so we can print as we get them.
+ # lower memory load...
+
+ foreach my $sf ( $annseq->top_SeqFeatures ) {
+ my @fth = Bio::AnnSeqIO::FTHelper::from_SeqFeature($sf,$annseq);
+ foreach my $fth ( @fth ) {
+ if( ! $fth->isa('Bio::AnnSeqIO::FTHelper') ) {
+ $sf->throw("Cannot process FTHelper... $fth");
+ }
- if( defined $self->_post_sort ) {
- # we need to read things into an array. Process. Sort them. Print 'em
+ &_print_EMBL_FTHelper($fth,$fh);
+ }
+ }
+ }
- my $post_sort_func = $self->_post_sort();
- my @fth;
+ print $fh "XX \n";
- foreach my $sf ( $annseq->top_SeqFeatures ) {
- push(@fth,Bio::AnnSeqIO::FTHelper::from_SeqFeature($sf,$annseq));
- }
+ if( $self->_show_dna() == 0 ) {
+ return;
+ }
- @fth = sort { &$post_sort_func($a,$b) } @fth;
+ # finished printing features.
- foreach my $fth ( @fth ) {
- &_print_EMBL_FTHelper($fth,$fh);
- }
- } else {
- # not post sorted. And so we can print as we get them.
- # lower memory load...
-
- foreach my $sf ( $annseq->top_SeqFeatures ) {
- my @fth = Bio::AnnSeqIO::FTHelper::from_SeqFeature($sf,$annseq);
- foreach my $fth ( @fth ) {
- &_print_EMBL_FTHelper($fth,$fh);
- }
- }
- }
+ $str =~ tr/A-Z/a-z/;
+ my $a = $str;
+ $a =~ s/[^a]//g;
+ my $alen = length $a;
- print $fh "XX\n";
+ my $t = $str;
+ $t =~ s/[^t]//g;
+ my $tlen = length $t;
- if( $self->_show_dna() == 0 ) {
- return;
- }
+ my $g = $str;
+ $g =~ s/[^g]//g;
+ my $glen = length $g;
- # finished printing features.
+ my $c = $str;
+ $c =~ s/[^c]//g;
+ my $clen = length $c;
- $str =~ tr/A-Z/a-z/;
-
- # Count each nucleotide
- my $alen = $str =~ tr/a/a/;
- my $clen = $str =~ tr/c/c/;
- my $glen = $str =~ tr/g/g/;
- my $tlen = $str =~ tr/t/t/;
-
- my $olen = $len - ($alen + $tlen + $clen + $glen);
- if( $olen < 0 ) {
- $self->warn("Weird. More atgc than bases. Problem!");
- }
+ my $olen = $len - ($alen + $tlen + $clen + $glen);
+ if( $olen < 0 ) {
+ $self->warn("Weird. More atgc than bases. Problem!");
+ }
- print $fh "SQ Sequence $len BP; $alen A; $clen C; $glen G; $tlen T; $olen other;\n";
-
- my $nuc = 60; # Number of nucleotides per line
- my $whole_pat = 'a10' x 6; # Pattern for unpacking a whole line
- my $out_pat = 'A11' x 6; # Pattern for packing a line
- my $length = length($str);
-
- # Calculate the number of nucleotides which fit on whole lines
- my $whole = int($length / $nuc) * $nuc;
-
- # Print the whole lines
- my( $i );
- for ($i = 0; $i < $whole; $i += $nuc) {
- my $blocks = pack $out_pat,
- unpack $whole_pat,
- substr($str, $i, $nuc);
- printf $fh " $blocks%9d\n", $i + $nuc;
- }
+ print $fh "SQ Sequence $len BP; $alen A; $clen C; $glen G; $tlen T; $olen other;\n";
+ print $fh " ";
+ my $linepos;
+ for ($i = 0; $i < length($str); $i += 10) {
+
+ if( $i+10 >= length($str) ) {
+ # last line.
+ print $fh substr($str,$i);
+ $linepos += length($str)-$i;
+ print $fh ' ' x (70 - $linepos);
+ print $fh sprintf(" %-5d\n",length($str));
+ last;
+ }
+ print $fh substr($str,$i,10), " ";
+ $linepos += 11;
+ if( ($i+10)%60 == 0 ) {
+ my $end = $i+10;
+ print $fh sprintf("%-5d\n ",$end);
+ $linepos = 5;
+ }
+ }
- # Print the last line
- if (my $last = substr($str, $i)) {
- my $last_len = length($last);
- my $last_pat = 'a10' x int($last_len / 10) .'a'. $last_len % 10;
- my $blocks = pack $out_pat,
- unpack($last_pat, $last);
- printf $fh " $blocks%9d\n", $length; # Add the length to the end
- }
- print $fh "//\n";
- return 1;
+ print $fh "//\n";
+ return 1;
}
=head2 _print_EMBL_FTHelper
@@ -605,7 +487,6 @@ sub write_annseq {
sub _print_EMBL_FTHelper {
my ($fth,$fh,$always_quote) = @_;
- $always_quote ||= 0;
if( ! ref $fth || ! $fth->isa('Bio::AnnSeqIO::FTHelper') ) {
$fth->warn("$fth is not a FTHelper class. Attempting to print, but there could be tears!");
@@ -614,18 +495,12 @@ sub _print_EMBL_FTHelper {
#print $fh "FH Key Location/Qualifiers\n";
#print $fh sprintf("FT %-15s %s\n",$fth->key,$fth->loc);
- &_write_line_EMBL_regex($fh,sprintf("FT %-15s ",$fth->key),"FT ",$fth->loc,'\,|$',80);
+ &_write_line_EMBL_regex($fh,sprintf("FT %-15s ",$fth->key),"FT ",$fth->loc,',|$',80);
foreach my $tag ( keys %{$fth->field} ) {
- if( ! defined $fth->field->{$tag} ) { next; }
foreach my $value ( @{$fth->field->{$tag}} ) {
- $value =~ s/\"/\"\"/g;
- if ($value eq "_no_value") {
- &_write_line_EMBL_regex($fh,"FT ","FT ","/$tag",'.|$',80);
- }
- elsif( $always_quote == 1 || $value !~ /^\d+$/ ) {
+ if( $always_quote == 1 || $value !~ /^\d+$/ ) {
&_write_line_EMBL_regex($fh,"FT ","FT ","/$tag=\"$value\"",'.|$',80);
- }
- else {
+ } else {
&_write_line_EMBL_regex($fh,"FT ","FT ","/$tag=$value",'.|$',80);
}
# print $fh "FT /", $tag, "=\"", $value, "\"\n";
@@ -647,7 +522,7 @@ sub _print_EMBL_FTHelper {
=cut
-sub _read_EMBL_References {
+sub _read_EMBL_References{
my ($buffer,$fh) = @_;
my (@refs);
@@ -656,36 +531,23 @@ sub _read_EMBL_References {
if( $$buffer !~ /^RN/ ) {
warn("Not parsing line '$$buffer' which maybe important");
}
- my $b1;
- my $b2;
my $title;
my $loc;
my $au;
- my $med;
- my $com;
-
while( <$fh> ) {
/^R/ || last;
- /^RP (\d+)-(\d+)/ && do {$b1=$1;$b2=$2;};
- /^RX MEDLINE;\s+(\d+)/ && do {$med=$1};
/^RA (.*)/ && do { $au .= $1; next;};
/^RT (.*)/ && do { $title .= $1; next;};
/^RL (.*)/ && do { $loc .= $1; next;};
- /^RC (.*)/ && do { $com .= $1; next;};
}
-
+
my $ref = new Bio::Annotation::Reference;
$au =~ s/;\s*$//g;
$title =~ s/;\s*$//g;
- $ref->start($b1);
- $ref->end($b2);
$ref->authors($au);
$ref->title($title);
$ref->location($loc);
- $ref->medline($med);
- $ref->comment($com);
-
push(@refs,$ref);
$$buffer = $_;
@@ -707,29 +569,20 @@ sub _read_EMBL_References {
sub _read_EMBL_Species {
my( $buffer, $fh ) = @_;
- my $org;
-
+
$_ = $$buffer;
- my( $sub_species, $species, $genus, $common, @class );
+
+ my( $species, $genus, $common, @class );
while (defined( $_ ||= <$fh> )) {
- if (/^OS\s+(\S+)\s+(\S+)\s+(\S+)?(?:\s+\((.*)\))?/) {
+ if (/^OS\s+(\S+)\s+(\S+)(?:\s+\((\s+)\))?/) {
$genus = $1;
- if ($2) {
- $species = $2
- }
- else {
- $species = "sp.";
- }
- $sub_species = $3 if $3;
- $common = $4 if $4;
+ $species = $2;
+ $common = $3 if $3;
}
elsif (s/^OC\s+//) {
- push(@class, split /[\;\s\.]+/);
+ push(@class, split /[\s\.;]+/, $_);
}
- elsif (/^OG\s+(.*)/) {
- $org = $1;
- }
else {
last;
}
@@ -741,61 +594,17 @@ sub _read_EMBL_Species {
# Don't make a species object if it's "Unknown" or "None"
return if $genus =~ /^(Unknown|None)$/i;
-
+
# Bio::Species array needs array in Species -> Kingdom direction
- if ($sub_species) {
- push( @class, $genus, $species, $sub_species);
- }
- else {
- push( @class, $genus, $species, "");
- }
+ push( @class, $genus, $species );
@class = reverse @class;
my $make = Bio::Species->new();
$make->classification( @class );
$make->common_name( $common ) if $common;
- $make->organelle($org) if $org;
return $make;
}
-=head2 _read_EMBL_DBLink
-
- Title : _read_EMBL_DBLink
- Usage :
- Function: Reads the EMBL database cross reference ("DR") lines
- Example :
- Returns : A list of Bio::Annotation::DBLink objects
- Args :
-
-=cut
-
-sub _read_EMBL_DBLink {
- my( $buffer, $fh ) = @_;
- my( @db_link );
-
- $_ = $$buffer;
- while (defined( $_ ||= <$fh> )) {
-
- if (my($databse, $prim_id, $sec_id)
- = /^DR ([^\s;]+);\s*([^\s;]+);\s*([^\s;]+)?\.$/) {
- my $link = Bio::Annotation::DBLink->new();
- $link->database ( $databse );
- $link->primary_id ( $prim_id );
- $link->optional_id( $sec_id ) if $sec_id;
- push(@db_link, $link);
- }
- else {
- last;
- }
-
- $_ = undef; # Empty $_ to trigger read of next line
- }
-
- $$buffer = $_;
-
- return @db_link;
-}
-
=head2 _filehandle
Title : _filehandle
@@ -845,91 +654,149 @@ sub _read_FTHelper_EMBL {
# Exit loop on first qualifier - line is left in $_
last if /^XX/; # sometimes a trailing XX comment left in!
last if /^FT\s+\//;
- last if /^FT\s+(\S+)\s+(\S+)/;
-
# Get location line
/^FT\s+(\S+)/ or $out->throw("Weird location line in EMBL feature table: '$_'");
$loc .= $1;
}
- $loc =~ s/<//;
- $loc =~ s/>//;
$out->key($key);
$out->loc($loc);
# Now read in other fields
- # Loop reads $_ when defined (i.e. only in first loop), then $fh, until end of file
while( defined($_ ||= <$fh>) ) {
- # Exit loop on non FT lines!
- /^FT/ || last;
+ # Exit loop on non FT lines!
+ /^FT/ || last;
+
+ # Exit loop on new primary key
+ /^FT \w/ && last;
+
+ # Field on one line
+ if (/^FT\s+\/(\S+)=\"(.*)\"/) {
+ my $key = $1;
+ my $value = $2;
+ if(! defined $out->field->{$key} ) {
+ $out->field->{$key} = [];
+ }
+ push(@{$out->field->{$key}},$value);
+ }
+ # Field on on multilines:
+ elsif (/^FT\s+\/(\S+)=\"(.*)/) {
+ my $key = $1;
+ my $value = $2;
+ while ( <$fh> ) {
+ /FT\s+(.*)\"/ && do { $value .= $1; last; };
+ /FT\s+(.*?)\s*/ && do {$value .= $1; };
+ }
+ if(! defined $out->field->{$key} ) {
+ $out->field->{$key} = [];
+ }
+ push(@{$out->field->{$key}},$value);
+ }
+ # Field with no quoted value
+ elsif (/^FT\s+\/(\S+)=(\S+)/) {
+ my $key = $1;
+ my $value = $2;
+ if(! defined $out->field->{$key} ) {
+ $out->field->{$key} = [];
+ }
+ push(@{$out->field->{$key}},$value);
+ }
+
+ # Empty $_ to trigger read from $fh
+ undef $_;
+ }
- # Exit loop on new primary key
- /^FT [\w-]/ && last;
+ $$buffer = $_;
+
+ return $out;
+}
- my( $key, $value );
-
- # Field on one line
- if (/^FT\s+\/(\S+)=\"(.+)\"/) {
- $key = $1;
- $value = $2;
- $value =~ s/\"\"/\"/g;
- }
-
- # Field on on multilines:
- elsif (/^FT\s+\/(\S+)=\"(.*)/) {
- $key = $1;
- my @lines = ($2);
-
- # /FT\s+(.*)\"/ && do { $value .= $1; last; };
- # /FT\s+(.*?)\s*/ && do {$value .= $1; };
-
- while ( <$fh> ) {
- s/\"\"/__DOUBLE_QUOTE_STRING__/g;
-
- if (/FT\s+(.*)\"/) {
- push(@lines, $1);
- last;
- }
- elsif (/FT\s+(.*?)\s*$/) {
- push(@lines, $1);
- }
- else {
- $out->throw("Couldn't match '$_'");
- }
- }
-
- # Join list with spaces any of the elements contain them
- my( $join );
- if (grep /\s/, @lines) {
- $join = ' ';
- } else {
- $join = '';
- }
- $value = join($join, @lines);
-
- $value =~ s/__DOUBLE_QUOTE_STRING__/\"/g;
- }
-
- # Field with unquoted value
- elsif (/^FT\s+\/(\S+)=?(\S+)?/) {
- $key = $1;
- $value = $2 ? $2 : "_no_value";
- }
+=head2 _generic_seqfeature
- # Store the key and value
- $out->field->{$key} ||= [];
- push(@{$out->field->{$key}}, $value);
+ Title : _generic_seqfeature
+ Usage : &_generic_seqfeature($annseq,$fthelper)
+ Function: processes fthelper into a generic seqfeature
+ Returns : nothing (places a new seqfeature into annseq)
+ Args : Bio::AnnSeq,Bio::AnnSeqIO::FTHelper
- # Empty $_ to trigger read from $fh
- undef $_;
- }
- $$buffer = $_;
+=cut
- return $out;
-}
+sub _generic_seqfeature {
+ my ($annseq,$fth) = @_;
+ my ($sf);
+
+ # print "Converting from", $fth->key, "\n";
+ $sf = new Bio::SeqFeature::Generic;
+ if( $fth->loc =~ /join/ ) {
+ my $strand;
+ if ( $fth->loc =~ /complement/ ) {
+ $strand = -1;
+ } else {
+ $strand = 1;
+ }
+
+ $sf->strand($strand);
+ $sf->primary_tag($fth->key . "_span");
+ $sf->source_tag('EMBL');
+ $sf->has_tag("parent",1);
+ $sf->_parse->{'parent_homogenous'} = 1;
+
+ # we need to make sub features
+ my $loc = $fth->loc;
+ while( $loc =~ /(\d+)\.\.(\d+)/g ) {
+ my $start = $1;
+ my $end = $2;
+ #print "Processing $start-$end\n";
+ my $sub = new Bio::SeqFeature::Generic;
+ $sub->primary_tag($fth->key);
+ $sub->start($start);
+ $sub->end($end);
+ $sub->strand($strand);
+ $sub->source_tag('EMBL');
+ $sf->add_sub_SeqFeature($sub,'EXPAND');
+ }
+
+ } else {
+ my $lst;
+ my $len;
+
+ if( $fth->loc =~ /^(\d+)$/ ) {
+ $lst = $len = $1;
+ } else {
+ $fth->loc =~ /(\d+)\.\.(\d+)/ || do {
+ $annseq->throw("Weird location line [" . $fth->loc . "] in reading EMBL");
+ last;
+ };
+ $lst = $1;
+ $len = $2;
+ }
+
+ $sf->start($lst);
+ $sf->end($len);
+ $sf->source_tag('EMBL');
+ $sf->primary_tag($fth->key);
+ if( $fth->loc =~ /complement/ ) {
+ $sf->strand(-1);
+ } else {
+ $sf->strand(1);
+ }
+ }
+
+ #print "Adding B4 ", $sf->primary_tag , "\n";
+
+ foreach my $key ( keys %{$fth->field} ){
+ foreach my $value ( @{$fth->field->{$key}} ) {
+ $sf->add_tag_value($key,$value);
+ }
+ }
+
+
+
+ $annseq->add_SeqFeature($sf);
+}
=head2 _write_line_EMBL
@@ -943,7 +810,7 @@ sub _read_FTHelper_EMBL {
=cut
-sub _write_line_EMBL {
+sub _write_line_EMBL{
my ($fh,$pre1,$pre2,$line,$length) = @_;
$length || die "Miscalled write_line_EMBL without length. Programming error!";
@@ -979,33 +846,29 @@ sub _write_line_EMBL {
=cut
sub _write_line_EMBL_regex {
- my ($fh,$pre1,$pre2,$line,$regex,$length) = @_;
+ my ($fh,$pre1,$pre2,$line,$regex,$length) = @_;
- #print STDOUT "Going to print with $line!\n";
+
+ #print STDOUT "Going to print with $line!\n";
- $length || die "Miscalled write_line_EMBL without length. Programming error!";
+ $length || die "Miscalled write_line_EMBL without length. Programming error!";
- if( length $pre1 != length $pre2 ) {
- die "Programming error - cannot called write_line_EMBL_regex with different length pre1 and pre2 tags!";
- }
+ if( length $pre1 != length $pre2 ) {
+ die "Programming error - cannot called write_line_EMBL_regex with different length pre1 and pre2 tags!";
+ }
- my $subl = $length - (length $pre1) -1 ;
+ my $subl = $length - (length $pre1) -1 ;
+ my @lines;
- my( @lines );
- while($line =~ m/(.{1,$subl})($regex)/g) {
- push(@lines, $1.$2);
- }
- foreach (@lines) { s/\s+$//; }
- #chomp(@lines);
-
- # Print first line
- my $s = shift(@lines);
- print $fh "$pre1$s\n";
-
- # Print the rest
- foreach my $s ( @lines ) {
- print $fh "$pre2$s\n";
- }
+ while($line =~ m/(.{1,$subl})($regex)/g) {
+ push(@lines, $1.$2);
+ }
+
+ my $s = shift @lines;
+ print $fh "$pre1$s\n";
+ foreach my $s ( @lines ) {
+ print $fh "$pre2$s\n";
+ }
}
=head2 _post_sort
View
106 Bio/AnnSeqIO/FTHelper.pm
@@ -102,94 +102,8 @@ sub _initialize {
=cut
-=head2 _generic_seqfeature
-
- Title : _generic_seqfeature
- Usage : $fthelper->_generic_seqfeature($annseq)
- Function: processes fthelper into a generic seqfeature
- Returns : nothing (places a new seqfeature into annseq)
- Args : Bio::AnnSeq,Bio::AnnSeqIO::FTHelper
-
-
-=cut
-
-sub _generic_seqfeature {
- my ($fth,$annseq) = @_;
- my ($sf);
-
- # print "Converting from", $fth->key, "\n";
-
- $sf = new Bio::SeqFeature::Generic;
- if( $fth->loc =~ /join/ ) {
- my $strand;
- if ( $fth->loc =~ /complement/ ) {
- $strand = -1;
- } else {
- $strand = 1;
- }
-
- $sf->strand($strand);
- $sf->primary_tag($fth->key . "_span");
- $sf->source_tag('GenBank');
- $sf->has_tag("parent",1);
- $sf->_parse->{'parent_homogenous'} = 1;
-
- # we need to make sub features
- my $loc = $fth->loc;
- while( $loc =~ /\<?(\d+)\.\.\>?(\d+)/g ) {
- my $start = $1;
- my $end = $2;
- #print "Processing $start-$end\n";
- my $sub = new Bio::SeqFeature::Generic;
- $sub->primary_tag($fth->key);
- $sub->start($start);
- $sub->end($end);
- $sub->strand($strand);
- $sub->source_tag('GenBank');
- $sf->add_sub_SeqFeature($sub,'EXPAND');
- }
-
- } else {
- my $lst;
- my $len;
-
- if( $fth->loc =~ /^(\d+)$/ ) {
- $lst = $len = $1;
- } else {
- $fth->loc =~ /\<?(\d+)\.\.\>?(\d+)/ || do {
- $annseq->throw("Weird location line [" . $fth->loc . "] in reading GenBank");
- last;
- };
- $lst = $1;
- $len = $2;
- }
-
- $sf->start($lst);
- $sf->end($len);
- $sf->source_tag('GenBank');
- $sf->primary_tag($fth->key);
- if( $fth->loc =~ /complement/ ) {
- $sf->strand(-1);
- } else {
- $sf->strand(1);
- }
- }
-
- #print "Adding B4 ", $sf->primary_tag , "\n";
-
- foreach my $key ( keys %{$fth->field} ){
- foreach my $value ( @{$fth->field->{$key}} ) {
- $sf->add_tag_value($key,$value);
- }
- }
-
-
-
- $annseq->add_SeqFeature($sf);
-}
-
sub from_SeqFeature {
- my ($sf, $context_annseq) = @_;
+ my ($sf,$context_annseq,) = shift;
my @ret;
my $key;
@@ -199,22 +113,15 @@ sub from_SeqFeature {
# themselves to the EMBL/GenBank...
#
- if ($sf->can("to_FTHelper")) {
+ if( $sf->can("to_FTHelper") ) {
return $sf->to_FTHelper($context_annseq);
}
# build something sensible...
# if the parent homogenous flag is set, build things from the
# sub level
my $loc;
- my $ph_flag;
- if( $sf->can('_parse') ) {
- $ph_flag = $sf->_parse->{'parent_homogenous'} || 0;
- } else {
- $ph_flag = 0;
- }
-
- if( $ph_flag == 1 ) {
+ if( $sf->_parse->{'parent_homogenous'} == 1 ) {
$key = $sf->primary_tag();
$key =~ s/_span//g;
$loc = "join(";
@@ -235,7 +142,6 @@ sub from_SeqFeature {
} else {
$loc = $sf->start() . ".." . $sf->end();
$key = $sf->primary_tag();
-
if( $sf->strand == -1 ) {
$loc = "complement($loc)";
}
@@ -250,8 +156,8 @@ sub from_SeqFeature {
$fth->loc($loc);
$fth->key($key);
- $fth->field->{'note'} = [];
- #$sf->source_tag && do { push(@{$fth->field->{'note'}},"source=" . $sf->source_tag ); };
+ $fth->field->{'note'}= [];
+ $sf->source_tag && do { push(@{$fth->field->{'note'}},"source=" . $sf->source_tag ); };
$sf->score && do { push(@{$fth->field->{'note'}},"score=" . $sf->score ); };
$sf->frame && do { push(@{$fth->field->{'note'}},"frame=" . $sf->frame ); };
#$sf->strand && do { push(@{$fth->field->{'note'}},"strand=" . $sf->strand ); };
@@ -267,7 +173,7 @@ sub from_SeqFeature {
push(@ret,$fth);
- unless (@ret) {
+ if( $#ret == -1 ) {
$context_annseq->throw("Problem in processing seqfeature $sf - no fthelpers. Error!");
}
foreach my $ft ( @ret ) {
View
1,039 Bio/AnnSeqIO/GenBank.pm
@@ -1,1039 +0,0 @@
-
-
-# BioPerl module for Bio::SeqIO::GenBank
-#
-# Cared for by Elia Stupka <elia@ebi.ac.uk>
-#
-# Copyright Elia Stupka
-#
-# You may distribute this module under the same terms as perl itself
-
-# POD documentation - main docs before the code
-
-=head1 NAME
-
-Bio::AnnSeqIO::GenBank - GenBank sequence input/output stream
-
-=head1 SYNOPSIS
-
-It is probably best not to use this object directly, but
-rather go through the AnnSeqIO handler system. Go:
-
- $stream = Bio::AnnSeqIO->new(-file => $filename, -format => 'GenBank');
-
- while ( my $annseq = $stream->next_annseq() ) {
- # do something with $annseq
- }
-
-=head1 DESCRIPTION
-
-This object can transform Bio::AnnSeq objects to and from GenBank flat
-file databases.
-
-There is alot of flexibility here about how to dump things which I need
-to document fully.
-
-
-=head2 Optional functions
-
-=over
-
-=item _show_dna()
-
-(output only) shows the dna or not
-
-=item _post_sort
-
-(output only) provides a sorting func which is applied to the FTHelpers
-before printing
-
-=item _id_generation_func
-
-This is function which is called as
-
- print "ID ", $func($annseq), "\n";
-
-To generate the ID line. If it is not there, it generates a sensible ID
-line using a number of tools.
-
-
-=end
-
-=head1 FEEDBACK
-
-=head2 Mailing Lists
-
-User feedback is an integral part of the evolution of this
-and other Bioperl modules. Send your comments and suggestions preferably
- to one of the Bioperl mailing lists.
-Your participation is much appreciated.
-
- vsns-bcd-perl@lists.uni-bielefeld.de - General discussion
- vsns-bcd-perl-guts@lists.uni-bielefeld.de - Technically-oriented discussion
- http://bio.perl.org/MailList.html - About the mailing lists
-
-=head2 Reporting Bugs
-
-Report bugs to the Bioperl bug tracking system to help us keep track
- the bugs and their resolution.
- Bug reports can be submitted via email or the web:
-
- bioperl-bugs@bio.perl.org
- http://bio.perl.org/bioperl-bugs/
-
-=head1 AUTHOR - Elia Stupka
-
-Email elia@ebi.ac.uk
-
-Describe contact details here
-
-=head1 APPENDIX
-
-The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
-
-=cut
-
-
-# Let the code begin...
-
-
-package Bio::AnnSeqIO::GenBank;
-use vars qw($AUTOLOAD @ISA);
-use strict;
-use Bio::AnnSeq;
-use Bio::AnnSeqIO::FTHelper;
-use Bio::SeqFeature::Generic;
-use Bio::Species;
-use Bio::AnnSeqIO::StreamI;
-
-# Object preamble - inheriets from Bio::Root::Object
-
-use Bio::Root::Object;
-use FileHandle;
-
-@ISA = qw(Bio::Root::Object Bio::AnnSeqIO::StreamI);
-# new() is inherited from Bio::Root::Object
-
-# _initialize is where the heavy stuff will happen when new is called
-
-sub _initialize {
- my($self,@args) = @_;
-
- my $make = $self->SUPER::_initialize;
-
- my ($file,$fh) = $self->_rearrange([qw(
- FILE
- FH
- )],
- @args,
- );
- if( $file && $fh ) {
- $self->throw("Providing both a file and a filehandle for reading from - oly one please!");
- }
-
- if( !$file && !$fh ) {
- $self->throw("Neither a file (-file) nor a filehandle (-fh) provided to GenBank opening");
- }
-
- if( $file ) {
- $fh = new FileHandle;
- $fh->open($file) || $self->throw("Could not open $file for GenBank stream reading $!");
- }
-
- # hash for functions for decoding keys.
- $self->{'_func_ftunit_hash'} = {};
- $self->_filehandle($fh);
- $self->_show_dna(1); # sets this to one by default. People can change it
-
-# set stuff in self from @args
- return $make; # success - we hope!
-}
-
-
-=head2 next_annseq
-
- Title : next_annseq
- Usage : $seq = $stream->next_annseq()
- Function: returns the next sequence in the stream
- Returns : Bio::AnnSeq object
- Args :
-
-
-=cut
-
-sub next_annseq{
- my ($self,@args) = @_;
- my ($seq,$fh,$c,$line,$name,$desc,$acc,$seqc,$mol,$div,$date);
- my $annseq = Bio::AnnSeq->new();
-
- $fh = $self->_filehandle();
-
- if( eof $fh) {
- return undef; # no throws - end of file
- }
-
- $line = <$fh>;
- if( $line =~ /^\s+$/ ) {
- while( <$fh> ) {
- /\S/ && last;
- }
- $line = $_;
- }
-
- if( !defined $line ) {
- return undef; # end of file
- }
- $line =~ /^LOCUS\s+\S+/ || $self->throw("GenBank stream with no LOCUS. Not GenBank in my book. Got $line");
- $line =~ /^LOCUS\s+(\S+)\s+\S+\s+bp\s+(\S+)\s+(\S+)\s+(\S+)/;
-
- $name = $1;
-
- $mol=$2;
- if ($mol) {
- $annseq->molecule($mol);
- }
-
- $div=$3;
- if ($div) {
- $annseq->division($div);
- }
-
- $date=$4;
- if ($date) {
- $annseq->add_date($date);
- }
-
- my $buffer = $line;
-
- BEFORE_FEATURE_TABLE :
- until( eof($fh) ) {
- $_ = $buffer;
-
- # Exit at start of Feature table
- last if /^FEATURES/;
-
- # Description line(s)
- if (/^DEFINITION\s+(\S.*\S)/) {
- $desc .= $desc ? " $1" : $1;
- $desc .= " ";
- while ( <$fh> ) {
- /^\s+(.*)/ && do { $desc .= $1; next;};
- last;
- }
- }
-
- if( /^ACCESSION\s+(\S+)/ ) {
- $acc = $1;
- $annseq->accession($acc);
- }
-
- #Version number
- if( /^VERSION\s+(\S+)/ ) {
- my $sv = $1;
- $annseq->sv($sv);
- }
-
- #Keywords
- if( /^KEYWORDS\s+(.*)/ ) {
- my $keywords = $1;
- $keywords =~ s/\;//g;
- $annseq->keywords($keywords);
- }
-
- # Organism name and phylogenetic information
- if (/^SOURCE/) {
- my $species = _read_GenBank_Species(\$buffer, $fh);
- $annseq->species( $species );
- next;
- }
-
- #References
- if (/^REFERENCE/) {
- my @refs = &_read_GenBank_References(\$buffer,$fh);
- $annseq->annotation->add_Reference(@refs);
- next;
- }
-
- #Comments
- if (/^COMMENT\s+(.*)/) {
- my $comment = $1;
- while (<$fh>) {
- if (/^FEATURES/) {
- $comment =~ s/\n/ /g;
- $comment =~ s/ +/ /g;
- my $commobj = Bio::Annotation::Comment->new();
- $commobj->text($comment);
- $annseq->annotation->add_Comment($commobj);
- last BEFORE_FEATURE_TABLE;
- }
- $comment .= $_;
- }
- }
-
- # Get next line.
- $buffer = <$fh>;
- }
-
- # need to read the first line of the feature table
- $_ = <$fh>;
-
- $buffer = $_;
-
- FEATURE_TABLE :
- until( eof($fh) ) {
-
- last if /^BASE/;
- my $ftunit = &_read_FTHelper_GenBank($fh,\$buffer);
-
- # process ftunit
- $ftunit->_generic_seqfeature($annseq);
- }
- $seqc = "";
- while (<$fh>) {
- last if /^ORIGIN/;
- }
- while( <$fh> ) {
- /^\/\// && last;
- $_ = uc($_);
- s/[^A-Za-z]//g;
- $seqc .= $_;
- }
-
- $seq = Bio::Seq->new(-seq => $seqc , -id => $name, -desc => $desc);
- $annseq->seq($seq);
- return $annseq;
-}
-
-=head2 write_annseq
-
- Title : write_annseq
- Usage : $stream->write_annseq($seq)
- Function: writes the $seq object (must be annseq) to the stream
- Returns : 1 for success and 0 for error
- Args : Bio::AnnSeq
-
-
-=cut
-
-sub write_annseq {
- my ($self,$annseq) = @_;
-
- if( !defined $annseq ) {
- $self->throw("Attempting to write with no annseq!");
- }
-
- if( ! ref $annseq || ! $annseq->isa('Bio::AnnSeqI') ) {
- $self->warn(" $annseq is not a AnnSeqI compliant module. Attempting to dump, but may fail!");
- }
-
- my $fh = $self->_filehandle();
- my $seq = $annseq->seq();
- my $i;
- my $str = $seq->seq;
-
- my ($div, $mol);
- my $len = $seq->seq_len();
-
-
- if ( $annseq->can('division') ) {
- $div=$annseq->division;
- }
- if( !defined $div || ! $div ) { $div = 'UNK'; }
-
- if( !$annseq->can('molecule') || ($mol = $annseq->molecule()) == undef ) {
- $mol = 'DNA';
- }
- else {
- $mol = $annseq->molecule;
- }
-
-
- my $temp_line;
- if( $self->_id_generation_func ) {
- $temp_line = &{$self->_id_generation_func}($annseq);
- } else {
- my @dates = $annseq->each_date();
- my $date = shift @dates;
- $temp_line = sprintf ("%-12s%-10s%10s bp%8s%5s %3s ", 'LOCUS',$seq->id(),$len,$mol,$div,$date);
- }
-
- print $fh "$temp_line\n";
- _write_line_GenBank_regex($fh,"DEFINITION "," ",$seq->desc(),"\\s\+\|\$",80);
-
- # if there, write the accession line
-
- if( $self->_ac_generation_func ) {
- $temp_line = &{$self->_ac_generation_func}($annseq);
- print $fh "ACCESSION $temp_line\n";
- } else {
- if( $annseq->can('accession') ) {
- print $fh "ACCESSION ",$annseq->accession,"\n";
- }
- # otherwise - cannot print <sigh>
- }
-
- # if there, write the version line
-
- if( $self->_sv_generation_func ) {
- $temp_line = &{$self->_sv_generation_func}($annseq);
- if( $temp_line ) {
- print $fh "VERSION $temp_line\n";
- }
- } else {
- if( $annseq->can('sv') ) {
- print $fh "VERSION ",$annseq->sv,"\n";
- }
- }
-
- # if there, write the keywords line
-
- if( $self->_kw_generation_func ) {
- $temp_line = &{$self->_kw_generation_func}($annseq);
- print $fh "KEYWORDS $temp_line\n";
- } else {
- if( $annseq->can('keywords') ) {
- print $fh "KEYWORDS ",$annseq->keywords,"\n";
- }
- }
-
-
- # Organism lines
- if (my $spec = $annseq->species) {
- my($species, $genus, @class) = $spec->classification();
- my $sub_species = $spec->sub_species();
- my $OS = "$genus $species $sub_species";
- print $fh "SOURCE $OS\n";
- print $fh " ORGANISM $OS\n";
- my $OC = join (';', reverse(@class));
- $OC =~ s/\n//g;
- $OC =~ s/\;/\; /g;
- $OC = "$OC; $genus.";
- _write_line_GenBank_regex($fh," "," ",$OC,"\\s\+\|\$",80);
- }
-
- # Reference lines
- my $count = 1;
- foreach my $ref ( $annseq->annotation->each_Reference() ) {
- $temp_line = sprintf ("REFERENCE $count (bases %d to %d)",$ref->start,$ref->end);
- print $fh "$temp_line\n";
- &_write_line_GenBank_regex($fh," AUTHORS "," ",$ref->authors,"\\s\+\|\$",80);
- &_write_line_GenBank_regex($fh," TITLE "," ",$ref->title,"\\s\+\|\$",80);
- &_write_line_GenBank_regex($fh," JOURNAL "," ",$ref->location,"\\s\+\|\$",80);
- if ($ref->comment) {
- &_write_line_GenBank_regex($fh," REMARK "," ",$ref->comment,"\\s\+\|\$",80);
- }
- $count++;
- }
- # Comment lines
-
- foreach my $comment ( $annseq->annotation->each_Comment() ) {
- _write_line_GenBank_regex($fh,"COMMENT "," ",$comment->text,"\\s\+\|\$",80);
- }
- print $fh "FEATURES Location/Qualifiers\n";
-
- if( defined $self->_post_sort ) {
- # we need to read things into an array. Process. Sort them. Print 'em
-
- my $post_sort_func = $self->_post_sort();
- my @fth;
-
- foreach my $sf ( $annseq->top_SeqFeatures ) {
- push(@fth,Bio::AnnSeqIO::FTHelper::from_SeqFeature($sf,$annseq));
- }
-
- @fth = sort { &$post_sort_func($a,$b) } @fth;
-
- foreach my $fth ( @fth ) {
- &_print_GenBank_FTHelper($fth,$fh);
- }
- } else {
- # not post sorted. And so we can print as we get them.
- # lower memory load...
-
- foreach my $sf ( $annseq->top_SeqFeatures ) {
- my @fth = Bio::AnnSeqIO::FTHelper::from_SeqFeature($sf,$annseq);
- foreach my $fth ( @fth ) {
- if( ! $fth->isa('Bio::AnnSeqIO::FTHelper') ) {
- $sf->throw("Cannot process FTHelper... $fth");
- }
-
- &_print_GenBank_FTHelper($fth,$fh);
- }
- }
- }
-
- if( $self->_show_dna() == 0 ) {
- return;
- }
-
-# finished printing features.
-
- $str =~ tr/A-Z/a-z/;
-
-# Count each nucleotide
- my $alen = $str =~ tr/a/a/;
- my $clen = $str =~ tr/c/c/;
- my $glen = $str =~ tr/g/g/;
- my $tlen = $str =~ tr/t/t/;
-
- my $olen = $len - ($alen + $tlen + $clen + $glen);
- if( $olen < 0 ) {
- $self->warn("Weird. More atgc than bases. Problem!");
- }
- printf $fh ("BASE COUNT %8s a %6s c %6s g %6s t\n",$alen,$clen,$glen,$tlen);
- printf $fh "ORIGIN\n";
- my $di;
- for ($i = 0; $i < length($str); $i += 10) {
-
- $di=$i+11;
-
- #first line
- if ($i==0) {
- print $fh sprintf("%9d ",1);
- }
- #print sequence, spaced by 10
- print $fh substr($str,$i,10), " ";
-
- #break line and print number at beginning of next line
- if(($i+10)%60 == 0) {
- print $fh "\n";
- print $fh sprintf("%9d ",$di);
- }
- }
-
-
- print $fh "\n//\n";
- return 1;
-}
-
-=head2 _print_GenBank_FTHelper
-
- Title : _print_GenBank_FTHelper
- Usage :
- Function:
- Example :
- Returns :
- Args :
-
-
-=cut
-
-sub _print_GenBank_FTHelper {
- my ($fth,$fh,$always_quote) = @_;
-
- if( ! ref $fth || ! $fth->isa('Bio::AnnSeqIO::FTHelper') ) {
- $fth->warn("$fth is not a FTHelper class. Attempting to print, but there could be tears!");
- }
- &_write_line_GenBank_regex($fh,sprintf(" %-16s",$fth->key)," ",$fth->loc,"\,\|\$",80);
-
- if( !defined $always_quote) { $always_quote = 0; }
-
- foreach my $tag ( keys %{$fth->field} ) {
- foreach my $value ( @{$fth->field->{$tag}} ) {
- $value =~ s/\"/\"\"/g;
- if ($value eq "_no_value") {
- &_write_line_GenBank_regex($fh," "," ","/$tag","\.\|\$",80);
- }
- elsif( $always_quote == 1 || $value !~ /^\d+$/ ) {
- &_write_line_GenBank_regex($fh," "," ","/$tag=\"$value\"","\.\|\$",80);
- } else {
- &_write_line_GenBank_regex($fh," "," ","/$tag=$value","\.\|\$",80);
- }
- }
- }
-
-}
-
-
-=head2 _read_GenBank_References
-
- Title : _read_GenBank_References
- Usage :
- Function: Reads references from GenBank format. Internal function really
- Example :
- Returns :
- Args :
-
-
-=cut
-
-sub _read_GenBank_References{
- my ($buffer,$fh) = @_;
- my (@refs);
- my $previous;
-
- # assumme things are starting with RN
-
- if( $$buffer !~ /^REFERENCE/ ) {
- warn("Not parsing line '$$buffer' which maybe important");
- }
- my $title;
- my $loc;
- my $au;
- my $b1;
- my $b2;
- my $com;
-
- if ($$buffer =~ /^REFERENCE\s+\d+\s+\(bases (\d+) to (\d+)/){
- $b1=$1;
- $b2=$2;
- }
- while( <$fh> ) {
- if (/^ AUTHORS\s+(.*)/) {
- $au .= $1;
- while ( <$fh> ) {
- /^ TITLE/ && last;
- /^\s+(.*)/ && do { $au .= $1; $au =~ s/\,(\S)/ $1/g;$au .= " ";next;};
- }
- }
- if (/^ TITLE\s+(.*)/) {
- $title .= $1;
- while ( <$fh> ) {
- /^ JOURNAL/ && last;
- /^\s+(.*)/ && do { $title .= $1;$title .= " ";next;};
- }
- }
- if (/^ JOURNAL\s+(.*)/) {
- $loc .= $1;
- while ( <$fh> ) {
- /^ REMARK/ && last;
- /^\s+(.*)/ && do { $loc .= $1;$loc .= " ";next;};
- last;
- }
- }
-
- if (/^ REMARK\s+(.*)/) {
- $com .= $1;
- while ( <$fh> ) {
- /^\s+(.*)/ && do { $com .= $1;$com .= " ";next;};
- last;
- }
- }
-
- /^REFERENCE/ && do {
- # put away current reference
- my $ref = new Bio::Annotation::Reference;
- $au =~ s/;\s*$//g;
- $title =~ s/;\s*$//g;
- $ref->start($b1);
- $ref->end($b2);
- $ref->authors($au);
- $ref->title($title);
- $ref->location($loc);
- $ref->comment($com);
- push(@refs,$ref);
- $au = "";
- $title = "";
- $loc = "";
- $com = "";
-
- # return to main reference loop
- next;
- };
-
- /^FEATURES||^COMMENT/ && last;
- }
-
- # put away last reference
-
- my $ref = new Bio::Annotation::Reference;
- $au =~ s/;\s*$//g;
- $title =~ s/;\s*$//g;
-
- $ref->start($b1);
- $ref->end($b2);
- $ref->authors($au);
- $ref->title($title);
- $ref->location($loc);
- $ref->comment($com);
- push(@refs,$ref);
- $$buffer = $_;
-
- return @refs;
-}
-
-
-=head2 _read_GenBank_Species
-
- Title : _read_GenBank_Species
- Usage :
- Function: Reads the GenBank Organism species and classification
- lines.
- Example :
- Returns : A Bio::Species object
- Args :
-
-=cut
-
-sub _read_GenBank_Species {
- my( $buffer, $fh ) = @_;
-
- $_ = $$buffer;
-
- my( $sub_species, $species, $genus, $common, @class );
- while (defined( $_ ||= <$fh> )) {
-
- if (/^SOURCE\s+(.*)/) {
- $common = $1;
- $common =~ s/\.//;
- }
- if (/^ ORGANISM\s+(\S+)\s+(\S+)\s+?(\S+)?/) {
- $genus = $1 if $1;
- $genus = "None" unless $1;
- if ($2) {
- $species = $2;
- }
- else {
- $species = "sp.";
- }
- $sub_species = $3 if $3;
- }
- elsif ($_ !~ /^REFERENCE/) {
- $_ =~ s/\s+//;
- $_ =~ s/^SOURCE\S+//;
- push(@class, split /[\;]+/, $_);
- }
- else {
- last;
- }
-
- $_ = undef; # Empty $_ to trigger read of next line
- }
-
- $$buffer = $_;
-
- # Don't make a species object if it's "Unknown" or "None"
- return if $genus =~ /^(Unknown|None)$/i;
-
- # Bio::Species array needs array in Species -> Kingdom direction
- if ($sub_species) {
- push( @class, $genus, $species, $sub_species );
- }
- else {
- push( @class, $genus, $species, "");
- }
- @class = reverse @class;
-
- my $make = Bio::Species->new();
- $make->classification( @class );
- $make->common_name( $common ) if $common;
- return $make;
-}
-
-=head2 _filehandle
-
- Title : _filehandle
- Usage : $obj->_filehandle($newval)
- Function:
- Example :
- Returns : value of _filehandle
- Args : newvalue (optional)
-
-
-=cut
-
-sub _filehandle{
- my ($obj,$value) = @_;
- if( defined $value) {
- $obj->{'_filehandle'} = $value;
- }
- return $obj->{'_filehandle'};
-
-}
-
-=head2 _read_FTHelper_GenBank
-
- Title : _read_FTHelper_GenBank
- Usage : &_read_FTHelper_GenBank($fh,$buffer)
- Function: reads the next FT key line
- Example :
- Returns : Bio::AnnSeqIO::FTHelper object
- Args : filehandle and reference to a scalar
-
-
-=cut
-
-sub _read_FTHelper_GenBank {
- my ($fh,$buffer) = @_;
- my ($key,$loc,$out);
-
- $_ = $$buffer;
- if( $$buffer =~ /^\s+(\S+)\s+(\S+)/ ) {
- $key = $1;
- $loc = $2;
- if ($$buffer !~ /^\s+\//) {
- while ( <$fh> ) {
- # read location line until feature, qualifier or end of feature table
- /^\s+\S+\s+\S+/ && last; # trigger for next feature
- /\s+\// && last; # trigger for qualifier
- /^BASE/ && last; # trigger for end of feature table
- /\s+\>?(\S+)/ && do {$loc .= $1;};
-
- }
- }
- }
-
- $out = new Bio::AnnSeqIO::FTHelper();
- $out->key($key);
- $out->loc($loc);
-
- # Now read in other fields
- # Loop reads $_ when defined (i.e. only in first loop), then read $fh, until eof
- while ( defined($_ ||= <$fh>) ) {
-
- # Exit loop on new primary key or end of features
- (/^ \S+/||/^BASE/) && last;
-
- # Field on one line
- if (/^\s+\/(\S+)=\"(.*)\"/) {
- my $key = $1;
- my $value = $2;
- if(! defined $out->field->{$key} ) {
- $out->field->{$key} = [];
- }
- $value =~ s/\"\"/\"/g;
-
- push(@{$out->field->{$key}},$value);
- }
- # Field on on multilines:
- elsif (/^\s+\/(\S+)=\"(.*)/) {
- my $key = $1;
- my $value = $2;
- while ( <$fh> ) {
- # first subs out the ""
- s/\"\"/__DOUBLE_QUOTE_STRING__/g;
- /\s+(.*)\"/ && do { $value .= $1; last; };
- /\s+(.*)/ && do {$value .= $1; };
- }
- $value =~ s/__DOUBLE_QUOTE_STRING__/\"/g;
-
- if(! defined $out->field->{$key} ) {
- $out->field->{$key} = [];
- }
- push(@{$out->field->{$key}},$value);
- }
- # Field with no quoted value
- elsif (/^\s+\/(\S+)=?(\S+)?/) {
- my $key = $1;
- my $value = $2 if $2;
- $value = "_no_value" unless $2;
- if(! defined $out->field->{$key} ) {
- $out->field->{$key} = [];
- }
- push(@{$out->field->{$key}},$value);
- }
-
- # Empty $_ to trigger read from $fh
- undef $_;
- }
-
- $$buffer = $_;
- return $out;
-}
-
-=head2 _write_line_GenBank
-
- Title : _write_line_GenBank
- Usage :
- Function: internal function
- Example :
- Returns :
- Args :
-
-
-=cut
-
-sub _write_line_GenBank{
- my ($fh,$pre1,$pre2,$line,$length) = @_;
-
- $length || die "Miscalled write_line_GenBank without length. Programming error!";
- my $subl = $length - length $pre2;
- my $linel = length $line;
- my $i;
-
- my $sub = substr($line,0,$length - length $pre1);
-
- print $fh "$pre1$sub\n";
-
- for($i= ($length - length $pre1);$i < $linel;) {
- $sub = substr($line,$i,($subl));
- print $fh "$pre2$sub\n";
- $i += $subl;
- }