Skip to content
Browse files

Merge branch 'master' into bug_3390

  • Loading branch information...
2 parents d4a13cd + 09f5ae8 commit f0d981132b79fc2363a7364cf43e62be73f091f7 @fangly fangly committed Nov 17, 2012
View
31 Bio/DB/Fasta.pm
@@ -307,7 +307,7 @@ sub subseq {
Title : length
Usage : my $length = $qualdb->length($id);
- Function: Get the number of residies in the indicated sequence.
+ Function: Get the number of residues in the indicated sequence.
Returns : Number
Args : ID of entry
@@ -384,17 +384,18 @@ sub subseq {
}
sub trunc {
+ # Override Bio::PrimarySeqI trunc() method. This way, we create an object
+ # that does not store the sequence in memory.
my ($self, $start, $stop) = @_;
$self->throw("Stop cannot be smaller than start") if $stop < $start;
- return $self->{start} <= $self->{stop} ?
- $self->new( $self->{db},
- $self->{id},
- $self->{start}+$start-1,
- $self->{start}+$stop-1, ) :
- $self->new( $self->{db},
- $self->{id},
- $self->{start}-($start-1),
- $self->{start}-($stop-1) );
+ if ($self->{start} <= $self->{stop}) {
+ $start = $self->{start}+$start-1;
+ $stop = $self->{start}+$stop-1;
+ } else {
+ $start = $self->{start}-($start-1);
+ $stop = $self->{start}-($stop-1);
+ }
+ return $self->new( $self->{db}, $self->{id}, $start, $stop );
}
sub is_circular {
@@ -413,6 +414,8 @@ sub accession_number {
}
sub primary_id {
+ # Following Bio::PrimarySeqI, since this sequence has no accession number,
+ # its primary_id should be a stringified memory location.
my $self = shift;
return overload::StrVal($self);
}
@@ -427,19 +430,23 @@ sub alphabet {
}
sub revcom {
+ # Override Bio::PrimarySeqI revcom() with optimized method.
my $self = shift;
return $self->new(@{$self}{'db', 'id', 'stop', 'start'});
}
sub length {
+ # Get length from sequence location, not the sequence string (too expensive)
my $self = shift;
- return CORE::length($self->seq);
+ return $self->{start} < $self->{stop} ?
+ $self->{stop} - $self->{start} + 1 :
+ $self->{start} - $self->{stop} + 1 ;
}
sub description {
my $self = shift;
my $header = $self->{'db'}->header($self->{id});
- # remove the id from the header
+ # Remove the ID from the header
return (split(/\s+/, $header, 2))[1];
}
*desc = \&description;
View
26 Bio/DB/Qual.pm
@@ -431,23 +431,24 @@ sub subqual {
sub trunc {
+ # Override Bio::Seq::QualI trunc() method. This way, we create an object
+ # that does not store the quality array in memory.
my ($self, $start, $stop) = @_;
$self->throw(
"$stop is smaller than $stop. If you want to truncate and reverse ".
"complement, you must call trunc followed by revcom."
) if $start > $stop;
- my ($left, $right);
if ($self->{start} <= $self->{stop}) {
- $left = $self->{start}+$start-1;
- $right = $self->{start}+$stop-1;
+ $start = $self->{start}+$start-1;
+ $stop = $self->{start}+$stop-1;
} else {
- $left = $self->{start}-($start-1);
- $right = $self->{start}-($stop-1);
+ $start = $self->{start}-($start-1);
+ $stop = $self->{start}-($stop-1);
}
my $obj = $self->new( -database => $self->{db},
-id => $self->{id},
- -start => $left,
- -stop => $right
+ -start => $start,
+ -stop => $stop
);
return $obj;
}
@@ -464,11 +465,18 @@ sub primary_id {
return overload::StrVal($self);
}
+sub revcom {
+ # Override Bio::QualI revcom() with optimized method.
+ my $self = shift;
+ return $self->new(@{$self}{'db', 'id', 'stop', 'start'});
+}
sub length {
- # number of quality scores
+ # Get length from quality location, not the quality array (too expensive)
my $self = shift;
- return scalar(@{$self->qual});
+ return $self->{start} < $self->{stop} ?
+ $self->{stop} - $self->{start} + 1 :
+ $self->{start} - $self->{stop} + 1 ;
}
View
175 Bio/LocatableSeq.pm
@@ -97,8 +97,7 @@ methods. Internal methods are usually preceded with a _
=cut
-#'
-# Let the code begin...
+
package Bio::LocatableSeq;
use strict;
@@ -120,6 +119,7 @@ $MATCHPATTERN = $RESIDUE_SYMBOLS.$GAP_SYMBOLS.$FRAMESHIFT_SYMBOLS.$OTHER_SYMBOLS
use base qw(Bio::PrimarySeq Bio::RangeI);
+
sub new {
my ($class, @args) = @_;
my $self = $class->SUPER::new(@args);
@@ -146,6 +146,7 @@ sub new {
return $self; # success - we hope!
}
+
=head2 start
Title : start
@@ -157,7 +158,7 @@ sub new {
=cut
-sub start{
+sub start {
my $self = shift;
if( @_ ) {
my $value = shift;
@@ -168,6 +169,7 @@ sub start{
return;
}
+
=head2 end
Title : end
@@ -214,6 +216,7 @@ sub end {
}
}
+
# changed 08.10.26 to return ungapped length, not the calculated end
# of the sequence
sub _ungapped_len {
@@ -235,6 +238,7 @@ sub _ungapped_len {
# return CORE::length($string);
#}
+
=head2 strand
Title : strand
@@ -245,7 +249,7 @@ sub _ungapped_len {
=cut
-sub strand{
+sub strand {
my $self = shift;
if( @_ ) {
my $value = shift;
@@ -254,6 +258,7 @@ sub strand{
return $self->{'strand'};
}
+
=head2 mapping
Title : mapping
@@ -282,6 +287,7 @@ sub mapping {
return @{ $self->{'_mapping'} };
}
+
=head2 frameshifts
Title : frameshifts
@@ -307,6 +313,7 @@ sub frameshifts {
return %{$self->{_frameshifts}} : return ();
}
+
=head2 get_nse
Title : get_nse
@@ -318,7 +325,7 @@ sub frameshifts {
=cut
-sub get_nse{
+sub get_nse {
my ($self,$char1,$char2) = @_;
$char1 ||= "/";
@@ -346,6 +353,7 @@ sub get_nse{
return join('',$id, $v, $char1, $st, $char2, $end);
}
+
=head2 force_nse
Title : force_nse
@@ -367,6 +375,7 @@ sub force_nse {
return $self->{'_force_nse'};
}
+
=head2 num_gaps
Title : num_gaps
@@ -440,53 +449,54 @@ sub column_from_residue_number {
unless $resnumber =~ /^\d+$/ and $resnumber > 0;
if ($resnumber >= $self->start() and $resnumber <= $self->end()) {
- my @chunks;
- my $column_incr;
- my $current_column;
- my $current_residue = $self->start - 1;
- my $seq = $self->seq;
- my $strand = $self->strand || 0;
-
- if ($strand == -1) {
-# @chunks = reverse $seq =~ m/[^\.\-]+|[\.\-]+/go;
- @chunks = reverse $seq =~ m/[$RESIDUE_SYMBOLS]+|[$GAP_SYMBOLS]+/go;
- $column_incr = -1;
- $current_column = (CORE::length $seq) + 1;
- }
- else {
-# @chunks = $seq =~ m/[^\.\-]+|[\.\-]+/go;
- @chunks = $seq =~ m/[$RESIDUE_SYMBOLS]+|[$GAP_SYMBOLS]+/go;
- $column_incr = 1;
- $current_column = 0;
- }
-
- while (my $chunk = shift @chunks) {
-# if ($chunk =~ m|^[\.\-]|o) {
- if ($chunk =~ m|^[$GAP_SYMBOLS]|o) {
- $current_column += $column_incr * CORE::length($chunk);
- }
- else {
- if ($current_residue + CORE::length($chunk) < $resnumber) {
- $current_column += $column_incr * CORE::length($chunk);
- $current_residue += CORE::length($chunk);
- }
- else {
- if ($strand == -1) {
- $current_column -= $resnumber - $current_residue;
- }
- else {
- $current_column += $resnumber - $current_residue;
- }
- return $current_column;
- }
- }
- }
+ my @chunks;
+ my $column_incr;
+ my $current_column;
+ my $current_residue = $self->start - 1;
+ my $seq = $self->seq;
+ my $strand = $self->strand || 0;
+
+ if ($strand == -1) {
+ #@chunks = reverse $seq =~ m/[^\.\-]+|[\.\-]+/go;
+ @chunks = reverse $seq =~ m/[$RESIDUE_SYMBOLS]+|[$GAP_SYMBOLS]+/go;
+ $column_incr = -1;
+ $current_column = (CORE::length $seq) + 1;
+ }
+ else {
+ #@chunks = $seq =~ m/[^\.\-]+|[\.\-]+/go;
+ @chunks = $seq =~ m/[$RESIDUE_SYMBOLS]+|[$GAP_SYMBOLS]+/go;
+ $column_incr = 1;
+ $current_column = 0;
+ }
+
+ while (my $chunk = shift @chunks) {
+ #if ($chunk =~ m|^[\.\-]|o) {
+ if ($chunk =~ m|^[$GAP_SYMBOLS]|o) {
+ $current_column += $column_incr * CORE::length($chunk);
+ }
+ else {
+ if ($current_residue + CORE::length($chunk) < $resnumber) {
+ $current_column += $column_incr * CORE::length($chunk);
+ $current_residue += CORE::length($chunk);
+ }
+ else {
+ if ($strand == -1) {
+ $current_column -= $resnumber - $current_residue;
+ }
+ else {
+ $current_column += $resnumber - $current_residue;
+ }
+ return $current_column;
+ }
+ }
+ }
}
$self->throw("Could not find residue number $resnumber");
}
+
=head2 location_from_column
Title : location_from_column
@@ -567,6 +577,7 @@ sub location_from_column {
return $loc;
}
+
=head2 revcom
Title : revcom
@@ -596,18 +607,16 @@ sub revcom {
return $new;
}
+
=head2 trunc
Title : trunc
Usage : $subseq = $myseq->trunc(10,100);
Function: Provides a truncation of a sequence,
-
- Example :
Returns : a fresh Bio::PrimarySeqI implementing object
Args : Two integers denoting first and last columns of the
sequence to be included into sub-sequence.
-
=cut
sub trunc {
@@ -625,45 +634,44 @@ sub trunc {
return $new;
}
+
=head2 validate_seq
Title : validate_seq
- Usage : if(! $seq->validate_seq($seq_str) ) {
+ Usage : if(! $seqobj->validate_seq($seq_str) ) {
print "sequence $seq_str is not valid for an object of
- alphabet ",$seq->alphabet, "\n";
- }
- Function: Validates a given sequence string. A validating sequence string
- must be accepted by seq(). A string that does not validate will
- lead to an exception if passed to seq().
-
- The implementation provided here does not take alphabet() into
- account. Allowed are all letters (A-Z), numbers [0-9]
- and common symbols used for gaps, stop codons, unknown residues,
- and frameshifts, including '-','.','*','?','=',and '~'.
-
- Example :
- Returns : 1 if the supplied sequence string is valid for the object, and
- 0 otherwise.
- Args : The sequence string to be validated.
+ alphabet ",$seqobj->alphabet, "\n";
+ }
+ Function: Test that the given sequence is valid, i.e. contains only valid
+ characters. The allowed characters are all letters (A-Z) and '-','.',
+ '*','?','=' and '~'. Spaces are not valid. Note that this
+ implementation does not take alphabet() into account.
+ Returns : 1 if the supplied sequence string is valid, 0 otherwise.
+ Args : - Sequence string to be validated
+ - Boolean to throw an error if the sequence is invalid
=cut
sub validate_seq {
- my ($self,$seqstr) = @_;
- if( ! defined $seqstr ){ $seqstr = $self->seq(); }
- return 0 unless( defined $seqstr);
-
- if((CORE::length($seqstr) > 0) &&
- ($seqstr !~ /^([$MATCHPATTERN]+)$/)) {
- $self->warn("seq doesn't validate with [$MATCHPATTERN], mismatch is " .
- join(",",($seqstr =~ /([^$MATCHPATTERN]+)/g)));
+ my ($self, $seqstr, $throw) = @_;
+ $seqstr = '' if not defined $seqstr;
+ $throw = 0 if not defined $throw ; # 0 for backward compatiblity
+ if ( (CORE::length $seqstr > 0 ) &&
+ ($seqstr !~ /^([$MATCHPATTERN]+)$/) ) {
+ if ($throw) {
+ $self->throw("Failed validation of sequence '".(defined($self->id) ||
+ '[unidentified sequence]')."'. Invalid characters were: " .
+ join('',($seqstr =~ /([^$MATCHPATTERN]+)/g)));
+ }
return 0;
}
return 1;
}
+
################## DEPRECATED METHODS ##################
+
=head2 no_gap
Title : no_gaps
@@ -682,13 +690,14 @@ sub validate_seq {
=cut
sub no_gaps {
- my $self = shift;
- $self->deprecated(-warn_version => 1.0069,
- -throw_version => 1.0075,
- -message => 'Use of method no_gaps() is deprecated, use num_gaps() instead');
- $self->num_gaps(@_);
+ my $self = shift;
+ $self->deprecated( -warn_version => 1.0069,
+ -throw_version => 1.0075,
+ -message => 'Use of method no_gaps() is deprecated, use num_gaps() instead' );
+ return $self->num_gaps(@_);
}
+
=head2 no_sequences
Title : no_sequences
@@ -701,11 +710,11 @@ sub no_gaps {
=cut
sub no_sequences {
- my $self = shift;
- $self->deprecated(-warn_version => 1.0069,
- -throw_version => 1.0075,
- -message => 'Use of method no_sequences() is deprecated, use num_sequences() instead');
- $self->num_sequences(@_);
+ my $self = shift;
+ $self->deprecated( -warn_version => 1.0069,
+ -throw_version => 1.0075,
+ -message => 'Use of method no_sequences() is deprecated, use num_sequences() instead' );
+ return $self->num_sequences(@_);
}
1;
View
599 Bio/PrimarySeq.pm
@@ -13,7 +13,7 @@
=head1 NAME
-Bio::PrimarySeq - Bioperl lightweight Sequence Object
+Bio::PrimarySeq - Bioperl lightweight sequence object
=head1 SYNOPSIS
@@ -26,18 +26,22 @@ Bio::PrimarySeq - Bioperl lightweight Sequence Object
# make from memory
- $seqobj = Bio::PrimarySeq->new ( -seq => 'ATGGGGTGGGCGGTGGGTGGTTTG',
- -id => 'GeneFragment-12',
- -accession_number => 'X78121',
- -alphabet => 'dna',
- -is_circular => 1 );
+ $seqobj = Bio::PrimarySeq->new (
+ -seq => 'ATGGGGTGGGCGGTGGGTGGTTTG',
+ -id => 'GeneFragment-12',
+ -accession_number => 'X78121',
+ -alphabet => 'dna',
+ -is_circular => 1,
+ );
print "Sequence ", $seqobj->id(), " with accession ",
$seqobj->accession_number, "\n";
# read from file
- $inputstream = Bio::SeqIO->new(-file => "myseq.fa",
- -format => 'Fasta');
+ $inputstream = Bio::SeqIO->new(
+ -file => "myseq.fa",
+ -format => 'Fasta',
+ );
$seqobj = $inputstream->next_seq();
print "Sequence ", $seqobj->id(), " and desc ", $seqobj->desc, "\n";
@@ -51,7 +55,7 @@ Bio::PrimarySeq - Bioperl lightweight Sequence Object
=head1 DESCRIPTION
-PrimarySeq is a lightweight Sequence object, storing the sequence, its
+PrimarySeq is a lightweight sequence object, storing the sequence, its
name, a computer-useful unique name, and other fundamental attributes.
It does not contain sequence features or other information. To have a
sequence with sequence features you should use the Seq object which uses
@@ -118,33 +122,29 @@ methods. Internal methods are usually preceded with a _
=cut
-# Let the code begin...
-
package Bio::PrimarySeq;
-use vars qw($MATCHPATTERN $GAP_SYMBOLS);
+
use strict;
-$MATCHPATTERN = 'A-Za-z\-\.\*\?=~';
-$GAP_SYMBOLS = '-~';
+our $MATCHPATTERN = 'A-Za-z\-\.\*\?=~';
+our $GAP_SYMBOLS = '-~';
use base qw(Bio::Root::Root Bio::PrimarySeqI
Bio::IdentifiableI Bio::DescribableI);
-#
-# setup the allowed values for alphabet()
-#
+# Setup the allowed values for alphabet()
my %valid_type = map {$_, 1} qw( dna rna protein );
+
=head2 new
Title : new
- Usage : $seq = Bio::PrimarySeq->new( -seq => 'ATGGGGGTGGTGGTACCCT',
+ Usage : $seqobj = Bio::PrimarySeq->new( -seq => 'ATGGGGGTGGTGGTACCCT',
-id => 'human_id',
-accession_number => 'AL000012',
);
-
Function: Returns a new primary seq object from
basic constructors, being a string for the sequence
and strings for id and accession_number.
@@ -155,6 +155,7 @@ my %valid_type = map {$_, 1} qw( dna rna protein );
values.
Returns : a new Bio::PrimarySeq object
Args : -seq => sequence string
+ -ref_to_seq => ... or reference to a sequence string
-display_id => display id of the sequence (locus name)
-accession_number => accession number
-primary_id => primary id (Genbank id)
@@ -163,23 +164,22 @@ my %valid_type = map {$_, 1} qw( dna rna protein );
-authority => the authority for the namespace
-description => description text
-desc => alias for description
- -alphabet => sequence type (alphabet) (dna|rna|protein)
+ -alphabet => skip alphabet guess and set it to dna, rna or protein
-id => alias for display id
- -is_circular => boolean field for whether or not sequence is circular
- -direct => boolean field for directly setting sequence (requires alphabet also set)
- -ref_to_seq => boolean field indicating the sequence is a reference (?!?)
- -nowarnonempty => boolean field for whether or not to warn when sequence is empty
+ -is_circular => boolean to indicate that sequence is circular
+ -direct => boolean to directly set sequences. The next time -seq,
+ seq() or -ref_to_seq is use, the sequence will not be
+ validated. Be careful with this...
+ -nowarnonempty => boolean to avoid warning when sequence is empty
=cut
-
sub new {
my ($class, @args) = @_;
my $self = $class->SUPER::new(@args);
-
- my($seq,$id,$acc,$pid,$ns,$auth,$v,$oid,
- $desc,$description,
- $alphabet,$given_id,$is_circular,$direct,$ref_to_seq,$len,$nowarnonempty) =
+ my ($seq, $id, $acc, $pid, $ns, $auth, $v, $oid, $desc, $description,
+ $alphabet, $given_id, $is_circular, $direct, $ref_to_seq, $len,
+ $nowarnonempty) =
$self->_rearrange([qw(SEQ
DISPLAY_ID
ACCESSION_NUMBER
@@ -199,161 +199,162 @@ sub new {
NOWARNONEMPTY
)],
@args);
-
- # private var _nowarnonempty, need to be set before calling _guess_alphabet
- $self->{'_nowarnonempty'} = $nowarnonempty;
+
+ # Private var _nowarnonempty, needs to be set before calling _guess_alphabet
+ $self->{'_nowarnonempty'} = $nowarnonempty;
+ $self->{'_direct'} = $direct;
if( defined $id && defined $given_id ) {
- if( $id ne $given_id ) {
- $self->throw("Provided both id and display_id constructor ".
- "functions. [$id] [$given_id]");
- }
+ if( $id ne $given_id ) {
+ $self->throw("Provided both id and display_id constructors: [$id] [$given_id]");
+ }
}
if( defined $given_id ) { $id = $given_id; }
- # let's set the length before the seq -- if there is one, this length is
- # going to be invalidated
- defined $len && $self->length($len);
-
- # if alphabet is provided we set it first, so that it won't be guessed
- # when the sequence is set
- $alphabet && $self->alphabet($alphabet);
-
- # bernd's idea: define ids so that invalid sequence messages
- # can be more informative...
+ # Bernd's idea: set ids now for more informative invalid sequence messages
defined $id && $self->display_id($id);
$acc && $self->accession_number($acc);
defined $pid && $self->primary_id($pid);
- # if there is an alphabet, and direct is passed in, assume the alphabet
- # and sequence is ok
-
- if( $direct && $ref_to_seq) {
- $self->{'seq'} = $$ref_to_seq;
- if( ! $alphabet ) {
- $self->_guess_alphabet();
- } # else it has been set already above
+ # Set alphabet now to avoid guessing it later, when sequence is set
+ $alphabet && $self->alphabet($alphabet);
+
+ # Set the length before the seq. If there is a seq, length will be updated later
+ $self->{'length'} = $len || 0;
+
+ # Set the sequence (but also alphabet and length)
+ if ($ref_to_seq) {
+ $self->_set_seq_by_ref($ref_to_seq, $alphabet);
} else {
- # print STDERR "DEBUG: setting sequence to [$seq]\n";
- # note: the sequence string may be empty
- $self->seq($seq) if defined($seq);
+ if (defined $seq) {
+ # Note: the sequence string may be empty
+ $self->seq($seq);
+ }
}
- $desc && $self->desc($desc);
- $description && $self->description($description);
- $is_circular && $self->is_circular($is_circular);
- $ns && $self->namespace($ns);
- $auth && $self->authority($auth);
- defined($v) && $self->version($v);
+ $desc && $self->desc($desc);
+ $description && $self->description($description);
+ $is_circular && $self->is_circular($is_circular);
+ $ns && $self->namespace($ns);
+ $auth && $self->authority($auth);
+ defined($v) && $self->version($v);
defined($oid) && $self->object_id($oid);
-
return $self;
}
-sub direct_seq_set {
- my $obj = shift;
- return $obj->{'seq'} = shift if @_;
- return;
-}
-
=head2 seq
Title : seq
- Usage : $string = $obj->seq();
- Function: Returns the sequence as a string of letters. The
- case of the letters is left up to the implementer.
- Suggested cases are upper case for proteins and lower case for
- DNA sequence (IUPAC standard), but you should not rely on this.
+ Usage : $string = $seqobj->seq();
+ Function: Get or set the sequence as a string of letters. The case of
+ the letters is left up to the implementer. Suggested cases are
+ upper case for proteins and lower case for DNA sequence (IUPAC
+ standard), but you should not rely on this. An error is thrown if
+ the sequence contains invalid characters: see validate_seq().
Returns : A scalar
- Args : Optionally on set the new value (a string). An optional second
- argument presets the alphabet (otherwise it will be guessed).
+ Args : - Optional new sequence value (a string) to set
+ - Optional alphabet (it is guessed by default)
=cut
sub seq {
- my ($obj,@args) = @_;
-
- if( scalar(@args) == 0 ) {
- return $obj->{'seq'};
- }
-
- my ($value,$alphabet) = @args;
-
- if(@args) {
- if(defined($value) && (! $obj->validate_seq($value))) {
- $obj->throw("Attempting to set the sequence '".(defined($obj->id) ||
- "[unidentified sequence]")."' to [$value] which does not look healthy");
- }
- # if a sequence was already set we make sure that we re-adjust the
- # alphabet, otherwise we skip guessing if alphabet is already set
- # note: if the new seq is empty or undef, we don't consider that a
- # change (we wouldn't have anything to guess on anyway)
- my $is_changed_seq =
- exists($obj->{'seq'}) && (CORE::length($value || '') > 0);
- $obj->{'seq'} = $value;
- # new alphabet overridden by arguments?
- if($alphabet) {
- # yes, set it no matter what
- $obj->alphabet($alphabet);
- } elsif ($is_changed_seq || (! defined($obj->alphabet()))) {
- # if we changed a previous sequence to a new one or if there is no
- # alphabet yet at all, we need to guess the (possibly new) alphabet
- $obj->_guess_alphabet();
- } # else (seq not changed and alphabet was defined) do nothing
- # if the seq is changed, make sure we unset a possibly set length
- $obj->length(undef) if $is_changed_seq || $obj->{'seq'};
- }
- return $obj->{'seq'};
+ my ($self, @args) = @_;
+
+ if( scalar @args == 0 ) {
+ return $self->{'seq'};
+ }
+
+ my ($seq_str, $alphabet) = @args;
+ if (@args) {
+ $self->_set_seq_by_ref(\$seq_str, $alphabet);
+ }
+
+ return $self->{'seq'};
}
+
+sub _set_seq_by_ref {
+ # Set a sequence by reference. A reference is used to avoid the cost of
+ # copying the sequence (which can be very large) between functions.
+ my ($self, $seq_str_ref, $alphabet) = @_;
+
+ # Validate sequence if sequence is not empty and we are not in direct mode
+ if ( (! $self->{'_direct'}) && (defined $$seq_str_ref) ) {
+ $self->validate_seq($$seq_str_ref, 1);
+ }
+ delete $self->{'_direct'}; # next sequence will have to be validated
+
+ # Record sequence length
+ my $len = CORE::length($$seq_str_ref || '');
+ my $is_changed_seq = (exists $self->{'seq'}) && ($len > 0);
+ # Note: if the new seq is empty or undef, this is not considered a change
+ delete $self->{'_freeze_length'} if $is_changed_seq;
+ $self->{'length'} = $len if not exists $self->{'_freeze_length'};
+
+ # Set sequence
+ $self->{'seq'} = $$seq_str_ref;
+
+ # Set or guess alphabet
+ if ($alphabet) {
+ # Alphabet specified, set it no matter what
+ $self->alphabet($alphabet);
+ } elsif ($is_changed_seq || (! defined($self->alphabet()))) {
+ # If we changed a previous sequence to a new one or if there is no
+ # alphabet yet at all, we need to guess the (possibly new) alphabet
+ $self->_guess_alphabet();
+ } # else (seq not changed and alphabet was defined) do nothing
+
+ return 1;
+}
+
+
=head2 validate_seq
Title : validate_seq
- Usage : if(! $seq->validate_seq($seq_str) ) {
+ Usage : if(! $seqobj->validate_seq($seq_str) ) {
print "sequence $seq_str is not valid for an object of
- alphabet ",$seq->alphabet, "\n";
+ alphabet ",$seqobj->alphabet, "\n";
}
- Function: Validates a given sequence string. A validating sequence string
- must be accepted by seq(). A string that does not validate will
- lead to an exception if passed to seq().
-
- The implementation provided here does not take alphabet() into
- account. Allowed are all letters (A-Z) and '-','.','*','?','=',
- and '~'.
-
- Example :
- Returns : 1 if the supplied sequence string is valid for the object, and
- 0 otherwise.
- Args : The sequence string to be validated.
-
+ Function: Test that the given sequence is valid, i.e. contains only valid
+ characters. The allowed characters are all letters (A-Z) and '-','.',
+ '*','?','=' and '~'. Spaces are not valid. Note that this
+ implementation does not take alphabet() into account and that empty
+ sequences are considered valid.
+ Returns : 1 if the supplied sequence string is valid, 0 otherwise.
+ Args : - Sequence string to be validated
+ - Boolean to optionally throw an error if the sequence is invalid
=cut
sub validate_seq {
- my ($self,$seqstr) = @_;
- if( ! defined $seqstr ){ $seqstr = $self->seq(); }
- return 0 unless( defined $seqstr);
- if((CORE::length($seqstr) > 0) &&
- ($seqstr !~ /^([$MATCHPATTERN]+)$/)) {
- $self->warn("sequence '".(defined($self->id) || "[unidentified sequence]").
- "' doesn't validate, mismatch is " .
- join(",",($seqstr =~ /([^$MATCHPATTERN]+)/g)));
+ my ($self, $seqstr, $throw) = @_;
+ if ( (defined $seqstr ) &&
+ ($seqstr !~ /^[$MATCHPATTERN]*$/) ) {
+ if ($throw) {
+ $self->throw("Failed validation of sequence '".(defined($self->id) ||
+ '[unidentified sequence]')."'. Invalid characters were: " .
+ join('',($seqstr =~ /[^$MATCHPATTERN]/g)));
+ }
return 0;
}
return 1;
}
+
=head2 subseq
Title : subseq
- Usage : $substring = $obj->subseq(10,40);
- $substring = $obj->subseq(10,40,NOGAP)
- $substring = $obj->subseq(-START=>10,-END=>40,-REPLACE_WITH=>'tga')
- Function: returns the subseq from start to end, where the first sequence
+ Usage : $substring = $seqobj->subseq(10,40);
+ $substring = $seqobj->subseq(10,40,'nogap');
+ $substring = $seqobj->subseq(-start=>10, -end=>40, -replace_with=>'tga');
+ $substring = $seqobj->subseq($location_obj);
+ $substring = $seqobj->subseq($location_obj, -nogap => 1);
+ Function: Return the subseq from start to end, where the first sequence
character has coordinate 1 number is inclusive, ie 1-2 are the
- first two characters of the sequence
+ first two characters of the sequence. The given start coordinate
+ has to be larger than the end, even if the sequence is circular.
Returns : a string
Args : integer for start position
integer for end position
@@ -366,88 +367,106 @@ sub validate_seq {
=cut
sub subseq {
- my $self = shift;
- my @args = @_;
- my ($start,$end,$nogap,$replace) = $self->_rearrange([qw(START
- END
- NOGAP
- REPLACE_WITH)],@args);
+ my $self = shift;
+ my @args = @_;
+ my ($start, $end, $nogap, $replace) = $self->_rearrange([qw(START
+ END
+ NOGAP
+ REPLACE_WITH)], @args);
- # if $replace is specified, have the constructor validate it as seq
- my $dummy = new Bio::PrimarySeq(-seq=>$replace, -alphabet=>$self->alphabet) if defined($replace);
-
- if( ref($start) && $start->isa('Bio::LocationI') ) {
- my $loc = $start;
- my $seq = "";
- foreach my $subloc ($loc->each_Location()) {
- my $piece = $self->subseq(-START=>$subloc->start(),
- '-END'=>$subloc->end(),
- -REPLACE_WITH=>$replace,
- -NOGAP=>$nogap);
- $piece =~ s/[$GAP_SYMBOLS]//g if $nogap;
- if($subloc->strand() < 0) {
- $piece = Bio::PrimarySeq->new('-seq' => $piece)->revcom()->seq();
- }
- $seq .= $piece;
- }
- return $seq;
- } elsif( defined $start && defined $end ) {
- if( $start > $end ){
- $self->throw("Bad start,end parameters. Start [$start] has to be ".
- "less than end [$end]");
- }
- if( $start <= 0 ) {
- $self->throw("Bad start parameter ($start). Start must be positive.");
- }
-
- # remove one from start, and then length is end-start
- $start--;
- my @ss_args = map { eval "defined $_" ? $_ : () } qw( $self->{seq} $start $end-$start $replace);
- my $seqstr = eval join( '', "substr(", join(',',@ss_args), ")");
-
- if( $end > $self->length) {
- if ($self->is_circular) {
- my $start = 0;
- my $end = $end - $self->length;
- my @ss_args = map { eval "defined $_" ? $_ : () } qw( $self->{seq} $start $end-$start $replace);
- my $appendstr = eval join( '', "substr(", join(',',@ss_args), ")");
- $seqstr .= $appendstr;
- } else {
- $self->throw("Bad end parameter ($end). End must be less than the total length of sequence (total=".$self->length.")")
- }
- }
+ # If -replace_with is specified, validate the replacement sequence
+ if (defined $replace) {
+ $self->validate_seq( $replace ) ||
+ $self->throw("Replacement sequence does not look valid");
+ }
+
+ if( ref($start) && $start->isa('Bio::LocationI') ) {
+ my $loc = $start;
+ my $seq = '';
+ foreach my $subloc ($loc->each_Location()) {
+ my $piece = $self->subseq(-start => $subloc->start(),
+ -end => $subloc->end(),
+ -replace_with => $replace,
+ -nogap => $nogap);
+ $piece =~ s/[$GAP_SYMBOLS]//g if $nogap;
+ if ($subloc->strand() < 0) {
+ $piece = $self->_revcom_from_string($piece, $self->alphabet);
+ }
+ $seq .= $piece;
+ }
+ return $seq;
+ } elsif( defined $start && defined $end ) {
+ if( $start > $end ){
+ $self->throw("Bad start,end parameters. Start [$start] has to be ".
+ "less than end [$end]");
+ }
+ if( $start <= 0 ) {
+ $self->throw("Bad start parameter ($start). Start must be positive.");
+ }
+
+ # Remove one from start, and then length is end-start
+ $start--;
+
+ my $seqstr;
+ if (defined $replace) {
+ $seqstr = substr $self->{seq}, $start, $end-$start, $replace;
+ } else {
+ $seqstr = substr $self->{seq}, $start, $end-$start;
+ }
+
+
+ if ($end > $self->length) {
+ if ($self->is_circular) {
+ my $start = 0;
+ my $end = $end - $self->length;
+
+ my $appendstr;
+ if (defined $replace) {
+ $appendstr = substr $self->{seq}, $start, $end-$start, $replace;
+ } else {
+ $appendstr = substr $self->{seq}, $start, $end-$start;
+ }
+
+ $seqstr .= $appendstr;
+ } else {
+ $self->throw("Bad end parameter ($end). End must be less than ".
+ "the total length of sequence (total=".$self->length.")")
+ }
+ }
- $seqstr =~ s/[$GAP_SYMBOLS]//g if ($nogap);
- return $seqstr;
+ $seqstr =~ s/[$GAP_SYMBOLS]//g if ($nogap);
+ return $seqstr;
- } else {
- $self->warn("Incorrect parameters to subseq - must be two integers or a Bio::LocationI object. Got:", $self,$start,$end,$replace,$nogap);
- return;
- }
+ } else {
+ $self->warn("Incorrect parameters to subseq - must be two integers or ".
+ "a Bio::LocationI object. Got:", $self,$start,$end,$replace,$nogap);
+ return;
+ }
}
+
=head2 length
Title : length
- Usage : $len = $seq->length();
- Function: Get the length of the sequence in number of symbols (bases
+ Usage : $len = $seqobj->length();
+ Function: Get the stored length of the sequence in number of symbols (bases
or amino acids).
- You can also set this attribute, even to a number that does
- not match the length of the sequence string. This is useful
- if you don''t want to set the sequence too, or if you want
- to free up memory by unsetting the sequence. In the latter
- case you could do e.g.
-
- $seq->length($seq->length);
- $seq->seq(undef);
-
- Note that if you set the sequence to a value other than
- undef at any time, the length attribute will be
- invalidated, and the length of the sequence string will be
- reported again. Also, we won''t let you lie about the length.
-
- Example :
+ In some circumstances, you can also set this attribute:
+ 1/ For empty sequences, you can set the length to anything you want:
+ my $seqobj = Bio::PrimarySeq->new( -length => 123 );
+ my $len = $seqobj->len; # 123
+ 2/ To save memory when using very long sequences, you can set the
+ length of the sequence to the length of the sequence (and nothing
+ else):
+ my $seqobj = Bio::PrimarySeq->new( -seq => 'ACGT...' ); # 1 Mbp sequence
+ # process $seqobj... then after you're done with it
+ $seqobj->length($seqobj->length);
+ $seqobj->seq(undef); # free memory!
+ my $len = $seqobj->len; # 1 Mbp
+
+ Note that if you set seq() to a value other than undef at any time,
+ the length attribute will be reset.
Returns : integer representing the length of the sequence.
Args : Optionally, the value on set
@@ -456,28 +475,23 @@ sub subseq {
sub length {
my ($self, $val) = @_;
if (defined $val) {
- my $len = CORE::length($self->seq() || '');
+ my $len = $self->{'length'};
if ($len && ($len != $val)) {
$self->throw("You're trying to lie about the length: ".
"is $len but you say ".$val);
}
- $self->{'_seq_length'} = $val;
- return $val;
- } else {
- if (defined $self->{'_seq_length'}) {
- return $self->{'_seq_length'};
- } else {
- return CORE::length($self->seq() || '');
- }
+ $self->{'length'} = $val;
+ $self->{'_freeze_length'} = undef;
}
+ return $self->{'length'};
}
=head2 display_id
Title : display_id or display_name
- Usage : $id_string = $obj->display_id();
- Function: returns the display id, aka the common name of the Sequence object.
+ Usage : $id_string = $seqobj->display_id();
+ Function: Get or set the display id, aka the common name of the sequence object.
The semantics of this is that it is the most likely string to
be used as an identifier of the sequence, and likely to have
@@ -492,25 +506,24 @@ sub length {
With the new Bio::DescribeableI interface, display_name aliases
to this method.
-
- Returns : A string
- Args : None
-
+ Returns : A string for the display ID
+ Args : Optional string for the display ID to set
=cut
sub display_id {
- my ($obj,$value) = @_;
- if( defined $value) {
- $obj->{'display_id'} = $value;
+ my ($self, $value) = @_;
+ if( defined $value) {
+ $self->{'display_id'} = $value;
}
- return $obj->{'display_id'};
+ return $self->{'display_id'};
}
+
=head2 accession_number
Title : accession_number or object_id
- Usage : $unique_key = $obj->accession_number;
+ Usage : $unique_key = $seqobj->accession_number;
Function: Returns the unique biological id for a sequence, commonly
called the accession_number. For sequences from established
databases, the implementors should try to use the correct
@@ -525,64 +538,61 @@ sub display_id {
With the new Bio::IdentifiableI interface, this is aliased
to object_id
-
Returns : A string
Args : A string (optional) for setting
=cut
sub accession_number {
- my( $obj, $acc ) = @_;
-
+ my( $self, $acc ) = @_;
if (defined $acc) {
- $obj->{'accession_number'} = $acc;
+ $self->{'accession_number'} = $acc;
} else {
- $acc = $obj->{'accession_number'};
+ $acc = $self->{'accession_number'};
$acc = 'unknown' unless defined $acc;
}
return $acc;
}
+
=head2 primary_id
Title : primary_id
- Usage : $unique_key = $obj->primary_id;
+ Usage : $unique_key = $seqobj->primary_id;
Function: Returns the unique id for this object in this
implementation. This allows implementations to manage their
own object ids in a way the implementaiton can control
clients can expect one id to map to one object.
For sequences with no natural primary id, this method
should return a stringified memory location.
-
Returns : A string
Args : A string (optional, for setting)
=cut
sub primary_id {
- my $obj = shift;
+ my $self = shift;
if(@_) {
- $obj->{'primary_id'} = shift;
+ $self->{'primary_id'} = shift;
}
- if( ! defined($obj->{'primary_id'}) ) {
- return "$obj";
+ if( ! defined($self->{'primary_id'}) ) {
+ return "$self";
}
- return $obj->{'primary_id'};
+ return $self->{'primary_id'};
}
=head2 alphabet
Title : alphabet
- Usage : if( $obj->alphabet eq 'dna' ) { /Do Something/ }
- Function: Get/Set the alphabet of sequence, one of
+ Usage : if( $seqobj->alphabet eq 'dna' ) { # Do something }
+ Function: Get/set the alphabet of sequence, one of
'dna', 'rna' or 'protein'. This is case sensitive.
This is not called <type> because this would cause
upgrade problems from the 0.5 and earlier Seq objects.
-
Returns : a string either 'dna','rna','protein'. NB - the object must
make a call of the type - if there is no alphabet specified it
has to guess.
@@ -592,28 +602,27 @@ sub primary_id {
=cut
sub alphabet {
- my ($obj,$value) = @_;
+ my ($self,$value) = @_;
if (defined $value) {
$value = lc $value;
unless ( $valid_type{$value} ) {
- $obj->throw("Alphabet '$value' is not a valid alphabet (".
+ $self->throw("Alphabet '$value' is not a valid alphabet (".
join(',', map "'$_'", sort keys %valid_type) .") lowercase");
}
- $obj->{'alphabet'} = $value;
+ $self->{'alphabet'} = $value;
}
- return $obj->{'alphabet'};
+ return $self->{'alphabet'};
}
+
=head2 desc
Title : desc or description
- Usage : $obj->desc($newval)
+ Usage : $seqobj->desc($newval);
Function: Get/set description of the sequence.
'description' is an alias for this for compliance with the
Bio::DescribeableI interface.
-
- Example :
Returns : value of desc (a string)
Args : newvalue (a string or undef, optional)
@@ -627,6 +636,7 @@ sub desc{
return $self->{'desc'};
}
+
=head2 can_call_new
Title : can_call_new
@@ -636,7 +646,6 @@ sub desc{
Returns : true
Args :
-
=cut
sub can_call_new {
@@ -645,26 +654,27 @@ sub can_call_new {
return 1;
}
+
=head2 id
Title : id
- Usage : $id = $seq->id()
+ Usage : $id = $seqobj->id();
Function: This is mapped on display_id
Example :
Returns :
Args :
-
=cut
sub id {
return shift->display_id(@_);
}
+
=head2 is_circular
Title : is_circular
- Usage : if( $obj->is_circular) { /Do Something/ }
+ Usage : if( $seqobj->is_circular) { # Do something }
Function: Returns true if the molecule is circular
Returns : Boolean value
Args : none
@@ -673,44 +683,42 @@ sub id {
sub is_circular{
my $self = shift;
-
return $self->{'is_circular'} = shift if @_;
return $self->{'is_circular'};
}
=head1 Methods for Bio::IdentifiableI compliance
-=cut
-
=head2 object_id
Title : object_id
- Usage : $string = $obj->object_id()
- Function: A string which represents the stable primary identifier
+ Usage : $string = $seqobj->object_id();
+ Function: Get or set a string which represents the stable primary identifier
in this namespace of this object. For DNA sequences this
is its accession_number, similarly for protein sequences.
This is aliased to accession_number().
Returns : A scalar
-
+ Args : Optional object ID to set.
=cut
sub object_id {
return shift->accession_number(@_);
}
+
=head2 version
Title : version
- Usage : $version = $obj->version()
- Function: A number which differentiates between versions of
+ Usage : $version = $seqobj->version();
+ Function: Get or set a number which differentiates between versions of
the same object. Higher numbers are considered to be
later and more relevant, but a single object described
the same identifier should represent the same concept.
-
Returns : A number
+ Args : Optional version to set.
=cut
@@ -726,33 +734,33 @@ sub version{
=head2 authority
Title : authority
- Usage : $authority = $obj->authority()
- Function: A string which represents the organisation which
- granted the namespace, written as the DNS name for
+ Usage : $authority = $seqobj->authority();
+ Function: Get or set a string which represents the organisation which
+ granted the namespace, written as the DNS name of the
organisation (eg, wormbase.org).
-
Returns : A scalar
+ Args : Optional authority to set.
=cut
sub authority {
- my ($obj,$value) = @_;
+ my ($self, $value) = @_;
if( defined $value) {
- $obj->{'authority'} = $value;
+ $self->{'authority'} = $value;
}
- return $obj->{'authority'};
+ return $self->{'authority'};
}
+
=head2 namespace
Title : namespace
- Usage : $string = $obj->namespace()
- Function: A string representing the name space this identifier
- is valid in, often the database name or the name
- describing the collection.
-
+ Usage : $string = $seqobj->namespace();
+ Function: Get or set a string representing the name space this identifier
+ is valid in, often the database name or the name describing the
+ collection.
Returns : A scalar
-
+ Args : Optional namespace to set.
=cut
@@ -764,51 +772,54 @@ sub namespace{
return $self->{'namespace'} || "";
}
+
=head1 Methods for Bio::DescribableI compliance
This comprises of display_name and description.
-=cut
-
=head2 display_name
Title : display_name
- Usage : $string = $obj->display_name()
- Function: A string which is what should be displayed to the user.
+ Usage : $string = $seqobj->display_name();
+ Function: Get or set a string which is what should be displayed to the user.
The string should have no spaces (ideally, though a cautious
user of this interface would not assumme this) and should be
less than thirty characters (though again, double checking
this is a good idea).
This is aliased to display_id().
- Returns : A scalar
+ Returns : A string for the display name
+ Args : Optional string for the display name to set.
=cut
sub display_name {
return shift->display_id(@_);
}
+
=head2 description
Title : description
- Usage : $string = $obj->description()
- Function: A text string suitable for displaying to the user a
+ Usage : $string = $seqobj->description();
+ Function: Get or set a text string suitable for displaying to the user a
description. This string is likely to have spaces, but
should not have any newlines or formatting - just plain
text. The string should not be greater than 255 characters
and clients can feel justified at truncating strings at 255
characters for the purposes of display.
This is aliased to desc().
- Returns : A scalar
+ Returns : A string for the description
+ Args : Optional string for the description to set.
=cut
sub description {
return shift->desc(@_);
}
+
=head1 Methods Inherited from Bio::PrimarySeqI
These methods are available on Bio::PrimarySeq, although they are
@@ -817,7 +828,7 @@ actually implemented on Bio::PrimarySeqI
=head2 revcom
Title : revcom
- Usage : $rev = $seq->revcom()
+ Usage : $rev = $seqobj->revcom();
Function: Produces a new Bio::SeqI implementing object which
is the reversed complement of the sequence. For protein
sequences this throws an exception of
@@ -835,31 +846,21 @@ actually implemented on Bio::PrimarySeqI
This of course, causes Perl to handle the garbage
collection of the old object, but it is roughly speaking as
efficient as an inplace edit.
-
Returns : A new (fresh) Bio::SeqI object
Args : none
-=cut
-
=head2 trunc
Title : trunc
Usage : $subseq = $myseq->trunc(10,100);
Function: Provides a truncation of a sequence,
-
- Example :
Returns : A fresh Bio::SeqI implementing object.
- Args :
-
-
-=cut
+ Args : Numbers for the start and end positions
=head1 Internal methods
These are internal methods to PrimarySeq
-=cut
-
=head2 _guess_alphabet
Title : _guess_alphabet
@@ -871,11 +872,9 @@ These are internal methods to PrimarySeq
RNA characters are also valid characters for proteins, there is
no foolproof way of determining the right alphabet. This is our best
guess only!
- Example :
Returns : string 'dna', 'rna', 'protein' or ''.
Args : none
-
=cut
sub _guess_alphabet {
View
534 Bio/PrimarySeqI.pm
@@ -11,6 +11,7 @@
# POD documentation - main docs before the code
+
=head1 NAME
Bio::PrimarySeqI - Interface definition for a Bio::PrimarySeq
@@ -36,11 +37,11 @@ Bio::PrimarySeqI - Interface definition for a Bio::PrimarySeq
# Object manipulation
eval {
- $rev = $obj->revcom();
+ $rev = $obj->revcom();
};
if( $@ ) {
- $obj->throw("Could not reverse complement. ".
- "Probably not DNA. Actual exception\n$@\n");
+ $obj->throw( "Could not reverse complement. ".
+ "Probably not DNA. Actual exception\n$@\n" );
}
$trunc = $obj->trunc(12,50);
@@ -118,16 +119,14 @@ methods. Internal methods are usually preceded with a _
=cut
-# Let the code begin...
-
-
package Bio::PrimarySeqI;
use strict;
use Bio::Tools::CodonTable;
use base qw(Bio::Root::RootI);
-=head1 Implementation Specific Functions
+
+=head1 Implementation-specific Functions
These functions are the ones that a specific implementation must
define.
@@ -147,10 +146,11 @@ define.
=cut
sub seq {
- my ($self) = @_;
- $self->throw_not_implemented();
+ my ($self) = @_;
+ $self->throw_not_implemented();
}
+
=head2 subseq
Title : subseq
@@ -167,11 +167,12 @@ sub seq {
=cut
-sub subseq{
- my ($self) = @_;
- $self->throw_not_implemented();
+sub subseq {
+ my ($self) = @_;
+ $self->throw_not_implemented();
}
+
=head2 display_id
Title : display_id
@@ -196,12 +197,11 @@ sub subseq{
Args : None
Status : Virtual
-
=cut
sub display_id {
- my ($self) = @_;
- $self->throw_not_implemented();
+ my ($self) = @_;
+ $self->throw_not_implemented();
}
@@ -222,16 +222,14 @@ sub display_id {
Args : None
Status : Virtual
-
=cut
sub accession_number {
- my ($self,@args) = @_;
- $self->throw_not_implemented();
+ my ($self,@args) = @_;
+ $self->throw_not_implemented();
}
-
=head2 primary_id
Title : primary_id
@@ -248,12 +246,11 @@ sub accession_number {
Args : None
Status : Virtual
-
=cut
sub primary_id {
- my ($self,@args) = @_;
- $self->throw_not_implemented();
+ my ($self,@args) = @_;
+ $self->throw_not_implemented();
}
@@ -262,32 +259,31 @@ sub primary_id {
Title : can_call_new
Usage : if( $obj->can_call_new ) {
$newobj = $obj->new( %param );
- }
+ }
Function: Can_call_new returns 1 or 0 depending
on whether an implementation allows new
constructor to be called. If a new constructor
is allowed, then it should take the followed hashed
constructor list.
$myobject->new( -seq => $sequence_as_string,
- -display_id => $id
- -accession_number => $accession
- -alphabet => 'dna',
- );
+ -display_id => $id
+ -accession_number => $accession
+ -alphabet => 'dna',
+ );
Returns : 1 or 0
Args :
=cut
-sub can_call_new{
- my ($self,@args) = @_;
-
- # we default to 0 here
-
- return 0;
+sub can_call_new {
+ my ($self,@args) = @_;
+ # we default to 0 here
+ return 0;
}
+
=head2 alphabet
Title : alphabet
@@ -304,30 +300,29 @@ sub can_call_new{
Args : None
Status : Virtual
-
=cut
-sub alphabet{
+sub alphabet {
my ( $self ) = @_;
$self->throw_not_implemented();
}
+
=head2 moltype
Title : moltype
Usage : Deprecated. Use alphabet() instead.
=cut
-sub moltype{
- my ($self,@args) = @_;
-
- $self->warn("moltype: pre v1.0 method. Calling alphabet() instead...");
- $self->alphabet(@args);
+sub moltype {
+ my ($self,@args) = @_;
+ $self->warn("moltype: pre v1.0 method. Calling alphabet() instead...");
+ return $self->alphabet(@args);
}
-=head1 Optional Implementation Functions
+=head1 Implementation-optional Functions
The following functions rely on the above functions. An
implementing class does not need to provide these functions, as they
@@ -370,60 +365,55 @@ are encouraged to override these methods
=cut
-sub revcom{
- my ($self) = @_;
-
- my $seqclass;
- if($self->can_call_new()) {
- $seqclass = ref($self);
- } else {
- $seqclass = 'Bio::PrimarySeq';
- $self->_attempt_to_load_Seq();
- }
- my $out = $seqclass->new( '-seq' => $self->_revcom_from_string($self->seq, $self->alphabet),
- '-is_circular' => $self->is_circular,
- '-display_id' => $self->display_id,
- '-accession_number' => $self->accession_number,
- '-alphabet' => $self->alphabet,
- '-desc' => $self->desc(),
- '-verbose' => $self->verbose
- );
- return $out;
-
+sub revcom {
+ my ($self) = @_;
+ my ($seqclass, $opts) = $self->_setup_class;
+ my $out = $seqclass->new(
+ -seq => $self->_revcom_from_string($self->seq, $self->alphabet),
+ -is_circular => $self->is_circular,
+ -display_id => $self->display_id,
+ -accession_number => $self->accession_number,
+ -alphabet => $self->alphabet,
+ -desc => $self->desc,
+ -verbose => $self->verbose,
+ %$opts,
+ );
+ return $out;
}
+
sub _revcom_from_string {
- my ($self, $string, $alphabet) = @_;
-
- # Check that reverse-complementing makes sense
- if( $alphabet eq 'protein' ) {
- $self->throw("Sequence is a protein. Cannot revcom.");
- }
- if( $alphabet ne 'dna' && $alphabet ne 'rna' ) {
- my $msg = "Sequence is not dna or rna, but [$alphabet]. Attempting to revcom, ".
- "but unsure if this is right.";
- if( $self->can('warn') ) {
- $self->warn($msg);
- } else {
- warn("[$self] $msg");
- }
- }
-
- # If sequence is RNA, map to DNA (then map back later)
- if( $alphabet eq 'rna' ) {
- $string =~ tr/uU/tT/;
- }
-
- # Reverse-complement now
- $string =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
- $string = CORE::reverse $string;
-
- # Map back RNA to DNA
- if( $alphabet eq 'rna' ) {
- $string =~ tr/tT/uU/;
- }
-
- return $string;
+ my ($self, $string, $alphabet) = @_;
+
+ # Check that reverse-complementing makes sense
+ if( $alphabet eq 'protein' ) {
+ $self->throw("Sequence is a protein. Cannot revcom.");
+ }
+ if( $alphabet ne 'dna' && $alphabet ne 'rna' ) {
+ my $msg = "Sequence is not dna or rna, but [$alphabet]. Attempting to revcom, ".
+ "but unsure if this is right.";
+ if( $self->can('warn') ) {
+ $self->warn($msg);
+ } else {
+ warn("[$self] $msg");
+ }
+ }
+
+ # If sequence is RNA, map to DNA (then map back later)
+ if( $alphabet eq 'rna' ) {
+ $string =~ tr/uU/tT/;
+ }
+
+ # Reverse-complement now
+ $string =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
+ $string = CORE::reverse $string;
+
+ # Map back RNA to DNA
+ if( $alphabet eq 'rna' ) {
+ $string =~ tr/tT/uU/;
+ }
+
+ return $string;
}
@@ -438,40 +428,35 @@ sub _revcom_from_string {
=cut
-sub trunc{
- my ($self,$start,$end) = @_;
-
- my $str;
- if( defined $start && ref($start) &&
- $start->isa('Bio::LocationI') ) {
- $str = $self->subseq($start); # start is a location actually
- } elsif( !$end ) {
- $self->throw("trunc start,end -- there was no end for $start");
- } elsif( $end < $start ) {
- my $msg = "start [$start] is greater than end [$end]. \n".
- "If you want to truncated and reverse complement, \n".
- "you must call trunc followed by revcom. Sorry.";
- $self->throw($msg);
- } else {
- $str = $self->subseq($start,$end);
- }
-
- my $seqclass;
- if($self->can_call_new()) {
- $seqclass = ref($self);
- } else {
- $seqclass = 'Bio::PrimarySeq';
- $self->_attempt_to_load_Seq();
- }
-
- my $out = $seqclass->new( '-seq' => $str,
- '-display_id' => $self->display_id,
- '-accession_number' => $self->accession_number,
- '-alphabet' => $self->alphabet,
- '-desc' => $self->desc(),
- '-verbose' => $self->verbose
- );
- return $out;
+sub trunc {
+ my ($self,$start,$end) = @_;
+
+ my $str;
+ if( defined $start && ref($start) &&
+ $start->isa('Bio::LocationI') ) {
+ $str = $self->subseq($start); # start is a location actually
+ } elsif( !$end ) {
+ $self->throw("trunc start,end -- there was no end for $start");
+ } elsif( $end < $start ) {
+ my $msg = "start [$start] is greater than end [$end]. \n".
+ "If you want to truncated and reverse complement, \n".
+ "you must call trunc followed by revcom. Sorry.";
+ $self->throw($msg);
+ } else {
+ $str = $self->subseq($start,$end);
+ }
+
+ my ($seqclass, $opts) = $self->_setup_class;
+ my $out = $seqclass->new(
+ -seq => $str,
+ -display_id => $self->display_id,
+ -accession_number => $self->accession_number,
+ -alphabet => $self->alphabet,
+ -desc => $self->desc,
+ -verbose => $self->verbose,
+ %$opts,
+ );
+ return $out;
}
@@ -567,15 +552,15 @@ element in an array:
=cut
sub translate {
- my ($self,@args) = @_;
- my ($terminator, $unknown, $frame, $codonTableId, $complete,
- $complete_codons, $throw, $codonTable, $orf, $start_codon, $offset);
-
- ## new API with named parameters, post 1.5.1
- if ($args[0] && $args[0] =~ /^-[A-Z]+/i) {
- ($terminator, $unknown, $frame, $codonTableId, $complete,
- $complete_codons, $throw,$codonTable, $orf, $start_codon, $offset) =
- $self->_rearrange([qw(TERMINATOR
+ my ($self,@args) = @_;
+ my ($terminator, $unknown, $frame, $codonTableId, $complete,
+ $complete_codons, $throw, $codonTable, $orf, $start_codon, $offset);
+
+ ## new API with named parameters, post 1.5.1
+ if ($args[0] && $args[0] =~ /^-[A-Z]+/i) {
+ ($terminator, $unknown, $frame, $codonTableId, $complete,
+ $complete_codons, $throw,$codonTable, $orf, $start_codon, $offset) =
+ $self->_rearrange([qw(TERMINATOR
UNKNOWN
FRAME
CODONTABLE_ID
@@ -586,11 +571,11 @@ sub translate {
ORF
START
OFFSET)], @args);
- ## old API, 1.5.1 and preceding versions
- } else {
- ($terminator, $unknown, $frame, $codonTableId,
- $complete, $throw, $codonTable, $offset) = @args;
- }
+ ## old API, 1.5.1 and preceding versions
+ } else {
+ ($terminator, $unknown, $frame, $codonTableId,
+ $complete, $throw, $codonTable, $offset) = @args;
+ }
## Initialize termination codon, unknown codon, codon table id, frame
$terminator = '*' unless (defined($terminator) and $terminator ne '');
@@ -601,46 +586,45 @@ sub translate {
## Get a CodonTable, error if custom CodonTable is invalid
if ($codonTable) {
- $self->throw("Need a Bio::Tools::CodonTable object, not ". $codonTable)
- unless $codonTable->isa('Bio::Tools::CodonTable');
+ $self->throw("Need a Bio::Tools::CodonTable object, not ". $codonTable)
+ unless $codonTable->isa('Bio::Tools::CodonTable');
} else {
# shouldn't this be cached? Seems wasteful to have a new instance
# every time...
- $codonTable = Bio::Tools::CodonTable->new( -id => $codonTableId);
- }
+ $codonTable = Bio::Tools::CodonTable->new( -id => $codonTableId);
+ }
## Error if alphabet is "protein"
$self->throw("Can't translate an amino acid sequence.") if
- ($self->alphabet =~ /protein/i);
+ ($self->alphabet =~ /protein/i);
## Error if -start parameter isn't a valid codon
- if ($start_codon) {
- $self->throw("Invalid start codon: $start_codon.") if
- ( $start_codon !~ /^[A-Z]{3}$/i );
- }
-
- my $seq;
-
- if ($offset) {
- $self->throw("Offset must be 1, 2, or 3.") if
- ( $offset !~ /^[123]$/ );
- my ($start, $end) = ($offset, $self->length);
- ($seq) = $self->subseq($start, $end);
- } else {
- ($seq) = $self->seq();
- }
+ if ($start_codon) {
+ $self->throw("Invalid start codon: $start_codon.") if
+ ( $start_codon !~ /^[A-Z]{3}$/i );
+ }
+
+ my $seq;
+ if ($offset) {
+ $self->throw("Offset must be 1, 2, or 3.") if
+ ( $offset !~ /^[123]$/ );
+ my ($start, $end) = ($offset, $self->length);
+ ($seq) = $self->subseq($start, $end);
+ } else {
+ ($seq) = $self->seq();
+ }
## ignore frame if an ORF is supposed to be found
- if ( $orf ) {
- my ($orf_region) = $self->_find_orfs_nucleotide( $seq, $codonTable, $start_codon, $orf eq 'longest' ? 0 : 'first_only' );
- $seq = $self->_orf_sequence( $seq, $orf_region );
- } else {
- ## use frame, error if frame is not 0, 1 or 2
- $self->throw("Valid values for frame are 0, 1, or 2, not $frame.")
- unless ($frame == 0 or $frame == 1 or $frame == 2);
- $seq = substr($seq,$frame);
- }
+ if ( $orf ) {
+ my ($orf_region) = $self->_find_orfs_nucleotide( $seq, $codonTable, $start_codon, $orf eq 'longest' ? 0 : 'first_only' );
+ $seq = $self->_orf_sequence( $seq, $orf_region );
+ } else {
+ ## use frame, error if frame is not 0, 1 or 2
+ $self->throw("Valid values for frame are 0, 1, or 2, not $frame.")
+ unless ($frame == 0 or $frame == 1 or $frame == 2);
+ $seq = substr($seq,$frame);
+ }
## Translate it
my $output = $codonTable->translate($seq, $complete_codons);
@@ -650,51 +634,47 @@ sub translate {
## Only if we are expecting to translate a complete coding region
if ($complete) {
- my $id = $self->display_id;
- # remove the terminator character
- if( substr($output,-1,1) eq $terminator ) {
- chop $output;
- } else {
- $throw && $self->throw("Seq [$id]: Not using a valid terminator codon!");
- $self->warn("Seq [$id]: Not using a valid terminator codon!");
- }
- # test if there are terminator characters inside the protein sequence!
- if ($output =~ /\Q$terminator\E/) {
- $id ||= '';
- $throw && $self->throw("Seq [$id]: Terminator codon inside CDS!");
- $self->warn("Seq [$id]: Terminator codon inside CDS!");
- }
- # if the initiator codon is not ATG, the amino acid needs to be changed to M
- if ( substr($output,0,1) ne 'M' ) {
- if ($codonTable->is_start_codon(substr($seq, 0, 3)) ) {
- $output = 'M'. substr($output,1);
- } elsif ($throw) {
- $self->throw("Seq [$id]: Not using a valid initiator codon!");
- } else {
- $self->warn("Seq [$id]: Not using a valid initiator codon!");
- }
- }
+ my $id = $self->display_id;
+ # remove the terminator character
+ if( substr($output,-1,1) eq $terminator ) {
+ chop $output;
+ } else {
+ $throw && $self->throw("Seq [$id]: Not using a valid terminator codon!");
+ $self->warn("Seq [$id]: Not using a valid terminator codon!");
+ }
+ # test if there are terminator characters inside the protein sequence!
+ if ($output =~ /\Q$terminator\E/) {
+ $id ||= '';
+ $throw && $self->throw("Seq [$id]: Terminator codon inside CDS!");
+ $self->warn("Seq [$id]: Terminator codon inside CDS!");
+ }
+ # if the initiator codon is not ATG, the amino acid needs to be changed to M
+ if ( substr($output,0,1) ne 'M' ) {
+ if ($codonTable->is_start_codon(substr($seq, 0, 3)) ) {
+ $output = 'M'. substr($output,1);
+ } elsif ($throw) {
+ $self->throw("Seq [$id]: Not using a valid initiator codon!");
+ } else {
+ $self->warn("Seq [$id]: Not using a valid initiator codon!");
+ }
+ }
}
- my $seqclass;
- if ($self->can_call_new()) {
- $seqclass = ref($self);
- } else {
- $seqclass = 'Bio::PrimarySeq';
- $self->_attempt_to_load_Seq();
- }
- my $out = $seqclass->new( '-seq' => $output,
- '-display_id' => $self->display_id,
- '-accession_number' => $self->accession_number,
- # is there anything wrong with retaining the