Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Reformatting and editing

  • Loading branch information...
commit ae44e03b21ea72296f8bb21fcb356ca236cb3572 1 parent 0946759
@bosborne bosborne authored
Showing with 198 additions and 178 deletions.
  1. +198 −178 Bio/DB/NCBIHelper.pm
View
376 Bio/DB/NCBIHelper.pm
@@ -98,8 +98,9 @@ use URI::Escape qw(uri_unescape);
use base qw(Bio::DB::WebDBSeqI Bio::Root::Root);
BEGIN {
- $MAX_ENTRIES = 19000;
- $HOSTBASE = 'http://eutils.ncbi.nlm.nih.gov';
+ $MAX_ENTRIES = 19000;
+ $HOSTBASE = 'http://eutils.ncbi.nlm.nih.gov';
+ $REQUEST_DELAY = 3;
%CGILOCATION = (
'batch' => [ 'post' => '/entrez/eutils/epost.fcgi' ],
'query' => [ 'get' => '/entrez/eutils/efetch.fcgi' ],
@@ -119,23 +120,36 @@ BEGIN {
$DEFAULTFORMAT = 'gb';
}
+=head2 new
-# the new way to make modules a little more lightweight
+ Title : new
+ Usage :
+ Function: the new way to make modules a little more lightweight
+ Returns :
+ Args :
+
+=cut
sub new {
- my ($class, @args ) = @_;
+ my ( $class, @args ) = @_;
my $self = $class->SUPER::new(@args);
- my ($seq_start,$seq_stop,$no_redirect, $redirect, $complexity,$strand) =
- $self->_rearrange([qw(SEQ_START SEQ_STOP NO_REDIRECT REDIRECT_REFSEQ COMPLEXITY STRAND)],
- @args);
- $seq_start && $self->seq_start($seq_start);
- $seq_stop && $self->seq_stop($seq_stop);
- $no_redirect && $self->no_redirect($no_redirect);
- $redirect && $self->redirect_refseq($redirect);
- $strand && $self->strand($strand);
- # adjust statement to accept zero value
- defined $complexity && ($complexity >=0 && $complexity <=4)
- && $self->complexity($complexity);
+ my ($seq_start, $seq_stop, $no_redirect,
+ $redirect, $complexity, $strand
+ )
+ = $self->_rearrange(
+ [ qw(SEQ_START SEQ_STOP NO_REDIRECT REDIRECT_REFSEQ COMPLEXITY STRAND) ],
+ @args
+ );
+ $seq_start && $self->seq_start($seq_start);
+ $seq_stop && $self->seq_stop($seq_stop);
+ $no_redirect && $self->no_redirect($no_redirect);
+ $redirect && $self->redirect_refseq($redirect);
+ $strand && $self->strand($strand);
+
+ # adjust statement to accept zero value
+ defined $complexity
+ && ( $complexity >= 0 && $complexity <= 4 )
+ && $self->complexity($complexity);
return $self;
}
@@ -144,7 +158,7 @@ sub new {
Title : get_params
Usage : my %params = $self->get_params($mode)
- Function: Returns key,value pairs to be passed to NCBI database
+ Function: returns key,value pairs to be passed to NCBI database
for either 'batch' or 'single' sequence retrieval method
Returns : a key,value pair hash
Args : 'single' or 'batch' mode for retrieval
@@ -160,7 +174,7 @@ sub get_params {
Title : default_format
Usage : my $format = $self->default_format
- Function: Returns default sequence format for this module
+ Function: returns default sequence format for this module
Returns : string
Args : none
@@ -181,82 +195,94 @@ sub default_format {
=cut
sub get_request {
- my ($self, @qualifiers) = @_;
- my ($mode, $uids, $format, $query, $seq_start, $seq_stop, $strand, $complexity) =
- $self->_rearrange([qw(MODE UIDS FORMAT QUERY SEQ_START SEQ_STOP STRAND COMPLEXITY)],
- @qualifiers);
- $mode = lc $mode;
- ($format) = $self->request_format() unless ( defined $format);
- if( !defined $mode || $mode eq '' ) { $mode = 'single'; }
- my %params = $self->get_params($mode);
- if( ! %params ) {
- $self->throw("must specify a valid retrieval mode 'single' or 'batch' not '$mode'")
- }
- my $url = URI->new($HOSTBASE . $CGILOCATION{$mode}[1]);
- unless( $mode eq 'webenv' || defined $uids || defined $query) {
- $self->throw("Must specify a query or list of uids to fetch");
- }
- if ($query && $query->can('cookie')) {
- @params{'WebEnv','query_key'} = $query->cookie;
- $params{'db'} = $query->db;
- }
- elsif ($query) {
- $params{'id'} = join ',',$query->ids;
- }
- # for batch retrieval, non-query style
- elsif ($mode eq 'webenv' && $self->can('cookie')) {
- @params{'WebEnv','query_key'} = $self->cookie;
- }
- elsif ($uids) {
- if( ref($uids) =~ /array/i ) {
- $uids = join(",", @$uids);
- }
- $params{'id'} = $uids;
- }
- $seq_start && ($params{'seq_start'} = $seq_start);
- $seq_stop && ($params{'seq_stop'} = $seq_stop);
- $strand && ($params{'strand'} = $strand);
- if (defined $complexity && ($seq_start || $seq_stop || $strand)) {
- $self->warn("Complexity set to $complexity; seq_start and seq_stop may not work!")
- if ($complexity != 1 && ($seq_start || $seq_stop));
- $self->warn("Complexity set to 0; expect strange results with strand set to 2")
- if ($complexity == 0 && $strand == 2 && $format eq 'fasta');
- }
- defined $complexity && ($params{'complexity'} = $complexity);
- $params{'rettype'} = $format unless $mode eq 'batch';
- # for now, 'post' is batch retrieval
- if ($CGILOCATION{$mode}[0] eq 'post') {
- my $response = $self->ua->request(POST $url,[%params]);
- $response->proxy_authorization_basic($self->authentication)
- if ( $self->authentication);
- $self->_parse_response($response->content);
- my ($cookie, $querykey) = $self->cookie;
- my %qualifiers = ('-mode' => 'webenv',
- '-seq_start' => $seq_start,
- '-seq_stop' => $seq_stop,
- '-strand' => $strand,
- '-complexity' => $complexity,
- '-format' => $format);
- return $self->get_request(%qualifiers);
- } else {
- $url->query_form(%params);
- return GET $url;
- }
+ my ( $self, @qualifiers ) = @_;
+ my ( $mode, $uids, $format, $query, $seq_start, $seq_stop, $strand,
+ $complexity )
+ = $self->_rearrange(
+ [qw(MODE UIDS FORMAT QUERY SEQ_START SEQ_STOP STRAND COMPLEXITY)],
+ @qualifiers );
+ $mode = lc $mode;
+ ($format) = $self->request_format() unless ( defined $format );
+ if ( !defined $mode || $mode eq '' ) { $mode = 'single'; }
+ my %params = $self->get_params($mode);
+ if ( !%params ) {
+ $self->throw(
+ "must specify a valid retrieval mode 'single' or 'batch' not '$mode'"
+ );
+ }
+ my $url = URI->new( $HOSTBASE . $CGILOCATION{$mode}[1] );
+ unless ( $mode eq 'webenv' || defined $uids || defined $query ) {
+ $self->throw("Must specify a query or list of uids to fetch");
+ }
+ if ( $query && $query->can('cookie') ) {
+ @params{ 'WebEnv', 'query_key' } = $query->cookie;
+ $params{'db'} = $query->db;
+ }
+ elsif ($query) {
+ $params{'id'} = join ',', $query->ids;
+ }
+
+ # for batch retrieval, non-query style
+ elsif ( $mode eq 'webenv' && $self->can('cookie') ) {
+ @params{ 'WebEnv', 'query_key' } = $self->cookie;
+ }
+ elsif ($uids) {
+ if ( ref($uids) =~ /array/i ) {
+ $uids = join( ",", @$uids );
+ }
+ $params{'id'} = $uids;
+ }
+ $seq_start && ( $params{'seq_start'} = $seq_start );
+ $seq_stop && ( $params{'seq_stop'} = $seq_stop );
+ $strand && ( $params{'strand'} = $strand );
+ if ( defined $complexity && ( $seq_start || $seq_stop || $strand ) ) {
+ $self->warn(
+ "Complexity set to $complexity; seq_start and seq_stop may not work!"
+ ) if ( $complexity != 1 && ( $seq_start || $seq_stop ) );
+ $self->warn(
+ "Complexity set to 0; expect strange results with strand set to 2"
+ ) if ( $complexity == 0 && $strand == 2 && $format eq 'fasta' );
+ }
+ defined $complexity && ( $params{'complexity'} = $complexity );
+ $params{'rettype'} = $format unless $mode eq 'batch';
+
+ # for now, 'post' is batch retrieval
+ if ( $CGILOCATION{$mode}[0] eq 'post' ) {
+ my $response = $self->ua->request( POST $url, [%params] );
+ $response->proxy_authorization_basic( $self->authentication )
+ if ( $self->authentication );
+ $self->_parse_response( $response->content );
+ my ( $cookie, $querykey ) = $self->cookie;
+ my %qualifiers = (
+ '-mode' => 'webenv',
+ '-seq_start' => $seq_start,
+ '-seq_stop' => $seq_stop,
+ '-strand' => $strand,
+ '-complexity' => $complexity,
+ '-format' => $format
+ );
+ return $self->get_request(%qualifiers);
+ }
+ else {
+ $url->query_form(%params);
+ return GET $url;
+ }
}
+
=head2 get_Stream_by_batch
Title : get_Stream_by_batch
Usage : $seq = $db->get_Stream_by_batch($ref);
Function: Retrieves Seq objects from Entrez 'en masse', rather than one
at a time. For large numbers of sequences, this is far superior
- than get_Stream_by_[id/acc]().
+ than get_Stream_by_id or get_Stream_by_acc.
Example :
Returns : a Bio::SeqIO stream object
Args : $ref : either an array reference, a filename, or a filehandle
from which to get the list of unique ids/accession numbers.
-NOTE: deprecated API. Use get_Stream_by_id() instead.
+ NOTE: deprecated API. Use get_Stream_by_id() instead.
=cut
@@ -272,13 +298,12 @@ NOTE: deprecated API. Use get_Stream_by_id() instead.
Usage : $seq = $db->get_Stream_by_query($query);
Function: Retrieves Seq objects from Entrez 'en masse', rather than one
at a time. For large numbers of sequences, this is far superior
- than get_Stream_by_[id/acc]().
+ to get_Stream_by_id and get_Stream_by_acc.
Example :
Returns : a Bio::SeqIO stream object
- Args : $query : An Entrez query string or a
- Bio::DB::Query::GenBank object. It is suggested that you
- create a Bio::DB::Query::GenBank object and get the entry
- count before you fetch a potentially large stream.
+ Args : An Entrez query string or a Bio::DB::Query::GenBank object.
+ It is suggested that you create a Bio::DB::Query::GenBank object and get
+ the entry count before you fetch a potentially large stream.
=cut
@@ -294,17 +319,17 @@ sub get_Stream_by_query {
Title : postprocess_data
Usage : $self->postprocess_data ( 'type' => 'string',
- 'location' => \$datastr);
- Function: process downloaded data before loading into a Bio::SeqIO
+ 'location' => \$datastr );
+ Function: Process downloaded data before loading into a Bio::SeqIO. This
+ works for Genbank and Genpept, other classes should override
+ it with their own method.
Returns : void
- Args : hash with two keys - 'type' can be 'string' or 'file'
- - 'location' either file location or string
- reference containing data
+ Args : hash with two keys:
-=cut
+ 'type' can be 'string' or 'file'
+ 'location' either file location or string reference containing data
-# the default method, works for genbank/genpept, other classes should
-# override it with their own method.
+=cut
sub postprocess_data {
# retain this in case postprocessing is needed at a future date
@@ -326,20 +351,22 @@ sub postprocess_data {
=cut
sub request_format {
- my ($self, $value) = @_;
- if( defined $value ) {
- $value = lc $value;
- if( defined $FORMATMAP{$value} ) {
- $self->{'_format'} = [ $value, $FORMATMAP{$value}];
- } else {
- # Try to fall back to a default. Alternatively, we could throw
- # an exception
- $self->{'_format'} = [ $value, $value ];
- }
- }
- return @{$self->{'_format'}};
+ my ( $self, $value ) = @_;
+ if ( defined $value ) {
+ $value = lc $value;
+ if ( defined $FORMATMAP{$value} ) {
+ $self->{'_format'} = [ $value, $FORMATMAP{$value} ];
+ }
+ else {
+ # Try to fall back to a default. Alternatively, we could throw
+ # an exception
+ $self->{'_format'} = [ $value, $value ];
+ }
+ }
+ return @{ $self->{'_format'} };
}
+
=head2 redirect_refseq
Title : redirect_refseq
@@ -350,7 +377,7 @@ sub request_format {
Throws : 'unparseable output exception'
Note : This replaces 'no_redirect' as a more straightforward flag to
redirect possible RefSeqs to use Bio::DB::RefSeq (EBI interface)
- instead of retrievign the NCBI records
+ instead of retrieving the NCBI records
=cut
@@ -368,23 +395,21 @@ sub redirect_refseq {
Returns : value from 0-4 indicating level of complexity
Args : value from 0-4 (optional); if unset server assumes 1
Throws : if arg is not an integer or falls outside of noted range above
- Note : From efetch docs:
-
- Complexity regulates the display:
+ Note : From efetch docs, the complexity regulates the display:
- * 0 - get the whole blob
- * 1 - get the bioseq for gi of interest (default in Entrez)
- * 2 - get the minimal bioseq-set containing the gi of interest
- * 3 - get the minimal nuc-prot containing the gi of interest
- * 4 - get the minimal pub-set containing the gi of interest
+ 0 - get the whole blob
+ 1 - get the bioseq for gi of interest (default in Entrez)
+ 2 - get the minimal bioseq-set containing the gi of interest
+ 3 - get the minimal nuc-prot containing the gi of interest
+ 4 - get the minimal pub-set containing the gi of interest
=cut
sub complexity {
- my ($self, $comp) = @_;
- if (defined $comp) {
- $self->throw("Complexity value must be integer between 0 and 4") if
- $comp !~ /^\d+$/ || $comp < 0 || $comp > 4;
+ my ( $self, $comp ) = @_;
+ if ( defined $comp ) {
+ $self->throw("Complexity value must be integer between 0 and 4")
+ if $comp !~ /^\d+$/ || $comp < 0 || $comp > 4;
$self->{'_complexity'} = $comp;
}
return $self->{'_complexity'};
@@ -463,7 +488,7 @@ Overriding WebDBSeqI method to help newbies to retrieve sequences
Title : get_Stream_by_acc
Usage : $seq = $db->get_Stream_by_acc([$acc1, $acc2]);
- Function: Gets a series of Seq objects by accession numbers
+ Function: gets a series of Seq objects by accession numbers
Returns : a Bio::SeqIO stream object
Args : $ref : a reference to an array of accession numbers for
the desired sequence entries
@@ -472,122 +497,117 @@ Overriding WebDBSeqI method to help newbies to retrieve sequences
=cut
sub get_Stream_by_acc {
- my ($self, $ids ) = @_;
+ my ( $self, $ids ) = @_;
my $newdb = $self->_check_id($ids);
- if (defined $newdb && ref($newdb) && $newdb->isa('Bio::DB::RefSeq')) {
- return $newdb->get_seq_stream('-uids' => $ids, '-mode' => 'single');
- } else {
- return $self->get_seq_stream('-uids' => $ids, '-mode' => 'single');
+ if ( defined $newdb && ref($newdb) && $newdb->isa('Bio::DB::RefSeq') ) {
+ return $newdb->get_seq_stream( '-uids' => $ids, '-mode' => 'single' );
+ }
+ else {
+ return $self->get_seq_stream( '-uids' => $ids, '-mode' => 'single' );
}
}
-
=head2 _check_id
Title : _check_id
Usage :
Function:
- Returns : A Bio::DB::RefSeq reference or throws
+ Returns : a Bio::DB::RefSeq reference or throws
Args : $id(s), $string
=cut
sub _check_id {
- my ($self, $ids) = @_;
-
- # NT contigs can not be retrieved
- $self->throw("NT_ contigs are whole chromosome files which are not part of regular".
- "database distributions. Go to ftp://ftp.ncbi.nih.gov/genomes/.")
- if $ids =~ /NT_/;
-
- # Asking for a RefSeq from EMBL/GenBank
-
- if ($self->redirect_refseq) {
- if ($ids =~ /N._/) {
- $self->warn("[$ids] is not a normal sequence database but a RefSeq entry.".
- " Redirecting the request.\n")
- if $self->verbose >= 0;
- return Bio::DB::RefSeq->new();
- }
- }
+ my ( $self, $ids ) = @_;
+
+ # NT contigs can not be retrieved
+ $self->throw("NT_ contigs are whole chromosome files which are not part of regular"
+ . "database distributions. Go to ftp://ftp.ncbi.nih.gov/genomes/.")
+ if $ids =~ /NT_/;
+
+ # Asking for a RefSeq from EMBL/GenBank
+ if ( $self->redirect_refseq ) {
+ if ( $ids =~ /N._/ ) {
+ $self->warn(
+ "[$ids] is not a normal sequence database but a RefSeq entry."
+ . " Redirecting the request.\n" )
+ if $self->verbose >= 0;
+ return Bio::DB::RefSeq->new();
+ }
+ }
}
+
=head2 delay_policy
Title : delay_policy
Usage : $secs = $self->delay_policy
- Function: return number of seconds to delay between calls to remote db
+ Function: NCBI requests a delay of 3 seconds between requests. This method
+ implements that policy.
Returns : number of seconds to delay
Args : none
- NOTE: NCBI requests a delay of 3 seconds between requests. This method
- implements that policy.
-
=cut
sub delay_policy {
- my $self = shift;
- return 3;
+ my $self = shift;
+ return $REQUEST_DELAY;
}
=head2 cookie
Title : cookie
Usage : ($cookie,$querynum) = $db->cookie
- Function: return the NCBI query cookie
+ Function: return the NCBI query cookie, this information is used by
+ Bio::DB::GenBank in conjunction with efetch, ripped from
+ Bio::DB::Query::GenBank
Returns : list of (cookie,querynum)
Args : none
-NOTE: this information is used by Bio::DB::GenBank in
-conjunction with efetch.
-
=cut
-# ripped from Bio::DB::Query::GenBank
sub cookie {
- my $self = shift;
- if (@_) {
- $self->{'_cookie'} = shift;
- $self->{'_querynum'} = shift;
- }
- else {
- return @{$self}{qw(_cookie _querynum)};
- }
+ my $self = shift;
+ if (@_) {
+ $self->{'_cookie'} = shift;
+ $self->{'_querynum'} = shift;
+ }
+ else {
+ return @{$self}{qw(_cookie _querynum)};
+ }
}
=head2 _parse_response
Title : _parse_response
Usage : $db->_parse_response($content)
- Function: parse out response for cookie
+ Function: parse out response for cookie, this is a trimmed-down version
+ of _parse_response from Bio::DB::Query::GenBank
Returns : empty
Args : none
Throws : 'unparseable output exception'
=cut
-# trimmed-down version of _parse_response from Bio::DB::Query::GenBank
sub _parse_response {
- my $self = shift;
- my $content = shift;
- if (my ($warning) = $content =~ m!<ErrorList>(.+)</ErrorList>!s) {
- $self->warn("Warning(s) from GenBank: $warning\n");
- }
- if (my ($error) = $content =~ /<OutputMessage>([^<]+)/) {
- $self->throw("Error from Genbank: $error");
- }
- my ($cookie) = $content =~ m!<WebEnv>(\S+)</WebEnv>!;
- my ($querykey) = $content =~ m!<QueryKey>(\d+)!;
- $self->cookie(uri_unescape($cookie),$querykey);
+ my $self = shift;
+ my $content = shift;
+ if ( my ($warning) = $content =~ m!<ErrorList>(.+)</ErrorList>!s ) {
+ $self->warn("Warning(s) from GenBank: $warning\n");
+ }
+ if ( my ($error) = $content =~ /<OutputMessage>([^<]+)/ ) {
+ $self->throw("Error from Genbank: $error");
+ }
+ my ($cookie) = $content =~ m!<WebEnv>(\S+)</WebEnv>!;
+ my ($querykey) = $content =~ m!<QueryKey>(\d+)!;
+ $self->cookie( uri_unescape($cookie), $querykey );
}
-########### DEPRECATED!!!! ###########
-
=head2 no_redirect
Title : no_redirect
Usage : $db->no_redirect($content)
- Function: Used to indicate that Bio::DB::GenBank instance retrieves
+ Function: DEPRECATED - Used to indicate that Bio::DB::GenBank instance retrieves
possible RefSeqs from EBI instead; default behavior is now to
retrieve directly from NCBI
Returns : None
Please sign in to comment.
Something went wrong with that request. Please try again.