diff --git a/Bio/Align/DNAStatistics.pm b/Bio/Align/DNAStatistics.pm index 18b488b750..892793194c 100644 --- a/Bio/Align/DNAStatistics.pm +++ b/Bio/Align/DNAStatistics.pm @@ -73,21 +73,37 @@ in brackets are the pattern which will match =over 3 -=item JukesCantor [jc|jukes|jukescantor|jukes-cantor] +=item * -=item Uncorrected [jcuncor|uncorrected] +JukesCantor [jc|jukes|jukescantor|jukes-cantor] -=item F81 [f81|felsenstein] +=item * -=item Kimura [k2|k2p|k80|kimura] +Uncorrected [jcuncor|uncorrected] -=item Tamura [t92|tamura|tamura92] +=item * -=item F84 [f84|felsenstein84] +F81 [f81|felsenstein] -=item TajimaNei [tajimanei|tajima\-nei] +=item * -=item JinNei [jinnei|jin\-nei] (not implemented) +Kimura [k2|k2p|k80|kimura] + +=item * + +Tamura [t92|tamura|tamura92] + +=item * + +F84 [f84|felsenstein84] + +=item * + +TajimaNei [tajimanei|tajima\-nei] + +=item * + +JinNei [jinnei|jin\-nei] (not implemented) =back @@ -104,7 +120,7 @@ several pre-requisites for the alignment. =item 1 DNA alignment must be based on protein alignment. Use the subroutine -L in Bio::Align::Utilities to achieve this. +L to achieve this. =item 2 @@ -140,49 +156,53 @@ comparisons in an MSA. The statistics returned are: =over 3 -=item S_d +=item * -Number of synonymous mutations between the 2 sequences. +S_d - Number of synonymous mutations between the 2 sequences. -=item N_d +=item * -Number of non-synonymous mutations between the 2 sequences. +N_d - Number of non-synonymous mutations between the 2 sequences. -=item S +=item * -Mean number of synonymous sites in both sequences. +S - Mean number of synonymous sites in both sequences. -=item N +=item * -mean number of synonymous sites in both sequences. +N - mean number of synonymous sites in both sequences. -=item P_s +=item * -proportion of synonymous differences in both sequences given by P_s = S_d/S. +P_s - proportion of synonymous differences in both sequences given by +P_s = S_d/S. -=item P_n +=item * -proportion of non-synonymous differences in both sequences given by P_n = S_n/S. +P_n - proportion of non-synonymous differences in both sequences given +by P_n = S_n/S. -=item D_s +=item * -estimation of synonymous mutations per synonymous site (by Jukes-Cantor). +D_s - estimation of synonymous mutations per synonymous site (by +Jukes-Cantor). -=item D_n +=item * -estimation of non-synonymous mutations per non-synonymous site (by Jukes-Cantor). +D_n - estimation of non-synonymous mutations per non-synonymous site (by +Jukes-Cantor). -=item D_n_var +=item * -estimation of variance of D_n . +D_n_var - estimation of variance of D_n . -=item D_s_var +=item * -estimation of variance of S_n. +D_s_var - estimation of variance of S_n. -=item z_value +=item * -calculation of z value.Positive value indicates D_n E D_s, +z_value - calculation of z value.Positive value indicates D_n E D_s, negative value indicates D_s E D_n. =back @@ -191,25 +211,25 @@ The statistics returned by calc_average_KaKs are: =over 3 -=item D_s +=item * -Average number of synonymous mutations/synonymous site. +D_s - Average number of synonymous mutations/synonymous site. -=item D_n +=item * -Average number of non-synonymous mutations/non-synonymous site. +D_n - Average number of non-synonymous mutations/non-synonymous site. -=item D_s_var +=item * -Estimated variance of Ds from bootstrapped alignments. +D_s_var - Estimated variance of Ds from bootstrapped alignments. -=item D_n_var +=item * -Estimated variance of Dn from bootstrapped alignments. +D_n_var - Estimated variance of Dn from bootstrapped alignments. -=item z_score +=item * -calculation of z value. Positive value indicates D_n ED_s, +z_score - calculation of z value. Positive value indicates D_n ED_s, negative values vice versa. =back @@ -222,7 +242,6 @@ the book, and reproduce those results. If people like having this sort of analysis in BioPerl other methods for estimating Ds and Dn can be provided later. - Much of the DNA distance code is based on implementations in EMBOSS (Rice et al, www.emboss.org) [distmat.c] and PHYLIP (J. Felsenstein et al) [dnadist.c]. Insight also gained from Eddy, Durbin, Krogh, & @@ -232,26 +251,36 @@ Mitchison. =over 3 -=item D_JukesCantor +=item * + +D_JukesCantor "Phylogenetic Inference", Swoffrod, Olsen, Waddell and Hillis, in Mol. Systematics, 2nd ed, 1996, Ch 11. Derived from "Evolution of Protein Molecules", Jukes & Cantor, in Mammalian Prot. Metab., III, 1969, pp. 21-132. -=item D_Tamura +=item * + +D_Tamura K Tamura, Mol. Biol. Evol. 1992, 9, 678. -=item D_Kimura +=item * + +D_Kimura M Kimura, J. Mol. Evol., 1980, 16, 111. -=item JinNei +=item * + +JinNei Jin and Nei, Mol. Biol. Evol. 82, 7, 1990. -=item D_TajimaNei +=item * + +D_TajimaNei Tajima and Nei, Mol. Biol. Evol. 1984, 1, 269. diff --git a/Bio/AnalysisI.pm b/Bio/AnalysisI.pm index e62de7db7a..8c10ea5b1b 100644 --- a/Bio/AnalysisI.pm +++ b/Bio/AnalysisI.pm @@ -198,8 +198,8 @@ sub describe { shift->throw_not_implemented(); } The analysis input data are named, and can be also associated with a default value, with allowed values and with few other attributes. The names are important for feeding the service with the input data (the -inputs are given to methods C, C, and/or C -as name/value pairs). +inputs are given to methods C, C, and/or +C as name/value pairs). Here is a (slightly shortened) example of an input specification: @@ -324,8 +324,8 @@ tool. Call this method if you wish to "stage the scene" - to create a job with all input data but without actually running it. This method is -called automatically from other methods (C and C) so -usually you do not need to call it directly. +called automatically from other methods (C and +C) so usually you do not need to call it directly. The input data and prameters for this execution can be specified in various ways: @@ -459,7 +459,7 @@ sub id { shift->throw_not_implemented(); } # ----------------------------------------------------------------------------- -=head2 run +=head2 Bio::AnalysisI::JobI::run Usage : $job->run Returns : itself @@ -467,8 +467,8 @@ sub id { shift->throw_not_implemented(); } It starts previously created job. The job already must have all input data filled-in. This differs from the method of the same name of the -C object where the C method creates -also a new job allowing to set input data. +C object where the C method +creates also a new job allowing to set input data. =cut @@ -476,7 +476,7 @@ sub run { shift->throw_not_implemented(); } # ----------------------------------------------------------------------------- -=head2 wait_for +=head2 Bio::AnalysisI::JobI::wait_for Usage : $job->wait_for Returns : itself diff --git a/Bio/Assembly/Tools/ContigSpectrum.pm b/Bio/Assembly/Tools/ContigSpectrum.pm index 9c77ba27df..f5910e32fe 100644 --- a/Bio/Assembly/Tools/ContigSpectrum.pm +++ b/Bio/Assembly/Tools/ContigSpectrum.pm @@ -785,7 +785,7 @@ sub average { } -=head2 average +=head2 score Title : score Usage : my $score = $csp->score(); diff --git a/Bio/DB/GFF.pm b/Bio/DB/GFF.pm index a5ec74b9ea..71e10b3b8b 100644 --- a/Bio/DB/GFF.pm +++ b/Bio/DB/GFF.pm @@ -94,7 +94,9 @@ directory under a subdirectory named Bio::DB::GFF: =over 4 -=item bp_load_gff.pl +=item * + +bp_load_gff.pl This script will load a Bio::DB::GFF database from a flat GFF file of sequence annotations. Only the relational database version of @@ -108,7 +110,9 @@ for most of their functionality. load_gff.pl also has a --upgrade option, which will perform a non-destructive upgrade of older schemas to newer ones. -=item bp_bulk_load_gff.pl +=item * + +bp_bulk_load_gff.pl This script will populate a Bio::DB::GFF database from a flat GFF file of sequence annotations. Only the MySQL database version of @@ -120,7 +124,9 @@ This script takes a --fasta argument to load raw DNA into the database as well. However, GFF databases do not require access to the raw DNA for most of their functionality. -=item bp_fast_load_gff.pl +=item * + +bp_fast_load_gff.pl This script is as fast as bp_bulk_load_gff.pl but uses Unix pipe tricks to allow for incremental updates. It only supports the MySQL @@ -129,13 +135,17 @@ non-Unix platforms. Arguments are the same as bp_load_gff.pl -=item gadfly_to_gff.pl +=item * + +gadfly_to_gff.pl This script will convert the GFF-like format used by the Berkeley Drosophila Sequencing project into a format suitable for use with this module. -=item sgd_to_gff.pl +=item * + +sgd_to_gff.pl This script will convert the tab-delimited feature files used by the Saccharomyces Genome Database into a format suitable for use with this @@ -155,13 +165,17 @@ The 9 columns are as follows: =over 4 -=item 1. reference sequence +=item 1. + +reference sequence This is the ID of the sequence that is used to establish the coordinate system of the annotation. In the example above, the reference sequence is "Chr1". -=item 2. source +=item 2. + +source The source of the annotation. This field describes how the annotation was derived. In the example above, the source is "curated" to @@ -169,22 +183,30 @@ indicate that the feature is the result of human curation. The names and versions of software programs are often used for the source field, as in "tRNAScan-SE/1.2". -=item 3. method +=item 3. + +method The annotation method. This field describes the type of the annotation, such as "CDS". Together the method and source describe the annotation type. -=item 4. start position +=item 4. + +start position The start of the annotation relative to the reference sequence. -=item 5. stop position +=item 5. + +stop position The stop of the annotation relative to the reference sequence. Start is always less than or equal to stop. -=item 6. score +=item 6. + +score For annotations that are associated with a numeric score (for example, a sequence similarity), this field describes the score. The score @@ -192,20 +214,26 @@ units are completely unspecified, but for sequence similarities, it is typically percent identity. Annotations that don't have a score can use "." -=item 7. strand +=item 7. + +strand For those annotations which are strand-specific, this field is the strand on which the annotation resides. It is "+" for the forward strand, "-" for the reverse strand, or "." for annotations that are not stranded. -=item 8. phase +=item 8. + +phase For annotations that are linked to proteins, this field describes the phase of the annotation on the codons. It is a number from 0 to 2, or "." for features that have no phase. -=item 9. group +=item 9. + +group GFF provides a simple way of generating annotation hierarchies ("is composed of" relationships) by providing a group field. The group @@ -315,13 +343,17 @@ specifying which tag to group on: =over 4 -=item Using -preferred_groups +=item * + +Using -preferred_groups When you create a Bio::DB::GFF object, pass it a -preferred_groups=E argument. This specifies a tag that will be used for grouping. You can pass an array reference to specify a list of such tags. -=item In the GFF header +=item * + +In the GFF header The GFF file itself can specify which tags are to be used for grouping. Insert a comment like the following: @@ -409,7 +441,9 @@ it adaptable to use with a variety of databases. =over 4 -=item Adaptors +=item * + +Adaptors The core of the module handles the user API, annotation coordinate arithmetic, and other common issues. The details of fetching @@ -441,7 +475,9 @@ There are currently five adaptors recommended for general use: Check the Bio/DB/GFF/Adaptor directory and subdirectories for other, more specialized adaptors, as well as experimental ones. -=item Aggregators +=item * + +Aggregators The GFF format uses a "group" field to indicate aggregation properties of individual features. For example, a set of exons and introns may @@ -513,7 +549,7 @@ has some limitations. =over 4 -=item 1. GFF version string is required +=item GFF version string is required The GFF file B contain the version comment: @@ -523,7 +559,7 @@ Unless this version string is present at the top of the GFF file, the loader will attempt to parse the file in GFF2 format, with less-than-desirable results. -=item 2. Only one level of nesting allowed +=item Only one level of nesting allowed A major restriction is that Bio::DB::GFF only allows one level of nesting of features. For nesting, the Target tag will be used @@ -1742,27 +1778,37 @@ This method takes a single overloaded argument, which can be any of: =over 4 -=item 1. a scalar corresponding to a GFF file on the system +=item * + +a scalar corresponding to a GFF file on the system A pathname to a local GFF file. Any files ending with the .gz, .Z, or .bz2 suffixes will be transparently decompressed with the appropriate command-line utility. -=item 2. an array reference containing a list of GFF files on the system +=item * + +an array reference containing a list of GFF files on the system For example ['/home/gff/gff1.gz','/home/gff/gff2.gz'] -=item 3. directory path +=item * + +directory path The indicated directory will be searched for all files ending in the suffixes .gff, .gff.gz, .gff.Z or .gff.bz2. -=item 4. filehandle +=item * + +filehandle An open filehandle from which to read the GFF data. Tied filehandles now work as well. -=item 5. a pipe expression +=item * + +a pipe expression A pipe expression will also work. For example, a GFF file on a remote web server can be loaded with an expression like this: @@ -1837,27 +1883,37 @@ This method takes a single overloaded argument, which can be any of: =over 4 -=item 1. scalar corresponding to a FASTA file on the system +=item * + +scalar corresponding to a FASTA file on the system A pathname to a local FASTA file. Any files ending with the .gz, .Z, or .bz2 suffixes will be transparently decompressed with the appropriate command-line utility. -=item 2. array reference containing a list of FASTA files on the +=item * + +array reference containing a list of FASTA files on the system For example ['/home/fasta/genomic.fa.gz','/home/fasta/genomic.fa.gz'] -=item 3. path to a directory +=item * + +path to a directory The indicated directory will be searched for all files ending in the suffixes .fa, .fa.gz, .fa.Z or .fa.bz2. -a=item 4. filehandle +=item * + +filehandle An open filehandle from which to read the FASTA data. -=item 5. pipe expression +=item * + +pipe expression A pipe expression will also work. For example, a FASTA file on a remote web server can be loaded with an expression like this: @@ -3775,7 +3831,6 @@ fixed. =head1 SEE ALSO -L, L, L, L, diff --git a/Bio/DB/GFF/Aggregator.pm b/Bio/DB/GFF/Aggregator.pm index 6de9519a9d..d9162d7915 100644 --- a/Bio/DB/GFF/Aggregator.pm +++ b/Bio/DB/GFF/Aggregator.pm @@ -39,20 +39,26 @@ Instances of Bio::DB::GFF::Aggregator have three attributes: =over 3 -=item method +=item * + +method This is the GFF method field of the composite feature as a whole. For example, "transcript" may be used for a composite feature created by aggregating individual intron, exon and UTR features. -=item main method +=item * + +main method Sometimes GFF groups are organized hierarchically, with one feature logically containing another. For example, in the C. elegans schema, methods of type "Sequence:curated" correspond to regions covered by curated genes. There can be zero or one main methods. -=item subparts +=item * + +subparts This is a list of one or more methods that correspond to the component features of the aggregates. For example, in the C. elegans database, @@ -65,14 +71,18 @@ subclasses: =over 4 -=item disaggregate() +=item * + +disaggregate() This method is called by the Adaptor object prior to fetching a list of features. The method is passed an associative array containing the [method,source] pairs that the user has requested, and it returns a list of raw features that it would like the adaptor to fetch. -=item aggregate() +=item * + +aggregate() This method is called by the Adaptor object after it has fetched features. The method is passed a list of raw features and is expected @@ -86,15 +96,21 @@ case, it suffices for subclasses to override the following methods: =over 4 -=item method() +=item * + +method() Return the default method for the composite feature as a whole. -=item main_name() +=item * + +main_name() Return the default main method name. -=item part_names() +=item * + +part_names() Return a list of subpart method names. diff --git a/Bio/DB/HIV/HIVQueryHelper.pm b/Bio/DB/HIV/HIVQueryHelper.pm index d22f5a3f2e..67922d0334 100755 --- a/Bio/DB/HIV/HIVQueryHelper.pm +++ b/Bio/DB/HIV/HIVQueryHelper.pm @@ -95,7 +95,7 @@ BEGIN { =head2 HIVSchema - objects/methods to manipulate a version of the LANL HIV DB schema -=head3 SYNOPSIS +=head3 HIVSchema SYNOPSIS $schema = new HIVSchema( 'lanl-schema.xml' ); @tables = $schema->tables; @@ -109,7 +109,7 @@ BEGIN { $table = $schema->tablepart('SEQ_SAMple.SSAM_badseq'); # returns 'SEQ_SAMple' $column = $schema->columnpart('SEQ_SAMple.SSAM_badseq'); # returns 'SSAM_badseq' -=head3 DESCRIPTION +=head3 HIVSchema DESCRIPTION HIVSchema methods are used in L for table, column, primary/foreign key manipulations based on the observed Los @@ -131,9 +131,9 @@ use strict; ### constructor -=head3 CONSTRUCTOR +=head3 HIVSchema CONSTRUCTOR -=head4 new +=head4 HIVSchema::new Title : new Usage : $schema = new HIVSchema( "lanl-schema.xml "); @@ -157,9 +157,9 @@ sub new { ### object methods -=head3 INSTANCE METHODS +=head3 HIVSchema INSTANCE METHODS -=head4 tables +=head4 HIVSchema tables Title : tables Usage : $schema->tables() @@ -186,7 +186,7 @@ sub tables { return @k; } -=head4 columns +=head4 HIVSchema columns Title : columns Usage : $schema->columns( [$tablename] ); @@ -218,7 +218,7 @@ sub columns { return @k; } -=head4 fields +=head4 HIVSchema fields Title : fields Usage : $schema->fields(); @@ -238,7 +238,7 @@ sub fields { return @k; } -=head4 options +=head4 HIVSchema options Title : options Usage : $schema->options(@fieldnames) @@ -259,7 +259,7 @@ sub options { return $$sref{$sfield}{option} ? @{$$sref{$sfield}{option}} : (); } -=head4 aliases +=head4 HIVSchema aliases Title : aliases Usage : $schema->aliases(@fieldnames) @@ -286,7 +286,7 @@ sub aliases { } } -=head4 ankh +=head4 HIVSchema ankh Title : ankh (annotation key hash) Usage : $schema->ankh(@fieldnames) @@ -314,7 +314,7 @@ sub ankh { return %ret; } -=head4 tablepart +=head4 HIVSchema tablepart Title : tablepart (alias: tbl) Usage : $schema->tbl(@fieldnames) @@ -353,7 +353,7 @@ sub tbl { shift->tablepart(@_); } -=head4 columnpart +=head4 HIVSchema columnpart Title : columnpart (alias: col) Usage : $schema->col(@fieldnames) @@ -382,7 +382,7 @@ sub col { shift->columnpart(@_); } -=head4 primarykey +=head4 HIVSchema primarykey Title : primarykey [alias: pk] Usage : $schema->pk(@tablenames); @@ -416,7 +416,7 @@ sub pk { shift->primarykey(@_); } -=head4 foreignkey +=head4 HIVSchema foreignkey Title : foreignkey [alias: fk] Usage : $schema->fk($intable [, $totable]) @@ -461,7 +461,7 @@ sub fk { shift->foreignkey(@_); } -=head4 foreigntable +=head4 HIVSchema foreigntable Title : foreigntable [alias ftbl] Usage : $schema->ftbl( @foreign_key_fieldnames ); @@ -495,7 +495,7 @@ sub ftbl { shift->foreigntable(@_); } -=head4 find_join +=head4 HIVSchema find_join Title : find_join Usage : $sch->find_join('Table1', 'Table2') @@ -527,7 +527,7 @@ sub find_join { } } -=head4 _find_join_guts +=head4 HIVSchema _find_join_guts Title : _find_join_guts Usage : $sch->_find_join_guts($table1, $table2, $stackref, \$found, $reverse) @@ -610,7 +610,7 @@ sub _find_join_guts { } } -=head4 loadSchema +=head4 HIVSchema loadSchema Title : loadHIVSchema [alias: loadSchema] Usage : $schema->loadSchema( $XMLfilename ) @@ -686,7 +686,7 @@ sub loadSchema { # below, dangerous -=head4 _sfieldh +=head4 HIVSchema _sfieldh Title : _sfieldh Usage : $schema->_sfieldh($fieldname) @@ -708,7 +708,7 @@ sub _sfieldh { =head2 Class QRY - a query algebra for HIVQuery -=head3 SYNOPSIS +=head3 QRY SYNOPSIS $Q = new QRY( new R( @@ -729,7 +729,7 @@ sub _sfieldh { $Q3 = QRY::Or($Q, $Q2); print $Q3->A; # prints '(CCR5 CXCR4)[coreceptor] (ZA)[country]' -=head3 DESCRIPTION +=head3 QRY DESCRIPTION The QRY package provides a query parser for L. Currently, the parser supports AND, OR, @@ -823,9 +823,7 @@ use overload # QRY object will be translated into (possibly multiple) hashes # conforming to HIVQuery parameter requirements. -=head3 CLASS METHODS - -=head4 _make_q +=head4 QRY _make_q Title : _make_q Usage : QRY::_make_q($parsetree) @@ -862,7 +860,7 @@ sub _make_q { return @dbq; } -=head4 _make_q_guts +=head4 QRY _make_q_guts Title : _make_q_guts (Internal class method) Usage : _make_q_guts($ptree, $q_expr, $qarry, $anarry) @@ -974,7 +972,7 @@ sub _make_q_guts { : return 1; } -=head4 _parse_q +=head4 QRY _parse_q Title : _parse_q Usage : QRY::_parse_q($query_string) @@ -1045,7 +1043,7 @@ sub _parse_q { ## QRY constructor -=head3 CONSTRUCTOR +=head3 QRY CONSTRUCTOR =head4 QRY Constructor @@ -1070,9 +1068,9 @@ sub new { ## QRY instance methods -=head3 INSTANCE METHODS +=head3 QRY INSTANCE METHODS -=head4 requests +=head4 QRY requests Title : requests Usage : $QRY->requests @@ -1089,7 +1087,7 @@ sub requests { return @{$self->{'requests'}}; } -=head4 put_requests +=head4 QRY put_requests Title : put_requests Usage : $QRY->put_request(@R) @@ -1110,7 +1108,7 @@ sub put_requests { return @args; } -=head4 isnull +=head4 QRY isnull Title : isnull Usage : $QRY->isnull @@ -1126,7 +1124,7 @@ sub isnull { return ($self->requests) ? 0 : 1; } -=head4 A +=head4 QRY A Title : A Usage : print $QRY->A @@ -1142,7 +1140,7 @@ sub A { return join( "\n", map {$_->A} $self->requests ); } -=head4 len +=head4 QRY len Title : len Usage : $QRY->len @@ -1158,7 +1156,7 @@ sub len { return scalar @{$self->{'requests'}}; } -=head4 clone +=head4 QRY clone Title : clone Usage : $QRY2 = $QRY1->clone; @@ -1181,9 +1179,9 @@ sub clone { ## QRY class methods -=head3 CLASS METHODS +=head3 QRY CLASS METHODS -=head4 Or +=head4 QRY Or Title : Or Usage : $QRY3 = QRY::Or($QRY1, $QRY2) @@ -1237,7 +1235,7 @@ sub Or { return new QRY( @ret_rq ); } -=head4 And +=head4 QRY And Title : And Usage : $QRY3 = QRY::And($QRY1, $QRY2) @@ -1268,7 +1266,7 @@ sub And { return new QRY( @ret_rq ); } -=head4 Bool +=head4 QRY Bool Title : Bool Usage : QRY::Bool($QRY1) @@ -1285,7 +1283,7 @@ sub Bool { return $q->isnull ? 0 : 1; } -=head4 Eq +=head4 QRY Eq Title : Eq Usage : QRY::Eq($QRY1, $QRY2) @@ -1319,7 +1317,7 @@ sub Eq { =head2 Class R - request objects for QRY algebra -=head3 SYNOPSIS +=head3 R SYNOPSIS $R = new R( $q1, $q2 ); $R->put_atoms($q3); @@ -1334,7 +1332,7 @@ sub Eq { QRY::Eq( new QRY(R::Or($R1, $R2)), new QRY($R1, $R2) ); # returns 1 R::In( (R::And($R1, $R2))[0], $R1 ); # returns 1 -=head3 DESCRIPTION +=head3 R DESCRIPTION Class R objects contain a list of atomic queries (class Q objects). Each class R object represents a single HTTP request to the @@ -1350,7 +1348,7 @@ $R::NULL = new R(); ## R constructor -=head3 CONSTRUCTOR +=head3 R CONSTRUCTOR =head4 R constructor @@ -1375,9 +1373,9 @@ sub new { ## R instance methods -=head3 INSTANCE METHODS +=head3 R INSTANCE METHODS -=head4 len +=head4 R len Title : len Usage : $R->len @@ -1393,7 +1391,7 @@ sub len { return scalar @{[keys %{$self->{'atoms'}}]}; } -=head4 atoms +=head4 R atoms Title : atoms Usage : $R->atoms( [optional $field]) @@ -1415,7 +1413,7 @@ sub atoms { return wantarray ? map { $self->{'atoms'}->{$_} } @flds : $self->{'atoms'}->{$flds[0]}; } -=head4 fields +=head4 R fields Title : fields Usage : $R->fields @@ -1431,7 +1429,7 @@ sub fields { return keys %{$self->{'atoms'}}; } -=head4 put_atoms +=head4 R put_atoms Title : put_atoms Usage : $R->put_atoms( @q ) @@ -1465,7 +1463,7 @@ sub put_atoms { return; } -=head4 del_atoms +=head4 R del_atoms Title : del_atoms Usage : $R->del_atoms( @qfields ) @@ -1490,7 +1488,7 @@ sub del_atoms { return @ret; } -=head4 isnull +=head4 R isnull Title : isnull Usage : $R->isnull @@ -1506,7 +1504,7 @@ sub isnull { return ($self->len) ? 0 : 1; } -=head4 A +=head4 R A Title : A Usage : print $R->A @@ -1523,7 +1521,7 @@ sub A { return join(" ", map {$_->A} @a); } -=head4 clone +=head4 R clone Title : clone Usage : $R2 = $R1->clone; @@ -1546,9 +1544,9 @@ sub clone { ## R class methods -=head3 CLASS METHODS +=head3 R CLASS METHODS -=head4 In +=head4 R In Title : In Usage : R::In($R1, $R2) @@ -1578,7 +1576,7 @@ sub In { return 1; } -=head4 And +=head4 R And Title : And Usage : @Rresult = R::And($R1, $R2) @@ -1624,7 +1622,7 @@ sub And { } -=head4 Or +=head4 R Or Title : Or Usage : @Rresult = R::Or($R1, $R2) @@ -1672,7 +1670,7 @@ sub Or { } -=head4 Eq +=head4 R Eq Title : Eq Usage : R::Eq($R1, $R2) @@ -1703,7 +1701,7 @@ sub Eq { =head2 Class Q - atomic query objects for QRY algebra -=head3 SYNOPSIS +=head3 Q SYNOPSIS $q = new Q('coreceptor', 'CXCR4 CCR5'); $u = new Q('coreceptor', 'CXCR4'); @@ -1715,7 +1713,7 @@ sub Eq { Q::qin($u, $q) # returns 1 Q::qeq(Q::qand($u, $q), $u ); # returns 1 -=head3 DESCRIPTION +=head3 Q DESCRIPTION Class Q objects represent atomic queries, that can be described by a single LANL cgi parameter=value pair. Class R objects (requests) are @@ -1731,7 +1729,7 @@ $Q::NULL = new Q(); ## Q constructor -=head3 CONSTRUCTOR +=head3 Q CONSTRUCTOR =head4 Q constructor @@ -1758,9 +1756,9 @@ sub new { ## Q instance methods -=head3 INSTANCE METHODS +=head3 Q INSTANCE METHODS -=head4 isnull +=head4 Q isnull Title : isnull Usage : $q->isnull @@ -1778,7 +1776,7 @@ sub isnull { return 0; } -=head4 fld +=head4 Q fld Title : fld Usage : $q->fld($field) @@ -1802,7 +1800,7 @@ sub fld { } -=head4 dta +=head4 Q dta Title : dta Usage : $q->dta($data) @@ -1825,7 +1823,7 @@ sub dta { return $self->{dta}; } -=head4 A +=head4 Q A Title : A Usage : print $q->A @@ -1844,7 +1842,7 @@ sub A { return "(".join(' ', sort {$a cmp $b} @a).")[".$self->fld."]"; } -=head4 clone +=head4 Q clone Title : clone Usage : $q2 = $q1->clone; @@ -1864,9 +1862,9 @@ sub clone { ### Q class methods -=head3 CLASS METHODS +=head3 Q CLASS METHODS -=head4 qin +=head4 Q qin Title : qin Usage : Q::qin($q1, $q2) @@ -1885,7 +1883,7 @@ sub qin { return Q::qeq( $b, Q::qor($a, $b) ); } -=head4 qeq +=head4 Q qeq Title : qeq Usage : Q::qeq($q1, $q2) @@ -1909,7 +1907,7 @@ sub qeq { return @cd == @bd; } -=head4 qor +=head4 Q qor Title : qor Usage : @qresult = Q::qor($q1, $q2) @@ -1941,7 +1939,7 @@ sub qor { return @ret; } -=head4 qand +=head4 Q qand Title : qand Usage : @qresult = Q::And($q1, $q2) @@ -1992,9 +1990,9 @@ sub qand { } } -=head3 INTERNALS +=head3 Q INTERNALS -=head4 unique +=head4 Q unique Title : unique Usage : @ua = unique(@a) @@ -2016,7 +2014,7 @@ sub unique { =head2 Additional tools for Bio::AnnotationCollectionI -=head3 SYNOPSIS +=head3 Bio::AnnotationCollectionI SYNOPSIS (additional methods) $seq->annotation->put_value('patient_id', 1401) $seq->annotation->get_value('patient_ids') # returns 1401 @@ -2027,9 +2025,11 @@ sub unique { $blood_readings{$_} = $seq->annonation->get_value(['clinical', $_]); } -=head3 DESCRIPTION +=head3 Bio::AnnotationCollectionI DESCRIPTION (additional methods) -C and C allow easy creation of and access to an annotation collection tree with nodes of L. These methods obiviate direct accession of the SimpleValue objects. +C and C allow easy creation of and access to an +annotation collection tree with nodes of L. These +methods obiviate direct accession of the SimpleValue objects. =cut @@ -2082,7 +2082,8 @@ sub get_value { \@tagnames, $value (or as -KEYS=>\@tagnames, -VALUE=>$value ) Note : If intervening nodes do not exist, put_value creates them, replacing existing nodes. So if $ac->put_value('x', 10) was done, then later, - $ac->put_value(['x', 'y'], 20), the original value of 'x' is trashed, and $ac->get_value('x') will now return the annotation collection + $ac->put_value(['x', 'y'], 20), the original value of 'x' is trashed, + and $ac->get_value('x') will now return the annotation collection with tagname 'y'. =cut diff --git a/Bio/DB/SeqFeature/Store.pm b/Bio/DB/SeqFeature/Store.pm index d6b259fc88..2a87c7e3f7 100644 --- a/Bio/DB/SeqFeature/Store.pm +++ b/Bio/DB/SeqFeature/Store.pm @@ -114,18 +114,24 @@ with the following differences: =over 4 -=item 1. No limitation on Bio::SeqFeatureI implementations +=item 1. + +No limitation on Bio::SeqFeatureI implementations Unlike Bio::DB::GFF, Bio::DB::SeqFeature::Store works with any Bio::SeqFeatureI object. -=item 2. No limitation on nesting of features & subfeatures +=item 2. + +No limitation on nesting of features & subfeatures Bio::DB::GFF is limited to features that have at most one level of subfeature. Bio::DB::SeqFeature::Store can work with features that have unlimited levels of nesting. -=item 3. No aggregators +=item 3. + +No aggregators The aggregator architecture, which was necessary to impose order on the GFF2 files that Bio::DB::GFF works with, does not apply to @@ -133,7 +139,9 @@ Bio::DB::SeqFeature::Store. It is intended to store features that obey well-defined ontologies, such as the Sequence Ontology (http://song.sourceforge.net). -=item 4. No relative locations +=item 4. + +No relative locations All locations defined by this module are relative to an absolute sequence ID, unlike Bio::DB::GFF which allows you to define the @@ -2506,7 +2514,6 @@ use the BioPerl bug tracking system to report bugs. =head1 SEE ALSO -L, L, L, L, diff --git a/Bio/Index/Stockholm.pm b/Bio/Index/Stockholm.pm index 6740796519..08e3bd9801 100644 --- a/Bio/Index/Stockholm.pm +++ b/Bio/Index/Stockholm.pm @@ -172,10 +172,10 @@ sub fetch_report{ return $report->next_aln; } -=head2 fetch_report +=head2 fetch_aln - Title : fetch_report - Usage : my $align = $idx->fetch_report($id); + Title : fetch_aln + Usage : my $align = $idx->fetch_aln($id); Function: Returns a Bio::SimpleAlign object for a specific alignment Returns : Bio::SimpleAlign diff --git a/Bio/Root/IO.pm b/Bio/Root/IO.pm index 3042aa3059..079eed9674 100644 --- a/Bio/Root/IO.pm +++ b/Bio/Root/IO.pm @@ -98,6 +98,10 @@ web: Email hlapp@gmx.net +=head1 CONTRIBUTORS + +Mark A. Jensen ( maj -at- fortinbras -dot- us ) + =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ @@ -144,10 +148,10 @@ BEGIN { } eval { - require LWP::Simple; + require LWP::UserAgent; }; if( $@ ) { - print STDERR "Cannot load LWP::Simple: $@" if( $VERBOSE > 0 ); + print STDERR "Cannot load LWP::UserAgent: $@" if( $VERBOSE > 0 ); $HAS_LWP = 0; } else { $HAS_LWP = 1; @@ -251,6 +255,13 @@ sub new { -flush boolean flag to autoflush after each write -noclose boolean flag, when set to true will not close a filehandle (must explictly call close($io->_fh) + -retries number of times to try a web fetch before failure + + -ua_parms hashref of key => value parameters to pass + to LWP::UserAgent->new() + (only meaningful with -url is set) + A useful value might be, for example, + { timeout => 60 } (ua default is 180 sec) Returns : TRUE Args : named parameters @@ -262,27 +273,34 @@ sub _initialize_io { $self->_register_for_cleanup(\&_io_cleanup); - my ($input, $noclose, $file, $fh, $flush, $url) = $self->_rearrange([qw(INPUT - NOCLOSE - FILE FH - FLUSH URL)], @args); + my ($input, $noclose, $file, $fh, $flush, $url, + $retries, $ua_parms) = + $self->_rearrange([qw(INPUT + NOCLOSE + FILE + FH + FLUSH + URL + RETRIES + UA_PARMS)], @args); if($url){ - my $trymax = 5; + $retries ||= 5; - if($HAS_LWP){ #use LWP::Simple::getstore() - require LWP::Simple; - #$self->warn("has lwp"); + if($HAS_LWP){ #use LWP::UserAgent + require LWP::UserAgent; + my $ua = LWP::UserAgent->new(%$ua_parms); my $http_result; my($handle,$tempfile) = $self->tempfile(); CORE::close($handle); + - for(my $try = 1 ; $try <= $trymax ; $try++){ - $http_result = LWP::Simple::getstore($url, $tempfile); - $self->warn("[$try/$trymax] tried to fetch $url, but server threw $http_result. retrying...") if $http_result != 200; - last if $http_result == 200; + for(my $try = 1 ; $try <= $retries ; $try++){ + $http_result = $ua->get($url, ':content_file' => $tempfile); + $self->warn("[$try/$retries] tried to fetch $url, but server threw " . $http_result->code . ". retrying...") if !$http_result->is_success; + last if $http_result->is_success; } - $self->throw("failed to fetch $url, server threw $http_result") if $http_result != 200; + $self->throw("failed to fetch $url, server threw " . $http_result->code) if !$http_result->is_success; $input = $tempfile; $file = $tempfile; diff --git a/Bio/Search/HSP/ModelHSP.pm b/Bio/Search/HSP/ModelHSP.pm index bdf78f9c1f..28914d1f08 100644 --- a/Bio/Search/HSP/ModelHSP.pm +++ b/Bio/Search/HSP/ModelHSP.pm @@ -404,33 +404,6 @@ sub get_aln { return $aln; } -=head2 seq_inds - - Title : seq_inds - Purpose : Get a list of residue positions (indices) for all identical - : or conserved residues in the query or sbjct sequence. - Example : @s_ind = $hsp->seq_inds('query', 'identical'); - : @h_ind = $hsp->seq_inds('hit', 'conserved'); - : @h_ind = $hsp->seq_inds('hit', 'conserved', 1); - Returns : List of integers - : May include ranges if collapse is true. - Argument : seq_type = 'query' or 'hit' or 'sbjct' (default = query) - : ('sbjct' is synonymous with 'hit') - : class = 'identical' or 'conserved' or 'nomatch' or 'gap' - : (default = identical) - : (can be shortened to 'id' or 'cons') - : - : collapse = boolean, if true, consecutive positions are merged - : using a range notation, e.g., "1 2 3 4 5 7 9 10 11" - : collapses to "1-5 7 9-11". This is useful for - : consolidating long lists. Default = no collapse. - Throws : n/a. - Comments : - -See Also : L, L - -=cut - =head2 Inherited from Bio::SeqFeature::SimilarityPair These methods come from Bio::SeqFeature::SimilarityPair @@ -488,7 +461,7 @@ These methods come from Bio::SeqFeature::SimilarityPair The following methods have been overridden due to their current reliance on sequence-based queries. They may be implemented in future versions of this class. -=head2 frac_identical +=head2 seq_inds =cut diff --git a/Bio/SeqIO/chadoxml.pm b/Bio/SeqIO/chadoxml.pm index 4beeed9cd5..9021b4649c 100644 --- a/Bio/SeqIO/chadoxml.pm +++ b/Bio/SeqIO/chadoxml.pm @@ -55,7 +55,7 @@ This is currently a write-only module. -seq_so_type=>'gene', -src_feature=>'X', -src_feat_type=>'chromosome_arm', - -nounflatten=>1, + -nounflatten=>1, -is_analysis=>'true', -data_source=>'GenBank'); @@ -80,64 +80,64 @@ containment hierarchy conforming to chado central dogma model: gene Destination of data in the subject Bio::Seq object $seq is as following: - *$seq->display_id: name of the top-level feature; + *$seq->display_id: name of the top-level feature; - *$seq->accession_number: if defined, uniquename and - feature_dbxref of the top-level - feature if not defined, - $seq->display_id is used as the - uniquename of the top-level feature; + *$seq->accession_number: if defined, uniquename and + feature_dbxref of the top-level + feature if not defined, + $seq->display_id is used as the + uniquename of the top-level feature; - *$seq->molecule: transformed to SO type, used as the feature - type of the top-level feature if -seq_so_type - argument is supplied, use the supplied SO type - as the feature type of the top-level feature; + *$seq->molecule: transformed to SO type, used as the feature + type of the top-level feature if -seq_so_type + argument is supplied, use the supplied SO type + as the feature type of the top-level feature; - *$seq->species: organism of the top-level feature; + *$seq->species: organism of the top-level feature; - *$seq->seq: residues of the top-level feature; + *$seq->seq: residues of the top-level feature; - *$seq->is_circular, $seq->division: feature_cvterm; + *$seq->is_circular, $seq->division: feature_cvterm; - *$seq->keywords, $seq->desc, comments: featureprop; + *$seq->keywords, $seq->desc, comments: featureprop; - *references: pub and feature_pub; - medline/pubmed ids: pub_dbxref; - comments: pubprop; + *references: pub and feature_pub; + medline/pubmed ids: pub_dbxref; + comments: pubprop; - *feature "source" span: featureloc for top-level feature; + *feature "source" span: featureloc for top-level feature; - *feature "source" db_xref: feature_dbxref for top-level feature; + *feature "source" db_xref: feature_dbxref for top-level feature; - *feature "source" other tags: featureprop for top-level feature; + *feature "source" other tags: featureprop for top-level feature; - *subfeature 'symbol' or 'label' tag: feature uniquename, if + *subfeature 'symbol' or 'label' tag: feature uniquename, if none of these is present, the chadoxml object generates feature uniquenames as: -- (e.g. foo-mRNA--1000..3000); - *gene model: feature_relationship built based on the + *gene model: feature_relationship built based on the containment hierarchy; - *feature span: featureloc; + *feature span: featureloc; - *feature accession numbers: feature_dbxref; + *feature accession numbers: feature_dbxref; - *feature tags (except db_xref, symbol and gene): featureprop; + *feature tags (except db_xref, symbol and gene): featureprop; Things to watch out for: - *chado schema change: this version works with the chado + *chado schema change: this version works with the chado version tagged chado_1_01 in GMOD CVS. - *feature uniquenames: especially important if using XORT + *feature uniquenames: especially important if using XORT loader to do incremental load into chado. may need pre-processing of the source data to put the correct uniquenames in place. - *pub uniquenames: chadoxml->write_seq() has the FlyBase policy + *pub uniquenames: chadoxml->write_seq() has the FlyBase policy on pub uniquenames hard-coded, it assigns pub uniquenames in the following way: for journals and books, use ISBN number; for @@ -147,7 +147,7 @@ Things to watch out for: implement your policy. look for the comments in the code. - *for pubs possibly existing in chado but with no knowledge of + *for pubs possibly existing in chado but with no knowledge of its uniquename:put "op" as "match", then need to run the output chadoxml through a special filter that talks to chado database and tries to find the @@ -160,9 +160,9 @@ Things to watch out for: case. please modify to work according to your rules. - *chado initialization for loading: + *chado initialization for loading: - cv & cvterm: in the output chadoxml, all cv's and + cv & cvterm: in the output chadoxml, all cv's and cvterm's are lookup only. Therefore, before using XORT loader to load the output into chado, chado must be @@ -247,29 +247,29 @@ undef(my %datahash); #data from Bio::Seq object stored in a hash my $chadotables = 'feature featureprop feature_relationship featureloc feature_cvterm cvterm cv feature_pub pub pub_dbxref pub_author author pub_relationship pubprop feature_dbxref dbxref db synonym feature_synonym'; my %fkey = ( - "cvterm.cv_id" => "cv", + "cvterm.cv_id" => "cv", "cvterm.dbxref_id" => "dbxref", - "dbxref.db_id" => "db", - "feature.type_id" => "cvterm", - "feature.organism_id" => "organism", - "feature.dbxref_id" => "dbxref", - "featureprop.type_id" => "cvterm", - "feature_pub.pub_id" => "pub", - "feature_cvterm.cvterm_id" => "cvterm", - "feature_cvterm.pub_id" => "pub", + "dbxref.db_id" => "db", + "feature.type_id" => "cvterm", + "feature.organism_id" => "organism", + "feature.dbxref_id" => "dbxref", + "featureprop.type_id" => "cvterm", + "feature_pub.pub_id" => "pub", + "feature_cvterm.cvterm_id" => "cvterm", + "feature_cvterm.pub_id" => "pub", "feature_cvterm.feature_id" => "feature", - "feature_dbxref.dbxref_id" => "dbxref", - "feature_relationship.object_id" => "feature", - "feature_relationship.subject_id" => "feature", - "feature_relationship.type_id" => "cvterm", - "featureloc.srcfeature_id" => "feature", - "pub.type_id" => "cvterm", - "pub_dbxref.dbxref_id" => "dbxref", - "pub_author.author_id" => "author", - "pub_relationship.obj_pub_id" => "pub", - "pub_relationship.subj_pub_id" => "pub", - "pub_relationship.type_id" => "cvterm", - "pubprop.type_id" => "cvterm", + "feature_dbxref.dbxref_id" => "dbxref", + "feature_relationship.object_id" => "feature", + "feature_relationship.subject_id" => "feature", + "feature_relationship.type_id" => "cvterm", + "featureloc.srcfeature_id" => "feature", + "pub.type_id" => "cvterm", + "pub_dbxref.dbxref_id" => "dbxref", + "pub_author.author_id" => "author", + "pub_relationship.obj_pub_id" => "pub", + "pub_relationship.subj_pub_id" => "pub", + "pub_relationship.type_id" => "cvterm", + "pubprop.type_id" => "cvterm", "feature_synonym.feature_id" => "feature", "feature_synonym.synonym_id" => "synonym", "feature_synonym.pub_id" => "pub", @@ -283,22 +283,22 @@ my %cv_name = ( ); my %feattype_args2so = ( - "aberr" => "aberration_junction", -# "conflict" => "sequence_difference", -# "polyA_signal" => "polyA_signal_sequence", - "variation" => "sequence_variant", - "mutation1" => "point_mutation", #for single-base mutation - "mutation2" => "sequence_variant", #for multi-base mutation - "rescue" => "rescue_fragment", -# "rfrag" => "restriction_fragment", - "protein_bind" => "protein_binding_site", - "misc_feature" => "region", -# "prim_transcript" => "primary_transcript", - "CDS" => "polypeptide", - "reg_element" => "regulatory_region", - "seq_variant" => "sequence_variant", - "mat_peptide" => "mature_peptide", - "sig_peptide" => "signal_peptide", + "aberr" => "aberration_junction", +# "conflict" => "sequence_difference", +# "polyA_signal" => "polyA_signal_sequence", + "variation" => "sequence_variant", + "mutation1" => "point_mutation", #for single-base mutation + "mutation2" => "sequence_variant", #for multi-base mutation + "rescue" => "rescue_fragment", +# "rfrag" => "restriction_fragment", + "protein_bind" => "protein_binding_site", + "misc_feature" => "region", +# "prim_transcript" => "primary_transcript", + "CDS" => "polypeptide", + "reg_element" => "regulatory_region", + "seq_variant" => "sequence_variant", + "mat_peptide" => "mature_peptide", + "sig_peptide" => "signal_peptide", ); undef(my %organism); @@ -328,99 +328,103 @@ sub _initialize { Title : write_seq Usage : $stream->write_seq(-seq=>$seq, -seq_so_type=>$seqSOtype, - -src_feature=>$srcfeature, - -src_feat_type=>$srcfeattype, - -nounflatten=>0 or 1, - -is_analysis=>'true' or 'false', - -data_source=>$datasource) + -src_feature=>$srcfeature, + -src_feat_type=>$srcfeattype, + -nounflatten=>0 or 1, + -is_analysis=>'true' or 'false', + -data_source=>$datasource) Function: writes the $seq object (must be seq) into chadoxml. - Current implementation: - 1. for non-mRNA records, - a top-level feature of type $seq->alphabet is - generated for the whole GenBank record, features listed - are unflattened for DNA records to build gene model - feature graph, and for the other types of records all - features in $seq are treated as subfeatures of the top-level - feature. - 2. for mRNA records, - if a 'gene' feature is present, it B have a /symbol - or /label tag to contain the uniquename of the gene. a top- - level feature of type 'gene' is generated. the mRNA is written - as a subfeature of the top-level gene feature, and the other - sequence features listed in $seq are treated as subfeatures - of the mRNA feature. Returns : 1 for success and 0 for error + Args : A Bio::Seq object $seq, optional $seqSOtype, $srcfeature, + $srcfeattype, $nounflatten, $is_analysis and $data_source. +When $srcfeature (a string, the uniquename of the source feature) is given, the +location and strand information of the top-level feature against the source +feature will be derived from the sequence feature called 'source' of the $seq +object, a featureloc record is generated for the top -level feature on +$srcfeature. when $srcfeature is given, $srcfeattype must also be present. All +feature coordinates in $seq should be against $srcfeature. $seqSOtype is the +optional SO term to use as the type of the top-level feature. For example, a +GenBank data file for a Drosophila melanogaster genome scaffold has the molecule +type of "DNA", when converting to chadoxml, a $seqSOtype argument of +"golden_path_region" can be supplied to save the scaffold as a feature of type +"golden_path_region" in chadoxml, instead of "DNA". a feature with primary tag +of 'source' must be present in the sequence feature list of $seq, to decribe the +whole sequence record. - Args : A Bio::Seq object $seq, optional $seqSOtype, $srcfeature, - $srcfeattype, $nounflatten, $is_analysis and $data_source. - when $srcfeature (a string, the uniquename of the source - feature) is given, the location and strand information of - the top-level feature against the source feature will be - derived from the sequence feature called 'source' of the - $seq object, a featureloc record is generated for the top - -level feature on $srcfeature. when $srcfeature is given, - $srcfeattype must also be present. All feature coordinates - in $seq should be against $srcfeature. $seqSOtype is the - optional SO term to use as the type of the top-level feature. - For example, a GenBank data file for a Drosophila melanogaster - genome scaffold has the molecule type of "DNA", when - converting to chadoxml, a $seqSOtype argument of - "golden_path_region" can be supplied to save the scaffold - as a feature of type "golden_path_region" in chadoxml, instead - of "DNA". a feature with primary tag of 'source' must be - present in the sequence feature list of $seq, to decribe the - whole sequence record. +In the current implementation: + +=over 3 + +=item * + +non-mRNA records + +A top-level feature of type $seq-Ealphabet is generated for the whole GenBank +record, features listed are unflattened for DNA records to build gene model +feature graph, and for the other types of records all features in $seq are +treated as subfeatures of the top-level feature. +=item * + +mRNA records + +If a 'gene' feature is present, it B have a /symbol or /label tag to +contain the uniquename of the gene. a top-level feature of type 'gene' is +generated. the mRNA is written as a subfeature of the top-level gene feature, +and the other sequence features listed in $seq are treated as subfeatures of the +mRNA feature. + +=back =cut sub write_seq { - my $usage = <write_seq() Usage : \$stream->write_seq(-seq=>\$seq, - -seq_so_type=>\$SOtype, - -src_feature=>\$srcfeature, - -src_feat_type=>\$srcfeattype, - -nounflatten=>0 or 1, + -seq_so_type=>\$SOtype, + -src_feature=>\$srcfeature, + -src_feat_type=>\$srcfeattype, + -nounflatten=>0 or 1, -is_analysis=>'true' or 'false', -data_source=>\$datasource) -Args : \$seq : a Bio::Seq object - \$SOtype : the SO term to use as the feature type of - the \$seq record, optional - \$srcfeature : unique name of the source feature, a string - containing at least one alphabetical letter - (a-z, A-Z), optional - \$srcfeattype : feature type of \$srcfeature. one of SO terms. - optional - when \$srcfeature is given, \$srcfeattype becomes mandatory, - \$datasource : source of the sequence annotation data, - e.g. 'GenBank' or 'GFF'. +Args : \$seq : a Bio::Seq object + \$SOtype : the SO term to use as the feature type of + the \$seq record, optional + \$srcfeature : unique name of the source feature, a string + containing at least one alphabetical letter + (a-z, A-Z), optional + \$srcfeattype : feature type of \$srcfeature. one of SO terms. + optional + when \$srcfeature is given, \$srcfeattype becomes mandatory, + \$datasource : source of the sequence annotation data, + e.g. 'GenBank' or 'GFF'. EOUSAGE - my ($self,@args) = @_; + my ($self,@args) = @_; - my ($seq, $seq_so_type, $srcfeature, $srcfeattype, $nounflatten, $isanalysis, $datasource, $genus, $species) = - $self->_rearrange([qw(SEQ - SEQ_SO_TYPE - SRC_FEATURE - SRC_FEAT_TYPE - NOUNFLATTEN - IS_ANALYSIS - DATA_SOURCE + my ($seq, $seq_so_type, $srcfeature, $srcfeattype, $nounflatten, $isanalysis, $datasource, $genus, $species) = + $self->_rearrange([qw(SEQ + SEQ_SO_TYPE + SRC_FEATURE + SRC_FEAT_TYPE + NOUNFLATTEN + IS_ANALYSIS + DATA_SOURCE GENUS SPECIES - )], - @args); - #print "$seq_so_type, $srcfeature, $srcfeattype\n"; + )], + @args); + #print "$seq_so_type, $srcfeature, $srcfeattype\n"; - if( !defined $seq ) { - $self->throw("Attempting to write with no seq!"); - } + if( !defined $seq ) { + $self->throw("Attempting to write with no seq!"); + } - if( ! ref $seq || ! $seq->isa('Bio::Seq::RichSeqI') ) { - ## FIXME $self->warn(" $seq is not a RichSeqI compliant module. Attempting to dump, but may fail!"); - } + if( ! ref $seq || ! $seq->isa('Bio::Seq::RichSeqI') ) { + ## FIXME $self->warn(" $seq is not a RichSeqI compliant module. Attempting to dump, but may fail!"); + } # try to get the srcfeature from the seqFeature object # for this to work, the user has to pass in the srcfeature type @@ -430,124 +434,124 @@ EOUSAGE } } - #$srcfeature, when provided, should contain at least one alphabetical letter - if (defined $srcfeature) - { - if ($srcfeature =~ /[a-zA-Z]/) - { - chomp($srcfeature); - } else { - $self->throw( $usage ); - } - - #check for mandatory $srcfeattype - if (! defined $srcfeattype) - { - $self->throw( $usage ); - #$srcfeattype must be a string of non-whitespace characters - } else { - if ($srcfeattype =~ /\S+/) { - chomp($srcfeattype); - } else { - $self->throw( $usage ); - } - } - } - - # variables local to write_seq() + #$srcfeature, when provided, should contain at least one alphabetical letter + if (defined $srcfeature) + { + if ($srcfeature =~ /[a-zA-Z]/) + { + chomp($srcfeature); + } else { + $self->throw( $usage ); + } + + #check for mandatory $srcfeattype + if (! defined $srcfeattype) + { + $self->throw( $usage ); + #$srcfeattype must be a string of non-whitespace characters + } else { + if ($srcfeattype =~ /\S+/) { + chomp($srcfeattype); + } else { + $self->throw( $usage ); + } + } + } + + # variables local to write_seq() my $div = undef; - my $hkey = undef; - undef(my @top_featureprops); + my $hkey = undef; + undef(my @top_featureprops); undef(my @featuresyns); undef(my @top_featurecvterms); - my $name = $seq->display_id if $seq->can('display_id'); + my $name = $seq->display_id if $seq->can('display_id'); $name = $seq->display_name if $seq->can('display_name'); - undef(my @feature_cvterms); - undef(my %sthash); - undef(my %dvhash); - undef(my %h1); - undef(my %h2); - my $temp = undef; - my $ann = undef; - undef(my @references); - undef(my @feature_pubs); - my $ref = undef; - my $location = undef; - my $fbrf = undef; - my $journal = undef; - my $issue = undef; - my $volume = undef; - my $volumeissue = undef; - my $pages = undef; - my $year = undef; - my $pubtype = undef; -# my $miniref= undef; - my $uniquename = undef; - my $refhash = undef; - my $feat = undef; - my $tag = undef; - my $tag_cv = undef; - my $ftype = undef; - my $subfeatcnt = undef; - undef(my @top_featrels); - undef (my %srcfhash); - - local($^W) = 0; # supressing warnings about uninitialized fields. + undef(my @feature_cvterms); + undef(my %sthash); + undef(my %dvhash); + undef(my %h1); + undef(my %h2); + my $temp = undef; + my $ann = undef; + undef(my @references); + undef(my @feature_pubs); + my $ref = undef; + my $location = undef; + my $fbrf = undef; + my $journal = undef; + my $issue = undef; + my $volume = undef; + my $volumeissue = undef; + my $pages = undef; + my $year = undef; + my $pubtype = undef; +# my $miniref= undef; + my $uniquename = undef; + my $refhash = undef; + my $feat = undef; + my $tag = undef; + my $tag_cv = undef; + my $ftype = undef; + my $subfeatcnt = undef; + undef(my @top_featrels); + undef (my %srcfhash); + + local($^W) = 0; # supressing warnings about uninitialized fields. if (!$name && $seq->can('attributes') ) { ($name) = $seq->attributes('Alias'); } - if ($seq->can('accession_number') && defined $seq->accession_number && $seq->accession_number ne 'unknown') { - $uniquename = $seq->accession_number; - } elsif ($seq->can('accession') && defined $seq->accession && $seq->accession ne 'unknown') { - $uniquename = $seq->accession; - } elsif ($seq->can('attributes')) { + if ($seq->can('accession_number') && defined $seq->accession_number && $seq->accession_number ne 'unknown') { + $uniquename = $seq->accession_number; + } elsif ($seq->can('accession') && defined $seq->accession && $seq->accession ne 'unknown') { + $uniquename = $seq->accession; + } elsif ($seq->can('attributes')) { ($uniquename) = $seq->attributes('load_id'); } else { - $uniquename = $name; - } + $uniquename = $name; + } my $len = $seq->length(); - if ($len == 0) { - $len = undef; - } - - undef(my $gb_type); - if (!$seq->can('molecule') || ! defined ($gb_type = $seq->molecule()) ) { - $gb_type = $seq->can('alphabet') ? $seq->alphabet : 'DNA'; - } - $gb_type = 'DNA' if $ftype eq 'dna'; - $gb_type = 'RNA' if $ftype eq 'rna'; - - if(length $seq_so_type > 0) { - if (defined $seq_so_type) { - $ftype = $seq_so_type; - } - elsif ($seq->type) { - $ftype = ($seq->type =~ /(.*):/) - ? $1 - : $seq->type; - } - else { - $ftype = $gb_type; - } - } - else { - $ftype = $gb_type; - } - - my %ftype_hash = $self->return_ftype_hash($ftype); + if ($len == 0) { + $len = undef; + } + + undef(my $gb_type); + if (!$seq->can('molecule') || ! defined ($gb_type = $seq->molecule()) ) { + $gb_type = $seq->can('alphabet') ? $seq->alphabet : 'DNA'; + } + $gb_type = 'DNA' if $ftype eq 'dna'; + $gb_type = 'RNA' if $ftype eq 'rna'; + + if(length $seq_so_type > 0) { + if (defined $seq_so_type) { + $ftype = $seq_so_type; + } + elsif ($seq->type) { + $ftype = ($seq->type =~ /(.*):/) + ? $1 + : $seq->type; + } + else { + $ftype = $gb_type; + } + } + else { + $ftype = $gb_type; + } + + my %ftype_hash = $self->return_ftype_hash($ftype); if ($species) { %organism = ("genus"=>$genus, "species" => $species); } else { - my $spec = $seq->species(); - if (!defined $spec) { - $self->throw("$seq does not know what organism it is from, which is required by chado. cannot proceed!\n"); - } else { - %organism = ("genus"=>$spec->genus(), "species" => $spec->species()); - } + my $spec = $seq->species(); + if (!defined $spec) { + $self->throw("$seq does not know what organism it is from, which is required by chado. cannot proceed!\n"); + } else { + %organism = ("genus"=>$spec->genus(), "species" => $spec->species()); + } } my $residues; @@ -561,22 +565,22 @@ EOUSAGE $residues = ''; } - #set is_analysis flag for gene model features - undef(my $isanal); - if ($ftype eq 'gene' || $ftype eq 'mRNA' || $ftype eq 'exon' || $ftype eq 'protein' || $ftype eq 'polypeptide') { - $isanal = $isanalysis; - $isanal = 'false' if !defined $isanal; - } - - %datahash = ( - "name" => $name, - "uniquename" => $uniquename, - "seqlen" => $len, - "residues" => $residues, - "type_id" => \%ftype_hash, - "organism_id" => \%organism, - "is_analysis" => $isanal || 'false', - ); + #set is_analysis flag for gene model features + undef(my $isanal); + if ($ftype eq 'gene' || $ftype eq 'mRNA' || $ftype eq 'exon' || $ftype eq 'protein' || $ftype eq 'polypeptide') { + $isanal = $isanalysis; + $isanal = 'false' if !defined $isanal; + } + + %datahash = ( + "name" => $name, + "uniquename" => $uniquename, + "seqlen" => $len, + "residues" => $residues, + "type_id" => \%ftype_hash, + "organism_id" => \%organism, + "is_analysis" => $isanal || 'false', + ); if (defined $srcfeature) { %srcfhash = $self->_srcf_hash($srcfeature, @@ -604,130 +608,130 @@ EOUSAGE } - #if $srcfeature is not given, use the Bio::Seq object itself as the srcfeature for featureloc's - if (!defined $srcfeature) { - $srcfeature = $uniquename; - $srcfeattype = $ftype; - } - - #default data source is 'GenBank' - if (!defined $datasource) { - $datasource = 'GenBank'; - } - - if ($datasource =~ /GenBank/i) { - #sequence topology as feature_cvterm - if ($seq->can('is_circular') && $seq->is_circular) { - %sthash = ( - "cvterm_id" => {'name' => 'circular', - 'cv_id' => { - 'name' => 'sequence topology', - }, - }, - "pub_id" => {'uniquename' => 'nullpub', - 'type_id' => { - 'name' => 'null pub', - 'cv_id' => { - 'name'=> 'pub type', - }, - }, - }, - ); - } else { - %sthash = ( - "cvterm_id" => { 'name' => 'linear', - 'cv_id' => { - 'name' => 'sequence topology', - } - }, - "pub_id" => {'uniquename' => 'nullpub', - 'type_id' => { - 'name' => 'null pub', - 'cv_id' => { - 'name'=> 'pub type', - }, - }, - }, - ); - } - push(@feature_cvterms, \%sthash); - - #division as feature_cvterm - if ($seq->can('division') && defined $seq->division()) { - $div = $seq->division(); - %dvhash = ( - "cvterm_id" => {'name' => $div, - 'cv_id' => { - 'name' => 'GenBank division'}}, - "pub_id" => {'uniquename' => 'nullpub', - 'type_id' => { - 'name' => 'null pub', - 'cv_id' => { - 'name'=> 'pub type'}, - }}, - ); - push(@feature_cvterms, \%dvhash); - } - - $datahash{'feature_cvterm'} = \@feature_cvterms; - } # closes if GenBank - - #featureprop's - #DEFINITION - if ($seq->can('desc') && defined $seq->desc()) { - $temp = $seq->desc(); - - my %prophash = ( - "type_id" => {'name' => 'description', - 'cv_id' => { - 'name' => + #if $srcfeature is not given, use the Bio::Seq object itself as the srcfeature for featureloc's + if (!defined $srcfeature) { + $srcfeature = $uniquename; + $srcfeattype = $ftype; + } + + #default data source is 'GenBank' + if (!defined $datasource) { + $datasource = 'GenBank'; + } + + if ($datasource =~ /GenBank/i) { + #sequence topology as feature_cvterm + if ($seq->can('is_circular') && $seq->is_circular) { + %sthash = ( + "cvterm_id" => {'name' => 'circular', + 'cv_id' => { + 'name' => 'sequence topology', + }, + }, + "pub_id" => {'uniquename' => 'nullpub', + 'type_id' => { + 'name' => 'null pub', + 'cv_id' => { + 'name'=> 'pub type', + }, + }, + }, + ); + } else { + %sthash = ( + "cvterm_id" => { 'name' => 'linear', + 'cv_id' => { + 'name' => 'sequence topology', + } + }, + "pub_id" => {'uniquename' => 'nullpub', + 'type_id' => { + 'name' => 'null pub', + 'cv_id' => { + 'name'=> 'pub type', + }, + }, + }, + ); + } + push(@feature_cvterms, \%sthash); + + #division as feature_cvterm + if ($seq->can('division') && defined $seq->division()) { + $div = $seq->division(); + %dvhash = ( + "cvterm_id" => {'name' => $div, + 'cv_id' => { + 'name' => 'GenBank division'}}, + "pub_id" => {'uniquename' => 'nullpub', + 'type_id' => { + 'name' => 'null pub', + 'cv_id' => { + 'name'=> 'pub type'}, + }}, + ); + push(@feature_cvterms, \%dvhash); + } + + $datahash{'feature_cvterm'} = \@feature_cvterms; + } # closes if GenBank + + #featureprop's + #DEFINITION + if ($seq->can('desc') && defined $seq->desc()) { + $temp = $seq->desc(); + + my %prophash = ( + "type_id" => {'name' => 'description', + 'cv_id' => { + 'name' => $cv_name{'feature_property'} }, }, - "value" => $temp, - ); + "value" => $temp, + ); - push(@top_featureprops, \%prophash); + push(@top_featureprops, \%prophash); } - #KEYWORDS - if ($seq->can('keywords')) { - $temp = $seq->keywords(); + #KEYWORDS + if ($seq->can('keywords')) { + $temp = $seq->keywords(); - if (defined $temp && $temp ne '.' && $temp ne '') { - my %prophash = ( - "type_id" => {'name' => 'keywords', - 'cv_id' => { + if (defined $temp && $temp ne '.' && $temp ne '') { + my %prophash = ( + "type_id" => {'name' => 'keywords', + 'cv_id' => { 'name' => $cv_name{'feature_property'} } - }, - "value" => $temp, - ); + }, + "value" => $temp, + ); - push(@top_featureprops, \%prophash); - } + push(@top_featureprops, \%prophash); + } } - #COMMENT - if ($seq->can('annotation')) { - $ann = $seq->annotation(); - foreach my $comment ($ann->get_Annotations('comment')) { - $temp = $comment->as_text(); - #print "fcomment: $temp\n"; - my %prophash = ( - "type_id" => {'name' => 'comment', - 'cv_id' => { + #COMMENT + if ($seq->can('annotation')) { + $ann = $seq->annotation(); + foreach my $comment ($ann->get_Annotations('comment')) { + $temp = $comment->as_text(); + #print "fcomment: $temp\n"; + my %prophash = ( + "type_id" => {'name' => 'comment', + 'cv_id' => { 'name' => $cv_name{'feature_property'} } }, - "value" => $temp, - ); + "value" => $temp, + ); - push(@top_featureprops, \%prophash); - } - } + push(@top_featureprops, \%prophash); + } + } my @top_dbxrefs = (); #feature object from Bio::DB::SeqFeature::Store @@ -762,422 +766,422 @@ EOUSAGE @top_dbxrefs = $self->handle_source($seq,@top_dbxrefs); } - #accession and version as feature_dbxref - if ($seq->can('accession_number') && defined $seq->accession_number && $seq->accession_number ne 'unknown') { - my $db = $self->_guess_acc_db($seq, $seq->accession_number); - my %acchash = ( - "db_id" => {'name' => $db}, - "accession" => $seq->accession_number, - "version" => $seq->seq_version, - ); - my %fdbx = ('dbxref_id' => \%acchash); - push(@top_dbxrefs, \%fdbx); - } - - if( $seq->isa('Bio::Seq::RichSeqI') && defined $seq->get_secondary_accessions() ) { - my @secacc = $seq->get_secondary_accessions(); - my $acc; - foreach $acc (@secacc) { - my %acchash = ( - "db_id" => {'name' => 'GB'}, - "accession" => $acc, - ); - my %fdbx = ('dbxref_id' => \%acchash); - push(@top_dbxrefs, \%fdbx); - } - } - - #GI number - if( $seq->isa('Bio::Seq::RichSeqI') && defined ($seq->pid)) { - my $id = $seq->pid; - #print "reftype: ", ref($id), "\n"; - - #if (ref($id) eq 'HASH') { - my %acchash = ( - "db_id" => {'name' => 'GI'}, - "accession" => $id, - ); - my %fdbx = ('dbxref_id' => \%acchash); - push (@top_dbxrefs, \%fdbx); - } - - #REFERENCES as feature_pub - if (defined $ann) { - #get the references - @references = $ann->get_Annotations('reference'); - foreach $ref (@references) { - undef(my %pubhash); - $refhash = $ref->hash_tree(); - $location = $ref->location || $refhash->{'location'}; - #print "location: $location\n"; - - #get FBrf#, special for FlyBase SEAN loading - if (index($location, ' ==') >= 0) { - $location =~ /\s==/; - #print "match: $MATCH\n"; - #print "prematch: $PREMATCH\n"; - #print "postmatch: $POSTMATCH\n"; - $fbrf = $PREMATCH; - $location = $POSTMATCH; - $location =~ s/^\s//; - } - - #print "location: $location\n"; - #unpublished reference - if ($location =~ /Unpublished/) { - $pubtype = 'unpublished'; - %pubhash = ( - "title" => $ref->title || $refhash->{'title'}, - #"miniref" => substr($location, 0, 255), - #"uniquename" => $fbrf, - "type_id" => {'name' => $pubtype, 'cv_id' => {'name' =>'pub type'}} - ); - } - #submitted - elsif ($location =~ /Submitted/) { - $pubtype = 'submitted'; - - %pubhash = ( - "title" => $ref->title || $refhash->{'title'}, - #"miniref" => substr($location, 0, 255), - #"uniquename" => $fbrf, - "type_id" => {'name' => $pubtype, 'cv_id' => {'name' =>'pub type'}} - ); - - undef(my $pyear); - $pyear = $self->_getSubmitYear($location); - if (defined $pyear) { - $pubhash{'pyear'} = $pyear; - } - } - - #published journal paper - elsif ($location =~ /\D+\s\d+\s\((\d+|\d+-\d+)\),\s(\d+-\d+|\d+--\d+)\s\(\d\d\d\d\)$/) { - $pubtype = 'paper'; - - #parse location to get journal, volume, issue, pages & year - $location =~ /\(\d\d\d\d\)$/; - - $year = $MATCH; - my $stuff = $PREMATCH; - $year =~ s/\(//; #remove the leading parenthesis - $year =~ s/\)//; #remove the trailing parenthesis - - $stuff =~ /,\s(\d+-\d+|\d+--\d+)\s$/; - - $pages = $MATCH; - $stuff = $PREMATCH; - $pages =~ s/^, //; #remove the leading comma and space - $pages =~ s/ $//; #remove the last space - - $stuff =~ /\s\d+\s\((\d+|\d+-\d+)\)$/; - - $volumeissue = $MATCH; - $journal = $PREMATCH; - $volumeissue =~ s/^ //; #remove the leading space - $volumeissue =~ /\((\d+|\d+-\d+)\)$/; - $issue = $MATCH; - $volume = $PREMATCH; - $issue =~ s/^\(//; #remove the leading parentheses - $issue =~ s/\)$//; #remove the last parentheses - $volume =~ s/^\s//; #remove the leading space - $volume =~ s/\s$//; #remove the last space - - %pubhash = ( - "title" => $ref->title || $refhash->{'title'}, - "volume" => $volume, - "issue" => $issue, - "pyear" => $year, - "pages" => $pages, - #"miniref" => substr($location, 0, 255), - #"miniref" => ' ', - #"uniquename" => $fbrf, - "type_id" => {'name' => $pubtype, 'cv_id' => {'name' =>'pub type'}}, - "pub_relationship" => { - 'obj_pub_id' => { - 'uniquename' => $journal, - 'title' => $journal, - #'miniref' => substr($journal, 0, 255), - 'type_id' =>{'name' => 'journal', - 'cv_id' => - {'name' => 'pub type' - }, - }, - #'pubprop' =>{'value'=> $journal, - # 'type_id'=>{'name' => 'abbreviation', 'cv_id' => {'name' => 'pubprop type'}}, - # }, - }, - 'type_id' => { - 'name' => 'published_in', - 'cv_id' => { - 'name' => 'pub relationship type'}, - }, - }, - ); - } - - #other references - else { - $pubtype = 'other'; - %pubhash = ( - "title" => $ref->title || $refhash->{'title'}, - #"miniref" => $fbrf, - "type_id" => { - 'name' => $pubtype, - 'cv_id' => {'name' =>'pub type'} - } - ); - } - - #pub_author - my $autref = $self->_getRefAuthors($ref); - if (defined $autref) { - $pubhash{'pub_author'} = $autref; - } - # if no author and is type 'submitted' and has submitter address, use the first 100 characters of submitter address as the author lastname. - else { - if ($pubtype eq 'submitted') { - my $autref = $self->_getSubmitAddr($ref); - if (defined $autref) { - $pubhash{'pub_author'} = $autref; - } - } - } - - #$ref->comment as pubprop - #print "ref comment: ", $ref->comment, "\n"; - #print "ref comment: ", $refhash->{'comment'}, "\n"; - if (defined $ref->comment || defined $refhash->{'comment'}) { - my $comnt = $ref->comment || $refhash->{'comment'}; - #print "remark: ", $comnt, "\n"; - $pubhash{'pubprop'} = { - "type_id" => {'name' => 'comment', 'cv_id' => {'name' => 'pubprop type'}}, - "value" => $comnt, - }; - } - - #pub_dbxref - undef(my @pub_dbxrefs); - if (defined $fbrf) { - push(@pub_dbxrefs, {dbxref_id => {accession => $fbrf, db_id => {'name' => 'FlyBase'}}}); - } - if (defined ($temp = $ref->medline)) { - push(@pub_dbxrefs, {dbxref_id => {accession => $temp, db_id => {'name' => 'MEDLINE'}}}); - #use medline # as the pub's uniquename - $pubhash{'uniquename'} = $temp; - } - if (defined ($temp = $ref->pubmed)) { - push(@pub_dbxrefs, {dbxref_id => {accession => $temp, db_id => {'name' => 'PUBMED'}}}); - } - $pubhash{'pub_dbxref'} = \@pub_dbxrefs; - - #if the pub uniquename is not defined or blank, put its FBrf# as its uniquename - #this is unique to FlyBase - #USERS OF THIS MODULE: PLEASE MODIFY HERE TO IMPLEMENT YOUR POLICY - # ON PUB UNIQUENAME!!! - if (!defined $pubhash{'uniquename'} || $pubhash{'uniquename'} eq '') { - if (defined $fbrf) { - $pubhash{'uniquename'} = $fbrf; - } - #else { - # $pubhash{'uniquename'} = $self->_CreatePubUname($ref); - #} - } - - #add to collection of references - #if the pub covers the entire sequence of the top-level feature, add it to feature_pubs - if (($ref->start == 1 && $ref->end == $len) || (!defined $ref->start && !defined $ref->end)) { - push(@feature_pubs, {"pub_id" => \%pubhash}); - } - #the pub is about a sub-sequence of the top-level feature - #create a feature for the sub-sequence and add pub as its feature_pub - #featureloc of this sub-sequence is against the top-level feature, in interbase coordinates. - else { - my %parf = ( - 'uniquename' => $uniquename . ':' . $ref->start . "\.\." . $ref->end, - 'organism_id' =>\%organism, - 'type_id' =>{'name' =>'region', 'cv_id' => {'name' => $cv_name{'sequence'} }}, - ); - my %parfsrcf = ( - 'uniquename' => $uniquename, - 'organism_id' =>\%organism, - ); - my %parfloc = ( - 'srcfeature_id' => \%parfsrcf, - 'fmin' => $ref->start - 1, - 'fmax' => $ref->end, - ); - $parf{'featureloc'} = \%parfloc; - $parf{'feature_pub'} = {'pub_id' => \%pubhash}; - my %ffr = ( - 'subject_id' => \%parf, - 'type_id' => { 'name' => 'partof', 'cv_id' => { 'name' => $cv_name{'relationship'}}}, - ); - push(@top_featrels, \%ffr); - } - } - $datahash{'feature_pub'} = \@feature_pubs; - } - - ##construct srcfeature hash for use in featureloc - if (defined $srcfeature) { + #accession and version as feature_dbxref + if ($seq->can('accession_number') && defined $seq->accession_number && $seq->accession_number ne 'unknown') { + my $db = $self->_guess_acc_db($seq, $seq->accession_number); + my %acchash = ( + "db_id" => {'name' => $db}, + "accession" => $seq->accession_number, + "version" => $seq->seq_version, + ); + my %fdbx = ('dbxref_id' => \%acchash); + push(@top_dbxrefs, \%fdbx); + } + + if( $seq->isa('Bio::Seq::RichSeqI') && defined $seq->get_secondary_accessions() ) { + my @secacc = $seq->get_secondary_accessions(); + my $acc; + foreach $acc (@secacc) { + my %acchash = ( + "db_id" => {'name' => 'GB'}, + "accession" => $acc, + ); + my %fdbx = ('dbxref_id' => \%acchash); + push(@top_dbxrefs, \%fdbx); + } + } + + #GI number + if( $seq->isa('Bio::Seq::RichSeqI') && defined ($seq->pid)) { + my $id = $seq->pid; + #print "reftype: ", ref($id), "\n"; + + #if (ref($id) eq 'HASH') { + my %acchash = ( + "db_id" => {'name' => 'GI'}, + "accession" => $id, + ); + my %fdbx = ('dbxref_id' => \%acchash); + push (@top_dbxrefs, \%fdbx); + } + + #REFERENCES as feature_pub + if (defined $ann) { + #get the references + @references = $ann->get_Annotations('reference'); + foreach $ref (@references) { + undef(my %pubhash); + $refhash = $ref->hash_tree(); + $location = $ref->location || $refhash->{'location'}; + #print "location: $location\n"; + + #get FBrf#, special for FlyBase SEAN loading + if (index($location, ' ==') >= 0) { + $location =~ /\s==/; + #print "match: $MATCH\n"; + #print "prematch: $PREMATCH\n"; + #print "postmatch: $POSTMATCH\n"; + $fbrf = $PREMATCH; + $location = $POSTMATCH; + $location =~ s/^\s//; + } + + #print "location: $location\n"; + #unpublished reference + if ($location =~ /Unpublished/) { + $pubtype = 'unpublished'; + %pubhash = ( + "title" => $ref->title || $refhash->{'title'}, + #"miniref" => substr($location, 0, 255), + #"uniquename" => $fbrf, + "type_id" => {'name' => $pubtype, 'cv_id' => {'name' =>'pub type'}} + ); + } + #submitted + elsif ($location =~ /Submitted/) { + $pubtype = 'submitted'; + + %pubhash = ( + "title" => $ref->title || $refhash->{'title'}, + #"miniref" => substr($location, 0, 255), + #"uniquename" => $fbrf, + "type_id" => {'name' => $pubtype, 'cv_id' => {'name' =>'pub type'}} + ); + + undef(my $pyear); + $pyear = $self->_getSubmitYear($location); + if (defined $pyear) { + $pubhash{'pyear'} = $pyear; + } + } + + #published journal paper + elsif ($location =~ /\D+\s\d+\s\((\d+|\d+-\d+)\),\s(\d+-\d+|\d+--\d+)\s\(\d\d\d\d\)$/) { + $pubtype = 'paper'; + + #parse location to get journal, volume, issue, pages & year + $location =~ /\(\d\d\d\d\)$/; + + $year = $MATCH; + my $stuff = $PREMATCH; + $year =~ s/\(//; #remove the leading parenthesis + $year =~ s/\)//; #remove the trailing parenthesis + + $stuff =~ /,\s(\d+-\d+|\d+--\d+)\s$/; + + $pages = $MATCH; + $stuff = $PREMATCH; + $pages =~ s/^, //; #remove the leading comma and space + $pages =~ s/ $//; #remove the last space + + $stuff =~ /\s\d+\s\((\d+|\d+-\d+)\)$/; + + $volumeissue = $MATCH; + $journal = $PREMATCH; + $volumeissue =~ s/^ //; #remove the leading space + $volumeissue =~ /\((\d+|\d+-\d+)\)$/; + $issue = $MATCH; + $volume = $PREMATCH; + $issue =~ s/^\(//; #remove the leading parentheses + $issue =~ s/\)$//; #remove the last parentheses + $volume =~ s/^\s//; #remove the leading space + $volume =~ s/\s$//; #remove the last space + + %pubhash = ( + "title" => $ref->title || $refhash->{'title'}, + "volume" => $volume, + "issue" => $issue, + "pyear" => $year, + "pages" => $pages, + #"miniref" => substr($location, 0, 255), + #"miniref" => ' ', + #"uniquename" => $fbrf, + "type_id" => {'name' => $pubtype, 'cv_id' => {'name' =>'pub type'}}, + "pub_relationship" => { + 'obj_pub_id' => { + 'uniquename' => $journal, + 'title' => $journal, + #'miniref' => substr($journal, 0, 255), + 'type_id' =>{'name' => 'journal', + 'cv_id' => + {'name' => 'pub type' + }, + }, + #'pubprop' =>{'value'=> $journal, + # 'type_id'=>{'name' => 'abbreviation', 'cv_id' => {'name' => 'pubprop type'}}, + # }, + }, + 'type_id' => { + 'name' => 'published_in', + 'cv_id' => { + 'name' => 'pub relationship type'}, + }, + }, + ); + } + + #other references + else { + $pubtype = 'other'; + %pubhash = ( + "title" => $ref->title || $refhash->{'title'}, + #"miniref" => $fbrf, + "type_id" => { + 'name' => $pubtype, + 'cv_id' => {'name' =>'pub type'} + } + ); + } + + #pub_author + my $autref = $self->_getRefAuthors($ref); + if (defined $autref) { + $pubhash{'pub_author'} = $autref; + } + # if no author and is type 'submitted' and has submitter address, use the first 100 characters of submitter address as the author lastname. + else { + if ($pubtype eq 'submitted') { + my $autref = $self->_getSubmitAddr($ref); + if (defined $autref) { + $pubhash{'pub_author'} = $autref; + } + } + } + + #$ref->comment as pubprop + #print "ref comment: ", $ref->comment, "\n"; + #print "ref comment: ", $refhash->{'comment'}, "\n"; + if (defined $ref->comment || defined $refhash->{'comment'}) { + my $comnt = $ref->comment || $refhash->{'comment'}; + #print "remark: ", $comnt, "\n"; + $pubhash{'pubprop'} = { + "type_id" => {'name' => 'comment', 'cv_id' => {'name' => 'pubprop type'}}, + "value" => $comnt, + }; + } + + #pub_dbxref + undef(my @pub_dbxrefs); + if (defined $fbrf) { + push(@pub_dbxrefs, {dbxref_id => {accession => $fbrf, db_id => {'name' => 'FlyBase'}}}); + } + if (defined ($temp = $ref->medline)) { + push(@pub_dbxrefs, {dbxref_id => {accession => $temp, db_id => {'name' => 'MEDLINE'}}}); + #use medline # as the pub's uniquename + $pubhash{'uniquename'} = $temp; + } + if (defined ($temp = $ref->pubmed)) { + push(@pub_dbxrefs, {dbxref_id => {accession => $temp, db_id => {'name' => 'PUBMED'}}}); + } + $pubhash{'pub_dbxref'} = \@pub_dbxrefs; + + #if the pub uniquename is not defined or blank, put its FBrf# as its uniquename + #this is unique to FlyBase + #USERS OF THIS MODULE: PLEASE MODIFY HERE TO IMPLEMENT YOUR POLICY + # ON PUB UNIQUENAME!!! + if (!defined $pubhash{'uniquename'} || $pubhash{'uniquename'} eq '') { + if (defined $fbrf) { + $pubhash{'uniquename'} = $fbrf; + } + #else { + # $pubhash{'uniquename'} = $self->_CreatePubUname($ref); + #} + } + + #add to collection of references + #if the pub covers the entire sequence of the top-level feature, add it to feature_pubs + if (($ref->start == 1 && $ref->end == $len) || (!defined $ref->start && !defined $ref->end)) { + push(@feature_pubs, {"pub_id" => \%pubhash}); + } + #the pub is about a sub-sequence of the top-level feature + #create a feature for the sub-sequence and add pub as its feature_pub + #featureloc of this sub-sequence is against the top-level feature, in interbase coordinates. + else { + my %parf = ( + 'uniquename' => $uniquename . ':' . $ref->start . "\.\." . $ref->end, + 'organism_id' =>\%organism, + 'type_id' =>{'name' =>'region', 'cv_id' => {'name' => $cv_name{'sequence'} }}, + ); + my %parfsrcf = ( + 'uniquename' => $uniquename, + 'organism_id' =>\%organism, + ); + my %parfloc = ( + 'srcfeature_id' => \%parfsrcf, + 'fmin' => $ref->start - 1, + 'fmax' => $ref->end, + ); + $parf{'featureloc'} = \%parfloc; + $parf{'feature_pub'} = {'pub_id' => \%pubhash}; + my %ffr = ( + 'subject_id' => \%parf, + 'type_id' => { 'name' => 'partof', 'cv_id' => { 'name' => $cv_name{'relationship'}}}, + ); + push(@top_featrels, \%ffr); + } + } + $datahash{'feature_pub'} = \@feature_pubs; + } + + ##construct srcfeature hash for use in featureloc + if (defined $srcfeature) { %srcfhash = $self->_srcf_hash($srcfeature, $srcfeattype, \%organism); - # my %fr = ( - # "object_id" => \%srcfhash, - # "type_id" => { 'name' => 'partof', 'cv_id' => { 'name' => 'relationship type'}}, - # ); - - # push (@top_featrels, \%fr); - } - - #unflatten the seq features in $seq if $seq is a gene or a DNA sequence - if (($gb_type eq 'gene' || $gb_type eq 'DNA') && - !$nounflatten) { - my $u = Bio::SeqFeature::Tools::Unflattener->new; - $u->unflatten_seq(-seq=>$seq, -use_magic=>1); - } - - my @top_sfs = $seq->get_SeqFeatures; - #print $#top_sfs, "\n"; - - #SUBFEATURES - - if ($datasource =~ /GenBank/i) { - $tag_cv = 'GenBank feature qualifier'; - } elsif ($datasource =~ /GFF/i) { - $tag_cv = 'feature_property'; - } else { - $tag_cv = $cv_name{'feature_property'}; - } - - my $si = 0; - foreach $feat (@top_sfs) { - #$feat = $top_sfs[$si]; - #print "si: $si\n"; - my $prim_tag = $feat->primary_tag; - #print $prim_tag, "\n"; - - # get all qualifiers of the 'source' feature, load these as top_featureprops of the top level feature - if ($prim_tag eq 'source') { - foreach $tag ($feat->all_tags()) { - #db_xref - if ($tag eq 'db_xref' + # my %fr = ( + # "object_id" => \%srcfhash, + # "type_id" => { 'name' => 'partof', 'cv_id' => { 'name' => 'relationship type'}}, + # ); + + # push (@top_featrels, \%fr); + } + + #unflatten the seq features in $seq if $seq is a gene or a DNA sequence + if (($gb_type eq 'gene' || $gb_type eq 'DNA') && + !$nounflatten) { + my $u = Bio::SeqFeature::Tools::Unflattener->new; + $u->unflatten_seq(-seq=>$seq, -use_magic=>1); + } + + my @top_sfs = $seq->get_SeqFeatures; + #print $#top_sfs, "\n"; + + #SUBFEATURES + + if ($datasource =~ /GenBank/i) { + $tag_cv = 'GenBank feature qualifier'; + } elsif ($datasource =~ /GFF/i) { + $tag_cv = 'feature_property'; + } else { + $tag_cv = $cv_name{'feature_property'}; + } + + my $si = 0; + foreach $feat (@top_sfs) { + #$feat = $top_sfs[$si]; + #print "si: $si\n"; + my $prim_tag = $feat->primary_tag; + #print $prim_tag, "\n"; + + # get all qualifiers of the 'source' feature, load these as top_featureprops of the top level feature + if ($prim_tag eq 'source') { + foreach $tag ($feat->all_tags()) { + #db_xref + if ($tag eq 'db_xref' or $tag eq 'Dbxref' or $tag eq 'dbxref') { - my @t1 = $feat->each_tag_value($tag); - foreach $temp (@t1) { - $temp =~ /([^:]*?):(.*)/; + my @t1 = $feat->each_tag_value($tag); + foreach $temp (@t1) { + $temp =~ /([^:]*?):(.*)/; my $db = $1; my $xref = $2; #PRE/POST very inefficent - #my $db = $PREMATCH; - #my $xref = $POSTMATCH; - my %acchash = ( - "db_id" => {'name' => $db}, - "accession" => $xref, - ); - my %fdbx = ('dbxref_id' => \%acchash); - push (@top_dbxrefs, \%fdbx); - } + #my $db = $PREMATCH; + #my $xref = $POSTMATCH; + my %acchash = ( + "db_id" => {'name' => $db}, + "accession" => $xref, + ); + my %fdbx = ('dbxref_id' => \%acchash); + push (@top_dbxrefs, \%fdbx); + } #Ontology_term } elsif ($tag eq 'Ontology_term') { my @t1 = $feat->each_tag_value($tag); foreach $temp (@t1) { ###FIXME } - #other tags as featureprop - } elsif ($tag ne 'gene') { - my %prophash = undef; - %prophash = ( - "type_id" => {'name' => $tag, 'cv_id' => {'name' => $tag_cv}}, - "value" => join(' ',$feat->each_tag_value($tag)), - ); - push(@top_featureprops, \%prophash); - } - } + #other tags as featureprop + } elsif ($tag ne 'gene') { + my %prophash = undef; + %prophash = ( + "type_id" => {'name' => $tag, 'cv_id' => {'name' => $tag_cv}}, + "value" => join(' ',$feat->each_tag_value($tag)), + ); + push(@top_featureprops, \%prophash); + } + } if ($feat->can('source')) { my $source = $feat->source(); @top_dbxrefs = $self->handle_source($feat, @top_dbxrefs); } - #featureloc for the top-level feature - my $fmin = undef; - my $fmax = undef; - my $strand = undef; + #featureloc for the top-level feature + my $fmin = undef; + my $fmax = undef; + my $strand = undef; my $phase = undef; - my %fl = undef; + my %fl = undef; - $fmin = $feat->start - 1; - $fmax = $feat->end; - $strand = $feat->strand; + $fmin = $feat->start - 1; + $fmax = $feat->end; + $strand = $feat->strand; if ($feat->can('phase')) { $phase = $feat->phase; } - %fl = ( - "srcfeature_id" => \%srcfhash, - "fmin" => $fmin, - "fmax" => $fmax, - "strand" => $strand, + %fl = ( + "srcfeature_id" => \%srcfhash, + "fmin" => $fmin, + "fmax" => $fmax, + "strand" => $strand, "phase" => $phase, - ); + ); - $datahash{'featureloc'} = \%fl; + $datahash{'featureloc'} = \%fl; - #delete 'source' feature from @top_sfs - splice(@top_sfs, $si, 1); - } - $si ++; - #close loop over top_sfs - } + #delete 'source' feature from @top_sfs + splice(@top_sfs, $si, 1); + } + $si ++; + #close loop over top_sfs + } - #the top-level features other than 'source' - foreach $feat (@top_sfs) { - #print $feat->primary_tag, "\n"; + #the top-level features other than 'source' + foreach $feat (@top_sfs) { + #print $feat->primary_tag, "\n"; - my $r = $self->_subfeat2featrelhash($name, $ftype, $feat, \%srcfhash, $tag_cv, $isanalysis); + my $r = $self->_subfeat2featrelhash($name, $ftype, $feat, \%srcfhash, $tag_cv, $isanalysis); - if (!($ftype eq 'mRNA' && $feat->primary_tag eq 'gene')) { - my %fr = %$r; - push(@top_featrels, \%fr); - } else { - %finaldatahash = %$r; - } - } + if (!($ftype eq 'mRNA' && $feat->primary_tag eq 'gene')) { + my %fr = %$r; + push(@top_featrels, \%fr); + } else { + %finaldatahash = %$r; + } + } - if (@top_dbxrefs) { - $datahash{'feature_dbxref'} = \@top_dbxrefs; - } + if (@top_dbxrefs) { + $datahash{'feature_dbxref'} = \@top_dbxrefs; + } - if (@top_featureprops) { - $datahash{'featureprop'} = \@top_featureprops; - } + if (@top_featureprops) { + $datahash{'featureprop'} = \@top_featureprops; + } - if (@top_featrels) { - $datahash{'feature_relationship'} = \@top_featrels; - } + if (@top_featrels) { + $datahash{'feature_relationship'} = \@top_featrels; + } if (@top_featurecvterms) { $datahash{'feature_cvterm'} = \@top_featurecvterms; } - if ($ftype eq 'mRNA' && %finaldatahash) { - $finaldatahash{'feature_relationship'} = { - 'subject_id' => \%datahash, - 'type_id' => { 'name' => 'partof', 'cv_id' => { 'name' => $cv_name{'relationship'} }}, - }; - } else { - %finaldatahash = %datahash; - } + if ($ftype eq 'mRNA' && %finaldatahash) { + $finaldatahash{'feature_relationship'} = { + 'subject_id' => \%datahash, + 'type_id' => { 'name' => 'partof', 'cv_id' => { 'name' => $cv_name{'relationship'} }}, + }; + } else { + %finaldatahash = %datahash; + } - my $mainTag = 'feature'; - $self->_hash2xml(undef, $mainTag, \%finaldatahash); + my $mainTag = 'feature'; + $self->_hash2xml(undef, $mainTag, \%finaldatahash); - return 1; + return 1; } sub _hash2xml { @@ -1198,7 +1202,7 @@ sub _hash2xml { my $root = shift if (@_); #print "ult: $ult\n"; if (!defined $self->{'writer'}) { - $root = 1; + $root = 1; $self->_create_writer(); } my $temp; @@ -1209,278 +1213,278 @@ sub _hash2xml { #requires that the journal name itself is also stored as a pubprop record for the journal with value equal #to the journal name and type of 'abbreviation'. if ($ult eq 'pub' && $mh{'type_id'}->{'name'} eq 'journal') { - $self->{'writer'}->startTag($ult, 'ref' => $mh{'title'} . ':journal:abbreviation'); + $self->{'writer'}->startTag($ult, 'ref' => $mh{'title'} . ':journal:abbreviation'); } #special pub match if pub uniquename not known elsif ($ult eq 'pub' && !defined $mh{'uniquename'}) { - $self->{'writer'}->startTag($ult, 'op' => 'match'); - #set the match flag, all the sub tags should also have "op"="match" - $isMatch = 1; + $self->{'writer'}->startTag($ult, 'op' => 'match'); + #set the match flag, all the sub tags should also have "op"="match" + $isMatch = 1; } #if cvterm or cv, lookup only elsif (($ult eq 'cvterm') || ($ult eq 'cv')) { - $self->{'writer'}->startTag($ult, 'op' => 'lookup'); + $self->{'writer'}->startTag($ult, 'op' => 'lookup'); } #if nested tables of match table, match too elsif ($isMatch) { - $self->{'writer'}->startTag($ult, 'op' => 'match'); + $self->{'writer'}->startTag($ult, 'op' => 'match'); } else { - $self->{'writer'}->startTag($ult); + $self->{'writer'}->startTag($ult); } #first loop to produce xml for all the table columns foreach $key (keys %mh) { - #print "key: $key\n"; - $xx = ' ' . $key; - $yy = $key . ' '; - if (index($chadotables, $xx) < 0 && index($chadotables, $yy) < 0) - { - if ($isMatch) { - $self->{'writer'}->startTag($key, 'op' => 'match'); - } else { - $self->{'writer'}->startTag($key); - } - - my $x = $ult . '.' . $key; - #the column is a foreign key - if (defined $fkey{$x}) - { - $nt = $fkey{$x}; - $sh = $mh{$key}; - $self->_hash2xml($isMatch, $nt, $sh, 0); - } else - { - #print "$key: $mh{$key}\n"; - $self->{'writer'}->characters($mh{$key}); - } - $self->{'writer'}->endTag($key); - } + #print "key: $key\n"; + $xx = ' ' . $key; + $yy = $key . ' '; + if (index($chadotables, $xx) < 0 && index($chadotables, $yy) < 0) + { + if ($isMatch) { + $self->{'writer'}->startTag($key, 'op' => 'match'); + } else { + $self->{'writer'}->startTag($key); + } + + my $x = $ult . '.' . $key; + #the column is a foreign key + if (defined $fkey{$x}) + { + $nt = $fkey{$x}; + $sh = $mh{$key}; + $self->_hash2xml($isMatch, $nt, $sh, 0); + } else + { + #print "$key: $mh{$key}\n"; + $self->{'writer'}->characters($mh{$key}); + } + $self->{'writer'}->endTag($key); + } } #second loop to produce xml for all the nested tables foreach $key (keys %mh) { - #print "key: $key\n"; - $xx = ' ' . $key; - $yy = $key . ' '; - #a nested table - if (index($chadotables, $xx) > 0 || index($chadotables, $yy) > 0) - { - #$writer->startTag($key); - $ntref = $mh{$key}; - #print "$key: ", ref($ntref), "\n"; - if (ref($ntref) =~ 'HASH') { - $self->_hash2xml($isMatch, $key, $ntref, 0); - } elsif (ref($ntref) =~ 'ARRAY') { - #print "array dim: ", $#$ntref, "\n"; - foreach $ref (@$ntref) { - #print "\n"; - $self->_hash2xml($isMatch, $key, $ref, 0); - } - } - #$writer->endTag($key); - } + #print "key: $key\n"; + $xx = ' ' . $key; + $yy = $key . ' '; + #a nested table + if (index($chadotables, $xx) > 0 || index($chadotables, $yy) > 0) + { + #$writer->startTag($key); + $ntref = $mh{$key}; + #print "$key: ", ref($ntref), "\n"; + if (ref($ntref) =~ 'HASH') { + $self->_hash2xml($isMatch, $key, $ntref, 0); + } elsif (ref($ntref) =~ 'ARRAY') { + #print "array dim: ", $#$ntref, "\n"; + foreach $ref (@$ntref) { + #print "\n"; + $self->_hash2xml($isMatch, $key, $ref, 0); + } + } + #$writer->endTag($key); + } } #end tag $self->{'writer'}->endTag($ult); #if ($root == 1) { -# $self->{'writer'}->endTag('chado'); +# $self->{'writer'}->endTag('chado'); # } } sub _guess_acc_db { - my $self = shift; - my $seq = shift; - my $acc = shift; - #print "acc: $acc\n"; - - if ($acc =~ /^NM_\d{6}/ || $acc =~ /^NP_\d{6}/ || $acc =~ /^NT_\d{6}/ || $acc =~ /^NC_\d{6}/) { - return "RefSeq"; - } elsif ($acc =~ /^XM_\d{6}/ || $acc =~ /^XP_\d{6}/ || $acc =~ /^XR_\d{6}/) { - return "RefSeq"; - } elsif ($acc =~ /^[a-zA-Z]{1,2}\d{5,6}/) { - return "GB"; - } elsif ($seq->molecule() eq 'protein' && $acc =~ /^[a-zA-z]\d{5}/) { - return "PIR"; - } elsif ($seq->molecule() eq 'protein' && $acc =~ /^\d{6,7}[a-zA-Z]/) { - return "PRF"; - } elsif ($acc =~ /\d+/ && $acc !~ /[a-zA-Z]/) { - return "LocusID"; - } elsif ($acc =~ /^CG\d+/ || $acc =~ /^FB[a-z][a-z]\d+/) { - return "FlyBase"; - } else { - return "unknown"; - } + my $self = shift; + my $seq = shift; + my $acc = shift; + #print "acc: $acc\n"; + + if ($acc =~ /^NM_\d{6}/ || $acc =~ /^NP_\d{6}/ || $acc =~ /^NT_\d{6}/ || $acc =~ /^NC_\d{6}/) { + return "RefSeq"; + } elsif ($acc =~ /^XM_\d{6}/ || $acc =~ /^XP_\d{6}/ || $acc =~ /^XR_\d{6}/) { + return "RefSeq"; + } elsif ($acc =~ /^[a-zA-Z]{1,2}\d{5,6}/) { + return "GB"; + } elsif ($seq->molecule() eq 'protein' && $acc =~ /^[a-zA-z]\d{5}/) { + return "PIR"; + } elsif ($seq->molecule() eq 'protein' && $acc =~ /^\d{6,7}[a-zA-Z]/) { + return "PRF"; + } elsif ($acc =~ /\d+/ && $acc !~ /[a-zA-Z]/) { + return "LocusID"; + } elsif ($acc =~ /^CG\d+/ || $acc =~ /^FB[a-z][a-z]\d+/) { + return "FlyBase"; + } else { + return "unknown"; + } } sub _subfeat2featrelhash { - my $self = shift; - my $genename = shift; - my $seqtype = shift; - my $feat = shift; - my $r = shift; - my %srcf = %$r; #srcfeature hash for featureloc.srcfeature_id - my $tag_cv = shift; - my $isanalysis = shift; - - my $prim_tag = $feat->primary_tag; - - my $sfunique = undef; #subfeature uniquename - my $sfname = undef; #subfeature name - my $sftype = undef; #subfeature type - - if ($feat->has_tag('symbol')) { - ($sfunique) = $feat->each_tag_value("symbol"); - } elsif ($feat->has_tag('label')) { - ($sfunique) = $feat->each_tag_value("label"); - } else { - #$self->throw("$prim_tag at " . $feat->start . "\.\." . $feat->end . " does not have symbol or label! To convert into chadoxml, a seq feature must have a /symbol or /label tag holding its unique name."); - #generate feature unique name as -- - $sfunique = $self->_genFeatUniqueName($genename, $feat); - } - - if ($feat->has_tag('Name')) { - ($sfname) = $feat->each_tag_value("Name"); - } - - #feature type translation - if (defined $feattype_args2so{$prim_tag}) { - $sftype = $feattype_args2so{$prim_tag}; - } else { - $sftype = $prim_tag; - } - - if ($prim_tag eq 'mutation') { - if ($feat->start == $feat->end) { - $sftype = $feattype_args2so{'mutation1'}; - } else { - $sftype = $feattype_args2so{'mutation2'}; - } - } - - #set is_analysis flag for gene model features - undef(my $isanal); - if ($sftype eq 'gene' || $sftype eq 'mRNA' || $sftype eq 'exon' || $sftype eq 'protein' || $sftype eq 'polypeptide') { - $isanal = $isanalysis; - } - - my %sfhash = ( - "name" => $sfname, - "uniquename" => $sfunique, - "organism_id" => \%organism, - "type_id" => { 'name' => $sftype, 'cv_id' => { 'name' => $cv_name{'sequence'} }}, - "is_analysis" => $isanal || 'false', - ); - - #make a copy of %sfhash for passing to this method when recursively called - #my %srcfeat = ( + my $self = shift; + my $genename = shift; + my $seqtype = shift; + my $feat = shift; + my $r = shift; + my %srcf = %$r; #srcfeature hash for featureloc.srcfeature_id + my $tag_cv = shift; + my $isanalysis = shift; + + my $prim_tag = $feat->primary_tag; + + my $sfunique = undef; #subfeature uniquename + my $sfname = undef; #subfeature name + my $sftype = undef; #subfeature type + + if ($feat->has_tag('symbol')) { + ($sfunique) = $feat->each_tag_value("symbol"); + } elsif ($feat->has_tag('label')) { + ($sfunique) = $feat->each_tag_value("label"); + } else { + #$self->throw("$prim_tag at " . $feat->start . "\.\." . $feat->end . " does not have symbol or label! To convert into chadoxml, a seq feature must have a /symbol or /label tag holding its unique name."); + #generate feature unique name as -- + $sfunique = $self->_genFeatUniqueName($genename, $feat); + } + + if ($feat->has_tag('Name')) { + ($sfname) = $feat->each_tag_value("Name"); + } + + #feature type translation + if (defined $feattype_args2so{$prim_tag}) { + $sftype = $feattype_args2so{$prim_tag}; + } else { + $sftype = $prim_tag; + } + + if ($prim_tag eq 'mutation') { + if ($feat->start == $feat->end) { + $sftype = $feattype_args2so{'mutation1'}; + } else { + $sftype = $feattype_args2so{'mutation2'}; + } + } + + #set is_analysis flag for gene model features + undef(my $isanal); + if ($sftype eq 'gene' || $sftype eq 'mRNA' || $sftype eq 'exon' || $sftype eq 'protein' || $sftype eq 'polypeptide') { + $isanal = $isanalysis; + } + + my %sfhash = ( + "name" => $sfname, + "uniquename" => $sfunique, + "organism_id" => \%organism, + "type_id" => { 'name' => $sftype, 'cv_id' => { 'name' => $cv_name{'sequence'} }}, + "is_analysis" => $isanal || 'false', + ); + + #make a copy of %sfhash for passing to this method when recursively called + #my %srcfeat = ( # "name" => $sfname, # "uniquename" => $sfunique, # "organism_id" => \%organism, # "type_id" => { 'name' => $sftype, 'cv_id' => { 'name' => 'SO'}}, # ); - #featureloc for subfeatures - undef(my $sfmin); - undef(my $sfmax); - undef(my $is_sfmin_partial); - undef(my $is_sfmax_partial); - undef(my $sfstrand); + #featureloc for subfeatures + undef(my $sfmin); + undef(my $sfmax); + undef(my $is_sfmin_partial); + undef(my $is_sfmax_partial); + undef(my $sfstrand); undef(my $sfphase); - $sfmin = $feat->start - 1; - $sfmax = $feat->end; - $sfstrand = $feat->strand(); + $sfmin = $feat->start - 1; + $sfmax = $feat->end; + $sfstrand = $feat->strand(); if ($feat->can('phase')) { $sfphase = $feat->phase; } - #if the gene feature in an mRNA record, cannot use its coordinates, omit featureloc - if ($seqtype eq 'mRNA' && $sftype eq 'gene') { - } else { - if ($feat->location->isa('Bio::Location::FuzzyLocationI')) { - if ($feat->location->start_pos_type() ne 'EXACT') { - $is_sfmin_partial = 'true'; - } - if ($feat->location->end_pos_type() ne 'EXACT') { - $is_sfmax_partial = 'true'; - } - } - - my %sfl = ( - "srcfeature_id" => \%srcf, - "fmin" => $sfmin, - "is_fmin_partial" => $is_sfmin_partial || 'false', - "fmax" => $sfmax, - "is_fmax_partial" => $is_sfmax_partial || 'false', - "strand" => $sfstrand, + #if the gene feature in an mRNA record, cannot use its coordinates, omit featureloc + if ($seqtype eq 'mRNA' && $sftype eq 'gene') { + } else { + if ($feat->location->isa('Bio::Location::FuzzyLocationI')) { + if ($feat->location->start_pos_type() ne 'EXACT') { + $is_sfmin_partial = 'true'; + } + if ($feat->location->end_pos_type() ne 'EXACT') { + $is_sfmax_partial = 'true'; + } + } + + my %sfl = ( + "srcfeature_id" => \%srcf, + "fmin" => $sfmin, + "is_fmin_partial" => $is_sfmin_partial || 'false', + "fmax" => $sfmax, + "is_fmax_partial" => $is_sfmax_partial || 'false', + "strand" => $sfstrand, "phase" => $sfphase, - ); + ); - $sfhash{'featureloc'} = \%sfl; - } + $sfhash{'featureloc'} = \%sfl; + } - #subfeature tags - undef(my @sfdbxrefs); #subfeature dbxrefs - undef(my @sub_featureprops); #subfeature props + #subfeature tags + undef(my @sfdbxrefs); #subfeature dbxrefs + undef(my @sub_featureprops); #subfeature props undef(my @sub_featuresyns); #subfeature synonyms undef(my @sub_featurecvterms); #subfeature cvterms - foreach my $tag ($feat->all_tags()) { - #feature_dbxref for features - if ($tag eq 'db_xref' or $tag eq 'dbxref' or $tag eq 'Dbxref') { - my @t1 = $feat->each_tag_value($tag); - #print "# of dbxref: @t1\n"; - for my $temp (@t1) { - $temp =~ /:/; - my $db = $PREMATCH; - my $xref = $POSTMATCH; - #print "db: $db; xref: $xref\n"; - my %acchash = ( - "db_id" => {'name' => $db}, - "accession" => $xref, - ); - my %sfdbx = ('dbxref_id' => \%acchash); - push (@sfdbxrefs, \%sfdbx); - } + foreach my $tag ($feat->all_tags()) { + #feature_dbxref for features + if ($tag eq 'db_xref' or $tag eq 'dbxref' or $tag eq 'Dbxref') { + my @t1 = $feat->each_tag_value($tag); + #print "# of dbxref: @t1\n"; + for my $temp (@t1) { + $temp =~ /:/; + my $db = $PREMATCH; + my $xref = $POSTMATCH; + #print "db: $db; xref: $xref\n"; + my %acchash = ( + "db_id" => {'name' => $db}, + "accession" => $xref, + ); + my %sfdbx = ('dbxref_id' => \%acchash); + push (@sfdbxrefs, \%sfdbx); + } #Alias tags } elsif ($tag eq 'Alias') { @sub_featuresyns = $self->handle_Alias_tag($feat, @sub_featuresyns); } elsif ($tag eq 'Ontology_term') { @sub_featurecvterms = $self->handle_Ontology_tag($feat, @sub_featurecvterms); - #featureprop for features, excluding GFF Name & Parent tags - } elsif ($tag ne 'gene' && $tag ne 'symbol' && $tag ne 'Name' && $tag ne 'Parent') { + #featureprop for features, excluding GFF Name & Parent tags + } elsif ($tag ne 'gene' && $tag ne 'symbol' && $tag ne 'Name' && $tag ne 'Parent') { next if ($tag eq 'parent_id'); next if ($tag eq 'load_id'); - foreach my $val ($feat->each_tag_value($tag)) { - my %prophash = undef; - %prophash = ( - "type_id" => {'name' => $tag, 'cv_id' => {'name' => $tag_cv}}, - "value" => $val, - ); - push(@sub_featureprops, \%prophash); - } - } - } + foreach my $val ($feat->each_tag_value($tag)) { + my %prophash = undef; + %prophash = ( + "type_id" => {'name' => $tag, 'cv_id' => {'name' => $tag_cv}}, + "value" => $val, + ); + push(@sub_featureprops, \%prophash); + } + } + } if ($feat->can('source')) { @sfdbxrefs = $self->handle_source($feat,@sfdbxrefs); } - if (@sub_featureprops) { - $sfhash{'featureprop'} = \@sub_featureprops; - } - if (@sfdbxrefs) { - $sfhash{'feature_dbxref'} = \@sfdbxrefs; - } + if (@sub_featureprops) { + $sfhash{'featureprop'} = \@sub_featureprops; + } + if (@sfdbxrefs) { + $sfhash{'feature_dbxref'} = \@sfdbxrefs; + } if (@sub_featuresyns) { $sfhash{'feature_synonym'} = \@sub_featuresyns; } @@ -1488,62 +1492,62 @@ sub _subfeat2featrelhash { $sfhash{'feature_cvterm'} = \@sub_featurecvterms; } - undef(my @ssfeatrel); - if ($feat->has_tag('locus_tag')) { - ($genename)= $feat->each_tag_value('locus_tag'); - } elsif ($feat->has_tag('gene')) { - ($genename)= $feat->each_tag_value('gene'); - } - - foreach my $sf ($feat->get_SeqFeatures()) { - #print $sf->primary_tag, "\n"; - my $rref = $self->_subfeat2featrelhash($genename, $sftype, $sf, \%srcf, $tag_cv, $isanalysis); - if (defined $rref) { - push(@ssfeatrel, $rref); - } - } - - if (@ssfeatrel) { - $sfhash{'feature_relationship'} = \@ssfeatrel; - } - - #subj-obj relationship type - undef(my $reltypename); + undef(my @ssfeatrel); + if ($feat->has_tag('locus_tag')) { + ($genename)= $feat->each_tag_value('locus_tag'); + } elsif ($feat->has_tag('gene')) { + ($genename)= $feat->each_tag_value('gene'); + } + + foreach my $sf ($feat->get_SeqFeatures()) { + #print $sf->primary_tag, "\n"; + my $rref = $self->_subfeat2featrelhash($genename, $sftype, $sf, \%srcf, $tag_cv, $isanalysis); + if (defined $rref) { + push(@ssfeatrel, $rref); + } + } + + if (@ssfeatrel) { + $sfhash{'feature_relationship'} = \@ssfeatrel; + } + + #subj-obj relationship type + undef(my $reltypename); $reltypename = return_reltypename($sftype); - my %fr = ( - "subject_id" => \%sfhash, - "type_id" => { 'name' => $reltypename, + my %fr = ( + "subject_id" => \%sfhash, + "type_id" => { 'name' => $reltypename, 'cv_id' => { 'name' => $cv_name{'relationship'} }}, - ); + ); - if ($seqtype eq 'mRNA' && $sftype eq 'gene') { - return \%sfhash; - } else { - return \%fr; - } + if ($seqtype eq 'mRNA' && $sftype eq 'gene') { + return \%sfhash; + } else { + return \%fr; + } } #generate uniquename for feature as: -- (foo-mRNA-10..1000) sub _genFeatUniqueName { - my $self = shift; - my $genename = shift; - my $feat = shift; - undef(my $uniquename); - my $ftype = $feat->primary_tag; - my $start = $feat->start; - my $end = $feat->end; - - if ($feat->has_tag('locus_tag')) { - ($genename) = $feat->each_tag_value("locus_tag"); - } elsif ($feat->has_tag('gene')) { - ($genename) = $feat->each_tag_value("gene"); - } - - $uniquename = $genename . '-' . $ftype . '-' . $start . "\.\." . $end; - - return $uniquename; + my $self = shift; + my $genename = shift; + my $feat = shift; + undef(my $uniquename); + my $ftype = $feat->primary_tag; + my $start = $feat->start; + my $end = $feat->end; + + if ($feat->has_tag('locus_tag')) { + ($genename) = $feat->each_tag_value("locus_tag"); + } elsif ($feat->has_tag('gene')) { + ($genename) = $feat->each_tag_value("gene"); + } + + $uniquename = $genename . '-' . $ftype . '-' . $start . "\.\." . $end; + + return $uniquename; } #create uniquename for pubs with no medline id and no FBrf# @@ -1552,60 +1556,60 @@ sub _genFeatUniqueName { # or and if two, # or et al. if more #sub _CreatePubUname { -# my $self = shift; -# my $pub = shift; -# undef(my $pubuname); +# my $self = shift; +# my $pub = shift; +# undef(my $pubuname); # -# return $pubuname; +# return $pubuname; #} #get authors of a reference #returns ref to the array of author hashes sub _getRefAuthors { - my $self = shift; - my $ref = shift; - - my $temp = $ref->authors; - undef(my @authors); - undef(my @aut); - - #there are authors - if ($temp ne '.') { - if (index($temp, ' and ') > 0) { - $temp =~ / and /; - my $lastauthor = $POSTMATCH; - @authors = split(/\, /, $PREMATCH); - push (@authors, $lastauthor); - } else { - @authors = split(/\, /, $temp); - } - - my $a; - my $i = 0; - foreach $a (@authors) { - $i ++; - #parse the author lastname and givennames - undef(my $last); - undef(my $given); - if (index($a, ',') > 0) { #genbank format, last,f.m. - ($last, $given) = split(/\,/, $a); - } elsif (index($a, ' ') > 0) { #embl format, last f.m. - ($last, $given) = split(/ /, $a); - } - my %au = ( - 'surname' => $last, - 'givennames' => $given, - ); - push(@aut, {author_id => \%au, arank => $i}); - } - - return \@aut; - } - - #no authors, Bio::SeqIO::genbank doesn't pick up 'CONSRTM' line. - else { - return; - } + my $self = shift; + my $ref = shift; + + my $temp = $ref->authors; + undef(my @authors); + undef(my @aut); + + #there are authors + if ($temp ne '.') { + if (index($temp, ' and ') > 0) { + $temp =~ / and /; + my $lastauthor = $POSTMATCH; + @authors = split(/\, /, $PREMATCH); + push (@authors, $lastauthor); + } else { + @authors = split(/\, /, $temp); + } + + my $a; + my $i = 0; + foreach $a (@authors) { + $i ++; + #parse the author lastname and givennames + undef(my $last); + undef(my $given); + if (index($a, ',') > 0) { #genbank format, last,f.m. + ($last, $given) = split(/\,/, $a); + } elsif (index($a, ' ') > 0) { #embl format, last f.m. + ($last, $given) = split(/ /, $a); + } + my %au = ( + 'surname' => $last, + 'givennames' => $given, + ); + push(@aut, {author_id => \%au, arank => $i}); + } + + return \@aut; + } + + #no authors, Bio::SeqIO::genbank doesn't pick up 'CONSRTM' line. + else { + return; + } } @@ -1616,15 +1620,15 @@ sub _getSubmitYear { my $citation = shift; if ($citation !~ /Submitted/) { - $self->warn("not citation for a submitted reference. cannot extract submission year."); - return; + $self->warn("not citation for a submitted reference. cannot extract submission year."); + return; } else { - $citation =~ /Submitted \(\d\d-[a-zA-Z]{3}-\d{4}\)/; - my $a = $MATCH; - $a =~ /\d{4}/; - my $year = $MATCH; + $citation =~ /Submitted \(\d\d-[a-zA-Z]{3}-\d{4}\)/; + my $a = $MATCH; + $a =~ /\d{4}/; + my $year = $MATCH; - return $year; + return $year; } } @@ -1635,47 +1639,33 @@ sub _getSubmitAddr { my $citation = $ref->location; if ($citation !~ /Submitted/) { - $self->warn("not citation for a submitted reference. cannot extract submission year."); - return; + $self->warn("not citation for a submitted reference. cannot extract submission year."); + return; } else { - $citation =~ /Submitted \(\d\d-[a-zA-Z]{3}-\d{4}\)/; - my $a = $POSTMATCH; - if (defined $a) { - $a =~ s/^\s//; - %author = ( - 'author_id' => {'surname' => substr($a, 0, 100)}, - ); - return \%author; - } else { - return; - } + $citation =~ /Submitted \(\d\d-[a-zA-Z]{3}-\d{4}\)/; + my $a = $POSTMATCH; + if (defined $a) { + $a =~ s/^\s//; + %author = ( + 'author_id' => {'surname' => substr($a, 0, 100)}, + ); + return \%author; + } else { + return; + } } } - =head2 suppress_residues -=over - -=item Usage - - $obj->suppress_residues() #get existing value - $obj->suppress_residues($newval) #set new value - -=item Function - -Keep track of the flag to suppress printing of residues in the chadoxml file. -The default it to allow all residues to go into the file. - -=item Returns - -value of suppress_residues (a scalar) - -=item Arguments - -new value of suppress_residues (to set) - -=back + Title : suppress_residues + Usage : $obj->suppress_residues() #get existing value + $obj->suppress_residues($newval) #set new value + Function : Keep track of the flag to suppress printing of residues in the + chadoxml file. The default it to allow all residues to go into the + file. + Returns : value of suppress_residues (a scalar) + Args : new value of suppress_residues (to set) =cut @@ -1688,32 +1678,19 @@ sub suppress_residues { =head2 allow_residues -=over - -=item Usage - - $obj->allow_residues() #get existing value - $obj->allow_residues($feature_type) #set new value - -=item Function - -Track the allow_residues type. This can be used in conjunction with the -suppress_residues flag to only allow residues from a specific feature type -to be printed in the xml file, for example, only printing chromosome -residues. When suppress_residues is set to true, then only chromosome -features would would go into the xml file. If suppress_residues is not -set, this function has no effect (since the default is to put all residues -in the xml file). - -=item Returns - -value of allow_residues (a string that corresponds to a feature type) - -=item Arguments - -new value of allow_residues (to set) - -=back + Title : allow_residues + Usage : $obj->allow_residues() #get existing value + $obj->allow_residues($feature_type) #set new value + Function : Track the allow_residues type. This can be used in conjunction + with the suppress_residues flag to only allow residues from a + specific feature type to be printed in the xml file, for example, + only printing chromosome residues. When suppress_residues is set to + true, then only chromosome features would would go into the xml + file. If suppress_residues is not set, this function has no effect + (since the default is to put all residues in the xml file). + Returns : value of allow_residues (string that corresponds to a feature type) + Args : new value of allow_residues (to set) + Status : =cut @@ -1726,27 +1703,14 @@ sub allow_residues { =head2 return_ftype_hash -=over - -=item Usage - - $obj->return_ftype_hash() - -=item Function - -A simple hash where returning it has be factored out of the main -code to allow subclasses to override it. - -=item Returns - -A hash that indicates what the name of the SO term is and what -the name of the Sequence Ontology is in the cv table. - -=item Arguments - -The string that represents the SO term. - -=back + Title : return_ftype_hash + Usage : $obj->return_ftype_hash() + Function : A simple hash where returning it has be factored out of the main + code to allow subclasses to override it. + Returns : A hash that indicates what the name of the SO term is and what + the name of the Sequence Ontology is in the cv table. + Args : The string that represents the SO term. + Status : =cut @@ -1760,26 +1724,13 @@ sub return_ftype_hash { =head2 return_reltypename -=over - -=item Usage - - $obj->return_reltypename() - -=item Function - -Return the appropriate relationship type name depending on the -feature type (typically part_of, but derives_from for polypeptide). - -=item Returns - -A relationship type name. - -=item Arguments - -A SO type name. - -=back + Title : return_reltypename + Usage : $obj->return_reltypename + Function : Return the appropriate relationship type name depending on the + feature type (typically part_of, but derives_from for polypeptide). + Returns : A relationship type name. + Args : A SO type name. + Status : =cut @@ -1799,21 +1750,12 @@ sub return_reltypename { =head2 next_seq -=over - -=item Usage - - $obj->next_seq() - -=item Function - -Not implemented--this is a write-only adapter. - -=item Returns - -=item Arguments - -=back + Title : next_seq + Usage : $obj->next_seq + Function : + Returns : + Args : + Status : Not implemented (write only adaptor) =cut @@ -1824,28 +1766,14 @@ sub next_seq { } - =head2 _create_writer -=over - -=item Usage - - $obj->_create_writer() - -=item Function - -Creates XML::Writer object and writes start tag - -=item Returns - -Nothing, though the writer persists as part of the chadoxml object - -=item Arguments - -None - -=back + Title : _create_writer + Usage : $obj->_create_writer + Function : Creates XML::Writer object and writes start tag + Returns : Nothing, though the writer persists as part of the chadoxml object + Args : None + Status : =cut @@ -1869,25 +1797,12 @@ sub _create_writer { =head2 close_chadoxml -=over - -=item Usage - - $obj->close_chadoxml() - -=item Function - -Writes the closing xml tag - -=item Returns - -Nothing - -=item Arguments - -None - -=back + Title : close_chadoxml + Usage : $obj->close_chadoxml + Function : Writes the closing xml tag + Returns : None + Args : None + Status : =cut @@ -1900,25 +1815,12 @@ sub close_chadoxml { =head2 handle_unreserved_tags -=over - -=item Usage - - $obj->handle_unreserved_tags() - -=item Function - -Converts tag value pairs to xml-ready hashrefs - -=item Returns - -The array containing the hashrefs - -=item Arguments - -In order: the Seq or SeqFeature object, the key, and the hasharray - -=back + Title : handle_unreserved_tags + Usage : $obj->handle_unreserved_tags + Function : Converts tag value pairs to xml-ready hashrefs + Returns : The array containing the hashrefs + Args : In order: the Seq or SeqFeature object, the key, and the hasharray + Status : =cut @@ -1944,25 +1846,12 @@ sub handle_unreserved_tags { =head2 handle_Alias_tag -=over - -=item Usage - - $obj->handle_Alias_tag() - -=item Function - -Convert Alias values to synonym hash refs - -=item Returns - -An array of synonym hash tags - -=item Arguments - -The seq or seqFeature object and the synonym hash array - -=back + Title : handle_Alias_tag + Usage : $obj->handle_Alias_tag + Function : Convert Alias values to synonym hash refs + Returns : An array of synonym hash tags + Args : The seq or seqFeature object and the synonym hash array + Status : =cut @@ -1993,27 +1882,14 @@ sub handle_Alias_tag { return @arr; } -=head2 handle_Ontology_tag - -=over - -=item Usage - - $obj->handle_Ontology_tag () - -=item Function - -Convert Ontology_term values to ontology term hash refs - -=item Returns - -An array of ontology term hash refs +=head2 handle_Ontology_tag -=item Arguments - -The seq or seqFeature object and the ontology term array - -=back + Title : handle_Ontology_tag + Usage : $obj->handle_Ontology_tag + Function : Convert Ontology_term values to ontology term hash refs + Returns : An array of ontology term hash refs + Args : The seq or seqFeature object and the ontology term array + Status : =cut @@ -2045,25 +1921,12 @@ sub handle_Ontology_tag { =head2 handle_dbxref -=over - -=item Usage - - $obj->handle_dbxref() - -=item Function - -Convert Dbxref values to dbxref hashref - -=item Returns - -An array of dbxref hashrefs - -=item Arguments - -A seq or seqFeature object and the dbxref array - -=back + Title : handle_dbxref + Usage : $obj->handle_dbxref + Function : Convert Dbxref values to dbxref hashref + Returns : An array of dbxref hashrefs + Args : A seq or seqFeature object and the dbxref array + Status : =cut @@ -2102,19 +1965,12 @@ sub handle_dbxref { =head2 handle_source -=over - -=item Usage - - $obj->handle_source() - -=item Function - -=item Returns - -=item Arguments - -=back + Title : handle_source + Usage : $obj->handle_source + Function : + Returns : + Args : + Status : =cut @@ -2139,26 +1995,13 @@ sub handle_source { =head2 _srcf_hash -=over - -=item Usage - - $obj->_srcf_hash() - -=item Function - -Creates the srcfeature hash for use in featureloc hashes - -=item Returns - -The srcfeature hash - -=item Arguments - -The srcfeature name, the srcfeature type and a reference to the -organism hash. - -=back + Title : _srcf_hash + Usage : $obj->_srcf_hash + Function : Creates the srcfeature hash for use in featureloc hashes + Returns : The srcfeature hash + Args : The srcfeature name, the srcfeature type and a reference to the + organism hash. + Status : =cut diff --git a/Bio/SeqIO/flybase_chadoxml.pm b/Bio/SeqIO/flybase_chadoxml.pm index 7c1da88bf6..b0254e6cdf 100644 --- a/Bio/SeqIO/flybase_chadoxml.pm +++ b/Bio/SeqIO/flybase_chadoxml.pm @@ -91,30 +91,16 @@ sub _initialize { return; } - =head2 return_ftype_hash -=over - -=item Usage - - $obj->return_ftype_hash() - -=item Function - -A simple hash where returning it has be factored out of the main -code to allow subclasses to override it. - -=item Returns - -A hash that indicates what the name of the SO term is and what -the name of the Sequence Ontology is in the cv table. - -=item Arguments - -The string that represents the SO term. - -=back + Title : return_ftype_hash + Usage : $obj->return_ftype_hash() + Function : A simple hash where returning it has be factored out of the main + code to allow subclasses to override it. + Returns : A hash that indicates what the name of the SO term is and what + the name of the Sequence Ontology is in the cv table. + Args : The string that represents the SO term. + Status : =cut @@ -128,26 +114,13 @@ sub return_ftype_hash { =head2 return_reltypename -=over - -=item Usage - - $obj->return_reltypename() - -=item Function - -Return the appropriate relationship type name depending on the -feature type (typically partof, but producedby for proteins). - -=item Returns - -A relationship type name. - -=item Arguments - -A SO type name. - -=back + Title : return_reltypename + Usage : $obj->return_reltypename + Function : Return the appropriate relationship type name depending on the + feature type (typically part_of, but derives_from for polypeptide). + Returns : A relationship type name. + Args : A SO type name. + Status : =cut @@ -167,23 +140,20 @@ sub return_reltypename { =head2 write_seq -=over - -=item Usage - - $obj->write_seq() - -=item Function - -Overrides Bio::SeqIO::chadoxml's write_seq method just -to add an internal close_chadoxml (mimics original use -by FlyBase). - -=item Returns - -=item Arguments - -=back + Title : write_seq + Usage : $stream->write_seq(-seq=>$seq, -seq_so_type=>$seqSOtype, + -src_feature=>$srcfeature, + -src_feat_type=>$srcfeattype, + -nounflatten=>0 or 1, + -is_analysis=>'true' or 'false', + -data_source=>$datasource) + Function: writes the $seq object (must be seq) into chadoxml. + Returns : 1 for success and 0 for error + Args : A Bio::Seq object $seq, optional $seqSOtype, $srcfeature, + $srcfeattype, $nounflatten, $is_analysis and $data_source. + +Overrides Bio::SeqIO::chadoxml's write_seq method just to add an internal +close_chadoxml (mimics original use by FlyBase). =cut diff --git a/Bio/Tools/TargetP.pm b/Bio/Tools/TargetP.pm index 2c617f4854..33df0ff12b 100644 --- a/Bio/Tools/TargetP.pm +++ b/Bio/Tools/TargetP.pm @@ -87,7 +87,7 @@ of the bugs and their resolution. Bug reports can be submitted via the web: http://bugzilla.open-bio.org/ - + =head1 AUTHORS - Emmanuel Quevillon Email emmanuel.quevillon@versailles.inra.fr diff --git a/Build.PL b/Build.PL index f8b2258ccd..815d24310e 100755 --- a/Build.PL +++ b/Build.PL @@ -203,9 +203,10 @@ sub make_dbi_test { my $dsn = $build->notes('test_dsn') || return; my $path0 = File::Spec->catfile('t', 'LocalDB', 'SeqFeature.t'); my $driver = $build->notes('dbd_driver'); - my $path = File::Spec->catfile('t', 'LocalDB', ($driver eq 'mysql') - ? 'SeqFeature_mysql.t' - : 'SeqFeature_Pg.t'); + my $path = File::Spec->catfile('t', 'LocalDB', + ($driver eq 'mysql') ? 'SeqFeature_mysql.t' : + ($driver eq 'SQLite') ? 'SeqFeature_SQLite.t' : + 'SeqFeature_Pg.t'); my $test_db = $build->notes('test_db'); my $user = $build->notes('test_user'); my $pass = $build->notes('test_pass'); @@ -263,6 +264,9 @@ sub prompt_for_biodb { elsif ($driver =~ /^[oO]/) { $driver = 'Oracle'; } + elsif ($driver =~ /^[sS]/) { + $driver = 'SQLite'; + } my $test_db = $build->prompt("Which database should I use for testing the $driver driver?\n". "This database should already be present but doesn't have to ". diff --git a/maintenance/deprecated.pl b/maintenance/deprecated.pl index 109d707ab9..50f3ed784d 100644 --- a/maintenance/deprecated.pl +++ b/maintenance/deprecated.pl @@ -195,11 +195,6 @@ =head1 OPTIONS Name of output file to write deprecation table to. DEPRECATED.NEW is the default name -=item B<-o | --outfile> - -Name of output file to write deprecation table to. DEPRECATED.NEW is the default -name - =item B<-v | --verbose> Show the progress through files during the checking. diff --git a/scripts/Bio-DB-EUtilities/einfo.PLS b/scripts/Bio-DB-EUtilities/einfo.PLS new file mode 100644 index 0000000000..bef96fb723 --- /dev/null +++ b/scripts/Bio-DB-EUtilities/einfo.PLS @@ -0,0 +1,95 @@ +#!perl +# $Id: einfo.PLS 15088 2008-12-04 02:49:09Z bosborne $ +use strict; +use warnings; + +=head1 NAME + +einfo.pl - query einfo to find all available databases or information about a + specific database (field information or links to other NCBI + databases) + +=head1 SYNOPSIS + + einfo [-d database] [-f Field Code] [-l Link Name] [-o outfile] + +=head1 DESCRIPTION + +Command line options: + + -d/--db/--database + NCBI database to query + (default = none, which shows available databases) + + -f/--field + print out information about a specific field code + (default = none) + + -l/--link + print out information about a specific link name + (default = none) + + -o/--out + outfile + (default = STDOUT) + + -h/--help + show this documentation + +If -d is not specified, field and link arguments are ignored and all available +databases are printed instead. + +If either link names or field codes (or both) are specified, nothing else is +printed out (only the info requested). You can specify as many fields and/or +links as you want by using multiple -f/-l EARGE on the command line. + +=head1 AUTHOR - Chris Fields + +Chris Fields cjfields at bioperl dot org + +=cut + +use Getopt::Long; +use Bio::DB::EUtilities; + +my ($db, @fields, @links, $outfile); +GetOptions( + 'd|db|database:s' => \$db, + 'f|field:s' => \@fields, + 'l|link:s' => \@links, + 'o|out|outfile:s' => \$outfile, + 'h|help' => sub { exec('perldoc',$0); exit; } + ); + +my $outfh; +if( $outfile ) { + open($outfh, ">$outfile") || die("$outfile: $!"); +} else { + $outfh = \*STDOUT; +} + +if (!defined $db) { + my $eutil = Bio::DB::EUtilities->new(-eutil => 'einfo'); + print $outfh join("\n",$eutil->get_available_databases); + exit; +} else { + my $eutil = Bio::DB::EUtilities->new(-eutil => 'einfo', + -db => $db); + if (@links || @fields) { + for my $fi ($eutil->get_FieldInfo) { + my $code = $fi->get_field_code; + if (grep {$_ eq $code} @fields) { + print $outfh $fi->to_string."\n"; + } + } + for my $li ($eutil->get_LinkInfo) { + my $nm = $li->get_link_name; + if (grep {$_ eq $nm} @links) { + print $outfh $li->to_string."\n"; + } + } + } else { + $eutil->print_FieldInfo; + $eutil->print_LinkInfo; + } +} diff --git a/t/LocalDB/SeqFeature_SQLite.t b/t/LocalDB/SeqFeature_SQLite.t index d0315e0ffe..0da88dcb6c 100644 --- a/t/LocalDB/SeqFeature_SQLite.t +++ b/t/LocalDB/SeqFeature_SQLite.t @@ -1 +1 @@ -system 'perl t/LocalDB/SeqFeature.t -adaptor DBI::SQLite -create 1 -temp 1 -dsn dbi:SQLite: -debug 0'; +system 'perl t/LocalDB/SeqFeature.t -adaptor DBI::SQLite -create 1 -temp 1 -dsn dbi:SQLite:database=test'; diff --git a/t/PodSyntax.t b/t/PodSyntax.t index e983ddc20c..47e658abc7 100644 --- a/t/PodSyntax.t +++ b/t/PodSyntax.t @@ -10,4 +10,4 @@ BEGIN { } # check pod is syntactically correct -all_pod_files_ok( all_pod_files('.') ) +all_pod_files_ok( all_pod_files(qw(Bio scripts examples maintenance)) ) diff --git a/t/gmap_f9-searchio.t b/t/SearchIO/gmap_f9.t similarity index 94% rename from t/gmap_f9-searchio.t rename to t/SearchIO/gmap_f9.t index 14fc3e376a..69b5984cea 100644 --- a/t/gmap_f9-searchio.t +++ b/t/SearchIO/gmap_f9.t @@ -1,12 +1,16 @@ -#!perl +# -*-Perl-*- Test Harness script for Bioperl +# $Id: gmap_f9.t 14995 2008-11-16 06:20:00Z cjfields $ use strict; use warnings; - -use Test::More tests => 46; - -use Bio::SearchIO; -use Data::Dumper; +BEGIN { + use lib '.'; + use Bio::Root::Test; + + test_begin(-tests => 47); + + use_ok('Bio::SearchIO'); +} my $searchio = Bio::SearchIO->new(-format => 'gmap_f9', @@ -81,17 +85,12 @@ $searchio = Bio::SearchIO->new(-format => 'gmap_f9', my $result_count = 0; while (my $result = $searchio->next_result) { $result_count++; -# while(my $hit = $result->next_hit){ -# my @hsps = $hit->hsps(); -# print $hit->name, " ", $hit->length, scalar(@hsps); -# } } is($result_count, 58, "Can we loop over multiple results properly (expecting 58)?"); exit(0); - sub _check_hit { my ($hit, $info) = @_;