From a2afbafcd7f035d133e3722dfc6668c00b0eec43 Mon Sep 17 00:00:00 2001 From: carlosp420 Date: Wed, 3 Dec 2014 15:39:35 +0200 Subject: [PATCH 1/3] copy code in Biopython --- Bio/bold/__init__.py | 24 ++ Bio/bold/api.py | 684 +++++++++++++++++++++++++++++++++++++++++++ Bio/bold/utils.py | 32 ++ 3 files changed, 740 insertions(+) create mode 100755 Bio/bold/__init__.py create mode 100644 Bio/bold/api.py create mode 100644 Bio/bold/utils.py diff --git a/Bio/bold/__init__.py b/Bio/bold/__init__.py new file mode 100755 index 00000000000..a6e88217512 --- /dev/null +++ b/Bio/bold/__init__.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- +# Copyright 2014 by Carlos Pena. All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. + +"""Code to invoke the BOLD server over the internet. + +This module provides code to work with the BOLD API provided by BOLDSYSTEMS +http://www.boldsystems.org/index.php/Resources + +""" +import logging + +from .api import call_id +from .api import call_taxon_search +from .api import call_taxon_data +from .api import call_specimen_data +from .api import call_sequence_data +from .api import call_full_data +from .api import call_trace_files + + +logging.basicConfig(format='[bold module]:%(levelname)s:%(message)s', level=logging.DEBUG) diff --git a/Bio/bold/api.py b/Bio/bold/api.py new file mode 100644 index 00000000000..2278b236488 --- /dev/null +++ b/Bio/bold/api.py @@ -0,0 +1,684 @@ +# -*- coding: utf-8 -*- +# Copyright 2014 by Carlos Pena. All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. + +"""Code to invoke the BOLD server over the internet. + +This module provides code to work with the BOLD API provided by BOLDSYSTEMS +http://www.boldsystems.org/index.php/Resources + +Classes: +Request Builds the correct URL and performs the HTTP request to the API from + BOLD. +Response Parses the returned data from BOLD. + +""" +import json +import os +import re +from random import randint +import sys +import warnings +import xml +import xml.etree.ElementTree as ET + +from Bio import BiopythonWarning +from Bio import SeqIO +from Bio._py3k import Request as _Request +from Bio._py3k import urlopen as _urlopen +from Bio._py3k import urlencode as _urlencode +from Bio._py3k import _as_string + +from . import utils + + +class Response(object): + """Accepts and parses results from a call to the BOLD API. + + Parses the data and returns a Response object. + + Attributes: + items (list or str): Metadata from BOLD after parsing. + service (str): Alias of the method used to interact with BOLD. + + """ + def _parse_data(self, service, result_string): + """Parses XML response from BOLD. + + Args: + service: Alias of the method used to interact with BOLD. + result_string: XML or JSON string returned from BOLD. + + Returns: + List of all items as dictionaries. + + """ + self.method = service + + if result_string.strip() == '': + raise ValueError("BOLD did not return any result.") + + if service == 'call_taxon_search' or service == 'call_taxon_data': + self._parse_json(result_string) + + if service == 'call_specimen_data' or service == 'call_full_data' or \ + service == 'call_id': + # Result_string could be data as tab-separated values (tsv) + # ugly hack for python 2.6 that does not have ET.ParseError + if sys.version.startswith('2.6'): + try: + self._parse_xml(result_string) + except xml.parsers.expat.ExpatError: + self.items = result_string + else: + try: + self._parse_xml(result_string) + except ET.ParseError: + self.items = result_string + + if service == 'call_sequence_data': + self._parse_fasta(result_string) + + if service == 'call_trace_files': + # file_contents is in binary form + self.file_contents = result_string + + def _parse_json(self, result_string): + """Parses JSON response from BOLD. + + Args: + result_string: JSON string returned from BOLD. + + Returns: + List of all items as dictionaries. + + Raises: + ValueError: "BOLD did not return any result." + + """ + items_from_bold = [] + append = items_from_bold.append + response = json.loads(result_string) + if hasattr(response, 'items'): + # Is this a simple JSON and we got only one item? + simple_json = False + for i in response.keys(): + res = re.search('^[0-9]+', i) + if res is None: + simple_json = True + + if simple_json is True: + response = [response] + for string_id in response: + item = dict() + try: + json_obj = response[string_id] + except TypeError: + obj = string_id + json_obj = obj + + if hasattr(json_obj, 'items'): + for k, v in json_obj.items(): + if k == 'taxid': + item['tax_id'] = v + elif k == 'taxon': + item['taxon'] = v + elif k == 'tax_rank': + item['tax_rank'] = v + elif k == 'tax_division': + item['tax_division'] = v + elif k == 'parentid': + item['parent_id'] = v + elif k == 'parentname': + item['parent_name'] = v + elif k == 'taxonrep': + item['taxon_rep'] = v + else: + item[k] = v + append(item) + self.items = items_from_bold + else: + raise ValueError("BOLD did not return any result.") + + def _parse_xml(self, result_string): + """Parses XML response from BOLD. + + Args: + result_string: XML string returned from BOLD. + + Returns: + List of all items as dictionaries. + + """ + items_from_bold = [] + append = items_from_bold.append + + if self.method == 'call_id': + xml_tag = 'match' + else: + xml_tag = 'record' + + root = ET.fromstring(result_string) + for match in root.findall(xml_tag): + item = dict() + fields = [ + # These pairs correspond to convertions of key names from BOLD + # to friendly versions: + # + # (key name from BOLD, friendlier key name) + + # For call_id + ('ID', 'bold_id'), + ('sequencedescription', 'sequence_description'), + ('database', 'database'), + ('citation', 'citation'), + ('taxonomicidentification', 'taxonomic_identification'), + ('similarity', 'similarity'), + ('specimen/url', 'specimen_url'), + ('specimen/collectionlocation/country', 'specimen_collection_location_country'), + ('specimen/collectionlocation/coord/lat', 'specimen_collection_location_latitude'), + ('specimen/collectionlocation/coord/lon', 'specimen_collection_location_longitude'), + + ('record_id', 'record_id'), + ('processid', 'process_id'), + ('bin_uri', 'bin_uri'), + ('specimen_identifiers/sampleid', 'specimen_identifiers_sample_id'), + ('specimen_identifiers/catalognum', 'specimen_identifiers_catalog_num'), + ('specimen_identifiers/fieldnum', 'specimen_identifiers_field_num'), + ('specimen_identifiers/institution_storing', 'specimen_identifiers_institution_storing'), + ('taxonomy/identification_provided_by', 'taxonomy_identification_provided_by'), + ('taxonomy/phylum/taxon/taxID', 'taxonomy_phylum_taxon_id'), + ('taxonomy/phylum/taxon/name', 'taxonomy_phylum_taxon_name'), + ('taxonomy/class/taxon/taxID', 'taxonomy_class_taxon_id'), + ('taxonomy/class/taxon/name', 'taxonomy_class_taxon_name'), + ('taxonomy/order/taxon/taxID', 'taxonomy_order_taxon_id'), + ('taxonomy/order/taxon/name', 'taxonomy_order_taxon_name'), + ('taxonomy/family/taxon/taxID', 'taxonomy_family_taxon_id'), + ('taxonomy/family/taxon/name', 'taxonomy_family_taxon_name'), + ('taxonomy/genus/taxon/taxID', 'taxonomy_genus_taxon_id'), + ('taxonomy/genus/taxon/name', 'taxonomy_genus_taxon_name'), + ('taxonomy/species/taxon/taxID', 'taxonomy_species_taxon_id'), + ('taxonomy/species/taxon/name', 'taxonomy_species_taxon_name'), + ('specimen_details/voucher_type', 'specimen_details_voucher_type'), + ('specimen_details/voucher_desc', 'specimen_details_voucher_desc'), + ('specimen_details/extrainfo', 'specimen_details_extra_info'), + ('specimen_details/lifestage', 'specimen_details_lifestage'), + ('collection_event/collector', 'collection_event_collector'), + ('collection_event/collectors', 'collection_event_collectors'), + ('collection_event/collectiondate', 'collection_event_collection_date'), + ('collection_event/coordinates/lat', 'collection_event_coordinates_latitude'), + ('collection_event/coordinates/long', 'collection_event_coordinates_longitude'), + ('collection_event/exactsite', 'collection_event_exact_site'), + ('collection_event/country', 'collection_event_country'), + ('collection_event/province', 'collection_event_province'), + ('specimen_imagery/media/mediaID', 'specimen_imagery_media_id'), + ('specimen_imagery/media/caption', 'specimen_imagery_media_caption'), + ('specimen_imagery/media/metatags', 'specimen_imagery_media_metatags'), + ('specimen_imagery/media/copyright', 'specimen_imagery_media_copyright'), + ('specimen_imagery/media/image_file', 'specimen_imagery_media_image_file'), + ('tracefiles/read/read_id', 'tracefiles_read_read_id'), + ('tracefiles/read/run_date', 'tracefiles_read_run_date'), + ('tracefiles/read/sequencing_center', 'tracefiles_read_sequencing_center'), + ('tracefiles/read/direction', 'tracefiles_read_direction'), + ('tracefiles/read/seq_primer', 'tracefiles_read_seq_primer'), + ('tracefiles/read/trace_link', 'tracefiles_read_trace_link'), + ('tracefiles/read/markercode', 'tracefiles_read_marker_code'), + ('sequences/sequence/sequenceID', 'sequences_sequence_sequence_id'), + ('sequences/sequence/markercode', 'sequences_sequence_marker_code'), + ('sequences/sequence/genbank_accession', 'sequences_sequence_genbank_accession'), + ('sequences/sequence/nucleotides', 'sequences_sequence_nucleotides'), + ] + for field in fields: + if match.find(field[0]) is not None: + key = field[1] + matched = match.findall(field[0]) + if len(matched) == 0: + item[key] = None + elif len(matched) == 1: + item[key] = match.find(field[0]).text + elif len(matched) > 1: + item[key] = [i.text for i in matched] + append(item) + self.items = items_from_bold + + def _parse_fasta(self, result_string): + """Parses string response from BOLD containing FASTA sequences. + + Args: + result_string: FASTA sequences as string returned from BOLD. + + Returns: + List of all items as Biopython SeqRecord objects. + + """ + filename = "tmp_" + str(randint(1, 1000000)) + ".fas" + with open(filename, "w") as handle: + handle.write(result_string) + generator = SeqIO.parse(filename, "fasta") + self.items = [i for i in generator] + os.remove(filename) + + +class Request(object): + """Constructs a :class:`Request `. Sends HTTP request. + + Returns: + A :class:`Response ` object. + + """ + def get(self, service, **kwargs): + """Does HTTP request to BOLD webservice. + + Args: + service: The BOLD API alias to interact with. + kwargs: Paramenters send by users. + + Returns: + A Response class containing parsed data as attribute `items`. + + """ + params = '' + + if service == 'call_id': + sequence = utils._prepare_sequence(kwargs['seq']) + params = _urlencode({'db': kwargs['db'], 'sequence': sequence}) + + if service == 'call_taxon_search': + if kwargs['fuzzy'] is True: + fuzzy = 'true' + else: + fuzzy = 'false' + params = _urlencode({ + 'taxName': kwargs['taxonomic_identification'], + 'fuzzy': fuzzy, + }) + + if service == 'call_taxon_data': + if kwargs['include_tree'] is False: + params = _urlencode({ + 'taxId': kwargs['tax_id'], + 'dataTypes': kwargs['data_type'], + }) + else: + params = _urlencode({ + 'taxId': kwargs['tax_id'], + 'dataTypes': kwargs['data_type'], + 'includeTree': 'true', + }) + + if service == 'call_specimen_data' or service == 'call_sequence_data' or \ + service == 'call_full_data' or service == 'call_trace_files': + payload = dict() + for k, v in kwargs.items(): + if v is not None and k != 'url': + payload[k] = v + params = _urlencode(payload) + + url = kwargs['url'] + "?" + params + req = _Request(url, headers={'User-Agent': 'BiopythonClient'}) + handle = _urlopen(req) + response = Response() + + if service == 'call_trace_files': + binary_result = handle.read() + response._parse_data(service, binary_result) + else: + result = _as_string(handle.read()) + response._parse_data(service, result) + return response + + +def request(service, **kwargs): + """Builds our request based on given arguments. Used internally. + + Args: + service: The BOLD API alias to interact with. Examples: `call_id`, + `call_taxon_search`. + kwargs: Arguments passed by users when calling our methods. + + Returns: + Request object with service alias, correct URL and user arguments. + + """ + req = Request() + + if service == 'call_id': + # User wants the service `call_id`. So we need to use this URL: + url = "http://boldsystems.org/index.php/Ids_xml" + return req.get(service=service, url=url, **kwargs) + + if service == 'call_taxon_search': + url = "http://www.boldsystems.org/index.php/API_Tax/TaxonSearch" + return req.get(service=service, url=url, **kwargs) + + if service == 'call_taxon_data': + url = "http://www.boldsystems.org/index.php/API_Tax/TaxonData" + return req.get(service=service, url=url, **kwargs) + + if service == 'call_trace_files': + url = "http://www.boldsystems.org/index.php/API_Public/trace" + + args_returning_lots_of_data = ['institutions', 'researchers', 'geo'] + for arg in args_returning_lots_of_data: + if kwargs[arg] is not None: + warnings.warn('Requesting ``' + arg + '`` data from BOLD will ' + 'possibly return a lot of records and the transfer ' + 'of data might take a lot of time to complete as ' + 'many Megabytes are expected.', + BiopythonWarning + ) + return req.get(service=service, url=url, **kwargs) + + if service == 'call_specimen_data': + url = "http://www.boldsystems.org/index.php/API_Public/specimen" + + args_returning_lots_of_data = ['institutions', 'researchers', 'geo'] + for arg in args_returning_lots_of_data: + if kwargs[arg] is not None: + warnings.warn('Requesting ``' + arg + '`` data from BOLD will ' + 'possibly return a lot of records and the transfer ' + 'of data might take a lot of time to complete as ' + 'many Megabytes are expected.', + BiopythonWarning + ) + return req.get(service=service, url=url, **kwargs) + + if service == 'call_sequence_data': + url = "http://www.boldsystems.org/index.php/API_Public/sequence" + elif service == 'call_full_data': + url = "http://www.boldsystems.org/index.php/API_Public/combined" + + args_returning_lots_of_data = ['institutions', 'researchers', 'geo'] + for arg in args_returning_lots_of_data: + if kwargs[arg] is not None: + warnings.warn('Requesting ``' + arg + '`` data from BOLD will ' + 'possibly return a lot of records and the transfer ' + 'of data might take a lot of time to complete as ' + 'many Megabytes are expected.', + BiopythonWarning + ) + return req.get(service=service, url=url, **kwargs) + + +def call_id(seq, db): + """Call the ID Engine API + http://www.boldsystems.org/index.php/resources/api?type=idengine + + Args: + seq: DNA sequence string or seq_record object. + db: The BOLD database of available records. Choices: ``COX1_SPECIES``,' + ``COX1``, ``COX1_SPECIES_PUBLIC``, ``COX1_L640bp``. + + Returns: + List of dictionaries containing metadata. One dictionary per BOLD record. + + Examples: + + >>> from Bio import bold + >>> seq = 'TTTTTGGTATTTGAGCAGGAATAGTAGGAACTTCTCTCAGTTTAATTATTCGAATAGAATTAGGTAATCCAGGTTTCTTAATTGGAGATGATCAAATTTATAATACTATTGTAACAGCCCATGCTTTTATTATAATTTTTTTTATAGTTATACCTATTGTAATTGGAGGATTTGGAAATTGACTAGTTCCCCTAATATTAGGTGCACCTGATATAGCTTTCCCTCGTATAAATAATATAAGATATTGACTACTTCCACCATCTTTAATATTATTAATTTCAAGTAGTATTGTAGAAAATGGAGCTGGAACAGGTTGAACAGTTTACCCCCCTCTTTCCTCTAATATTGCTCATAGAGGAACCTCAGTAGACTTAGCAATTTTTTCTCTTCATTTAGCTGGTATTTCTTCTATTTTAGGAGCTATTAATTTTATTACTACAATTATTAATATACGAGTTAATGGAATATCCTATGATCAAATACCTTTATTTGTTTGAGCTGTTGGAATTACAGCTCTTCTTTTACTTCTTTCTTTACCTGTTTTAGCAGGAGCTATCACAATACTTCTTACAGATCGAAATTTAAATACATCATTTTTTGATCCTGCAGGAGGAGGTGATCCAATTTTATACCAACATTTATTTTGATTTTTTGGTCACCC' + >>> res = bold.call_id(seq, db='COX1') + >>> item = res.items[1] + >>> item['bold_id'] # this is the ID assigned by BOLD + 'GBLN3590-14' + + """ + return request('call_id', seq=seq, db=db) + + +def call_taxon_search(taxonomic_identification, fuzzy=None): + """Call the TaxonSearch API + http://www.boldsystems.org/index.php/resources/api?type=taxonomy#Ideasforwebservices-SequenceParameters + + Args: + taxonomic_identification: species or any taxon name + fuzzy: False by default + + Returns: + List of dictionaries containing metadata. One dictionary per BOLD record. + + Raises: + ValueError: If `fuzzy` is not True or False. + + Examples: + + >>> from Bio import bold + >>> taxonomic_identification = 'Euptychia ordinata' + >>> res = bold.call_taxon_search(taxonomic_identification, fuzzy=False) + >>> item = res.items[0] # there can be more than one result + >>> item['tax_id'] + 302603 + + """ + if fuzzy is None or fuzzy is False: + fuzzy = False + elif fuzzy is True: + fuzzy = True + else: + raise ValueError('Invalid value for ``fuzzy``. Use True or False.') + + return request('call_taxon_search', + taxonomic_identification=taxonomic_identification, + fuzzy=fuzzy + ) + + +def call_taxon_data(tax_id, data_type=None, include_tree=None): + """Call the TaxonData API. It has several methods to get additional + metadata. + + Args: + tax_id: Taxon to get information for. + data_type: ``basic|all|images``. Default is ``basic``. + include_tree: Optional. Also returns information for parent taxa. True or + False (default). + + Returns: + List of dictionaries containing metadata for a given taxon. + + Raises: + ValueError: If `include_tree` is not True or False. + + Examples: + + >>> from Bio import bold + >>> tax_id = 88899 + >>> res = bold.call_taxon_data(tax_id, data_type='basic,images') + >>> item = res.items[0] + >>> item['taxon'] + 'Momotus' + >>> [(i['image'], i['photographer']) for i in item['images']] + [('BSPBB/MJM_7364_IMG_2240_d+1345758620.JPG', 'Oscar Lopez')] + + """ + if data_type is None: + # We will use by default data_type='basic' + data_type = 'basic' + + if include_tree is None or include_tree is False: + include_tree = False + elif include_tree is True: + include_tree = True + else: + raise ValueError('Invalid value for ``include_tree``. Use True or False.') + + return request('call_taxon_data', tax_id=tax_id, data_type=data_type, + include_tree=include_tree) + + +def call_specimen_data(taxon=None, ids=None, bin=None, container=None, + institutions=None, researchers=None, geo=None, + format=None): + """Call the Specimen Data Retrieval API. + + Args: + taxon: Taxon name including the ranks: phylum, class, order, family, + subfamily, genus and species. Example: `taxon='Bos taurus'`. + ids: Sample ids, process ids, museum ids and field ids. Example: + `ids='ACRJP618|ACRJP619-11'`. + bin: BIN stands for Barcode Index number URI. Example: `bin='BOLD:AAA5125'`. + container: Containers include project codes and dataset codes. Example: + `container='DS-EZROM'`. + institutions: Name of Specimen Storing Sites. Example: + `'institutions=Biodiversity Institute of Ontario'`. + researchers: Collectors and specimen indenfitiers. Example: + `researchers='Thibaud Decaens'`. + geo: Geographic sites such as countries, provinces and states. Example: + `geo='Alaska'`. + format: Optional: ``format='tsv'`` will return results a string + containing data in tab-separated values. If not used, the + data will be returned as dictionary (default behaviour). + + Raises: + ValueError: If `format` is not None and not 'tsv'. + + Returns: + Matching specimen data records as string in TSV format or as list of + dictionaries. + + Examples: + + >>> from Bio import bold + >>> bin = 'BOLD:AAE2777' + >>> res = bold.call_specimen_data(bin=bin) + >>> class_taxon_names = [item['taxonomy_class_taxon_name'] for item in res.items] + >>> class_taxon_names[0] + 'Insecta' + + """ + if format is not None and format != 'tsv': + raise ValueError('Invalid value for ``format``') + + return request('call_specimen_data', taxon=taxon, ids=ids, bin=bin, + container=container, institutions=institutions, + researchers=researchers, geo=geo, format=format + ) + + +def call_sequence_data(taxon=None, ids=None, bin=None, container=None, + institutions=None, researchers=None, geo=None, + marker=None): + """Call the Specimen Data Retrieval API. + + Args: + taxon: Taxon name including the ranks: phylum, class, order, family, + subfamily, genus and species. Example: `taxon='Bos taurus'`. + ids: Sample ids, process ids, museum ids and field ids. Example: + `ids='ACRJP618|ACRJP619-11'`. + bin: BIN stands for Barcode Index number URI. Example: `bin='BOLD:AAA5125'`. + container: Containers include project codes and dataset codes. Example: + `container='DS-EZROM'`. + institutions: Name of Specimen Storing Sites. Example: + `'institutions=Biodiversity Institute of Ontario'`. + researchers: Collectors and specimen indenfitiers. Example: + `researchers='Thibaud Decaens'`. + geo: Geographic sites such as countries, provinces and states. Example: + `geo='Alaska'`. + marker: Genetic marker code. Example: `marker='COI-5P'`. + + Returns: + DNA sequences of matching records in FASTA format. + + Examples: + + >>> from Bio import bold + >>> res = bold.call_sequence_data(taxon='Hermeuptychia', geo='Peru') + >>> items = res.items + >>> [item.id for item in items] + ['GBLN4477-14|Hermeuptychia', 'GBLN4478-14|Hermeuptychia', 'GBLN4479-14|Hermeuptychia'] + + """ + return request('call_sequence_data', taxon=taxon, ids=ids, bin=bin, + container=container, institutions=institutions, + researchers=researchers, geo=geo, marker=marker + ) + + +def call_full_data(taxon=None, ids=None, bin=None, container=None, + institutions=None, researchers=None, geo=None, + marker=None, format=None): + """Call the Full Data Retrieval API (combined). + + Args: + taxon: Taxon name including the ranks: phylum, class, order, family, + subfamily, genus and species. Example: `taxon='Bos taurus'`. + ids: Sample ids, process ids, museum ids and field ids. Example: + `ids='ACRJP618|ACRJP619-11'`. + bin: BIN stands for Barcode Index number URI. Example: `bin='BOLD:AAA5125'`. + container: Containers include project codes and dataset codes. Example: + `container='DS-EZROM'`. + institutions: Name of Specimen Storing Sites. Example: + `'institutions=Biodiversity Institute of Ontario'`. + researchers: Collectors and specimen indenfitiers. Example: + `researchers='Thibaud Decaens'`. + geo: Geographic sites such as countries, provinces and states. Example: + `geo='Alaska'`. + marker: Genetic marker code. Example: `marker='COI-5P'`. + format: Optional. `format='tsv'`. + + Returns: + The data is returned as a string in TSV format or list of dicts parsed + from a XML file. + + Raises: + ValueError: If `format` is not None or 'tsv'. + + Examples: + + >>> from Bio import bold + >>> res = bold.call_full_data(taxon='Hermeuptychia', geo='Peru') + >>> item = res.items[0] + >>> [item['sequences_sequence_genbank_accession'] for item in res.items] + ['KF466142', 'KF466143', 'KF466144'] + + """ + if format is not None and format != 'tsv': + raise ValueError('Invalid value for ``format``') + + return request('call_full_data', taxon=taxon, ids=ids, bin=bin, + container=container, institutions=institutions, + researchers=researchers, geo=geo, marker=marker, format=format + ) + + +def call_trace_files(taxon=None, ids=None, bin=None, container=None, + institutions=None, researchers=None, geo=None, + marker=None): + """Trace files can be retrieved from BOLD by querying with several parameters. + + Args: + taxon: Taxon name including the ranks: phylum, class, order, family, + subfamily, genus and species. Example: `taxon='Bos taurus'`. + ids: Sample ids, process ids, museum ids and field ids. Example: + `ids='ACRJP618|ACRJP619-11'`. + bin: BIN stands for Barcode Index number URI. Example: `bin='BOLD:AAA5125'`. + container: Containers include project codes and dataset codes. Example: + `container='DS-EZROM'`. + institutions: Name of Specimen Storing Sites. Example: + `'institutions=Biodiversity Institute of Ontario'`. + researchers: Collectors and specimen indenfitiers. Example: + `researchers='Thibaud Decaens'`. + geo: Geographic sites such as countries, provinces and states. Example: + `geo='Alaska'`. + marker: Genetic marker code. Example: `marker='COI-5P'`. + + Returns: + A TAR file consisting of compressed Trace Files (traces in either + .ab1 or .scf format) along with a file listing the Process ID, taxon and + marker for each Trace File included. + + Examples: + + >>> from Bio import bold + >>> res = bold.call_trace_files(taxon='Euptychia mollis', + ... institutions='York University') + >>> with open("trace_files.tar", "wb") as handle: + ... handle.write(res.file_contents) + 4106240 + + """ + return request('call_trace_files', taxon=taxon, ids=ids, bin=bin, + container=container, institutions=institutions, + researchers=researchers, geo=geo, marker=marker + ) diff --git a/Bio/bold/utils.py b/Bio/bold/utils.py new file mode 100644 index 00000000000..d000750aa21 --- /dev/null +++ b/Bio/bold/utils.py @@ -0,0 +1,32 @@ +# -*- coding: utf-8 -*- +# Copyright 2014 by Carlos Pena. All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. + +"""Code to invoke the BOLD server over the internet. + +This module provides code to work with the BOLD API provided by BOLDSYSTEMS +http://www.boldsystems.org/index.php/Resources + +""" +from Bio._py3k import basestring + + +def _prepare_sequence(seq_record): + """Outputs a DNA sequence as string. + + Args: + seq_record: Either sequence as string or sequence object. + + Returns: + Sequence as string. + + """ + if isinstance(seq_record, basestring): + return seq_record + else: + try: + return str(seq_record.seq) + except AttributeError: + raise AttributeError("No valid sequence was found for %s." % seq_record) From bf4cb5e74cd0072edfac838174c89f9686751160 Mon Sep 17 00:00:00 2001 From: carlosp420 Date: Wed, 3 Dec 2014 16:23:35 +0200 Subject: [PATCH 2/3] added Experimental Warning --- Bio/bold/__init__.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Bio/bold/__init__.py b/Bio/bold/__init__.py index a6e88217512..c631b26f87a 100755 --- a/Bio/bold/__init__.py +++ b/Bio/bold/__init__.py @@ -11,6 +11,9 @@ """ import logging +import warnings + +from Bio import BiopythonExperimentalWarning from .api import call_id from .api import call_taxon_search @@ -21,4 +24,10 @@ from .api import call_trace_files +warnings.warn('Bio.bold is an experimental submodule which may undergo ' + 'significant changes prior to its future official release.', + BiopythonExperimentalWarning) + +__docformat__ = "restructuredtext en" + logging.basicConfig(format='[bold module]:%(levelname)s:%(message)s', level=logging.DEBUG) From 8d0e2c6d72f0366c2e44d2c5af8f02adb87ee08a Mon Sep 17 00:00:00 2001 From: carlosp420 Date: Wed, 3 Dec 2014 16:34:45 +0200 Subject: [PATCH 3/3] added tests --- Tests/test_bold_api.py | 261 +++++++++++++++++++++++++++++++++++++++ Tests/test_bold_utils.py | 40 ++++++ setup.py | 1 + 3 files changed, 302 insertions(+) create mode 100755 Tests/test_bold_api.py create mode 100755 Tests/test_bold_utils.py diff --git a/Tests/test_bold_api.py b/Tests/test_bold_api.py new file mode 100755 index 00000000000..74698d4a28e --- /dev/null +++ b/Tests/test_bold_api.py @@ -0,0 +1,261 @@ +# -*- coding: utf-8 -*- +# Copyright 2014 by Carlos Pena. All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. +import unittest +import warnings + +from Bio import BiopythonWarning +from Bio._py3k import HTTPError +from Bio import MissingExternalDependencyError + +from Bio import bold +from Bio.bold import api + + +class TestApi(unittest.TestCase): + + def setUp(self): + warnings.simplefilter('ignore', BiopythonWarning) + + def test_call_id(self): + seq = "TTTTTGGTATTTGAGCAGGAATAGTAGGAACTTCTCTCAGTTTAATTATTCGAATAGAATTAGGTAATCCAGGTTTCTTAATTGGAGATGATCAAATTTATAATACTATTGTAACAGCCCATGCTTTTATTATAATTTTTTTTATAGTTATACCTATTGTAATTGGAGGATTTGGAAATTGACTAGTTCCCCTAATATTAGGTGCACCTGATATAGCTTTCCCTCGTATAAATAATATAAGATATTGACTACTTCCACCATCTTTAATATTATTAATTTCAAGTAGTATTGTAGAAAATGGAGCTGGAACAGGTTGAACAGTTTACCCCCCTCTTTCCTCTAATATTGCTCATAGAGGAACCTCAGTAGACTTAGCAATTTTTTCTCTTCATTTAGCTGGTATTTCTTCTATTTTAGGAGCTATTAATTTTATTACTACAATTATTAATATACGAGTTAATGGAATATCCTATGATCAAATACCTTTATTTGTTTGAGCTGTTGGAATTACAGCTCTTCTTTTACTTCTTTCTTTACCTGTTTTAGCAGGAGCTATCACAATACTTCTTACAGATCGAAATTTAAATACATCATTTTTTGATCCTGCAGGAGGAGGTGATCCAATTTTATACCAACATTTATTTTGATTTTTTGGTCACCC" + db = "COX1_SPECIES_PUBLIC" + res = bold.call_id(seq, db) + for item in res.items: + if item['similarity'] == 1: + self.assertEqual('Euptychia ordinata', item['taxonomic_identification']) + + def test_call_taxon_search(self): + taxonomic_identification = 'Euptychia ordinata' + expected = 302603 + res = bold.call_taxon_search(taxonomic_identification, fuzzy=False) + item = res.items[0] + self.assertEqual(expected, item['tax_id']) + + taxonomic_identification = 'Fabaceae' + res = bold.call_taxon_search(taxonomic_identification, fuzzy=False) + item = res.items[0] + self.assertEqual('Plants', item['tax_division']) + self.assertEqual(187, item['parent_id']) + self.assertEqual('Fabales', item['parent_name']) + self.assertEqual('Fabaceae', item['taxon_rep']) + + taxonomic_identification = 'Diplura' + res = bold.call_taxon_search(taxonomic_identification, fuzzy=False) + self.assertEqual(2, len(res.items)) + + def test_call_taxon_search_returns_empty(self): + taxonomic_identification = 'Fake species name' + self.assertRaises(ValueError, bold.call_taxon_search, taxonomic_identification, fuzzy=False) + + def test_call_taxon_search_fuzzy_true(self): + taxonomic_identification = 'Fabaceae' + res = bold.call_taxon_search(taxonomic_identification, fuzzy=True) + self.assertEqual(5, len(res.items)) + + def test_call_taxon_search_fuzzy_error(self): + self.assertRaises(ValueError, bold.call_taxon_search, 'Fabaceae', 'true') + + def test_call_specimen_data(self): + taxon = 'Euptychia' + res = bold.call_specimen_data(taxon) + item = res.items[0] + self.assertEqual('Nymphalidae', item['taxonomy_family_taxon_name']) + + def test_call_specimen_data_several_taxa(self): + taxon = 'Euptychia|Mycalesis' + res = bold.call_specimen_data(taxon) + self.assertTrue('Mycalesis' in [item['taxonomy_genus_taxon_name'] for item in res.items]) + + def test_call_specimen_data_bin(self): + bin = 'BOLD:AAE2777' + res = bold.call_specimen_data(bin=bin) + taxonomy_identifications = [] + append = taxonomy_identifications.append + for item in res.items: + if 'taxonomy_identification_provided_by' in item: + append(item['taxonomy_identification_provided_by']) + self.assertTrue('Jose Montero' in taxonomy_identifications) + + def test_call_specimen_data_container(self): + container = 'ACRJP' + try: + res = bold.call_specimen_data(container=container) + except HTTPError: + # e.g. due to timeout + raise MissingExternalDependencyError("internet connection failed") + + taxonomy_identifications = [] + append = taxonomy_identifications.append + for item in res.items: + if 'taxonomy_identification_provided_by' in item: + append(item['taxonomy_identification_provided_by']) + self.assertTrue('Jacques L. Pierre' in taxonomy_identifications) + + def test_call_specimen_data_institutions(self): + institutions = 'University of Turku' + res = bold.call_specimen_data(institutions=institutions) + taxonomy_identifications = [] + append = taxonomy_identifications.append + for item in res.items: + if 'taxonomy_identification_provided_by' in item: + append(item['taxonomy_identification_provided_by']) + self.assertTrue('Meri Lindqvist' in taxonomy_identifications) + + def test_call_specimen_data_researchers(self): + researchers = 'Thibaud Decaens' + res = bold.call_specimen_data(researchers=researchers) + collection_event_countries = [] + append = collection_event_countries.append + for item in res.items: + if 'collection_event_country' in item: + append(item['collection_event_country']) + self.assertTrue('Peru' in collection_event_countries) + + def test_call_specimen_data_geo(self): + geo = 'Iceland' + res = bold.call_specimen_data(geo=geo) + collection_event_countries = [] + append = collection_event_countries.append + for item in res.items: + if 'collection_event_country' in item: + append(item['collection_event_country']) + self.assertTrue('Iceland' in collection_event_countries) + + def test_call_specimen_data_format_tsv(self): + geo = 'Iceland' + res = bold.call_specimen_data(geo=geo, format='tsv') + self.assertTrue('Iceland' in res.items) + + def test_call_specimen_data_wrong_format(self): + geo = 'Iceland' + self.assertRaises(ValueError, bold.call_specimen_data, geo=geo, format='csv') + + def test_call_specimen_data_return_empty(self): + geo = 'Fake country name' + self.assertRaises(ValueError, bold.call_specimen_data, geo=geo) + + def test_call_taxon_data_basic(self): + tax_id = 302603 + # using default datatype='basic' + res = bold.call_taxon_data(tax_id, data_type='basic') + item = res.items[0] + self.assertEqual(7044, item['parent_id']) + + def test_call_taxon_data_basic_empty(self): + tax_id = 302603 + res = bold.call_taxon_data(tax_id) + item = res.items[0] + self.assertEqual(7044, item['parent_id']) + + def test_call_taxon_data_includetree_false(self): + tax_id = 302603 + # using default datatype='basic' + res = bold.call_taxon_data(tax_id, data_type='basic', include_tree=False) + item = res.items[0] + self.assertEqual(7044, item['parent_id']) + + def test_call_taxon_data_includetree_true(self): + tax_id = 302603 + # using default datatype='basic' + res = bold.call_taxon_data(tax_id, data_type='basic', include_tree=True) + self.assertEqual(7, len(res.items)) + + def test_call_taxon_data_includetree_error(self): + tax_id = 302603 + # using default datatype='basic' + self.assertRaises(ValueError, bold.call_taxon_data, (tax_id, 'basic', 'true')) + + def test_call_sequence_data(self): + taxon = 'Hermeuptychia' + geo = 'Peru' + res = bold.call_sequence_data(taxon=taxon, geo=geo) + items = res.items + seq_record_ids = [item.id for item in items] + self.assertTrue('GBLN4477-14|Hermeuptychia' in seq_record_ids) + + def test_call_sequence_data_returns_empty(self): + taxon = 'Fake taxon' + geo = 'Fake country' + self.assertRaises(ValueError, bold.call_sequence_data, taxon, geo) + + def test_call_full_data(self): + taxon = 'Hermeuptychia' + geo = 'Peru' + res = bold.call_full_data(taxon=taxon, geo=geo) + genbank_accession_numbers = [item['specimen_identifiers_sample_id'] for item in res.items] + self.assertTrue('KF466142' in genbank_accession_numbers) + + def test_call_full_data_invalid(self): + geo = 'Peru' + format = 'csv' + self.assertRaises(ValueError, bold.call_full_data, geo=geo, format=format) + + def test_call_trace_files(self): + taxon = 'Euptychia mollis' + institutions = 'York University' + res = bold.call_trace_files(taxon=taxon, + institutions=institutions) + self.assertNotEqual(res.file_contents, None) + + def test_parse_json(self): + res = api.Response() + + # call_taxon_search + json_string = '{"302603":{"taxid":302603,"taxon":"Euptychia ordinata","tax_rank":"species","tax_division":"Animals","parentid":7044,"parentname":"Euptychia"}}' + res._parse_json(json_string) + item = res.items[0] + self.assertEqual(302603, item['tax_id']) + self.assertEqual(7044, item['parent_id']) + + # data_type = basic + json_string = '{"taxid":891,"taxon":"Fabaceae","tax_rank":"family","tax_division":"Plants","parentid":187,"parentname":"Fabales","taxonrep":"Fabaceae"}' + res._parse_json(json_string) + item = res.items[0] + self.assertEqual('Fabaceae', item['taxon']) + self.assertEqual('Plants', item['tax_division']) + + # data_type = images + json_string = '{"images":[{"copyright_institution":"Smithsonian Tropical Research Institute","specimenid":2616716,"copyright":"Matthew J. MIller","imagequality":4,"photographer":"Oscar Lopez","image":"BSPBB\/MJM_7364_IMG_2240_d+1345758620.JPG","fieldnum":"MJM 7364","sampleid":"MJM 7364","mam_uri":null,"copyright_license":"CreativeCommons - Attribution Non-Commercial","meta":"Dorsal","copyright_holder":"Matthew J. MIller","catalognum":"","copyright_contact":"millerm@si.edu","copyright_year":"2012","taxonrep":"Momotus momota","aspectratio":1.608,"original":true,"external":null}]}' + res._parse_json(json_string) + item = res.items[0] + self.assertEqual('Oscar Lopez', item['images'][0]['photographer']) + + # data_type = geo + json_string = '{"country":{"Brazil":3,"Mexico":2,"Panama":10,"Guatemala":1,"Peru":13,"Bolivia":6,"Ecuador":2},"sitemap":"http:\/\/www.boldsystems.org\/index.php\/TaxBrowser_Maps_CollectionSites?taxid=88899"}' + res._parse_json(json_string) + item = res.items[0] + self.assertTrue('Brazil' in item['country'].keys()) + + # data_type = stats + json_string = '{"stats":{"publicspecies":2,"publicbins":3,"publicmarkersequences":{"COI-5P":6},"publicrecords":6,"specimenrecords":"45","sequencedspecimens":"25","barcodespecimens":"22","species":"3","barcodespecies":"3"}}' + res._parse_json(json_string) + item = res.items[0] + self.assertTrue('publicspecies' in item['stats'].keys()) + + # data_type = sequencinlabs + json_string = '{"sequencinglabs":{"Smithsonian Tropical Research Institute":7,"Biodiversity Institute of Ontario":13,"Universidade Federal de Minas Gerais":1,"Mined from GenBank":2,"Royal Ontario Museum":2}}' + res._parse_json(json_string) + item = res.items[0] + self.assertTrue('Royal Ontario Museum' in item['sequencinglabs'].keys()) + + # data_type = thirdparty + json_string = r'{"taxid": 88899, "taxon": "Momotus", "tax_rank": "genus", "tax_division": "Animals", "parentid": 88898, "parentname": "Momotidae", "wikipedia_summary": "Momotus is a small genus of the motmots, a family of near passerine birds found in forest and woodland of the Neotropics. They have a colourful plumage, which is green on the back becoming blue on the flight feathers and the long tails. The barbs near the ends of the two longest central tail feathers fall off, leaving a length of bare shaft so that tails appear racket-shaped. \n\nMomotus species, like other motmots, eat small prey such as insects and lizards, and will also take fruit. They nest in tunnels in banks, laying about four white eggs.", "wikipedia_link": "http://en.wikipedia.org/wiki/Momotus", "gbif_map": "http://data.gbif.org/species/2475289/overviewMap.png"}' + res._parse_json(json_string) + item = res.items[0] + self.assertTrue('wikipedia_summary' in item.keys()) + + def test_parse_data_empty(self): + result_string = '' + response = api.Response() + self.assertRaises(ValueError, response._parse_data, 'call_id', result_string) + + def tearDown(self): + pass + + +if __name__ == '__main__': + runner = unittest.TextTestRunner(verbosity=2) + unittest.main(testRunner=runner) diff --git a/Tests/test_bold_utils.py b/Tests/test_bold_utils.py new file mode 100755 index 00000000000..7a4068e9023 --- /dev/null +++ b/Tests/test_bold_utils.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- +# Copyright 2014 by Carlos Pena. All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. +import unittest + +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord + +from Bio.bold import utils + + +class TestUtils(unittest.TestCase): + + def setUp(self): + pass + + def test_call_id(self): + seq = "TTTTTGGTATTTGAGCAGGAATAGTAGGAACTTCTCTCAGTTTAATTATTCGAATAGAATTAGGTAATCCAGGTTTCTTAATTGGAGATGATCAAATTTATAATACTATTGTAACAGCCCATGCTTTTATTATAATTTTTTTTATAGTTATACCTATTGTAATTGGAGGATTTGGAAATTGACTAGTTCCCCTAATATTAGGTGCACCTGATATAGCTTTCCCTCGTATAAATAATATAAGATATTGACTACTTCCACCATCTTTAATATTATTAATTTCAAGTAGTATTGTAGAAAATGGAGCTGGAACAGGTTGAACAGTTTACCCCCCTCTTTCCTCTAATATTGCTCATAGAGGAACCTCAGTAGACTTAGCAATTTTTTCTCTTCATTTAGCTGGTATTTCTTCTATTTTAGGAGCTATTAATTTTATTACTACAATTATTAATATACGAGTTAATGGAATATCCTATGATCAAATACCTTTATTTGTTTGAGCTGTTGGAATTACAGCTCTTCTTTTACTTCTTTCTTTACCTGTTTTAGCAGGAGCTATCACAATACTTCTTACAGATCGAAATTTAAATACATCATTTTTTGATCCTGCAGGAGGAGGTGATCCAATTTTATACCAACATTTATTTTGATTTTTTGGTCACCC" + + expected = "TTTTTGGTATTTGAGCAGGAATAGTAGGAACTTCTCTCAGTTTAATTATTCGAATAGAATTAGGTAATCCAGGTTTCTTAATTGGAGATGATCAAATTTATAATACTATTGTAACAGCCCATGCTTTTATTATAATTTTTTTTATAGTTATACCTATTGTAATTGGAGGATTTGGAAATTGACTAGTTCCCCTAATATTAGGTGCACCTGATATAGCTTTCCCTCGTATAAATAATATAAGATATTGACTACTTCCACCATCTTTAATATTATTAATTTCAAGTAGTATTGTAGAAAATGGAGCTGGAACAGGTTGAACAGTTTACCCCCCTCTTTCCTCTAATATTGCTCATAGAGGAACCTCAGTAGACTTAGCAATTTTTTCTCTTCATTTAGCTGGTATTTCTTCTATTTTAGGAGCTATTAATTTTATTACTACAATTATTAATATACGAGTTAATGGAATATCCTATGATCAAATACCTTTATTTGTTTGAGCTGTTGGAATTACAGCTCTTCTTTTACTTCTTTCTTTACCTGTTTTAGCAGGAGCTATCACAATACTTCTTACAGATCGAAATTTAAATACATCATTTTTTGATCCTGCAGGAGGAGGTGATCCAATTTTATACCAACATTTATTTTGATTTTTTGGTCACCC" + result = utils._prepare_sequence(seq) + self.assertEqual(expected, result) + + simple_seq = Seq(seq) + seq = SeqRecord(simple_seq) + result = utils._prepare_sequence(seq) + self.assertEqual(expected, result) + + seq = {'not a seq object': 'dummy'} + self.assertRaises(AttributeError, utils._prepare_sequence, seq) + + def tearDown(self): + pass + + +if __name__ == '__main__': + runner = unittest.TextTestRunner(verbosity=2) + unittest.main(testRunner=runner) diff --git a/setup.py b/setup.py index 21976ad3dc2..3b2c6d29d83 100644 --- a/setup.py +++ b/setup.py @@ -308,6 +308,7 @@ def is_Numpy_installed(): 'Bio.Alphabet', 'Bio.Application', 'Bio.Blast', + 'Bio.bold', 'Bio.CAPS', 'Bio.codonalign', 'Bio.Compass',