Skip to content

Commit

Permalink
Basics for TogoWS REST API support
Browse files Browse the repository at this point in the history
  • Loading branch information
peterjc committed Dec 16, 2011
1 parent edf8f26 commit 01ba2f2
Show file tree
Hide file tree
Showing 3 changed files with 631 additions and 0 deletions.
344 changes: 344 additions & 0 deletions Bio/TogoWS/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,344 @@
# Copyright 2010-2011 by Peter Cock. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.

"""Provides code to access the TogoWS integrated websevices of DBCLS, Japan.
This module aims to make the TogoWS (from DBCLS, Japan) easier to use. See:
http://togows.dbcls.jp/
The TogoWS REST service provides simple access to a range of databases, acting
as a proxy to shield you from all the different provider APIs. This works using
simple URLs (which this module will construct for you). For more details, see
http://togows.dbcls.jp/site/en/rest.html
The functionality is somewhat similar to Biopython's Bio.Entrez module which
provides access to the NCBI's Entrez Utilities (E-Utils) which also covers a
wide range of databases.
Currently TogoWS does not provide any usage guidelines (unlike the NCBI whose
requirements are reasonably clear). To avoid risking overloading the service,
Biopython will only allow three calls per second.
The TogoWS SOAP service offers a more complex API for calling web services
(essentially calling remote functions) provided by DDBJ, KEGG and PDBj. For
example, this allows you to run a remote BLAST search at the DDBJ. This is
not yet covered by this module, however there are lots of Python examples
on the TogoWS website using the SOAPpy python library. See:
http://togows.dbcls.jp/site/en/soap.html
http://soapy.sourceforge.net/
"""

import urllib
import time
from Bio import File

#Caches:
_search_db_names = None
_fetch_db_names = None
_fetch_db_fields = {}
_fetch_db_formats = {}

def _get_fields(url):
"""Queries a TogoWS URL for a plain text list of values (PRIVATE)."""
handle = _open(url)
fields = handle.read().strip().split()
handle.close()
return fields

def _get_entry_dbs():
return _get_fields("http://togows.dbcls.jp/entry")

def _get_entry_fields(db):
return _get_fields("http://togows.dbcls.jp/entry/%s?fields" % db)

def _get_entry_formats(db):
return _get_fields("http://togows.dbcls.jp/entry/%s?formats" % db)

def tfetch(db, id, format=None, field=None):
"""TogoWS fetch entry (returns a handle).
db - database (string), see list below.
id - identier (string) or a list of identifiers (either as a list of
strings or a single string with comma separators).
format - return data file format (string), options depend on the database
e.g. "xml", "json", "gff", "fasta", "ttl" (RDF)
field - specific field from within the database record (string)
e.g. "au" or "authors" for pubmed.
At the time of writing, TogoWS website claims it supports the following
databases:
KEGG: gene, orthology, enzyme, compound, drug, glycan, reaction
DDBJ: ddbj, dad
PDBj: pdb
NCBI: gene, genome, genomeprj, geo, journals, mesh, nucleotide, omim,
pmc, protein, pubmed, taxonomy, cdd, popset, snp, unigene,
homologene, nuccore, nucest, nucgss, unists
EBI: biomodels, chebi, ensembl, go, interpro, reactome, uniprot,
uniparc, uniref100, uniref90, uniref50, msdchem, msdpdb
However, the list given at http://togows.dbcls.jp/entry/ is much smaller.
The name of this function (tfetch) mimics that of the related NCBI
Entrez service EFetch, available in Biopython as Bio.Entrez.efetch(...)
"""
global _fetch_db_names, _fetch_db_fields, fetch_db_formats
if _fetch_db_names is None:
_fetch_db_names = _get_entry_dbs()
if db not in _fetch_db_names:
raise ValueError("TogoWS entry fetch does not officially support "
"database '%s'." % db)
if field:
try:
fields = _fetch_db_fields[db]
except KeyError:
fields = _get_entry_fields(db)
_fetch_db_fields[db] = fields
if field not in fields:
#TODO - Make this a ValueError? Right now TogoWS appears to support
#some undocumented fields like "length" for "embl".
import warnings
warnings.warn("TogoWS entry fetch does not explicitly support "
"field '%s' for database '%s'." % (field, db))
if format:
try:
formats = _fetch_db_formats[db]
except KeyError:
formats = _get_entry_formats(db)
_fetch_db_fields[db] = formats
if format not in formats:
raise ValueError("TogoWS entry fetch does not explicitly support "
"format '%s' for database '%s'." % (format, db))

if isinstance(id, list):
id = ",".join(id)
url="http://togows.dbcls.jp/entry/%s/%s" % (db, id)
if field:
url += "/" + field
if format:
url += "." + format
return _open(url)

def tsearch_count(db, query):
"""TogoWS search count (returns an integer).
db - database (string), see http://togows.dbcls.jp/search
query - search term (string)
You could then use the count to download a large set of search results in
batches using the offset and limit options to Bio.TogoWS.tsearch().
"""
global _search_db_names
if _search_db_names is None:
_search_db_names = _get_fields("http://togows.dbcls.jp/search")
if db not in _search_db_names:
#TODO - Make this a ValueError? Right now despite the HTML website
#claiming to, the "gene" or "ncbi-gene" don't work and are not listed.
import warnings
warnings.warn("TogoWS search does not officially support database '%s'. "
"See http://togows.dbcls.jp/search/ for options." % db)
#TODO - Encode spaces etc
handle = _open("http://togows.dbcls.jp/search/%s/%s/count" % (db, query))
count = int(handle.read().strip())
handle.close()
return count

def tsearch_iter(db, query, batch=100):
"""TogoWS search iteratating over the results (generator function).
db - database (string), see http://togows.dbcls.jp/search
query - search term (string)
batch - number of search results to pull back each time talk to TogoWS.
You would use this function within a for loop, e.g.
for id in tsearch_iter("pubmed", "lung+cancer+drug"):
print id #maybe fetch data with tfetch?
"""
count = tsearch_count(db, query)
if not count:
raise StopIteration
remain = count
offset = 1 #They don't use zero based counting
while remain:
batch = min(batch, remain)
ids = tsearch(db, query, offset, batch).read().strip().split()
assert len(ids)==batch, "Got %i, expected %i" % (len(ids), batch)
#print "offset %i, %s ... %s" % (offset, ids[0], ids[1])
for identifier in ids:
yield identifier
offset += batch
remain -= batch


def tsearch(db, query, offset=None, count=None, format=None):
"""TogoWS search (returns a handle).
db - database (string), see http://togows.dbcls.jp/search/
query - search term (string)
offset, count - optional integers specifying which result to start from
(1 based) and the number of results to return.
format - return data file format (string), e.g. "json", "ttl" (RDF)
By default plain text is returned.
At the time of writing, TogoWS supports a long list of databases, including
many from the NCBI (e.g. "ncbi-pubmed" or "pubmed", "ncbi-genbank" or
"genbank", "ncbi-taxonomy"), EBI (e.g. "ebi-ebml" or "embl", "ebi-uniprot"
or "uniprot, "ebi-go"), and KEGG (e.g. "kegg-compound" or "compound").
The name of this function (tsearch) mimics that of the related NCBI
Entrez service ESearch, available in Biopython as Bio.Entrez.esearch(...)
See also the function Bio.TogoWS.tsearch_count() which returns the number
of matches found, and the Bio.TogoWS.tsearch_iter() function which allows
you to iterate over the search results (taking care of batching for you).
"""
global _search_db_names
if _search_db_names is None:
_search_db_names = _get_fields("http://togows.dbcls.jp/search")
if db not in _search_db_names:
#TODO - Make this a ValueError? Right now despite the HTML website
#claiming to, the "gene" or "ncbi-gene" don't work and are not listed.
import warnings
warnings.warn("TogoWS search does not explicitly support database '%s'. "
"See http://togows.dbcls.jp/search/ for options." % db)
#TODO - Encode spaces etc
url="http://togows.dbcls.jp/search/%s/%s" % (db, query)
if offset is not None and count is not None:
if offset<=0:
raise ValueError("Offset should be at least one")
if count<=0:
raise ValueError("Count should be at least one")
url += "/%i,%i" % (offset, count)
elif offset is not None or count is not None:
raise ValueError("Expect BOTH offset AND count to be provided (or neither)")
if format:
url += "." + format
#print url
return _open(url)

def tconvert(data, in_format, out_format):
"""TogoWS convert (returns a handle).
data - string or handle containing input record(s)
in_format - string describing the input file format (e.g. "genbank")
out_format - string describing the requested output format (e.g. "fasta")
Note that Biopython has built in support for conversion of sequence and
alignnent file formats (functions Bio.SeqIO.convert and Bio.AlignIO.convert)
"""
#TODO - Check the formats are supported
url="http://togows.dbcls.jp/convert/%s.%s" % (in_format, out_format)
#TODO - Should we just accept a string not a handle? What about a filename?
if hasattr(data, "read"):
#Handle
return _open(url, post={"data":data.read()})
else:
#String
return _open(url, post={"data":data})

def _open(url, post=None):
"""Helper function to build the URL and open a handle to it (PRIVATE).
Open a handle to TogoWS. Does some very simple error checking, and will
raise an IOError if it encounters an error.
In the absense of clear guidelines, this function also enforces "up to
three queries per second" to avoid abusing the TogoWS servers.
"""
delay = 0.333333333 #one third of a second
current = time.time()
wait = _open.previous + delay - current
if wait > 0:
time.sleep(wait)
_open.previous = current + wait
else:
_open.previous = current

#print url
if post:
handle = urllib.urlopen(url, urllib.urlencode(post))
else:
handle = urllib.urlopen(url)

# Wrap the handle inside an UndoHandle.
uhandle = File.UndoHandle(handle)

# Check for errors in the first 10 lines.
# This is kind of ugly.
lines = []
for i in range(10):
lines.append(uhandle.readline())
for i in range(9, -1, -1):
uhandle.saveline(lines[i])
data = ''.join(lines)

if data == '':
#ValueError? This can occur with an invalid formats or fields
#e.g. http://togows.dbcls.jp/entry/pubmed/16381885.au
#which is an invalid file format, I meant to try this
#instead http://togows.dbcls.jp/entry/pubmed/16381885/au
raise IOError("TogoWS replied with no data:\n%s % url")
if data == ' ':
#I've seen this on things which should work, e.g.
#e.g. http://togows.dbcls.jp/entry/genome/X52960.fasta
raise IOError("TogoWS replied with just a single space:\n%s" % url)
if data.startswith("Error: "):
#TODO - Should this be a value error (in some cases?)
raise IOError("TogoWS replied with an error message:\n\n%s\n\n%s" \
% (data, url))
if "<title>We're sorry, but something went wrong</title>" in data:
#ValueError? This can occur with an invalid formats or fields
raise IOError("TogoWS replied: We're sorry, but something went wrong:\n%s" \
% url)

return uhandle

_open.previous = 0


if __name__ == "__main__":

try:
_get_fields("http://togows.dbcls.jp/entry/invalid?fields")
assert False, "Should fail"
except IOError, e:
assert "Error: Invalid database." in str(e)
pass

print tfetch("pubmed", "16381885", field="au").read()
print tfetch("pubmed", "16381885", field="authors").read()
print tfetch("ddbj", "X52960").read()
print tfetch("ddbj", "X52960", "fasta").read()
print tfetch("ddbj", "X52960", "gff").read()
try:
print tfetch("ddbj", "X52960", "text").read()
except Exception, e:
print e
print tfetch("uniprot", ["A1AG1_HUMAN","A1AG1_MOUSE"]).read()

"""
names1, names2 = tfetch("pubmed", "16381885,19850725", field="authors").read().strip().split("\n")
assert names1.split("\t") == ['Kanehisa, M.', 'Goto, S.', 'Hattori, M.', 'Aoki-Kinoshita, K. F.', 'Itoh, M.', 'Kawashima, S.', 'Katayama, T.', 'Araki, M.', 'Hirakawa, M.']
assert names2.split("\t") == ['Kaminuma, E.', 'Mashima, J.', 'Kodama, Y.', 'Gojobori, T.', 'Ogasawara, O.', 'Okubo, K.', 'Takagi, T.', 'Nakamura, Y.']
assert tsearch_count("uniprot", "lung+cancer") > 1000
#print tsearch("uniprot", "lung+cancer").read().strip().split()
from Bio import SeqIO
print SeqIO.read(tfetch("ddbj", "X52960", "fasta"), "fasta")
print SeqIO.read(tfetch("protein", "16130152", "fasta"), "fasta")
print SeqIO.read(tfetch("protein", "16130152"), "gb")
"""

#Current count is 1276, so compare all in one to batched:
#assert list(tsearch_iter("uniprot", "lung+cancer",batch=50)) \
# == list(tsearch_iter("uniprot", "lung+cancer",batch=100))
all_in_one = tsearch("uniprot", "lung+cancer").read().strip().split("\n")
if len(all_in_one) == 100:
print "Oh, search was capped at 100."
else:
batched = list(tsearch_iter("uniprot", "lung+cancer"))
assert all_in_one == batched, "All: %s\nBatched: %s" % (all_in_one, batched)
Loading

0 comments on commit 01ba2f2

Please sign in to comment.