From 00c94d438a57d385dd5a1660099974cf5e6ffef1 Mon Sep 17 00:00:00 2001 From: jchang Date: Sun, 14 May 2000 21:19:03 +0000 Subject: [PATCH] added search_for function --- Bio/Medline/PubMed.py | 89 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 85 insertions(+), 4 deletions(-) diff --git a/Bio/Medline/PubMed.py b/Bio/Medline/PubMed.py index d153e33a6ce..92230ae6adf 100644 --- a/Bio/Medline/PubMed.py +++ b/Bio/Medline/PubMed.py @@ -13,11 +13,17 @@ Classes: -Dictionary +Dictionary Access PubMed articles using a dictionary interface. + +Functions: +search_for Search PubMed. """ import time +import string +import re +import sgmllib from Bio.WWW import NCBI @@ -28,7 +34,7 @@ class Dictionary: """ def __init__(self, delay=5.0, parser=None): - """__init__(self, delay=5.0, parser=None) + """Dictionary(delay=5.0, parser=None) Create a new Dictionary to access PubMed. parser is an optional parser (e.g. Medline.RecordParser) object to change the results @@ -59,7 +65,7 @@ def values(self): raise NotImplementedError, "You don't really want to do this..." def has_key(self, id): - """has_key(self, id) -> bool""" + """S.has_key(id) -> bool""" try: self[id] except KeyError: @@ -74,7 +80,7 @@ def get(self, id, failobj=None): raise "How did I get here?" def __getitem__(self, id): - """__getitem__(self, id) -> object + """S.__getitem__(id) -> object Return the Medline entry. id is either the Medline Unique ID or the Pubmed ID of the article. Raises a KeyError if there's an @@ -94,7 +100,82 @@ def __getitem__(self, id): db='PubMed', id=id, report='medlars', mode='text') except IOError, x: # raise a KeyError instead of an IOError + # XXX I really should distinguish between a real IOError and + # if the id is not in the database. raise KeyError, x if self.parser is not None: return self.parser.parse(handle) return handle.read() + +def search_for(search, batchsize=10000, delay=1, callback_fn=None): + """search_for(search, batchsize=10000, delay=1, callback_fn=None) -> ids + + Search PubMed and return a list of the PMID's that match the criteria. + search is the search string used to search the database. PubMed only + allows users to retrieve the search results in batches of up to 10000 + ID's at a time. batchsize is the size of the batch to use. delay + is the number of seconds to wait between queries. callback_fn is + an optional callback function that will be called as results are + retrieved. It should take the PMID as an argument. + + """ + class ResultParser(sgmllib.SGMLParser): + # Parse the ID's out of the HTML-formatted page that PubMed + # returns. The format of the page is: + # QueryResult + # + # 10807727
+ # [...] + # + def __init__(self): + sgmllib.SGMLParser.__init__(self) + self.ids = [] + self.in_body = 0 + def start_body(self, attributes): + self.in_body = 1 + def end_body(self): + self.in_body = 0 + _not_pmid_re = re.compile(r'\D') + def handle_data(self, data): + # The ID's only appear in the body. If I'm not in the body, + # then don't do anything. + if not self.in_body: + return + # If data is just whitespace, then ignore it. + data = string.strip(data) + if not data: + return + # Everything here should be a PMID. Check and make sure + # data really is one. A PMID should be a string consisting + # of only integers. Should I check to make sure it + # meets a certain minimum length? + if self._not_pmid_re.search(data): + raise SyntaxError, \ + "I expected an ID, but '%s' doesn't look like one." % \ + repr(data) + self.ids.append(data) + + last_search = None + ids = [] + while 1: + parser = ResultParser() + + # Check to make sure enough time has passed before my + # last search. If not, then wait. + if last_search is not None: + time.sleep(time.time() - (last_search + delay)) + last_search = time.time() + + # Do a query against PmQty. Search medline, using the + # search string, and get only the ID's in the results. + h = NCBI.pmqty('m', search, dopt='d', + dispmax=batchsize, dispstart=len(ids)) + parser.feed(h.read()) + if not parser.ids: # no more id's to read + break + if callback_fn is not None: + # Call the callback function with each of the new ID's. + for id in parser.ids: + callback_fn(id) + ids.extend(parser.ids) + return ids