/
Entrez.py
332 lines (260 loc) · 12.2 KB
/
Entrez.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
# Copyright 1999-2000 by Jeffrey Chang. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Provides code to access NCBI over the WWW.
The main Entrez web page is available at:
http://www.ncbi.nlm.nih.gov/Entrez/
A list of the Entrez utilities is available at:
http://www.ncbi.nlm.nih.gov/entrez/utils/utils_index.html
Functions:
query Query Entrez; retrieve results in HTML format.
efetch Retrieves records in the requested format from a list of one or
more primary IDs or from the user's environment
epost Posts a file containing a list of primary IDs for future use in
the user's environment to use with subsequent search strategies
esearch Searches and retrieves primary IDs (for use in EFetch, ELink,
and ESummary) and term translations and optionally retains
results for future use in the user's environment.
elink Checks for the existence of an external or Related Articles link
from a list of one or more primary IDs. Retrieves primary IDs
and relevancy scores for links to Entrez databases or Related
Articles; creates a hyperlink to the primary LinkOut provider
for a specific ID and database, or lists LinkOut URLs
and Attributes for multiple IDs.
einfo Provides field index term counts, last update, and available
links for each database.
esummary Retrieves document summaries from a list of primary IDs or from
the user's environment.
egquery Provides Entrez database counts in XML for a single search
using Global Query.
espell Retrieves spelling suggestions.
_open
"""
import urllib, time
from Bio import File
def query(cmd, db, cgi='http://www.ncbi.nlm.nih.gov/sites/entrez',
**keywds):
"""query(cmd, db, cgi='http://www.ncbi.nlm.nih.gov/sites/entrez',
**keywds) -> handle
Query Entrez and return a handle to the results, consisting of
a web page in HTML format.
See the online documentation for an explanation of the parameters:
http://www.ncbi.nlm.nih.gov/books/bv.fcgi?rid=helplinks.chapter.linkshelp
Raises an IOError exception if there's a network error.
"""
variables = {'cmd' : cmd, 'db' : db}
variables.update(keywds)
return _open(cgi, variables)
def pmfetch(db, id, report=None, mode=None,
cgi="http://www.ncbi.nlm.nih.gov/entrez/utils/pmfetch.fcgi"):
"""pmfetch(db, id, report=None, mode=None,
cgi="http://www.ncbi.nlm.nih.gov/entrez/utils/pmfetch.fcgi")
Query PmFetch and return a handle to the results. See the
online documentation for an explanation of the parameters:
http://www.ncbi.nlm.nih.gov/entrez/utils/pmfetch_help.html
Raises an IOError exception if there's a network error.
"""
# NCBI has retired PmFetch!!!
import warnings
warnings.warn("pmfetch is deprecated, as NCBI has retired PmFetch. Please let the Biopython developers know (biopython-dev@biopython.org) if you still use this function", DeprecationWarning)
variables = {'db' : db, 'id' : id}
if report is not None:
variables['report'] = report
if mode is not None:
variables['mode'] = mode
return _open(cgi, variables)
def pmqty(db, term, dopt=None,
cgi='http://www.ncbi.nlm.nih.gov/entrez/utils/pmqty.fcgi',
**keywds):
"""pmqty(db, term, dopt=None,
cgi='http://www.ncbi.nlm.nih.gov/entrez/utils/pmqty.fcgi') -> handle
Query PmQty and return a handle to the results. See the
online documentation for an explanation of the parameters:
http://www.ncbi.nlm.nih.gov/entrez/utils/pmqty_help.html
Raises an IOError exception if there's a network error.
"""
# NCBI has retired PmQty!!!
import warnings
warnings.warn("pmqty is deprecated, as NCBI has retired PmQty. Please let the Biopython developers know (biopython-dev@biopython.org) if you still use this function", DeprecationWarning)
variables = {'db' : db, 'term' : term}
if dopt is not None:
variables['dopt'] = dopt
variables.update(keywds)
return _open(cgi, variables)
def pmneighbor(pmid, display,
cgi='http://www.ncbi.nlm.nih.gov/entrez/utils/pmneighbor.fcgi'):
"""pmneighbor(pmid, display,
cgi='http://www.ncbi.nlm.nih.gov/entrez/utils/pmneighbor.fcgi') -> handle
Query PMNeighbor and return a handle to the results. See the
online documentation for an explanation of the parameters:
http://www.ncbi.nlm.nih.gov/entrez/utils/pmneighbor_help.html
Raises an IOError exception if there's a network error.
"""
# NCBI has retired PmNeighbor!!!
import warnings
warnings.warn("pmneighbor is deprecated, as NCBI has retired PmNeighbor. Please let the Biopython developers know (biopython-dev@biopython.org) if you still use this function", DeprecationWarning)
#
# Warning: HUGE HACK HERE! pmneighbor expects the display
# parameter to be passed as just a tag, with no value.
# Unfortunately, _open doesn't support these types of parameters,
# so I'm building my own cgi string. This is really due to the
# limitations of urllib.urlencode. We'll have to figure out a
# good workaround.
fullcgi = "%s?pmid=%s&%s" % (cgi, pmid, display)
return _open(fullcgi)
# XXX retmode?
def epost(db, id, cgi='http://www.ncbi.nlm.nih.gov/entrez/eutils/epost.fcgi',
**keywds):
"""epost(db, id[, cgi]) -> handle
Query Entrez and return a handle to the results.
Posts a file containing a list of UIs for future use in the user's
environment to use with subsequent search strategies. See the online
documentation for an explanation of the parameters:
http://www.ncbi.nlm.nih.gov/entrez/query/static/epost_help.html
Raises an IOError exception if there's a network error.
"""
variables = {'db' : db, 'id' : id}
variables.update(keywds)
return _open(cgi, variables)
def efetch(db, cgi='http://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi',
**keywds):
"""efetch(db[, cgi][...]) -> handle
Query Entrez and return a handle to the results.
EFetch retrieves records in the requested format from a list of one or
more UIs or from user's environment. See the online
documentation for an explanation of the parameters:
http://www.ncbi.nlm.nih.gov/entrez/query/static/efetch_help.html
Raises an IOError exception if there's a network error.
"""
variables = {'db' : db}
variables.update(keywds)
return _open(cgi, variables)
def esearch(db, term,
cgi='http://www.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi',
**keywds):
"""esearch(db, term[, cgi][...]) -> handle
Query Entrez and return a handle to the results.
ESearch searches and retrieves primary IDs (for use in EFetch, ELink
and ESummary) and term translations, and optionally retains results
for future use in the user's environment. See the online
documentation for an explanation of the parameters:
http://www.ncbi.nlm.nih.gov/entrez/query/static/esearch_help.html
Raises an IOError exception if there's a network error.
"""
variables = {'db' : db,
'term' : term}
variables.update(keywds)
return _open(cgi, variables)
def elink(cgi='http://www.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi',
**keywds):
"""elink([, cgi][...]) -> handle
Query Entrez and return a handle to the results.
ELink checks for the existence of an external or Related Articles link
from a list of one or more primary IDs; retrieves IDs and relevancy
scores for links to Entrez databases or Related Articles; creates a
hyperlink to the primary LinkOut provider for a specific ID and
database, or lists LinkOut URLs and attributes for multiple IDs. See
the online documentation for an explanation of the parameters:
http://www.ncbi.nlm.nih.gov/entrez/query/static/elink_help.html
Raises an IOError exception if there's a network error.
"""
variables = {}
variables.update(keywds)
return _open(cgi, variables)
def einfo(cgi='http://www.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi',
**keywds):
"""einfo([, cgi][...]) -> handle
Query Entrez and return a handle to the results.
EInfo provides field names, index term counts, last update, and
available links for each Entrez database. See the online
documentation for an explanation of the parameters:
http://www.ncbi.nlm.nih.gov/entrez/query/static/einfo_help.html
Raises an IOError exception if there's a network error.
"""
variables = {}
variables.update(keywds)
return _open(cgi, variables)
def esummary(cgi='http://www.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi',
**keywds):
"""esummary([, cgi][...]) -> handle
Query Entrez and return a handle to the results.
ESummary retrieves document summaries from a list of primary IDs or
from the user's environment. See the online documentation for an
explanation of the parameters:
http://www.ncbi.nlm.nih.gov/entrez/query/static/esummary_help.html
Raises an IOError exception if there's a network error.
"""
variables = {}
variables.update(keywds)
return _open(cgi, variables)
def egquery(cgi='http://www.ncbi.nlm.nih.gov/entrez/eutils/egquery.fcgi',
**keywds):
"""egquery([, cgi][...]) -> handle
Query Entrez and return a handle to the results.
EGQuery provides Entrez database counts in XML for a single search
using Global Query. See the online documentation for an explanation
of the parameters:
http://www.ncbi.nlm.nih.gov/entrez/query/static/egquery_help.html
Raises an IOError exception if there's a network error.
"""
variables = {}
variables.update(keywds)
return _open(cgi, variables)
def espell(cgi='http://www.ncbi.nlm.nih.gov/entrez/eutils/espell.fcgi',
**keywds):
"""espell([, cgi][...]) -> handle
Query Entrez and return a handle to the results.
ESpell retrieves spelling suggestions, if available. See the online
documentation for an explanation of the parameters:
http://www.ncbi.nlm.nih.gov/entrez/query/static/espell_help.html
Raises an IOError exception if there's a network error.
"""
variables = {}
variables.update(keywds)
return _open(cgi, variables)
def _open(cgi, params={}):
"""_open(cgi, params={}) -> UndoHandle
Open a handle to Entrez. cgi is the URL for the cgi script to access.
params is a dictionary with the options to pass to it. Does some
simple error checking, and will raise an IOError if it encounters one.
"""
# NCBI requirement: At least three seconds between queries
delay = 3.0
current = time.time()
wait = _open.previous + delay - current
if wait > 0:
time.sleep(wait)
_open.previous = current + wait
else:
_open.previous = current
# Open a handle to Entrez.
if not "tool" in params:
params["tool"] = "biopython"
options = urllib.urlencode(params, doseq=True)
cgi += "?" + options
handle = urllib.urlopen(cgi)
# Wrap the handle inside an UndoHandle.
uhandle = File.UndoHandle(handle)
# Check for errors in the first 5 lines.
# This is kind of ugly.
lines = []
for i in range(5):
lines.append(uhandle.readline())
for i in range(4, -1, -1):
uhandle.saveline(lines[i])
data = ''.join(lines)
if "500 Proxy Error" in data:
# Sometimes Entrez returns a Proxy Error instead of results
raise IOError, "500 Proxy Error (NCBI busy?)"
elif "502 Proxy Error" in data:
raise IOError, "502 Proxy Error (NCBI busy?)"
elif "WWW Error 500 Diagnostic" in data:
raise IOError, "WWW Error 500 Diagnostic (NCBI busy?)"
elif data[:5] == "ERROR":
# XXX Possible bug here, because I don't know whether this really
# occurs on the first line. I need to check this!
raise IOError, "ERROR, possibly because id not available?"
# Should I check for 404? timeout? etc?
return uhandle
_open.previous = 0