/
search.py
105 lines (87 loc) · 2.74 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
def metadata_full(query, categories=False, get=None):
"""Find samples or categories
Parameters
----------
query : str
The query to execute
categories : boolean, optional
Whether to search for categories (True) or samples (False, default).
get : function
A getter
Raises
------
TypeError
When unexpected operators are used
ValueError
When a where query is used with a categories search
Returns
-------
set
The observed sample IDs
"""
from os.path import join, dirname
import redbiom
import redbiom.set_expr
import redbiom.where_expr
import redbiom._requests
import redbiom.util
import functools
import nltk
if get is None:
config = redbiom.get_config()
get = redbiom._requests.make_get(config)
if categories:
target = 'category-search'
else:
target = 'text-search'
stemmer = nltk.PorterStemmer(nltk.PorterStemmer.MARTIN_EXTENSIONS)
nltk_data_path = join(dirname(__file__), 'assets', 'nltk_data')
if nltk.data.path[0] != nltk_data_path:
nltk.data.path = [nltk_data_path] + nltk.data.path
stops = frozenset(nltk.corpus.stopwords.words('english'))
stem_f = functools.partial(redbiom.util.stems, stops, stemmer)
samples = set()
for plan_type, q in query_plan(query):
if plan_type == 'set':
samples.update(redbiom.set_expr.seteval(q, get=get,
target=target,
stemmer=stem_f))
elif plan_type == 'where':
if categories:
raise ValueError("where clauses not allowed with a category "
"search")
obs = set(redbiom.where_expr.whereeval(q, get=get).index)
if samples:
samples &= obs
else:
samples = obs
return samples
def query_plan(query):
"""Light sanity checking and query partitioning
Parameters
----------
query : str
The query to operate on
Returns
-------
list of tuple
The (query type, query).
Raises
------
ValueError
When there are no queries
"""
if query.startswith('where'):
part = query.split('where', 1)[1].strip()
if not part:
raise ValueError('No query')
return [('where', part)]
parts = query.split('where', 1)
for i, part in enumerate(parts):
if not part:
raise ValueError('No query')
parts[i] = parts[i].strip()
if len(parts) == 1:
return [('set', parts[0].strip())]
else:
return [('set', parts[0].strip()), ('where', parts[1].strip())]