-
Notifications
You must be signed in to change notification settings - Fork 23
/
authors.py
251 lines (207 loc) · 9.64 KB
/
authors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
"""Query-builders and helpers for searching by author name."""
from typing import Tuple, Optional, List
import re
from functools import reduce, wraps
from operator import ior, iand
from elasticsearch_dsl import Search, Q, SF
from arxiv.base import logging
from .util import wildcard_escape, escape, STRING_LITERAL, \
remove_single_characters, has_wildcard
logger = logging.getLogger(__name__)
logger.propagate = False
# We don't remove stopwords from author names at index time because
# institutions and collaborations are often treated as authors just like
# people.
STOP = ["and", "or", "the", "of", "a", "for"]
def _remove_stopwords(term: str) -> str:
"""Remove common stopwords, except in literal queries."""
parts = re.split(STRING_LITERAL, term)
for stopword in STOP:
parts = [re.sub(f"(^|\s+){stopword}(\s+|$)", " ", part)
if not part.startswith('"') and not part.startswith("'")
else part for part in parts]
return "".join(parts)
def Q_(qtype: str, field: str, value: str) -> Q:
"""Generate an appropriate :class:`Q` based on wildcard presence."""
if has_wildcard(value):
return Q("wildcard", **{field: {"value": escape(value)}})
return Q(qtype, **{field: escape(value)})
def part_query(term: str, path: str = "authors") -> Q:
"""
Build a query that matches within a single author using name parts.
Anything before the first comma is treated as the author's surname, and
everything after the first comma is treated as the author's first name
or initials.
Parameters
----------
term : str
Search term for a single author.
path : str
Nested document path.
Returns
-------
:class:`.Q`
"""
AUTHOR_QUERY_FIELDS = [
f"{path}.full_name",
f"{path}.last_name",
f"{path}.full_name_initialized"
]
term = term.strip()
logger.debug(f'{path} part_query for {term}')
# Commas are used to distinguish surname and forename.
forename_is_individuated = "," in term
if forename_is_individuated:
# We treat the entire part as a search for a single author. The part
# before the comma is treated as a surname, and the part after the
# comma is treated as a forename or a prefix of the forename.
name_parts = [p.strip() for p in term.split(",")]
surname = name_parts[0].strip()
forename = " ".join(name_parts[1:]).strip()
# Doing a query string so that wildcards and literals are just handled.
q_surname = Q("query_string", fields=[f"{path}.last_name"],
query=escape(surname),
default_operator='AND',
allow_leading_wildcard=False)
if forename:
# If a wildcard is provided in the forename, we treat it as a
# query string query. This has the disadvantage of losing term
# order, but the advantage of handling wildcards as expected.
logger.debug(f'Forename: {forename}')
if has_wildcard(forename):
q_forename = Q("query_string", fields=[f"{path}.first_name"],
query=escape(forename),
auto_generate_phrase_queries=True,
default_operator='AND',
allow_leading_wildcard=False)
# Otherwise, we expect the forename to match as a phrase. The
# _prefix bit means that the last word can match as a prefix of the
# corresponding term.
else:
q_forename = Q("match_phrase_prefix",
**{f"{path}__first_name": forename})
# It may be the case that the forename consists of initials or some
# other prefix/partial forename. For a match of this kind, each
# part of the forename part must be a prefix of a term in the
# forename.
if path == 'authors' and forename:
logger.debug('Consider initials: %s', forename)
q_forename |= Q("match_phrase_prefix",
**{f"{path}__initials": forename})
# We will treat this as a search for a single author; surname and
# forename parts must match in the same (nested) author.
q = q_surname & q_forename
else:
q = q_surname
else:
# Match across all fields within a single author. We don't know which
# bits of the query match which bits of the author name. This will
# handle wildcards, literals, etc.
q = Q("query_string",
fields=AUTHOR_QUERY_FIELDS, default_operator='AND',
allow_leading_wildcard=False,
type="cross_fields", query=escape(term))
return Q("nested", path=path, query=q, score_mode='sum')
def string_query(term: str, path: str = 'authors', operator: str = 'AND') -> Q:
"""Build a query that handles query strings within a single author."""
q = Q("query_string", fields=[f"{path}.full_name"],
default_operator=operator, allow_leading_wildcard=False,
type="cross_fields", query=escape(term))
return Q('nested', path=path, query=q, score_mode='sum')
def author_query(term: str, operator: str = 'and') -> Q:
"""
Construct a query based on author (and owner) names.
Substrings delimited by semicolons should only match if the terms in that
substring match within a single author.
If a substring (delimited or not) contains a comma, everything before the
first comma will be treated as a surname, and the remainder treated as
either the forename or initials. In this scenario, all terms must match
within a single author.
Otherwise, we will simply match all of the parts of the query across all
of the available author/owner fields. Each part of the query must match in
at least one field in at least one author/owner.
Parameters
----------
term: str
Raw querystring. Should not be escaped or normalized in any way.
operator : str
Default: 'AND'; anything else treated as 'OR'. If 'OR', relaxes the
requirement that all parts of the query match. This is useful for
"all fields" searches, in which only part of the query may be expected
to match on an author/owner name.
Returns
-------
:class:`.Q`
An Elasticsearch DSL query part.
"""
logger.debug(f"Author query for {term}")
term = term.lower()
# Check for balanced double-quotes.
if '"' in term and term.count('"') % 2 == 0: # Probably a literal search.
logger.debug(f"Contains literal: {term}")
# Apply literal parts of the query separately.
return reduce(iand if operator.upper() == 'AND' else ior, [
(string_query(part, operator=operator)
| string_query(part, path="owners", operator=operator))
for part in re.split(STRING_LITERAL, term) if part.strip()
])
term = term.replace('"', '') # Just ignore unbalanced quotes.
if ";" in term: # Authors are individuated.
logger.debug(f"Authors are individuated: {term}")
logger.debug(f"Operator: {operator}")
return reduce(iand if operator.upper() == "AND" else ior, [
(part_query(author_part) | part_query(author_part, "owners"))
for author_part in term.split(";") if author_part
])
if "," in term: # Forename is individuated.
logger.debug(f"Forename is individuated: {term}")
return part_query(term) | part_query(term, "owners")
logger.debug(f"General author search: {term}")
# We include both w/in author and among author matches, so that more
# precise matches get more weight.
#
# A query_string query on the combined field will yield matches among
# authors.
q = Q('query_string', fields=['authors_combined'],
query=escape(term, quotes=True),
default_operator='and')
# A nested query_string query on full name will match within individual
# authors.
q |= (
Q('nested', path='authors', score_mode='sum',
query=Q("query_string", fields=['authors.full_name'],
default_operator=operator, allow_leading_wildcard=False,
query=escape(term, quotes=True)))
| Q('nested', path='owners', score_mode='sum',
query=Q("query_string", fields=['owners.full_name'],
default_operator=operator, allow_leading_wildcard=False,
query=escape(term, quotes=True)))
)
return q
def author_id_query(term: str, operator: str = 'and') -> Q:
"""Generate a query part for Author ID using the ES DSL."""
term = term.lower() # Just in case.
if operator == 'or':
return (
Q("nested", path="owners",
query=Q("terms", **{"owners__author_id": term.split()}))
| Q("terms", **{"submitter__author_id": term.split()})
)
return reduce(iand, [(
Q("nested", path="owners",
query=Q("term", **{"owners__author_id": part}))
| Q("term", **{"submitter__author_id": part})
) for part in term.split()])
def orcid_query(term: str, operator: str = 'and') -> Q:
"""Generate a query part for ORCID ID using the ES DSL."""
if operator == 'or':
return (
Q("nested", path="owners",
query=Q("terms", **{"owners__orcid": term.split()}))
| Q("terms", **{"submitter__orcid": term.split()})
)
return reduce(iand, [(
Q("nested", path="owners",
query=Q("term", **{"owners__orcid": part}))
| Q("term", **{"submitter__orcid": part})
) for part in term.split()])