-
Notifications
You must be signed in to change notification settings - Fork 57
/
identifier.py
212 lines (177 loc) · 6.98 KB
/
identifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
"""Base domain classes for browse service."""
import json
import re
from re import RegexFlag
from typing import Match, Optional, Union, Tuple, Callable, List
from arxiv import taxonomy
# arXiv ID format used from 1991 to 2007-03
RE_ARXIV_OLD_ID = re.compile(
r'^(?P<archive>[a-z]{1,}(\-[a-z]{2,})?)(\.([a-zA-Z\-]{2,}))?\/'
r'(?P<yymm>(?P<yy>\d\d)(?P<mm>\d\d))(?P<num>\d\d\d)'
r'(v(?P<version>[1-9]\d*))?([#\/].*)?$')
# arXiv ID format used from 2007-04 to present
RE_ARXIV_NEW_ID = re.compile(
r'^(?P<yymm>(?P<yy>\d\d)(?P<mm>\d\d))\.(?P<num>\d{4,5})'
r'(v(?P<version>[1-9]\d*))?([#\/].*)?$'
)
Sub_type = List[Tuple[str, Union[str, Callable[[Match[str]], str]],
int, Union[int, RegexFlag]]]
SUBSTITUTIONS: Sub_type = [
# pattern, replacement, count, flags
(r'\.(pdf|ps|gz|ps\.gz)$', '', 0, 0),
(r'^/', '', 0, 0),
(r'^arxiv:', '', 1, re.I),
(r'//+', '/', 0, 0),
(r'--+', '-', 0, 0),
(r'^([^/]+)', lambda x: str.lower(x.group(0)), 1, 0),
(r'([^a\-])(ph|ex|th|qc|mat|lat|sci)(\/|$)', r'\g<1>-\g<2>\g<3>', 1, 0)
]
class IdentifierException(Exception):
"""Error class for general arXiv identifier exceptions."""
pass
class IdentifierIsArchiveException(IdentifierException):
"""Error class for case where supplied arXiv identifier is an archive."""
pass
class Identifier:
"""Class for arXiv identifiers of published papers."""
def __init__(self, arxiv_id: str) -> None:
"""Attempt to validate the provided arXiv ID.
Parse constituent parts.
"""
self.ids = arxiv_id
"""The ID as specified."""
self.id: str = arxiv_id
self.archive: Optional[str] = None
self.filename: Optional[str] = None
self.year: Optional[int] = None
self.month: Optional[int] = None
self.is_old_id: Optional[bool] = None
if self.ids in taxonomy.definitions.ARCHIVES:
raise IdentifierIsArchiveException(
taxonomy.definitions.ARCHIVES[self.ids]['name'])
for subtup in SUBSTITUTIONS:
arxiv_id = re.sub(subtup[0],
subtup[1],
arxiv_id,
count=subtup[2],
flags=subtup[3])
self.version = 0
parse_actions = ((RE_ARXIV_OLD_ID, self._parse_old_id),
(RE_ARXIV_NEW_ID, self._parse_new_id))
id_match = None
for regex, parse_action in parse_actions:
id_match = re.match(regex, arxiv_id)
if id_match:
parse_action(id_match)
break
if not id_match:
raise IdentifierException(
f'invalid arXiv identifier {self.ids}'
)
self.num: Optional[int] = int(id_match.group('num'))
if self.num is None:
raise IdentifierException('arXiv identifier is empty')
if self.year is None:
raise IdentifierException('year is empty')
if self.num is not None and self.year is not None:
if self.num == 0 \
or (self.num > 99999 and self.year >= 2015) \
or (self.num > 9999 and self.year < 2015) \
or (self.num > 999 and self.is_old_id):
raise IdentifierException(
'invalid arXiv identifier {}'.format(self.ids)
)
self.has_version: bool = False
self.idv: str = self.id
if id_match.group('version'):
self.version = int(id_match.group('version'))
self.idv = f'{self.id}v{self.version}'
self.has_version = True
self.squashed = self.id.replace('/', '')
self.squashedv = self.idv.replace('/', '')
self.yymm: str = id_match.group('yymm')
self.month = int(id_match.group('mm'))
if self.month > 12 or self.month < 1:
raise IdentifierException(
f'invalid arXiv identifier {self.ids}'
)
if self.is_old_id:
if self.year < 1991 or self.year > 2007 \
or (self.year == 2007 and self.month > 3):
raise IdentifierException(
f'invalid arXiv identifier {self.ids}'
)
else:
if self.year < 2007 or (self.year == 2007 and self.month < 4):
raise IdentifierException(
f'invalid arXiv identifier {self.ids}'
)
def _parse_old_id(self, match_obj: Match[str]) -> None:
"""
Populate instance attributes parsed from old arXiv identifier.
The old identifiers were minted from 1991 until March 2007.
Parameters
----------
match_obj : Match[str]
A regex match on RE_ARXIV_OLD_ID
Returns
-------
None
"""
self.is_old_id = True
self.archive = match_obj.group('archive')
self.year = int(match_obj.group('yy')) + 1900
self.year += 100 if int(match_obj.group('yy')) < 91 else 0
if match_obj.group('version'):
self.version = int(match_obj.group('version'))
self.filename = '{}{:03d}'.format(
match_obj.group('yymm'),
int(match_obj.group('num')))
self.id = f'{self.archive}/{self.filename}'
def _parse_new_id(self, match_obj: Match[str]) -> None:
"""
Populate instance attributes from a new arXiv identifier.
New identifiers started 2007-04 with 4-digit suffix;
starting 2015 they have a 5-digit suffix.
e.g. 0704.1234
1412.0001
1501.00001
1711.01234
Parameters
----------
match_obj : Match[str]
A regex match on RE_ARXIV_NEW_ID
Returns
-------
None
"""
self.is_old_id = False
self.archive = 'arxiv'
# NB: this works only until 2099
self.year = int(match_obj.group('yy')) + 2000
if self.year >= 2015:
self.id = '{:04d}.{:05d}'.format(
int(match_obj.group('yymm')),
int(match_obj.group('num')))
else:
self.id = '{:04d}.{:04d}'.format(
int(match_obj.group('yymm')),
int(match_obj.group('num')))
self.filename = self.id
def __str__(self) -> str:
"""Return the string representation of the instance in json."""
return json.dumps(self, default=lambda o: o.__dict__,
sort_keys=True, indent=True)
def __repr__(self) -> str:
"""Return the instance representation."""
return f"Identifier(arxiv_id='{self.ids}')"
def __eq__(self, other: object) -> bool:
"""
Return instance equality: other should be type <= Instance.
Note that 'other' can't be statically checked to be type Instance
by design: https://stackoverflow.com/a/37557540/3096687
"""
try:
return self.__dict__ == other.__dict__
except AttributeError:
return False