-
Notifications
You must be signed in to change notification settings - Fork 0
/
naturalnum.py
339 lines (295 loc) · 11.6 KB
/
naturalnum.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
import re
import logging
lhsValidatorRegex = r"[0-9a-zA-Z]+"
rhsValidatorRegex = r"[0-9a-zA-Z\(\),\$-]+" ## Only alphanumerics (inc. hyphen) or these chars: (),$
ruleValidatorRegex = r".+=.+" ## At least one char each side of '=' delimiter
rhsPlaceholderRegex = r"\$[A-Za-z]" ## A '$' followed by an alpha char
## Set up logging (users will set up their own handlers)
class NullHandler(logging.Handler):
def emit(self, record):
pass
logger = logging.getLogger("naturalnum")
logger.setLevel(logging.DEBUG)
h = NullHandler()
logger.addHandler(h)
class Rule:
"""Holds all attributes of a Rule,
including matching and translation logic.
"""
def __init__(self, lhs=None, rhs=None):
self.lhs = lhs
self.rhs = rhs
def init(self):
self.validateLhs()
self.validateRhs()
self.validateLhsWithRhs()
self.buildLhsRegex()
self.buildLhsGroupDict()
self.buildRhsWithBackrefs()
self.splitRhsTokens()
self.validateRhsTokenList()
self.initd = True
def validateLhs(self):
"""Validates string in LHS of the rule. All of the following must be true:
- not empty or null
- alphanumeric chars only i.e. [0-9a-Z]
- if alpha chars are present they must all be unique, as they will be
used as identifiers
"""
# Ensure lhs is present
if (self.lhs == None or self.lhs == ""):
raise RuleValidationException("Could not validate lhs of Rule: [" +
"" if self.lhs is None else self.lhs + "] because it is empty.")
# Ensure lhs is alphanumeric only
if (re.match(lhsValidatorRegex, self.lhs) == None):
raise RuleValidationException("Could not validate lhs of Rule: [" +
self.lhs + "]. Only alpha/digits allowed.")
# If alpha chars present, ensure they are unique
if(not self.lhs.isdigit()):
alphas = [x for x in self.lhs if not x.isdigit()]
if len(set(alphas)) != len(alphas):
raise RuleValidationException("Could not validate lhs of Rule: [" +
self.lhs + "]. Alpha characters must be unique.")
def splitRhsTokens(self):
"""Splits rhs string into a list of tokens, separated by ',' delimiter"""
self.rhsTokenList = self.rhsWithBackrefs.split(',')
def validateRhs(self):
"""Validates string in RHS of a rule. All of the following must be true:
- Not empty or null
- Only alphanumeric chars, or any of: (),$
- Brackets must be balanced
- Anything within brackets should resolve to digits
"""
# Check rhs is present
if (self.rhs == None or self.rhs == ""):
raise RuleValidationException("Could not validate rhs of Rule: [" +
"" if self.rhs is None else self.rhs + "] because it is empty.")
logger.debug("Validating rhs: " + self.rhs)
# Check only allowed chars present
if (re.match(rhsValidatorRegex, self.rhs) == None):
raise RuleValidationException("Could not validate rhs of Rule: [" +
self.rhs + "]. Only alpha/digits or the following chars allowed: (),$")
# Check '$' only precedes an alpha character
checkNextIsAlpha = False
for x in self.rhs:
if x == '$':
checkNextIsAlpha = True
else:
if checkNextIsAlpha:
if not x.isalpha():
raise RuleValidationException("Could not validate rhs of Rule: [" +
self.rhs + "]. '$' must always be folowed by an alpha char.")
checkNextIsAlpha = False
# Ensure everything within brackets is either digits or digivars
withinBracketsSearchPatt = re.compile("\\(.*?\\)") # todo move to constant
for match in re.finditer(withinBracketsSearchPatt, self.rhs):
matchedVal = self.rhs[match.start()+1:match.end()-1]
logger.debug("Checking rhs token within brackets [" + matchedVal + "] is digits or digivars")
digitsOrDigivarsPatt = re.compile("^(\\$[a-zA-Z]|[0-9])+$") # todo move to constant
if digitsOrDigivarsPatt.match(matchedVal) == None:
raise RuleValidationException \
("Bracketed terms must contain only digits or digivars")
def validateRhsTokenList(self):
""""""
# Check that if brackets exist, they are at the start/end positions
for token in self.rhsTokenList:
pos = 0
for x in token:
if x == '(' and not pos == 0:
raise RuleValidationException("Could not validate rhs token: [" +
token + "] - can only start recursion at beginning of token.")
elif x == ')' and not pos == len(token)-1:
raise RuleValidationException("Could not validate rhs token: [" +
token + "] - can only end recursion at end of token.")
pos = pos + 1
def validateLhsWithRhs(self):
# Check all placeholders on RHS appear on LHS
# 1. Find set of all LHS placeholders
lhsPlaceholders = set([x for x in self.lhs if not x.isdigit()])
# 2. Find set of all RHS placeholders
placeholderDetected = False
rhsPlaceholdersList = []
for x in self.rhs:
if x == '$':
placeholderDetected = True
else:
if placeholderDetected:
rhsPlaceholdersList.append(x)
placeholderDetected = False
rhsPlaceholders = set(rhsPlaceholdersList)
# 3. Check RHS placeholders is a subset of LHS placeholders
# (not necessarily vice-versa)
if not rhsPlaceholders <= lhsPlaceholders:
raise RuleValidationException("Could not validate Rule lhs [" +
self.lhs + "] with rhs: [" + self.rhs +
"] - placeholder(s) on rhs do not appear on lhs.")
def buildLhsRegex(self):
lhsRegex = "^"
for x in self.lhs:
if x.isdigit():
lhsRegex = lhsRegex + x
else:
lhsRegex = lhsRegex + "(.)"
lhsRegex = lhsRegex + "$"
self.lhsRegex = lhsRegex
self.lhsRegexPattern = re.compile(lhsRegex)
def buildLhsGroupDict(self):
"""Builds a dictionary of placeholder character to group number.
"""
self.lhsGroupDict = {}
alphas = [x for x in self.lhs if not x.isdigit()]
count = 1
for x in alphas:
self.lhsGroupDict[x] = count
count = count + 1
def buildRhsWithBackrefs(self):
"""Builds the pattern for the resulting output string for this rule.
This can either be a single token, e.g.:
1=one
or a list of tokens (just a comma-separated string at this stage),
e.g.:
21=twenty,one
In more complex cases (when we want to capture digits from the lhs
and feed them back into the rule engine), we will replace instances
of $<digivar> on the rhs with the appropriate regex backreference.
E.g. for:
tu=$t0,u
The resulting rhs expression will be: \\g<1>0,\\g<2>
So '21' would be replaced with 20,1. Note the use of g<num> format
for regex backreferences, this is to disambiguate cases such as
\\10 - which should be interpreted as backreference to group number 1,
followed by a literal 0, not as a backreference to group number 10.
Additionally, when recursion is to be used upon a token (the resulting
rhs token should be fed back into the rule engine), the rhs expression
is enclosed in parentheses, e.g.:
tu=($t0),($u)
would result in each of the two rhs tokens being fed back through the
rule engine, and being replaced with the result. This strategy allows
reuse among rules.
"""
## Replace all instances of "$<char>" in rhs expression with a
## backreference to the group number matching that char in the LHS regex.
rhsWithBackrefs = self.rhs
rhsPlaceholderSearchPatt = re.compile(rhsPlaceholderRegex)
for match in re.finditer(rhsPlaceholderSearchPatt, self.rhs):
matchedVal = self.rhs[match.start():match.end()]
backref = "\\g<" + str(self.lhsGroupDict[matchedVal[1:]]) + ">"
rhsWithBackrefs = rhsWithBackrefs.replace(matchedVal, backref)
self.rhsWithBackrefs = rhsWithBackrefs
def matches(self, value):
"""Returns True if the whole of 'value' matches this rule, else False"""
return self.lhsRegexPattern.match(value)
def resolve(self, value):
if not self.matches(value):
raise RuleUsageException("Rule does not match value, cannot resolve")
tokenList = []
for rhsToken in self.rhsTokenList:
logger.debug("replacing [" + value + "] with [" + rhsToken + \
"] in context of match regex [" + self.lhsRegex + "]")
resolvedRhsToken = self.lhsRegexPattern.sub(rhsToken, value)
tokenList.append(resolvedRhsToken)
return tokenList
def __str__(self):
return "[" + self.lhs + "=" + self.rhs + "], [" + self.lhsRegex + "=" + self.rhsWithBackrefs + "]"
class RuleValidationException(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return repr(self.value)
class RuleUsageException(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return repr(self.value)
class RuleEvaluationException(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return repr(self.value)
class RuleList:
def __init__(self):
self.rules = []
def __len__(self):
return len(self.rules)
def add(self, rule):
self.rules.append(rule)
def search(self, value):
for rule in self.rules:
if rule.matches(value):
return rule
class RuleEngine:
def __init__(self, ruleList):
self.ruleList = ruleList
@classmethod
def fromLangFilename(cls, fileName):
logger.debug("fromLangFilename()")
f = open(fileName, 'r')
logger.debug("opened config file [" + fileName + "]")
ruleList = RuleList()
for line in f:
logger.debug("read line: [" + line[0:-1] + "]")
rule = validateAndParseRule(line)
if rule != None:
logger.debug("Adding Rule to RuleList")
ruleList.add(rule)
else:
logger.debug("No rule found, skipping this line")
re = RuleEngine(ruleList)
logger.debug("finished loading RuleEngine")
return re
def resolve(self, value):
logger.debug("resolve() value=[" + value + "]")
matchedRule = self.ruleList.search(value)
if not matchedRule == None:
logger.debug("found matched rule: " + str(matchedRule))
rhsTokens = matchedRule.resolve(value)
rhsTokensFollowingRecursion = []
for rhsToken in rhsTokens:
## If current rhs token is enclosed in brackets, replace it with
## the result of feeding the value (without brackets) back through
## the rule engine.
if rhsToken[0:1] == '(' and rhsToken[-1:] == ')':
rhsTokenToRecurse = rhsToken[1:-1]
logger.debug("Recursing value [" + rhsTokenToRecurse + "] back through rule engine")
resultAfterRecursion = self.resolve(rhsTokenToRecurse)
if resultAfterRecursion == None:
raise RuleEvaluationException("Could not match fragment of result [" + \
rhsTokenToRecurse + "] to a rule")
rhsTokensFollowingRecursion = rhsTokensFollowingRecursion + \
resultAfterRecursion
else:
rhsTokensFollowingRecursion.append(rhsToken)
return rhsTokensFollowingRecursion
else:
logger.debug("could not find matching rule")
return None
def validateAndParseRule(rule):
"""Validates and parses entire rule string, returning it as a Rule object.
Any characters after '#' in the rule string are ignored.
The rule string must have a '=' delimiter, surrounded by a valid LHS and
RHS.
Return values:
- If entire rule is valid, a Rule object
- If rule is invalid, a RuleValidationException will be raised
- If rule was only a comment, None will be returned
"""
logger.debug("validateAndParseRule()")
# Get rule without comments, return None if rule is only a comment
commentCharPos = rule.find('#')
if commentCharPos != -1:
ruleWithoutComment = rule[:commentCharPos].rstrip()
else:
ruleWithoutComment = rule.rstrip()
if len(ruleWithoutComment) == 0:
return None
# Check that rule without comment is some chars, delimited by '='
if (re.match(ruleValidatorRegex, ruleWithoutComment) == None):
raise RuleValidationException("Could not validate rule: [" +
rule + "]. Format should be: <lhs>=<rhs>")
# Split rule into LHS and RHS, create Rule object and run its validations
parts = ruleWithoutComment.split('=')
lhs = parts[0]
rhs = parts[1]
rule = Rule(lhs, rhs)
rule.init()
return rule