/
scanner_run.clj
380 lines (340 loc) · 13 KB
/
scanner_run.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
(ns active.ephemerol.scanner-run
(:require [active.clojure.record :refer :all])
(:import [java.io Reader IOException]))
(definterface IState
(position_row [])
(set_position_row [x])
(position_column [])
(set_position_column [x]))
(deftype Position
[^:volatile-mutable row
^:volatile-mutable column]
Object
(equals
[this other]
(and (instance? Position other)
(let [^Position other-pos other]
(= row (.position_row other-pos))
(= column (.position_column other-pos)))))
(hashCode
[this]
(+ row column))
IState
(position_row [_] row)
(set_position_row [_ x] (set! row x))
(position_column [_] column)
(set_position_column [_ x] (set! column x)))
(defn make-position
[r c]
(Position. r c))
(defn position-row
[^Position p]
(.position_row p))
(defn set-position-row!
[^Position p r]
(.set_position_row p r))
(defn position-column
[^Position p]
(.position_column p))
(defn set-position-column!
[^Position p c]
(.set_position_column p c))
(defmethod print-method Position [v ^java.io.Writer w]
(.write w "[")
(print-method (position-row v) w)
(.write w ",")
(print-method (position-column v) w)
(.write w "]"))
(defn copy-position
[pos]
(Position. (position-row pos)
(position-column pos)))
(defn position=?
[pos1 pos2]
(and (= (position-row pos1) (position-row pos2))
(= (position-column pos1) (position-column pos2))))
(def ^:private linefeed (int \newline))
(def ^:private tab (int \tab))
(defn update-position!
[pos ^long ch]
(case ch
10 ; linefeed
(do
(set-position-column! pos 0)
(set-position-row! pos (+ 1 (position-row pos))))
9 ; tab
(let [col (position-column pos)]
(set-position-column! pos (* 8 (quot (+ 7 col) 8))))
(set-position-column! pos (+ 1 (position-column pos)))))
(define-record-type ScanError
(make-scan-error cause)
scan-error?
[cause scan-error-cause])
(def stuck-scan-error (make-scan-error :stuck))
(def eof-scan-error (make-scan-error :eof))
(define-record-type Scanner
(make-scanner automaton bot-state? states final partition-size partition-bits indices encodings eof-action)
scanner?
;; for debugging only; may be nil; not preserved across scanner->expression
[automaton scanner-automaton
;; says whether state #1 is the after-bot state
bot-state? scanner-bot-state?
states scanner-states
; array of final states, where each entry is either a regular action or a EolAction record
final scanner-final
partition-size scanner-partition-size
partition-bits scanner-partition-bits
indices scanner-indices
encodings scanner-encodings
eof-action scanner-eof-action])
; internal wrapper to mark actions valid only at eol
(define-record-type EolAction
(make-eol-action at-eol vanilla)
eol-action?
[at-eol eol-action-at-eol
;; action valid at the same place, without eol
vanilla eol-action-vanilla])
(defn- new-bindings+map
[bindings action->name action new-name]
(if (contains? action->name action)
[bindings action->name]
[(conj bindings new-name action)
(assoc action->name action new-name)]))
(defn- fill-final-expression
[^objects final final-name]
(let [size (count final)]
(loop [bindings []
action->name {}
i 0]
(if (< i size)
(if-let [thing (aget final i)]
(if (eol-action? thing)
(let [[bindings action->name] (new-bindings+map bindings action->name (eol-action-at-eol thing) (symbol (str "eol" i)))
[bindings action->name] (new-bindings+map bindings action->name (eol-action-vanilla thing) (symbol (str "vanilla" i)))]
(recur bindings action->name (+ 1 i)))
(let [[bindings action->name] (new-bindings+map bindings action->name thing (symbol (str "action" i)))]
(recur bindings action->name (+ 1 i))))
(recur bindings action->name (+ 1 i)))
;; next up
(loop [statements []
i 0]
(if (< i size)
(if-let [thing (aget final i)]
(if (eol-action? thing)
(recur (conj statements `(aset ~final-name ~i
(make-eol-action ~(get action->name (eol-action-at-eol thing))
~(get action->name (eol-action-vanilla thing)))))
(+ 1 i))
(recur (conj statements `(aset ~final-name ~i
~(get action->name thing)))
(+ 1 i)))
(recur statements (+ 1 i)))
;; ... and one
`(let ~bindings ~@statements)))))))
(defn- encode-int-array
[ar]
;; work around "method size too large" and string literals < 64k
(loop [s (str (vec ar))
ss '()]
(cond
(= "" s)
(if (= (count ss) 1)
`(int-array (read-string ~(first ss)))
`(int-array (read-string (string/join [~@(reverse ss)]))))
(> (count s) 65535)
(recur (subs s 65535)
(cons (subs s 0 65535) ss))
:else
(recur "" (cons s ss)))))
(defn scanner->expression
[scanner]
(let [final (scanner-final scanner)]
`(let [~'final (object-array ~(count final))
~'scanner (make-scanner nil
~(scanner-bot-state? scanner)
~(encode-int-array (scanner-states scanner))
~'final
~(scanner-partition-size scanner)
~(scanner-partition-bits scanner)
~(encode-int-array (scanner-indices scanner))
~(encode-int-array (scanner-encodings scanner))
~(scanner-eof-action scanner))]
~(fill-final-expression final 'final)
~'scanner)))
(defn write-scanner-ns
[scanner ns-name reqs writer-arg]
(with-open [writer (clojure.java.io/writer writer-arg)]
(binding [*out* writer
*print-meta* true
*print-length* nil
*print-level* nil]
(pr `(ns ~ns-name
(:require [clojure.string :as ~'string]
[active.ephemerol.scanner-run :refer :all]
~@reqs)))
(println)
(println `(declare ~'scanner ~'scan-one))
(pr `(def ~'scanner ~(scanner->expression scanner)))
(println)
(pr `(def ~'scan-one (make-scan-one ~'scanner)))
(println)
(pr `(defn ~'scan
[x#]
(with-open [r# (clojure.java.io/reader x#)]
(scan-to-list ~'scan-one (reader->list r#) (make-position 1 0)))))
(println))))
(defn reverse-list->string
[rlis]
(let [sb (StringBuilder. (count rlis))]
(loop [rlis rlis]
(if (empty? rlis)
(do
(.reverse sb)
(.toString sb))
(do
(.appendCodePoint sb ^int (first rlis))
(recur (rest rlis)))))))
(define-record-type ScanResult
(make-scan-result data input input-position)
scan-result?
[data scan-result-data ; holds the return in the end either scan-error or empty list
input scan-result-input ; the rest of input
input-position scan-result-input-position])
(defn make-scan-one
[scanner]
(let [^ints states (scanner-states scanner)
bot-state? (scanner-bot-state? scanner)
^objects final (scanner-final scanner)
partition-size (scanner-partition-size scanner)
bits (scanner-partition-bits scanner)
^ints indices (scanner-indices scanner)
^ints encodings (scanner-encodings scanner)
eof-action (scanner-eof-action scanner)]
(let [mask (- (bit-shift-left 1 bits) 1)
state-size (+ 1 partition-size)
scalar-value->class (fn [sc]
(aget encodings
(+ (aget indices
(bit-shift-right sc bits))
(bit-and sc mask))))
state-next (fn [state-index sc]
(let [class (scalar-value->class sc)]
(if (= class -1)
-1
(loop [state-index state-index]
(let [base (* state-index state-size)
next-index (aget states (+ base class))]
(if (= next-index -1)
(let [tunnel-index (aget states (+ base partition-size))]
(if (= tunnel-index -1)
-1
(recur tunnel-index)))
next-index))))))]
(fn [start-input start-position]
(let [position (copy-position start-position) ; updated
;; lexeme read so far
lexeme-builder (StringBuilder.)]
(loop [state (if (and bot-state?
(zero? (position-column position)))
1
0)
;; to be prepended to port
input start-input
;; these are the values for the last final state
last-action nil
last-lexeme ""
last-input '()
last-position nil]
;; (write (list 'loop state input (reverse rev-lexeme) last-action (reverse last-rev-lexeme) last-input)) (newline)
(cond
(not-empty input)
(let [c (int (first input))
input (rest input)]
(update-position! position c)
(let [new-state (long (state-next state c))]
(cond
(not= new-state -1)
;; successful transition
(do
(.appendCodePoint lexeme-builder c)
(if-let [action (aget final new-state)]
;; final state
(if (eol-action? action) ; EOL action
(recur new-state input
(if (or (empty? input)
(= linefeed (first input)))
(eol-action-at-eol action)
(eol-action-vanilla action))
(.toString lexeme-builder)
input (copy-position position))
(recur new-state input action (.toString lexeme-builder)
input (copy-position position)))
;; non-final state
(recur new-state input
last-action last-lexeme last-input last-position)))
last-action
;; stuck
(last-action last-lexeme
start-position
last-input last-position)
:else
;; stuck, no action
(make-scan-result stuck-scan-error start-input start-position))))
;; eof
last-action
(last-action last-lexeme
start-position
last-input last-position)
;; eof at the beginning
(zero? (.length lexeme-builder))
;; call either the default or user specified eof handler.
(eof-action "" start-position '() last-position)
;; eof, no action
:else
(make-scan-result eof-scan-error start-input start-position)))))))) ;the end position
(defn scan-to-list
[scan-one input input-position]
(loop [v (transient [])
input input
input-position input-position]
(if (empty? input)
[(persistent! v) input input-position]
(let [scan-result (scan-one input input-position)]
(if-let [data (scan-result-data scan-result)]
(if (scan-error? data)
[data (scan-result-input scan-result) (scan-result-input-position scan-result)]
(recur (conj! v data)
(scan-result-input scan-result) (scan-result-input-position scan-result)))
[(persistent! v) input input-position])))))
(defn string->list
[^String str]
(let [sz (.length str)]
(loop [i 0
v (transient [])]
(if (< i sz)
(let [sv (.codePointAt str i)]
(recur (+ i (Character/charCount sv))
(conj! v sv)))
(persistent! v)))))
(defn read-scalar-value
^long [^Reader r]
(let [high (.read r)]
(if (= -1 high)
-1
(let [highc (char high)]
(if (Character/isHighSurrogate highc)
(let [next (.read r)]
(when (= -1 next)
(throw (IOException. "malformed Unicode encoding")))
(let [lowc (char next)]
(when-not (Character/isLowSurrogate lowc)
(throw (IOException. "malformed Unicode encoding")))
(Character/toCodePoint highc lowc)))
high)))))
(defn reader->list
[^Reader r]
(loop [v (transient [])]
(let [sc (read-scalar-value r)]
(if (= sc -1)
(persistent! v)
(recur (conj! v sc))))))