/
walkthrough.clj
243 lines (185 loc) · 7.26 KB
/
walkthrough.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
(ns walkthrough
(:import (org.apache.beam.sdk.transforms GroupByKey Combine)))
;; This walkthrough introduces the core concepts of thurber.
;; The thurber namespace contains the public API.
(require '[thurber :as th])
;; Beam standardizes on Joda time and slf4j logging:
(require '[clj-time.core :as t]
'[clojure.tools.logging :as log])
;; It will be common to interop with Beam's Java classes:
(import 'org.apache.beam.runners.direct.DirectOptions
'org.apache.beam.sdk.io.TextIO
'org.apache.beam.sdk.values.KV
'org.apache.beam.sdk.transforms.Count)
;;;; PIPELINES
;; Create a default Beam pipeline:
(th/create-pipeline)
;; Create a Beam pipeline from command-line arguments:
(def pipeline-from-args
(th/create-pipeline ["--targetParallelism=7" "--jobName=thurber-walkthrough"]))
(assert (= "thurber-walkthrough" (-> pipeline-from-args (.getOptions) (.getJobName))))
;; Create a Beam pipeline from args provided as a Clojure map:
(def pipeline-from-args-map
(th/create-pipeline {:target-parallelism 7
:job-name "thurber-walkthrough"}))
(assert (= "thurber-walkthrough" (-> pipeline-from-args-map (.getOptions) (.getJobName))))
(assert (= 7 (-> pipeline-from-args-map (.getOptions) (.as DirectOptions) (.getTargetParallelism))))
;; A "custom config" arg can be provided; this allows for dynamic arguments (i.e., no need
;; to define a static class extension to PipelineOptions). The custom config will be available
;; to pipeline (at pipeline construction-time or run-time) as a Clojure map.
(def pipeline-with-custom-config
(th/create-pipeline {:target-parallelism 11
:custom-config {:my-custom-config-val 5}}))
(assert (= 5 (:my-custom-config-val (th/get-custom-config pipeline-with-custom-config))))
;;;; SOURCES
;; Pipelines read from sources.
;; We can create a source from hard-coded Clojure data. This is often
;; useful for testing:
(def data-source (th/create [1 2 3]))
;; We can also create any Beam Java-based source:
(def file-source (-> (TextIO/read) (.from "word_count/lorem.txt")))
;;;; SINKS
;; Pipelines write to sinks.
;; We can create a sink using any of Beam Java-based sink:
(def file-sink (-> (TextIO/write) (.to "word-counts")))
;; We can simply sink to our logging system. This is often useful
;; for testing:
(def log-sink #'th/log-elem*)
;;;; SIMPLEST PIPELINE
;; thurber's `apply!` is used to build pipelines.
;; Here we read from our simple hard-coded source and write to our
;; log sink:
(def simplest-pipeline
(doto (th/create-pipeline)
(th/apply! data-source log-sink)))
;; Run the pipeline. This will log each input element from the
;; source (1, 2, 3...not necessarily in this order)
(.run simplest-pipeline)
;;;; FUNCTIONS (ParDo)
;; The simplest Beam transform is a ParDo ("parallel do").
;; Here is a simple function:
(defn double [elem] (+ elem elem))
;; thurber treats Clojure functions as a ParDos automatically:
(def simple-pipeline
(doto (th/create-pipeline)
(th/apply! data-source #'double log-sink)))
;; This logs 2 and 4 and 6 in some order:
(.run simple-pipeline)
;;;; SERIALIZABLE FUNCTIONS
;; When constructing our pipeline, why did we refer to
;; the function's var?
;;
;; #'double
;;
;; Beam distributes functions across the cluster and vars
;; are serializable. In this case #'double is serialized
;; and sent to Beam cluster nodes. When the var is deserialized,
;; thurber ensures that it is rebound to its function.
;;;; INLINE FUNCTIONS
;; The pipeline above refers to a named function in our
;; namespace. However thurber supports inlining functions,
;; which is useful in some cases for readability.
;; thurber's `inline` must be used, which ensures the inline
;; function is properly serializable. Inlined functions
;; must be given an explicit name:
(def simple-pipeline
(doto (th/create-pipeline)
(th/apply! data-source
(th/inline
(fn triple [elem]
(* elem 3)))
log-sink)))
;; This logs 3, 6, and 9:
(.run simple-pipeline)
;;;; PARTIAL FUNCTIONS
;; During runtime stream processing, ParDo functions receive a
;; single element, the element being processed. However thurber
;; supports multi-arity ParDo functions where the last arg is
;; the processing element and prior (serializable) args are
;; bound early using th/partial*:
(def simple-pipeline
(doto (th/create-pipeline)
(th/apply! data-source
(th/partial* #'* 4)
log-sink)))
;; This logs 4, 8, and 12:
(.run simple-pipeline)
;;;; MULTIPLE OUTPUTS
;; A ParDo function can emit zero, one, or many values downstream.
;; When a seq (per Clojure's seq?) is returned from a function, all
;; values are emitted individually downstream.
;;
;; Lazy sequences are common when emitting large streams:
(defn to-words [^String sentence]
(remove empty? (.split sentence "[^\\p{L}]+")))
(def words-pipeline
(doto (th/create-pipeline)
(th/apply!
(th/create ["The quick brown fox jumps over the lazy dog."
"Pack my box with five dozen liquor jugs."])
#'to-words
log-sink)))
;; This logs each word in each sentence individually:
(.run words-pipeline)
;;;; GROUP BY KEY
;; Most Beam pipelines require "shuffle" steps where related data is
;; gathered together by some value (i.e, by a "key" value). Before
;; a shuffle (e.g., GroupByKey) can occur, Beam requires the elements
;; to exist in KV form (as org.apache.beam.sdk.values.KV instances):
;; Let's group values in our source stream by whether they are even or odd:
(def example-pipeline
(doto (th/create-pipeline)
(th/apply! data-source
;; KV elements can be constructed by any ParDo function;
;; however thurber's th/->kv, with th/partial*, is quite useful:
(th/partial* #'th/->kv
(th/inline
(fn classify-even-or-odd [v]
(if (even? v) :even :odd))))
(GroupByKey/create)
log-sink)))
;; This logs grouped values, KV{:even [2]} and KV{:odd [3, 1]}...
(.run example-pipeline)
;;;; COMPOSITE TRANSFORMS
;; Composite transforms can be created with `comp*`:
(def count-even-and-odd-xf
(th/comp* "count-even-and-odd"
(th/partial* #'th/->kv #'classify-even-or-odd)
(Count/perKey)
;; We can (optionally!) use kv->clj to convert Beam's Java KV values
;; to Clojure (i.e., MapEntry) values. This allows for downstream
;; destructuring of key/val pairs.
#'th/kv->clj))
;; Using our composite transform:
(def example-pipeline
(doto (th/create-pipeline)
(th/apply!
data-source
count-even-and-odd-xf
(th/inline
(fn count-sink [[k v]]
(log/infof "There are %d %s numbers."
v (name k)))))))
;; This logs "There are 2 odd numbers."
;; "There are 1 even numbers."
(.run example-pipeline)
;;;; COMBINE
;; Beam's Combine transforms are like Clojure Reducers (https://clojure.org/reference/reducers),
;; and thurber uses similar concepts of reducef and combinef as `clojure.core.reducers/fold`.
(def example-pipeline
(doto (th/create-pipeline)
(th/apply!
data-source
(Combine/globally
(th/combiner #'+))
#'th/log-elem*)))
;; The combine sum, 6, is logged:
(.run example-pipeline)
;;;;
;; todo optional name,, name prefixes
;; todo coders!!!
;; todo ser-fn
;; todo thread-local bindings etc.
;; todo state and timer API
;; todo output tags
;; todo side inputs