/
pipe_executor_cluster_map.go
492 lines (465 loc) · 17.8 KB
/
pipe_executor_cluster_map.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
package compute_pipes
import (
"context"
"fmt"
"log"
"math/rand"
"net"
"net/http"
"net/rpc"
"runtime/debug"
"sync"
"time"
)
// Pipe executor that shard the input channel onto the sub-cluster based
// on the shard key. The shard key is hashed and map onto one of the
// sub-cluster node (The number of nodes of the sub-cluster is specified by nbrSubClusterNodes).
// Cluster nodes sharding data using splitter key
func (ctx *BuilderContext) StartClusterMap(spec *PipeSpec, source *InputChannel, clusterMapResultCh chan chan ComputePipesResult) {
var cpErr, err error
var evaluatorsWg sync.WaitGroup
// remainingPeerInWg: peers to join for sending records, wait until all peers have joined
// peersInWg: peers joining to send records, wait until all peers has completed sending records
var peersInWg, remainingPeerInWg sync.WaitGroup
var distributionWg sync.WaitGroup
var distributionCh []chan []interface{}
var distributionResultCh, consumedLocallyResultCh chan ComputePipesResult
var receivedFromPeersResultCh []chan ComputePipesResult
var nbrRecordsConsumedLocally int64
var incommingDataCh chan []interface{}
var server net.Listener
var outPeers []Peer
var evaluators []PipeTransformationEvaluator
var destinationSubClusterNodeId int
var spliterColumnIdx int
var ok bool
var addr string
var pcServer *PeerServer
var peerBatchSize int
defer func() {
// Catch the panic that might be generated downstream
if r := recover(); r != nil {
cpErr := fmt.Errorf("StartClusterMap: recovered error: %v", r)
log.Println(cpErr)
debug.PrintStack()
ctx.errCh <- cpErr
close(ctx.done)
}
// Make sure the PeerServer closed all the receivedFromPeersResultCh
if pcServer != nil {
for i, isClosed := range pcServer.peersResultClosed {
if isClosed != nil && !*isClosed {
close(pcServer.receivedFromPeersResultCh[i])
}
}
}
// Closing the output channels
// fmt.Println("**!@@ CLUSTER_MAP: Closing Output Channels")
oc := make(map[string]bool)
for i := range spec.Apply {
oc[spec.Apply[i].Output] = true
}
for i := range oc {
// fmt.Println("**!@@ CLUSTER_MAP: Closing Output Channel", i)
ctx.channelRegistry.CloseChannel(i)
}
}()
// fmt.Println("**!@@ CLUSTER_MAP *1 Called, shuffle on column", *spec.Column)
if ctx.cpConfig.ClusterConfig == nil {
cpErr = fmt.Errorf("error: missing ClusterConfig section in compute_pipes_config")
goto gotError
}
spliterColumnIdx, ok = source.columns[*spec.Column]
if !ok {
cpErr = fmt.Errorf("error: invalid column name %s for distribute_data with source channel %s", *spec.Column, source.config.Name)
goto gotError
}
// Open connection with peer nodes
// With each node, have 2 connections: one to send and the other one to receive.
// Start the connection listener for the incomming (server) -- receive data, input sources
// Create an intermediate channel for all the incomming connections to use to forward the
// input records.
incommingDataCh = make(chan []interface{}, 1)
// Keep track of how many records received by current node from peers
receivedFromPeersResultCh = make([]chan ComputePipesResult, ctx.nbrSubClusterNodes-1)
for i := range receivedFromPeersResultCh {
receivedFromPeersResultCh[i] = make(chan ComputePipesResult, 2)
clusterMapResultCh <- receivedFromPeersResultCh[i]
}
// Handle the incomming connection
addr = ctx.env["$CPIPES_SERVER_ADDR"].(string)
// Register the rpc server
pcServer = &PeerServer{
nodeId: int32(ctx.nodeId),
subClusterNodeId: int32(ctx.subClusterNodeId),
recordCount: make(map[int]*int64, ctx.nbrSubClusterNodes-1),
peersWg: &peersInWg,
remainingPeerInWg: &remainingPeerInWg,
incommingDataCh: incommingDataCh,
peersResultClosed: make(map[int]*bool, ctx.nbrSubClusterNodes-1),
receivedFromPeersResultCh: receivedFromPeersResultCh,
errCh: ctx.errCh,
done: ctx.done,
}
for i := 0; i < ctx.nbrSubClusterNodes-1; i++ {
pcServer.recordCount[i] = new(int64)
pcServer.peersResultClosed[i] = new(bool)
}
err = rpc.Register(pcServer)
if err != nil {
cpErr = fmt.Errorf("while registering the rpc server: %v", err)
goto gotError
}
// Registers an HTTP handler for RPC messages
rpc.HandleHTTP()
// Start listening for the requests
server, err = net.Listen("tcp", addr)
if err != nil {
cpErr = fmt.Errorf("while opening a listener on %s: %v", addr, err)
goto gotError
}
remainingPeerInWg.Add(ctx.nbrSubClusterNodes - 1)
// Serve accepts incoming HTTP connections on the listener l, creating
// a new service goroutine for each. The service goroutines read requests
// and then call handler to reply to them
go func() {
defer func() {
// Catch the panic that might be generated by the PeerServer
if r := recover(); r != nil {
cpErr := fmt.Errorf("StartClusterMap: recovered error: %v", r)
log.Println(cpErr)
debug.PrintStack()
ctx.errCh <- cpErr
close(ctx.done)
}
}()
// log.Println("**!@@ CLUSTER_MAP *2 RPC server registered, listening on", addr)
http.Serve(server, nil)
// err := http.Serve(server, nil)
// log.Println("**!@@ CLUSTER_MAP *2 RPC server DONE listening on", addr, "::", err)
}()
// Note: when evaluatorsWg and source is done, need to call Close() on server to terminate the Accept loop
// and close intermediate channel incommingDataCh
// Get the node's ip
err = ctx.updateClusterInfo()
if err != nil {
cpErr = fmt.Errorf("while calling registerNode: %v", err)
goto gotError
}
// Open the client connections with peers -- send data, output sources
outPeers = make([]Peer, len(ctx.peersAddress))
for i, peerAddress := range ctx.peersAddress {
// log.Printf("**!@@ CLUSTER_MAP *3 (%s) connecting to %s", ctx.selfAddress, peerAddress)
if peerAddress != ctx.selfAddress {
retry := 0
start := time.Now()
for {
// DialHTTP connects to an HTTP RPC server at the specified network
client, err := rpc.DialHTTP("tcp", peerAddress)
if err == nil {
// log.Printf("**!@@ CLUSTER_MAP *3 (%s) CONNECTED to %s on try #%d", ctx.selfAddress, peerAddress, retry)
outPeers[i] = Peer{
peerAddress: peerAddress,
client: client,
}
// Register the client with the peer server
args := &PeerRecordMessage{Sender: int32(ctx.subClusterNodeId)}
// log.Printf("**!@@ PeerServer: sending ClientReady to peer %d", i)
err = client.Call("PeerServer.ClientReady", args, &PeerReply{})
if err != nil {
cpErr = fmt.Errorf("while calling PeerServer.ClientReady to node %d: %v", i, err)
goto gotError
}
break
}
if time.Since(start) > time.Duration(ctx.cpConfig.ClusterConfig.PeerRegistrationTimeout)*time.Second {
cpErr = fmt.Errorf("too many retry to open comm with peer %d at %s for distribute_data with source channel %s: %v", i, peerAddress, source.config.Name, err)
goto gotError
}
// log.Printf("**!@@ CLUSTER_MAP *3 (%s) failed to connect to %s on try #%d, will retry :: %v", ctx.selfAddress, peerAddress, retry, err)
time.Sleep(1 * time.Second)
err = ctx.updatePeerAddr(i)
if err != nil {
cpErr = fmt.Errorf("while refreshing peer %d addr: %v", i, err)
goto gotError
}
retry++
}
} else {
// log.Printf("**!@@ CLUSTER_MAP *3 (%s) stand-in for %s", ctx.selfAddress, peerAddress)
// Put a stand-in for self
outPeers[i] = Peer{
peerAddress: ctx.selfAddress,
}
}
}
// log.Printf("**!@@ CLUSTER_MAP *3 (%s) All %d peer connections established", ctx.selfAddress, len(ctx.peersAddress))
// log.Printf("**!@@ CLUSTER_MAP *4 WAIT for all incomming PEER client to be established")
remainingPeerInWg.Wait()
// log.Printf("**!@@ CLUSTER_MAP *4 DONE WAIT got all incomming PEER client established")
// Build the PipeTransformationEvaluators
evaluators = make([]PipeTransformationEvaluator, len(spec.Apply))
for j := range spec.Apply {
if spec.Apply[j].Type == "partition_writer" {
cpErr = fmt.Errorf("error in StartClusterMap, cannot have an Apply of Type partition_writer")
goto gotError
}
eval, err := ctx.buildPipeTransformationEvaluator(source, nil, nil, &spec.Apply[j])
if err != nil {
cpErr = fmt.Errorf("while calling buildPipeTransformationEvaluator in StartClusterMap for %s: %v", spec.Apply[j].Type, err)
goto gotError
}
evaluators[j] = eval
}
// Have the evaluators process records from incommingDataCh in a goroutine
evaluatorsWg.Add(1)
go func() {
defer func() {
// Catch the panic that might be generated downstream
if r := recover(); r != nil {
cpErr := fmt.Errorf("StartClusterMap: recovered error while evaluators are processing incommingDataCh: %v", r)
log.Println(cpErr)
debug.PrintStack()
ctx.errCh <- cpErr
close(ctx.done)
}
evaluatorsWg.Done()
}()
// Process the channel
// log.Printf("**!@@ CLUSTER_MAP *5 Processing intermediate channel incommingDataCh")
for inRow := range incommingDataCh {
for i := range evaluators {
err = evaluators[i].apply(&inRow)
if err != nil {
cpErr = fmt.Errorf("while calling apply on PipeTransformationEvaluator (in StartClusterMap): %v", err)
goto gotError
}
}
}
// Done, close the evaluators
for i := range spec.Apply {
if evaluators[i] != nil {
err = evaluators[i].done()
if err != nil {
log.Printf("while calling done on PipeTransformationEvaluator (in StartClusterMap): %v", err)
}
evaluators[i].finally()
}
}
// All good!
// log.Printf("**!@@ CLUSTER_MAP *5 Processing intermediate channel incommingDataCh - All good!")
return
gotError:
for i := range spec.Apply {
if evaluators[i] != nil {
evaluators[i].finally()
}
}
log.Println(cpErr)
ctx.errCh <- cpErr
close(ctx.done)
}()
// Process the source channel, distribute the input records on the cluster,
// The records for this node are sent to incommingDataCh
// Process the channel
// Add a layor of intermediate channels so the main loop does not serialize all the sending of inRow.
// This is to allow sending to peer nodes in parallel
distributionCh = make([]chan []interface{}, ctx.nbrSubClusterNodes)
if ctx.cpConfig.ClusterConfig != nil {
peerBatchSize = ctx.cpConfig.ClusterConfig.PeerBatchSize
}
if peerBatchSize == 0 {
peerBatchSize = 100
}
for i := range distributionCh {
if i == ctx.subClusterNodeId {
// Consume the record locally -- no need for another coroutine, just switch the channel
distributionCh[i] = incommingDataCh
} else {
distributionCh[i] = make(chan []interface{}, int(float32(peerBatchSize)*1.5))
distributionResultCh = make(chan ComputePipesResult, 1)
clusterMapResultCh <- distributionResultCh
distributionWg.Add(1)
// Send record to peer node
go func(iWorker int, resultCh chan ComputePipesResult) {
defer func() {
// Catch the panic that might be generated downstream
if r := recover(); r != nil {
cpErr := fmt.Errorf("StartClusterMap: recovered error while sending records to peer %d: %v", iWorker, r)
log.Println(cpErr)
debug.PrintStack()
ctx.errCh <- cpErr
close(ctx.done)
}
distributionWg.Done()
}()
// log.Printf("**!@@ CLUSTER_MAP *6 Distributing records :: sending to peer %d - starting", iWorker)
var sentRowCount int64
for {
peerMsg := PeerRecordMessage{
Sender: int32(ctx.subClusterNodeId),
Records: make([][]interface{}, peerBatchSize),
}
iCount := 0
for inRow := range distributionCh[iWorker] {
peerMsg.Records[iCount] = inRow
iCount++
if iCount >= peerBatchSize {
break
}
}
if iCount > 0 {
// Send the records to peer
peerMsg.RecordsCount = int32(iCount)
err = outPeers[iWorker].client.Call("PeerServer.PushRecords", &peerMsg, &PeerReply{})
if err != nil {
cpErr = fmt.Errorf("while calling PeerServer.PushRecords to node %d of sub-cluster %d: %v", iWorker, ctx.subClusterId, err)
goto gotError
}
sentRowCount += int64(iCount)
}
if iCount < peerBatchSize {
// We're done the channel is closed, let the peer node know
peerMsg = PeerRecordMessage{
Sender: int32(ctx.subClusterNodeId),
}
err = outPeers[iWorker].client.Call("PeerServer.ClientDone", &peerMsg, &PeerReply{})
if err != nil {
cpErr = fmt.Errorf("while calling PeerServer.ClientDone to node %d of sub-cluster %d: %v", iWorker, ctx.subClusterId, err)
goto gotError
}
break
}
}
// All good!
// log.Printf("**!@@ CLUSTER_MAP *6 Distributing records :: sending to peer %d of sub-cluster %d - All good!", iWorker, ctx.subClusterId)
resultCh <- ComputePipesResult{
TableName: fmt.Sprintf("Record sent to peer %d of sub-cluster %d", iWorker, ctx.subClusterId),
CopyRowCount: sentRowCount,
}
close(resultCh)
return
gotError:
log.Printf("**!@@ CLUSTER_MAP *6 Distributing records :: sending to peer %d of sub-cluster %d - gotError", iWorker, ctx.subClusterId)
log.Println(cpErr)
ctx.errCh <- cpErr
close(ctx.done)
resultCh <- ComputePipesResult{
TableName: fmt.Sprintf("Record sent to peer %d of sub-cluster %d (error)", iWorker, ctx.subClusterId),
CopyRowCount: sentRowCount,
Err: cpErr,
}
close(resultCh)
}(i, distributionResultCh)
}
}
// Keep track of how many records are consume locally by current nodes
consumedLocallyResultCh = make(chan ComputePipesResult, 1)
clusterMapResultCh <- consumedLocallyResultCh
// All the peers distribution goroutines to sent records are established, can now close clusterMapResultCh
close(clusterMapResultCh)
// log.Printf("**!@@ CLUSTER_MAP *5 Processing input source channel: %s", source.config.Name)
for inRow := range source.channel {
v := EvalHash(inRow[spliterColumnIdx], uint64(ctx.nbrSubClusterNodes))
// if v != nil {
// log.Printf("##### EvalHash k: %v, nbr: %d => %v", inRow[spliterColumnIdx], nbrSubClusterNodes, *v)
// } else {
// log.Printf("##### EvalHash k: %v, nbr: %d => NULL", inRow[spliterColumnIdx], nbrSubClusterNodes)
// }
if v != nil {
destinationSubClusterNodeId = int(*v)
} else {
// pick random shard
destinationSubClusterNodeId = rand.Intn(ctx.nbrSubClusterNodes)
}
if destinationSubClusterNodeId == ctx.subClusterNodeId {
nbrRecordsConsumedLocally++
}
// log.Printf("**!@@ CLUSTER_MAP *5 INPUT key: %s, hash: %d => %d", key, keyHash, destinationSubClusterNodeId)
// consume or send the record via the distribution channels
select {
case distributionCh[destinationSubClusterNodeId] <- inRow:
case <-ctx.done:
log.Printf("ClusterMap: writing to incommingDataCh intermediate channel interrupted")
goto doneSource // so we can clean up
}
}
doneSource:
// log.Printf("**!@@ CLUSTER_MAP *5 DONE Processing input source channel: %s", source.config.Name)
consumedLocallyResultCh <- ComputePipesResult{
CopyRowCount: nbrRecordsConsumedLocally,
TableName: fmt.Sprintf("Records consumed locally by node %d of sub-cluster %d", ctx.nodeId, ctx.subClusterId),
}
close(consumedLocallyResultCh)
// Close the distribution channel to outPeer since processing the source has completed
for i := range distributionCh {
if i == ctx.subClusterNodeId {
// Local shard, correspond to incommingDataCh, will be closed once the incomming peer
// connections are closed
} else {
close(distributionCh[i])
}
}
// Wait for the distrubution channels to be completed
// log.Printf("**!@@ CLUSTER_MAP *7 WAIT on distributionWg so we can close the connection to PEER")
distributionWg.Wait()
// log.Printf("**!@@ CLUSTER_MAP *7 DONE WAIT on distributionWg CLOSING connections to PEER")
// Close the outgoing connection to peer nodes
for i := range outPeers {
if outPeers[i].client != nil {
outPeers[i].client.Close()
}
}
// Source channel completed, now wait for the peers with incoming records to complete
// log.Printf("**!@@ CLUSTER_MAP *8 WAIT on peersInWg - incomming PEER")
peersInWg.Wait()
// log.Printf("**!@@ CLUSTER_MAP *8 DONE WAIT on peersInWg - incomming PEER")
// Close incommingDataCh and server listerner
close(incommingDataCh)
server.Close()
server = nil
// When the evaluators has completed processing incommingDataCh then close output channels
evaluatorsWg.Wait()
// All good!
return
gotError:
close(clusterMapResultCh)
log.Println("**!@@ CLUSTER_MAP gotError:", cpErr)
ctx.errCh <- cpErr
close(ctx.done)
if server != nil {
server.Close()
}
}
// Get the nodes' ip address of the nodes that are in this sub-cluster
func (ctx *BuilderContext) updateClusterInfo() error {
stmt := "SELECT node_address FROM jetsapi.cpipes_cluster_node_registry WHERE session_id = $1 AND sc_id = $2 ORDER BY sc_node_id ASC"
ctx.peersAddress = make([]string, 0)
rows, err := ctx.dbpool.Query(context.Background(), stmt, ctx.SessionId(), ctx.subClusterId)
if err != nil {
return fmt.Errorf("while querying peer's address from db (in updateClusterInfo): %v", err)
}
defer rows.Close()
for rows.Next() {
var addr string
if err := rows.Scan(&addr); err != nil {
return fmt.Errorf("while scanning node's address from db (in updateClusterInfo): %v", err)
}
ctx.peersAddress = append(ctx.peersAddress, addr)
}
if len(ctx.peersAddress) != ctx.nbrSubClusterNodes {
return fmt.Errorf("error got %d node addresses from database, expecting %d", len(ctx.peersAddress), ctx.nbrSubClusterNodes)
}
ctx.selfAddress = ctx.peersAddress[ctx.subClusterNodeId]
return nil
}
func (ctx *BuilderContext) updatePeerAddr(peer int) error {
var addr string
stmt := "SELECT node_address FROM jetsapi.cpipes_cluster_node_registry WHERE session_id = $1 AND sc_id = $2 AND sc_node_id = $3"
err := ctx.dbpool.QueryRow(context.Background(), stmt, ctx.SessionId(), ctx.subClusterId, peer).Scan(&addr)
if err != nil {
return fmt.Errorf("while querying peer's address from db (in updatePeerAddr): %v", err)
}
ctx.peersAddress[peer] = addr
return nil
}