sdks/go/pkg/beam/pardo.go

// Licensed to the Apache Software Foundation (ASF) under one or more
// contributor license agreements.  See the NOTICE file distributed with
// this work for additional information regarding copyright ownership.
// The ASF licenses this file to You under the Apache License, Version 2.0
// (the "License"); you may not use this file except in compliance with
// the License.  You may obtain a copy of the License at
//
//    http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package beam

import (
	"fmt"
	"github.com/apache/beam/sdks/go/pkg/beam/core/graph"
	"github.com/apache/beam/sdks/go/pkg/beam/core/graph/coder"
	"github.com/apache/beam/sdks/go/pkg/beam/core/typex"
	"github.com/apache/beam/sdks/go/pkg/beam/internal/errors"
)

func addParDoCtx(err error, s Scope) error {
	return errors.WithContextf(err, "inserting ParDo in scope %s", s)
}

// TryParDo attempts to insert a ParDo transform into the pipeline. It may fail
// for multiple reasons, notably that the dofn is not valid or cannot be bound
// -- due to type mismatch, say -- to the incoming PCollections.
func TryParDo(s Scope, dofn interface{}, col PCollection, opts ...Option) ([]PCollection, error) {
	side, typedefs, err := validate(s, col, opts)
	if err != nil {
		return nil, addParDoCtx(err, s)
	}

	doFnOpt := graph.NumMainInputs(graph.MainSingle)
	// Check the PCollection for any keyed type (not just KV specifically).
	if typex.IsKV(col.Type()) {
		doFnOpt = graph.NumMainInputs(graph.MainKv)
	} else if typex.IsCoGBK(col.Type()) {
		doFnOpt = graph.CoGBKMainInput(len(col.Type().Components()))
	}
	fn, err := graph.NewDoFn(dofn, doFnOpt)
	if err != nil {
		return nil, addParDoCtx(err, s)
	}

	in := []*graph.Node{col.n}
	for _, s := range side {
		in = append(in, s.Input.n)
	}

	var rc *coder.Coder
	if fn.IsSplittable() {
		sdf := (*graph.SplittableDoFn)(fn)
		rc, err = inferCoder(typex.New(sdf.RestrictionT()))
		if err != nil {
			return nil, addParDoCtx(err, s)
		}
	}

	edge, err := graph.NewParDo(s.real, s.scope, fn, in, rc, typedefs)
	if err != nil {
		return nil, addParDoCtx(err, s)
	}

	var ret []PCollection
	for _, out := range edge.Output {
		c := PCollection{out.To}
		c.SetCoder(NewCoder(c.Type()))
		ret = append(ret, c)
	}
	return ret, nil
}

// ParDoN inserts a ParDo with any number of outputs into the pipeline.
func ParDoN(s Scope, dofn interface{}, col PCollection, opts ...Option) []PCollection {
	return MustN(TryParDo(s, dofn, col, opts...))
}

// ParDo0 inserts a ParDo with zero output transform into the pipeline.
func ParDo0(s Scope, dofn interface{}, col PCollection, opts ...Option) {
	ret := MustN(TryParDo(s, dofn, col, opts...))
	if len(ret) != 0 {
		panic(formatParDoError(dofn, len(ret), 0))
	}
}

// ParDo is the core element-wise PTransform in Apache Beam, invoking a
// user-specified function on each of the elements of the input PCollection
// to produce zero or more output elements, all of which are collected into
// the output PCollection. Use one of the ParDo variants for a different
// number of output PCollections. The PCollections do no need to have the
// same types.
//
// Elements are processed independently, and possibly in parallel across
// distributed cloud resources. The ParDo processing style is similar to what
// happens inside the "Mapper" or "Reducer" class of a MapReduce-style
// algorithm.
//
// DoFns
//
// The function to use to process each element is specified by a DoFn, either as
// single function or as a struct with methods, notably ProcessElement. The
// struct may also define Setup, StartBundle, FinishBundle and Teardown methods.
// The struct is JSON-serialized and may contain construction-time values.
//
// Conceptually, when a ParDo transform is executed, the elements of the input
// PCollection are first divided up into some number of "bundles". These are
// farmed off to distributed worker machines (or run locally, if using the
// direct runner). For each bundle of input elements processing proceeds as
// follows:
//
//  * If a struct, a fresh instance of the argument DoFn is created on a
//    worker from json serialization, and the Setup method is called on this
//    instance, if present. A runner may reuse DoFn instances for multiple
//    bundles. A DoFn that has terminated abnormally (by returning an error)
//    will never be reused.
//  * The DoFn's StartBundle method, if provided, is called to initialize it.
//  * The DoFn's ProcessElement method is called on each of the input elements
//    in the bundle.
//  * The DoFn's FinishBundle method, if provided, is called to complete its
//    work. After FinishBundle is called, the framework will not again invoke
//    ProcessElement or FinishBundle until a new call to StartBundle has
//    occurred.
//  * If any of Setup, StartBundle, ProcessElement or FinishBundle methods
//    return an error, the Teardown method, if provided, will be called on the
//    DoFn instance.
//  * If a runner will no longer use a DoFn, the Teardown method, if provided,
//    will be called on the discarded instance.
//
// Each of the calls to any of the DoFn's processing methods can produce zero
// or more output elements. All of the of output elements from all of the DoFn
// instances are included in an output PCollection.
//
// For example:
//
//    words := beam.ParDo(s, &Foo{...}, ...)
//    lengths := beam.ParDo(s, func (word string) int) {
//          return len(word)
//    }, words)
//
//
// Each output element has the same timestamp and is in the same windows as its
// corresponding input element. The timestamp can be accessed and/or emitted by
// including a EventTime-typed parameter. The name of the function or struct is
// used as the DoFn name. Function literals do not have stable names and should
// thus not be used in production code.
//
// Side Inputs
//
// While a ParDo processes elements from a single "main input" PCollection, it
// can take additional "side input" PCollections. These SideInput along with
// the DoFn parameter form express styles of accessing PCollection computed by
// earlier pipeline operations, passed in to the ParDo transform using SideInput
// options, and their contents accessible to each of the DoFn operations. For
// example:
//
//     words := ...
//     cufoff := ...  // Singleton PCollection<int>
//     smallWords := beam.ParDo(s, func (word string, cutoff int, emit func(string)) {
//           if len(word) < cutoff {
//                emit(word)
//           }
//     }, words, beam.SideInput{Input: cutoff})
//
// Additional Outputs
//
// Optionally, a ParDo transform can produce zero or multiple output
// PCollections. Note the use of ParDo2 to specfic 2 outputs. For example:
//
//     words := ...
//     cufoff := ...  // Singleton PCollection<int>
//     small, big := beam.ParDo2(s, func (word string, cutoff int, small, big func(string)) {
//           if len(word) < cutoff {
//                small(word)
//           } else {
//                big(word)
//           }
//     }, words, beam.SideInput{Input: cutoff})
//
//
// By default, the Coders for the elements of each output PCollections is
// inferred from the concrete type.
//
// No Global Shared State
//
// There are three main ways to initialize the state of a DoFn instance
// processing a bundle:
//
//  * Define public instance variable state. This state will be automatically
//    JSON serialized and then deserialized in the DoFn instances created for
//    bundles. This method is good for state known when the original DoFn is
//    created in the main program, if it's not overly large. This is not
//    suitable for any state which must only be used for a single bundle, as
//    DoFn's may be used to process multiple bundles.
//
//  * Compute the state as a singleton PCollection and pass it in as a side
//    input to the DoFn. This is good if the state needs to be computed by the
//    pipeline, or if the state is very large and so is best read from file(s)
//    rather than sent as part of the DoFn's serialized state.
//
//  * Initialize the state in each DoFn instance, in a StartBundle method.
//    This is good if the initialization doesn't depend on any information
//    known only by the main program or computed by earlier pipeline
//    operations, but is the same for all instances of this DoFn for all
//    program executions, say setting up empty caches or initializing constant
//    data.
//
// ParDo operations are intended to be able to run in parallel across multiple
// worker machines. This precludes easy sharing and updating mutable state
// across those machines. There is no support in the Beam model for
// communicating and synchronizing updates to shared state across worker
// machines, so programs should not access any mutable global variable state in
// their DoFn, without understanding that the Go processes for the main program
// and workers will each have its own independent copy of such state, and there
// won't be any automatic copying of that state across Java processes. All
// information should be communicated to DoFn instances via main and side
// inputs and serialized state, and all output should be communicated from a
// DoFn instance via output PCollections, in the absence of external
// communication mechanisms written by user code.
//
// Splittable DoFns (Experimental)
//
// Warning: Splittable DoFns are still experimental, largely untested, and
// likely to have bugs.
//
// Splittable DoFns are DoFns that are able to split work within an element,
// as opposed to only at element boundaries like normal DoFns. This is useful
// for DoFns that emit many outputs per input element and can distribute that
// work among multiple workers. The most common examples of this are sources.
//
// In order to split work within an element, splittable DoFns use the concept of
// restrictions, which are objects that are associated with an element and
// describe a portion of work on that element. For example, a restriction
// associated with a filename might describe what byte range within that file to
// process. In addition to restrictions, splittable DoFns also rely on
// restriction trackers to track progress and perform splits on a restriction
// currently being processed. See the `RTracker` interface in core/sdf/sdf.go
// for more details.
//
// Splitting
//
// Splitting means taking one restriction and splitting into two or more that
// cover the entire input space of the original one. In other words, processing
// all the split restrictions should produce identical output to processing
// the original one.
//
// Splitting occurs in two stages. The initial splitting occurs before any
// restrictions have started processing. This step is used to split large
// restrictions into smaller ones that can then be distributed among multiple
// workers for processing. Initial splitting is user-defined and optional.
//
// Dynamic splitting occurs during the processing of a restriction in runners
// that have implemented it. If there are available workers, runners may split
// the unprocessed portion of work from a busy worker and shard it to available
// workers in order to better distribute work. With unsplittable DoFns this can
// only occur on element boundaries, but for splittable DoFns this split
// can land within a restriction and will require splitting that restriction.
//
// * Note: The Go SDK currently does not support dynamic splitting for SDFs,
//   only initial splitting. Only initially split restrictions can be
//   distributed by liquid sharding. Stragglers will not be split during
//   execution with dynamic splitting.
//
// Splittable DoFn Methods
//
// Making a splittable DoFn requires the following methods to be implemented on
// a DoFn in addition to the usual DoFn requirements. In the following
// method signatures `elem` represents the main input elements to the DoFn, and
// should match the types used in ProcessElement. `restriction` represents the
// user-defined restriction, and can be any type as long as it is consistent
// throughout all the splittable DoFn methods:
//
// * `CreateInitialRestriction(element) restriction`
//     CreateInitialRestriction creates an initial restriction encompassing an
//     entire element. The restriction created stays associated with the element
//     it describes.
// * `SplitRestriction(elem, restriction) []restriction`
//     SplitRestriction takes an element and its initial restriction, and
//     optionally performs an initial split on it, returning a slice of all the
//     split restrictions. If no splits are desired, the method returns a slice
//     containing only the original restriction. This method will always be
//     called on each newly created restriction before they are processed.
// * `RestrictionSize(elem, restriction) float64`
//     RestrictionSize returns a cheap size estimation for a restriction. This
//     size is an abstract scalar value that represents how much work a
//     restriction takes compared to other restrictions in the same DoFn. For
//     example, a size of 200 represents twice as much work as a size of
//     100, but the numbers do not represent anything on their own. Size is
//     used by runners to estimate work for liquid sharding.
// * `CreateTracker(restriction) restrictionTracker`
//     CreateTracker creates and returns a restriction tracker (a concrete type
//     implementing the `sdf.RTracker` interface) given a restriction. The
//     restriction tracker is used to track progress processing a restriction,
//     and to allow for dynamic splits. This method is called on each
//     restriction right before processing begins.
// * `ProcessElement(sdf.RTracker, element, func emit(output))`
//     For splittable DoFns, ProcessElement requires a restriction tracker
//     before inputs, and generally requires emits to be used for outputs, since
//     restrictions will generally produce multiple outputs. For more details
//     on processing restrictions in a splittable DoFn, see `sdf.RTracker`.
//
// Fault Tolerance
//
// In a distributed system, things can fail: machines can crash, machines can
// be unable to communicate across the network, etc. While individual failures
// are rare, the larger the job, the greater the chance that something,
// somewhere, will fail. Beam runners may strive to mask such failures by
// retrying failed DoFn bundles. This means that a DoFn instance might process
// a bundle partially, then crash for some reason, then be rerun (often as a
// new process) on that same bundle and on the same elements as before.
// Sometimes two or more DoFn instances will be running on the same bundle
// simultaneously, with the system taking the results of the first instance to
// complete successfully. Consequently, the code in a DoFn needs to be written
// such that these duplicate (sequential or concurrent) executions do not cause
// problems. If the outputs of a DoFn are a pure function of its inputs, then
// this requirement is satisfied. However, if a DoFn's execution has external
// side-effects, such as performing updates to external HTTP services, then
// the DoFn's code needs to take care to ensure that those updates are
// idempotent and that concurrent updates are acceptable. This property can be
// difficult to achieve, so it is advisable to strive to keep DoFns as pure
// functions as much as possible.
//
// Optimization
//
// Beam runners may choose to apply optimizations to a pipeline before it is
// executed. A key optimization, fusion, relates to ParDo operations. If one
// ParDo operation produces a PCollection that is then consumed as the main
// input of another ParDo operation, the two ParDo operations will be fused
// together into a single ParDo operation and run in a single pass; this is
// "producer-consumer fusion". Similarly, if two or more ParDo operations
// have the same PCollection main input, they will be fused into a single ParDo
// that makes just one pass over the input PCollection; this is "sibling
// fusion".
//
// If after fusion there are no more unfused references to a PCollection (e.g.,
// one between a producer ParDo and a consumer ParDo), the PCollection itself
// is "fused away" and won't ever be written to disk, saving all the I/O and
// space expense of constructing it.
//
// When Beam runners apply fusion optimization, it is essentially "free" to
// write ParDo operations in a very modular, composable style, each ParDo
// operation doing one clear task, and stringing together sequences of ParDo
// operations to get the desired overall effect. Such programs can be easier to
// understand, easier to unit-test, easier to extend and evolve, and easier to
// reuse in new programs. The predefined library of PTransforms that come with
// Beam makes heavy use of this modular, composable style, trusting to the
// runner to "flatten out" all the compositions into highly optimized stages.
//
// See https://beam.apache.org/documentation/programming-guide/#pardo
// for the web documentation for ParDo
func ParDo(s Scope, dofn interface{}, col PCollection, opts ...Option) PCollection {
	ret := MustN(TryParDo(s, dofn, col, opts...))
	if len(ret) != 1 {
		panic(formatParDoError(dofn, len(ret), 1))
	}
	return ret[0]
}

// TODO(herohde) 6/1/2017: add windowing aspects to above documentation.

// ParDo2 inserts a ParDo with 2 outputs into the pipeline.
func ParDo2(s Scope, dofn interface{}, col PCollection, opts ...Option) (PCollection, PCollection) {
	ret := MustN(TryParDo(s, dofn, col, opts...))
	if len(ret) != 2 {
		panic(formatParDoError(dofn, len(ret), 2))
	}
	return ret[0], ret[1]
}

// ParDo3 inserts a ParDo with 3 outputs into the pipeline.
func ParDo3(s Scope, dofn interface{}, col PCollection, opts ...Option) (PCollection, PCollection, PCollection) {
	ret := MustN(TryParDo(s, dofn, col, opts...))
	if len(ret) != 3 {
		panic(formatParDoError(dofn, len(ret), 3))
	}
	return ret[0], ret[1], ret[2]
}

// ParDo4 inserts a ParDo with 4 outputs into the pipeline.
func ParDo4(s Scope, dofn interface{}, col PCollection, opts ...Option) (PCollection, PCollection, PCollection, PCollection) {
	ret := MustN(TryParDo(s, dofn, col, opts...))
	if len(ret) != 4 {
		panic(formatParDoError(dofn, len(ret), 4))
	}
	return ret[0], ret[1], ret[2], ret[3]
}

// ParDo5 inserts a ParDo with 5 outputs into the pipeline.
func ParDo5(s Scope, dofn interface{}, col PCollection, opts ...Option) (PCollection, PCollection, PCollection, PCollection, PCollection) {
	ret := MustN(TryParDo(s, dofn, col, opts...))
	if len(ret) != 5 {
		panic(formatParDoError(dofn, len(ret), 5))
	}
	return ret[0], ret[1], ret[2], ret[3], ret[4]
}

// ParDo6 inserts a ParDo with 6 outputs into the pipeline.
func ParDo6(s Scope, dofn interface{}, col PCollection, opts ...Option) (PCollection, PCollection, PCollection, PCollection, PCollection, PCollection) {
	ret := MustN(TryParDo(s, dofn, col, opts...))
	if len(ret) != 6 {
		panic(formatParDoError(dofn, len(ret), 6))
	}
	return ret[0], ret[1], ret[2], ret[3], ret[4], ret[5]
}

// ParDo7 inserts a ParDo with 7 outputs into the pipeline.
func ParDo7(s Scope, dofn interface{}, col PCollection, opts ...Option) (PCollection, PCollection, PCollection, PCollection, PCollection, PCollection, PCollection) {
	ret := MustN(TryParDo(s, dofn, col, opts...))
	if len(ret) != 7 {
		panic(formatParDoError(dofn, len(ret), 7))
	}
	return ret[0], ret[1], ret[2], ret[3], ret[4], ret[5], ret[6]
}

// formatParDoError is a helper function to provide a more concise error
// message to the users when a DoFn and its ParDo pairing is incorrect.
//
// We construct a new graph.Fn using the doFn which is passed. We explicitly
// ignore the error since we already know that its already a DoFn type as
// TryParDo would have panicked otherwise.
func formatParDoError(doFn interface{}, emitSize int, parDoSize int) string {
	doFun, _ := graph.NewFn(doFn)
	doFnName := doFun.Name()

	thisParDo := parDoForSize(parDoSize) // Conveniently keeps the API slim.
	correctParDo := parDoForSize(emitSize)

	return fmt.Sprintf("DoFn %v has %v outputs, but %v requires %v outputs, use %v instead.", doFnName, emitSize, thisParDo, parDoSize, correctParDo)
}

// parDoForSize takes a in a DoFns emit dimension and recommends the correct
// ParDo to use.
func parDoForSize(emitDim int) string {
	switch emitDim {
	case 0, 2, 3, 4, 5, 6, 7:
		return fmt.Sprintf("ParDo%d", emitDim)
	case 1:
		return "ParDo"
	default:
		return "ParDoN"
	}
}