From 2bdda86170c25d0a9fadca65d33077b53b0734fc Mon Sep 17 00:00:00 2001 From: Stephan Ewen Date: Mon, 26 Feb 2018 16:41:24 +0100 Subject: [PATCH] [FLINK-8791] [docs] Fix documentation about configuring dependencies --- docs/dev/linking.md | 96 ------- docs/dev/linking_with_flink.md | 146 ----------- docs/redirects/linking_with_flink.md | 25 ++ .../linking_with_optional_modules.md | 25 ++ docs/start/dependencies.md | 244 ++++++++++++++++++ 5 files changed, 294 insertions(+), 242 deletions(-) delete mode 100644 docs/dev/linking.md delete mode 100644 docs/dev/linking_with_flink.md create mode 100644 docs/redirects/linking_with_flink.md create mode 100644 docs/redirects/linking_with_optional_modules.md create mode 100644 docs/start/dependencies.md diff --git a/docs/dev/linking.md b/docs/dev/linking.md deleted file mode 100644 index 78ef54494831f..0000000000000 --- a/docs/dev/linking.md +++ /dev/null @@ -1,96 +0,0 @@ ---- -nav-title: "Linking with Optional Modules" -title: "Linking with modules not contained in the binary distribution" -nav-parent_id: start -nav-pos: 10 ---- - - -The binary distribution contains jar packages in the `lib` folder that are automatically -provided to the classpath of your distributed programs. Almost all of Flink classes are -located there with a few exceptions, for example the streaming connectors and some freshly -added modules. To run code depending on these modules you need to make them accessible -during runtime, for which we suggest two options: - -1. Either copy the required jar files to the `lib` folder onto all of your TaskManagers. -Note that you have to restart your TaskManagers after this. -2. Or package them with your code. - -The latter version is recommended as it respects the classloader management in Flink. - -### Packaging dependencies with your usercode with Maven - -To provide these dependencies not included by Flink we suggest two options with Maven. - -1. The maven assembly plugin builds a so-called uber-jar (executable jar) containing all your dependencies. -The assembly configuration is straight-forward, but the resulting jar might become bulky. -See [maven-assembly-plugin](http://maven.apache.org/plugins/maven-assembly-plugin/usage.html) for further information. -2. The maven unpack plugin unpacks the relevant parts of the dependencies and -then packages it with your code. - -Using the latter approach in order to bundle the Kafka connector, `flink-connector-kafka` -you would need to add the classes from both the connector and the Kafka API itself. Add -the following to your plugins section. - -~~~xml - - org.apache.maven.plugins - maven-dependency-plugin - 2.9 - - - unpack - - prepare-package - - unpack - - - - - - org.apache.flink - flink-connector-kafka - {{ site.version }} - jar - false - ${project.build.directory}/classes - org/apache/flink/** - - - - org.apache.kafka - kafka_ - - jar - false - ${project.build.directory}/classes - kafka/** - - - - - - -~~~ - -Now when running `mvn clean package` the produced jar includes the required dependencies. - -{% top %} diff --git a/docs/dev/linking_with_flink.md b/docs/dev/linking_with_flink.md deleted file mode 100644 index f2380b23a07f6..0000000000000 --- a/docs/dev/linking_with_flink.md +++ /dev/null @@ -1,146 +0,0 @@ ---- -title: "Linking with Flink" -nav-parent_id: start -nav-pos: 2 ---- - - -To write programs with Flink, you need to include the Flink library corresponding to -your programming language in your project. - -The simplest way to do this is to use one of the quickstart scripts: either for -[Java]({{ site.baseurl }}/quickstart/java_api_quickstart.html) or for [Scala]({{ site.baseurl }}/quickstart/scala_api_quickstart.html). They -create a blank project from a template (a Maven Archetype), which sets up everything for you. To -manually create the project, you can use the archetype and create a project by calling: - -
-
-{% highlight bash %} -mvn archetype:generate \ - -DarchetypeGroupId=org.apache.flink \ - -DarchetypeArtifactId=flink-quickstart-java \ - -DarchetypeVersion={{site.version }} -{% endhighlight %} -
-
-{% highlight bash %} -mvn archetype:generate \ - -DarchetypeGroupId=org.apache.flink \ - -DarchetypeArtifactId=flink-quickstart-scala \ - -DarchetypeVersion={{site.version }} -{% endhighlight %} -
-
- -The archetypes are working for stable releases and preview versions (`-SNAPSHOT`). - -If you want to add Flink to an existing Maven project, add the following entry to your -*dependencies* section in the *pom.xml* file of your project: - -
-
-{% highlight xml %} - - - org.apache.flink - flink-streaming-java{{ site.scala_version_suffix }} - {{site.version }} - - - - org.apache.flink - flink-java - {{site.version }} - - - org.apache.flink - flink-clients{{ site.scala_version_suffix }} - {{site.version }} - -{% endhighlight %} -
-
-{% highlight xml %} - - - org.apache.flink - flink-streaming-scala{{ site.scala_version_suffix }} - {{site.version }} - - - - org.apache.flink - flink-scala{{ site.scala_version_suffix }} - {{site.version }} - - - org.apache.flink - flink-clients{{ site.scala_version_suffix }} - {{site.version }} - -{% endhighlight %} - -**Important:** When working with the Scala API you must have one of these two imports: -{% highlight scala %} -import org.apache.flink.api.scala._ -{% endhighlight %} - -or - -{% highlight scala %} -import org.apache.flink.api.scala.createTypeInformation -{% endhighlight %} - -The reason is that Flink analyzes the types that are used in a program and generates serializers -and comparators for them. By having either of those imports you enable an implicit conversion -that creates the type information for Flink operations. - -If you would rather use SBT, see [here]({{ site.baseurl }}/quickstart/scala_api_quickstart.html#sbt). -
-
- -#### Scala Dependency Versions - -Because Scala 2.10 binary is not compatible with Scala 2.11 binary, we provide multiple artifacts -to support both Scala versions. - -Starting from the 0.10 line, we cross-build all Flink modules for both 2.10 and 2.11. If you want -to run your program on Flink with Scala 2.11, you need to add a `_2.11` suffix to the `artifactId` -values of the Flink modules in your dependencies section. - -If you are looking for building Flink with Scala 2.11, please check -[build guide]({{ site.baseurl }}/start/building.html#scala-versions). - -#### Hadoop Dependency Versions - -If you are using Flink together with Hadoop, the version of the dependency may vary depending on the -version of Hadoop (or more specifically, HDFS) that you want to use Flink with. Please refer to the -[downloads page](http://flink.apache.org/downloads.html) for a list of available versions, and instructions -on how to link with custom versions of Hadoop. - -In order to link against the latest SNAPSHOT versions of the code, please follow -[this guide](http://flink.apache.org/how-to-contribute.html#snapshots-nightly-builds). - -The *flink-clients* dependency is only necessary to invoke the Flink program locally (for example to -run it standalone for testing and debugging). If you intend to only export the program as a JAR -file and [run it on a cluster]({{ site.baseurl }}/dev/cluster_execution.html), you can skip that dependency. - -{% top %} - diff --git a/docs/redirects/linking_with_flink.md b/docs/redirects/linking_with_flink.md new file mode 100644 index 0000000000000..1289487c9ff22 --- /dev/null +++ b/docs/redirects/linking_with_flink.md @@ -0,0 +1,25 @@ +--- +title: "Linking with Flink" +layout: redirect +redirect: /start/dependencies.html +permalink: /dev/linking_with_flink.html +--- + + diff --git a/docs/redirects/linking_with_optional_modules.md b/docs/redirects/linking_with_optional_modules.md new file mode 100644 index 0000000000000..e494fbc54d082 --- /dev/null +++ b/docs/redirects/linking_with_optional_modules.md @@ -0,0 +1,25 @@ +--- +title: "Linking with Optional Modules" +layout: redirect +redirect: /start/dependencies.html +permalink: /dev/linking.html +--- + + diff --git a/docs/start/dependencies.md b/docs/start/dependencies.md new file mode 100644 index 0000000000000..1375c6f30f8ea --- /dev/null +++ b/docs/start/dependencies.md @@ -0,0 +1,244 @@ +--- +title: "Configuring Dependencies, Connectors, Libraries" +nav-parent_id: start +nav-pos: 2 +--- + + +Every Flink application depends on a set of Flink libraries. At the bare minimum, the application depends +on the Flink APIs. Many applications depend in addition on certain connector libraries (like Kafka, Cassandra, etc.). +When running Flink applications (either in a distributed deployment, or in the IDE for testing), the Flink +runtime library must be available as well. + + +## Flink Core and Application Dependencies + +As with most systems that run user-defined applications, there are two broad categories of dependencies and libraries in Flink: + + - **Flink Core Dependenies**: Flink itself consists of a set of classes and dependencies that are needed to run the system, for example + coordination, networking, checkpoints, failover, APIs, operations (such as windowing), resource management, etc. + The set of all these classes and dependencies forms the core of Flink's runtime and must be present when a Flink + application is started. + + These core classes and dependencies are packaged in the `flink-dist` jar. They are part of Flink's `lib` folder and + part of the basic Flink container images. Think of these dependencies as similar to Java's core library (`rt.jar`, `charsets.jar`, etc.), + which contains the classes like `String` and `List`. + + The Flink Core Dependencies do not contain any connectors or libraries (CEP, SQL, ML, etc.) in order to avoid having an excessive + number of dependencies and classes in the classpath by default. In fact, we try to keep the core dependencies as slim as possible + to keep the default classpath small and avoid dependency clashes. + + - The **User Application Dependencies** are all connectors, formats, or libraries that a specific user application needs. + + The user application is typically packaged into an *application jar*, which contains the application code and the required + connector and library dependencies. + + The user application dependencies explicitly do not include the Flink DataSet / DataStream APIs and runtime dependencies, + because those are already part of Flink's Core Dependencies. + + +## Setting up a Project: Basic Dependencies + +Every Flink application needs as the bare minimum the API dependencies, to develop against. +For Maven, you can use the [Java Project Template]({{ site.baseurl }}/quickstart/java_api_quickstart.html) +or [Scala Project Template]({{ site.baseurl }}/quickstart/scala_api_quickstart.html) to create +a program skeleton with these initial dependencies. + +When setting up a project manually, you need to add the following dependencies for the Java/Scala API +(here presented in Maven syntax, but the same dependencies apply to other build tools (Gradle, SBT, etc.) as well. + +
+
+{% highlight xml %} + + org.apache.flink + flink-java + {{site.version }} + provided + + + org.apache.flink + flink-streaming-java{{ site.scala_version_suffix }} + {{site.version }} + provided + +{% endhighlight %} +
+
+{% highlight xml %} + + org.apache.flink + flink-scala{{ site.scala_version_suffix }} + {{site.version }} + provided + + + org.apache.flink + flink-streaming-scala{{ site.scala_version_suffix }} + {{site.version }} + provided + +{% endhighlight %} +
+
+ +**Important:** Please note that all these dependencies have their scope set to *provided*. +That means that they are needed to compile against, but that they should not be packaged into the +project's resulting application jar file - these dependencies are Flink Core Dependencies, +which are already available in any setup. + +It is highly recommended to keep the dependencies in scope *provided*. If they are not set to *provided*, +the best case is that the resulting JAR becomes excessively large, because it also contains all Flink core +dependencies. The worst case is that the Flink core dependencies that are added to the application's jar file +clash with some of your own dependency versions (which is normally avoided through inverted classloading). + +**Note on IntelliJ:** To make the applications run within IntelliJ IDEA, the Flink dependencies need +to be declared in scope *compile* rather than *provided*. Otherwise IntelliJ will not add them to the classpath and +the in-IDE execution will fail with a `NoClassDefFountError`. To avoid having to declare the +dependency scope as *compile* (which is not recommended, see above), the above linked Java- and Scala +project templates use a trick: They add a profile that selectively activates when the application +is run in IntelliJ and only then promotes the dependencies to scope *compile*, without affecting +the packaging of the JAR files. + + +## Adding Connector and Library Dependencies + +Most applications need specific connectors or libraries to run, for example a connector to Kafka, Cassandra, etc. +These connectors are not part of Flink's core dependencies and must hence be added as dependencies to the application + +Below is an example adding the connector for Kafka 0.10 as a dependency (Maven syntax): +{% highlight xml %} + + org.apache.flink + flink-connector-kafka-0.10{{ site.scala_version_suffix }} + {{site.version }} + +{% endhighlight %} + +We recommend to package the application code and all its required dependencies into one *jar-with-dependencies* which +we refer to as the *application jar*. The application jar can be submitted to an already running Flink cluster, +or added to a Flink application container image. + +Projects created from the [Java Project Template]({{ site.baseurl }}/quickstart/java_api_quickstart.html) or +[Scala Project Template]({{ site.baseurl }}/quickstart/scala_api_quickstart.html) are configured to automatically include +the application dependencies into the application jar when running `mvn clean package`. For projects that are +not set up from those templates, we recommend to add the Maven Shade Plugin (as listed in the Appendix below) +to build the application jar with all required dependencies. + +**Important:** For Maven (and other build tools) to correctly package the dependencies into the application jar, +these application dependencies must be specified in scope *compile* (unlike the core dependencies, which +must be specified in scope *provided*). + + +## Scala Versions + +Scala versions (2.10, 2.11, 2.12, etc.) are not binary compatible with one another. +For that reason, Flink for Scala 2.11 cannot be used with an application that uses +Scala 2.12. + +All Flink dependencies that (transitively) depend on Scala are suffixed with the +Scala version that they are built for, for example `flink-streaming-scala_2.11`. + +Developers that only use Java can pick any Scala version, Scala developers need to +pick the Scala version that matches their application's Scala version. + +Please refer to the [build guide]({{ site.baseurl }}/start/building.html#scala-versions) +for details on how to build Flink for a specific Scala version. + +**Note:** Because of major breaking changes in Scala 2.12, Flink 1.5 currently builds only for Scala 2.11. +We aim to add support for Scala 2.12 in the next versions. + + +## Hadoop Dependencies + +**General rule: It should never be necessary to add Hadoop dependencies directly to your application.** +*(The only exception being when using existing Hadoop input-/output formats with Flink's Hadoop compatibility wrappers)* + +If you want to use Flink with Hadoop, you need to have a Flink setup that includes the Hadoop dependencies, rather than +adding Hadoop as an application dependency. Please refer to the [Hadoop Setup Guide]({{ site.baseurl }}/ops/deployment/hadoop.html) +for details. + +There are two main reasons for that design: + + - Some Hadoop interaction happens in Flink's core, possibly before the user application is started, for example + setting up HDFS for checkpoints, authenticating via Hadoop's Kerberos tokens, or deployment on YARN. + + - Flink's inverted classloading approach hides many transitive dependencies from the core dependencies. That applies not only + to Flink's own core dependencies, but also to Hadoop's dependencies when present in the setup. + That way, applications can use different versions of the same dependencies without running into dependency conflicts (and + trust us, that's a big deal, because Hadoops dependency tree is huge.) + +If you need Hadoop dependencies during testing or development inside the IDE (for example for HDFS access), please configure +these dependencies similar to the scope of the dependencies to *test* or to *provided*. + + +## Appendix: Template for bulding a Jar with Dependencies + +To build an application JAR that contains all dependencies required for declared connectors and libraries, +you can use the following shade plugin definition: + +{% highlight xml %} + + + + org.apache.maven.plugins + maven-shade-plugin + 3.0.0 + + + package + + shade + + + + + com.google.code.findbugs:jsr305 + org.slf4j:* + log4j:* + + + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + + my.prorgams.main.clazz + + + + + + + + +{% endhighlight %} + +{% top %} +