diff --git a/.github/workflows/flink-ci.yml b/.github/workflows/flink-ci.yml index 19255e778daa..070adc33683a 100644 --- a/.github/workflows/flink-ci.yml +++ b/.github/workflows/flink-ci.yml @@ -93,7 +93,7 @@ jobs: strategy: matrix: jvm: [8, 11] - flink: ['1.13', '1.14', '1.15'] + flink: ['1.14', '1.15'] env: SPARK_LOCAL_IP: localhost steps: diff --git a/.github/workflows/java-ci.yml b/.github/workflows/java-ci.yml index 2e52ea960d38..756bcaab578b 100644 --- a/.github/workflows/java-ci.yml +++ b/.github/workflows/java-ci.yml @@ -87,7 +87,7 @@ jobs: with: distribution: zulu java-version: 8 - - run: ./gradlew -DflinkVersions=1.13,1.14,1.15 -DsparkVersions=2.4,3.0,3.1,3.2,3.3 -DhiveVersions=2,3 build -x test -x javadoc -x integrationTest + - run: ./gradlew -DflinkVersions=1.14,1.15 -DsparkVersions=2.4,3.0,3.1,3.2,3.3 -DhiveVersions=2,3 build -x test -x javadoc -x integrationTest build-javadoc: runs-on: ubuntu-20.04 diff --git a/.github/workflows/publish-snapshot.yml b/.github/workflows/publish-snapshot.yml index b9b667a8180a..e25387fab7cf 100644 --- a/.github/workflows/publish-snapshot.yml +++ b/.github/workflows/publish-snapshot.yml @@ -40,5 +40,5 @@ jobs: java-version: 8 - run: | ./gradlew printVersion - ./gradlew -DflinkVersions=1.13,1.14,1.15 -DsparkVersions=2.4,3.0,3.1,3.2,3.3 -DhiveVersions=2,3 publishApachePublicationToMavenRepository -PmavenUser=${{ secrets.NEXUS_USER }} -PmavenPassword=${{ secrets.NEXUS_PW }} + ./gradlew -DflinkVersions=1.14,1.15 -DsparkVersions=2.4,3.0,3.1,3.2,3.3 -DhiveVersions=2,3 publishApachePublicationToMavenRepository -PmavenUser=${{ secrets.NEXUS_USER }} -PmavenPassword=${{ secrets.NEXUS_PW }} ./gradlew -DflinkVersions= -DsparkVersions=3.2,3.3 -DscalaVersion=2.13 -DhiveVersions= publishApachePublicationToMavenRepository -PmavenUser=${{ secrets.NEXUS_USER }} -PmavenPassword=${{ secrets.NEXUS_PW }} diff --git a/dev/stage-binaries.sh b/dev/stage-binaries.sh index 58bc388eb3d6..bf4f61ee4106 100755 --- a/dev/stage-binaries.sh +++ b/dev/stage-binaries.sh @@ -19,7 +19,7 @@ # SCALA_VERSION=2.12 -FLINK_VERSIONS=1.13,1.14,1.15 +FLINK_VERSIONS=1.14,1.15 SPARK_VERSIONS=2.4,3.0,3.1,3.2,3.3 HIVE_VERSIONS=2,3 diff --git a/flink/build.gradle b/flink/build.gradle index 38caa806077c..4a6bab3bfbed 100644 --- a/flink/build.gradle +++ b/flink/build.gradle @@ -19,10 +19,6 @@ def flinkVersions = (System.getProperty("flinkVersions") != null ? System.getProperty("flinkVersions") : System.getProperty("defaultFlinkVersions")).split(",") -if (flinkVersions.contains("1.13")) { - apply from: file("$projectDir/v1.13/build.gradle") -} - if (flinkVersions.contains("1.14")) { apply from: file("$projectDir/v1.14/build.gradle") } diff --git a/flink/v1.13/build.gradle b/flink/v1.13/build.gradle deleted file mode 100644 index cb0ac8ad3f6a..000000000000 --- a/flink/v1.13/build.gradle +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -String flinkVersion = '1.13.2' -String flinkMajorVersion = '1.13' -String scalaVersion = System.getProperty("scalaVersion") != null ? System.getProperty("scalaVersion") : System.getProperty("defaultScalaVersion") - -project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}") { - - dependencies { - implementation project(path: ':iceberg-bundled-guava', configuration: 'shadow') - api project(':iceberg-api') - implementation project(':iceberg-common') - implementation project(':iceberg-core') - api project(':iceberg-data') - implementation project(':iceberg-orc') - implementation project(':iceberg-parquet') - implementation project(':iceberg-hive-metastore') - - compileOnly "org.apache.flink:flink-streaming-java_${scalaVersion}:${flinkVersion}" - compileOnly "org.apache.flink:flink-streaming-java_${scalaVersion}:${flinkVersion}:tests" - compileOnly "org.apache.flink:flink-table-api-java-bridge_${scalaVersion}:${flinkVersion}" - compileOnly "org.apache.flink:flink-table-planner-blink_${scalaVersion}:${flinkVersion}" - compileOnly "org.apache.flink:flink-table-planner_${scalaVersion}:${flinkVersion}" - compileOnly "org.apache.hadoop:hadoop-hdfs" - compileOnly "org.apache.hadoop:hadoop-common" - compileOnly("org.apache.hadoop:hadoop-minicluster") { - exclude group: 'org.apache.avro', module: 'avro' - } - - implementation("org.apache.parquet:parquet-avro") { - exclude group: 'org.apache.avro', module: 'avro' - // already shaded by Parquet - exclude group: 'it.unimi.dsi' - exclude group: 'org.codehaus.jackson' - } - - compileOnly "org.apache.avro:avro" - - implementation("org.apache.orc:orc-core::nohive") { - exclude group: 'org.apache.hadoop' - exclude group: 'commons-lang' - // These artifacts are shaded and included in the orc-core fat jar - exclude group: 'com.google.protobuf', module: 'protobuf-java' - exclude group: 'org.apache.hive', module: 'hive-storage-api' - } - - testImplementation "org.apache.flink:flink-core:${flinkVersion}" - testImplementation "org.apache.flink:flink-runtime_${scalaVersion}:${flinkVersion}" - testImplementation "org.apache.flink:flink-table-planner-blink_${scalaVersion}:${flinkVersion}" - testImplementation ("org.apache.flink:flink-test-utils-junit:${flinkVersion}") { - exclude group: 'junit' - } - testImplementation("org.apache.flink:flink-test-utils_${scalaVersion}:${flinkVersion}") { - exclude group: "org.apache.curator", module: 'curator-test' - exclude group: 'junit' - } - - testImplementation project(path: ':iceberg-hive-metastore', configuration: 'testArtifacts') - testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') - testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') - testImplementation project(path: ':iceberg-data', configuration: 'testArtifacts') - - // By default, hive-exec is a fat/uber jar and it exports a guava library - // that's really old. We use the core classifier to be able to override our guava - // version. Luckily, hive-exec seems to work okay so far with this version of guava - // See: https://github.com/apache/hive/blob/master/ql/pom.xml#L911 for more context. - testImplementation("org.apache.hive:hive-exec::core") { - exclude group: 'org.apache.avro', module: 'avro' - exclude group: 'org.slf4j', module: 'slf4j-log4j12' - exclude group: 'org.pentaho' // missing dependency - exclude group: 'org.apache.hive', module: 'hive-llap-tez' - exclude group: 'org.apache.logging.log4j' - exclude group: 'com.google.protobuf', module: 'protobuf-java' - exclude group: 'org.apache.calcite' - exclude group: 'org.apache.calcite.avatica' - exclude group: 'com.google.code.findbugs', module: 'jsr305' - } - - testImplementation("org.apache.hive:hive-metastore") { - exclude group: 'org.apache.avro', module: 'avro' - exclude group: 'org.slf4j', module: 'slf4j-log4j12' - exclude group: 'org.pentaho' // missing dependency - exclude group: 'org.apache.hbase' - exclude group: 'org.apache.logging.log4j' - exclude group: 'co.cask.tephra' - exclude group: 'com.google.code.findbugs', module: 'jsr305' - exclude group: 'org.eclipse.jetty.aggregate', module: 'jetty-all' - exclude group: 'org.eclipse.jetty.orbit', module: 'javax.servlet' - exclude group: 'org.apache.parquet', module: 'parquet-hadoop-bundle' - exclude group: 'com.tdunning', module: 'json' - exclude group: 'javax.transaction', module: 'transaction-api' - exclude group: 'com.zaxxer', module: 'HikariCP' - } - } -} - -project(":iceberg-flink:iceberg-flink-runtime-${flinkMajorVersion}") { - apply plugin: 'com.github.johnrengelman.shadow' - - tasks.jar.dependsOn tasks.shadowJar - - configurations { - implementation { - exclude group: 'org.apache.flink' - // included in Flink - exclude group: 'org.slf4j' - exclude group: 'org.apache.commons' - exclude group: 'commons-pool' - exclude group: 'commons-codec' - exclude group: 'org.xerial.snappy' - exclude group: 'javax.xml.bind' - exclude group: 'javax.annotation' - } - } - - dependencies { - implementation project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}") - implementation project(':iceberg-aws') - implementation(project(':iceberg-aliyun')) { - exclude group: 'edu.umd.cs.findbugs', module: 'findbugs' - exclude group: 'org.apache.httpcomponents', module: 'httpclient' - exclude group: 'commons-logging', module: 'commons-logging' - } - implementation(project(':iceberg-nessie')) { - exclude group: 'com.google.code.findbugs', module: 'jsr305' - } - } - - shadowJar { - configurations = [project.configurations.runtimeClasspath] - - zip64 true - - // include the LICENSE and NOTICE files for the shaded Jar - from(projectDir) { - include 'LICENSE' - include 'NOTICE' - } - - // Relocate dependencies to avoid conflicts - relocate 'org.apache.avro', 'org.apache.iceberg.shaded.org.apache.avro' - relocate 'org.apache.parquet', 'org.apache.iceberg.shaded.org.apache.parquet' - relocate 'com.google', 'org.apache.iceberg.shaded.com.google' - relocate 'com.fasterxml', 'org.apache.iceberg.shaded.com.fasterxml' - relocate 'com.github.benmanes', 'org.apache.iceberg.shaded.com.github.benmanes' - relocate 'org.checkerframework', 'org.apache.iceberg.shaded.org.checkerframework' - relocate 'shaded.parquet', 'org.apache.iceberg.shaded.org.apache.parquet.shaded' - relocate 'org.apache.orc', 'org.apache.iceberg.shaded.org.apache.orc' - relocate 'io.airlift', 'org.apache.iceberg.shaded.io.airlift' - relocate 'org.threeten.extra', 'org.apache.iceberg.shaded.org.threeten.extra' - relocate 'org.apache.httpcomponents.client5', 'org.apache.iceberg.shaded.org.apache.httpcomponents.client5' - - classifier null - } - - jar { - enabled = false - } -} diff --git a/flink/v1.13/flink-runtime/LICENSE b/flink/v1.13/flink-runtime/LICENSE deleted file mode 100644 index a6161156db8a..000000000000 --- a/flink/v1.13/flink-runtime/LICENSE +++ /dev/null @@ -1,492 +0,0 @@ - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - --------------------------------------------------------------------------------- - -This binary artifact contains Apache Avro. - -Copyright: 2014-2020 The Apache Software Foundation. -Home page: https://parquet.apache.org/ -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains the Jackson JSON processor. - -Copyright: 2007-2020 Tatu Saloranta and other contributors -Home page: http://jackson.codehaus.org/ -License: http://www.apache.org/licenses/LICENSE-2.0.txt - --------------------------------------------------------------------------------- - -This binary artifact contains Apache Parquet. - -Copyright: 2014-2020 The Apache Software Foundation. -Home page: https://parquet.apache.org/ -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains Apache Thrift. - -Copyright: 2006-2010 The Apache Software Foundation. -Home page: https://thrift.apache.org/ -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains fastutil. - -Copyright: 2002-2014 Sebastiano Vigna -Home page: http://fastutil.di.unimi.it/ -License: http://www.apache.org/licenses/LICENSE-2.0.html - --------------------------------------------------------------------------------- - -This binary artifact contains Apache ORC. - -Copyright: 2013-2020 The Apache Software Foundation. -Home page: https://orc.apache.org/ -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains Apache Hive's storage API via ORC. - -Copyright: 2013-2020 The Apache Software Foundation. -Home page: https://hive.apache.org/ -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains Airlift Aircompressor. - -Copyright: 2011-2020 Aircompressor authors. -Home page: https://github.com/airlift/aircompressor -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains Airlift Slice. - -Copyright: 2013-2020 Slice authors. -Home page: https://github.com/airlift/slice -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains JetBrains annotations. - -Copyright: 2000-2020 JetBrains s.r.o. -Home page: https://github.com/JetBrains/java-annotations -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains Google Guava. - -Copyright: 2006-2020 The Guava Authors -Home page: https://github.com/google/guava -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains Google Error Prone Annotations. - -Copyright: Copyright 2011-2019 The Error Prone Authors -Home page: https://github.com/google/error-prone -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains checkerframework checker-qual Annotations. - -Copyright: 2004-2020 the Checker Framework developers -Home page: https://github.com/typetools/checker-framework -License: https://github.com/typetools/checker-framework/blob/master/LICENSE.txt (MIT license) - -License text: -| The annotations are licensed under the MIT License. (The text of this -| license appears below.) More specifically, all the parts of the Checker -| Framework that you might want to include with your own program use the -| MIT License. This is the checker-qual.jar file and all the files that -| appear in it: every file in a qual/ directory, plus utility files such -| as NullnessUtil.java, RegexUtil.java, SignednessUtil.java, etc. -| In addition, the cleanroom implementations of third-party annotations, -| which the Checker Framework recognizes as aliases for its own -| annotations, are licensed under the MIT License. -| -| Permission is hereby granted, free of charge, to any person obtaining a copy -| of this software and associated documentation files (the "Software"), to deal -| in the Software without restriction, including without limitation the rights -| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -| copies of the Software, and to permit persons to whom the Software is -| furnished to do so, subject to the following conditions: -| -| The above copyright notice and this permission notice shall be included in -| all copies or substantial portions of the Software. -| -| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -| THE SOFTWARE. - --------------------------------------------------------------------------------- - -This binary artifact contains Animal Sniffer Annotations. - -Copyright: 2009-2018 codehaus.org -Home page: https://www.mojohaus.org/animal-sniffer/animal-sniffer-annotations/ -License: https://www.mojohaus.org/animal-sniffer/animal-sniffer-annotations/license.html (MIT license) - -License text: -| The MIT License -| -| Copyright (c) 2009 codehaus.org. -| -| Permission is hereby granted, free of charge, to any person obtaining a copy -| of this software and associated documentation files (the "Software"), to deal -| in the Software without restriction, including without limitation the rights -| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -| copies of the Software, and to permit persons to whom the Software is -| furnished to do so, subject to the following conditions: -| -| The above copyright notice and this permission notice shall be included in -| all copies or substantial portions of the Software. -| -| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -| THE SOFTWARE. - --------------------------------------------------------------------------------- - -This binary artifact contains Caffeine by Ben Manes. - -Copyright: 2014-2020 Ben Manes and contributors -Home page: https://github.com/ben-manes/caffeine -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains Apache Yetus audience annotations. - -Copyright: 2008-2020 The Apache Software Foundation. -Home page: https://yetus.apache.org/ -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains Google protobuf. - -Copyright: 2008 Google Inc. -Home page: https://developers.google.com/protocol-buffers -License: https://github.com/protocolbuffers/protobuf/blob/master/LICENSE (BSD) - -License text: - -| Copyright 2008 Google Inc. All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -| -| Code generated by the Protocol Buffer compiler is owned by the owner -| of the input file used when generating it. This code is not -| standalone and requires a support library to be linked with it. This -| support library is itself covered by the above license. - --------------------------------------------------------------------------------- - -This binary artifact contains ThreeTen. - -Copyright: 2007-present, Stephen Colebourne & Michael Nascimento Santos. -Home page: https://www.threeten.org/threeten-extra/ -License: https://github.com/ThreeTen/threeten-extra/blob/master/LICENSE.txt (BSD 3-clause) - -License text: - -| All rights reserved. -| -| * Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are met: -| -| * Redistributions of source code must retain the above copyright notice, -| this list of conditions and the following disclaimer. -| -| * Redistributions in binary form must reproduce the above copyright notice, -| this list of conditions and the following disclaimer in the documentation -| and/or other materials provided with the distribution. -| -| * Neither the name of JSR-310 nor the names of its contributors -| may be used to endorse or promote products derived from this software -| without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -| CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------------- - -This binary artifact includes Project Nessie with the following in its NOTICE -file: - -| Dremio -| Copyright 2015-2017 Dremio Corporation -| -| This product includes software developed at -| The Apache Software Foundation (http://www.apache.org/). - --------------------------------------------------------------------------------- - -This binary includes code from Apache Commons. - -* Core ArrayUtil. - -Copyright: 2020 The Apache Software Foundation -Home page: https://commons.apache.org/ -License: https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains Apache HttpComponents Client. - -Copyright: 1999-2022 The Apache Software Foundation. -Home page: https://hc.apache.org/ -License: http://www.apache.org/licenses/LICENSE-2.0 diff --git a/flink/v1.13/flink-runtime/NOTICE b/flink/v1.13/flink-runtime/NOTICE deleted file mode 100644 index 81aa1660456a..000000000000 --- a/flink/v1.13/flink-runtime/NOTICE +++ /dev/null @@ -1,91 +0,0 @@ - -Apache Iceberg -Copyright 2017-2022 The Apache Software Foundation - -This product includes software developed at -The Apache Software Foundation (http://www.apache.org/). - --------------------------------------------------------------------------------- - -This binary artifact includes Apache ORC with the following in its NOTICE file: - -| Apache ORC -| Copyright 2013-2019 The Apache Software Foundation -| -| This product includes software developed by The Apache Software -| Foundation (http://www.apache.org/). -| -| This product includes software developed by Hewlett-Packard: -| (c) Copyright [2014-2015] Hewlett-Packard Development Company, L.P - --------------------------------------------------------------------------------- - -This binary artifact includes Airlift Aircompressor with the following in its -NOTICE file: - -| Snappy Copyright Notices -| ========================= -| -| * Copyright 2011 Dain Sundstrom -| * Copyright 2011, Google Inc. -| -| -| Snappy License -| =============== -| Copyright 2011, Google Inc. -| All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------------- - -This binary artifact includes Apache Yetus with the following in its NOTICE -file: - -| Apache Yetus -| Copyright 2008-2020 The Apache Software Foundation -| -| This product includes software developed at -| The Apache Software Foundation (https://www.apache.org/). -| -| --- -| Additional licenses for the Apache Yetus Source/Website: -| --- -| -| -| See LICENSE for terms. - --------------------------------------------------------------------------------- - -This binary artifact includes Project Nessie with the following in its NOTICE -file: - -| Dremio -| Copyright 2015-2017 Dremio Corporation -| -| This product includes software developed at -| The Apache Software Foundation (http://www.apache.org/). diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java deleted file mode 100644 index 7c098cf20d03..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.io.Serializable; -import java.util.Map; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.CatalogUtil; -import org.apache.iceberg.catalog.Catalog; -import org.apache.iceberg.hadoop.HadoopCatalog; -import org.apache.iceberg.hadoop.SerializableConfiguration; -import org.apache.iceberg.hive.HiveCatalog; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; - -/** Serializable loader to load an Iceberg {@link Catalog}. */ -public interface CatalogLoader extends Serializable { - - /** - * Create a new catalog with the provided properties. NOTICE: for flink, we may initialize the - * {@link CatalogLoader} at flink sql client side or job manager side, and then serialize this - * catalog loader to task manager, finally deserialize it and create a new catalog at task manager - * side. - * - * @return a newly created {@link Catalog} - */ - Catalog loadCatalog(); - - static CatalogLoader hadoop( - String name, Configuration hadoopConf, Map properties) { - return new HadoopCatalogLoader(name, hadoopConf, properties); - } - - static CatalogLoader hive(String name, Configuration hadoopConf, Map properties) { - return new HiveCatalogLoader(name, hadoopConf, properties); - } - - static CatalogLoader custom( - String name, Map properties, Configuration hadoopConf, String impl) { - return new CustomCatalogLoader(name, properties, hadoopConf, impl); - } - - class HadoopCatalogLoader implements CatalogLoader { - private final String catalogName; - private final SerializableConfiguration hadoopConf; - private final String warehouseLocation; - private final Map properties; - - private HadoopCatalogLoader( - String catalogName, Configuration conf, Map properties) { - this.catalogName = catalogName; - this.hadoopConf = new SerializableConfiguration(conf); - this.warehouseLocation = properties.get(CatalogProperties.WAREHOUSE_LOCATION); - this.properties = Maps.newHashMap(properties); - } - - @Override - public Catalog loadCatalog() { - return CatalogUtil.loadCatalog( - HadoopCatalog.class.getName(), catalogName, properties, hadoopConf.get()); - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("catalogName", catalogName) - .add("warehouseLocation", warehouseLocation) - .toString(); - } - } - - class HiveCatalogLoader implements CatalogLoader { - private final String catalogName; - private final SerializableConfiguration hadoopConf; - private final String uri; - private final String warehouse; - private final int clientPoolSize; - private final Map properties; - - private HiveCatalogLoader( - String catalogName, Configuration conf, Map properties) { - this.catalogName = catalogName; - this.hadoopConf = new SerializableConfiguration(conf); - this.uri = properties.get(CatalogProperties.URI); - this.warehouse = properties.get(CatalogProperties.WAREHOUSE_LOCATION); - this.clientPoolSize = - properties.containsKey(CatalogProperties.CLIENT_POOL_SIZE) - ? Integer.parseInt(properties.get(CatalogProperties.CLIENT_POOL_SIZE)) - : CatalogProperties.CLIENT_POOL_SIZE_DEFAULT; - this.properties = Maps.newHashMap(properties); - } - - @Override - public Catalog loadCatalog() { - return CatalogUtil.loadCatalog( - HiveCatalog.class.getName(), catalogName, properties, hadoopConf.get()); - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("catalogName", catalogName) - .add("uri", uri) - .add("warehouse", warehouse) - .add("clientPoolSize", clientPoolSize) - .toString(); - } - } - - class CustomCatalogLoader implements CatalogLoader { - - private final SerializableConfiguration hadoopConf; - private final Map properties; - private final String name; - private final String impl; - - private CustomCatalogLoader( - String name, Map properties, Configuration conf, String impl) { - this.hadoopConf = new SerializableConfiguration(conf); - this.properties = Maps.newHashMap(properties); // wrap into a hashmap for serialization - this.name = name; - this.impl = - Preconditions.checkNotNull( - impl, "Cannot initialize custom Catalog, impl class name is null"); - } - - @Override - public Catalog loadCatalog() { - return CatalogUtil.loadCatalog(impl, name, properties, hadoopConf.get()); - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this).add("name", name).add("impl", impl).toString(); - } - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java deleted file mode 100644 index 75d09732189f..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java +++ /dev/null @@ -1,772 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.io.Closeable; -import java.io.IOException; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import java.util.Optional; -import java.util.Set; -import java.util.stream.Collectors; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.catalog.AbstractCatalog; -import org.apache.flink.table.catalog.CatalogBaseTable; -import org.apache.flink.table.catalog.CatalogDatabase; -import org.apache.flink.table.catalog.CatalogDatabaseImpl; -import org.apache.flink.table.catalog.CatalogFunction; -import org.apache.flink.table.catalog.CatalogPartition; -import org.apache.flink.table.catalog.CatalogPartitionSpec; -import org.apache.flink.table.catalog.CatalogTable; -import org.apache.flink.table.catalog.CatalogTableImpl; -import org.apache.flink.table.catalog.ObjectPath; -import org.apache.flink.table.catalog.exceptions.CatalogException; -import org.apache.flink.table.catalog.exceptions.DatabaseAlreadyExistException; -import org.apache.flink.table.catalog.exceptions.DatabaseNotEmptyException; -import org.apache.flink.table.catalog.exceptions.DatabaseNotExistException; -import org.apache.flink.table.catalog.exceptions.FunctionNotExistException; -import org.apache.flink.table.catalog.exceptions.TableAlreadyExistException; -import org.apache.flink.table.catalog.exceptions.TableNotExistException; -import org.apache.flink.table.catalog.exceptions.TableNotPartitionedException; -import org.apache.flink.table.catalog.stats.CatalogColumnStatistics; -import org.apache.flink.table.catalog.stats.CatalogTableStatistics; -import org.apache.flink.table.expressions.Expression; -import org.apache.flink.table.factories.Factory; -import org.apache.flink.util.StringUtils; -import org.apache.iceberg.CachingCatalog; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.PartitionField; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.Table; -import org.apache.iceberg.Transaction; -import org.apache.iceberg.UpdateProperties; -import org.apache.iceberg.catalog.Catalog; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.catalog.SupportsNamespaces; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.exceptions.AlreadyExistsException; -import org.apache.iceberg.exceptions.NamespaceNotEmptyException; -import org.apache.iceberg.exceptions.NoSuchNamespaceException; -import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; - -/** - * A Flink Catalog implementation that wraps an Iceberg {@link Catalog}. - * - *

The mapping between Flink database and Iceberg namespace: Supplying a base namespace for a - * given catalog, so if you have a catalog that supports a 2-level namespace, you would supply the - * first level in the catalog configuration and the second level would be exposed as Flink - * databases. - * - *

The Iceberg table manages its partitions by itself. The partition of the Iceberg table is - * independent of the partition of Flink. - */ -public class FlinkCatalog extends AbstractCatalog { - - private final CatalogLoader catalogLoader; - private final Catalog icebergCatalog; - private final Namespace baseNamespace; - private final SupportsNamespaces asNamespaceCatalog; - private final Closeable closeable; - private final boolean cacheEnabled; - - public FlinkCatalog( - String catalogName, - String defaultDatabase, - Namespace baseNamespace, - CatalogLoader catalogLoader, - boolean cacheEnabled) { - super(catalogName, defaultDatabase); - this.catalogLoader = catalogLoader; - this.baseNamespace = baseNamespace; - this.cacheEnabled = cacheEnabled; - - Catalog originalCatalog = catalogLoader.loadCatalog(); - icebergCatalog = cacheEnabled ? CachingCatalog.wrap(originalCatalog) : originalCatalog; - asNamespaceCatalog = - originalCatalog instanceof SupportsNamespaces ? (SupportsNamespaces) originalCatalog : null; - closeable = originalCatalog instanceof Closeable ? (Closeable) originalCatalog : null; - } - - @Override - public void open() throws CatalogException { - // Create the default database if it does not exist. - try { - createDatabase(getDefaultDatabase(), ImmutableMap.of(), true); - } catch (DatabaseAlreadyExistException e) { - // Ignore the exception if it's already exist. - } - } - - @Override - public void close() throws CatalogException { - if (closeable != null) { - try { - closeable.close(); - } catch (IOException e) { - throw new CatalogException(e); - } - } - } - - public Catalog catalog() { - return icebergCatalog; - } - - private Namespace toNamespace(String database) { - String[] namespace = new String[baseNamespace.levels().length + 1]; - System.arraycopy(baseNamespace.levels(), 0, namespace, 0, baseNamespace.levels().length); - namespace[baseNamespace.levels().length] = database; - return Namespace.of(namespace); - } - - TableIdentifier toIdentifier(ObjectPath path) { - return TableIdentifier.of(toNamespace(path.getDatabaseName()), path.getObjectName()); - } - - @Override - public List listDatabases() throws CatalogException { - if (asNamespaceCatalog == null) { - return Collections.singletonList(getDefaultDatabase()); - } - - return asNamespaceCatalog.listNamespaces(baseNamespace).stream() - .map(n -> n.level(n.levels().length - 1)) - .collect(Collectors.toList()); - } - - @Override - public CatalogDatabase getDatabase(String databaseName) - throws DatabaseNotExistException, CatalogException { - if (asNamespaceCatalog == null) { - if (!getDefaultDatabase().equals(databaseName)) { - throw new DatabaseNotExistException(getName(), databaseName); - } else { - return new CatalogDatabaseImpl(Maps.newHashMap(), ""); - } - } else { - try { - Map metadata = - Maps.newHashMap(asNamespaceCatalog.loadNamespaceMetadata(toNamespace(databaseName))); - String comment = metadata.remove("comment"); - return new CatalogDatabaseImpl(metadata, comment); - } catch (NoSuchNamespaceException e) { - throw new DatabaseNotExistException(getName(), databaseName, e); - } - } - } - - @Override - public boolean databaseExists(String databaseName) throws CatalogException { - try { - getDatabase(databaseName); - return true; - } catch (DatabaseNotExistException ignore) { - return false; - } - } - - @Override - public void createDatabase(String name, CatalogDatabase database, boolean ignoreIfExists) - throws DatabaseAlreadyExistException, CatalogException { - createDatabase( - name, mergeComment(database.getProperties(), database.getComment()), ignoreIfExists); - } - - private void createDatabase( - String databaseName, Map metadata, boolean ignoreIfExists) - throws DatabaseAlreadyExistException, CatalogException { - if (asNamespaceCatalog != null) { - try { - asNamespaceCatalog.createNamespace(toNamespace(databaseName), metadata); - } catch (AlreadyExistsException e) { - if (!ignoreIfExists) { - throw new DatabaseAlreadyExistException(getName(), databaseName, e); - } - } - } else { - throw new UnsupportedOperationException( - "Namespaces are not supported by catalog: " + getName()); - } - } - - private Map mergeComment(Map metadata, String comment) { - Map ret = Maps.newHashMap(metadata); - if (metadata.containsKey("comment")) { - throw new CatalogException("Database properties should not contain key: 'comment'."); - } - - if (!StringUtils.isNullOrWhitespaceOnly(comment)) { - ret.put("comment", comment); - } - return ret; - } - - @Override - public void dropDatabase(String name, boolean ignoreIfNotExists, boolean cascade) - throws DatabaseNotExistException, DatabaseNotEmptyException, CatalogException { - if (asNamespaceCatalog != null) { - try { - boolean success = asNamespaceCatalog.dropNamespace(toNamespace(name)); - if (!success && !ignoreIfNotExists) { - throw new DatabaseNotExistException(getName(), name); - } - } catch (NoSuchNamespaceException e) { - if (!ignoreIfNotExists) { - throw new DatabaseNotExistException(getName(), name, e); - } - } catch (NamespaceNotEmptyException e) { - throw new DatabaseNotEmptyException(getName(), name, e); - } - } else { - if (!ignoreIfNotExists) { - throw new DatabaseNotExistException(getName(), name); - } - } - } - - @Override - public void alterDatabase(String name, CatalogDatabase newDatabase, boolean ignoreIfNotExists) - throws DatabaseNotExistException, CatalogException { - if (asNamespaceCatalog != null) { - Namespace namespace = toNamespace(name); - Map updates = Maps.newHashMap(); - Set removals = Sets.newHashSet(); - - try { - Map oldProperties = asNamespaceCatalog.loadNamespaceMetadata(namespace); - Map newProperties = - mergeComment(newDatabase.getProperties(), newDatabase.getComment()); - - for (String key : oldProperties.keySet()) { - if (!newProperties.containsKey(key)) { - removals.add(key); - } - } - - for (Map.Entry entry : newProperties.entrySet()) { - if (!entry.getValue().equals(oldProperties.get(entry.getKey()))) { - updates.put(entry.getKey(), entry.getValue()); - } - } - - if (!updates.isEmpty()) { - asNamespaceCatalog.setProperties(namespace, updates); - } - - if (!removals.isEmpty()) { - asNamespaceCatalog.removeProperties(namespace, removals); - } - - } catch (NoSuchNamespaceException e) { - if (!ignoreIfNotExists) { - throw new DatabaseNotExistException(getName(), name, e); - } - } - } else { - if (getDefaultDatabase().equals(name)) { - throw new CatalogException( - "Can not alter the default database when the iceberg catalog doesn't support namespaces."); - } - if (!ignoreIfNotExists) { - throw new DatabaseNotExistException(getName(), name); - } - } - } - - @Override - public List listTables(String databaseName) - throws DatabaseNotExistException, CatalogException { - try { - return icebergCatalog.listTables(toNamespace(databaseName)).stream() - .map(TableIdentifier::name) - .collect(Collectors.toList()); - } catch (NoSuchNamespaceException e) { - throw new DatabaseNotExistException(getName(), databaseName, e); - } - } - - @Override - public CatalogTable getTable(ObjectPath tablePath) - throws TableNotExistException, CatalogException { - Table table = loadIcebergTable(tablePath); - return toCatalogTable(table); - } - - private Table loadIcebergTable(ObjectPath tablePath) throws TableNotExistException { - try { - Table table = icebergCatalog.loadTable(toIdentifier(tablePath)); - if (cacheEnabled) { - table.refresh(); - } - - return table; - } catch (org.apache.iceberg.exceptions.NoSuchTableException e) { - throw new TableNotExistException(getName(), tablePath, e); - } - } - - @Override - public boolean tableExists(ObjectPath tablePath) throws CatalogException { - return icebergCatalog.tableExists(toIdentifier(tablePath)); - } - - @Override - public void dropTable(ObjectPath tablePath, boolean ignoreIfNotExists) - throws TableNotExistException, CatalogException { - try { - icebergCatalog.dropTable(toIdentifier(tablePath)); - } catch (org.apache.iceberg.exceptions.NoSuchTableException e) { - if (!ignoreIfNotExists) { - throw new TableNotExistException(getName(), tablePath, e); - } - } - } - - @Override - public void renameTable(ObjectPath tablePath, String newTableName, boolean ignoreIfNotExists) - throws TableNotExistException, TableAlreadyExistException, CatalogException { - try { - icebergCatalog.renameTable( - toIdentifier(tablePath), - toIdentifier(new ObjectPath(tablePath.getDatabaseName(), newTableName))); - } catch (org.apache.iceberg.exceptions.NoSuchTableException e) { - if (!ignoreIfNotExists) { - throw new TableNotExistException(getName(), tablePath, e); - } - } catch (AlreadyExistsException e) { - throw new TableAlreadyExistException(getName(), tablePath, e); - } - } - - @Override - public void createTable(ObjectPath tablePath, CatalogBaseTable table, boolean ignoreIfExists) - throws CatalogException, TableAlreadyExistException { - if (Objects.equals( - table.getOptions().get("connector"), FlinkDynamicTableFactory.FACTORY_IDENTIFIER)) { - throw new IllegalArgumentException( - "Cannot create the table with 'connector'='iceberg' table property in " - + "an iceberg catalog, Please create table with 'connector'='iceberg' property in a non-iceberg catalog or " - + "create table without 'connector'='iceberg' related properties in an iceberg table."); - } - - createIcebergTable(tablePath, table, ignoreIfExists); - } - - void createIcebergTable(ObjectPath tablePath, CatalogBaseTable table, boolean ignoreIfExists) - throws CatalogException, TableAlreadyExistException { - validateFlinkTable(table); - - Schema icebergSchema = FlinkSchemaUtil.convert(table.getSchema()); - PartitionSpec spec = toPartitionSpec(((CatalogTable) table).getPartitionKeys(), icebergSchema); - - ImmutableMap.Builder properties = ImmutableMap.builder(); - String location = null; - for (Map.Entry entry : table.getOptions().entrySet()) { - if ("location".equalsIgnoreCase(entry.getKey())) { - location = entry.getValue(); - } else { - properties.put(entry.getKey(), entry.getValue()); - } - } - - try { - icebergCatalog.createTable( - toIdentifier(tablePath), icebergSchema, spec, location, properties.build()); - } catch (AlreadyExistsException e) { - if (!ignoreIfExists) { - throw new TableAlreadyExistException(getName(), tablePath, e); - } - } - } - - @Override - public void alterTable(ObjectPath tablePath, CatalogBaseTable newTable, boolean ignoreIfNotExists) - throws CatalogException, TableNotExistException { - validateFlinkTable(newTable); - - Table icebergTable; - try { - icebergTable = loadIcebergTable(tablePath); - } catch (TableNotExistException e) { - if (!ignoreIfNotExists) { - throw e; - } else { - return; - } - } - - CatalogTable table = toCatalogTable(icebergTable); - - // Currently, Flink SQL only support altering table properties. - - // For current Flink Catalog API, support for adding/removing/renaming columns cannot be done by - // comparing - // CatalogTable instances, unless the Flink schema contains Iceberg column IDs. - if (!table.getSchema().equals(newTable.getSchema())) { - throw new UnsupportedOperationException("Altering schema is not supported yet."); - } - - if (!table.getPartitionKeys().equals(((CatalogTable) newTable).getPartitionKeys())) { - throw new UnsupportedOperationException("Altering partition keys is not supported yet."); - } - - Map oldProperties = table.getOptions(); - Map setProperties = Maps.newHashMap(); - - String setLocation = null; - String setSnapshotId = null; - String pickSnapshotId = null; - - for (Map.Entry entry : newTable.getOptions().entrySet()) { - String key = entry.getKey(); - String value = entry.getValue(); - - if (Objects.equals(value, oldProperties.get(key))) { - continue; - } - - if ("location".equalsIgnoreCase(key)) { - setLocation = value; - } else if ("current-snapshot-id".equalsIgnoreCase(key)) { - setSnapshotId = value; - } else if ("cherry-pick-snapshot-id".equalsIgnoreCase(key)) { - pickSnapshotId = value; - } else { - setProperties.put(key, value); - } - } - - oldProperties - .keySet() - .forEach( - k -> { - if (!newTable.getOptions().containsKey(k)) { - setProperties.put(k, null); - } - }); - - commitChanges(icebergTable, setLocation, setSnapshotId, pickSnapshotId, setProperties); - } - - private static void validateFlinkTable(CatalogBaseTable table) { - Preconditions.checkArgument( - table instanceof CatalogTable, "The Table should be a CatalogTable."); - - TableSchema schema = table.getSchema(); - schema - .getTableColumns() - .forEach( - column -> { - if (!FlinkCompatibilityUtil.isPhysicalColumn(column)) { - throw new UnsupportedOperationException( - "Creating table with computed columns is not supported yet."); - } - }); - - if (!schema.getWatermarkSpecs().isEmpty()) { - throw new UnsupportedOperationException( - "Creating table with watermark specs is not supported yet."); - } - } - - private static PartitionSpec toPartitionSpec(List partitionKeys, Schema icebergSchema) { - PartitionSpec.Builder builder = PartitionSpec.builderFor(icebergSchema); - partitionKeys.forEach(builder::identity); - return builder.build(); - } - - private static List toPartitionKeys(PartitionSpec spec, Schema icebergSchema) { - ImmutableList.Builder partitionKeysBuilder = ImmutableList.builder(); - for (PartitionField field : spec.fields()) { - if (field.transform().isIdentity()) { - partitionKeysBuilder.add(icebergSchema.findColumnName(field.sourceId())); - } else { - // Not created by Flink SQL. - // For compatibility with iceberg tables, return empty. - // TODO modify this after Flink support partition transform. - return Collections.emptyList(); - } - } - return partitionKeysBuilder.build(); - } - - private static void commitChanges( - Table table, - String setLocation, - String setSnapshotId, - String pickSnapshotId, - Map setProperties) { - // don't allow setting the snapshot and picking a commit at the same time because order is - // ambiguous and choosing - // one order leads to different results - Preconditions.checkArgument( - setSnapshotId == null || pickSnapshotId == null, - "Cannot set the current snapshot ID and cherry-pick snapshot changes"); - - if (setSnapshotId != null) { - long newSnapshotId = Long.parseLong(setSnapshotId); - table.manageSnapshots().setCurrentSnapshot(newSnapshotId).commit(); - } - - // if updating the table snapshot, perform that update first in case it fails - if (pickSnapshotId != null) { - long newSnapshotId = Long.parseLong(pickSnapshotId); - table.manageSnapshots().cherrypick(newSnapshotId).commit(); - } - - Transaction transaction = table.newTransaction(); - - if (setLocation != null) { - transaction.updateLocation().setLocation(setLocation).commit(); - } - - if (!setProperties.isEmpty()) { - UpdateProperties updateProperties = transaction.updateProperties(); - setProperties.forEach( - (k, v) -> { - if (v == null) { - updateProperties.remove(k); - } else { - updateProperties.set(k, v); - } - }); - updateProperties.commit(); - } - - transaction.commitTransaction(); - } - - static CatalogTable toCatalogTable(Table table) { - TableSchema schema = FlinkSchemaUtil.toSchema(table.schema()); - List partitionKeys = toPartitionKeys(table.spec(), table.schema()); - - // NOTE: We can not create a IcebergCatalogTable extends CatalogTable, because Flink optimizer - // may use - // CatalogTableImpl to copy a new catalog table. - // Let's re-loading table from Iceberg catalog when creating source/sink operators. - // Iceberg does not have Table comment, so pass a null (Default comment value in Flink). - return new CatalogTableImpl(schema, partitionKeys, table.properties(), null); - } - - @Override - public Optional getFactory() { - return Optional.of(new FlinkDynamicTableFactory(this)); - } - - CatalogLoader getCatalogLoader() { - return catalogLoader; - } - - // ------------------------------ Unsupported methods - // --------------------------------------------- - - @Override - public List listViews(String databaseName) throws CatalogException { - return Collections.emptyList(); - } - - @Override - public CatalogPartition getPartition(ObjectPath tablePath, CatalogPartitionSpec partitionSpec) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public boolean partitionExists(ObjectPath tablePath, CatalogPartitionSpec partitionSpec) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public void createPartition( - ObjectPath tablePath, - CatalogPartitionSpec partitionSpec, - CatalogPartition partition, - boolean ignoreIfExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public void dropPartition( - ObjectPath tablePath, CatalogPartitionSpec partitionSpec, boolean ignoreIfNotExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public void alterPartition( - ObjectPath tablePath, - CatalogPartitionSpec partitionSpec, - CatalogPartition newPartition, - boolean ignoreIfNotExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public List listFunctions(String dbName) throws CatalogException { - return Collections.emptyList(); - } - - @Override - public CatalogFunction getFunction(ObjectPath functionPath) - throws FunctionNotExistException, CatalogException { - throw new FunctionNotExistException(getName(), functionPath); - } - - @Override - public boolean functionExists(ObjectPath functionPath) throws CatalogException { - return false; - } - - @Override - public void createFunction( - ObjectPath functionPath, CatalogFunction function, boolean ignoreIfExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public void alterFunction( - ObjectPath functionPath, CatalogFunction newFunction, boolean ignoreIfNotExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public void dropFunction(ObjectPath functionPath, boolean ignoreIfNotExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public void alterTableStatistics( - ObjectPath tablePath, CatalogTableStatistics tableStatistics, boolean ignoreIfNotExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public void alterTableColumnStatistics( - ObjectPath tablePath, CatalogColumnStatistics columnStatistics, boolean ignoreIfNotExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public void alterPartitionStatistics( - ObjectPath tablePath, - CatalogPartitionSpec partitionSpec, - CatalogTableStatistics partitionStatistics, - boolean ignoreIfNotExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public void alterPartitionColumnStatistics( - ObjectPath tablePath, - CatalogPartitionSpec partitionSpec, - CatalogColumnStatistics columnStatistics, - boolean ignoreIfNotExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public List listPartitions(ObjectPath tablePath) - throws TableNotExistException, TableNotPartitionedException, CatalogException { - Table table = loadIcebergTable(tablePath); - - if (table.spec().isUnpartitioned()) { - throw new TableNotPartitionedException(icebergCatalog.name(), tablePath); - } - - Set set = Sets.newHashSet(); - try (CloseableIterable tasks = table.newScan().planFiles()) { - for (DataFile dataFile : CloseableIterable.transform(tasks, FileScanTask::file)) { - Map map = Maps.newHashMap(); - StructLike structLike = dataFile.partition(); - PartitionSpec spec = table.specs().get(dataFile.specId()); - for (int i = 0; i < structLike.size(); i++) { - map.put(spec.fields().get(i).name(), String.valueOf(structLike.get(i, Object.class))); - } - set.add(new CatalogPartitionSpec(map)); - } - } catch (IOException e) { - throw new CatalogException( - String.format("Failed to list partitions of table %s", tablePath), e); - } - - return Lists.newArrayList(set); - } - - @Override - public List listPartitions( - ObjectPath tablePath, CatalogPartitionSpec partitionSpec) throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public List listPartitionsByFilter( - ObjectPath tablePath, List filters) throws CatalogException { - throw new UnsupportedOperationException(); - } - - // After partition pruning and filter push down, the statistics have become very inaccurate, so - // the statistics from - // here are of little significance. - // Flink will support something like SupportsReportStatistics in future. - - @Override - public CatalogTableStatistics getTableStatistics(ObjectPath tablePath) throws CatalogException { - return CatalogTableStatistics.UNKNOWN; - } - - @Override - public CatalogColumnStatistics getTableColumnStatistics(ObjectPath tablePath) - throws CatalogException { - return CatalogColumnStatistics.UNKNOWN; - } - - @Override - public CatalogTableStatistics getPartitionStatistics( - ObjectPath tablePath, CatalogPartitionSpec partitionSpec) throws CatalogException { - return CatalogTableStatistics.UNKNOWN; - } - - @Override - public CatalogColumnStatistics getPartitionColumnStatistics( - ObjectPath tablePath, CatalogPartitionSpec partitionSpec) throws CatalogException { - return CatalogColumnStatistics.UNKNOWN; - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java deleted file mode 100644 index aee3cfb38daa..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.net.URL; -import java.nio.file.Files; -import java.nio.file.Paths; -import java.util.List; -import java.util.Locale; -import java.util.Map; -import org.apache.flink.configuration.GlobalConfiguration; -import org.apache.flink.runtime.util.HadoopUtils; -import org.apache.flink.table.catalog.Catalog; -import org.apache.flink.table.factories.CatalogFactory; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.base.Strings; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; - -/** - * A Flink Catalog factory implementation that creates {@link FlinkCatalog}. - * - *

This supports the following catalog configuration options: - * - *

    - *
  • type - Flink catalog factory key, should be "iceberg" - *
  • catalog-type - iceberg catalog type, "hive" or "hadoop" - *
  • uri - the Hive Metastore URI (Hive catalog only) - *
  • clients - the Hive Client Pool Size (Hive catalog only) - *
  • warehouse - the warehouse path (Hadoop catalog only) - *
  • default-database - a database name to use as the default - *
  • base-namespace - a base namespace as the prefix for all databases (Hadoop - * catalog only) - *
  • cache-enabled - whether to enable catalog cache - *
- * - *

To use a custom catalog that is not a Hive or Hadoop catalog, extend this class and override - * {@link #createCatalogLoader(String, Map, Configuration)}. - */ -public class FlinkCatalogFactory implements CatalogFactory { - - // Can not just use "type", it conflicts with CATALOG_TYPE. - public static final String ICEBERG_CATALOG_TYPE = "catalog-type"; - public static final String ICEBERG_CATALOG_TYPE_HADOOP = "hadoop"; - public static final String ICEBERG_CATALOG_TYPE_HIVE = "hive"; - - public static final String HIVE_CONF_DIR = "hive-conf-dir"; - public static final String HADOOP_CONF_DIR = "hadoop-conf-dir"; - public static final String DEFAULT_DATABASE = "default-database"; - public static final String DEFAULT_DATABASE_NAME = "default"; - public static final String BASE_NAMESPACE = "base-namespace"; - public static final String CACHE_ENABLED = "cache-enabled"; - - public static final String TYPE = "type"; - public static final String PROPERTY_VERSION = "property-version"; - - /** - * Create an Iceberg {@link org.apache.iceberg.catalog.Catalog} loader to be used by this Flink - * catalog adapter. - * - * @param name Flink's catalog name - * @param properties Flink's catalog properties - * @param hadoopConf Hadoop configuration for catalog - * @return an Iceberg catalog loader - */ - static CatalogLoader createCatalogLoader( - String name, Map properties, Configuration hadoopConf) { - String catalogImpl = properties.get(CatalogProperties.CATALOG_IMPL); - if (catalogImpl != null) { - String catalogType = properties.get(ICEBERG_CATALOG_TYPE); - Preconditions.checkArgument( - catalogType == null, - "Cannot create catalog %s, both catalog-type and catalog-impl are set: catalog-type=%s, catalog-impl=%s", - name, - catalogType, - catalogImpl); - return CatalogLoader.custom(name, properties, hadoopConf, catalogImpl); - } - - String catalogType = properties.getOrDefault(ICEBERG_CATALOG_TYPE, ICEBERG_CATALOG_TYPE_HIVE); - switch (catalogType.toLowerCase(Locale.ENGLISH)) { - case ICEBERG_CATALOG_TYPE_HIVE: - // The values of properties 'uri', 'warehouse', 'hive-conf-dir' are allowed to be null, in - // that case it will - // fallback to parse those values from hadoop configuration which is loaded from classpath. - String hiveConfDir = properties.get(HIVE_CONF_DIR); - String hadoopConfDir = properties.get(HADOOP_CONF_DIR); - Configuration newHadoopConf = mergeHiveConf(hadoopConf, hiveConfDir, hadoopConfDir); - return CatalogLoader.hive(name, newHadoopConf, properties); - - case ICEBERG_CATALOG_TYPE_HADOOP: - return CatalogLoader.hadoop(name, hadoopConf, properties); - - default: - throw new UnsupportedOperationException( - "Unknown catalog-type: " + catalogType + " (Must be 'hive' or 'hadoop')"); - } - } - - @Override - public Map requiredContext() { - Map context = Maps.newHashMap(); - context.put(TYPE, "iceberg"); - context.put(PROPERTY_VERSION, "1"); - return context; - } - - @Override - public List supportedProperties() { - return ImmutableList.of("*"); - } - - @Override - public Catalog createCatalog(String name, Map properties) { - return createCatalog(name, properties, clusterHadoopConf()); - } - - protected Catalog createCatalog( - String name, Map properties, Configuration hadoopConf) { - CatalogLoader catalogLoader = createCatalogLoader(name, properties, hadoopConf); - String defaultDatabase = properties.getOrDefault(DEFAULT_DATABASE, DEFAULT_DATABASE_NAME); - - Namespace baseNamespace = Namespace.empty(); - if (properties.containsKey(BASE_NAMESPACE)) { - baseNamespace = Namespace.of(properties.get(BASE_NAMESPACE).split("\\.")); - } - - boolean cacheEnabled = Boolean.parseBoolean(properties.getOrDefault(CACHE_ENABLED, "true")); - return new FlinkCatalog(name, defaultDatabase, baseNamespace, catalogLoader, cacheEnabled); - } - - private static Configuration mergeHiveConf( - Configuration hadoopConf, String hiveConfDir, String hadoopConfDir) { - Configuration newConf = new Configuration(hadoopConf); - if (!Strings.isNullOrEmpty(hiveConfDir)) { - Preconditions.checkState( - Files.exists(Paths.get(hiveConfDir, "hive-site.xml")), - "There should be a hive-site.xml file under the directory %s", - hiveConfDir); - newConf.addResource(new Path(hiveConfDir, "hive-site.xml")); - } else { - // If don't provide the hive-site.xml path explicitly, it will try to load resource from - // classpath. If still - // couldn't load the configuration file, then it will throw exception in HiveCatalog. - URL configFile = CatalogLoader.class.getClassLoader().getResource("hive-site.xml"); - if (configFile != null) { - newConf.addResource(configFile); - } - } - - if (!Strings.isNullOrEmpty(hadoopConfDir)) { - Preconditions.checkState( - Files.exists(Paths.get(hadoopConfDir, "hdfs-site.xml")), - "Failed to load Hadoop configuration: missing %s", - Paths.get(hadoopConfDir, "hdfs-site.xml")); - newConf.addResource(new Path(hadoopConfDir, "hdfs-site.xml")); - Preconditions.checkState( - Files.exists(Paths.get(hadoopConfDir, "core-site.xml")), - "Failed to load Hadoop configuration: missing %s", - Paths.get(hadoopConfDir, "core-site.xml")); - newConf.addResource(new Path(hadoopConfDir, "core-site.xml")); - } - return newConf; - } - - public static Configuration clusterHadoopConf() { - return HadoopUtils.getHadoopConfiguration(GlobalConfiguration.loadConfiguration()); - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java deleted file mode 100644 index 83fa09de544c..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java +++ /dev/null @@ -1,200 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.util.List; -import java.util.Map; -import java.util.function.Function; -import org.apache.flink.configuration.ConfigOption; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.iceberg.Table; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; - -class FlinkConfParser { - - private final Map tableProperties; - private final Map options; - private final ReadableConfig readableConfig; - - FlinkConfParser(Table table, Map options, ReadableConfig readableConfig) { - this.tableProperties = table.properties(); - this.options = options; - this.readableConfig = readableConfig; - } - - public BooleanConfParser booleanConf() { - return new BooleanConfParser(); - } - - public IntConfParser intConf() { - return new IntConfParser(); - } - - public LongConfParser longConf() { - return new LongConfParser(); - } - - public StringConfParser stringConf() { - return new StringConfParser(); - } - - class BooleanConfParser extends ConfParser { - private Boolean defaultValue; - - @Override - protected BooleanConfParser self() { - return this; - } - - public BooleanConfParser defaultValue(boolean value) { - this.defaultValue = value; - return self(); - } - - public BooleanConfParser defaultValue(String value) { - this.defaultValue = Boolean.parseBoolean(value); - return self(); - } - - public boolean parse() { - Preconditions.checkArgument(defaultValue != null, "Default value cannot be null"); - return parse(Boolean::parseBoolean, defaultValue); - } - } - - class IntConfParser extends ConfParser { - private Integer defaultValue; - - @Override - protected IntConfParser self() { - return this; - } - - public IntConfParser defaultValue(int value) { - this.defaultValue = value; - return self(); - } - - public int parse() { - Preconditions.checkArgument(defaultValue != null, "Default value cannot be null"); - return parse(Integer::parseInt, defaultValue); - } - - public Integer parseOptional() { - return parse(Integer::parseInt, null); - } - } - - class LongConfParser extends ConfParser { - private Long defaultValue; - - @Override - protected LongConfParser self() { - return this; - } - - public LongConfParser defaultValue(long value) { - this.defaultValue = value; - return self(); - } - - public long parse() { - Preconditions.checkArgument(defaultValue != null, "Default value cannot be null"); - return parse(Long::parseLong, defaultValue); - } - - public Long parseOptional() { - return parse(Long::parseLong, null); - } - } - - class StringConfParser extends ConfParser { - private String defaultValue; - - @Override - protected StringConfParser self() { - return this; - } - - public StringConfParser defaultValue(String value) { - this.defaultValue = value; - return self(); - } - - public String parse() { - Preconditions.checkArgument(defaultValue != null, "Default value cannot be null"); - return parse(Function.identity(), defaultValue); - } - - public String parseOptional() { - return parse(Function.identity(), null); - } - } - - abstract class ConfParser { - private final List optionNames = Lists.newArrayList(); - private String tablePropertyName; - private ConfigOption configOption; - - protected abstract ThisT self(); - - public ThisT option(String name) { - this.optionNames.add(name); - return self(); - } - - public ThisT flinkConfig(ConfigOption newConfigOption) { - this.configOption = newConfigOption; - return self(); - } - - public ThisT tableProperty(String name) { - this.tablePropertyName = name; - return self(); - } - - protected T parse(Function conversion, T defaultValue) { - if (!optionNames.isEmpty()) { - for (String optionName : optionNames) { - String optionValue = options.get(optionName); - if (optionValue != null) { - return conversion.apply(optionValue); - } - } - } - - if (configOption != null) { - T propertyValue = readableConfig.get(configOption); - if (propertyValue != null) { - return propertyValue; - } - } - - if (tablePropertyName != null) { - String propertyValue = tableProperties.get(tablePropertyName); - if (propertyValue != null) { - return conversion.apply(propertyValue); - } - } - - return defaultValue; - } - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java deleted file mode 100644 index 603cb3961c02..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import org.apache.flink.configuration.ConfigOption; -import org.apache.flink.configuration.ConfigOptions; - -public class FlinkConfigOptions { - - private FlinkConfigOptions() {} - - public static final ConfigOption TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM = - ConfigOptions.key("table.exec.iceberg.infer-source-parallelism") - .booleanType() - .defaultValue(true) - .withDescription( - "If is false, parallelism of source are set by config.\n" - + "If is true, source parallelism is inferred according to splits number.\n"); - - public static final ConfigOption TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX = - ConfigOptions.key("table.exec.iceberg.infer-source-parallelism.max") - .intType() - .defaultValue(100) - .withDescription("Sets max infer parallelism for source operator."); - - public static final ConfigOption TABLE_EXEC_ICEBERG_EXPOSE_SPLIT_LOCALITY_INFO = - ConfigOptions.key("table.exec.iceberg.expose-split-locality-info") - .booleanType() - .noDefaultValue() - .withDescription( - "Expose split host information to use Flink's locality aware split assigner."); -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java deleted file mode 100644 index 97fb4196795d..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java +++ /dev/null @@ -1,205 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.util.Map; -import java.util.Set; -import org.apache.flink.configuration.ConfigOption; -import org.apache.flink.configuration.ConfigOptions; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.catalog.CatalogBaseTable; -import org.apache.flink.table.catalog.CatalogDatabaseImpl; -import org.apache.flink.table.catalog.CatalogTable; -import org.apache.flink.table.catalog.ObjectIdentifier; -import org.apache.flink.table.catalog.ObjectPath; -import org.apache.flink.table.catalog.exceptions.DatabaseAlreadyExistException; -import org.apache.flink.table.catalog.exceptions.TableAlreadyExistException; -import org.apache.flink.table.connector.sink.DynamicTableSink; -import org.apache.flink.table.connector.source.DynamicTableSource; -import org.apache.flink.table.factories.DynamicTableSinkFactory; -import org.apache.flink.table.factories.DynamicTableSourceFactory; -import org.apache.flink.table.utils.TableSchemaUtils; -import org.apache.flink.util.Preconditions; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.exceptions.AlreadyExistsException; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; - -public class FlinkDynamicTableFactory - implements DynamicTableSinkFactory, DynamicTableSourceFactory { - static final String FACTORY_IDENTIFIER = "iceberg"; - - private static final ConfigOption CATALOG_NAME = - ConfigOptions.key("catalog-name") - .stringType() - .noDefaultValue() - .withDescription("Catalog name"); - - private static final ConfigOption CATALOG_TYPE = - ConfigOptions.key(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE) - .stringType() - .noDefaultValue() - .withDescription("Catalog type, the optional types are: custom, hadoop, hive."); - - private static final ConfigOption CATALOG_DATABASE = - ConfigOptions.key("catalog-database") - .stringType() - .defaultValue(FlinkCatalogFactory.DEFAULT_DATABASE_NAME) - .withDescription("Database name managed in the iceberg catalog."); - - private static final ConfigOption CATALOG_TABLE = - ConfigOptions.key("catalog-table") - .stringType() - .noDefaultValue() - .withDescription("Table name managed in the underlying iceberg catalog and database."); - - private final FlinkCatalog catalog; - - public FlinkDynamicTableFactory() { - this.catalog = null; - } - - public FlinkDynamicTableFactory(FlinkCatalog catalog) { - this.catalog = catalog; - } - - @Override - public DynamicTableSource createDynamicTableSource(Context context) { - ObjectIdentifier objectIdentifier = context.getObjectIdentifier(); - CatalogTable catalogTable = context.getCatalogTable(); - Map tableProps = catalogTable.getOptions(); - TableSchema tableSchema = TableSchemaUtils.getPhysicalSchema(catalogTable.getSchema()); - - TableLoader tableLoader; - if (catalog != null) { - tableLoader = createTableLoader(catalog, objectIdentifier.toObjectPath()); - } else { - tableLoader = - createTableLoader( - catalogTable, - tableProps, - objectIdentifier.getDatabaseName(), - objectIdentifier.getObjectName()); - } - - return new IcebergTableSource(tableLoader, tableSchema, tableProps, context.getConfiguration()); - } - - @Override - public DynamicTableSink createDynamicTableSink(Context context) { - ObjectPath objectPath = context.getObjectIdentifier().toObjectPath(); - CatalogTable catalogTable = context.getCatalogTable(); - Map writeProps = catalogTable.getOptions(); - TableSchema tableSchema = TableSchemaUtils.getPhysicalSchema(catalogTable.getSchema()); - - TableLoader tableLoader; - if (catalog != null) { - tableLoader = createTableLoader(catalog, objectPath); - } else { - tableLoader = - createTableLoader( - catalogTable, writeProps, objectPath.getDatabaseName(), objectPath.getObjectName()); - } - - return new IcebergTableSink(tableLoader, tableSchema, writeProps); - } - - @Override - public Set> requiredOptions() { - Set> options = Sets.newHashSet(); - options.add(CATALOG_TYPE); - options.add(CATALOG_NAME); - return options; - } - - @Override - public Set> optionalOptions() { - Set> options = Sets.newHashSet(); - options.add(CATALOG_DATABASE); - options.add(CATALOG_TABLE); - return options; - } - - @Override - public String factoryIdentifier() { - return FACTORY_IDENTIFIER; - } - - private static TableLoader createTableLoader( - CatalogBaseTable catalogBaseTable, - Map tableProps, - String databaseName, - String tableName) { - Configuration flinkConf = new Configuration(); - tableProps.forEach(flinkConf::setString); - - String catalogName = flinkConf.getString(CATALOG_NAME); - Preconditions.checkNotNull( - catalogName, "Table property '%s' cannot be null", CATALOG_NAME.key()); - - String catalogDatabase = flinkConf.getString(CATALOG_DATABASE, databaseName); - Preconditions.checkNotNull(catalogDatabase, "The iceberg database name cannot be null"); - - String catalogTable = flinkConf.getString(CATALOG_TABLE, tableName); - Preconditions.checkNotNull(catalogTable, "The iceberg table name cannot be null"); - - org.apache.hadoop.conf.Configuration hadoopConf = FlinkCatalogFactory.clusterHadoopConf(); - FlinkCatalogFactory factory = new FlinkCatalogFactory(); - FlinkCatalog flinkCatalog = - (FlinkCatalog) factory.createCatalog(catalogName, tableProps, hadoopConf); - ObjectPath objectPath = new ObjectPath(catalogDatabase, catalogTable); - - // Create database if not exists in the external catalog. - if (!flinkCatalog.databaseExists(catalogDatabase)) { - try { - flinkCatalog.createDatabase( - catalogDatabase, new CatalogDatabaseImpl(Maps.newHashMap(), null), true); - } catch (DatabaseAlreadyExistException e) { - throw new AlreadyExistsException( - e, - "Database %s already exists in the iceberg catalog %s.", - catalogName, - catalogDatabase); - } - } - - // Create table if not exists in the external catalog. - if (!flinkCatalog.tableExists(objectPath)) { - try { - flinkCatalog.createIcebergTable(objectPath, catalogBaseTable, true); - } catch (TableAlreadyExistException e) { - throw new AlreadyExistsException( - e, - "Table %s already exists in the database %s and catalog %s", - catalogTable, - catalogDatabase, - catalogName); - } - } - - return TableLoader.fromCatalog( - flinkCatalog.getCatalogLoader(), TableIdentifier.of(catalogDatabase, catalogTable)); - } - - private static TableLoader createTableLoader(FlinkCatalog catalog, ObjectPath objectPath) { - Preconditions.checkNotNull(catalog, "Flink catalog cannot be null"); - return TableLoader.fromCatalog(catalog.getCatalogLoader(), catalog.toIdentifier(objectPath)); - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java deleted file mode 100644 index 717de9ef5acc..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java +++ /dev/null @@ -1,266 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.time.Instant; -import java.time.LocalDate; -import java.time.LocalDateTime; -import java.time.LocalTime; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.function.BiFunction; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import org.apache.flink.table.expressions.CallExpression; -import org.apache.flink.table.expressions.FieldReferenceExpression; -import org.apache.flink.table.expressions.ResolvedExpression; -import org.apache.flink.table.expressions.ValueLiteralExpression; -import org.apache.flink.table.functions.BuiltInFunctionDefinitions; -import org.apache.flink.table.functions.FunctionDefinition; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.expressions.Expression.Operation; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.util.DateTimeUtil; -import org.apache.iceberg.util.NaNUtil; - -public class FlinkFilters { - private FlinkFilters() {} - - private static final Pattern STARTS_WITH_PATTERN = Pattern.compile("([^%]+)%"); - - private static final Map FILTERS = - ImmutableMap.builder() - .put(BuiltInFunctionDefinitions.EQUALS, Operation.EQ) - .put(BuiltInFunctionDefinitions.NOT_EQUALS, Operation.NOT_EQ) - .put(BuiltInFunctionDefinitions.GREATER_THAN, Operation.GT) - .put(BuiltInFunctionDefinitions.GREATER_THAN_OR_EQUAL, Operation.GT_EQ) - .put(BuiltInFunctionDefinitions.LESS_THAN, Operation.LT) - .put(BuiltInFunctionDefinitions.LESS_THAN_OR_EQUAL, Operation.LT_EQ) - .put(BuiltInFunctionDefinitions.IS_NULL, Operation.IS_NULL) - .put(BuiltInFunctionDefinitions.IS_NOT_NULL, Operation.NOT_NULL) - .put(BuiltInFunctionDefinitions.AND, Operation.AND) - .put(BuiltInFunctionDefinitions.OR, Operation.OR) - .put(BuiltInFunctionDefinitions.NOT, Operation.NOT) - .put(BuiltInFunctionDefinitions.LIKE, Operation.STARTS_WITH) - .build(); - - /** - * Convert flink expression to iceberg expression. - * - *

the BETWEEN, NOT_BETWEEN, IN expression will be converted by flink automatically. the - * BETWEEN will be converted to (GT_EQ AND LT_EQ), the NOT_BETWEEN will be converted to (LT_EQ OR - * GT_EQ), the IN will be converted to OR, so we do not add the conversion here - * - * @param flinkExpression the flink expression - * @return the iceberg expression - */ - public static Optional convert( - org.apache.flink.table.expressions.Expression flinkExpression) { - if (!(flinkExpression instanceof CallExpression)) { - return Optional.empty(); - } - - CallExpression call = (CallExpression) flinkExpression; - Operation op = FILTERS.get(call.getFunctionDefinition()); - if (op != null) { - switch (op) { - case IS_NULL: - return onlyChildAs(call, FieldReferenceExpression.class) - .map(FieldReferenceExpression::getName) - .map(Expressions::isNull); - - case NOT_NULL: - return onlyChildAs(call, FieldReferenceExpression.class) - .map(FieldReferenceExpression::getName) - .map(Expressions::notNull); - - case LT: - return convertFieldAndLiteral(Expressions::lessThan, Expressions::greaterThan, call); - - case LT_EQ: - return convertFieldAndLiteral( - Expressions::lessThanOrEqual, Expressions::greaterThanOrEqual, call); - - case GT: - return convertFieldAndLiteral(Expressions::greaterThan, Expressions::lessThan, call); - - case GT_EQ: - return convertFieldAndLiteral( - Expressions::greaterThanOrEqual, Expressions::lessThanOrEqual, call); - - case EQ: - return convertFieldAndLiteral( - (ref, lit) -> { - if (NaNUtil.isNaN(lit)) { - return Expressions.isNaN(ref); - } else { - return Expressions.equal(ref, lit); - } - }, - call); - - case NOT_EQ: - return convertFieldAndLiteral( - (ref, lit) -> { - if (NaNUtil.isNaN(lit)) { - return Expressions.notNaN(ref); - } else { - return Expressions.notEqual(ref, lit); - } - }, - call); - - case NOT: - return onlyChildAs(call, CallExpression.class) - .flatMap(FlinkFilters::convert) - .map(Expressions::not); - - case AND: - return convertLogicExpression(Expressions::and, call); - - case OR: - return convertLogicExpression(Expressions::or, call); - - case STARTS_WITH: - return convertLike(call); - } - } - - return Optional.empty(); - } - - private static Optional onlyChildAs( - CallExpression call, Class expectedChildClass) { - List children = call.getResolvedChildren(); - if (children.size() != 1) { - return Optional.empty(); - } - - ResolvedExpression child = children.get(0); - if (!expectedChildClass.isInstance(child)) { - return Optional.empty(); - } - - return Optional.of(expectedChildClass.cast(child)); - } - - private static Optional convertLike(CallExpression call) { - List args = call.getResolvedChildren(); - if (args.size() != 2) { - return Optional.empty(); - } - - org.apache.flink.table.expressions.Expression left = args.get(0); - org.apache.flink.table.expressions.Expression right = args.get(1); - - if (left instanceof FieldReferenceExpression && right instanceof ValueLiteralExpression) { - String name = ((FieldReferenceExpression) left).getName(); - return convertLiteral((ValueLiteralExpression) right) - .flatMap( - lit -> { - if (lit instanceof String) { - String pattern = (String) lit; - Matcher matcher = STARTS_WITH_PATTERN.matcher(pattern); - // exclude special char of LIKE - // '_' is the wildcard of the SQL LIKE - if (!pattern.contains("_") && matcher.matches()) { - return Optional.of(Expressions.startsWith(name, matcher.group(1))); - } - } - - return Optional.empty(); - }); - } - - return Optional.empty(); - } - - private static Optional convertLogicExpression( - BiFunction function, CallExpression call) { - List args = call.getResolvedChildren(); - if (args == null || args.size() != 2) { - return Optional.empty(); - } - - Optional left = convert(args.get(0)); - Optional right = convert(args.get(1)); - if (left.isPresent() && right.isPresent()) { - return Optional.of(function.apply(left.get(), right.get())); - } - - return Optional.empty(); - } - - private static Optional convertLiteral(ValueLiteralExpression expression) { - Optional value = - expression.getValueAs( - expression.getOutputDataType().getLogicalType().getDefaultConversion()); - return value.map( - o -> { - if (o instanceof LocalDateTime) { - return DateTimeUtil.microsFromTimestamp((LocalDateTime) o); - } else if (o instanceof Instant) { - return DateTimeUtil.microsFromInstant((Instant) o); - } else if (o instanceof LocalTime) { - return DateTimeUtil.microsFromTime((LocalTime) o); - } else if (o instanceof LocalDate) { - return DateTimeUtil.daysFromDate((LocalDate) o); - } - - return o; - }); - } - - private static Optional convertFieldAndLiteral( - BiFunction expr, CallExpression call) { - return convertFieldAndLiteral(expr, expr, call); - } - - private static Optional convertFieldAndLiteral( - BiFunction convertLR, - BiFunction convertRL, - CallExpression call) { - List args = call.getResolvedChildren(); - if (args.size() != 2) { - return Optional.empty(); - } - - org.apache.flink.table.expressions.Expression left = args.get(0); - org.apache.flink.table.expressions.Expression right = args.get(1); - - if (left instanceof FieldReferenceExpression && right instanceof ValueLiteralExpression) { - String name = ((FieldReferenceExpression) left).getName(); - Optional lit = convertLiteral((ValueLiteralExpression) right); - if (lit.isPresent()) { - return Optional.of(convertLR.apply(name, lit.get())); - } - } else if (left instanceof ValueLiteralExpression - && right instanceof FieldReferenceExpression) { - Optional lit = convertLiteral((ValueLiteralExpression) left); - String name = ((FieldReferenceExpression) right).getName(); - if (lit.isPresent()) { - return Optional.of(convertRL.apply(name, lit.get())); - } - } - - return Optional.empty(); - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java deleted file mode 100644 index 767d4497ac91..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import org.apache.iceberg.Schema; -import org.apache.iceberg.types.FixupTypes; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.types.Types; - -/** - * The uuid and fixed are converted to the same Flink type. Conversion back can produce only one, - * which may not be correct. - */ -class FlinkFixupTypes extends FixupTypes { - - private FlinkFixupTypes(Schema referenceSchema) { - super(referenceSchema); - } - - static Schema fixup(Schema schema, Schema referenceSchema) { - return new Schema( - TypeUtil.visit(schema, new FlinkFixupTypes(referenceSchema)).asStructType().fields()); - } - - @Override - protected boolean fixupPrimitive(Type.PrimitiveType type, Type source) { - if (type instanceof Types.FixedType) { - int length = ((Types.FixedType) type).length(); - return source.typeId() == Type.TypeID.UUID && length == 16; - } - return false; - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java deleted file mode 100644 index 97439b7bb0d6..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java +++ /dev/null @@ -1,178 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.util.List; -import java.util.Set; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.utils.TypeConversions; -import org.apache.iceberg.Schema; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.types.Types; - -/** - * Converter between Flink types and Iceberg type. The conversion is not a 1:1 mapping that not - * allows back-and-forth conversion. So some information might get lost during the back-and-forth - * conversion. - * - *

This inconsistent types: - * - *

    - *
  • map Iceberg UUID type to Flink BinaryType(16) - *
  • map Flink VarCharType(_) and CharType(_) to Iceberg String type - *
  • map Flink VarBinaryType(_) to Iceberg Binary type - *
  • map Flink TimeType(_) to Iceberg Time type (microseconds) - *
  • map Flink TimestampType(_) to Iceberg Timestamp without zone type (microseconds) - *
  • map Flink LocalZonedTimestampType(_) to Iceberg Timestamp with zone type (microseconds) - *
  • map Flink MultiSetType to Iceberg Map type(element, int) - *
- * - *

- */ -public class FlinkSchemaUtil { - - private FlinkSchemaUtil() {} - - /** Convert the flink table schema to apache iceberg schema. */ - public static Schema convert(TableSchema schema) { - LogicalType schemaType = schema.toRowDataType().getLogicalType(); - Preconditions.checkArgument( - schemaType instanceof RowType, "Schema logical type should be RowType."); - - RowType root = (RowType) schemaType; - Type converted = root.accept(new FlinkTypeToType(root)); - - Schema iSchema = new Schema(converted.asStructType().fields()); - return freshIdentifierFieldIds(iSchema, schema); - } - - private static Schema freshIdentifierFieldIds(Schema iSchema, TableSchema schema) { - // Locate the identifier field id list. - Set identifierFieldIds = Sets.newHashSet(); - if (schema.getPrimaryKey().isPresent()) { - for (String column : schema.getPrimaryKey().get().getColumns()) { - Types.NestedField field = iSchema.findField(column); - Preconditions.checkNotNull( - field, - "Cannot find field ID for the primary key column %s in schema %s", - column, - iSchema); - identifierFieldIds.add(field.fieldId()); - } - } - - return new Schema(iSchema.schemaId(), iSchema.asStruct().fields(), identifierFieldIds); - } - - /** - * Convert a Flink {@link TableSchema} to a {@link Schema} based on the given schema. - * - *

This conversion does not assign new ids; it uses ids from the base schema. - * - *

Data types, field order, and nullability will match the Flink type. This conversion may - * return a schema that is not compatible with base schema. - * - * @param baseSchema a Schema on which conversion is based - * @param flinkSchema a Flink TableSchema - * @return the equivalent Schema - * @throws IllegalArgumentException if the type cannot be converted or there are missing ids - */ - public static Schema convert(Schema baseSchema, TableSchema flinkSchema) { - // convert to a type with fresh ids - Types.StructType struct = convert(flinkSchema).asStruct(); - // reassign ids to match the base schema - Schema schema = TypeUtil.reassignIds(new Schema(struct.fields()), baseSchema); - // fix types that can't be represented in Flink (UUID) - Schema fixedSchema = FlinkFixupTypes.fixup(schema, baseSchema); - return freshIdentifierFieldIds(fixedSchema, flinkSchema); - } - - /** - * Convert a {@link Schema} to a {@link RowType Flink type}. - * - * @param schema a Schema - * @return the equivalent Flink type - * @throws IllegalArgumentException if the type cannot be converted to Flink - */ - public static RowType convert(Schema schema) { - return (RowType) TypeUtil.visit(schema, new TypeToFlinkType()); - } - - /** - * Convert a {@link Type} to a {@link LogicalType Flink type}. - * - * @param type a Type - * @return the equivalent Flink type - * @throws IllegalArgumentException if the type cannot be converted to Flink - */ - public static LogicalType convert(Type type) { - return TypeUtil.visit(type, new TypeToFlinkType()); - } - - /** - * Convert a {@link RowType} to a {@link TableSchema}. - * - * @param rowType a RowType - * @return Flink TableSchema - */ - public static TableSchema toSchema(RowType rowType) { - TableSchema.Builder builder = TableSchema.builder(); - for (RowType.RowField field : rowType.getFields()) { - builder.field(field.getName(), TypeConversions.fromLogicalToDataType(field.getType())); - } - return builder.build(); - } - - /** - * Convert a {@link Schema} to a {@link TableSchema}. - * - * @param schema iceberg schema to convert. - * @return Flink TableSchema. - */ - public static TableSchema toSchema(Schema schema) { - TableSchema.Builder builder = TableSchema.builder(); - - // Add columns. - for (RowType.RowField field : convert(schema).getFields()) { - builder.field(field.getName(), TypeConversions.fromLogicalToDataType(field.getType())); - } - - // Add primary key. - Set identifierFieldIds = schema.identifierFieldIds(); - if (!identifierFieldIds.isEmpty()) { - List columns = Lists.newArrayListWithExpectedSize(identifierFieldIds.size()); - for (Integer identifierFieldId : identifierFieldIds) { - String columnName = schema.findColumnName(identifierFieldId); - Preconditions.checkNotNull( - columnName, "Cannot find field with id %s in schema %s", identifierFieldId, schema); - - columns.add(columnName); - } - builder.primaryKey(columns.toArray(new String[0])); - } - - return builder.build(); - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java deleted file mode 100644 index 6f8bfef2ef44..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java +++ /dev/null @@ -1,199 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.util.List; -import java.util.stream.Collectors; -import org.apache.flink.table.types.logical.ArrayType; -import org.apache.flink.table.types.logical.BigIntType; -import org.apache.flink.table.types.logical.BinaryType; -import org.apache.flink.table.types.logical.BooleanType; -import org.apache.flink.table.types.logical.CharType; -import org.apache.flink.table.types.logical.DateType; -import org.apache.flink.table.types.logical.DecimalType; -import org.apache.flink.table.types.logical.DoubleType; -import org.apache.flink.table.types.logical.FloatType; -import org.apache.flink.table.types.logical.IntType; -import org.apache.flink.table.types.logical.LocalZonedTimestampType; -import org.apache.flink.table.types.logical.MapType; -import org.apache.flink.table.types.logical.MultisetType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.logical.SmallIntType; -import org.apache.flink.table.types.logical.TimeType; -import org.apache.flink.table.types.logical.TimestampType; -import org.apache.flink.table.types.logical.TinyIntType; -import org.apache.flink.table.types.logical.VarBinaryType; -import org.apache.flink.table.types.logical.VarCharType; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; - -class FlinkTypeToType extends FlinkTypeVisitor { - - private final RowType root; - private int nextId; - - FlinkTypeToType(RowType root) { - this.root = root; - // the root struct's fields use the first ids - this.nextId = root.getFieldCount(); - } - - private int getNextId() { - int next = nextId; - nextId += 1; - return next; - } - - @Override - public Type visit(CharType charType) { - return Types.StringType.get(); - } - - @Override - public Type visit(VarCharType varCharType) { - return Types.StringType.get(); - } - - @Override - public Type visit(BooleanType booleanType) { - return Types.BooleanType.get(); - } - - @Override - public Type visit(BinaryType binaryType) { - return Types.FixedType.ofLength(binaryType.getLength()); - } - - @Override - public Type visit(VarBinaryType varBinaryType) { - return Types.BinaryType.get(); - } - - @Override - public Type visit(DecimalType decimalType) { - return Types.DecimalType.of(decimalType.getPrecision(), decimalType.getScale()); - } - - @Override - public Type visit(TinyIntType tinyIntType) { - return Types.IntegerType.get(); - } - - @Override - public Type visit(SmallIntType smallIntType) { - return Types.IntegerType.get(); - } - - @Override - public Type visit(IntType intType) { - return Types.IntegerType.get(); - } - - @Override - public Type visit(BigIntType bigIntType) { - return Types.LongType.get(); - } - - @Override - public Type visit(FloatType floatType) { - return Types.FloatType.get(); - } - - @Override - public Type visit(DoubleType doubleType) { - return Types.DoubleType.get(); - } - - @Override - public Type visit(DateType dateType) { - return Types.DateType.get(); - } - - @Override - public Type visit(TimeType timeType) { - return Types.TimeType.get(); - } - - @Override - public Type visit(TimestampType timestampType) { - return Types.TimestampType.withoutZone(); - } - - @Override - public Type visit(LocalZonedTimestampType localZonedTimestampType) { - return Types.TimestampType.withZone(); - } - - @Override - public Type visit(ArrayType arrayType) { - Type elementType = arrayType.getElementType().accept(this); - if (arrayType.getElementType().isNullable()) { - return Types.ListType.ofOptional(getNextId(), elementType); - } else { - return Types.ListType.ofRequired(getNextId(), elementType); - } - } - - @Override - public Type visit(MultisetType multisetType) { - Type elementType = multisetType.getElementType().accept(this); - return Types.MapType.ofRequired(getNextId(), getNextId(), elementType, Types.IntegerType.get()); - } - - @Override - public Type visit(MapType mapType) { - // keys in map are not allowed to be null. - Type keyType = mapType.getKeyType().accept(this); - Type valueType = mapType.getValueType().accept(this); - if (mapType.getValueType().isNullable()) { - return Types.MapType.ofOptional(getNextId(), getNextId(), keyType, valueType); - } else { - return Types.MapType.ofRequired(getNextId(), getNextId(), keyType, valueType); - } - } - - @Override - @SuppressWarnings("ReferenceEquality") - public Type visit(RowType rowType) { - List newFields = Lists.newArrayListWithExpectedSize(rowType.getFieldCount()); - boolean isRoot = root == rowType; - - List types = - rowType.getFields().stream() - .map(f -> f.getType().accept(this)) - .collect(Collectors.toList()); - - for (int i = 0; i < rowType.getFieldCount(); i++) { - int id = isRoot ? i : getNextId(); - - RowType.RowField field = rowType.getFields().get(i); - String name = field.getName(); - String comment = field.getDescription().orElse(null); - - if (field.getType().isNullable()) { - newFields.add(Types.NestedField.optional(id, name, types.get(i), comment)); - } else { - newFields.add(Types.NestedField.required(id, name, types.get(i), comment)); - } - } - - return Types.StructType.of(newFields); - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java deleted file mode 100644 index f3de2416088c..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import org.apache.flink.table.types.logical.DayTimeIntervalType; -import org.apache.flink.table.types.logical.DistinctType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.LogicalTypeVisitor; -import org.apache.flink.table.types.logical.NullType; -import org.apache.flink.table.types.logical.RawType; -import org.apache.flink.table.types.logical.StructuredType; -import org.apache.flink.table.types.logical.SymbolType; -import org.apache.flink.table.types.logical.YearMonthIntervalType; -import org.apache.flink.table.types.logical.ZonedTimestampType; - -public abstract class FlinkTypeVisitor implements LogicalTypeVisitor { - - // ------------------------- Unsupported types ------------------------------ - - @Override - public T visit(ZonedTimestampType zonedTimestampType) { - throw new UnsupportedOperationException("Unsupported ZonedTimestampType."); - } - - @Override - public T visit(YearMonthIntervalType yearMonthIntervalType) { - throw new UnsupportedOperationException("Unsupported YearMonthIntervalType."); - } - - @Override - public T visit(DayTimeIntervalType dayTimeIntervalType) { - throw new UnsupportedOperationException("Unsupported DayTimeIntervalType."); - } - - @Override - public T visit(DistinctType distinctType) { - throw new UnsupportedOperationException("Unsupported DistinctType."); - } - - @Override - public T visit(StructuredType structuredType) { - throw new UnsupportedOperationException("Unsupported StructuredType."); - } - - @Override - public T visit(NullType nullType) { - throw new UnsupportedOperationException("Unsupported NullType."); - } - - @Override - public T visit(RawType rawType) { - throw new UnsupportedOperationException("Unsupported RawType."); - } - - @Override - public T visit(SymbolType symbolType) { - throw new UnsupportedOperationException("Unsupported SymbolType."); - } - - @Override - public T visit(LogicalType other) { - throw new UnsupportedOperationException("Unsupported type: " + other); - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java deleted file mode 100644 index bcdf9324cda9..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.util.Map; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.iceberg.DistributionMode; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; - -/** - * A class for common Iceberg configs for Flink writes. - * - *

If a config is set at multiple levels, the following order of precedence is used (top to - * bottom): - * - *

    - *
  1. Write options - *
  2. flink ReadableConfig - *
  3. Table metadata - *
- * - * The most specific value is set in write options and takes precedence over all other configs. If - * no write option is provided, this class checks the flink configuration for any overrides. If no - * applicable value is found in the write options, this class uses the table metadata. - * - *

Note this class is NOT meant to be serialized. - */ -public class FlinkWriteConf { - - private final FlinkConfParser confParser; - - public FlinkWriteConf( - Table table, Map writeOptions, ReadableConfig readableConfig) { - this.confParser = new FlinkConfParser(table, writeOptions, readableConfig); - } - - public boolean overwriteMode() { - return confParser - .booleanConf() - .option(FlinkWriteOptions.OVERWRITE_MODE.key()) - .flinkConfig(FlinkWriteOptions.OVERWRITE_MODE) - .defaultValue(FlinkWriteOptions.OVERWRITE_MODE.defaultValue()) - .parse(); - } - - public boolean upsertMode() { - return confParser - .booleanConf() - .option(FlinkWriteOptions.WRITE_UPSERT_ENABLED.key()) - .flinkConfig(FlinkWriteOptions.WRITE_UPSERT_ENABLED) - .tableProperty(TableProperties.UPSERT_ENABLED) - .defaultValue(TableProperties.UPSERT_ENABLED_DEFAULT) - .parse(); - } - - public FileFormat dataFileFormat() { - String valueAsString = - confParser - .stringConf() - .option(FlinkWriteOptions.WRITE_FORMAT.key()) - .flinkConfig(FlinkWriteOptions.WRITE_FORMAT) - .tableProperty(TableProperties.DEFAULT_FILE_FORMAT) - .defaultValue(TableProperties.DEFAULT_FILE_FORMAT_DEFAULT) - .parse(); - return FileFormat.fromString(valueAsString); - } - - public long targetDataFileSize() { - return confParser - .longConf() - .option(FlinkWriteOptions.TARGET_FILE_SIZE_BYTES.key()) - .flinkConfig(FlinkWriteOptions.TARGET_FILE_SIZE_BYTES) - .tableProperty(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES) - .defaultValue(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT) - .parse(); - } - - public DistributionMode distributionMode() { - String modeName = - confParser - .stringConf() - .option(FlinkWriteOptions.DISTRIBUTION_MODE.key()) - .flinkConfig(FlinkWriteOptions.DISTRIBUTION_MODE) - .tableProperty(TableProperties.WRITE_DISTRIBUTION_MODE) - .defaultValue(TableProperties.WRITE_DISTRIBUTION_MODE_NONE) - .parse(); - return DistributionMode.fromName(modeName); - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java deleted file mode 100644 index a3091d5779c7..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import org.apache.flink.configuration.ConfigOption; -import org.apache.flink.configuration.ConfigOptions; - -/** Flink sink write options */ -public class FlinkWriteOptions { - - private FlinkWriteOptions() {} - - // File format for write operations(default: Table write.format.default ) - public static final ConfigOption WRITE_FORMAT = - ConfigOptions.key("write-format").stringType().noDefaultValue(); - - // Overrides this table's write.target-file-size-bytes - public static final ConfigOption TARGET_FILE_SIZE_BYTES = - ConfigOptions.key("target-file-size-bytes").longType().noDefaultValue(); - - // Overrides this table's write.upsert.enabled - public static final ConfigOption WRITE_UPSERT_ENABLED = - ConfigOptions.key("upsert-enabled").booleanType().noDefaultValue(); - - public static final ConfigOption OVERWRITE_MODE = - ConfigOptions.key("overwrite-enabled").booleanType().defaultValue(false); - - // Overrides the table's write.distribution-mode - public static final ConfigOption DISTRIBUTION_MODE = - ConfigOptions.key("distribution-mode").stringType().noDefaultValue(); -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java deleted file mode 100644 index 2a4a5302300c..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.util.List; -import java.util.Map; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.api.constraints.UniqueConstraint; -import org.apache.flink.table.connector.ChangelogMode; -import org.apache.flink.table.connector.sink.DataStreamSinkProvider; -import org.apache.flink.table.connector.sink.DynamicTableSink; -import org.apache.flink.table.connector.sink.abilities.SupportsOverwrite; -import org.apache.flink.table.connector.sink.abilities.SupportsPartitioning; -import org.apache.flink.types.RowKind; -import org.apache.flink.util.Preconditions; -import org.apache.iceberg.flink.sink.FlinkSink; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; - -public class IcebergTableSink implements DynamicTableSink, SupportsPartitioning, SupportsOverwrite { - private final TableLoader tableLoader; - private final TableSchema tableSchema; - private final Map writeProps; - - private boolean overwrite = false; - - private IcebergTableSink(IcebergTableSink toCopy) { - this.tableLoader = toCopy.tableLoader; - this.tableSchema = toCopy.tableSchema; - this.overwrite = toCopy.overwrite; - this.writeProps = toCopy.writeProps; - } - - public IcebergTableSink( - TableLoader tableLoader, TableSchema tableSchema, Map writeProps) { - this.tableLoader = tableLoader; - this.tableSchema = tableSchema; - this.writeProps = writeProps; - } - - @Override - public SinkRuntimeProvider getSinkRuntimeProvider(Context context) { - Preconditions.checkState( - !overwrite || context.isBounded(), - "Unbounded data stream doesn't support overwrite operation."); - - List equalityColumns = - tableSchema.getPrimaryKey().map(UniqueConstraint::getColumns).orElseGet(ImmutableList::of); - - return (DataStreamSinkProvider) - dataStream -> - FlinkSink.forRowData(dataStream) - .tableLoader(tableLoader) - .tableSchema(tableSchema) - .equalityFieldColumns(equalityColumns) - .overwrite(overwrite) - .setAll(writeProps) - .append(); - } - - @Override - public void applyStaticPartition(Map partition) { - // The flink's PartitionFanoutWriter will handle the static partition write policy - // automatically. - } - - @Override - public ChangelogMode getChangelogMode(ChangelogMode requestedMode) { - ChangelogMode.Builder builder = ChangelogMode.newBuilder(); - for (RowKind kind : requestedMode.getContainedKinds()) { - builder.addContainedKind(kind); - } - return builder.build(); - } - - @Override - public DynamicTableSink copy() { - return new IcebergTableSink(this); - } - - @Override - public String asSummaryString() { - return "Iceberg table sink"; - } - - @Override - public void applyOverwrite(boolean newOverwrite) { - this.overwrite = newOverwrite; - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSource.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSource.java deleted file mode 100644 index 3bd7335f74c5..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSource.java +++ /dev/null @@ -1,193 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.connector.ChangelogMode; -import org.apache.flink.table.connector.source.DataStreamScanProvider; -import org.apache.flink.table.connector.source.DynamicTableSource; -import org.apache.flink.table.connector.source.ScanTableSource; -import org.apache.flink.table.connector.source.abilities.SupportsFilterPushDown; -import org.apache.flink.table.connector.source.abilities.SupportsLimitPushDown; -import org.apache.flink.table.connector.source.abilities.SupportsProjectionPushDown; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.expressions.ResolvedExpression; -import org.apache.flink.table.types.DataType; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.flink.source.FlinkSource; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; - -/** Flink Iceberg table source. */ -public class IcebergTableSource - implements ScanTableSource, - SupportsProjectionPushDown, - SupportsFilterPushDown, - SupportsLimitPushDown { - - private int[] projectedFields; - private long limit; - private List filters; - - private final TableLoader loader; - private final TableSchema schema; - private final Map properties; - private final boolean isLimitPushDown; - private final ReadableConfig readableConfig; - - private IcebergTableSource(IcebergTableSource toCopy) { - this.loader = toCopy.loader; - this.schema = toCopy.schema; - this.properties = toCopy.properties; - this.projectedFields = toCopy.projectedFields; - this.isLimitPushDown = toCopy.isLimitPushDown; - this.limit = toCopy.limit; - this.filters = toCopy.filters; - this.readableConfig = toCopy.readableConfig; - } - - public IcebergTableSource( - TableLoader loader, - TableSchema schema, - Map properties, - ReadableConfig readableConfig) { - this(loader, schema, properties, null, false, -1, ImmutableList.of(), readableConfig); - } - - private IcebergTableSource( - TableLoader loader, - TableSchema schema, - Map properties, - int[] projectedFields, - boolean isLimitPushDown, - long limit, - List filters, - ReadableConfig readableConfig) { - this.loader = loader; - this.schema = schema; - this.properties = properties; - this.projectedFields = projectedFields; - this.isLimitPushDown = isLimitPushDown; - this.limit = limit; - this.filters = filters; - this.readableConfig = readableConfig; - } - - @Override - public void applyProjection(int[][] projectFields) { - this.projectedFields = new int[projectFields.length]; - for (int i = 0; i < projectFields.length; i++) { - Preconditions.checkArgument( - projectFields[i].length == 1, "Don't support nested projection in iceberg source now."); - this.projectedFields[i] = projectFields[i][0]; - } - } - - private DataStream createDataStream(StreamExecutionEnvironment execEnv) { - return FlinkSource.forRowData() - .env(execEnv) - .tableLoader(loader) - .properties(properties) - .project(getProjectedSchema()) - .limit(limit) - .filters(filters) - .flinkConf(readableConfig) - .build(); - } - - private TableSchema getProjectedSchema() { - if (projectedFields == null) { - return schema; - } else { - String[] fullNames = schema.getFieldNames(); - DataType[] fullTypes = schema.getFieldDataTypes(); - return TableSchema.builder() - .fields( - Arrays.stream(projectedFields).mapToObj(i -> fullNames[i]).toArray(String[]::new), - Arrays.stream(projectedFields).mapToObj(i -> fullTypes[i]).toArray(DataType[]::new)) - .build(); - } - } - - @Override - public void applyLimit(long newLimit) { - this.limit = newLimit; - } - - @Override - public Result applyFilters(List flinkFilters) { - List acceptedFilters = Lists.newArrayList(); - List expressions = Lists.newArrayList(); - - for (ResolvedExpression resolvedExpression : flinkFilters) { - Optional icebergExpression = FlinkFilters.convert(resolvedExpression); - if (icebergExpression.isPresent()) { - expressions.add(icebergExpression.get()); - acceptedFilters.add(resolvedExpression); - } - } - - this.filters = expressions; - return Result.of(acceptedFilters, flinkFilters); - } - - @Override - public boolean supportsNestedProjection() { - // TODO: support nested projection - return false; - } - - @Override - public ChangelogMode getChangelogMode() { - return ChangelogMode.insertOnly(); - } - - @Override - public ScanRuntimeProvider getScanRuntimeProvider(ScanContext runtimeProviderContext) { - return new DataStreamScanProvider() { - @Override - public DataStream produceDataStream(StreamExecutionEnvironment execEnv) { - return createDataStream(execEnv); - } - - @Override - public boolean isBounded() { - return FlinkSource.isBounded(properties); - } - }; - } - - @Override - public DynamicTableSource copy() { - return new IcebergTableSource(this); - } - - @Override - public String asSummaryString() { - return "Iceberg table source"; - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java deleted file mode 100644 index d4cec7a3e80b..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.lang.reflect.Array; -import java.nio.ByteBuffer; -import java.time.LocalDateTime; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.table.types.logical.DecimalType; -import org.apache.flink.table.types.logical.LocalZonedTimestampType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.logical.TimestampType; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.DateTimeUtil; -import org.apache.iceberg.util.UUIDUtil; - -public class RowDataWrapper implements StructLike { - - private final LogicalType[] types; - private final PositionalGetter[] getters; - private RowData rowData = null; - - public RowDataWrapper(RowType rowType, Types.StructType struct) { - int size = rowType.getFieldCount(); - - types = (LogicalType[]) Array.newInstance(LogicalType.class, size); - getters = (PositionalGetter[]) Array.newInstance(PositionalGetter.class, size); - - for (int i = 0; i < size; i++) { - types[i] = rowType.getTypeAt(i); - getters[i] = buildGetter(types[i], struct.fields().get(i).type()); - } - } - - public RowDataWrapper wrap(RowData data) { - this.rowData = data; - return this; - } - - @Override - public int size() { - return types.length; - } - - @Override - public T get(int pos, Class javaClass) { - if (rowData.isNullAt(pos)) { - return null; - } else if (getters[pos] != null) { - return javaClass.cast(getters[pos].get(rowData, pos)); - } - - Object value = RowData.createFieldGetter(types[pos], pos).getFieldOrNull(rowData); - return javaClass.cast(value); - } - - @Override - public void set(int pos, T value) { - throw new UnsupportedOperationException( - "Could not set a field in the RowDataWrapper because rowData is read-only"); - } - - private interface PositionalGetter { - T get(RowData data, int pos); - } - - private static PositionalGetter buildGetter(LogicalType logicalType, Type type) { - switch (logicalType.getTypeRoot()) { - case TINYINT: - return (row, pos) -> (int) row.getByte(pos); - case SMALLINT: - return (row, pos) -> (int) row.getShort(pos); - case CHAR: - case VARCHAR: - return (row, pos) -> row.getString(pos).toString(); - - case BINARY: - case VARBINARY: - if (Type.TypeID.UUID == type.typeId()) { - return (row, pos) -> UUIDUtil.convert(row.getBinary(pos)); - } else { - return (row, pos) -> ByteBuffer.wrap(row.getBinary(pos)); - } - - case DECIMAL: - DecimalType decimalType = (DecimalType) logicalType; - return (row, pos) -> - row.getDecimal(pos, decimalType.getPrecision(), decimalType.getScale()).toBigDecimal(); - - case TIME_WITHOUT_TIME_ZONE: - // Time in RowData is in milliseconds (Integer), while iceberg's time is microseconds - // (Long). - return (row, pos) -> ((long) row.getInt(pos)) * 1_000; - - case TIMESTAMP_WITHOUT_TIME_ZONE: - TimestampType timestampType = (TimestampType) logicalType; - return (row, pos) -> { - LocalDateTime localDateTime = - row.getTimestamp(pos, timestampType.getPrecision()).toLocalDateTime(); - return DateTimeUtil.microsFromTimestamp(localDateTime); - }; - - case TIMESTAMP_WITH_LOCAL_TIME_ZONE: - LocalZonedTimestampType lzTs = (LocalZonedTimestampType) logicalType; - return (row, pos) -> { - TimestampData timestampData = row.getTimestamp(pos, lzTs.getPrecision()); - return timestampData.getMillisecond() * 1000 - + timestampData.getNanoOfMillisecond() / 1000; - }; - - case ROW: - RowType rowType = (RowType) logicalType; - Types.StructType structType = (Types.StructType) type; - - RowDataWrapper nestedWrapper = new RowDataWrapper(rowType, structType); - return (row, pos) -> nestedWrapper.wrap(row.getRow(pos, rowType.getFieldCount())); - - default: - return null; - } - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java deleted file mode 100644 index e128badb8461..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.io.Closeable; -import java.io.IOException; -import java.io.Serializable; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.Catalog; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.hadoop.SerializableConfiguration; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; - -/** - * Serializable loader to load an Iceberg {@link Table}. Flink needs to get {@link Table} objects in - * the cluster (for example, to get splits), not just on the client side. So we need an Iceberg - * table loader to get the {@link Table} object. - */ -public interface TableLoader extends Closeable, Serializable { - - void open(); - - Table loadTable(); - - static TableLoader fromCatalog(CatalogLoader catalogLoader, TableIdentifier identifier) { - return new CatalogTableLoader(catalogLoader, identifier); - } - - static TableLoader fromHadoopTable(String location) { - return fromHadoopTable(location, FlinkCatalogFactory.clusterHadoopConf()); - } - - static TableLoader fromHadoopTable(String location, Configuration hadoopConf) { - return new HadoopTableLoader(location, hadoopConf); - } - - class HadoopTableLoader implements TableLoader { - - private static final long serialVersionUID = 1L; - - private final String location; - private final SerializableConfiguration hadoopConf; - - private transient HadoopTables tables; - - private HadoopTableLoader(String location, Configuration conf) { - this.location = location; - this.hadoopConf = new SerializableConfiguration(conf); - } - - @Override - public void open() { - tables = new HadoopTables(hadoopConf.get()); - } - - @Override - public Table loadTable() { - return tables.load(location); - } - - @Override - public void close() {} - - @Override - public String toString() { - return MoreObjects.toStringHelper(this).add("location", location).toString(); - } - } - - class CatalogTableLoader implements TableLoader { - - private static final long serialVersionUID = 1L; - - private final CatalogLoader catalogLoader; - private final String identifier; - - private transient Catalog catalog; - - private CatalogTableLoader(CatalogLoader catalogLoader, TableIdentifier tableIdentifier) { - this.catalogLoader = catalogLoader; - this.identifier = tableIdentifier.toString(); - } - - @Override - public void open() { - catalog = catalogLoader.loadCatalog(); - } - - @Override - public Table loadTable() { - return catalog.loadTable(TableIdentifier.parse(identifier)); - } - - @Override - public void close() throws IOException { - if (catalog instanceof Closeable) { - ((Closeable) catalog).close(); - } - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("tableIdentifier", identifier) - .add("catalogLoader", catalogLoader) - .toString(); - } - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java deleted file mode 100644 index f8f1b74b1ceb..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.util.List; -import org.apache.flink.table.types.logical.ArrayType; -import org.apache.flink.table.types.logical.BigIntType; -import org.apache.flink.table.types.logical.BinaryType; -import org.apache.flink.table.types.logical.BooleanType; -import org.apache.flink.table.types.logical.DateType; -import org.apache.flink.table.types.logical.DecimalType; -import org.apache.flink.table.types.logical.DoubleType; -import org.apache.flink.table.types.logical.FloatType; -import org.apache.flink.table.types.logical.IntType; -import org.apache.flink.table.types.logical.LocalZonedTimestampType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.MapType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.logical.TimeType; -import org.apache.flink.table.types.logical.TimestampType; -import org.apache.flink.table.types.logical.VarBinaryType; -import org.apache.flink.table.types.logical.VarCharType; -import org.apache.iceberg.Schema; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.types.Types; - -class TypeToFlinkType extends TypeUtil.SchemaVisitor { - TypeToFlinkType() {} - - @Override - public LogicalType schema(Schema schema, LogicalType structType) { - return structType; - } - - @Override - public LogicalType struct(Types.StructType struct, List fieldResults) { - List fields = struct.fields(); - - List flinkFields = Lists.newArrayListWithExpectedSize(fieldResults.size()); - for (int i = 0; i < fields.size(); i += 1) { - Types.NestedField field = fields.get(i); - LogicalType type = fieldResults.get(i); - RowType.RowField flinkField = - new RowType.RowField(field.name(), type.copy(field.isOptional()), field.doc()); - flinkFields.add(flinkField); - } - - return new RowType(flinkFields); - } - - @Override - public LogicalType field(Types.NestedField field, LogicalType fieldResult) { - return fieldResult; - } - - @Override - public LogicalType list(Types.ListType list, LogicalType elementResult) { - return new ArrayType(elementResult.copy(list.isElementOptional())); - } - - @Override - public LogicalType map(Types.MapType map, LogicalType keyResult, LogicalType valueResult) { - // keys in map are not allowed to be null. - return new MapType(keyResult.copy(false), valueResult.copy(map.isValueOptional())); - } - - @Override - public LogicalType primitive(Type.PrimitiveType primitive) { - switch (primitive.typeId()) { - case BOOLEAN: - return new BooleanType(); - case INTEGER: - return new IntType(); - case LONG: - return new BigIntType(); - case FLOAT: - return new FloatType(); - case DOUBLE: - return new DoubleType(); - case DATE: - return new DateType(); - case TIME: - // For the type: Flink only support TimeType with default precision (second) now. The - // precision of time is - // not supported in Flink, so we can think of it as a simple time type directly. - // For the data: Flink uses int that support mills to represent time data, so it supports - // mills precision. - return new TimeType(); - case TIMESTAMP: - Types.TimestampType timestamp = (Types.TimestampType) primitive; - if (timestamp.shouldAdjustToUTC()) { - // MICROS - return new LocalZonedTimestampType(6); - } else { - // MICROS - return new TimestampType(6); - } - case STRING: - return new VarCharType(VarCharType.MAX_LENGTH); - case UUID: - // UUID length is 16 - return new BinaryType(16); - case FIXED: - Types.FixedType fixedType = (Types.FixedType) primitive; - return new BinaryType(fixedType.length()); - case BINARY: - return new VarBinaryType(VarBinaryType.MAX_LENGTH); - case DECIMAL: - Types.DecimalType decimal = (Types.DecimalType) primitive; - return new DecimalType(decimal.precision(), decimal.scale()); - default: - throw new UnsupportedOperationException( - "Cannot convert unknown type to Flink: " + primitive); - } - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java deleted file mode 100644 index 06ac54617ae6..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.actions; - -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.CoreOptions; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.iceberg.Table; - -public class Actions { - - public static final Configuration CONFIG = - new Configuration() - // disable classloader check as Avro may cache class/object in the serializers. - .set(CoreOptions.CHECK_LEAKED_CLASSLOADER, false); - - private StreamExecutionEnvironment env; - private Table table; - - private Actions(StreamExecutionEnvironment env, Table table) { - this.env = env; - this.table = table; - } - - public static Actions forTable(StreamExecutionEnvironment env, Table table) { - return new Actions(env, table); - } - - public static Actions forTable(Table table) { - return new Actions(StreamExecutionEnvironment.getExecutionEnvironment(CONFIG), table); - } - - public RewriteDataFilesAction rewriteDataFiles() { - return new RewriteDataFilesAction(env, table); - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java deleted file mode 100644 index 9876bb3861c4..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.actions; - -import java.util.List; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.Table; -import org.apache.iceberg.actions.BaseRewriteDataFilesAction; -import org.apache.iceberg.flink.source.RowDataRewriter; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -public class RewriteDataFilesAction extends BaseRewriteDataFilesAction { - - private StreamExecutionEnvironment env; - private int maxParallelism; - - public RewriteDataFilesAction(StreamExecutionEnvironment env, Table table) { - super(table); - this.env = env; - this.maxParallelism = env.getParallelism(); - } - - @Override - protected FileIO fileIO() { - return table().io(); - } - - @Override - protected List rewriteDataForTasks(List combinedScanTasks) { - int size = combinedScanTasks.size(); - int parallelism = Math.min(size, maxParallelism); - DataStream dataStream = env.fromCollection(combinedScanTasks); - RowDataRewriter rowDataRewriter = - new RowDataRewriter(table(), caseSensitive(), fileIO(), encryptionManager()); - try { - return rowDataRewriter.rewriteDataForTasks(dataStream, parallelism); - } catch (Exception e) { - throw new RuntimeException("Rewrite data file error.", e); - } - } - - @Override - protected RewriteDataFilesAction self() { - return this; - } - - public RewriteDataFilesAction maxParallelism(int parallelism) { - Preconditions.checkArgument(parallelism > 0, "Invalid max parallelism %s", parallelism); - this.maxParallelism = parallelism; - return this; - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java deleted file mode 100644 index 8103224a0b6c..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import org.apache.flink.table.types.logical.ArrayType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.LogicalTypeFamily; -import org.apache.flink.table.types.logical.MapType; -import org.apache.flink.table.types.logical.NullType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.avro.AvroWithPartnerByStructureVisitor; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.util.Pair; - -public abstract class AvroWithFlinkSchemaVisitor - extends AvroWithPartnerByStructureVisitor { - - @Override - protected boolean isStringType(LogicalType logicalType) { - return logicalType.getTypeRoot().getFamilies().contains(LogicalTypeFamily.CHARACTER_STRING); - } - - @Override - protected boolean isMapType(LogicalType logicalType) { - return logicalType instanceof MapType; - } - - @Override - protected LogicalType arrayElementType(LogicalType arrayType) { - Preconditions.checkArgument( - arrayType instanceof ArrayType, "Invalid array: %s is not an array", arrayType); - return ((ArrayType) arrayType).getElementType(); - } - - @Override - protected LogicalType mapKeyType(LogicalType mapType) { - Preconditions.checkArgument(isMapType(mapType), "Invalid map: %s is not a map", mapType); - return ((MapType) mapType).getKeyType(); - } - - @Override - protected LogicalType mapValueType(LogicalType mapType) { - Preconditions.checkArgument(isMapType(mapType), "Invalid map: %s is not a map", mapType); - return ((MapType) mapType).getValueType(); - } - - @Override - protected Pair fieldNameAndType(LogicalType structType, int pos) { - Preconditions.checkArgument( - structType instanceof RowType, "Invalid struct: %s is not a struct", structType); - RowType.RowField field = ((RowType) structType).getFields().get(pos); - return Pair.of(field.getName(), field.getType()); - } - - @Override - protected LogicalType nullType() { - return new NullType(); - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroReader.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroReader.java deleted file mode 100644 index 86404959735a..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroReader.java +++ /dev/null @@ -1,169 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.io.IOException; -import java.util.List; -import java.util.Map; -import java.util.function.Supplier; -import org.apache.avro.LogicalType; -import org.apache.avro.LogicalTypes; -import org.apache.avro.Schema; -import org.apache.avro.io.DatumReader; -import org.apache.avro.io.Decoder; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.avro.AvroSchemaWithTypeVisitor; -import org.apache.iceberg.avro.SupportsRowPosition; -import org.apache.iceberg.avro.ValueReader; -import org.apache.iceberg.avro.ValueReaders; -import org.apache.iceberg.data.avro.DecoderResolver; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; - -public class FlinkAvroReader implements DatumReader, SupportsRowPosition { - - private final Schema readSchema; - private final ValueReader reader; - private Schema fileSchema = null; - - public FlinkAvroReader(org.apache.iceberg.Schema expectedSchema, Schema readSchema) { - this(expectedSchema, readSchema, ImmutableMap.of()); - } - - @SuppressWarnings("unchecked") - public FlinkAvroReader( - org.apache.iceberg.Schema expectedSchema, Schema readSchema, Map constants) { - this.readSchema = readSchema; - this.reader = - (ValueReader) - AvroSchemaWithTypeVisitor.visit(expectedSchema, readSchema, new ReadBuilder(constants)); - } - - @Override - public void setSchema(Schema newFileSchema) { - this.fileSchema = Schema.applyAliases(newFileSchema, readSchema); - } - - @Override - public RowData read(RowData reuse, Decoder decoder) throws IOException { - return DecoderResolver.resolveAndRead(decoder, readSchema, fileSchema, reader, reuse); - } - - @Override - public void setRowPositionSupplier(Supplier posSupplier) { - if (reader instanceof SupportsRowPosition) { - ((SupportsRowPosition) reader).setRowPositionSupplier(posSupplier); - } - } - - private static class ReadBuilder extends AvroSchemaWithTypeVisitor> { - private final Map idToConstant; - - private ReadBuilder(Map idToConstant) { - this.idToConstant = idToConstant; - } - - @Override - public ValueReader record( - Types.StructType expected, Schema record, List names, List> fields) { - return FlinkValueReaders.struct(fields, expected.asStructType(), idToConstant); - } - - @Override - public ValueReader union(Type expected, Schema union, List> options) { - return ValueReaders.union(options); - } - - @Override - public ValueReader array( - Types.ListType expected, Schema array, ValueReader elementReader) { - return FlinkValueReaders.array(elementReader); - } - - @Override - public ValueReader map( - Types.MapType expected, Schema map, ValueReader keyReader, ValueReader valueReader) { - return FlinkValueReaders.arrayMap(keyReader, valueReader); - } - - @Override - public ValueReader map(Types.MapType expected, Schema map, ValueReader valueReader) { - return FlinkValueReaders.map(FlinkValueReaders.strings(), valueReader); - } - - @Override - public ValueReader primitive(Type.PrimitiveType expected, Schema primitive) { - LogicalType logicalType = primitive.getLogicalType(); - if (logicalType != null) { - switch (logicalType.getName()) { - case "date": - return ValueReaders.ints(); - - case "time-micros": - return FlinkValueReaders.timeMicros(); - - case "timestamp-millis": - return FlinkValueReaders.timestampMills(); - - case "timestamp-micros": - return FlinkValueReaders.timestampMicros(); - - case "decimal": - LogicalTypes.Decimal decimal = (LogicalTypes.Decimal) logicalType; - return FlinkValueReaders.decimal( - ValueReaders.decimalBytesReader(primitive), - decimal.getPrecision(), - decimal.getScale()); - - case "uuid": - return FlinkValueReaders.uuids(); - - default: - throw new IllegalArgumentException("Unknown logical type: " + logicalType); - } - } - - switch (primitive.getType()) { - case NULL: - return ValueReaders.nulls(); - case BOOLEAN: - return ValueReaders.booleans(); - case INT: - return ValueReaders.ints(); - case LONG: - return ValueReaders.longs(); - case FLOAT: - return ValueReaders.floats(); - case DOUBLE: - return ValueReaders.doubles(); - case STRING: - return FlinkValueReaders.strings(); - case FIXED: - return ValueReaders.fixed(primitive.getFixedSize()); - case BYTES: - return ValueReaders.bytes(); - case ENUM: - return FlinkValueReaders.enums(primitive.getEnumSymbols()); - default: - throw new IllegalArgumentException("Unsupported type: " + primitive); - } - } - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java deleted file mode 100644 index 873e65783119..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java +++ /dev/null @@ -1,165 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.io.IOException; -import java.util.List; -import java.util.stream.Collectors; -import java.util.stream.IntStream; -import java.util.stream.Stream; -import org.apache.avro.LogicalTypes; -import org.apache.avro.Schema; -import org.apache.avro.io.Encoder; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.FieldMetrics; -import org.apache.iceberg.avro.MetricsAwareDatumWriter; -import org.apache.iceberg.avro.ValueWriter; -import org.apache.iceberg.avro.ValueWriters; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -public class FlinkAvroWriter implements MetricsAwareDatumWriter { - private final RowType rowType; - private ValueWriter writer = null; - - public FlinkAvroWriter(RowType rowType) { - this.rowType = rowType; - } - - @Override - @SuppressWarnings("unchecked") - public void setSchema(Schema schema) { - this.writer = - (ValueWriter) - AvroWithFlinkSchemaVisitor.visit(rowType, schema, new WriteBuilder()); - } - - @Override - public void write(RowData datum, Encoder out) throws IOException { - writer.write(datum, out); - } - - @Override - public Stream metrics() { - return writer.metrics(); - } - - private static class WriteBuilder extends AvroWithFlinkSchemaVisitor> { - @Override - public ValueWriter record( - LogicalType struct, Schema record, List names, List> fields) { - return FlinkValueWriters.row( - fields, - IntStream.range(0, names.size()) - .mapToObj(i -> fieldNameAndType(struct, i).second()) - .collect(Collectors.toList())); - } - - @Override - public ValueWriter union(LogicalType type, Schema union, List> options) { - Preconditions.checkArgument( - options.contains(ValueWriters.nulls()), - "Cannot create writer for non-option union: %s", - union); - Preconditions.checkArgument( - options.size() == 2, "Cannot create writer for non-option union: %s", union); - if (union.getTypes().get(0).getType() == Schema.Type.NULL) { - return ValueWriters.option(0, options.get(1)); - } else { - return ValueWriters.option(1, options.get(0)); - } - } - - @Override - public ValueWriter array(LogicalType sArray, Schema array, ValueWriter elementWriter) { - return FlinkValueWriters.array(elementWriter, arrayElementType(sArray)); - } - - @Override - public ValueWriter map(LogicalType sMap, Schema map, ValueWriter valueReader) { - return FlinkValueWriters.map( - FlinkValueWriters.strings(), mapKeyType(sMap), valueReader, mapValueType(sMap)); - } - - @Override - public ValueWriter map( - LogicalType sMap, Schema map, ValueWriter keyWriter, ValueWriter valueWriter) { - return FlinkValueWriters.arrayMap( - keyWriter, mapKeyType(sMap), valueWriter, mapValueType(sMap)); - } - - @Override - public ValueWriter primitive(LogicalType type, Schema primitive) { - org.apache.avro.LogicalType logicalType = primitive.getLogicalType(); - if (logicalType != null) { - switch (logicalType.getName()) { - case "date": - return ValueWriters.ints(); - - case "time-micros": - return FlinkValueWriters.timeMicros(); - - case "timestamp-micros": - return FlinkValueWriters.timestampMicros(); - - case "decimal": - LogicalTypes.Decimal decimal = (LogicalTypes.Decimal) logicalType; - return FlinkValueWriters.decimal(decimal.getPrecision(), decimal.getScale()); - - case "uuid": - return ValueWriters.uuids(); - - default: - throw new IllegalArgumentException("Unsupported logical type: " + logicalType); - } - } - - switch (primitive.getType()) { - case NULL: - return ValueWriters.nulls(); - case BOOLEAN: - return ValueWriters.booleans(); - case INT: - switch (type.getTypeRoot()) { - case TINYINT: - return ValueWriters.tinyints(); - case SMALLINT: - return ValueWriters.shorts(); - default: - return ValueWriters.ints(); - } - case LONG: - return ValueWriters.longs(); - case FLOAT: - return ValueWriters.floats(); - case DOUBLE: - return ValueWriters.doubles(); - case STRING: - return FlinkValueWriters.strings(); - case FIXED: - return ValueWriters.fixed(primitive.getFixedSize()); - case BYTES: - return ValueWriters.bytes(); - default: - throw new IllegalArgumentException("Unsupported type: " + primitive); - } - } - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java deleted file mode 100644 index 65b9d44ad4b8..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.util.List; -import java.util.Map; -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.orc.OrcRowReader; -import org.apache.iceberg.orc.OrcSchemaWithTypeVisitor; -import org.apache.iceberg.orc.OrcValueReader; -import org.apache.iceberg.orc.OrcValueReaders; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; -import org.apache.orc.TypeDescription; -import org.apache.orc.storage.ql.exec.vector.StructColumnVector; -import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch; - -public class FlinkOrcReader implements OrcRowReader { - private final OrcValueReader reader; - - public FlinkOrcReader(Schema iSchema, TypeDescription readSchema) { - this(iSchema, readSchema, ImmutableMap.of()); - } - - public FlinkOrcReader(Schema iSchema, TypeDescription readSchema, Map idToConstant) { - this.reader = - OrcSchemaWithTypeVisitor.visit(iSchema, readSchema, new ReadBuilder(idToConstant)); - } - - @Override - public RowData read(VectorizedRowBatch batch, int row) { - return (RowData) reader.read(new StructColumnVector(batch.size, batch.cols), row); - } - - @Override - public void setBatchContext(long batchOffsetInFile) { - reader.setBatchContext(batchOffsetInFile); - } - - private static class ReadBuilder extends OrcSchemaWithTypeVisitor> { - private final Map idToConstant; - - private ReadBuilder(Map idToConstant) { - this.idToConstant = idToConstant; - } - - @Override - public OrcValueReader record( - Types.StructType iStruct, - TypeDescription record, - List names, - List> fields) { - return FlinkOrcReaders.struct(fields, iStruct, idToConstant); - } - - @Override - public OrcValueReader list( - Types.ListType iList, TypeDescription array, OrcValueReader elementReader) { - return FlinkOrcReaders.array(elementReader); - } - - @Override - public OrcValueReader map( - Types.MapType iMap, - TypeDescription map, - OrcValueReader keyReader, - OrcValueReader valueReader) { - return FlinkOrcReaders.map(keyReader, valueReader); - } - - @Override - public OrcValueReader primitive(Type.PrimitiveType iPrimitive, TypeDescription primitive) { - switch (iPrimitive.typeId()) { - case BOOLEAN: - return OrcValueReaders.booleans(); - case INTEGER: - return OrcValueReaders.ints(); - case LONG: - return OrcValueReaders.longs(); - case FLOAT: - return OrcValueReaders.floats(); - case DOUBLE: - return OrcValueReaders.doubles(); - case DATE: - return FlinkOrcReaders.dates(); - case TIME: - return FlinkOrcReaders.times(); - case TIMESTAMP: - Types.TimestampType timestampType = (Types.TimestampType) iPrimitive; - if (timestampType.shouldAdjustToUTC()) { - return FlinkOrcReaders.timestampTzs(); - } else { - return FlinkOrcReaders.timestamps(); - } - case STRING: - return FlinkOrcReaders.strings(); - case UUID: - case FIXED: - case BINARY: - return OrcValueReaders.bytes(); - case DECIMAL: - Types.DecimalType decimalType = (Types.DecimalType) iPrimitive; - return FlinkOrcReaders.decimals(decimalType.precision(), decimalType.scale()); - default: - throw new IllegalArgumentException( - String.format( - "Invalid iceberg type %s corresponding to ORC type %s", iPrimitive, primitive)); - } - } - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java deleted file mode 100644 index 7a4a15c7e600..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java +++ /dev/null @@ -1,283 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.math.BigDecimal; -import java.time.Instant; -import java.time.LocalDateTime; -import java.time.ZoneOffset; -import java.util.List; -import java.util.Map; -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.GenericArrayData; -import org.apache.flink.table.data.GenericMapData; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.iceberg.orc.OrcValueReader; -import org.apache.iceberg.orc.OrcValueReaders; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; -import org.apache.orc.storage.ql.exec.vector.BytesColumnVector; -import org.apache.orc.storage.ql.exec.vector.ColumnVector; -import org.apache.orc.storage.ql.exec.vector.DecimalColumnVector; -import org.apache.orc.storage.ql.exec.vector.ListColumnVector; -import org.apache.orc.storage.ql.exec.vector.LongColumnVector; -import org.apache.orc.storage.ql.exec.vector.MapColumnVector; -import org.apache.orc.storage.ql.exec.vector.TimestampColumnVector; -import org.apache.orc.storage.serde2.io.HiveDecimalWritable; - -class FlinkOrcReaders { - private FlinkOrcReaders() {} - - static OrcValueReader strings() { - return StringReader.INSTANCE; - } - - static OrcValueReader dates() { - return DateReader.INSTANCE; - } - - static OrcValueReader decimals(int precision, int scale) { - if (precision <= 18) { - return new Decimal18Reader(precision, scale); - } else if (precision <= 38) { - return new Decimal38Reader(precision, scale); - } else { - throw new IllegalArgumentException("Invalid precision: " + precision); - } - } - - static OrcValueReader times() { - return TimeReader.INSTANCE; - } - - static OrcValueReader timestamps() { - return TimestampReader.INSTANCE; - } - - static OrcValueReader timestampTzs() { - return TimestampTzReader.INSTANCE; - } - - static OrcValueReader array(OrcValueReader elementReader) { - return new ArrayReader<>(elementReader); - } - - public static OrcValueReader map( - OrcValueReader keyReader, OrcValueReader valueReader) { - return new MapReader<>(keyReader, valueReader); - } - - public static OrcValueReader struct( - List> readers, Types.StructType struct, Map idToConstant) { - return new StructReader(readers, struct, idToConstant); - } - - private static class StringReader implements OrcValueReader { - private static final StringReader INSTANCE = new StringReader(); - - @Override - public StringData nonNullRead(ColumnVector vector, int row) { - BytesColumnVector bytesVector = (BytesColumnVector) vector; - return StringData.fromBytes( - bytesVector.vector[row], bytesVector.start[row], bytesVector.length[row]); - } - } - - private static class DateReader implements OrcValueReader { - private static final DateReader INSTANCE = new DateReader(); - - @Override - public Integer nonNullRead(ColumnVector vector, int row) { - return (int) ((LongColumnVector) vector).vector[row]; - } - } - - private static class Decimal18Reader implements OrcValueReader { - private final int precision; - private final int scale; - - Decimal18Reader(int precision, int scale) { - this.precision = precision; - this.scale = scale; - } - - @Override - public DecimalData nonNullRead(ColumnVector vector, int row) { - HiveDecimalWritable value = ((DecimalColumnVector) vector).vector[row]; - - // The hive ORC writer may will adjust the scale of decimal data. - Preconditions.checkArgument( - value.precision() <= precision, - "Cannot read value as decimal(%s,%s), too large: %s", - precision, - scale, - value); - - return DecimalData.fromUnscaledLong(value.serialize64(scale), precision, scale); - } - } - - private static class Decimal38Reader implements OrcValueReader { - private final int precision; - private final int scale; - - Decimal38Reader(int precision, int scale) { - this.precision = precision; - this.scale = scale; - } - - @Override - public DecimalData nonNullRead(ColumnVector vector, int row) { - BigDecimal value = - ((DecimalColumnVector) vector).vector[row].getHiveDecimal().bigDecimalValue(); - - Preconditions.checkArgument( - value.precision() <= precision, - "Cannot read value as decimal(%s,%s), too large: %s", - precision, - scale, - value); - - return DecimalData.fromBigDecimal(value, precision, scale); - } - } - - private static class TimeReader implements OrcValueReader { - private static final TimeReader INSTANCE = new TimeReader(); - - @Override - public Integer nonNullRead(ColumnVector vector, int row) { - long micros = ((LongColumnVector) vector).vector[row]; - // Flink only support time mills, just erase micros. - return (int) (micros / 1000); - } - } - - private static class TimestampReader implements OrcValueReader { - private static final TimestampReader INSTANCE = new TimestampReader(); - - @Override - public TimestampData nonNullRead(ColumnVector vector, int row) { - TimestampColumnVector tcv = (TimestampColumnVector) vector; - LocalDateTime localDate = - Instant.ofEpochSecond(Math.floorDiv(tcv.time[row], 1_000), tcv.nanos[row]) - .atOffset(ZoneOffset.UTC) - .toLocalDateTime(); - return TimestampData.fromLocalDateTime(localDate); - } - } - - private static class TimestampTzReader implements OrcValueReader { - private static final TimestampTzReader INSTANCE = new TimestampTzReader(); - - @Override - public TimestampData nonNullRead(ColumnVector vector, int row) { - TimestampColumnVector tcv = (TimestampColumnVector) vector; - Instant instant = - Instant.ofEpochSecond(Math.floorDiv(tcv.time[row], 1_000), tcv.nanos[row]) - .atOffset(ZoneOffset.UTC) - .toInstant(); - return TimestampData.fromInstant(instant); - } - } - - private static class ArrayReader implements OrcValueReader { - private final OrcValueReader elementReader; - - private ArrayReader(OrcValueReader elementReader) { - this.elementReader = elementReader; - } - - @Override - public ArrayData nonNullRead(ColumnVector vector, int row) { - ListColumnVector listVector = (ListColumnVector) vector; - int offset = (int) listVector.offsets[row]; - int length = (int) listVector.lengths[row]; - List elements = Lists.newArrayListWithExpectedSize(length); - for (int c = 0; c < length; ++c) { - elements.add(elementReader.read(listVector.child, offset + c)); - } - return new GenericArrayData(elements.toArray()); - } - - @Override - public void setBatchContext(long batchOffsetInFile) { - elementReader.setBatchContext(batchOffsetInFile); - } - } - - private static class MapReader implements OrcValueReader { - private final OrcValueReader keyReader; - private final OrcValueReader valueReader; - - private MapReader(OrcValueReader keyReader, OrcValueReader valueReader) { - this.keyReader = keyReader; - this.valueReader = valueReader; - } - - @Override - public MapData nonNullRead(ColumnVector vector, int row) { - MapColumnVector mapVector = (MapColumnVector) vector; - int offset = (int) mapVector.offsets[row]; - long length = mapVector.lengths[row]; - - Map map = Maps.newHashMap(); - for (int c = 0; c < length; c++) { - K key = keyReader.read(mapVector.keys, offset + c); - V value = valueReader.read(mapVector.values, offset + c); - map.put(key, value); - } - - return new GenericMapData(map); - } - - @Override - public void setBatchContext(long batchOffsetInFile) { - keyReader.setBatchContext(batchOffsetInFile); - valueReader.setBatchContext(batchOffsetInFile); - } - } - - private static class StructReader extends OrcValueReaders.StructReader { - private final int numFields; - - StructReader( - List> readers, Types.StructType struct, Map idToConstant) { - super(readers, struct, idToConstant); - this.numFields = struct.fields().size(); - } - - @Override - protected RowData create() { - return new GenericRowData(numFields); - } - - @Override - protected void set(RowData struct, int pos, Object value) { - ((GenericRowData) struct).setField(pos, value); - } - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java deleted file mode 100644 index 6a31accffd22..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java +++ /dev/null @@ -1,163 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.util.Deque; -import java.util.List; -import java.util.stream.Stream; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.FieldMetrics; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.orc.GenericOrcWriters; -import org.apache.iceberg.orc.OrcRowWriter; -import org.apache.iceberg.orc.OrcValueWriter; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; -import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch; - -public class FlinkOrcWriter implements OrcRowWriter { - private final FlinkOrcWriters.RowDataWriter writer; - - private FlinkOrcWriter(RowType rowType, Schema iSchema) { - this.writer = - (FlinkOrcWriters.RowDataWriter) - FlinkSchemaVisitor.visit(rowType, iSchema, new WriteBuilder()); - } - - public static OrcRowWriter buildWriter(RowType rowType, Schema iSchema) { - return new FlinkOrcWriter(rowType, iSchema); - } - - @Override - public void write(RowData row, VectorizedRowBatch output) { - Preconditions.checkArgument(row != null, "value must not be null"); - writer.writeRow(row, output); - } - - @Override - public List> writers() { - return writer.writers(); - } - - @Override - public Stream> metrics() { - return writer.metrics(); - } - - private static class WriteBuilder extends FlinkSchemaVisitor> { - private final Deque fieldIds = Lists.newLinkedList(); - - private WriteBuilder() {} - - @Override - public void beforeField(Types.NestedField field) { - fieldIds.push(field.fieldId()); - } - - @Override - public void afterField(Types.NestedField field) { - fieldIds.pop(); - } - - @Override - public OrcValueWriter record( - Types.StructType iStruct, List> results, List fieldType) { - return FlinkOrcWriters.struct(results, fieldType); - } - - @Override - public OrcValueWriter map( - Types.MapType iMap, - OrcValueWriter key, - OrcValueWriter value, - LogicalType keyType, - LogicalType valueType) { - return FlinkOrcWriters.map(key, value, keyType, valueType); - } - - @Override - public OrcValueWriter list( - Types.ListType iList, OrcValueWriter element, LogicalType elementType) { - return FlinkOrcWriters.list(element, elementType); - } - - @Override - public OrcValueWriter primitive(Type.PrimitiveType iPrimitive, LogicalType flinkPrimitive) { - switch (iPrimitive.typeId()) { - case BOOLEAN: - return GenericOrcWriters.booleans(); - case INTEGER: - switch (flinkPrimitive.getTypeRoot()) { - case TINYINT: - return GenericOrcWriters.bytes(); - case SMALLINT: - return GenericOrcWriters.shorts(); - } - return GenericOrcWriters.ints(); - case LONG: - return GenericOrcWriters.longs(); - case FLOAT: - Preconditions.checkArgument( - fieldIds.peek() != null, - String.format( - "[BUG] Cannot find field id for primitive field with type %s. This is likely because id " - + "information is not properly pushed during schema visiting.", - iPrimitive)); - return GenericOrcWriters.floats(fieldIds.peek()); - case DOUBLE: - Preconditions.checkArgument( - fieldIds.peek() != null, - String.format( - "[BUG] Cannot find field id for primitive field with type %s. This is likely because id " - + "information is not properly pushed during schema visiting.", - iPrimitive)); - return GenericOrcWriters.doubles(fieldIds.peek()); - case DATE: - return FlinkOrcWriters.dates(); - case TIME: - return FlinkOrcWriters.times(); - case TIMESTAMP: - Types.TimestampType timestampType = (Types.TimestampType) iPrimitive; - if (timestampType.shouldAdjustToUTC()) { - return FlinkOrcWriters.timestampTzs(); - } else { - return FlinkOrcWriters.timestamps(); - } - case STRING: - return FlinkOrcWriters.strings(); - case UUID: - case FIXED: - case BINARY: - return GenericOrcWriters.byteArrays(); - case DECIMAL: - Types.DecimalType decimalType = (Types.DecimalType) iPrimitive; - return FlinkOrcWriters.decimals(decimalType.precision(), decimalType.scale()); - default: - throw new IllegalArgumentException( - String.format( - "Invalid iceberg type %s corresponding to Flink logical type %s", - iPrimitive, flinkPrimitive)); - } - } - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java deleted file mode 100644 index da2f95cf822f..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java +++ /dev/null @@ -1,317 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.time.Instant; -import java.time.OffsetDateTime; -import java.time.ZoneOffset; -import java.util.List; -import java.util.stream.Stream; -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.iceberg.FieldMetrics; -import org.apache.iceberg.data.orc.GenericOrcWriters; -import org.apache.iceberg.orc.OrcValueWriter; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.orc.storage.common.type.HiveDecimal; -import org.apache.orc.storage.ql.exec.vector.BytesColumnVector; -import org.apache.orc.storage.ql.exec.vector.ColumnVector; -import org.apache.orc.storage.ql.exec.vector.DecimalColumnVector; -import org.apache.orc.storage.ql.exec.vector.ListColumnVector; -import org.apache.orc.storage.ql.exec.vector.LongColumnVector; -import org.apache.orc.storage.ql.exec.vector.MapColumnVector; -import org.apache.orc.storage.ql.exec.vector.TimestampColumnVector; - -class FlinkOrcWriters { - - private FlinkOrcWriters() {} - - static OrcValueWriter strings() { - return StringWriter.INSTANCE; - } - - static OrcValueWriter dates() { - return DateWriter.INSTANCE; - } - - static OrcValueWriter times() { - return TimeWriter.INSTANCE; - } - - static OrcValueWriter timestamps() { - return TimestampWriter.INSTANCE; - } - - static OrcValueWriter timestampTzs() { - return TimestampTzWriter.INSTANCE; - } - - static OrcValueWriter decimals(int precision, int scale) { - if (precision <= 18) { - return new Decimal18Writer(precision, scale); - } else if (precision <= 38) { - return new Decimal38Writer(precision, scale); - } else { - throw new IllegalArgumentException("Invalid precision: " + precision); - } - } - - static OrcValueWriter list( - OrcValueWriter elementWriter, LogicalType elementType) { - return new ListWriter<>(elementWriter, elementType); - } - - static OrcValueWriter map( - OrcValueWriter keyWriter, - OrcValueWriter valueWriter, - LogicalType keyType, - LogicalType valueType) { - return new MapWriter<>(keyWriter, valueWriter, keyType, valueType); - } - - static OrcValueWriter struct(List> writers, List types) { - return new RowDataWriter(writers, types); - } - - private static class StringWriter implements OrcValueWriter { - private static final StringWriter INSTANCE = new StringWriter(); - - @Override - public void nonNullWrite(int rowId, StringData data, ColumnVector output) { - byte[] value = data.toBytes(); - ((BytesColumnVector) output).setRef(rowId, value, 0, value.length); - } - } - - private static class DateWriter implements OrcValueWriter { - private static final DateWriter INSTANCE = new DateWriter(); - - @Override - public void nonNullWrite(int rowId, Integer data, ColumnVector output) { - ((LongColumnVector) output).vector[rowId] = data; - } - } - - private static class TimeWriter implements OrcValueWriter { - private static final TimeWriter INSTANCE = new TimeWriter(); - - @Override - public void nonNullWrite(int rowId, Integer millis, ColumnVector output) { - // The time in flink is in millisecond, while the standard time in iceberg is microsecond. - // So we need to transform it to microsecond. - ((LongColumnVector) output).vector[rowId] = millis * 1000L; - } - } - - private static class TimestampWriter implements OrcValueWriter { - private static final TimestampWriter INSTANCE = new TimestampWriter(); - - @Override - public void nonNullWrite(int rowId, TimestampData data, ColumnVector output) { - TimestampColumnVector cv = (TimestampColumnVector) output; - cv.setIsUTC(true); - // millis - OffsetDateTime offsetDateTime = data.toInstant().atOffset(ZoneOffset.UTC); - cv.time[rowId] = - offsetDateTime.toEpochSecond() * 1_000 + offsetDateTime.getNano() / 1_000_000; - // truncate nanos to only keep microsecond precision. - cv.nanos[rowId] = (offsetDateTime.getNano() / 1_000) * 1_000; - } - } - - private static class TimestampTzWriter implements OrcValueWriter { - private static final TimestampTzWriter INSTANCE = new TimestampTzWriter(); - - @SuppressWarnings("JavaInstantGetSecondsGetNano") - @Override - public void nonNullWrite(int rowId, TimestampData data, ColumnVector output) { - TimestampColumnVector cv = (TimestampColumnVector) output; - // millis - Instant instant = data.toInstant(); - cv.time[rowId] = instant.toEpochMilli(); - // truncate nanos to only keep microsecond precision. - cv.nanos[rowId] = (instant.getNano() / 1_000) * 1_000; - } - } - - private static class Decimal18Writer implements OrcValueWriter { - private final int precision; - private final int scale; - - Decimal18Writer(int precision, int scale) { - this.precision = precision; - this.scale = scale; - } - - @Override - public void nonNullWrite(int rowId, DecimalData data, ColumnVector output) { - Preconditions.checkArgument( - scale == data.scale(), - "Cannot write value as decimal(%s,%s), wrong scale: %s", - precision, - scale, - data); - Preconditions.checkArgument( - data.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", - precision, - scale, - data); - - ((DecimalColumnVector) output) - .vector[rowId].setFromLongAndScale(data.toUnscaledLong(), data.scale()); - } - } - - private static class Decimal38Writer implements OrcValueWriter { - private final int precision; - private final int scale; - - Decimal38Writer(int precision, int scale) { - this.precision = precision; - this.scale = scale; - } - - @Override - public void nonNullWrite(int rowId, DecimalData data, ColumnVector output) { - Preconditions.checkArgument( - scale == data.scale(), - "Cannot write value as decimal(%s,%s), wrong scale: %s", - precision, - scale, - data); - Preconditions.checkArgument( - data.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", - precision, - scale, - data); - - ((DecimalColumnVector) output) - .vector[rowId].set(HiveDecimal.create(data.toBigDecimal(), false)); - } - } - - static class ListWriter implements OrcValueWriter { - private final OrcValueWriter elementWriter; - private final ArrayData.ElementGetter elementGetter; - - ListWriter(OrcValueWriter elementWriter, LogicalType elementType) { - this.elementWriter = elementWriter; - this.elementGetter = ArrayData.createElementGetter(elementType); - } - - @Override - @SuppressWarnings("unchecked") - public void nonNullWrite(int rowId, ArrayData data, ColumnVector output) { - ListColumnVector cv = (ListColumnVector) output; - cv.lengths[rowId] = data.size(); - cv.offsets[rowId] = cv.childCount; - cv.childCount = (int) (cv.childCount + cv.lengths[rowId]); - // make sure the child is big enough. - growColumnVector(cv.child, cv.childCount); - - for (int e = 0; e < cv.lengths[rowId]; ++e) { - Object value = elementGetter.getElementOrNull(data, e); - elementWriter.write((int) (e + cv.offsets[rowId]), (T) value, cv.child); - } - } - - @Override - public Stream> metrics() { - return elementWriter.metrics(); - } - } - - static class MapWriter implements OrcValueWriter { - private final OrcValueWriter keyWriter; - private final OrcValueWriter valueWriter; - private final ArrayData.ElementGetter keyGetter; - private final ArrayData.ElementGetter valueGetter; - - MapWriter( - OrcValueWriter keyWriter, - OrcValueWriter valueWriter, - LogicalType keyType, - LogicalType valueType) { - this.keyWriter = keyWriter; - this.valueWriter = valueWriter; - this.keyGetter = ArrayData.createElementGetter(keyType); - this.valueGetter = ArrayData.createElementGetter(valueType); - } - - @Override - @SuppressWarnings("unchecked") - public void nonNullWrite(int rowId, MapData data, ColumnVector output) { - MapColumnVector cv = (MapColumnVector) output; - ArrayData keyArray = data.keyArray(); - ArrayData valArray = data.valueArray(); - - // record the length and start of the list elements - cv.lengths[rowId] = data.size(); - cv.offsets[rowId] = cv.childCount; - cv.childCount = (int) (cv.childCount + cv.lengths[rowId]); - // make sure the child is big enough - growColumnVector(cv.keys, cv.childCount); - growColumnVector(cv.values, cv.childCount); - // Add each element - for (int e = 0; e < cv.lengths[rowId]; ++e) { - int pos = (int) (e + cv.offsets[rowId]); - keyWriter.write(pos, (K) keyGetter.getElementOrNull(keyArray, e), cv.keys); - valueWriter.write(pos, (V) valueGetter.getElementOrNull(valArray, e), cv.values); - } - } - - @Override - public Stream> metrics() { - return Stream.concat(keyWriter.metrics(), valueWriter.metrics()); - } - } - - static class RowDataWriter extends GenericOrcWriters.StructWriter { - private final List fieldGetters; - - RowDataWriter(List> writers, List types) { - super(writers); - - this.fieldGetters = Lists.newArrayListWithExpectedSize(types.size()); - for (int i = 0; i < types.size(); i++) { - fieldGetters.add(RowData.createFieldGetter(types.get(i), i)); - } - } - - @Override - protected Object get(RowData struct, int index) { - return fieldGetters.get(index).getFieldOrNull(struct); - } - } - - private static void growColumnVector(ColumnVector cv, int requestedSize) { - if (cv.isNull.length < requestedSize) { - // Use growth factor of 3 to avoid frequent array allocations - cv.ensureSize(requestedSize * 3, true); - } - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java deleted file mode 100644 index 2b21d77b70e0..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java +++ /dev/null @@ -1,823 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.math.BigDecimal; -import java.math.BigInteger; -import java.nio.ByteBuffer; -import java.time.Instant; -import java.time.ZoneOffset; -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.GenericArrayData; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.RawValueData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.iceberg.MetadataColumns; -import org.apache.iceberg.Schema; -import org.apache.iceberg.parquet.ParquetValueReader; -import org.apache.iceberg.parquet.ParquetValueReaders; -import org.apache.iceberg.parquet.TypeWithSchemaVisitor; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.ArrayUtil; -import org.apache.parquet.column.ColumnDescriptor; -import org.apache.parquet.io.api.Binary; -import org.apache.parquet.schema.GroupType; -import org.apache.parquet.schema.LogicalTypeAnnotation.DecimalLogicalTypeAnnotation; -import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.PrimitiveType; -import org.apache.parquet.schema.Type; - -public class FlinkParquetReaders { - private FlinkParquetReaders() {} - - public static ParquetValueReader buildReader( - Schema expectedSchema, MessageType fileSchema) { - return buildReader(expectedSchema, fileSchema, ImmutableMap.of()); - } - - @SuppressWarnings("unchecked") - public static ParquetValueReader buildReader( - Schema expectedSchema, MessageType fileSchema, Map idToConstant) { - return (ParquetValueReader) - TypeWithSchemaVisitor.visit( - expectedSchema.asStruct(), fileSchema, new ReadBuilder(fileSchema, idToConstant)); - } - - private static class ReadBuilder extends TypeWithSchemaVisitor> { - private final MessageType type; - private final Map idToConstant; - - ReadBuilder(MessageType type, Map idToConstant) { - this.type = type; - this.idToConstant = idToConstant; - } - - @Override - public ParquetValueReader message( - Types.StructType expected, MessageType message, List> fieldReaders) { - return struct(expected, message.asGroupType(), fieldReaders); - } - - @Override - public ParquetValueReader struct( - Types.StructType expected, GroupType struct, List> fieldReaders) { - // match the expected struct's order - Map> readersById = Maps.newHashMap(); - Map typesById = Maps.newHashMap(); - List fields = struct.getFields(); - for (int i = 0; i < fields.size(); i += 1) { - Type fieldType = fields.get(i); - if (fieldReaders.get(i) != null) { - int fieldD = type.getMaxDefinitionLevel(path(fieldType.getName())) - 1; - if (fieldType.getId() != null) { - int id = fieldType.getId().intValue(); - readersById.put(id, ParquetValueReaders.option(fieldType, fieldD, fieldReaders.get(i))); - typesById.put(id, fieldType); - } - } - } - - List expectedFields = - expected != null ? expected.fields() : ImmutableList.of(); - List> reorderedFields = - Lists.newArrayListWithExpectedSize(expectedFields.size()); - List types = Lists.newArrayListWithExpectedSize(expectedFields.size()); - for (Types.NestedField field : expectedFields) { - int id = field.fieldId(); - if (idToConstant.containsKey(id)) { - // containsKey is used because the constant may be null - reorderedFields.add(ParquetValueReaders.constant(idToConstant.get(id))); - types.add(null); - } else if (id == MetadataColumns.ROW_POSITION.fieldId()) { - reorderedFields.add(ParquetValueReaders.position()); - types.add(null); - } else if (id == MetadataColumns.IS_DELETED.fieldId()) { - reorderedFields.add(ParquetValueReaders.constant(false)); - types.add(null); - } else { - ParquetValueReader reader = readersById.get(id); - if (reader != null) { - reorderedFields.add(reader); - types.add(typesById.get(id)); - } else { - reorderedFields.add(ParquetValueReaders.nulls()); - types.add(null); - } - } - } - - return new RowDataReader(types, reorderedFields); - } - - @Override - public ParquetValueReader list( - Types.ListType expectedList, GroupType array, ParquetValueReader elementReader) { - if (expectedList == null) { - return null; - } - - GroupType repeated = array.getFields().get(0).asGroupType(); - String[] repeatedPath = currentPath(); - - int repeatedD = type.getMaxDefinitionLevel(repeatedPath) - 1; - int repeatedR = type.getMaxRepetitionLevel(repeatedPath) - 1; - - Type elementType = repeated.getType(0); - int elementD = type.getMaxDefinitionLevel(path(elementType.getName())) - 1; - - return new ArrayReader<>( - repeatedD, repeatedR, ParquetValueReaders.option(elementType, elementD, elementReader)); - } - - @Override - public ParquetValueReader map( - Types.MapType expectedMap, - GroupType map, - ParquetValueReader keyReader, - ParquetValueReader valueReader) { - if (expectedMap == null) { - return null; - } - - GroupType repeatedKeyValue = map.getFields().get(0).asGroupType(); - String[] repeatedPath = currentPath(); - - int repeatedD = type.getMaxDefinitionLevel(repeatedPath) - 1; - int repeatedR = type.getMaxRepetitionLevel(repeatedPath) - 1; - - Type keyType = repeatedKeyValue.getType(0); - int keyD = type.getMaxDefinitionLevel(path(keyType.getName())) - 1; - Type valueType = repeatedKeyValue.getType(1); - int valueD = type.getMaxDefinitionLevel(path(valueType.getName())) - 1; - - return new MapReader<>( - repeatedD, - repeatedR, - ParquetValueReaders.option(keyType, keyD, keyReader), - ParquetValueReaders.option(valueType, valueD, valueReader)); - } - - @Override - @SuppressWarnings("CyclomaticComplexity") - public ParquetValueReader primitive( - org.apache.iceberg.types.Type.PrimitiveType expected, PrimitiveType primitive) { - if (expected == null) { - return null; - } - - ColumnDescriptor desc = type.getColumnDescription(currentPath()); - - if (primitive.getOriginalType() != null) { - switch (primitive.getOriginalType()) { - case ENUM: - case JSON: - case UTF8: - return new StringReader(desc); - case INT_8: - case INT_16: - case INT_32: - if (expected.typeId() == Types.LongType.get().typeId()) { - return new ParquetValueReaders.IntAsLongReader(desc); - } else { - return new ParquetValueReaders.UnboxedReader<>(desc); - } - case TIME_MICROS: - return new LossyMicrosToMillisTimeReader(desc); - case TIME_MILLIS: - return new MillisTimeReader(desc); - case DATE: - case INT_64: - return new ParquetValueReaders.UnboxedReader<>(desc); - case TIMESTAMP_MICROS: - if (((Types.TimestampType) expected).shouldAdjustToUTC()) { - return new MicrosToTimestampTzReader(desc); - } else { - return new MicrosToTimestampReader(desc); - } - case TIMESTAMP_MILLIS: - if (((Types.TimestampType) expected).shouldAdjustToUTC()) { - return new MillisToTimestampTzReader(desc); - } else { - return new MillisToTimestampReader(desc); - } - case DECIMAL: - DecimalLogicalTypeAnnotation decimal = - (DecimalLogicalTypeAnnotation) primitive.getLogicalTypeAnnotation(); - switch (primitive.getPrimitiveTypeName()) { - case BINARY: - case FIXED_LEN_BYTE_ARRAY: - return new BinaryDecimalReader(desc, decimal.getPrecision(), decimal.getScale()); - case INT64: - return new LongDecimalReader(desc, decimal.getPrecision(), decimal.getScale()); - case INT32: - return new IntegerDecimalReader(desc, decimal.getPrecision(), decimal.getScale()); - default: - throw new UnsupportedOperationException( - "Unsupported base type for decimal: " + primitive.getPrimitiveTypeName()); - } - case BSON: - return new ParquetValueReaders.ByteArrayReader(desc); - default: - throw new UnsupportedOperationException( - "Unsupported logical type: " + primitive.getOriginalType()); - } - } - - switch (primitive.getPrimitiveTypeName()) { - case FIXED_LEN_BYTE_ARRAY: - case BINARY: - return new ParquetValueReaders.ByteArrayReader(desc); - case INT32: - if (expected.typeId() == org.apache.iceberg.types.Type.TypeID.LONG) { - return new ParquetValueReaders.IntAsLongReader(desc); - } else { - return new ParquetValueReaders.UnboxedReader<>(desc); - } - case FLOAT: - if (expected.typeId() == org.apache.iceberg.types.Type.TypeID.DOUBLE) { - return new ParquetValueReaders.FloatAsDoubleReader(desc); - } else { - return new ParquetValueReaders.UnboxedReader<>(desc); - } - case BOOLEAN: - case INT64: - case DOUBLE: - return new ParquetValueReaders.UnboxedReader<>(desc); - default: - throw new UnsupportedOperationException("Unsupported type: " + primitive); - } - } - } - - private static class BinaryDecimalReader - extends ParquetValueReaders.PrimitiveReader { - private final int precision; - private final int scale; - - BinaryDecimalReader(ColumnDescriptor desc, int precision, int scale) { - super(desc); - this.precision = precision; - this.scale = scale; - } - - @Override - public DecimalData read(DecimalData ignored) { - Binary binary = column.nextBinary(); - BigDecimal bigDecimal = new BigDecimal(new BigInteger(binary.getBytes()), scale); - // TODO: need a unit test to write-read-validate decimal via FlinkParquetWrite/Reader - return DecimalData.fromBigDecimal(bigDecimal, precision, scale); - } - } - - private static class IntegerDecimalReader - extends ParquetValueReaders.PrimitiveReader { - private final int precision; - private final int scale; - - IntegerDecimalReader(ColumnDescriptor desc, int precision, int scale) { - super(desc); - this.precision = precision; - this.scale = scale; - } - - @Override - public DecimalData read(DecimalData ignored) { - return DecimalData.fromUnscaledLong(column.nextInteger(), precision, scale); - } - } - - private static class LongDecimalReader extends ParquetValueReaders.PrimitiveReader { - private final int precision; - private final int scale; - - LongDecimalReader(ColumnDescriptor desc, int precision, int scale) { - super(desc); - this.precision = precision; - this.scale = scale; - } - - @Override - public DecimalData read(DecimalData ignored) { - return DecimalData.fromUnscaledLong(column.nextLong(), precision, scale); - } - } - - private static class MicrosToTimestampTzReader - extends ParquetValueReaders.UnboxedReader { - MicrosToTimestampTzReader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public TimestampData read(TimestampData ignored) { - long value = readLong(); - return TimestampData.fromLocalDateTime( - Instant.ofEpochSecond( - Math.floorDiv(value, 1000_000), Math.floorMod(value, 1000_000) * 1000) - .atOffset(ZoneOffset.UTC) - .toLocalDateTime()); - } - - @Override - public long readLong() { - return column.nextLong(); - } - } - - private static class MicrosToTimestampReader - extends ParquetValueReaders.UnboxedReader { - MicrosToTimestampReader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public TimestampData read(TimestampData ignored) { - long value = readLong(); - return TimestampData.fromInstant( - Instant.ofEpochSecond( - Math.floorDiv(value, 1000_000), Math.floorMod(value, 1000_000) * 1000)); - } - - @Override - public long readLong() { - return column.nextLong(); - } - } - - private static class MillisToTimestampReader - extends ParquetValueReaders.UnboxedReader { - MillisToTimestampReader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public TimestampData read(TimestampData ignored) { - long millis = readLong(); - return TimestampData.fromEpochMillis(millis); - } - - @Override - public long readLong() { - return column.nextLong(); - } - } - - private static class MillisToTimestampTzReader - extends ParquetValueReaders.UnboxedReader { - MillisToTimestampTzReader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public TimestampData read(TimestampData ignored) { - long millis = readLong(); - return TimestampData.fromLocalDateTime( - Instant.ofEpochMilli(millis).atOffset(ZoneOffset.UTC).toLocalDateTime()); - } - - @Override - public long readLong() { - return column.nextLong(); - } - } - - private static class StringReader extends ParquetValueReaders.PrimitiveReader { - StringReader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public StringData read(StringData ignored) { - Binary binary = column.nextBinary(); - ByteBuffer buffer = binary.toByteBuffer(); - if (buffer.hasArray()) { - return StringData.fromBytes( - buffer.array(), buffer.arrayOffset() + buffer.position(), buffer.remaining()); - } else { - return StringData.fromBytes(binary.getBytes()); - } - } - } - - private static class LossyMicrosToMillisTimeReader - extends ParquetValueReaders.PrimitiveReader { - LossyMicrosToMillisTimeReader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public Integer read(Integer reuse) { - // Discard microseconds since Flink uses millisecond unit for TIME type. - return (int) Math.floorDiv(column.nextLong(), 1000); - } - } - - private static class MillisTimeReader extends ParquetValueReaders.PrimitiveReader { - MillisTimeReader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public Integer read(Integer reuse) { - return (int) column.nextLong(); - } - } - - private static class ArrayReader - extends ParquetValueReaders.RepeatedReader { - private int readPos = 0; - private int writePos = 0; - - ArrayReader(int definitionLevel, int repetitionLevel, ParquetValueReader reader) { - super(definitionLevel, repetitionLevel, reader); - } - - @Override - protected ReusableArrayData newListData(ArrayData reuse) { - this.readPos = 0; - this.writePos = 0; - - if (reuse instanceof ReusableArrayData) { - return (ReusableArrayData) reuse; - } else { - return new ReusableArrayData(); - } - } - - @Override - @SuppressWarnings("unchecked") - protected E getElement(ReusableArrayData list) { - E value = null; - if (readPos < list.capacity()) { - value = (E) list.values[readPos]; - } - - readPos += 1; - - return value; - } - - @Override - protected void addElement(ReusableArrayData reused, E element) { - if (writePos >= reused.capacity()) { - reused.grow(); - } - - reused.values[writePos] = element; - - writePos += 1; - } - - @Override - protected ArrayData buildList(ReusableArrayData list) { - // Since ReusableArrayData is not accepted by Flink, use GenericArrayData temporarily to walk - // around it. - // Revert this to use ReusableArrayData once it is fixed in Flink. - // For your reference, https://issues.apache.org/jira/browse/FLINK-25238. - return new GenericArrayData(Arrays.copyOf(list.values, writePos)); - } - } - - private static class MapReader - extends ParquetValueReaders.RepeatedKeyValueReader { - private int readPos = 0; - private int writePos = 0; - - private final ParquetValueReaders.ReusableEntry entry = - new ParquetValueReaders.ReusableEntry<>(); - private final ParquetValueReaders.ReusableEntry nullEntry = - new ParquetValueReaders.ReusableEntry<>(); - - MapReader( - int definitionLevel, - int repetitionLevel, - ParquetValueReader keyReader, - ParquetValueReader valueReader) { - super(definitionLevel, repetitionLevel, keyReader, valueReader); - } - - @Override - protected ReusableMapData newMapData(MapData reuse) { - this.readPos = 0; - this.writePos = 0; - - if (reuse instanceof ReusableMapData) { - return (ReusableMapData) reuse; - } else { - return new ReusableMapData(); - } - } - - @Override - @SuppressWarnings("unchecked") - protected Map.Entry getPair(ReusableMapData map) { - Map.Entry kv = nullEntry; - if (readPos < map.capacity()) { - entry.set((K) map.keys.values[readPos], (V) map.values.values[readPos]); - kv = entry; - } - - readPos += 1; - - return kv; - } - - @Override - protected void addPair(ReusableMapData map, K key, V value) { - if (writePos >= map.capacity()) { - map.grow(); - } - - map.keys.values[writePos] = key; - map.values.values[writePos] = value; - - writePos += 1; - } - - @Override - protected MapData buildMap(ReusableMapData map) { - map.setNumElements(writePos); - return map; - } - } - - private static class RowDataReader - extends ParquetValueReaders.StructReader { - private final int numFields; - - RowDataReader(List types, List> readers) { - super(types, readers); - this.numFields = readers.size(); - } - - @Override - protected GenericRowData newStructData(RowData reuse) { - if (reuse instanceof GenericRowData) { - return (GenericRowData) reuse; - } else { - return new GenericRowData(numFields); - } - } - - @Override - protected Object getField(GenericRowData intermediate, int pos) { - return intermediate.getField(pos); - } - - @Override - protected RowData buildStruct(GenericRowData struct) { - return struct; - } - - @Override - protected void set(GenericRowData row, int pos, Object value) { - row.setField(pos, value); - } - - @Override - protected void setNull(GenericRowData row, int pos) { - row.setField(pos, null); - } - - @Override - protected void setBoolean(GenericRowData row, int pos, boolean value) { - row.setField(pos, value); - } - - @Override - protected void setInteger(GenericRowData row, int pos, int value) { - row.setField(pos, value); - } - - @Override - protected void setLong(GenericRowData row, int pos, long value) { - row.setField(pos, value); - } - - @Override - protected void setFloat(GenericRowData row, int pos, float value) { - row.setField(pos, value); - } - - @Override - protected void setDouble(GenericRowData row, int pos, double value) { - row.setField(pos, value); - } - } - - private static class ReusableMapData implements MapData { - private final ReusableArrayData keys; - private final ReusableArrayData values; - - private int numElements; - - private ReusableMapData() { - this.keys = new ReusableArrayData(); - this.values = new ReusableArrayData(); - } - - private void grow() { - keys.grow(); - values.grow(); - } - - private int capacity() { - return keys.capacity(); - } - - public void setNumElements(int numElements) { - this.numElements = numElements; - keys.setNumElements(numElements); - values.setNumElements(numElements); - } - - @Override - public int size() { - return numElements; - } - - @Override - public ReusableArrayData keyArray() { - return keys; - } - - @Override - public ReusableArrayData valueArray() { - return values; - } - } - - private static class ReusableArrayData implements ArrayData { - private static final Object[] EMPTY = new Object[0]; - - private Object[] values = EMPTY; - private int numElements = 0; - - private void grow() { - if (values.length == 0) { - this.values = new Object[20]; - } else { - Object[] old = values; - this.values = new Object[old.length << 1]; - // copy the old array in case it has values that can be reused - System.arraycopy(old, 0, values, 0, old.length); - } - } - - private int capacity() { - return values.length; - } - - public void setNumElements(int numElements) { - this.numElements = numElements; - } - - @Override - public int size() { - return numElements; - } - - @Override - public boolean isNullAt(int ordinal) { - return null == values[ordinal]; - } - - @Override - public boolean getBoolean(int ordinal) { - return (boolean) values[ordinal]; - } - - @Override - public byte getByte(int ordinal) { - return (byte) values[ordinal]; - } - - @Override - public short getShort(int ordinal) { - return (short) values[ordinal]; - } - - @Override - public int getInt(int ordinal) { - return (int) values[ordinal]; - } - - @Override - public long getLong(int ordinal) { - return (long) values[ordinal]; - } - - @Override - public float getFloat(int ordinal) { - return (float) values[ordinal]; - } - - @Override - public double getDouble(int ordinal) { - return (double) values[ordinal]; - } - - @Override - public StringData getString(int pos) { - return (StringData) values[pos]; - } - - @Override - public DecimalData getDecimal(int pos, int precision, int scale) { - return (DecimalData) values[pos]; - } - - @Override - public TimestampData getTimestamp(int pos, int precision) { - return (TimestampData) values[pos]; - } - - @SuppressWarnings("unchecked") - @Override - public RawValueData getRawValue(int pos) { - return (RawValueData) values[pos]; - } - - @Override - public byte[] getBinary(int ordinal) { - return (byte[]) values[ordinal]; - } - - @Override - public ArrayData getArray(int ordinal) { - return (ArrayData) values[ordinal]; - } - - @Override - public MapData getMap(int ordinal) { - return (MapData) values[ordinal]; - } - - @Override - public RowData getRow(int pos, int numFields) { - return (RowData) values[pos]; - } - - @Override - public boolean[] toBooleanArray() { - return ArrayUtil.toPrimitive((Boolean[]) values); - } - - @Override - public byte[] toByteArray() { - return ArrayUtil.toPrimitive((Byte[]) values); - } - - @Override - public short[] toShortArray() { - return ArrayUtil.toPrimitive((Short[]) values); - } - - @Override - public int[] toIntArray() { - return ArrayUtil.toPrimitive((Integer[]) values); - } - - @Override - public long[] toLongArray() { - return ArrayUtil.toPrimitive((Long[]) values); - } - - @Override - public float[] toFloatArray() { - return ArrayUtil.toPrimitive((Float[]) values); - } - - @Override - public double[] toDoubleArray() { - return ArrayUtil.toPrimitive((Double[]) values); - } - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java deleted file mode 100644 index db4f1730a134..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java +++ /dev/null @@ -1,504 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.NoSuchElementException; -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.table.types.logical.ArrayType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.MapType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.logical.RowType.RowField; -import org.apache.flink.table.types.logical.SmallIntType; -import org.apache.flink.table.types.logical.TinyIntType; -import org.apache.iceberg.parquet.ParquetValueReaders; -import org.apache.iceberg.parquet.ParquetValueWriter; -import org.apache.iceberg.parquet.ParquetValueWriters; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.util.DecimalUtil; -import org.apache.parquet.column.ColumnDescriptor; -import org.apache.parquet.io.api.Binary; -import org.apache.parquet.schema.GroupType; -import org.apache.parquet.schema.LogicalTypeAnnotation.DecimalLogicalTypeAnnotation; -import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.PrimitiveType; -import org.apache.parquet.schema.Type; - -public class FlinkParquetWriters { - private FlinkParquetWriters() {} - - @SuppressWarnings("unchecked") - public static ParquetValueWriter buildWriter(LogicalType schema, MessageType type) { - return (ParquetValueWriter) - ParquetWithFlinkSchemaVisitor.visit(schema, type, new WriteBuilder(type)); - } - - private static class WriteBuilder extends ParquetWithFlinkSchemaVisitor> { - private final MessageType type; - - WriteBuilder(MessageType type) { - this.type = type; - } - - @Override - public ParquetValueWriter message( - RowType sStruct, MessageType message, List> fields) { - return struct(sStruct, message.asGroupType(), fields); - } - - @Override - public ParquetValueWriter struct( - RowType sStruct, GroupType struct, List> fieldWriters) { - List fields = struct.getFields(); - List flinkFields = sStruct.getFields(); - List> writers = Lists.newArrayListWithExpectedSize(fieldWriters.size()); - List flinkTypes = Lists.newArrayList(); - for (int i = 0; i < fields.size(); i += 1) { - writers.add(newOption(struct.getType(i), fieldWriters.get(i))); - flinkTypes.add(flinkFields.get(i).getType()); - } - - return new RowDataWriter(writers, flinkTypes); - } - - @Override - public ParquetValueWriter list( - ArrayType sArray, GroupType array, ParquetValueWriter elementWriter) { - GroupType repeated = array.getFields().get(0).asGroupType(); - String[] repeatedPath = currentPath(); - - int repeatedD = type.getMaxDefinitionLevel(repeatedPath); - int repeatedR = type.getMaxRepetitionLevel(repeatedPath); - - return new ArrayDataWriter<>( - repeatedD, - repeatedR, - newOption(repeated.getType(0), elementWriter), - sArray.getElementType()); - } - - @Override - public ParquetValueWriter map( - MapType sMap, - GroupType map, - ParquetValueWriter keyWriter, - ParquetValueWriter valueWriter) { - GroupType repeatedKeyValue = map.getFields().get(0).asGroupType(); - String[] repeatedPath = currentPath(); - - int repeatedD = type.getMaxDefinitionLevel(repeatedPath); - int repeatedR = type.getMaxRepetitionLevel(repeatedPath); - - return new MapDataWriter<>( - repeatedD, - repeatedR, - newOption(repeatedKeyValue.getType(0), keyWriter), - newOption(repeatedKeyValue.getType(1), valueWriter), - sMap.getKeyType(), - sMap.getValueType()); - } - - private ParquetValueWriter newOption(Type fieldType, ParquetValueWriter writer) { - int maxD = type.getMaxDefinitionLevel(path(fieldType.getName())); - return ParquetValueWriters.option(fieldType, maxD, writer); - } - - @Override - public ParquetValueWriter primitive(LogicalType fType, PrimitiveType primitive) { - ColumnDescriptor desc = type.getColumnDescription(currentPath()); - - if (primitive.getOriginalType() != null) { - switch (primitive.getOriginalType()) { - case ENUM: - case JSON: - case UTF8: - return strings(desc); - case DATE: - case INT_8: - case INT_16: - case INT_32: - return ints(fType, desc); - case INT_64: - return ParquetValueWriters.longs(desc); - case TIME_MICROS: - return timeMicros(desc); - case TIMESTAMP_MICROS: - return timestamps(desc); - case DECIMAL: - DecimalLogicalTypeAnnotation decimal = - (DecimalLogicalTypeAnnotation) primitive.getLogicalTypeAnnotation(); - switch (primitive.getPrimitiveTypeName()) { - case INT32: - return decimalAsInteger(desc, decimal.getPrecision(), decimal.getScale()); - case INT64: - return decimalAsLong(desc, decimal.getPrecision(), decimal.getScale()); - case BINARY: - case FIXED_LEN_BYTE_ARRAY: - return decimalAsFixed(desc, decimal.getPrecision(), decimal.getScale()); - default: - throw new UnsupportedOperationException( - "Unsupported base type for decimal: " + primitive.getPrimitiveTypeName()); - } - case BSON: - return byteArrays(desc); - default: - throw new UnsupportedOperationException( - "Unsupported logical type: " + primitive.getOriginalType()); - } - } - - switch (primitive.getPrimitiveTypeName()) { - case FIXED_LEN_BYTE_ARRAY: - case BINARY: - return byteArrays(desc); - case BOOLEAN: - return ParquetValueWriters.booleans(desc); - case INT32: - return ints(fType, desc); - case INT64: - return ParquetValueWriters.longs(desc); - case FLOAT: - return ParquetValueWriters.floats(desc); - case DOUBLE: - return ParquetValueWriters.doubles(desc); - default: - throw new UnsupportedOperationException("Unsupported type: " + primitive); - } - } - } - - private static ParquetValueWriters.PrimitiveWriter ints( - LogicalType type, ColumnDescriptor desc) { - if (type instanceof TinyIntType) { - return ParquetValueWriters.tinyints(desc); - } else if (type instanceof SmallIntType) { - return ParquetValueWriters.shorts(desc); - } - return ParquetValueWriters.ints(desc); - } - - private static ParquetValueWriters.PrimitiveWriter strings(ColumnDescriptor desc) { - return new StringDataWriter(desc); - } - - private static ParquetValueWriters.PrimitiveWriter timeMicros(ColumnDescriptor desc) { - return new TimeMicrosWriter(desc); - } - - private static ParquetValueWriters.PrimitiveWriter decimalAsInteger( - ColumnDescriptor desc, int precision, int scale) { - Preconditions.checkArgument( - precision <= 9, - "Cannot write decimal value as integer with precision larger than 9," - + " wrong precision %s", - precision); - return new IntegerDecimalWriter(desc, precision, scale); - } - - private static ParquetValueWriters.PrimitiveWriter decimalAsLong( - ColumnDescriptor desc, int precision, int scale) { - Preconditions.checkArgument( - precision <= 18, - "Cannot write decimal value as long with precision larger than 18, " - + " wrong precision %s", - precision); - return new LongDecimalWriter(desc, precision, scale); - } - - private static ParquetValueWriters.PrimitiveWriter decimalAsFixed( - ColumnDescriptor desc, int precision, int scale) { - return new FixedDecimalWriter(desc, precision, scale); - } - - private static ParquetValueWriters.PrimitiveWriter timestamps( - ColumnDescriptor desc) { - return new TimestampDataWriter(desc); - } - - private static ParquetValueWriters.PrimitiveWriter byteArrays(ColumnDescriptor desc) { - return new ByteArrayWriter(desc); - } - - private static class StringDataWriter extends ParquetValueWriters.PrimitiveWriter { - private StringDataWriter(ColumnDescriptor desc) { - super(desc); - } - - @Override - public void write(int repetitionLevel, StringData value) { - column.writeBinary(repetitionLevel, Binary.fromReusedByteArray(value.toBytes())); - } - } - - private static class TimeMicrosWriter extends ParquetValueWriters.PrimitiveWriter { - private TimeMicrosWriter(ColumnDescriptor desc) { - super(desc); - } - - @Override - public void write(int repetitionLevel, Integer value) { - long micros = value.longValue() * 1000; - column.writeLong(repetitionLevel, micros); - } - } - - private static class IntegerDecimalWriter - extends ParquetValueWriters.PrimitiveWriter { - private final int precision; - private final int scale; - - private IntegerDecimalWriter(ColumnDescriptor desc, int precision, int scale) { - super(desc); - this.precision = precision; - this.scale = scale; - } - - @Override - public void write(int repetitionLevel, DecimalData decimal) { - Preconditions.checkArgument( - decimal.scale() == scale, - "Cannot write value as decimal(%s,%s), wrong scale: %s", - precision, - scale, - decimal); - Preconditions.checkArgument( - decimal.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", - precision, - scale, - decimal); - - column.writeInteger(repetitionLevel, (int) decimal.toUnscaledLong()); - } - } - - private static class LongDecimalWriter extends ParquetValueWriters.PrimitiveWriter { - private final int precision; - private final int scale; - - private LongDecimalWriter(ColumnDescriptor desc, int precision, int scale) { - super(desc); - this.precision = precision; - this.scale = scale; - } - - @Override - public void write(int repetitionLevel, DecimalData decimal) { - Preconditions.checkArgument( - decimal.scale() == scale, - "Cannot write value as decimal(%s,%s), wrong scale: %s", - precision, - scale, - decimal); - Preconditions.checkArgument( - decimal.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", - precision, - scale, - decimal); - - column.writeLong(repetitionLevel, decimal.toUnscaledLong()); - } - } - - private static class FixedDecimalWriter extends ParquetValueWriters.PrimitiveWriter { - private final int precision; - private final int scale; - private final ThreadLocal bytes; - - private FixedDecimalWriter(ColumnDescriptor desc, int precision, int scale) { - super(desc); - this.precision = precision; - this.scale = scale; - this.bytes = - ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); - } - - @Override - public void write(int repetitionLevel, DecimalData decimal) { - byte[] binary = - DecimalUtil.toReusedFixLengthBytes(precision, scale, decimal.toBigDecimal(), bytes.get()); - column.writeBinary(repetitionLevel, Binary.fromReusedByteArray(binary)); - } - } - - private static class TimestampDataWriter - extends ParquetValueWriters.PrimitiveWriter { - private TimestampDataWriter(ColumnDescriptor desc) { - super(desc); - } - - @Override - public void write(int repetitionLevel, TimestampData value) { - column.writeLong( - repetitionLevel, value.getMillisecond() * 1000 + value.getNanoOfMillisecond() / 1000); - } - } - - private static class ByteArrayWriter extends ParquetValueWriters.PrimitiveWriter { - private ByteArrayWriter(ColumnDescriptor desc) { - super(desc); - } - - @Override - public void write(int repetitionLevel, byte[] bytes) { - column.writeBinary(repetitionLevel, Binary.fromReusedByteArray(bytes)); - } - } - - private static class ArrayDataWriter extends ParquetValueWriters.RepeatedWriter { - private final LogicalType elementType; - - private ArrayDataWriter( - int definitionLevel, - int repetitionLevel, - ParquetValueWriter writer, - LogicalType elementType) { - super(definitionLevel, repetitionLevel, writer); - this.elementType = elementType; - } - - @Override - protected Iterator elements(ArrayData list) { - return new ElementIterator<>(list); - } - - private class ElementIterator implements Iterator { - private final int size; - private final ArrayData list; - private final ArrayData.ElementGetter getter; - private int index; - - private ElementIterator(ArrayData list) { - this.list = list; - size = list.size(); - getter = ArrayData.createElementGetter(elementType); - index = 0; - } - - @Override - public boolean hasNext() { - return index != size; - } - - @Override - @SuppressWarnings("unchecked") - public E next() { - if (index >= size) { - throw new NoSuchElementException(); - } - - E element = (E) getter.getElementOrNull(list, index); - index += 1; - - return element; - } - } - } - - private static class MapDataWriter - extends ParquetValueWriters.RepeatedKeyValueWriter { - private final LogicalType keyType; - private final LogicalType valueType; - - private MapDataWriter( - int definitionLevel, - int repetitionLevel, - ParquetValueWriter keyWriter, - ParquetValueWriter valueWriter, - LogicalType keyType, - LogicalType valueType) { - super(definitionLevel, repetitionLevel, keyWriter, valueWriter); - this.keyType = keyType; - this.valueType = valueType; - } - - @Override - protected Iterator> pairs(MapData map) { - return new EntryIterator<>(map); - } - - private class EntryIterator implements Iterator> { - private final int size; - private final ArrayData keys; - private final ArrayData values; - private final ParquetValueReaders.ReusableEntry entry; - private final ArrayData.ElementGetter keyGetter; - private final ArrayData.ElementGetter valueGetter; - private int index; - - private EntryIterator(MapData map) { - size = map.size(); - keys = map.keyArray(); - values = map.valueArray(); - entry = new ParquetValueReaders.ReusableEntry<>(); - keyGetter = ArrayData.createElementGetter(keyType); - valueGetter = ArrayData.createElementGetter(valueType); - index = 0; - } - - @Override - public boolean hasNext() { - return index != size; - } - - @Override - @SuppressWarnings("unchecked") - public Map.Entry next() { - if (index >= size) { - throw new NoSuchElementException(); - } - - entry.set( - (K) keyGetter.getElementOrNull(keys, index), - (V) valueGetter.getElementOrNull(values, index)); - index += 1; - - return entry; - } - } - } - - private static class RowDataWriter extends ParquetValueWriters.StructWriter { - private final RowData.FieldGetter[] fieldGetter; - - RowDataWriter(List> writers, List types) { - super(writers); - fieldGetter = new RowData.FieldGetter[types.size()]; - for (int i = 0; i < types.size(); i += 1) { - fieldGetter[i] = RowData.createFieldGetter(types.get(i), i); - } - } - - @Override - protected Object get(RowData struct, int index) { - return fieldGetter[index].getFieldOrNull(struct); - } - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java deleted file mode 100644 index ba4e1a7a7aec..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java +++ /dev/null @@ -1,161 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.util.List; -import org.apache.flink.table.types.logical.ArrayType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.MapType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.Schema; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; - -abstract class FlinkSchemaVisitor { - - static T visit(RowType flinkType, Schema schema, FlinkSchemaVisitor visitor) { - return visit(flinkType, schema.asStruct(), visitor); - } - - private static T visit(LogicalType flinkType, Type iType, FlinkSchemaVisitor visitor) { - switch (iType.typeId()) { - case STRUCT: - return visitRecord(flinkType, iType.asStructType(), visitor); - - case MAP: - MapType mapType = (MapType) flinkType; - Types.MapType iMapType = iType.asMapType(); - T key; - T value; - - Types.NestedField keyField = iMapType.field(iMapType.keyId()); - visitor.beforeMapKey(keyField); - try { - key = visit(mapType.getKeyType(), iMapType.keyType(), visitor); - } finally { - visitor.afterMapKey(keyField); - } - - Types.NestedField valueField = iMapType.field(iMapType.valueId()); - visitor.beforeMapValue(valueField); - try { - value = visit(mapType.getValueType(), iMapType.valueType(), visitor); - } finally { - visitor.afterMapValue(valueField); - } - - return visitor.map(iMapType, key, value, mapType.getKeyType(), mapType.getValueType()); - - case LIST: - ArrayType listType = (ArrayType) flinkType; - Types.ListType iListType = iType.asListType(); - T element; - - Types.NestedField elementField = iListType.field(iListType.elementId()); - visitor.beforeListElement(elementField); - try { - element = visit(listType.getElementType(), iListType.elementType(), visitor); - } finally { - visitor.afterListElement(elementField); - } - - return visitor.list(iListType, element, listType.getElementType()); - - default: - return visitor.primitive(iType.asPrimitiveType(), flinkType); - } - } - - private static T visitRecord( - LogicalType flinkType, Types.StructType struct, FlinkSchemaVisitor visitor) { - Preconditions.checkArgument(flinkType instanceof RowType, "%s is not a RowType.", flinkType); - RowType rowType = (RowType) flinkType; - - int fieldSize = struct.fields().size(); - List results = Lists.newArrayListWithExpectedSize(fieldSize); - List fieldTypes = Lists.newArrayListWithExpectedSize(fieldSize); - List nestedFields = struct.fields(); - - for (int i = 0; i < fieldSize; i++) { - Types.NestedField iField = nestedFields.get(i); - int fieldIndex = rowType.getFieldIndex(iField.name()); - Preconditions.checkArgument( - fieldIndex >= 0, "NestedField: %s is not found in flink RowType: %s", iField, rowType); - - LogicalType fieldFlinkType = rowType.getTypeAt(fieldIndex); - - fieldTypes.add(fieldFlinkType); - - visitor.beforeField(iField); - try { - results.add(visit(fieldFlinkType, iField.type(), visitor)); - } finally { - visitor.afterField(iField); - } - } - - return visitor.record(struct, results, fieldTypes); - } - - public T record(Types.StructType iStruct, List results, List fieldTypes) { - return null; - } - - public T list(Types.ListType iList, T element, LogicalType elementType) { - return null; - } - - public T map(Types.MapType iMap, T key, T value, LogicalType keyType, LogicalType valueType) { - return null; - } - - public T primitive(Type.PrimitiveType iPrimitive, LogicalType flinkPrimitive) { - return null; - } - - public void beforeField(Types.NestedField field) {} - - public void afterField(Types.NestedField field) {} - - public void beforeListElement(Types.NestedField elementField) { - beforeField(elementField); - } - - public void afterListElement(Types.NestedField elementField) { - afterField(elementField); - } - - public void beforeMapKey(Types.NestedField keyField) { - beforeField(keyField); - } - - public void afterMapKey(Types.NestedField keyField) { - afterField(keyField); - } - - public void beforeMapValue(Types.NestedField valueField) { - beforeField(valueField); - } - - public void afterMapValue(Types.NestedField valueField) { - afterField(valueField); - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java deleted file mode 100644 index 32f6c3a2ccfd..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java +++ /dev/null @@ -1,312 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.io.IOException; -import java.math.BigDecimal; -import java.math.BigInteger; -import java.nio.charset.StandardCharsets; -import java.util.List; -import java.util.Map; -import org.apache.avro.io.Decoder; -import org.apache.avro.util.Utf8; -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.GenericArrayData; -import org.apache.flink.table.data.GenericMapData; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.iceberg.avro.ValueReader; -import org.apache.iceberg.avro.ValueReaders; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; - -public class FlinkValueReaders { - - private FlinkValueReaders() {} - - static ValueReader strings() { - return StringReader.INSTANCE; - } - - static ValueReader enums(List symbols) { - return new EnumReader(symbols); - } - - static ValueReader uuids() { - return ValueReaders.fixed(16); - } - - static ValueReader timeMicros() { - return TimeMicrosReader.INSTANCE; - } - - static ValueReader timestampMills() { - return TimestampMillsReader.INSTANCE; - } - - static ValueReader timestampMicros() { - return TimestampMicrosReader.INSTANCE; - } - - static ValueReader decimal( - ValueReader unscaledReader, int precision, int scale) { - return new DecimalReader(unscaledReader, precision, scale); - } - - static ValueReader array(ValueReader elementReader) { - return new ArrayReader(elementReader); - } - - static ValueReader arrayMap(ValueReader keyReader, ValueReader valueReader) { - return new ArrayMapReader(keyReader, valueReader); - } - - static ValueReader map(ValueReader keyReader, ValueReader valueReader) { - return new MapReader(keyReader, valueReader); - } - - static ValueReader struct( - List> readers, Types.StructType struct, Map idToConstant) { - return new StructReader(readers, struct, idToConstant); - } - - private static class StringReader implements ValueReader { - private static final StringReader INSTANCE = new StringReader(); - - private StringReader() {} - - @Override - public StringData read(Decoder decoder, Object reuse) throws IOException { - // use the decoder's readString(Utf8) method because it may be a resolving decoder - Utf8 utf8 = null; - if (reuse instanceof StringData) { - utf8 = new Utf8(((StringData) reuse).toBytes()); - } - - Utf8 string = decoder.readString(utf8); - return StringData.fromBytes(string.getBytes(), 0, string.getByteLength()); - } - } - - private static class EnumReader implements ValueReader { - private final StringData[] symbols; - - private EnumReader(List symbols) { - this.symbols = new StringData[symbols.size()]; - for (int i = 0; i < this.symbols.length; i += 1) { - this.symbols[i] = StringData.fromBytes(symbols.get(i).getBytes(StandardCharsets.UTF_8)); - } - } - - @Override - public StringData read(Decoder decoder, Object ignore) throws IOException { - int index = decoder.readEnum(); - return symbols[index]; - } - } - - private static class DecimalReader implements ValueReader { - private final ValueReader bytesReader; - private final int precision; - private final int scale; - - private DecimalReader(ValueReader bytesReader, int precision, int scale) { - this.bytesReader = bytesReader; - this.precision = precision; - this.scale = scale; - } - - @Override - public DecimalData read(Decoder decoder, Object reuse) throws IOException { - byte[] bytes = bytesReader.read(decoder, null); - return DecimalData.fromBigDecimal( - new BigDecimal(new BigInteger(bytes), scale), precision, scale); - } - } - - private static class TimeMicrosReader implements ValueReader { - private static final TimeMicrosReader INSTANCE = new TimeMicrosReader(); - - @Override - public Integer read(Decoder decoder, Object reuse) throws IOException { - long micros = decoder.readLong(); - // Flink only support time mills, just erase micros. - return (int) (micros / 1000); - } - } - - private static class TimestampMillsReader implements ValueReader { - private static final TimestampMillsReader INSTANCE = new TimestampMillsReader(); - - @Override - public TimestampData read(Decoder decoder, Object reuse) throws IOException { - return TimestampData.fromEpochMillis(decoder.readLong()); - } - } - - private static class TimestampMicrosReader implements ValueReader { - private static final TimestampMicrosReader INSTANCE = new TimestampMicrosReader(); - - @Override - public TimestampData read(Decoder decoder, Object reuse) throws IOException { - long micros = decoder.readLong(); - long mills = micros / 1000; - int nanos = ((int) (micros % 1000)) * 1000; - if (nanos < 0) { - nanos += 1_000_000; - mills -= 1; - } - return TimestampData.fromEpochMillis(mills, nanos); - } - } - - private static class ArrayReader implements ValueReader { - private final ValueReader elementReader; - private final List reusedList = Lists.newArrayList(); - - private ArrayReader(ValueReader elementReader) { - this.elementReader = elementReader; - } - - @Override - public GenericArrayData read(Decoder decoder, Object reuse) throws IOException { - reusedList.clear(); - long chunkLength = decoder.readArrayStart(); - - while (chunkLength > 0) { - for (int i = 0; i < chunkLength; i += 1) { - reusedList.add(elementReader.read(decoder, null)); - } - - chunkLength = decoder.arrayNext(); - } - - // this will convert the list to an array so it is okay to reuse the list - return new GenericArrayData(reusedList.toArray()); - } - } - - private static MapData kvArrayToMap(List keyList, List valueList) { - Map map = Maps.newHashMap(); - Object[] keys = keyList.toArray(); - Object[] values = valueList.toArray(); - for (int i = 0; i < keys.length; i++) { - map.put(keys[i], values[i]); - } - - return new GenericMapData(map); - } - - private static class ArrayMapReader implements ValueReader { - private final ValueReader keyReader; - private final ValueReader valueReader; - - private final List reusedKeyList = Lists.newArrayList(); - private final List reusedValueList = Lists.newArrayList(); - - private ArrayMapReader(ValueReader keyReader, ValueReader valueReader) { - this.keyReader = keyReader; - this.valueReader = valueReader; - } - - @Override - public MapData read(Decoder decoder, Object reuse) throws IOException { - reusedKeyList.clear(); - reusedValueList.clear(); - - long chunkLength = decoder.readArrayStart(); - - while (chunkLength > 0) { - for (int i = 0; i < chunkLength; i += 1) { - reusedKeyList.add(keyReader.read(decoder, null)); - reusedValueList.add(valueReader.read(decoder, null)); - } - - chunkLength = decoder.arrayNext(); - } - - return kvArrayToMap(reusedKeyList, reusedValueList); - } - } - - private static class MapReader implements ValueReader { - private final ValueReader keyReader; - private final ValueReader valueReader; - - private final List reusedKeyList = Lists.newArrayList(); - private final List reusedValueList = Lists.newArrayList(); - - private MapReader(ValueReader keyReader, ValueReader valueReader) { - this.keyReader = keyReader; - this.valueReader = valueReader; - } - - @Override - public MapData read(Decoder decoder, Object reuse) throws IOException { - reusedKeyList.clear(); - reusedValueList.clear(); - - long chunkLength = decoder.readMapStart(); - - while (chunkLength > 0) { - for (int i = 0; i < chunkLength; i += 1) { - reusedKeyList.add(keyReader.read(decoder, null)); - reusedValueList.add(valueReader.read(decoder, null)); - } - - chunkLength = decoder.mapNext(); - } - - return kvArrayToMap(reusedKeyList, reusedValueList); - } - } - - private static class StructReader extends ValueReaders.StructReader { - private final int numFields; - - private StructReader( - List> readers, Types.StructType struct, Map idToConstant) { - super(readers, struct, idToConstant); - this.numFields = readers.size(); - } - - @Override - protected RowData reuseOrCreate(Object reuse) { - if (reuse instanceof GenericRowData && ((GenericRowData) reuse).getArity() == numFields) { - return (GenericRowData) reuse; - } - return new GenericRowData(numFields); - } - - @Override - protected Object get(RowData struct, int pos) { - return null; - } - - @Override - protected void set(RowData struct, int pos, Object value) { - ((GenericRowData) struct).setField(pos, value); - } - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java deleted file mode 100644 index 4e86ecce28b5..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java +++ /dev/null @@ -1,253 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.io.IOException; -import java.lang.reflect.Array; -import java.util.List; -import org.apache.avro.io.Encoder; -import org.apache.avro.util.Utf8; -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.iceberg.avro.ValueWriter; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.util.DecimalUtil; - -public class FlinkValueWriters { - - private FlinkValueWriters() {} - - static ValueWriter strings() { - return StringWriter.INSTANCE; - } - - static ValueWriter timeMicros() { - return TimeMicrosWriter.INSTANCE; - } - - static ValueWriter timestampMicros() { - return TimestampMicrosWriter.INSTANCE; - } - - static ValueWriter decimal(int precision, int scale) { - return new DecimalWriter(precision, scale); - } - - static ValueWriter array(ValueWriter elementWriter, LogicalType elementType) { - return new ArrayWriter<>(elementWriter, elementType); - } - - static ValueWriter arrayMap( - ValueWriter keyWriter, - LogicalType keyType, - ValueWriter valueWriter, - LogicalType valueType) { - return new ArrayMapWriter<>(keyWriter, keyType, valueWriter, valueType); - } - - static ValueWriter map( - ValueWriter keyWriter, - LogicalType keyType, - ValueWriter valueWriter, - LogicalType valueType) { - return new MapWriter<>(keyWriter, keyType, valueWriter, valueType); - } - - static ValueWriter row(List> writers, List types) { - return new RowWriter(writers, types); - } - - private static class StringWriter implements ValueWriter { - private static final StringWriter INSTANCE = new StringWriter(); - - private StringWriter() {} - - @Override - public void write(StringData s, Encoder encoder) throws IOException { - // toBytes is cheaper than Avro calling toString, which incurs encoding costs - encoder.writeString(new Utf8(s.toBytes())); - } - } - - private static class DecimalWriter implements ValueWriter { - private final int precision; - private final int scale; - private final ThreadLocal bytes; - - private DecimalWriter(int precision, int scale) { - this.precision = precision; - this.scale = scale; - this.bytes = - ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); - } - - @Override - public void write(DecimalData d, Encoder encoder) throws IOException { - encoder.writeFixed( - DecimalUtil.toReusedFixLengthBytes(precision, scale, d.toBigDecimal(), bytes.get())); - } - } - - private static class TimeMicrosWriter implements ValueWriter { - private static final TimeMicrosWriter INSTANCE = new TimeMicrosWriter(); - - @Override - public void write(Integer timeMills, Encoder encoder) throws IOException { - encoder.writeLong(timeMills * 1000L); - } - } - - private static class TimestampMicrosWriter implements ValueWriter { - private static final TimestampMicrosWriter INSTANCE = new TimestampMicrosWriter(); - - @Override - public void write(TimestampData timestampData, Encoder encoder) throws IOException { - long micros = - timestampData.getMillisecond() * 1000 + timestampData.getNanoOfMillisecond() / 1000; - encoder.writeLong(micros); - } - } - - private static class ArrayWriter implements ValueWriter { - private final ValueWriter elementWriter; - private final ArrayData.ElementGetter elementGetter; - - private ArrayWriter(ValueWriter elementWriter, LogicalType elementType) { - this.elementWriter = elementWriter; - this.elementGetter = ArrayData.createElementGetter(elementType); - } - - @Override - @SuppressWarnings("unchecked") - public void write(ArrayData array, Encoder encoder) throws IOException { - encoder.writeArrayStart(); - int numElements = array.size(); - encoder.setItemCount(numElements); - for (int i = 0; i < numElements; i += 1) { - encoder.startItem(); - elementWriter.write((T) elementGetter.getElementOrNull(array, i), encoder); - } - encoder.writeArrayEnd(); - } - } - - private static class ArrayMapWriter implements ValueWriter { - private final ValueWriter keyWriter; - private final ValueWriter valueWriter; - private final ArrayData.ElementGetter keyGetter; - private final ArrayData.ElementGetter valueGetter; - - private ArrayMapWriter( - ValueWriter keyWriter, - LogicalType keyType, - ValueWriter valueWriter, - LogicalType valueType) { - this.keyWriter = keyWriter; - this.keyGetter = ArrayData.createElementGetter(keyType); - this.valueWriter = valueWriter; - this.valueGetter = ArrayData.createElementGetter(valueType); - } - - @Override - @SuppressWarnings("unchecked") - public void write(MapData map, Encoder encoder) throws IOException { - encoder.writeArrayStart(); - int numElements = map.size(); - encoder.setItemCount(numElements); - ArrayData keyArray = map.keyArray(); - ArrayData valueArray = map.valueArray(); - for (int i = 0; i < numElements; i += 1) { - encoder.startItem(); - keyWriter.write((K) keyGetter.getElementOrNull(keyArray, i), encoder); - valueWriter.write((V) valueGetter.getElementOrNull(valueArray, i), encoder); - } - encoder.writeArrayEnd(); - } - } - - private static class MapWriter implements ValueWriter { - private final ValueWriter keyWriter; - private final ValueWriter valueWriter; - private final ArrayData.ElementGetter keyGetter; - private final ArrayData.ElementGetter valueGetter; - - private MapWriter( - ValueWriter keyWriter, - LogicalType keyType, - ValueWriter valueWriter, - LogicalType valueType) { - this.keyWriter = keyWriter; - this.keyGetter = ArrayData.createElementGetter(keyType); - this.valueWriter = valueWriter; - this.valueGetter = ArrayData.createElementGetter(valueType); - } - - @Override - @SuppressWarnings("unchecked") - public void write(MapData map, Encoder encoder) throws IOException { - encoder.writeMapStart(); - int numElements = map.size(); - encoder.setItemCount(numElements); - ArrayData keyArray = map.keyArray(); - ArrayData valueArray = map.valueArray(); - for (int i = 0; i < numElements; i += 1) { - encoder.startItem(); - keyWriter.write((K) keyGetter.getElementOrNull(keyArray, i), encoder); - valueWriter.write((V) valueGetter.getElementOrNull(valueArray, i), encoder); - } - encoder.writeMapEnd(); - } - } - - static class RowWriter implements ValueWriter { - private final ValueWriter[] writers; - private final RowData.FieldGetter[] getters; - - private RowWriter(List> writers, List types) { - this.writers = (ValueWriter[]) Array.newInstance(ValueWriter.class, writers.size()); - this.getters = new RowData.FieldGetter[writers.size()]; - for (int i = 0; i < writers.size(); i += 1) { - this.writers[i] = writers.get(i); - this.getters[i] = RowData.createFieldGetter(types.get(i), i); - } - } - - @Override - public void write(RowData row, Encoder encoder) throws IOException { - for (int i = 0; i < writers.length; i += 1) { - if (row.isNullAt(i)) { - writers[i].write(null, encoder); - } else { - write(row, i, writers[i], encoder); - } - } - } - - @SuppressWarnings("unchecked") - private void write(RowData row, int pos, ValueWriter writer, Encoder encoder) - throws IOException { - writer.write((T) getters[pos].getFieldOrNull(row), encoder); - } - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java deleted file mode 100644 index 33feb2e32118..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java +++ /dev/null @@ -1,222 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.util.Deque; -import java.util.List; -import org.apache.flink.table.types.logical.ArrayType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.MapType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.logical.RowType.RowField; -import org.apache.iceberg.avro.AvroSchemaUtil; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.parquet.schema.GroupType; -import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.OriginalType; -import org.apache.parquet.schema.PrimitiveType; -import org.apache.parquet.schema.Type; - -public class ParquetWithFlinkSchemaVisitor { - private final Deque fieldNames = Lists.newLinkedList(); - - public static T visit( - LogicalType sType, Type type, ParquetWithFlinkSchemaVisitor visitor) { - Preconditions.checkArgument(sType != null, "Invalid DataType: null"); - if (type instanceof MessageType) { - Preconditions.checkArgument( - sType instanceof RowType, "Invalid struct: %s is not a struct", sType); - RowType struct = (RowType) sType; - return visitor.message( - struct, (MessageType) type, visitFields(struct, type.asGroupType(), visitor)); - } else if (type.isPrimitive()) { - return visitor.primitive(sType, type.asPrimitiveType()); - } else { - // if not a primitive, the typeId must be a group - GroupType group = type.asGroupType(); - OriginalType annotation = group.getOriginalType(); - if (annotation != null) { - switch (annotation) { - case LIST: - Preconditions.checkArgument( - !group.isRepetition(Type.Repetition.REPEATED), - "Invalid list: top-level group is repeated: %s", - group); - Preconditions.checkArgument( - group.getFieldCount() == 1, - "Invalid list: does not contain single repeated field: %s", - group); - - GroupType repeatedElement = group.getFields().get(0).asGroupType(); - Preconditions.checkArgument( - repeatedElement.isRepetition(Type.Repetition.REPEATED), - "Invalid list: inner group is not repeated"); - Preconditions.checkArgument( - repeatedElement.getFieldCount() <= 1, - "Invalid list: repeated group is not a single field: %s", - group); - - Preconditions.checkArgument( - sType instanceof ArrayType, "Invalid list: %s is not an array", sType); - ArrayType array = (ArrayType) sType; - RowType.RowField element = - new RowField( - "element", array.getElementType(), "element of " + array.asSummaryString()); - - visitor.fieldNames.push(repeatedElement.getName()); - try { - T elementResult = null; - if (repeatedElement.getFieldCount() > 0) { - elementResult = visitField(element, repeatedElement.getType(0), visitor); - } - - return visitor.list(array, group, elementResult); - - } finally { - visitor.fieldNames.pop(); - } - - case MAP: - Preconditions.checkArgument( - !group.isRepetition(Type.Repetition.REPEATED), - "Invalid map: top-level group is repeated: %s", - group); - Preconditions.checkArgument( - group.getFieldCount() == 1, - "Invalid map: does not contain single repeated field: %s", - group); - - GroupType repeatedKeyValue = group.getType(0).asGroupType(); - Preconditions.checkArgument( - repeatedKeyValue.isRepetition(Type.Repetition.REPEATED), - "Invalid map: inner group is not repeated"); - Preconditions.checkArgument( - repeatedKeyValue.getFieldCount() <= 2, - "Invalid map: repeated group does not have 2 fields"); - - Preconditions.checkArgument( - sType instanceof MapType, "Invalid map: %s is not a map", sType); - MapType map = (MapType) sType; - RowField keyField = - new RowField("key", map.getKeyType(), "key of " + map.asSummaryString()); - RowField valueField = - new RowField("value", map.getValueType(), "value of " + map.asSummaryString()); - - visitor.fieldNames.push(repeatedKeyValue.getName()); - try { - T keyResult = null; - T valueResult = null; - switch (repeatedKeyValue.getFieldCount()) { - case 2: - // if there are 2 fields, both key and value are projected - keyResult = visitField(keyField, repeatedKeyValue.getType(0), visitor); - valueResult = visitField(valueField, repeatedKeyValue.getType(1), visitor); - break; - case 1: - // if there is just one, use the name to determine what it is - Type keyOrValue = repeatedKeyValue.getType(0); - if (keyOrValue.getName().equalsIgnoreCase("key")) { - keyResult = visitField(keyField, keyOrValue, visitor); - // value result remains null - } else { - valueResult = visitField(valueField, keyOrValue, visitor); - // key result remains null - } - break; - default: - // both results will remain null - } - - return visitor.map(map, group, keyResult, valueResult); - - } finally { - visitor.fieldNames.pop(); - } - - default: - } - } - Preconditions.checkArgument( - sType instanceof RowType, "Invalid struct: %s is not a struct", sType); - RowType struct = (RowType) sType; - return visitor.struct(struct, group, visitFields(struct, group, visitor)); - } - } - - private static T visitField( - RowType.RowField sField, Type field, ParquetWithFlinkSchemaVisitor visitor) { - visitor.fieldNames.push(field.getName()); - try { - return visit(sField.getType(), field, visitor); - } finally { - visitor.fieldNames.pop(); - } - } - - private static List visitFields( - RowType struct, GroupType group, ParquetWithFlinkSchemaVisitor visitor) { - List sFields = struct.getFields(); - Preconditions.checkArgument( - sFields.size() == group.getFieldCount(), "Structs do not match: %s and %s", struct, group); - List results = Lists.newArrayListWithExpectedSize(group.getFieldCount()); - for (int i = 0; i < sFields.size(); i += 1) { - Type field = group.getFields().get(i); - RowType.RowField sField = sFields.get(i); - Preconditions.checkArgument( - field.getName().equals(AvroSchemaUtil.makeCompatibleName(sField.getName())), - "Structs do not match: field %s != %s", - field.getName(), - sField.getName()); - results.add(visitField(sField, field, visitor)); - } - - return results; - } - - public T message(RowType sStruct, MessageType message, List fields) { - return null; - } - - public T struct(RowType sStruct, GroupType struct, List fields) { - return null; - } - - public T list(ArrayType sArray, GroupType array, T element) { - return null; - } - - public T map(MapType sMap, GroupType map, T key, T value) { - return null; - } - - public T primitive(LogicalType sPrimitive, PrimitiveType primitive) { - return null; - } - - protected String[] currentPath() { - return Lists.newArrayList(fieldNames.descendingIterator()).toArray(new String[0]); - } - - protected String[] path(String name) { - List list = Lists.newArrayList(fieldNames.descendingIterator()); - list.add(name); - return list.toArray(new String[0]); - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java deleted file mode 100644 index e41bae686d1e..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java +++ /dev/null @@ -1,257 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.util.Map; -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.RawValueData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.types.RowKind; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; - -public class RowDataProjection implements RowData { - /** - * Creates a projecting wrapper for {@link RowData} rows. - * - *

This projection will not project the nested children types of repeated types like lists and - * maps. - * - * @param schema schema of rows wrapped by this projection - * @param projectedSchema result schema of the projected rows - * @return a wrapper to project rows - */ - public static RowDataProjection create(Schema schema, Schema projectedSchema) { - return RowDataProjection.create( - FlinkSchemaUtil.convert(schema), schema.asStruct(), projectedSchema.asStruct()); - } - - /** - * Creates a projecting wrapper for {@link RowData} rows. - * - *

This projection will not project the nested children types of repeated types like lists and - * maps. - * - * @param rowType flink row type of rows wrapped by this projection - * @param schema schema of rows wrapped by this projection - * @param projectedSchema result schema of the projected rows - * @return a wrapper to project rows - */ - public static RowDataProjection create( - RowType rowType, Types.StructType schema, Types.StructType projectedSchema) { - return new RowDataProjection(rowType, schema, projectedSchema); - } - - private final RowData.FieldGetter[] getters; - private RowData rowData; - - private RowDataProjection( - RowType rowType, Types.StructType rowStruct, Types.StructType projectType) { - Map fieldIdToPosition = Maps.newHashMap(); - for (int i = 0; i < rowStruct.fields().size(); i++) { - fieldIdToPosition.put(rowStruct.fields().get(i).fieldId(), i); - } - - this.getters = new RowData.FieldGetter[projectType.fields().size()]; - for (int i = 0; i < getters.length; i++) { - Types.NestedField projectField = projectType.fields().get(i); - Types.NestedField rowField = rowStruct.field(projectField.fieldId()); - - Preconditions.checkNotNull( - rowField, - "Cannot locate the project field <%s> in the iceberg struct <%s>", - projectField, - rowStruct); - - getters[i] = - createFieldGetter( - rowType, fieldIdToPosition.get(projectField.fieldId()), rowField, projectField); - } - } - - private static RowData.FieldGetter createFieldGetter( - RowType rowType, int position, Types.NestedField rowField, Types.NestedField projectField) { - Preconditions.checkArgument( - rowField.type().typeId() == projectField.type().typeId(), - "Different iceberg type between row field <%s> and project field <%s>", - rowField, - projectField); - - switch (projectField.type().typeId()) { - case STRUCT: - RowType nestedRowType = (RowType) rowType.getTypeAt(position); - return row -> { - RowData nestedRow = - row.isNullAt(position) ? null : row.getRow(position, nestedRowType.getFieldCount()); - return RowDataProjection.create( - nestedRowType, rowField.type().asStructType(), projectField.type().asStructType()) - .wrap(nestedRow); - }; - - case MAP: - Types.MapType projectedMap = projectField.type().asMapType(); - Types.MapType originalMap = rowField.type().asMapType(); - - boolean keyProjectable = - !projectedMap.keyType().isNestedType() - || projectedMap.keyType().equals(originalMap.keyType()); - boolean valueProjectable = - !projectedMap.valueType().isNestedType() - || projectedMap.valueType().equals(originalMap.valueType()); - Preconditions.checkArgument( - keyProjectable && valueProjectable, - "Cannot project a partial map key or value with non-primitive type. Trying to project <%s> out of <%s>", - projectField, - rowField); - - return RowData.createFieldGetter(rowType.getTypeAt(position), position); - - case LIST: - Types.ListType projectedList = projectField.type().asListType(); - Types.ListType originalList = rowField.type().asListType(); - - boolean elementProjectable = - !projectedList.elementType().isNestedType() - || projectedList.elementType().equals(originalList.elementType()); - Preconditions.checkArgument( - elementProjectable, - "Cannot project a partial list element with non-primitive type. Trying to project <%s> out of <%s>", - projectField, - rowField); - - return RowData.createFieldGetter(rowType.getTypeAt(position), position); - - default: - return RowData.createFieldGetter(rowType.getTypeAt(position), position); - } - } - - public RowData wrap(RowData row) { - this.rowData = row; - return this; - } - - private Object getValue(int pos) { - return getters[pos].getFieldOrNull(rowData); - } - - @Override - public int getArity() { - return getters.length; - } - - @Override - public RowKind getRowKind() { - return rowData.getRowKind(); - } - - @Override - public void setRowKind(RowKind kind) { - throw new UnsupportedOperationException("Cannot set row kind in the RowDataProjection"); - } - - @Override - public boolean isNullAt(int pos) { - return rowData == null || getValue(pos) == null; - } - - @Override - public boolean getBoolean(int pos) { - return (boolean) getValue(pos); - } - - @Override - public byte getByte(int pos) { - return (byte) getValue(pos); - } - - @Override - public short getShort(int pos) { - return (short) getValue(pos); - } - - @Override - public int getInt(int pos) { - return (int) getValue(pos); - } - - @Override - public long getLong(int pos) { - return (long) getValue(pos); - } - - @Override - public float getFloat(int pos) { - return (float) getValue(pos); - } - - @Override - public double getDouble(int pos) { - return (double) getValue(pos); - } - - @Override - public StringData getString(int pos) { - return (StringData) getValue(pos); - } - - @Override - public DecimalData getDecimal(int pos, int precision, int scale) { - return (DecimalData) getValue(pos); - } - - @Override - public TimestampData getTimestamp(int pos, int precision) { - return (TimestampData) getValue(pos); - } - - @Override - @SuppressWarnings("unchecked") - public RawValueData getRawValue(int pos) { - return (RawValueData) getValue(pos); - } - - @Override - public byte[] getBinary(int pos) { - return (byte[]) getValue(pos); - } - - @Override - public ArrayData getArray(int pos) { - return (ArrayData) getValue(pos); - } - - @Override - public MapData getMap(int pos) { - return (MapData) getValue(pos); - } - - @Override - public RowData getRow(int pos, int numFields) { - return (RowData) getValue(pos); - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java deleted file mode 100644 index c5cb51b7eae4..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.math.BigDecimal; -import java.nio.ByteBuffer; -import org.apache.avro.generic.GenericData; -import org.apache.avro.util.Utf8; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.table.runtime.typeutils.RowDataSerializer; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.ByteBuffers; -import org.apache.iceberg.util.DateTimeUtil; - -public class RowDataUtil { - - private RowDataUtil() {} - - public static Object convertConstant(Type type, Object value) { - if (value == null) { - return null; - } - - switch (type.typeId()) { - case DECIMAL: // DecimalData - Types.DecimalType decimal = (Types.DecimalType) type; - return DecimalData.fromBigDecimal((BigDecimal) value, decimal.precision(), decimal.scale()); - case STRING: // StringData - if (value instanceof Utf8) { - Utf8 utf8 = (Utf8) value; - return StringData.fromBytes(utf8.getBytes(), 0, utf8.getByteLength()); - } - return StringData.fromString(value.toString()); - case FIXED: // byte[] - if (value instanceof byte[]) { - return value; - } else if (value instanceof GenericData.Fixed) { - return ((GenericData.Fixed) value).bytes(); - } - return ByteBuffers.toByteArray((ByteBuffer) value); - case BINARY: // byte[] - return ByteBuffers.toByteArray((ByteBuffer) value); - case TIME: // int mills instead of long - return (int) ((Long) value / 1000); - case TIMESTAMP: // TimestampData - return TimestampData.fromLocalDateTime(DateTimeUtil.timestampFromMicros((Long) value)); - default: - } - return value; - } - - /** - * Similar to the private {@link RowDataSerializer#copyRowData(RowData, RowData)} method. This - * skips the check the arity of rowType and from, because the from RowData may contains additional - * column for position deletes. Using {@link RowDataSerializer#copy(RowData, RowData)} will fail - * the arity check. - */ - public static RowData clone( - RowData from, RowData reuse, RowType rowType, TypeSerializer[] fieldSerializers) { - GenericRowData ret; - if (reuse instanceof GenericRowData) { - ret = (GenericRowData) reuse; - } else { - ret = new GenericRowData(from.getArity()); - } - ret.setRowKind(from.getRowKind()); - for (int i = 0; i < rowType.getFieldCount(); i++) { - if (!from.isNullAt(i)) { - RowData.FieldGetter getter = RowData.createFieldGetter(rowType.getTypeAt(i), i); - ret.setField(i, fieldSerializers[i].copy(getter.getFieldOrNull(from))); - } else { - ret.setField(i, null); - } - } - return ret; - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java deleted file mode 100644 index 9e0bc69bd54e..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.IOException; -import java.util.List; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionKey; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.flink.data.RowDataProjection; -import org.apache.iceberg.io.BaseTaskWriter; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.OutputFileFactory; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.TypeUtil; - -abstract class BaseDeltaTaskWriter extends BaseTaskWriter { - - private final Schema schema; - private final Schema deleteSchema; - private final RowDataWrapper wrapper; - private final RowDataWrapper keyWrapper; - private final RowDataProjection keyProjection; - private final boolean upsert; - - BaseDeltaTaskWriter( - PartitionSpec spec, - FileFormat format, - FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, - FileIO io, - long targetFileSize, - Schema schema, - RowType flinkSchema, - List equalityFieldIds, - boolean upsert) { - super(spec, format, appenderFactory, fileFactory, io, targetFileSize); - this.schema = schema; - this.deleteSchema = TypeUtil.select(schema, Sets.newHashSet(equalityFieldIds)); - this.wrapper = new RowDataWrapper(flinkSchema, schema.asStruct()); - this.upsert = upsert; - this.keyWrapper = - new RowDataWrapper(FlinkSchemaUtil.convert(deleteSchema), deleteSchema.asStruct()); - this.keyProjection = RowDataProjection.create(schema, deleteSchema); - } - - abstract RowDataDeltaWriter route(RowData row); - - RowDataWrapper wrapper() { - return wrapper; - } - - @Override - public void write(RowData row) throws IOException { - RowDataDeltaWriter writer = route(row); - - switch (row.getRowKind()) { - case INSERT: - case UPDATE_AFTER: - if (upsert) { - writer.deleteKey(keyProjection.wrap(row)); - } - writer.write(row); - break; - - case UPDATE_BEFORE: - if (upsert) { - break; // UPDATE_BEFORE is not necessary for UPSERT, we do nothing to prevent delete one - // row twice - } - writer.delete(row); - break; - case DELETE: - writer.delete(row); - break; - - default: - throw new UnsupportedOperationException("Unknown row kind: " + row.getRowKind()); - } - } - - protected class RowDataDeltaWriter extends BaseEqualityDeltaWriter { - RowDataDeltaWriter(PartitionKey partition) { - super(partition, schema, deleteSchema); - } - - @Override - protected StructLike asStructLike(RowData data) { - return wrapper.wrap(data); - } - - @Override - protected StructLike asStructLikeKey(RowData data) { - return keyWrapper.wrap(data); - } - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java deleted file mode 100644 index 036970c06d5b..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.List; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; - -class DeltaManifests { - - private static final CharSequence[] EMPTY_REF_DATA_FILES = new CharSequence[0]; - - private final ManifestFile dataManifest; - private final ManifestFile deleteManifest; - private final CharSequence[] referencedDataFiles; - - DeltaManifests(ManifestFile dataManifest, ManifestFile deleteManifest) { - this(dataManifest, deleteManifest, EMPTY_REF_DATA_FILES); - } - - DeltaManifests( - ManifestFile dataManifest, ManifestFile deleteManifest, CharSequence[] referencedDataFiles) { - Preconditions.checkNotNull(referencedDataFiles, "Referenced data files shouldn't be null."); - - this.dataManifest = dataManifest; - this.deleteManifest = deleteManifest; - this.referencedDataFiles = referencedDataFiles; - } - - ManifestFile dataManifest() { - return dataManifest; - } - - ManifestFile deleteManifest() { - return deleteManifest; - } - - CharSequence[] referencedDataFiles() { - return referencedDataFiles; - } - - List manifests() { - List manifests = Lists.newArrayListWithCapacity(2); - if (dataManifest != null) { - manifests.add(dataManifest); - } - - if (deleteManifest != null) { - manifests.add(deleteManifest); - } - - return manifests; - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java deleted file mode 100644 index c4d6e713bb73..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.DataInputStream; -import java.io.DataOutputStream; -import java.io.IOException; -import org.apache.flink.core.io.SimpleVersionedSerializer; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.ManifestFiles; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -class DeltaManifestsSerializer implements SimpleVersionedSerializer { - private static final int VERSION_1 = 1; - private static final int VERSION_2 = 2; - private static final byte[] EMPTY_BINARY = new byte[0]; - - static final DeltaManifestsSerializer INSTANCE = new DeltaManifestsSerializer(); - - @Override - public int getVersion() { - return VERSION_2; - } - - @Override - public byte[] serialize(DeltaManifests deltaManifests) throws IOException { - Preconditions.checkNotNull( - deltaManifests, "DeltaManifests to be serialized should not be null"); - - ByteArrayOutputStream binaryOut = new ByteArrayOutputStream(); - DataOutputStream out = new DataOutputStream(binaryOut); - - byte[] dataManifestBinary = EMPTY_BINARY; - if (deltaManifests.dataManifest() != null) { - dataManifestBinary = ManifestFiles.encode(deltaManifests.dataManifest()); - } - - out.writeInt(dataManifestBinary.length); - out.write(dataManifestBinary); - - byte[] deleteManifestBinary = EMPTY_BINARY; - if (deltaManifests.deleteManifest() != null) { - deleteManifestBinary = ManifestFiles.encode(deltaManifests.deleteManifest()); - } - - out.writeInt(deleteManifestBinary.length); - out.write(deleteManifestBinary); - - CharSequence[] referencedDataFiles = deltaManifests.referencedDataFiles(); - out.writeInt(referencedDataFiles.length); - for (int i = 0; i < referencedDataFiles.length; i++) { - out.writeUTF(referencedDataFiles[i].toString()); - } - - return binaryOut.toByteArray(); - } - - @Override - public DeltaManifests deserialize(int version, byte[] serialized) throws IOException { - if (version == VERSION_1) { - return deserializeV1(serialized); - } else if (version == VERSION_2) { - return deserializeV2(serialized); - } else { - throw new RuntimeException("Unknown serialize version: " + version); - } - } - - private DeltaManifests deserializeV1(byte[] serialized) throws IOException { - return new DeltaManifests(ManifestFiles.decode(serialized), null); - } - - private DeltaManifests deserializeV2(byte[] serialized) throws IOException { - ManifestFile dataManifest = null; - ManifestFile deleteManifest = null; - - ByteArrayInputStream binaryIn = new ByteArrayInputStream(serialized); - DataInputStream in = new DataInputStream(binaryIn); - - int dataManifestSize = in.readInt(); - if (dataManifestSize > 0) { - byte[] dataManifestBinary = new byte[dataManifestSize]; - Preconditions.checkState(in.read(dataManifestBinary) == dataManifestSize); - - dataManifest = ManifestFiles.decode(dataManifestBinary); - } - - int deleteManifestSize = in.readInt(); - if (deleteManifestSize > 0) { - byte[] deleteManifestBinary = new byte[deleteManifestSize]; - Preconditions.checkState(in.read(deleteManifestBinary) == deleteManifestSize); - - deleteManifest = ManifestFiles.decode(deleteManifestBinary); - } - - int referenceDataFileNum = in.readInt(); - CharSequence[] referencedDataFiles = new CharSequence[referenceDataFileNum]; - for (int i = 0; i < referenceDataFileNum; i++) { - referencedDataFiles[i] = in.readUTF(); - } - - return new DeltaManifests(dataManifest, deleteManifest, referencedDataFiles); - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java deleted file mode 100644 index 18b269d6c3e9..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.List; -import org.apache.flink.api.java.functions.KeySelector; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.util.StructLikeWrapper; -import org.apache.iceberg.util.StructProjection; - -/** - * Create a {@link KeySelector} to shuffle by equality fields, to ensure same equality fields record - * will be emitted to same writer in order. - */ -class EqualityFieldKeySelector implements KeySelector { - - private final Schema schema; - private final RowType flinkSchema; - private final Schema deleteSchema; - - private transient RowDataWrapper rowDataWrapper; - private transient StructProjection structProjection; - private transient StructLikeWrapper structLikeWrapper; - - EqualityFieldKeySelector(Schema schema, RowType flinkSchema, List equalityFieldIds) { - this.schema = schema; - this.flinkSchema = flinkSchema; - this.deleteSchema = TypeUtil.select(schema, Sets.newHashSet(equalityFieldIds)); - } - - /** - * Construct the {@link RowDataWrapper} lazily here because few members in it are not - * serializable. In this way, we don't have to serialize them with forcing. - */ - protected RowDataWrapper lazyRowDataWrapper() { - if (rowDataWrapper == null) { - rowDataWrapper = new RowDataWrapper(flinkSchema, schema.asStruct()); - } - return rowDataWrapper; - } - - /** Construct the {@link StructProjection} lazily because it is not serializable. */ - protected StructProjection lazyStructProjection() { - if (structProjection == null) { - structProjection = StructProjection.create(schema, deleteSchema); - } - return structProjection; - } - - /** Construct the {@link StructLikeWrapper} lazily because it is not serializable. */ - protected StructLikeWrapper lazyStructLikeWrapper() { - if (structLikeWrapper == null) { - structLikeWrapper = StructLikeWrapper.forType(deleteSchema.asStruct()); - } - return structLikeWrapper; - } - - @Override - public Integer getKey(RowData row) { - RowDataWrapper wrappedRowData = lazyRowDataWrapper().wrap(row); - StructProjection projectedRowData = lazyStructProjection().wrap(wrappedRowData); - StructLikeWrapper wrapper = lazyStructLikeWrapper().set(projectedRowData); - return wrapper.hashCode(); - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java deleted file mode 100644 index b5d08b46be58..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java +++ /dev/null @@ -1,271 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.IOException; -import java.io.Serializable; -import java.io.UncheckedIOException; -import java.util.Map; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.MetricsConfig; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.avro.Avro; -import org.apache.iceberg.deletes.EqualityDeleteWriter; -import org.apache.iceberg.deletes.PositionDeleteWriter; -import org.apache.iceberg.encryption.EncryptedOutputFile; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.data.FlinkAvroWriter; -import org.apache.iceberg.flink.data.FlinkOrcWriter; -import org.apache.iceberg.flink.data.FlinkParquetWriters; -import org.apache.iceberg.io.DataWriter; -import org.apache.iceberg.io.DeleteSchemaUtil; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.io.OutputFile; -import org.apache.iceberg.orc.ORC; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -public class FlinkAppenderFactory implements FileAppenderFactory, Serializable { - private final Schema schema; - private final RowType flinkSchema; - private final Map props; - private final PartitionSpec spec; - private final int[] equalityFieldIds; - private final Schema eqDeleteRowSchema; - private final Schema posDeleteRowSchema; - - private RowType eqDeleteFlinkSchema = null; - private RowType posDeleteFlinkSchema = null; - - public FlinkAppenderFactory( - Schema schema, RowType flinkSchema, Map props, PartitionSpec spec) { - this(schema, flinkSchema, props, spec, null, null, null); - } - - public FlinkAppenderFactory( - Schema schema, - RowType flinkSchema, - Map props, - PartitionSpec spec, - int[] equalityFieldIds, - Schema eqDeleteRowSchema, - Schema posDeleteRowSchema) { - this.schema = schema; - this.flinkSchema = flinkSchema; - this.props = props; - this.spec = spec; - this.equalityFieldIds = equalityFieldIds; - this.eqDeleteRowSchema = eqDeleteRowSchema; - this.posDeleteRowSchema = posDeleteRowSchema; - } - - private RowType lazyEqDeleteFlinkSchema() { - if (eqDeleteFlinkSchema == null) { - Preconditions.checkNotNull(eqDeleteRowSchema, "Equality delete row schema shouldn't be null"); - this.eqDeleteFlinkSchema = FlinkSchemaUtil.convert(eqDeleteRowSchema); - } - return eqDeleteFlinkSchema; - } - - private RowType lazyPosDeleteFlinkSchema() { - if (posDeleteFlinkSchema == null) { - Preconditions.checkNotNull(posDeleteRowSchema, "Pos-delete row schema shouldn't be null"); - this.posDeleteFlinkSchema = FlinkSchemaUtil.convert(posDeleteRowSchema); - } - return this.posDeleteFlinkSchema; - } - - @Override - public FileAppender newAppender(OutputFile outputFile, FileFormat format) { - MetricsConfig metricsConfig = MetricsConfig.fromProperties(props); - try { - switch (format) { - case AVRO: - return Avro.write(outputFile) - .createWriterFunc(ignore -> new FlinkAvroWriter(flinkSchema)) - .setAll(props) - .schema(schema) - .metricsConfig(metricsConfig) - .overwrite() - .build(); - - case ORC: - return ORC.write(outputFile) - .createWriterFunc( - (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(flinkSchema, iSchema)) - .setAll(props) - .metricsConfig(metricsConfig) - .schema(schema) - .overwrite() - .build(); - - case PARQUET: - return Parquet.write(outputFile) - .createWriterFunc(msgType -> FlinkParquetWriters.buildWriter(flinkSchema, msgType)) - .setAll(props) - .metricsConfig(metricsConfig) - .schema(schema) - .overwrite() - .build(); - - default: - throw new UnsupportedOperationException("Cannot write unknown file format: " + format); - } - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } - - @Override - public DataWriter newDataWriter( - EncryptedOutputFile file, FileFormat format, StructLike partition) { - return new DataWriter<>( - newAppender(file.encryptingOutputFile(), format), - format, - file.encryptingOutputFile().location(), - spec, - partition, - file.keyMetadata()); - } - - @Override - public EqualityDeleteWriter newEqDeleteWriter( - EncryptedOutputFile outputFile, FileFormat format, StructLike partition) { - Preconditions.checkState( - equalityFieldIds != null && equalityFieldIds.length > 0, - "Equality field ids shouldn't be null or empty when creating equality-delete writer"); - Preconditions.checkNotNull( - eqDeleteRowSchema, - "Equality delete row schema shouldn't be null when creating equality-delete writer"); - - MetricsConfig metricsConfig = MetricsConfig.fromProperties(props); - try { - switch (format) { - case AVRO: - return Avro.writeDeletes(outputFile.encryptingOutputFile()) - .createWriterFunc(ignore -> new FlinkAvroWriter(lazyEqDeleteFlinkSchema())) - .withPartition(partition) - .overwrite() - .setAll(props) - .rowSchema(eqDeleteRowSchema) - .withSpec(spec) - .withKeyMetadata(outputFile.keyMetadata()) - .equalityFieldIds(equalityFieldIds) - .buildEqualityWriter(); - - case ORC: - return ORC.writeDeletes(outputFile.encryptingOutputFile()) - .createWriterFunc( - (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(flinkSchema, iSchema)) - .withPartition(partition) - .overwrite() - .setAll(props) - .rowSchema(eqDeleteRowSchema) - .withSpec(spec) - .withKeyMetadata(outputFile.keyMetadata()) - .equalityFieldIds(equalityFieldIds) - .buildEqualityWriter(); - - case PARQUET: - return Parquet.writeDeletes(outputFile.encryptingOutputFile()) - .createWriterFunc( - msgType -> FlinkParquetWriters.buildWriter(lazyEqDeleteFlinkSchema(), msgType)) - .withPartition(partition) - .overwrite() - .setAll(props) - .metricsConfig(metricsConfig) - .rowSchema(eqDeleteRowSchema) - .withSpec(spec) - .withKeyMetadata(outputFile.keyMetadata()) - .equalityFieldIds(equalityFieldIds) - .buildEqualityWriter(); - - default: - throw new UnsupportedOperationException( - "Cannot write equality-deletes for unsupported file format: " + format); - } - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } - - @Override - public PositionDeleteWriter newPosDeleteWriter( - EncryptedOutputFile outputFile, FileFormat format, StructLike partition) { - MetricsConfig metricsConfig = MetricsConfig.fromProperties(props); - try { - switch (format) { - case AVRO: - return Avro.writeDeletes(outputFile.encryptingOutputFile()) - .createWriterFunc(ignore -> new FlinkAvroWriter(lazyPosDeleteFlinkSchema())) - .withPartition(partition) - .overwrite() - .setAll(props) - .rowSchema(posDeleteRowSchema) - .withSpec(spec) - .withKeyMetadata(outputFile.keyMetadata()) - .buildPositionWriter(); - - case ORC: - RowType orcPosDeleteSchema = - FlinkSchemaUtil.convert(DeleteSchemaUtil.posDeleteSchema(posDeleteRowSchema)); - return ORC.writeDeletes(outputFile.encryptingOutputFile()) - .createWriterFunc( - (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(orcPosDeleteSchema, iSchema)) - .withPartition(partition) - .overwrite() - .setAll(props) - .metricsConfig(metricsConfig) - .rowSchema(posDeleteRowSchema) - .withSpec(spec) - .withKeyMetadata(outputFile.keyMetadata()) - .transformPaths(path -> StringData.fromString(path.toString())) - .buildPositionWriter(); - - case PARQUET: - RowType flinkPosDeleteSchema = - FlinkSchemaUtil.convert(DeleteSchemaUtil.posDeleteSchema(posDeleteRowSchema)); - return Parquet.writeDeletes(outputFile.encryptingOutputFile()) - .createWriterFunc( - msgType -> FlinkParquetWriters.buildWriter(flinkPosDeleteSchema, msgType)) - .withPartition(partition) - .overwrite() - .setAll(props) - .metricsConfig(metricsConfig) - .rowSchema(posDeleteRowSchema) - .withSpec(spec) - .withKeyMetadata(outputFile.keyMetadata()) - .transformPaths(path -> StringData.fromString(path.toString())) - .buildPositionWriter(); - - default: - throw new UnsupportedOperationException( - "Cannot write pos-deletes for unsupported file format: " + format); - } - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java deleted file mode 100644 index 2183fe062af4..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java +++ /dev/null @@ -1,293 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.apache.iceberg.MetadataColumns.DELETE_FILE_ROW_FIELD_NAME; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; -import static org.apache.iceberg.TableProperties.DELETE_DEFAULT_FILE_FORMAT; - -import java.io.Serializable; -import java.util.Map; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.Table; -import org.apache.iceberg.avro.Avro; -import org.apache.iceberg.data.BaseFileWriterFactory; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.data.FlinkAvroWriter; -import org.apache.iceberg.flink.data.FlinkOrcWriter; -import org.apache.iceberg.flink.data.FlinkParquetWriters; -import org.apache.iceberg.io.DeleteSchemaUtil; -import org.apache.iceberg.orc.ORC; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -class FlinkFileWriterFactory extends BaseFileWriterFactory implements Serializable { - private RowType dataFlinkType; - private RowType equalityDeleteFlinkType; - private RowType positionDeleteFlinkType; - - FlinkFileWriterFactory( - Table table, - FileFormat dataFileFormat, - Schema dataSchema, - RowType dataFlinkType, - SortOrder dataSortOrder, - FileFormat deleteFileFormat, - int[] equalityFieldIds, - Schema equalityDeleteRowSchema, - RowType equalityDeleteFlinkType, - SortOrder equalityDeleteSortOrder, - Schema positionDeleteRowSchema, - RowType positionDeleteFlinkType) { - - super( - table, - dataFileFormat, - dataSchema, - dataSortOrder, - deleteFileFormat, - equalityFieldIds, - equalityDeleteRowSchema, - equalityDeleteSortOrder, - positionDeleteRowSchema); - - this.dataFlinkType = dataFlinkType; - this.equalityDeleteFlinkType = equalityDeleteFlinkType; - this.positionDeleteFlinkType = positionDeleteFlinkType; - } - - static Builder builderFor(Table table) { - return new Builder(table); - } - - @Override - protected void configureDataWrite(Avro.DataWriteBuilder builder) { - builder.createWriterFunc(ignore -> new FlinkAvroWriter(dataFlinkType())); - } - - @Override - protected void configureEqualityDelete(Avro.DeleteWriteBuilder builder) { - builder.createWriterFunc(ignored -> new FlinkAvroWriter(equalityDeleteFlinkType())); - } - - @Override - protected void configurePositionDelete(Avro.DeleteWriteBuilder builder) { - int rowFieldIndex = positionDeleteFlinkType().getFieldIndex(DELETE_FILE_ROW_FIELD_NAME); - if (rowFieldIndex >= 0) { - // FlinkAvroWriter accepts just the Flink type of the row ignoring the path and pos - RowType positionDeleteRowFlinkType = - (RowType) positionDeleteFlinkType().getTypeAt(rowFieldIndex); - builder.createWriterFunc(ignored -> new FlinkAvroWriter(positionDeleteRowFlinkType)); - } - } - - @Override - protected void configureDataWrite(Parquet.DataWriteBuilder builder) { - builder.createWriterFunc(msgType -> FlinkParquetWriters.buildWriter(dataFlinkType(), msgType)); - } - - @Override - protected void configureEqualityDelete(Parquet.DeleteWriteBuilder builder) { - builder.createWriterFunc( - msgType -> FlinkParquetWriters.buildWriter(equalityDeleteFlinkType(), msgType)); - } - - @Override - protected void configurePositionDelete(Parquet.DeleteWriteBuilder builder) { - builder.createWriterFunc( - msgType -> FlinkParquetWriters.buildWriter(positionDeleteFlinkType(), msgType)); - builder.transformPaths(path -> StringData.fromString(path.toString())); - } - - @Override - protected void configureDataWrite(ORC.DataWriteBuilder builder) { - builder.createWriterFunc( - (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(dataFlinkType(), iSchema)); - } - - @Override - protected void configureEqualityDelete(ORC.DeleteWriteBuilder builder) { - builder.createWriterFunc( - (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(equalityDeleteFlinkType(), iSchema)); - } - - @Override - protected void configurePositionDelete(ORC.DeleteWriteBuilder builder) { - builder.createWriterFunc( - (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(positionDeleteFlinkType(), iSchema)); - builder.transformPaths(path -> StringData.fromString(path.toString())); - } - - private RowType dataFlinkType() { - if (dataFlinkType == null) { - Preconditions.checkNotNull(dataSchema(), "Data schema must not be null"); - this.dataFlinkType = FlinkSchemaUtil.convert(dataSchema()); - } - - return dataFlinkType; - } - - private RowType equalityDeleteFlinkType() { - if (equalityDeleteFlinkType == null) { - Preconditions.checkNotNull( - equalityDeleteRowSchema(), "Equality delete schema must not be null"); - this.equalityDeleteFlinkType = FlinkSchemaUtil.convert(equalityDeleteRowSchema()); - } - - return equalityDeleteFlinkType; - } - - private RowType positionDeleteFlinkType() { - if (positionDeleteFlinkType == null) { - // wrap the optional row schema into the position delete schema that contains path and - // position - Schema positionDeleteSchema = DeleteSchemaUtil.posDeleteSchema(positionDeleteRowSchema()); - this.positionDeleteFlinkType = FlinkSchemaUtil.convert(positionDeleteSchema); - } - - return positionDeleteFlinkType; - } - - static class Builder { - private final Table table; - private FileFormat dataFileFormat; - private Schema dataSchema; - private RowType dataFlinkType; - private SortOrder dataSortOrder; - private FileFormat deleteFileFormat; - private int[] equalityFieldIds; - private Schema equalityDeleteRowSchema; - private RowType equalityDeleteFlinkType; - private SortOrder equalityDeleteSortOrder; - private Schema positionDeleteRowSchema; - private RowType positionDeleteFlinkType; - - Builder(Table table) { - this.table = table; - - Map properties = table.properties(); - - String dataFileFormatName = - properties.getOrDefault(DEFAULT_FILE_FORMAT, DEFAULT_FILE_FORMAT_DEFAULT); - this.dataFileFormat = FileFormat.fromString(dataFileFormatName); - - String deleteFileFormatName = - properties.getOrDefault(DELETE_DEFAULT_FILE_FORMAT, dataFileFormatName); - this.deleteFileFormat = FileFormat.fromString(deleteFileFormatName); - } - - Builder dataFileFormat(FileFormat newDataFileFormat) { - this.dataFileFormat = newDataFileFormat; - return this; - } - - Builder dataSchema(Schema newDataSchema) { - this.dataSchema = newDataSchema; - return this; - } - - /** - * Sets a Flink type for data. - * - *

If not set, the value is derived from the provided Iceberg schema. - */ - Builder dataFlinkType(RowType newDataFlinkType) { - this.dataFlinkType = newDataFlinkType; - return this; - } - - Builder dataSortOrder(SortOrder newDataSortOrder) { - this.dataSortOrder = newDataSortOrder; - return this; - } - - Builder deleteFileFormat(FileFormat newDeleteFileFormat) { - this.deleteFileFormat = newDeleteFileFormat; - return this; - } - - Builder equalityFieldIds(int[] newEqualityFieldIds) { - this.equalityFieldIds = newEqualityFieldIds; - return this; - } - - Builder equalityDeleteRowSchema(Schema newEqualityDeleteRowSchema) { - this.equalityDeleteRowSchema = newEqualityDeleteRowSchema; - return this; - } - - /** - * Sets a Flink type for equality deletes. - * - *

If not set, the value is derived from the provided Iceberg schema. - */ - Builder equalityDeleteFlinkType(RowType newEqualityDeleteFlinkType) { - this.equalityDeleteFlinkType = newEqualityDeleteFlinkType; - return this; - } - - Builder equalityDeleteSortOrder(SortOrder newEqualityDeleteSortOrder) { - this.equalityDeleteSortOrder = newEqualityDeleteSortOrder; - return this; - } - - Builder positionDeleteRowSchema(Schema newPositionDeleteRowSchema) { - this.positionDeleteRowSchema = newPositionDeleteRowSchema; - return this; - } - - /** - * Sets a Flink type for position deletes. - * - *

If not set, the value is derived from the provided Iceberg schema. - */ - Builder positionDeleteFlinkType(RowType newPositionDeleteFlinkType) { - this.positionDeleteFlinkType = newPositionDeleteFlinkType; - return this; - } - - FlinkFileWriterFactory build() { - boolean noEqualityDeleteConf = equalityFieldIds == null && equalityDeleteRowSchema == null; - boolean fullEqualityDeleteConf = equalityFieldIds != null && equalityDeleteRowSchema != null; - Preconditions.checkArgument( - noEqualityDeleteConf || fullEqualityDeleteConf, - "Equality field IDs and equality delete row schema must be set together"); - - return new FlinkFileWriterFactory( - table, - dataFileFormat, - dataSchema, - dataFlinkType, - dataSortOrder, - deleteFileFormat, - equalityFieldIds, - equalityDeleteRowSchema, - equalityDeleteFlinkType, - equalityDeleteSortOrder, - positionDeleteRowSchema, - positionDeleteFlinkType); - } - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java deleted file mode 100644 index 996e4bbb1b01..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.IOException; -import java.util.List; -import java.util.function.Supplier; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.HasTableOperations; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.ManifestFiles; -import org.apache.iceberg.ManifestWriter; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableOperations; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.OutputFile; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; - -class FlinkManifestUtil { - private static final int FORMAT_V2 = 2; - private static final Long DUMMY_SNAPSHOT_ID = 0L; - - private FlinkManifestUtil() {} - - static ManifestFile writeDataFiles( - OutputFile outputFile, PartitionSpec spec, List dataFiles) throws IOException { - ManifestWriter writer = - ManifestFiles.write(FORMAT_V2, spec, outputFile, DUMMY_SNAPSHOT_ID); - - try (ManifestWriter closeableWriter = writer) { - closeableWriter.addAll(dataFiles); - } - - return writer.toManifestFile(); - } - - static List readDataFiles(ManifestFile manifestFile, FileIO io) throws IOException { - try (CloseableIterable dataFiles = ManifestFiles.read(manifestFile, io)) { - return Lists.newArrayList(dataFiles); - } - } - - static ManifestOutputFileFactory createOutputFileFactory( - Table table, String flinkJobId, int subTaskId, long attemptNumber) { - TableOperations ops = ((HasTableOperations) table).operations(); - return new ManifestOutputFileFactory( - ops, table.io(), table.properties(), flinkJobId, subTaskId, attemptNumber); - } - - static DeltaManifests writeCompletedFiles( - WriteResult result, Supplier outputFileSupplier, PartitionSpec spec) - throws IOException { - - ManifestFile dataManifest = null; - ManifestFile deleteManifest = null; - - // Write the completed data files into a newly created data manifest file. - if (result.dataFiles() != null && result.dataFiles().length > 0) { - dataManifest = - writeDataFiles(outputFileSupplier.get(), spec, Lists.newArrayList(result.dataFiles())); - } - - // Write the completed delete files into a newly created delete manifest file. - if (result.deleteFiles() != null && result.deleteFiles().length > 0) { - OutputFile deleteManifestFile = outputFileSupplier.get(); - - ManifestWriter deleteManifestWriter = - ManifestFiles.writeDeleteManifest(FORMAT_V2, spec, deleteManifestFile, DUMMY_SNAPSHOT_ID); - try (ManifestWriter writer = deleteManifestWriter) { - for (DeleteFile deleteFile : result.deleteFiles()) { - writer.add(deleteFile); - } - } - - deleteManifest = deleteManifestWriter.toManifestFile(); - } - - return new DeltaManifests(dataManifest, deleteManifest, result.referencedDataFiles()); - } - - static WriteResult readCompletedFiles(DeltaManifests deltaManifests, FileIO io) - throws IOException { - WriteResult.Builder builder = WriteResult.builder(); - - // Read the completed data files from persisted data manifest file. - if (deltaManifests.dataManifest() != null) { - builder.addDataFiles(readDataFiles(deltaManifests.dataManifest(), io)); - } - - // Read the completed delete files from persisted delete manifests file. - if (deltaManifests.deleteManifest() != null) { - try (CloseableIterable deleteFiles = - ManifestFiles.readDeleteManifest(deltaManifests.deleteManifest(), io, null)) { - builder.addDeleteFiles(deleteFiles); - } - } - - return builder.addReferencedDataFiles(deltaManifests.referencedDataFiles()).build(); - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java deleted file mode 100644 index 8846bb137fe7..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java +++ /dev/null @@ -1,562 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.function.Function; -import org.apache.flink.api.common.functions.MapFunction; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.api.common.typeinfo.Types; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.datastream.DataStreamSink; -import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; -import org.apache.flink.streaming.api.functions.sink.DiscardingSink; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.util.DataFormatConverters; -import org.apache.flink.table.types.DataType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.types.Row; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DistributionMode; -import org.apache.iceberg.PartitionField; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SerializableTable; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.FlinkWriteConf; -import org.apache.iceberg.flink.FlinkWriteOptions; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.TypeUtil; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class FlinkSink { - private static final Logger LOG = LoggerFactory.getLogger(FlinkSink.class); - - private static final String ICEBERG_STREAM_WRITER_NAME = - IcebergStreamWriter.class.getSimpleName(); - private static final String ICEBERG_FILES_COMMITTER_NAME = - IcebergFilesCommitter.class.getSimpleName(); - - private FlinkSink() {} - - /** - * Initialize a {@link Builder} to export the data from generic input data stream into iceberg - * table. We use {@link RowData} inside the sink connector, so users need to provide a mapper - * function and a {@link TypeInformation} to convert those generic records to a RowData - * DataStream. - * - * @param input the generic source input data stream. - * @param mapper function to convert the generic data to {@link RowData} - * @param outputType to define the {@link TypeInformation} for the input data. - * @param the data type of records. - * @return {@link Builder} to connect the iceberg table. - */ - public static Builder builderFor( - DataStream input, MapFunction mapper, TypeInformation outputType) { - return new Builder().forMapperOutputType(input, mapper, outputType); - } - - /** - * Initialize a {@link Builder} to export the data from input data stream with {@link Row}s into - * iceberg table. We use {@link RowData} inside the sink connector, so users need to provide a - * {@link TableSchema} for builder to convert those {@link Row}s to a {@link RowData} DataStream. - * - * @param input the source input data stream with {@link Row}s. - * @param tableSchema defines the {@link TypeInformation} for input data. - * @return {@link Builder} to connect the iceberg table. - */ - public static Builder forRow(DataStream input, TableSchema tableSchema) { - RowType rowType = (RowType) tableSchema.toRowDataType().getLogicalType(); - DataType[] fieldDataTypes = tableSchema.getFieldDataTypes(); - - DataFormatConverters.RowConverter rowConverter = - new DataFormatConverters.RowConverter(fieldDataTypes); - return builderFor(input, rowConverter::toInternal, FlinkCompatibilityUtil.toTypeInfo(rowType)) - .tableSchema(tableSchema); - } - - /** - * Initialize a {@link Builder} to export the data from input data stream with {@link RowData}s - * into iceberg table. - * - * @param input the source input data stream with {@link RowData}s. - * @return {@link Builder} to connect the iceberg table. - */ - public static Builder forRowData(DataStream input) { - return new Builder().forRowData(input); - } - - public static class Builder { - private Function> inputCreator = null; - private TableLoader tableLoader; - private Table table; - private TableSchema tableSchema; - private Integer writeParallelism = null; - private List equalityFieldColumns = null; - private String uidPrefix = null; - private ReadableConfig readableConfig = new Configuration(); - private final Map writeOptions = Maps.newHashMap(); - private FlinkWriteConf flinkWriteConf = null; - - private Builder() {} - - private Builder forRowData(DataStream newRowDataInput) { - this.inputCreator = ignored -> newRowDataInput; - return this; - } - - private Builder forMapperOutputType( - DataStream input, MapFunction mapper, TypeInformation outputType) { - this.inputCreator = - newUidPrefix -> { - // Input stream order is crucial for some situation(e.g. in cdc case). Therefore, we - // need to set the parallelism - // of map operator same as its input to keep map operator chaining its input, and avoid - // rebalanced by default. - SingleOutputStreamOperator inputStream = - input.map(mapper, outputType).setParallelism(input.getParallelism()); - if (newUidPrefix != null) { - inputStream.name(operatorName(newUidPrefix)).uid(newUidPrefix + "-mapper"); - } - return inputStream; - }; - return this; - } - - /** - * This iceberg {@link Table} instance is used for initializing {@link IcebergStreamWriter} - * which will write all the records into {@link DataFile}s and emit them to downstream operator. - * Providing a table would avoid so many table loading from each separate task. - * - * @param newTable the loaded iceberg table instance. - * @return {@link Builder} to connect the iceberg table. - */ - public Builder table(Table newTable) { - this.table = newTable; - return this; - } - - /** - * The table loader is used for loading tables in {@link IcebergFilesCommitter} lazily, we need - * this loader because {@link Table} is not serializable and could not just use the loaded table - * from Builder#table in the remote task manager. - * - * @param newTableLoader to load iceberg table inside tasks. - * @return {@link Builder} to connect the iceberg table. - */ - public Builder tableLoader(TableLoader newTableLoader) { - this.tableLoader = newTableLoader; - return this; - } - - /** - * Set the write properties for Flink sink. View the supported properties in {@link - * FlinkWriteOptions} - */ - public Builder set(String property, String value) { - writeOptions.put(property, value); - return this; - } - - /** - * Set the write properties for Flink sink. View the supported properties in {@link - * FlinkWriteOptions} - */ - public Builder setAll(Map properties) { - writeOptions.putAll(properties); - return this; - } - - public Builder tableSchema(TableSchema newTableSchema) { - this.tableSchema = newTableSchema; - return this; - } - - public Builder overwrite(boolean newOverwrite) { - writeOptions.put(FlinkWriteOptions.OVERWRITE_MODE.key(), Boolean.toString(newOverwrite)); - return this; - } - - public Builder flinkConf(ReadableConfig config) { - this.readableConfig = config; - return this; - } - - /** - * Configure the write {@link DistributionMode} that the flink sink will use. Currently, flink - * support {@link DistributionMode#NONE} and {@link DistributionMode#HASH}. - * - * @param mode to specify the write distribution mode. - * @return {@link Builder} to connect the iceberg table. - */ - public Builder distributionMode(DistributionMode mode) { - Preconditions.checkArgument( - !DistributionMode.RANGE.equals(mode), - "Flink does not support 'range' write distribution mode now."); - if (mode != null) { - writeOptions.put(FlinkWriteOptions.DISTRIBUTION_MODE.key(), mode.modeName()); - } - return this; - } - - /** - * Configuring the write parallel number for iceberg stream writer. - * - * @param newWriteParallelism the number of parallel iceberg stream writer. - * @return {@link Builder} to connect the iceberg table. - */ - public Builder writeParallelism(int newWriteParallelism) { - this.writeParallelism = newWriteParallelism; - return this; - } - - /** - * All INSERT/UPDATE_AFTER events from input stream will be transformed to UPSERT events, which - * means it will DELETE the old records and then INSERT the new records. In partitioned table, - * the partition fields should be a subset of equality fields, otherwise the old row that - * located in partition-A could not be deleted by the new row that located in partition-B. - * - * @param enabled indicate whether it should transform all INSERT/UPDATE_AFTER events to UPSERT. - * @return {@link Builder} to connect the iceberg table. - */ - public Builder upsert(boolean enabled) { - writeOptions.put(FlinkWriteOptions.WRITE_UPSERT_ENABLED.key(), Boolean.toString(enabled)); - return this; - } - - /** - * Configuring the equality field columns for iceberg table that accept CDC or UPSERT events. - * - * @param columns defines the iceberg table's key. - * @return {@link Builder} to connect the iceberg table. - */ - public Builder equalityFieldColumns(List columns) { - this.equalityFieldColumns = columns; - return this; - } - - /** - * Set the uid prefix for FlinkSink operators. Note that FlinkSink internally consists of - * multiple operators (like writer, committer, dummy sink etc.) Actually operator uid will be - * appended with a suffix like "uidPrefix-writer".
- *
- * If provided, this prefix is also applied to operator names.
- *
- * Flink auto generates operator uid if not set explicitly. It is a recommended - * best-practice to set uid for all operators before deploying to production. Flink has an - * option to {@code pipeline.auto-generate-uid=false} to disable auto-generation and force - * explicit setting of all operator uid.
- *
- * Be careful with setting this for an existing job, because now we are changing the operator - * uid from an auto-generated one to this new value. When deploying the change with a - * checkpoint, Flink won't be able to restore the previous Flink sink operator state (more - * specifically the committer operator state). You need to use {@code --allowNonRestoredState} - * to ignore the previous sink state. During restore Flink sink state is used to check if last - * commit was actually successful or not. {@code --allowNonRestoredState} can lead to data loss - * if the Iceberg commit failed in the last completed checkpoint. - * - * @param newPrefix prefix for Flink sink operator uid and name - * @return {@link Builder} to connect the iceberg table. - */ - public Builder uidPrefix(String newPrefix) { - this.uidPrefix = newPrefix; - return this; - } - - private DataStreamSink chainIcebergOperators() { - Preconditions.checkArgument( - inputCreator != null, - "Please use forRowData() or forMapperOutputType() to initialize the input DataStream."); - Preconditions.checkNotNull(tableLoader, "Table loader shouldn't be null"); - - DataStream rowDataInput = inputCreator.apply(uidPrefix); - - if (table == null) { - tableLoader.open(); - try (TableLoader loader = tableLoader) { - this.table = loader.loadTable(); - } catch (IOException e) { - throw new UncheckedIOException( - "Failed to load iceberg table from table loader: " + tableLoader, e); - } - } - - flinkWriteConf = new FlinkWriteConf(table, writeOptions, readableConfig); - - // Find out the equality field id list based on the user-provided equality field column names. - List equalityFieldIds = checkAndGetEqualityFieldIds(); - - // Convert the requested flink table schema to flink row type. - RowType flinkRowType = toFlinkRowType(table.schema(), tableSchema); - - // Distribute the records from input data stream based on the write.distribution-mode and - // equality fields. - DataStream distributeStream = - distributeDataStream( - rowDataInput, equalityFieldIds, table.spec(), table.schema(), flinkRowType); - - // Add parallel writers that append rows to files - SingleOutputStreamOperator writerStream = - appendWriter(distributeStream, flinkRowType, equalityFieldIds); - - // Add single-parallelism committer that commits files - // after successful checkpoint or end of input - SingleOutputStreamOperator committerStream = appendCommitter(writerStream); - - // Add dummy discard sink - return appendDummySink(committerStream); - } - - /** - * Append the iceberg sink operators to write records to iceberg table. - * - * @return {@link DataStreamSink} for sink. - */ - public DataStreamSink append() { - return chainIcebergOperators(); - } - - private String operatorName(String suffix) { - return uidPrefix != null ? uidPrefix + "-" + suffix : suffix; - } - - @VisibleForTesting - List checkAndGetEqualityFieldIds() { - List equalityFieldIds = Lists.newArrayList(table.schema().identifierFieldIds()); - if (equalityFieldColumns != null && equalityFieldColumns.size() > 0) { - Set equalityFieldSet = - Sets.newHashSetWithExpectedSize(equalityFieldColumns.size()); - for (String column : equalityFieldColumns) { - org.apache.iceberg.types.Types.NestedField field = table.schema().findField(column); - Preconditions.checkNotNull( - field, - "Missing required equality field column '%s' in table schema %s", - column, - table.schema()); - equalityFieldSet.add(field.fieldId()); - } - - if (!equalityFieldSet.equals(table.schema().identifierFieldIds())) { - LOG.warn( - "The configured equality field column IDs {} are not matched with the schema identifier field IDs" - + " {}, use job specified equality field columns as the equality fields by default.", - equalityFieldSet, - table.schema().identifierFieldIds()); - } - equalityFieldIds = Lists.newArrayList(equalityFieldSet); - } - return equalityFieldIds; - } - - @SuppressWarnings("unchecked") - private DataStreamSink appendDummySink( - SingleOutputStreamOperator committerStream) { - DataStreamSink resultStream = - committerStream - .addSink(new DiscardingSink()) - .name(operatorName(String.format("IcebergSink %s", this.table.name()))) - .setParallelism(1); - if (uidPrefix != null) { - resultStream = resultStream.uid(uidPrefix + "-dummysink"); - } - return resultStream; - } - - private SingleOutputStreamOperator appendCommitter( - SingleOutputStreamOperator writerStream) { - IcebergFilesCommitter filesCommitter = - new IcebergFilesCommitter(tableLoader, flinkWriteConf.overwriteMode()); - SingleOutputStreamOperator committerStream = - writerStream - .transform(operatorName(ICEBERG_FILES_COMMITTER_NAME), Types.VOID, filesCommitter) - .setParallelism(1) - .setMaxParallelism(1); - if (uidPrefix != null) { - committerStream = committerStream.uid(uidPrefix + "-committer"); - } - return committerStream; - } - - private SingleOutputStreamOperator appendWriter( - DataStream input, RowType flinkRowType, List equalityFieldIds) { - // Validate the equality fields and partition fields if we enable the upsert mode. - if (flinkWriteConf.upsertMode()) { - Preconditions.checkState( - !flinkWriteConf.overwriteMode(), - "OVERWRITE mode shouldn't be enable when configuring to use UPSERT data stream."); - Preconditions.checkState( - !equalityFieldIds.isEmpty(), - "Equality field columns shouldn't be empty when configuring to use UPSERT data stream."); - if (!table.spec().isUnpartitioned()) { - for (PartitionField partitionField : table.spec().fields()) { - Preconditions.checkState( - equalityFieldIds.contains(partitionField.sourceId()), - "In UPSERT mode, partition field '%s' should be included in equality fields: '%s'", - partitionField, - equalityFieldColumns); - } - } - } - - IcebergStreamWriter streamWriter = - createStreamWriter(table, flinkWriteConf, flinkRowType, equalityFieldIds); - - int parallelism = writeParallelism == null ? input.getParallelism() : writeParallelism; - SingleOutputStreamOperator writerStream = - input - .transform( - operatorName(ICEBERG_STREAM_WRITER_NAME), - TypeInformation.of(WriteResult.class), - streamWriter) - .setParallelism(parallelism); - if (uidPrefix != null) { - writerStream = writerStream.uid(uidPrefix + "-writer"); - } - return writerStream; - } - - private DataStream distributeDataStream( - DataStream input, - List equalityFieldIds, - PartitionSpec partitionSpec, - Schema iSchema, - RowType flinkRowType) { - DistributionMode writeMode = flinkWriteConf.distributionMode(); - - LOG.info("Write distribution mode is '{}'", writeMode.modeName()); - switch (writeMode) { - case NONE: - if (equalityFieldIds.isEmpty()) { - return input; - } else { - LOG.info("Distribute rows by equality fields, because there are equality fields set"); - return input.keyBy( - new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); - } - - case HASH: - if (equalityFieldIds.isEmpty()) { - if (partitionSpec.isUnpartitioned()) { - LOG.warn( - "Fallback to use 'none' distribution mode, because there are no equality fields set " - + "and table is unpartitioned"); - return input; - } else { - return input.keyBy(new PartitionKeySelector(partitionSpec, iSchema, flinkRowType)); - } - } else { - if (partitionSpec.isUnpartitioned()) { - LOG.info( - "Distribute rows by equality fields, because there are equality fields set " - + "and table is unpartitioned"); - return input.keyBy( - new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); - } else { - for (PartitionField partitionField : partitionSpec.fields()) { - Preconditions.checkState( - equalityFieldIds.contains(partitionField.sourceId()), - "In 'hash' distribution mode with equality fields set, partition field '%s' " - + "should be included in equality fields: '%s'", - partitionField, - equalityFieldColumns); - } - return input.keyBy(new PartitionKeySelector(partitionSpec, iSchema, flinkRowType)); - } - } - - case RANGE: - if (equalityFieldIds.isEmpty()) { - LOG.warn( - "Fallback to use 'none' distribution mode, because there are no equality fields set " - + "and {}=range is not supported yet in flink", - WRITE_DISTRIBUTION_MODE); - return input; - } else { - LOG.info( - "Distribute rows by equality fields, because there are equality fields set " - + "and{}=range is not supported yet in flink", - WRITE_DISTRIBUTION_MODE); - return input.keyBy( - new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); - } - - default: - throw new RuntimeException("Unrecognized " + WRITE_DISTRIBUTION_MODE + ": " + writeMode); - } - } - } - - static RowType toFlinkRowType(Schema schema, TableSchema requestedSchema) { - if (requestedSchema != null) { - // Convert the flink schema to iceberg schema firstly, then reassign ids to match the existing - // iceberg schema. - Schema writeSchema = TypeUtil.reassignIds(FlinkSchemaUtil.convert(requestedSchema), schema); - TypeUtil.validateWriteSchema(schema, writeSchema, true, true); - - // We use this flink schema to read values from RowData. The flink's TINYINT and SMALLINT will - // be promoted to - // iceberg INTEGER, that means if we use iceberg's table schema to read TINYINT (backend by 1 - // 'byte'), we will - // read 4 bytes rather than 1 byte, it will mess up the byte array in BinaryRowData. So here - // we must use flink - // schema. - return (RowType) requestedSchema.toRowDataType().getLogicalType(); - } else { - return FlinkSchemaUtil.convert(schema); - } - } - - static IcebergStreamWriter createStreamWriter( - Table table, - FlinkWriteConf flinkWriteConf, - RowType flinkRowType, - List equalityFieldIds) { - Preconditions.checkArgument(table != null, "Iceberg table shouldn't be null"); - - Table serializableTable = SerializableTable.copyOf(table); - TaskWriterFactory taskWriterFactory = - new RowDataTaskWriterFactory( - serializableTable, - flinkRowType, - flinkWriteConf.targetDataFileSize(), - flinkWriteConf.dataFileFormat(), - equalityFieldIds, - flinkWriteConf.upsertMode()); - return new IcebergStreamWriter<>(table.name(), taskWriterFactory); - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java deleted file mode 100644 index 11910b06058e..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java +++ /dev/null @@ -1,450 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.IOException; -import java.util.Arrays; -import java.util.Comparator; -import java.util.List; -import java.util.Map; -import java.util.NavigableMap; -import java.util.SortedMap; -import org.apache.flink.api.common.state.ListState; -import org.apache.flink.api.common.state.ListStateDescriptor; -import org.apache.flink.api.common.typeinfo.BasicTypeInfo; -import org.apache.flink.api.common.typeinfo.PrimitiveArrayTypeInfo; -import org.apache.flink.core.io.SimpleVersionedSerialization; -import org.apache.flink.runtime.state.StateInitializationContext; -import org.apache.flink.runtime.state.StateSnapshotContext; -import org.apache.flink.streaming.api.operators.AbstractStreamOperator; -import org.apache.flink.streaming.api.operators.BoundedOneInput; -import org.apache.flink.streaming.api.operators.OneInputStreamOperator; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; -import org.apache.flink.table.runtime.typeutils.SortedMapTypeInfo; -import org.apache.iceberg.AppendFiles; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.ReplacePartitions; -import org.apache.iceberg.RowDelta; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.SnapshotUpdate; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.base.Strings; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Comparators; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.PropertyUtil; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -class IcebergFilesCommitter extends AbstractStreamOperator - implements OneInputStreamOperator, BoundedOneInput { - - private static final long serialVersionUID = 1L; - private static final long INITIAL_CHECKPOINT_ID = -1L; - private static final byte[] EMPTY_MANIFEST_DATA = new byte[0]; - - private static final Logger LOG = LoggerFactory.getLogger(IcebergFilesCommitter.class); - private static final String FLINK_JOB_ID = "flink.job-id"; - - // The max checkpoint id we've committed to iceberg table. As the flink's checkpoint is always - // increasing, so we could - // correctly commit all the data files whose checkpoint id is greater than the max committed one - // to iceberg table, for - // avoiding committing the same data files twice. This id will be attached to iceberg's meta when - // committing the - // iceberg transaction. - private static final String MAX_COMMITTED_CHECKPOINT_ID = "flink.max-committed-checkpoint-id"; - static final String MAX_CONTINUOUS_EMPTY_COMMITS = "flink.max-continuous-empty-commits"; - - // TableLoader to load iceberg table lazily. - private final TableLoader tableLoader; - private final boolean replacePartitions; - - // A sorted map to maintain the completed data files for each pending checkpointId (which have not - // been committed - // to iceberg table). We need a sorted map here because there's possible that few checkpoints - // snapshot failed, for - // example: the 1st checkpoint have 2 data files <1, >, the 2st checkpoint have 1 - // data files - // <2, >. Snapshot for checkpoint#1 interrupted because of network/disk failure etc, while - // we don't expect - // any data loss in iceberg table. So we keep the finished files <1, > in memory and - // retry to commit - // iceberg table when the next checkpoint happen. - private final NavigableMap dataFilesPerCheckpoint = Maps.newTreeMap(); - - // The completed files cache for current checkpoint. Once the snapshot barrier received, it will - // be flushed to the - // 'dataFilesPerCheckpoint'. - private final List writeResultsOfCurrentCkpt = Lists.newArrayList(); - - // It will have an unique identifier for one job. - private transient String flinkJobId; - private transient Table table; - private transient ManifestOutputFileFactory manifestOutputFileFactory; - private transient long maxCommittedCheckpointId; - private transient int continuousEmptyCheckpoints; - private transient int maxContinuousEmptyCommits; - // There're two cases that we restore from flink checkpoints: the first case is restoring from - // snapshot created by the - // same flink job; another case is restoring from snapshot created by another different job. For - // the second case, we - // need to maintain the old flink job's id in flink state backend to find the - // max-committed-checkpoint-id when - // traversing iceberg table's snapshots. - private static final ListStateDescriptor JOB_ID_DESCRIPTOR = - new ListStateDescriptor<>("iceberg-flink-job-id", BasicTypeInfo.STRING_TYPE_INFO); - private transient ListState jobIdState; - // All pending checkpoints states for this function. - private static final ListStateDescriptor> STATE_DESCRIPTOR = - buildStateDescriptor(); - private transient ListState> checkpointsState; - - IcebergFilesCommitter(TableLoader tableLoader, boolean replacePartitions) { - this.tableLoader = tableLoader; - this.replacePartitions = replacePartitions; - } - - @Override - public void initializeState(StateInitializationContext context) throws Exception { - super.initializeState(context); - this.flinkJobId = getContainingTask().getEnvironment().getJobID().toString(); - - // Open the table loader and load the table. - this.tableLoader.open(); - this.table = tableLoader.loadTable(); - - maxContinuousEmptyCommits = - PropertyUtil.propertyAsInt(table.properties(), MAX_CONTINUOUS_EMPTY_COMMITS, 10); - Preconditions.checkArgument( - maxContinuousEmptyCommits > 0, MAX_CONTINUOUS_EMPTY_COMMITS + " must be positive"); - - int subTaskId = getRuntimeContext().getIndexOfThisSubtask(); - int attemptId = getRuntimeContext().getAttemptNumber(); - this.manifestOutputFileFactory = - FlinkManifestUtil.createOutputFileFactory(table, flinkJobId, subTaskId, attemptId); - this.maxCommittedCheckpointId = INITIAL_CHECKPOINT_ID; - - this.checkpointsState = context.getOperatorStateStore().getListState(STATE_DESCRIPTOR); - this.jobIdState = context.getOperatorStateStore().getListState(JOB_ID_DESCRIPTOR); - if (context.isRestored()) { - Iterable jobIdIterable = jobIdState.get(); - if (jobIdIterable == null || !jobIdIterable.iterator().hasNext()) { - LOG.warn( - "Failed to restore committer state. This can happen when operator uid changed and Flink " - + "allowNonRestoredState is enabled. Best practice is to explicitly set the operator id " - + "via FlinkSink#Builder#uidPrefix() so that the committer operator uid is stable. " - + "Otherwise, Flink auto generate an operator uid based on job topology." - + "With that, operator uid is subjective to change upon topology change."); - return; - } - - String restoredFlinkJobId = jobIdIterable.iterator().next(); - Preconditions.checkState( - !Strings.isNullOrEmpty(restoredFlinkJobId), - "Flink job id parsed from checkpoint snapshot shouldn't be null or empty"); - - // Since flink's checkpoint id will start from the max-committed-checkpoint-id + 1 in the new - // flink job even if - // it's restored from a snapshot created by another different flink job, so it's safe to - // assign the max committed - // checkpoint id from restored flink job to the current flink job. - this.maxCommittedCheckpointId = getMaxCommittedCheckpointId(table, restoredFlinkJobId); - - NavigableMap uncommittedDataFiles = - Maps.newTreeMap(checkpointsState.get().iterator().next()) - .tailMap(maxCommittedCheckpointId, false); - if (!uncommittedDataFiles.isEmpty()) { - // Committed all uncommitted data files from the old flink job to iceberg table. - long maxUncommittedCheckpointId = uncommittedDataFiles.lastKey(); - commitUpToCheckpoint(uncommittedDataFiles, restoredFlinkJobId, maxUncommittedCheckpointId); - } - } - } - - @Override - public void snapshotState(StateSnapshotContext context) throws Exception { - super.snapshotState(context); - long checkpointId = context.getCheckpointId(); - LOG.info( - "Start to flush snapshot state to state backend, table: {}, checkpointId: {}", - table, - checkpointId); - - // Update the checkpoint state. - dataFilesPerCheckpoint.put(checkpointId, writeToManifest(checkpointId)); - // Reset the snapshot state to the latest state. - checkpointsState.clear(); - checkpointsState.add(dataFilesPerCheckpoint); - - jobIdState.clear(); - jobIdState.add(flinkJobId); - - // Clear the local buffer for current checkpoint. - writeResultsOfCurrentCkpt.clear(); - } - - @Override - public void notifyCheckpointComplete(long checkpointId) throws Exception { - super.notifyCheckpointComplete(checkpointId); - // It's possible that we have the following events: - // 1. snapshotState(ckpId); - // 2. snapshotState(ckpId+1); - // 3. notifyCheckpointComplete(ckpId+1); - // 4. notifyCheckpointComplete(ckpId); - // For step#4, we don't need to commit iceberg table again because in step#3 we've committed all - // the files, - // Besides, we need to maintain the max-committed-checkpoint-id to be increasing. - if (checkpointId > maxCommittedCheckpointId) { - commitUpToCheckpoint(dataFilesPerCheckpoint, flinkJobId, checkpointId); - this.maxCommittedCheckpointId = checkpointId; - } - } - - private void commitUpToCheckpoint( - NavigableMap deltaManifestsMap, String newFlinkJobId, long checkpointId) - throws IOException { - NavigableMap pendingMap = deltaManifestsMap.headMap(checkpointId, true); - List manifests = Lists.newArrayList(); - NavigableMap pendingResults = Maps.newTreeMap(); - for (Map.Entry e : pendingMap.entrySet()) { - if (Arrays.equals(EMPTY_MANIFEST_DATA, e.getValue())) { - // Skip the empty flink manifest. - continue; - } - - DeltaManifests deltaManifests = - SimpleVersionedSerialization.readVersionAndDeSerialize( - DeltaManifestsSerializer.INSTANCE, e.getValue()); - pendingResults.put( - e.getKey(), FlinkManifestUtil.readCompletedFiles(deltaManifests, table.io())); - manifests.addAll(deltaManifests.manifests()); - } - - int totalFiles = - pendingResults.values().stream() - .mapToInt(r -> r.dataFiles().length + r.deleteFiles().length) - .sum(); - continuousEmptyCheckpoints = totalFiles == 0 ? continuousEmptyCheckpoints + 1 : 0; - if (totalFiles != 0 || continuousEmptyCheckpoints % maxContinuousEmptyCommits == 0) { - if (replacePartitions) { - replacePartitions(pendingResults, newFlinkJobId, checkpointId); - } else { - commitDeltaTxn(pendingResults, newFlinkJobId, checkpointId); - } - continuousEmptyCheckpoints = 0; - } - pendingMap.clear(); - - // Delete the committed manifests. - for (ManifestFile manifest : manifests) { - try { - table.io().deleteFile(manifest.path()); - } catch (Exception e) { - // The flink manifests cleaning failure shouldn't abort the completed checkpoint. - String details = - MoreObjects.toStringHelper(this) - .add("flinkJobId", newFlinkJobId) - .add("checkpointId", checkpointId) - .add("manifestPath", manifest.path()) - .toString(); - LOG.warn( - "The iceberg transaction has been committed, but we failed to clean the temporary flink manifests: {}", - details, - e); - } - } - } - - private void replacePartitions( - NavigableMap pendingResults, String newFlinkJobId, long checkpointId) { - // Partition overwrite does not support delete files. - int deleteFilesNum = - pendingResults.values().stream().mapToInt(r -> r.deleteFiles().length).sum(); - Preconditions.checkState(deleteFilesNum == 0, "Cannot overwrite partitions with delete files."); - - // Commit the overwrite transaction. - ReplacePartitions dynamicOverwrite = table.newReplacePartitions(); - - int numFiles = 0; - for (WriteResult result : pendingResults.values()) { - Preconditions.checkState( - result.referencedDataFiles().length == 0, "Should have no referenced data files."); - - numFiles += result.dataFiles().length; - Arrays.stream(result.dataFiles()).forEach(dynamicOverwrite::addFile); - } - - commitOperation( - dynamicOverwrite, numFiles, 0, "dynamic partition overwrite", newFlinkJobId, checkpointId); - } - - private void commitDeltaTxn( - NavigableMap pendingResults, String newFlinkJobId, long checkpointId) { - int deleteFilesNum = - pendingResults.values().stream().mapToInt(r -> r.deleteFiles().length).sum(); - - if (deleteFilesNum == 0) { - // To be compatible with iceberg format V1. - AppendFiles appendFiles = table.newAppend(); - - int numFiles = 0; - for (WriteResult result : pendingResults.values()) { - Preconditions.checkState( - result.referencedDataFiles().length == 0, "Should have no referenced data files."); - - numFiles += result.dataFiles().length; - Arrays.stream(result.dataFiles()).forEach(appendFiles::appendFile); - } - - commitOperation(appendFiles, numFiles, 0, "append", newFlinkJobId, checkpointId); - } else { - // To be compatible with iceberg format V2. - for (Map.Entry e : pendingResults.entrySet()) { - // We don't commit the merged result into a single transaction because for the sequential - // transaction txn1 and - // txn2, the equality-delete files of txn2 are required to be applied to data files from - // txn1. Committing the - // merged one will lead to the incorrect delete semantic. - WriteResult result = e.getValue(); - - // Row delta validations are not needed for streaming changes that write equality deletes. - // Equality deletes - // are applied to data in all previous sequence numbers, so retries may push deletes further - // in the future, - // but do not affect correctness. Position deletes committed to the table in this path are - // used only to delete - // rows from data files that are being added in this commit. There is no way for data files - // added along with - // the delete files to be concurrently removed, so there is no need to validate the files - // referenced by the - // position delete files that are being committed. - RowDelta rowDelta = table.newRowDelta(); - - int numDataFiles = result.dataFiles().length; - Arrays.stream(result.dataFiles()).forEach(rowDelta::addRows); - - int numDeleteFiles = result.deleteFiles().length; - Arrays.stream(result.deleteFiles()).forEach(rowDelta::addDeletes); - - commitOperation( - rowDelta, numDataFiles, numDeleteFiles, "rowDelta", newFlinkJobId, e.getKey()); - } - } - } - - private void commitOperation( - SnapshotUpdate operation, - int numDataFiles, - int numDeleteFiles, - String description, - String newFlinkJobId, - long checkpointId) { - LOG.info( - "Committing {} with {} data files and {} delete files to table {}", - description, - numDataFiles, - numDeleteFiles, - table); - operation.set(MAX_COMMITTED_CHECKPOINT_ID, Long.toString(checkpointId)); - operation.set(FLINK_JOB_ID, newFlinkJobId); - - long start = System.currentTimeMillis(); - operation.commit(); // abort is automatically called if this fails. - long duration = System.currentTimeMillis() - start; - LOG.info("Committed in {} ms", duration); - } - - @Override - public void processElement(StreamRecord element) { - this.writeResultsOfCurrentCkpt.add(element.getValue()); - } - - @Override - public void endInput() throws IOException { - // Flush the buffered data files into 'dataFilesPerCheckpoint' firstly. - long currentCheckpointId = Long.MAX_VALUE; - dataFilesPerCheckpoint.put(currentCheckpointId, writeToManifest(currentCheckpointId)); - writeResultsOfCurrentCkpt.clear(); - - commitUpToCheckpoint(dataFilesPerCheckpoint, flinkJobId, currentCheckpointId); - } - - /** - * Write all the complete data files to a newly created manifest file and return the manifest's - * avro serialized bytes. - */ - private byte[] writeToManifest(long checkpointId) throws IOException { - if (writeResultsOfCurrentCkpt.isEmpty()) { - return EMPTY_MANIFEST_DATA; - } - - WriteResult result = WriteResult.builder().addAll(writeResultsOfCurrentCkpt).build(); - DeltaManifests deltaManifests = - FlinkManifestUtil.writeCompletedFiles( - result, () -> manifestOutputFileFactory.create(checkpointId), table.spec()); - - return SimpleVersionedSerialization.writeVersionAndSerialize( - DeltaManifestsSerializer.INSTANCE, deltaManifests); - } - - @Override - public void dispose() throws Exception { - if (tableLoader != null) { - tableLoader.close(); - } - } - - private static ListStateDescriptor> buildStateDescriptor() { - Comparator longComparator = Comparators.forType(Types.LongType.get()); - // Construct a SortedMapTypeInfo. - SortedMapTypeInfo sortedMapTypeInfo = - new SortedMapTypeInfo<>( - BasicTypeInfo.LONG_TYPE_INFO, - PrimitiveArrayTypeInfo.BYTE_PRIMITIVE_ARRAY_TYPE_INFO, - longComparator); - return new ListStateDescriptor<>("iceberg-files-committer-state", sortedMapTypeInfo); - } - - static long getMaxCommittedCheckpointId(Table table, String flinkJobId) { - Snapshot snapshot = table.currentSnapshot(); - long lastCommittedCheckpointId = INITIAL_CHECKPOINT_ID; - - while (snapshot != null) { - Map summary = snapshot.summary(); - String snapshotFlinkJobId = summary.get(FLINK_JOB_ID); - if (flinkJobId.equals(snapshotFlinkJobId)) { - String value = summary.get(MAX_COMMITTED_CHECKPOINT_ID); - if (value != null) { - lastCommittedCheckpointId = Long.parseLong(value); - break; - } - } - Long parentSnapshotId = snapshot.parentId(); - snapshot = parentSnapshotId != null ? table.snapshot(parentSnapshotId) : null; - } - - return lastCommittedCheckpointId; - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java deleted file mode 100644 index de70ac4c4643..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.IOException; -import org.apache.flink.streaming.api.operators.AbstractStreamOperator; -import org.apache.flink.streaming.api.operators.BoundedOneInput; -import org.apache.flink.streaming.api.operators.ChainingStrategy; -import org.apache.flink.streaming.api.operators.OneInputStreamOperator; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; -import org.apache.iceberg.io.TaskWriter; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; - -class IcebergStreamWriter extends AbstractStreamOperator - implements OneInputStreamOperator, BoundedOneInput { - - private static final long serialVersionUID = 1L; - - private final String fullTableName; - private final TaskWriterFactory taskWriterFactory; - - private transient TaskWriter writer; - private transient int subTaskId; - private transient int attemptId; - - IcebergStreamWriter(String fullTableName, TaskWriterFactory taskWriterFactory) { - this.fullTableName = fullTableName; - this.taskWriterFactory = taskWriterFactory; - setChainingStrategy(ChainingStrategy.ALWAYS); - } - - @Override - public void open() { - this.subTaskId = getRuntimeContext().getIndexOfThisSubtask(); - this.attemptId = getRuntimeContext().getAttemptNumber(); - - // Initialize the task writer factory. - this.taskWriterFactory.initialize(subTaskId, attemptId); - - // Initialize the task writer. - this.writer = taskWriterFactory.create(); - } - - @Override - public void prepareSnapshotPreBarrier(long checkpointId) throws Exception { - flush(); - this.writer = taskWriterFactory.create(); - } - - @Override - public void processElement(StreamRecord element) throws Exception { - writer.write(element.getValue()); - } - - @Override - public void dispose() throws Exception { - super.dispose(); - if (writer != null) { - writer.close(); - writer = null; - } - } - - @Override - public void endInput() throws IOException { - // For bounded stream, it may don't enable the checkpoint mechanism so we'd better to emit the - // remaining completed files to downstream before closing the writer so that we won't miss any - // of them. - // Note that if the task is not closed after calling endInput, checkpoint may be triggered again - // causing files to be sent repeatedly, the writer is marked as null after the last file is sent - // to guard against duplicated writes. - flush(); - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("table_name", fullTableName) - .add("subtask_id", subTaskId) - .add("attempt_id", attemptId) - .toString(); - } - - /** close all open files and emit files to downstream committer operator */ - private void flush() throws IOException { - if (writer == null) { - return; - } - - WriteResult result = writer.complete(); - output.collect(new StreamRecord<>(result)); - - // Set writer to null to prevent duplicate flushes in the corner case of - // prepareSnapshotPreBarrier happening after endInput. - writer = null; - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java deleted file mode 100644 index a5f5adef7dad..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.Map; -import java.util.concurrent.atomic.AtomicInteger; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.TableOperations; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.OutputFile; -import org.apache.iceberg.relocated.com.google.common.base.Strings; - -class ManifestOutputFileFactory { - // Users could define their own flink manifests directory by setting this value in table - // properties. - static final String FLINK_MANIFEST_LOCATION = "flink.manifests.location"; - - private final TableOperations ops; - private final FileIO io; - private final Map props; - private final String flinkJobId; - private final int subTaskId; - private final long attemptNumber; - private final AtomicInteger fileCount = new AtomicInteger(0); - - ManifestOutputFileFactory( - TableOperations ops, - FileIO io, - Map props, - String flinkJobId, - int subTaskId, - long attemptNumber) { - this.ops = ops; - this.io = io; - this.props = props; - this.flinkJobId = flinkJobId; - this.subTaskId = subTaskId; - this.attemptNumber = attemptNumber; - } - - private String generatePath(long checkpointId) { - return FileFormat.AVRO.addExtension( - String.format( - "%s-%05d-%d-%d-%05d", - flinkJobId, subTaskId, attemptNumber, checkpointId, fileCount.incrementAndGet())); - } - - OutputFile create(long checkpointId) { - String flinkManifestDir = props.get(FLINK_MANIFEST_LOCATION); - - String newManifestFullPath; - if (Strings.isNullOrEmpty(flinkManifestDir)) { - // User don't specify any flink manifest directory, so just use the default metadata path. - newManifestFullPath = ops.metadataFileLocation(generatePath(checkpointId)); - } else { - newManifestFullPath = - String.format("%s/%s", stripTrailingSlash(flinkManifestDir), generatePath(checkpointId)); - } - - return io.newOutputFile(newManifestFullPath); - } - - private static String stripTrailingSlash(String path) { - String result = path; - while (result.endsWith("/")) { - result = result.substring(0, result.length() - 1); - } - return result; - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java deleted file mode 100644 index df951684b446..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import org.apache.flink.api.java.functions.KeySelector; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.PartitionKey; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.RowDataWrapper; - -/** - * Create a {@link KeySelector} to shuffle by partition key, then each partition/bucket will be - * wrote by only one task. That will reduce lots of small files in partitioned fanout write policy - * for {@link FlinkSink}. - */ -class PartitionKeySelector implements KeySelector { - - private final Schema schema; - private final PartitionKey partitionKey; - private final RowType flinkSchema; - - private transient RowDataWrapper rowDataWrapper; - - PartitionKeySelector(PartitionSpec spec, Schema schema, RowType flinkSchema) { - this.schema = schema; - this.partitionKey = new PartitionKey(spec, schema); - this.flinkSchema = flinkSchema; - } - - /** - * Construct the {@link RowDataWrapper} lazily here because few members in it are not - * serializable. In this way, we don't have to serialize them with forcing. - */ - private RowDataWrapper lazyRowDataWrapper() { - if (rowDataWrapper == null) { - rowDataWrapper = new RowDataWrapper(flinkSchema, schema.asStruct()); - } - return rowDataWrapper; - } - - @Override - public String getKey(RowData row) { - partitionKey.partition(lazyRowDataWrapper().wrap(row)); - return partitionKey.toPath(); - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java deleted file mode 100644 index 38062dd1a2c4..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.util.List; -import java.util.Map; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionKey; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.OutputFileFactory; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.util.Tasks; - -class PartitionedDeltaWriter extends BaseDeltaTaskWriter { - - private final PartitionKey partitionKey; - - private final Map writers = Maps.newHashMap(); - - PartitionedDeltaWriter( - PartitionSpec spec, - FileFormat format, - FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, - FileIO io, - long targetFileSize, - Schema schema, - RowType flinkSchema, - List equalityFieldIds, - boolean upsert) { - super( - spec, - format, - appenderFactory, - fileFactory, - io, - targetFileSize, - schema, - flinkSchema, - equalityFieldIds, - upsert); - this.partitionKey = new PartitionKey(spec, schema); - } - - @Override - RowDataDeltaWriter route(RowData row) { - partitionKey.partition(wrapper().wrap(row)); - - RowDataDeltaWriter writer = writers.get(partitionKey); - if (writer == null) { - // NOTICE: we need to copy a new partition key here, in case of messing up the keys in - // writers. - PartitionKey copiedKey = partitionKey.copy(); - writer = new RowDataDeltaWriter(copiedKey); - writers.put(copiedKey, writer); - } - - return writer; - } - - @Override - public void close() { - try { - Tasks.foreach(writers.values()) - .throwFailureWhenFinished() - .noRetry() - .run(RowDataDeltaWriter::close, IOException.class); - - writers.clear(); - } catch (IOException e) { - throw new UncheckedIOException("Failed to close equality delta writer", e); - } - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java deleted file mode 100644 index 1c330434d019..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java +++ /dev/null @@ -1,186 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.List; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionKey; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.OutputFileFactory; -import org.apache.iceberg.io.PartitionedFanoutWriter; -import org.apache.iceberg.io.TaskWriter; -import org.apache.iceberg.io.UnpartitionedWriter; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.util.ArrayUtil; - -public class RowDataTaskWriterFactory implements TaskWriterFactory { - private final Table table; - private final Schema schema; - private final RowType flinkSchema; - private final PartitionSpec spec; - private final FileIO io; - private final long targetFileSizeBytes; - private final FileFormat format; - private final List equalityFieldIds; - private final boolean upsert; - private final FileAppenderFactory appenderFactory; - - private transient OutputFileFactory outputFileFactory; - - public RowDataTaskWriterFactory( - Table table, - RowType flinkSchema, - long targetFileSizeBytes, - FileFormat format, - List equalityFieldIds, - boolean upsert) { - this.table = table; - this.schema = table.schema(); - this.flinkSchema = flinkSchema; - this.spec = table.spec(); - this.io = table.io(); - this.targetFileSizeBytes = targetFileSizeBytes; - this.format = format; - this.equalityFieldIds = equalityFieldIds; - this.upsert = upsert; - - if (equalityFieldIds == null || equalityFieldIds.isEmpty()) { - this.appenderFactory = - new FlinkAppenderFactory(schema, flinkSchema, table.properties(), spec); - } else if (upsert) { - // In upsert mode, only the new row is emitted using INSERT row kind. Therefore, any column of - // the inserted row - // may differ from the deleted row other than the primary key fields, and the delete file must - // contain values - // that are correct for the deleted row. Therefore, only write the equality delete fields. - this.appenderFactory = - new FlinkAppenderFactory( - schema, - flinkSchema, - table.properties(), - spec, - ArrayUtil.toIntArray(equalityFieldIds), - TypeUtil.select(schema, Sets.newHashSet(equalityFieldIds)), - null); - } else { - this.appenderFactory = - new FlinkAppenderFactory( - schema, - flinkSchema, - table.properties(), - spec, - ArrayUtil.toIntArray(equalityFieldIds), - schema, - null); - } - } - - @Override - public void initialize(int taskId, int attemptId) { - this.outputFileFactory = - OutputFileFactory.builderFor(table, taskId, attemptId).format(format).build(); - } - - @Override - public TaskWriter create() { - Preconditions.checkNotNull( - outputFileFactory, - "The outputFileFactory shouldn't be null if we have invoked the initialize()."); - - if (equalityFieldIds == null || equalityFieldIds.isEmpty()) { - // Initialize a task writer to write INSERT only. - if (spec.isUnpartitioned()) { - return new UnpartitionedWriter<>( - spec, format, appenderFactory, outputFileFactory, io, targetFileSizeBytes); - } else { - return new RowDataPartitionedFanoutWriter( - spec, - format, - appenderFactory, - outputFileFactory, - io, - targetFileSizeBytes, - schema, - flinkSchema); - } - } else { - // Initialize a task writer to write both INSERT and equality DELETE. - if (spec.isUnpartitioned()) { - return new UnpartitionedDeltaWriter( - spec, - format, - appenderFactory, - outputFileFactory, - io, - targetFileSizeBytes, - schema, - flinkSchema, - equalityFieldIds, - upsert); - } else { - return new PartitionedDeltaWriter( - spec, - format, - appenderFactory, - outputFileFactory, - io, - targetFileSizeBytes, - schema, - flinkSchema, - equalityFieldIds, - upsert); - } - } - } - - private static class RowDataPartitionedFanoutWriter extends PartitionedFanoutWriter { - - private final PartitionKey partitionKey; - private final RowDataWrapper rowDataWrapper; - - RowDataPartitionedFanoutWriter( - PartitionSpec spec, - FileFormat format, - FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, - FileIO io, - long targetFileSize, - Schema schema, - RowType flinkSchema) { - super(spec, format, appenderFactory, fileFactory, io, targetFileSize); - this.partitionKey = new PartitionKey(spec, schema); - this.rowDataWrapper = new RowDataWrapper(flinkSchema, schema.asStruct()); - } - - @Override - protected PartitionKey partition(RowData row) { - partitionKey.partition(rowDataWrapper.wrap(row)); - return partitionKey; - } - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java deleted file mode 100644 index e3a1245e8cbd..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.Serializable; -import org.apache.iceberg.io.TaskWriter; - -/** - * Factory to create {@link TaskWriter} - * - * @param data type of record. - */ -public interface TaskWriterFactory extends Serializable { - - /** - * Initialize the factory with a given taskId and attemptId. - * - * @param taskId the identifier of task. - * @param attemptId the attempt id of this task. - */ - void initialize(int taskId, int attemptId); - - /** - * Initialize a {@link TaskWriter} with given task id and attempt id. - * - * @return a newly created task writer. - */ - TaskWriter create(); -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java deleted file mode 100644 index 7680fb933b20..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.IOException; -import java.util.List; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.OutputFileFactory; - -class UnpartitionedDeltaWriter extends BaseDeltaTaskWriter { - private final RowDataDeltaWriter writer; - - UnpartitionedDeltaWriter( - PartitionSpec spec, - FileFormat format, - FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, - FileIO io, - long targetFileSize, - Schema schema, - RowType flinkSchema, - List equalityFieldIds, - boolean upsert) { - super( - spec, - format, - appenderFactory, - fileFactory, - io, - targetFileSize, - schema, - flinkSchema, - equalityFieldIds, - upsert); - this.writer = new RowDataDeltaWriter(null); - } - - @Override - RowDataDeltaWriter route(RowData row) { - return writer; - } - - @Override - public void close() throws IOException { - writer.close(); - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java deleted file mode 100644 index 85c848b3d8ea..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.util.Iterator; -import org.apache.flink.annotation.Internal; -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.encryption.EncryptionManager; -import org.apache.iceberg.encryption.InputFilesDecryptor; -import org.apache.iceberg.io.CloseableIterator; -import org.apache.iceberg.io.FileIO; - -/** - * Flink data iterator that reads {@link CombinedScanTask} into a {@link CloseableIterator} - * - * @param is the output data type returned by this iterator. - */ -@Internal -public class DataIterator implements CloseableIterator { - - private final FileScanTaskReader fileScanTaskReader; - - private final InputFilesDecryptor inputFilesDecryptor; - private Iterator tasks; - private CloseableIterator currentIterator; - - public DataIterator( - FileScanTaskReader fileScanTaskReader, - CombinedScanTask task, - FileIO io, - EncryptionManager encryption) { - this.fileScanTaskReader = fileScanTaskReader; - - this.inputFilesDecryptor = new InputFilesDecryptor(task, io, encryption); - this.tasks = task.files().iterator(); - this.currentIterator = CloseableIterator.empty(); - } - - @Override - public boolean hasNext() { - updateCurrentIterator(); - return currentIterator.hasNext(); - } - - @Override - public T next() { - updateCurrentIterator(); - return currentIterator.next(); - } - - /** Updates the current iterator field to ensure that the current Iterator is not exhausted. */ - private void updateCurrentIterator() { - try { - while (!currentIterator.hasNext() && tasks.hasNext()) { - currentIterator.close(); - currentIterator = openTaskIterator(tasks.next()); - } - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } - - private CloseableIterator openTaskIterator(FileScanTask scanTask) { - return fileScanTaskReader.open(scanTask, inputFilesDecryptor); - } - - @Override - public void close() throws IOException { - // close the current iterator - currentIterator.close(); - tasks = null; - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java deleted file mode 100644 index 927a804a4792..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.Serializable; -import org.apache.flink.annotation.Internal; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.encryption.InputFilesDecryptor; -import org.apache.iceberg.io.CloseableIterator; - -/** - * Read a {@link FileScanTask} into a {@link CloseableIterator} - * - * @param is the output data type returned by this iterator. - */ -@Internal -public interface FileScanTaskReader extends Serializable { - CloseableIterator open(FileScanTask fileScanTask, InputFilesDecryptor inputFilesDecryptor); -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java deleted file mode 100644 index 6f8d6e3461aa..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.IOException; -import org.apache.flink.api.common.io.DefaultInputSplitAssigner; -import org.apache.flink.api.common.io.InputFormat; -import org.apache.flink.api.common.io.LocatableInputSplitAssigner; -import org.apache.flink.api.common.io.RichInputFormat; -import org.apache.flink.api.common.io.statistics.BaseStatistics; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.core.io.InputSplitAssigner; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.encryption.EncryptionManager; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; - -/** Flink {@link InputFormat} for Iceberg. */ -public class FlinkInputFormat extends RichInputFormat { - - private static final long serialVersionUID = 1L; - - private final TableLoader tableLoader; - private final FileIO io; - private final EncryptionManager encryption; - private final ScanContext context; - private final RowDataFileScanTaskReader rowDataReader; - - private transient DataIterator iterator; - private transient long currentReadCount = 0L; - - FlinkInputFormat( - TableLoader tableLoader, - Schema tableSchema, - FileIO io, - EncryptionManager encryption, - ScanContext context) { - this.tableLoader = tableLoader; - this.io = io; - this.encryption = encryption; - this.context = context; - this.rowDataReader = - new RowDataFileScanTaskReader( - tableSchema, context.project(), context.nameMapping(), context.caseSensitive()); - } - - @VisibleForTesting - Schema projectedSchema() { - return context.project(); - } - - @Override - public BaseStatistics getStatistics(BaseStatistics cachedStatistics) { - // Legacy method, not be used. - return null; - } - - @Override - public FlinkInputSplit[] createInputSplits(int minNumSplits) throws IOException { - // Called in Job manager, so it is OK to load table from catalog. - tableLoader.open(); - try (TableLoader loader = tableLoader) { - Table table = loader.loadTable(); - return FlinkSplitGenerator.createInputSplits(table, context); - } - } - - @Override - public InputSplitAssigner getInputSplitAssigner(FlinkInputSplit[] inputSplits) { - return context.exposeLocality() - ? new LocatableInputSplitAssigner(inputSplits) - : new DefaultInputSplitAssigner(inputSplits); - } - - @Override - public void configure(Configuration parameters) {} - - @Override - public void open(FlinkInputSplit split) { - this.iterator = new DataIterator<>(rowDataReader, split.getTask(), io, encryption); - } - - @Override - public boolean reachedEnd() { - if (context.limit() > 0 && currentReadCount >= context.limit()) { - return true; - } else { - return !iterator.hasNext(); - } - } - - @Override - public RowData nextRecord(RowData reuse) { - currentReadCount++; - return iterator.next(); - } - - @Override - public void close() throws IOException { - if (iterator != null) { - iterator.close(); - } - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java deleted file mode 100644 index 16fd4f39596c..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.util.Arrays; -import javax.annotation.Nullable; -import org.apache.flink.core.io.LocatableInputSplit; -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; - -public class FlinkInputSplit extends LocatableInputSplit { - - private final CombinedScanTask task; - - FlinkInputSplit(int splitNumber, CombinedScanTask task, @Nullable String[] hostnames) { - super(splitNumber, hostnames); - this.task = task; - } - - CombinedScanTask getTask() { - return task; - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("splitNumber", getSplitNumber()) - .add("task", task) - .add("hosts", Arrays.toString(getHostnames())) - .toString(); - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java deleted file mode 100644 index 7ad81a7e9e2b..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java +++ /dev/null @@ -1,320 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.time.Duration; -import java.util.List; -import java.util.Map; -import java.util.Set; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.api.config.ExecutionConfigOptions; -import org.apache.flink.table.data.RowData; -import org.apache.hadoop.fs.Path; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableScan; -import org.apache.iceberg.encryption.EncryptionManager; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; -import org.apache.iceberg.hadoop.HadoopFileIO; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class FlinkSource { - private static final Logger LOG = LoggerFactory.getLogger(FlinkSource.class); - - private FlinkSource() {} - - /** - * Initialize a {@link Builder} to read the data from iceberg table. Equivalent to {@link - * TableScan}. See more options in {@link ScanContext}. - * - *

The Source can be read static data in bounded mode. It can also continuously check the - * arrival of new data and read records incrementally. - * - *

    - *
  • Without startSnapshotId: Bounded - *
  • With startSnapshotId and with endSnapshotId: Bounded - *
  • With startSnapshotId (-1 means unbounded preceding) and Without endSnapshotId: Unbounded - *
- * - *

- * - * @return {@link Builder} to connect the iceberg table. - */ - public static Builder forRowData() { - return new Builder(); - } - - /** Source builder to build {@link DataStream}. */ - public static class Builder { - private static final Set FILE_SYSTEM_SUPPORT_LOCALITY = ImmutableSet.of("hdfs"); - - private StreamExecutionEnvironment env; - private Table table; - private TableLoader tableLoader; - private TableSchema projectedSchema; - private ReadableConfig readableConfig = new Configuration(); - private final ScanContext.Builder contextBuilder = ScanContext.builder(); - private Boolean exposeLocality; - - public Builder tableLoader(TableLoader newLoader) { - this.tableLoader = newLoader; - return this; - } - - public Builder table(Table newTable) { - this.table = newTable; - return this; - } - - public Builder env(StreamExecutionEnvironment newEnv) { - this.env = newEnv; - return this; - } - - public Builder filters(List filters) { - contextBuilder.filters(filters); - return this; - } - - public Builder project(TableSchema schema) { - this.projectedSchema = schema; - return this; - } - - public Builder limit(long newLimit) { - contextBuilder.limit(newLimit); - return this; - } - - public Builder properties(Map properties) { - contextBuilder.fromProperties(properties); - return this; - } - - public Builder caseSensitive(boolean caseSensitive) { - contextBuilder.caseSensitive(caseSensitive); - return this; - } - - public Builder snapshotId(Long snapshotId) { - contextBuilder.useSnapshotId(snapshotId); - return this; - } - - public Builder startSnapshotId(Long startSnapshotId) { - contextBuilder.startSnapshotId(startSnapshotId); - return this; - } - - public Builder endSnapshotId(Long endSnapshotId) { - contextBuilder.endSnapshotId(endSnapshotId); - return this; - } - - public Builder asOfTimestamp(Long asOfTimestamp) { - contextBuilder.asOfTimestamp(asOfTimestamp); - return this; - } - - public Builder splitSize(Long splitSize) { - contextBuilder.splitSize(splitSize); - return this; - } - - public Builder splitLookback(Integer splitLookback) { - contextBuilder.splitLookback(splitLookback); - return this; - } - - public Builder splitOpenFileCost(Long splitOpenFileCost) { - contextBuilder.splitOpenFileCost(splitOpenFileCost); - return this; - } - - public Builder streaming(boolean streaming) { - contextBuilder.streaming(streaming); - return this; - } - - public Builder exposeLocality(boolean newExposeLocality) { - this.exposeLocality = newExposeLocality; - return this; - } - - public Builder nameMapping(String nameMapping) { - contextBuilder.nameMapping(nameMapping); - return this; - } - - public Builder monitorInterval(Duration interval) { - contextBuilder.monitorInterval(interval); - return this; - } - - public Builder maxPlanningSnapshotCount(int newMaxPlanningSnapshotCount) { - contextBuilder.maxPlanningSnapshotCount(newMaxPlanningSnapshotCount); - return this; - } - - public Builder flinkConf(ReadableConfig config) { - this.readableConfig = config; - return this; - } - - public FlinkInputFormat buildFormat() { - Preconditions.checkNotNull(tableLoader, "TableLoader should not be null"); - - Schema icebergSchema; - FileIO io; - EncryptionManager encryption; - if (table == null) { - // load required fields by table loader. - tableLoader.open(); - try (TableLoader loader = tableLoader) { - table = loader.loadTable(); - icebergSchema = table.schema(); - io = table.io(); - encryption = table.encryption(); - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } else { - icebergSchema = table.schema(); - io = table.io(); - encryption = table.encryption(); - } - - if (projectedSchema == null) { - contextBuilder.project(icebergSchema); - } else { - contextBuilder.project(FlinkSchemaUtil.convert(icebergSchema, projectedSchema)); - } - contextBuilder.exposeLocality(localityEnabled()); - - return new FlinkInputFormat( - tableLoader, icebergSchema, io, encryption, contextBuilder.build()); - } - - public DataStream build() { - Preconditions.checkNotNull(env, "StreamExecutionEnvironment should not be null"); - FlinkInputFormat format = buildFormat(); - - ScanContext context = contextBuilder.build(); - TypeInformation typeInfo = - FlinkCompatibilityUtil.toTypeInfo(FlinkSchemaUtil.convert(context.project())); - - if (!context.isStreaming()) { - int parallelism = inferParallelism(format, context); - if (env.getMaxParallelism() > 0) { - parallelism = Math.min(parallelism, env.getMaxParallelism()); - } - return env.createInput(format, typeInfo).setParallelism(parallelism); - } else { - StreamingMonitorFunction function = new StreamingMonitorFunction(tableLoader, context); - - String monitorFunctionName = String.format("Iceberg table (%s) monitor", table); - String readerOperatorName = String.format("Iceberg table (%s) reader", table); - - return env.addSource(function, monitorFunctionName) - .transform(readerOperatorName, typeInfo, StreamingReaderOperator.factory(format)); - } - } - - int inferParallelism(FlinkInputFormat format, ScanContext context) { - int parallelism = - readableConfig.get(ExecutionConfigOptions.TABLE_EXEC_RESOURCE_DEFAULT_PARALLELISM); - if (readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM)) { - int maxInferParallelism = - readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX); - Preconditions.checkState( - maxInferParallelism >= 1, - FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX.key() - + " cannot be less than 1"); - int splitNum; - try { - FlinkInputSplit[] splits = format.createInputSplits(0); - splitNum = splits.length; - } catch (IOException e) { - throw new UncheckedIOException( - "Failed to create iceberg input splits for table: " + table, e); - } - - parallelism = Math.min(splitNum, maxInferParallelism); - } - - if (context.limit() > 0) { - int limit = - context.limit() >= Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) context.limit(); - parallelism = Math.min(parallelism, limit); - } - - // parallelism must be positive. - parallelism = Math.max(1, parallelism); - return parallelism; - } - - private boolean localityEnabled() { - Boolean localityEnabled = - this.exposeLocality != null - ? this.exposeLocality - : readableConfig.get( - FlinkConfigOptions.TABLE_EXEC_ICEBERG_EXPOSE_SPLIT_LOCALITY_INFO); - - if (localityEnabled != null && !localityEnabled) { - return false; - } - - FileIO fileIO = table.io(); - if (fileIO instanceof HadoopFileIO) { - HadoopFileIO hadoopFileIO = (HadoopFileIO) fileIO; - try { - String scheme = - new Path(table.location()).getFileSystem(hadoopFileIO.getConf()).getScheme(); - return FILE_SYSTEM_SUPPORT_LOCALITY.contains(scheme); - } catch (IOException e) { - LOG.warn( - "Failed to determine whether the locality information can be exposed for table: {}", - table, - e); - } - } - - return false; - } - } - - public static boolean isBounded(Map properties) { - return !ScanContext.builder().fromProperties(properties).build().isStreaming(); - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitGenerator.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitGenerator.java deleted file mode 100644 index 2473d167ff68..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitGenerator.java +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.util.List; -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.TableScan; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.hadoop.Util; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.util.Tasks; -import org.apache.iceberg.util.ThreadPools; - -class FlinkSplitGenerator { - private FlinkSplitGenerator() {} - - static FlinkInputSplit[] createInputSplits(Table table, ScanContext context) { - List tasks = tasks(table, context); - FlinkInputSplit[] splits = new FlinkInputSplit[tasks.size()]; - boolean exposeLocality = context.exposeLocality(); - - Tasks.range(tasks.size()) - .stopOnFailure() - .executeWith(exposeLocality ? ThreadPools.getWorkerPool() : null) - .run( - index -> { - CombinedScanTask task = tasks.get(index); - String[] hostnames = null; - if (exposeLocality) { - hostnames = Util.blockLocations(table.io(), task); - } - splits[index] = new FlinkInputSplit(index, task, hostnames); - }); - return splits; - } - - private static List tasks(Table table, ScanContext context) { - TableScan scan = - table.newScan().caseSensitive(context.caseSensitive()).project(context.project()); - - if (context.snapshotId() != null) { - scan = scan.useSnapshot(context.snapshotId()); - } - - if (context.asOfTimestamp() != null) { - scan = scan.asOfTime(context.asOfTimestamp()); - } - - if (context.startSnapshotId() != null) { - if (context.endSnapshotId() != null) { - scan = scan.appendsBetween(context.startSnapshotId(), context.endSnapshotId()); - } else { - scan = scan.appendsAfter(context.startSnapshotId()); - } - } - - if (context.splitSize() != null) { - scan = scan.option(TableProperties.SPLIT_SIZE, context.splitSize().toString()); - } - - if (context.splitLookback() != null) { - scan = scan.option(TableProperties.SPLIT_LOOKBACK, context.splitLookback().toString()); - } - - if (context.splitOpenFileCost() != null) { - scan = - scan.option(TableProperties.SPLIT_OPEN_FILE_COST, context.splitOpenFileCost().toString()); - } - - if (context.filters() != null) { - for (Expression filter : context.filters()) { - scan = scan.filter(filter); - } - } - - try (CloseableIterable tasksIterable = scan.planTasks()) { - return Lists.newArrayList(tasksIterable); - } catch (IOException e) { - throw new UncheckedIOException("Failed to close table scan: " + scan, e); - } - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java deleted file mode 100644 index 5fada27d5471..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java +++ /dev/null @@ -1,222 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.util.Map; -import org.apache.flink.annotation.Internal; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.MetadataColumns; -import org.apache.iceberg.Schema; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.avro.Avro; -import org.apache.iceberg.data.DeleteFilter; -import org.apache.iceberg.encryption.InputFilesDecryptor; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.flink.data.FlinkAvroReader; -import org.apache.iceberg.flink.data.FlinkOrcReader; -import org.apache.iceberg.flink.data.FlinkParquetReaders; -import org.apache.iceberg.flink.data.RowDataProjection; -import org.apache.iceberg.flink.data.RowDataUtil; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.CloseableIterator; -import org.apache.iceberg.io.InputFile; -import org.apache.iceberg.mapping.NameMappingParser; -import org.apache.iceberg.orc.ORC; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.util.PartitionUtil; - -@Internal -public class RowDataFileScanTaskReader implements FileScanTaskReader { - - private final Schema tableSchema; - private final Schema projectedSchema; - private final String nameMapping; - private final boolean caseSensitive; - - public RowDataFileScanTaskReader( - Schema tableSchema, Schema projectedSchema, String nameMapping, boolean caseSensitive) { - this.tableSchema = tableSchema; - this.projectedSchema = projectedSchema; - this.nameMapping = nameMapping; - this.caseSensitive = caseSensitive; - } - - @Override - public CloseableIterator open( - FileScanTask task, InputFilesDecryptor inputFilesDecryptor) { - Schema partitionSchema = TypeUtil.select(projectedSchema, task.spec().identitySourceIds()); - - Map idToConstant = - partitionSchema.columns().isEmpty() - ? ImmutableMap.of() - : PartitionUtil.constantsMap(task, RowDataUtil::convertConstant); - - FlinkDeleteFilter deletes = - new FlinkDeleteFilter(task, tableSchema, projectedSchema, inputFilesDecryptor); - CloseableIterable iterable = - deletes.filter( - newIterable(task, deletes.requiredSchema(), idToConstant, inputFilesDecryptor)); - - // Project the RowData to remove the extra meta columns. - if (!projectedSchema.sameSchema(deletes.requiredSchema())) { - RowDataProjection rowDataProjection = - RowDataProjection.create( - deletes.requiredRowType(), - deletes.requiredSchema().asStruct(), - projectedSchema.asStruct()); - iterable = CloseableIterable.transform(iterable, rowDataProjection::wrap); - } - - return iterable.iterator(); - } - - private CloseableIterable newIterable( - FileScanTask task, - Schema schema, - Map idToConstant, - InputFilesDecryptor inputFilesDecryptor) { - CloseableIterable iter; - if (task.isDataTask()) { - throw new UnsupportedOperationException("Cannot read data task."); - } else { - switch (task.file().format()) { - case PARQUET: - iter = newParquetIterable(task, schema, idToConstant, inputFilesDecryptor); - break; - - case AVRO: - iter = newAvroIterable(task, schema, idToConstant, inputFilesDecryptor); - break; - - case ORC: - iter = newOrcIterable(task, schema, idToConstant, inputFilesDecryptor); - break; - - default: - throw new UnsupportedOperationException( - "Cannot read unknown format: " + task.file().format()); - } - } - - return iter; - } - - private CloseableIterable newAvroIterable( - FileScanTask task, - Schema schema, - Map idToConstant, - InputFilesDecryptor inputFilesDecryptor) { - Avro.ReadBuilder builder = - Avro.read(inputFilesDecryptor.getInputFile(task)) - .reuseContainers() - .project(schema) - .split(task.start(), task.length()) - .createReaderFunc(readSchema -> new FlinkAvroReader(schema, readSchema, idToConstant)); - - if (nameMapping != null) { - builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); - } - - return builder.build(); - } - - private CloseableIterable newParquetIterable( - FileScanTask task, - Schema schema, - Map idToConstant, - InputFilesDecryptor inputFilesDecryptor) { - Parquet.ReadBuilder builder = - Parquet.read(inputFilesDecryptor.getInputFile(task)) - .split(task.start(), task.length()) - .project(schema) - .createReaderFunc( - fileSchema -> FlinkParquetReaders.buildReader(schema, fileSchema, idToConstant)) - .filter(task.residual()) - .caseSensitive(caseSensitive) - .reuseContainers(); - - if (nameMapping != null) { - builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); - } - - return builder.build(); - } - - private CloseableIterable newOrcIterable( - FileScanTask task, - Schema schema, - Map idToConstant, - InputFilesDecryptor inputFilesDecryptor) { - Schema readSchemaWithoutConstantAndMetadataFields = - TypeUtil.selectNot( - schema, Sets.union(idToConstant.keySet(), MetadataColumns.metadataFieldIds())); - - ORC.ReadBuilder builder = - ORC.read(inputFilesDecryptor.getInputFile(task)) - .project(readSchemaWithoutConstantAndMetadataFields) - .split(task.start(), task.length()) - .createReaderFunc( - readOrcSchema -> new FlinkOrcReader(schema, readOrcSchema, idToConstant)) - .filter(task.residual()) - .caseSensitive(caseSensitive); - - if (nameMapping != null) { - builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); - } - - return builder.build(); - } - - private static class FlinkDeleteFilter extends DeleteFilter { - private final RowType requiredRowType; - private final RowDataWrapper asStructLike; - private final InputFilesDecryptor inputFilesDecryptor; - - FlinkDeleteFilter( - FileScanTask task, - Schema tableSchema, - Schema requestedSchema, - InputFilesDecryptor inputFilesDecryptor) { - super(task.file().path().toString(), task.deletes(), tableSchema, requestedSchema); - this.requiredRowType = FlinkSchemaUtil.convert(requiredSchema()); - this.asStructLike = new RowDataWrapper(requiredRowType, requiredSchema().asStruct()); - this.inputFilesDecryptor = inputFilesDecryptor; - } - - public RowType requiredRowType() { - return requiredRowType; - } - - @Override - protected StructLike asStructLike(RowData row) { - return asStructLike.wrap(row); - } - - @Override - protected InputFile getInputFile(String location) { - return inputFilesDecryptor.getInputFile(location); - } - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java deleted file mode 100644 index 1468879097de..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java +++ /dev/null @@ -1,164 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.apache.iceberg.TableProperties.DEFAULT_NAME_MAPPING; - -import java.util.Collection; -import java.util.List; -import java.util.stream.Collectors; -import org.apache.flink.api.common.functions.RichMapFunction; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SerializableTable; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.encryption.EncryptionManager; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.sink.RowDataTaskWriterFactory; -import org.apache.iceberg.flink.sink.TaskWriterFactory; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.TaskWriter; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.util.PropertyUtil; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class RowDataRewriter { - - private static final Logger LOG = LoggerFactory.getLogger(RowDataRewriter.class); - - private final Schema schema; - private final String nameMapping; - private final FileIO io; - private final boolean caseSensitive; - private final EncryptionManager encryptionManager; - private final TaskWriterFactory taskWriterFactory; - private final String tableName; - - public RowDataRewriter( - Table table, boolean caseSensitive, FileIO io, EncryptionManager encryptionManager) { - this.schema = table.schema(); - this.caseSensitive = caseSensitive; - this.io = io; - this.encryptionManager = encryptionManager; - this.nameMapping = - PropertyUtil.propertyAsString(table.properties(), DEFAULT_NAME_MAPPING, null); - this.tableName = table.name(); - - String formatString = - PropertyUtil.propertyAsString( - table.properties(), - TableProperties.DEFAULT_FILE_FORMAT, - TableProperties.DEFAULT_FILE_FORMAT_DEFAULT); - FileFormat format = FileFormat.fromString(formatString); - RowType flinkSchema = FlinkSchemaUtil.convert(table.schema()); - this.taskWriterFactory = - new RowDataTaskWriterFactory( - SerializableTable.copyOf(table), flinkSchema, Long.MAX_VALUE, format, null, false); - } - - public List rewriteDataForTasks( - DataStream dataStream, int parallelism) throws Exception { - RewriteMap map = - new RewriteMap( - schema, nameMapping, io, caseSensitive, encryptionManager, taskWriterFactory); - DataStream> ds = dataStream.map(map).setParallelism(parallelism); - return Lists.newArrayList(ds.executeAndCollect("Rewrite table :" + tableName)).stream() - .flatMap(Collection::stream) - .collect(Collectors.toList()); - } - - public static class RewriteMap extends RichMapFunction> { - - private TaskWriter writer; - private int subTaskId; - private int attemptId; - - private final Schema schema; - private final String nameMapping; - private final FileIO io; - private final boolean caseSensitive; - private final EncryptionManager encryptionManager; - private final TaskWriterFactory taskWriterFactory; - private final RowDataFileScanTaskReader rowDataReader; - - public RewriteMap( - Schema schema, - String nameMapping, - FileIO io, - boolean caseSensitive, - EncryptionManager encryptionManager, - TaskWriterFactory taskWriterFactory) { - this.schema = schema; - this.nameMapping = nameMapping; - this.io = io; - this.caseSensitive = caseSensitive; - this.encryptionManager = encryptionManager; - this.taskWriterFactory = taskWriterFactory; - this.rowDataReader = - new RowDataFileScanTaskReader(schema, schema, nameMapping, caseSensitive); - } - - @Override - public void open(Configuration parameters) { - this.subTaskId = getRuntimeContext().getIndexOfThisSubtask(); - this.attemptId = getRuntimeContext().getAttemptNumber(); - // Initialize the task writer factory. - this.taskWriterFactory.initialize(subTaskId, attemptId); - } - - @Override - public List map(CombinedScanTask task) throws Exception { - // Initialize the task writer. - this.writer = taskWriterFactory.create(); - try (DataIterator iterator = - new DataIterator<>(rowDataReader, task, io, encryptionManager)) { - while (iterator.hasNext()) { - RowData rowData = iterator.next(); - writer.write(rowData); - } - return Lists.newArrayList(writer.dataFiles()); - } catch (Throwable originalThrowable) { - try { - LOG.error("Aborting commit for (subTaskId {}, attemptId {})", subTaskId, attemptId); - writer.abort(); - LOG.error("Aborted commit for (subTaskId {}, attemptId {})", subTaskId, attemptId); - } catch (Throwable inner) { - if (originalThrowable != inner) { - originalThrowable.addSuppressed(inner); - LOG.warn("Suppressing exception in catch: {}", inner.getMessage(), inner); - } - } - - if (originalThrowable instanceof Exception) { - throw originalThrowable; - } else { - throw new RuntimeException(originalThrowable); - } - } - } - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java deleted file mode 100644 index b78d0e643aa9..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java +++ /dev/null @@ -1,373 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.apache.iceberg.TableProperties.DEFAULT_NAME_MAPPING; - -import java.io.Serializable; -import java.time.Duration; -import java.util.List; -import java.util.Map; -import org.apache.flink.configuration.ConfigOption; -import org.apache.flink.configuration.ConfigOptions; -import org.apache.flink.configuration.Configuration; -import org.apache.iceberg.Schema; -import org.apache.iceberg.expressions.Expression; - -/** Context object with optional arguments for a Flink Scan. */ -class ScanContext implements Serializable { - - private static final long serialVersionUID = 1L; - - private static final ConfigOption SNAPSHOT_ID = - ConfigOptions.key("snapshot-id").longType().defaultValue(null); - - private static final ConfigOption CASE_SENSITIVE = - ConfigOptions.key("case-sensitive").booleanType().defaultValue(false); - - private static final ConfigOption AS_OF_TIMESTAMP = - ConfigOptions.key("as-of-timestamp").longType().defaultValue(null); - - private static final ConfigOption START_SNAPSHOT_ID = - ConfigOptions.key("start-snapshot-id").longType().defaultValue(null); - - private static final ConfigOption END_SNAPSHOT_ID = - ConfigOptions.key("end-snapshot-id").longType().defaultValue(null); - - private static final ConfigOption SPLIT_SIZE = - ConfigOptions.key("split-size").longType().defaultValue(null); - - private static final ConfigOption SPLIT_LOOKBACK = - ConfigOptions.key("split-lookback").intType().defaultValue(null); - - private static final ConfigOption SPLIT_FILE_OPEN_COST = - ConfigOptions.key("split-file-open-cost").longType().defaultValue(null); - - private static final ConfigOption STREAMING = - ConfigOptions.key("streaming").booleanType().defaultValue(false); - - private static final ConfigOption MONITOR_INTERVAL = - ConfigOptions.key("monitor-interval").durationType().defaultValue(Duration.ofSeconds(10)); - - private static final ConfigOption MAX_PLANNING_SNAPSHOT_COUNT = - ConfigOptions.key("max-planning-snapshot-count").intType().defaultValue(Integer.MAX_VALUE); - - private final boolean caseSensitive; - private final boolean exposeLocality; - private final Long snapshotId; - private final Long startSnapshotId; - private final Long endSnapshotId; - private final Long asOfTimestamp; - private final Long splitSize; - private final Integer splitLookback; - private final Long splitOpenFileCost; - private final boolean isStreaming; - private final Duration monitorInterval; - private final int maxPlanningSnapshotCount; - - private final String nameMapping; - private final Schema schema; - private final List filters; - private final long limit; - - private ScanContext( - boolean caseSensitive, - Long snapshotId, - Long startSnapshotId, - Long endSnapshotId, - Long asOfTimestamp, - Long splitSize, - Integer splitLookback, - Long splitOpenFileCost, - boolean isStreaming, - Duration monitorInterval, - String nameMapping, - Schema schema, - List filters, - long limit, - boolean exposeLocality, - int maxPlanningSnapshotCount) { - this.caseSensitive = caseSensitive; - this.snapshotId = snapshotId; - this.startSnapshotId = startSnapshotId; - this.endSnapshotId = endSnapshotId; - this.asOfTimestamp = asOfTimestamp; - this.splitSize = splitSize; - this.splitLookback = splitLookback; - this.splitOpenFileCost = splitOpenFileCost; - this.isStreaming = isStreaming; - this.monitorInterval = monitorInterval; - - this.nameMapping = nameMapping; - this.schema = schema; - this.filters = filters; - this.limit = limit; - this.exposeLocality = exposeLocality; - this.maxPlanningSnapshotCount = maxPlanningSnapshotCount; - } - - boolean caseSensitive() { - return caseSensitive; - } - - Long snapshotId() { - return snapshotId; - } - - Long startSnapshotId() { - return startSnapshotId; - } - - Long endSnapshotId() { - return endSnapshotId; - } - - Long asOfTimestamp() { - return asOfTimestamp; - } - - Long splitSize() { - return splitSize; - } - - Integer splitLookback() { - return splitLookback; - } - - Long splitOpenFileCost() { - return splitOpenFileCost; - } - - boolean isStreaming() { - return isStreaming; - } - - Duration monitorInterval() { - return monitorInterval; - } - - String nameMapping() { - return nameMapping; - } - - Schema project() { - return schema; - } - - List filters() { - return filters; - } - - long limit() { - return limit; - } - - boolean exposeLocality() { - return exposeLocality; - } - - public int maxPlanningSnapshotCount() { - return maxPlanningSnapshotCount; - } - - ScanContext copyWithAppendsBetween(long newStartSnapshotId, long newEndSnapshotId) { - return ScanContext.builder() - .caseSensitive(caseSensitive) - .useSnapshotId(null) - .startSnapshotId(newStartSnapshotId) - .endSnapshotId(newEndSnapshotId) - .asOfTimestamp(null) - .splitSize(splitSize) - .splitLookback(splitLookback) - .splitOpenFileCost(splitOpenFileCost) - .streaming(isStreaming) - .monitorInterval(monitorInterval) - .nameMapping(nameMapping) - .project(schema) - .filters(filters) - .limit(limit) - .exposeLocality(exposeLocality) - .maxPlanningSnapshotCount(maxPlanningSnapshotCount) - .build(); - } - - ScanContext copyWithSnapshotId(long newSnapshotId) { - return ScanContext.builder() - .caseSensitive(caseSensitive) - .useSnapshotId(newSnapshotId) - .startSnapshotId(null) - .endSnapshotId(null) - .asOfTimestamp(null) - .splitSize(splitSize) - .splitLookback(splitLookback) - .splitOpenFileCost(splitOpenFileCost) - .streaming(isStreaming) - .monitorInterval(monitorInterval) - .nameMapping(nameMapping) - .project(schema) - .filters(filters) - .limit(limit) - .exposeLocality(exposeLocality) - .maxPlanningSnapshotCount(maxPlanningSnapshotCount) - .build(); - } - - static Builder builder() { - return new Builder(); - } - - static class Builder { - private boolean caseSensitive = CASE_SENSITIVE.defaultValue(); - private Long snapshotId = SNAPSHOT_ID.defaultValue(); - private Long startSnapshotId = START_SNAPSHOT_ID.defaultValue(); - private Long endSnapshotId = END_SNAPSHOT_ID.defaultValue(); - private Long asOfTimestamp = AS_OF_TIMESTAMP.defaultValue(); - private Long splitSize = SPLIT_SIZE.defaultValue(); - private Integer splitLookback = SPLIT_LOOKBACK.defaultValue(); - private Long splitOpenFileCost = SPLIT_FILE_OPEN_COST.defaultValue(); - private boolean isStreaming = STREAMING.defaultValue(); - private Duration monitorInterval = MONITOR_INTERVAL.defaultValue(); - private int maxPlanningSnapshotCount = MAX_PLANNING_SNAPSHOT_COUNT.defaultValue(); - private String nameMapping; - private Schema projectedSchema; - private List filters; - private long limit = -1L; - private boolean exposeLocality; - - private Builder() {} - - Builder caseSensitive(boolean newCaseSensitive) { - this.caseSensitive = newCaseSensitive; - return this; - } - - Builder useSnapshotId(Long newSnapshotId) { - this.snapshotId = newSnapshotId; - return this; - } - - Builder startSnapshotId(Long newStartSnapshotId) { - this.startSnapshotId = newStartSnapshotId; - return this; - } - - Builder endSnapshotId(Long newEndSnapshotId) { - this.endSnapshotId = newEndSnapshotId; - return this; - } - - Builder asOfTimestamp(Long newAsOfTimestamp) { - this.asOfTimestamp = newAsOfTimestamp; - return this; - } - - Builder splitSize(Long newSplitSize) { - this.splitSize = newSplitSize; - return this; - } - - Builder splitLookback(Integer newSplitLookback) { - this.splitLookback = newSplitLookback; - return this; - } - - Builder splitOpenFileCost(Long newSplitOpenFileCost) { - this.splitOpenFileCost = newSplitOpenFileCost; - return this; - } - - Builder streaming(boolean streaming) { - this.isStreaming = streaming; - return this; - } - - Builder monitorInterval(Duration newMonitorInterval) { - this.monitorInterval = newMonitorInterval; - return this; - } - - Builder nameMapping(String newNameMapping) { - this.nameMapping = newNameMapping; - return this; - } - - Builder project(Schema newProjectedSchema) { - this.projectedSchema = newProjectedSchema; - return this; - } - - Builder filters(List newFilters) { - this.filters = newFilters; - return this; - } - - Builder limit(long newLimit) { - this.limit = newLimit; - return this; - } - - Builder exposeLocality(boolean newExposeLocality) { - this.exposeLocality = newExposeLocality; - return this; - } - - public Builder maxPlanningSnapshotCount(int newMaxPlanningSnapshotCount) { - this.maxPlanningSnapshotCount = newMaxPlanningSnapshotCount; - return this; - } - - Builder fromProperties(Map properties) { - Configuration config = new Configuration(); - properties.forEach(config::setString); - - return this.useSnapshotId(config.get(SNAPSHOT_ID)) - .caseSensitive(config.get(CASE_SENSITIVE)) - .asOfTimestamp(config.get(AS_OF_TIMESTAMP)) - .startSnapshotId(config.get(START_SNAPSHOT_ID)) - .endSnapshotId(config.get(END_SNAPSHOT_ID)) - .splitSize(config.get(SPLIT_SIZE)) - .splitLookback(config.get(SPLIT_LOOKBACK)) - .splitOpenFileCost(config.get(SPLIT_FILE_OPEN_COST)) - .streaming(config.get(STREAMING)) - .monitorInterval(config.get(MONITOR_INTERVAL)) - .nameMapping(properties.get(DEFAULT_NAME_MAPPING)) - .maxPlanningSnapshotCount(config.get(MAX_PLANNING_SNAPSHOT_COUNT)); - } - - public ScanContext build() { - return new ScanContext( - caseSensitive, - snapshotId, - startSnapshotId, - endSnapshotId, - asOfTimestamp, - splitSize, - splitLookback, - splitOpenFileCost, - isStreaming, - monitorInterval, - nameMapping, - projectedSchema, - filters, - limit, - exposeLocality, - maxPlanningSnapshotCount); - } - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java deleted file mode 100644 index 59eb1366d136..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java +++ /dev/null @@ -1,224 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.util.List; -import org.apache.flink.api.common.state.ListState; -import org.apache.flink.api.common.state.ListStateDescriptor; -import org.apache.flink.api.common.typeutils.base.LongSerializer; -import org.apache.flink.runtime.state.FunctionInitializationContext; -import org.apache.flink.runtime.state.FunctionSnapshotContext; -import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction; -import org.apache.flink.streaming.api.functions.source.RichSourceFunction; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.util.SnapshotUtil; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * This is the single (non-parallel) monitoring task which takes a {@link FlinkInputFormat}, it is - * responsible for: - * - *

    - *
  1. Monitoring snapshots of the Iceberg table. - *
  2. Creating the {@link FlinkInputSplit splits} corresponding to the incremental files - *
  3. Assigning them to downstream tasks for further processing. - *
- * - *

The splits to be read are forwarded to the downstream {@link StreamingReaderOperator} which - * can have parallelism greater than one. - */ -public class StreamingMonitorFunction extends RichSourceFunction - implements CheckpointedFunction { - - private static final Logger LOG = LoggerFactory.getLogger(StreamingMonitorFunction.class); - - private static final long INIT_LAST_SNAPSHOT_ID = -1L; - - private final TableLoader tableLoader; - private final ScanContext scanContext; - - private volatile boolean isRunning = true; - - // The checkpoint thread is not the same thread that running the function for SourceStreamTask - // now. It's necessary to - // mark this as volatile. - private volatile long lastSnapshotId = INIT_LAST_SNAPSHOT_ID; - - private transient SourceContext sourceContext; - private transient Table table; - private transient ListState lastSnapshotIdState; - - public StreamingMonitorFunction(TableLoader tableLoader, ScanContext scanContext) { - Preconditions.checkArgument( - scanContext.snapshotId() == null, "Cannot set snapshot-id option for streaming reader"); - Preconditions.checkArgument( - scanContext.asOfTimestamp() == null, - "Cannot set as-of-timestamp option for streaming reader"); - Preconditions.checkArgument( - scanContext.endSnapshotId() == null, - "Cannot set end-snapshot-id option for streaming reader"); - Preconditions.checkArgument( - scanContext.maxPlanningSnapshotCount() > 0, - "The max-planning-snapshot-count must be greater than zero"); - this.tableLoader = tableLoader; - this.scanContext = scanContext; - } - - @Override - public void initializeState(FunctionInitializationContext context) throws Exception { - // Load iceberg table from table loader. - tableLoader.open(); - table = tableLoader.loadTable(); - - // Initialize the flink state for last snapshot id. - lastSnapshotIdState = - context - .getOperatorStateStore() - .getListState(new ListStateDescriptor<>("snapshot-id-state", LongSerializer.INSTANCE)); - - // Restore the last-snapshot-id from flink's state if possible. - if (context.isRestored()) { - LOG.info("Restoring state for the {}.", getClass().getSimpleName()); - lastSnapshotId = lastSnapshotIdState.get().iterator().next(); - } else if (scanContext.startSnapshotId() != null) { - Preconditions.checkNotNull( - table.currentSnapshot(), "Don't have any available snapshot in table."); - - long currentSnapshotId = table.currentSnapshot().snapshotId(); - Preconditions.checkState( - SnapshotUtil.isAncestorOf(table, currentSnapshotId, scanContext.startSnapshotId()), - "The option start-snapshot-id %s is not an ancestor of the current snapshot.", - scanContext.startSnapshotId()); - - lastSnapshotId = scanContext.startSnapshotId(); - } - } - - @Override - public void snapshotState(FunctionSnapshotContext context) throws Exception { - lastSnapshotIdState.clear(); - lastSnapshotIdState.add(lastSnapshotId); - } - - @Override - public void run(SourceContext ctx) throws Exception { - this.sourceContext = ctx; - while (isRunning) { - monitorAndForwardSplits(); - Thread.sleep(scanContext.monitorInterval().toMillis()); - } - } - - private long toSnapshotIdInclusive( - long lastConsumedSnapshotId, long currentSnapshotId, int maxPlanningSnapshotCount) { - List snapshotIds = - SnapshotUtil.snapshotIdsBetween(table, lastConsumedSnapshotId, currentSnapshotId); - if (snapshotIds.size() <= maxPlanningSnapshotCount) { - return currentSnapshotId; - } else { - // It uses reverted index since snapshotIdsBetween returns Ids that are ordered by committed - // time descending. - return snapshotIds.get(snapshotIds.size() - maxPlanningSnapshotCount); - } - } - - @VisibleForTesting - void sourceContext(SourceContext ctx) { - this.sourceContext = ctx; - } - - @VisibleForTesting - void monitorAndForwardSplits() { - // Refresh the table to get the latest committed snapshot. - table.refresh(); - - Snapshot snapshot = table.currentSnapshot(); - if (snapshot != null && snapshot.snapshotId() != lastSnapshotId) { - long snapshotId = snapshot.snapshotId(); - - ScanContext newScanContext; - if (lastSnapshotId == INIT_LAST_SNAPSHOT_ID) { - newScanContext = scanContext.copyWithSnapshotId(snapshotId); - } else { - snapshotId = - toSnapshotIdInclusive( - lastSnapshotId, snapshotId, scanContext.maxPlanningSnapshotCount()); - newScanContext = scanContext.copyWithAppendsBetween(lastSnapshotId, snapshotId); - } - - LOG.debug( - "Start discovering splits from {} (exclusive) to {} (inclusive)", - lastSnapshotId, - snapshotId); - long start = System.currentTimeMillis(); - FlinkInputSplit[] splits = FlinkSplitGenerator.createInputSplits(table, newScanContext); - LOG.debug( - "Discovered {} splits, time elapsed {}ms", - splits.length, - System.currentTimeMillis() - start); - - // only need to hold the checkpoint lock when emitting the splits and updating lastSnapshotId - start = System.currentTimeMillis(); - synchronized (sourceContext.getCheckpointLock()) { - for (FlinkInputSplit split : splits) { - sourceContext.collect(split); - } - - lastSnapshotId = snapshotId; - } - LOG.debug( - "Forwarded {} splits, time elapsed {}ms", - splits.length, - System.currentTimeMillis() - start); - } - } - - @Override - public void cancel() { - // this is to cover the case where cancel() is called before the run() - if (sourceContext != null) { - synchronized (sourceContext.getCheckpointLock()) { - isRunning = false; - } - } else { - isRunning = false; - } - - // Release all the resources here. - if (tableLoader != null) { - try { - tableLoader.close(); - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } - } - - @Override - public void close() { - cancel(); - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java deleted file mode 100644 index 6dcf6c3e3f98..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java +++ /dev/null @@ -1,246 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.IOException; -import java.util.Queue; -import org.apache.flink.api.common.state.ListState; -import org.apache.flink.api.common.state.ListStateDescriptor; -import org.apache.flink.runtime.state.JavaSerializer; -import org.apache.flink.runtime.state.StateInitializationContext; -import org.apache.flink.runtime.state.StateSnapshotContext; -import org.apache.flink.streaming.api.functions.source.SourceFunction; -import org.apache.flink.streaming.api.operators.AbstractStreamOperator; -import org.apache.flink.streaming.api.operators.AbstractStreamOperatorFactory; -import org.apache.flink.streaming.api.operators.MailboxExecutor; -import org.apache.flink.streaming.api.operators.OneInputStreamOperator; -import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory; -import org.apache.flink.streaming.api.operators.StreamOperator; -import org.apache.flink.streaming.api.operators.StreamOperatorParameters; -import org.apache.flink.streaming.api.operators.StreamSourceContexts; -import org.apache.flink.streaming.api.operators.YieldingOperatorFactory; -import org.apache.flink.streaming.api.watermark.Watermark; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; -import org.apache.flink.streaming.runtime.tasks.ProcessingTimeService; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * The operator that reads the {@link FlinkInputSplit splits} received from the preceding {@link - * StreamingMonitorFunction}. Contrary to the {@link StreamingMonitorFunction} which has a - * parallelism of 1, this operator can have multiple parallelism. - * - *

As soon as a split descriptor is received, it is put in a queue, and use {@link - * MailboxExecutor} read the actual data of the split. This architecture allows the separation of - * the reading thread from the one split processing the checkpoint barriers, thus removing any - * potential back-pressure. - */ -public class StreamingReaderOperator extends AbstractStreamOperator - implements OneInputStreamOperator { - - private static final Logger LOG = LoggerFactory.getLogger(StreamingReaderOperator.class); - - // It's the same thread that is running this operator and checkpoint actions. we use this executor - // to schedule only - // one split for future reading, so that a new checkpoint could be triggered without blocking long - // time for exhausting - // all scheduled splits. - private final MailboxExecutor executor; - private FlinkInputFormat format; - - private transient SourceFunction.SourceContext sourceContext; - - private transient ListState inputSplitsState; - private transient Queue splits; - - // Splits are read by the same thread that calls processElement. Each read task is submitted to - // that thread by adding - // them to the executor. This state is used to ensure that only one read task is in that queue at - // a time, so that read - // tasks do not accumulate ahead of checkpoint tasks. When there is a read task in the queue, this - // is set to RUNNING. - // When there are no more files to read, this will be set to IDLE. - private transient SplitState currentSplitState; - - private StreamingReaderOperator( - FlinkInputFormat format, ProcessingTimeService timeService, MailboxExecutor mailboxExecutor) { - this.format = Preconditions.checkNotNull(format, "The InputFormat should not be null."); - this.processingTimeService = timeService; - this.executor = - Preconditions.checkNotNull(mailboxExecutor, "The mailboxExecutor should not be null."); - } - - @Override - public void initializeState(StateInitializationContext context) throws Exception { - super.initializeState(context); - - // TODO Replace Java serialization with Avro approach to keep state compatibility. - // See issue: https://github.com/apache/iceberg/issues/1698 - inputSplitsState = - context - .getOperatorStateStore() - .getListState(new ListStateDescriptor<>("splits", new JavaSerializer<>())); - - // Initialize the current split state to IDLE. - currentSplitState = SplitState.IDLE; - - // Recover splits state from flink state backend if possible. - splits = Lists.newLinkedList(); - if (context.isRestored()) { - int subtaskIdx = getRuntimeContext().getIndexOfThisSubtask(); - LOG.info("Restoring state for the {} (taskIdx: {}).", getClass().getSimpleName(), subtaskIdx); - - for (FlinkInputSplit split : inputSplitsState.get()) { - splits.add(split); - } - } - - this.sourceContext = - StreamSourceContexts.getSourceContext( - getOperatorConfig().getTimeCharacteristic(), - getProcessingTimeService(), - new Object(), // no actual locking needed - getContainingTask().getStreamStatusMaintainer(), - output, - getRuntimeContext().getExecutionConfig().getAutoWatermarkInterval(), - -1); - - // Enqueue to process the recovered input splits. - enqueueProcessSplits(); - } - - @Override - public void snapshotState(StateSnapshotContext context) throws Exception { - super.snapshotState(context); - - inputSplitsState.clear(); - inputSplitsState.addAll(Lists.newArrayList(splits)); - } - - @Override - public void processElement(StreamRecord element) { - splits.add(element.getValue()); - enqueueProcessSplits(); - } - - private void enqueueProcessSplits() { - if (currentSplitState == SplitState.IDLE && !splits.isEmpty()) { - currentSplitState = SplitState.RUNNING; - executor.execute(this::processSplits, this.getClass().getSimpleName()); - } - } - - private void processSplits() throws IOException { - FlinkInputSplit split = splits.poll(); - if (split == null) { - currentSplitState = SplitState.IDLE; - return; - } - - format.open(split); - try { - RowData nextElement = null; - while (!format.reachedEnd()) { - nextElement = format.nextRecord(nextElement); - sourceContext.collect(nextElement); - } - } finally { - currentSplitState = SplitState.IDLE; - format.close(); - } - - // Re-schedule to process the next split. - enqueueProcessSplits(); - } - - @Override - public void processWatermark(Watermark mark) { - // we do nothing because we emit our own watermarks if needed. - } - - @Override - public void dispose() throws Exception { - super.dispose(); - - if (format != null) { - format.close(); - format.closeInputFormat(); - format = null; - } - - sourceContext = null; - } - - @Override - public void close() throws Exception { - super.close(); - output.close(); - if (sourceContext != null) { - sourceContext.emitWatermark(Watermark.MAX_WATERMARK); - sourceContext.close(); - sourceContext = null; - } - } - - static OneInputStreamOperatorFactory factory(FlinkInputFormat format) { - return new OperatorFactory(format); - } - - private enum SplitState { - IDLE, - RUNNING - } - - private static class OperatorFactory extends AbstractStreamOperatorFactory - implements YieldingOperatorFactory, - OneInputStreamOperatorFactory { - - private final FlinkInputFormat format; - - private transient MailboxExecutor mailboxExecutor; - - private OperatorFactory(FlinkInputFormat format) { - this.format = format; - } - - @Override - public void setMailboxExecutor(MailboxExecutor mailboxExecutor) { - this.mailboxExecutor = mailboxExecutor; - } - - @SuppressWarnings("unchecked") - @Override - public > O createStreamOperator( - StreamOperatorParameters parameters) { - StreamingReaderOperator operator = - new StreamingReaderOperator(format, processingTimeService, mailboxExecutor); - operator.setup( - parameters.getContainingTask(), parameters.getStreamConfig(), parameters.getOutput()); - return (O) operator; - } - - @Override - public Class getStreamOperatorClass(ClassLoader classLoader) { - return StreamingReaderOperator.class; - } - } -} diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java deleted file mode 100644 index 2c5c587f4ebf..000000000000 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.util; - -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.table.api.TableColumn; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; -import org.apache.flink.table.types.logical.RowType; - -/** - * This is a small util class that try to hide calls to Flink Internal or PublicEvolve interfaces as - * Flink can change those APIs during minor version release. - */ -public class FlinkCompatibilityUtil { - - private FlinkCompatibilityUtil() {} - - public static TypeInformation toTypeInfo(RowType rowType) { - return InternalTypeInfo.of(rowType); - } - - public static boolean isPhysicalColumn(TableColumn column) { - return column.isPhysical(); - } -} diff --git a/flink/v1.13/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory b/flink/v1.13/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory deleted file mode 100644 index 29a9955a7e20..000000000000 --- a/flink/v1.13/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory +++ /dev/null @@ -1,16 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -org.apache.iceberg.flink.FlinkDynamicTableFactory diff --git a/flink/v1.13/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.TableFactory b/flink/v1.13/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.TableFactory deleted file mode 100644 index 2b6bfa3cd579..000000000000 --- a/flink/v1.13/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.TableFactory +++ /dev/null @@ -1,16 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -org.apache.iceberg.flink.FlinkCatalogFactory diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/FlinkCatalogTestBase.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/FlinkCatalogTestBase.java deleted file mode 100644 index d4da736dcd83..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/FlinkCatalogTestBase.java +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.io.IOException; -import java.util.List; -import java.util.Map; -import org.apache.flink.util.ArrayUtils; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.catalog.Catalog; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.catalog.SupportsNamespaces; -import org.apache.iceberg.hadoop.HadoopCatalog; -import org.apache.iceberg.relocated.com.google.common.base.Joiner; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.junit.After; -import org.junit.AfterClass; -import org.junit.Before; -import org.junit.BeforeClass; -import org.junit.rules.TemporaryFolder; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Parameterized.class) -public abstract class FlinkCatalogTestBase extends FlinkTestBase { - - protected static final String DATABASE = "db"; - private static TemporaryFolder hiveWarehouse = new TemporaryFolder(); - private static TemporaryFolder hadoopWarehouse = new TemporaryFolder(); - - @BeforeClass - public static void createWarehouse() throws IOException { - hiveWarehouse.create(); - hadoopWarehouse.create(); - } - - @AfterClass - public static void dropWarehouse() { - hiveWarehouse.delete(); - hadoopWarehouse.delete(); - } - - @Before - public void before() { - sql("CREATE CATALOG %s WITH %s", catalogName, toWithClause(config)); - } - - @After - public void clean() { - sql("DROP CATALOG IF EXISTS %s", catalogName); - } - - @Parameterized.Parameters(name = "catalogName = {0} baseNamespace = {1}") - public static Iterable parameters() { - return Lists.newArrayList( - new Object[] {"testhive", Namespace.empty()}, - new Object[] {"testhadoop", Namespace.empty()}, - new Object[] {"testhadoop_basenamespace", Namespace.of("l0", "l1")}); - } - - protected final String catalogName; - protected final Namespace baseNamespace; - protected final Catalog validationCatalog; - protected final SupportsNamespaces validationNamespaceCatalog; - protected final Map config = Maps.newHashMap(); - - protected final String flinkDatabase; - protected final Namespace icebergNamespace; - protected final boolean isHadoopCatalog; - - public FlinkCatalogTestBase(String catalogName, Namespace baseNamespace) { - this.catalogName = catalogName; - this.baseNamespace = baseNamespace; - this.isHadoopCatalog = catalogName.startsWith("testhadoop"); - this.validationCatalog = - isHadoopCatalog - ? new HadoopCatalog(hiveConf, "file:" + hadoopWarehouse.getRoot()) - : catalog; - this.validationNamespaceCatalog = (SupportsNamespaces) validationCatalog; - - config.put("type", "iceberg"); - if (!baseNamespace.isEmpty()) { - config.put(FlinkCatalogFactory.BASE_NAMESPACE, baseNamespace.toString()); - } - if (isHadoopCatalog) { - config.put(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, "hadoop"); - } else { - config.put(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, "hive"); - config.put(CatalogProperties.URI, getURI(hiveConf)); - } - config.put(CatalogProperties.WAREHOUSE_LOCATION, String.format("file://%s", warehouseRoot())); - - this.flinkDatabase = catalogName + "." + DATABASE; - this.icebergNamespace = - Namespace.of(ArrayUtils.concat(baseNamespace.levels(), new String[] {DATABASE})); - } - - protected String warehouseRoot() { - if (isHadoopCatalog) { - return hadoopWarehouse.getRoot().getAbsolutePath(); - } else { - return hiveWarehouse.getRoot().getAbsolutePath(); - } - } - - protected String getFullQualifiedTableName(String tableName) { - final List levels = Lists.newArrayList(icebergNamespace.levels()); - levels.add(tableName); - return Joiner.on('.').join(levels); - } - - static String getURI(HiveConf conf) { - return conf.get(HiveConf.ConfVars.METASTOREURIS.varname); - } - - static String toWithClause(Map props) { - StringBuilder builder = new StringBuilder(); - builder.append("("); - int propCount = 0; - for (Map.Entry entry : props.entrySet()) { - if (propCount > 0) { - builder.append(","); - } - builder - .append("'") - .append(entry.getKey()) - .append("'") - .append("=") - .append("'") - .append(entry.getValue()) - .append("'"); - propCount++; - } - builder.append(")"); - return builder.toString(); - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/FlinkTestBase.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/FlinkTestBase.java deleted file mode 100644 index 3b9f6268eb22..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/FlinkTestBase.java +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.util.List; -import org.apache.flink.table.api.EnvironmentSettings; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.table.api.TableResult; -import org.apache.flink.test.util.MiniClusterWithClientResource; -import org.apache.flink.test.util.TestBaseUtils; -import org.apache.flink.types.Row; -import org.apache.flink.util.CloseableIterator; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.iceberg.CatalogUtil; -import org.apache.iceberg.hive.HiveCatalog; -import org.apache.iceberg.hive.TestHiveMetastore; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.assertj.core.api.Assertions; -import org.junit.AfterClass; -import org.junit.BeforeClass; -import org.junit.ClassRule; -import org.junit.rules.TemporaryFolder; - -public abstract class FlinkTestBase extends TestBaseUtils { - - @ClassRule - public static MiniClusterWithClientResource miniClusterResource = - MiniClusterResource.createWithClassloaderCheckDisabled(); - - @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); - - private static TestHiveMetastore metastore = null; - protected static HiveConf hiveConf = null; - protected static HiveCatalog catalog = null; - - private volatile TableEnvironment tEnv = null; - - @BeforeClass - public static void startMetastore() { - FlinkTestBase.metastore = new TestHiveMetastore(); - metastore.start(); - FlinkTestBase.hiveConf = metastore.hiveConf(); - FlinkTestBase.catalog = - (HiveCatalog) - CatalogUtil.loadCatalog( - HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); - } - - @AfterClass - public static void stopMetastore() throws Exception { - metastore.stop(); - FlinkTestBase.catalog = null; - } - - protected TableEnvironment getTableEnv() { - if (tEnv == null) { - synchronized (this) { - if (tEnv == null) { - EnvironmentSettings settings = - EnvironmentSettings.newInstance().useBlinkPlanner().inBatchMode().build(); - - TableEnvironment env = TableEnvironment.create(settings); - env.getConfig() - .getConfiguration() - .set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, false); - tEnv = env; - } - } - } - return tEnv; - } - - protected static TableResult exec(TableEnvironment env, String query, Object... args) { - return env.executeSql(String.format(query, args)); - } - - protected TableResult exec(String query, Object... args) { - return exec(getTableEnv(), query, args); - } - - protected List sql(String query, Object... args) { - TableResult tableResult = exec(query, args); - try (CloseableIterator iter = tableResult.collect()) { - return Lists.newArrayList(iter); - } catch (Exception e) { - throw new RuntimeException("Failed to collect table result", e); - } - } - - protected void assertSameElements(Iterable expected, Iterable actual) { - Assertions.assertThat(actual).isNotNull().containsExactlyInAnyOrderElementsOf(expected); - } - - protected void assertSameElements(String message, Iterable expected, Iterable actual) { - Assertions.assertThat(actual) - .isNotNull() - .as(message) - .containsExactlyInAnyOrderElementsOf(expected); - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java deleted file mode 100644 index 45af9241b743..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.CoreOptions; -import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; -import org.apache.flink.test.util.MiniClusterWithClientResource; - -public class MiniClusterResource { - - private static final int DEFAULT_TM_NUM = 1; - private static final int DEFAULT_PARALLELISM = 4; - - public static final Configuration DISABLE_CLASSLOADER_CHECK_CONFIG = - new Configuration() - // disable classloader check as Avro may cache class/object in the serializers. - .set(CoreOptions.CHECK_LEAKED_CLASSLOADER, false); - - private MiniClusterResource() {} - - /** - * It will start a mini cluster with classloader.check-leaked-classloader=false, so that we won't - * break the unit tests because of the class loader leak issue. In our iceberg integration tests, - * there're some that will assert the results after finished the flink jobs, so actually we may - * access the class loader that has been closed by the flink task managers if we enable the switch - * classloader.check-leaked-classloader by default. - */ - public static MiniClusterWithClientResource createWithClassloaderCheckDisabled() { - return new MiniClusterWithClientResource( - new MiniClusterResourceConfiguration.Builder() - .setNumberTaskManagers(DEFAULT_TM_NUM) - .setNumberSlotsPerTaskManager(DEFAULT_PARALLELISM) - .setConfiguration(DISABLE_CLASSLOADER_CHECK_CONFIG) - .build()); - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java deleted file mode 100644 index c73fa1e4bc97..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.math.BigDecimal; -import java.nio.ByteBuffer; -import java.time.Instant; -import java.time.LocalDate; -import java.time.LocalDateTime; -import java.time.LocalTime; -import java.time.OffsetDateTime; -import java.time.ZoneOffset; -import java.time.temporal.ChronoUnit; -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import java.util.UUID; -import java.util.concurrent.TimeUnit; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.GenericArrayData; -import org.apache.flink.table.data.GenericMapData; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; - -public class RowDataConverter { - private static final OffsetDateTime EPOCH = Instant.ofEpochSecond(0).atOffset(ZoneOffset.UTC); - private static final LocalDate EPOCH_DAY = EPOCH.toLocalDate(); - - private RowDataConverter() {} - - public static RowData convert(Schema iSchema, Record record) { - return convert(iSchema.asStruct(), record); - } - - private static RowData convert(Types.StructType struct, Record record) { - GenericRowData rowData = new GenericRowData(struct.fields().size()); - List fields = struct.fields(); - for (int i = 0; i < fields.size(); i += 1) { - Types.NestedField field = fields.get(i); - - Type fieldType = field.type(); - - switch (fieldType.typeId()) { - case STRUCT: - rowData.setField(i, convert(fieldType.asStructType(), record.get(i))); - break; - case LIST: - rowData.setField(i, convert(fieldType.asListType(), record.get(i))); - break; - case MAP: - rowData.setField(i, convert(fieldType.asMapType(), record.get(i))); - break; - default: - rowData.setField(i, convert(fieldType, record.get(i))); - } - } - return rowData; - } - - private static Object convert(Type type, Object object) { - if (object == null) { - return null; - } - - switch (type.typeId()) { - case BOOLEAN: - case INTEGER: - case LONG: - case FLOAT: - case DOUBLE: - case FIXED: - return object; - case DATE: - return (int) ChronoUnit.DAYS.between(EPOCH_DAY, (LocalDate) object); - case TIME: - // Iceberg's time is in microseconds, while flink's time is in milliseconds. - LocalTime localTime = (LocalTime) object; - return (int) TimeUnit.NANOSECONDS.toMillis(localTime.toNanoOfDay()); - case TIMESTAMP: - if (((Types.TimestampType) type).shouldAdjustToUTC()) { - return TimestampData.fromInstant(((OffsetDateTime) object).toInstant()); - } else { - return TimestampData.fromLocalDateTime((LocalDateTime) object); - } - case STRING: - return StringData.fromString((String) object); - case UUID: - UUID uuid = (UUID) object; - ByteBuffer bb = ByteBuffer.allocate(16); - bb.putLong(uuid.getMostSignificantBits()); - bb.putLong(uuid.getLeastSignificantBits()); - return bb.array(); - case BINARY: - ByteBuffer buffer = (ByteBuffer) object; - return Arrays.copyOfRange( - buffer.array(), - buffer.arrayOffset() + buffer.position(), - buffer.arrayOffset() + buffer.remaining()); - case DECIMAL: - Types.DecimalType decimalType = (Types.DecimalType) type; - return DecimalData.fromBigDecimal( - (BigDecimal) object, decimalType.precision(), decimalType.scale()); - case STRUCT: - return convert(type.asStructType(), (Record) object); - case LIST: - List list = (List) object; - Object[] convertedArray = new Object[list.size()]; - for (int i = 0; i < convertedArray.length; i++) { - convertedArray[i] = convert(type.asListType().elementType(), list.get(i)); - } - return new GenericArrayData(convertedArray); - case MAP: - Map convertedMap = Maps.newLinkedHashMap(); - Map map = (Map) object; - for (Map.Entry entry : map.entrySet()) { - convertedMap.put( - convert(type.asMapType().keyType(), entry.getKey()), - convert(type.asMapType().valueType(), entry.getValue())); - } - return new GenericMapData(convertedMap); - default: - throw new UnsupportedOperationException("Not a supported type: " + type); - } - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java deleted file mode 100644 index 54c226bffc6e..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java +++ /dev/null @@ -1,328 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.apache.iceberg.hadoop.HadoopOutputFile.fromPath; - -import java.io.IOException; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.types.RowKind; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DataFiles; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableScan; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.IcebergGenerics; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.deletes.EqualityDeleteWriter; -import org.apache.iceberg.deletes.PositionDelete; -import org.apache.iceberg.deletes.PositionDeleteWriter; -import org.apache.iceberg.encryption.EncryptedOutputFile; -import org.apache.iceberg.flink.sink.FlinkAppenderFactory; -import org.apache.iceberg.hadoop.HadoopInputFile; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.Pair; -import org.apache.iceberg.util.StructLikeSet; -import org.apache.iceberg.util.StructLikeWrapper; -import org.junit.Assert; - -public class SimpleDataUtil { - - private SimpleDataUtil() {} - - public static final Schema SCHEMA = - new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get())); - - public static final TableSchema FLINK_SCHEMA = - TableSchema.builder().field("id", DataTypes.INT()).field("data", DataTypes.STRING()).build(); - - public static final RowType ROW_TYPE = (RowType) FLINK_SCHEMA.toRowDataType().getLogicalType(); - - public static final Record RECORD = GenericRecord.create(SCHEMA); - - public static Table createTable( - String path, Map properties, boolean partitioned) { - PartitionSpec spec; - if (partitioned) { - spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); - } else { - spec = PartitionSpec.unpartitioned(); - } - return new HadoopTables().create(SCHEMA, spec, properties, path); - } - - public static Record createRecord(Integer id, String data) { - Record record = RECORD.copy(); - record.setField("id", id); - record.setField("data", data); - return record; - } - - public static RowData createRowData(Integer id, String data) { - return GenericRowData.of(id, StringData.fromString(data)); - } - - public static RowData createInsert(Integer id, String data) { - return GenericRowData.ofKind(RowKind.INSERT, id, StringData.fromString(data)); - } - - public static RowData createDelete(Integer id, String data) { - return GenericRowData.ofKind(RowKind.DELETE, id, StringData.fromString(data)); - } - - public static RowData createUpdateBefore(Integer id, String data) { - return GenericRowData.ofKind(RowKind.UPDATE_BEFORE, id, StringData.fromString(data)); - } - - public static RowData createUpdateAfter(Integer id, String data) { - return GenericRowData.ofKind(RowKind.UPDATE_AFTER, id, StringData.fromString(data)); - } - - public static DataFile writeFile( - Schema schema, - PartitionSpec spec, - Configuration conf, - String location, - String filename, - List rows) - throws IOException { - Path path = new Path(location, filename); - FileFormat fileFormat = FileFormat.fromFileName(filename); - Preconditions.checkNotNull(fileFormat, "Cannot determine format for file: %s", filename); - - RowType flinkSchema = FlinkSchemaUtil.convert(schema); - FileAppenderFactory appenderFactory = - new FlinkAppenderFactory(schema, flinkSchema, ImmutableMap.of(), spec); - - FileAppender appender = appenderFactory.newAppender(fromPath(path, conf), fileFormat); - try (FileAppender closeableAppender = appender) { - closeableAppender.addAll(rows); - } - - return DataFiles.builder(spec) - .withInputFile(HadoopInputFile.fromPath(path, conf)) - .withMetrics(appender.metrics()) - .build(); - } - - public static DeleteFile writeEqDeleteFile( - Table table, - FileFormat format, - String tablePath, - String filename, - FileAppenderFactory appenderFactory, - List deletes) - throws IOException { - EncryptedOutputFile outputFile = - table.encryption().encrypt(fromPath(new Path(tablePath, filename), new Configuration())); - - EqualityDeleteWriter eqWriter = - appenderFactory.newEqDeleteWriter(outputFile, format, null); - try (EqualityDeleteWriter writer = eqWriter) { - writer.write(deletes); - } - return eqWriter.toDeleteFile(); - } - - public static DeleteFile writePosDeleteFile( - Table table, - FileFormat format, - String tablePath, - String filename, - FileAppenderFactory appenderFactory, - List> positions) - throws IOException { - EncryptedOutputFile outputFile = - table.encryption().encrypt(fromPath(new Path(tablePath, filename), new Configuration())); - - PositionDeleteWriter posWriter = - appenderFactory.newPosDeleteWriter(outputFile, format, null); - PositionDelete posDelete = PositionDelete.create(); - try (PositionDeleteWriter writer = posWriter) { - for (Pair p : positions) { - writer.write(posDelete.set(p.first(), p.second(), null)); - } - } - return posWriter.toDeleteFile(); - } - - private static List convertToRecords(List rows) { - List records = Lists.newArrayList(); - for (RowData row : rows) { - Integer id = row.isNullAt(0) ? null : row.getInt(0); - String data = row.isNullAt(1) ? null : row.getString(1).toString(); - records.add(createRecord(id, data)); - } - return records; - } - - public static void assertTableRows(String tablePath, List expected) throws IOException { - assertTableRecords(tablePath, convertToRecords(expected)); - } - - public static void assertTableRows(Table table, List expected) throws IOException { - assertTableRecords(table, convertToRecords(expected)); - } - - public static void assertTableRecords(Table table, List expected) throws IOException { - table.refresh(); - - Types.StructType type = table.schema().asStruct(); - StructLikeSet expectedSet = StructLikeSet.create(type); - expectedSet.addAll(expected); - - try (CloseableIterable iterable = IcebergGenerics.read(table).build()) { - StructLikeSet actualSet = StructLikeSet.create(type); - - for (Record record : iterable) { - actualSet.add(record); - } - - Assert.assertEquals("Should produce the expected record", expectedSet, actualSet); - } - } - - public static void assertTableRecords(String tablePath, List expected) - throws IOException { - Preconditions.checkArgument(expected != null, "expected records shouldn't be null"); - assertTableRecords(new HadoopTables().load(tablePath), expected); - } - - public static StructLikeSet expectedRowSet(Table table, Record... records) { - StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); - Collections.addAll(set, records); - return set; - } - - public static StructLikeSet actualRowSet(Table table, String... columns) throws IOException { - return actualRowSet(table, null, columns); - } - - public static StructLikeSet actualRowSet(Table table, Long snapshotId, String... columns) - throws IOException { - table.refresh(); - StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); - try (CloseableIterable reader = - IcebergGenerics.read(table) - .useSnapshot(snapshotId == null ? table.currentSnapshot().snapshotId() : snapshotId) - .select(columns) - .build()) { - reader.forEach(set::add); - } - return set; - } - - public static List partitionDataFiles(Table table, Map partitionValues) - throws IOException { - table.refresh(); - Types.StructType partitionType = table.spec().partitionType(); - - Record partitionRecord = GenericRecord.create(partitionType).copy(partitionValues); - StructLikeWrapper expectedWrapper = - StructLikeWrapper.forType(partitionType).set(partitionRecord); - - List dataFiles = Lists.newArrayList(); - try (CloseableIterable fileScanTasks = table.newScan().planFiles()) { - for (FileScanTask scanTask : fileScanTasks) { - StructLikeWrapper wrapper = - StructLikeWrapper.forType(partitionType).set(scanTask.file().partition()); - - if (expectedWrapper.equals(wrapper)) { - dataFiles.add(scanTask.file()); - } - } - } - - return dataFiles; - } - - public static Map> snapshotToDataFiles(Table table) throws IOException { - table.refresh(); - - Map> result = Maps.newHashMap(); - Snapshot current = table.currentSnapshot(); - while (current != null) { - TableScan tableScan = table.newScan(); - if (current.parentId() != null) { - // Collect the data files that was added only in current snapshot. - tableScan = tableScan.appendsBetween(current.parentId(), current.snapshotId()); - } else { - // Collect the data files that was added in the oldest snapshot. - tableScan = tableScan.useSnapshot(current.snapshotId()); - } - try (CloseableIterable scanTasks = tableScan.planFiles()) { - result.put( - current.snapshotId(), - ImmutableList.copyOf(Iterables.transform(scanTasks, FileScanTask::file))); - } - - // Continue to traverse the parent snapshot if exists. - if (current.parentId() == null) { - break; - } - // Iterate to the parent snapshot. - current = table.snapshot(current.parentId()); - } - return result; - } - - public static List matchingPartitions( - List dataFiles, PartitionSpec partitionSpec, Map partitionValues) { - Types.StructType partitionType = partitionSpec.partitionType(); - Record partitionRecord = GenericRecord.create(partitionType).copy(partitionValues); - StructLikeWrapper expected = StructLikeWrapper.forType(partitionType).set(partitionRecord); - return dataFiles.stream() - .filter( - df -> { - StructLikeWrapper wrapper = - StructLikeWrapper.forType(partitionType).set(df.partition()); - return wrapper.equals(expected); - }) - .collect(Collectors.toList()); - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java deleted file mode 100644 index 3b0fe69c5655..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.IOException; -import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; -import java.util.Map; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.hadoop.HadoopFileIO; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; -import org.assertj.core.api.Assertions; -import org.junit.AfterClass; -import org.junit.Assert; -import org.junit.BeforeClass; -import org.junit.Test; - -/** Test for {@link CatalogLoader} and {@link TableLoader}. */ -public class TestCatalogTableLoader extends FlinkTestBase { - - private static File warehouse = null; - private static final TableIdentifier IDENTIFIER = TableIdentifier.of("default", "my_table"); - private static final Schema SCHEMA = - new Schema(Types.NestedField.required(1, "f1", Types.StringType.get())); - - @BeforeClass - public static void createWarehouse() throws IOException { - warehouse = File.createTempFile("warehouse", null); - Assert.assertTrue(warehouse.delete()); - hiveConf.set("my_key", "my_value"); - } - - @AfterClass - public static void dropWarehouse() { - if (warehouse != null && warehouse.exists()) { - warehouse.delete(); - } - } - - @Test - public void testHadoopCatalogLoader() throws IOException, ClassNotFoundException { - Map properties = Maps.newHashMap(); - properties.put(CatalogProperties.WAREHOUSE_LOCATION, "file:" + warehouse); - CatalogLoader loader = CatalogLoader.hadoop("my_catalog", hiveConf, properties); - validateCatalogLoader(loader); - } - - @Test - public void testHiveCatalogLoader() throws IOException, ClassNotFoundException { - CatalogLoader loader = CatalogLoader.hive("my_catalog", hiveConf, Maps.newHashMap()); - validateCatalogLoader(loader); - } - - @Test - public void testHadoopTableLoader() throws IOException, ClassNotFoundException { - String location = "file:" + warehouse + "/my_table"; - new HadoopTables(hiveConf).create(SCHEMA, location); - validateTableLoader(TableLoader.fromHadoopTable(location, hiveConf)); - } - - @Test - public void testHiveCatalogTableLoader() throws IOException, ClassNotFoundException { - CatalogLoader catalogLoader = CatalogLoader.hive("my_catalog", hiveConf, Maps.newHashMap()); - validateTableLoader(TableLoader.fromCatalog(catalogLoader, IDENTIFIER)); - } - - private static void validateCatalogLoader(CatalogLoader loader) - throws IOException, ClassNotFoundException { - Table table = javaSerAndDeSer(loader).loadCatalog().createTable(IDENTIFIER, SCHEMA); - validateHadoopConf(table); - } - - private static void validateTableLoader(TableLoader loader) - throws IOException, ClassNotFoundException { - TableLoader copied = javaSerAndDeSer(loader); - copied.open(); - try { - validateHadoopConf(copied.loadTable()); - } finally { - copied.close(); - } - } - - private static void validateHadoopConf(Table table) { - FileIO io = table.io(); - Assertions.assertThat(io) - .as("FileIO should be a HadoopFileIO") - .isInstanceOf(HadoopFileIO.class); - HadoopFileIO hadoopIO = (HadoopFileIO) io; - Assert.assertEquals("my_value", hadoopIO.conf().get("my_key")); - } - - @SuppressWarnings("unchecked") - private static T javaSerAndDeSer(T object) throws IOException, ClassNotFoundException { - ByteArrayOutputStream bytes = new ByteArrayOutputStream(); - try (ObjectOutputStream out = new ObjectOutputStream(bytes)) { - out.writeObject(object); - } - - try (ObjectInputStream in = - new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { - return (T) in.readObject(); - } - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java deleted file mode 100644 index 9987a16c7682..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java +++ /dev/null @@ -1,301 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.io.File; -import java.io.IOException; -import java.util.List; -import org.apache.flink.types.Row; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.BaseTable; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableMetadata; -import org.apache.iceberg.TableOperations; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.source.BoundedTableFactory; -import org.apache.iceberg.flink.source.ChangeLogTableTestBase; -import org.apache.iceberg.relocated.com.google.common.base.Joiner; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.util.StructLikeSet; -import org.junit.After; -import org.junit.Assert; -import org.junit.Before; -import org.junit.BeforeClass; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -/** - * In this test case, we mainly cover the impact of primary key selection, multiple operations - * within a single transaction, and multiple operations between different txn on the correctness of - * the data. - */ -@RunWith(Parameterized.class) -public class TestChangeLogTable extends ChangeLogTableTestBase { - private static final Configuration CONF = new Configuration(); - private static final String SOURCE_TABLE = "default_catalog.default_database.source_change_logs"; - - private static final String CATALOG_NAME = "test_catalog"; - private static final String DATABASE_NAME = "test_db"; - private static final String TABLE_NAME = "test_table"; - private static String warehouse; - - private final boolean partitioned; - - @Parameterized.Parameters(name = "PartitionedTable={0}") - public static Iterable parameters() { - return ImmutableList.of(new Object[] {true}, new Object[] {false}); - } - - public TestChangeLogTable(boolean partitioned) { - this.partitioned = partitioned; - } - - @BeforeClass - public static void createWarehouse() throws IOException { - File warehouseFile = TEMPORARY_FOLDER.newFolder(); - Assert.assertTrue("The warehouse should be deleted", warehouseFile.delete()); - warehouse = String.format("file:%s", warehouseFile); - } - - @Before - public void before() { - sql( - "CREATE CATALOG %s WITH ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", - CATALOG_NAME, warehouse); - sql("USE CATALOG %s", CATALOG_NAME); - sql("CREATE DATABASE %s", DATABASE_NAME); - sql("USE %s", DATABASE_NAME); - } - - @After - @Override - public void clean() { - sql("DROP TABLE IF EXISTS %s", TABLE_NAME); - sql("DROP DATABASE IF EXISTS %s", DATABASE_NAME); - sql("DROP CATALOG IF EXISTS %s", CATALOG_NAME); - BoundedTableFactory.clearDataSets(); - } - - @Test - public void testSqlChangeLogOnIdKey() throws Exception { - List> inputRowsPerCheckpoint = - ImmutableList.of( - ImmutableList.of( - insertRow(1, "aaa"), - deleteRow(1, "aaa"), - insertRow(1, "bbb"), - insertRow(2, "aaa"), - deleteRow(2, "aaa"), - insertRow(2, "bbb")), - ImmutableList.of( - updateBeforeRow(2, "bbb"), - updateAfterRow(2, "ccc"), - deleteRow(2, "ccc"), - insertRow(2, "ddd")), - ImmutableList.of( - deleteRow(1, "bbb"), - insertRow(1, "ccc"), - deleteRow(1, "ccc"), - insertRow(1, "ddd"))); - - List> expectedRecordsPerCheckpoint = - ImmutableList.of( - ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "bbb")), - ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "ddd")), - ImmutableList.of(insertRow(1, "ddd"), insertRow(2, "ddd"))); - - testSqlChangeLog( - TABLE_NAME, ImmutableList.of("id"), inputRowsPerCheckpoint, expectedRecordsPerCheckpoint); - } - - @Test - public void testChangeLogOnDataKey() throws Exception { - List> elementsPerCheckpoint = - ImmutableList.of( - ImmutableList.of( - insertRow(1, "aaa"), - deleteRow(1, "aaa"), - insertRow(2, "bbb"), - insertRow(1, "bbb"), - insertRow(2, "aaa")), - ImmutableList.of( - updateBeforeRow(2, "aaa"), updateAfterRow(1, "ccc"), insertRow(1, "aaa")), - ImmutableList.of(deleteRow(1, "bbb"), insertRow(2, "aaa"), insertRow(2, "ccc"))); - - List> expectedRecords = - ImmutableList.of( - ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "aaa")), - ImmutableList.of(insertRow(1, "aaa"), insertRow(1, "bbb"), insertRow(1, "ccc")), - ImmutableList.of( - insertRow(1, "aaa"), - insertRow(1, "ccc"), - insertRow(2, "aaa"), - insertRow(2, "ccc"))); - - testSqlChangeLog(TABLE_NAME, ImmutableList.of("data"), elementsPerCheckpoint, expectedRecords); - } - - @Test - public void testChangeLogOnIdDataKey() throws Exception { - List> elementsPerCheckpoint = - ImmutableList.of( - ImmutableList.of( - insertRow(1, "aaa"), - deleteRow(1, "aaa"), - insertRow(2, "bbb"), - insertRow(1, "bbb"), - insertRow(2, "aaa")), - ImmutableList.of( - updateBeforeRow(2, "aaa"), updateAfterRow(1, "ccc"), insertRow(1, "aaa")), - ImmutableList.of(deleteRow(1, "bbb"), insertRow(2, "aaa"))); - - List> expectedRecords = - ImmutableList.of( - ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "aaa"), insertRow(2, "bbb")), - ImmutableList.of( - insertRow(1, "aaa"), insertRow(1, "bbb"), insertRow(1, "ccc"), insertRow(2, "bbb")), - ImmutableList.of( - insertRow(1, "aaa"), - insertRow(1, "ccc"), - insertRow(2, "aaa"), - insertRow(2, "bbb"))); - - testSqlChangeLog( - TABLE_NAME, ImmutableList.of("data", "id"), elementsPerCheckpoint, expectedRecords); - } - - @Test - public void testPureInsertOnIdKey() throws Exception { - List> elementsPerCheckpoint = - ImmutableList.of( - ImmutableList.of(insertRow(1, "aaa"), insertRow(2, "bbb")), - ImmutableList.of(insertRow(3, "ccc"), insertRow(4, "ddd")), - ImmutableList.of(insertRow(5, "eee"), insertRow(6, "fff"))); - - List> expectedRecords = - ImmutableList.of( - ImmutableList.of(insertRow(1, "aaa"), insertRow(2, "bbb")), - ImmutableList.of( - insertRow(1, "aaa"), insertRow(2, "bbb"), insertRow(3, "ccc"), insertRow(4, "ddd")), - ImmutableList.of( - insertRow(1, "aaa"), - insertRow(2, "bbb"), - insertRow(3, "ccc"), - insertRow(4, "ddd"), - insertRow(5, "eee"), - insertRow(6, "fff"))); - - testSqlChangeLog(TABLE_NAME, ImmutableList.of("data"), elementsPerCheckpoint, expectedRecords); - } - - private static Record record(int id, String data) { - return SimpleDataUtil.createRecord(id, data); - } - - private Table createTable(String tableName, List key, boolean isPartitioned) { - String partitionByCause = isPartitioned ? "PARTITIONED BY (data)" : ""; - sql( - "CREATE TABLE %s(id INT, data VARCHAR, PRIMARY KEY(%s) NOT ENFORCED) %s", - tableName, Joiner.on(',').join(key), partitionByCause); - - // Upgrade the iceberg table to format v2. - CatalogLoader loader = - CatalogLoader.hadoop( - "my_catalog", CONF, ImmutableMap.of(CatalogProperties.WAREHOUSE_LOCATION, warehouse)); - Table table = loader.loadCatalog().loadTable(TableIdentifier.of(DATABASE_NAME, TABLE_NAME)); - TableOperations ops = ((BaseTable) table).operations(); - TableMetadata meta = ops.current(); - ops.commit(meta, meta.upgradeToFormatVersion(2)); - - return table; - } - - private void testSqlChangeLog( - String tableName, - List key, - List> inputRowsPerCheckpoint, - List> expectedRecordsPerCheckpoint) - throws Exception { - String dataId = BoundedTableFactory.registerDataSet(inputRowsPerCheckpoint); - sql( - "CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL)" - + " WITH ('connector'='BoundedSource', 'data-id'='%s')", - SOURCE_TABLE, dataId); - - Assert.assertEquals( - "Should have the expected rows", - listJoin(inputRowsPerCheckpoint), - sql("SELECT * FROM %s", SOURCE_TABLE)); - - Table table = createTable(tableName, key, partitioned); - sql("INSERT INTO %s SELECT * FROM %s", tableName, SOURCE_TABLE); - - table.refresh(); - List snapshots = findValidSnapshots(table); - int expectedSnapshotNum = expectedRecordsPerCheckpoint.size(); - Assert.assertEquals( - "Should have the expected snapshot number", expectedSnapshotNum, snapshots.size()); - - for (int i = 0; i < expectedSnapshotNum; i++) { - long snapshotId = snapshots.get(i).snapshotId(); - List expectedRows = expectedRecordsPerCheckpoint.get(i); - Assert.assertEquals( - "Should have the expected records for the checkpoint#" + i, - expectedRowSet(table, expectedRows), - actualRowSet(table, snapshotId)); - } - - if (expectedSnapshotNum > 0) { - Assert.assertEquals( - "Should have the expected rows in the final table", - Sets.newHashSet(expectedRecordsPerCheckpoint.get(expectedSnapshotNum - 1)), - Sets.newHashSet(sql("SELECT * FROM %s", tableName))); - } - } - - private List findValidSnapshots(Table table) { - List validSnapshots = Lists.newArrayList(); - for (Snapshot snapshot : table.snapshots()) { - if (snapshot.allManifests(table.io()).stream() - .anyMatch(m -> snapshot.snapshotId() == m.snapshotId())) { - validSnapshots.add(snapshot); - } - } - return validSnapshots; - } - - private static StructLikeSet expectedRowSet(Table table, List rows) { - Record[] records = new Record[rows.size()]; - for (int i = 0; i < records.length; i++) { - records[i] = record((int) rows.get(i).getField(0), (String) rows.get(i).getField(1)); - } - return SimpleDataUtil.expectedRowSet(table, records); - } - - private static StructLikeSet actualRowSet(Table table, long snapshotId) throws IOException { - return SimpleDataUtil.actualRowSet(table, snapshotId, "*"); - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java deleted file mode 100644 index e9372adda4c1..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java +++ /dev/null @@ -1,207 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.util.Map; -import org.apache.flink.api.common.ExecutionConfig; -import org.apache.flink.api.java.typeutils.runtime.kryo.KryoSerializer; -import org.apache.flink.core.memory.DataInputDeserializer; -import org.apache.flink.core.memory.DataOutputSerializer; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DataFiles; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.FileMetadata; -import org.apache.iceberg.Metrics; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; -import org.assertj.core.api.Assertions; -import org.junit.Test; - -public class TestDataFileSerialization { - - private static final Schema DATE_SCHEMA = - new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - optional(4, "double", Types.DoubleType.get())); - - private static final PartitionSpec PARTITION_SPEC = - PartitionSpec.builderFor(DATE_SCHEMA).identity("date").build(); - - private static final Map COLUMN_SIZES = Maps.newHashMap(); - private static final Map VALUE_COUNTS = Maps.newHashMap(); - private static final Map NULL_VALUE_COUNTS = Maps.newHashMap(); - private static final Map NAN_VALUE_COUNTS = Maps.newHashMap(); - private static final Map LOWER_BOUNDS = Maps.newHashMap(); - private static final Map UPPER_BOUNDS = Maps.newHashMap(); - - static { - COLUMN_SIZES.put(1, 2L); - COLUMN_SIZES.put(2, 3L); - VALUE_COUNTS.put(1, 5L); - VALUE_COUNTS.put(2, 3L); - VALUE_COUNTS.put(4, 2L); - NULL_VALUE_COUNTS.put(1, 0L); - NULL_VALUE_COUNTS.put(2, 2L); - NAN_VALUE_COUNTS.put(4, 1L); - LOWER_BOUNDS.put(1, longToBuffer(0L)); - UPPER_BOUNDS.put(1, longToBuffer(4L)); - } - - private static final Metrics METRICS = - new Metrics( - 5L, null, VALUE_COUNTS, NULL_VALUE_COUNTS, NAN_VALUE_COUNTS, LOWER_BOUNDS, UPPER_BOUNDS); - - private static final DataFile DATA_FILE = - DataFiles.builder(PARTITION_SPEC) - .withPath("/path/to/data-1.parquet") - .withFileSizeInBytes(1234) - .withPartitionPath("date=2018-06-08") - .withMetrics(METRICS) - .withSplitOffsets(ImmutableList.of(4L)) - .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(34)) - .withSortOrder(SortOrder.unsorted()) - .build(); - - private static final DeleteFile POS_DELETE_FILE = - FileMetadata.deleteFileBuilder(PARTITION_SPEC) - .ofPositionDeletes() - .withPath("/path/to/pos-delete.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("date=2018-06-08") - .withMetrics(METRICS) - .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(35)) - .withRecordCount(23) - .build(); - - private static final DeleteFile EQ_DELETE_FILE = - FileMetadata.deleteFileBuilder(PARTITION_SPEC) - .ofEqualityDeletes(2, 3) - .withPath("/path/to/equality-delete.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("date=2018-06-08") - .withMetrics(METRICS) - .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(35)) - .withRecordCount(23) - .withSortOrder(SortOrder.unsorted()) - .build(); - - @Test - public void testJavaSerialization() throws Exception { - ByteArrayOutputStream bytes = new ByteArrayOutputStream(); - try (ObjectOutputStream out = new ObjectOutputStream(bytes)) { - out.writeObject(DATA_FILE); - out.writeObject(DATA_FILE.copy()); - - out.writeObject(POS_DELETE_FILE); - out.writeObject(POS_DELETE_FILE.copy()); - - out.writeObject(EQ_DELETE_FILE); - out.writeObject(EQ_DELETE_FILE.copy()); - } - - try (ObjectInputStream in = - new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { - for (int i = 0; i < 2; i += 1) { - Object obj = in.readObject(); - Assertions.assertThat(obj).as("Should be a DataFile").isInstanceOf(DataFile.class); - TestHelpers.assertEquals(DATA_FILE, (DataFile) obj); - } - - for (int i = 0; i < 2; i += 1) { - Object obj = in.readObject(); - Assertions.assertThat(obj) - .as("Should be a position DeleteFile") - .isInstanceOf(DeleteFile.class); - TestHelpers.assertEquals(POS_DELETE_FILE, (DeleteFile) obj); - } - - for (int i = 0; i < 2; i += 1) { - Object obj = in.readObject(); - Assertions.assertThat(obj) - .as("Should be a equality DeleteFile") - .isInstanceOf(DeleteFile.class); - TestHelpers.assertEquals(EQ_DELETE_FILE, (DeleteFile) obj); - } - } - } - - @Test - public void testDataFileKryoSerialization() throws IOException { - KryoSerializer kryo = new KryoSerializer<>(DataFile.class, new ExecutionConfig()); - - DataOutputSerializer outputView = new DataOutputSerializer(1024); - - kryo.serialize(DATA_FILE, outputView); - kryo.serialize(DATA_FILE.copy(), outputView); - - DataInputDeserializer inputView = new DataInputDeserializer(outputView.getCopyOfBuffer()); - DataFile dataFile1 = kryo.deserialize(inputView); - DataFile dataFile2 = kryo.deserialize(inputView); - - TestHelpers.assertEquals(DATA_FILE, dataFile1); - TestHelpers.assertEquals(DATA_FILE, dataFile2); - } - - @Test - public void testDeleteFileKryoSerialization() throws IOException { - KryoSerializer kryo = new KryoSerializer<>(DeleteFile.class, new ExecutionConfig()); - - DataOutputSerializer outputView = new DataOutputSerializer(1024); - - kryo.serialize(POS_DELETE_FILE, outputView); - kryo.serialize(POS_DELETE_FILE.copy(), outputView); - - kryo.serialize(EQ_DELETE_FILE, outputView); - kryo.serialize(EQ_DELETE_FILE.copy(), outputView); - - DataInputDeserializer inputView = new DataInputDeserializer(outputView.getCopyOfBuffer()); - - DeleteFile posDeleteFile1 = kryo.deserialize(inputView); - DeleteFile posDeleteFile2 = kryo.deserialize(inputView); - - TestHelpers.assertEquals(POS_DELETE_FILE, posDeleteFile1); - TestHelpers.assertEquals(POS_DELETE_FILE, posDeleteFile2); - - DeleteFile eqDeleteFile1 = kryo.deserialize(inputView); - DeleteFile eqDeleteFile2 = kryo.deserialize(inputView); - - TestHelpers.assertEquals(EQ_DELETE_FILE, eqDeleteFile1); - TestHelpers.assertEquals(EQ_DELETE_FILE, eqDeleteFile2); - } - - private static ByteBuffer longToBuffer(long value) { - return ByteBuffer.allocate(8).order(ByteOrder.LITTLE_ENDIAN).putLong(0, value); - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java deleted file mode 100644 index c4cb78f034ad..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.apache.iceberg.types.Types.NestedField.required; - -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.types.Types; - -public class TestFixtures { - - private TestFixtures() {} - - public static final Schema SCHEMA = - new Schema( - required(1, "data", Types.StringType.get()), - required(2, "id", Types.LongType.get()), - required(3, "dt", Types.StringType.get())); - - public static final PartitionSpec SPEC = - PartitionSpec.builderFor(SCHEMA).identity("dt").bucket("id", 1).build(); - - public static final RowType ROW_TYPE = FlinkSchemaUtil.convert(SCHEMA); - - public static final String DATABASE = "default"; - public static final String TABLE = "t"; - - public static final TableIdentifier TABLE_IDENTIFIER = TableIdentifier.of(DATABASE, TABLE); -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java deleted file mode 100644 index d4de12c62300..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java +++ /dev/null @@ -1,310 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.io.File; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import org.apache.flink.table.catalog.exceptions.DatabaseNotEmptyException; -import org.apache.flink.types.Row; -import org.apache.iceberg.AssertHelpers; -import org.apache.iceberg.Schema; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.Types; -import org.junit.After; -import org.junit.Assert; -import org.junit.Assume; -import org.junit.Test; - -public class TestFlinkCatalogDatabase extends FlinkCatalogTestBase { - - public TestFlinkCatalogDatabase(String catalogName, Namespace baseNamepace) { - super(catalogName, baseNamepace); - } - - @After - @Override - public void clean() { - sql("DROP TABLE IF EXISTS %s.tl", flinkDatabase); - sql("DROP DATABASE IF EXISTS %s", flinkDatabase); - super.clean(); - } - - @Test - public void testCreateNamespace() { - Assert.assertFalse( - "Database should not already exist", - validationNamespaceCatalog.namespaceExists(icebergNamespace)); - - sql("CREATE DATABASE %s", flinkDatabase); - - Assert.assertTrue( - "Database should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); - - sql("CREATE DATABASE IF NOT EXISTS %s", flinkDatabase); - Assert.assertTrue( - "Database should still exist", - validationNamespaceCatalog.namespaceExists(icebergNamespace)); - - sql("DROP DATABASE IF EXISTS %s", flinkDatabase); - Assert.assertFalse( - "Database should be dropped", validationNamespaceCatalog.namespaceExists(icebergNamespace)); - - sql("CREATE DATABASE IF NOT EXISTS %s", flinkDatabase); - Assert.assertTrue( - "Database should be created", validationNamespaceCatalog.namespaceExists(icebergNamespace)); - } - - @Test - public void testDefaultDatabase() { - sql("USE CATALOG %s", catalogName); - sql("SHOW TABLES"); - - Assert.assertEquals( - "Should use the current catalog", getTableEnv().getCurrentCatalog(), catalogName); - Assert.assertEquals( - "Should use the configured default namespace", - getTableEnv().getCurrentDatabase(), - "default"); - } - - @Test - public void testDropEmptyDatabase() { - Assert.assertFalse( - "Namespace should not already exist", - validationNamespaceCatalog.namespaceExists(icebergNamespace)); - - sql("CREATE DATABASE %s", flinkDatabase); - - Assert.assertTrue( - "Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); - - sql("DROP DATABASE %s", flinkDatabase); - - Assert.assertFalse( - "Namespace should have been dropped", - validationNamespaceCatalog.namespaceExists(icebergNamespace)); - } - - @Test - public void testDropNonEmptyNamespace() { - Assume.assumeFalse( - "Hadoop catalog throws IOException: Directory is not empty.", isHadoopCatalog); - - Assert.assertFalse( - "Namespace should not already exist", - validationNamespaceCatalog.namespaceExists(icebergNamespace)); - - sql("CREATE DATABASE %s", flinkDatabase); - - validationCatalog.createTable( - TableIdentifier.of(icebergNamespace, "tl"), - new Schema(Types.NestedField.optional(0, "id", Types.LongType.get()))); - - Assert.assertTrue( - "Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); - Assert.assertTrue( - "Table should exist", - validationCatalog.tableExists(TableIdentifier.of(icebergNamespace, "tl"))); - - AssertHelpers.assertThrowsCause( - "Should fail if trying to delete a non-empty database", - DatabaseNotEmptyException.class, - String.format("Database %s in catalog %s is not empty.", DATABASE, catalogName), - () -> sql("DROP DATABASE %s", flinkDatabase)); - - sql("DROP TABLE %s.tl", flinkDatabase); - } - - @Test - public void testListTables() { - Assert.assertFalse( - "Namespace should not already exist", - validationNamespaceCatalog.namespaceExists(icebergNamespace)); - - sql("CREATE DATABASE %s", flinkDatabase); - sql("USE CATALOG %s", catalogName); - sql("USE %s", DATABASE); - - Assert.assertTrue( - "Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); - - Assert.assertEquals("Should not list any tables", 0, sql("SHOW TABLES").size()); - - validationCatalog.createTable( - TableIdentifier.of(icebergNamespace, "tl"), - new Schema(Types.NestedField.optional(0, "id", Types.LongType.get()))); - - List tables = sql("SHOW TABLES"); - Assert.assertEquals("Only 1 table", 1, tables.size()); - Assert.assertEquals("Table name should match", "tl", tables.get(0).getField(0)); - } - - @Test - public void testListNamespace() { - Assert.assertFalse( - "Namespace should not already exist", - validationNamespaceCatalog.namespaceExists(icebergNamespace)); - - sql("CREATE DATABASE %s", flinkDatabase); - sql("USE CATALOG %s", catalogName); - - Assert.assertTrue( - "Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); - - List databases = sql("SHOW DATABASES"); - - if (isHadoopCatalog) { - Assert.assertEquals("Should have 2 database", 2, databases.size()); - Assert.assertEquals( - "Should have db and default database", - Sets.newHashSet("default", "db"), - Sets.newHashSet(databases.get(0).getField(0), databases.get(1).getField(0))); - - if (!baseNamespace.isEmpty()) { - // test namespace not belongs to this catalog - validationNamespaceCatalog.createNamespace( - Namespace.of(baseNamespace.level(0), "UNKNOWN_NAMESPACE")); - databases = sql("SHOW DATABASES"); - Assert.assertEquals("Should have 2 database", 2, databases.size()); - Assert.assertEquals( - "Should have db and default database", - Sets.newHashSet("default", "db"), - Sets.newHashSet(databases.get(0).getField(0), databases.get(1).getField(0))); - } - } else { - // If there are multiple classes extends FlinkTestBase, TestHiveMetastore may loose the - // creation for default - // database. See HiveMetaStore.HMSHandler.init. - Assert.assertTrue( - "Should have db database", - databases.stream().anyMatch(d -> Objects.equals(d.getField(0), "db"))); - } - } - - @Test - public void testCreateNamespaceWithMetadata() { - Assume.assumeFalse("HadoopCatalog does not support namespace metadata", isHadoopCatalog); - - Assert.assertFalse( - "Namespace should not already exist", - validationNamespaceCatalog.namespaceExists(icebergNamespace)); - - sql("CREATE DATABASE %s WITH ('prop'='value')", flinkDatabase); - - Assert.assertTrue( - "Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); - - Map nsMetadata = - validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); - - Assert.assertEquals( - "Namespace should have expected prop value", "value", nsMetadata.get("prop")); - } - - @Test - public void testCreateNamespaceWithComment() { - Assume.assumeFalse("HadoopCatalog does not support namespace metadata", isHadoopCatalog); - - Assert.assertFalse( - "Namespace should not already exist", - validationNamespaceCatalog.namespaceExists(icebergNamespace)); - - sql("CREATE DATABASE %s COMMENT 'namespace doc'", flinkDatabase); - - Assert.assertTrue( - "Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); - - Map nsMetadata = - validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); - - Assert.assertEquals( - "Namespace should have expected comment", "namespace doc", nsMetadata.get("comment")); - } - - @Test - public void testCreateNamespaceWithLocation() throws Exception { - Assume.assumeFalse("HadoopCatalog does not support namespace metadata", isHadoopCatalog); - - Assert.assertFalse( - "Namespace should not already exist", - validationNamespaceCatalog.namespaceExists(icebergNamespace)); - - File location = TEMPORARY_FOLDER.newFile(); - Assert.assertTrue(location.delete()); - - sql("CREATE DATABASE %s WITH ('location'='%s')", flinkDatabase, location); - - Assert.assertTrue( - "Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); - - Map nsMetadata = - validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); - - Assert.assertEquals( - "Namespace should have expected location", - "file:" + location.getPath(), - nsMetadata.get("location")); - } - - @Test - public void testSetProperties() { - Assume.assumeFalse("HadoopCatalog does not support namespace metadata", isHadoopCatalog); - - Assert.assertFalse( - "Namespace should not already exist", - validationNamespaceCatalog.namespaceExists(icebergNamespace)); - - sql("CREATE DATABASE %s", flinkDatabase); - - Assert.assertTrue( - "Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); - - Map defaultMetadata = - validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); - Assert.assertFalse( - "Default metadata should not have custom property", defaultMetadata.containsKey("prop")); - - sql("ALTER DATABASE %s SET ('prop'='value')", flinkDatabase); - - Map nsMetadata = - validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); - - Assert.assertEquals( - "Namespace should have expected prop value", "value", nsMetadata.get("prop")); - } - - @Test - public void testHadoopNotSupportMeta() { - Assume.assumeTrue("HadoopCatalog does not support namespace metadata", isHadoopCatalog); - - Assert.assertFalse( - "Namespace should not already exist", - validationNamespaceCatalog.namespaceExists(icebergNamespace)); - - AssertHelpers.assertThrowsCause( - "Should fail if trying to create database with location in hadoop catalog.", - UnsupportedOperationException.class, - String.format("Cannot create namespace %s: metadata is not supported", icebergNamespace), - () -> sql("CREATE DATABASE %s WITH ('prop'='value')", flinkDatabase)); - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java deleted file mode 100644 index f7edd5653ebd..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.util.Map; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.AssertHelpers; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.catalog.Catalog; -import org.apache.iceberg.hadoop.HadoopCatalog; -import org.apache.iceberg.hive.HiveCatalog; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.assertj.core.api.Assertions; -import org.junit.Before; -import org.junit.Test; - -public class TestFlinkCatalogFactory { - - private Map props; - - @Before - public void before() { - props = Maps.newHashMap(); - props.put("type", "iceberg"); - props.put(CatalogProperties.WAREHOUSE_LOCATION, "/tmp/location"); - } - - @Test - public void testCreateCreateCatalogHive() { - String catalogName = "hiveCatalog"; - props.put( - FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, FlinkCatalogFactory.ICEBERG_CATALOG_TYPE_HIVE); - - Catalog catalog = - FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration()) - .loadCatalog(); - - Assertions.assertThat(catalog).isNotNull().isInstanceOf(HiveCatalog.class); - } - - @Test - public void testCreateCreateCatalogHadoop() { - String catalogName = "hadoopCatalog"; - props.put( - FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, FlinkCatalogFactory.ICEBERG_CATALOG_TYPE_HADOOP); - - Catalog catalog = - FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration()) - .loadCatalog(); - - Assertions.assertThat(catalog).isNotNull().isInstanceOf(HadoopCatalog.class); - } - - @Test - public void testCreateCreateCatalogCustom() { - String catalogName = "customCatalog"; - props.put(CatalogProperties.CATALOG_IMPL, CustomHadoopCatalog.class.getName()); - - Catalog catalog = - FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration()) - .loadCatalog(); - - Assertions.assertThat(catalog).isNotNull().isInstanceOf(CustomHadoopCatalog.class); - } - - @Test - public void testCreateCreateCatalogCustomWithHiveCatalogTypeSet() { - String catalogName = "customCatalog"; - props.put(CatalogProperties.CATALOG_IMPL, CustomHadoopCatalog.class.getName()); - props.put( - FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, FlinkCatalogFactory.ICEBERG_CATALOG_TYPE_HIVE); - - AssertHelpers.assertThrows( - "Should throw when both catalog-type and catalog-impl are set", - IllegalArgumentException.class, - "both catalog-type and catalog-impl are set", - () -> FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration())); - } - - @Test - public void testLoadCatalogUnknown() { - String catalogName = "unknownCatalog"; - props.put(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, "fooType"); - - AssertHelpers.assertThrows( - "Should throw when an unregistered / unknown catalog is set as the catalog factor's`type` setting", - UnsupportedOperationException.class, - "Unknown catalog-type", - () -> FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration())); - } - - public static class CustomHadoopCatalog extends HadoopCatalog { - - public CustomHadoopCatalog() {} - - public CustomHadoopCatalog(Configuration conf, String warehouseLocation) { - setConf(conf); - initialize( - "custom", ImmutableMap.of(CatalogProperties.WAREHOUSE_LOCATION, warehouseLocation)); - } - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java deleted file mode 100644 index 897480d495c3..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java +++ /dev/null @@ -1,419 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.util.Arrays; -import java.util.Collections; -import java.util.Map; -import java.util.Optional; -import java.util.Set; -import java.util.stream.Collectors; -import java.util.stream.StreamSupport; -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.api.ValidationException; -import org.apache.flink.table.api.constraints.UniqueConstraint; -import org.apache.flink.table.catalog.CatalogTable; -import org.apache.flink.table.catalog.ObjectPath; -import org.apache.flink.table.catalog.exceptions.TableNotExistException; -import org.apache.iceberg.AssertHelpers; -import org.apache.iceberg.BaseTable; -import org.apache.iceberg.ContentFile; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DataFiles; -import org.apache.iceberg.DataOperations; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableOperations; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.exceptions.NoSuchTableException; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.Types; -import org.junit.After; -import org.junit.Assert; -import org.junit.Assume; -import org.junit.Before; -import org.junit.Test; - -public class TestFlinkCatalogTable extends FlinkCatalogTestBase { - - public TestFlinkCatalogTable(String catalogName, Namespace baseNamepace) { - super(catalogName, baseNamepace); - } - - @Override - @Before - public void before() { - super.before(); - sql("CREATE DATABASE %s", flinkDatabase); - sql("USE CATALOG %s", catalogName); - sql("USE %s", DATABASE); - } - - @After - public void cleanNamespaces() { - sql("DROP TABLE IF EXISTS %s.tl", flinkDatabase); - sql("DROP TABLE IF EXISTS %s.tl2", flinkDatabase); - sql("DROP DATABASE IF EXISTS %s", flinkDatabase); - super.clean(); - } - - @Test - public void testGetTable() { - sql("CREATE TABLE tl(id BIGINT, strV STRING)"); - - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, "tl")); - Schema iSchema = - new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "strV", Types.StringType.get())); - Assert.assertEquals( - "Should load the expected iceberg schema", iSchema.toString(), table.schema().toString()); - } - - @Test - public void testRenameTable() { - Assume.assumeFalse("HadoopCatalog does not support rename table", isHadoopCatalog); - - final Schema tableSchema = - new Schema(Types.NestedField.optional(0, "id", Types.LongType.get())); - validationCatalog.createTable(TableIdentifier.of(icebergNamespace, "tl"), tableSchema); - sql("ALTER TABLE tl RENAME TO tl2"); - AssertHelpers.assertThrows( - "Should fail if trying to get a nonexistent table", - ValidationException.class, - "Table `tl` was not found.", - () -> getTableEnv().from("tl")); - Schema actualSchema = FlinkSchemaUtil.convert(getTableEnv().from("tl2").getSchema()); - Assert.assertEquals(tableSchema.asStruct(), actualSchema.asStruct()); - } - - @Test - public void testCreateTable() throws TableNotExistException { - sql("CREATE TABLE tl(id BIGINT)"); - - Table table = table("tl"); - Assert.assertEquals( - new Schema(Types.NestedField.optional(1, "id", Types.LongType.get())).asStruct(), - table.schema().asStruct()); - Assert.assertEquals(Maps.newHashMap(), table.properties()); - - CatalogTable catalogTable = catalogTable("tl"); - Assert.assertEquals( - TableSchema.builder().field("id", DataTypes.BIGINT()).build(), catalogTable.getSchema()); - Assert.assertEquals(Maps.newHashMap(), catalogTable.getOptions()); - } - - @Test - public void testCreateTableWithPrimaryKey() throws Exception { - sql("CREATE TABLE tl(id BIGINT, data STRING, key STRING PRIMARY KEY NOT ENFORCED)"); - - Table table = table("tl"); - Assert.assertEquals( - "Should have the expected row key.", - Sets.newHashSet(table.schema().findField("key").fieldId()), - table.schema().identifierFieldIds()); - - CatalogTable catalogTable = catalogTable("tl"); - Optional uniqueConstraintOptional = catalogTable.getSchema().getPrimaryKey(); - Assert.assertTrue( - "Should have the expected unique constraint", uniqueConstraintOptional.isPresent()); - Assert.assertEquals( - "Should have the expected columns", - ImmutableList.of("key"), - uniqueConstraintOptional.get().getColumns()); - } - - @Test - public void testCreateTableWithMultiColumnsInPrimaryKey() throws Exception { - sql( - "CREATE TABLE tl(id BIGINT, data STRING, CONSTRAINT pk_constraint PRIMARY KEY(data, id) NOT ENFORCED)"); - - Table table = table("tl"); - Assert.assertEquals( - "Should have the expected RowKey", - Sets.newHashSet( - table.schema().findField("id").fieldId(), table.schema().findField("data").fieldId()), - table.schema().identifierFieldIds()); - - CatalogTable catalogTable = catalogTable("tl"); - Optional uniqueConstraintOptional = catalogTable.getSchema().getPrimaryKey(); - Assert.assertTrue( - "Should have the expected unique constraint", uniqueConstraintOptional.isPresent()); - Assert.assertEquals( - "Should have the expected columns", - ImmutableSet.of("data", "id"), - ImmutableSet.copyOf(uniqueConstraintOptional.get().getColumns())); - } - - @Test - public void testCreateTableIfNotExists() { - sql("CREATE TABLE tl(id BIGINT)"); - - // Assert that table does exist. - Assert.assertEquals(Maps.newHashMap(), table("tl").properties()); - - sql("DROP TABLE tl"); - AssertHelpers.assertThrows( - "Table 'tl' should be dropped", - NoSuchTableException.class, - "Table does not exist: " + getFullQualifiedTableName("tl"), - () -> table("tl")); - - sql("CREATE TABLE IF NOT EXISTS tl(id BIGINT)"); - Assert.assertEquals(Maps.newHashMap(), table("tl").properties()); - - final Map expectedProperties = ImmutableMap.of("key", "value"); - table("tl").updateProperties().set("key", "value").commit(); - Assert.assertEquals(expectedProperties, table("tl").properties()); - - sql("CREATE TABLE IF NOT EXISTS tl(id BIGINT)"); - Assert.assertEquals( - "Should still be the old table.", expectedProperties, table("tl").properties()); - } - - @Test - public void testCreateTableLike() throws TableNotExistException { - sql("CREATE TABLE tl(id BIGINT)"); - sql("CREATE TABLE tl2 LIKE tl"); - - Table table = table("tl2"); - Assert.assertEquals( - new Schema(Types.NestedField.optional(1, "id", Types.LongType.get())).asStruct(), - table.schema().asStruct()); - Assert.assertEquals(Maps.newHashMap(), table.properties()); - - CatalogTable catalogTable = catalogTable("tl2"); - Assert.assertEquals( - TableSchema.builder().field("id", DataTypes.BIGINT()).build(), catalogTable.getSchema()); - Assert.assertEquals(Maps.newHashMap(), catalogTable.getOptions()); - } - - @Test - public void testCreateTableLocation() { - Assume.assumeFalse( - "HadoopCatalog does not support creating table with location", isHadoopCatalog); - - sql("CREATE TABLE tl(id BIGINT) WITH ('location'='file:///tmp/location')"); - - Table table = table("tl"); - Assert.assertEquals( - new Schema(Types.NestedField.optional(1, "id", Types.LongType.get())).asStruct(), - table.schema().asStruct()); - Assert.assertEquals("file:///tmp/location", table.location()); - Assert.assertEquals(Maps.newHashMap(), table.properties()); - } - - @Test - public void testCreatePartitionTable() throws TableNotExistException { - sql("CREATE TABLE tl(id BIGINT, dt STRING) PARTITIONED BY(dt)"); - - Table table = table("tl"); - Assert.assertEquals( - new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "dt", Types.StringType.get())) - .asStruct(), - table.schema().asStruct()); - Assert.assertEquals( - PartitionSpec.builderFor(table.schema()).identity("dt").build(), table.spec()); - Assert.assertEquals(Maps.newHashMap(), table.properties()); - - CatalogTable catalogTable = catalogTable("tl"); - Assert.assertEquals( - TableSchema.builder() - .field("id", DataTypes.BIGINT()) - .field("dt", DataTypes.STRING()) - .build(), - catalogTable.getSchema()); - Assert.assertEquals(Maps.newHashMap(), catalogTable.getOptions()); - Assert.assertEquals(Collections.singletonList("dt"), catalogTable.getPartitionKeys()); - } - - @Test - public void testCreateTableWithFormatV2ThroughTableProperty() throws Exception { - sql("CREATE TABLE tl(id BIGINT) WITH ('format-version'='2')"); - - Table table = table("tl"); - Assert.assertEquals( - "should create table using format v2", - 2, - ((BaseTable) table).operations().current().formatVersion()); - } - - @Test - public void testUpgradeTableWithFormatV2ThroughTableProperty() throws Exception { - sql("CREATE TABLE tl(id BIGINT) WITH ('format-version'='1')"); - - Table table = table("tl"); - TableOperations ops = ((BaseTable) table).operations(); - Assert.assertEquals("should create table using format v1", 1, ops.refresh().formatVersion()); - - sql("ALTER TABLE tl SET('format-version'='2')"); - Assert.assertEquals("should update table to use format v2", 2, ops.refresh().formatVersion()); - } - - @Test - public void testDowngradeTableToFormatV1ThroughTablePropertyFails() throws Exception { - sql("CREATE TABLE tl(id BIGINT) WITH ('format-version'='2')"); - - Table table = table("tl"); - TableOperations ops = ((BaseTable) table).operations(); - Assert.assertEquals("should create table using format v2", 2, ops.refresh().formatVersion()); - - AssertHelpers.assertThrowsRootCause( - "should fail to downgrade to v1", - IllegalArgumentException.class, - "Cannot downgrade v2 table to v1", - () -> sql("ALTER TABLE tl SET('format-version'='1')")); - } - - @Test - public void testLoadTransformPartitionTable() throws TableNotExistException { - Schema schema = new Schema(Types.NestedField.optional(0, "id", Types.LongType.get())); - validationCatalog.createTable( - TableIdentifier.of(icebergNamespace, "tl"), - schema, - PartitionSpec.builderFor(schema).bucket("id", 100).build()); - - CatalogTable catalogTable = catalogTable("tl"); - Assert.assertEquals( - TableSchema.builder().field("id", DataTypes.BIGINT()).build(), catalogTable.getSchema()); - Assert.assertEquals(Maps.newHashMap(), catalogTable.getOptions()); - Assert.assertEquals(Collections.emptyList(), catalogTable.getPartitionKeys()); - } - - @Test - public void testAlterTable() throws TableNotExistException { - sql("CREATE TABLE tl(id BIGINT) WITH ('oldK'='oldV')"); - Map properties = Maps.newHashMap(); - properties.put("oldK", "oldV"); - - // new - sql("ALTER TABLE tl SET('newK'='newV')"); - properties.put("newK", "newV"); - Assert.assertEquals(properties, table("tl").properties()); - - // update old - sql("ALTER TABLE tl SET('oldK'='oldV2')"); - properties.put("oldK", "oldV2"); - Assert.assertEquals(properties, table("tl").properties()); - - // remove property - CatalogTable catalogTable = catalogTable("tl"); - properties.remove("oldK"); - getTableEnv() - .getCatalog(getTableEnv().getCurrentCatalog()) - .get() - .alterTable(new ObjectPath(DATABASE, "tl"), catalogTable.copy(properties), false); - Assert.assertEquals(properties, table("tl").properties()); - } - - @Test - public void testRelocateTable() { - Assume.assumeFalse("HadoopCatalog does not support relocate table", isHadoopCatalog); - - sql("CREATE TABLE tl(id BIGINT)"); - sql("ALTER TABLE tl SET('location'='file:///tmp/location')"); - Assert.assertEquals("file:///tmp/location", table("tl").location()); - } - - @Test - public void testSetCurrentAndCherryPickSnapshotId() { - sql("CREATE TABLE tl(c1 INT, c2 STRING, c3 STRING) PARTITIONED BY (c1)"); - - Table table = table("tl"); - - DataFile fileA = - DataFiles.builder(table.spec()) - .withPath("/path/to/data-a.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=0") // easy way to set partition data for now - .withRecordCount(1) - .build(); - DataFile fileB = - DataFiles.builder(table.spec()) - .withPath("/path/to/data-b.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=1") // easy way to set partition data for now - .withRecordCount(1) - .build(); - DataFile replacementFile = - DataFiles.builder(table.spec()) - .withPath("/path/to/data-a-replacement.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=0") // easy way to set partition data for now - .withRecordCount(1) - .build(); - - table.newAppend().appendFile(fileA).commit(); - long snapshotId = table.currentSnapshot().snapshotId(); - - // stage an overwrite that replaces FILE_A - table.newReplacePartitions().addFile(replacementFile).stageOnly().commit(); - - Snapshot staged = Iterables.getLast(table.snapshots()); - Assert.assertEquals( - "Should find the staged overwrite snapshot", DataOperations.OVERWRITE, staged.operation()); - - // add another append so that the original commit can't be fast-forwarded - table.newAppend().appendFile(fileB).commit(); - - // test cherry pick - sql("ALTER TABLE tl SET('cherry-pick-snapshot-id'='%s')", staged.snapshotId()); - validateTableFiles(table, fileB, replacementFile); - - // test set current snapshot - sql("ALTER TABLE tl SET('current-snapshot-id'='%s')", snapshotId); - validateTableFiles(table, fileA); - } - - private void validateTableFiles(Table tbl, DataFile... expectedFiles) { - tbl.refresh(); - Set expectedFilePaths = - Arrays.stream(expectedFiles).map(DataFile::path).collect(Collectors.toSet()); - Set actualFilePaths = - StreamSupport.stream(tbl.newScan().planFiles().spliterator(), false) - .map(FileScanTask::file) - .map(ContentFile::path) - .collect(Collectors.toSet()); - Assert.assertEquals("Files should match", expectedFilePaths, actualFilePaths); - } - - private Table table(String name) { - return validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, name)); - } - - private CatalogTable catalogTable(String name) throws TableNotExistException { - return (CatalogTable) - getTableEnv() - .getCatalog(getTableEnv().getCurrentCatalog()) - .get() - .getTable(new ObjectPath(DATABASE, name)); - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java deleted file mode 100644 index 839700f50127..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.apache.iceberg.flink.FlinkCatalogFactory.CACHE_ENABLED; - -import java.util.List; -import org.apache.flink.table.catalog.CatalogPartitionSpec; -import org.apache.flink.table.catalog.ObjectPath; -import org.apache.flink.table.catalog.exceptions.TableNotExistException; -import org.apache.flink.table.catalog.exceptions.TableNotPartitionedException; -import org.apache.iceberg.AssertHelpers; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.After; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; -import org.junit.runners.Parameterized; - -public class TestFlinkCatalogTablePartitions extends FlinkCatalogTestBase { - - private String tableName = "test_table"; - - private final FileFormat format; - - @Parameterized.Parameters( - name = "catalogName={0}, baseNamespace={1}, format={2}, cacheEnabled={3}") - public static Iterable parameters() { - List parameters = Lists.newArrayList(); - for (FileFormat format : - new FileFormat[] {FileFormat.ORC, FileFormat.AVRO, FileFormat.PARQUET}) { - for (Boolean cacheEnabled : new Boolean[] {true, false}) { - for (Object[] catalogParams : FlinkCatalogTestBase.parameters()) { - String catalogName = (String) catalogParams[0]; - Namespace baseNamespace = (Namespace) catalogParams[1]; - parameters.add(new Object[] {catalogName, baseNamespace, format, cacheEnabled}); - } - } - } - return parameters; - } - - public TestFlinkCatalogTablePartitions( - String catalogName, Namespace baseNamespace, FileFormat format, boolean cacheEnabled) { - super(catalogName, baseNamespace); - this.format = format; - config.put(CACHE_ENABLED, String.valueOf(cacheEnabled)); - } - - @Override - @Before - public void before() { - super.before(); - sql("CREATE DATABASE %s", flinkDatabase); - sql("USE CATALOG %s", catalogName); - sql("USE %s", DATABASE); - } - - @After - public void cleanNamespaces() { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); - sql("DROP DATABASE IF EXISTS %s", flinkDatabase); - super.clean(); - } - - @Test - public void testListPartitionsWithUnpartitionedTable() { - sql( - "CREATE TABLE %s (id INT, data VARCHAR) with ('write.format.default'='%s')", - tableName, format.name()); - sql("INSERT INTO %s SELECT 1,'a'", tableName); - - ObjectPath objectPath = new ObjectPath(DATABASE, tableName); - FlinkCatalog flinkCatalog = (FlinkCatalog) getTableEnv().getCatalog(catalogName).get(); - AssertHelpers.assertThrows( - "Should not list partitions for unpartitioned table.", - TableNotPartitionedException.class, - () -> flinkCatalog.listPartitions(objectPath)); - } - - @Test - public void testListPartitionsWithPartitionedTable() - throws TableNotExistException, TableNotPartitionedException { - sql( - "CREATE TABLE %s (id INT, data VARCHAR) PARTITIONED BY (data) " - + "with ('write.format.default'='%s')", - tableName, format.name()); - sql("INSERT INTO %s SELECT 1,'a'", tableName); - sql("INSERT INTO %s SELECT 2,'b'", tableName); - - ObjectPath objectPath = new ObjectPath(DATABASE, tableName); - FlinkCatalog flinkCatalog = (FlinkCatalog) getTableEnv().getCatalog(catalogName).get(); - List list = flinkCatalog.listPartitions(objectPath); - Assert.assertEquals("Should have 2 partition", 2, list.size()); - - List expected = Lists.newArrayList(); - CatalogPartitionSpec partitionSpec1 = new CatalogPartitionSpec(ImmutableMap.of("data", "a")); - CatalogPartitionSpec partitionSpec2 = new CatalogPartitionSpec(ImmutableMap.of("data", "b")); - expected.add(partitionSpec1); - expected.add(partitionSpec2); - Assert.assertEquals("Should produce the expected catalog partition specs.", list, expected); - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java deleted file mode 100644 index c89ea4f53054..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java +++ /dev/null @@ -1,467 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.math.BigDecimal; -import java.nio.ByteBuffer; -import java.time.Instant; -import java.time.LocalDate; -import java.time.LocalDateTime; -import java.time.LocalTime; -import java.util.List; -import java.util.Optional; -import java.util.stream.Collectors; -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.api.Expressions; -import org.apache.flink.table.api.TableColumn; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.expressions.ApiExpressionUtils; -import org.apache.flink.table.expressions.CallExpression; -import org.apache.flink.table.expressions.Expression; -import org.apache.flink.table.expressions.FieldReferenceExpression; -import org.apache.flink.table.expressions.ResolvedExpression; -import org.apache.flink.table.expressions.UnresolvedCallExpression; -import org.apache.flink.table.expressions.UnresolvedReferenceExpression; -import org.apache.flink.table.expressions.ValueLiteralExpression; -import org.apache.flink.table.expressions.utils.ApiExpressionDefaultVisitor; -import org.apache.flink.table.functions.BuiltInFunctionDefinitions; -import org.apache.iceberg.expressions.And; -import org.apache.iceberg.expressions.BoundLiteralPredicate; -import org.apache.iceberg.expressions.Not; -import org.apache.iceberg.expressions.Or; -import org.apache.iceberg.expressions.UnboundPredicate; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.util.DateTimeUtil; -import org.apache.iceberg.util.Pair; -import org.assertj.core.api.Assertions; -import org.junit.Assert; -import org.junit.Test; - -public class TestFlinkFilters { - - private static final TableSchema TABLE_SCHEMA = - TableSchema.builder() - .field("field1", DataTypes.INT()) - .field("field2", DataTypes.BIGINT()) - .field("field3", DataTypes.FLOAT()) - .field("field4", DataTypes.DOUBLE()) - .field("field5", DataTypes.STRING()) - .field("field6", DataTypes.BOOLEAN()) - .field("field7", DataTypes.BINARY(2)) - .field("field8", DataTypes.DECIMAL(10, 2)) - .field("field9", DataTypes.DATE()) - .field("field10", DataTypes.TIME()) - .field("field11", DataTypes.TIMESTAMP()) - .field("field12", DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE()) - .build(); - - // A map list of fields and values used to verify the conversion of flink expression to iceberg - // expression - private static final List> FIELD_VALUE_LIST = - ImmutableList.of( - Pair.of("field1", 1), - Pair.of("field2", 2L), - Pair.of("field3", 3F), - Pair.of("field4", 4D), - Pair.of("field5", "iceberg"), - Pair.of("field6", true), - Pair.of("field7", new byte[] {'a', 'b'}), - Pair.of("field8", BigDecimal.valueOf(10.12)), - Pair.of("field9", DateTimeUtil.daysFromDate(LocalDate.now())), - Pair.of("field10", DateTimeUtil.microsFromTime(LocalTime.now())), - Pair.of("field11", DateTimeUtil.microsFromTimestamp(LocalDateTime.now())), - Pair.of("field12", DateTimeUtil.microsFromInstant(Instant.now()))); - - @Test - public void testFlinkDataTypeEqual() { - matchLiteral("field1", 1, 1); - matchLiteral("field2", 10L, 10L); - matchLiteral("field3", 1.2F, 1.2F); - matchLiteral("field4", 3.4D, 3.4D); - matchLiteral("field5", "abcd", "abcd"); - matchLiteral("field6", true, true); - matchLiteral("field7", new byte[] {'a', 'b'}, ByteBuffer.wrap(new byte[] {'a', 'b'})); - matchLiteral("field8", BigDecimal.valueOf(10.12), BigDecimal.valueOf(10.12)); - - LocalDate date = LocalDate.parse("2020-12-23"); - matchLiteral("field9", date, DateTimeUtil.daysFromDate(date)); - - LocalTime time = LocalTime.parse("12:13:14"); - matchLiteral("field10", time, DateTimeUtil.microsFromTime(time)); - - LocalDateTime dateTime = LocalDateTime.parse("2020-12-23T12:13:14"); - matchLiteral("field11", dateTime, DateTimeUtil.microsFromTimestamp(dateTime)); - - Instant instant = Instant.parse("2020-12-23T12:13:14.00Z"); - matchLiteral("field12", instant, DateTimeUtil.microsFromInstant(instant)); - } - - @Test - public void testEquals() { - for (Pair pair : FIELD_VALUE_LIST) { - UnboundPredicate expected = - org.apache.iceberg.expressions.Expressions.equal(pair.first(), pair.second()); - - Optional actual = - FlinkFilters.convert( - resolve(Expressions.$(pair.first()).isEqual(Expressions.lit(pair.second())))); - Assert.assertTrue("Conversion should succeed", actual.isPresent()); - assertPredicatesMatch(expected, actual.get()); - - Optional actual1 = - FlinkFilters.convert( - resolve(Expressions.lit(pair.second()).isEqual(Expressions.$(pair.first())))); - Assert.assertTrue("Conversion should succeed", actual1.isPresent()); - assertPredicatesMatch(expected, actual1.get()); - } - } - - @Test - public void testEqualsNaN() { - UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.isNaN("field3"); - - Optional actual = - FlinkFilters.convert(resolve(Expressions.$("field3").isEqual(Expressions.lit(Float.NaN)))); - Assert.assertTrue("Conversion should succeed", actual.isPresent()); - assertPredicatesMatch(expected, actual.get()); - - Optional actual1 = - FlinkFilters.convert(resolve(Expressions.lit(Float.NaN).isEqual(Expressions.$("field3")))); - Assert.assertTrue("Conversion should succeed", actual1.isPresent()); - assertPredicatesMatch(expected, actual1.get()); - } - - @Test - public void testNotEquals() { - for (Pair pair : FIELD_VALUE_LIST) { - UnboundPredicate expected = - org.apache.iceberg.expressions.Expressions.notEqual(pair.first(), pair.second()); - - Optional actual = - FlinkFilters.convert( - resolve(Expressions.$(pair.first()).isNotEqual(Expressions.lit(pair.second())))); - Assert.assertTrue("Conversion should succeed", actual.isPresent()); - assertPredicatesMatch(expected, actual.get()); - - Optional actual1 = - FlinkFilters.convert( - resolve(Expressions.lit(pair.second()).isNotEqual(Expressions.$(pair.first())))); - Assert.assertTrue("Conversion should succeed", actual1.isPresent()); - assertPredicatesMatch(expected, actual1.get()); - } - } - - @Test - public void testNotEqualsNaN() { - UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.notNaN("field3"); - - Optional actual = - FlinkFilters.convert( - resolve(Expressions.$("field3").isNotEqual(Expressions.lit(Float.NaN)))); - Assert.assertTrue("Conversion should succeed", actual.isPresent()); - assertPredicatesMatch(expected, actual.get()); - - Optional actual1 = - FlinkFilters.convert( - resolve(Expressions.lit(Float.NaN).isNotEqual(Expressions.$("field3")))); - Assert.assertTrue("Conversion should succeed", actual1.isPresent()); - assertPredicatesMatch(expected, actual1.get()); - } - - @Test - public void testGreaterThan() { - UnboundPredicate expected = - org.apache.iceberg.expressions.Expressions.greaterThan("field1", 1); - - Optional actual = - FlinkFilters.convert(resolve(Expressions.$("field1").isGreater(Expressions.lit(1)))); - Assert.assertTrue("Conversion should succeed", actual.isPresent()); - assertPredicatesMatch(expected, actual.get()); - - Optional actual1 = - FlinkFilters.convert(resolve(Expressions.lit(1).isLess(Expressions.$("field1")))); - Assert.assertTrue("Conversion should succeed", actual1.isPresent()); - assertPredicatesMatch(expected, actual1.get()); - } - - @Test - public void testGreaterThanEquals() { - UnboundPredicate expected = - org.apache.iceberg.expressions.Expressions.greaterThanOrEqual("field1", 1); - - Optional actual = - FlinkFilters.convert(resolve(Expressions.$("field1").isGreaterOrEqual(Expressions.lit(1)))); - Assert.assertTrue("Conversion should succeed", actual.isPresent()); - assertPredicatesMatch(expected, actual.get()); - - Optional actual1 = - FlinkFilters.convert(resolve(Expressions.lit(1).isLessOrEqual(Expressions.$("field1")))); - Assert.assertTrue("Conversion should succeed", actual1.isPresent()); - assertPredicatesMatch(expected, actual1.get()); - } - - @Test - public void testLessThan() { - UnboundPredicate expected = - org.apache.iceberg.expressions.Expressions.lessThan("field1", 1); - - Optional actual = - FlinkFilters.convert(resolve(Expressions.$("field1").isLess(Expressions.lit(1)))); - Assert.assertTrue("Conversion should succeed", actual.isPresent()); - assertPredicatesMatch(expected, actual.get()); - - Optional actual1 = - FlinkFilters.convert(resolve(Expressions.lit(1).isGreater(Expressions.$("field1")))); - Assert.assertTrue("Conversion should succeed", actual1.isPresent()); - assertPredicatesMatch(expected, actual1.get()); - } - - @Test - public void testLessThanEquals() { - UnboundPredicate expected = - org.apache.iceberg.expressions.Expressions.lessThanOrEqual("field1", 1); - - Optional actual = - FlinkFilters.convert(resolve(Expressions.$("field1").isLessOrEqual(Expressions.lit(1)))); - Assert.assertTrue("Conversion should succeed", actual.isPresent()); - assertPredicatesMatch(expected, actual.get()); - - Optional actual1 = - FlinkFilters.convert(resolve(Expressions.lit(1).isGreaterOrEqual(Expressions.$("field1")))); - Assert.assertTrue("Conversion should succeed", actual1.isPresent()); - assertPredicatesMatch(expected, actual1.get()); - } - - @Test - public void testIsNull() { - Expression expr = resolve(Expressions.$("field1").isNull()); - Optional actual = FlinkFilters.convert(expr); - Assert.assertTrue("Conversion should succeed", actual.isPresent()); - UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.isNull("field1"); - assertPredicatesMatch(expected, actual.get()); - } - - @Test - public void testIsNotNull() { - Expression expr = resolve(Expressions.$("field1").isNotNull()); - Optional actual = FlinkFilters.convert(expr); - Assert.assertTrue("Conversion should succeed", actual.isPresent()); - UnboundPredicate expected = - org.apache.iceberg.expressions.Expressions.notNull("field1"); - assertPredicatesMatch(expected, actual.get()); - } - - @Test - public void testAnd() { - Expression expr = - resolve( - Expressions.$("field1") - .isEqual(Expressions.lit(1)) - .and(Expressions.$("field2").isEqual(Expressions.lit(2L)))); - Optional actual = FlinkFilters.convert(expr); - Assert.assertTrue("Conversion should succeed", actual.isPresent()); - And and = (And) actual.get(); - And expected = - (And) - org.apache.iceberg.expressions.Expressions.and( - org.apache.iceberg.expressions.Expressions.equal("field1", 1), - org.apache.iceberg.expressions.Expressions.equal("field2", 2L)); - - assertPredicatesMatch(expected.left(), and.left()); - assertPredicatesMatch(expected.right(), and.right()); - } - - @Test - public void testOr() { - Expression expr = - resolve( - Expressions.$("field1") - .isEqual(Expressions.lit(1)) - .or(Expressions.$("field2").isEqual(Expressions.lit(2L)))); - Optional actual = FlinkFilters.convert(expr); - Assert.assertTrue("Conversion should succeed", actual.isPresent()); - Or or = (Or) actual.get(); - Or expected = - (Or) - org.apache.iceberg.expressions.Expressions.or( - org.apache.iceberg.expressions.Expressions.equal("field1", 1), - org.apache.iceberg.expressions.Expressions.equal("field2", 2L)); - - assertPredicatesMatch(expected.left(), or.left()); - assertPredicatesMatch(expected.right(), or.right()); - } - - @Test - public void testNot() { - Expression expr = - resolve( - ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.NOT, - Expressions.$("field1").isEqual(Expressions.lit(1)))); - Optional actual = FlinkFilters.convert(expr); - Assert.assertTrue("Conversion should succeed", actual.isPresent()); - Not not = (Not) actual.get(); - Not expected = - (Not) - org.apache.iceberg.expressions.Expressions.not( - org.apache.iceberg.expressions.Expressions.equal("field1", 1)); - - Assert.assertEquals("Predicate operation should match", expected.op(), not.op()); - assertPredicatesMatch(expected.child(), not.child()); - } - - @Test - public void testLike() { - UnboundPredicate expected = - org.apache.iceberg.expressions.Expressions.startsWith("field5", "abc"); - Expression expr = - resolve( - ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("abc%"))); - Optional actual = FlinkFilters.convert(expr); - Assert.assertTrue("Conversion should succeed", actual.isPresent()); - assertPredicatesMatch(expected, actual.get()); - - expr = - resolve( - ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("%abc"))); - actual = FlinkFilters.convert(expr); - Assert.assertFalse("Conversion should failed", actual.isPresent()); - - expr = - resolve( - ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, - Expressions.$("field5"), - Expressions.lit("%abc%"))); - actual = FlinkFilters.convert(expr); - Assert.assertFalse("Conversion should failed", actual.isPresent()); - - expr = - resolve( - ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, - Expressions.$("field5"), - Expressions.lit("abc%d"))); - actual = FlinkFilters.convert(expr); - Assert.assertFalse("Conversion should failed", actual.isPresent()); - - expr = - resolve( - ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("%"))); - actual = FlinkFilters.convert(expr); - Assert.assertFalse("Conversion should failed", actual.isPresent()); - - expr = - resolve( - ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("a_"))); - actual = FlinkFilters.convert(expr); - Assert.assertFalse("Conversion should failed", actual.isPresent()); - - expr = - resolve( - ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("a%b"))); - actual = FlinkFilters.convert(expr); - Assert.assertFalse("Conversion should failed", actual.isPresent()); - } - - @SuppressWarnings("unchecked") - private void matchLiteral(String fieldName, Object flinkLiteral, T icebergLiteral) { - Expression expr = resolve(Expressions.$(fieldName).isEqual(Expressions.lit(flinkLiteral))); - Optional actual = FlinkFilters.convert(expr); - Assert.assertTrue("Conversion should succeed", actual.isPresent()); - org.apache.iceberg.expressions.Expression expression = actual.get(); - Assertions.assertThat(expression) - .as("The expression should be a UnboundPredicate") - .isInstanceOf(UnboundPredicate.class); - UnboundPredicate unboundPredicate = (UnboundPredicate) expression; - - org.apache.iceberg.expressions.Expression expression1 = - unboundPredicate.bind(FlinkSchemaUtil.convert(TABLE_SCHEMA).asStruct(), false); - Assertions.assertThat(expression1) - .as("The expression should be a BoundLiteralPredicate") - .isInstanceOf(BoundLiteralPredicate.class); - - BoundLiteralPredicate predicate = (BoundLiteralPredicate) expression1; - Assert.assertTrue("Should match the literal", predicate.test(icebergLiteral)); - } - - private static Expression resolve(Expression originalExpression) { - return originalExpression.accept( - new ApiExpressionDefaultVisitor() { - @Override - public Expression visit(UnresolvedReferenceExpression unresolvedReference) { - String name = unresolvedReference.getName(); - Optional field = TABLE_SCHEMA.getTableColumn(name); - if (field.isPresent()) { - int index = TABLE_SCHEMA.getTableColumns().indexOf(field.get()); - return new FieldReferenceExpression(name, field.get().getType(), 0, index); - } else { - return null; - } - } - - @Override - public Expression visit(UnresolvedCallExpression unresolvedCall) { - List children = - unresolvedCall.getChildren().stream() - .map(e -> (ResolvedExpression) e.accept(this)) - .collect(Collectors.toList()); - return new CallExpression( - unresolvedCall.getFunctionDefinition(), children, DataTypes.STRING()); - } - - @Override - public Expression visit(ValueLiteralExpression valueLiteral) { - return valueLiteral; - } - - @Override - protected Expression defaultMethod(Expression expression) { - throw new UnsupportedOperationException( - String.format("unsupported expression: %s", expression)); - } - }); - } - - private void assertPredicatesMatch( - org.apache.iceberg.expressions.Expression expected, - org.apache.iceberg.expressions.Expression actual) { - Assertions.assertThat(expected) - .as("The expected expression should be a UnboundPredicate") - .isInstanceOf(UnboundPredicate.class); - Assertions.assertThat(actual) - .as("The actual expression should be a UnboundPredicate") - .isInstanceOf(UnboundPredicate.class); - UnboundPredicate predicateExpected = (UnboundPredicate) expected; - UnboundPredicate predicateActual = (UnboundPredicate) actual; - Assert.assertEquals( - "Predicate operation should match", predicateExpected.op(), predicateActual.op()); - Assert.assertEquals( - "Predicate literal should match", predicateExpected.literal(), predicateActual.literal()); - Assert.assertEquals( - "Predicate name should match", - predicateExpected.ref().name(), - predicateActual.ref().name()); - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java deleted file mode 100644 index 64746356636b..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Map; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.junit.Assert; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -public class TestFlinkHiveCatalog extends FlinkTestBase { - - @Rule public TemporaryFolder tempFolder = new TemporaryFolder(); - - @Test - public void testCreateCatalogWithWarehouseLocation() throws IOException { - Map props = Maps.newHashMap(); - props.put("type", "iceberg"); - props.put(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, "hive"); - props.put(CatalogProperties.URI, FlinkCatalogTestBase.getURI(hiveConf)); - - File warehouseDir = tempFolder.newFolder(); - props.put(CatalogProperties.WAREHOUSE_LOCATION, "file://" + warehouseDir.getAbsolutePath()); - - checkSQLQuery(props, warehouseDir); - } - - @Test - public void testCreateCatalogWithHiveConfDir() throws IOException { - // Dump the hive conf into a local file. - File hiveConfDir = tempFolder.newFolder(); - File hiveSiteXML = new File(hiveConfDir, "hive-site.xml"); - File warehouseDir = tempFolder.newFolder(); - try (FileOutputStream fos = new FileOutputStream(hiveSiteXML)) { - Configuration newConf = new Configuration(hiveConf); - // Set another new directory which is different with the hive metastore's warehouse path. - newConf.set( - HiveConf.ConfVars.METASTOREWAREHOUSE.varname, "file://" + warehouseDir.getAbsolutePath()); - newConf.writeXml(fos); - } - Assert.assertTrue("hive-site.xml should be created now.", Files.exists(hiveSiteXML.toPath())); - - // Construct the catalog attributions. - Map props = Maps.newHashMap(); - props.put("type", "iceberg"); - props.put(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, "hive"); - props.put(CatalogProperties.URI, FlinkCatalogTestBase.getURI(hiveConf)); - // Set the 'hive-conf-dir' instead of 'warehouse' - props.put(FlinkCatalogFactory.HIVE_CONF_DIR, hiveConfDir.getAbsolutePath()); - - checkSQLQuery(props, warehouseDir); - } - - private void checkSQLQuery(Map catalogProperties, File warehouseDir) - throws IOException { - sql( - "CREATE CATALOG test_catalog WITH %s", - FlinkCatalogTestBase.toWithClause(catalogProperties)); - sql("USE CATALOG test_catalog"); - sql("CREATE DATABASE test_db"); - sql("USE test_db"); - sql("CREATE TABLE test_table(c1 INT, c2 STRING)"); - sql("INSERT INTO test_table SELECT 1, 'a'"); - - Path databasePath = warehouseDir.toPath().resolve("test_db.db"); - Assert.assertTrue("Database path should exist", Files.exists(databasePath)); - - Path tablePath = databasePath.resolve("test_table"); - Assert.assertTrue("Table path should exist", Files.exists(tablePath)); - - Path dataPath = tablePath.resolve("data"); - Assert.assertTrue("Table data path should exist", Files.exists(dataPath)); - Assert.assertEquals( - "Should have a .crc file and a .parquet file", 2, Files.list(dataPath).count()); - - sql("DROP TABLE test_table"); - sql("DROP DATABASE test_db"); - sql("DROP CATALOG test_catalog"); - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java deleted file mode 100644 index b5dfb9cb2f6b..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java +++ /dev/null @@ -1,416 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.api.ValidationException; -import org.apache.flink.table.types.logical.BinaryType; -import org.apache.flink.table.types.logical.CharType; -import org.apache.flink.table.types.logical.LocalZonedTimestampType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.logical.TimeType; -import org.apache.flink.table.types.logical.TimestampType; -import org.apache.flink.table.types.logical.VarBinaryType; -import org.apache.flink.table.types.logical.VarCharType; -import org.apache.iceberg.AssertHelpers; -import org.apache.iceberg.Schema; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; -import org.junit.Assert; -import org.junit.Test; - -public class TestFlinkSchemaUtil { - - @Test - public void testConvertFlinkSchemaToIcebergSchema() { - TableSchema flinkSchema = - TableSchema.builder() - .field("id", DataTypes.INT().notNull()) - .field("name", DataTypes.STRING()) /* optional by default */ - .field("salary", DataTypes.DOUBLE().notNull()) - .field( - "locations", - DataTypes.MAP( - DataTypes.STRING(), - DataTypes.ROW( - DataTypes.FIELD("posX", DataTypes.DOUBLE().notNull(), "X field"), - DataTypes.FIELD("posY", DataTypes.DOUBLE().notNull(), "Y field")))) - .field("strArray", DataTypes.ARRAY(DataTypes.STRING()).nullable()) - .field("intArray", DataTypes.ARRAY(DataTypes.INT()).nullable()) - .field("char", DataTypes.CHAR(10).notNull()) - .field("varchar", DataTypes.VARCHAR(10).notNull()) - .field("boolean", DataTypes.BOOLEAN().nullable()) - .field("tinyint", DataTypes.TINYINT()) - .field("smallint", DataTypes.SMALLINT()) - .field("bigint", DataTypes.BIGINT()) - .field("varbinary", DataTypes.VARBINARY(10)) - .field("binary", DataTypes.BINARY(10)) - .field("time", DataTypes.TIME()) - .field("timestampWithoutZone", DataTypes.TIMESTAMP()) - .field("timestampWithZone", DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE()) - .field("date", DataTypes.DATE()) - .field("decimal", DataTypes.DECIMAL(2, 2)) - .field("decimal2", DataTypes.DECIMAL(38, 2)) - .field("decimal3", DataTypes.DECIMAL(10, 1)) - .field("multiset", DataTypes.MULTISET(DataTypes.STRING().notNull())) - .build(); - - Schema icebergSchema = - new Schema( - Types.NestedField.required(0, "id", Types.IntegerType.get(), null), - Types.NestedField.optional(1, "name", Types.StringType.get(), null), - Types.NestedField.required(2, "salary", Types.DoubleType.get(), null), - Types.NestedField.optional( - 3, - "locations", - Types.MapType.ofOptional( - 24, - 25, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(22, "posX", Types.DoubleType.get(), "X field"), - Types.NestedField.required( - 23, "posY", Types.DoubleType.get(), "Y field")))), - Types.NestedField.optional( - 4, "strArray", Types.ListType.ofOptional(26, Types.StringType.get())), - Types.NestedField.optional( - 5, "intArray", Types.ListType.ofOptional(27, Types.IntegerType.get())), - Types.NestedField.required(6, "char", Types.StringType.get()), - Types.NestedField.required(7, "varchar", Types.StringType.get()), - Types.NestedField.optional(8, "boolean", Types.BooleanType.get()), - Types.NestedField.optional(9, "tinyint", Types.IntegerType.get()), - Types.NestedField.optional(10, "smallint", Types.IntegerType.get()), - Types.NestedField.optional(11, "bigint", Types.LongType.get()), - Types.NestedField.optional(12, "varbinary", Types.BinaryType.get()), - Types.NestedField.optional(13, "binary", Types.FixedType.ofLength(10)), - Types.NestedField.optional(14, "time", Types.TimeType.get()), - Types.NestedField.optional( - 15, "timestampWithoutZone", Types.TimestampType.withoutZone()), - Types.NestedField.optional(16, "timestampWithZone", Types.TimestampType.withZone()), - Types.NestedField.optional(17, "date", Types.DateType.get()), - Types.NestedField.optional(18, "decimal", Types.DecimalType.of(2, 2)), - Types.NestedField.optional(19, "decimal2", Types.DecimalType.of(38, 2)), - Types.NestedField.optional(20, "decimal3", Types.DecimalType.of(10, 1)), - Types.NestedField.optional( - 21, - "multiset", - Types.MapType.ofRequired(28, 29, Types.StringType.get(), Types.IntegerType.get()))); - - checkSchema(flinkSchema, icebergSchema); - } - - @Test - public void testMapField() { - TableSchema flinkSchema = - TableSchema.builder() - .field( - "map_int_long", - DataTypes.MAP(DataTypes.INT(), DataTypes.BIGINT()).notNull()) /* Required */ - .field( - "map_int_array_string", - DataTypes.MAP(DataTypes.ARRAY(DataTypes.INT()), DataTypes.STRING())) - .field( - "map_decimal_string", DataTypes.MAP(DataTypes.DECIMAL(10, 2), DataTypes.STRING())) - .field( - "map_fields_fields", - DataTypes.MAP( - DataTypes.ROW( - DataTypes.FIELD("field_int", DataTypes.INT(), "doc - int"), - DataTypes.FIELD("field_string", DataTypes.STRING(), "doc - string")) - .notNull(), /* Required */ - DataTypes.ROW( - DataTypes.FIELD( - "field_array", - DataTypes.ARRAY(DataTypes.STRING()), - "doc - array")) - .notNull() /* Required */) - .notNull() /* Required */) - .build(); - - Schema icebergSchema = - new Schema( - Types.NestedField.required( - 0, - "map_int_long", - Types.MapType.ofOptional(4, 5, Types.IntegerType.get(), Types.LongType.get()), - null), - Types.NestedField.optional( - 1, - "map_int_array_string", - Types.MapType.ofOptional( - 7, - 8, - Types.ListType.ofOptional(6, Types.IntegerType.get()), - Types.StringType.get()), - null), - Types.NestedField.optional( - 2, - "map_decimal_string", - Types.MapType.ofOptional( - 9, 10, Types.DecimalType.of(10, 2), Types.StringType.get())), - Types.NestedField.required( - 3, - "map_fields_fields", - Types.MapType.ofRequired( - 15, - 16, - Types.StructType.of( - Types.NestedField.optional( - 11, "field_int", Types.IntegerType.get(), "doc - int"), - Types.NestedField.optional( - 12, "field_string", Types.StringType.get(), "doc - string")), - Types.StructType.of( - Types.NestedField.optional( - 14, - "field_array", - Types.ListType.ofOptional(13, Types.StringType.get()), - "doc - array"))))); - - checkSchema(flinkSchema, icebergSchema); - } - - @Test - public void testStructField() { - TableSchema flinkSchema = - TableSchema.builder() - .field( - "struct_int_string_decimal", - DataTypes.ROW( - DataTypes.FIELD("field_int", DataTypes.INT()), - DataTypes.FIELD("field_string", DataTypes.STRING()), - DataTypes.FIELD("field_decimal", DataTypes.DECIMAL(19, 2)), - DataTypes.FIELD( - "field_struct", - DataTypes.ROW( - DataTypes.FIELD("inner_struct_int", DataTypes.INT()), - DataTypes.FIELD( - "inner_struct_float_array", - DataTypes.ARRAY(DataTypes.FLOAT()))) - .notNull()) /* Row is required */) - .notNull()) /* Required */ - .field( - "struct_map_int_int", - DataTypes.ROW( - DataTypes.FIELD( - "field_map", DataTypes.MAP(DataTypes.INT(), DataTypes.INT()))) - .nullable()) /* Optional */ - .build(); - - Schema icebergSchema = - new Schema( - Types.NestedField.required( - 0, - "struct_int_string_decimal", - Types.StructType.of( - Types.NestedField.optional(5, "field_int", Types.IntegerType.get()), - Types.NestedField.optional(6, "field_string", Types.StringType.get()), - Types.NestedField.optional(7, "field_decimal", Types.DecimalType.of(19, 2)), - Types.NestedField.required( - 8, - "field_struct", - Types.StructType.of( - Types.NestedField.optional( - 3, "inner_struct_int", Types.IntegerType.get()), - Types.NestedField.optional( - 4, - "inner_struct_float_array", - Types.ListType.ofOptional(2, Types.FloatType.get())))))), - Types.NestedField.optional( - 1, - "struct_map_int_int", - Types.StructType.of( - Types.NestedField.optional( - 11, - "field_map", - Types.MapType.ofOptional( - 9, 10, Types.IntegerType.get(), Types.IntegerType.get()))))); - - checkSchema(flinkSchema, icebergSchema); - } - - @Test - public void testListField() { - TableSchema flinkSchema = - TableSchema.builder() - .field( - "list_struct_fields", - DataTypes.ARRAY(DataTypes.ROW(DataTypes.FIELD("field_int", DataTypes.INT()))) - .notNull()) /* Required */ - .field( - "list_optional_struct_fields", - DataTypes.ARRAY( - DataTypes.ROW( - DataTypes.FIELD( - "field_timestamp_with_local_time_zone", - DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE()))) - .nullable()) /* Optional */ - .field( - "list_map_fields", - DataTypes.ARRAY( - DataTypes.MAP( - DataTypes.ARRAY( - DataTypes.INT().notNull()), /* Key of map must be required */ - DataTypes.ROW( - DataTypes.FIELD("field_0", DataTypes.INT(), "doc - int"))) - .notNull()) - .notNull()) /* Required */ - .build(); - - Schema icebergSchema = - new Schema( - Types.NestedField.required( - 0, - "list_struct_fields", - Types.ListType.ofOptional( - 4, - Types.StructType.of( - Types.NestedField.optional(3, "field_int", Types.IntegerType.get())))), - Types.NestedField.optional( - 1, - "list_optional_struct_fields", - Types.ListType.ofOptional( - 6, - Types.StructType.of( - Types.NestedField.optional( - 5, - "field_timestamp_with_local_time_zone", - Types.TimestampType.withZone())))), - Types.NestedField.required( - 2, - "list_map_fields", - Types.ListType.ofRequired( - 11, - Types.MapType.ofOptional( - 9, - 10, - Types.ListType.ofRequired(7, Types.IntegerType.get()), - Types.StructType.of( - Types.NestedField.optional( - 8, "field_0", Types.IntegerType.get(), "doc - int")))))); - - checkSchema(flinkSchema, icebergSchema); - } - - private void checkSchema(TableSchema flinkSchema, Schema icebergSchema) { - Assert.assertEquals(icebergSchema.asStruct(), FlinkSchemaUtil.convert(flinkSchema).asStruct()); - // The conversion is not a 1:1 mapping, so we just check iceberg types. - Assert.assertEquals( - icebergSchema.asStruct(), - FlinkSchemaUtil.convert(FlinkSchemaUtil.toSchema(FlinkSchemaUtil.convert(icebergSchema))) - .asStruct()); - } - - @Test - public void testInconsistentTypes() { - checkInconsistentType( - Types.UUIDType.get(), new BinaryType(16), new BinaryType(16), Types.FixedType.ofLength(16)); - checkInconsistentType( - Types.StringType.get(), - new VarCharType(VarCharType.MAX_LENGTH), - new CharType(100), - Types.StringType.get()); - checkInconsistentType( - Types.BinaryType.get(), - new VarBinaryType(VarBinaryType.MAX_LENGTH), - new VarBinaryType(100), - Types.BinaryType.get()); - checkInconsistentType( - Types.TimeType.get(), new TimeType(), new TimeType(3), Types.TimeType.get()); - checkInconsistentType( - Types.TimestampType.withoutZone(), - new TimestampType(6), - new TimestampType(3), - Types.TimestampType.withoutZone()); - checkInconsistentType( - Types.TimestampType.withZone(), - new LocalZonedTimestampType(6), - new LocalZonedTimestampType(3), - Types.TimestampType.withZone()); - } - - private void checkInconsistentType( - Type icebergType, - LogicalType flinkExpectedType, - LogicalType flinkType, - Type icebergExpectedType) { - Assert.assertEquals(flinkExpectedType, FlinkSchemaUtil.convert(icebergType)); - Assert.assertEquals( - Types.StructType.of(Types.NestedField.optional(0, "f0", icebergExpectedType)), - FlinkSchemaUtil.convert(FlinkSchemaUtil.toSchema(RowType.of(flinkType))).asStruct()); - } - - @Test - public void testConvertFlinkSchemaBaseOnIcebergSchema() { - Schema baseSchema = - new Schema( - Lists.newArrayList( - Types.NestedField.required(101, "int", Types.IntegerType.get()), - Types.NestedField.optional(102, "string", Types.StringType.get())), - Sets.newHashSet(101)); - - TableSchema flinkSchema = - TableSchema.builder() - .field("int", DataTypes.INT().notNull()) - .field("string", DataTypes.STRING().nullable()) - .primaryKey("int") - .build(); - Schema convertedSchema = FlinkSchemaUtil.convert(baseSchema, flinkSchema); - Assert.assertEquals(baseSchema.asStruct(), convertedSchema.asStruct()); - Assert.assertEquals(ImmutableSet.of(101), convertedSchema.identifierFieldIds()); - } - - @Test - public void testConvertFlinkSchemaWithPrimaryKeys() { - Schema icebergSchema = - new Schema( - Lists.newArrayList( - Types.NestedField.required(1, "int", Types.IntegerType.get()), - Types.NestedField.required(2, "string", Types.StringType.get())), - Sets.newHashSet(1, 2)); - - TableSchema tableSchema = FlinkSchemaUtil.toSchema(icebergSchema); - Assert.assertTrue(tableSchema.getPrimaryKey().isPresent()); - Assert.assertEquals( - ImmutableSet.of("int", "string"), - ImmutableSet.copyOf(tableSchema.getPrimaryKey().get().getColumns())); - } - - @Test - public void testConvertFlinkSchemaWithNestedColumnInPrimaryKeys() { - Schema icebergSchema = - new Schema( - Lists.newArrayList( - Types.NestedField.required( - 1, - "struct", - Types.StructType.of( - Types.NestedField.required(2, "inner", Types.IntegerType.get())))), - Sets.newHashSet(2)); - AssertHelpers.assertThrows( - "Does not support the nested columns in flink schema's primary keys", - ValidationException.class, - "Column 'struct.inner' does not exist", - () -> FlinkSchemaUtil.toSchema(icebergSchema)); - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java deleted file mode 100644 index 23bd7cf47c17..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java +++ /dev/null @@ -1,343 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; -import java.util.stream.IntStream; -import org.apache.flink.streaming.api.TimeCharacteristic; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.EnvironmentSettings; -import org.apache.flink.table.api.Expressions; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; -import org.apache.flink.test.util.MiniClusterWithClientResource; -import org.apache.flink.types.Row; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DistributionMode; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.flink.source.BoundedTableFactory; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.junit.After; -import org.junit.Assert; -import org.junit.Assume; -import org.junit.Before; -import org.junit.ClassRule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Parameterized.class) -public class TestFlinkTableSink extends FlinkCatalogTestBase { - - @ClassRule - public static final MiniClusterWithClientResource MINI_CLUSTER_RESOURCE = - MiniClusterResource.createWithClassloaderCheckDisabled(); - - @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); - - private static final String SOURCE_TABLE = "default_catalog.default_database.bounded_source"; - private static final String TABLE_NAME = "test_table"; - private TableEnvironment tEnv; - private Table icebergTable; - - private final FileFormat format; - private final boolean isStreamingJob; - - @Parameterized.Parameters( - name = "catalogName={0}, baseNamespace={1}, format={2}, isStreaming={3}") - public static Iterable parameters() { - List parameters = Lists.newArrayList(); - for (FileFormat format : - new FileFormat[] {FileFormat.ORC, FileFormat.AVRO, FileFormat.PARQUET}) { - for (Boolean isStreaming : new Boolean[] {true, false}) { - for (Object[] catalogParams : FlinkCatalogTestBase.parameters()) { - String catalogName = (String) catalogParams[0]; - Namespace baseNamespace = (Namespace) catalogParams[1]; - parameters.add(new Object[] {catalogName, baseNamespace, format, isStreaming}); - } - } - } - return parameters; - } - - public TestFlinkTableSink( - String catalogName, Namespace baseNamespace, FileFormat format, Boolean isStreamingJob) { - super(catalogName, baseNamespace); - this.format = format; - this.isStreamingJob = isStreamingJob; - } - - @Override - protected TableEnvironment getTableEnv() { - if (tEnv == null) { - synchronized (this) { - EnvironmentSettings.Builder settingsBuilder = - EnvironmentSettings.newInstance().useBlinkPlanner(); - if (isStreamingJob) { - settingsBuilder.inStreamingMode(); - StreamExecutionEnvironment env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); - env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); - env.enableCheckpointing(400); - env.setMaxParallelism(2); - env.setParallelism(2); - tEnv = StreamTableEnvironment.create(env, settingsBuilder.build()); - } else { - settingsBuilder.inBatchMode(); - tEnv = TableEnvironment.create(settingsBuilder.build()); - } - } - } - return tEnv; - } - - @Override - @Before - public void before() { - super.before(); - sql("CREATE DATABASE %s", flinkDatabase); - sql("USE CATALOG %s", catalogName); - sql("USE %s", DATABASE); - sql( - "CREATE TABLE %s (id int, data varchar) with ('write.format.default'='%s')", - TABLE_NAME, format.name()); - icebergTable = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); - } - - @Override - @After - public void clean() { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, TABLE_NAME); - sql("DROP DATABASE IF EXISTS %s", flinkDatabase); - BoundedTableFactory.clearDataSets(); - super.clean(); - } - - @Test - public void testInsertFromSourceTable() throws Exception { - // Register the rows into a temporary table. - getTableEnv() - .createTemporaryView( - "sourceTable", - getTableEnv() - .fromValues( - SimpleDataUtil.FLINK_SCHEMA.toRowDataType(), - Expressions.row(1, "hello"), - Expressions.row(2, "world"), - Expressions.row(3, (String) null), - Expressions.row(null, "bar"))); - - // Redirect the records from source table to destination table. - sql("INSERT INTO %s SELECT id,data from sourceTable", TABLE_NAME); - - // Assert the table records as expected. - SimpleDataUtil.assertTableRecords( - icebergTable, - Lists.newArrayList( - SimpleDataUtil.createRecord(1, "hello"), - SimpleDataUtil.createRecord(2, "world"), - SimpleDataUtil.createRecord(3, null), - SimpleDataUtil.createRecord(null, "bar"))); - } - - @Test - public void testOverwriteTable() throws Exception { - Assume.assumeFalse( - "Flink unbounded streaming does not support overwrite operation", isStreamingJob); - - sql("INSERT INTO %s SELECT 1, 'a'", TABLE_NAME); - SimpleDataUtil.assertTableRecords( - icebergTable, Lists.newArrayList(SimpleDataUtil.createRecord(1, "a"))); - - sql("INSERT OVERWRITE %s SELECT 2, 'b'", TABLE_NAME); - SimpleDataUtil.assertTableRecords( - icebergTable, Lists.newArrayList(SimpleDataUtil.createRecord(2, "b"))); - } - - @Test - public void testReplacePartitions() throws Exception { - Assume.assumeFalse( - "Flink unbounded streaming does not support overwrite operation", isStreamingJob); - String tableName = "test_partition"; - sql( - "CREATE TABLE %s(id INT, data VARCHAR) PARTITIONED BY (data) WITH ('write.format.default'='%s')", - tableName, format.name()); - - try { - Table partitionedTable = - validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, tableName)); - - sql("INSERT INTO %s SELECT 1, 'a'", tableName); - sql("INSERT INTO %s SELECT 2, 'b'", tableName); - sql("INSERT INTO %s SELECT 3, 'c'", tableName); - - SimpleDataUtil.assertTableRecords( - partitionedTable, - Lists.newArrayList( - SimpleDataUtil.createRecord(1, "a"), - SimpleDataUtil.createRecord(2, "b"), - SimpleDataUtil.createRecord(3, "c"))); - - sql("INSERT OVERWRITE %s SELECT 4, 'b'", tableName); - sql("INSERT OVERWRITE %s SELECT 5, 'a'", tableName); - - SimpleDataUtil.assertTableRecords( - partitionedTable, - Lists.newArrayList( - SimpleDataUtil.createRecord(5, "a"), - SimpleDataUtil.createRecord(4, "b"), - SimpleDataUtil.createRecord(3, "c"))); - - sql("INSERT OVERWRITE %s PARTITION (data='a') SELECT 6", tableName); - - SimpleDataUtil.assertTableRecords( - partitionedTable, - Lists.newArrayList( - SimpleDataUtil.createRecord(6, "a"), - SimpleDataUtil.createRecord(4, "b"), - SimpleDataUtil.createRecord(3, "c"))); - } finally { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); - } - } - - @Test - public void testInsertIntoPartition() throws Exception { - String tableName = "test_insert_into_partition"; - sql( - "CREATE TABLE %s(id INT, data VARCHAR) PARTITIONED BY (data) WITH ('write.format.default'='%s')", - tableName, format.name()); - - try { - Table partitionedTable = - validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, tableName)); - - // Full partition. - sql("INSERT INTO %s PARTITION (data='a') SELECT 1", tableName); - sql("INSERT INTO %s PARTITION (data='a') SELECT 2", tableName); - sql("INSERT INTO %s PARTITION (data='b') SELECT 3", tableName); - - SimpleDataUtil.assertTableRecords( - partitionedTable, - Lists.newArrayList( - SimpleDataUtil.createRecord(1, "a"), - SimpleDataUtil.createRecord(2, "a"), - SimpleDataUtil.createRecord(3, "b"))); - - // Partial partition. - sql("INSERT INTO %s SELECT 4, 'c'", tableName); - sql("INSERT INTO %s SELECT 5, 'd'", tableName); - - SimpleDataUtil.assertTableRecords( - partitionedTable, - Lists.newArrayList( - SimpleDataUtil.createRecord(1, "a"), - SimpleDataUtil.createRecord(2, "a"), - SimpleDataUtil.createRecord(3, "b"), - SimpleDataUtil.createRecord(4, "c"), - SimpleDataUtil.createRecord(5, "d"))); - } finally { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); - } - } - - @Test - public void testHashDistributeMode() throws Exception { - String tableName = "test_hash_distribution_mode"; - Map tableProps = - ImmutableMap.of( - "write.format.default", - format.name(), - TableProperties.WRITE_DISTRIBUTION_MODE, - DistributionMode.HASH.modeName()); - - // Initialize a BoundedSource table to precisely emit those rows in only one checkpoint. - List dataSet = - IntStream.range(1, 1000) - .mapToObj(i -> ImmutableList.of(Row.of(i, "aaa"), Row.of(i, "bbb"), Row.of(i, "ccc"))) - .flatMap(List::stream) - .collect(Collectors.toList()); - String dataId = BoundedTableFactory.registerDataSet(ImmutableList.of(dataSet)); - sql( - "CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL)" - + " WITH ('connector'='BoundedSource', 'data-id'='%s')", - SOURCE_TABLE, dataId); - Assert.assertEquals( - "Should have the expected rows in source table.", - Sets.newHashSet(dataSet), - Sets.newHashSet(sql("SELECT * FROM %s", SOURCE_TABLE))); - - sql( - "CREATE TABLE %s(id INT, data VARCHAR) PARTITIONED BY (data) WITH %s", - tableName, toWithClause(tableProps)); - - try { - // Insert data set. - sql("INSERT INTO %s SELECT * FROM %s", tableName, SOURCE_TABLE); - - Assert.assertEquals( - "Should have the expected rows in sink table.", - Sets.newHashSet(dataSet), - Sets.newHashSet(sql("SELECT * FROM %s", tableName))); - - // Sometimes we will have more than one checkpoint if we pass the auto checkpoint interval, - // thus producing multiple snapshots. Here we assert that each snapshot has only 1 file per - // partition. - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, tableName)); - Map> snapshotToDataFiles = SimpleDataUtil.snapshotToDataFiles(table); - for (List dataFiles : snapshotToDataFiles.values()) { - if (dataFiles.isEmpty()) { - continue; - } - - Assert.assertEquals( - "There should be 1 data file in partition 'aaa'", - 1, - SimpleDataUtil.matchingPartitions( - dataFiles, table.spec(), ImmutableMap.of("data", "aaa")) - .size()); - Assert.assertEquals( - "There should be 1 data file in partition 'bbb'", - 1, - SimpleDataUtil.matchingPartitions( - dataFiles, table.spec(), ImmutableMap.of("data", "bbb")) - .size()); - Assert.assertEquals( - "There should be 1 data file in partition 'ccc'", - 1, - SimpleDataUtil.matchingPartitions( - dataFiles, table.spec(), ImmutableMap.of("data", "ccc")) - .size()); - } - } finally { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); - } - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSource.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSource.java deleted file mode 100644 index 8f30f13db7e0..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSource.java +++ /dev/null @@ -1,653 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.io.File; -import java.io.IOException; -import java.util.List; -import org.apache.flink.configuration.CoreOptions; -import org.apache.flink.table.api.SqlParserException; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.types.Row; -import org.apache.iceberg.AssertHelpers; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.events.Listeners; -import org.apache.iceberg.events.ScanEvent; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.assertj.core.api.Assertions; -import org.junit.After; -import org.junit.Assert; -import org.junit.Before; -import org.junit.BeforeClass; -import org.junit.Test; - -public class TestFlinkTableSource extends FlinkTestBase { - - private static final String CATALOG_NAME = "test_catalog"; - private static final String DATABASE_NAME = "test_db"; - private static final String TABLE_NAME = "test_table"; - private final FileFormat format = FileFormat.AVRO; - private static String warehouse; - - private int scanEventCount = 0; - private ScanEvent lastScanEvent = null; - - public TestFlinkTableSource() { - // register a scan event listener to validate pushdown - Listeners.register( - event -> { - scanEventCount += 1; - lastScanEvent = event; - }, - ScanEvent.class); - } - - @Override - protected TableEnvironment getTableEnv() { - super.getTableEnv().getConfig().getConfiguration().set(CoreOptions.DEFAULT_PARALLELISM, 1); - return super.getTableEnv(); - } - - @BeforeClass - public static void createWarehouse() throws IOException { - File warehouseFile = TEMPORARY_FOLDER.newFolder(); - Assert.assertTrue("The warehouse should be deleted", warehouseFile.delete()); - // before variables - warehouse = "file:" + warehouseFile; - } - - @Before - public void before() { - sql( - "CREATE CATALOG %s WITH ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", - CATALOG_NAME, warehouse); - sql("USE CATALOG %s", CATALOG_NAME); - sql("CREATE DATABASE %s", DATABASE_NAME); - sql("USE %s", DATABASE_NAME); - sql( - "CREATE TABLE %s (id INT, data VARCHAR,d DOUBLE) WITH ('write.format.default'='%s')", - TABLE_NAME, format.name()); - sql( - "INSERT INTO %s VALUES (1,'iceberg',10),(2,'b',20),(3,CAST(NULL AS VARCHAR),30)", - TABLE_NAME); - - this.scanEventCount = 0; - this.lastScanEvent = null; - } - - @After - public void clean() { - sql("DROP TABLE IF EXISTS %s.%s", DATABASE_NAME, TABLE_NAME); - sql("DROP DATABASE IF EXISTS %s", DATABASE_NAME); - sql("DROP CATALOG IF EXISTS %s", CATALOG_NAME); - } - - @Test - public void testLimitPushDown() { - - AssertHelpers.assertThrows( - "Invalid limit number: -1 ", - SqlParserException.class, - () -> sql("SELECT * FROM %s LIMIT -1", TABLE_NAME)); - - Assert.assertEquals( - "Should have 0 record", 0, sql("SELECT * FROM %s LIMIT 0", TABLE_NAME).size()); - - String sqlLimitExceed = String.format("SELECT * FROM %s LIMIT 4", TABLE_NAME); - List resultExceed = sql(sqlLimitExceed); - Assert.assertEquals("Should have 3 records", 3, resultExceed.size()); - List expectedList = - Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); - assertSameElements(expectedList, resultExceed); - - String querySql = String.format("SELECT * FROM %s LIMIT 1", TABLE_NAME); - String explain = getTableEnv().explainSql(querySql); - String expectedExplain = "limit=[1]"; - Assert.assertTrue("Explain should contain LimitPushDown", explain.contains(expectedExplain)); - List result = sql(querySql); - Assert.assertEquals("Should have 1 record", 1, result.size()); - Assertions.assertThat(result).containsAnyElementsOf(expectedList); - - String sqlMixed = String.format("SELECT * FROM %s WHERE id = 1 LIMIT 2", TABLE_NAME); - List mixedResult = sql(sqlMixed); - Assert.assertEquals("Should have 1 record", 1, mixedResult.size()); - Assert.assertEquals( - "Should produce the expected records", Row.of(1, "iceberg", 10.0), mixedResult.get(0)); - } - - @Test - public void testNoFilterPushDown() { - String sql = String.format("SELECT * FROM %s ", TABLE_NAME); - List result = sql(sql); - List expectedRecords = - Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); - assertSameElements(expectedRecords, result); - Assert.assertEquals( - "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); - } - - @Test - public void testFilterPushDownEqual() { - String sqlLiteralRight = String.format("SELECT * FROM %s WHERE id = 1 ", TABLE_NAME); - String expectedFilter = "ref(name=\"id\") == 1"; - - List result = sql(sqlLiteralRight); - Assert.assertEquals("Should have 1 record", 1, result.size()); - Assert.assertEquals( - "Should produce the expected record", Row.of(1, "iceberg", 10.0), result.get(0)); - - Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals( - "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); - } - - @Test - public void testFilterPushDownEqualNull() { - String sqlEqualNull = String.format("SELECT * FROM %s WHERE data = NULL ", TABLE_NAME); - - List result = sql(sqlEqualNull); - Assert.assertEquals("Should have 0 record", 0, result.size()); - Assert.assertNull("Should not push down a filter", lastScanEvent); - } - - @Test - public void testFilterPushDownEqualLiteralOnLeft() { - String sqlLiteralLeft = String.format("SELECT * FROM %s WHERE 1 = id ", TABLE_NAME); - String expectedFilter = "ref(name=\"id\") == 1"; - - List resultLeft = sql(sqlLiteralLeft); - Assert.assertEquals("Should have 1 record", 1, resultLeft.size()); - Assert.assertEquals( - "Should produce the expected record", Row.of(1, "iceberg", 10.0), resultLeft.get(0)); - - Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals( - "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); - } - - @Test - public void testFilterPushDownNoEqual() { - String sqlNE = String.format("SELECT * FROM %s WHERE id <> 1 ", TABLE_NAME); - String expectedFilter = "ref(name=\"id\") != 1"; - - List resultNE = sql(sqlNE); - Assert.assertEquals("Should have 2 records", 2, resultNE.size()); - - List expectedNE = Lists.newArrayList(Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); - assertSameElements(expectedNE, resultNE); - Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals( - "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); - } - - @Test - public void testFilterPushDownNoEqualNull() { - String sqlNotEqualNull = String.format("SELECT * FROM %s WHERE data <> NULL ", TABLE_NAME); - - List resultNE = sql(sqlNotEqualNull); - Assert.assertEquals("Should have 0 records", 0, resultNE.size()); - Assert.assertNull("Should not push down a filter", lastScanEvent); - } - - @Test - public void testFilterPushDownAnd() { - String sqlAnd = - String.format("SELECT * FROM %s WHERE id = 1 AND data = 'iceberg' ", TABLE_NAME); - - List resultAnd = sql(sqlAnd); - Assert.assertEquals("Should have 1 record", 1, resultAnd.size()); - Assert.assertEquals( - "Should produce the expected record", Row.of(1, "iceberg", 10.0), resultAnd.get(0)); - - Assert.assertEquals("Should create only one scan", 1, scanEventCount); - String expected = "(ref(name=\"id\") == 1 and ref(name=\"data\") == \"iceberg\")"; - Assert.assertEquals( - "Should contain the push down filter", expected, lastScanEvent.filter().toString()); - } - - @Test - public void testFilterPushDownOr() { - String sqlOr = String.format("SELECT * FROM %s WHERE id = 1 OR data = 'b' ", TABLE_NAME); - String expectedFilter = "(ref(name=\"id\") == 1 or ref(name=\"data\") == \"b\")"; - - List resultOr = sql(sqlOr); - Assert.assertEquals("Should have 2 record", 2, resultOr.size()); - - List expectedOR = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); - assertSameElements(expectedOR, resultOr); - - Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals( - "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); - } - - @Test - public void testFilterPushDownGreaterThan() { - String sqlGT = String.format("SELECT * FROM %s WHERE id > 1 ", TABLE_NAME); - String expectedFilter = "ref(name=\"id\") > 1"; - - List resultGT = sql(sqlGT); - Assert.assertEquals("Should have 2 record", 2, resultGT.size()); - - List expectedGT = Lists.newArrayList(Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); - assertSameElements(expectedGT, resultGT); - - Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals( - "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); - } - - @Test - public void testFilterPushDownGreaterThanNull() { - String sqlGT = String.format("SELECT * FROM %s WHERE data > null ", TABLE_NAME); - - List resultGT = sql(sqlGT); - Assert.assertEquals("Should have 0 record", 0, resultGT.size()); - Assert.assertNull("Should not push down a filter", lastScanEvent); - } - - @Test - public void testFilterPushDownGreaterThanLiteralOnLeft() { - String sqlGT = String.format("SELECT * FROM %s WHERE 3 > id ", TABLE_NAME); - String expectedFilter = "ref(name=\"id\") < 3"; - - List resultGT = sql(sqlGT); - Assert.assertEquals("Should have 2 records", 2, resultGT.size()); - - List expectedGT = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); - assertSameElements(expectedGT, resultGT); - - Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals( - "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); - } - - @Test - public void testFilterPushDownGreaterThanEqual() { - String sqlGTE = String.format("SELECT * FROM %s WHERE id >= 2 ", TABLE_NAME); - String expectedFilter = "ref(name=\"id\") >= 2"; - - List resultGTE = sql(sqlGTE); - Assert.assertEquals("Should have 2 records", 2, resultGTE.size()); - - List expectedGTE = Lists.newArrayList(Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); - assertSameElements(expectedGTE, resultGTE); - - Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals( - "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); - } - - @Test - public void testFilterPushDownGreaterThanEqualNull() { - String sqlGTE = String.format("SELECT * FROM %s WHERE data >= null ", TABLE_NAME); - - List resultGT = sql(sqlGTE); - Assert.assertEquals("Should have 0 record", 0, resultGT.size()); - Assert.assertNull("Should not push down a filter", lastScanEvent); - } - - @Test - public void testFilterPushDownGreaterThanEqualLiteralOnLeft() { - String sqlGTE = String.format("SELECT * FROM %s WHERE 2 >= id ", TABLE_NAME); - String expectedFilter = "ref(name=\"id\") <= 2"; - - List resultGTE = sql(sqlGTE); - Assert.assertEquals("Should have 2 records", 2, resultGTE.size()); - - List expectedGTE = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); - assertSameElements(expectedGTE, resultGTE); - - Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals( - "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); - } - - @Test - public void testFilterPushDownLessThan() { - String sqlLT = String.format("SELECT * FROM %s WHERE id < 2 ", TABLE_NAME); - String expectedFilter = "ref(name=\"id\") < 2"; - - List resultLT = sql(sqlLT); - Assert.assertEquals("Should have 1 record", 1, resultLT.size()); - Assert.assertEquals( - "Should produce the expected record", Row.of(1, "iceberg", 10.0), resultLT.get(0)); - - Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals( - "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); - } - - @Test - public void testFilterPushDownLessThanNull() { - String sqlLT = String.format("SELECT * FROM %s WHERE data < null ", TABLE_NAME); - - List resultGT = sql(sqlLT); - Assert.assertEquals("Should have 0 record", 0, resultGT.size()); - Assert.assertNull("Should not push down a filter", lastScanEvent); - } - - @Test - public void testFilterPushDownLessThanLiteralOnLeft() { - String sqlLT = String.format("SELECT * FROM %s WHERE 2 < id ", TABLE_NAME); - String expectedFilter = "ref(name=\"id\") > 2"; - - List resultLT = sql(sqlLT); - Assert.assertEquals("Should have 1 record", 1, resultLT.size()); - Assert.assertEquals( - "Should produce the expected record", Row.of(3, null, 30.0), resultLT.get(0)); - - Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals( - "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); - } - - @Test - public void testFilterPushDownLessThanEqual() { - String sqlLTE = String.format("SELECT * FROM %s WHERE id <= 1 ", TABLE_NAME); - String expectedFilter = "ref(name=\"id\") <= 1"; - - List resultLTE = sql(sqlLTE); - Assert.assertEquals("Should have 1 record", 1, resultLTE.size()); - Assert.assertEquals( - "Should produce the expected record", Row.of(1, "iceberg", 10.0), resultLTE.get(0)); - - Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals( - "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); - } - - @Test - public void testFilterPushDownLessThanEqualNull() { - String sqlLTE = String.format("SELECT * FROM %s WHERE data <= null ", TABLE_NAME); - - List resultGT = sql(sqlLTE); - Assert.assertEquals("Should have 0 record", 0, resultGT.size()); - Assert.assertNull("Should not push down a filter", lastScanEvent); - } - - @Test - public void testFilterPushDownLessThanEqualLiteralOnLeft() { - String sqlLTE = String.format("SELECT * FROM %s WHERE 3 <= id ", TABLE_NAME); - String expectedFilter = "ref(name=\"id\") >= 3"; - - List resultLTE = sql(sqlLTE); - Assert.assertEquals("Should have 1 record", 1, resultLTE.size()); - Assert.assertEquals( - "Should produce the expected record", Row.of(3, null, 30.0), resultLTE.get(0)); - - Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals( - "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); - } - - @Test - public void testFilterPushDownIn() { - String sqlIN = String.format("SELECT * FROM %s WHERE id IN (1,2) ", TABLE_NAME); - String expectedFilter = "(ref(name=\"id\") == 1 or ref(name=\"id\") == 2)"; - List resultIN = sql(sqlIN); - Assert.assertEquals("Should have 2 records", 2, resultIN.size()); - - List expectedIN = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); - assertSameElements(expectedIN, resultIN); - Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals( - "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); - } - - @Test - public void testFilterPushDownInNull() { - String sqlInNull = - String.format("SELECT * FROM %s WHERE data IN ('iceberg',NULL) ", TABLE_NAME); - - List result = sql(sqlInNull); - Assert.assertEquals("Should have 1 record", 1, result.size()); - Assert.assertEquals( - "Should produce the expected record", Row.of(1, "iceberg", 10.0), result.get(0)); - Assert.assertEquals( - "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); - } - - @Test - public void testFilterPushDownNotIn() { - String sqlNotIn = String.format("SELECT * FROM %s WHERE id NOT IN (3,2) ", TABLE_NAME); - - List resultNotIn = sql(sqlNotIn); - Assert.assertEquals("Should have 1 record", 1, resultNotIn.size()); - Assert.assertEquals( - "Should produce the expected record", Row.of(1, "iceberg", 10.0), resultNotIn.get(0)); - Assert.assertEquals("Should create only one scan", 1, scanEventCount); - String expectedScan = "(ref(name=\"id\") != 2 and ref(name=\"id\") != 3)"; - Assert.assertEquals( - "Should contain the push down filter", expectedScan, lastScanEvent.filter().toString()); - } - - @Test - public void testFilterPushDownNotInNull() { - String sqlNotInNull = String.format("SELECT * FROM %s WHERE id NOT IN (1,2,NULL) ", TABLE_NAME); - List resultGT = sql(sqlNotInNull); - Assert.assertEquals("Should have 0 record", 0, resultGT.size()); - Assert.assertEquals( - "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); - } - - @Test - public void testFilterPushDownIsNotNull() { - String sqlNotNull = String.format("SELECT * FROM %s WHERE data IS NOT NULL", TABLE_NAME); - String expectedFilter = "not_null(ref(name=\"data\"))"; - - List resultNotNull = sql(sqlNotNull); - Assert.assertEquals("Should have 2 record", 2, resultNotNull.size()); - - List expected = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); - assertSameElements(expected, resultNotNull); - - Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals( - "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); - } - - @Test - public void testFilterPushDownIsNull() { - String sqlNull = String.format("SELECT * FROM %s WHERE data IS NULL", TABLE_NAME); - String expectedFilter = "is_null(ref(name=\"data\"))"; - - List resultNull = sql(sqlNull); - Assert.assertEquals("Should have 1 record", 1, resultNull.size()); - Assert.assertEquals( - "Should produce the expected record", Row.of(3, null, 30.0), resultNull.get(0)); - - Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals( - "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); - } - - @Test - public void testFilterPushDownNot() { - String sqlNot = String.format("SELECT * FROM %s WHERE NOT (id = 1 OR id = 2 ) ", TABLE_NAME); - - List resultNot = sql(sqlNot); - Assert.assertEquals("Should have 1 record", 1, resultNot.size()); - Assert.assertEquals( - "Should produce the expected record", Row.of(3, null, 30.0), resultNot.get(0)); - - Assert.assertEquals("Should create only one scan", 1, scanEventCount); - String expectedFilter = "(ref(name=\"id\") != 1 and ref(name=\"id\") != 2)"; - Assert.assertEquals( - "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); - } - - @Test - public void testFilterPushDownBetween() { - String sqlBetween = String.format("SELECT * FROM %s WHERE id BETWEEN 1 AND 2 ", TABLE_NAME); - - List resultBetween = sql(sqlBetween); - Assert.assertEquals("Should have 2 record", 2, resultBetween.size()); - - List expectedBetween = - Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); - assertSameElements(expectedBetween, resultBetween); - - Assert.assertEquals("Should create only one scan", 1, scanEventCount); - String expected = "(ref(name=\"id\") >= 1 and ref(name=\"id\") <= 2)"; - Assert.assertEquals( - "Should contain the push down filter", expected, lastScanEvent.filter().toString()); - } - - @Test - public void testFilterPushDownNotBetween() { - String sqlNotBetween = - String.format("SELECT * FROM %s WHERE id NOT BETWEEN 2 AND 3 ", TABLE_NAME); - String expectedFilter = "(ref(name=\"id\") < 2 or ref(name=\"id\") > 3)"; - - List resultNotBetween = sql(sqlNotBetween); - Assert.assertEquals("Should have 1 record", 1, resultNotBetween.size()); - Assert.assertEquals( - "Should produce the expected record", Row.of(1, "iceberg", 10.0), resultNotBetween.get(0)); - - Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals( - "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); - } - - @Test - public void testFilterPushDownLike() { - String expectedFilter = "ref(name=\"data\") startsWith \"\"ice\"\""; - - String sqlLike = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE 'ice%%' "; - List resultLike = sql(sqlLike); - Assert.assertEquals("Should have 1 record", 1, resultLike.size()); - Assert.assertEquals( - "The like result should produce the expected record", - Row.of(1, "iceberg", 10.0), - resultLike.get(0)); - Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals( - "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); - } - - @Test - public void testFilterNotPushDownLike() { - Row expectRecord = Row.of(1, "iceberg", 10.0); - String sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE '%%i' "; - List resultLike = sql(sqlNoPushDown); - Assert.assertEquals("Should have 1 record", 0, resultLike.size()); - Assert.assertEquals( - "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); - - sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE '%%i%%' "; - resultLike = sql(sqlNoPushDown); - Assert.assertEquals("Should have 1 record", 1, resultLike.size()); - Assert.assertEquals("Should produce the expected record", expectRecord, resultLike.get(0)); - Assert.assertEquals( - "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); - - sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE '%%ice%%g' "; - resultLike = sql(sqlNoPushDown); - Assert.assertEquals("Should have 1 record", 1, resultLike.size()); - Assert.assertEquals("Should produce the expected record", expectRecord, resultLike.get(0)); - Assert.assertEquals( - "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); - - sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE '%%' "; - resultLike = sql(sqlNoPushDown); - Assert.assertEquals("Should have 3 records", 3, resultLike.size()); - List expectedRecords = - Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); - assertSameElements(expectedRecords, resultLike); - Assert.assertEquals( - "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); - - sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE 'iceber_' "; - resultLike = sql(sqlNoPushDown); - Assert.assertEquals("Should have 1 record", 1, resultLike.size()); - Assert.assertEquals("Should produce the expected record", expectRecord, resultLike.get(0)); - Assert.assertEquals( - "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); - - sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE 'i%%g' "; - resultLike = sql(sqlNoPushDown); - Assert.assertEquals("Should have 1 record", 1, resultLike.size()); - Assert.assertEquals("Should produce the expected record", expectRecord, resultLike.get(0)); - Assert.assertEquals( - "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); - } - - @Test - public void testFilterPushDown2Literal() { - String sql2Literal = String.format("SELECT * FROM %s WHERE 1 > 0 ", TABLE_NAME); - List result = sql(sql2Literal); - List expectedRecords = - Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); - assertSameElements(expectedRecords, result); - Assert.assertEquals( - "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); - } - - /** - * NaN is not supported by flink now, so we add the test case to assert the parse error, when we - * upgrade the flink that supports NaN, we will delele the method, and add some test case to test - * NaN. - */ - @Test - public void testSqlParseError() { - String sqlParseErrorEqual = - String.format("SELECT * FROM %s WHERE d = CAST('NaN' AS DOUBLE) ", TABLE_NAME); - AssertHelpers.assertThrows( - "The NaN is not supported by flink now. ", - NumberFormatException.class, - () -> sql(sqlParseErrorEqual)); - - String sqlParseErrorNotEqual = - String.format("SELECT * FROM %s WHERE d <> CAST('NaN' AS DOUBLE) ", TABLE_NAME); - AssertHelpers.assertThrows( - "The NaN is not supported by flink now. ", - NumberFormatException.class, - () -> sql(sqlParseErrorNotEqual)); - - String sqlParseErrorGT = - String.format("SELECT * FROM %s WHERE d > CAST('NaN' AS DOUBLE) ", TABLE_NAME); - AssertHelpers.assertThrows( - "The NaN is not supported by flink now. ", - NumberFormatException.class, - () -> sql(sqlParseErrorGT)); - - String sqlParseErrorLT = - String.format("SELECT * FROM %s WHERE d < CAST('NaN' AS DOUBLE) ", TABLE_NAME); - AssertHelpers.assertThrows( - "The NaN is not supported by flink now. ", - NumberFormatException.class, - () -> sql(sqlParseErrorLT)); - - String sqlParseErrorGTE = - String.format("SELECT * FROM %s WHERE d >= CAST('NaN' AS DOUBLE) ", TABLE_NAME); - AssertHelpers.assertThrows( - "The NaN is not supported by flink now. ", - NumberFormatException.class, - () -> sql(sqlParseErrorGTE)); - - String sqlParseErrorLTE = - String.format("SELECT * FROM %s WHERE d <= CAST('NaN' AS DOUBLE) ", TABLE_NAME); - AssertHelpers.assertThrows( - "The NaN is not supported by flink now. ", - NumberFormatException.class, - () -> sql(sqlParseErrorLTE)); - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java deleted file mode 100644 index e5cd8741c2a5..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java +++ /dev/null @@ -1,339 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.time.LocalDate; -import java.util.List; -import java.util.Map; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.EnvironmentSettings; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; -import org.apache.flink.test.util.MiniClusterWithClientResource; -import org.apache.flink.types.Row; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.junit.After; -import org.junit.Before; -import org.junit.ClassRule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Parameterized.class) -public class TestFlinkUpsert extends FlinkCatalogTestBase { - - @ClassRule - public static final MiniClusterWithClientResource MINI_CLUSTER_RESOURCE = - MiniClusterResource.createWithClassloaderCheckDisabled(); - - @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); - - private final boolean isStreamingJob; - private final Map tableUpsertProps = Maps.newHashMap(); - private TableEnvironment tEnv; - - public TestFlinkUpsert( - String catalogName, Namespace baseNamespace, FileFormat format, Boolean isStreamingJob) { - super(catalogName, baseNamespace); - this.isStreamingJob = isStreamingJob; - tableUpsertProps.put(TableProperties.FORMAT_VERSION, "2"); - tableUpsertProps.put(TableProperties.UPSERT_ENABLED, "true"); - tableUpsertProps.put(TableProperties.DEFAULT_FILE_FORMAT, format.name()); - } - - @Parameterized.Parameters( - name = "catalogName={0}, baseNamespace={1}, format={2}, isStreaming={3}") - public static Iterable parameters() { - List parameters = Lists.newArrayList(); - for (FileFormat format : - new FileFormat[] {FileFormat.PARQUET, FileFormat.AVRO, FileFormat.ORC}) { - for (Boolean isStreaming : new Boolean[] {true, false}) { - // Only test with one catalog as this is a file operation concern. - // FlinkCatalogTestBase requires the catalog name start with testhadoop if using hadoop - // catalog. - String catalogName = "testhadoop"; - Namespace baseNamespace = Namespace.of("default"); - parameters.add(new Object[] {catalogName, baseNamespace, format, isStreaming}); - } - } - return parameters; - } - - @Override - protected TableEnvironment getTableEnv() { - if (tEnv == null) { - synchronized (this) { - EnvironmentSettings.Builder settingsBuilder = EnvironmentSettings.newInstance(); - if (isStreamingJob) { - settingsBuilder.inStreamingMode(); - StreamExecutionEnvironment env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); - env.enableCheckpointing(400); - env.setMaxParallelism(2); - env.setParallelism(2); - tEnv = StreamTableEnvironment.create(env, settingsBuilder.build()); - } else { - settingsBuilder.inBatchMode(); - tEnv = TableEnvironment.create(settingsBuilder.build()); - } - } - } - return tEnv; - } - - @Override - @Before - public void before() { - super.before(); - sql("CREATE DATABASE IF NOT EXISTS %s", flinkDatabase); - sql("USE CATALOG %s", catalogName); - sql("USE %s", DATABASE); - } - - @Override - @After - public void clean() { - sql("DROP DATABASE IF EXISTS %s", flinkDatabase); - super.clean(); - } - - @Test - public void testUpsertAndQuery() { - String tableName = "test_upsert_query"; - LocalDate dt20220301 = LocalDate.of(2022, 3, 1); - LocalDate dt20220302 = LocalDate.of(2022, 3, 2); - - sql( - "CREATE TABLE %s(id INT NOT NULL, province STRING NOT NULL, dt DATE, PRIMARY KEY(id,province) NOT ENFORCED) " - + "PARTITIONED BY (province) WITH %s", - tableName, toWithClause(tableUpsertProps)); - - try { - sql( - "INSERT INTO %s VALUES " - + "(1, 'a', DATE '2022-03-01')," - + "(2, 'b', DATE '2022-03-01')," - + "(1, 'b', DATE '2022-03-01')", - tableName); - - sql( - "INSERT INTO %s VALUES " - + "(4, 'a', DATE '2022-03-02')," - + "(5, 'b', DATE '2022-03-02')," - + "(1, 'b', DATE '2022-03-02')", - tableName); - - List rowsOn20220301 = - Lists.newArrayList(Row.of(2, "b", dt20220301), Row.of(1, "a", dt20220301)); - TestHelpers.assertRows( - sql("SELECT * FROM %s WHERE dt < '2022-03-02'", tableName), rowsOn20220301); - - List rowsOn20220302 = - Lists.newArrayList( - Row.of(1, "b", dt20220302), Row.of(4, "a", dt20220302), Row.of(5, "b", dt20220302)); - TestHelpers.assertRows( - sql("SELECT * FROM %s WHERE dt = '2022-03-02'", tableName), rowsOn20220302); - - TestHelpers.assertRows( - sql("SELECT * FROM %s", tableName), - Lists.newArrayList(Iterables.concat(rowsOn20220301, rowsOn20220302))); - } finally { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); - } - } - - @Test - public void testUpsertOptions() { - String tableName = "test_upsert_options"; - LocalDate dt20220301 = LocalDate.of(2022, 3, 1); - LocalDate dt20220302 = LocalDate.of(2022, 3, 2); - - Map optionsUpsertProps = Maps.newHashMap(tableUpsertProps); - optionsUpsertProps.remove(TableProperties.UPSERT_ENABLED); - sql( - "CREATE TABLE %s(id INT NOT NULL, province STRING NOT NULL, dt DATE, PRIMARY KEY(id,province) NOT ENFORCED) " - + "PARTITIONED BY (province) WITH %s", - tableName, toWithClause(optionsUpsertProps)); - - try { - sql( - "INSERT INTO %s /*+ OPTIONS('upsert-enabled'='true')*/ VALUES " - + "(1, 'a', DATE '2022-03-01')," - + "(2, 'b', DATE '2022-03-01')," - + "(1, 'b', DATE '2022-03-01')", - tableName); - - sql( - "INSERT INTO %s /*+ OPTIONS('upsert-enabled'='true')*/ VALUES " - + "(4, 'a', DATE '2022-03-02')," - + "(5, 'b', DATE '2022-03-02')," - + "(1, 'b', DATE '2022-03-02')", - tableName); - - List rowsOn20220301 = - Lists.newArrayList(Row.of(2, "b", dt20220301), Row.of(1, "a", dt20220301)); - TestHelpers.assertRows( - sql("SELECT * FROM %s WHERE dt < '2022-03-02'", tableName), rowsOn20220301); - - List rowsOn20220302 = - Lists.newArrayList( - Row.of(1, "b", dt20220302), Row.of(4, "a", dt20220302), Row.of(5, "b", dt20220302)); - TestHelpers.assertRows( - sql("SELECT * FROM %s WHERE dt = '2022-03-02'", tableName), rowsOn20220302); - - TestHelpers.assertRows( - sql("SELECT * FROM %s", tableName), - Lists.newArrayList(Iterables.concat(rowsOn20220301, rowsOn20220302))); - } finally { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); - } - } - - @Test - public void testPrimaryKeyEqualToPartitionKey() { - // This is an SQL based reproduction of TestFlinkIcebergSinkV2#testUpsertOnDataKey - String tableName = "upsert_on_data_key"; - try { - sql( - "CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL, PRIMARY KEY(data) NOT ENFORCED) " - + "PARTITIONED BY (data) WITH %s", - tableName, toWithClause(tableUpsertProps)); - - sql("INSERT INTO %s VALUES " + "(1, 'aaa')," + "(2, 'aaa')," + "(3, 'bbb')", tableName); - - TestHelpers.assertRows( - sql("SELECT * FROM %s", tableName), - Lists.newArrayList(Row.of(2, "aaa"), Row.of(3, "bbb"))); - - sql("INSERT INTO %s VALUES " + "(4, 'aaa')," + "(5, 'bbb')", tableName); - - TestHelpers.assertRows( - sql("SELECT * FROM %s", tableName), - Lists.newArrayList(Row.of(4, "aaa"), Row.of(5, "bbb"))); - - sql("INSERT INTO %s VALUES " + "(6, 'aaa')," + "(7, 'bbb')", tableName); - - TestHelpers.assertRows( - sql("SELECT * FROM %s", tableName), - Lists.newArrayList(Row.of(6, "aaa"), Row.of(7, "bbb"))); - } finally { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); - } - } - - @Test - public void testPrimaryKeyFieldsAtBeginningOfSchema() { - String tableName = "upsert_on_pk_at_schema_start"; - LocalDate dt = LocalDate.of(2022, 3, 1); - try { - sql( - "CREATE TABLE %s(data STRING NOT NULL, dt DATE NOT NULL, id INT, PRIMARY KEY(data,dt) NOT ENFORCED) " - + "PARTITIONED BY (data) WITH %s", - tableName, toWithClause(tableUpsertProps)); - - sql( - "INSERT INTO %s VALUES " - + "('aaa', DATE '2022-03-01', 1)," - + "('aaa', DATE '2022-03-01', 2)," - + "('bbb', DATE '2022-03-01', 3)", - tableName); - - TestHelpers.assertRows( - sql("SELECT * FROM %s", tableName), - Lists.newArrayList(Row.of("aaa", dt, 2), Row.of("bbb", dt, 3))); - - sql( - "INSERT INTO %s VALUES " - + "('aaa', DATE '2022-03-01', 4)," - + "('bbb', DATE '2022-03-01', 5)", - tableName); - - TestHelpers.assertRows( - sql("SELECT * FROM %s", tableName), - Lists.newArrayList(Row.of("aaa", dt, 4), Row.of("bbb", dt, 5))); - - sql( - "INSERT INTO %s VALUES " - + "('aaa', DATE '2022-03-01', 6)," - + "('bbb', DATE '2022-03-01', 7)", - tableName); - - TestHelpers.assertRows( - sql("SELECT * FROM %s", tableName), - Lists.newArrayList(Row.of("aaa", dt, 6), Row.of("bbb", dt, 7))); - } finally { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); - } - } - - @Test - public void testPrimaryKeyFieldsAtEndOfTableSchema() { - // This is the same test case as testPrimaryKeyFieldsAtBeginningOfSchema, but the primary key - // fields - // are located at the end of the flink schema. - String tableName = "upsert_on_pk_at_schema_end"; - LocalDate dt = LocalDate.of(2022, 3, 1); - try { - sql( - "CREATE TABLE %s(id INT, data STRING NOT NULL, dt DATE NOT NULL, PRIMARY KEY(data,dt) NOT ENFORCED) " - + "PARTITIONED BY (data) WITH %s", - tableName, toWithClause(tableUpsertProps)); - - sql( - "INSERT INTO %s VALUES " - + "(1, 'aaa', DATE '2022-03-01')," - + "(2, 'aaa', DATE '2022-03-01')," - + "(3, 'bbb', DATE '2022-03-01')", - tableName); - - TestHelpers.assertRows( - sql("SELECT * FROM %s", tableName), - Lists.newArrayList(Row.of(2, "aaa", dt), Row.of(3, "bbb", dt))); - - sql( - "INSERT INTO %s VALUES " - + "(4, 'aaa', DATE '2022-03-01')," - + "(5, 'bbb', DATE '2022-03-01')", - tableName); - - TestHelpers.assertRows( - sql("SELECT * FROM %s", tableName), - Lists.newArrayList(Row.of(4, "aaa", dt), Row.of(5, "bbb", dt))); - - sql( - "INSERT INTO %s VALUES " - + "(6, 'aaa', DATE '2022-03-01')," - + "(7, 'bbb', DATE '2022-03-01')", - tableName); - - TestHelpers.assertRows( - sql("SELECT * FROM %s", tableName), - Lists.newArrayList(Row.of(6, "aaa", dt), Row.of(7, "bbb", dt))); - } finally { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); - } - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java deleted file mode 100644 index e840ba842bef..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java +++ /dev/null @@ -1,444 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.io.IOException; -import java.math.BigDecimal; -import java.nio.ByteBuffer; -import java.time.LocalDate; -import java.time.LocalDateTime; -import java.time.LocalTime; -import java.time.OffsetDateTime; -import java.util.Collection; -import java.util.List; -import java.util.Map; -import java.util.UUID; -import java.util.function.Consumer; -import java.util.stream.Collectors; -import org.apache.flink.api.common.ExecutionConfig; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.api.java.typeutils.runtime.kryo.KryoSerializer; -import org.apache.flink.core.memory.DataInputDeserializer; -import org.apache.flink.core.memory.DataOutputSerializer; -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.table.data.conversion.DataStructureConverter; -import org.apache.flink.table.data.conversion.DataStructureConverters; -import org.apache.flink.table.runtime.typeutils.InternalSerializers; -import org.apache.flink.table.types.logical.ArrayType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.MapType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.utils.TypeConversions; -import org.apache.flink.types.Row; -import org.apache.iceberg.ContentFile; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.Schema; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.data.RowDataUtil; -import org.apache.iceberg.flink.source.FlinkInputFormat; -import org.apache.iceberg.flink.source.FlinkInputSplit; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.DateTimeUtil; -import org.assertj.core.api.Assertions; -import org.junit.Assert; - -public class TestHelpers { - private TestHelpers() {} - - public static T roundTripKryoSerialize(Class clazz, T table) throws IOException { - KryoSerializer kryo = new KryoSerializer<>(clazz, new ExecutionConfig()); - - DataOutputSerializer outputView = new DataOutputSerializer(1024); - kryo.serialize(table, outputView); - - DataInputDeserializer inputView = new DataInputDeserializer(outputView.getCopyOfBuffer()); - return kryo.deserialize(inputView); - } - - public static RowData copyRowData(RowData from, RowType rowType) { - TypeSerializer[] fieldSerializers = - rowType.getChildren().stream() - .map((LogicalType type) -> InternalSerializers.create(type)) - .toArray(TypeSerializer[]::new); - return RowDataUtil.clone(from, null, rowType, fieldSerializers); - } - - public static void readRowData(FlinkInputFormat input, Consumer visitor) - throws IOException { - for (FlinkInputSplit s : input.createInputSplits(0)) { - input.open(s); - try { - while (!input.reachedEnd()) { - RowData row = input.nextRecord(null); - visitor.accept(row); - } - } finally { - input.close(); - } - } - } - - public static List readRowData(FlinkInputFormat inputFormat, RowType rowType) - throws IOException { - List results = Lists.newArrayList(); - readRowData(inputFormat, row -> results.add(copyRowData(row, rowType))); - return results; - } - - public static List readRows(FlinkInputFormat inputFormat, RowType rowType) - throws IOException { - return convertRowDataToRow(readRowData(inputFormat, rowType), rowType); - } - - public static List convertRowDataToRow(List rowDataList, RowType rowType) { - DataStructureConverter converter = - DataStructureConverters.getConverter(TypeConversions.fromLogicalToDataType(rowType)); - return rowDataList.stream() - .map(converter::toExternal) - .map(Row.class::cast) - .collect(Collectors.toList()); - } - - public static void assertRecords(List results, List expectedRecords, Schema schema) { - List expected = Lists.newArrayList(); - @SuppressWarnings("unchecked") - DataStructureConverter converter = - (DataStructureConverter) - DataStructureConverters.getConverter( - TypeConversions.fromLogicalToDataType(FlinkSchemaUtil.convert(schema))); - expectedRecords.forEach( - r -> expected.add(converter.toExternal(RowDataConverter.convert(schema, r)))); - assertRows(results, expected); - } - - public static void assertRows(List results, List expected, RowType rowType) { - assertRows(convertRowDataToRow(results, rowType), convertRowDataToRow(expected, rowType)); - } - - public static void assertRows(List results, List expected) { - Assertions.assertThat(results).containsExactlyInAnyOrderElementsOf(expected); - } - - public static void assertRowData(Schema schema, StructLike expected, RowData actual) { - assertRowData(schema.asStruct(), FlinkSchemaUtil.convert(schema), expected, actual); - } - - public static void assertRowData( - Types.StructType structType, - LogicalType rowType, - StructLike expectedRecord, - RowData actualRowData) { - if (expectedRecord == null && actualRowData == null) { - return; - } - - Assert.assertTrue( - "expected Record and actual RowData should be both null or not null", - expectedRecord != null && actualRowData != null); - - List types = Lists.newArrayList(); - for (Types.NestedField field : structType.fields()) { - types.add(field.type()); - } - - for (int i = 0; i < types.size(); i += 1) { - LogicalType logicalType = ((RowType) rowType).getTypeAt(i); - Object expected = expectedRecord.get(i, Object.class); - // The RowData.createFieldGetter won't return null for the required field. But in the - // projection case, if we are - // projecting a nested required field from an optional struct, then we should give a null for - // the projected field - // if the outer struct value is null. So we need to check the nullable for actualRowData here. - // For more details - // please see issue #2738. - Object actual = - actualRowData.isNullAt(i) - ? null - : RowData.createFieldGetter(logicalType, i).getFieldOrNull(actualRowData); - assertEquals(types.get(i), logicalType, expected, actual); - } - } - - private static void assertEquals( - Type type, LogicalType logicalType, Object expected, Object actual) { - - if (expected == null && actual == null) { - return; - } - - Assert.assertTrue( - "expected and actual should be both null or not null", expected != null && actual != null); - - switch (type.typeId()) { - case BOOLEAN: - Assert.assertEquals("boolean value should be equal", expected, actual); - break; - case INTEGER: - Assert.assertEquals("int value should be equal", expected, actual); - break; - case LONG: - Assert.assertEquals("long value should be equal", expected, actual); - break; - case FLOAT: - Assert.assertEquals("float value should be equal", expected, actual); - break; - case DOUBLE: - Assert.assertEquals("double value should be equal", expected, actual); - break; - case STRING: - Assertions.assertThat(expected) - .as("Should expect a CharSequence") - .isInstanceOf(CharSequence.class); - Assert.assertEquals("string should be equal", String.valueOf(expected), actual.toString()); - break; - case DATE: - Assertions.assertThat(expected).as("Should expect a Date").isInstanceOf(LocalDate.class); - LocalDate date = DateTimeUtil.dateFromDays((int) actual); - Assert.assertEquals("date should be equal", expected, date); - break; - case TIME: - Assertions.assertThat(expected) - .as("Should expect a LocalTime") - .isInstanceOf(LocalTime.class); - int milliseconds = (int) (((LocalTime) expected).toNanoOfDay() / 1000_000); - Assert.assertEquals("time millis should be equal", milliseconds, actual); - break; - case TIMESTAMP: - if (((Types.TimestampType) type).shouldAdjustToUTC()) { - Assertions.assertThat(expected) - .as("Should expect a OffsetDataTime") - .isInstanceOf(OffsetDateTime.class); - OffsetDateTime ts = (OffsetDateTime) expected; - Assert.assertEquals( - "OffsetDataTime should be equal", - ts.toLocalDateTime(), - ((TimestampData) actual).toLocalDateTime()); - } else { - Assertions.assertThat(expected) - .as("Should expect a LocalDataTime") - .isInstanceOf(LocalDateTime.class); - LocalDateTime ts = (LocalDateTime) expected; - Assert.assertEquals( - "LocalDataTime should be equal", ts, ((TimestampData) actual).toLocalDateTime()); - } - break; - case BINARY: - Assertions.assertThat(expected) - .as("Should expect a ByteBuffer") - .isInstanceOf(ByteBuffer.class); - Assert.assertEquals("binary should be equal", expected, ByteBuffer.wrap((byte[]) actual)); - break; - case DECIMAL: - Assertions.assertThat(expected) - .as("Should expect a BigDecimal") - .isInstanceOf(BigDecimal.class); - BigDecimal bd = (BigDecimal) expected; - Assert.assertEquals( - "decimal value should be equal", bd, ((DecimalData) actual).toBigDecimal()); - break; - case LIST: - Assertions.assertThat(expected) - .as("Should expect a Collection") - .isInstanceOf(Collection.class); - Collection expectedArrayData = (Collection) expected; - ArrayData actualArrayData = (ArrayData) actual; - LogicalType elementType = ((ArrayType) logicalType).getElementType(); - Assert.assertEquals( - "array length should be equal", expectedArrayData.size(), actualArrayData.size()); - assertArrayValues( - type.asListType().elementType(), elementType, expectedArrayData, actualArrayData); - break; - case MAP: - Assertions.assertThat(expected).as("Should expect a Map").isInstanceOf(Map.class); - assertMapValues(type.asMapType(), logicalType, (Map) expected, (MapData) actual); - break; - case STRUCT: - Assertions.assertThat(expected).as("Should expect a Record").isInstanceOf(StructLike.class); - assertRowData(type.asStructType(), logicalType, (StructLike) expected, (RowData) actual); - break; - case UUID: - Assertions.assertThat(expected).as("Should expect a UUID").isInstanceOf(UUID.class); - Assert.assertEquals( - "UUID should be equal", - expected.toString(), - UUID.nameUUIDFromBytes((byte[]) actual).toString()); - break; - case FIXED: - Assertions.assertThat(expected).as("Should expect byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("binary should be equal", (byte[]) expected, (byte[]) actual); - break; - default: - throw new IllegalArgumentException("Not a supported type: " + type); - } - } - - private static void assertArrayValues( - Type type, LogicalType logicalType, Collection expectedArray, ArrayData actualArray) { - List expectedElements = Lists.newArrayList(expectedArray); - for (int i = 0; i < expectedArray.size(); i += 1) { - if (expectedElements.get(i) == null) { - Assert.assertTrue(actualArray.isNullAt(i)); - continue; - } - - Object expected = expectedElements.get(i); - - assertEquals( - type, - logicalType, - expected, - ArrayData.createElementGetter(logicalType).getElementOrNull(actualArray, i)); - } - } - - private static void assertMapValues( - Types.MapType mapType, LogicalType type, Map expected, MapData actual) { - Assert.assertEquals("map size should be equal", expected.size(), actual.size()); - - ArrayData actualKeyArrayData = actual.keyArray(); - ArrayData actualValueArrayData = actual.valueArray(); - LogicalType actualKeyType = ((MapType) type).getKeyType(); - LogicalType actualValueType = ((MapType) type).getValueType(); - Type keyType = mapType.keyType(); - Type valueType = mapType.valueType(); - - ArrayData.ElementGetter keyGetter = ArrayData.createElementGetter(actualKeyType); - ArrayData.ElementGetter valueGetter = ArrayData.createElementGetter(actualValueType); - - for (Map.Entry entry : expected.entrySet()) { - Object matchedActualKey = null; - int matchedKeyIndex = 0; - for (int i = 0; i < actual.size(); i += 1) { - try { - Object key = keyGetter.getElementOrNull(actualKeyArrayData, i); - assertEquals(keyType, actualKeyType, entry.getKey(), key); - matchedActualKey = key; - matchedKeyIndex = i; - break; - } catch (AssertionError e) { - // not found - } - } - Assert.assertNotNull("Should have a matching key", matchedActualKey); - final int valueIndex = matchedKeyIndex; - assertEquals( - valueType, - actualValueType, - entry.getValue(), - valueGetter.getElementOrNull(actualValueArrayData, valueIndex)); - } - } - - public static void assertEquals(ManifestFile expected, ManifestFile actual) { - if (expected == actual) { - return; - } - Assert.assertTrue("Should not be null.", expected != null && actual != null); - Assert.assertEquals("Path must match", expected.path(), actual.path()); - Assert.assertEquals("Length must match", expected.length(), actual.length()); - Assert.assertEquals("Spec id must match", expected.partitionSpecId(), actual.partitionSpecId()); - Assert.assertEquals("ManifestContent must match", expected.content(), actual.content()); - Assert.assertEquals( - "SequenceNumber must match", expected.sequenceNumber(), actual.sequenceNumber()); - Assert.assertEquals( - "MinSequenceNumber must match", expected.minSequenceNumber(), actual.minSequenceNumber()); - Assert.assertEquals("Snapshot id must match", expected.snapshotId(), actual.snapshotId()); - Assert.assertEquals( - "Added files flag must match", expected.hasAddedFiles(), actual.hasAddedFiles()); - Assert.assertEquals( - "Added files count must match", expected.addedFilesCount(), actual.addedFilesCount()); - Assert.assertEquals( - "Added rows count must match", expected.addedRowsCount(), actual.addedRowsCount()); - Assert.assertEquals( - "Existing files flag must match", expected.hasExistingFiles(), actual.hasExistingFiles()); - Assert.assertEquals( - "Existing files count must match", - expected.existingFilesCount(), - actual.existingFilesCount()); - Assert.assertEquals( - "Existing rows count must match", expected.existingRowsCount(), actual.existingRowsCount()); - Assert.assertEquals( - "Deleted files flag must match", expected.hasDeletedFiles(), actual.hasDeletedFiles()); - Assert.assertEquals( - "Deleted files count must match", expected.deletedFilesCount(), actual.deletedFilesCount()); - Assert.assertEquals( - "Deleted rows count must match", expected.deletedRowsCount(), actual.deletedRowsCount()); - - List expectedSummaries = expected.partitions(); - List actualSummaries = actual.partitions(); - Assert.assertEquals( - "PartitionFieldSummary size does not match", - expectedSummaries.size(), - actualSummaries.size()); - for (int i = 0; i < expectedSummaries.size(); i++) { - Assert.assertEquals( - "Null flag in partition must match", - expectedSummaries.get(i).containsNull(), - actualSummaries.get(i).containsNull()); - Assert.assertEquals( - "NaN flag in partition must match", - expectedSummaries.get(i).containsNaN(), - actualSummaries.get(i).containsNaN()); - Assert.assertEquals( - "Lower bounds in partition must match", - expectedSummaries.get(i).lowerBound(), - actualSummaries.get(i).lowerBound()); - Assert.assertEquals( - "Upper bounds in partition must match", - expectedSummaries.get(i).upperBound(), - actualSummaries.get(i).upperBound()); - } - } - - public static void assertEquals(ContentFile expected, ContentFile actual) { - if (expected == actual) { - return; - } - Assert.assertTrue("Shouldn't be null.", expected != null && actual != null); - Assert.assertEquals("SpecId", expected.specId(), actual.specId()); - Assert.assertEquals("Content", expected.content(), actual.content()); - Assert.assertEquals("Path", expected.path(), actual.path()); - Assert.assertEquals("Format", expected.format(), actual.format()); - Assert.assertEquals("Partition size", expected.partition().size(), actual.partition().size()); - for (int i = 0; i < expected.partition().size(); i++) { - Assert.assertEquals( - "Partition data at index " + i, - expected.partition().get(i, Object.class), - actual.partition().get(i, Object.class)); - } - Assert.assertEquals("Record count", expected.recordCount(), actual.recordCount()); - Assert.assertEquals("File size in bytes", expected.fileSizeInBytes(), actual.fileSizeInBytes()); - Assert.assertEquals("Column sizes", expected.columnSizes(), actual.columnSizes()); - Assert.assertEquals("Value counts", expected.valueCounts(), actual.valueCounts()); - Assert.assertEquals("Null value counts", expected.nullValueCounts(), actual.nullValueCounts()); - Assert.assertEquals("Lower bounds", expected.lowerBounds(), actual.lowerBounds()); - Assert.assertEquals("Upper bounds", expected.upperBounds(), actual.upperBounds()); - Assert.assertEquals("Key metadata", expected.keyMetadata(), actual.keyMetadata()); - Assert.assertEquals("Split offsets", expected.splitOffsets(), actual.splitOffsets()); - Assert.assertEquals( - "Equality field id list", actual.equalityFieldIds(), expected.equalityFieldIds()); - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java deleted file mode 100644 index 088e1cf4731e..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java +++ /dev/null @@ -1,348 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.util.Map; -import org.apache.flink.configuration.CoreOptions; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.EnvironmentSettings; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.table.api.ValidationException; -import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; -import org.apache.flink.table.catalog.Catalog; -import org.apache.flink.table.catalog.ObjectPath; -import org.apache.flink.types.Row; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hive.metastore.HiveMetaStoreClient; -import org.apache.iceberg.AssertHelpers; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.thrift.TException; -import org.junit.After; -import org.junit.Assert; -import org.junit.ClassRule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Parameterized.class) -public class TestIcebergConnector extends FlinkTestBase { - - private static final String TABLE_NAME = "test_table"; - - @ClassRule public static final TemporaryFolder WAREHOUSE = new TemporaryFolder(); - - private final String catalogName; - private final Map properties; - private final boolean isStreaming; - private volatile TableEnvironment tEnv; - - @Parameterized.Parameters(name = "catalogName = {0}, properties = {1}, isStreaming={2}") - public static Iterable parameters() { - return Lists.newArrayList( - // Create iceberg table in the hadoop catalog and default database. - new Object[] { - "testhadoop", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hadoop"), - true - }, - new Object[] { - "testhadoop", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hadoop", - "catalog-table", "not_existing_table"), - true - }, - new Object[] { - "testhadoop", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hadoop"), - false - }, - // Create iceberg table in the hadoop catalog and not_existing_db. - new Object[] { - "testhadoop", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hadoop", - "catalog-database", "not_existing_db"), - true - }, - new Object[] { - "testhadoop", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hadoop", - "catalog-database", "not_existing_db", - "catalog-table", "not_existing_table"), - true - }, - new Object[] { - "testhadoop", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hadoop", - "catalog-database", "not_existing_db"), - false - }, - // Create iceberg table in the hive catalog and default database. - new Object[] { - "testhive", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hive"), - true - }, - new Object[] { - "testhive", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hive", - "catalog-table", "not_existing_table"), - true - }, - new Object[] { - "testhive", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hive"), - false - }, - // Create iceberg table in the hive catalog and not_existing_db. - new Object[] { - "testhive", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hive", - "catalog-database", "not_existing_db"), - true - }, - new Object[] { - "testhive", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hive", - "catalog-database", "not_existing_db", - "catalog-table", "not_existing_table"), - true - }, - new Object[] { - "testhive", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hive", - "catalog-database", "not_existing_db"), - false - }); - } - - public TestIcebergConnector( - String catalogName, Map properties, boolean isStreaming) { - this.catalogName = catalogName; - this.properties = properties; - this.isStreaming = isStreaming; - } - - @Override - protected TableEnvironment getTableEnv() { - if (tEnv == null) { - synchronized (this) { - if (tEnv == null) { - EnvironmentSettings.Builder settingsBuilder = - EnvironmentSettings.newInstance().useBlinkPlanner(); - if (isStreaming) { - settingsBuilder.inStreamingMode(); - StreamExecutionEnvironment env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); - env.enableCheckpointing(400); - env.setMaxParallelism(2); - env.setParallelism(2); - tEnv = StreamTableEnvironment.create(env, settingsBuilder.build()); - } else { - settingsBuilder.inBatchMode(); - tEnv = TableEnvironment.create(settingsBuilder.build()); - } - // Set only one parallelism. - tEnv.getConfig() - .getConfiguration() - .set(CoreOptions.DEFAULT_PARALLELISM, 1) - .set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, false); - } - } - } - return tEnv; - } - - @After - public void after() throws TException { - sql("DROP TABLE IF EXISTS %s", TABLE_NAME); - - // Clean the created orphan databases and tables from hive-metastore. - if (isHiveCatalog()) { - HiveMetaStoreClient metaStoreClient = new HiveMetaStoreClient(hiveConf); - try { - metaStoreClient.dropTable(databaseName(), tableName()); - if (!isDefaultDatabaseName()) { - try { - metaStoreClient.dropDatabase(databaseName()); - } catch (Exception ignored) { - // Ignore - } - } - } finally { - metaStoreClient.close(); - } - } - } - - private void testCreateConnectorTable() { - Map tableProps = createTableProps(); - - // Create table under the flink's current database. - sql("CREATE TABLE %s (id BIGINT, data STRING) WITH %s", TABLE_NAME, toWithClause(tableProps)); - sql("INSERT INTO %s VALUES (1, 'AAA'), (2, 'BBB'), (3, 'CCC')", TABLE_NAME); - Assert.assertEquals( - "Should have expected rows", - Sets.newHashSet(Row.of(1L, "AAA"), Row.of(2L, "BBB"), Row.of(3L, "CCC")), - Sets.newHashSet(sql("SELECT * FROM %s", TABLE_NAME))); - - FlinkCatalogFactory factory = new FlinkCatalogFactory(); - Catalog flinkCatalog = factory.createCatalog(catalogName, tableProps, new Configuration()); - Assert.assertTrue( - "Should have created the expected database", flinkCatalog.databaseExists(databaseName())); - Assert.assertTrue( - "Should have created the expected table", - flinkCatalog.tableExists(new ObjectPath(databaseName(), tableName()))); - - // Drop and create it again. - sql("DROP TABLE %s", TABLE_NAME); - sql("CREATE TABLE %s (id BIGINT, data STRING) WITH %s", TABLE_NAME, toWithClause(tableProps)); - Assert.assertEquals( - "Should have expected rows", - Sets.newHashSet(Row.of(1L, "AAA"), Row.of(2L, "BBB"), Row.of(3L, "CCC")), - Sets.newHashSet(sql("SELECT * FROM %s", TABLE_NAME))); - } - - @Test - public void testCreateTableUnderDefaultDatabase() { - testCreateConnectorTable(); - } - - @Test - public void testCatalogDatabaseConflictWithFlinkDatabase() { - sql("CREATE DATABASE IF NOT EXISTS `%s`", databaseName()); - sql("USE `%s`", databaseName()); - - try { - testCreateConnectorTable(); - // Ensure that the table was created under the specific database. - AssertHelpers.assertThrows( - "Table should already exists", - ValidationException.class, - "Could not execute CreateTable in path", - () -> sql("CREATE TABLE `default_catalog`.`%s`.`%s`", databaseName(), TABLE_NAME)); - } finally { - sql("DROP TABLE IF EXISTS `%s`.`%s`", databaseName(), TABLE_NAME); - if (!isDefaultDatabaseName()) { - sql("DROP DATABASE `%s`", databaseName()); - } - } - } - - @Test - public void testConnectorTableInIcebergCatalog() { - // Create the catalog properties - Map catalogProps = Maps.newHashMap(); - catalogProps.put("type", "iceberg"); - if (isHiveCatalog()) { - catalogProps.put("catalog-type", "hive"); - catalogProps.put(CatalogProperties.URI, FlinkCatalogTestBase.getURI(hiveConf)); - } else { - catalogProps.put("catalog-type", "hadoop"); - } - catalogProps.put(CatalogProperties.WAREHOUSE_LOCATION, createWarehouse()); - - // Create the table properties - Map tableProps = createTableProps(); - - // Create a connector table in an iceberg catalog. - sql("CREATE CATALOG `test_catalog` WITH %s", toWithClause(catalogProps)); - try { - AssertHelpers.assertThrowsCause( - "Cannot create the iceberg connector table in iceberg catalog", - IllegalArgumentException.class, - "Cannot create the table with 'connector'='iceberg' table property in an iceberg catalog", - () -> - sql( - "CREATE TABLE `test_catalog`.`%s`.`%s` (id BIGINT, data STRING) WITH %s", - FlinkCatalogFactory.DEFAULT_DATABASE_NAME, TABLE_NAME, toWithClause(tableProps))); - } finally { - sql("DROP CATALOG IF EXISTS `test_catalog`"); - } - } - - private Map createTableProps() { - Map tableProps = Maps.newHashMap(properties); - tableProps.put("catalog-name", catalogName); - tableProps.put(CatalogProperties.WAREHOUSE_LOCATION, createWarehouse()); - if (isHiveCatalog()) { - tableProps.put(CatalogProperties.URI, FlinkCatalogTestBase.getURI(hiveConf)); - } - return tableProps; - } - - private boolean isHiveCatalog() { - return "testhive".equalsIgnoreCase(catalogName); - } - - private boolean isDefaultDatabaseName() { - return FlinkCatalogFactory.DEFAULT_DATABASE_NAME.equalsIgnoreCase(databaseName()); - } - - private String tableName() { - return properties.getOrDefault("catalog-table", TABLE_NAME); - } - - private String databaseName() { - return properties.getOrDefault("catalog-database", "default_database"); - } - - private String toWithClause(Map props) { - return FlinkCatalogTestBase.toWithClause(props); - } - - private static String createWarehouse() { - try { - return String.format("file://%s", WAREHOUSE.newFolder().getAbsolutePath()); - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java deleted file mode 100644 index 6bd94e9ca61c..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java +++ /dev/null @@ -1,174 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.IOException; -import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import org.apache.flink.api.common.ExecutionConfig; -import org.apache.flink.api.java.typeutils.runtime.kryo.KryoSerializer; -import org.apache.flink.core.memory.DataInputDeserializer; -import org.apache.flink.core.memory.DataOutputSerializer; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DataFiles; -import org.apache.iceberg.GenericManifestFile; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.ManifestFiles; -import org.apache.iceberg.ManifestWriter; -import org.apache.iceberg.Metrics; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.hadoop.HadoopFileIO; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.OutputFile; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.types.Types; -import org.assertj.core.api.Assertions; -import org.junit.Assert; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -public class TestManifestFileSerialization { - - private static final Schema SCHEMA = - new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - required(4, "double", Types.DoubleType.get())); - - private static final PartitionSpec SPEC = - PartitionSpec.builderFor(SCHEMA).identity("double").build(); - - private static final DataFile FILE_A = - DataFiles.builder(SPEC) - .withPath("/path/to/data-1.parquet") - .withFileSizeInBytes(0) - .withPartition(org.apache.iceberg.TestHelpers.Row.of(1D)) - .withPartitionPath("double=1") - .withMetrics( - new Metrics( - 5L, - null, // no column sizes - ImmutableMap.of(1, 5L, 2, 3L), // value count - ImmutableMap.of(1, 0L, 2, 2L), // null count - ImmutableMap.of(), // nan count - ImmutableMap.of(1, longToBuffer(0L)), // lower bounds - ImmutableMap.of(1, longToBuffer(4L)) // upper bounds - )) - .build(); - - private static final DataFile FILE_B = - DataFiles.builder(SPEC) - .withPath("/path/to/data-2.parquet") - .withFileSizeInBytes(0) - .withPartition(org.apache.iceberg.TestHelpers.Row.of(Double.NaN)) - .withPartitionPath("double=NaN") - .withMetrics( - new Metrics( - 1L, - null, // no column sizes - ImmutableMap.of(1, 1L, 4, 1L), // value count - ImmutableMap.of(1, 0L, 2, 0L), // null count - ImmutableMap.of(4, 1L), // nan count - ImmutableMap.of(1, longToBuffer(0L)), // lower bounds - ImmutableMap.of(1, longToBuffer(1L)) // upper bounds - )) - .build(); - - private static final FileIO FILE_IO = new HadoopFileIO(new Configuration()); - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - @Test - public void testKryoSerialization() throws IOException { - KryoSerializer kryo = - new KryoSerializer<>(ManifestFile.class, new ExecutionConfig()); - - DataOutputSerializer outputView = new DataOutputSerializer(1024); - - ManifestFile manifest = writeManifest(FILE_A, FILE_B); - - kryo.serialize(manifest, outputView); - kryo.serialize(manifest.copy(), outputView); - kryo.serialize(GenericManifestFile.copyOf(manifest).build(), outputView); - - DataInputDeserializer inputView = new DataInputDeserializer(outputView.getCopyOfBuffer()); - ManifestFile m1 = kryo.deserialize(inputView); - ManifestFile m2 = kryo.deserialize(inputView); - ManifestFile m3 = kryo.deserialize(inputView); - - TestHelpers.assertEquals(manifest, m1); - TestHelpers.assertEquals(manifest, m2); - TestHelpers.assertEquals(manifest, m3); - } - - @Test - public void testJavaSerialization() throws Exception { - ByteArrayOutputStream bytes = new ByteArrayOutputStream(); - - ManifestFile manifest = writeManifest(FILE_A, FILE_B); - - try (ObjectOutputStream out = new ObjectOutputStream(bytes)) { - out.writeObject(manifest); - out.writeObject(manifest.copy()); - out.writeObject(GenericManifestFile.copyOf(manifest).build()); - } - - try (ObjectInputStream in = - new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { - for (int i = 0; i < 3; i += 1) { - Object obj = in.readObject(); - Assertions.assertThat(obj).as("Should be a ManifestFile").isInstanceOf(ManifestFile.class); - TestHelpers.assertEquals(manifest, (ManifestFile) obj); - } - } - } - - private ManifestFile writeManifest(DataFile... files) throws IOException { - File manifestFile = temp.newFile("input.m0.avro"); - Assert.assertTrue(manifestFile.delete()); - OutputFile outputFile = FILE_IO.newOutputFile(manifestFile.getCanonicalPath()); - - ManifestWriter writer = ManifestFiles.write(SPEC, outputFile); - try { - for (DataFile file : files) { - writer.add(file); - } - } finally { - writer.close(); - } - - return writer.toManifestFile(); - } - - private static ByteBuffer longToBuffer(long value) { - return ByteBuffer.allocate(8).order(ByteOrder.LITTLE_ENDIAN).putLong(0, value); - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java deleted file mode 100644 index c78fa51215dd..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.util.Iterator; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.RecordWrapperTest; -import org.apache.iceberg.Schema; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.data.InternalRecordWrapper; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.data.RandomRowData; -import org.apache.iceberg.util.StructLikeWrapper; -import org.assertj.core.api.Assertions; -import org.junit.Assert; - -public class TestRowDataWrapper extends RecordWrapperTest { - - /** - * Flink's time type has been truncated to millis seconds, so we need a customized assert method - * to check the values. - */ - @Override - public void testTime() { - generateAndValidate( - new Schema(TIME.fields()), - (message, expectedWrapper, actualWrapper) -> { - for (int pos = 0; pos < TIME.fields().size(); pos++) { - Object expected = expectedWrapper.get().get(pos, Object.class); - Object actual = actualWrapper.get().get(pos, Object.class); - if (expected == actual) { - return; - } - - Assertions.assertThat(actual).isNotNull(); - Assertions.assertThat(expected).isNotNull(); - - int expectedMilliseconds = (int) ((long) expected / 1000_000); - int actualMilliseconds = (int) ((long) actual / 1000_000); - Assert.assertEquals(message, expectedMilliseconds, actualMilliseconds); - } - }); - } - - @Override - protected void generateAndValidate(Schema schema, RecordWrapperTest.AssertMethod assertMethod) { - int numRecords = 100; - Iterable recordList = RandomGenericData.generate(schema, numRecords, 101L); - Iterable rowDataList = RandomRowData.generate(schema, numRecords, 101L); - - InternalRecordWrapper recordWrapper = new InternalRecordWrapper(schema.asStruct()); - RowDataWrapper rowDataWrapper = - new RowDataWrapper(FlinkSchemaUtil.convert(schema), schema.asStruct()); - - Iterator actual = recordList.iterator(); - Iterator expected = rowDataList.iterator(); - - StructLikeWrapper actualWrapper = StructLikeWrapper.forType(schema.asStruct()); - StructLikeWrapper expectedWrapper = StructLikeWrapper.forType(schema.asStruct()); - for (int i = 0; i < numRecords; i++) { - Assert.assertTrue("Should have more records", actual.hasNext()); - Assert.assertTrue("Should have more RowData", expected.hasNext()); - - StructLike recordStructLike = recordWrapper.wrap(actual.next()); - StructLike rowDataStructLike = rowDataWrapper.wrap(expected.next()); - - assertMethod.assertEquals( - "Should have expected StructLike values", - actualWrapper.set(recordStructLike), - expectedWrapper.set(rowDataStructLike)); - } - - Assert.assertFalse("Shouldn't have more record", actual.hasNext()); - Assert.assertFalse("Shouldn't have more RowData", expected.hasNext()); - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java deleted file mode 100644 index 61a821a9ac5a..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.io.File; -import org.apache.iceberg.Table; -import org.apache.iceberg.TestTables; - -public class TestTableLoader implements TableLoader { - private File dir; - - public static TableLoader of(String dir) { - return new TestTableLoader(dir); - } - - public TestTableLoader(String dir) { - this.dir = new File(dir); - } - - @Override - public void open() {} - - @Override - public Table loadTable() { - return TestTables.load(dir, "test"); - } - - @Override - public void close() {} -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java deleted file mode 100644 index 3ad1d53db8d5..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.apache.iceberg.flink.TestHelpers.roundTripKryoSerialize; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - -import java.io.File; -import java.io.IOException; -import java.util.Map; -import org.apache.iceberg.HasTableOperations; -import org.apache.iceberg.MetadataTableType; -import org.apache.iceberg.MetadataTableUtils; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SerializableTable; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableOperations; -import org.apache.iceberg.TestHelpers; -import org.apache.iceberg.Transaction; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.types.Types; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -public class TestTableSerialization { - private static final HadoopTables TABLES = new HadoopTables(); - - private static final Schema SCHEMA = - new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - optional(4, "double", Types.DoubleType.get())); - - private static final PartitionSpec SPEC = - PartitionSpec.builderFor(SCHEMA).identity("date").build(); - - private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA).asc("id").build(); - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - private Table table; - - @Before - public void initTable() throws IOException { - Map props = ImmutableMap.of("k1", "v1", "k2", "v2"); - - File tableLocation = temp.newFolder(); - Assert.assertTrue(tableLocation.delete()); - - this.table = TABLES.create(SCHEMA, SPEC, SORT_ORDER, props, tableLocation.toString()); - } - - @Test - public void testSerializableTableKryoSerialization() throws IOException { - SerializableTable serializableTable = (SerializableTable) SerializableTable.copyOf(table); - org.apache.iceberg.TestHelpers.assertSerializedAndLoadedMetadata( - table, roundTripKryoSerialize(SerializableTable.class, serializableTable)); - } - - @Test - public void testSerializableMetadataTableKryoSerialization() throws IOException { - for (MetadataTableType type : MetadataTableType.values()) { - TableOperations ops = ((HasTableOperations) table).operations(); - Table metadataTable = - MetadataTableUtils.createMetadataTableInstance(ops, table.name(), "meta", type); - SerializableTable serializableMetadataTable = - (SerializableTable) SerializableTable.copyOf(metadataTable); - - org.apache.iceberg.TestHelpers.assertSerializedAndLoadedMetadata( - metadataTable, - roundTripKryoSerialize(SerializableTable.class, serializableMetadataTable)); - } - } - - @Test - public void testSerializableTransactionTableKryoSerialization() throws IOException { - Transaction txn = table.newTransaction(); - - txn.updateProperties().set("k1", "v1").commit(); - - Table txnTable = txn.table(); - SerializableTable serializableTxnTable = (SerializableTable) SerializableTable.copyOf(txnTable); - - TestHelpers.assertSerializedMetadata( - txnTable, roundTripKryoSerialize(SerializableTable.class, serializableTxnTable)); - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java deleted file mode 100644 index e59d7dacd978..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java +++ /dev/null @@ -1,389 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.actions; - -import static org.apache.iceberg.flink.SimpleDataUtil.RECORD; - -import java.io.File; -import java.io.IOException; -import java.util.List; -import java.util.UUID; -import java.util.stream.Collectors; -import org.apache.commons.lang3.StringUtils; -import org.apache.flink.configuration.CoreOptions; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.iceberg.ContentFile; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DataFiles; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.Files; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.actions.RewriteDataFilesActionResult; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.data.GenericAppenderFactory; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.flink.FlinkCatalogTestBase; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.junit.After; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Parameterized.class) -public class TestRewriteDataFilesAction extends FlinkCatalogTestBase { - - private static final String TABLE_NAME_UNPARTITIONED = "test_table_unpartitioned"; - private static final String TABLE_NAME_PARTITIONED = "test_table_partitioned"; - private final FileFormat format; - private Table icebergTableUnPartitioned; - private Table icebergTablePartitioned; - - public TestRewriteDataFilesAction( - String catalogName, Namespace baseNamespace, FileFormat format) { - super(catalogName, baseNamespace); - this.format = format; - } - - @Override - protected TableEnvironment getTableEnv() { - super.getTableEnv().getConfig().getConfiguration().set(CoreOptions.DEFAULT_PARALLELISM, 1); - return super.getTableEnv(); - } - - @Parameterized.Parameters(name = "catalogName={0}, baseNamespace={1}, format={2}") - public static Iterable parameters() { - List parameters = Lists.newArrayList(); - for (FileFormat format : - new FileFormat[] {FileFormat.AVRO, FileFormat.ORC, FileFormat.PARQUET}) { - for (Object[] catalogParams : FlinkCatalogTestBase.parameters()) { - String catalogName = (String) catalogParams[0]; - Namespace baseNamespace = (Namespace) catalogParams[1]; - parameters.add(new Object[] {catalogName, baseNamespace, format}); - } - } - return parameters; - } - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - @Override - @Before - public void before() { - super.before(); - sql("CREATE DATABASE %s", flinkDatabase); - sql("USE CATALOG %s", catalogName); - sql("USE %s", DATABASE); - sql( - "CREATE TABLE %s (id int, data varchar) with ('write.format.default'='%s')", - TABLE_NAME_UNPARTITIONED, format.name()); - icebergTableUnPartitioned = - validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME_UNPARTITIONED)); - - sql( - "CREATE TABLE %s (id int, data varchar,spec varchar) " - + " PARTITIONED BY (data,spec) with ('write.format.default'='%s')", - TABLE_NAME_PARTITIONED, format.name()); - icebergTablePartitioned = - validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME_PARTITIONED)); - } - - @Override - @After - public void clean() { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, TABLE_NAME_UNPARTITIONED); - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, TABLE_NAME_PARTITIONED); - sql("DROP DATABASE IF EXISTS %s", flinkDatabase); - super.clean(); - } - - @Test - public void testRewriteDataFilesEmptyTable() throws Exception { - Assert.assertNull("Table must be empty", icebergTableUnPartitioned.currentSnapshot()); - Actions.forTable(icebergTableUnPartitioned).rewriteDataFiles().execute(); - Assert.assertNull("Table must stay empty", icebergTableUnPartitioned.currentSnapshot()); - } - - @Test - public void testRewriteDataFilesUnpartitionedTable() throws Exception { - sql("INSERT INTO %s SELECT 1, 'hello'", TABLE_NAME_UNPARTITIONED); - sql("INSERT INTO %s SELECT 2, 'world'", TABLE_NAME_UNPARTITIONED); - - icebergTableUnPartitioned.refresh(); - - CloseableIterable tasks = icebergTableUnPartitioned.newScan().planFiles(); - List dataFiles = - Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); - Assert.assertEquals("Should have 2 data files before rewrite", 2, dataFiles.size()); - - RewriteDataFilesActionResult result = - Actions.forTable(icebergTableUnPartitioned).rewriteDataFiles().execute(); - - Assert.assertEquals("Action should rewrite 2 data files", 2, result.deletedDataFiles().size()); - Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFiles().size()); - - icebergTableUnPartitioned.refresh(); - - CloseableIterable tasks1 = icebergTableUnPartitioned.newScan().planFiles(); - List dataFiles1 = - Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); - Assert.assertEquals("Should have 1 data files after rewrite", 1, dataFiles1.size()); - - // Assert the table records as expected. - SimpleDataUtil.assertTableRecords( - icebergTableUnPartitioned, - Lists.newArrayList( - SimpleDataUtil.createRecord(1, "hello"), SimpleDataUtil.createRecord(2, "world"))); - } - - @Test - public void testRewriteDataFilesPartitionedTable() throws Exception { - sql("INSERT INTO %s SELECT 1, 'hello' ,'a'", TABLE_NAME_PARTITIONED); - sql("INSERT INTO %s SELECT 2, 'hello' ,'a'", TABLE_NAME_PARTITIONED); - sql("INSERT INTO %s SELECT 3, 'world' ,'b'", TABLE_NAME_PARTITIONED); - sql("INSERT INTO %s SELECT 4, 'world' ,'b'", TABLE_NAME_PARTITIONED); - - icebergTablePartitioned.refresh(); - - CloseableIterable tasks = icebergTablePartitioned.newScan().planFiles(); - List dataFiles = - Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); - Assert.assertEquals("Should have 4 data files before rewrite", 4, dataFiles.size()); - - RewriteDataFilesActionResult result = - Actions.forTable(icebergTablePartitioned).rewriteDataFiles().execute(); - - Assert.assertEquals("Action should rewrite 4 data files", 4, result.deletedDataFiles().size()); - Assert.assertEquals("Action should add 2 data file", 2, result.addedDataFiles().size()); - - icebergTablePartitioned.refresh(); - - CloseableIterable tasks1 = icebergTablePartitioned.newScan().planFiles(); - List dataFiles1 = - Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); - Assert.assertEquals("Should have 2 data files after rewrite", 2, dataFiles1.size()); - - // Assert the table records as expected. - Schema schema = - new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.optional(3, "spec", Types.StringType.get())); - - Record record = GenericRecord.create(schema); - SimpleDataUtil.assertTableRecords( - icebergTablePartitioned, - Lists.newArrayList( - record.copy("id", 1, "data", "hello", "spec", "a"), - record.copy("id", 2, "data", "hello", "spec", "a"), - record.copy("id", 3, "data", "world", "spec", "b"), - record.copy("id", 4, "data", "world", "spec", "b"))); - } - - @Test - public void testRewriteDataFilesWithFilter() throws Exception { - sql("INSERT INTO %s SELECT 1, 'hello' ,'a'", TABLE_NAME_PARTITIONED); - sql("INSERT INTO %s SELECT 2, 'hello' ,'a'", TABLE_NAME_PARTITIONED); - sql("INSERT INTO %s SELECT 3, 'world' ,'a'", TABLE_NAME_PARTITIONED); - sql("INSERT INTO %s SELECT 4, 'world' ,'b'", TABLE_NAME_PARTITIONED); - sql("INSERT INTO %s SELECT 5, 'world' ,'b'", TABLE_NAME_PARTITIONED); - - icebergTablePartitioned.refresh(); - - CloseableIterable tasks = icebergTablePartitioned.newScan().planFiles(); - List dataFiles = - Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); - Assert.assertEquals("Should have 5 data files before rewrite", 5, dataFiles.size()); - - RewriteDataFilesActionResult result = - Actions.forTable(icebergTablePartitioned) - .rewriteDataFiles() - .filter(Expressions.equal("spec", "a")) - .filter(Expressions.startsWith("data", "he")) - .execute(); - - Assert.assertEquals("Action should rewrite 2 data files", 2, result.deletedDataFiles().size()); - Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFiles().size()); - - icebergTablePartitioned.refresh(); - - CloseableIterable tasks1 = icebergTablePartitioned.newScan().planFiles(); - List dataFiles1 = - Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); - Assert.assertEquals("Should have 4 data files after rewrite", 4, dataFiles1.size()); - - // Assert the table records as expected. - Schema schema = - new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.optional(3, "spec", Types.StringType.get())); - - Record record = GenericRecord.create(schema); - SimpleDataUtil.assertTableRecords( - icebergTablePartitioned, - Lists.newArrayList( - record.copy("id", 1, "data", "hello", "spec", "a"), - record.copy("id", 2, "data", "hello", "spec", "a"), - record.copy("id", 3, "data", "world", "spec", "a"), - record.copy("id", 4, "data", "world", "spec", "b"), - record.copy("id", 5, "data", "world", "spec", "b"))); - } - - @Test - public void testRewriteLargeTableHasResiduals() throws IOException { - // all records belong to the same partition - List records1 = Lists.newArrayList(); - List records2 = Lists.newArrayList(); - List expected = Lists.newArrayList(); - for (int i = 0; i < 100; i++) { - int id = i; - String data = String.valueOf(i % 3); - if (i % 2 == 0) { - records1.add("(" + id + ",'" + data + "')"); - } else { - records2.add("(" + id + ",'" + data + "')"); - } - Record record = RECORD.copy(); - record.setField("id", id); - record.setField("data", data); - expected.add(record); - } - - sql("INSERT INTO %s values " + StringUtils.join(records1, ","), TABLE_NAME_UNPARTITIONED); - sql("INSERT INTO %s values " + StringUtils.join(records2, ","), TABLE_NAME_UNPARTITIONED); - - icebergTableUnPartitioned.refresh(); - - CloseableIterable tasks = - icebergTableUnPartitioned - .newScan() - .ignoreResiduals() - .filter(Expressions.equal("data", "0")) - .planFiles(); - for (FileScanTask task : tasks) { - Assert.assertEquals("Residuals must be ignored", Expressions.alwaysTrue(), task.residual()); - } - List dataFiles = - Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); - Assert.assertEquals("Should have 2 data files before rewrite", 2, dataFiles.size()); - - Actions actions = Actions.forTable(icebergTableUnPartitioned); - - RewriteDataFilesActionResult result = - actions.rewriteDataFiles().filter(Expressions.equal("data", "0")).execute(); - Assert.assertEquals("Action should rewrite 2 data files", 2, result.deletedDataFiles().size()); - Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFiles().size()); - - // Assert the table records as expected. - SimpleDataUtil.assertTableRecords(icebergTableUnPartitioned, expected); - } - - /** - * a test case to test avoid repeate compress - * - *

If datafile cannot be combined to CombinedScanTask with other DataFiles, the size of the - * CombinedScanTask list size is 1, so we remove these CombinedScanTasks to avoid compressed - * repeatedly. - * - *

In this test case,we generated 3 data files and set targetSizeInBytes greater than the - * largest file size so that it cannot be combined a CombinedScanTask with other datafiles. The - * datafile with the largest file size will not be compressed. - * - * @throws IOException IOException - */ - @Test - public void testRewriteAvoidRepeateCompress() throws IOException { - List expected = Lists.newArrayList(); - Schema schema = icebergTableUnPartitioned.schema(); - GenericAppenderFactory genericAppenderFactory = new GenericAppenderFactory(schema); - File file = temp.newFile(); - int count = 0; - try (FileAppender fileAppender = - genericAppenderFactory.newAppender(Files.localOutput(file), format)) { - long filesize = 20000; - for (; fileAppender.length() < filesize; count++) { - Record record = SimpleDataUtil.createRecord(count, UUID.randomUUID().toString()); - fileAppender.add(record); - expected.add(record); - } - } - - DataFile dataFile = - DataFiles.builder(icebergTableUnPartitioned.spec()) - .withPath(file.getAbsolutePath()) - .withFileSizeInBytes(file.length()) - .withFormat(format) - .withRecordCount(count) - .build(); - - icebergTableUnPartitioned.newAppend().appendFile(dataFile).commit(); - - sql("INSERT INTO %s SELECT 1,'a' ", TABLE_NAME_UNPARTITIONED); - sql("INSERT INTO %s SELECT 2,'b' ", TABLE_NAME_UNPARTITIONED); - - icebergTableUnPartitioned.refresh(); - - CloseableIterable tasks = icebergTableUnPartitioned.newScan().planFiles(); - List dataFiles = - Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); - Assert.assertEquals("Should have 3 data files before rewrite", 3, dataFiles.size()); - - Actions actions = Actions.forTable(icebergTableUnPartitioned); - - long targetSizeInBytes = file.length() + 10; - RewriteDataFilesActionResult result = - actions - .rewriteDataFiles() - .targetSizeInBytes(targetSizeInBytes) - .splitOpenFileCost(1) - .execute(); - Assert.assertEquals("Action should rewrite 2 data files", 2, result.deletedDataFiles().size()); - Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFiles().size()); - - icebergTableUnPartitioned.refresh(); - - CloseableIterable tasks1 = icebergTableUnPartitioned.newScan().planFiles(); - List dataFilesRewrote = - Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); - Assert.assertEquals("Should have 2 data files after rewrite", 2, dataFilesRewrote.size()); - - // the biggest file do not be rewrote - List rewroteDataFileNames = - dataFilesRewrote.stream().map(ContentFile::path).collect(Collectors.toList()); - Assert.assertTrue(rewroteDataFileNames.contains(file.getAbsolutePath())); - - // Assert the table records as expected. - expected.add(SimpleDataUtil.createRecord(1, "a")); - expected.add(SimpleDataUtil.createRecord(2, "b")); - SimpleDataUtil.assertTableRecords(icebergTableUnPartitioned, expected); - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java deleted file mode 100644 index cc58d9817ac6..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.RowDataConverter; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; - -public class RandomRowData { - private RandomRowData() {} - - public static Iterable generate(Schema schema, int numRecords, long seed) { - return convert(schema, RandomGenericData.generate(schema, numRecords, seed)); - } - - public static Iterable convert(Schema schema, Iterable records) { - return Iterables.transform(records, record -> RowDataConverter.convert(schema, record)); - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java deleted file mode 100644 index 64acecfb0415..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.io.File; -import java.io.IOException; -import java.util.Iterator; -import java.util.List; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.Files; -import org.apache.iceberg.Schema; -import org.apache.iceberg.avro.Avro; -import org.apache.iceberg.data.DataTest; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.data.avro.DataReader; -import org.apache.iceberg.data.avro.DataWriter; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.Assert; - -public class TestFlinkAvroReaderWriter extends DataTest { - - private static final int NUM_RECORDS = 100; - - @Override - protected void writeAndValidate(Schema schema) throws IOException { - RowType flinkSchema = FlinkSchemaUtil.convert(schema); - List expectedRecords = RandomGenericData.generate(schema, NUM_RECORDS, 1991L); - List expectedRows = Lists.newArrayList(RandomRowData.convert(schema, expectedRecords)); - - File recordsFile = temp.newFile(); - Assert.assertTrue("Delete should succeed", recordsFile.delete()); - - // Write the expected records into AVRO file, then read them into RowData and assert with the - // expected Record list. - try (FileAppender writer = - Avro.write(Files.localOutput(recordsFile)) - .schema(schema) - .createWriterFunc(DataWriter::create) - .build()) { - writer.addAll(expectedRecords); - } - - try (CloseableIterable reader = - Avro.read(Files.localInput(recordsFile)) - .project(schema) - .createReaderFunc(FlinkAvroReader::new) - .build()) { - Iterator expected = expectedRecords.iterator(); - Iterator rows = reader.iterator(); - for (int i = 0; i < NUM_RECORDS; i++) { - Assert.assertTrue("Should have expected number of records", rows.hasNext()); - TestHelpers.assertRowData(schema.asStruct(), flinkSchema, expected.next(), rows.next()); - } - Assert.assertFalse("Should not have extra records", rows.hasNext()); - } - - File rowDataFile = temp.newFile(); - Assert.assertTrue("Delete should succeed", rowDataFile.delete()); - - // Write the expected RowData into AVRO file, then read them into Record and assert with the - // expected RowData list. - try (FileAppender writer = - Avro.write(Files.localOutput(rowDataFile)) - .schema(schema) - .createWriterFunc(ignore -> new FlinkAvroWriter(flinkSchema)) - .build()) { - writer.addAll(expectedRows); - } - - try (CloseableIterable reader = - Avro.read(Files.localInput(rowDataFile)) - .project(schema) - .createReaderFunc(DataReader::create) - .build()) { - Iterator expected = expectedRows.iterator(); - Iterator records = reader.iterator(); - for (int i = 0; i < NUM_RECORDS; i += 1) { - Assert.assertTrue("Should have expected number of records", records.hasNext()); - TestHelpers.assertRowData(schema.asStruct(), flinkSchema, records.next(), expected.next()); - } - Assert.assertFalse("Should not have extra records", records.hasNext()); - } - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java deleted file mode 100644 index fdffc0e01c20..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.io.File; -import java.io.IOException; -import java.util.Iterator; -import java.util.List; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.Files; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.DataTest; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.data.orc.GenericOrcReader; -import org.apache.iceberg.data.orc.GenericOrcWriter; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.orc.ORC; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.Assert; - -public class TestFlinkOrcReaderWriter extends DataTest { - private static final int NUM_RECORDS = 100; - - @Override - protected void writeAndValidate(Schema schema) throws IOException { - RowType flinkSchema = FlinkSchemaUtil.convert(schema); - List expectedRecords = RandomGenericData.generate(schema, NUM_RECORDS, 1990L); - List expectedRows = Lists.newArrayList(RandomRowData.convert(schema, expectedRecords)); - - File recordsFile = temp.newFile(); - Assert.assertTrue("Delete should succeed", recordsFile.delete()); - - // Write the expected records into ORC file, then read them into RowData and assert with the - // expected Record list. - try (FileAppender writer = - ORC.write(Files.localOutput(recordsFile)) - .schema(schema) - .createWriterFunc(GenericOrcWriter::buildWriter) - .build()) { - writer.addAll(expectedRecords); - } - - try (CloseableIterable reader = - ORC.read(Files.localInput(recordsFile)) - .project(schema) - .createReaderFunc(type -> new FlinkOrcReader(schema, type)) - .build()) { - Iterator expected = expectedRecords.iterator(); - Iterator rows = reader.iterator(); - for (int i = 0; i < NUM_RECORDS; i++) { - Assert.assertTrue("Should have expected number of records", rows.hasNext()); - TestHelpers.assertRowData(schema.asStruct(), flinkSchema, expected.next(), rows.next()); - } - Assert.assertFalse("Should not have extra records", rows.hasNext()); - } - - File rowDataFile = temp.newFile(); - Assert.assertTrue("Delete should succeed", rowDataFile.delete()); - - // Write the expected RowData into ORC file, then read them into Record and assert with the - // expected RowData list. - RowType rowType = FlinkSchemaUtil.convert(schema); - try (FileAppender writer = - ORC.write(Files.localOutput(rowDataFile)) - .schema(schema) - .createWriterFunc((iSchema, typeDesc) -> FlinkOrcWriter.buildWriter(rowType, iSchema)) - .build()) { - writer.addAll(expectedRows); - } - - try (CloseableIterable reader = - ORC.read(Files.localInput(rowDataFile)) - .project(schema) - .createReaderFunc(type -> GenericOrcReader.buildReader(schema, type)) - .build()) { - Iterator expected = expectedRows.iterator(); - Iterator records = reader.iterator(); - for (int i = 0; i < NUM_RECORDS; i += 1) { - Assert.assertTrue("Should have expected number of records", records.hasNext()); - TestHelpers.assertRowData(schema.asStruct(), flinkSchema, records.next(), expected.next()); - } - Assert.assertFalse("Should not have extra records", records.hasNext()); - } - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java deleted file mode 100644 index 51060e14e1ae..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.io.File; -import java.io.IOException; -import java.util.Iterator; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.iceberg.Files; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.DataTest; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.data.parquet.GenericParquetWriter; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.parquet.Parquet; -import org.junit.Assert; - -public class TestFlinkParquetReader extends DataTest { - private static final int NUM_RECORDS = 100; - - private void writeAndValidate(Iterable iterable, Schema schema) throws IOException { - File testFile = temp.newFile(); - Assert.assertTrue("Delete should succeed", testFile.delete()); - - try (FileAppender writer = - Parquet.write(Files.localOutput(testFile)) - .schema(schema) - .createWriterFunc(GenericParquetWriter::buildWriter) - .build()) { - writer.addAll(iterable); - } - - try (CloseableIterable reader = - Parquet.read(Files.localInput(testFile)) - .project(schema) - .createReaderFunc(type -> FlinkParquetReaders.buildReader(schema, type)) - .build()) { - Iterator expected = iterable.iterator(); - Iterator rows = reader.iterator(); - LogicalType rowType = FlinkSchemaUtil.convert(schema); - for (int i = 0; i < NUM_RECORDS; i += 1) { - Assert.assertTrue("Should have expected number of rows", rows.hasNext()); - TestHelpers.assertRowData(schema.asStruct(), rowType, expected.next(), rows.next()); - } - Assert.assertFalse("Should not have extra rows", rows.hasNext()); - } - } - - @Override - protected void writeAndValidate(Schema schema) throws IOException { - writeAndValidate(RandomGenericData.generate(schema, NUM_RECORDS, 19981), schema); - writeAndValidate( - RandomGenericData.generateDictionaryEncodableRecords(schema, NUM_RECORDS, 21124), schema); - writeAndValidate( - RandomGenericData.generateFallbackRecords(schema, NUM_RECORDS, 21124, NUM_RECORDS / 20), - schema); - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java deleted file mode 100644 index 7b868eafc311..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.io.File; -import java.io.IOException; -import java.util.Iterator; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.iceberg.Files; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.DataTest; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.data.parquet.GenericParquetReaders; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.parquet.Parquet; -import org.junit.Assert; -import org.junit.Rule; -import org.junit.rules.TemporaryFolder; - -public class TestFlinkParquetWriter extends DataTest { - private static final int NUM_RECORDS = 100; - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - private void writeAndValidate(Iterable iterable, Schema schema) throws IOException { - File testFile = temp.newFile(); - Assert.assertTrue("Delete should succeed", testFile.delete()); - - LogicalType logicalType = FlinkSchemaUtil.convert(schema); - - try (FileAppender writer = - Parquet.write(Files.localOutput(testFile)) - .schema(schema) - .createWriterFunc(msgType -> FlinkParquetWriters.buildWriter(logicalType, msgType)) - .build()) { - writer.addAll(iterable); - } - - try (CloseableIterable reader = - Parquet.read(Files.localInput(testFile)) - .project(schema) - .createReaderFunc(msgType -> GenericParquetReaders.buildReader(schema, msgType)) - .build()) { - Iterator expected = iterable.iterator(); - Iterator actual = reader.iterator(); - LogicalType rowType = FlinkSchemaUtil.convert(schema); - for (int i = 0; i < NUM_RECORDS; i += 1) { - Assert.assertTrue("Should have expected number of rows", actual.hasNext()); - TestHelpers.assertRowData(schema.asStruct(), rowType, actual.next(), expected.next()); - } - Assert.assertFalse("Should not have extra rows", actual.hasNext()); - } - } - - @Override - protected void writeAndValidate(Schema schema) throws IOException { - writeAndValidate(RandomRowData.generate(schema, NUM_RECORDS, 19981), schema); - - writeAndValidate( - RandomRowData.convert( - schema, - RandomGenericData.generateDictionaryEncodableRecords(schema, NUM_RECORDS, 21124)), - schema); - - writeAndValidate( - RandomRowData.convert( - schema, - RandomGenericData.generateFallbackRecords( - schema, NUM_RECORDS, 21124, NUM_RECORDS / 20)), - schema); - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java deleted file mode 100644 index 4cb77b11fd7b..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java +++ /dev/null @@ -1,340 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.util.Iterator; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.AssertHelpers; -import org.apache.iceberg.Schema; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.StructProjection; -import org.junit.Assert; -import org.junit.Test; - -public class TestRowDataProjection { - - @Test - public void testFullProjection() { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - - generateAndValidate(schema, schema); - } - - @Test - public void testReorderedFullProjection() { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - - Schema reordered = - new Schema( - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.required(0, "id", Types.LongType.get())); - - generateAndValidate(schema, reordered); - } - - @Test - public void testBasicProjection() { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - Schema id = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); - Schema data = new Schema(Types.NestedField.optional(1, "data", Types.StringType.get())); - generateAndValidate(schema, id); - generateAndValidate(schema, data); - } - - @Test - public void testEmptyProjection() { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - generateAndValidate(schema, schema.select()); - } - - @Test - public void testRename() { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - - Schema renamed = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "renamed", Types.StringType.get())); - generateAndValidate(schema, renamed); - } - - @Test - public void testNestedProjection() { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional( - 3, - "location", - Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get())))); - - // Project id only. - Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); - generateAndValidate(schema, idOnly); - - // Project lat only. - Schema latOnly = - new Schema( - Types.NestedField.optional( - 3, - "location", - Types.StructType.of(Types.NestedField.required(1, "lat", Types.FloatType.get())))); - generateAndValidate(schema, latOnly); - - // Project long only. - Schema longOnly = - new Schema( - Types.NestedField.optional( - 3, - "location", - Types.StructType.of(Types.NestedField.required(2, "long", Types.FloatType.get())))); - generateAndValidate(schema, longOnly); - - // Project location. - Schema locationOnly = schema.select("location"); - generateAndValidate(schema, locationOnly); - } - - @Test - public void testPrimitiveTypeProjection() { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.required(2, "b", Types.BooleanType.get()), - Types.NestedField.optional(3, "i", Types.IntegerType.get()), - Types.NestedField.required(4, "l", Types.LongType.get()), - Types.NestedField.optional(5, "f", Types.FloatType.get()), - Types.NestedField.required(6, "d", Types.DoubleType.get()), - Types.NestedField.optional(7, "date", Types.DateType.get()), - Types.NestedField.optional(8, "time", Types.TimeType.get()), - Types.NestedField.required(9, "ts", Types.TimestampType.withoutZone()), - Types.NestedField.required(10, "ts_tz", Types.TimestampType.withZone()), - Types.NestedField.required(11, "s", Types.StringType.get()), - Types.NestedField.required(12, "fixed", Types.FixedType.ofLength(7)), - Types.NestedField.optional(13, "bytes", Types.BinaryType.get()), - Types.NestedField.required(14, "dec_9_0", Types.DecimalType.of(9, 0)), - Types.NestedField.required(15, "dec_11_2", Types.DecimalType.of(11, 2)), - Types.NestedField.required( - 16, "dec_38_10", Types.DecimalType.of(38, 10)) // maximum precision - ); - - generateAndValidate(schema, schema); - } - - @Test - public void testPrimitiveMapTypeProjection() { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional( - 3, - "map", - Types.MapType.ofOptional(1, 2, Types.IntegerType.get(), Types.StringType.get()))); - - // Project id only. - Schema idOnly = schema.select("id"); - generateAndValidate(schema, idOnly); - - // Project map only. - Schema mapOnly = schema.select("map"); - generateAndValidate(schema, mapOnly); - - // Project all. - generateAndValidate(schema, schema); - } - - @Test - public void testNestedMapTypeProjection() { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional( - 7, - "map", - Types.MapType.ofOptional( - 5, - 6, - Types.StructType.of( - Types.NestedField.required(1, "key", Types.LongType.get()), - Types.NestedField.required(2, "keyData", Types.LongType.get())), - Types.StructType.of( - Types.NestedField.required(3, "value", Types.LongType.get()), - Types.NestedField.required(4, "valueData", Types.LongType.get()))))); - - // Project id only. - Schema idOnly = schema.select("id"); - generateAndValidate(schema, idOnly); - - // Project map only. - Schema mapOnly = schema.select("map"); - generateAndValidate(schema, mapOnly); - - // Project all. - generateAndValidate(schema, schema); - - // Project partial map key. - Schema partialMapKey = - new Schema( - Types.NestedField.optional( - 7, - "map", - Types.MapType.ofOptional( - 5, - 6, - Types.StructType.of(Types.NestedField.required(1, "key", Types.LongType.get())), - Types.StructType.of( - Types.NestedField.required(3, "value", Types.LongType.get()), - Types.NestedField.required(4, "valueData", Types.LongType.get()))))); - AssertHelpers.assertThrows( - "Should not allow to project a partial map key with non-primitive type.", - IllegalArgumentException.class, - "Cannot project a partial map key or value", - () -> generateAndValidate(schema, partialMapKey)); - - // Project partial map key. - Schema partialMapValue = - new Schema( - Types.NestedField.optional( - 7, - "map", - Types.MapType.ofOptional( - 5, - 6, - Types.StructType.of( - Types.NestedField.required(1, "key", Types.LongType.get()), - Types.NestedField.required(2, "keyData", Types.LongType.get())), - Types.StructType.of( - Types.NestedField.required(3, "value", Types.LongType.get()))))); - AssertHelpers.assertThrows( - "Should not allow to project a partial map value with non-primitive type.", - IllegalArgumentException.class, - "Cannot project a partial map key or value", - () -> generateAndValidate(schema, partialMapValue)); - } - - @Test - public void testPrimitiveListTypeProjection() { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional( - 2, "list", Types.ListType.ofOptional(1, Types.StringType.get()))); - - // Project id only. - Schema idOnly = schema.select("id"); - generateAndValidate(schema, idOnly); - - // Project list only. - Schema mapOnly = schema.select("list"); - generateAndValidate(schema, mapOnly); - - // Project all. - generateAndValidate(schema, schema); - } - - @Test - public void testNestedListTypeProjection() { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional( - 5, - "list", - Types.ListType.ofOptional( - 4, - Types.StructType.of( - Types.NestedField.required(1, "nestedListField1", Types.LongType.get()), - Types.NestedField.required(2, "nestedListField2", Types.LongType.get()), - Types.NestedField.required(3, "nestedListField3", Types.LongType.get()))))); - - // Project id only. - Schema idOnly = schema.select("id"); - generateAndValidate(schema, idOnly); - - // Project list only. - Schema mapOnly = schema.select("list"); - generateAndValidate(schema, mapOnly); - - // Project all. - generateAndValidate(schema, schema); - - // Project partial list value. - Schema partialList = - new Schema( - Types.NestedField.optional( - 5, - "list", - Types.ListType.ofOptional( - 4, - Types.StructType.of( - Types.NestedField.required(2, "nestedListField2", Types.LongType.get()))))); - AssertHelpers.assertThrows( - "Should not allow to project a partial list element with non-primitive type.", - IllegalArgumentException.class, - "Cannot project a partial list element", - () -> generateAndValidate(schema, partialList)); - } - - private void generateAndValidate(Schema schema, Schema projectSchema) { - int numRecords = 100; - Iterable recordList = RandomGenericData.generate(schema, numRecords, 102L); - Iterable rowDataList = RandomRowData.generate(schema, numRecords, 102L); - - StructProjection structProjection = StructProjection.create(schema, projectSchema); - RowDataProjection rowDataProjection = RowDataProjection.create(schema, projectSchema); - - Iterator recordIter = recordList.iterator(); - Iterator rowDataIter = rowDataList.iterator(); - - for (int i = 0; i < numRecords; i++) { - Assert.assertTrue("Should have more records", recordIter.hasNext()); - Assert.assertTrue("Should have more RowData", rowDataIter.hasNext()); - - StructLike expected = structProjection.wrap(recordIter.next()); - RowData actual = rowDataProjection.wrap(rowDataIter.next()); - - TestHelpers.assertRowData(projectSchema, expected, actual); - } - - Assert.assertFalse("Shouldn't have more record", recordIter.hasNext()); - Assert.assertFalse("Shouldn't have more RowData", rowDataIter.hasNext()); - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java deleted file mode 100644 index df2e6ae21c7e..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java +++ /dev/null @@ -1,580 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.io.File; -import java.io.IOException; -import java.util.Map; -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.GenericArrayData; -import org.apache.flink.table.data.GenericMapData; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.iceberg.Files; -import org.apache.iceberg.Schema; -import org.apache.iceberg.avro.Avro; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Comparators; -import org.apache.iceberg.types.Types; -import org.junit.Assert; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -public class TestRowProjection { - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - private RowData writeAndRead(String desc, Schema writeSchema, Schema readSchema, RowData row) - throws IOException { - File file = temp.newFile(desc + ".avro"); - Assert.assertTrue(file.delete()); - - try (FileAppender appender = - Avro.write(Files.localOutput(file)) - .schema(writeSchema) - .createWriterFunc(ignore -> new FlinkAvroWriter(FlinkSchemaUtil.convert(writeSchema))) - .build()) { - appender.add(row); - } - - Iterable records = - Avro.read(Files.localInput(file)) - .project(readSchema) - .createReaderFunc(FlinkAvroReader::new) - .build(); - - return Iterables.getOnlyElement(records); - } - - @Test - public void testFullProjection() throws Exception { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - - RowData row = GenericRowData.of(34L, StringData.fromString("test")); - - RowData projected = writeAndRead("full_projection", schema, schema, row); - - Assert.assertEquals("Should contain the correct id value", 34L, projected.getLong(0)); - - int cmp = Comparators.charSequences().compare("test", projected.getString(1).toString()); - Assert.assertEquals("Should contain the correct data value", cmp, 0); - } - - @Test - public void testSpecialCharacterProjection() throws Exception { - Schema schema = - new Schema( - Types.NestedField.required(0, "user id", Types.LongType.get()), - Types.NestedField.optional(1, "data%0", Types.StringType.get())); - - RowData row = GenericRowData.of(34L, StringData.fromString("test")); - - RowData full = writeAndRead("special_chars", schema, schema, row); - - Assert.assertEquals("Should contain the correct id value", 34L, full.getLong(0)); - Assert.assertEquals( - "Should contain the correct data value", - 0, - Comparators.charSequences().compare("test", full.getString(1).toString())); - - RowData projected = writeAndRead("special_characters", schema, schema.select("data%0"), full); - - Assert.assertEquals("Should not contain id value", 1, projected.getArity()); - Assert.assertEquals( - "Should contain the correct data value", - 0, - Comparators.charSequences().compare("test", projected.getString(0).toString())); - } - - @Test - public void testReorderedFullProjection() throws Exception { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - - RowData row = GenericRowData.of(34L, StringData.fromString("test")); - - Schema reordered = - new Schema( - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.required(0, "id", Types.LongType.get())); - - RowData projected = writeAndRead("full_projection", schema, reordered, row); - - Assert.assertEquals( - "Should contain the correct 0 value", "test", projected.getString(0).toString()); - Assert.assertEquals("Should contain the correct 1 value", 34L, projected.getLong(1)); - } - - @Test - public void testReorderedProjection() throws Exception { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - - RowData row = GenericRowData.of(34L, StringData.fromString("test")); - - Schema reordered = - new Schema( - Types.NestedField.optional(2, "missing_1", Types.StringType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.optional(3, "missing_2", Types.LongType.get())); - - RowData projected = writeAndRead("full_projection", schema, reordered, row); - - Assert.assertTrue("Should contain the correct 0 value", projected.isNullAt(0)); - Assert.assertEquals( - "Should contain the correct 1 value", "test", projected.getString(1).toString()); - Assert.assertTrue("Should contain the correct 2 value", projected.isNullAt(2)); - } - - @Test - public void testRenamedAddedField() throws Exception { - Schema schema = - new Schema( - Types.NestedField.required(1, "a", Types.LongType.get()), - Types.NestedField.required(2, "b", Types.LongType.get()), - Types.NestedField.required(3, "d", Types.LongType.get())); - - RowData row = GenericRowData.of(100L, 200L, 300L); - - Schema renamedAdded = - new Schema( - Types.NestedField.optional(1, "a", Types.LongType.get()), - Types.NestedField.optional(2, "b", Types.LongType.get()), - Types.NestedField.optional(3, "c", Types.LongType.get()), - Types.NestedField.optional(4, "d", Types.LongType.get())); - - RowData projected = writeAndRead("rename_and_add_column_projection", schema, renamedAdded, row); - Assert.assertEquals("Should contain the correct value in column 1", projected.getLong(0), 100L); - Assert.assertEquals("Should contain the correct value in column 2", projected.getLong(1), 200L); - Assert.assertEquals("Should contain the correct value in column 3", projected.getLong(2), 300L); - Assert.assertTrue("Should contain empty value on new column 4", projected.isNullAt(3)); - } - - @Test - public void testEmptyProjection() throws Exception { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - - RowData row = GenericRowData.of(34L, StringData.fromString("test")); - - RowData projected = writeAndRead("empty_projection", schema, schema.select(), row); - - Assert.assertNotNull("Should read a non-null record", projected); - Assert.assertEquals(0, projected.getArity()); - } - - @Test - public void testBasicProjection() throws Exception { - Schema writeSchema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - - RowData row = GenericRowData.of(34L, StringData.fromString("test")); - - Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); - - RowData projected = writeAndRead("basic_projection_id", writeSchema, idOnly, row); - Assert.assertEquals("Should not project data", 1, projected.getArity()); - Assert.assertEquals("Should contain the correct id value", 34L, projected.getLong(0)); - - Schema dataOnly = new Schema(Types.NestedField.optional(1, "data", Types.StringType.get())); - - projected = writeAndRead("basic_projection_data", writeSchema, dataOnly, row); - - Assert.assertEquals("Should not project id", 1, projected.getArity()); - int cmp = Comparators.charSequences().compare("test", projected.getString(0).toString()); - Assert.assertEquals("Should contain the correct data value", 0, cmp); - } - - @Test - public void testRename() throws Exception { - Schema writeSchema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - - RowData row = GenericRowData.of(34L, StringData.fromString("test")); - - Schema readSchema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "renamed", Types.StringType.get())); - - RowData projected = writeAndRead("project_and_rename", writeSchema, readSchema, row); - - Assert.assertEquals("Should contain the correct id value", 34L, projected.getLong(0)); - int cmp = Comparators.charSequences().compare("test", projected.getString(1).toString()); - Assert.assertEquals("Should contain the correct data/renamed value", 0, cmp); - } - - @Test - public void testNestedStructProjection() throws Exception { - Schema writeSchema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional( - 3, - "location", - Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get())))); - - RowData location = GenericRowData.of(52.995143f, -1.539054f); - RowData record = GenericRowData.of(34L, location); - - Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); - - RowData projected = writeAndRead("id_only", writeSchema, idOnly, record); - Assert.assertEquals("Should not project location", 1, projected.getArity()); - Assert.assertEquals("Should contain the correct id value", 34L, projected.getLong(0)); - - Schema latOnly = - new Schema( - Types.NestedField.optional( - 3, - "location", - Types.StructType.of(Types.NestedField.required(1, "lat", Types.FloatType.get())))); - - projected = writeAndRead("latitude_only", writeSchema, latOnly, record); - RowData projectedLocation = projected.getRow(0, 1); - Assert.assertEquals("Should not project id", 1, projected.getArity()); - Assert.assertFalse("Should project location", projected.isNullAt(0)); - Assert.assertEquals("Should not project longitude", 1, projectedLocation.getArity()); - Assert.assertEquals( - "Should project latitude", 52.995143f, projectedLocation.getFloat(0), 0.000001f); - - Schema longOnly = - new Schema( - Types.NestedField.optional( - 3, - "location", - Types.StructType.of(Types.NestedField.required(2, "long", Types.FloatType.get())))); - - projected = writeAndRead("longitude_only", writeSchema, longOnly, record); - projectedLocation = projected.getRow(0, 1); - Assert.assertEquals("Should not project id", 1, projected.getArity()); - Assert.assertFalse("Should project location", projected.isNullAt(0)); - Assert.assertEquals("Should not project latitutde", 1, projectedLocation.getArity()); - Assert.assertEquals( - "Should project longitude", -1.539054f, projectedLocation.getFloat(0), 0.000001f); - - Schema locationOnly = writeSchema.select("location"); - projected = writeAndRead("location_only", writeSchema, locationOnly, record); - projectedLocation = projected.getRow(0, 1); - Assert.assertEquals("Should not project id", 1, projected.getArity()); - Assert.assertFalse("Should project location", projected.isNullAt(0)); - Assert.assertEquals( - "Should project latitude", 52.995143f, projectedLocation.getFloat(0), 0.000001f); - Assert.assertEquals( - "Should project longitude", -1.539054f, projectedLocation.getFloat(1), 0.000001f); - } - - @Test - public void testMapProjection() throws IOException { - Schema writeSchema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional( - 5, - "properties", - Types.MapType.ofOptional(6, 7, Types.StringType.get(), Types.StringType.get()))); - - GenericMapData properties = - new GenericMapData( - ImmutableMap.of( - StringData.fromString("a"), - StringData.fromString("A"), - StringData.fromString("b"), - StringData.fromString("B"))); - - RowData row = GenericRowData.of(34L, properties); - - Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); - - RowData projected = writeAndRead("id_only", writeSchema, idOnly, row); - Assert.assertEquals("Should contain the correct id value", 34L, projected.getLong(0)); - Assert.assertEquals("Should not project properties map", 1, projected.getArity()); - - Schema keyOnly = writeSchema.select("properties.key"); - projected = writeAndRead("key_only", writeSchema, keyOnly, row); - Assert.assertEquals("Should not project id", 1, projected.getArity()); - Assert.assertEquals("Should project entire map", properties, projected.getMap(0)); - - Schema valueOnly = writeSchema.select("properties.value"); - projected = writeAndRead("value_only", writeSchema, valueOnly, row); - Assert.assertEquals("Should not project id", 1, projected.getArity()); - Assert.assertEquals("Should project entire map", properties, projected.getMap(0)); - - Schema mapOnly = writeSchema.select("properties"); - projected = writeAndRead("map_only", writeSchema, mapOnly, row); - Assert.assertEquals("Should not project id", 1, projected.getArity()); - Assert.assertEquals("Should project entire map", properties, projected.getMap(0)); - } - - private Map toStringMap(Map map) { - Map stringMap = Maps.newHashMap(); - for (Map.Entry entry : map.entrySet()) { - if (entry.getValue() instanceof CharSequence) { - stringMap.put(entry.getKey().toString(), entry.getValue().toString()); - } else { - stringMap.put(entry.getKey().toString(), entry.getValue()); - } - } - return stringMap; - } - - @Test - public void testMapOfStructsProjection() throws IOException { - Schema writeSchema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional( - 5, - "locations", - Types.MapType.ofOptional( - 6, - 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()))))); - - RowData l1 = GenericRowData.of(53.992811f, -1.542616f); - RowData l2 = GenericRowData.of(52.995143f, -1.539054f); - GenericMapData map = - new GenericMapData( - ImmutableMap.of(StringData.fromString("L1"), l1, StringData.fromString("L2"), l2)); - RowData row = GenericRowData.of(34L, map); - - Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); - - RowData projected = writeAndRead("id_only", writeSchema, idOnly, row); - Assert.assertEquals("Should contain the correct id value", 34L, projected.getLong(0)); - Assert.assertEquals("Should not project locations map", 1, projected.getArity()); - - projected = writeAndRead("all_locations", writeSchema, writeSchema.select("locations"), row); - Assert.assertEquals("Should not project id", 1, projected.getArity()); - Assert.assertEquals("Should project locations map", row.getMap(1), projected.getMap(0)); - - projected = writeAndRead("lat_only", writeSchema, writeSchema.select("locations.lat"), row); - GenericMapData locations = (GenericMapData) projected.getMap(0); - Assert.assertNotNull("Should project locations map", locations); - GenericArrayData l1l2Array = - new GenericArrayData( - new Object[] {StringData.fromString("L2"), StringData.fromString("L1")}); - Assert.assertEquals("Should contain L1 and L2", l1l2Array, locations.keyArray()); - RowData projectedL1 = (RowData) locations.get(StringData.fromString("L1")); - Assert.assertNotNull("L1 should not be null", projectedL1); - Assert.assertEquals("L1 should contain lat", 53.992811f, projectedL1.getFloat(0), 0.000001); - Assert.assertEquals("L1 should not contain long", 1, projectedL1.getArity()); - RowData projectedL2 = (RowData) locations.get(StringData.fromString("L2")); - Assert.assertNotNull("L2 should not be null", projectedL2); - Assert.assertEquals("L2 should contain lat", 52.995143f, projectedL2.getFloat(0), 0.000001); - Assert.assertEquals("L2 should not contain long", 1, projectedL2.getArity()); - - projected = writeAndRead("long_only", writeSchema, writeSchema.select("locations.long"), row); - Assert.assertEquals("Should not project id", 1, projected.getArity()); - locations = (GenericMapData) projected.getMap(0); - Assert.assertNotNull("Should project locations map", locations); - Assert.assertEquals("Should contain L1 and L2", l1l2Array, locations.keyArray()); - projectedL1 = (RowData) locations.get(StringData.fromString("L1")); - Assert.assertNotNull("L1 should not be null", projectedL1); - Assert.assertEquals("L1 should not contain lat", 1, projectedL1.getArity()); - Assert.assertEquals("L1 should contain long", -1.542616f, projectedL1.getFloat(0), 0.000001); - projectedL2 = (RowData) locations.get(StringData.fromString("L2")); - Assert.assertNotNull("L2 should not be null", projectedL2); - Assert.assertEquals("L2 should not contain lat", 1, projectedL2.getArity()); - Assert.assertEquals("L2 should contain long", -1.539054f, projectedL2.getFloat(0), 0.000001); - - Schema latitiudeRenamed = - new Schema( - Types.NestedField.optional( - 5, - "locations", - Types.MapType.ofOptional( - 6, - 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "latitude", Types.FloatType.get()))))); - - projected = writeAndRead("latitude_renamed", writeSchema, latitiudeRenamed, row); - Assert.assertEquals("Should not project id", 1, projected.getArity()); - locations = (GenericMapData) projected.getMap(0); - Assert.assertNotNull("Should project locations map", locations); - Assert.assertEquals("Should contain L1 and L2", l1l2Array, locations.keyArray()); - projectedL1 = (RowData) locations.get(StringData.fromString("L1")); - Assert.assertNotNull("L1 should not be null", projectedL1); - Assert.assertEquals( - "L1 should contain latitude", 53.992811f, projectedL1.getFloat(0), 0.000001); - projectedL2 = (RowData) locations.get(StringData.fromString("L2")); - Assert.assertNotNull("L2 should not be null", projectedL2); - Assert.assertEquals( - "L2 should contain latitude", 52.995143f, projectedL2.getFloat(0), 0.000001); - } - - @Test - public void testListProjection() throws IOException { - Schema writeSchema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional( - 10, "values", Types.ListType.ofOptional(11, Types.LongType.get()))); - - GenericArrayData values = new GenericArrayData(new Long[] {56L, 57L, 58L}); - - RowData row = GenericRowData.of(34L, values); - - Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); - - RowData projected = writeAndRead("id_only", writeSchema, idOnly, row); - Assert.assertEquals("Should contain the correct id value", 34L, projected.getLong(0)); - Assert.assertEquals("Should not project values list", 1, projected.getArity()); - - Schema elementOnly = writeSchema.select("values.element"); - projected = writeAndRead("element_only", writeSchema, elementOnly, row); - Assert.assertEquals("Should not project id", 1, projected.getArity()); - Assert.assertEquals("Should project entire list", values, projected.getArray(0)); - - Schema listOnly = writeSchema.select("values"); - projected = writeAndRead("list_only", writeSchema, listOnly, row); - Assert.assertEquals("Should not project id", 1, projected.getArity()); - Assert.assertEquals("Should project entire list", values, projected.getArray(0)); - } - - @Test - @SuppressWarnings("unchecked") - public void testListOfStructsProjection() throws IOException { - Schema writeSchema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional( - 22, - "points", - Types.ListType.ofOptional( - 21, - Types.StructType.of( - Types.NestedField.required(19, "x", Types.IntegerType.get()), - Types.NestedField.optional(18, "y", Types.IntegerType.get()))))); - - RowData p1 = GenericRowData.of(1, 2); - RowData p2 = GenericRowData.of(3, null); - GenericArrayData arrayData = new GenericArrayData(new RowData[] {p1, p2}); - RowData row = GenericRowData.of(34L, arrayData); - - Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); - - RowData projected = writeAndRead("id_only", writeSchema, idOnly, row); - Assert.assertEquals("Should contain the correct id value", 34L, projected.getLong(0)); - Assert.assertEquals("Should not project points list", 1, projected.getArity()); - - projected = writeAndRead("all_points", writeSchema, writeSchema.select("points"), row); - Assert.assertEquals("Should not project id", 1, projected.getArity()); - Assert.assertEquals("Should project points list", row.getArray(1), projected.getArray(0)); - - projected = writeAndRead("x_only", writeSchema, writeSchema.select("points.x"), row); - Assert.assertEquals("Should not project id", 1, projected.getArity()); - Assert.assertFalse("Should project points list", projected.isNullAt(0)); - ArrayData points = projected.getArray(0); - Assert.assertEquals("Should read 2 points", 2, points.size()); - RowData projectedP1 = points.getRow(0, 2); - Assert.assertEquals("Should project x", 1, projectedP1.getInt(0)); - Assert.assertEquals("Should not project y", 1, projectedP1.getArity()); - RowData projectedP2 = points.getRow(1, 2); - Assert.assertEquals("Should not project y", 1, projectedP2.getArity()); - Assert.assertEquals("Should project x", 3, projectedP2.getInt(0)); - - projected = writeAndRead("y_only", writeSchema, writeSchema.select("points.y"), row); - Assert.assertEquals("Should not project id", 1, projected.getArity()); - Assert.assertFalse("Should project points list", projected.isNullAt(0)); - points = projected.getArray(0); - Assert.assertEquals("Should read 2 points", 2, points.size()); - projectedP1 = points.getRow(0, 2); - Assert.assertEquals("Should not project x", 1, projectedP1.getArity()); - Assert.assertEquals("Should project y", 2, projectedP1.getInt(0)); - projectedP2 = points.getRow(1, 2); - Assert.assertEquals("Should not project x", 1, projectedP2.getArity()); - Assert.assertTrue("Should project null y", projectedP2.isNullAt(0)); - - Schema yRenamed = - new Schema( - Types.NestedField.optional( - 22, - "points", - Types.ListType.ofOptional( - 21, - Types.StructType.of( - Types.NestedField.optional(18, "z", Types.IntegerType.get()))))); - - projected = writeAndRead("y_renamed", writeSchema, yRenamed, row); - Assert.assertEquals("Should not project id", 1, projected.getArity()); - Assert.assertFalse("Should project points list", projected.isNullAt(0)); - points = projected.getArray(0); - Assert.assertEquals("Should read 2 points", 2, points.size()); - projectedP1 = points.getRow(0, 2); - Assert.assertEquals("Should not project x and y", 1, projectedP1.getArity()); - Assert.assertEquals("Should project z", 2, projectedP1.getInt(0)); - projectedP2 = points.getRow(1, 2); - Assert.assertEquals("Should not project x and y", 1, projectedP2.getArity()); - Assert.assertTrue("Should project null z", projectedP2.isNullAt(0)); - } - - @Test - public void testAddedFieldsWithRequiredChildren() throws Exception { - Schema schema = new Schema(Types.NestedField.required(1, "a", Types.LongType.get())); - - RowData row = GenericRowData.of(100L); - - Schema addedFields = - new Schema( - Types.NestedField.optional(1, "a", Types.LongType.get()), - Types.NestedField.optional( - 2, - "b", - Types.StructType.of(Types.NestedField.required(3, "c", Types.LongType.get()))), - Types.NestedField.optional(4, "d", Types.ListType.ofRequired(5, Types.LongType.get())), - Types.NestedField.optional( - 6, - "e", - Types.MapType.ofRequired(7, 8, Types.LongType.get(), Types.LongType.get()))); - - RowData projected = - writeAndRead("add_fields_with_required_children_projection", schema, addedFields, row); - Assert.assertEquals("Should contain the correct value in column 1", projected.getLong(0), 100L); - Assert.assertTrue("Should contain empty value in new column 2", projected.isNullAt(1)); - Assert.assertTrue("Should contain empty value in new column 4", projected.isNullAt(2)); - Assert.assertTrue("Should contain empty value in new column 6", projected.isNullAt(3)); - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java deleted file mode 100644 index 2e5e7121bb2b..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java +++ /dev/null @@ -1,351 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.apache.iceberg.flink.SimpleDataUtil.createDelete; -import static org.apache.iceberg.flink.SimpleDataUtil.createInsert; -import static org.apache.iceberg.flink.SimpleDataUtil.createRecord; -import static org.apache.iceberg.flink.SimpleDataUtil.createUpdateAfter; -import static org.apache.iceberg.flink.SimpleDataUtil.createUpdateBefore; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.Arrays; -import java.util.List; -import java.util.stream.Collectors; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.FileContent; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.RowDelta; -import org.apache.iceberg.SerializableTable; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.TableTestBase; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.io.TaskWriter; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.util.StructLikeSet; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Parameterized.class) -public class TestDeltaTaskWriter extends TableTestBase { - private static final int FORMAT_V2 = 2; - - private final FileFormat format; - - @Parameterized.Parameters(name = "FileFormat = {0}") - public static Object[][] parameters() { - return new Object[][] {{"avro"}, {"orc"}, {"parquet"}}; - } - - public TestDeltaTaskWriter(String fileFormat) { - super(FORMAT_V2); - this.format = FileFormat.fromString(fileFormat); - } - - @Override - @Before - public void setupTable() throws IOException { - this.tableDir = temp.newFolder(); - Assert.assertTrue(tableDir.delete()); // created by table create - - this.metadataDir = new File(tableDir, "metadata"); - } - - private void initTable(boolean partitioned) { - if (partitioned) { - this.table = create(SCHEMA, PartitionSpec.builderFor(SCHEMA).identity("data").build()); - } else { - this.table = create(SCHEMA, PartitionSpec.unpartitioned()); - } - - table - .updateProperties() - .set(TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, String.valueOf(8 * 1024)) - .defaultFormat(format) - .commit(); - } - - private int idFieldId() { - return table.schema().findField("id").fieldId(); - } - - private int dataFieldId() { - return table.schema().findField("data").fieldId(); - } - - private void testCdcEvents(boolean partitioned) throws IOException { - List equalityFieldIds = Lists.newArrayList(idFieldId()); - TaskWriterFactory taskWriterFactory = createTaskWriterFactory(equalityFieldIds); - taskWriterFactory.initialize(1, 1); - - // Start the 1th transaction. - TaskWriter writer = taskWriterFactory.create(); - - writer.write(createInsert(1, "aaa")); - writer.write(createInsert(2, "bbb")); - writer.write(createInsert(3, "ccc")); - - // Update <2, 'bbb'> to <2, 'ddd'> - writer.write(createUpdateBefore(2, "bbb")); // 1 pos-delete and 1 eq-delete. - writer.write(createUpdateAfter(2, "ddd")); - - // Update <1, 'aaa'> to <1, 'eee'> - writer.write(createUpdateBefore(1, "aaa")); // 1 pos-delete and 1 eq-delete. - writer.write(createUpdateAfter(1, "eee")); - - // Insert <4, 'fff'> - writer.write(createInsert(4, "fff")); - // Insert <5, 'ggg'> - writer.write(createInsert(5, "ggg")); - - // Delete <3, 'ccc'> - writer.write(createDelete(3, "ccc")); // 1 pos-delete and 1 eq-delete. - - WriteResult result = writer.complete(); - Assert.assertEquals(partitioned ? 7 : 1, result.dataFiles().length); - Assert.assertEquals(partitioned ? 3 : 1, result.deleteFiles().length); - commitTransaction(result); - - Assert.assertEquals( - "Should have expected records.", - expectedRowSet( - createRecord(1, "eee"), - createRecord(2, "ddd"), - createRecord(4, "fff"), - createRecord(5, "ggg")), - actualRowSet("*")); - - // Start the 2nd transaction. - writer = taskWriterFactory.create(); - - // Update <2, 'ddd'> to <6, 'hhh'> - (Update both key and value) - writer.write(createUpdateBefore(2, "ddd")); // 1 eq-delete - writer.write(createUpdateAfter(6, "hhh")); - - // Update <5, 'ggg'> to <5, 'iii'> - writer.write(createUpdateBefore(5, "ggg")); // 1 eq-delete - writer.write(createUpdateAfter(5, "iii")); - - // Delete <4, 'fff'> - writer.write(createDelete(4, "fff")); // 1 eq-delete. - - result = writer.complete(); - Assert.assertEquals(partitioned ? 2 : 1, result.dataFiles().length); - Assert.assertEquals(partitioned ? 3 : 1, result.deleteFiles().length); - commitTransaction(result); - - Assert.assertEquals( - "Should have expected records", - expectedRowSet(createRecord(1, "eee"), createRecord(5, "iii"), createRecord(6, "hhh")), - actualRowSet("*")); - } - - @Test - public void testUnpartitioned() throws IOException { - initTable(false); - testCdcEvents(false); - } - - @Test - public void testPartitioned() throws IOException { - initTable(true); - testCdcEvents(true); - } - - private void testWritePureEqDeletes(boolean partitioned) throws IOException { - initTable(partitioned); - List equalityFieldIds = Lists.newArrayList(idFieldId()); - TaskWriterFactory taskWriterFactory = createTaskWriterFactory(equalityFieldIds); - taskWriterFactory.initialize(1, 1); - - TaskWriter writer = taskWriterFactory.create(); - writer.write(createDelete(1, "aaa")); - writer.write(createDelete(2, "bbb")); - writer.write(createDelete(3, "ccc")); - - WriteResult result = writer.complete(); - Assert.assertEquals(0, result.dataFiles().length); - Assert.assertEquals(partitioned ? 3 : 1, result.deleteFiles().length); - commitTransaction(result); - - Assert.assertEquals("Should have no record", expectedRowSet(), actualRowSet("*")); - } - - @Test - public void testUnpartitionedPureEqDeletes() throws IOException { - testWritePureEqDeletes(false); - } - - @Test - public void testPartitionedPureEqDeletes() throws IOException { - testWritePureEqDeletes(true); - } - - private void testAbort(boolean partitioned) throws IOException { - initTable(partitioned); - List equalityFieldIds = Lists.newArrayList(idFieldId()); - TaskWriterFactory taskWriterFactory = createTaskWriterFactory(equalityFieldIds); - taskWriterFactory.initialize(1, 1); - - TaskWriter writer = taskWriterFactory.create(); - for (int i = 0; i < 8_000; i += 2) { - writer.write(createUpdateBefore(i + 1, "aaa")); - writer.write(createUpdateAfter(i + 1, "aaa")); - - writer.write(createUpdateBefore(i + 2, "bbb")); - writer.write(createUpdateAfter(i + 2, "bbb")); - } - - // Assert the current data/delete file count. - List files = - Files.walk(Paths.get(tableDir.getPath(), "data")) - .filter(p -> p.toFile().isFile()) - .filter(p -> !p.toString().endsWith(".crc")) - .collect(Collectors.toList()); - Assert.assertEquals( - "Should have expected file count, but files are: " + files, - partitioned ? 4 : 2, - files.size()); - - writer.abort(); - for (Path file : files) { - Assert.assertFalse(Files.exists(file)); - } - } - - @Test - public void testUnpartitionedAbort() throws IOException { - testAbort(false); - } - - @Test - public void testPartitionedAbort() throws IOException { - testAbort(true); - } - - @Test - public void testPartitionedTableWithDataAsKey() throws IOException { - initTable(true); - List equalityFieldIds = Lists.newArrayList(dataFieldId()); - TaskWriterFactory taskWriterFactory = createTaskWriterFactory(equalityFieldIds); - taskWriterFactory.initialize(1, 1); - - // Start the 1th transaction. - TaskWriter writer = taskWriterFactory.create(); - writer.write(createInsert(1, "aaa")); - writer.write(createInsert(2, "aaa")); - writer.write(createInsert(3, "bbb")); - writer.write(createInsert(4, "ccc")); - - WriteResult result = writer.complete(); - Assert.assertEquals(3, result.dataFiles().length); - Assert.assertEquals(1, result.deleteFiles().length); - commitTransaction(result); - - Assert.assertEquals( - "Should have expected records", - expectedRowSet(createRecord(2, "aaa"), createRecord(3, "bbb"), createRecord(4, "ccc")), - actualRowSet("*")); - - // Start the 2nd transaction. - writer = taskWriterFactory.create(); - writer.write(createInsert(5, "aaa")); - writer.write(createInsert(6, "bbb")); - writer.write(createDelete(7, "ccc")); // 1 eq-delete. - - result = writer.complete(); - Assert.assertEquals(2, result.dataFiles().length); - Assert.assertEquals(1, result.deleteFiles().length); - commitTransaction(result); - - Assert.assertEquals( - "Should have expected records", - expectedRowSet( - createRecord(2, "aaa"), - createRecord(5, "aaa"), - createRecord(3, "bbb"), - createRecord(6, "bbb")), - actualRowSet("*")); - } - - @Test - public void testPartitionedTableWithDataAndIdAsKey() throws IOException { - initTable(true); - List equalityFieldIds = Lists.newArrayList(dataFieldId(), idFieldId()); - TaskWriterFactory taskWriterFactory = createTaskWriterFactory(equalityFieldIds); - taskWriterFactory.initialize(1, 1); - - TaskWriter writer = taskWriterFactory.create(); - writer.write(createInsert(1, "aaa")); - writer.write(createInsert(2, "aaa")); - - writer.write(createDelete(2, "aaa")); // 1 pos-delete. - - WriteResult result = writer.complete(); - Assert.assertEquals(1, result.dataFiles().length); - Assert.assertEquals(1, result.deleteFiles().length); - Assert.assertEquals( - Sets.newHashSet(FileContent.POSITION_DELETES), - Sets.newHashSet(result.deleteFiles()[0].content())); - commitTransaction(result); - - Assert.assertEquals( - "Should have expected records", expectedRowSet(createRecord(1, "aaa")), actualRowSet("*")); - } - - private void commitTransaction(WriteResult result) { - RowDelta rowDelta = table.newRowDelta(); - Arrays.stream(result.dataFiles()).forEach(rowDelta::addRows); - Arrays.stream(result.deleteFiles()).forEach(rowDelta::addDeletes); - rowDelta - .validateDeletedFiles() - .validateDataFilesExist(Lists.newArrayList(result.referencedDataFiles())) - .commit(); - } - - private StructLikeSet expectedRowSet(Record... records) { - return SimpleDataUtil.expectedRowSet(table, records); - } - - private StructLikeSet actualRowSet(String... columns) throws IOException { - return SimpleDataUtil.actualRowSet(table, columns); - } - - private TaskWriterFactory createTaskWriterFactory(List equalityFieldIds) { - return new RowDataTaskWriterFactory( - SerializableTable.copyOf(table), - FlinkSchemaUtil.convert(table.schema()), - 128 * 1024 * 1024, - format, - equalityFieldIds, - false); - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java deleted file mode 100644 index 4c17cd7607df..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.List; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.io.TestAppenderFactory; -import org.apache.iceberg.util.ArrayUtil; -import org.apache.iceberg.util.StructLikeSet; - -public class TestFlinkAppenderFactory extends TestAppenderFactory { - - private final RowType rowType; - - public TestFlinkAppenderFactory(String fileFormat, boolean partitioned) { - super(fileFormat, partitioned); - this.rowType = FlinkSchemaUtil.convert(SCHEMA); - } - - @Override - protected FileAppenderFactory createAppenderFactory( - List equalityFieldIds, Schema eqDeleteSchema, Schema posDeleteRowSchema) { - return new FlinkAppenderFactory( - table.schema(), - rowType, - table.properties(), - table.spec(), - ArrayUtil.toIntArray(equalityFieldIds), - eqDeleteSchema, - posDeleteRowSchema); - } - - @Override - protected RowData createRow(Integer id, String data) { - return SimpleDataUtil.createRowData(id, data); - } - - @Override - protected StructLikeSet expectedRowSet(Iterable rows) { - StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); - for (RowData row : rows) { - RowDataWrapper wrapper = new RowDataWrapper(rowType, table.schema().asStruct()); - set.add(wrapper.wrap(row)); - } - return set; - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java deleted file mode 100644 index da45241256f5..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.List; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.io.FileWriterFactory; -import org.apache.iceberg.io.TestFileWriterFactory; -import org.apache.iceberg.util.ArrayUtil; -import org.apache.iceberg.util.StructLikeSet; - -public class TestFlinkFileWriterFactory extends TestFileWriterFactory { - - public TestFlinkFileWriterFactory(FileFormat fileFormat, boolean partitioned) { - super(fileFormat, partitioned); - } - - @Override - protected FileWriterFactory newWriterFactory( - Schema dataSchema, - List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { - return FlinkFileWriterFactory.builderFor(table) - .dataSchema(table.schema()) - .dataFileFormat(format()) - .deleteFileFormat(format()) - .equalityFieldIds(ArrayUtil.toIntArray(equalityFieldIds)) - .equalityDeleteRowSchema(equalityDeleteRowSchema) - .positionDeleteRowSchema(positionDeleteRowSchema) - .build(); - } - - @Override - protected RowData toRow(Integer id, String data) { - return SimpleDataUtil.createRowData(id, data); - } - - @Override - protected StructLikeSet toSet(Iterable rows) { - StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); - RowType flinkType = FlinkSchemaUtil.convert(table.schema()); - for (RowData row : rows) { - RowDataWrapper wrapper = new RowDataWrapper(flinkType, table.schema().asStruct()); - set.add(wrapper.wrap(row)); - } - return set; - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java deleted file mode 100644 index 57edc56d9acd..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java +++ /dev/null @@ -1,389 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.File; -import java.io.IOException; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.api.java.typeutils.RowTypeInfo; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.util.DataFormatConverters; -import org.apache.flink.test.util.MiniClusterWithClientResource; -import org.apache.flink.types.Row; -import org.apache.iceberg.AssertHelpers; -import org.apache.iceberg.DistributionMode; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.flink.FlinkWriteOptions; -import org.apache.iceberg.flink.MiniClusterResource; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.source.BoundedTestSource; -import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.junit.Assert; -import org.junit.Before; -import org.junit.ClassRule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Parameterized.class) -public class TestFlinkIcebergSink { - - @ClassRule - public static final MiniClusterWithClientResource MINI_CLUSTER_RESOURCE = - MiniClusterResource.createWithClassloaderCheckDisabled(); - - @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); - - private static final TypeInformation ROW_TYPE_INFO = - new RowTypeInfo(SimpleDataUtil.FLINK_SCHEMA.getFieldTypes()); - private static final DataFormatConverters.RowConverter CONVERTER = - new DataFormatConverters.RowConverter(SimpleDataUtil.FLINK_SCHEMA.getFieldDataTypes()); - - private String tablePath; - private Table table; - private StreamExecutionEnvironment env; - private TableLoader tableLoader; - - private final FileFormat format; - private final int parallelism; - private final boolean partitioned; - - @Parameterized.Parameters(name = "format={0}, parallelism = {1}, partitioned = {2}") - public static Object[][] parameters() { - return new Object[][] { - {"avro", 1, true}, - {"avro", 1, false}, - {"avro", 2, true}, - {"avro", 2, false}, - {"orc", 1, true}, - {"orc", 1, false}, - {"orc", 2, true}, - {"orc", 2, false}, - {"parquet", 1, true}, - {"parquet", 1, false}, - {"parquet", 2, true}, - {"parquet", 2, false} - }; - } - - public TestFlinkIcebergSink(String format, int parallelism, boolean partitioned) { - this.format = FileFormat.fromString(format); - this.parallelism = parallelism; - this.partitioned = partitioned; - } - - @Before - public void before() throws IOException { - File folder = TEMPORARY_FOLDER.newFolder(); - String warehouse = folder.getAbsolutePath(); - - tablePath = warehouse.concat("/test"); - Assert.assertTrue("Should create the table path correctly.", new File(tablePath).mkdir()); - - Map props = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()); - table = SimpleDataUtil.createTable(tablePath, props, partitioned); - - env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(100) - .setParallelism(parallelism) - .setMaxParallelism(parallelism); - - tableLoader = TableLoader.fromHadoopTable(tablePath); - } - - private List convertToRowData(List rows) { - return rows.stream().map(CONVERTER::toInternal).collect(Collectors.toList()); - } - - private BoundedTestSource createBoundedSource(List rows) { - return new BoundedTestSource<>(rows.toArray(new Row[0])); - } - - @Test - public void testWriteRowData() throws Exception { - List rows = Lists.newArrayList(Row.of(1, "hello"), Row.of(2, "world"), Row.of(3, "foo")); - DataStream dataStream = - env.addSource(createBoundedSource(rows), ROW_TYPE_INFO) - .map(CONVERTER::toInternal, FlinkCompatibilityUtil.toTypeInfo(SimpleDataUtil.ROW_TYPE)); - - FlinkSink.forRowData(dataStream) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(parallelism) - .append(); - - // Execute the program. - env.execute("Test Iceberg DataStream"); - - // Assert the iceberg table's records. - SimpleDataUtil.assertTableRows(tablePath, convertToRowData(rows)); - } - - private List createRows(String prefix) { - return Lists.newArrayList( - Row.of(1, prefix + "aaa"), - Row.of(1, prefix + "bbb"), - Row.of(1, prefix + "ccc"), - Row.of(2, prefix + "aaa"), - Row.of(2, prefix + "bbb"), - Row.of(2, prefix + "ccc"), - Row.of(3, prefix + "aaa"), - Row.of(3, prefix + "bbb"), - Row.of(3, prefix + "ccc")); - } - - private void testWriteRow(TableSchema tableSchema, DistributionMode distributionMode) - throws Exception { - List rows = createRows(""); - DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); - - FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .tableSchema(tableSchema) - .writeParallelism(parallelism) - .distributionMode(distributionMode) - .append(); - - // Execute the program. - env.execute("Test Iceberg DataStream."); - - SimpleDataUtil.assertTableRows(tablePath, convertToRowData(rows)); - } - - private int partitionFiles(String partition) throws IOException { - return SimpleDataUtil.partitionDataFiles(table, ImmutableMap.of("data", partition)).size(); - } - - @Test - public void testWriteRow() throws Exception { - testWriteRow(null, DistributionMode.NONE); - } - - @Test - public void testWriteRowWithTableSchema() throws Exception { - testWriteRow(SimpleDataUtil.FLINK_SCHEMA, DistributionMode.NONE); - } - - @Test - public void testJobNoneDistributeMode() throws Exception { - table - .updateProperties() - .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.HASH.modeName()) - .commit(); - - testWriteRow(null, DistributionMode.NONE); - - if (parallelism > 1) { - if (partitioned) { - int files = partitionFiles("aaa") + partitionFiles("bbb") + partitionFiles("ccc"); - Assert.assertTrue("Should have more than 3 files in iceberg table.", files > 3); - } - } - } - - @Test - public void testJobHashDistributionMode() { - table - .updateProperties() - .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.HASH.modeName()) - .commit(); - - AssertHelpers.assertThrows( - "Does not support range distribution-mode now.", - IllegalArgumentException.class, - "Flink does not support 'range' write distribution mode now.", - () -> { - testWriteRow(null, DistributionMode.RANGE); - return null; - }); - } - - @Test - public void testJobNullDistributionMode() throws Exception { - table - .updateProperties() - .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.HASH.modeName()) - .commit(); - - testWriteRow(null, null); - - if (partitioned) { - Assert.assertEquals( - "There should be only 1 data file in partition 'aaa'", 1, partitionFiles("aaa")); - Assert.assertEquals( - "There should be only 1 data file in partition 'bbb'", 1, partitionFiles("bbb")); - Assert.assertEquals( - "There should be only 1 data file in partition 'ccc'", 1, partitionFiles("ccc")); - } - } - - @Test - public void testPartitionWriteMode() throws Exception { - testWriteRow(null, DistributionMode.HASH); - if (partitioned) { - Assert.assertEquals( - "There should be only 1 data file in partition 'aaa'", 1, partitionFiles("aaa")); - Assert.assertEquals( - "There should be only 1 data file in partition 'bbb'", 1, partitionFiles("bbb")); - Assert.assertEquals( - "There should be only 1 data file in partition 'ccc'", 1, partitionFiles("ccc")); - } - } - - @Test - public void testShuffleByPartitionWithSchema() throws Exception { - testWriteRow(SimpleDataUtil.FLINK_SCHEMA, DistributionMode.HASH); - if (partitioned) { - Assert.assertEquals( - "There should be only 1 data file in partition 'aaa'", 1, partitionFiles("aaa")); - Assert.assertEquals( - "There should be only 1 data file in partition 'bbb'", 1, partitionFiles("bbb")); - Assert.assertEquals( - "There should be only 1 data file in partition 'ccc'", 1, partitionFiles("ccc")); - } - } - - @Test - public void testTwoSinksInDisjointedDAG() throws Exception { - Map props = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()); - - String leftTablePath = TEMPORARY_FOLDER.newFolder().getAbsolutePath().concat("/left"); - Assert.assertTrue("Should create the table path correctly.", new File(leftTablePath).mkdir()); - Table leftTable = SimpleDataUtil.createTable(leftTablePath, props, partitioned); - TableLoader leftTableLoader = TableLoader.fromHadoopTable(leftTablePath); - - String rightTablePath = TEMPORARY_FOLDER.newFolder().getAbsolutePath().concat("/right"); - Assert.assertTrue("Should create the table path correctly.", new File(rightTablePath).mkdir()); - Table rightTable = SimpleDataUtil.createTable(rightTablePath, props, partitioned); - TableLoader rightTableLoader = TableLoader.fromHadoopTable(rightTablePath); - - env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(100) - .setParallelism(parallelism) - .setMaxParallelism(parallelism); - env.getConfig().disableAutoGeneratedUIDs(); - - List leftRows = createRows("left-"); - DataStream leftStream = - env.fromCollection(leftRows, ROW_TYPE_INFO) - .name("leftCustomSource") - .uid("leftCustomSource"); - FlinkSink.forRow(leftStream, SimpleDataUtil.FLINK_SCHEMA) - .table(leftTable) - .tableLoader(leftTableLoader) - .tableSchema(SimpleDataUtil.FLINK_SCHEMA) - .distributionMode(DistributionMode.NONE) - .uidPrefix("leftIcebergSink") - .append(); - - List rightRows = createRows("right-"); - DataStream rightStream = - env.fromCollection(rightRows, ROW_TYPE_INFO) - .name("rightCustomSource") - .uid("rightCustomSource"); - FlinkSink.forRow(rightStream, SimpleDataUtil.FLINK_SCHEMA) - .table(rightTable) - .tableLoader(rightTableLoader) - .tableSchema(SimpleDataUtil.FLINK_SCHEMA) - .writeParallelism(parallelism) - .distributionMode(DistributionMode.HASH) - .uidPrefix("rightIcebergSink") - .append(); - - // Execute the program. - env.execute("Test Iceberg DataStream."); - - SimpleDataUtil.assertTableRows(leftTablePath, convertToRowData(leftRows)); - SimpleDataUtil.assertTableRows(rightTablePath, convertToRowData(rightRows)); - } - - @Test - public void testOverrideWriteConfigWithUnknownDistributionMode() { - Map newProps = Maps.newHashMap(); - newProps.put(FlinkWriteOptions.DISTRIBUTION_MODE.key(), "UNRECOGNIZED"); - - List rows = createRows(""); - DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); - - FlinkSink.Builder builder = - FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(parallelism) - .setAll(newProps); - - AssertHelpers.assertThrows( - "Should fail with invalid distribution mode.", - IllegalArgumentException.class, - "Invalid distribution mode: UNRECOGNIZED", - () -> { - builder.append(); - - // Execute the program. - env.execute("Test Iceberg DataStream."); - return null; - }); - } - - @Test - public void testOverrideWriteConfigWithUnknownFileFormat() { - Map newProps = Maps.newHashMap(); - newProps.put(FlinkWriteOptions.WRITE_FORMAT.key(), "UNRECOGNIZED"); - - List rows = createRows(""); - DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); - - FlinkSink.Builder builder = - FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(parallelism) - .setAll(newProps); - - AssertHelpers.assertThrows( - "Should fail with invalid file format.", - IllegalArgumentException.class, - "Invalid file format: UNRECOGNIZED", - () -> { - builder.append(); - - // Execute the program. - env.execute("Test Iceberg DataStream."); - return null; - }); - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java deleted file mode 100644 index 9f63d2f9f1c9..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java +++ /dev/null @@ -1,497 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.File; -import java.io.IOException; -import java.util.List; -import java.util.Map; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.api.java.functions.KeySelector; -import org.apache.flink.api.java.typeutils.RowTypeInfo; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.test.util.MiniClusterWithClientResource; -import org.apache.flink.types.Row; -import org.apache.flink.types.RowKind; -import org.apache.iceberg.AssertHelpers; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.TableTestBase; -import org.apache.iceberg.data.IcebergGenerics; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.MiniClusterResource; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TestTableLoader; -import org.apache.iceberg.flink.source.BoundedTestSource; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.StructLikeSet; -import org.junit.Assert; -import org.junit.Before; -import org.junit.ClassRule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Parameterized.class) -public class TestFlinkIcebergSinkV2 extends TableTestBase { - - @ClassRule - public static final MiniClusterWithClientResource MINI_CLUSTER_RESOURCE = - MiniClusterResource.createWithClassloaderCheckDisabled(); - - @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); - - private static final int FORMAT_V2 = 2; - private static final TypeInformation ROW_TYPE_INFO = - new RowTypeInfo(SimpleDataUtil.FLINK_SCHEMA.getFieldTypes()); - - private static final Map ROW_KIND_MAP = - ImmutableMap.of( - "+I", RowKind.INSERT, - "-D", RowKind.DELETE, - "-U", RowKind.UPDATE_BEFORE, - "+U", RowKind.UPDATE_AFTER); - - private static final int ROW_ID_POS = 0; - private static final int ROW_DATA_POS = 1; - - private final FileFormat format; - private final int parallelism; - private final boolean partitioned; - private final String writeDistributionMode; - - private StreamExecutionEnvironment env; - private TestTableLoader tableLoader; - - @Parameterized.Parameters( - name = "FileFormat = {0}, Parallelism = {1}, Partitioned={2}, WriteDistributionMode ={3}") - public static Object[][] parameters() { - return new Object[][] { - new Object[] {"avro", 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_NONE}, - new Object[] {"avro", 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_NONE}, - new Object[] {"avro", 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_NONE}, - new Object[] {"avro", 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_NONE}, - new Object[] {"orc", 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_HASH}, - new Object[] {"orc", 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_HASH}, - new Object[] {"orc", 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_HASH}, - new Object[] {"orc", 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_HASH}, - new Object[] {"parquet", 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE}, - new Object[] {"parquet", 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE}, - new Object[] {"parquet", 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE}, - new Object[] {"parquet", 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE} - }; - } - - public TestFlinkIcebergSinkV2( - String format, int parallelism, boolean partitioned, String writeDistributionMode) { - super(FORMAT_V2); - this.format = FileFormat.fromString(format); - this.parallelism = parallelism; - this.partitioned = partitioned; - this.writeDistributionMode = writeDistributionMode; - } - - @Before - public void setupTable() throws IOException { - this.tableDir = temp.newFolder(); - this.metadataDir = new File(tableDir, "metadata"); - Assert.assertTrue(tableDir.delete()); - - if (!partitioned) { - table = create(SimpleDataUtil.SCHEMA, PartitionSpec.unpartitioned()); - } else { - table = - create( - SimpleDataUtil.SCHEMA, - PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build()); - } - - table - .updateProperties() - .set(TableProperties.DEFAULT_FILE_FORMAT, format.name()) - .set(TableProperties.WRITE_DISTRIBUTION_MODE, writeDistributionMode) - .commit(); - - env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(100L) - .setParallelism(parallelism) - .setMaxParallelism(parallelism); - - tableLoader = new TestTableLoader(tableDir.getAbsolutePath()); - } - - private List findValidSnapshots(Table table) { - List validSnapshots = Lists.newArrayList(); - for (Snapshot snapshot : table.snapshots()) { - if (snapshot.allManifests(table.io()).stream() - .anyMatch(m -> snapshot.snapshotId() == m.snapshotId())) { - validSnapshots.add(snapshot); - } - } - return validSnapshots; - } - - private void testChangeLogs( - List equalityFieldColumns, - KeySelector keySelector, - boolean insertAsUpsert, - List> elementsPerCheckpoint, - List> expectedRecordsPerCheckpoint) - throws Exception { - DataStream dataStream = - env.addSource(new BoundedTestSource<>(elementsPerCheckpoint), ROW_TYPE_INFO); - - FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .tableLoader(tableLoader) - .tableSchema(SimpleDataUtil.FLINK_SCHEMA) - .writeParallelism(parallelism) - .equalityFieldColumns(equalityFieldColumns) - .upsert(insertAsUpsert) - .append(); - - // Execute the program. - env.execute("Test Iceberg Change-Log DataStream."); - - table.refresh(); - List snapshots = findValidSnapshots(table); - int expectedSnapshotNum = expectedRecordsPerCheckpoint.size(); - Assert.assertEquals( - "Should have the expected snapshot number", expectedSnapshotNum, snapshots.size()); - - for (int i = 0; i < expectedSnapshotNum; i++) { - long snapshotId = snapshots.get(i).snapshotId(); - List expectedRecords = expectedRecordsPerCheckpoint.get(i); - Assert.assertEquals( - "Should have the expected records for the checkpoint#" + i, - expectedRowSet(expectedRecords.toArray(new Record[0])), - actualRowSet(snapshotId, "*")); - } - } - - private Row row(String rowKind, int id, String data) { - RowKind kind = ROW_KIND_MAP.get(rowKind); - if (kind == null) { - throw new IllegalArgumentException("Unknown row kind: " + rowKind); - } - - return Row.ofKind(kind, id, data); - } - - private Record record(int id, String data) { - return SimpleDataUtil.createRecord(id, data); - } - - @Test - public void testCheckAndGetEqualityFieldIds() { - table - .updateSchema() - .allowIncompatibleChanges() - .addRequiredColumn("type", Types.StringType.get()) - .setIdentifierFields("type") - .commit(); - - DataStream dataStream = - env.addSource(new BoundedTestSource<>(ImmutableList.of()), ROW_TYPE_INFO); - FlinkSink.Builder builder = - FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA).table(table); - - // Use schema identifier field IDs as equality field id list by default - Assert.assertEquals( - table.schema().identifierFieldIds(), - Sets.newHashSet(builder.checkAndGetEqualityFieldIds())); - - // Use user-provided equality field column as equality field id list - builder.equalityFieldColumns(Lists.newArrayList("id")); - Assert.assertEquals( - Sets.newHashSet(table.schema().findField("id").fieldId()), - Sets.newHashSet(builder.checkAndGetEqualityFieldIds())); - - builder.equalityFieldColumns(Lists.newArrayList("type")); - Assert.assertEquals( - Sets.newHashSet(table.schema().findField("type").fieldId()), - Sets.newHashSet(builder.checkAndGetEqualityFieldIds())); - } - - @Test - public void testChangeLogOnIdKey() throws Exception { - List> elementsPerCheckpoint = - ImmutableList.of( - ImmutableList.of( - row("+I", 1, "aaa"), - row("-D", 1, "aaa"), - row("+I", 1, "bbb"), - row("+I", 2, "aaa"), - row("-D", 2, "aaa"), - row("+I", 2, "bbb")), - ImmutableList.of( - row("-U", 2, "bbb"), row("+U", 2, "ccc"), row("-D", 2, "ccc"), row("+I", 2, "ddd")), - ImmutableList.of( - row("-D", 1, "bbb"), - row("+I", 1, "ccc"), - row("-D", 1, "ccc"), - row("+I", 1, "ddd"))); - - List> expectedRecords = - ImmutableList.of( - ImmutableList.of(record(1, "bbb"), record(2, "bbb")), - ImmutableList.of(record(1, "bbb"), record(2, "ddd")), - ImmutableList.of(record(1, "ddd"), record(2, "ddd"))); - - if (partitioned && writeDistributionMode.equals(TableProperties.WRITE_DISTRIBUTION_MODE_HASH)) { - AssertHelpers.assertThrows( - "Should be error because equality field columns don't include all partition keys", - IllegalStateException.class, - "should be included in equality fields", - () -> { - testChangeLogs( - ImmutableList.of("id"), - row -> row.getField(ROW_ID_POS), - false, - elementsPerCheckpoint, - expectedRecords); - return null; - }); - } else { - testChangeLogs( - ImmutableList.of("id"), - row -> row.getField(ROW_ID_POS), - false, - elementsPerCheckpoint, - expectedRecords); - } - } - - @Test - public void testChangeLogOnDataKey() throws Exception { - List> elementsPerCheckpoint = - ImmutableList.of( - ImmutableList.of( - row("+I", 1, "aaa"), - row("-D", 1, "aaa"), - row("+I", 2, "bbb"), - row("+I", 1, "bbb"), - row("+I", 2, "aaa")), - ImmutableList.of(row("-U", 2, "aaa"), row("+U", 1, "ccc"), row("+I", 1, "aaa")), - ImmutableList.of(row("-D", 1, "bbb"), row("+I", 2, "aaa"), row("+I", 2, "ccc"))); - - List> expectedRecords = - ImmutableList.of( - ImmutableList.of(record(1, "bbb"), record(2, "aaa")), - ImmutableList.of(record(1, "aaa"), record(1, "bbb"), record(1, "ccc")), - ImmutableList.of( - record(1, "aaa"), record(1, "ccc"), record(2, "aaa"), record(2, "ccc"))); - - testChangeLogs( - ImmutableList.of("data"), - row -> row.getField(ROW_DATA_POS), - false, - elementsPerCheckpoint, - expectedRecords); - } - - @Test - public void testChangeLogOnIdDataKey() throws Exception { - List> elementsPerCheckpoint = - ImmutableList.of( - ImmutableList.of( - row("+I", 1, "aaa"), - row("-D", 1, "aaa"), - row("+I", 2, "bbb"), - row("+I", 1, "bbb"), - row("+I", 2, "aaa")), - ImmutableList.of(row("-U", 2, "aaa"), row("+U", 1, "ccc"), row("+I", 1, "aaa")), - ImmutableList.of(row("-D", 1, "bbb"), row("+I", 2, "aaa"))); - - List> expectedRecords = - ImmutableList.of( - ImmutableList.of(record(1, "bbb"), record(2, "aaa"), record(2, "bbb")), - ImmutableList.of( - record(1, "aaa"), record(1, "bbb"), record(1, "ccc"), record(2, "bbb")), - ImmutableList.of( - record(1, "aaa"), record(1, "ccc"), record(2, "aaa"), record(2, "bbb"))); - - testChangeLogs( - ImmutableList.of("data", "id"), - row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), - false, - elementsPerCheckpoint, - expectedRecords); - } - - @Test - public void testChangeLogOnSameKey() throws Exception { - List> elementsPerCheckpoint = - ImmutableList.of( - // Checkpoint #1 - ImmutableList.of(row("+I", 1, "aaa"), row("-D", 1, "aaa"), row("+I", 1, "aaa")), - // Checkpoint #2 - ImmutableList.of(row("-U", 1, "aaa"), row("+U", 1, "aaa")), - // Checkpoint #3 - ImmutableList.of(row("-D", 1, "aaa"), row("+I", 1, "aaa")), - // Checkpoint #4 - ImmutableList.of(row("-U", 1, "aaa"), row("+U", 1, "aaa"), row("+I", 1, "aaa"))); - - List> expectedRecords = - ImmutableList.of( - ImmutableList.of(record(1, "aaa")), - ImmutableList.of(record(1, "aaa")), - ImmutableList.of(record(1, "aaa")), - ImmutableList.of(record(1, "aaa"), record(1, "aaa"))); - - testChangeLogs( - ImmutableList.of("id", "data"), - row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), - false, - elementsPerCheckpoint, - expectedRecords); - } - - @Test - public void testUpsertModeCheck() throws Exception { - DataStream dataStream = - env.addSource(new BoundedTestSource<>(ImmutableList.of()), ROW_TYPE_INFO); - FlinkSink.Builder builder = - FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .tableLoader(tableLoader) - .tableSchema(SimpleDataUtil.FLINK_SCHEMA) - .writeParallelism(parallelism) - .upsert(true); - - AssertHelpers.assertThrows( - "Should be error because upsert mode and overwrite mode enable at the same time.", - IllegalStateException.class, - "OVERWRITE mode shouldn't be enable", - () -> - builder.equalityFieldColumns(ImmutableList.of("id", "data")).overwrite(true).append()); - - AssertHelpers.assertThrows( - "Should be error because equality field columns are empty.", - IllegalStateException.class, - "Equality field columns shouldn't be empty", - () -> builder.equalityFieldColumns(ImmutableList.of()).overwrite(false).append()); - } - - @Test - public void testUpsertOnIdKey() throws Exception { - List> elementsPerCheckpoint = - ImmutableList.of( - ImmutableList.of(row("+I", 1, "aaa"), row("+U", 1, "bbb")), - ImmutableList.of(row("+I", 1, "ccc")), - ImmutableList.of(row("+U", 1, "ddd"), row("+I", 1, "eee"))); - - List> expectedRecords = - ImmutableList.of( - ImmutableList.of(record(1, "bbb")), - ImmutableList.of(record(1, "ccc")), - ImmutableList.of(record(1, "eee"))); - - if (!partitioned) { - testChangeLogs( - ImmutableList.of("id"), - row -> row.getField(ROW_ID_POS), - true, - elementsPerCheckpoint, - expectedRecords); - } else { - AssertHelpers.assertThrows( - "Should be error because equality field columns don't include all partition keys", - IllegalStateException.class, - "should be included in equality fields", - () -> { - testChangeLogs( - ImmutableList.of("id"), - row -> row.getField(ROW_ID_POS), - true, - elementsPerCheckpoint, - expectedRecords); - return null; - }); - } - } - - @Test - public void testUpsertOnDataKey() throws Exception { - List> elementsPerCheckpoint = - ImmutableList.of( - ImmutableList.of(row("+I", 1, "aaa"), row("+I", 2, "aaa"), row("+I", 3, "bbb")), - ImmutableList.of(row("+U", 4, "aaa"), row("-U", 3, "bbb"), row("+U", 5, "bbb")), - ImmutableList.of(row("+I", 6, "aaa"), row("+U", 7, "bbb"))); - - List> expectedRecords = - ImmutableList.of( - ImmutableList.of(record(2, "aaa"), record(3, "bbb")), - ImmutableList.of(record(4, "aaa"), record(5, "bbb")), - ImmutableList.of(record(6, "aaa"), record(7, "bbb"))); - - testChangeLogs( - ImmutableList.of("data"), - row -> row.getField(ROW_DATA_POS), - true, - elementsPerCheckpoint, - expectedRecords); - } - - @Test - public void testUpsertOnIdDataKey() throws Exception { - List> elementsPerCheckpoint = - ImmutableList.of( - ImmutableList.of(row("+I", 1, "aaa"), row("+U", 1, "aaa"), row("+I", 2, "bbb")), - ImmutableList.of(row("+I", 1, "aaa"), row("-D", 2, "bbb"), row("+I", 2, "ccc")), - ImmutableList.of(row("+U", 1, "bbb"), row("-U", 1, "ccc"), row("-D", 1, "aaa"))); - - List> expectedRecords = - ImmutableList.of( - ImmutableList.of(record(1, "aaa"), record(2, "bbb")), - ImmutableList.of(record(1, "aaa"), record(2, "ccc")), - ImmutableList.of(record(1, "bbb"), record(2, "ccc"))); - - testChangeLogs( - ImmutableList.of("id", "data"), - row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), - true, - elementsPerCheckpoint, - expectedRecords); - } - - private StructLikeSet expectedRowSet(Record... records) { - return SimpleDataUtil.expectedRowSet(table, records); - } - - private StructLikeSet actualRowSet(long snapshotId, String... columns) throws IOException { - table.refresh(); - StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); - try (CloseableIterable reader = - IcebergGenerics.read(table).useSnapshot(snapshotId).select(columns).build()) { - reader.forEach(set::add); - } - return set; - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java deleted file mode 100644 index 3c67662f6c34..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java +++ /dev/null @@ -1,302 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.apache.iceberg.flink.sink.ManifestOutputFileFactory.FLINK_MANIFEST_LOCATION; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Paths; -import java.util.List; -import java.util.Map; -import java.util.UUID; -import java.util.concurrent.atomic.AtomicInteger; -import org.apache.flink.core.io.SimpleVersionedSerialization; -import org.apache.flink.core.io.SimpleVersionedSerializer; -import org.apache.flink.table.data.RowData; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.HasTableOperations; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.ManifestFiles; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.util.Pair; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -public class TestFlinkManifest { - private static final Configuration CONF = new Configuration(); - - @Rule public TemporaryFolder tempFolder = new TemporaryFolder(); - - private String tablePath; - private Table table; - private FileAppenderFactory appenderFactory; - private final AtomicInteger fileCount = new AtomicInteger(0); - - @Before - public void before() throws IOException { - File folder = tempFolder.newFolder(); - String warehouse = folder.getAbsolutePath(); - - tablePath = warehouse.concat("/test"); - Assert.assertTrue("Should create the table directory correctly.", new File(tablePath).mkdir()); - - // Construct the iceberg table. - table = SimpleDataUtil.createTable(tablePath, ImmutableMap.of(), false); - - int[] equalityFieldIds = - new int[] { - table.schema().findField("id").fieldId(), table.schema().findField("data").fieldId() - }; - this.appenderFactory = - new FlinkAppenderFactory( - table.schema(), - FlinkSchemaUtil.convert(table.schema()), - table.properties(), - table.spec(), - equalityFieldIds, - table.schema(), - null); - } - - @Test - public void testIO() throws IOException { - String flinkJobId = newFlinkJobId(); - for (long checkpointId = 1; checkpointId <= 3; checkpointId++) { - ManifestOutputFileFactory factory = - FlinkManifestUtil.createOutputFileFactory(table, flinkJobId, 1, 1); - final long curCkpId = checkpointId; - - List dataFiles = generateDataFiles(10); - List eqDeleteFiles = generateEqDeleteFiles(5); - List posDeleteFiles = generatePosDeleteFiles(5); - DeltaManifests deltaManifests = - FlinkManifestUtil.writeCompletedFiles( - WriteResult.builder() - .addDataFiles(dataFiles) - .addDeleteFiles(eqDeleteFiles) - .addDeleteFiles(posDeleteFiles) - .build(), - () -> factory.create(curCkpId), - table.spec()); - - WriteResult result = FlinkManifestUtil.readCompletedFiles(deltaManifests, table.io()); - Assert.assertEquals("Size of data file list are not equal.", 10, result.deleteFiles().length); - for (int i = 0; i < dataFiles.size(); i++) { - TestHelpers.assertEquals(dataFiles.get(i), result.dataFiles()[i]); - } - Assert.assertEquals("Size of delete file list are not equal.", 10, result.dataFiles().length); - for (int i = 0; i < 5; i++) { - TestHelpers.assertEquals(eqDeleteFiles.get(i), result.deleteFiles()[i]); - } - for (int i = 0; i < 5; i++) { - TestHelpers.assertEquals(posDeleteFiles.get(i), result.deleteFiles()[5 + i]); - } - } - } - - @Test - public void testUserProvidedManifestLocation() throws IOException { - long checkpointId = 1; - String flinkJobId = newFlinkJobId(); - File userProvidedFolder = tempFolder.newFolder(); - Map props = - ImmutableMap.of(FLINK_MANIFEST_LOCATION, userProvidedFolder.getAbsolutePath() + "///"); - ManifestOutputFileFactory factory = - new ManifestOutputFileFactory( - ((HasTableOperations) table).operations(), table.io(), props, flinkJobId, 1, 1); - - List dataFiles = generateDataFiles(5); - DeltaManifests deltaManifests = - FlinkManifestUtil.writeCompletedFiles( - WriteResult.builder().addDataFiles(dataFiles).build(), - () -> factory.create(checkpointId), - table.spec()); - - Assert.assertNotNull("Data manifest shouldn't be null", deltaManifests.dataManifest()); - Assert.assertNull("Delete manifest should be null", deltaManifests.deleteManifest()); - Assert.assertEquals( - "The newly created manifest file should be located under the user provided directory", - userProvidedFolder.toPath(), - Paths.get(deltaManifests.dataManifest().path()).getParent()); - - WriteResult result = FlinkManifestUtil.readCompletedFiles(deltaManifests, table.io()); - - Assert.assertEquals(0, result.deleteFiles().length); - Assert.assertEquals(5, result.dataFiles().length); - - Assert.assertEquals( - "Size of data file list are not equal.", dataFiles.size(), result.dataFiles().length); - for (int i = 0; i < dataFiles.size(); i++) { - TestHelpers.assertEquals(dataFiles.get(i), result.dataFiles()[i]); - } - } - - @Test - public void testVersionedSerializer() throws IOException { - long checkpointId = 1; - String flinkJobId = newFlinkJobId(); - ManifestOutputFileFactory factory = - FlinkManifestUtil.createOutputFileFactory(table, flinkJobId, 1, 1); - - List dataFiles = generateDataFiles(10); - List eqDeleteFiles = generateEqDeleteFiles(10); - List posDeleteFiles = generatePosDeleteFiles(10); - DeltaManifests expected = - FlinkManifestUtil.writeCompletedFiles( - WriteResult.builder() - .addDataFiles(dataFiles) - .addDeleteFiles(eqDeleteFiles) - .addDeleteFiles(posDeleteFiles) - .build(), - () -> factory.create(checkpointId), - table.spec()); - - byte[] versionedSerializeData = - SimpleVersionedSerialization.writeVersionAndSerialize( - DeltaManifestsSerializer.INSTANCE, expected); - DeltaManifests actual = - SimpleVersionedSerialization.readVersionAndDeSerialize( - DeltaManifestsSerializer.INSTANCE, versionedSerializeData); - TestHelpers.assertEquals(expected.dataManifest(), actual.dataManifest()); - TestHelpers.assertEquals(expected.deleteManifest(), actual.deleteManifest()); - - byte[] versionedSerializeData2 = - SimpleVersionedSerialization.writeVersionAndSerialize( - DeltaManifestsSerializer.INSTANCE, actual); - Assert.assertArrayEquals(versionedSerializeData, versionedSerializeData2); - } - - @Test - public void testCompatibility() throws IOException { - // The v2 deserializer should be able to deserialize the v1 binary. - long checkpointId = 1; - String flinkJobId = newFlinkJobId(); - ManifestOutputFileFactory factory = - FlinkManifestUtil.createOutputFileFactory(table, flinkJobId, 1, 1); - - List dataFiles = generateDataFiles(10); - ManifestFile manifest = - FlinkManifestUtil.writeDataFiles(factory.create(checkpointId), table.spec(), dataFiles); - byte[] dataV1 = - SimpleVersionedSerialization.writeVersionAndSerialize(new V1Serializer(), manifest); - - DeltaManifests delta = - SimpleVersionedSerialization.readVersionAndDeSerialize( - DeltaManifestsSerializer.INSTANCE, dataV1); - Assert.assertNull("Serialization v1 don't include delete files.", delta.deleteManifest()); - Assert.assertNotNull( - "Serialization v1 should not have null data manifest.", delta.dataManifest()); - TestHelpers.assertEquals(manifest, delta.dataManifest()); - - List actualFiles = FlinkManifestUtil.readDataFiles(delta.dataManifest(), table.io()); - Assert.assertEquals(10, actualFiles.size()); - for (int i = 0; i < 10; i++) { - TestHelpers.assertEquals(dataFiles.get(i), actualFiles.get(i)); - } - } - - private static class V1Serializer implements SimpleVersionedSerializer { - - @Override - public int getVersion() { - return 1; - } - - @Override - public byte[] serialize(ManifestFile m) throws IOException { - return ManifestFiles.encode(m); - } - - @Override - public ManifestFile deserialize(int version, byte[] serialized) throws IOException { - return ManifestFiles.decode(serialized); - } - } - - private DataFile writeDataFile(String filename, List rows) throws IOException { - return SimpleDataUtil.writeFile( - table.schema(), - table.spec(), - CONF, - tablePath, - FileFormat.PARQUET.addExtension(filename), - rows); - } - - private DeleteFile writeEqDeleteFile(String filename, List deletes) throws IOException { - return SimpleDataUtil.writeEqDeleteFile( - table, FileFormat.PARQUET, tablePath, filename, appenderFactory, deletes); - } - - private DeleteFile writePosDeleteFile(String filename, List> positions) - throws IOException { - return SimpleDataUtil.writePosDeleteFile( - table, FileFormat.PARQUET, tablePath, filename, appenderFactory, positions); - } - - private List generateDataFiles(int fileNum) throws IOException { - List rowDataList = Lists.newArrayList(); - List dataFiles = Lists.newArrayList(); - for (int i = 0; i < fileNum; i++) { - rowDataList.add(SimpleDataUtil.createRowData(i, "a" + i)); - dataFiles.add(writeDataFile("data-file-" + fileCount.incrementAndGet(), rowDataList)); - } - return dataFiles; - } - - private List generateEqDeleteFiles(int fileNum) throws IOException { - List rowDataList = Lists.newArrayList(); - List deleteFiles = Lists.newArrayList(); - for (int i = 0; i < fileNum; i++) { - rowDataList.add(SimpleDataUtil.createDelete(i, "a" + i)); - deleteFiles.add( - writeEqDeleteFile("eq-delete-file-" + fileCount.incrementAndGet(), rowDataList)); - } - return deleteFiles; - } - - private List generatePosDeleteFiles(int fileNum) throws IOException { - List> positions = Lists.newArrayList(); - List deleteFiles = Lists.newArrayList(); - for (int i = 0; i < fileNum; i++) { - positions.add(Pair.of("data-file-1", (long) i)); - deleteFiles.add( - writePosDeleteFile("pos-delete-file-" + fileCount.incrementAndGet(), positions)); - } - return deleteFiles; - } - - private static String newFlinkJobId() { - return UUID.randomUUID().toString(); - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java deleted file mode 100644 index 3951c2e70f65..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.List; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.io.FileWriterFactory; -import org.apache.iceberg.io.TestPartitioningWriters; -import org.apache.iceberg.util.ArrayUtil; -import org.apache.iceberg.util.StructLikeSet; - -public class TestFlinkPartitioningWriters extends TestPartitioningWriters { - - public TestFlinkPartitioningWriters(FileFormat fileFormat) { - super(fileFormat); - } - - @Override - protected FileWriterFactory newWriterFactory( - Schema dataSchema, - List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { - return FlinkFileWriterFactory.builderFor(table) - .dataSchema(table.schema()) - .dataFileFormat(format()) - .deleteFileFormat(format()) - .equalityFieldIds(ArrayUtil.toIntArray(equalityFieldIds)) - .equalityDeleteRowSchema(equalityDeleteRowSchema) - .positionDeleteRowSchema(positionDeleteRowSchema) - .build(); - } - - @Override - protected RowData toRow(Integer id, String data) { - return SimpleDataUtil.createRowData(id, data); - } - - @Override - protected StructLikeSet toSet(Iterable rows) { - StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); - RowType flinkType = FlinkSchemaUtil.convert(table.schema()); - for (RowData row : rows) { - RowDataWrapper wrapper = new RowDataWrapper(flinkType, table.schema().asStruct()); - set.add(wrapper.wrap(row)); - } - return set; - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java deleted file mode 100644 index 9e846efe6fc9..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.List; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.io.FileWriterFactory; -import org.apache.iceberg.io.TestPositionDeltaWriters; -import org.apache.iceberg.util.ArrayUtil; -import org.apache.iceberg.util.StructLikeSet; - -public class TestFlinkPositionDeltaWriters extends TestPositionDeltaWriters { - - public TestFlinkPositionDeltaWriters(FileFormat fileFormat) { - super(fileFormat); - } - - @Override - protected FileWriterFactory newWriterFactory( - Schema dataSchema, - List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { - return FlinkFileWriterFactory.builderFor(table) - .dataSchema(table.schema()) - .dataFileFormat(format()) - .deleteFileFormat(format()) - .equalityFieldIds(ArrayUtil.toIntArray(equalityFieldIds)) - .equalityDeleteRowSchema(equalityDeleteRowSchema) - .positionDeleteRowSchema(positionDeleteRowSchema) - .build(); - } - - @Override - protected RowData toRow(Integer id, String data) { - return SimpleDataUtil.createRowData(id, data); - } - - @Override - protected StructLikeSet toSet(Iterable rows) { - StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); - RowType flinkType = FlinkSchemaUtil.convert(table.schema()); - for (RowData row : rows) { - RowDataWrapper wrapper = new RowDataWrapper(flinkType, table.schema().asStruct()); - set.add(wrapper.wrap(row)); - } - return set; - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java deleted file mode 100644 index 07716b9c3e60..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.List; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.io.FileWriterFactory; -import org.apache.iceberg.io.TestRollingFileWriters; -import org.apache.iceberg.util.ArrayUtil; - -public class TestFlinkRollingFileWriters extends TestRollingFileWriters { - - public TestFlinkRollingFileWriters(FileFormat fileFormat, boolean partitioned) { - super(fileFormat, partitioned); - } - - @Override - protected FileWriterFactory newWriterFactory( - Schema dataSchema, - List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { - return FlinkFileWriterFactory.builderFor(table) - .dataSchema(table.schema()) - .dataFileFormat(format()) - .deleteFileFormat(format()) - .equalityFieldIds(ArrayUtil.toIntArray(equalityFieldIds)) - .equalityDeleteRowSchema(equalityDeleteRowSchema) - .positionDeleteRowSchema(positionDeleteRowSchema) - .build(); - } - - @Override - protected RowData toRow(Integer id, String data) { - return SimpleDataUtil.createRowData(id, data); - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java deleted file mode 100644 index e6d64ef2c720..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Table; -import org.apache.iceberg.io.FileWriterFactory; -import org.apache.iceberg.io.TestWriterMetrics; - -public class TestFlinkWriterMetrics extends TestWriterMetrics { - - public TestFlinkWriterMetrics(FileFormat fileFormat) { - super(fileFormat); - } - - @Override - protected FileWriterFactory newWriterFactory(Table sourceTable) { - return FlinkFileWriterFactory.builderFor(sourceTable) - .dataSchema(sourceTable.schema()) - .dataFileFormat(fileFormat) - .deleteFileFormat(fileFormat) - .positionDeleteRowSchema(sourceTable.schema()) - .build(); - } - - @Override - protected RowData toRow(Integer id, String data, boolean boolValue, Long longValue) { - GenericRowData nested = GenericRowData.of(boolValue, longValue); - GenericRowData row = GenericRowData.of(id, StringData.fromString(data), nested); - return row; - } - - @Override - public RowData toGenericRow(int value, int repeated) { - GenericRowData row = new GenericRowData(repeated); - for (int i = 0; i < repeated; i++) { - row.setField(i, value); - } - return row; - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java deleted file mode 100644 index ddb0fda69ab6..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java +++ /dev/null @@ -1,866 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.flink.sink.IcebergFilesCommitter.MAX_CONTINUOUS_EMPTY_COMMITS; -import static org.apache.iceberg.flink.sink.ManifestOutputFileFactory.FLINK_MANIFEST_LOCATION; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.List; -import java.util.stream.Collectors; -import org.apache.flink.api.common.ExecutionConfig; -import org.apache.flink.api.common.JobID; -import org.apache.flink.runtime.checkpoint.OperatorSubtaskState; -import org.apache.flink.runtime.operators.testutils.MockEnvironment; -import org.apache.flink.runtime.operators.testutils.MockEnvironmentBuilder; -import org.apache.flink.runtime.operators.testutils.MockInputSplitProvider; -import org.apache.flink.streaming.api.operators.AbstractStreamOperatorFactory; -import org.apache.flink.streaming.api.operators.BoundedOneInput; -import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory; -import org.apache.flink.streaming.api.operators.StreamOperator; -import org.apache.flink.streaming.api.operators.StreamOperatorParameters; -import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; -import org.apache.flink.table.data.RowData; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.GenericManifestFile; -import org.apache.iceberg.ManifestContent; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.TableTestBase; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.flink.TestTableLoader; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.util.Pair; -import org.junit.Assert; -import org.junit.Assume; -import org.junit.Before; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Parameterized.class) -public class TestIcebergFilesCommitter extends TableTestBase { - private static final Configuration CONF = new Configuration(); - - private String tablePath; - private File flinkManifestFolder; - - private final FileFormat format; - - @Parameterized.Parameters(name = "FileFormat = {0}, FormatVersion={1}") - public static Object[][] parameters() { - return new Object[][] { - new Object[] {"avro", 1}, - new Object[] {"avro", 2}, - new Object[] {"parquet", 1}, - new Object[] {"parquet", 2}, - new Object[] {"orc", 1}, - new Object[] {"orc", 2} - }; - } - - public TestIcebergFilesCommitter(String format, int formatVersion) { - super(formatVersion); - this.format = FileFormat.fromString(format); - } - - @Override - @Before - public void setupTable() throws IOException { - flinkManifestFolder = temp.newFolder(); - - this.tableDir = temp.newFolder(); - this.metadataDir = new File(tableDir, "metadata"); - Assert.assertTrue(tableDir.delete()); - - tablePath = tableDir.getAbsolutePath(); - - // Construct the iceberg table. - table = create(SimpleDataUtil.SCHEMA, PartitionSpec.unpartitioned()); - - table - .updateProperties() - .set(DEFAULT_FILE_FORMAT, format.name()) - .set(FLINK_MANIFEST_LOCATION, flinkManifestFolder.getAbsolutePath()) - .set(MAX_CONTINUOUS_EMPTY_COMMITS, "1") - .commit(); - } - - @Test - public void testCommitTxnWithoutDataFiles() throws Exception { - long checkpointId = 0; - long timestamp = 0; - JobID jobId = new JobID(); - try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { - harness.setup(); - harness.open(); - - SimpleDataUtil.assertTableRows(table, Lists.newArrayList()); - assertSnapshotSize(0); - assertMaxCommittedCheckpointId(jobId, -1L); - - // It's better to advance the max-committed-checkpoint-id in iceberg snapshot, so that the - // future flink job - // failover won't fail. - for (int i = 1; i <= 3; i++) { - harness.snapshot(++checkpointId, ++timestamp); - assertFlinkManifests(0); - - harness.notifyOfCompletedCheckpoint(checkpointId); - assertFlinkManifests(0); - - assertSnapshotSize(i); - assertMaxCommittedCheckpointId(jobId, checkpointId); - } - } - } - - @Test - public void testMaxContinuousEmptyCommits() throws Exception { - table.updateProperties().set(MAX_CONTINUOUS_EMPTY_COMMITS, "3").commit(); - - JobID jobId = new JobID(); - long checkpointId = 0; - long timestamp = 0; - try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { - harness.setup(); - harness.open(); - - assertSnapshotSize(0); - - for (int i = 1; i <= 9; i++) { - harness.snapshot(++checkpointId, ++timestamp); - harness.notifyOfCompletedCheckpoint(checkpointId); - - assertSnapshotSize(i / 3); - } - } - } - - private WriteResult of(DataFile dataFile) { - return WriteResult.builder().addDataFiles(dataFile).build(); - } - - @Test - public void testCommitTxn() throws Exception { - // Test with 3 continues checkpoints: - // 1. snapshotState for checkpoint#1 - // 2. notifyCheckpointComplete for checkpoint#1 - // 3. snapshotState for checkpoint#2 - // 4. notifyCheckpointComplete for checkpoint#2 - // 5. snapshotState for checkpoint#3 - // 6. notifyCheckpointComplete for checkpoint#3 - long timestamp = 0; - - JobID jobID = new JobID(); - try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobID)) { - harness.setup(); - harness.open(); - assertSnapshotSize(0); - - List rows = Lists.newArrayListWithExpectedSize(3); - for (int i = 1; i <= 3; i++) { - RowData rowData = SimpleDataUtil.createRowData(i, "hello" + i); - DataFile dataFile = writeDataFile("data-" + i, ImmutableList.of(rowData)); - harness.processElement(of(dataFile), ++timestamp); - rows.add(rowData); - - harness.snapshot(i, ++timestamp); - assertFlinkManifests(1); - - harness.notifyOfCompletedCheckpoint(i); - assertFlinkManifests(0); - - SimpleDataUtil.assertTableRows(table, ImmutableList.copyOf(rows)); - assertSnapshotSize(i); - assertMaxCommittedCheckpointId(jobID, i); - } - } - } - - @Test - public void testOrderedEventsBetweenCheckpoints() throws Exception { - // It's possible that two checkpoints happen in the following orders: - // 1. snapshotState for checkpoint#1; - // 2. snapshotState for checkpoint#2; - // 3. notifyCheckpointComplete for checkpoint#1; - // 4. notifyCheckpointComplete for checkpoint#2; - long timestamp = 0; - - JobID jobId = new JobID(); - try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { - harness.setup(); - harness.open(); - - assertMaxCommittedCheckpointId(jobId, -1L); - - RowData row1 = SimpleDataUtil.createRowData(1, "hello"); - DataFile dataFile1 = writeDataFile("data-1", ImmutableList.of(row1)); - - harness.processElement(of(dataFile1), ++timestamp); - assertMaxCommittedCheckpointId(jobId, -1L); - - // 1. snapshotState for checkpoint#1 - long firstCheckpointId = 1; - harness.snapshot(firstCheckpointId, ++timestamp); - assertFlinkManifests(1); - - RowData row2 = SimpleDataUtil.createRowData(2, "world"); - DataFile dataFile2 = writeDataFile("data-2", ImmutableList.of(row2)); - harness.processElement(of(dataFile2), ++timestamp); - assertMaxCommittedCheckpointId(jobId, -1L); - - // 2. snapshotState for checkpoint#2 - long secondCheckpointId = 2; - harness.snapshot(secondCheckpointId, ++timestamp); - assertFlinkManifests(2); - - // 3. notifyCheckpointComplete for checkpoint#1 - harness.notifyOfCompletedCheckpoint(firstCheckpointId); - SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1)); - assertMaxCommittedCheckpointId(jobId, firstCheckpointId); - assertFlinkManifests(1); - - // 4. notifyCheckpointComplete for checkpoint#2 - harness.notifyOfCompletedCheckpoint(secondCheckpointId); - SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1, row2)); - assertMaxCommittedCheckpointId(jobId, secondCheckpointId); - assertFlinkManifests(0); - } - } - - @Test - public void testDisorderedEventsBetweenCheckpoints() throws Exception { - // It's possible that the two checkpoints happen in the following orders: - // 1. snapshotState for checkpoint#1; - // 2. snapshotState for checkpoint#2; - // 3. notifyCheckpointComplete for checkpoint#2; - // 4. notifyCheckpointComplete for checkpoint#1; - long timestamp = 0; - - JobID jobId = new JobID(); - try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { - harness.setup(); - harness.open(); - - assertMaxCommittedCheckpointId(jobId, -1L); - - RowData row1 = SimpleDataUtil.createRowData(1, "hello"); - DataFile dataFile1 = writeDataFile("data-1", ImmutableList.of(row1)); - - harness.processElement(of(dataFile1), ++timestamp); - assertMaxCommittedCheckpointId(jobId, -1L); - - // 1. snapshotState for checkpoint#1 - long firstCheckpointId = 1; - harness.snapshot(firstCheckpointId, ++timestamp); - assertFlinkManifests(1); - - RowData row2 = SimpleDataUtil.createRowData(2, "world"); - DataFile dataFile2 = writeDataFile("data-2", ImmutableList.of(row2)); - harness.processElement(of(dataFile2), ++timestamp); - assertMaxCommittedCheckpointId(jobId, -1L); - - // 2. snapshotState for checkpoint#2 - long secondCheckpointId = 2; - harness.snapshot(secondCheckpointId, ++timestamp); - assertFlinkManifests(2); - - // 3. notifyCheckpointComplete for checkpoint#2 - harness.notifyOfCompletedCheckpoint(secondCheckpointId); - SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1, row2)); - assertMaxCommittedCheckpointId(jobId, secondCheckpointId); - assertFlinkManifests(0); - - // 4. notifyCheckpointComplete for checkpoint#1 - harness.notifyOfCompletedCheckpoint(firstCheckpointId); - SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1, row2)); - assertMaxCommittedCheckpointId(jobId, secondCheckpointId); - assertFlinkManifests(0); - } - } - - @Test - public void testRecoveryFromValidSnapshot() throws Exception { - long checkpointId = 0; - long timestamp = 0; - List expectedRows = Lists.newArrayList(); - OperatorSubtaskState snapshot; - - JobID jobId = new JobID(); - try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { - harness.setup(); - harness.open(); - - assertSnapshotSize(0); - assertMaxCommittedCheckpointId(jobId, -1L); - - RowData row = SimpleDataUtil.createRowData(1, "hello"); - expectedRows.add(row); - DataFile dataFile1 = writeDataFile("data-1", ImmutableList.of(row)); - - harness.processElement(of(dataFile1), ++timestamp); - snapshot = harness.snapshot(++checkpointId, ++timestamp); - assertFlinkManifests(1); - - harness.notifyOfCompletedCheckpoint(checkpointId); - assertFlinkManifests(0); - - SimpleDataUtil.assertTableRows(table, ImmutableList.of(row)); - assertSnapshotSize(1); - assertMaxCommittedCheckpointId(jobId, checkpointId); - } - - // Restore from the given snapshot - try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { - harness.setup(); - harness.initializeState(snapshot); - harness.open(); - - SimpleDataUtil.assertTableRows(table, expectedRows); - assertSnapshotSize(1); - assertMaxCommittedCheckpointId(jobId, checkpointId); - - RowData row = SimpleDataUtil.createRowData(2, "world"); - expectedRows.add(row); - DataFile dataFile = writeDataFile("data-2", ImmutableList.of(row)); - harness.processElement(of(dataFile), ++timestamp); - - harness.snapshot(++checkpointId, ++timestamp); - assertFlinkManifests(1); - - harness.notifyOfCompletedCheckpoint(checkpointId); - assertFlinkManifests(0); - - SimpleDataUtil.assertTableRows(table, expectedRows); - assertSnapshotSize(2); - assertMaxCommittedCheckpointId(jobId, checkpointId); - } - } - - @Test - public void testRecoveryFromSnapshotWithoutCompletedNotification() throws Exception { - // We've two steps in checkpoint: 1. snapshotState(ckp); 2. notifyCheckpointComplete(ckp). It's - // possible that we - // flink job will restore from a checkpoint with only step#1 finished. - long checkpointId = 0; - long timestamp = 0; - OperatorSubtaskState snapshot; - List expectedRows = Lists.newArrayList(); - JobID jobId = new JobID(); - try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { - harness.setup(); - harness.open(); - - assertSnapshotSize(0); - assertMaxCommittedCheckpointId(jobId, -1L); - - RowData row = SimpleDataUtil.createRowData(1, "hello"); - expectedRows.add(row); - DataFile dataFile = writeDataFile("data-1", ImmutableList.of(row)); - harness.processElement(of(dataFile), ++timestamp); - - snapshot = harness.snapshot(++checkpointId, ++timestamp); - SimpleDataUtil.assertTableRows(table, ImmutableList.of()); - assertMaxCommittedCheckpointId(jobId, -1L); - assertFlinkManifests(1); - } - - try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { - harness.setup(); - harness.initializeState(snapshot); - harness.open(); - - // All flink manifests should be cleaned because it has committed the unfinished iceberg - // transaction. - assertFlinkManifests(0); - - SimpleDataUtil.assertTableRows(table, expectedRows); - assertMaxCommittedCheckpointId(jobId, checkpointId); - - harness.snapshot(++checkpointId, ++timestamp); - // Did not write any new record, so it won't generate new manifest. - assertFlinkManifests(0); - - harness.notifyOfCompletedCheckpoint(checkpointId); - assertFlinkManifests(0); - - SimpleDataUtil.assertTableRows(table, expectedRows); - assertSnapshotSize(2); - assertMaxCommittedCheckpointId(jobId, checkpointId); - - RowData row = SimpleDataUtil.createRowData(2, "world"); - expectedRows.add(row); - DataFile dataFile = writeDataFile("data-2", ImmutableList.of(row)); - harness.processElement(of(dataFile), ++timestamp); - - snapshot = harness.snapshot(++checkpointId, ++timestamp); - assertFlinkManifests(1); - } - - // Redeploying flink job from external checkpoint. - JobID newJobId = new JobID(); - try (OneInputStreamOperatorTestHarness harness = - createStreamSink(newJobId)) { - harness.setup(); - harness.initializeState(snapshot); - harness.open(); - - // All flink manifests should be cleaned because it has committed the unfinished iceberg - // transaction. - assertFlinkManifests(0); - - assertMaxCommittedCheckpointId(newJobId, -1); - assertMaxCommittedCheckpointId(jobId, checkpointId); - SimpleDataUtil.assertTableRows(table, expectedRows); - assertSnapshotSize(3); - - RowData row = SimpleDataUtil.createRowData(3, "foo"); - expectedRows.add(row); - DataFile dataFile = writeDataFile("data-3", ImmutableList.of(row)); - harness.processElement(of(dataFile), ++timestamp); - - harness.snapshot(++checkpointId, ++timestamp); - assertFlinkManifests(1); - - harness.notifyOfCompletedCheckpoint(checkpointId); - assertFlinkManifests(0); - - SimpleDataUtil.assertTableRows(table, expectedRows); - assertSnapshotSize(4); - assertMaxCommittedCheckpointId(newJobId, checkpointId); - } - } - - @Test - public void testStartAnotherJobToWriteSameTable() throws Exception { - long checkpointId = 0; - long timestamp = 0; - List rows = Lists.newArrayList(); - List tableRows = Lists.newArrayList(); - - JobID oldJobId = new JobID(); - try (OneInputStreamOperatorTestHarness harness = - createStreamSink(oldJobId)) { - harness.setup(); - harness.open(); - - assertSnapshotSize(0); - assertMaxCommittedCheckpointId(oldJobId, -1L); - - for (int i = 1; i <= 3; i++) { - rows.add(SimpleDataUtil.createRowData(i, "hello" + i)); - tableRows.addAll(rows); - - DataFile dataFile = writeDataFile(String.format("data-%d", i), rows); - harness.processElement(of(dataFile), ++timestamp); - harness.snapshot(++checkpointId, ++timestamp); - assertFlinkManifests(1); - - harness.notifyOfCompletedCheckpoint(checkpointId); - assertFlinkManifests(0); - - SimpleDataUtil.assertTableRows(table, tableRows); - assertSnapshotSize(i); - assertMaxCommittedCheckpointId(oldJobId, checkpointId); - } - } - - // The new started job will start with checkpoint = 1 again. - checkpointId = 0; - timestamp = 0; - JobID newJobId = new JobID(); - try (OneInputStreamOperatorTestHarness harness = - createStreamSink(newJobId)) { - harness.setup(); - harness.open(); - - assertSnapshotSize(3); - assertMaxCommittedCheckpointId(oldJobId, 3); - assertMaxCommittedCheckpointId(newJobId, -1); - - rows.add(SimpleDataUtil.createRowData(2, "world")); - tableRows.addAll(rows); - - DataFile dataFile = writeDataFile("data-new-1", rows); - harness.processElement(of(dataFile), ++timestamp); - harness.snapshot(++checkpointId, ++timestamp); - assertFlinkManifests(1); - - harness.notifyOfCompletedCheckpoint(checkpointId); - assertFlinkManifests(0); - SimpleDataUtil.assertTableRows(table, tableRows); - assertSnapshotSize(4); - assertMaxCommittedCheckpointId(newJobId, checkpointId); - } - } - - @Test - public void testMultipleJobsWriteSameTable() throws Exception { - long timestamp = 0; - List tableRows = Lists.newArrayList(); - - JobID[] jobs = new JobID[] {new JobID(), new JobID(), new JobID()}; - for (int i = 0; i < 20; i++) { - int jobIndex = i % 3; - int checkpointId = i / 3; - JobID jobId = jobs[jobIndex]; - try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { - harness.setup(); - harness.open(); - - assertSnapshotSize(i); - assertMaxCommittedCheckpointId(jobId, checkpointId == 0 ? -1 : checkpointId); - - List rows = Lists.newArrayList(SimpleDataUtil.createRowData(i, "word-" + i)); - tableRows.addAll(rows); - - DataFile dataFile = writeDataFile(String.format("data-%d", i), rows); - harness.processElement(of(dataFile), ++timestamp); - harness.snapshot(checkpointId + 1, ++timestamp); - assertFlinkManifests(1); - - harness.notifyOfCompletedCheckpoint(checkpointId + 1); - assertFlinkManifests(0); - SimpleDataUtil.assertTableRows(table, tableRows); - assertSnapshotSize(i + 1); - assertMaxCommittedCheckpointId(jobId, checkpointId + 1); - } - } - } - - @Test - public void testBoundedStream() throws Exception { - JobID jobId = new JobID(); - try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { - harness.setup(); - harness.open(); - - assertFlinkManifests(0); - assertSnapshotSize(0); - assertMaxCommittedCheckpointId(jobId, -1L); - - List tableRows = Lists.newArrayList(SimpleDataUtil.createRowData(1, "word-1")); - - DataFile dataFile = writeDataFile("data-1", tableRows); - harness.processElement(of(dataFile), 1); - ((BoundedOneInput) harness.getOneInputOperator()).endInput(); - - assertFlinkManifests(0); - SimpleDataUtil.assertTableRows(table, tableRows); - assertSnapshotSize(1); - assertMaxCommittedCheckpointId(jobId, Long.MAX_VALUE); - } - } - - @Test - public void testFlinkManifests() throws Exception { - long timestamp = 0; - final long checkpoint = 10; - - JobID jobId = new JobID(); - try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { - harness.setup(); - harness.open(); - - assertMaxCommittedCheckpointId(jobId, -1L); - - RowData row1 = SimpleDataUtil.createRowData(1, "hello"); - DataFile dataFile1 = writeDataFile("data-1", ImmutableList.of(row1)); - - harness.processElement(of(dataFile1), ++timestamp); - assertMaxCommittedCheckpointId(jobId, -1L); - - // 1. snapshotState for checkpoint#1 - harness.snapshot(checkpoint, ++timestamp); - List manifestPaths = assertFlinkManifests(1); - Path manifestPath = manifestPaths.get(0); - Assert.assertEquals( - "File name should have the expected pattern.", - String.format("%s-%05d-%d-%d-%05d.avro", jobId, 0, 0, checkpoint, 1), - manifestPath.getFileName().toString()); - - // 2. Read the data files from manifests and assert. - List dataFiles = - FlinkManifestUtil.readDataFiles(createTestingManifestFile(manifestPath), table.io()); - Assert.assertEquals(1, dataFiles.size()); - TestHelpers.assertEquals(dataFile1, dataFiles.get(0)); - - // 3. notifyCheckpointComplete for checkpoint#1 - harness.notifyOfCompletedCheckpoint(checkpoint); - SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1)); - assertMaxCommittedCheckpointId(jobId, checkpoint); - assertFlinkManifests(0); - } - } - - @Test - public void testDeleteFiles() throws Exception { - Assume.assumeFalse("Only support equality-delete in format v2.", formatVersion < 2); - - long timestamp = 0; - long checkpoint = 10; - - JobID jobId = new JobID(); - FileAppenderFactory appenderFactory = createDeletableAppenderFactory(); - - try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { - harness.setup(); - harness.open(); - - assertMaxCommittedCheckpointId(jobId, -1L); - - RowData row1 = SimpleDataUtil.createInsert(1, "aaa"); - DataFile dataFile1 = writeDataFile("data-file-1", ImmutableList.of(row1)); - harness.processElement(of(dataFile1), ++timestamp); - assertMaxCommittedCheckpointId(jobId, -1L); - - // 1. snapshotState for checkpoint#1 - harness.snapshot(checkpoint, ++timestamp); - List manifestPaths = assertFlinkManifests(1); - Path manifestPath = manifestPaths.get(0); - Assert.assertEquals( - "File name should have the expected pattern.", - String.format("%s-%05d-%d-%d-%05d.avro", jobId, 0, 0, checkpoint, 1), - manifestPath.getFileName().toString()); - - // 2. Read the data files from manifests and assert. - List dataFiles = - FlinkManifestUtil.readDataFiles(createTestingManifestFile(manifestPath), table.io()); - Assert.assertEquals(1, dataFiles.size()); - TestHelpers.assertEquals(dataFile1, dataFiles.get(0)); - - // 3. notifyCheckpointComplete for checkpoint#1 - harness.notifyOfCompletedCheckpoint(checkpoint); - SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1)); - assertMaxCommittedCheckpointId(jobId, checkpoint); - assertFlinkManifests(0); - - // 4. process both data files and delete files. - RowData row2 = SimpleDataUtil.createInsert(2, "bbb"); - DataFile dataFile2 = writeDataFile("data-file-2", ImmutableList.of(row2)); - - RowData delete1 = SimpleDataUtil.createDelete(1, "aaa"); - DeleteFile deleteFile1 = - writeEqDeleteFile(appenderFactory, "delete-file-1", ImmutableList.of(delete1)); - harness.processElement( - WriteResult.builder().addDataFiles(dataFile2).addDeleteFiles(deleteFile1).build(), - ++timestamp); - assertMaxCommittedCheckpointId(jobId, checkpoint); - - // 5. snapshotState for checkpoint#2 - harness.snapshot(++checkpoint, ++timestamp); - assertFlinkManifests(2); - - // 6. notifyCheckpointComplete for checkpoint#2 - harness.notifyOfCompletedCheckpoint(checkpoint); - SimpleDataUtil.assertTableRows(table, ImmutableList.of(row2)); - assertMaxCommittedCheckpointId(jobId, checkpoint); - assertFlinkManifests(0); - } - } - - @Test - public void testCommitTwoCheckpointsInSingleTxn() throws Exception { - Assume.assumeFalse("Only support equality-delete in format v2.", formatVersion < 2); - - long timestamp = 0; - long checkpoint = 10; - - JobID jobId = new JobID(); - FileAppenderFactory appenderFactory = createDeletableAppenderFactory(); - - try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { - harness.setup(); - harness.open(); - - assertMaxCommittedCheckpointId(jobId, -1L); - - RowData insert1 = SimpleDataUtil.createInsert(1, "aaa"); - RowData insert2 = SimpleDataUtil.createInsert(2, "bbb"); - RowData delete3 = SimpleDataUtil.createDelete(3, "ccc"); - DataFile dataFile1 = writeDataFile("data-file-1", ImmutableList.of(insert1, insert2)); - DeleteFile deleteFile1 = - writeEqDeleteFile(appenderFactory, "delete-file-1", ImmutableList.of(delete3)); - harness.processElement( - WriteResult.builder().addDataFiles(dataFile1).addDeleteFiles(deleteFile1).build(), - ++timestamp); - - // The 1th snapshotState. - harness.snapshot(checkpoint, ++timestamp); - - RowData insert4 = SimpleDataUtil.createInsert(4, "ddd"); - RowData delete2 = SimpleDataUtil.createDelete(2, "bbb"); - DataFile dataFile2 = writeDataFile("data-file-2", ImmutableList.of(insert4)); - DeleteFile deleteFile2 = - writeEqDeleteFile(appenderFactory, "delete-file-2", ImmutableList.of(delete2)); - harness.processElement( - WriteResult.builder().addDataFiles(dataFile2).addDeleteFiles(deleteFile2).build(), - ++timestamp); - - // The 2nd snapshotState. - harness.snapshot(++checkpoint, ++timestamp); - - // Notify the 2nd snapshot to complete. - harness.notifyOfCompletedCheckpoint(checkpoint); - SimpleDataUtil.assertTableRows(table, ImmutableList.of(insert1, insert4)); - assertMaxCommittedCheckpointId(jobId, checkpoint); - assertFlinkManifests(0); - Assert.assertEquals( - "Should have committed 2 txn.", 2, ImmutableList.copyOf(table.snapshots()).size()); - } - } - - private DeleteFile writeEqDeleteFile( - FileAppenderFactory appenderFactory, String filename, List deletes) - throws IOException { - return SimpleDataUtil.writeEqDeleteFile( - table, FileFormat.PARQUET, tablePath, filename, appenderFactory, deletes); - } - - private DeleteFile writePosDeleteFile( - FileAppenderFactory appenderFactory, - String filename, - List> positions) - throws IOException { - return SimpleDataUtil.writePosDeleteFile( - table, FileFormat.PARQUET, tablePath, filename, appenderFactory, positions); - } - - private FileAppenderFactory createDeletableAppenderFactory() { - int[] equalityFieldIds = - new int[] { - table.schema().findField("id").fieldId(), table.schema().findField("data").fieldId() - }; - return new FlinkAppenderFactory( - table.schema(), - FlinkSchemaUtil.convert(table.schema()), - table.properties(), - table.spec(), - equalityFieldIds, - table.schema(), - null); - } - - private ManifestFile createTestingManifestFile(Path manifestPath) { - return new GenericManifestFile( - manifestPath.toAbsolutePath().toString(), - manifestPath.toFile().length(), - 0, - ManifestContent.DATA, - 0, - 0, - 0L, - 0, - 0, - 0, - 0, - 0, - 0, - null, - null); - } - - private List assertFlinkManifests(int expectedCount) throws IOException { - List manifests = - Files.list(flinkManifestFolder.toPath()) - .filter(p -> !p.toString().endsWith(".crc")) - .collect(Collectors.toList()); - Assert.assertEquals( - String.format("Expected %s flink manifests, but the list is: %s", expectedCount, manifests), - expectedCount, - manifests.size()); - return manifests; - } - - private DataFile writeDataFile(String filename, List rows) throws IOException { - return SimpleDataUtil.writeFile( - table.schema(), table.spec(), CONF, tablePath, format.addExtension(filename), rows); - } - - private void assertMaxCommittedCheckpointId(JobID jobID, long expectedId) { - table.refresh(); - long actualId = IcebergFilesCommitter.getMaxCommittedCheckpointId(table, jobID.toString()); - Assert.assertEquals(expectedId, actualId); - } - - private void assertSnapshotSize(int expectedSnapshotSize) { - table.refresh(); - Assert.assertEquals(expectedSnapshotSize, Lists.newArrayList(table.snapshots()).size()); - } - - private OneInputStreamOperatorTestHarness createStreamSink(JobID jobID) - throws Exception { - TestOperatorFactory factory = TestOperatorFactory.of(tablePath); - return new OneInputStreamOperatorTestHarness<>(factory, createEnvironment(jobID)); - } - - private static MockEnvironment createEnvironment(JobID jobID) { - return new MockEnvironmentBuilder() - .setTaskName("test task") - .setManagedMemorySize(32 * 1024) - .setInputSplitProvider(new MockInputSplitProvider()) - .setBufferSize(256) - .setTaskConfiguration(new org.apache.flink.configuration.Configuration()) - .setExecutionConfig(new ExecutionConfig()) - .setMaxParallelism(16) - .setJobID(jobID) - .build(); - } - - private static class TestOperatorFactory extends AbstractStreamOperatorFactory - implements OneInputStreamOperatorFactory { - private final String tablePath; - - private TestOperatorFactory(String tablePath) { - this.tablePath = tablePath; - } - - private static TestOperatorFactory of(String tablePath) { - return new TestOperatorFactory(tablePath); - } - - @Override - @SuppressWarnings("unchecked") - public > T createStreamOperator( - StreamOperatorParameters param) { - IcebergFilesCommitter committer = - new IcebergFilesCommitter(new TestTableLoader(tablePath), false); - committer.setup(param.getContainingTask(), param.getStreamConfig(), param.getOutput()); - return (T) committer; - } - - @Override - public Class getStreamOperatorClass(ClassLoader classLoader) { - return IcebergFilesCommitter.class; - } - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java deleted file mode 100644 index bd959bfb31c4..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java +++ /dev/null @@ -1,392 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.File; -import java.io.IOException; -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import java.util.Set; -import org.apache.flink.streaming.api.operators.BoundedOneInput; -import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.LocatedFileStatus; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.RemoteIterator; -import org.apache.iceberg.AppendFiles; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.FlinkWriteConf; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.Types; -import org.assertj.core.api.Assertions; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Parameterized.class) -public class TestIcebergStreamWriter { - @Rule public TemporaryFolder tempFolder = new TemporaryFolder(); - - private String tablePath; - private Table table; - - private final FileFormat format; - private final boolean partitioned; - - @Parameterized.Parameters(name = "format = {0}, partitioned = {1}") - public static Object[][] parameters() { - return new Object[][] { - {"avro", true}, - {"avro", false}, - {"orc", true}, - {"orc", false}, - {"parquet", true}, - {"parquet", false} - }; - } - - public TestIcebergStreamWriter(String format, boolean partitioned) { - this.format = FileFormat.fromString(format); - this.partitioned = partitioned; - } - - @Before - public void before() throws IOException { - File folder = tempFolder.newFolder(); - tablePath = folder.getAbsolutePath(); - - // Construct the iceberg table. - Map props = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()); - table = SimpleDataUtil.createTable(tablePath, props, partitioned); - } - - @Test - public void testWritingTable() throws Exception { - long checkpointId = 1L; - try (OneInputStreamOperatorTestHarness testHarness = - createIcebergStreamWriter()) { - // The first checkpoint - testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), 1); - testHarness.processElement(SimpleDataUtil.createRowData(2, "world"), 1); - testHarness.processElement(SimpleDataUtil.createRowData(3, "hello"), 1); - - testHarness.prepareSnapshotPreBarrier(checkpointId); - long expectedDataFiles = partitioned ? 2 : 1; - WriteResult result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build(); - Assert.assertEquals(0, result.deleteFiles().length); - Assert.assertEquals(expectedDataFiles, result.dataFiles().length); - - checkpointId = checkpointId + 1; - - // The second checkpoint - testHarness.processElement(SimpleDataUtil.createRowData(4, "foo"), 1); - testHarness.processElement(SimpleDataUtil.createRowData(5, "bar"), 2); - - testHarness.prepareSnapshotPreBarrier(checkpointId); - expectedDataFiles = partitioned ? 4 : 2; - result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build(); - Assert.assertEquals(0, result.deleteFiles().length); - Assert.assertEquals(expectedDataFiles, result.dataFiles().length); - - // Commit the iceberg transaction. - AppendFiles appendFiles = table.newAppend(); - Arrays.stream(result.dataFiles()).forEach(appendFiles::appendFile); - appendFiles.commit(); - - // Assert the table records. - SimpleDataUtil.assertTableRecords( - tablePath, - Lists.newArrayList( - SimpleDataUtil.createRecord(1, "hello"), - SimpleDataUtil.createRecord(2, "world"), - SimpleDataUtil.createRecord(3, "hello"), - SimpleDataUtil.createRecord(4, "foo"), - SimpleDataUtil.createRecord(5, "bar"))); - } - } - - @Test - public void testSnapshotTwice() throws Exception { - long checkpointId = 1; - long timestamp = 1; - try (OneInputStreamOperatorTestHarness testHarness = - createIcebergStreamWriter()) { - testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), timestamp++); - testHarness.processElement(SimpleDataUtil.createRowData(2, "world"), timestamp); - - testHarness.prepareSnapshotPreBarrier(checkpointId++); - long expectedDataFiles = partitioned ? 2 : 1; - WriteResult result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build(); - Assert.assertEquals(0, result.deleteFiles().length); - Assert.assertEquals(expectedDataFiles, result.dataFiles().length); - - // snapshot again immediately. - for (int i = 0; i < 5; i++) { - testHarness.prepareSnapshotPreBarrier(checkpointId++); - - result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build(); - Assert.assertEquals(0, result.deleteFiles().length); - Assert.assertEquals(expectedDataFiles, result.dataFiles().length); - } - } - } - - @Test - public void testTableWithoutSnapshot() throws Exception { - try (OneInputStreamOperatorTestHarness testHarness = - createIcebergStreamWriter()) { - Assert.assertEquals(0, testHarness.extractOutputValues().size()); - } - // Even if we closed the iceberg stream writer, there's no orphan data file. - Assert.assertEquals(0, scanDataFiles().size()); - - try (OneInputStreamOperatorTestHarness testHarness = - createIcebergStreamWriter()) { - testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), 1); - // Still not emit the data file yet, because there is no checkpoint. - Assert.assertEquals(0, testHarness.extractOutputValues().size()); - } - // Once we closed the iceberg stream writer, there will left an orphan data file. - Assert.assertEquals(1, scanDataFiles().size()); - } - - private Set scanDataFiles() throws IOException { - Path dataDir = new Path(tablePath, "data"); - FileSystem fs = FileSystem.get(new Configuration()); - if (!fs.exists(dataDir)) { - return ImmutableSet.of(); - } else { - Set paths = Sets.newHashSet(); - RemoteIterator iterators = fs.listFiles(dataDir, true); - while (iterators.hasNext()) { - LocatedFileStatus status = iterators.next(); - if (status.isFile()) { - Path path = status.getPath(); - if (path.getName().endsWith("." + format.toString().toLowerCase())) { - paths.add(path.toString()); - } - } - } - return paths; - } - } - - @Test - public void testBoundedStreamCloseWithEmittingDataFiles() throws Exception { - try (OneInputStreamOperatorTestHarness testHarness = - createIcebergStreamWriter()) { - testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), 1); - testHarness.processElement(SimpleDataUtil.createRowData(2, "world"), 2); - - Assertions.assertThat(testHarness.getOneInputOperator()).isInstanceOf(BoundedOneInput.class); - ((BoundedOneInput) testHarness.getOneInputOperator()).endInput(); - - long expectedDataFiles = partitioned ? 2 : 1; - WriteResult result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build(); - Assert.assertEquals(0, result.deleteFiles().length); - Assert.assertEquals(expectedDataFiles, result.dataFiles().length); - - // invoke endInput again. - ((BoundedOneInput) testHarness.getOneInputOperator()).endInput(); - - result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build(); - Assert.assertEquals(0, result.deleteFiles().length); - // Datafiles should not be sent again - Assert.assertEquals(expectedDataFiles, result.dataFiles().length); - } - } - - @Test - public void testBoundedStreamTriggeredEndInputBeforeTriggeringCheckpoint() throws Exception { - try (OneInputStreamOperatorTestHarness testHarness = - createIcebergStreamWriter()) { - testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), 1); - testHarness.processElement(SimpleDataUtil.createRowData(2, "world"), 2); - - testHarness.endInput(); - - long expectedDataFiles = partitioned ? 2 : 1; - WriteResult result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build(); - Assert.assertEquals(0, result.deleteFiles().length); - Assert.assertEquals(expectedDataFiles, result.dataFiles().length); - - testHarness.prepareSnapshotPreBarrier(1L); - - result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build(); - Assert.assertEquals(0, result.deleteFiles().length); - // It should be ensured that after endInput is triggered, when prepareSnapshotPreBarrier - // is triggered, write should only send WriteResult once - Assert.assertEquals(expectedDataFiles, result.dataFiles().length); - } - } - - @Test - public void testTableWithTargetFileSize() throws Exception { - // Adjust the target-file-size in table properties. - table - .updateProperties() - .set(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, "4") // ~4 bytes; low enough to trigger - .commit(); - - List rows = Lists.newArrayListWithCapacity(8000); - List records = Lists.newArrayListWithCapacity(8000); - for (int i = 0; i < 2000; i++) { - for (String data : new String[] {"a", "b", "c", "d"}) { - rows.add(SimpleDataUtil.createRowData(i, data)); - records.add(SimpleDataUtil.createRecord(i, data)); - } - } - - try (OneInputStreamOperatorTestHarness testHarness = - createIcebergStreamWriter()) { - for (RowData row : rows) { - testHarness.processElement(row, 1); - } - - // snapshot the operator. - testHarness.prepareSnapshotPreBarrier(1); - WriteResult result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build(); - Assert.assertEquals(0, result.deleteFiles().length); - Assert.assertEquals(8, result.dataFiles().length); - - // Assert that the data file have the expected records. - for (DataFile dataFile : result.dataFiles()) { - Assert.assertEquals(1000, dataFile.recordCount()); - } - - // Commit the iceberg transaction. - AppendFiles appendFiles = table.newAppend(); - Arrays.stream(result.dataFiles()).forEach(appendFiles::appendFile); - appendFiles.commit(); - } - - // Assert the table records. - SimpleDataUtil.assertTableRecords(tablePath, records); - } - - @Test - public void testPromotedFlinkDataType() throws Exception { - Schema iSchema = - new Schema( - Types.NestedField.required(1, "tinyint", Types.IntegerType.get()), - Types.NestedField.required(2, "smallint", Types.IntegerType.get()), - Types.NestedField.optional(3, "int", Types.IntegerType.get())); - TableSchema flinkSchema = - TableSchema.builder() - .field("tinyint", DataTypes.TINYINT().notNull()) - .field("smallint", DataTypes.SMALLINT().notNull()) - .field("int", DataTypes.INT().nullable()) - .build(); - - PartitionSpec spec; - if (partitioned) { - spec = - PartitionSpec.builderFor(iSchema) - .identity("smallint") - .identity("tinyint") - .identity("int") - .build(); - } else { - spec = PartitionSpec.unpartitioned(); - } - - String location = tempFolder.newFolder().getAbsolutePath(); - Map props = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()); - Table icebergTable = new HadoopTables().create(iSchema, spec, props, location); - - List rows = - Lists.newArrayList( - GenericRowData.of((byte) 0x01, (short) -32768, 101), - GenericRowData.of((byte) 0x02, (short) 0, 102), - GenericRowData.of((byte) 0x03, (short) 32767, 103)); - - Record record = GenericRecord.create(iSchema); - List expected = - Lists.newArrayList( - record.copy(ImmutableMap.of("tinyint", 1, "smallint", -32768, "int", 101)), - record.copy(ImmutableMap.of("tinyint", 2, "smallint", 0, "int", 102)), - record.copy(ImmutableMap.of("tinyint", 3, "smallint", 32767, "int", 103))); - - try (OneInputStreamOperatorTestHarness testHarness = - createIcebergStreamWriter(icebergTable, flinkSchema)) { - for (RowData row : rows) { - testHarness.processElement(row, 1); - } - testHarness.prepareSnapshotPreBarrier(1); - WriteResult result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build(); - Assert.assertEquals(0, result.deleteFiles().length); - Assert.assertEquals(partitioned ? 3 : 1, result.dataFiles().length); - - // Commit the iceberg transaction. - AppendFiles appendFiles = icebergTable.newAppend(); - Arrays.stream(result.dataFiles()).forEach(appendFiles::appendFile); - appendFiles.commit(); - } - - SimpleDataUtil.assertTableRecords(location, expected); - } - - private OneInputStreamOperatorTestHarness createIcebergStreamWriter() - throws Exception { - return createIcebergStreamWriter(table, SimpleDataUtil.FLINK_SCHEMA); - } - - private OneInputStreamOperatorTestHarness createIcebergStreamWriter( - Table icebergTable, TableSchema flinkSchema) throws Exception { - RowType flinkRowType = FlinkSink.toFlinkRowType(icebergTable.schema(), flinkSchema); - FlinkWriteConf flinkWriteConfig = - new FlinkWriteConf( - icebergTable, Maps.newHashMap(), new org.apache.flink.configuration.Configuration()); - - IcebergStreamWriter streamWriter = - FlinkSink.createStreamWriter(icebergTable, flinkWriteConfig, flinkRowType, null); - OneInputStreamOperatorTestHarness harness = - new OneInputStreamOperatorTestHarness<>(streamWriter, 1, 1, 0); - - harness.setup(); - harness.open(); - - return harness; - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java deleted file mode 100644 index b6c785cb144b..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java +++ /dev/null @@ -1,252 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.List; -import java.util.stream.Collectors; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.PartitionKey; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.InternalRecordWrapper; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.flink.data.RandomRowData; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.junit.Assert; -import org.junit.Test; - -public class TestRowDataPartitionKey { - private static final Schema SCHEMA = - new Schema( - Types.NestedField.required(0, "boolType", Types.BooleanType.get()), - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.required(2, "longType", Types.LongType.get()), - Types.NestedField.required(3, "dateType", Types.DateType.get()), - Types.NestedField.required(4, "timeType", Types.TimeType.get()), - Types.NestedField.required(5, "stringType", Types.StringType.get()), - Types.NestedField.required(6, "timestampWithoutZone", Types.TimestampType.withoutZone()), - Types.NestedField.required(7, "timestampWithZone", Types.TimestampType.withZone()), - Types.NestedField.required(8, "fixedType", Types.FixedType.ofLength(5)), - Types.NestedField.required(9, "uuidType", Types.UUIDType.get()), - Types.NestedField.required(10, "binaryType", Types.BinaryType.get()), - Types.NestedField.required(11, "decimalType1", Types.DecimalType.of(18, 3)), - Types.NestedField.required(12, "decimalType2", Types.DecimalType.of(10, 5)), - Types.NestedField.required(13, "decimalType3", Types.DecimalType.of(38, 19)), - Types.NestedField.required(14, "floatType", Types.FloatType.get()), - Types.NestedField.required(15, "doubleType", Types.DoubleType.get())); - - private static final List SUPPORTED_PRIMITIVES = - SCHEMA.asStruct().fields().stream().map(Types.NestedField::name).collect(Collectors.toList()); - - private static final Schema NESTED_SCHEMA = - new Schema( - Types.NestedField.required( - 1, - "structType", - Types.StructType.of( - Types.NestedField.optional(2, "innerStringType", Types.StringType.get()), - Types.NestedField.optional(3, "innerIntegerType", Types.IntegerType.get())))); - - @Test - public void testNullPartitionValue() { - Schema schema = - new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get())); - - PartitionSpec spec = PartitionSpec.builderFor(schema).identity("data").build(); - - List rows = - Lists.newArrayList( - GenericRowData.of(1, StringData.fromString("a")), - GenericRowData.of(2, StringData.fromString("b")), - GenericRowData.of(3, null)); - - RowDataWrapper rowWrapper = - new RowDataWrapper(FlinkSchemaUtil.convert(schema), schema.asStruct()); - - for (RowData row : rows) { - PartitionKey partitionKey = new PartitionKey(spec, schema); - partitionKey.partition(rowWrapper.wrap(row)); - Assert.assertEquals(partitionKey.size(), 1); - - String expectedStr = row.isNullAt(1) ? null : row.getString(1).toString(); - Assert.assertEquals(expectedStr, partitionKey.get(0, String.class)); - } - } - - @Test - public void testPartitionWithOneNestedField() { - RowDataWrapper rowWrapper = - new RowDataWrapper(FlinkSchemaUtil.convert(NESTED_SCHEMA), NESTED_SCHEMA.asStruct()); - List records = RandomGenericData.generate(NESTED_SCHEMA, 10, 1991); - List rows = Lists.newArrayList(RandomRowData.convert(NESTED_SCHEMA, records)); - - PartitionSpec spec1 = - PartitionSpec.builderFor(NESTED_SCHEMA).identity("structType.innerStringType").build(); - PartitionSpec spec2 = - PartitionSpec.builderFor(NESTED_SCHEMA).identity("structType.innerIntegerType").build(); - - for (int i = 0; i < rows.size(); i++) { - RowData row = rows.get(i); - Record record = (Record) records.get(i).get(0); - - PartitionKey partitionKey1 = new PartitionKey(spec1, NESTED_SCHEMA); - partitionKey1.partition(rowWrapper.wrap(row)); - Assert.assertEquals(partitionKey1.size(), 1); - - Assert.assertEquals(record.get(0), partitionKey1.get(0, String.class)); - - PartitionKey partitionKey2 = new PartitionKey(spec2, NESTED_SCHEMA); - partitionKey2.partition(rowWrapper.wrap(row)); - Assert.assertEquals(partitionKey2.size(), 1); - - Assert.assertEquals(record.get(1), partitionKey2.get(0, Integer.class)); - } - } - - @Test - public void testPartitionMultipleNestedField() { - RowDataWrapper rowWrapper = - new RowDataWrapper(FlinkSchemaUtil.convert(NESTED_SCHEMA), NESTED_SCHEMA.asStruct()); - List records = RandomGenericData.generate(NESTED_SCHEMA, 10, 1992); - List rows = Lists.newArrayList(RandomRowData.convert(NESTED_SCHEMA, records)); - - PartitionSpec spec1 = - PartitionSpec.builderFor(NESTED_SCHEMA) - .identity("structType.innerIntegerType") - .identity("structType.innerStringType") - .build(); - PartitionSpec spec2 = - PartitionSpec.builderFor(NESTED_SCHEMA) - .identity("structType.innerStringType") - .identity("structType.innerIntegerType") - .build(); - - PartitionKey pk1 = new PartitionKey(spec1, NESTED_SCHEMA); - PartitionKey pk2 = new PartitionKey(spec2, NESTED_SCHEMA); - - for (int i = 0; i < rows.size(); i++) { - RowData row = rows.get(i); - Record record = (Record) records.get(i).get(0); - - pk1.partition(rowWrapper.wrap(row)); - Assert.assertEquals(2, pk1.size()); - - Assert.assertEquals(record.get(1), pk1.get(0, Integer.class)); - Assert.assertEquals(record.get(0), pk1.get(1, String.class)); - - pk2.partition(rowWrapper.wrap(row)); - Assert.assertEquals(2, pk2.size()); - - Assert.assertEquals(record.get(0), pk2.get(0, String.class)); - Assert.assertEquals(record.get(1), pk2.get(1, Integer.class)); - } - } - - @Test - public void testPartitionValueTypes() { - RowType rowType = FlinkSchemaUtil.convert(SCHEMA); - RowDataWrapper rowWrapper = new RowDataWrapper(rowType, SCHEMA.asStruct()); - InternalRecordWrapper recordWrapper = new InternalRecordWrapper(SCHEMA.asStruct()); - - List records = RandomGenericData.generate(SCHEMA, 10, 1993); - List rows = Lists.newArrayList(RandomRowData.convert(SCHEMA, records)); - - for (String column : SUPPORTED_PRIMITIVES) { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity(column).build(); - Class[] javaClasses = spec.javaClasses(); - - PartitionKey pk = new PartitionKey(spec, SCHEMA); - PartitionKey expectedPK = new PartitionKey(spec, SCHEMA); - - for (int j = 0; j < rows.size(); j++) { - RowData row = rows.get(j); - Record record = records.get(j); - - pk.partition(rowWrapper.wrap(row)); - expectedPK.partition(recordWrapper.wrap(record)); - - Assert.assertEquals( - "Partition with column " + column + " should have one field.", 1, pk.size()); - - if (column.equals("timeType")) { - Assert.assertEquals( - "Partition with column " + column + " should have the expected values", - expectedPK.get(0, Long.class) / 1000, - pk.get(0, Long.class) / 1000); - } else { - Assert.assertEquals( - "Partition with column " + column + " should have the expected values", - expectedPK.get(0, javaClasses[0]), - pk.get(0, javaClasses[0])); - } - } - } - } - - @Test - public void testNestedPartitionValues() { - Schema nestedSchema = new Schema(Types.NestedField.optional(1001, "nested", SCHEMA.asStruct())); - RowType rowType = FlinkSchemaUtil.convert(nestedSchema); - - RowDataWrapper rowWrapper = new RowDataWrapper(rowType, nestedSchema.asStruct()); - InternalRecordWrapper recordWrapper = new InternalRecordWrapper(nestedSchema.asStruct()); - - List records = RandomGenericData.generate(nestedSchema, 10, 1994); - List rows = Lists.newArrayList(RandomRowData.convert(nestedSchema, records)); - - for (int i = 0; i < SUPPORTED_PRIMITIVES.size(); i++) { - String column = String.format("nested.%s", SUPPORTED_PRIMITIVES.get(i)); - - PartitionSpec spec = PartitionSpec.builderFor(nestedSchema).identity(column).build(); - Class[] javaClasses = spec.javaClasses(); - - PartitionKey pk = new PartitionKey(spec, nestedSchema); - PartitionKey expectedPK = new PartitionKey(spec, nestedSchema); - - for (int j = 0; j < rows.size(); j++) { - pk.partition(rowWrapper.wrap(rows.get(j))); - expectedPK.partition(recordWrapper.wrap(records.get(j))); - - Assert.assertEquals( - "Partition with nested column " + column + " should have one field.", 1, pk.size()); - - if (column.equals("nested.timeType")) { - Assert.assertEquals( - "Partition with nested column " + column + " should have the expected values.", - expectedPK.get(0, Long.class) / 1000, - pk.get(0, Long.class) / 1000); - } else { - Assert.assertEquals( - "Partition with nested column " + column + " should have the expected values.", - expectedPK.get(0, javaClasses[0]), - pk.get(0, javaClasses[0])); - } - } - } - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java deleted file mode 100644 index d83dd8530f98..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java +++ /dev/null @@ -1,245 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.File; -import java.io.IOException; -import java.util.List; -import java.util.Map; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.iceberg.AppendFiles; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.SerializableTable; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.data.RandomRowData; -import org.apache.iceberg.io.TaskWriter; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Parameterized.class) -public class TestTaskWriters { - private static final Configuration CONF = new Configuration(); - private static final long TARGET_FILE_SIZE = 128 * 1024 * 1024; - - @Rule public final TemporaryFolder tempFolder = new TemporaryFolder(); - - @Parameterized.Parameters(name = "format = {0}, partitioned = {1}") - public static Object[][] parameters() { - return new Object[][] { - {"avro", true}, - {"avro", false}, - {"orc", true}, - {"orc", false}, - {"parquet", true}, - {"parquet", false} - }; - } - - private final FileFormat format; - private final boolean partitioned; - - private String path; - private Table table; - - public TestTaskWriters(String format, boolean partitioned) { - this.format = FileFormat.fromString(format); - this.partitioned = partitioned; - } - - @Before - public void before() throws IOException { - File folder = tempFolder.newFolder(); - path = folder.getAbsolutePath(); - - // Construct the iceberg table with the specified file format. - Map props = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()); - table = SimpleDataUtil.createTable(path, props, partitioned); - } - - @Test - public void testWriteZeroRecord() throws IOException { - try (TaskWriter taskWriter = createTaskWriter(TARGET_FILE_SIZE)) { - taskWriter.close(); - - DataFile[] dataFiles = taskWriter.dataFiles(); - Assert.assertNotNull(dataFiles); - Assert.assertEquals(0, dataFiles.length); - - // Close again. - taskWriter.close(); - dataFiles = taskWriter.dataFiles(); - Assert.assertNotNull(dataFiles); - Assert.assertEquals(0, dataFiles.length); - } - } - - @Test - public void testCloseTwice() throws IOException { - try (TaskWriter taskWriter = createTaskWriter(TARGET_FILE_SIZE)) { - taskWriter.write(SimpleDataUtil.createRowData(1, "hello")); - taskWriter.write(SimpleDataUtil.createRowData(2, "world")); - taskWriter.close(); // The first close - taskWriter.close(); // The second close - - int expectedFiles = partitioned ? 2 : 1; - DataFile[] dataFiles = taskWriter.dataFiles(); - Assert.assertEquals(expectedFiles, dataFiles.length); - - FileSystem fs = FileSystem.get(CONF); - for (DataFile dataFile : dataFiles) { - Assert.assertTrue(fs.exists(new Path(dataFile.path().toString()))); - } - } - } - - @Test - public void testAbort() throws IOException { - try (TaskWriter taskWriter = createTaskWriter(TARGET_FILE_SIZE)) { - taskWriter.write(SimpleDataUtil.createRowData(1, "hello")); - taskWriter.write(SimpleDataUtil.createRowData(2, "world")); - - taskWriter.abort(); - DataFile[] dataFiles = taskWriter.dataFiles(); - - int expectedFiles = partitioned ? 2 : 1; - Assert.assertEquals(expectedFiles, dataFiles.length); - - FileSystem fs = FileSystem.get(CONF); - for (DataFile dataFile : dataFiles) { - Assert.assertFalse(fs.exists(new Path(dataFile.path().toString()))); - } - } - } - - @Test - public void testCompleteFiles() throws IOException { - try (TaskWriter taskWriter = createTaskWriter(TARGET_FILE_SIZE)) { - taskWriter.write(SimpleDataUtil.createRowData(1, "a")); - taskWriter.write(SimpleDataUtil.createRowData(2, "b")); - taskWriter.write(SimpleDataUtil.createRowData(3, "c")); - taskWriter.write(SimpleDataUtil.createRowData(4, "d")); - - DataFile[] dataFiles = taskWriter.dataFiles(); - int expectedFiles = partitioned ? 4 : 1; - Assert.assertEquals(expectedFiles, dataFiles.length); - - dataFiles = taskWriter.dataFiles(); - Assert.assertEquals(expectedFiles, dataFiles.length); - - FileSystem fs = FileSystem.get(CONF); - for (DataFile dataFile : dataFiles) { - Assert.assertTrue(fs.exists(new Path(dataFile.path().toString()))); - } - - AppendFiles appendFiles = table.newAppend(); - for (DataFile dataFile : dataFiles) { - appendFiles.appendFile(dataFile); - } - appendFiles.commit(); - - // Assert the data rows. - SimpleDataUtil.assertTableRecords( - path, - Lists.newArrayList( - SimpleDataUtil.createRecord(1, "a"), - SimpleDataUtil.createRecord(2, "b"), - SimpleDataUtil.createRecord(3, "c"), - SimpleDataUtil.createRecord(4, "d"))); - } - } - - @Test - public void testRollingWithTargetFileSize() throws IOException { - try (TaskWriter taskWriter = createTaskWriter(4)) { - List rows = Lists.newArrayListWithCapacity(8000); - List records = Lists.newArrayListWithCapacity(8000); - for (int i = 0; i < 2000; i++) { - for (String data : new String[] {"a", "b", "c", "d"}) { - rows.add(SimpleDataUtil.createRowData(i, data)); - records.add(SimpleDataUtil.createRecord(i, data)); - } - } - - for (RowData row : rows) { - taskWriter.write(row); - } - - DataFile[] dataFiles = taskWriter.dataFiles(); - Assert.assertEquals(8, dataFiles.length); - - AppendFiles appendFiles = table.newAppend(); - for (DataFile dataFile : dataFiles) { - appendFiles.appendFile(dataFile); - } - appendFiles.commit(); - - // Assert the data rows. - SimpleDataUtil.assertTableRecords(path, records); - } - } - - @Test - public void testRandomData() throws IOException { - try (TaskWriter taskWriter = createTaskWriter(TARGET_FILE_SIZE)) { - Iterable rows = RandomRowData.generate(SimpleDataUtil.SCHEMA, 100, 1996); - for (RowData row : rows) { - taskWriter.write(row); - } - - taskWriter.close(); - DataFile[] dataFiles = taskWriter.dataFiles(); - AppendFiles appendFiles = table.newAppend(); - for (DataFile dataFile : dataFiles) { - appendFiles.appendFile(dataFile); - } - appendFiles.commit(); - - // Assert the data rows. - SimpleDataUtil.assertTableRows(path, Lists.newArrayList(rows)); - } - } - - private TaskWriter createTaskWriter(long targetFileSize) { - TaskWriterFactory taskWriterFactory = - new RowDataTaskWriterFactory( - SerializableTable.copyOf(table), - (RowType) SimpleDataUtil.FLINK_SCHEMA.toRowDataType().getLogicalType(), - targetFileSize, - format, - null, - false); - taskWriterFactory.initialize(1, 1); - return taskWriterFactory.create(); - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java deleted file mode 100644 index b0be3daf7b49..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.function.Supplier; -import java.util.stream.Stream; -import org.apache.flink.api.java.typeutils.RowTypeInfo; -import org.apache.flink.configuration.ConfigOption; -import org.apache.flink.configuration.ConfigOptions; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.streaming.api.functions.source.SourceFunction; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.connector.ChangelogMode; -import org.apache.flink.table.connector.source.DataStreamScanProvider; -import org.apache.flink.table.connector.source.DynamicTableSource; -import org.apache.flink.table.connector.source.ScanTableSource; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.util.DataFormatConverters; -import org.apache.flink.table.factories.DynamicTableSourceFactory; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.utils.TableSchemaUtils; -import org.apache.flink.types.Row; -import org.apache.flink.types.RowKind; -import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; - -public class BoundedTableFactory implements DynamicTableSourceFactory { - private static final AtomicInteger DATA_SET_ID = new AtomicInteger(0); - private static final Map>> DATA_SETS = Maps.newHashMap(); - - private static final ConfigOption DATA_ID = - ConfigOptions.key("data-id").stringType().noDefaultValue(); - - public static String registerDataSet(List> dataSet) { - String dataSetId = String.valueOf(DATA_SET_ID.incrementAndGet()); - DATA_SETS.put(dataSetId, dataSet); - return dataSetId; - } - - public static void clearDataSets() { - DATA_SETS.clear(); - } - - @Override - public DynamicTableSource createDynamicTableSource(Context context) { - TableSchema tableSchema = - TableSchemaUtils.getPhysicalSchema(context.getCatalogTable().getSchema()); - - Configuration configuration = Configuration.fromMap(context.getCatalogTable().getOptions()); - String dataId = configuration.getString(DATA_ID); - Preconditions.checkArgument( - DATA_SETS.containsKey(dataId), "data-id %s does not found in registered data set.", dataId); - - return new BoundedTableSource(DATA_SETS.get(dataId), tableSchema); - } - - @Override - public String factoryIdentifier() { - return "BoundedSource"; - } - - @Override - public Set> requiredOptions() { - return ImmutableSet.of(); - } - - @Override - public Set> optionalOptions() { - return ImmutableSet.of(DATA_ID); - } - - private static class BoundedTableSource implements ScanTableSource { - - private final List> elementsPerCheckpoint; - private final TableSchema tableSchema; - - private BoundedTableSource(List> elementsPerCheckpoint, TableSchema tableSchema) { - this.elementsPerCheckpoint = elementsPerCheckpoint; - this.tableSchema = tableSchema; - } - - private BoundedTableSource(BoundedTableSource toCopy) { - this.elementsPerCheckpoint = toCopy.elementsPerCheckpoint; - this.tableSchema = toCopy.tableSchema; - } - - @Override - public ChangelogMode getChangelogMode() { - Supplier> supplier = () -> elementsPerCheckpoint.stream().flatMap(List::stream); - - // Add the INSERT row kind by default. - ChangelogMode.Builder builder = ChangelogMode.newBuilder().addContainedKind(RowKind.INSERT); - - if (supplier.get().anyMatch(r -> r.getKind() == RowKind.DELETE)) { - builder.addContainedKind(RowKind.DELETE); - } - - if (supplier.get().anyMatch(r -> r.getKind() == RowKind.UPDATE_BEFORE)) { - builder.addContainedKind(RowKind.UPDATE_BEFORE); - } - - if (supplier.get().anyMatch(r -> r.getKind() == RowKind.UPDATE_AFTER)) { - builder.addContainedKind(RowKind.UPDATE_AFTER); - } - - return builder.build(); - } - - @Override - public ScanRuntimeProvider getScanRuntimeProvider(ScanContext runtimeProviderContext) { - return new DataStreamScanProvider() { - @Override - public DataStream produceDataStream(StreamExecutionEnvironment env) { - boolean checkpointEnabled = env.getCheckpointConfig().isCheckpointingEnabled(); - SourceFunction source = - new BoundedTestSource<>(elementsPerCheckpoint, checkpointEnabled); - - RowType rowType = (RowType) tableSchema.toRowDataType().getLogicalType(); - // Converter to convert the Row to RowData. - DataFormatConverters.RowConverter rowConverter = - new DataFormatConverters.RowConverter(tableSchema.getFieldDataTypes()); - - return env.addSource(source, new RowTypeInfo(tableSchema.getFieldTypes())) - .map(rowConverter::toInternal, FlinkCompatibilityUtil.toTypeInfo(rowType)); - } - - @Override - public boolean isBounded() { - return true; - } - }; - } - - @Override - public DynamicTableSource copy() { - return new BoundedTableSource(this); - } - - @Override - public String asSummaryString() { - return "Bounded test table source"; - } - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java deleted file mode 100644 index 7b435d059845..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.concurrent.atomic.AtomicInteger; -import org.apache.flink.api.common.state.CheckpointListener; -import org.apache.flink.streaming.api.functions.source.SourceFunction; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -/** - * A stream source that: 1) emits the elements from elementsPerCheckpoint.get(0) without allowing - * checkpoints. 2) then waits for the checkpoint to complete. 3) emits the elements from - * elementsPerCheckpoint.get(1) without allowing checkpoints. 4) then waits for the checkpoint to - * complete. 5) ... - * - *

Util all the list from elementsPerCheckpoint are exhausted. - */ -public final class BoundedTestSource implements SourceFunction, CheckpointListener { - - private final List> elementsPerCheckpoint; - private final boolean checkpointEnabled; - private volatile boolean running = true; - - private final AtomicInteger numCheckpointsComplete = new AtomicInteger(0); - - /** Emits all those elements in several checkpoints. */ - public BoundedTestSource(List> elementsPerCheckpoint, boolean checkpointEnabled) { - this.elementsPerCheckpoint = elementsPerCheckpoint; - this.checkpointEnabled = checkpointEnabled; - } - - public BoundedTestSource(List> elementsPerCheckpoint) { - this(elementsPerCheckpoint, true); - } - - /** Emits all those elements in a single checkpoint. */ - public BoundedTestSource(T... elements) { - this(Collections.singletonList(Arrays.asList(elements))); - } - - @Override - public void run(SourceContext ctx) throws Exception { - if (!checkpointEnabled) { - Preconditions.checkArgument( - elementsPerCheckpoint.size() <= 1, - "There should be at most one list in the elementsPerCheckpoint when checkpoint is disabled."); - elementsPerCheckpoint.stream().flatMap(List::stream).forEach(ctx::collect); - return; - } - - for (List elements : elementsPerCheckpoint) { - - final int checkpointToAwait; - synchronized (ctx.getCheckpointLock()) { - // Let's say checkpointToAwait = numCheckpointsComplete.get() + delta, in fact the value of - // delta should not - // affect the final table records because we only need to make sure that there will be - // exactly - // elementsPerCheckpoint.size() checkpoints to emit each records buffer from the original - // elementsPerCheckpoint. - // Even if the checkpoints that emitted results are not continuous, the correctness of the - // data should not be - // affected in the end. Setting the delta to be 2 is introducing the variable that produce - // un-continuous - // checkpoints that emit the records buffer from elementsPerCheckpoints. - checkpointToAwait = numCheckpointsComplete.get() + 2; - for (T element : elements) { - ctx.collect(element); - } - } - - synchronized (ctx.getCheckpointLock()) { - while (running && numCheckpointsComplete.get() < checkpointToAwait) { - ctx.getCheckpointLock().wait(1); - } - } - } - } - - @Override - public void notifyCheckpointComplete(long checkpointId) throws Exception { - numCheckpointsComplete.incrementAndGet(); - } - - @Override - public void cancel() { - running = false; - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java deleted file mode 100644 index 4f210abff729..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.util.List; -import java.util.stream.Collectors; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.EnvironmentSettings; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; -import org.apache.flink.types.Row; -import org.apache.flink.types.RowKind; -import org.apache.iceberg.flink.FlinkTestBase; -import org.apache.iceberg.flink.MiniClusterResource; -import org.junit.After; -import org.junit.Rule; -import org.junit.rules.TestName; - -public class ChangeLogTableTestBase extends FlinkTestBase { - private volatile TableEnvironment tEnv = null; - - @Rule public TestName name = new TestName(); - - @After - public void clean() { - sql("DROP TABLE IF EXISTS %s", name.getMethodName()); - BoundedTableFactory.clearDataSets(); - } - - @Override - protected TableEnvironment getTableEnv() { - if (tEnv == null) { - synchronized (this) { - if (tEnv == null) { - EnvironmentSettings settings = - EnvironmentSettings.newInstance().useBlinkPlanner().inStreamingMode().build(); - - StreamExecutionEnvironment env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(400) - .setMaxParallelism(1) - .setParallelism(1); - - tEnv = StreamTableEnvironment.create(env, settings); - } - } - } - return tEnv; - } - - protected static Row insertRow(Object... values) { - return Row.ofKind(RowKind.INSERT, values); - } - - protected static Row deleteRow(Object... values) { - return Row.ofKind(RowKind.DELETE, values); - } - - protected static Row updateBeforeRow(Object... values) { - return Row.ofKind(RowKind.UPDATE_BEFORE, values); - } - - protected static Row updateAfterRow(Object... values) { - return Row.ofKind(RowKind.UPDATE_AFTER, values); - } - - protected static List listJoin(List> lists) { - return lists.stream().flatMap(List::stream).collect(Collectors.toList()); - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java deleted file mode 100644 index 7b5f9328694c..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.util.List; -import java.util.Objects; -import java.util.stream.Collectors; -import org.apache.flink.types.Row; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Streams; -import org.junit.Assert; -import org.junit.Test; - -public class TestBoundedTableFactory extends ChangeLogTableTestBase { - - @Test - public void testEmptyDataSet() { - String table = name.getMethodName(); - List> emptyDataSet = ImmutableList.of(); - - String dataId = BoundedTableFactory.registerDataSet(emptyDataSet); - sql( - "CREATE TABLE %s(id INT, data STRING) WITH ('connector'='BoundedSource', 'data-id'='%s')", - table, dataId); - - Assert.assertEquals( - "Should have caught empty change log set.", - ImmutableList.of(), - sql("SELECT * FROM %s", table)); - } - - @Test - public void testBoundedTableFactory() { - String table = name.getMethodName(); - List> dataSet = - ImmutableList.of( - ImmutableList.of( - insertRow(1, "aaa"), - deleteRow(1, "aaa"), - insertRow(1, "bbb"), - insertRow(2, "aaa"), - deleteRow(2, "aaa"), - insertRow(2, "bbb")), - ImmutableList.of( - updateBeforeRow(2, "bbb"), - updateAfterRow(2, "ccc"), - deleteRow(2, "ccc"), - insertRow(2, "ddd")), - ImmutableList.of( - deleteRow(1, "bbb"), - insertRow(1, "ccc"), - deleteRow(1, "ccc"), - insertRow(1, "ddd"))); - - String dataId = BoundedTableFactory.registerDataSet(dataSet); - sql( - "CREATE TABLE %s(id INT, data STRING) WITH ('connector'='BoundedSource', 'data-id'='%s')", - table, dataId); - - List rowSet = dataSet.stream().flatMap(Streams::stream).collect(Collectors.toList()); - Assert.assertEquals( - "Should have the expected change log events.", rowSet, sql("SELECT * FROM %s", table)); - - Assert.assertEquals( - "Should have the expected change log events", - rowSet.stream() - .filter(r -> Objects.equals(r.getField(1), "aaa")) - .collect(Collectors.toList()), - sql("SELECT * FROM %s WHERE data='aaa'", table)); - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java deleted file mode 100644 index 69b8ac269267..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.apache.iceberg.types.Types.NestedField.required; - -import java.io.IOException; -import java.util.List; -import java.util.Map; -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.types.Row; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.junit.Test; - -/** Test {@link FlinkInputFormat}. */ -public class TestFlinkInputFormat extends TestFlinkSource { - - public TestFlinkInputFormat(String fileFormat) { - super(fileFormat); - } - - @Override - public void before() throws IOException { - super.before(); - } - - @Override - protected List run( - FlinkSource.Builder formatBuilder, - Map sqlOptions, - String sqlFilter, - String... sqlSelectedFields) - throws Exception { - return runFormat(formatBuilder.tableLoader(tableLoader()).buildFormat()); - } - - @Test - public void testNestedProjection() throws Exception { - Schema schema = - new Schema( - required(1, "data", Types.StringType.get()), - required( - 2, - "nested", - Types.StructType.of( - Types.NestedField.required(3, "f1", Types.StringType.get()), - Types.NestedField.required(4, "f2", Types.StringType.get()), - Types.NestedField.required(5, "f3", Types.LongType.get()))), - required(6, "id", Types.LongType.get())); - - Table table = catalog.createTable(TableIdentifier.of("default", "t"), schema); - - List writeRecords = RandomGenericData.generate(schema, 2, 0L); - new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER).appendToTable(writeRecords); - - // Schema: [data, nested[f1, f2, f3], id] - // Projection: [nested.f2, data] - // The Flink SQL output: [f2, data] - // The FlinkInputFormat output: [nested[f2], data] - - TableSchema projectedSchema = - TableSchema.builder() - .field("nested", DataTypes.ROW(DataTypes.FIELD("f2", DataTypes.STRING()))) - .field("data", DataTypes.STRING()) - .build(); - List result = - runFormat( - FlinkSource.forRowData() - .tableLoader(tableLoader()) - .project(projectedSchema) - .buildFormat()); - - List expected = Lists.newArrayList(); - for (Record record : writeRecords) { - Row nested = Row.of(((Record) record.get(1)).get(1)); - expected.add(Row.of(nested, record.get(0))); - } - - TestHelpers.assertRows(result, expected); - } - - @Test - public void testBasicProjection() throws IOException { - Schema writeSchema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.optional(2, "time", Types.TimestampType.withZone())); - - Table table = catalog.createTable(TableIdentifier.of("default", "t"), writeSchema); - - List writeRecords = RandomGenericData.generate(writeSchema, 2, 0L); - new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER).appendToTable(writeRecords); - - TableSchema projectedSchema = - TableSchema.builder() - .field("id", DataTypes.BIGINT()) - .field("data", DataTypes.STRING()) - .build(); - List result = - runFormat( - FlinkSource.forRowData() - .tableLoader(tableLoader()) - .project(projectedSchema) - .buildFormat()); - - List expected = Lists.newArrayList(); - for (Record record : writeRecords) { - expected.add(Row.of(record.get(0), record.get(1))); - } - - TestHelpers.assertRows(result, expected); - } - - private List runFormat(FlinkInputFormat inputFormat) throws IOException { - RowType rowType = FlinkSchemaUtil.convert(inputFormat.projectedSchema()); - return TestHelpers.readRows(inputFormat, rowType); - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java deleted file mode 100644 index b2f914e51299..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.IOException; -import java.util.Map; -import org.apache.flink.table.types.logical.RowType; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.flink.CatalogLoader; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.util.StructLikeSet; - -public class TestFlinkInputFormatReaderDeletes extends TestFlinkReaderDeletesBase { - - public TestFlinkInputFormatReaderDeletes(FileFormat inputFormat) { - super(inputFormat); - } - - @Override - protected StructLikeSet rowSet(String tableName, Table testTable, String... columns) - throws IOException { - Schema projected = testTable.schema().select(columns); - RowType rowType = FlinkSchemaUtil.convert(projected); - Map properties = Maps.newHashMap(); - properties.put( - CatalogProperties.WAREHOUSE_LOCATION, - hiveConf.get(HiveConf.ConfVars.METASTOREWAREHOUSE.varname)); - properties.put(CatalogProperties.URI, hiveConf.get(HiveConf.ConfVars.METASTOREURIS.varname)); - properties.put( - CatalogProperties.CLIENT_POOL_SIZE, - Integer.toString(hiveConf.getInt("iceberg.hive.client-pool-size", 5))); - CatalogLoader hiveCatalogLoader = CatalogLoader.hive(catalog.name(), hiveConf, properties); - FlinkInputFormat inputFormat = - FlinkSource.forRowData() - .tableLoader( - TableLoader.fromCatalog( - hiveCatalogLoader, TableIdentifier.of("default", tableName))) - .project(FlinkSchemaUtil.toSchema(rowType)) - .buildFormat(); - - StructLikeSet set = StructLikeSet.create(projected.asStruct()); - TestHelpers.readRowData(inputFormat, rowType) - .forEach( - rowData -> { - RowDataWrapper wrapper = new RowDataWrapper(rowType, projected.asStruct()); - set.add(wrapper.wrap(rowData)); - }); - - return set; - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java deleted file mode 100644 index 3a7ec96cb1d6..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.IOException; -import java.util.List; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.TestMergingMetrics; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.RowDataConverter; -import org.apache.iceberg.flink.sink.FlinkAppenderFactory; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; - -public class TestFlinkMergingMetrics extends TestMergingMetrics { - - public TestFlinkMergingMetrics(FileFormat fileFormat) { - super(fileFormat); - } - - @Override - protected FileAppender writeAndGetAppender(List records) throws IOException { - RowType flinkSchema = FlinkSchemaUtil.convert(SCHEMA); - - FileAppender appender = - new FlinkAppenderFactory( - SCHEMA, flinkSchema, ImmutableMap.of(), PartitionSpec.unpartitioned()) - .newAppender(org.apache.iceberg.Files.localOutput(temp.newFile()), fileFormat); - try (FileAppender fileAppender = appender) { - records.stream().map(r -> RowDataConverter.convert(SCHEMA, r)).forEach(fileAppender::add); - } - return appender; - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java deleted file mode 100644 index 987d79fed3c3..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.util.Map; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.iceberg.BaseTable; -import org.apache.iceberg.CatalogUtil; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableMetadata; -import org.apache.iceberg.TableOperations; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.data.DeleteReadTests; -import org.apache.iceberg.hive.HiveCatalog; -import org.apache.iceberg.hive.TestHiveMetastore; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.junit.AfterClass; -import org.junit.BeforeClass; -import org.junit.ClassRule; -import org.junit.rules.TemporaryFolder; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Parameterized.class) -public abstract class TestFlinkReaderDeletesBase extends DeleteReadTests { - - @ClassRule public static final TemporaryFolder TEMP_FOLDER = new TemporaryFolder(); - - protected static String databaseName = "default"; - - protected static HiveConf hiveConf = null; - protected static HiveCatalog catalog = null; - private static TestHiveMetastore metastore = null; - - protected final FileFormat format; - - @Parameterized.Parameters(name = "fileFormat={0}") - public static Object[][] parameters() { - return new Object[][] { - new Object[] {FileFormat.PARQUET}, - new Object[] {FileFormat.AVRO}, - new Object[] {FileFormat.ORC} - }; - } - - TestFlinkReaderDeletesBase(FileFormat fileFormat) { - this.format = fileFormat; - } - - @BeforeClass - public static void startMetastore() { - metastore = new TestHiveMetastore(); - metastore.start(); - hiveConf = metastore.hiveConf(); - catalog = - (HiveCatalog) - CatalogUtil.loadCatalog( - HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); - } - - @AfterClass - public static void stopMetastore() throws Exception { - metastore.stop(); - catalog = null; - } - - @Override - protected Table createTable(String name, Schema schema, PartitionSpec spec) { - Map props = Maps.newHashMap(); - props.put(TableProperties.DEFAULT_FILE_FORMAT, format.name()); - - Table table = catalog.createTable(TableIdentifier.of(databaseName, name), schema, spec, props); - TableOperations ops = ((BaseTable) table).operations(); - TableMetadata meta = ops.current(); - ops.commit(meta, meta.upgradeToFormatVersion(2)); - - return table; - } - - @Override - protected void dropTable(String name) { - catalog.dropTable(TableIdentifier.of(databaseName, name)); - } - - @Override - protected boolean expectPruned() { - return false; - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java deleted file mode 100644 index 206f3f8beb5f..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java +++ /dev/null @@ -1,368 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.File; -import java.io.IOException; -import java.time.LocalDate; -import java.time.LocalDateTime; -import java.time.LocalTime; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import org.apache.flink.test.util.MiniClusterWithClientResource; -import org.apache.flink.types.Row; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.AppendFiles; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.flink.MiniClusterResource; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.hadoop.HadoopCatalog; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.DateTimeUtil; -import org.junit.After; -import org.junit.Assert; -import org.junit.Before; -import org.junit.ClassRule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Parameterized.class) -public abstract class TestFlinkScan { - - @ClassRule - public static final MiniClusterWithClientResource MINI_CLUSTER_RESOURCE = - MiniClusterResource.createWithClassloaderCheckDisabled(); - - @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); - - protected HadoopCatalog catalog; - protected String warehouse; - protected String location; - - // parametrized variables - protected final FileFormat fileFormat; - - @Parameterized.Parameters(name = "format={0}") - public static Object[] parameters() { - return new Object[] {"avro", "parquet", "orc"}; - } - - TestFlinkScan(String fileFormat) { - this.fileFormat = FileFormat.fromString(fileFormat); - } - - @Before - public void before() throws IOException { - File warehouseFile = TEMPORARY_FOLDER.newFolder(); - Assert.assertTrue(warehouseFile.delete()); - // before variables - warehouse = "file:" + warehouseFile; - Configuration conf = new Configuration(); - catalog = new HadoopCatalog(conf, warehouse); - location = String.format("%s/%s/%s", warehouse, TestFixtures.DATABASE, TestFixtures.TABLE); - } - - @After - public void after() throws IOException {} - - protected TableLoader tableLoader() { - return TableLoader.fromHadoopTable(location); - } - - protected abstract List runWithProjection(String... projected) throws Exception; - - protected abstract List runWithFilter(Expression filter, String sqlFilter) throws Exception; - - protected abstract List runWithOptions(Map options) throws Exception; - - protected abstract List run() throws Exception; - - @Test - public void testUnpartitionedTable() throws Exception { - Table table = catalog.createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); - List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); - new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER).appendToTable(expectedRecords); - TestHelpers.assertRecords(run(), expectedRecords, TestFixtures.SCHEMA); - } - - @Test - public void testPartitionedTable() throws Exception { - Table table = - catalog.createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); - List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); - expectedRecords.get(0).set(2, "2020-03-20"); - new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER) - .appendToTable(org.apache.iceberg.TestHelpers.Row.of("2020-03-20", 0), expectedRecords); - TestHelpers.assertRecords(run(), expectedRecords, TestFixtures.SCHEMA); - } - - @Test - public void testProjection() throws Exception { - Table table = - catalog.createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); - List inputRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); - new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER) - .appendToTable(org.apache.iceberg.TestHelpers.Row.of("2020-03-20", 0), inputRecords); - assertRows(runWithProjection("data"), Row.of(inputRecords.get(0).get(0))); - } - - @Test - public void testIdentityPartitionProjections() throws Exception { - Schema logSchema = - new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "dt", Types.StringType.get()), - Types.NestedField.optional(3, "level", Types.StringType.get()), - Types.NestedField.optional(4, "message", Types.StringType.get())); - PartitionSpec spec = - PartitionSpec.builderFor(logSchema).identity("dt").identity("level").build(); - - Table table = catalog.createTable(TestFixtures.TABLE_IDENTIFIER, logSchema, spec); - List inputRecords = RandomGenericData.generate(logSchema, 10, 0L); - - int idx = 0; - AppendFiles append = table.newAppend(); - for (Record record : inputRecords) { - record.set(1, "2020-03-2" + idx); - record.set(2, Integer.toString(idx)); - append.appendFile( - new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER) - .writeFile( - org.apache.iceberg.TestHelpers.Row.of("2020-03-2" + idx, Integer.toString(idx)), - ImmutableList.of(record))); - idx += 1; - } - append.commit(); - - // individual fields - validateIdentityPartitionProjections(table, Collections.singletonList("dt"), inputRecords); - validateIdentityPartitionProjections(table, Collections.singletonList("level"), inputRecords); - validateIdentityPartitionProjections(table, Collections.singletonList("message"), inputRecords); - validateIdentityPartitionProjections(table, Collections.singletonList("id"), inputRecords); - // field pairs - validateIdentityPartitionProjections(table, Arrays.asList("dt", "message"), inputRecords); - validateIdentityPartitionProjections(table, Arrays.asList("level", "message"), inputRecords); - validateIdentityPartitionProjections(table, Arrays.asList("dt", "level"), inputRecords); - // out-of-order pairs - validateIdentityPartitionProjections(table, Arrays.asList("message", "dt"), inputRecords); - validateIdentityPartitionProjections(table, Arrays.asList("message", "level"), inputRecords); - validateIdentityPartitionProjections(table, Arrays.asList("level", "dt"), inputRecords); - // out-of-order triplets - validateIdentityPartitionProjections( - table, Arrays.asList("dt", "level", "message"), inputRecords); - validateIdentityPartitionProjections( - table, Arrays.asList("level", "dt", "message"), inputRecords); - validateIdentityPartitionProjections( - table, Arrays.asList("dt", "message", "level"), inputRecords); - validateIdentityPartitionProjections( - table, Arrays.asList("level", "message", "dt"), inputRecords); - validateIdentityPartitionProjections( - table, Arrays.asList("message", "dt", "level"), inputRecords); - validateIdentityPartitionProjections( - table, Arrays.asList("message", "level", "dt"), inputRecords); - } - - private void validateIdentityPartitionProjections( - Table table, List projectedFields, List inputRecords) throws Exception { - List rows = runWithProjection(projectedFields.toArray(new String[0])); - - for (int pos = 0; pos < inputRecords.size(); pos++) { - Record inputRecord = inputRecords.get(pos); - Row actualRecord = rows.get(pos); - - for (int i = 0; i < projectedFields.size(); i++) { - String name = projectedFields.get(i); - Assert.assertEquals( - "Projected field " + name + " should match", - inputRecord.getField(name), - actualRecord.getField(i)); - } - } - } - - @Test - public void testSnapshotReads() throws Exception { - Table table = catalog.createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); - - GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER); - - List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); - helper.appendToTable(expectedRecords); - long snapshotId = table.currentSnapshot().snapshotId(); - - long timestampMillis = table.currentSnapshot().timestampMillis(); - - // produce another timestamp - waitUntilAfter(timestampMillis); - helper.appendToTable(RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L)); - - TestHelpers.assertRecords( - runWithOptions(ImmutableMap.of("snapshot-id", Long.toString(snapshotId))), - expectedRecords, - TestFixtures.SCHEMA); - TestHelpers.assertRecords( - runWithOptions(ImmutableMap.of("as-of-timestamp", Long.toString(timestampMillis))), - expectedRecords, - TestFixtures.SCHEMA); - } - - @Test - public void testIncrementalRead() throws Exception { - Table table = catalog.createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); - - GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER); - - List records1 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); - helper.appendToTable(records1); - long snapshotId1 = table.currentSnapshot().snapshotId(); - - // snapshot 2 - List records2 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); - helper.appendToTable(records2); - - List records3 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); - helper.appendToTable(records3); - long snapshotId3 = table.currentSnapshot().snapshotId(); - - // snapshot 4 - helper.appendToTable(RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L)); - - List expected2 = Lists.newArrayList(); - expected2.addAll(records2); - expected2.addAll(records3); - TestHelpers.assertRecords( - runWithOptions( - ImmutableMap.builder() - .put("start-snapshot-id", Long.toString(snapshotId1)) - .put("end-snapshot-id", Long.toString(snapshotId3)) - .build()), - expected2, - TestFixtures.SCHEMA); - } - - @Test - public void testFilterExp() throws Exception { - Table table = - catalog.createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); - - List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); - expectedRecords.get(0).set(2, "2020-03-20"); - expectedRecords.get(1).set(2, "2020-03-20"); - - GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER); - DataFile dataFile1 = - helper.writeFile(org.apache.iceberg.TestHelpers.Row.of("2020-03-20", 0), expectedRecords); - DataFile dataFile2 = - helper.writeFile( - org.apache.iceberg.TestHelpers.Row.of("2020-03-21", 0), - RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L)); - helper.appendToTable(dataFile1, dataFile2); - TestHelpers.assertRecords( - runWithFilter(Expressions.equal("dt", "2020-03-20"), "where dt='2020-03-20'"), - expectedRecords, - TestFixtures.SCHEMA); - } - - @Test - public void testPartitionTypes() throws Exception { - Schema typesSchema = - new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "decimal", Types.DecimalType.of(38, 18)), - Types.NestedField.optional(3, "str", Types.StringType.get()), - Types.NestedField.optional(4, "binary", Types.BinaryType.get()), - Types.NestedField.optional(5, "date", Types.DateType.get()), - Types.NestedField.optional(6, "time", Types.TimeType.get()), - Types.NestedField.optional(7, "timestamp", Types.TimestampType.withoutZone())); - PartitionSpec spec = - PartitionSpec.builderFor(typesSchema) - .identity("decimal") - .identity("str") - .identity("binary") - .identity("date") - .identity("time") - .identity("timestamp") - .build(); - - Table table = catalog.createTable(TestFixtures.TABLE_IDENTIFIER, typesSchema, spec); - List records = RandomGenericData.generate(typesSchema, 10, 0L); - GenericAppenderHelper appender = new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER); - for (Record record : records) { - org.apache.iceberg.TestHelpers.Row partition = - org.apache.iceberg.TestHelpers.Row.of( - record.get(1), - record.get(2), - record.get(3), - record.get(4) == null ? null : DateTimeUtil.daysFromDate((LocalDate) record.get(4)), - record.get(5) == null ? null : DateTimeUtil.microsFromTime((LocalTime) record.get(5)), - record.get(6) == null - ? null - : DateTimeUtil.microsFromTimestamp((LocalDateTime) record.get(6))); - appender.appendToTable(partition, Collections.singletonList(record)); - } - - TestHelpers.assertRecords(run(), records, typesSchema); - } - - @Test - public void testCustomizedFlinkDataTypes() throws Exception { - Schema schema = - new Schema( - Types.NestedField.required( - 1, - "map", - Types.MapType.ofRequired(2, 3, Types.StringType.get(), Types.StringType.get())), - Types.NestedField.required( - 4, "arr", Types.ListType.ofRequired(5, Types.StringType.get()))); - Table table = catalog.createTable(TestFixtures.TABLE_IDENTIFIER, schema); - List records = RandomGenericData.generate(schema, 10, 0L); - GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER); - helper.appendToTable(records); - TestHelpers.assertRecords(run(), records, schema); - } - - private static void assertRows(List results, Row... expected) { - TestHelpers.assertRows(results, Arrays.asList(expected)); - } - - private static void waitUntilAfter(long timestampMillis) { - long current = System.currentTimeMillis(); - while (current <= timestampMillis) { - current = System.currentTimeMillis(); - } - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java deleted file mode 100644 index c1a813417b46..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java +++ /dev/null @@ -1,295 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.IOException; -import java.util.List; -import java.util.Map; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.PipelineOptions; -import org.apache.flink.table.api.EnvironmentSettings; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.table.api.TableResult; -import org.apache.flink.table.api.config.TableConfigOptions; -import org.apache.flink.types.Row; -import org.apache.flink.util.CloseableIterator; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.Table; -import org.apache.iceberg.TestHelpers; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.common.DynMethods; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.junit.Assert; -import org.junit.Test; - -/** Test Flink SELECT SQLs. */ -public class TestFlinkScanSql extends TestFlinkSource { - - private volatile TableEnvironment tEnv; - - public TestFlinkScanSql(String fileFormat) { - super(fileFormat); - } - - @Override - public void before() throws IOException { - super.before(); - sql( - "create catalog iceberg_catalog with ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", - warehouse); - sql("use catalog iceberg_catalog"); - getTableEnv() - .getConfig() - .getConfiguration() - .set(TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED, true); - } - - private TableEnvironment getTableEnv() { - if (tEnv == null) { - synchronized (this) { - if (tEnv == null) { - this.tEnv = - TableEnvironment.create( - EnvironmentSettings.newInstance().useBlinkPlanner().inBatchMode().build()); - } - } - } - return tEnv; - } - - @Override - protected List run( - FlinkSource.Builder formatBuilder, - Map sqlOptions, - String sqlFilter, - String... sqlSelectedFields) { - String select = String.join(",", sqlSelectedFields); - - StringBuilder builder = new StringBuilder(); - sqlOptions.forEach((key, value) -> builder.append(optionToKv(key, value)).append(",")); - - String optionStr = builder.toString(); - - if (optionStr.endsWith(",")) { - optionStr = optionStr.substring(0, optionStr.length() - 1); - } - - if (!optionStr.isEmpty()) { - optionStr = String.format("/*+ OPTIONS(%s)*/", optionStr); - } - - return sql("select %s from t %s %s", select, optionStr, sqlFilter); - } - - @Test - public void testResiduals() throws Exception { - Table table = - catalog.createTable( - TableIdentifier.of("default", "t"), TestFixtures.SCHEMA, TestFixtures.SPEC); - - List writeRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); - writeRecords.get(0).set(1, 123L); - writeRecords.get(0).set(2, "2020-03-20"); - writeRecords.get(1).set(1, 456L); - writeRecords.get(1).set(2, "2020-03-20"); - - GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER); - - List expectedRecords = Lists.newArrayList(); - expectedRecords.add(writeRecords.get(0)); - - DataFile dataFile1 = helper.writeFile(TestHelpers.Row.of("2020-03-20", 0), writeRecords); - DataFile dataFile2 = - helper.writeFile( - TestHelpers.Row.of("2020-03-21", 0), - RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L)); - helper.appendToTable(dataFile1, dataFile2); - - Expression filter = - Expressions.and(Expressions.equal("dt", "2020-03-20"), Expressions.equal("id", 123)); - org.apache.iceberg.flink.TestHelpers.assertRecords( - runWithFilter(filter, "where dt='2020-03-20' and id=123"), - expectedRecords, - TestFixtures.SCHEMA); - } - - @Test - public void testInferedParallelism() throws IOException { - Table table = - catalog.createTable( - TableIdentifier.of("default", "t"), TestFixtures.SCHEMA, TestFixtures.SPEC); - - TableLoader tableLoader = TableLoader.fromHadoopTable(table.location()); - FlinkInputFormat flinkInputFormat = - FlinkSource.forRowData().tableLoader(tableLoader).table(table).buildFormat(); - ScanContext scanContext = ScanContext.builder().build(); - - // Empty table, infer parallelism should be at least 1 - int parallelism = FlinkSource.forRowData().inferParallelism(flinkInputFormat, scanContext); - Assert.assertEquals("Should produce the expected parallelism.", 1, parallelism); - - GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER); - DataFile dataFile1 = - helper.writeFile( - TestHelpers.Row.of("2020-03-20", 0), - RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L)); - DataFile dataFile2 = - helper.writeFile( - TestHelpers.Row.of("2020-03-21", 0), - RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L)); - helper.appendToTable(dataFile1, dataFile2); - - // Make sure to generate 2 CombinedScanTasks - long maxFileLen = Math.max(dataFile1.fileSizeInBytes(), dataFile2.fileSizeInBytes()); - sql( - "ALTER TABLE t SET ('read.split.open-file-cost'='1', 'read.split.target-size'='%s')", - maxFileLen); - - // 2 splits (max infer is the default value 100 , max > splits num), the parallelism is splits - // num : 2 - parallelism = FlinkSource.forRowData().inferParallelism(flinkInputFormat, scanContext); - Assert.assertEquals("Should produce the expected parallelism.", 2, parallelism); - - // 2 splits and limit is 1 , max infer parallelism is default 100, - // which is greater than splits num and limit, the parallelism is the limit value : 1 - parallelism = - FlinkSource.forRowData() - .inferParallelism(flinkInputFormat, ScanContext.builder().limit(1).build()); - Assert.assertEquals("Should produce the expected parallelism.", 1, parallelism); - - // 2 splits and max infer parallelism is 1 (max < splits num), the parallelism is 1 - Configuration configuration = new Configuration(); - configuration.setInteger(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX, 1); - parallelism = - FlinkSource.forRowData() - .flinkConf(configuration) - .inferParallelism(flinkInputFormat, ScanContext.builder().build()); - Assert.assertEquals("Should produce the expected parallelism.", 1, parallelism); - - // 2 splits, max infer parallelism is 1, limit is 3, the parallelism is max infer parallelism : - // 1 - parallelism = - FlinkSource.forRowData() - .flinkConf(configuration) - .inferParallelism(flinkInputFormat, ScanContext.builder().limit(3).build()); - Assert.assertEquals("Should produce the expected parallelism.", 1, parallelism); - - // 2 splits, infer parallelism is disabled, the parallelism is flink default parallelism 1 - configuration.setBoolean(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, false); - parallelism = - FlinkSource.forRowData() - .flinkConf(configuration) - .inferParallelism(flinkInputFormat, ScanContext.builder().limit(3).build()); - Assert.assertEquals("Should produce the expected parallelism.", 1, parallelism); - } - - @Test - public void testInferParallelismWithGlobalSetting() throws IOException { - Configuration cfg = tEnv.getConfig().getConfiguration(); - cfg.set(PipelineOptions.MAX_PARALLELISM, 1); - - Table table = - catalog.createTable(TableIdentifier.of("default", "t"), TestFixtures.SCHEMA, null); - - GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER); - List expectedRecords = Lists.newArrayList(); - long maxFileLen = 0; - for (int i = 0; i < 5; i++) { - List records = RandomGenericData.generate(TestFixtures.SCHEMA, 2, i); - DataFile dataFile = helper.writeFile(null, records); - helper.appendToTable(dataFile); - expectedRecords.addAll(records); - maxFileLen = Math.max(dataFile.fileSizeInBytes(), maxFileLen); - } - - // Make sure to generate multiple CombinedScanTasks - sql( - "ALTER TABLE t SET ('read.split.open-file-cost'='1', 'read.split.target-size'='%s')", - maxFileLen); - - List results = run(null, Maps.newHashMap(), "", "*"); - org.apache.iceberg.flink.TestHelpers.assertRecords( - results, expectedRecords, TestFixtures.SCHEMA); - } - - @Test - public void testExposeLocality() throws Exception { - Table table = - catalog.createTable( - TableIdentifier.of("default", "t"), TestFixtures.SCHEMA, TestFixtures.SPEC); - - TableLoader tableLoader = TableLoader.fromHadoopTable(table.location()); - List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 10, 0L); - expectedRecords.forEach(expectedRecord -> expectedRecord.set(2, "2020-03-20")); - - GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER); - DataFile dataFile = helper.writeFile(TestHelpers.Row.of("2020-03-20", 0), expectedRecords); - helper.appendToTable(dataFile); - - // test sql api - Configuration tableConf = getTableEnv().getConfig().getConfiguration(); - tableConf.setBoolean( - FlinkConfigOptions.TABLE_EXEC_ICEBERG_EXPOSE_SPLIT_LOCALITY_INFO.key(), false); - - List results = sql("select * from t"); - org.apache.iceberg.flink.TestHelpers.assertRecords( - results, expectedRecords, TestFixtures.SCHEMA); - - // test table api - tableConf.setBoolean( - FlinkConfigOptions.TABLE_EXEC_ICEBERG_EXPOSE_SPLIT_LOCALITY_INFO.key(), true); - FlinkSource.Builder builder = FlinkSource.forRowData().tableLoader(tableLoader).table(table); - - Boolean localityEnabled = - DynMethods.builder("localityEnabled") - .hiddenImpl(builder.getClass()) - .build() - .invoke(builder); - // When running with CI or local, `localityEnabled` will be false even if this configuration is - // enabled - Assert.assertFalse("Expose split locality info should be false.", localityEnabled); - - results = run(builder, Maps.newHashMap(), "where dt='2020-03-20'", "*"); - org.apache.iceberg.flink.TestHelpers.assertRecords( - results, expectedRecords, TestFixtures.SCHEMA); - } - - private List sql(String query, Object... args) { - TableResult tableResult = getTableEnv().executeSql(String.format(query, args)); - try (CloseableIterator iter = tableResult.collect()) { - List results = Lists.newArrayList(iter); - return results; - } catch (Exception e) { - throw new RuntimeException("Failed to collect table result", e); - } - } - - private String optionToKv(String key, Object value) { - return "'" + key + "'='" + value + "'"; - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java deleted file mode 100644 index 3a01952cd9ec..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import org.apache.flink.table.api.TableColumn; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.types.Row; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; - -public abstract class TestFlinkSource extends TestFlinkScan { - - TestFlinkSource(String fileFormat) { - super(fileFormat); - } - - @Override - protected List runWithProjection(String... projected) throws Exception { - TableSchema.Builder builder = TableSchema.builder(); - TableSchema schema = - FlinkSchemaUtil.toSchema( - FlinkSchemaUtil.convert( - catalog.loadTable(TableIdentifier.of("default", "t")).schema())); - for (String field : projected) { - TableColumn column = schema.getTableColumn(field).get(); - builder.field(column.getName(), column.getType()); - } - return run(FlinkSource.forRowData().project(builder.build()), Maps.newHashMap(), "", projected); - } - - @Override - protected List runWithFilter(Expression filter, String sqlFilter) throws Exception { - FlinkSource.Builder builder = - FlinkSource.forRowData().filters(Collections.singletonList(filter)); - return run(builder, Maps.newHashMap(), sqlFilter, "*"); - } - - @Override - protected List runWithOptions(Map options) throws Exception { - FlinkSource.Builder builder = FlinkSource.forRowData(); - Optional.ofNullable(options.get("snapshot-id")) - .ifPresent(value -> builder.snapshotId(Long.parseLong(value))); - Optional.ofNullable(options.get("start-snapshot-id")) - .ifPresent(value -> builder.startSnapshotId(Long.parseLong(value))); - Optional.ofNullable(options.get("end-snapshot-id")) - .ifPresent(value -> builder.endSnapshotId(Long.parseLong(value))); - Optional.ofNullable(options.get("as-of-timestamp")) - .ifPresent(value -> builder.asOfTimestamp(Long.parseLong(value))); - return run(builder, options, "", "*"); - } - - @Override - protected List run() throws Exception { - return run(FlinkSource.forRowData(), Maps.newHashMap(), "", "*"); - } - - protected abstract List run( - FlinkSource.Builder formatBuilder, - Map sqlOptions, - String sqlFilter, - String... sqlSelectedFields) - throws Exception; -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java deleted file mode 100644 index bc63e4a0b282..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java +++ /dev/null @@ -1,185 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.IOException; -import java.util.List; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.RowDelta; -import org.apache.iceberg.SerializableTable; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.flink.data.RowDataProjection; -import org.apache.iceberg.flink.sink.RowDataTaskWriterFactory; -import org.apache.iceberg.flink.sink.TaskWriterFactory; -import org.apache.iceberg.io.TaskWriter; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.Assert; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Parameterized.class) -public class TestProjectMetaColumn { - - @Rule public final TemporaryFolder folder = new TemporaryFolder(); - private final FileFormat format; - - @Parameterized.Parameters(name = "fileFormat={0}") - public static Iterable parameters() { - return Lists.newArrayList( - new Object[] {FileFormat.PARQUET}, - new Object[] {FileFormat.ORC}, - new Object[] {FileFormat.AVRO}); - } - - public TestProjectMetaColumn(FileFormat format) { - this.format = format; - } - - private void testSkipToRemoveMetaColumn(int formatVersion) throws IOException { - // Create the table with given format version. - String location = folder.getRoot().getAbsolutePath(); - Table table = - SimpleDataUtil.createTable( - location, - ImmutableMap.of(TableProperties.FORMAT_VERSION, String.valueOf(formatVersion)), - false); - - List rows = - Lists.newArrayList( - SimpleDataUtil.createInsert(1, "AAA"), - SimpleDataUtil.createInsert(2, "BBB"), - SimpleDataUtil.createInsert(3, "CCC")); - writeAndCommit(table, ImmutableList.of(), false, rows); - - FlinkInputFormat input = - FlinkSource.forRowData().tableLoader(TableLoader.fromHadoopTable(location)).buildFormat(); - - List results = Lists.newArrayList(); - TestHelpers.readRowData( - input, - rowData -> { - // If project to remove the meta columns, it will get a RowDataProjection. - Assert.assertTrue(rowData instanceof GenericRowData); - results.add(TestHelpers.copyRowData(rowData, SimpleDataUtil.ROW_TYPE)); - }); - - // Assert the results. - TestHelpers.assertRows(rows, results, SimpleDataUtil.ROW_TYPE); - } - - @Test - public void testV1SkipToRemoveMetaColumn() throws IOException { - testSkipToRemoveMetaColumn(1); - } - - @Test - public void testV2SkipToRemoveMetaColumn() throws IOException { - testSkipToRemoveMetaColumn(2); - } - - @Test - public void testV2RemoveMetaColumn() throws Exception { - // Create the v2 table. - String location = folder.getRoot().getAbsolutePath(); - Table table = - SimpleDataUtil.createTable( - location, ImmutableMap.of(TableProperties.FORMAT_VERSION, "2"), false); - - List rows = - Lists.newArrayList( - SimpleDataUtil.createInsert(1, "AAA"), - SimpleDataUtil.createDelete(1, "AAA"), - SimpleDataUtil.createInsert(2, "AAA"), - SimpleDataUtil.createInsert(2, "BBB")); - int eqFieldId = table.schema().findField("data").fieldId(); - writeAndCommit(table, ImmutableList.of(eqFieldId), true, rows); - - FlinkInputFormat input = - FlinkSource.forRowData().tableLoader(TableLoader.fromHadoopTable(location)).buildFormat(); - - List results = Lists.newArrayList(); - TestHelpers.readRowData( - input, - rowData -> { - // If project to remove the meta columns, it will get a RowDataProjection. - Assert.assertTrue(rowData instanceof RowDataProjection); - results.add(TestHelpers.copyRowData(rowData, SimpleDataUtil.ROW_TYPE)); - }); - - // Assert the results. - TestHelpers.assertRows( - ImmutableList.of( - SimpleDataUtil.createInsert(2, "AAA"), SimpleDataUtil.createInsert(2, "BBB")), - results, - SimpleDataUtil.ROW_TYPE); - } - - private void writeAndCommit( - Table table, List eqFieldIds, boolean upsert, List rows) - throws IOException { - TaskWriter writer = createTaskWriter(table, eqFieldIds, upsert); - try (TaskWriter io = writer) { - for (RowData row : rows) { - io.write(row); - } - } - - RowDelta delta = table.newRowDelta(); - WriteResult result = writer.complete(); - - for (DataFile dataFile : result.dataFiles()) { - delta.addRows(dataFile); - } - - for (DeleteFile deleteFile : result.deleteFiles()) { - delta.addDeletes(deleteFile); - } - - delta.commit(); - } - - private TaskWriter createTaskWriter( - Table table, List equalityFieldIds, boolean upsert) { - TaskWriterFactory taskWriterFactory = - new RowDataTaskWriterFactory( - SerializableTable.copyOf(table), - SimpleDataUtil.ROW_TYPE, - TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT, - format, - equalityFieldIds, - upsert); - - taskWriterFactory.initialize(1, 1); - return taskWriterFactory.create(); - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java deleted file mode 100644 index 1ab77b9b7039..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java +++ /dev/null @@ -1,253 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.IOException; -import java.util.Iterator; -import java.util.List; -import org.apache.flink.core.execution.JobClient; -import org.apache.flink.streaming.api.TimeCharacteristic; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.EnvironmentSettings; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.table.api.TableResult; -import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; -import org.apache.flink.table.api.config.TableConfigOptions; -import org.apache.flink.types.Row; -import org.apache.flink.util.CloseableIterator; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Table; -import org.apache.iceberg.TestHelpers; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.FlinkCatalogTestBase; -import org.apache.iceberg.flink.MiniClusterResource; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.After; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; - -public class TestStreamScanSql extends FlinkCatalogTestBase { - private static final String TABLE = "test_table"; - private static final FileFormat FORMAT = FileFormat.PARQUET; - - private TableEnvironment tEnv; - - public TestStreamScanSql(String catalogName, Namespace baseNamespace) { - super(catalogName, baseNamespace); - } - - @Override - protected TableEnvironment getTableEnv() { - if (tEnv == null) { - synchronized (this) { - if (tEnv == null) { - EnvironmentSettings.Builder settingsBuilder = - EnvironmentSettings.newInstance().useBlinkPlanner().inStreamingMode(); - - StreamExecutionEnvironment env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); - env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); - env.enableCheckpointing(400); - - StreamTableEnvironment streamTableEnv = - StreamTableEnvironment.create(env, settingsBuilder.build()); - streamTableEnv - .getConfig() - .getConfiguration() - .set(TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED, true); - tEnv = streamTableEnv; - } - } - } - return tEnv; - } - - @Override - @Before - public void before() { - super.before(); - sql("CREATE DATABASE %s", flinkDatabase); - sql("USE CATALOG %s", catalogName); - sql("USE %s", DATABASE); - } - - @Override - @After - public void clean() { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, TABLE); - sql("DROP DATABASE IF EXISTS %s", flinkDatabase); - super.clean(); - } - - private void insertRows(String partition, Table table, Row... rows) throws IOException { - GenericAppenderHelper appender = new GenericAppenderHelper(table, FORMAT, TEMPORARY_FOLDER); - - GenericRecord gRecord = GenericRecord.create(table.schema()); - List records = Lists.newArrayList(); - for (Row row : rows) { - records.add( - gRecord.copy( - "id", row.getField(0), - "data", row.getField(1), - "dt", row.getField(2))); - } - - if (partition != null) { - appender.appendToTable(TestHelpers.Row.of(partition, 0), records); - } else { - appender.appendToTable(records); - } - } - - private void insertRows(Table table, Row... rows) throws IOException { - insertRows(null, table, rows); - } - - private void assertRows(List expectedRows, Iterator iterator) { - for (Row expectedRow : expectedRows) { - Assert.assertTrue("Should have more records", iterator.hasNext()); - - Row actualRow = iterator.next(); - Assert.assertEquals("Should have expected fields", 3, actualRow.getArity()); - Assert.assertEquals( - "Should have expected id", expectedRow.getField(0), actualRow.getField(0)); - Assert.assertEquals( - "Should have expected data", expectedRow.getField(1), actualRow.getField(1)); - Assert.assertEquals( - "Should have expected dt", expectedRow.getField(2), actualRow.getField(2)); - } - } - - @Test - public void testUnPartitionedTable() throws Exception { - sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE); - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); - - TableResult result = - exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE); - try (CloseableIterator iterator = result.collect()) { - - Row row1 = Row.of(1, "aaa", "2021-01-01"); - insertRows(table, row1); - assertRows(ImmutableList.of(row1), iterator); - - Row row2 = Row.of(2, "bbb", "2021-01-01"); - insertRows(table, row2); - assertRows(ImmutableList.of(row2), iterator); - } - result.getJobClient().ifPresent(JobClient::cancel); - } - - @Test - public void testPartitionedTable() throws Exception { - sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR) PARTITIONED BY (dt)", TABLE); - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); - - TableResult result = - exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE); - try (CloseableIterator iterator = result.collect()) { - Row row1 = Row.of(1, "aaa", "2021-01-01"); - insertRows("2021-01-01", table, row1); - assertRows(ImmutableList.of(row1), iterator); - - Row row2 = Row.of(2, "bbb", "2021-01-02"); - insertRows("2021-01-02", table, row2); - assertRows(ImmutableList.of(row2), iterator); - - Row row3 = Row.of(1, "aaa", "2021-01-02"); - insertRows("2021-01-02", table, row3); - assertRows(ImmutableList.of(row3), iterator); - - Row row4 = Row.of(2, "bbb", "2021-01-01"); - insertRows("2021-01-01", table, row4); - assertRows(ImmutableList.of(row4), iterator); - } - result.getJobClient().ifPresent(JobClient::cancel); - } - - @Test - public void testConsumeFromBeginning() throws Exception { - sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE); - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); - - Row row1 = Row.of(1, "aaa", "2021-01-01"); - Row row2 = Row.of(2, "bbb", "2021-01-01"); - insertRows(table, row1, row2); - - TableResult result = - exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE); - try (CloseableIterator iterator = result.collect()) { - assertRows(ImmutableList.of(row1, row2), iterator); - - Row row3 = Row.of(3, "ccc", "2021-01-01"); - insertRows(table, row3); - assertRows(ImmutableList.of(row3), iterator); - - Row row4 = Row.of(4, "ddd", "2021-01-01"); - insertRows(table, row4); - assertRows(ImmutableList.of(row4), iterator); - } - result.getJobClient().ifPresent(JobClient::cancel); - } - - @Test - public void testConsumeFromStartSnapshotId() throws Exception { - sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE); - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); - - // Produce two snapshots. - Row row1 = Row.of(1, "aaa", "2021-01-01"); - Row row2 = Row.of(2, "bbb", "2021-01-01"); - insertRows(table, row1); - insertRows(table, row2); - - long startSnapshotId = table.currentSnapshot().snapshotId(); - - Row row3 = Row.of(3, "ccc", "2021-01-01"); - Row row4 = Row.of(4, "ddd", "2021-01-01"); - insertRows(table, row3, row4); - - TableResult result = - exec( - "SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s', " - + "'start-snapshot-id'='%d')*/", - TABLE, startSnapshotId); - try (CloseableIterator iterator = result.collect()) { - // The row2 in start snapshot will be excluded. - assertRows(ImmutableList.of(row3, row4), iterator); - - Row row5 = Row.of(5, "eee", "2021-01-01"); - Row row6 = Row.of(6, "fff", "2021-01-01"); - insertRows(table, row5, row6); - assertRows(ImmutableList.of(row5, row6), iterator); - - Row row7 = Row.of(7, "ggg", "2021-01-01"); - insertRows(table, row7); - assertRows(ImmutableList.of(row7), iterator); - } - result.getJobClient().ifPresent(JobClient::cancel); - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java deleted file mode 100644 index 91b11cbeb24d..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java +++ /dev/null @@ -1,387 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.File; -import java.io.IOException; -import java.time.Duration; -import java.util.List; -import java.util.concurrent.CountDownLatch; -import java.util.concurrent.TimeUnit; -import org.apache.flink.runtime.checkpoint.OperatorSubtaskState; -import org.apache.flink.streaming.api.functions.source.SourceFunction; -import org.apache.flink.streaming.api.operators.StreamSource; -import org.apache.flink.streaming.api.watermark.Watermark; -import org.apache.flink.streaming.util.AbstractStreamOperatorTestHarness; -import org.apache.flink.table.data.RowData; -import org.apache.flink.types.Row; -import org.apache.iceberg.AssertHelpers; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.TableTestBase; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.flink.TestTableLoader; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.SnapshotUtil; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Parameterized.class) -public class TestStreamingMonitorFunction extends TableTestBase { - - private static final Schema SCHEMA = - new Schema( - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.required(2, "data", Types.StringType.get())); - private static final FileFormat DEFAULT_FORMAT = FileFormat.PARQUET; - private static final long WAIT_TIME_MILLIS = 10 * 1000L; - - @Parameterized.Parameters(name = "FormatVersion={0}") - public static Iterable parameters() { - return ImmutableList.of(new Object[] {1}, new Object[] {2}); - } - - public TestStreamingMonitorFunction(int formatVersion) { - super(formatVersion); - } - - @Before - @Override - public void setupTable() throws IOException { - this.tableDir = temp.newFolder(); - this.metadataDir = new File(tableDir, "metadata"); - Assert.assertTrue(tableDir.delete()); - - // Construct the iceberg table. - table = create(SCHEMA, PartitionSpec.unpartitioned()); - } - - private void runSourceFunctionInTask( - TestSourceContext sourceContext, StreamingMonitorFunction function) { - Thread task = - new Thread( - () -> { - try { - function.run(sourceContext); - } catch (Exception e) { - throw new RuntimeException(e); - } - }); - task.start(); - } - - @Test - public void testConsumeWithoutStartSnapshotId() throws Exception { - List> recordsList = generateRecordsAndCommitTxn(10); - ScanContext scanContext = ScanContext.builder().monitorInterval(Duration.ofMillis(100)).build(); - - StreamingMonitorFunction function = createFunction(scanContext); - try (AbstractStreamOperatorTestHarness harness = createHarness(function)) { - harness.setup(); - harness.open(); - - CountDownLatch latch = new CountDownLatch(1); - TestSourceContext sourceContext = new TestSourceContext(latch); - runSourceFunctionInTask(sourceContext, function); - - Assert.assertTrue( - "Should have expected elements.", latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS)); - Thread.sleep(1000L); - - // Stop the stream task. - function.close(); - - Assert.assertEquals("Should produce the expected splits", 1, sourceContext.splits.size()); - TestHelpers.assertRecords( - sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA); - } - } - - @Test - public void testConsumeFromStartSnapshotId() throws Exception { - // Commit the first five transactions. - generateRecordsAndCommitTxn(5); - long startSnapshotId = table.currentSnapshot().snapshotId(); - - // Commit the next five transactions. - List> recordsList = generateRecordsAndCommitTxn(5); - - ScanContext scanContext = - ScanContext.builder() - .monitorInterval(Duration.ofMillis(100)) - .startSnapshotId(startSnapshotId) - .build(); - - StreamingMonitorFunction function = createFunction(scanContext); - try (AbstractStreamOperatorTestHarness harness = createHarness(function)) { - harness.setup(); - harness.open(); - - CountDownLatch latch = new CountDownLatch(1); - TestSourceContext sourceContext = new TestSourceContext(latch); - runSourceFunctionInTask(sourceContext, function); - - Assert.assertTrue( - "Should have expected elements.", latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS)); - Thread.sleep(1000L); - - // Stop the stream task. - function.close(); - - Assert.assertEquals("Should produce the expected splits", 1, sourceContext.splits.size()); - TestHelpers.assertRecords( - sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA); - } - } - - @Test - public void testCheckpointRestore() throws Exception { - List> recordsList = generateRecordsAndCommitTxn(10); - ScanContext scanContext = ScanContext.builder().monitorInterval(Duration.ofMillis(100)).build(); - - StreamingMonitorFunction func = createFunction(scanContext); - OperatorSubtaskState state; - try (AbstractStreamOperatorTestHarness harness = createHarness(func)) { - harness.setup(); - harness.open(); - - CountDownLatch latch = new CountDownLatch(1); - TestSourceContext sourceContext = new TestSourceContext(latch); - runSourceFunctionInTask(sourceContext, func); - - Assert.assertTrue( - "Should have expected elements.", latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS)); - Thread.sleep(1000L); - - state = harness.snapshot(1, 1); - - // Stop the stream task. - func.close(); - - Assert.assertEquals("Should produce the expected splits", 1, sourceContext.splits.size()); - TestHelpers.assertRecords( - sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA); - } - - List> newRecordsList = generateRecordsAndCommitTxn(10); - StreamingMonitorFunction newFunc = createFunction(scanContext); - try (AbstractStreamOperatorTestHarness harness = createHarness(newFunc)) { - harness.setup(); - // Recover to process the remaining snapshots. - harness.initializeState(state); - harness.open(); - - CountDownLatch latch = new CountDownLatch(1); - TestSourceContext sourceContext = new TestSourceContext(latch); - runSourceFunctionInTask(sourceContext, newFunc); - - Assert.assertTrue( - "Should have expected elements.", latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS)); - Thread.sleep(1000L); - - // Stop the stream task. - newFunc.close(); - - Assert.assertEquals("Should produce the expected splits", 1, sourceContext.splits.size()); - TestHelpers.assertRecords( - sourceContext.toRows(), Lists.newArrayList(Iterables.concat(newRecordsList)), SCHEMA); - } - } - - @Test - public void testInvalidMaxPlanningSnapshotCount() { - ScanContext scanContext1 = - ScanContext.builder() - .monitorInterval(Duration.ofMillis(100)) - .maxPlanningSnapshotCount(0) - .build(); - - AssertHelpers.assertThrows( - "Should throw exception because of invalid config", - IllegalArgumentException.class, - "must be greater than zero", - () -> { - createFunction(scanContext1); - return null; - }); - - ScanContext scanContext2 = - ScanContext.builder() - .monitorInterval(Duration.ofMillis(100)) - .maxPlanningSnapshotCount(-10) - .build(); - - AssertHelpers.assertThrows( - "Should throw exception because of invalid config", - IllegalArgumentException.class, - "must be greater than zero", - () -> { - createFunction(scanContext2); - return null; - }); - } - - @Test - public void testConsumeWithMaxPlanningSnapshotCount() throws Exception { - generateRecordsAndCommitTxn(10); - - // Use the oldest snapshot as starting to avoid the initial case. - long oldestSnapshotId = SnapshotUtil.oldestAncestor(table).snapshotId(); - - ScanContext scanContext = - ScanContext.builder() - .monitorInterval(Duration.ofMillis(100)) - .splitSize(1000L) - .startSnapshotId(oldestSnapshotId) - .maxPlanningSnapshotCount(Integer.MAX_VALUE) - .build(); - - FlinkInputSplit[] expectedSplits = FlinkSplitGenerator.createInputSplits(table, scanContext); - - Assert.assertEquals("should produce 9 splits", 9, expectedSplits.length); - - // This covers three cases that maxPlanningSnapshotCount is less than, equal or greater than the - // total splits number - for (int maxPlanningSnapshotCount : ImmutableList.of(1, 9, 15)) { - scanContext = - ScanContext.builder() - .monitorInterval(Duration.ofMillis(500)) - .startSnapshotId(oldestSnapshotId) - .splitSize(1000L) - .maxPlanningSnapshotCount(maxPlanningSnapshotCount) - .build(); - - StreamingMonitorFunction function = createFunction(scanContext); - try (AbstractStreamOperatorTestHarness harness = createHarness(function)) { - harness.setup(); - harness.open(); - - CountDownLatch latch = new CountDownLatch(1); - TestSourceContext sourceContext = new TestSourceContext(latch); - function.sourceContext(sourceContext); - function.monitorAndForwardSplits(); - - if (maxPlanningSnapshotCount < 10) { - Assert.assertEquals( - "Should produce same splits as max-planning-snapshot-count", - maxPlanningSnapshotCount, - sourceContext.splits.size()); - } - } - } - } - - private List> generateRecordsAndCommitTxn(int commitTimes) throws IOException { - List> expectedRecords = Lists.newArrayList(); - for (int i = 0; i < commitTimes; i++) { - List records = RandomGenericData.generate(SCHEMA, 100, 0L); - expectedRecords.add(records); - - // Commit those records to iceberg table. - writeRecords(records); - } - return expectedRecords; - } - - private void writeRecords(List records) throws IOException { - GenericAppenderHelper appender = new GenericAppenderHelper(table, DEFAULT_FORMAT, temp); - appender.appendToTable(records); - } - - private StreamingMonitorFunction createFunction(ScanContext scanContext) { - return new StreamingMonitorFunction( - TestTableLoader.of(tableDir.getAbsolutePath()), scanContext); - } - - private AbstractStreamOperatorTestHarness createHarness( - StreamingMonitorFunction function) throws Exception { - StreamSource streamSource = - new StreamSource<>(function); - return new AbstractStreamOperatorTestHarness<>(streamSource, 1, 1, 0); - } - - private class TestSourceContext implements SourceFunction.SourceContext { - private final List splits = Lists.newArrayList(); - private final Object checkpointLock = new Object(); - private final CountDownLatch latch; - - TestSourceContext(CountDownLatch latch) { - this.latch = latch; - } - - @Override - public void collect(FlinkInputSplit element) { - splits.add(element); - latch.countDown(); - } - - @Override - public void collectWithTimestamp(FlinkInputSplit element, long timestamp) { - collect(element); - } - - @Override - public void emitWatermark(Watermark mark) {} - - @Override - public void markAsTemporarilyIdle() {} - - @Override - public Object getCheckpointLock() { - return checkpointLock; - } - - @Override - public void close() {} - - private List toRows() throws IOException { - FlinkInputFormat format = - FlinkSource.forRowData() - .tableLoader(TestTableLoader.of(tableDir.getAbsolutePath())) - .buildFormat(); - - List rows = Lists.newArrayList(); - for (FlinkInputSplit split : splits) { - format.open(split); - - RowData element = null; - try { - while (!format.reachedEnd()) { - element = format.nextRecord(element); - rows.add(Row.of(element.getInt(0), element.getString(1).toString())); - } - } finally { - format.close(); - } - } - - return rows; - } - } -} diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java deleted file mode 100644 index e51afaa22f9b..000000000000 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java +++ /dev/null @@ -1,284 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.File; -import java.io.IOException; -import java.util.Collections; -import java.util.List; -import org.apache.flink.runtime.checkpoint.OperatorSubtaskState; -import org.apache.flink.streaming.api.TimeCharacteristic; -import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory; -import org.apache.flink.streaming.runtime.tasks.StreamTaskActionExecutor; -import org.apache.flink.streaming.runtime.tasks.mailbox.MailboxDefaultAction; -import org.apache.flink.streaming.runtime.tasks.mailbox.SteppingMailboxProcessor; -import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; -import org.apache.flink.table.data.RowData; -import org.apache.flink.types.Row; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.TableTestBase; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.flink.TestTableLoader; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.SnapshotUtil; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Parameterized.class) -public class TestStreamingReaderOperator extends TableTestBase { - - private static final Schema SCHEMA = - new Schema( - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.required(2, "data", Types.StringType.get())); - private static final FileFormat DEFAULT_FORMAT = FileFormat.PARQUET; - - @Parameterized.Parameters(name = "FormatVersion={0}") - public static Iterable parameters() { - return ImmutableList.of(new Object[] {1}, new Object[] {2}); - } - - public TestStreamingReaderOperator(int formatVersion) { - super(formatVersion); - } - - @Before - @Override - public void setupTable() throws IOException { - this.tableDir = temp.newFolder(); - this.metadataDir = new File(tableDir, "metadata"); - Assert.assertTrue(tableDir.delete()); - - // Construct the iceberg table. - table = create(SCHEMA, PartitionSpec.unpartitioned()); - } - - @Test - public void testProcessAllRecords() throws Exception { - List> expectedRecords = generateRecordsAndCommitTxn(10); - - List splits = generateSplits(); - Assert.assertEquals("Should have 10 splits", 10, splits.size()); - - try (OneInputStreamOperatorTestHarness harness = createReader()) { - harness.setup(); - harness.open(); - - SteppingMailboxProcessor processor = createLocalMailbox(harness); - - List expected = Lists.newArrayList(); - for (int i = 0; i < splits.size(); i++) { - // Process this element to enqueue to mail-box. - harness.processElement(splits.get(i), -1); - - // Run the mail-box once to read all records from the given split. - Assert.assertTrue("Should processed 1 split", processor.runMailboxStep()); - - // Assert the output has expected elements. - expected.addAll(expectedRecords.get(i)); - TestHelpers.assertRecords(readOutputValues(harness), expected, SCHEMA); - } - } - } - - @Test - public void testTriggerCheckpoint() throws Exception { - // Received emitted splits: split1, split2, split3, checkpoint request is triggered when reading - // records from - // split1. - List> expectedRecords = generateRecordsAndCommitTxn(3); - - List splits = generateSplits(); - Assert.assertEquals("Should have 3 splits", 3, splits.size()); - - long timestamp = 0; - try (OneInputStreamOperatorTestHarness harness = createReader()) { - harness.setup(); - harness.open(); - - SteppingMailboxProcessor processor = createLocalMailbox(harness); - - harness.processElement(splits.get(0), ++timestamp); - harness.processElement(splits.get(1), ++timestamp); - harness.processElement(splits.get(2), ++timestamp); - - // Trigger snapshot state, it will start to work once all records from split0 are read. - processor.getMainMailboxExecutor().execute(() -> harness.snapshot(1, 3), "Trigger snapshot"); - - Assert.assertTrue("Should have processed the split0", processor.runMailboxStep()); - Assert.assertTrue( - "Should have processed the snapshot state action", processor.runMailboxStep()); - - TestHelpers.assertRecords(readOutputValues(harness), expectedRecords.get(0), SCHEMA); - - // Read records from split1. - Assert.assertTrue("Should have processed the split1", processor.runMailboxStep()); - - // Read records from split2. - Assert.assertTrue("Should have processed the split2", processor.runMailboxStep()); - - TestHelpers.assertRecords( - readOutputValues(harness), Lists.newArrayList(Iterables.concat(expectedRecords)), SCHEMA); - } - } - - @Test - public void testCheckpointRestore() throws Exception { - List> expectedRecords = generateRecordsAndCommitTxn(15); - - List splits = generateSplits(); - Assert.assertEquals("Should have 10 splits", 15, splits.size()); - - OperatorSubtaskState state; - List expected = Lists.newArrayList(); - try (OneInputStreamOperatorTestHarness harness = createReader()) { - harness.setup(); - harness.open(); - - // Enqueue all the splits. - for (FlinkInputSplit split : splits) { - harness.processElement(split, -1); - } - - // Read all records from the first five splits. - SteppingMailboxProcessor localMailbox = createLocalMailbox(harness); - for (int i = 0; i < 5; i++) { - expected.addAll(expectedRecords.get(i)); - Assert.assertTrue("Should have processed the split#" + i, localMailbox.runMailboxStep()); - - TestHelpers.assertRecords(readOutputValues(harness), expected, SCHEMA); - } - - // Snapshot state now, there're 10 splits left in the state. - state = harness.snapshot(1, 1); - } - - expected.clear(); - try (OneInputStreamOperatorTestHarness harness = createReader()) { - harness.setup(); - // Recover to process the remaining splits. - harness.initializeState(state); - harness.open(); - - SteppingMailboxProcessor localMailbox = createLocalMailbox(harness); - - for (int i = 5; i < 10; i++) { - expected.addAll(expectedRecords.get(i)); - Assert.assertTrue("Should have processed one split#" + i, localMailbox.runMailboxStep()); - - TestHelpers.assertRecords(readOutputValues(harness), expected, SCHEMA); - } - - // Let's process the final 5 splits now. - for (int i = 10; i < 15; i++) { - expected.addAll(expectedRecords.get(i)); - harness.processElement(splits.get(i), 1); - - Assert.assertTrue("Should have processed the split#" + i, localMailbox.runMailboxStep()); - TestHelpers.assertRecords(readOutputValues(harness), expected, SCHEMA); - } - } - } - - private List readOutputValues( - OneInputStreamOperatorTestHarness harness) { - List results = Lists.newArrayList(); - for (RowData rowData : harness.extractOutputValues()) { - results.add(Row.of(rowData.getInt(0), rowData.getString(1).toString())); - } - return results; - } - - private List> generateRecordsAndCommitTxn(int commitTimes) throws IOException { - List> expectedRecords = Lists.newArrayList(); - for (int i = 0; i < commitTimes; i++) { - List records = RandomGenericData.generate(SCHEMA, 100, 0L); - expectedRecords.add(records); - - // Commit those records to iceberg table. - writeRecords(records); - } - return expectedRecords; - } - - private void writeRecords(List records) throws IOException { - GenericAppenderHelper appender = new GenericAppenderHelper(table, DEFAULT_FORMAT, temp); - appender.appendToTable(records); - } - - private List generateSplits() { - List inputSplits = Lists.newArrayList(); - - List snapshotIds = SnapshotUtil.currentAncestorIds(table); - for (int i = snapshotIds.size() - 1; i >= 0; i--) { - ScanContext scanContext; - if (i == snapshotIds.size() - 1) { - // Generate the splits from the first snapshot. - scanContext = ScanContext.builder().useSnapshotId(snapshotIds.get(i)).build(); - } else { - // Generate the splits between the previous snapshot and current snapshot. - scanContext = - ScanContext.builder() - .startSnapshotId(snapshotIds.get(i + 1)) - .endSnapshotId(snapshotIds.get(i)) - .build(); - } - - Collections.addAll(inputSplits, FlinkSplitGenerator.createInputSplits(table, scanContext)); - } - - return inputSplits; - } - - private OneInputStreamOperatorTestHarness createReader() - throws Exception { - // This input format is used to opening the emitted split. - FlinkInputFormat inputFormat = - FlinkSource.forRowData() - .tableLoader(TestTableLoader.of(tableDir.getAbsolutePath())) - .buildFormat(); - - OneInputStreamOperatorFactory factory = - StreamingReaderOperator.factory(inputFormat); - OneInputStreamOperatorTestHarness harness = - new OneInputStreamOperatorTestHarness<>(factory, 1, 1, 0); - harness.getStreamConfig().setTimeCharacteristic(TimeCharacteristic.ProcessingTime); - - return harness; - } - - private SteppingMailboxProcessor createLocalMailbox( - OneInputStreamOperatorTestHarness harness) { - return new SteppingMailboxProcessor( - MailboxDefaultAction.Controller::suspendDefaultAction, - harness.getTaskMailbox(), - StreamTaskActionExecutor.IMMEDIATE); - } -} diff --git a/flink/v1.13/flink/src/test/resources/META-INF/services/org.apache.flink.table.factories.Factory b/flink/v1.13/flink/src/test/resources/META-INF/services/org.apache.flink.table.factories.Factory deleted file mode 100644 index 47a3c94aa991..000000000000 --- a/flink/v1.13/flink/src/test/resources/META-INF/services/org.apache.flink.table.factories.Factory +++ /dev/null @@ -1,16 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -org.apache.iceberg.flink.source.BoundedTableFactory diff --git a/gradle.properties b/gradle.properties index 7fd7b3d35d75..de3cd8b056d4 100644 --- a/gradle.properties +++ b/gradle.properties @@ -16,7 +16,7 @@ jmhOutputPath=build/reports/jmh/human-readable-output.txt jmhIncludeRegex=.* systemProp.defaultFlinkVersions=1.15 -systemProp.knownFlinkVersions=1.13,1.14,1.15 +systemProp.knownFlinkVersions=1.14,1.15 systemProp.defaultHiveVersions=2 systemProp.knownHiveVersions=2,3 systemProp.defaultSparkVersions=3.3 diff --git a/settings.gradle b/settings.gradle index 3f6e2cf036e9..bb23aec52d51 100644 --- a/settings.gradle +++ b/settings.gradle @@ -88,15 +88,6 @@ if (!flinkVersions.isEmpty()) { project(':flink').name = 'iceberg-flink' } -if (flinkVersions.contains("1.13")) { - include ":iceberg-flink:flink-1.13" - include ":iceberg-flink:flink-runtime-1.13" - project(":iceberg-flink:flink-1.13").projectDir = file('flink/v1.13/flink') - project(":iceberg-flink:flink-1.13").name = "iceberg-flink-1.13" - project(":iceberg-flink:flink-runtime-1.13").projectDir = file('flink/v1.13/flink-runtime') - project(":iceberg-flink:flink-runtime-1.13").name = "iceberg-flink-runtime-1.13" -} - if (flinkVersions.contains("1.14")) { include ":iceberg-flink:flink-1.14" include ":iceberg-flink:flink-runtime-1.14"