diff --git a/Makefile b/Makefile index a9def81b1..437987c25 100644 --- a/Makefile +++ b/Makefile @@ -17,7 +17,7 @@ # golangci-lint version (keep in sync with CI and README) GOLANGCI_LINT_VERSION := v2.8.0 -.PHONY: test lint lint-install integration-setup integration-test integration-scanner integration-io integration-rest integration-spark docs-gen +.PHONY: test lint lint-install integration-setup integration-test integration-scanner integration-io integration-rest integration-spark integration-hadoop integration-down integration-logs docs-gen test: go test -v ./... @@ -32,12 +32,23 @@ lint-install: go install github.com/golangci/golangci-lint/v2/cmd/golangci-lint@$(GOLANGCI_LINT_VERSION) integration-setup: - docker compose -f internal/recipe/docker-compose.yml up -d - sleep 10 + mkdir -p /tmp/iceberg-hadoop-warehouse + docker compose -f internal/recipe/docker-compose.yml up -d --wait docker compose -f internal/recipe/docker-compose.yml exec -T spark-iceberg ipython ./provision.py - sleep 10 -integration-test: integration-scanner integration-io integration-rest integration-spark integration-hive +integration-down: + docker compose -f internal/recipe/docker-compose.yml down -v + +integration-logs: + docker compose -f internal/recipe/docker-compose.yml logs + +integration-env: + @echo "export AWS_S3_ENDPOINT=http://$$(docker inspect -f '{{range.NetworkSettings.Networks}}{{.IPAddress}}{{end}}' minio):9000" + @echo "export AWS_REGION=us-east-1" + @echo "export SPARK_CONTAINER_ID=$$(docker ps -qf 'name=spark-iceberg')" + @echo "export DOCKER_API_VERSION=$$(docker version -f '{{.Server.APIVersion}}')" + +integration-test: integration-scanner integration-io integration-rest integration-spark integration-hive integration-hadoop integration-scanner: go test -tags=integration -v -run="^TestScanner" ./table @@ -53,3 +64,9 @@ integration-spark: integration-hive: go test -tags=integration -v ./catalog/hive/... + +integration-hadoop: + go test -tags=integration -v -run="^TestHadoopIntegration" ./catalog/hadoop/... + +integration-hadoop-clean: + rm -rf /tmp/iceberg-hadoop-warehouse/* diff --git a/internal/recipe/Dockerfile b/internal/recipe/Dockerfile index 07937a292..073a276b1 100644 --- a/internal/recipe/Dockerfile +++ b/internal/recipe/Dockerfile @@ -21,6 +21,7 @@ RUN pip3 install pyarrow COPY provision.py . COPY validation.py . +COPY hadoop_validation.py . ENTRYPOINT ["./entrypoint.sh"] CMD ["notebook"] diff --git a/internal/recipe/docker-compose.yml b/internal/recipe/docker-compose.yml index 26456d8bc..ab8bd4d16 100644 --- a/internal/recipe/docker-compose.yml +++ b/internal/recipe/docker-compose.yml @@ -17,16 +17,17 @@ services: spark-iceberg: image: pyiceberg-spark - container_name: spark-iceberg + container_name: spark-iceberg build: . networks: iceberg_net: - depends_on: + depends_on: - rest - minio volumes: - ./warehouse:/home/iceberg/warehouse - ./notebooks:/home/iceberg/notebooks/notebooks + - /tmp/iceberg-hadoop-warehouse:/tmp/iceberg-hadoop-warehouse environment: - AWS_ACCESS_KEY_ID=admin - AWS_SECRET_ACCESS_KEY=password @@ -36,6 +37,12 @@ services: - 8080:8080 - 10000:10000 - 10001:10001 + healthcheck: + test: ["CMD-SHELL", "curl -sf http://localhost:8080 || exit 1"] + interval: 10s + timeout: 5s + retries: 12 + start_period: 30s rest: image: apache/iceberg-rest-fixture:1.10.1 container_name: iceberg-rest diff --git a/internal/recipe/hadoop_validation.py b/internal/recipe/hadoop_validation.py new file mode 100644 index 000000000..b3f90a08c --- /dev/null +++ b/internal/recipe/hadoop_validation.py @@ -0,0 +1,58 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import argparse +from pyspark.sql import SparkSession + +spark = ( + SparkSession.builder + .config("spark.sql.catalog.hadoop_test", "org.apache.iceberg.spark.SparkCatalog") + .config("spark.sql.catalog.hadoop_test.type", "hadoop") + .config("spark.sql.catalog.hadoop_test.warehouse", "/tmp/iceberg-hadoop-warehouse") + .getOrCreate() +) + + +def runSQL(sql): + result = spark.sql(sql) + result.show(truncate=False) + + return result + + +def runSQLAssert(sql): + """Execute SQL and assert at least one row is returned.""" + result = runSQL(sql) + count = result.count() + assert count > 0, f"Expected at least one row, got {count} for query: {sql}" + print(f"OK: {count} row(s) returned") + + return result + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--sql", type=str, required=True, help="Validation SQL statement to execute") + parser.add_argument("--assert-rows", action="store_true", + help="Assert that the query returns at least one row") + args = parser.parse_args() + + if args.sql: + if args.assert_rows: + runSQLAssert(args.sql) + else: + runSQL(args.sql)