diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index 61711f8472eb..9905d5685f5d 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -35,14 +35,6 @@ rust-version = { workspace = true } [lints] workspace = true -[[example]] -name = "dataframe_to_s3" -path = "examples/external_dependency/dataframe-to-s3.rs" - -[[example]] -name = "query_aws_s3" -path = "examples/external_dependency/query-aws-s3.rs" - [dev-dependencies] arrow = { workspace = true } # arrow_schema is required for record_batch! macro :sad: diff --git a/datafusion-examples/README.md b/datafusion-examples/README.md index 1fd6d9f48273..b74622cf957c 100644 --- a/datafusion-examples/README.md +++ b/datafusion-examples/README.md @@ -60,7 +60,7 @@ cargo run --example dataframe - [`examples/custom_data_source/custom_datasource.rs`](examples/custom_data_source/custom_datasource.rs): Run queries against a custom datasource (TableProvider) - [`examples/custom_data_source/custom_file_casts.rs`](examples/custom_data_source/custom_file_casts.rs): Implement custom casting rules to adapt file schemas - [`examples/custom_data_source/custom_file_format.rs`](examples/custom_data_source/custom_file_format.rs): Write data to a custom file format -- [`dataframe-to-s3.rs`](examples/external_dependency/dataframe-to-s3.rs): Run a query using a DataFrame against a parquet file from s3 and writing back to s3 +- [`examples/external_dependency/dataframe_to_s3.rs`](examples/external_dependency/dataframe_to_s3.rs): Run a query using a DataFrame against a parquet file from s3 and writing back to s3 - [`dataframe.rs`](examples/dataframe.rs): Run a query using a DataFrame API against parquet files, csv files, and in-memory data, including multiple subqueries. Also demonstrates the various methods to write out a DataFrame to a table, parquet file, csv file, and json file. - [`examples/builtin_functions/date_time`](examples/builtin_functions/date_time.rs): Examples of date-time related functions and queries - [`default_column_values.rs`](examples/default_column_values.rs): Implement custom default value handling for missing columns using field metadata and PhysicalExprAdapter @@ -82,7 +82,7 @@ cargo run --example dataframe - [`examples/query_planning/planner_api.rs`](examples/query_planning/planner_api.rs) APIs to manipulate logical and physical plans - [`examples/query_planning/pruning.rs`](examples/query_planning/pruning.rs): Use pruning to rule out files based on statistics - [`examples/query_planning/thread_pools.rs`](examples/query_planning/thread_pools.rs): Demonstrates TrackConsumersPool for memory tracking and debugging with enhanced error messages and shows how to implement memory-aware ExecutionPlan with memory reservation and spilling -- [`query-aws-s3.rs`](examples/external_dependency/query-aws-s3.rs): Configure `object_store` and run a query against files stored in AWS S3 +- [`examples/external_dependency/query_aws_s3.rs`](examples/external_dependency/query_aws_s3.rs): Configure `object_store` and run a query against files stored in AWS S3 - [`examples/data_io/query_http_csv.rs`](examples/data_io/query_http_csv.rs): Configure `object_store` and run a query against files via HTTP - [`examples/builtin_functions/regexp.rs`](examples/builtin_functions/regexp.rs): Examples of using regular expression functions - [`examples/data_io/remote_catalog.rs`](examples/data_io/remote_catalog.rs): Examples of interfacing with a remote catalog (e.g. over a network) diff --git a/datafusion-examples/examples/external_dependency/dataframe-to-s3.rs b/datafusion-examples/examples/external_dependency/dataframe_to_s3.rs similarity index 87% rename from datafusion-examples/examples/external_dependency/dataframe-to-s3.rs rename to datafusion-examples/examples/external_dependency/dataframe_to_s3.rs index e75ba5dd5328..d7fb17ba57d9 100644 --- a/datafusion-examples/examples/external_dependency/dataframe-to-s3.rs +++ b/datafusion-examples/examples/external_dependency/dataframe_to_s3.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +//! See `main.rs` for how to run it. + use std::env; use std::sync::Arc; @@ -28,14 +30,18 @@ use datafusion::prelude::*; use object_store::aws::AmazonS3Builder; use url::Url; -/// This example demonstrates querying data from AmazonS3 and writing -/// the result of a query back to AmazonS3 -#[tokio::main] -async fn main() -> Result<()> { +/// This example demonstrates querying data from Amazon S3 and writing +/// the result of a query back to Amazon S3. +/// +/// The following environment variables must be defined: +/// +/// - AWS_ACCESS_KEY_ID +/// - AWS_SECRET_ACCESS_KEY +pub async fn dataframe_to_s3() -> Result<()> { // create local execution context let ctx = SessionContext::new(); - //enter region and bucket to which your credentials have GET and PUT access + // enter region and bucket to which your credentials have GET and PUT access let region = ""; let bucket_name = ""; @@ -66,13 +72,13 @@ async fn main() -> Result<()> { .write_parquet(&out_path, DataFrameWriteOptions::new(), None) .await?; - //write as JSON to s3 + // write as JSON to s3 let json_out = format!("s3://{bucket_name}/json_out"); df.clone() .write_json(&json_out, DataFrameWriteOptions::new(), None) .await?; - //write as csv to s3 + // write as csv to s3 let csv_out = format!("s3://{bucket_name}/csv_out"); df.write_csv(&csv_out, DataFrameWriteOptions::new(), None) .await?; diff --git a/datafusion-examples/examples/external_dependency/main.rs b/datafusion-examples/examples/external_dependency/main.rs new file mode 100644 index 000000000000..f6fc1fb0ef14 --- /dev/null +++ b/datafusion-examples/examples/external_dependency/main.rs @@ -0,0 +1,93 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! # These are using data from Amazon S3 examples +//! +//! These examples demonstrate how to work with data from Amazon S3. +//! +//! ## Usage +//! ```bash +//! cargo run --example external_dependency -- [dataframe_to_s3|query_aws_s3] +//! ``` +//! +//! Each subcommand runs a corresponding example: +//! - `dataframe_to_s3` — run a query using a DataFrame against a parquet file from AWS S3 and writing back to AWS S3 +//! - `query_aws_s3` — configure `object_store` and run a query against files stored in AWS S3 + +mod dataframe_to_s3; +mod query_aws_s3; + +use std::str::FromStr; + +use datafusion::error::{DataFusionError, Result}; + +enum ExampleKind { + DataframeToS3, + QueryAwsS3, +} + +impl AsRef for ExampleKind { + fn as_ref(&self) -> &str { + match self { + Self::DataframeToS3 => "dataframe_to_s3", + Self::QueryAwsS3 => "query_aws_s3", + } + } +} + +impl FromStr for ExampleKind { + type Err = DataFusionError; + + fn from_str(s: &str) -> Result { + match s { + "dataframe_to_s3" => Ok(Self::DataframeToS3), + "query_aws_s3" => Ok(Self::QueryAwsS3), + _ => Err(DataFusionError::Execution(format!("Unknown example: {s}"))), + } + } +} + +impl ExampleKind { + const ALL: [Self; 2] = [Self::DataframeToS3, Self::QueryAwsS3]; + + const EXAMPLE_NAME: &str = "external_dependency"; + + fn variants() -> Vec<&'static str> { + Self::ALL.iter().map(|x| x.as_ref()).collect() + } +} + +#[tokio::main] +async fn main() -> Result<()> { + let usage = format!( + "Usage: cargo run --example {} -- [{}]", + ExampleKind::EXAMPLE_NAME, + ExampleKind::variants().join("|") + ); + + let arg = std::env::args().nth(1).ok_or_else(|| { + eprintln!("{usage}"); + DataFusionError::Execution("Missing argument".to_string()) + })?; + + match arg.parse::()? { + ExampleKind::DataframeToS3 => dataframe_to_s3::dataframe_to_s3().await?, + ExampleKind::QueryAwsS3 => query_aws_s3::query_aws_s3().await?, + } + + Ok(()) +} diff --git a/datafusion-examples/examples/external_dependency/query-aws-s3.rs b/datafusion-examples/examples/external_dependency/query_aws_s3.rs similarity index 90% rename from datafusion-examples/examples/external_dependency/query-aws-s3.rs rename to datafusion-examples/examples/external_dependency/query_aws_s3.rs index cd0b4562d5f2..63507bb3eed1 100644 --- a/datafusion-examples/examples/external_dependency/query-aws-s3.rs +++ b/datafusion-examples/examples/external_dependency/query_aws_s3.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +//! See `main.rs` for how to run it. + use datafusion::error::Result; use datafusion::prelude::*; use object_store::aws::AmazonS3Builder; @@ -22,14 +24,13 @@ use std::env; use std::sync::Arc; use url::Url; -/// This example demonstrates querying data in an S3 bucket. +/// This example demonstrates querying data in a public S3 bucket +/// (the NYC TLC open dataset: `s3://nyc-tlc`). /// /// The following environment variables must be defined: -/// -/// - AWS_ACCESS_KEY_ID -/// - AWS_SECRET_ACCESS_KEY -#[tokio::main] -async fn main() -> Result<()> { +/// - `AWS_ACCESS_KEY_ID` +/// - `AWS_SECRET_ACCESS_KEY` +pub async fn query_aws_s3() -> Result<()> { let ctx = SessionContext::new(); // the region must be set to the region where the bucket exists until the following