Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 0 additions & 8 deletions datafusion-examples/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,6 @@ rust-version = { workspace = true }
[lints]
workspace = true

[[example]]
name = "dataframe_to_s3"
path = "examples/external_dependency/dataframe-to-s3.rs"

[[example]]
name = "query_aws_s3"
path = "examples/external_dependency/query-aws-s3.rs"

[dev-dependencies]
arrow = { workspace = true }
# arrow_schema is required for record_batch! macro :sad:
Expand Down
4 changes: 2 additions & 2 deletions datafusion-examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ cargo run --example dataframe
- [`examples/custom_data_source/custom_datasource.rs`](examples/custom_data_source/custom_datasource.rs): Run queries against a custom datasource (TableProvider)
- [`examples/custom_data_source/custom_file_casts.rs`](examples/custom_data_source/custom_file_casts.rs): Implement custom casting rules to adapt file schemas
- [`examples/custom_data_source/custom_file_format.rs`](examples/custom_data_source/custom_file_format.rs): Write data to a custom file format
- [`dataframe-to-s3.rs`](examples/external_dependency/dataframe-to-s3.rs): Run a query using a DataFrame against a parquet file from s3 and writing back to s3
- [`examples/external_dependency/dataframe_to_s3.rs`](examples/external_dependency/dataframe_to_s3.rs): Run a query using a DataFrame against a parquet file from s3 and writing back to s3
- [`dataframe.rs`](examples/dataframe.rs): Run a query using a DataFrame API against parquet files, csv files, and in-memory data, including multiple subqueries. Also demonstrates the various methods to write out a DataFrame to a table, parquet file, csv file, and json file.
- [`examples/builtin_functions/date_time`](examples/builtin_functions/date_time.rs): Examples of date-time related functions and queries
- [`default_column_values.rs`](examples/default_column_values.rs): Implement custom default value handling for missing columns using field metadata and PhysicalExprAdapter
Expand All @@ -82,7 +82,7 @@ cargo run --example dataframe
- [`examples/query_planning/planner_api.rs`](examples/query_planning/planner_api.rs) APIs to manipulate logical and physical plans
- [`examples/query_planning/pruning.rs`](examples/query_planning/pruning.rs): Use pruning to rule out files based on statistics
- [`examples/query_planning/thread_pools.rs`](examples/query_planning/thread_pools.rs): Demonstrates TrackConsumersPool for memory tracking and debugging with enhanced error messages and shows how to implement memory-aware ExecutionPlan with memory reservation and spilling
- [`query-aws-s3.rs`](examples/external_dependency/query-aws-s3.rs): Configure `object_store` and run a query against files stored in AWS S3
- [`examples/external_dependency/query_aws_s3.rs`](examples/external_dependency/query_aws_s3.rs): Configure `object_store` and run a query against files stored in AWS S3
- [`examples/data_io/query_http_csv.rs`](examples/data_io/query_http_csv.rs): Configure `object_store` and run a query against files via HTTP
- [`examples/builtin_functions/regexp.rs`](examples/builtin_functions/regexp.rs): Examples of using regular expression functions
- [`examples/data_io/remote_catalog.rs`](examples/data_io/remote_catalog.rs): Examples of interfacing with a remote catalog (e.g. over a network)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
// specific language governing permissions and limitations
// under the License.

//! See `main.rs` for how to run it.

use std::env;
use std::sync::Arc;

Expand All @@ -28,14 +30,18 @@ use datafusion::prelude::*;
use object_store::aws::AmazonS3Builder;
use url::Url;

/// This example demonstrates querying data from AmazonS3 and writing
/// the result of a query back to AmazonS3
#[tokio::main]
async fn main() -> Result<()> {
/// This example demonstrates querying data from Amazon S3 and writing
/// the result of a query back to Amazon S3.
///
/// The following environment variables must be defined:
///
/// - AWS_ACCESS_KEY_ID
/// - AWS_SECRET_ACCESS_KEY
pub async fn dataframe_to_s3() -> Result<()> {
// create local execution context
let ctx = SessionContext::new();

//enter region and bucket to which your credentials have GET and PUT access
// enter region and bucket to which your credentials have GET and PUT access
let region = "<bucket-region-here>";
let bucket_name = "<bucket-name-here>";

Expand Down Expand Up @@ -66,13 +72,13 @@ async fn main() -> Result<()> {
.write_parquet(&out_path, DataFrameWriteOptions::new(), None)
.await?;

//write as JSON to s3
// write as JSON to s3
let json_out = format!("s3://{bucket_name}/json_out");
df.clone()
.write_json(&json_out, DataFrameWriteOptions::new(), None)
.await?;

//write as csv to s3
// write as csv to s3
let csv_out = format!("s3://{bucket_name}/csv_out");
df.write_csv(&csv_out, DataFrameWriteOptions::new(), None)
.await?;
Expand Down
93 changes: 93 additions & 0 deletions datafusion-examples/examples/external_dependency/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

//! # These are using data from Amazon S3 examples
//!
//! These examples demonstrate how to work with data from Amazon S3.
//!
//! ## Usage
//! ```bash
//! cargo run --example external_dependency -- [dataframe_to_s3|query_aws_s3]
//! ```
//!
//! Each subcommand runs a corresponding example:
//! - `dataframe_to_s3` — run a query using a DataFrame against a parquet file from AWS S3 and writing back to AWS S3
//! - `query_aws_s3` — configure `object_store` and run a query against files stored in AWS S3

mod dataframe_to_s3;
mod query_aws_s3;

use std::str::FromStr;

use datafusion::error::{DataFusionError, Result};

enum ExampleKind {
DataframeToS3,
QueryAwsS3,
}

impl AsRef<str> for ExampleKind {
fn as_ref(&self) -> &str {
match self {
Self::DataframeToS3 => "dataframe_to_s3",
Self::QueryAwsS3 => "query_aws_s3",
}
}
}

impl FromStr for ExampleKind {
type Err = DataFusionError;

fn from_str(s: &str) -> Result<Self> {
match s {
"dataframe_to_s3" => Ok(Self::DataframeToS3),
"query_aws_s3" => Ok(Self::QueryAwsS3),
_ => Err(DataFusionError::Execution(format!("Unknown example: {s}"))),
}
}
}

impl ExampleKind {
const ALL: [Self; 2] = [Self::DataframeToS3, Self::QueryAwsS3];

const EXAMPLE_NAME: &str = "external_dependency";

fn variants() -> Vec<&'static str> {
Self::ALL.iter().map(|x| x.as_ref()).collect()
}
}

#[tokio::main]
async fn main() -> Result<()> {
let usage = format!(
"Usage: cargo run --example {} -- [{}]",
ExampleKind::EXAMPLE_NAME,
ExampleKind::variants().join("|")
);

let arg = std::env::args().nth(1).ok_or_else(|| {
eprintln!("{usage}");
DataFusionError::Execution("Missing argument".to_string())
})?;

match arg.parse::<ExampleKind>()? {
ExampleKind::DataframeToS3 => dataframe_to_s3::dataframe_to_s3().await?,
ExampleKind::QueryAwsS3 => query_aws_s3::query_aws_s3().await?,
}

Ok(())
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,22 @@
// specific language governing permissions and limitations
// under the License.

//! See `main.rs` for how to run it.

use datafusion::error::Result;
use datafusion::prelude::*;
use object_store::aws::AmazonS3Builder;
use std::env;
use std::sync::Arc;
use url::Url;

/// This example demonstrates querying data in an S3 bucket.
/// This example demonstrates querying data in a public S3 bucket
/// (the NYC TLC open dataset: `s3://nyc-tlc`).
///
/// The following environment variables must be defined:
///
/// - AWS_ACCESS_KEY_ID
/// - AWS_SECRET_ACCESS_KEY
#[tokio::main]
async fn main() -> Result<()> {
/// - `AWS_ACCESS_KEY_ID`
/// - `AWS_SECRET_ACCESS_KEY`
pub async fn query_aws_s3() -> Result<()> {
let ctx = SessionContext::new();

// the region must be set to the region where the bucket exists until the following
Expand Down