Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Compressed CSV/JSON support #3642

Merged
merged 5 commits into from
Oct 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions datafusion-examples/examples/parquet_sql_multiple_files.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,8 @@
// specific language governing permissions and limitations
// under the License.

use datafusion::datasource::file_format::parquet::{
ParquetFormat, DEFAULT_PARQUET_EXTENSION,
};
use datafusion::datasource::file_format::file_type::{FileType, GetExt};
use datafusion::datasource::file_format::parquet::ParquetFormat;
use datafusion::datasource::listing::ListingOptions;
use datafusion::error::Result;
use datafusion::prelude::*;
Expand All @@ -35,7 +34,7 @@ async fn main() -> Result<()> {
// Configure listing options
let file_format = ParquetFormat::default().with_enable_pruning(true);
let listing_options = ListingOptions {
file_extension: DEFAULT_PARQUET_EXTENSION.to_owned(),
file_extension: FileType::PARQUET.get_ext(),
format: Arc::new(file_format),
table_partition_cols: vec![],
collect_stat: true,
Expand Down
6 changes: 6 additions & 0 deletions datafusion/common/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,12 @@ impl Display for DataFusionError {

impl error::Error for DataFusionError {}

impl From<DataFusionError> for io::Error {
fn from(e: DataFusionError) -> Self {
io::Error::new(io::ErrorKind::Other, e)
}
}

#[cfg(test)]
mod test {
use crate::error::DataFusionError;
Expand Down
5 changes: 5 additions & 0 deletions datafusion/core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,10 @@ unicode_expressions = ["datafusion-physical-expr/regex_expressions", "datafusion
ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] }
apache-avro = { version = "0.14", optional = true }
arrow = { version = "24.0.0", features = ["prettyprint"] }
async-compression = { version = "0.3.14", features = ["bzip2", "gzip", "futures-io", "tokio"] }
async-trait = "0.1.41"
bytes = "1.1"
bzip2 = "0.4.3"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any ideas how many dependencies we are adding? I guess some of them are already transitive dependencies through parquet.

chrono = { version = "0.4", default-features = false }
datafusion-common = { path = "../common", version = "13.0.0", features = ["parquet", "object_store"] }
datafusion-expr = { path = "../expr", version = "13.0.0" }
Expand All @@ -67,6 +69,7 @@ datafusion-optimizer = { path = "../optimizer", version = "13.0.0" }
datafusion-physical-expr = { path = "../physical-expr", version = "13.0.0" }
datafusion-row = { path = "../row", version = "13.0.0" }
datafusion-sql = { path = "../sql", version = "13.0.0" }
flate2 = "1.0.24"
futures = "0.3"
glob = "0.3.0"
hashbrown = { version = "0.12", features = ["raw"] }
Expand All @@ -89,6 +92,7 @@ sqlparser = "0.25"
tempfile = "3"
tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync", "fs", "parking_lot"] }
tokio-stream = "0.1"
tokio-util = { version = "0.7.4", features = ["io"] }
url = "2.2"
uuid = { version = "1.0", features = ["v4"] }

Expand All @@ -101,6 +105,7 @@ ctor = "0.1.22"
doc-comment = "0.3"
env_logger = "0.9"
fuzz-utils = { path = "fuzz-utils" }
rstest = "0.15.0"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Dandandan what do we think about adding this new testing library?


[[bench]]
harness = false
Expand Down
27 changes: 25 additions & 2 deletions datafusion/core/src/datasource/file_format/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,21 @@
//! CSV format abstractions

use std::any::Any;

use std::sync::Arc;

use arrow::datatypes::Schema;
use arrow::{self, datatypes::SchemaRef};
use async_trait::async_trait;
use bytes::Buf;

use datafusion_common::DataFusionError;

use futures::TryFutureExt;
use object_store::{ObjectMeta, ObjectStore};

use super::FileFormat;
use crate::datasource::file_format::file_type::FileCompressionType;
use crate::datasource::file_format::DEFAULT_SCHEMA_INFER_MAX_RECORD;
use crate::error::Result;
use crate::logical_plan::Expr;
Expand All @@ -43,6 +48,7 @@ pub struct CsvFormat {
has_header: bool,
delimiter: u8,
schema_infer_max_rec: Option<usize>,
file_compression_type: FileCompressionType,
}

impl Default for CsvFormat {
Expand All @@ -51,6 +57,7 @@ impl Default for CsvFormat {
schema_infer_max_rec: Some(DEFAULT_SCHEMA_INFER_MAX_RECORD),
has_header: true,
delimiter: b',',
file_compression_type: FileCompressionType::UNCOMPRESSED,
}
}
}
Expand Down Expand Up @@ -82,6 +89,16 @@ impl CsvFormat {
self
}

/// Set a `FileCompressionType` of CSV
/// - defaults to `FileCompressionType::UNCOMPRESSED`
pub fn with_file_compression_type(
mut self,
file_compression_type: FileCompressionType,
) -> Self {
self.file_compression_type = file_compression_type;
self
}

/// The delimiter character.
pub fn delimiter(&self) -> u8 {
self.delimiter
Expand Down Expand Up @@ -110,8 +127,9 @@ impl FileFormat for CsvFormat {
.await
.map_err(|e| DataFusionError::External(Box::new(e)))?;

let decoder = self.file_compression_type.convert_read(data.reader());
let (schema, records_read) = arrow::csv::reader::infer_reader_schema(
&mut data.as_ref(),
decoder,
self.delimiter,
Some(records_to_read),
self.has_header,
Expand Down Expand Up @@ -144,7 +162,12 @@ impl FileFormat for CsvFormat {
conf: FileScanConfig,
_filters: &[Expr],
) -> Result<Arc<dyn ExecutionPlan>> {
let exec = CsvExec::new(conf, self.has_header, self.delimiter);
let exec = CsvExec::new(
conf,
self.has_header,
self.delimiter,
self.file_compression_type.to_owned(),
);
Ok(Arc::new(exec))
}
}
Expand Down
Loading