Skip to content

Commit

Permalink
Compressed CSV/JSON support (#3642)
Browse files Browse the repository at this point in the history
* Compression text support

* Fix the path joining issue on Windows test

* Debug code for Windows CI

* Utilize `std::path::Path`, instead of `url::Url`
  • Loading branch information
Licht-T committed Oct 11, 2022
1 parent 58afdf7 commit b8a3a78
Show file tree
Hide file tree
Showing 18 changed files with 976 additions and 177 deletions.
7 changes: 3 additions & 4 deletions datafusion-examples/examples/parquet_sql_multiple_files.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,8 @@
// specific language governing permissions and limitations
// under the License.

use datafusion::datasource::file_format::parquet::{
ParquetFormat, DEFAULT_PARQUET_EXTENSION,
};
use datafusion::datasource::file_format::file_type::{FileType, GetExt};
use datafusion::datasource::file_format::parquet::ParquetFormat;
use datafusion::datasource::listing::ListingOptions;
use datafusion::error::Result;
use datafusion::prelude::*;
Expand All @@ -35,7 +34,7 @@ async fn main() -> Result<()> {
// Configure listing options
let file_format = ParquetFormat::default().with_enable_pruning(true);
let listing_options = ListingOptions {
file_extension: DEFAULT_PARQUET_EXTENSION.to_owned(),
file_extension: FileType::PARQUET.get_ext(),
format: Arc::new(file_format),
table_partition_cols: vec![],
collect_stat: true,
Expand Down
6 changes: 6 additions & 0 deletions datafusion/common/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,12 @@ impl Display for DataFusionError {

impl error::Error for DataFusionError {}

impl From<DataFusionError> for io::Error {
fn from(e: DataFusionError) -> Self {
io::Error::new(io::ErrorKind::Other, e)
}
}

#[cfg(test)]
mod test {
use crate::error::DataFusionError;
Expand Down
5 changes: 5 additions & 0 deletions datafusion/core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,10 @@ unicode_expressions = ["datafusion-physical-expr/regex_expressions", "datafusion
ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] }
apache-avro = { version = "0.14", optional = true }
arrow = { version = "24.0.0", features = ["prettyprint"] }
async-compression = { version = "0.3.14", features = ["bzip2", "gzip", "futures-io", "tokio"] }
async-trait = "0.1.41"
bytes = "1.1"
bzip2 = "0.4.3"
chrono = { version = "0.4", default-features = false }
datafusion-common = { path = "../common", version = "13.0.0", features = ["parquet", "object_store"] }
datafusion-expr = { path = "../expr", version = "13.0.0" }
Expand All @@ -67,6 +69,7 @@ datafusion-optimizer = { path = "../optimizer", version = "13.0.0" }
datafusion-physical-expr = { path = "../physical-expr", version = "13.0.0" }
datafusion-row = { path = "../row", version = "13.0.0" }
datafusion-sql = { path = "../sql", version = "13.0.0" }
flate2 = "1.0.24"
futures = "0.3"
glob = "0.3.0"
hashbrown = { version = "0.12", features = ["raw"] }
Expand All @@ -90,6 +93,7 @@ sqlparser = "0.25"
tempfile = "3"
tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync", "fs", "parking_lot"] }
tokio-stream = "0.1"
tokio-util = { version = "0.7.4", features = ["io"] }
url = "2.2"
uuid = { version = "1.0", features = ["v4"] }

Expand All @@ -102,6 +106,7 @@ ctor = "0.1.22"
doc-comment = "0.3"
env_logger = "0.9"
fuzz-utils = { path = "fuzz-utils" }
rstest = "0.15.0"

[[bench]]
harness = false
Expand Down
27 changes: 25 additions & 2 deletions datafusion/core/src/datasource/file_format/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,21 @@
//! CSV format abstractions

use std::any::Any;

use std::sync::Arc;

use arrow::datatypes::Schema;
use arrow::{self, datatypes::SchemaRef};
use async_trait::async_trait;
use bytes::Buf;

use datafusion_common::DataFusionError;

use futures::TryFutureExt;
use object_store::{ObjectMeta, ObjectStore};

use super::FileFormat;
use crate::datasource::file_format::file_type::FileCompressionType;
use crate::datasource::file_format::DEFAULT_SCHEMA_INFER_MAX_RECORD;
use crate::error::Result;
use crate::logical_plan::Expr;
Expand All @@ -43,6 +48,7 @@ pub struct CsvFormat {
has_header: bool,
delimiter: u8,
schema_infer_max_rec: Option<usize>,
file_compression_type: FileCompressionType,
}

impl Default for CsvFormat {
Expand All @@ -51,6 +57,7 @@ impl Default for CsvFormat {
schema_infer_max_rec: Some(DEFAULT_SCHEMA_INFER_MAX_RECORD),
has_header: true,
delimiter: b',',
file_compression_type: FileCompressionType::UNCOMPRESSED,
}
}
}
Expand Down Expand Up @@ -82,6 +89,16 @@ impl CsvFormat {
self
}

/// Set a `FileCompressionType` of CSV
/// - defaults to `FileCompressionType::UNCOMPRESSED`
pub fn with_file_compression_type(
mut self,
file_compression_type: FileCompressionType,
) -> Self {
self.file_compression_type = file_compression_type;
self
}

/// The delimiter character.
pub fn delimiter(&self) -> u8 {
self.delimiter
Expand Down Expand Up @@ -110,8 +127,9 @@ impl FileFormat for CsvFormat {
.await
.map_err(|e| DataFusionError::External(Box::new(e)))?;

let decoder = self.file_compression_type.convert_read(data.reader());
let (schema, records_read) = arrow::csv::reader::infer_reader_schema(
&mut data.as_ref(),
decoder,
self.delimiter,
Some(records_to_read),
self.has_header,
Expand Down Expand Up @@ -144,7 +162,12 @@ impl FileFormat for CsvFormat {
conf: FileScanConfig,
_filters: &[Expr],
) -> Result<Arc<dyn ExecutionPlan>> {
let exec = CsvExec::new(conf, self.has_header, self.delimiter);
let exec = CsvExec::new(
conf,
self.has_header,
self.delimiter,
self.file_compression_type.to_owned(),
);
Ok(Arc::new(exec))
}
}
Expand Down
Loading

0 comments on commit b8a3a78

Please sign in to comment.