From 78d2c1f8e70443aaab8aebe66584115105e255a1 Mon Sep 17 00:00:00 2001 From: Vince Date: Wed, 12 Jul 2023 23:10:00 +0200 Subject: [PATCH] Pass infer max records to JsonFormat. --- .../src/datasource/file_format/options.rs | 1 + .../core/src/datasource/physical_plan/json.rs | 30 +++++++++++++++++++ datafusion/core/tests/data/4.json | 4 +++ 3 files changed, 35 insertions(+) create mode 100644 datafusion/core/tests/data/4.json diff --git a/datafusion/core/src/datasource/file_format/options.rs b/datafusion/core/src/datasource/file_format/options.rs index 5694bf5380d5..6155dc6640fa 100644 --- a/datafusion/core/src/datasource/file_format/options.rs +++ b/datafusion/core/src/datasource/file_format/options.rs @@ -489,6 +489,7 @@ impl ReadOptions<'_> for ParquetReadOptions<'_> { impl ReadOptions<'_> for NdJsonReadOptions<'_> { fn to_listing_options(&self, config: &SessionConfig) -> ListingOptions { let file_format = JsonFormat::default() + .with_schema_infer_max_rec(Some(self.schema_infer_max_records)) .with_file_compression_type(self.file_compression_type.to_owned()); ListingOptions::new(Arc::new(file_format)) diff --git a/datafusion/core/src/datasource/physical_plan/json.rs b/datafusion/core/src/datasource/physical_plan/json.rs index e9082b80847f..541e448cfef7 100644 --- a/datafusion/core/src/datasource/physical_plan/json.rs +++ b/datafusion/core/src/datasource/physical_plan/json.rs @@ -733,4 +733,34 @@ mod tests { assert_eq!("Arrow error: Parser error: Error while parsing value d for column 0 at line 4", format!("{e}")); Ok(()) } + + #[tokio::test] + async fn ndjson_schema_infer_max_records() -> Result<()> { + async fn read_test_data(schema_infer_max_records: usize) -> Result { + let ctx = SessionContext::new(); + + let options = NdJsonReadOptions { + schema_infer_max_records, + ..Default::default() + }; + + let batches = ctx + .read_json("tests/data/4.json", options) + .await? + .collect() + .await?; + + Ok(batches[0].schema()) + } + + // Use only the first 2 rows to infer the schema, those have 2 fields. + let schema = read_test_data(2).await?; + assert_eq!(schema.fields().len(), 2); + + // Use all rows to infer the schema, those have 5 fields. + let schema = read_test_data(10).await?; + assert_eq!(schema.fields().len(), 5); + + Ok(()) + } } diff --git a/datafusion/core/tests/data/4.json b/datafusion/core/tests/data/4.json new file mode 100644 index 000000000000..f0c67cd7cf0e --- /dev/null +++ b/datafusion/core/tests/data/4.json @@ -0,0 +1,4 @@ +{"a":1, "b":[2.0, 1.3, -6.1]} +{"a":2, "b":[3.0, 4.3]} +{"c":[false, true], "d":{"c1": 23, "c2": 32}} +{"e": {"e1": 2, "e2": 12.3}} \ No newline at end of file