-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Allow listing tables to be created via TableFactories #4112
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -141,7 +141,13 @@ mod tests { | |
| assert!(err.to_string().contains("Generic S3 error: Missing region")); | ||
|
|
||
| env::set_var("AWS_REGION", "us-east-1"); | ||
| assert!(provider.get_by_url(&Url::from_str(s3).unwrap()).is_ok()); | ||
| let url = Url::from_str(s3).expect("Unable to parse s3 url"); | ||
| let res = provider.get_by_url(&url); | ||
| let msg = match res { | ||
| Err(e) => format!("{}", e), | ||
| Ok(_) => "".to_string() | ||
| }; | ||
| assert_eq!("".to_string(), msg); // Fail with error message | ||
|
||
| env::remove_var("AWS_REGION"); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,8 +19,10 @@ | |
| use crate::catalog::schema::SchemaProvider; | ||
| use crate::datasource::datasource::TableProviderFactory; | ||
| use crate::datasource::TableProvider; | ||
| use datafusion_common::DataFusionError; | ||
| use crate::execution::context::SessionState; | ||
| use datafusion_common::{context, DataFusionError}; | ||
| use futures::TryStreamExt; | ||
| use itertools::Itertools; | ||
| use object_store::ObjectStore; | ||
| use std::any::Any; | ||
| use std::collections::{HashMap, HashSet}; | ||
|
|
@@ -72,7 +74,7 @@ impl ListingSchemaProvider { | |
| } | ||
|
|
||
| /// Reload table information from ObjectStore | ||
| pub async fn refresh(&self) -> datafusion_common::Result<()> { | ||
| pub async fn refresh(&self, state: &SessionState) -> datafusion_common::Result<()> { | ||
|
||
| let entries: Vec<_> = self | ||
| .store | ||
| .list(Some(&self.path)) | ||
|
|
@@ -100,13 +102,20 @@ impl ListingSchemaProvider { | |
| .ok_or_else(|| { | ||
| DataFusionError::Internal("Cannot parse file name!".to_string()) | ||
| })?; | ||
| let table_name = table.to_str().ok_or_else(|| { | ||
| let table_name = file_name.split('.').collect_vec()[0]; | ||
|
||
| let table_path = table.to_str().ok_or_else(|| { | ||
| DataFusionError::Internal("Cannot parse file name!".to_string()) | ||
| })?; | ||
| if !self.table_exist(file_name) { | ||
| let table_name = format!("{}/{}", self.authority, table_name); | ||
| let provider = self.factory.create(table_name.as_str()).await?; | ||
| let _ = self.register_table(file_name.to_string(), provider.clone())?; | ||
| if !self.table_exist(table_name) { | ||
| let table_url = format!("{}/{}", self.authority, table_path); | ||
| let provider = self | ||
| .factory | ||
| .create(state, table_url.as_str()) | ||
| .await | ||
| .map_err(|e| { | ||
| context!(format!("Could not create table for {}", table_url), e) | ||
| })?; | ||
| let _ = self.register_table(table_name.to_string(), provider.clone())?; | ||
| } | ||
| } | ||
| Ok(()) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,79 @@ | ||
| // Licensed to the Apache Software Foundation (ASF) under one | ||
| // or more contributor license agreements. See the NOTICE file | ||
| // distributed with this work for additional information | ||
| // regarding copyright ownership. The ASF licenses this file | ||
| // to you under the Apache License, Version 2.0 (the | ||
| // "License"); you may not use this file except in compliance | ||
| // with the License. You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, | ||
| // software distributed under the License is distributed on an | ||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| // KIND, either express or implied. See the License for the | ||
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| //! Factory for creating ListingTables with default options | ||
|
|
||
| use crate::datasource::datasource::TableProviderFactory; | ||
| use crate::datasource::file_format::avro::AvroFormat; | ||
| use crate::datasource::file_format::csv::CsvFormat; | ||
| use crate::datasource::file_format::file_type::{FileType, GetExt}; | ||
| use crate::datasource::file_format::json::JsonFormat; | ||
| use crate::datasource::file_format::parquet::ParquetFormat; | ||
| use crate::datasource::file_format::FileFormat; | ||
| use crate::datasource::listing::{ | ||
| ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl, | ||
| }; | ||
| use crate::datasource::TableProvider; | ||
| use crate::execution::context::SessionState; | ||
| use async_trait::async_trait; | ||
| use std::sync::Arc; | ||
|
|
||
| /// A `TableProviderFactory` capable of creating new `ListingTable`s | ||
| pub struct ListingTableFactory { | ||
| file_type: FileType, | ||
| } | ||
|
|
||
| impl ListingTableFactory { | ||
| /// Creates a new `ListingTableFactory` | ||
| pub fn new(file_type: FileType) -> Self { | ||
| Self { file_type } | ||
| } | ||
| } | ||
|
|
||
| #[async_trait] | ||
| impl TableProviderFactory for ListingTableFactory { | ||
| async fn create( | ||
| &self, | ||
| state: &SessionState, | ||
| url: &str, | ||
| ) -> datafusion_common::Result<Arc<dyn TableProvider>> { | ||
| let file_extension = self.file_type.get_ext(); | ||
|
|
||
| let file_format: Arc<dyn FileFormat> = match self.file_type { | ||
|
||
| FileType::CSV => Arc::new(CsvFormat::default()), | ||
| FileType::PARQUET => Arc::new(ParquetFormat::default()), | ||
| FileType::AVRO => Arc::new(AvroFormat::default()), | ||
| FileType::JSON => Arc::new(JsonFormat::default()), | ||
| }; | ||
|
|
||
| let options = ListingOptions { | ||
| format: file_format, | ||
| collect_stat: true, | ||
| file_extension: file_extension.to_owned(), | ||
| target_partitions: 1, | ||
| table_partition_cols: vec![], | ||
| }; | ||
|
|
||
| let table_path = ListingTableUrl::parse(url)?; | ||
| let resolved_schema = options.infer_schema(state, &table_path).await?; | ||
| let config = ListingTableConfig::new(table_path) | ||
| .with_listing_options(options) | ||
| .with_schema(resolved_schema); | ||
| let table = ListingTable::try_new(config)?; | ||
| Ok(Arc::new(table)) | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -175,6 +175,26 @@ impl SessionContext { | |||||
| Self::with_config(SessionConfig::new()) | ||||||
| } | ||||||
|
|
||||||
| /// Finds any ListSchemaProviders and instructs them to reload tables from "disk" | ||||||
|
||||||
| /// Finds any ListSchemaProviders and instructs them to reload tables from "disk" | |
| /// Invokes `ListingSchemaProvider::reload()` for all registered providers |
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Moving this code to a common function on the context which we can use from datafusion-cli, tests, Ballista, etc.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
makes sense to me
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
not found is a hard error message to ctrl-f for. Adding the word table will hopefully make this statistically more likely to be found.
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You might also consider using assert_batches_eq here in this test
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The proof is in the pudding. Can't select from a table without registering it first, so this must be auto-registered.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add
TableFactorys for default formats.