ARROW-4263: [Rust] Donate DataFusion

This PR is to donate the DataFusion source code (assuming that the vote passes!) Author: Andy Grove <andygrove73@gmail.com> Closes #3399 from andygrove/ARROW-4263 and squashes the following commits: 990d06f <Andy Grove> formatting 6603091 <Andy Grove> update path again, update testing submodule 38fa63b <Andy Grove> remove test csv file, update tests to use test data from new testing submodule 16e4cff <Andy Grove> remove test csv file, update tests to use test data from new testing submodule 91f6e90 <Andy Grove> update example to use new data file 4ebeee5 <Andy Grove> formatting ae88a90 <Andy Grove> convert tests to use new test data file that was randomly generated d7bea8e <Andy Grove> update test to use uk_cities.csv and remove people.csv 061d788 <Andy Grove> remove unused test data files f60e50d <Andy Grove> remove unused test data files, manually recreate uk_cities.csv because I can't trace where the original data came from 28d914a <Andy Grove> Update 00-prepare.sh to handle datafusion versioning c4e1a26 <Andy Grove> DataFusion Donation
apache · Feb 4, 2019 · 29f14ca · 29f14ca
1 parent 7f96b6f
commit 29f14ca
Show file tree

Hide file tree

Showing 21 changed files with 4,590 additions and 4 deletions.
diff --git a/dev/release/00-prepare.sh b/dev/release/00-prepare.sh
@@ -100,9 +100,9 @@ update_versions() {
   cd "${SOURCE_DIR}/../../rust"
   sed -i.bak -E -e \
     "s/^version = \".+\"/version = \"${version}\"/g" \
-    arrow/Cargo.toml parquet/Cargo.toml
-  rm -f arrow/Cargo.toml.bak parquet/Cargo.toml.bak
-  git add arrow/Cargo.toml parquet/Cargo.toml
+    arrow/Cargo.toml parquet/Cargo.toml datafusion/Cargo.toml
+  rm -f arrow/Cargo.toml.bak parquet/Cargo.toml.bak datafusion/Cargo.toml.bak
+  git add arrow/Cargo.toml parquet/Cargo.toml datafusion/Cargo.toml
 
   # Update version number for parquet README
   sed -i.bak -E -e \

diff --git a/rust/Cargo.toml b/rust/Cargo.toml
@@ -19,4 +19,5 @@
 members = [
         "arrow",
         "parquet",
+        "datafusion",
 ]
diff --git a/rust/datafusion/Cargo.toml b/rust/datafusion/Cargo.toml
@@ -0,0 +1,50 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[package]
+name = "datafusion"
+description = "DataFusion is an in-memory query engine that uses Apache Arrow as the memory model"
+version = "0.13.0-SNAPSHOT"
+homepage = "https://github.com/apache/arrow"
+repository = "https://github.com/apache/arrow"
+authors = ["Apache Arrow <dev@arrow.apache.org>"]
+license = "Apache-2.0"
+keywords = [ "arrow", "query", "sql" ]
+include = [
+    "src/**/*.rs",
+    "Cargo.toml",
+]
+edition = "2018"
+
+[lib]
+name = "datafusion"
+path = "src/lib.rs"
+
+[dependencies]
+clap = "2.31.2"
+fnv = "1.0.3"
+arrow = { path = "../arrow" }
+parquet = { path = "../parquet" }
+datafusion-rustyline = "2.0.0-alpha-20180628"
+serde = { version = "1.0.80", features = ["alloc", "rc"] }
+serde_derive = "1.0.80"
+serde_json = "1.0.33"
+sqlparser = "0.2.0"
+
+[dev-dependencies]
+criterion = "0.2.0"
+
diff --git a/rust/datafusion/README.md b/rust/datafusion/README.md
@@ -0,0 +1,94 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# DataFusion
+
+DataFusion is an in-memory query engine that uses Apache Arrow as the memory model
+
+# Status
+
+The current code supports single-threaded execution of limited SQL queries (projection, selection, and aggregates) against CSV files. Parquet files will be supported shortly.
+
+Here is a brief example for running a SQL query against a CSV file. See the [examples](examples) directory for full examples.
+
+```rust
+fn main() {
+    // create local execution context
+    let mut ctx = ExecutionContext::new();
+
+    // define schema for data source (csv file)
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("city", DataType::Utf8, false),
+        Field::new("lat", DataType::Float64, false),
+        Field::new("lng", DataType::Float64, false),
+    ]));
+
+    // register csv file with the execution context
+    let csv_datasource = CsvDataSource::new("../../testing/data/csv/uk_cities.csv", schema.clone(), 1024);
+    ctx.register_datasource("cities", Rc::new(RefCell::new(csv_datasource)));
+
+    // simple projection and selection
+    let sql = "SELECT city, lat, lng FROM cities WHERE lat > 51.0 AND lat < 53";
+
+    // execute the query
+    let relation = ctx.sql(&sql).unwrap();
+
+    // display the relation
+    let mut results = relation.borrow_mut();
+
+    while let Some(batch) = results.next().unwrap() {
+
+        println!(
+            "RecordBatch has {} rows and {} columns",
+            batch.num_rows(),
+            batch.num_columns()
+        );
+
+        let city = batch
+            .column(0)
+            .as_any()
+            .downcast_ref::<BinaryArray>()
+            .unwrap();
+
+        let lat = batch
+            .column(1)
+            .as_any()
+            .downcast_ref::<Float64Array>()
+            .unwrap();
+
+        let lng = batch
+            .column(2)
+            .as_any()
+            .downcast_ref::<Float64Array>()
+            .unwrap();
+
+        for i in 0..batch.num_rows() {
+            let city_name: String = String::from_utf8(city.get_value(i).to_vec()).unwrap();
+
+            println!(
+                "City: {}, Latitude: {}, Longitude: {}",
+                city_name,
+                lat.value(i),
+                lng.value(i),
+            );
+        }
+    }
+}
+```
+
diff --git a/rust/datafusion/examples/csv_sql.rs b/rust/datafusion/examples/csv_sql.rs
@@ -0,0 +1,101 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::cell::RefCell;
+use std::rc::Rc;
+use std::sync::Arc;
+
+extern crate arrow;
+extern crate datafusion;
+
+use arrow::array::{BinaryArray, Float64Array};
+use arrow::datatypes::{DataType, Field, Schema};
+
+use datafusion::execution::context::ExecutionContext;
+use datafusion::execution::datasource::CsvDataSource;
+
+/// This example demonstrates executing a simple query against an Arrow data source and fetching results
+fn main() {
+    // create local execution context
+    let mut ctx = ExecutionContext::new();
+
+    // define schema for data source (csv file)
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("c1", DataType::Utf8, false),
+        Field::new("c2", DataType::UInt32, false),
+        Field::new("c3", DataType::Int8, false),
+        Field::new("c4", DataType::Int16, false),
+        Field::new("c5", DataType::Int32, false),
+        Field::new("c6", DataType::Int64, false),
+        Field::new("c7", DataType::UInt8, false),
+        Field::new("c8", DataType::UInt16, false),
+        Field::new("c9", DataType::UInt32, false),
+        Field::new("c10", DataType::UInt64, false),
+        Field::new("c11", DataType::Float32, false),
+        Field::new("c12", DataType::Float64, false),
+        Field::new("c13", DataType::Utf8, false),
+    ]));
+
+    // register csv file with the execution context
+    let csv_datasource = CsvDataSource::new(
+        "../../testing/data/csv/aggregate_test_100.csv",
+        schema.clone(),
+        1024,
+    );
+    ctx.register_datasource("aggregate_test_100", Rc::new(RefCell::new(csv_datasource)));
+
+    // simple projection and selection
+    let sql = "SELECT c1, MIN(c12), MAX(c12) FROM aggregate_test_100 WHERE c11 > 0.1 AND c11 < 0.9 GROUP BY c1";
+
+    // execute the query
+    let relation = ctx.sql(&sql).unwrap();
+
+    // display the relation
+    let mut results = relation.borrow_mut();
+
+    while let Some(batch) = results.next().unwrap() {
+        println!(
+            "RecordBatch has {} rows and {} columns",
+            batch.num_rows(),
+            batch.num_columns()
+        );
+
+        let c1 = batch
+            .column(0)
+            .as_any()
+            .downcast_ref::<BinaryArray>()
+            .unwrap();
+
+        let min = batch
+            .column(1)
+            .as_any()
+            .downcast_ref::<Float64Array>()
+            .unwrap();
+
+        let max = batch
+            .column(2)
+            .as_any()
+            .downcast_ref::<Float64Array>()
+            .unwrap();
+
+        for i in 0..batch.num_rows() {
+            let c1_value: String = String::from_utf8(c1.value(i).to_vec()).unwrap();
+
+            println!("{}, Min: {}, Max: {}", c1_value, min.value(i), max.value(i),);
+        }
+    }
+}