From dd97e1653b49bbe0d73f3bc61d7d67f8cc567e3c Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Wed, 2 Jun 2021 15:50:52 +0800 Subject: [PATCH 1/5] add random and now function --- python/src/functions.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/python/src/functions.rs b/python/src/functions.rs index f46dd3e0e5f7..8a1b58dc1c69 100644 --- a/python/src/functions.rs +++ b/python/src/functions.rs @@ -157,6 +157,13 @@ fn md5(value: expression::Expression) -> expression::Expression { } } +#[pyfunction] +fn now() -> expression::Expression { + expression::Expression { + expr: logical_plan::now(), + } +} + #[pyfunction] fn octet_length(value: expression::Expression) -> expression::Expression { expression::Expression { @@ -164,6 +171,13 @@ fn octet_length(value: expression::Expression) -> expression::Expression { } } +#[pyfunction] +fn random() -> expression::Expression { + expression::Expression { + expr: logical_plan::random(), + } +} + #[pyfunction] fn regexp_replace(value: expression::Expression) -> expression::Expression { expression::Expression { @@ -414,8 +428,10 @@ pub fn init(module: &PyModule) -> PyResult<()> { module.add_function(wrap_pyfunction!(lower, module)?)?; module.add_function(wrap_pyfunction!(lpad, module)?)?; module.add_function(wrap_pyfunction!(md5, module)?)?; + module.add_function(wrap_pyfunction!(now, module)?)?; module.add_function(wrap_pyfunction!(ltrim, module)?)?; module.add_function(wrap_pyfunction!(octet_length, module)?)?; + module.add_function(wrap_pyfunction!(random, module)?)?; module.add_function(wrap_pyfunction!(regexp_replace, module)?)?; module.add_function(wrap_pyfunction!(repeat, module)?)?; module.add_function(wrap_pyfunction!(replace, module)?)?; From 666fa09949288f5116de2df2c6400723b5bb0912 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Wed, 2 Jun 2021 18:12:34 +0800 Subject: [PATCH 2/5] fix --- python/src/functions.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/src/functions.rs b/python/src/functions.rs index 8a1b58dc1c69..3fe212f4f4bf 100644 --- a/python/src/functions.rs +++ b/python/src/functions.rs @@ -158,9 +158,9 @@ fn md5(value: expression::Expression) -> expression::Expression { } #[pyfunction] -fn now() -> expression::Expression { +fn now(value: expression::Expression) -> expression::Expression { expression::Expression { - expr: logical_plan::now(), + expr: logical_plan::now(value.expr), } } @@ -172,9 +172,9 @@ fn octet_length(value: expression::Expression) -> expression::Expression { } #[pyfunction] -fn random() -> expression::Expression { +fn random(value: expression::Expression) -> expression::Expression { expression::Expression { - expr: logical_plan::random(), + expr: logical_plan::random(value.expr), } } From ed0f62500878f96ab6a28e9557d6370378f3b009 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Wed, 2 Jun 2021 18:21:16 +0800 Subject: [PATCH 3/5] simplify python function definition --- python/src/functions.rs | 364 ++++++++-------------------------------- 1 file changed, 67 insertions(+), 297 deletions(-) diff --git a/python/src/functions.rs b/python/src/functions.rs index 3fe212f4f4bf..68c3fea2ff75 100644 --- a/python/src/functions.rs +++ b/python/src/functions.rs @@ -15,34 +15,13 @@ // specific language governing permissions and limitations // under the License. -use std::sync::Arc; - -use datafusion::arrow::datatypes::DataType; -use pyo3::{prelude::*, wrap_pyfunction}; - -use datafusion::logical_plan; - use crate::udaf; use crate::udf; use crate::{expression, types::PyDataType}; - -/// Expression representing a column on the existing plan. -#[pyfunction] -#[text_signature = "(name)"] -fn col(name: &str) -> expression::Expression { - expression::Expression { - expr: logical_plan::col(name), - } -} - -/// Expression representing a constant value -#[pyfunction] -#[text_signature = "(value)"] -fn lit(value: i32) -> expression::Expression { - expression::Expression { - expr: logical_plan::lit(value), - } -} +use datafusion::arrow::datatypes::DataType; +use datafusion::logical_plan; +use pyo3::{prelude::*, wrap_pyfunction}; +use std::sync::Arc; #[pyfunction] fn array(value: Vec) -> expression::Expression { @@ -51,55 +30,6 @@ fn array(value: Vec) -> expression::Expression { } } -#[pyfunction] -fn ascii(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::ascii(value.expr), - } -} - -#[pyfunction] -fn sum(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::sum(value.expr), - } -} - -#[pyfunction] -fn bit_length(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::bit_length(value.expr), - } -} - -#[pyfunction] -fn btrim(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::btrim(value.expr), - } -} - -#[pyfunction] -fn character_length(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::character_length(value.expr), - } -} - -#[pyfunction] -fn chr(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::chr(value.expr), - } -} - -#[pyfunction] -fn concat_ws(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::concat_ws(value.expr), - } -} - #[pyfunction] fn in_list( expr: expression::Expression, @@ -115,229 +45,69 @@ fn in_list( } } -#[pyfunction] -fn initcap(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::initcap(value.expr), - } -} - -#[pyfunction] -fn left(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::left(value.expr), - } -} - -#[pyfunction] -fn lower(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::lower(value.expr), - } -} - -#[pyfunction] -fn lpad(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::lpad(value.expr), - } -} - -#[pyfunction] -fn ltrim(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::ltrim(value.expr), - } -} - -#[pyfunction] -fn md5(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::md5(value.expr), - } -} - -#[pyfunction] -fn now(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::now(value.expr), - } -} - -#[pyfunction] -fn octet_length(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::octet_length(value.expr), - } -} - -#[pyfunction] -fn random(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::random(value.expr), - } -} - -#[pyfunction] -fn regexp_replace(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::regexp_replace(value.expr), - } -} - -#[pyfunction] -fn repeat(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::repeat(value.expr), - } -} - -#[pyfunction] -fn replace(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::replace(value.expr), - } -} - -#[pyfunction] -fn reverse(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::reverse(value.expr), - } -} - -#[pyfunction] -fn right(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::right(value.expr), - } -} - -#[pyfunction] -fn rpad(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::rpad(value.expr), - } -} - -#[pyfunction] -fn rtrim(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::rtrim(value.expr), - } -} - -#[pyfunction] -fn sha224(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::sha224(value.expr), - } -} - -#[pyfunction] -fn sha256(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::sha256(value.expr), - } -} - -#[pyfunction] -fn sha384(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::sha384(value.expr), - } -} - -#[pyfunction] -fn sha512(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::sha512(value.expr), - } -} - -#[pyfunction] -fn split_part(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::split_part(value.expr), - } -} - -#[pyfunction] -fn starts_with(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::starts_with(value.expr), - } -} - -#[pyfunction] -fn strpos(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::strpos(value.expr), - } -} - -#[pyfunction] -fn substr(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::substr(value.expr), - } -} - -#[pyfunction] -fn to_hex(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::to_hex(value.expr), - } -} - -#[pyfunction] -fn translate(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::translate(value.expr), - } -} - -#[pyfunction] -fn trim(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::trim(value.expr), - } -} - -#[pyfunction] -fn upper(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::upper(value.expr), - } -} - -#[pyfunction] -fn avg(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::avg(value.expr), - } -} - -#[pyfunction] -fn min(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::min(value.expr), - } -} - -#[pyfunction] -fn max(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::max(value.expr), - } -} - -#[pyfunction] -fn count(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::count(value.expr), - } -} +macro_rules! define_function { + ($NAME: ident) => {{ + /// Expression representing a $NAME function + #[pyfunction] + fn $NAME(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::$NAME(value.expr), + } + } + }}; + ($NAME: ident, $SIGNATURE: expr) => {{ + /// Expression representing a $NAME function + #[pyfunction] + #[text_signature = $SIGNATURE] + fn $NAME(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::$NAME(value.expr), + } + } + }}; +} + +define_function!(col, "(name)"); +define_function!(lit, "(value)"); +define_function!(ascii); +define_function!(sum); +define_function!(bit_length); +define_function!(btrim); +define_function!(character_length); +define_function!(chr); +define_function!(concat_ws); +define_function!(initcap); +define_function!(left); +define_function!(lower); +define_function!(lpad); +define_function!(ltrim); +define_function!(md5); +define_function!(now); +define_function!(octet_length); +define_function!(random); +define_function!(replace); +define_function!(repeat); +define_function!(regexp_replace); +define_function!(reverse); +define_function!(right); +define_function!(rpad); +define_function!(rtrim); +define_function!(sha224); +define_function!(sha256); +define_function!(sha384); +define_function!(sha512); +define_function!(split_part); +define_function!(starts_with); +define_function!(strpos); +define_function!(substr); +define_function!(to_hex); +define_function!(translate); +define_function!(trim); +define_function!(upper); +define_function!(avg); +define_function!(min); +define_function!(max); +define_function!(count); /* #[pyfunction] From 81158ff1f07680fadf661a88389c83b897139fd3 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Wed, 2 Jun 2021 20:18:14 +0800 Subject: [PATCH 4/5] adding more docs --- python/src/functions.rs | 110 ++++++++++++++++++++++++++-------------- 1 file changed, 73 insertions(+), 37 deletions(-) diff --git a/python/src/functions.rs b/python/src/functions.rs index 68c3fea2ff75..b03004fae431 100644 --- a/python/src/functions.rs +++ b/python/src/functions.rs @@ -23,6 +23,24 @@ use datafusion::logical_plan; use pyo3::{prelude::*, wrap_pyfunction}; use std::sync::Arc; +/// Expression representing a column on the existing plan. +#[pyfunction] +#[text_signature = "(name)"] +fn col(name: &str) -> expression::Expression { + expression::Expression { + expr: logical_plan::col(name), + } +} + +/// Expression representing a constant value +#[pyfunction] +#[text_signature = "(value)"] +fn lit(value: i32) -> expression::Expression { + expression::Expression { + expr: logical_plan::lit(value), + } +} + #[pyfunction] fn array(value: Vec) -> expression::Expression { expression::Expression { @@ -46,64 +64,82 @@ fn in_list( } macro_rules! define_function { - ($NAME: ident) => {{ - /// Expression representing a $NAME function + ($NAME: ident) => { + #[doc = "This function is not documented yet"] #[pyfunction] fn $NAME(value: expression::Expression) -> expression::Expression { expression::Expression { expr: logical_plan::$NAME(value.expr), } } - }}; - ($NAME: ident, $SIGNATURE: expr) => {{ - /// Expression representing a $NAME function + }; + ($NAME: ident, $DOC: expr) => { + #[doc = $DOC] #[pyfunction] - #[text_signature = $SIGNATURE] fn $NAME(value: expression::Expression) -> expression::Expression { expression::Expression { expr: logical_plan::$NAME(value.expr), } } - }}; + }; } -define_function!(col, "(name)"); -define_function!(lit, "(value)"); -define_function!(ascii); +define_function!(ascii, "Returns the numeric code of the first character of the argument. In UTF8 encoding, returns the Unicode code point of the character. In other multibyte encodings, the argument must be an ASCII character."); define_function!(sum); -define_function!(bit_length); -define_function!(btrim); -define_function!(character_length); -define_function!(chr); -define_function!(concat_ws); -define_function!(initcap); -define_function!(left); -define_function!(lower); -define_function!(lpad); -define_function!(ltrim); -define_function!(md5); +define_function!( + bit_length, + "Returns number of bits in the string (8 times the octet_length)." +); +define_function!(btrim, "Removes the longest string containing only characters in characters (a space by default) from the start and end of string."); +define_function!( + character_length, + "Returns number of characters in the string." +); +define_function!(chr, "Returns the character with the given code."); +define_function!(concat_ws, "Concatenates all but the first argument, with separators. The first argument is used as the separator string, and should not be NULL. Other NULL arguments are ignored."); +define_function!(initcap, "Converts the first letter of each word to upper case and the rest to lower case. Words are sequences of alphanumeric characters separated by non-alphanumeric characters."); +define_function!(left, "Returns first n characters in the string, or when n is negative, returns all but last |n| characters."); +define_function!(lower, "Converts the string to all lower case"); +define_function!(lpad, "Extends the string to length length by prepending the characters fill (a space by default). If the string is already longer than length then it is truncated (on the right)."); +define_function!(ltrim, "Removes the longest string containing only characters in characters (a space by default) from the start of string."); +define_function!( + md5, + "Computes the MD5 hash of the argument, with the result written in hexadecimal." +); define_function!(now); -define_function!(octet_length); -define_function!(random); -define_function!(replace); -define_function!(repeat); -define_function!(regexp_replace); -define_function!(reverse); -define_function!(right); -define_function!(rpad); -define_function!(rtrim); +define_function!(octet_length, "Returns number of bytes in the string. Since this version of the function accepts type character directly, it will not strip trailing spaces."); +define_function!(random, "Returns a random value in the range 0.0 <= x < 1.0"); +define_function!( + replace, + "Replaces all occurrences in string of substring from with substring to." +); +define_function!(repeat, "Repeats string the specified number of times."); +define_function!( + regexp_replace, + "Replaces substring(s) matching a POSIX regular expression" +); +define_function!( + reverse, + "Reverses the order of the characters in the string." +); +define_function!(right, "Returns last n characters in the string, or when n is negative, returns all but first |n| characters."); +define_function!(rpad, "Extends the string to length length by appending the characters fill (a space by default). If the string is already longer than length then it is truncated."); +define_function!(rtrim, "Removes the longest string containing only characters in characters (a space by default) from the end of string."); define_function!(sha224); define_function!(sha256); define_function!(sha384); define_function!(sha512); -define_function!(split_part); -define_function!(starts_with); -define_function!(strpos); +define_function!(split_part, "Splits string at occurrences of delimiter and returns the n'th field (counting from one)."); +define_function!(starts_with, "Returns true if string starts with prefix."); +define_function!(strpos,"Returns starting index of specified substring within string, or zero if it's not present. (Same as position(substring in string), but note the reversed argument order.)"); define_function!(substr); -define_function!(to_hex); -define_function!(translate); -define_function!(trim); -define_function!(upper); +define_function!( + to_hex, + "Converts the number to its equivalent hexadecimal representation." +); +define_function!(translate, "Replaces each character in string that matches a character in the from set with the corresponding character in the to set. If from is longer than to, occurrences of the extra characters in from are deleted."); +define_function!(trim, "Removes the longest string containing only characters in characters (a space by default) from the start, end, or both ends (BOTH is the default) of string."); +define_function!(upper, "Converts the string to all upper case."); define_function!(avg); define_function!(min); define_function!(max); From 03432e577491147f78e5fec6f7a38d8a07d846d9 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Wed, 2 Jun 2021 20:37:51 +0800 Subject: [PATCH 5/5] update readme --- python/README.md | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/python/README.md b/python/README.md index 1859fca9811c..50143aef42c5 100644 --- a/python/README.md +++ b/python/README.md @@ -115,7 +115,7 @@ df = df.aggregate( ) ``` -## How to install +## How to install (from pip) ```bash pip install datafusion @@ -135,12 +135,18 @@ cd arrow-datafusion/python # prepare development environment (used to build wheel / install in development) python3 -m venv venv -pip install maturin==0.10.4 toml==0.10.1 pyarrow==1.0.0 + +# activate the venv +source venv/bin/activate + +# install dependencies +pip install maturin==0.10.6 toml==0.10.1 pyarrow==4.0.0 ``` -Whenever rust code changes (your changes or via git pull): +Whenever rust code changes (your changes or via `git pull`): ```bash -venv/bin/maturin develop -venv/bin/python -m unittest discover tests +# make sure you activate the venv using "source venv/bin/activate" first +maturin develop +python -m unittest discover tests ```