diff --git a/datafusion-examples/examples/dataframe.rs b/datafusion-examples/examples/dataframe.rs index 027ff9970057..26fddcd226a9 100644 --- a/datafusion-examples/examples/dataframe.rs +++ b/datafusion-examples/examples/dataframe.rs @@ -51,10 +51,7 @@ async fn main() -> Result<()> { // Reading PARQUET file and print describe let parquet_df = ctx - .read_parquet( - &format!("{testdata}/alltypes_plain.parquet"), - ParquetReadOptions::default(), - ) + .read_parquet(filename, ParquetReadOptions::default()) .await?; parquet_df.describe().await.unwrap().show().await?; diff --git a/datafusion/core/src/dataframe.rs b/datafusion/core/src/dataframe.rs index 30e14a2afa59..82f8deb3c20d 100644 --- a/datafusion/core/src/dataframe.rs +++ b/datafusion/core/src/dataframe.rs @@ -29,7 +29,10 @@ use parquet::file::properties::WriterProperties; use datafusion_common::from_slice::FromSlice; use datafusion_common::{Column, DFSchema, ScalarValue}; -use datafusion_expr::{TableProviderFilterPushDown, UNNAMED_TABLE}; +use datafusion_expr::{ + avg, count, is_null, max, median, min, stddev, TableProviderFilterPushDown, + UNNAMED_TABLE, +}; use crate::arrow::datatypes::Schema; use crate::arrow::datatypes::SchemaRef; @@ -324,7 +327,7 @@ impl DataFrame { pub async fn describe(self) -> Result { //the functions now supported let supported_describe_functions = - vec!["count", "null_count", "mean", "min", "max"]; + vec!["count", "null_count", "mean", "std", "min", "max", "median"]; let fields_iter = self.schema().fields().iter(); @@ -349,7 +352,7 @@ impl DataFrame { vec![], fields_iter .clone() - .map(|f| datafusion_expr::count(col(f.name())).alias(f.name())) + .map(|f| count(col(f.name())).alias(f.name())) .collect::>(), )? .collect() @@ -360,12 +363,7 @@ impl DataFrame { vec![], fields_iter .clone() - .map(|f| { - datafusion_expr::count(datafusion_expr::is_null( - col(f.name()), - )) - .alias(f.name()) - }) + .map(|f| count(is_null(col(f.name()))).alias(f.name())) .collect::>(), )? .collect() @@ -377,7 +375,19 @@ impl DataFrame { fields_iter .clone() .filter(|f| f.data_type().is_numeric()) - .map(|f| datafusion_expr::avg(col(f.name())).alias(f.name())) + .map(|f| avg(col(f.name())).alias(f.name())) + .collect::>(), + )? + .collect() + .await?, + // std aggregation + self.clone() + .aggregate( + vec![], + fields_iter + .clone() + .filter(|f| f.data_type().is_numeric()) + .map(|f| stddev(col(f.name())).alias(f.name())) .collect::>(), )? .collect() @@ -391,7 +401,7 @@ impl DataFrame { .filter(|f| { !matches!(f.data_type(), DataType::Binary | DataType::Boolean) }) - .map(|f| datafusion_expr::min(col(f.name())).alias(f.name())) + .map(|f| min(col(f.name())).alias(f.name())) .collect::>(), )? .collect() @@ -405,7 +415,19 @@ impl DataFrame { .filter(|f| { !matches!(f.data_type(), DataType::Binary | DataType::Boolean) }) - .map(|f| datafusion_expr::max(col(f.name())).alias(f.name())) + .map(|f| max(col(f.name())).alias(f.name())) + .collect::>(), + )? + .collect() + .await?, + // median aggregation + self.clone() + .aggregate( + vec![], + fields_iter + .clone() + .filter(|f| f.data_type().is_numeric()) + .map(|f| median(col(f.name())).alias(f.name())) .collect::>(), )? .collect() diff --git a/datafusion/core/tests/dataframe.rs b/datafusion/core/tests/dataframe.rs index 85d968398efd..ede74b2272ce 100644 --- a/datafusion/core/tests/dataframe.rs +++ b/datafusion/core/tests/dataframe.rs @@ -49,15 +49,17 @@ async fn describe() -> Result<()> { let describe_record_batch = df.describe().await.unwrap().collect().await.unwrap(); #[rustfmt::skip] let expected = vec![ - "+------------+-----+----------+-------------+--------------+---------+------------+-------------------+------------+-----------------+------------+---------------------+", - "| describe | id | bool_col | tinyint_col | smallint_col | int_col | bigint_col | float_col | double_col | date_string_col | string_col | timestamp_col |", - "+------------+-----+----------+-------------+--------------+---------+------------+-------------------+------------+-----------------+------------+---------------------+", - "| count | 8.0 | 8 | 8.0 | 8.0 | 8.0 | 8.0 | 8.0 | 8.0 | 8 | 8 | 8 |", - "| null_count | 8.0 | 8 | 8.0 | 8.0 | 8.0 | 8.0 | 8.0 | 8.0 | 8 | 8 | 8 |", - "| mean | 3.5 | null | 0.5 | 0.5 | 0.5 | 5.0 | 0.550000011920929 | 5.05 | null | null | null |", - "| min | 0.0 | null | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | null | null | 2009-01-01T00:00:00 |", - "| max | 7.0 | null | 1.0 | 1.0 | 1.0 | 10.0 | 1.100000023841858 | 10.1 | null | null | 2009-04-01T00:01:00 |", - "+------------+-----+----------+-------------+--------------+---------+------------+-------------------+------------+-----------------+------------+---------------------+", + "+------------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+-----------------+------------+---------------------+", + "| describe | id | bool_col | tinyint_col | smallint_col | int_col | bigint_col | float_col | double_col | date_string_col | string_col | timestamp_col |", + "+------------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+-----------------+------------+---------------------+", + "| count | 8.0 | 8 | 8.0 | 8.0 | 8.0 | 8.0 | 8.0 | 8.0 | 8 | 8 | 8 |", + "| null_count | 8.0 | 8 | 8.0 | 8.0 | 8.0 | 8.0 | 8.0 | 8.0 | 8 | 8 | 8 |", + "| mean | 3.5 | null | 0.5 | 0.5 | 0.5 | 5.0 | 0.550000011920929 | 5.05 | null | null | null |", + "| std | 2.4494897427831783 | null | 0.5345224838248488 | 0.5345224838248488 | 0.5345224838248488 | 5.3452248382484875 | 0.5879747449513427 | 5.398677086630973 | null | null | null |", + "| min | 0.0 | null | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | null | null | 2009-01-01T00:00:00 |", + "| max | 7.0 | null | 1.0 | 1.0 | 1.0 | 10.0 | 1.100000023841858 | 10.1 | null | null | 2009-04-01T00:01:00 |", + "| median | 3.0 | null | 0.0 | 0.0 | 0.0 | 5.0 | 0.550000011920929 | 5.05 | null | null | null |", + "+------------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+-----------------+------------+---------------------+", ]; assert_batches_eq!(expected, &describe_record_batch);