From bb1b9ed9c590c626cd240c6f7f0e4e2523f21f30 Mon Sep 17 00:00:00 2001
From: Kumar Ujjawal <ujjawalpathak6@gmail.com>
Date: Fri, 10 Apr 2026 12:02:22 +0530
Subject: [PATCH] feat(parquet): add StatisticsConverter::from_column_index

---
 parquet/src/arrow/arrow_reader/statistics.rs | 39 ++++++++++-
 parquet/tests/arrow_reader/statistics.rs     | 74 ++++++++++++++++++++
 2 files changed, 112 insertions(+), 1 deletion(-)
diff --git a/parquet/src/arrow/arrow_reader/statistics.rs b/parquet/src/arrow/arrow_reader/statistics.rs
index 19d3e34f5243..f1d264f57408 100644
--- a/parquet/src/arrow/arrow_reader/statistics.rs
+++ b/parquet/src/arrow/arrow_reader/statistics.rs
@@ -1353,7 +1353,10 @@ where
 /// Note: The Parquet schema and Arrow schema do not have to be identical (for
 /// example, the columns may be in different orders and one or the other schemas
 /// may have additional columns). The function [`parquet_column`] is used to
-/// match the column in the Parquet schema to the column in the Arrow schema.
+/// match the column in the Parquet schema to the column in the Arrow schema
+/// when using [`Self::try_new`]. For nested leaf fields (for example fields
+/// within a struct), use [`Self::from_column_index`] with a pre-resolved
+/// Parquet leaf column index.
 #[derive(Debug)]
 pub struct StatisticsConverter<'a> {
     /// the index of the matched column in the Parquet schema
@@ -1454,6 +1457,9 @@ impl<'a> StatisticsConverter<'a> {
     /// arrays will be null. This can happen if the column is in the arrow
     /// schema but not in the parquet schema due to schema evolution.
     ///
+    /// This constructor only supports top-level, non-nested columns. For nested
+    /// leaf fields, use [`Self::from_column_index`].
+    ///
     /// See example on [`Self::row_group_mins`] for usage
     ///
     /// # Errors
@@ -1495,6 +1501,37 @@ impl<'a> StatisticsConverter<'a> {
         })
     }
 
+    /// Create a new `StatisticsConverter` from a Parquet leaf column index.
+    ///
+    /// Unlike [`Self::try_new`], this constructor bypasses schema resolution
+    /// and accepts a pre-resolved Parquet leaf column index directly. This is
+    /// useful for nested leaf fields where the caller already knows how the
+    /// Arrow field maps to the Parquet schema.
+    ///
+    /// # Errors
+    ///
+    /// * If `parquet_column_index` is out of bounds for `parquet_schema`
+    pub fn from_column_index(
+        parquet_column_index: usize,
+        arrow_field: &'a Field,
+        parquet_schema: &'a SchemaDescriptor,
+    ) -> Result<Self> {
+        if parquet_column_index >= parquet_schema.columns().len() {
+            return Err(arrow_err!(format!(
+                "Parquet column index {} out of bounds, max {}",
+                parquet_column_index,
+                parquet_schema.columns().len()
+            )));
+        }
+
+        Ok(Self {
+            parquet_column_index: Some(parquet_column_index),
+            arrow_field,
+            missing_null_counts_as_zero: true,
+            physical_type: Some(parquet_schema.column(parquet_column_index).physical_type()),
+        })
+    }
+
     /// Extract the minimum values from row group statistics in [`RowGroupMetaData`]
     ///
     /// # Return Value
diff --git a/parquet/tests/arrow_reader/statistics.rs b/parquet/tests/arrow_reader/statistics.rs
index 4f7ddcff4ad1..62225ddf276f 100644
--- a/parquet/tests/arrow_reader/statistics.rs
+++ b/parquet/tests/arrow_reader/statistics.rs
@@ -2340,6 +2340,80 @@ async fn test_boolean() {
     .run();
 }
 
+// struct array
+#[tokio::test]
+async fn test_struct_children_from_column_index() {
+    let reader = TestReader {
+        scenario: Scenario::StructArray,
+        row_per_group: 5,
+    }
+    .build()
+    .await;
+
+    let schema = reader.schema();
+    let parquet_schema = reader.parquet_schema();
+    let (_idx, struct_field) = schema.column_with_name("struct").unwrap();
+    let DataType::Struct(fields) = struct_field.data_type() else {
+        panic!("expected struct field, got {:?}", struct_field.data_type());
+    };
+
+    let cases = vec![
+        (
+            "struct.int32_col",
+            "int32_col",
+            Arc::new(Int32Array::from(vec![Some(1)])) as ArrayRef,
+            Arc::new(Int32Array::from(vec![Some(2)])) as ArrayRef,
+            UInt64Array::from(vec![1]),
+        ),
+        (
+            "struct.float32_col",
+            "float32_col",
+            Arc::new(Float32Array::from(vec![Some(6.0)])) as ArrayRef,
+            Arc::new(Float32Array::from(vec![Some(8.5)])) as ArrayRef,
+            UInt64Array::from(vec![0]),
+        ),
+        (
+            "struct.float64_col",
+            "float64_col",
+            Arc::new(Float64Array::from(vec![Some(12.0)])) as ArrayRef,
+            Arc::new(Float64Array::from(vec![Some(14.0)])) as ArrayRef,
+            UInt64Array::from(vec![1]),
+        ),
+    ];
+
+    for (parquet_path, field_name, expected_min, expected_max, expected_null_counts) in cases {
+        let parquet_column_index = parquet_schema
+            .columns()
+            .iter()
+            .position(|col| col.path().string() == parquet_path)
+            .unwrap();
+        let (_field_idx, child_field) = fields.find(field_name).unwrap();
+
+        let converter = StatisticsConverter::from_column_index(
+            parquet_column_index,
+            child_field.as_ref(),
+            parquet_schema,
+        )
+        .unwrap();
+
+        assert_eq!(converter.parquet_column_index(), Some(parquet_column_index));
+        assert_eq!(converter.arrow_field(), child_field.as_ref());
+
+        Test {
+            reader: &reader,
+            expected_min,
+            expected_max,
+            expected_null_counts,
+            expected_row_counts: Some(UInt64Array::from(vec![3])),
+            expected_max_value_exact: BooleanArray::from(vec![true]),
+            expected_min_value_exact: BooleanArray::from(vec![true]),
+            column_name: field_name,
+            check: Check::RowGroup,
+        }
+        .run_checks(converter);
+    }
+}
+
 // struct array
 // BUG
 // https://github.com/apache/datafusion/issues/10609