From 35803b96f97e5aaf159aa8d3aebdd9af0fe1d1f7 Mon Sep 17 00:00:00 2001 From: Kumar Ujjawal Date: Mon, 24 Nov 2025 17:51:31 +0530 Subject: [PATCH 1/2] Added shared validator helper that records context for mismatches and eliminates false-positive diffs --- datafusion/sqllogictest/bin/sqllogictests.rs | 33 +++- datafusion/sqllogictest/src/util.rs | 185 +++++++++++++++---- 2 files changed, 177 insertions(+), 41 deletions(-) diff --git a/datafusion/sqllogictest/bin/sqllogictests.rs b/datafusion/sqllogictest/bin/sqllogictests.rs index ec705fc9ba06..42720b983ec8 100644 --- a/datafusion/sqllogictest/bin/sqllogictests.rs +++ b/datafusion/sqllogictest/bin/sqllogictests.rs @@ -21,8 +21,9 @@ use datafusion::common::utils::get_available_parallelism; use datafusion::common::{exec_datafusion_err, exec_err, DataFusionError, Result}; use datafusion_sqllogictest::{ df_value_validator, read_dir_recursive, setup_scratch_dir, should_skip_file, - should_skip_record, value_normalizer, CurrentlyExecutingSqlTracker, DataFusion, - DataFusionSubstraitRoundTrip, Filter, TestContext, + should_skip_record, take_last_validation_failure, value_normalizer, + CurrentlyExecutingSqlTracker, DataFusion, DataFusionSubstraitRoundTrip, Filter, + TestContext, ValidationFailure, }; use futures::stream::StreamExt; use indicatif::{ @@ -33,7 +34,7 @@ use log::Level::Info; use log::{info, log_enabled}; use sqllogictest::{ parse_file, strict_column_validator, AsyncDB, Condition, MakeConnection, Normalizer, - Record, Validator, + Record, TestErrorKind, Validator, }; #[cfg(feature = "postgres")] @@ -43,6 +44,7 @@ use crate::postgres_container::{ use datafusion::common::runtime::SpawnedTask; use futures::FutureExt; use std::ffi::OsStr; +use std::fmt::Write; use std::fs; use std::path::{Path, PathBuf}; @@ -384,6 +386,31 @@ async fn run_file_in_runner>( continue; } if let Err(err) = runner.run_async(record).await { + if let TestErrorKind::QueryResultMismatch { sql, .. } = err.kind() { + if let Some(failure) = take_last_validation_failure() { + let ValidationFailure { + expected_snapshot, + actual_snapshot, + note, + } = failure; + let mut rendered = String::new(); + let _ = writeln!( + &mut rendered, + "query result mismatch (DataFusion validator):" + ); + let _ = writeln!(&mut rendered, "[SQL] {sql}"); + if let Some(note) = note { + let _ = writeln!(&mut rendered, "[Reason] {note}"); + } + let _ = writeln!(&mut rendered, "[Expected]"); + let _ = writeln!(&mut rendered, "{}", expected_snapshot); + let _ = writeln!(&mut rendered, "[Actual]"); + let _ = writeln!(&mut rendered, "{}", actual_snapshot); + let _ = writeln!(&mut rendered, "at {}", err.location()); + errs.push(rendered); + continue; + } + } errs.push(format!("{err}")); } } diff --git a/datafusion/sqllogictest/src/util.rs b/datafusion/sqllogictest/src/util.rs index 487bafc4c961..e28e2e881969 100644 --- a/datafusion/sqllogictest/src/util.rs +++ b/datafusion/sqllogictest/src/util.rs @@ -20,6 +20,7 @@ use itertools::Itertools; use log::Level::Warn; use log::{info, log_enabled, warn}; use sqllogictest::Normalizer; +use std::cell::RefCell; use std::fs; use std::path::{Path, PathBuf}; @@ -82,63 +83,135 @@ pub fn df_value_validator( actual: &[Vec], expected: &[String], ) -> bool { - // Support ignore marker to skip volatile parts of output. - const IGNORE_MARKER: &str = ""; + reset_validation_failure(); + let contains_ignore_marker = expected.iter().any(|line| line.contains(IGNORE_MARKER)); - let normalized_expected = expected.iter().map(normalizer).collect::>(); - let normalized_actual = actual + let normalized_expected: Vec = + expected.iter().map(normalizer).collect::>(); + let normalized_actual: Vec = actual .iter() .map(|strs| strs.iter().join(" ")) .map(|str| str.trim_end().to_string()) .collect_vec(); - // If ignore marker present, perform fragment-based matching on the full snapshot. if contains_ignore_marker { let expected_snapshot = normalized_expected.join("\n"); let actual_snapshot = normalized_actual.join("\n"); - let fragments: Vec<&str> = expected_snapshot.split(IGNORE_MARKER).collect(); - let mut pos = 0; - for (i, frag) in fragments.iter().enumerate() { - if frag.is_empty() { - continue; + match compare_with_ignore_markers(&expected_snapshot, &actual_snapshot) { + Ok(()) => true, + Err(note) => { + record_validation_failure(ValidationFailure { + expected_snapshot, + actual_snapshot, + note: Some(note), + }); + log_value_mismatch(&normalized_actual, &normalized_expected); + false } - if let Some(idx) = actual_snapshot[pos..].find(frag) { - // Edge case: The following example is expected to fail - // Actual - 'foo bar baz' - // Expected - 'bar ' - if (i == 0) && (idx != 0) { - return false; - } - - pos += idx + frag.len(); + } + } else if normalized_actual == normalized_expected { + true + } else { + log_value_mismatch(&normalized_actual, &normalized_expected); + false + } +} + +pub fn is_spark_path(relative_path: &Path) -> bool { + relative_path.starts_with("spark/") +} + +/// Details captured when validation fails so that the error reporter can render a consistent +/// message (e.g. suppressing `` false positives). +#[derive(Debug, Clone)] +pub struct ValidationFailure { + pub expected_snapshot: String, + pub actual_snapshot: String, + pub note: Option, +} + +thread_local! { + static LAST_VALIDATION_FAILURE: RefCell> = const { RefCell::new(None) }; +} + +pub fn take_last_validation_failure() -> Option { + LAST_VALIDATION_FAILURE.with(|cell| cell.borrow_mut().take()) +} + +fn record_validation_failure(failure: ValidationFailure) { + LAST_VALIDATION_FAILURE.with(|cell| { + *cell.borrow_mut() = Some(failure); + }); +} + +fn reset_validation_failure() { + LAST_VALIDATION_FAILURE.with(|cell| { + cell.borrow_mut().take(); + }); +} + +fn log_value_mismatch(actual: &[String], expected: &[String]) { + if !log_enabled!(Warn) { + return; + } + + warn!("df validation failed. actual vs expected:"); + for i in 0..actual.len() { + warn!("[{i}] {}", actual[i]); + warn!( + "[{i}] {}", + if expected.len() > i { + &expected[i] } else { - return false; + "No more results" } - } - return true; + ); } +} + +const IGNORE_MARKER: &str = ""; + +fn compare_with_ignore_markers( + expected_snapshot: &str, + actual_snapshot: &str, +) -> Result<(), String> { + let fragments: Vec<&str> = expected_snapshot.split(IGNORE_MARKER).collect(); + let mut pos = 0usize; + let trailing_wildcard = expected_snapshot.ends_with(IGNORE_MARKER); + for (i, frag) in fragments.iter().enumerate() { + if frag.is_empty() { + continue; + } - if log_enabled!(Warn) && normalized_actual != normalized_expected { - warn!("df validation failed. actual vs expected:"); - for i in 0..normalized_actual.len() { - warn!("[{i}] {}", normalized_actual[i]); - warn!( - "[{i}] {}", - if normalized_expected.len() > i { - &normalized_expected[i] - } else { - "No more results" - } - ); + if let Some(idx) = actual_snapshot[pos..].find(frag) { + if i == 0 && idx != 0 { + let unexpected = actual_snapshot[pos..pos + idx] + .chars() + .take(32) + .collect::(); + return Err(format!( + "expected output should start with fragment \"{frag}\" but actual output begins with \"{unexpected}\"" + )); + } + pos += idx + frag.len(); + } else { + let tail = &actual_snapshot[pos..]; + let preview = tail.chars().take(32).collect::(); + return Err(format!( + "fragment \"{frag}\" not found in actual output after byte offset {pos} (remaining output starts with \"{preview}\")" + )); } } - normalized_actual == normalized_expected -} + if !trailing_wildcard && pos != actual_snapshot.len() { + let preview = actual_snapshot[pos..].chars().take(32).collect::(); + return Err(format!( + "actual output contains additional trailing data starting with \"{preview}\"" + )); + } -pub fn is_spark_path(relative_path: &Path) -> bool { - relative_path.starts_with("spark/") + Ok(()) } #[cfg(test)] @@ -156,4 +229,40 @@ mod tests { assert!(!df_value_validator(value_normalizer, &actual, &expected)); } + + #[test] + fn ignore_marker_failure_records_context() { + let actual = vec![ + vec!["0".to_string()], + vec!["1".to_string()], + vec!["2".to_string()], + vec!["3".to_string()], + ]; + let expected = vec![ + "".to_string(), + "1".to_string(), + "3".to_string(), + ]; + + assert!(!df_value_validator(value_normalizer, &actual, &expected)); + let failure = take_last_validation_failure().expect("failure recorded"); + assert!(failure.expected_snapshot.contains("")); + assert!(failure.actual_snapshot.contains("2")); + assert!(failure.note.expect("note").contains("fragment")); + } + + #[test] + fn ignore_marker_requires_exact_end_without_trailing_wildcard() { + let actual = vec![ + vec!["0".to_string()], + vec!["1".to_string()], + vec!["2".to_string()], + vec!["3".to_string()], + ]; + let expected = vec!["".to_string(), "2".to_string()]; + + assert!(!df_value_validator(value_normalizer, &actual, &expected)); + let failure = take_last_validation_failure().expect("failure recorded"); + assert!(failure.note.expect("note").contains("trailing data")); + } } From 74f287e0f53b08f3a08bc8770ffaf6d65c7a4601 Mon Sep 17 00:00:00 2001 From: Kumar Ujjawal Date: Mon, 24 Nov 2025 18:51:22 +0530 Subject: [PATCH 2/2] fix clippy issues --- datafusion/sqllogictest/bin/sqllogictests.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/sqllogictest/bin/sqllogictests.rs b/datafusion/sqllogictest/bin/sqllogictests.rs index 42720b983ec8..2f7b93fd95f4 100644 --- a/datafusion/sqllogictest/bin/sqllogictests.rs +++ b/datafusion/sqllogictest/bin/sqllogictests.rs @@ -403,9 +403,9 @@ async fn run_file_in_runner>( let _ = writeln!(&mut rendered, "[Reason] {note}"); } let _ = writeln!(&mut rendered, "[Expected]"); - let _ = writeln!(&mut rendered, "{}", expected_snapshot); + let _ = writeln!(&mut rendered, "{expected_snapshot}"); let _ = writeln!(&mut rendered, "[Actual]"); - let _ = writeln!(&mut rendered, "{}", actual_snapshot); + let _ = writeln!(&mut rendered, "{actual_snapshot}"); let _ = writeln!(&mut rendered, "at {}", err.location()); errs.push(rendered); continue;