From cfae1360b578e24daa18f65fae8374cfb9cbf537 Mon Sep 17 00:00:00 2001 From: maskedsyntax Date: Tue, 3 Mar 2026 16:15:46 +0530 Subject: [PATCH 1/2] feat: add normality and variance homogeneity statistical tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - config.py: add StatisticalTestThresholds (normality p-value, Shapiro max-n cutoff, min sample size, Levene p-value and min group size) wired into HashPrepConfig - checks/statistical_tests.py: two new checks — - normality: Shapiro-Wilk (n ≤ 5000) or D'Agostino-Pearson (n > 5000) per numeric column; flags non-normal distributions with stat + p-value in the issue description - variance_homogeneity: Levene's test (median-centred, robust) across target-column groups; reports std ratio alongside the test result; skipped when no target column is set or groups are too small - summaries/variables.py: embed normality result (test name, statistic, p_value, is_normal bool) into each numeric column's summary dict; also fix a pre-existing crash where infinite values in a column caused np.histogram to receive range=(-inf, inf) — all distribution stats now computed on finite-only values - checks/__init__.py + core/analyzer.py: register normality and variance_homogeneity in CHECKS registry and ALL_CHECKS list - tests/test_statistical_tests.py: 30 tests covering normality check unit, Levene check unit, summary embedding, and end-to-end integration via DatasetAnalyzer; all 180 tests pass (150 existing + 30 new) --- hashprep/checks/__init__.py | 3 + hashprep/checks/statistical_tests.py | 142 ++++++++++++++ hashprep/config.py | 17 ++ hashprep/core/analyzer.py | 2 + hashprep/summaries/variables.py | 56 ++++-- tests/test_statistical_tests.py | 271 +++++++++++++++++++++++++++ 6 files changed, 474 insertions(+), 17 deletions(-) create mode 100644 hashprep/checks/statistical_tests.py create mode 100644 tests/test_statistical_tests.py diff --git a/hashprep/checks/__init__.py b/hashprep/checks/__init__.py index ccd8379..cf63f83 100644 --- a/hashprep/checks/__init__.py +++ b/hashprep/checks/__init__.py @@ -22,6 +22,7 @@ _check_outliers, _check_skewness, ) +from .statistical_tests import _check_normality, _check_variance_homogeneity def _check_dataset_drift(analyzer): @@ -57,6 +58,8 @@ def _check_dataset_drift(analyzer): "infinite_values": _check_infinite_values, "constant_length": _check_constant_length, "empty_dataset": _check_empty_dataset, + "normality": _check_normality, + "variance_homogeneity": _check_variance_homogeneity, } CORRELATION_CHECKS = {"feature_correlation", "categorical_correlation", "mixed_correlation"} diff --git a/hashprep/checks/statistical_tests.py b/hashprep/checks/statistical_tests.py new file mode 100644 index 0000000..27a6351 --- /dev/null +++ b/hashprep/checks/statistical_tests.py @@ -0,0 +1,142 @@ +""" +Statistical tests: normality (Shapiro-Wilk / D'Agostino-Pearson) and +variance homogeneity (Levene's test across target-column groups). +""" + +import numpy as np +from scipy.stats import levene, normaltest, shapiro + +from ..config import DEFAULT_CONFIG +from .core import Issue + +_ST = DEFAULT_CONFIG.statistical_tests + + +def _run_normality_test(series) -> tuple[str, float, float]: + """ + Return (test_name, statistic, p_value) for the most appropriate normality test. + Uses Shapiro-Wilk for n <= shapiro_max_n, D'Agostino-Pearson otherwise. + """ + n = len(series) + if n <= _ST.shapiro_max_n: + stat, p = shapiro(series) + return "shapiro_wilk", float(stat), float(p) + else: + stat, p = normaltest(series) + return "dagostino_pearson", float(stat), float(p) + + +def _check_normality(analyzer) -> list[Issue]: + """ + Flag numeric columns whose distribution is significantly non-normal. + Uses Shapiro-Wilk for n <= 5000, D'Agostino-Pearson for larger samples. + Non-normality matters for linear models, t-tests, and certain imputation strategies. + """ + issues = [] + + for col in analyzer.df.select_dtypes(include="number").columns: + series = analyzer.df[col].dropna() + n = len(series) + if n < _ST.normality_min_n: + continue + if series.nunique() <= 1: + continue + + test_name, stat, p_val = _run_normality_test(series) + + if p_val < _ST.normality_p_value: + # Severity: very small p → critical (strong evidence), otherwise warning + severity = "critical" if p_val < 0.001 else "warning" + impact = "high" if severity == "critical" else "medium" + test_label = "Shapiro-Wilk" if test_name == "shapiro_wilk" else "D'Agostino-Pearson" + + issues.append( + Issue( + category="normality", + severity=severity, + column=col, + description=( + f"Column '{col}' is non-normal ({test_label}: stat={stat:.4f}, p={p_val:.4g}, n={n})" + ), + impact_score=impact, + quick_fix=( + "Options:\n" + "- Transform: Log, sqrt, or Box-Cox/Yeo-Johnson often normalise skewed data.\n" + "- Use robust models: Tree-based models (XGBoost, RF) make no normality assumption.\n" + "- Normalise for linear models: Required for OLS residuals and LDA.\n" + "- Investigate outliers: Extreme values are a common cause of non-normality." + ), + ) + ) + + return issues + + +def _check_variance_homogeneity(analyzer) -> list[Issue]: + """ + Run Levene's test across groups defined by the target column. + Unequal variances (heteroscedasticity) across classes violate assumptions of + linear discriminant analysis and ANOVA; they also indicate scale differences + that may harm distance-based models. + + Only runs when a target column is set and has at least 2 groups with + sufficient data. + """ + issues = [] + + if analyzer.target_col is None: + return issues + + target = analyzer.df[analyzer.target_col].dropna() + groups_labels = target.unique() + + for col in analyzer.df.select_dtypes(include="number").columns: + if col == analyzer.target_col: + continue + + series = analyzer.df[col] + + # Build per-group arrays, filtering out groups that are too small + groups = [] + for label in groups_labels: + mask = analyzer.df[analyzer.target_col] == label + grp = series[mask].dropna().values + if len(grp) >= _ST.levene_min_group_size: + groups.append(grp) + + if len(groups) < 2: + continue + + try: + stat, p_val = levene(*groups, center="median") # median-centre is most robust + except ValueError: + continue + + if p_val < _ST.levene_p_value: + # Compute per-group stds to add colour to the description + stds = [float(np.std(g, ddof=1)) for g in groups] + std_ratio = max(stds) / min(stds) if min(stds) > 0 else float("inf") + severity = "critical" if std_ratio > 3.0 else "warning" + impact = "high" if severity == "critical" else "medium" + + issues.append( + Issue( + category="variance_homogeneity", + severity=severity, + column=col, + description=( + f"Column '{col}' has unequal variances across '{analyzer.target_col}' groups " + f"(Levene: stat={stat:.4f}, p={p_val:.4g}; std ratio={std_ratio:.2f}×)" + ), + impact_score=impact, + quick_fix=( + "Options:\n" + "- Scale per class: Normalise within each target group before training.\n" + "- Transform feature: Log or sqrt often equalises spread.\n" + "- Use Welch's t-test / robust ANOVA: Accounts for unequal variances.\n" + "- Use tree-based models: Decision trees are invariant to feature scaling." + ), + ) + ) + + return issues diff --git a/hashprep/config.py b/hashprep/config.py index 5c97a4f..8305e3a 100644 --- a/hashprep/config.py +++ b/hashprep/config.py @@ -126,6 +126,22 @@ class ImbalanceThresholds: majority_class_ratio: float = 0.9 +@dataclass(frozen=True) +class StatisticalTestThresholds: + """Thresholds for normality and variance homogeneity tests.""" + + # p-value below which we flag non-normality + normality_p_value: float = 0.05 + # Shapiro-Wilk is used up to this sample size; D'Agostino-Pearson above it + shapiro_max_n: int = 5000 + # Minimum samples to run any normality test + normality_min_n: int = 8 + # p-value below which Levene's test flags unequal variances across groups + levene_p_value: float = 0.05 + # Minimum group size to include a target group in Levene's test + levene_min_group_size: int = 8 + + @dataclass(frozen=True) class DateTimeThresholds: """Thresholds for datetime-specific checks.""" @@ -190,6 +206,7 @@ class HashPrepConfig: drift: DriftThresholds = field(default_factory=DriftThresholds) distribution: DistributionThresholds = field(default_factory=DistributionThresholds) imbalance: ImbalanceThresholds = field(default_factory=ImbalanceThresholds) + statistical_tests: StatisticalTestThresholds = field(default_factory=StatisticalTestThresholds) datetime: DateTimeThresholds = field(default_factory=DateTimeThresholds) type_inference: TypeInferenceConfig = field(default_factory=TypeInferenceConfig) sampling: SamplingDefaults = field(default_factory=SamplingDefaults) diff --git a/hashprep/core/analyzer.py b/hashprep/core/analyzer.py index ab696c8..2c44047 100644 --- a/hashprep/core/analyzer.py +++ b/hashprep/core/analyzer.py @@ -59,6 +59,8 @@ class DatasetAnalyzer: "unique_values", "infinite_values", "constant_length", + "normality", + "variance_homogeneity", ] def __init__( diff --git a/hashprep/summaries/variables.py b/hashprep/summaries/variables.py index a23ef07..afac225 100644 --- a/hashprep/summaries/variables.py +++ b/hashprep/summaries/variables.py @@ -4,11 +4,12 @@ import numpy as np import pandas as pd -from scipy.stats import median_abs_deviation +from scipy.stats import median_abs_deviation, normaltest, shapiro from ..config import DEFAULT_CONFIG _SUMMARY = DEFAULT_CONFIG.summaries +_ST = DEFAULT_CONFIG.statistical_tests def get_monotonicity(series: pd.Series) -> str: @@ -85,10 +86,12 @@ def _summarize_numeric(df, col): zeros_percentage = zeros_count / n * 100 negative_count = int((series < 0).sum()) negative_percentage = negative_count / n * 100 - mean_val = float(series.mean()) - min_val = float(series.min()) - max_val = float(series.max()) - q = series.quantile([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0]) + # Use finite-only series for distribution statistics to avoid np.histogram crashing + finite = series[np.isfinite(series)] + mean_val = float(finite.mean()) if not finite.empty else float("nan") + min_val = float(finite.min()) if not finite.empty else float("nan") + max_val = float(finite.max()) if not finite.empty else float("nan") + q = finite.quantile([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0]) quantiles = { "minimum": float(q[0]), "p5": float(q[0.05]), @@ -100,29 +103,47 @@ def _summarize_numeric(df, col): "range": float(q[1.0] - q[0]), "iqr": float(q[0.75] - q[0.25]), } - cv = float(series.std() / abs(mean_val)) if mean_val != 0 else None + cv = float(finite.std() / abs(mean_val)) if mean_val != 0 else None descriptive = { - "standard_deviation": float(series.std()), + "standard_deviation": float(finite.std()), "coefficient_of_variation": cv, - "kurtosis": float(series.kurtosis()), + "kurtosis": float(finite.kurtosis()), "mean": mean_val, - "mad": float(median_abs_deviation(series)), - "skewness": float(series.skew()), - "sum": float(series.sum()), - "variance": float(series.var()), - "monotonicity": get_monotonicity(series), + "mad": float(median_abs_deviation(finite)), + "skewness": float(finite.skew()), + "sum": float(finite.sum()), + "variance": float(finite.var()), + "monotonicity": get_monotonicity(finite), } - hist, bin_edges = np.histogram(series, bins=_SUMMARY.histogram_bins, range=(min_val, max_val)) + hist, bin_edges = np.histogram(finite, bins=_SUMMARY.histogram_bins, range=(min_val, max_val)) histogram = { "bin_edges": [float(x) for x in bin_edges], "counts": [int(x) for x in hist], } - vc = series.value_counts().head(_SUMMARY.top_n_values) + vc = finite.value_counts().head(_SUMMARY.top_n_values) common_values = {str(v): {"count": int(c), "percentage": float(c / n * 100)} for v, c in vc.items()} extremes = { - "minimum_10": [float(x) for x in sorted(series)[: _SUMMARY.extreme_values_count]], - "maximum_10": [float(x) for x in sorted(series)[-_SUMMARY.extreme_values_count :]], + "minimum_10": [float(x) for x in sorted(finite)[: _SUMMARY.extreme_values_count]], + "maximum_10": [float(x) for x in sorted(finite)[-_SUMMARY.extreme_values_count :]], } + # Normality test (Shapiro-Wilk for small n, D'Agostino-Pearson for large n) + normality = None + if n >= _ST.normality_min_n and series.nunique() > 1: + finite = series[np.isfinite(series)] + if len(finite) >= _ST.normality_min_n: + if len(finite) <= _ST.shapiro_max_n: + norm_stat, norm_p = shapiro(finite) + norm_test = "shapiro_wilk" + else: + norm_stat, norm_p = normaltest(finite) + norm_test = "dagostino_pearson" + normality = { + "test": norm_test, + "statistic": float(norm_stat), + "p_value": float(norm_p), + "is_normal": float(norm_p) >= _ST.normality_p_value, + } + stats = { "infinite_count": infinite_count, "infinite_percentage": float(infinite_percentage), @@ -137,6 +158,7 @@ def _summarize_numeric(df, col): "histogram": histogram, "common_values": common_values, "extreme_values": extremes, + "normality": normality, } return stats diff --git a/tests/test_statistical_tests.py b/tests/test_statistical_tests.py new file mode 100644 index 0000000..8ae64d5 --- /dev/null +++ b/tests/test_statistical_tests.py @@ -0,0 +1,271 @@ +"""Tests for statistical checks: normality and variance homogeneity.""" + +import numpy as np +import pandas as pd +import pytest + +from hashprep import DatasetAnalyzer +from hashprep.checks.statistical_tests import _check_normality, _check_variance_homogeneity +from hashprep.summaries.variables import _summarize_numeric + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +class _FakeAnalyzer: + def __init__(self, df, target_col=None): + self.df = df + self.target_col = target_col + from hashprep.utils.type_inference import infer_types + + self.column_types = infer_types(df) + + +rng = np.random.default_rng(42) + + +# --------------------------------------------------------------------------- +# Normality check +# --------------------------------------------------------------------------- + + +class TestNormalityCheck: + def test_normal_data_no_issue(self): + df = pd.DataFrame({"x": rng.standard_normal(200)}) + issues = _check_normality(_FakeAnalyzer(df)) + # Normal data should not trigger a flag + assert all(i.column != "x" for i in issues) or len(issues) == 0 + + def test_heavily_skewed_data_flagged(self): + # Exponential distribution is very non-normal + df = pd.DataFrame({"x": rng.exponential(scale=1.0, size=300)}) + issues = _check_normality(_FakeAnalyzer(df)) + assert any(i.category == "normality" and i.column == "x" for i in issues) + + def test_uniform_data_flagged(self): + df = pd.DataFrame({"x": rng.uniform(0, 1, size=200)}) + issues = _check_normality(_FakeAnalyzer(df)) + assert any(i.category == "normality" for i in issues) + + def test_too_few_rows_skipped(self): + df = pd.DataFrame({"x": [1.0, 2.0, 3.0]}) + issues = _check_normality(_FakeAnalyzer(df)) + assert issues == [] + + def test_constant_column_skipped(self): + df = pd.DataFrame({"x": [5.0] * 50}) + issues = _check_normality(_FakeAnalyzer(df)) + assert issues == [] + + def test_all_nan_column_skipped(self): + df = pd.DataFrame({"x": [np.nan] * 50, "y": rng.standard_normal(50)}) + categories = [i.column for i in _check_normality(_FakeAnalyzer(df))] + assert "x" not in categories + + def test_large_normal_data_uses_dagostino(self): + # n > 5000 → should switch to D'Agostino-Pearson + df = pd.DataFrame({"big": rng.standard_normal(6000)}) + issues = _check_normality(_FakeAnalyzer(df)) + # Large normal data should not be flagged + assert all(i.column != "big" for i in issues) + + def test_large_non_normal_data_flagged(self): + df = pd.DataFrame({"big": rng.exponential(scale=1.0, size=6000)}) + issues = _check_normality(_FakeAnalyzer(df)) + assert any(i.category == "normality" and i.column == "big" for i in issues) + + def test_issue_fields_populated(self): + df = pd.DataFrame({"x": rng.exponential(1.0, size=200)}) + issues = _check_normality(_FakeAnalyzer(df)) + issue = next(i for i in issues if i.column == "x") + assert issue.category == "normality" + assert issue.severity in ("warning", "critical") + assert "stat=" in issue.description + assert "p=" in issue.description + + def test_new_checks_in_all_checks(self): + assert "normality" in DatasetAnalyzer.ALL_CHECKS + assert "variance_homogeneity" in DatasetAnalyzer.ALL_CHECKS + + +# --------------------------------------------------------------------------- +# Variance homogeneity check (Levene's) +# --------------------------------------------------------------------------- + + +class TestVarianceHomogeneityCheck: + def test_no_target_col_returns_empty(self): + df = pd.DataFrame({"x": rng.standard_normal(100), "y": rng.standard_normal(100)}) + issues = _check_variance_homogeneity(_FakeAnalyzer(df, target_col=None)) + assert issues == [] + + def test_equal_variance_across_groups_no_issue(self): + df = pd.DataFrame( + { + "x": np.concatenate([rng.normal(0, 1, 100), rng.normal(5, 1, 100)]), + "target": ["A"] * 100 + ["B"] * 100, + } + ) + issues = _check_variance_homogeneity(_FakeAnalyzer(df, target_col="target")) + assert all(i.category != "variance_homogeneity" for i in issues) + + def test_unequal_variance_flagged(self): + # Group A: std=1, Group B: std=20 → Levene should fire + df = pd.DataFrame( + { + "x": np.concatenate([rng.normal(0, 1, 100), rng.normal(0, 20, 100)]), + "target": ["A"] * 100 + ["B"] * 100, + } + ) + issues = _check_variance_homogeneity(_FakeAnalyzer(df, target_col="target")) + assert any(i.category == "variance_homogeneity" and i.column == "x" for i in issues) + + def test_issue_contains_levene_info(self): + df = pd.DataFrame( + { + "x": np.concatenate([rng.normal(0, 1, 100), rng.normal(0, 15, 100)]), + "target": ["A"] * 100 + ["B"] * 100, + } + ) + issues = _check_variance_homogeneity(_FakeAnalyzer(df, target_col="target")) + issue = next(i for i in issues if i.category == "variance_homogeneity") + assert "Levene" in issue.description + assert "stat=" in issue.description + assert "p=" in issue.description + assert "std ratio" in issue.description + + def test_target_col_excluded_from_check(self): + df = pd.DataFrame( + { + "target": [0] * 100 + [1] * 100, + "x": rng.standard_normal(200), + } + ) + issues = _check_variance_homogeneity(_FakeAnalyzer(df, target_col="target")) + assert all(i.column != "target" for i in issues) + + def test_too_few_per_group_skipped(self): + # Only 3 samples per group — below min_group_size (8) + df = pd.DataFrame({"x": [1.0, 2.0, 3.0, 10.0, 20.0, 30.0], "target": ["A", "A", "A", "B", "B", "B"]}) + issues = _check_variance_homogeneity(_FakeAnalyzer(df, target_col="target")) + assert issues == [] + + def test_only_one_valid_group_skipped(self): + # Group B has < min_group_size samples + x = np.concatenate([rng.normal(0, 1, 100), rng.normal(0, 5, 3)]) + target = ["A"] * 100 + ["B"] * 3 + df = pd.DataFrame({"x": x, "target": target}) + issues = _check_variance_homogeneity(_FakeAnalyzer(df, target_col="target")) + assert issues == [] + + +# --------------------------------------------------------------------------- +# Normality in numeric summaries +# --------------------------------------------------------------------------- + + +class TestNormalitySummary: + def test_normality_key_present(self): + series = rng.standard_normal(100) + df = pd.DataFrame({"x": series}) + result = _summarize_numeric(df, "x") + assert "normality" in result + + def test_normality_fields(self): + df = pd.DataFrame({"x": rng.standard_normal(100)}) + norm = _summarize_numeric(df, "x")["normality"] + assert norm is not None + assert "test" in norm + assert "statistic" in norm + assert "p_value" in norm + assert "is_normal" in norm + assert isinstance(norm["is_normal"], bool) + assert norm["test"] == "shapiro_wilk" # n=100 < 5000 + + def test_large_sample_uses_dagostino(self): + df = pd.DataFrame({"x": rng.standard_normal(6000)}) + norm = _summarize_numeric(df, "x")["normality"] + assert norm["test"] == "dagostino_pearson" + + def test_normality_none_for_too_few_rows(self): + df = pd.DataFrame({"x": [1.0, 2.0, 3.0]}) + result = _summarize_numeric(df, "x") + assert result["normality"] is None + + def test_normality_none_for_constant_col(self): + df = pd.DataFrame({"x": [5.0] * 50}) + result = _summarize_numeric(df, "x") + assert result["normality"] is None + + def test_non_normal_data_is_normal_false(self): + # Very heavy exponential tail — p should be well below 0.05 + df = pd.DataFrame({"x": rng.exponential(1.0, size=500)}) + norm = _summarize_numeric(df, "x")["normality"] + assert norm["is_normal"] is False + + def test_normality_handles_inf_values(self): + # Infinite values should be excluded before the test + values = list(rng.standard_normal(100)) + [np.inf, -np.inf] + df = pd.DataFrame({"x": values}) + result = _summarize_numeric(df, "x") + # Should not raise; normality may be None or a valid result + assert "normality" in result + + +# --------------------------------------------------------------------------- +# Integration: DatasetAnalyzer end-to-end +# --------------------------------------------------------------------------- + + +class TestStatisticalTestsIntegration: + def test_normality_check_runs_in_full_analysis(self): + df = pd.DataFrame( + { + "normal_col": rng.standard_normal(200), + "skewed_col": rng.exponential(1.0, size=200), + } + ) + analyzer = DatasetAnalyzer(df, selected_checks=["normality"], auto_sample=False) + summary = analyzer.analyze() + categories = [i["category"] for i in summary["issues"]] + assert "normality" in categories + + def test_variance_homogeneity_runs_with_target(self): + df = pd.DataFrame( + { + "x": np.concatenate([rng.normal(0, 1, 100), rng.normal(0, 15, 100)]), + "target": ["A"] * 100 + ["B"] * 100, + } + ) + analyzer = DatasetAnalyzer(df, target_col="target", selected_checks=["variance_homogeneity"], auto_sample=False) + summary = analyzer.analyze() + categories = [i["category"] for i in summary["issues"]] + assert "variance_homogeneity" in categories + + def test_normality_summary_present_in_full_analysis(self): + df = pd.DataFrame({"val": rng.standard_normal(100), "cat": ["A", "B"] * 50}) + analyzer = DatasetAnalyzer(df, auto_sample=False) + summary = analyzer.analyze() + var = summary["summaries"]["variables"]["val"] + assert "normality" in var + assert var["normality"] is not None + + def test_no_crash_without_target(self): + df = pd.DataFrame({"a": rng.standard_normal(100), "b": rng.exponential(1, 100)}) + analyzer = DatasetAnalyzer(df, selected_checks=["variance_homogeneity"], auto_sample=False) + summary = analyzer.analyze() + assert summary["total_issues"] == 0 + + @pytest.mark.parametrize("check", ["normality", "variance_homogeneity"]) + def test_check_selectable_via_selected_checks(self, check): + df = pd.DataFrame( + { + "x": rng.exponential(1.0, size=100), + "target": ["A"] * 50 + ["B"] * 50, + } + ) + analyzer = DatasetAnalyzer(df, target_col="target", selected_checks=[check], auto_sample=False) + summary = analyzer.analyze() + # Should run without error and return a dict + assert "issues" in summary From aee4bcf4d21730c24f5e6a8c0767909189742bac Mon Sep 17 00:00:00 2001 From: maskedsyntax Date: Tue, 3 Mar 2026 16:20:16 +0530 Subject: [PATCH 2/2] style: apply ruff format to statistical_tests.py --- hashprep/checks/statistical_tests.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/hashprep/checks/statistical_tests.py b/hashprep/checks/statistical_tests.py index 27a6351..04c42e4 100644 --- a/hashprep/checks/statistical_tests.py +++ b/hashprep/checks/statistical_tests.py @@ -55,9 +55,7 @@ def _check_normality(analyzer) -> list[Issue]: category="normality", severity=severity, column=col, - description=( - f"Column '{col}' is non-normal ({test_label}: stat={stat:.4f}, p={p_val:.4g}, n={n})" - ), + description=(f"Column '{col}' is non-normal ({test_label}: stat={stat:.4f}, p={p_val:.4g}, n={n})"), impact_score=impact, quick_fix=( "Options:\n"