cachevector · maskedsyntax · Mar 3, 2026 · Mar 3, 2026 · Mar 3, 2026
diff --git a/hashprep/checks/__init__.py b/hashprep/checks/__init__.py
@@ -22,6 +22,7 @@
     _check_outliers,
     _check_skewness,
 )
+from .statistical_tests import _check_normality, _check_variance_homogeneity
 
 
 def _check_dataset_drift(analyzer):
@@ -57,6 +58,8 @@ def _check_dataset_drift(analyzer):
     "infinite_values": _check_infinite_values,
     "constant_length": _check_constant_length,
     "empty_dataset": _check_empty_dataset,
+    "normality": _check_normality,
+    "variance_homogeneity": _check_variance_homogeneity,
 }
 
 CORRELATION_CHECKS = {"feature_correlation", "categorical_correlation", "mixed_correlation"}

diff --git a/hashprep/checks/statistical_tests.py b/hashprep/checks/statistical_tests.py
@@ -0,0 +1,140 @@
+"""
+Statistical tests: normality (Shapiro-Wilk / D'Agostino-Pearson) and
+variance homogeneity (Levene's test across target-column groups).
+"""
+
+import numpy as np
+from scipy.stats import levene, normaltest, shapiro
+
+from ..config import DEFAULT_CONFIG
+from .core import Issue
+
+_ST = DEFAULT_CONFIG.statistical_tests
+
+
+def _run_normality_test(series) -> tuple[str, float, float]:
+    """
+    Return (test_name, statistic, p_value) for the most appropriate normality test.
+    Uses Shapiro-Wilk for n <= shapiro_max_n, D'Agostino-Pearson otherwise.
+    """
+    n = len(series)
+    if n <= _ST.shapiro_max_n:
+        stat, p = shapiro(series)
+        return "shapiro_wilk", float(stat), float(p)
+    else:
+        stat, p = normaltest(series)
+        return "dagostino_pearson", float(stat), float(p)
+
+
+def _check_normality(analyzer) -> list[Issue]:
+    """
+    Flag numeric columns whose distribution is significantly non-normal.
+    Uses Shapiro-Wilk for n <= 5000, D'Agostino-Pearson for larger samples.
+    Non-normality matters for linear models, t-tests, and certain imputation strategies.
+    """
+    issues = []
+
+    for col in analyzer.df.select_dtypes(include="number").columns:
+        series = analyzer.df[col].dropna()
+        n = len(series)
+        if n < _ST.normality_min_n:
+            continue
+        if series.nunique() <= 1:
+            continue
+
+        test_name, stat, p_val = _run_normality_test(series)
+
+        if p_val < _ST.normality_p_value:
+            # Severity: very small p → critical (strong evidence), otherwise warning
+            severity = "critical" if p_val < 0.001 else "warning"
+            impact = "high" if severity == "critical" else "medium"
+            test_label = "Shapiro-Wilk" if test_name == "shapiro_wilk" else "D'Agostino-Pearson"
+
+            issues.append(
+                Issue(
+                    category="normality",
+                    severity=severity,
+                    column=col,
+                    description=(f"Column '{col}' is non-normal ({test_label}: stat={stat:.4f}, p={p_val:.4g}, n={n})"),
+                    impact_score=impact,
+                    quick_fix=(
+                        "Options:\n"
+                        "- Transform: Log, sqrt, or Box-Cox/Yeo-Johnson often normalise skewed data.\n"
+                        "- Use robust models: Tree-based models (XGBoost, RF) make no normality assumption.\n"
+                        "- Normalise for linear models: Required for OLS residuals and LDA.\n"
+                        "- Investigate outliers: Extreme values are a common cause of non-normality."
+                    ),
+                )
+            )
+
+    return issues
+
+
+def _check_variance_homogeneity(analyzer) -> list[Issue]:
+    """
+    Run Levene's test across groups defined by the target column.
+    Unequal variances (heteroscedasticity) across classes violate assumptions of
+    linear discriminant analysis and ANOVA; they also indicate scale differences
+    that may harm distance-based models.
+
+    Only runs when a target column is set and has at least 2 groups with
+    sufficient data.
+    """
+    issues = []
+
+    if analyzer.target_col is None:
+        return issues
+
+    target = analyzer.df[analyzer.target_col].dropna()
+    groups_labels = target.unique()
+
+    for col in analyzer.df.select_dtypes(include="number").columns:
+        if col == analyzer.target_col:
+            continue
+
+        series = analyzer.df[col]
+
+        # Build per-group arrays, filtering out groups that are too small
+        groups = []
+        for label in groups_labels:
+            mask = analyzer.df[analyzer.target_col] == label
+            grp = series[mask].dropna().values
+            if len(grp) >= _ST.levene_min_group_size:
+                groups.append(grp)
+
+        if len(groups) < 2:
+            continue
+
+        try:
+            stat, p_val = levene(*groups, center="median")  # median-centre is most robust
+        except ValueError:
+            continue
+
+        if p_val < _ST.levene_p_value:
+            # Compute per-group stds to add colour to the description
+            stds = [float(np.std(g, ddof=1)) for g in groups]
+            std_ratio = max(stds) / min(stds) if min(stds) > 0 else float("inf")
+            severity = "critical" if std_ratio > 3.0 else "warning"
+            impact = "high" if severity == "critical" else "medium"
+
+            issues.append(
+                Issue(
+                    category="variance_homogeneity",
+                    severity=severity,
+                    column=col,
+                    description=(
+                        f"Column '{col}' has unequal variances across '{analyzer.target_col}' groups "
+                        f"(Levene: stat={stat:.4f}, p={p_val:.4g}; std ratio={std_ratio:.2f}×)"
+                    ),
+                    impact_score=impact,
+                    quick_fix=(
+                        "Options:\n"
+                        "- Scale per class: Normalise within each target group before training.\n"
+                        "- Transform feature: Log or sqrt often equalises spread.\n"
+                        "- Use Welch's t-test / robust ANOVA: Accounts for unequal variances.\n"
+                        "- Use tree-based models: Decision trees are invariant to feature scaling."
+                    ),
+                )
+            )
+
+    return issues
diff --git a/hashprep/config.py b/hashprep/config.py
@@ -126,6 +126,22 @@ class ImbalanceThresholds:
     majority_class_ratio: float = 0.9
 
 
+@dataclass(frozen=True)
+class StatisticalTestThresholds:
+    """Thresholds for normality and variance homogeneity tests."""
+
+    # p-value below which we flag non-normality
+    normality_p_value: float = 0.05
+    # Shapiro-Wilk is used up to this sample size; D'Agostino-Pearson above it
+    shapiro_max_n: int = 5000
+    # Minimum samples to run any normality test
+    normality_min_n: int = 8
+    # p-value below which Levene's test flags unequal variances across groups
+    levene_p_value: float = 0.05
+    # Minimum group size to include a target group in Levene's test
+    levene_min_group_size: int = 8
+
+
 @dataclass(frozen=True)
 class DateTimeThresholds:
     """Thresholds for datetime-specific checks."""
@@ -190,6 +206,7 @@ class HashPrepConfig:
     drift: DriftThresholds = field(default_factory=DriftThresholds)
     distribution: DistributionThresholds = field(default_factory=DistributionThresholds)
     imbalance: ImbalanceThresholds = field(default_factory=ImbalanceThresholds)
+    statistical_tests: StatisticalTestThresholds = field(default_factory=StatisticalTestThresholds)
     datetime: DateTimeThresholds = field(default_factory=DateTimeThresholds)
     type_inference: TypeInferenceConfig = field(default_factory=TypeInferenceConfig)
     sampling: SamplingDefaults = field(default_factory=SamplingDefaults)

diff --git a/hashprep/core/analyzer.py b/hashprep/core/analyzer.py
@@ -59,6 +59,8 @@ class DatasetAnalyzer:
         "unique_values",
         "infinite_values",
         "constant_length",
+        "normality",
+        "variance_homogeneity",
     ]
 
     def __init__(

diff --git a/hashprep/summaries/variables.py b/hashprep/summaries/variables.py
@@ -4,11 +4,12 @@
 
 import numpy as np
 import pandas as pd
-from scipy.stats import median_abs_deviation
+from scipy.stats import median_abs_deviation, normaltest, shapiro
 
 from ..config import DEFAULT_CONFIG
 
 _SUMMARY = DEFAULT_CONFIG.summaries
+_ST = DEFAULT_CONFIG.statistical_tests
 
 
 def get_monotonicity(series: pd.Series) -> str:
@@ -85,10 +86,12 @@ def _summarize_numeric(df, col):
     zeros_percentage = zeros_count / n * 100
     negative_count = int((series < 0).sum())
     negative_percentage = negative_count / n * 100
-    mean_val = float(series.mean())
-    min_val = float(series.min())
-    max_val = float(series.max())
-    q = series.quantile([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0])
+    # Use finite-only series for distribution statistics to avoid np.histogram crashing
+    finite = series[np.isfinite(series)]
+    mean_val = float(finite.mean()) if not finite.empty else float("nan")
+    min_val = float(finite.min()) if not finite.empty else float("nan")
+    max_val = float(finite.max()) if not finite.empty else float("nan")
+    q = finite.quantile([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0])
     quantiles = {
         "minimum": float(q[0]),
         "p5": float(q[0.05]),
@@ -100,29 +103,47 @@ def _summarize_numeric(df, col):
         "range": float(q[1.0] - q[0]),
         "iqr": float(q[0.75] - q[0.25]),
     }
-    cv = float(series.std() / abs(mean_val)) if mean_val != 0 else None
+    cv = float(finite.std() / abs(mean_val)) if mean_val != 0 else None
     descriptive = {
-        "standard_deviation": float(series.std()),
+        "standard_deviation": float(finite.std()),
         "coefficient_of_variation": cv,
-        "kurtosis": float(series.kurtosis()),
+        "kurtosis": float(finite.kurtosis()),
         "mean": mean_val,
-        "mad": float(median_abs_deviation(series)),
-        "skewness": float(series.skew()),
-        "sum": float(series.sum()),
-        "variance": float(series.var()),
-        "monotonicity": get_monotonicity(series),
+        "mad": float(median_abs_deviation(finite)),
+        "skewness": float(finite.skew()),
+        "sum": float(finite.sum()),
+        "variance": float(finite.var()),
+        "monotonicity": get_monotonicity(finite),
     }
-    hist, bin_edges = np.histogram(series, bins=_SUMMARY.histogram_bins, range=(min_val, max_val))
+    hist, bin_edges = np.histogram(finite, bins=_SUMMARY.histogram_bins, range=(min_val, max_val))
     histogram = {
         "bin_edges": [float(x) for x in bin_edges],
         "counts": [int(x) for x in hist],
     }
-    vc = series.value_counts().head(_SUMMARY.top_n_values)
+    vc = finite.value_counts().head(_SUMMARY.top_n_values)
     common_values = {str(v): {"count": int(c), "percentage": float(c / n * 100)} for v, c in vc.items()}
     extremes = {
-        "minimum_10": [float(x) for x in sorted(series)[: _SUMMARY.extreme_values_count]],
-        "maximum_10": [float(x) for x in sorted(series)[-_SUMMARY.extreme_values_count :]],
+        "minimum_10": [float(x) for x in sorted(finite)[: _SUMMARY.extreme_values_count]],
+        "maximum_10": [float(x) for x in sorted(finite)[-_SUMMARY.extreme_values_count :]],
     }
+    # Normality test (Shapiro-Wilk for small n, D'Agostino-Pearson for large n)
+    normality = None
+    if n >= _ST.normality_min_n and series.nunique() > 1:
+        finite = series[np.isfinite(series)]
+        if len(finite) >= _ST.normality_min_n:
+            if len(finite) <= _ST.shapiro_max_n:
+                norm_stat, norm_p = shapiro(finite)
+                norm_test = "shapiro_wilk"
+            else:
+                norm_stat, norm_p = normaltest(finite)
+                norm_test = "dagostino_pearson"
+            normality = {
+                "test": norm_test,
+                "statistic": float(norm_stat),
+                "p_value": float(norm_p),
+                "is_normal": float(norm_p) >= _ST.normality_p_value,
+            }
+
     stats = {
         "infinite_count": infinite_count,
         "infinite_percentage": float(infinite_percentage),
@@ -137,6 +158,7 @@ def _summarize_numeric(df, col):
         "histogram": histogram,
         "common_values": common_values,
         "extreme_values": extremes,
+        "normality": normality,
     }
     return stats