From cfae1360b578e24daa18f65fae8374cfb9cbf537 Mon Sep 17 00:00:00 2001
From: maskedsyntax <aftaab@aftaab.xyz>
Date: Tue, 3 Mar 2026 16:15:46 +0530
Subject: [PATCH 1/2] feat: add normality and variance homogeneity statistical
 tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- config.py: add StatisticalTestThresholds (normality p-value, Shapiro
  max-n cutoff, min sample size, Levene p-value and min group size)
  wired into HashPrepConfig
- checks/statistical_tests.py: two new checks —
    - normality: Shapiro-Wilk (n ≤ 5000) or D'Agostino-Pearson (n > 5000)
      per numeric column; flags non-normal distributions with stat + p-value
      in the issue description
    - variance_homogeneity: Levene's test (median-centred, robust) across
      target-column groups; reports std ratio alongside the test result;
      skipped when no target column is set or groups are too small
- summaries/variables.py: embed normality result (test name, statistic,
  p_value, is_normal bool) into each numeric column's summary dict; also
  fix a pre-existing crash where infinite values in a column caused
  np.histogram to receive range=(-inf, inf) — all distribution stats
  now computed on finite-only values
- checks/__init__.py + core/analyzer.py: register normality and
  variance_homogeneity in CHECKS registry and ALL_CHECKS list
- tests/test_statistical_tests.py: 30 tests covering normality check
  unit, Levene check unit, summary embedding, and end-to-end integration
  via DatasetAnalyzer; all 180 tests pass (150 existing + 30 new)
---
 hashprep/checks/__init__.py          |   3 +
 hashprep/checks/statistical_tests.py | 142 ++++++++++++++
 hashprep/config.py                   |  17 ++
 hashprep/core/analyzer.py            |   2 +
 hashprep/summaries/variables.py      |  56 ++++--
 tests/test_statistical_tests.py      | 271 +++++++++++++++++++++++++++
 6 files changed, 474 insertions(+), 17 deletions(-)
 create mode 100644 hashprep/checks/statistical_tests.py
 create mode 100644 tests/test_statistical_tests.py

diff --git a/hashprep/checks/__init__.py b/hashprep/checks/__init__.py
index ccd8379..cf63f83 100644
--- a/hashprep/checks/__init__.py
+++ b/hashprep/checks/__init__.py
@@ -22,6 +22,7 @@
     _check_outliers,
     _check_skewness,
 )
+from .statistical_tests import _check_normality, _check_variance_homogeneity
 
 
 def _check_dataset_drift(analyzer):
@@ -57,6 +58,8 @@ def _check_dataset_drift(analyzer):
     "infinite_values": _check_infinite_values,
     "constant_length": _check_constant_length,
     "empty_dataset": _check_empty_dataset,
+    "normality": _check_normality,
+    "variance_homogeneity": _check_variance_homogeneity,
 }
 
 CORRELATION_CHECKS = {"feature_correlation", "categorical_correlation", "mixed_correlation"}
diff --git a/hashprep/checks/statistical_tests.py b/hashprep/checks/statistical_tests.py
new file mode 100644
index 0000000..27a6351
--- /dev/null
+++ b/hashprep/checks/statistical_tests.py
@@ -0,0 +1,142 @@
+"""
+Statistical tests: normality (Shapiro-Wilk / D'Agostino-Pearson) and
+variance homogeneity (Levene's test across target-column groups).
+"""
+
+import numpy as np
+from scipy.stats import levene, normaltest, shapiro
+
+from ..config import DEFAULT_CONFIG
+from .core import Issue
+
+_ST = DEFAULT_CONFIG.statistical_tests
+
+
+def _run_normality_test(series) -> tuple[str, float, float]:
+    """
+    Return (test_name, statistic, p_value) for the most appropriate normality test.
+    Uses Shapiro-Wilk for n <= shapiro_max_n, D'Agostino-Pearson otherwise.
+    """
+    n = len(series)
+    if n <= _ST.shapiro_max_n:
+        stat, p = shapiro(series)
+        return "shapiro_wilk", float(stat), float(p)
+    else:
+        stat, p = normaltest(series)
+        return "dagostino_pearson", float(stat), float(p)
+
+
+def _check_normality(analyzer) -> list[Issue]:
+    """
+    Flag numeric columns whose distribution is significantly non-normal.
+    Uses Shapiro-Wilk for n <= 5000, D'Agostino-Pearson for larger samples.
+    Non-normality matters for linear models, t-tests, and certain imputation strategies.
+    """
+    issues = []
+
+    for col in analyzer.df.select_dtypes(include="number").columns:
+        series = analyzer.df[col].dropna()
+        n = len(series)
+        if n < _ST.normality_min_n:
+            continue
+        if series.nunique() <= 1:
+            continue
+
+        test_name, stat, p_val = _run_normality_test(series)
+
+        if p_val < _ST.normality_p_value:
+            # Severity: very small p → critical (strong evidence), otherwise warning
+            severity = "critical" if p_val < 0.001 else "warning"
+            impact = "high" if severity == "critical" else "medium"
+            test_label = "Shapiro-Wilk" if test_name == "shapiro_wilk" else "D'Agostino-Pearson"
+
+            issues.append(
+                Issue(
+                    category="normality",
+                    severity=severity,
+                    column=col,
+                    description=(
+                        f"Column '{col}' is non-normal ({test_label}: stat={stat:.4f}, p={p_val:.4g}, n={n})"
+                    ),
+                    impact_score=impact,
+                    quick_fix=(
+                        "Options:\n"
+                        "- Transform: Log, sqrt, or Box-Cox/Yeo-Johnson often normalise skewed data.\n"
+                        "- Use robust models: Tree-based models (XGBoost, RF) make no normality assumption.\n"
+                        "- Normalise for linear models: Required for OLS residuals and LDA.\n"
+                        "- Investigate outliers: Extreme values are a common cause of non-normality."
+                    ),
+                )
+            )
+
+    return issues
+
+
+def _check_variance_homogeneity(analyzer) -> list[Issue]:
+    """
+    Run Levene's test across groups defined by the target column.
+    Unequal variances (heteroscedasticity) across classes violate assumptions of
+    linear discriminant analysis and ANOVA; they also indicate scale differences
+    that may harm distance-based models.
+
+    Only runs when a target column is set and has at least 2 groups with
+    sufficient data.
+    """
+    issues = []
+
+    if analyzer.target_col is None:
+        return issues
+
+    target = analyzer.df[analyzer.target_col].dropna()
+    groups_labels = target.unique()
+
+    for col in analyzer.df.select_dtypes(include="number").columns:
+        if col == analyzer.target_col:
+            continue
+
+        series = analyzer.df[col]
+
+        # Build per-group arrays, filtering out groups that are too small
+        groups = []
+        for label in groups_labels:
+            mask = analyzer.df[analyzer.target_col] == label
+            grp = series[mask].dropna().values
+            if len(grp) >= _ST.levene_min_group_size:
+                groups.append(grp)
+
+        if len(groups) < 2:
+            continue
+
+        try:
+            stat, p_val = levene(*groups, center="median")  # median-centre is most robust
+        except ValueError:
+            continue
+
+        if p_val < _ST.levene_p_value:
+            # Compute per-group stds to add colour to the description
+            stds = [float(np.std(g, ddof=1)) for g in groups]
+            std_ratio = max(stds) / min(stds) if min(stds) > 0 else float("inf")
+            severity = "critical" if std_ratio > 3.0 else "warning"
+            impact = "high" if severity == "critical" else "medium"
+
+            issues.append(
+                Issue(
+                    category="variance_homogeneity",
+                    severity=severity,
+                    column=col,
+                    description=(
+                        f"Column '{col}' has unequal variances across '{analyzer.target_col}' groups "
+                        f"(Levene: stat={stat:.4f}, p={p_val:.4g}; std ratio={std_ratio:.2f}×)"
+                    ),
+                    impact_score=impact,
+                    quick_fix=(
+                        "Options:\n"
+                        "- Scale per class: Normalise within each target group before training.\n"
+                        "- Transform feature: Log or sqrt often equalises spread.\n"
+                        "- Use Welch's t-test / robust ANOVA: Accounts for unequal variances.\n"
+                        "- Use tree-based models: Decision trees are invariant to feature scaling."
+                    ),
+                )
+            )
+
+    return issues
diff --git a/hashprep/config.py b/hashprep/config.py
index 5c97a4f..8305e3a 100644
--- a/hashprep/config.py
+++ b/hashprep/config.py
@@ -126,6 +126,22 @@ class ImbalanceThresholds:
     majority_class_ratio: float = 0.9
 
 
+@dataclass(frozen=True)
+class StatisticalTestThresholds:
+    """Thresholds for normality and variance homogeneity tests."""
+
+    # p-value below which we flag non-normality
+    normality_p_value: float = 0.05
+    # Shapiro-Wilk is used up to this sample size; D'Agostino-Pearson above it
+    shapiro_max_n: int = 5000
+    # Minimum samples to run any normality test
+    normality_min_n: int = 8
+    # p-value below which Levene's test flags unequal variances across groups
+    levene_p_value: float = 0.05
+    # Minimum group size to include a target group in Levene's test
+    levene_min_group_size: int = 8
+
+
 @dataclass(frozen=True)
 class DateTimeThresholds:
     """Thresholds for datetime-specific checks."""
@@ -190,6 +206,7 @@ class HashPrepConfig:
     drift: DriftThresholds = field(default_factory=DriftThresholds)
     distribution: DistributionThresholds = field(default_factory=DistributionThresholds)
     imbalance: ImbalanceThresholds = field(default_factory=ImbalanceThresholds)
+    statistical_tests: StatisticalTestThresholds = field(default_factory=StatisticalTestThresholds)
     datetime: DateTimeThresholds = field(default_factory=DateTimeThresholds)
     type_inference: TypeInferenceConfig = field(default_factory=TypeInferenceConfig)
     sampling: SamplingDefaults = field(default_factory=SamplingDefaults)
diff --git a/hashprep/core/analyzer.py b/hashprep/core/analyzer.py
index ab696c8..2c44047 100644
--- a/hashprep/core/analyzer.py
+++ b/hashprep/core/analyzer.py
@@ -59,6 +59,8 @@ class DatasetAnalyzer:
         "unique_values",
         "infinite_values",
         "constant_length",
+        "normality",
+        "variance_homogeneity",
     ]
 
     def __init__(
diff --git a/hashprep/summaries/variables.py b/hashprep/summaries/variables.py
index a23ef07..afac225 100644
--- a/hashprep/summaries/variables.py
+++ b/hashprep/summaries/variables.py
@@ -4,11 +4,12 @@
 
 import numpy as np
 import pandas as pd
-from scipy.stats import median_abs_deviation
+from scipy.stats import median_abs_deviation, normaltest, shapiro
 
 from ..config import DEFAULT_CONFIG
 
 _SUMMARY = DEFAULT_CONFIG.summaries
+_ST = DEFAULT_CONFIG.statistical_tests
 
 
 def get_monotonicity(series: pd.Series) -> str:
@@ -85,10 +86,12 @@ def _summarize_numeric(df, col):
     zeros_percentage = zeros_count / n * 100
     negative_count = int((series < 0).sum())
     negative_percentage = negative_count / n * 100
-    mean_val = float(series.mean())
-    min_val = float(series.min())
-    max_val = float(series.max())
-    q = series.quantile([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0])
+    # Use finite-only series for distribution statistics to avoid np.histogram crashing
+    finite = series[np.isfinite(series)]
+    mean_val = float(finite.mean()) if not finite.empty else float("nan")
+    min_val = float(finite.min()) if not finite.empty else float("nan")
+    max_val = float(finite.max()) if not finite.empty else float("nan")
+    q = finite.quantile([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0])
     quantiles = {
         "minimum": float(q[0]),
         "p5": float(q[0.05]),
@@ -100,29 +103,47 @@ def _summarize_numeric(df, col):
         "range": float(q[1.0] - q[0]),
         "iqr": float(q[0.75] - q[0.25]),
     }
-    cv = float(series.std() / abs(mean_val)) if mean_val != 0 else None
+    cv = float(finite.std() / abs(mean_val)) if mean_val != 0 else None
     descriptive = {
-        "standard_deviation": float(series.std()),
+        "standard_deviation": float(finite.std()),
         "coefficient_of_variation": cv,
-        "kurtosis": float(series.kurtosis()),
+        "kurtosis": float(finite.kurtosis()),
         "mean": mean_val,
-        "mad": float(median_abs_deviation(series)),
-        "skewness": float(series.skew()),
-        "sum": float(series.sum()),
-        "variance": float(series.var()),
-        "monotonicity": get_monotonicity(series),
+        "mad": float(median_abs_deviation(finite)),
+        "skewness": float(finite.skew()),
+        "sum": float(finite.sum()),
+        "variance": float(finite.var()),
+        "monotonicity": get_monotonicity(finite),
     }
-    hist, bin_edges = np.histogram(series, bins=_SUMMARY.histogram_bins, range=(min_val, max_val))
+    hist, bin_edges = np.histogram(finite, bins=_SUMMARY.histogram_bins, range=(min_val, max_val))
     histogram = {
         "bin_edges": [float(x) for x in bin_edges],
         "counts": [int(x) for x in hist],
     }
-    vc = series.value_counts().head(_SUMMARY.top_n_values)
+    vc = finite.value_counts().head(_SUMMARY.top_n_values)
     common_values = {str(v): {"count": int(c), "percentage": float(c / n * 100)} for v, c in vc.items()}
     extremes = {
-        "minimum_10": [float(x) for x in sorted(series)[: _SUMMARY.extreme_values_count]],
-        "maximum_10": [float(x) for x in sorted(series)[-_SUMMARY.extreme_values_count :]],
+        "minimum_10": [float(x) for x in sorted(finite)[: _SUMMARY.extreme_values_count]],
+        "maximum_10": [float(x) for x in sorted(finite)[-_SUMMARY.extreme_values_count :]],
     }
+    # Normality test (Shapiro-Wilk for small n, D'Agostino-Pearson for large n)
+    normality = None
+    if n >= _ST.normality_min_n and series.nunique() > 1:
+        finite = series[np.isfinite(series)]
+        if len(finite) >= _ST.normality_min_n:
+            if len(finite) <= _ST.shapiro_max_n:
+                norm_stat, norm_p = shapiro(finite)
+                norm_test = "shapiro_wilk"
+            else:
+                norm_stat, norm_p = normaltest(finite)
+                norm_test = "dagostino_pearson"
+            normality = {
+                "test": norm_test,
+                "statistic": float(norm_stat),
+                "p_value": float(norm_p),
+                "is_normal": float(norm_p) >= _ST.normality_p_value,
+            }
+
     stats = {
         "infinite_count": infinite_count,
         "infinite_percentage": float(infinite_percentage),
@@ -137,6 +158,7 @@ def _summarize_numeric(df, col):
         "histogram": histogram,
         "common_values": common_values,
         "extreme_values": extremes,
+        "normality": normality,
     }
     return stats
 
diff --git a/tests/test_statistical_tests.py b/tests/test_statistical_tests.py
new file mode 100644
index 0000000..8ae64d5
--- /dev/null
+++ b/tests/test_statistical_tests.py
@@ -0,0 +1,271 @@
+"""Tests for statistical checks: normality and variance homogeneity."""
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from hashprep import DatasetAnalyzer
+from hashprep.checks.statistical_tests import _check_normality, _check_variance_homogeneity
+from hashprep.summaries.variables import _summarize_numeric
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+class _FakeAnalyzer:
+    def __init__(self, df, target_col=None):
+        self.df = df
+        self.target_col = target_col
+        from hashprep.utils.type_inference import infer_types
+
+        self.column_types = infer_types(df)
+
+
+rng = np.random.default_rng(42)
+
+
+# ---------------------------------------------------------------------------
+# Normality check
+# ---------------------------------------------------------------------------
+
+
+class TestNormalityCheck:
+    def test_normal_data_no_issue(self):
+        df = pd.DataFrame({"x": rng.standard_normal(200)})
+        issues = _check_normality(_FakeAnalyzer(df))
+        # Normal data should not trigger a flag
+        assert all(i.column != "x" for i in issues) or len(issues) == 0
+
+    def test_heavily_skewed_data_flagged(self):
+        # Exponential distribution is very non-normal
+        df = pd.DataFrame({"x": rng.exponential(scale=1.0, size=300)})
+        issues = _check_normality(_FakeAnalyzer(df))
+        assert any(i.category == "normality" and i.column == "x" for i in issues)
+
+    def test_uniform_data_flagged(self):
+        df = pd.DataFrame({"x": rng.uniform(0, 1, size=200)})
+        issues = _check_normality(_FakeAnalyzer(df))
+        assert any(i.category == "normality" for i in issues)
+
+    def test_too_few_rows_skipped(self):
+        df = pd.DataFrame({"x": [1.0, 2.0, 3.0]})
+        issues = _check_normality(_FakeAnalyzer(df))
+        assert issues == []
+
+    def test_constant_column_skipped(self):
+        df = pd.DataFrame({"x": [5.0] * 50})
+        issues = _check_normality(_FakeAnalyzer(df))
+        assert issues == []
+
+    def test_all_nan_column_skipped(self):
+        df = pd.DataFrame({"x": [np.nan] * 50, "y": rng.standard_normal(50)})
+        categories = [i.column for i in _check_normality(_FakeAnalyzer(df))]
+        assert "x" not in categories
+
+    def test_large_normal_data_uses_dagostino(self):
+        # n > 5000 → should switch to D'Agostino-Pearson
+        df = pd.DataFrame({"big": rng.standard_normal(6000)})
+        issues = _check_normality(_FakeAnalyzer(df))
+        # Large normal data should not be flagged
+        assert all(i.column != "big" for i in issues)
+
+    def test_large_non_normal_data_flagged(self):
+        df = pd.DataFrame({"big": rng.exponential(scale=1.0, size=6000)})
+        issues = _check_normality(_FakeAnalyzer(df))
+        assert any(i.category == "normality" and i.column == "big" for i in issues)
+
+    def test_issue_fields_populated(self):
+        df = pd.DataFrame({"x": rng.exponential(1.0, size=200)})
+        issues = _check_normality(_FakeAnalyzer(df))
+        issue = next(i for i in issues if i.column == "x")
+        assert issue.category == "normality"
+        assert issue.severity in ("warning", "critical")
+        assert "stat=" in issue.description
+        assert "p=" in issue.description
+
+    def test_new_checks_in_all_checks(self):
+        assert "normality" in DatasetAnalyzer.ALL_CHECKS
+        assert "variance_homogeneity" in DatasetAnalyzer.ALL_CHECKS
+
+
+# ---------------------------------------------------------------------------
+# Variance homogeneity check (Levene's)
+# ---------------------------------------------------------------------------
+
+
+class TestVarianceHomogeneityCheck:
+    def test_no_target_col_returns_empty(self):
+        df = pd.DataFrame({"x": rng.standard_normal(100), "y": rng.standard_normal(100)})
+        issues = _check_variance_homogeneity(_FakeAnalyzer(df, target_col=None))
+        assert issues == []
+
+    def test_equal_variance_across_groups_no_issue(self):
+        df = pd.DataFrame(
+            {
+                "x": np.concatenate([rng.normal(0, 1, 100), rng.normal(5, 1, 100)]),
+                "target": ["A"] * 100 + ["B"] * 100,
+            }
+        )
+        issues = _check_variance_homogeneity(_FakeAnalyzer(df, target_col="target"))
+        assert all(i.category != "variance_homogeneity" for i in issues)
+
+    def test_unequal_variance_flagged(self):
+        # Group A: std=1, Group B: std=20 → Levene should fire
+        df = pd.DataFrame(
+            {
+                "x": np.concatenate([rng.normal(0, 1, 100), rng.normal(0, 20, 100)]),
+                "target": ["A"] * 100 + ["B"] * 100,
+            }
+        )
+        issues = _check_variance_homogeneity(_FakeAnalyzer(df, target_col="target"))
+        assert any(i.category == "variance_homogeneity" and i.column == "x" for i in issues)
+
+    def test_issue_contains_levene_info(self):
+        df = pd.DataFrame(
+            {
+                "x": np.concatenate([rng.normal(0, 1, 100), rng.normal(0, 15, 100)]),
+                "target": ["A"] * 100 + ["B"] * 100,
+            }
+        )
+        issues = _check_variance_homogeneity(_FakeAnalyzer(df, target_col="target"))
+        issue = next(i for i in issues if i.category == "variance_homogeneity")
+        assert "Levene" in issue.description
+        assert "stat=" in issue.description
+        assert "p=" in issue.description
+        assert "std ratio" in issue.description
+
+    def test_target_col_excluded_from_check(self):
+        df = pd.DataFrame(
+            {
+                "target": [0] * 100 + [1] * 100,
+                "x": rng.standard_normal(200),
+            }
+        )
+        issues = _check_variance_homogeneity(_FakeAnalyzer(df, target_col="target"))
+        assert all(i.column != "target" for i in issues)
+
+    def test_too_few_per_group_skipped(self):
+        # Only 3 samples per group — below min_group_size (8)
+        df = pd.DataFrame({"x": [1.0, 2.0, 3.0, 10.0, 20.0, 30.0], "target": ["A", "A", "A", "B", "B", "B"]})
+        issues = _check_variance_homogeneity(_FakeAnalyzer(df, target_col="target"))
+        assert issues == []
+
+    def test_only_one_valid_group_skipped(self):
+        # Group B has < min_group_size samples
+        x = np.concatenate([rng.normal(0, 1, 100), rng.normal(0, 5, 3)])
+        target = ["A"] * 100 + ["B"] * 3
+        df = pd.DataFrame({"x": x, "target": target})
+        issues = _check_variance_homogeneity(_FakeAnalyzer(df, target_col="target"))
+        assert issues == []
+
+
+# ---------------------------------------------------------------------------
+# Normality in numeric summaries
+# ---------------------------------------------------------------------------
+
+
+class TestNormalitySummary:
+    def test_normality_key_present(self):
+        series = rng.standard_normal(100)
+        df = pd.DataFrame({"x": series})
+        result = _summarize_numeric(df, "x")
+        assert "normality" in result
+
+    def test_normality_fields(self):
+        df = pd.DataFrame({"x": rng.standard_normal(100)})
+        norm = _summarize_numeric(df, "x")["normality"]
+        assert norm is not None
+        assert "test" in norm
+        assert "statistic" in norm
+        assert "p_value" in norm
+        assert "is_normal" in norm
+        assert isinstance(norm["is_normal"], bool)
+        assert norm["test"] == "shapiro_wilk"  # n=100 < 5000
+
+    def test_large_sample_uses_dagostino(self):
+        df = pd.DataFrame({"x": rng.standard_normal(6000)})
+        norm = _summarize_numeric(df, "x")["normality"]
+        assert norm["test"] == "dagostino_pearson"
+
+    def test_normality_none_for_too_few_rows(self):
+        df = pd.DataFrame({"x": [1.0, 2.0, 3.0]})
+        result = _summarize_numeric(df, "x")
+        assert result["normality"] is None
+
+    def test_normality_none_for_constant_col(self):
+        df = pd.DataFrame({"x": [5.0] * 50})
+        result = _summarize_numeric(df, "x")
+        assert result["normality"] is None
+
+    def test_non_normal_data_is_normal_false(self):
+        # Very heavy exponential tail — p should be well below 0.05
+        df = pd.DataFrame({"x": rng.exponential(1.0, size=500)})
+        norm = _summarize_numeric(df, "x")["normality"]
+        assert norm["is_normal"] is False
+
+    def test_normality_handles_inf_values(self):
+        # Infinite values should be excluded before the test
+        values = list(rng.standard_normal(100)) + [np.inf, -np.inf]
+        df = pd.DataFrame({"x": values})
+        result = _summarize_numeric(df, "x")
+        # Should not raise; normality may be None or a valid result
+        assert "normality" in result
+
+
+# ---------------------------------------------------------------------------
+# Integration: DatasetAnalyzer end-to-end
+# ---------------------------------------------------------------------------
+
+
+class TestStatisticalTestsIntegration:
+    def test_normality_check_runs_in_full_analysis(self):
+        df = pd.DataFrame(
+            {
+                "normal_col": rng.standard_normal(200),
+                "skewed_col": rng.exponential(1.0, size=200),
+            }
+        )
+        analyzer = DatasetAnalyzer(df, selected_checks=["normality"], auto_sample=False)
+        summary = analyzer.analyze()
+        categories = [i["category"] for i in summary["issues"]]
+        assert "normality" in categories
+
+    def test_variance_homogeneity_runs_with_target(self):
+        df = pd.DataFrame(
+            {
+                "x": np.concatenate([rng.normal(0, 1, 100), rng.normal(0, 15, 100)]),
+                "target": ["A"] * 100 + ["B"] * 100,
+            }
+        )
+        analyzer = DatasetAnalyzer(df, target_col="target", selected_checks=["variance_homogeneity"], auto_sample=False)
+        summary = analyzer.analyze()
+        categories = [i["category"] for i in summary["issues"]]
+        assert "variance_homogeneity" in categories
+
+    def test_normality_summary_present_in_full_analysis(self):
+        df = pd.DataFrame({"val": rng.standard_normal(100), "cat": ["A", "B"] * 50})
+        analyzer = DatasetAnalyzer(df, auto_sample=False)
+        summary = analyzer.analyze()
+        var = summary["summaries"]["variables"]["val"]
+        assert "normality" in var
+        assert var["normality"] is not None
+
+    def test_no_crash_without_target(self):
+        df = pd.DataFrame({"a": rng.standard_normal(100), "b": rng.exponential(1, 100)})
+        analyzer = DatasetAnalyzer(df, selected_checks=["variance_homogeneity"], auto_sample=False)
+        summary = analyzer.analyze()
+        assert summary["total_issues"] == 0
+
+    @pytest.mark.parametrize("check", ["normality", "variance_homogeneity"])
+    def test_check_selectable_via_selected_checks(self, check):
+        df = pd.DataFrame(
+            {
+                "x": rng.exponential(1.0, size=100),
+                "target": ["A"] * 50 + ["B"] * 50,
+            }
+        )
+        analyzer = DatasetAnalyzer(df, target_col="target", selected_checks=[check], auto_sample=False)
+        summary = analyzer.analyze()
+        # Should run without error and return a dict
+        assert "issues" in summary

From aee4bcf4d21730c24f5e6a8c0767909189742bac Mon Sep 17 00:00:00 2001
From: maskedsyntax <aftaab@aftaab.xyz>
Date: Tue, 3 Mar 2026 16:20:16 +0530
Subject: [PATCH 2/2] style: apply ruff format to statistical_tests.py

---
 hashprep/checks/statistical_tests.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/hashprep/checks/statistical_tests.py b/hashprep/checks/statistical_tests.py
index 27a6351..04c42e4 100644
--- a/hashprep/checks/statistical_tests.py
+++ b/hashprep/checks/statistical_tests.py
@@ -55,9 +55,7 @@ def _check_normality(analyzer) -> list[Issue]:
                     category="normality",
                     severity=severity,
                     column=col,
-                    description=(
-                        f"Column '{col}' is non-normal ({test_label}: stat={stat:.4f}, p={p_val:.4g}, n={n})"
-                    ),
+                    description=(f"Column '{col}' is non-normal ({test_label}: stat={stat:.4f}, p={p_val:.4g}, n={n})"),
                     impact_score=impact,
                     quick_fix=(
                         "Options:\n"