Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions hashprep/checks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
_check_outliers,
_check_skewness,
)
from .statistical_tests import _check_normality, _check_variance_homogeneity


def _check_dataset_drift(analyzer):
Expand Down Expand Up @@ -57,6 +58,8 @@ def _check_dataset_drift(analyzer):
"infinite_values": _check_infinite_values,
"constant_length": _check_constant_length,
"empty_dataset": _check_empty_dataset,
"normality": _check_normality,
"variance_homogeneity": _check_variance_homogeneity,
}

CORRELATION_CHECKS = {"feature_correlation", "categorical_correlation", "mixed_correlation"}
Expand Down
140 changes: 140 additions & 0 deletions hashprep/checks/statistical_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
"""
Statistical tests: normality (Shapiro-Wilk / D'Agostino-Pearson) and
variance homogeneity (Levene's test across target-column groups).
"""

import numpy as np
from scipy.stats import levene, normaltest, shapiro

from ..config import DEFAULT_CONFIG
from .core import Issue

_ST = DEFAULT_CONFIG.statistical_tests


def _run_normality_test(series) -> tuple[str, float, float]:
"""
Return (test_name, statistic, p_value) for the most appropriate normality test.
Uses Shapiro-Wilk for n <= shapiro_max_n, D'Agostino-Pearson otherwise.
"""
n = len(series)
if n <= _ST.shapiro_max_n:
stat, p = shapiro(series)
return "shapiro_wilk", float(stat), float(p)
else:
stat, p = normaltest(series)
return "dagostino_pearson", float(stat), float(p)


def _check_normality(analyzer) -> list[Issue]:
"""
Flag numeric columns whose distribution is significantly non-normal.
Uses Shapiro-Wilk for n <= 5000, D'Agostino-Pearson for larger samples.
Non-normality matters for linear models, t-tests, and certain imputation strategies.
"""
issues = []

for col in analyzer.df.select_dtypes(include="number").columns:
series = analyzer.df[col].dropna()
n = len(series)
if n < _ST.normality_min_n:
continue
if series.nunique() <= 1:
continue

test_name, stat, p_val = _run_normality_test(series)

if p_val < _ST.normality_p_value:
# Severity: very small p → critical (strong evidence), otherwise warning
severity = "critical" if p_val < 0.001 else "warning"
impact = "high" if severity == "critical" else "medium"
test_label = "Shapiro-Wilk" if test_name == "shapiro_wilk" else "D'Agostino-Pearson"

issues.append(
Issue(
category="normality",
severity=severity,
column=col,
description=(f"Column '{col}' is non-normal ({test_label}: stat={stat:.4f}, p={p_val:.4g}, n={n})"),
impact_score=impact,
quick_fix=(
"Options:\n"
"- Transform: Log, sqrt, or Box-Cox/Yeo-Johnson often normalise skewed data.\n"
"- Use robust models: Tree-based models (XGBoost, RF) make no normality assumption.\n"
"- Normalise for linear models: Required for OLS residuals and LDA.\n"
"- Investigate outliers: Extreme values are a common cause of non-normality."
),
)
)

return issues


def _check_variance_homogeneity(analyzer) -> list[Issue]:
"""
Run Levene's test across groups defined by the target column.
Unequal variances (heteroscedasticity) across classes violate assumptions of
linear discriminant analysis and ANOVA; they also indicate scale differences
that may harm distance-based models.

Only runs when a target column is set and has at least 2 groups with
sufficient data.
"""
issues = []

if analyzer.target_col is None:
return issues

target = analyzer.df[analyzer.target_col].dropna()
groups_labels = target.unique()

for col in analyzer.df.select_dtypes(include="number").columns:
if col == analyzer.target_col:
continue

series = analyzer.df[col]

# Build per-group arrays, filtering out groups that are too small
groups = []
for label in groups_labels:
mask = analyzer.df[analyzer.target_col] == label
grp = series[mask].dropna().values
if len(grp) >= _ST.levene_min_group_size:
groups.append(grp)

if len(groups) < 2:
continue

try:
stat, p_val = levene(*groups, center="median") # median-centre is most robust
except ValueError:
continue

if p_val < _ST.levene_p_value:
# Compute per-group stds to add colour to the description
stds = [float(np.std(g, ddof=1)) for g in groups]
std_ratio = max(stds) / min(stds) if min(stds) > 0 else float("inf")
severity = "critical" if std_ratio > 3.0 else "warning"
impact = "high" if severity == "critical" else "medium"

issues.append(
Issue(
category="variance_homogeneity",
severity=severity,
column=col,
description=(
f"Column '{col}' has unequal variances across '{analyzer.target_col}' groups "
f"(Levene: stat={stat:.4f}, p={p_val:.4g}; std ratio={std_ratio:.2f}×)"
),
impact_score=impact,
quick_fix=(
"Options:\n"
"- Scale per class: Normalise within each target group before training.\n"
"- Transform feature: Log or sqrt often equalises spread.\n"
"- Use Welch's t-test / robust ANOVA: Accounts for unequal variances.\n"
"- Use tree-based models: Decision trees are invariant to feature scaling."
),
)
)

return issues
17 changes: 17 additions & 0 deletions hashprep/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,22 @@ class ImbalanceThresholds:
majority_class_ratio: float = 0.9


@dataclass(frozen=True)
class StatisticalTestThresholds:
"""Thresholds for normality and variance homogeneity tests."""

# p-value below which we flag non-normality
normality_p_value: float = 0.05
# Shapiro-Wilk is used up to this sample size; D'Agostino-Pearson above it
shapiro_max_n: int = 5000
# Minimum samples to run any normality test
normality_min_n: int = 8
# p-value below which Levene's test flags unequal variances across groups
levene_p_value: float = 0.05
# Minimum group size to include a target group in Levene's test
levene_min_group_size: int = 8


@dataclass(frozen=True)
class DateTimeThresholds:
"""Thresholds for datetime-specific checks."""
Expand Down Expand Up @@ -190,6 +206,7 @@ class HashPrepConfig:
drift: DriftThresholds = field(default_factory=DriftThresholds)
distribution: DistributionThresholds = field(default_factory=DistributionThresholds)
imbalance: ImbalanceThresholds = field(default_factory=ImbalanceThresholds)
statistical_tests: StatisticalTestThresholds = field(default_factory=StatisticalTestThresholds)
datetime: DateTimeThresholds = field(default_factory=DateTimeThresholds)
type_inference: TypeInferenceConfig = field(default_factory=TypeInferenceConfig)
sampling: SamplingDefaults = field(default_factory=SamplingDefaults)
Expand Down
2 changes: 2 additions & 0 deletions hashprep/core/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ class DatasetAnalyzer:
"unique_values",
"infinite_values",
"constant_length",
"normality",
"variance_homogeneity",
]

def __init__(
Expand Down
56 changes: 39 additions & 17 deletions hashprep/summaries/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@

import numpy as np
import pandas as pd
from scipy.stats import median_abs_deviation
from scipy.stats import median_abs_deviation, normaltest, shapiro

from ..config import DEFAULT_CONFIG

_SUMMARY = DEFAULT_CONFIG.summaries
_ST = DEFAULT_CONFIG.statistical_tests


def get_monotonicity(series: pd.Series) -> str:
Expand Down Expand Up @@ -85,10 +86,12 @@ def _summarize_numeric(df, col):
zeros_percentage = zeros_count / n * 100
negative_count = int((series < 0).sum())
negative_percentage = negative_count / n * 100
mean_val = float(series.mean())
min_val = float(series.min())
max_val = float(series.max())
q = series.quantile([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0])
# Use finite-only series for distribution statistics to avoid np.histogram crashing
finite = series[np.isfinite(series)]
mean_val = float(finite.mean()) if not finite.empty else float("nan")
min_val = float(finite.min()) if not finite.empty else float("nan")
max_val = float(finite.max()) if not finite.empty else float("nan")
q = finite.quantile([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0])
quantiles = {
"minimum": float(q[0]),
"p5": float(q[0.05]),
Expand All @@ -100,29 +103,47 @@ def _summarize_numeric(df, col):
"range": float(q[1.0] - q[0]),
"iqr": float(q[0.75] - q[0.25]),
}
cv = float(series.std() / abs(mean_val)) if mean_val != 0 else None
cv = float(finite.std() / abs(mean_val)) if mean_val != 0 else None
descriptive = {
"standard_deviation": float(series.std()),
"standard_deviation": float(finite.std()),
"coefficient_of_variation": cv,
"kurtosis": float(series.kurtosis()),
"kurtosis": float(finite.kurtosis()),
"mean": mean_val,
"mad": float(median_abs_deviation(series)),
"skewness": float(series.skew()),
"sum": float(series.sum()),
"variance": float(series.var()),
"monotonicity": get_monotonicity(series),
"mad": float(median_abs_deviation(finite)),
"skewness": float(finite.skew()),
"sum": float(finite.sum()),
"variance": float(finite.var()),
"monotonicity": get_monotonicity(finite),
}
hist, bin_edges = np.histogram(series, bins=_SUMMARY.histogram_bins, range=(min_val, max_val))
hist, bin_edges = np.histogram(finite, bins=_SUMMARY.histogram_bins, range=(min_val, max_val))
histogram = {
"bin_edges": [float(x) for x in bin_edges],
"counts": [int(x) for x in hist],
}
vc = series.value_counts().head(_SUMMARY.top_n_values)
vc = finite.value_counts().head(_SUMMARY.top_n_values)
common_values = {str(v): {"count": int(c), "percentage": float(c / n * 100)} for v, c in vc.items()}
extremes = {
"minimum_10": [float(x) for x in sorted(series)[: _SUMMARY.extreme_values_count]],
"maximum_10": [float(x) for x in sorted(series)[-_SUMMARY.extreme_values_count :]],
"minimum_10": [float(x) for x in sorted(finite)[: _SUMMARY.extreme_values_count]],
"maximum_10": [float(x) for x in sorted(finite)[-_SUMMARY.extreme_values_count :]],
}
# Normality test (Shapiro-Wilk for small n, D'Agostino-Pearson for large n)
normality = None
if n >= _ST.normality_min_n and series.nunique() > 1:
finite = series[np.isfinite(series)]
if len(finite) >= _ST.normality_min_n:
if len(finite) <= _ST.shapiro_max_n:
norm_stat, norm_p = shapiro(finite)
norm_test = "shapiro_wilk"
else:
norm_stat, norm_p = normaltest(finite)
norm_test = "dagostino_pearson"
normality = {
"test": norm_test,
"statistic": float(norm_stat),
"p_value": float(norm_p),
"is_normal": float(norm_p) >= _ST.normality_p_value,
}

stats = {
"infinite_count": infinite_count,
"infinite_percentage": float(infinite_percentage),
Expand All @@ -137,6 +158,7 @@ def _summarize_numeric(df, col):
"histogram": histogram,
"common_values": common_values,
"extreme_values": extremes,
"normality": normality,
}
return stats

Expand Down
Loading