In [42]:
from dataclasses import dataclass
from typing import Optional

@dataclass
class PreprocessConfig:
    report: bool = True
    report_path: str = "prep_dynamic.pdf"
    project_name: str = "Automata AI - Preprocessing Report"
    logo_path: Optional[str] = None
    verbose: bool = True


In [36]:
# =========================
# CELL 1 — AutomataPreprocessor (main code) [IMPROVED + FIXED]
# =========================

from dataclasses import dataclass, asdict
from typing import List, Dict, Optional, Tuple, Any, Callable
import re
import joblib
import logging
import numpy as np
import pandas as pd
import warnings
import importlib

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, mutual_info_classif

# ---- logging ----
logger = logging.getLogger("AutomataPreprocessor")
if not logger.handlers:
    ch = logging.StreamHandler()
    ch.setFormatter(logging.Formatter("[%(levelname)s] %(name)s: %(message)s"))
    logger.addHandler(ch)
logger.setLevel(logging.INFO)


@dataclass
class PreprocessingConfig:
    drop_missing_threshold: float = 0.8
    high_cardinality_threshold: float = 0.2   # kept for backward-compat (no longer used in split)
    high_cardinality_min_unique: int = 15

    numeric_imputer: str = "median"          # 'mean','median','most_frequent'
    numeric_scaler: str = "standard"         # 'standard','minmax','robust','none'

    categorical_imputer: str = "most_frequent"  # 'most_frequent','constant'

    # OHE controls (applied only if your sklearn supports them)
    ohe_min_frequency: Optional[float] = None   # e.g. 0.01 (or an int count in newer sklearn)
    ohe_max_categories: Optional[int] = None

    high_cardinality_encoder: str = "frequency" # currently: 'frequency'

    # datetime handling: drop or extract into numeric parts
    datetime_handling: str = "drop"  # 'drop' | 'extract'
    datetime_extract_parts: Tuple[str, ...] = ("year", "month", "day", "dayofweek")

    feature_selection: str = "auto"          # 'auto','none','mutual_info'
    feature_fraction: float = 0.75

    balancing: str = "class_weight"          # 'none','class_weight'
    imbalance_threshold: float = 1.5         # apply class_weight only if imbalance_ratio > this


class DatasetAnalyzer:
    DATE_REGEXES = [
        re.compile(r"^\d{4}-\d{2}-\d{2}"),        # YYYY-MM-DD
        re.compile(r"^\d{2}/\d{2}/\d{4}"),        # MM/DD/YYYY
        re.compile(r"^\d{4}/\d{2}/\d{2}"),        # YYYY/MM/DD
        re.compile(r"^\d{2}-[A-Za-z]{3}-\d{4}"),  # 01-Jan-2020
        re.compile(r"^\d{8}$"),                   # YYYYMMDD
    ]

    def _looks_like_datetime(self, series: pd.Series, sample_n: int = 50) -> bool:
        if pd.api.types.is_datetime64_any_dtype(series):
            return True

        s = series.dropna().astype(str)
        if s.shape[0] == 0:
            return False

        sample = s.head(sample_n)

        hit = 0
        for v in sample:
            for rx in self.DATE_REGEXES:
                if rx.search(v):
                    hit += 1
                    break
        if (hit / max(len(sample), 1)) > 0.6:
            return True

        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", message="Could not infer format")
            parsed = pd.to_datetime(sample, errors="coerce")
        return parsed.notna().mean() > 0.8

    def infer_column_types(self, X: pd.DataFrame) -> Dict[str, str]:
        types: Dict[str, str] = {}
        for col in X.columns:
            s = X[col]
            if pd.api.types.is_bool_dtype(s):
                types[col] = "categorical"
            elif pd.api.types.is_numeric_dtype(s):
                types[col] = "numeric"
            elif self._looks_like_datetime(s):
                types[col] = "datetime"
            else:
                types[col] = "categorical"
        return types

    def compute_meta(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> Dict[str, Any]:
        n_rows, n_cols = X.shape
        col_types = self.infer_column_types(X)
        missing_frac = X.isna().mean().to_dict()
        cardinality = X.nunique(dropna=True).to_dict()

        meta: Dict[str, Any] = {
            "n_rows": n_rows,
            "n_cols": n_cols,
            "col_types": col_types,
            "missing_frac": missing_frac,
            "cardinality": cardinality,
        }

        if y is not None:
            y_s = pd.Series(y)
            counts = y_s.value_counts().to_dict()
            if counts:
                majority = max(counts.values())
                minority = min(counts.values())
                imbalance_ratio = majority / max(1, minority)
            else:
                counts = {}
                imbalance_ratio = 1.0
            meta["class_counts"] = counts
            meta["imbalance_ratio"] = imbalance_ratio

        return meta


class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, unseen_value: float = 0.0):
        self.unseen_value = unseen_value
        self.mappings_: List[Dict[Any, float]] = []
        self.n_features_in_: Optional[int] = None

    def fit(self, X, y=None):
        arr = self._to_2d(X)
        self.n_features_in_ = arr.shape[1]
        self.mappings_ = []
        for j in range(arr.shape[1]):
            col = pd.Series(arr[:, j])
            self.mappings_.append(col.value_counts(normalize=True).to_dict())
        return self

    def transform(self, X):
        arr = self._to_2d(X)
        out = np.zeros((arr.shape[0], arr.shape[1]), dtype=float)
        for j in range(arr.shape[1]):
            mapping = self.mappings_[j]
            out[:, j] = pd.Series(arr[:, j]).map(mapping).fillna(self.unseen_value).to_numpy(dtype=float)
        return out

    def get_feature_names_out(self, input_features: Optional[List[str]] = None) -> List[str]:
        if input_features is None:
            n = self.n_features_in_ or 0
            return [f"x{j}_freq" for j in range(n)]
        return [f"{name}_freq" for name in input_features]

    @staticmethod
    def _to_2d(X):
        if isinstance(X, pd.DataFrame):
            return X.to_numpy()
        if isinstance(X, pd.Series):
            return X.to_numpy().reshape(-1, 1)
        arr = np.asarray(X)
        if arr.ndim == 1:
            arr = arr.reshape(-1, 1)
        return arr


def _make_ohe(cfg: PreprocessingConfig):
    base = {"handle_unknown": "ignore"}
    candidates = []

    kw = dict(base)
    kw["sparse_output"] = False
    if cfg.ohe_min_frequency is not None:
        kw["min_frequency"] = cfg.ohe_min_frequency
    if cfg.ohe_max_categories is not None:
        kw["max_categories"] = cfg.ohe_max_categories
    candidates.append(kw)

    kw = dict(base)
    kw["sparse"] = False
    if cfg.ohe_min_frequency is not None:
        kw["min_frequency"] = cfg.ohe_min_frequency
    if cfg.ohe_max_categories is not None:
        kw["max_categories"] = cfg.ohe_max_categories
    candidates.append(kw)

    candidates.append({**base, "sparse_output": False})
    candidates.append({**base, "sparse": False})
    candidates.append(base)

    last_err = None
    for c in candidates:
        try:
            return OneHotEncoder(**c)
        except TypeError as e:
            last_err = e
    raise last_err


def build_rule_based_config(meta: Dict[str, Any], cfg: Optional[PreprocessingConfig] = None) -> Tuple[PreprocessingConfig, Dict[str, Any]]:
    cfg = cfg or PreprocessingConfig()
    cfg = PreprocessingConfig(**asdict(cfg))

    if cfg.feature_selection == "auto":
        cfg.feature_selection = "mutual_info" if meta.get("n_cols", 0) > 30 else "none"

    drop_cols: List[str] = []
    for col, frac in meta.get("missing_frac", {}).items():
        if frac > cfg.drop_missing_threshold:
            drop_cols.append(col)
    for col, c in meta.get("cardinality", {}).items():
        if c <= 1 and col not in drop_cols:
            drop_cols.append(col)

    return cfg, {"drop_cols": drop_cols}


class AutomataPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(
        self,
        config: Optional[PreprocessingConfig] = None,
        strategy: str = "auto",
        verbose: bool = False,

        report: bool = False,
        report_path: str = "prep_report.pdf",
        project_name: str = "Automata AI Project",
        author: str = "",
        logo_path: Optional[str] = None,
    ):
        self.config = config
        self.strategy = strategy
        self.verbose = verbose
        if self.verbose:
            logger.setLevel(logging.DEBUG)

        self.report = report
        self.report_path = report_path
        self.project_name = project_name
        self.author = author
        self.logo_path = logo_path

        # NEW: safe reference to report generator (module, function_name)
        self.report_generator_ref_: Optional[Tuple[str, str]] = None

        self.analyzer_ = DatasetAnalyzer()

        self.meta_: Optional[Dict[str, Any]] = None
        self.meta_used_: Optional[Dict[str, Any]] = None

        self.config_: Optional[PreprocessingConfig] = None
        self.aux_: Optional[Dict[str, Any]] = None
        self.pipeline_: Optional[Pipeline] = None

        self.feature_names_in_: Optional[List[str]] = None
        self.output_feature_names_: Optional[List[str]] = None

        self.class_weights_: Optional[Dict[Any, float]] = None
        self.applied_: Dict[str, Any] = {}
        self.fs_k_: Optional[int] = None

        self.drop_cols_: List[str] = []
        self.datetime_cols_: List[str] = []
        self.datetime_generated_cols_: List[str] = []

    # NEW: lets you set the generator reliably (works locally + notebook)
    def set_report_generator(self, fn: Callable):
        self.report_generator_ref_ = (fn.__module__, fn.__name__)
        return self

    def fit(self, X, y=None):
        X = self._ensure_df(X)
        self.feature_names_in_ = list(X.columns)

        self.meta_ = self.analyzer_.compute_meta(X, y)
        if self.strategy == "auto" or self.config is None:
            self.config_, self.aux_ = build_rule_based_config(self.meta_, None)
        else:
            self.config_, self.aux_ = build_rule_based_config(self.meta_, self.config)

        self.drop_cols_ = list((self.aux_ or {}).get("drop_cols", []))
        X_used = X.drop(columns=self.drop_cols_, errors="ignore")

        X_used = self._handle_datetime_fit(X_used)

        col_types_used = self.analyzer_.infer_column_types(X_used)
        numeric_cols = [c for c, t in col_types_used.items() if t == "numeric"]
        categorical_cols = [c for c, t in col_types_used.items() if t == "categorical"]

        missing_frac_used = X_used.isna().mean().to_dict()
        cardinality_used = X_used.nunique(dropna=True).to_dict()
        self.meta_used_ = {
            "n_rows": int(X_used.shape[0]),
            "n_cols": int(X_used.shape[1]),
            "col_types": col_types_used,
            "missing_frac": missing_frac_used,
            "cardinality": cardinality_used,
        }

        numeric_missing_cols = [c for c in numeric_cols if missing_frac_used.get(c, 0.0) > 0.0]

        # split cat into low/high  (CHANGED: remove fraction rule; use min_unique only)
        card = X_used[categorical_cols].nunique(dropna=True) if categorical_cols else pd.Series(dtype=int)
        low_card_cols: List[str] = []
        high_card_cols: List[str] = []
        for c in categorical_cols:
            uniq = int(card.get(c, 0))
            if uniq >= int(self.config_.high_cardinality_min_unique):
                high_card_cols.append(c)
            else:
                low_card_cols.append(c)

        transformers = []

        if numeric_cols:
            num_steps = []
            if numeric_missing_cols:
                num_steps.append(("imputer", SimpleImputer(strategy=self.config_.numeric_imputer)))

            if self.config_.numeric_scaler == "standard":
                num_steps.append(("scaler", StandardScaler()))
            elif self.config_.numeric_scaler == "minmax":
                num_steps.append(("scaler", MinMaxScaler()))
            elif self.config_.numeric_scaler == "robust":
                num_steps.append(("scaler", RobustScaler()))

            transformers.append(("num", Pipeline(num_steps), numeric_cols) if num_steps else ("num", "passthrough", numeric_cols))

        if low_card_cols:
            low_missing_cols = [c for c in low_card_cols if missing_frac_used.get(c, 0.0) > 0.0]
            low_steps = []
            if low_missing_cols:
                low_steps.append(("imputer", SimpleImputer(strategy=self.config_.categorical_imputer)))
            low_steps.append(("ohe", _make_ohe(self.config_)))
            transformers.append(("cat_low", Pipeline(low_steps), low_card_cols))

        if high_card_cols:
            high_missing_cols = [c for c in high_card_cols if missing_frac_used.get(c, 0.0) > 0.0]
            high_steps = []
            if high_missing_cols:
                high_steps.append(("imputer", SimpleImputer(strategy=self.config_.categorical_imputer)))
            high_steps.append(("freq", FrequencyEncoder(unseen_value=0.0)))
            transformers.append(("cat_high", Pipeline(high_steps), high_card_cols))

        if not transformers:
            empty = FunctionTransformer(lambda Z: np.zeros((len(Z), 0)), validate=False)
            self.pipeline_ = Pipeline([("empty", empty)])
            self.pipeline_.fit(X_used, y) if y is not None else self.pipeline_.fit(X_used)
            self.output_feature_names_ = []
            self.class_weights_ = None
            self._populate_applied(
                numeric_cols, categorical_cols, numeric_missing_cols,
                low_card_cols, high_card_cols, False, False, missing_frac_used
            )
            self._maybe_generate_report()
            return self

        ct = ColumnTransformer(transformers=transformers, remainder="drop")

        pre_pipe = Pipeline([("preprocess", ct)])
        pre_pipe.fit(X_used, y) if y is not None else pre_pipe.fit(X_used)

        fs_used = False
        self.fs_k_ = None
        self.pipeline_ = pre_pipe

        names_pre = self._safe_get_preprocess_feature_names(X_used) or []
        n_pre = len(names_pre) if names_pre else int(pre_pipe.transform(X_used.head(1)).shape[1])

        if (self.config_.feature_selection == "mutual_info") and (y is not None) and (n_pre > 0):
            k = max(int(self.config_.feature_fraction * max(1, n_pre)), 1)
            k = min(k, n_pre)
            fs = SelectKBest(score_func=mutual_info_classif, k=k)
            full_pipe = Pipeline([("preprocess", ct), ("fs", fs)])
            try:
                full_pipe.fit(X_used, y)
                self.pipeline_ = full_pipe
                fs_used = True
                self.fs_k_ = k
            except Exception as e:
                if self.verbose:
                    logger.debug("Feature selection disabled due to error: %s", e)
                self.pipeline_ = pre_pipe
                fs_used = False
                self.fs_k_ = None

        self.output_feature_names_ = self._compute_output_feature_names(X_used)

        self.class_weights_ = None
        balancing_used = False
        if y is not None and self.config_.balancing == "class_weight":
            ir = float(self.meta_.get("imbalance_ratio", 1.0))
            if ir > float(self.config_.imbalance_threshold):
                y_s = pd.Series(y)
                counts = y_s.value_counts().to_dict()
                total = int(len(y_s))
                n_classes = max(len(counts), 1)
                self.class_weights_ = {cls: total / (n_classes * cnt) for cls, cnt in counts.items()}
                balancing_used = True

        self._populate_applied(
            numeric_cols, categorical_cols, numeric_missing_cols,
            low_card_cols, high_card_cols, fs_used, balancing_used, missing_frac_used
        )

        self._maybe_generate_report()
        return self

    def transform(self, X):
        if self.pipeline_ is None:
            raise RuntimeError("Call fit before transform.")

        X = self._ensure_df(X)
        X = self._align_schema(X)

        X_used = X.drop(columns=self.drop_cols_, errors="ignore")
        X_used = self._handle_datetime_transform(X_used)

        return self.pipeline_.transform(X_used)

    def fit_transform(self, X, y=None, **kwargs):
        return self.fit(X, y).transform(X)

    def get_feature_names_out(self) -> List[str]:
        if self.output_feature_names_ is None:
            raise RuntimeError("Feature names not available; fit first.")
        return list(self.output_feature_names_)

    def save(self, path: str):
        joblib.dump(self, path)
        logger.info("Saved AutomataPreprocessor to %s", path)

    @staticmethod
    def load(path: str):
        obj = joblib.load(path)
        if not isinstance(obj, AutomataPreprocessor):
            raise ValueError("Loaded object is not AutomataPreprocessor")
        logger.info("Loaded AutomataPreprocessor from %s", path)
        return obj

    @staticmethod
    def _ensure_df(X):
        if isinstance(X, pd.DataFrame):
            return X.copy()
        return pd.DataFrame(X)

    def _align_schema(self, X: pd.DataFrame) -> pd.DataFrame:
        if not self.feature_names_in_:
            return X
        X2 = X.copy()
        for c in self.feature_names_in_:
            if c not in X2.columns:
                X2[c] = np.nan
        X2 = X2[self.feature_names_in_]
        return X2

    def _handle_datetime_fit(self, X: pd.DataFrame) -> pd.DataFrame:
        cfg = self.config_ or PreprocessingConfig()
        col_types = self.analyzer_.infer_column_types(X)
        dt_cols = [c for c, t in col_types.items() if t == "datetime"]
        self.datetime_cols_ = dt_cols

        if not dt_cols:
            self.datetime_generated_cols_ = []
            return X

        if cfg.datetime_handling == "drop":
            self.datetime_generated_cols_ = []
            return X.drop(columns=dt_cols, errors="ignore")

        out = X.copy()
        gen_cols: List[str] = []
        for c in dt_cols:
            ser = pd.to_datetime(out[c], errors="coerce")
            if "year" in cfg.datetime_extract_parts:
                out[f"{c}__year"] = ser.dt.year; gen_cols.append(f"{c}__year")
            if "month" in cfg.datetime_extract_parts:
                out[f"{c}__month"] = ser.dt.month; gen_cols.append(f"{c}__month")
            if "day" in cfg.datetime_extract_parts:
                out[f"{c}__day"] = ser.dt.day; gen_cols.append(f"{c}__day")
            if "dayofweek" in cfg.datetime_extract_parts:
                out[f"{c}__dayofweek"] = ser.dt.dayofweek; gen_cols.append(f"{c}__dayofweek")

        out = out.drop(columns=dt_cols, errors="ignore")
        self.datetime_generated_cols_ = gen_cols
        return out

    def _handle_datetime_transform(self, X: pd.DataFrame) -> pd.DataFrame:
        cfg = self.config_ or PreprocessingConfig()
        if not self.datetime_cols_:
            return X

        if cfg.datetime_handling == "drop":
            return X.drop(columns=self.datetime_cols_, errors="ignore")

        out = X.copy()
        for c in self.datetime_cols_:
            if c not in out.columns:
                out[c] = pd.NaT
            ser = pd.to_datetime(out[c], errors="coerce")
            if f"{c}__year" in self.datetime_generated_cols_:
                out[f"{c}__year"] = ser.dt.year
            if f"{c}__month" in self.datetime_generated_cols_:
                out[f"{c}__month"] = ser.dt.month
            if f"{c}__day" in self.datetime_generated_cols_:
                out[f"{c}__day"] = ser.dt.day
            if f"{c}__dayofweek" in self.datetime_generated_cols_:
                out[f"{c}__dayofweek"] = ser.dt.dayofweek

        out = out.drop(columns=self.datetime_cols_, errors="ignore")
        return out

    def _populate_applied(
        self,
        numeric_cols: List[str],
        categorical_cols: List[str],
        numeric_missing_cols: List[str],
        low_card_cols: List[str],
        high_card_cols: List[str],
        feature_selection_used: bool,
        balancing_used: bool,
        missing_frac_used: Dict[str, float],
    ):
        cfg = self.config_ or PreprocessingConfig()
        low_missing_cols = [c for c in low_card_cols if missing_frac_used.get(c, 0.0) > 0.0]
        high_missing_cols = [c for c in high_card_cols if missing_frac_used.get(c, 0.0) > 0.0]

        self.applied_ = {
            "drop_cols": list(self.drop_cols_),
            "datetime_cols": list(self.datetime_cols_),
            "datetime_handling": cfg.datetime_handling,
            "datetime_generated_cols": list(self.datetime_generated_cols_),

            "numeric_cols": list(numeric_cols),
            "numeric_missing_cols": list(numeric_missing_cols),
            "numeric_imputer_used": bool(numeric_missing_cols),
            "numeric_scaler_used": bool(numeric_cols) and (cfg.numeric_scaler != "none"),

            "categorical_cols": list(categorical_cols),

            "low_card_cols": list(low_card_cols),
            "low_card_missing_cols": list(low_missing_cols),
            "low_card_imputer_used": bool(low_missing_cols),
            "low_card_encoder_used": bool(low_card_cols),

            "high_card_cols": list(high_card_cols),
            "high_card_missing_cols": list(high_missing_cols),
            "high_card_imputer_used": bool(high_missing_cols),
            "high_card_encoder_used": bool(high_card_cols),

            "feature_selection_used": bool(feature_selection_used),
            "feature_selection_method": (cfg.feature_selection if feature_selection_used else "none"),
            "feature_fraction": float(cfg.feature_fraction),
            "fs_k": self.fs_k_,

            "balancing_used": bool(balancing_used),
            "balancing_method": ("class_weight" if balancing_used else "none"),
            "imbalance_threshold": float(cfg.imbalance_threshold),
        }

    def _compute_output_feature_names(self, X_used: pd.DataFrame) -> List[str]:
        names_pre = self._safe_get_preprocess_feature_names(X_used) or []

        if self.pipeline_ is not None and "fs" in getattr(self.pipeline_, "named_steps", {}):
            fs = self.pipeline_.named_steps["fs"]
            try:
                mask = fs.get_support()
                if len(names_pre) == len(mask):
                    return list(np.array(names_pre, dtype=object)[mask])
            except Exception:
                pass

        return names_pre

    def _safe_get_preprocess_feature_names(self, X_used: pd.DataFrame) -> Optional[List[str]]:
        try:
            if self.pipeline_ is None or "preprocess" not in self.pipeline_.named_steps:
                return []
            ct = self.pipeline_.named_steps["preprocess"]

            if hasattr(ct, "get_feature_names_out"):
                try:
                    return list(ct.get_feature_names_out(input_features=list(X_used.columns)))
                except Exception:
                    try:
                        return list(ct.get_feature_names_out())
                    except Exception:
                        pass

            names: List[str] = []
            for name, trans, cols in getattr(ct, "transformers_", []):
                if name == "remainder":
                    continue
                if trans == "passthrough":
                    if isinstance(cols, (list, tuple)):
                        names.extend([f"{name}__{c}" for c in cols])
                    else:
                        names.append(f"{name}__{cols}")
                    continue
                if hasattr(trans, "get_feature_names_out"):
                    try:
                        out = trans.get_feature_names_out(cols if isinstance(cols, (list, tuple)) else None)
                        names.extend(list(out))
                        continue
                    except Exception:
                        pass
                if isinstance(cols, (list, tuple)):
                    names.extend([f"{name}__{c}" for c in cols])
                else:
                    names.append(f"{name}__{cols}")

            return names
        except Exception:
            return None

    def _maybe_generate_report(self):
        if not self.report:
            return

        # 1) preferred: use stored generator ref (works local + notebook)
        if self.report_generator_ref_:
            mod_name, fn_name = self.report_generator_ref_
            try:
                mod = importlib.import_module(mod_name)
                fn = getattr(mod, fn_name)
                fn(self, self.report_path)
                return
            except Exception as e:
                logger.warning("Report generator ref failed (%s.%s): %s", mod_name, fn_name, e)

        # 2) fallback: old behavior (notebook-friendly if present)
        try:
            from __main__ import generate_preprocessing_report
            generate_preprocessing_report(self, self.report_path)
        except Exception as e:
            logger.warning("Report requested but generator is unavailable or failed: %s", e)


In [38]:
# =========================
# CELL 2 — PDF Report Generator (charts + better tables + richer overview)
# =========================

import os
import tempfile
import datetime
from collections import Counter

from PIL import Image as PILImage

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

from reportlab.platypus import (
    SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak,
    Image as RLImage, LongTable, KeepTogether
)
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib import colors
from reportlab.lib.units import cm


def _wrap_token(s: str) -> str:
    return str(s)


# Define styles globally so mono_wrap can reference `small`
styles = getSampleStyleSheet()
H1 = ParagraphStyle("H1", parent=styles["Heading1"], alignment=1, spaceAfter=10)
H2 = ParagraphStyle("H2", parent=styles["Heading2"], spaceBefore=6, spaceAfter=8)
body = ParagraphStyle("body", parent=styles["BodyText"], spaceAfter=6, leading=13)
small = ParagraphStyle("small", parent=styles["BodyText"], fontSize=9, leading=11, spaceAfter=6)
caption = ParagraphStyle("cap", parent=styles["BodyText"], fontSize=9, leading=11, alignment=1, spaceAfter=10)
sub = ParagraphStyle("sub", parent=body, leftIndent=18, spaceAfter=6)

# Cover page styles (defined here as well)
cover_title = ParagraphStyle("cover_title", parent=styles["Title"], alignment=1, fontSize=24, spaceAfter=18, leading=30)
cover_subtitle = ParagraphStyle("cover_subtitle", parent=styles["h2"], alignment=1, fontSize=16, spaceAfter=12, leading=20)
cover_meta = ParagraphStyle("cover_meta", parent=styles["Normal"], alignment=1, fontSize=10, textColor=colors.gray, spaceAfter=24)
cover_desc = ParagraphStyle("cover_desc", parent=styles["Normal"], alignment=1, fontSize=12, leading=16, spaceAfter=0)

mono_wrap = ParagraphStyle(
    "mono_wrap",
    parent=small,
    fontName="Courier",
    fontSize=7.5,
    leading=9,
    wordWrap="CJK",
    splitLongWords=1,   # <-- key line
)



def _convert_image_to_png(img_path: str) -> str:
    img = PILImage.open(img_path).convert("RGBA")
    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
    tmp.close()
    img.save(tmp.name, format="PNG")
    return tmp.name

def _save_fig_to_png(fig) -> str:
    """Save a matplotlib figure as a high-DPI PNG suitable for embedding in PDF."""
    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
    tmp.close()
    fig.savefig(
        tmp.name,
        dpi=300,
        bbox_inches="tight",
        pad_inches=0.06,
        facecolor="white",
    )
    plt.close(fig)
    return tmp.name


def _fit_rl_image(img_path: str, max_w: float, max_h: float) -> RLImage:
    im = PILImage.open(img_path)
    w, h = im.size
    im.close()
    scale = min(max_w / float(w), max_h / float(h))
    return RLImage(img_path, width=w * scale, height=h * scale)


def _set_pub_rcparams():
    # Light, publication-friendly defaults (kept conservative to avoid layout surprises).
    plt.rcParams.update({
        "font.size": 9,
        "axes.titlesize": 11,
        "axes.labelsize": 9,
        "xtick.labelsize": 8,
        "ytick.labelsize": 8,
        "legend.fontsize": 8,
        "figure.dpi": 150,
        "savefig.dpi": 300,
        "axes.linewidth": 0.8,
    })

_set_pub_rcparams()


def _style_axes(ax, grid_axis: str):
    ax.set_axisbelow(True)
    ax.grid(True, axis=grid_axis, linestyle="--", linewidth=0.6, alpha=0.35)
    for s in ("top", "right"):
        ax.spines[s].set_visible(False)
    for s in ("left", "bottom"):
        ax.spines[s].set_linewidth(0.8)
        ax.spines[s].set_alpha(0.7)
    ax.tick_params(axis="both", which="both", length=3, width=0.8)
    return ax


def _add_bar_labels(ax, bars, labels, orientation: str):
    """labels is a list[str] with same length as bars."""
    try:
        ax.bar_label(bars, labels=labels, padding=3, fontsize=8)
    except Exception:
        pass



def _chart_missingness(missing_frac: dict, top_n: int = 12, threshold: float | None = None) -> str | None:
    if not missing_frac:
        return None
    items = sorted(missing_frac.items(), key=lambda x: x[1], reverse=True)[:top_n]
    if not items or max(v for _, v in items) <= 0:
        return None

    cols_, vals_ = zip(*items)
    vals_pct = [v * 100 for v in vals_]

    def _short(s: str, n: int = 22) -> str:
        s = str(s)
        return s if len(s) <= n else (s[: n - 1] + "…")

    cols_disp = [_short(c) for c in cols_]

    fig, ax = plt.subplots(figsize=(7.1, 3.9))
    bars = ax.barh(range(len(cols_disp)), vals_pct)
    ax.set_yticks(range(len(cols_disp)))
    ax.set_yticklabels(cols_disp)
    ax.invert_yaxis()
    ax.set_xlim(0, max(100, max(vals_pct) * 1.15))
    ax.set_xlabel("Missing values (%)")
    ax.set_title("Missingness (top columns)")
    _style_axes(ax, grid_axis="x")

    if threshold is not None:
        try:
            t = float(threshold) * 100.0
            ax.axvline(t, linestyle=":", linewidth=1.2)
            ax.text(t, ax.get_ylim()[0], f"  drop @ {t:.0f}%", va="bottom", fontsize=8)
        except Exception:
            pass

    _add_bar_labels(ax, bars, [f"{v:.1f}%" for v in vals_pct], orientation="h")
    fig.tight_layout()
    return _save_fig_to_png(fig)


def _chart_coltype_counts(col_types: dict) -> str | None:
    if not col_types:
        return None
    c = Counter(col_types.values())

    preferred = ["numeric", "categorical", "datetime"]
    labels = [k for k in preferred if k in c] + [k for k in sorted(c.keys()) if k not in preferred]
    values = [c[k] for k in labels]
    if sum(values) == 0:
        return None

    fig, ax = plt.subplots(figsize=(6.6, 3.4))
    bars = ax.bar(labels, values, width=0.58)
    ax.set_ylabel("Number of columns")
    ax.set_title("Feature type composition (before encoding)")
    _style_axes(ax, grid_axis="y")
    _add_bar_labels(ax, bars, [str(v) for v in values], orientation="v")
    fig.tight_layout()
    return _save_fig_to_png(fig)


def _chart_class_counts(class_counts: dict) -> str | None:
    if not class_counts:
        return None
    items = sorted(class_counts.items(), key=lambda x: x[1], reverse=True)
    labels = [str(k) for k, _ in items]
    values = [v for _, v in items]
    total = sum(values)
    if total == 0:
        return None

    fig, ax = plt.subplots(figsize=(6.9, 3.4))
    bars = ax.bar(labels, values, width=0.58)
    ax.set_ylabel("Samples")
    ax.set_title("Target class distribution")
    _style_axes(ax, grid_axis="y")

    bar_labels = [f"{v}\n({(v/total)*100:.1f}%)" for v in values]
    _add_bar_labels(ax, bars, bar_labels, orientation="v")

    ax.tick_params(axis="x", rotation=0)
    fig.tight_layout()
    return _save_fig_to_png(fig)


def _chart_feature_source_breakdown(feature_names: list[str]) -> str | None:
    if not feature_names:
        return None

    def src(n: str) -> str:
        return n.split("__", 1)[0] if "__" in n else "features"

    c = Counter(src(n) for n in feature_names)
    items = sorted(c.items(), key=lambda x: x[1], reverse=True)
    labels = [k for k, _ in items]
    values = [v for _, v in items]
    if sum(values) == 0:
        return None

    fig, ax = plt.subplots(figsize=(6.8, 3.4))
    bars = ax.barh(labels, values, height=0.58)
    ax.invert_yaxis()
    ax.set_xlabel("Number of output features")
    ax.set_title("Final feature contribution by transformer")
    _style_axes(ax, grid_axis="x")
    _add_bar_labels(ax, bars, [str(v) for v in values], orientation="h")
    fig.tight_layout()
    return _save_fig_to_png(fig)


def generate_preprocessing_report(prep, path: str = "prep_report.pdf"):
    meta = getattr(prep, "meta_", None) or {}
    meta_used = getattr(prep, "meta_used_", None) or {}
    applied = getattr(prep, "applied_", None) or {}
    cfg = getattr(prep, "config_", None)

    doc = SimpleDocTemplate(
        path,
        pagesize=A4,
        rightMargin=2*cm,
        leftMargin=2*cm,
        topMargin=3.1*cm,   # room for header
        bottomMargin=2.0*cm
    )

    story = []
    tmp_files = []

    # -------------------------
    # Logo handling (supports .webp via PIL conversion)
    # -------------------------
    logo_path = getattr(prep, "logo_path", None)
    logo_png = None
    if logo_path and os.path.exists(logo_path):
        try:
            logo_png = _convert_image_to_png(logo_path)
            tmp_files.append(logo_png)
        except Exception:
            logo_png = None

    # -------------------------
    # Header / Footer
    # -------------------------
    def draw_header_footer(canvas, doc_):
        canvas.saveState()

        # header baseline
        header_top = doc_.pagesize[1] - 1.0*cm
        header_bottom = doc_.pagesize[1] - doc_.topMargin + 0.25*cm

        # logo (smaller + better placement)
        if logo_png:
            lw, lh = (1.25*cm, 1.25*cm)
            x = doc_.leftMargin
            y = header_top - lh
            canvas.drawImage(
                logo_png, x, y,
                width=lw, height=lh,
                preserveAspectRatio=True, mask="auto"
            )

        # title on the right
        canvas.setFont("Helvetica-Bold", 10)
        canvas.drawRightString(
            doc_.pagesize[0] - doc_.rightMargin,
            doc_.pagesize[1] - 1.35*cm,
            getattr(prep, "project_name", "Automata AI - Preprocessing Report")
        )

        # subtle header line
        canvas.setLineWidth(0.4)
        canvas.setStrokeColor(colors.grey)
        canvas.line(doc_.leftMargin, header_bottom, doc_.pagesize[0] - doc_.rightMargin, header_bottom)

        # footer
        canvas.setFont("Helvetica", 8)
        canvas.setFillColor(colors.black)
        canvas.drawString(doc_.leftMargin, 1.15*cm, f"Page {doc_.page}")
        canvas.drawRightString(
            doc_.pagesize[0] - doc_.rightMargin,
            1.15*cm,
            f"© {datetime.datetime.now().year} Automata AI — All rights reserved"
        )

        canvas.restoreState()

    # -------------------------
    # Cover (final polished)
    # -------------------------

    # Push content down from top
    story.append(Spacer(1, 2.6 * cm))

    # Bigger logo, centered, lower on page
    if logo_png:
        cover_logo = _fit_rl_image(logo_png, max_w=6.5 * cm, max_h=6.5 * cm)
        cover_logo.hAlign = "CENTER"
        story.append(cover_logo)

    # Space between logo and title
    story.append(Spacer(1, 1.2 * cm))

    # Title block (centered)
    story.append(Paragraph(getattr(prep, "project_name", "Automata AI"), cover_title))
    story.append(Paragraph("Automated Preprocessing Report", cover_subtitle))
    story.append(Paragraph(
        f"Generated on {datetime.datetime.now():%Y-%m-%d %H:%M:%S}",
        cover_meta
    ))

    # EXTRA breathing room before description (you asked for this)
    story.append(Spacer(1, 1.2 * cm))

    story.append(Paragraph(
        "This report summarizes dataset characteristics, preprocessing decisions, and the resulting feature space.",
        cover_desc
    ))

    story.append(PageBreak())


    # -------------------------
    # 1) Dataset Overview (richer)
    # -------------------------
    story.append(Paragraph("1. Dataset Overview", H2))

    # prefer "used" meta when available
    n_rows = meta_used.get("n_rows", meta.get("n_rows", ""))
    n_cols_raw = meta.get("n_cols", "")
    n_cols_used = meta_used.get("n_cols", "")

    col_types_used = meta_used.get("col_types", meta.get("col_types", {})) or {}
    missing_used = meta_used.get("missing_frac", meta.get("missing_frac", {})) or {}
    card_used = meta_used.get("cardinality", meta.get("cardinality", {})) or {}

    type_counts = Counter(col_types_used.values()) if col_types_used else Counter()
    dropped_cols = applied.get("drop_cols", []) or []
    dt_cols = applied.get("datetime_cols", []) or []
    low_cols = applied.get("low_card_cols", []) or []
    high_cols = applied.get("high_card_cols", []) or []

    avg_missing = (sum(missing_used.values()) / max(len(missing_used), 1)) if missing_used else 0.0
    max_missing = max(missing_used.values()) if missing_used else 0.0

    overview_rows = [
        ["Samples", str(n_rows)],
        ["Raw features (before drops)", str(n_cols_raw)],
        ["Features used (after drops/datetime)", str(n_cols_used)],
        ["Numeric columns", str(type_counts.get("numeric", 0))],
        ["Categorical columns", str(type_counts.get("categorical", 0))],
        ["Datetime columns (detected)", str(type_counts.get("datetime", 0))],
        ["Dropped columns", str(len(dropped_cols))],
        ["Avg missing rate (across columns)", f"{avg_missing*100:.2f}%"],
        ["Max missing rate (single column)", f"{max_missing*100:.2f}%"],
        ["Low-card categorical columns", str(len(low_cols))],
        ["High-card categorical columns", str(len(high_cols))],
    ]

    if "class_counts" in meta:
        counts = meta.get("class_counts", {}) or {}
        ir = meta.get("imbalance_ratio", None)
        overview_rows += [
            ["# Classes", str(len(counts))],
            ["Imbalance ratio", f"{float(ir):.3f}" if ir is not None else ""],
        ]

    t = Table(overview_rows, colWidths=[7.5*cm, 8.5*cm])
    t.setStyle(TableStyle([
        ("GRID", (0,0), (-1,-1), 0.3, colors.grey),
        ("BACKGROUND", (0,0), (-1,0), colors.whitesmoke),
        ("FONTNAME", (0,0), (-1,0), "Helvetica-Bold"),
        ("VALIGN", (0,0), (-1,-1), "TOP"),
        ("ROWBACKGROUNDS", (0,1), (-1,-1), [colors.white, colors.Color(0.97,0.97,0.97)]),
        ("LEFTPADDING", (0,0), (-1,-1), 6),
        ("RIGHTPADDING", (0,0), (-1,-1), 6),
        ("TOPPADDING", (0,0), (-1,-1), 4),
        ("BOTTOMPADDING", (0,0), (-1,-1), 4),
    ]))
    story.append(t)
    story.append(Spacer(1, 0.4*cm))

    # charts
    chart_paths = []
    try:
        p1 = _chart_coltype_counts(col_types_used)
        if p1: chart_paths.append(("Column types", p1))
        p2 = _chart_missingness(missing_used, top_n=12, threshold=getattr(cfg, "drop_missing_threshold", None))
        if p2: chart_paths.append(("Missingness", p2))
        p3 = _chart_class_counts(meta.get("class_counts", {}) if "class_counts" in meta else {}) # type: ignore
        if p3: chart_paths.append(("Target classes", p3))
        for _, p in chart_paths:
            tmp_files.append(p)

        for title, p in chart_paths:
            img = _fit_rl_image(p, max_w=doc.width, max_h=8.2*cm)
            story.append(img)
            story.append(Paragraph(title, caption))
    except Exception:
        pass

    # optional: list dropped/datetime columns (compact)
    if dropped_cols:
        story.append(Paragraph(f"<b>Dropped columns ({len(dropped_cols)}):</b> {', '.join(map(str, dropped_cols[:80]))}"
                               + (" ..." if len(dropped_cols) > 80 else "") + ".",
            small))
    if dt_cols and applied.get("datetime_handling", "") == "drop":
        story.append(Paragraph(f"<b>Datetime columns dropped ({len(dt_cols)}):</b> {', '.join(map(str, dt_cols[:80]))}"
                               + (" ..." if len(dt_cols) > 80 else "") + ".",
            small))

    # config summary (uses your cfg object if present)
    story.append(PageBreak())
    if cfg is not None:
        cfg_rows = [
            ["drop_missing_threshold", str(getattr(cfg, "drop_missing_threshold", ""))],
            ["high_cardinality_min_unique", str(getattr(cfg, "high_cardinality_min_unique", ""))],
            ["high_cardinality_threshold", str(getattr(cfg, "high_cardinality_threshold", ""))],
            ["numeric_imputer", str(getattr(cfg, "numeric_imputer", ""))],
            ["numeric_scaler", str(getattr(cfg, "numeric_scaler", ""))],
            ["categorical_imputer", str(getattr(cfg, "categorical_imputer", ""))],
            ["datetime_handling", str(getattr(cfg, "datetime_handling", ""))],
            ["feature_selection", str(getattr(cfg, "feature_selection", ""))],
            ["feature_fraction", str(getattr(cfg, "feature_fraction", ""))],
            ["balancing", str(getattr(cfg, "balancing", ""))],
            ["imbalance_threshold", str(getattr(cfg, "imbalance_threshold", ""))],
        ]
        story.append(Spacer(1, 0.2*cm))
        story.append(Paragraph("Configuration Snapshot", ParagraphStyle("h3", parent=styles["Heading3"], spaceAfter=6)))
        tc = Table(cfg_rows, colWidths=[7.5*cm, 8.5*cm])
        tc.setStyle(TableStyle([
            ("GRID", (0,0), (-1,-1), 0.25, colors.grey),
            ("BACKGROUND", (0,0), (-1,0), colors.whitesmoke),
            ("FONTNAME", (0,0), (-1,0), "Helvetica-Bold"),
            ("ROWBACKGROUNDS", (0,1), (-1,-1), [colors.white, colors.Color(0.97,0.97,0.97)]),
            ("LEFTPADDING", (0,0), (-1,-1), 6),
            ("RIGHTPADDING", (0,0), (-1,-1), 6),
            ("TOPPADDING", (0,0), (-1,-1), 4),
            ("BOTTOMPADDING", (0,0), (-1,-1), 4),
        ]))
        story.append(tc)

    story.append(PageBreak())

    # -------------------------
    # 2) Preprocessing Steps Applied (your narrative, kept + slightly cleaner)
    # -------------------------
    story.append(Paragraph("2. Preprocessing Steps Applied", H2))

    if applied.get("drop_cols"):
        story.append(Paragraph(
            f"<b>• Column removal:</b> Columns with excessive missing values or constant values were removed. "
            f"Dropped: <b>{', '.join(applied['drop_cols'][:120])}</b>"
            + (" ..." if len(applied["drop_cols"]) > 120 else "") + ".",
            body
        ))

    if applied.get("datetime_cols"):
        story.append(Paragraph(
            f"<b>• Datetime handling:</b> Detected datetime columns: <b>{', '.join(applied['datetime_cols'])}</b>. "
            f"Handling mode: <b>{applied.get('datetime_handling','')}</b>.",
            body
        ))
        if applied.get("datetime_generated_cols"):
            story.append(Paragraph(
                f"– Extracted datetime parts into: <b>{', '.join(applied['datetime_generated_cols'])}</b>.",
                sub
            ))

    if applied.get("numeric_cols"):
        story.append(Paragraph(
            f"<b>• Numeric processing:</b> Numeric features: <b>{', '.join(applied['numeric_cols'][:80])}</b>"
            + (" ..." if len(applied["numeric_cols"]) > 80 else "") + ".",
            body
        ))
        if applied.get("numeric_imputer_used"):
            story.append(Paragraph(
                f"– Missing numeric values imputed using <b>{getattr(cfg,'numeric_imputer','')}</b> for: "
                f"<b>{', '.join(applied.get('numeric_missing_cols', []))}</b>.",
                sub
            ))
        if applied.get("numeric_scaler_used"):
            story.append(Paragraph(
                f"– Scaling applied using <b>{getattr(cfg,'numeric_scaler','')}</b>.",
                sub
            ))

    if applied.get("low_card_cols"):
        story.append(Paragraph(
            f"<b>• Low-card categorical:</b> <b>{', '.join(applied['low_card_cols'][:80])}</b>"
            + (" ..." if len(applied["low_card_cols"]) > 80 else "") + ".",
            body
        ))
        if applied.get("low_card_imputer_used"):
            story.append(Paragraph(
                f"– Missing values imputed using <b>{getattr(cfg,'categorical_imputer','')}</b>.",
                sub
            ))
        story.append(Paragraph("– One-hot encoding applied.", sub))

    if applied.get("high_card_cols"):
        story.append(Paragraph(
            f"<b>• High-card categorical:</b> <b>{', '.join(applied['high_card_cols'][:80])}</b>"
            + (" ..." if len(applied["high_card_cols"]) > 80 else "") + ".",
            body
        ))
        story.append(Paragraph("– Frequency encoding applied to avoid feature explosion.", sub))

    if applied.get("feature_selection_used"):
        story.append(Paragraph(
            f"<b>• Feature selection:</b> Method <b>{applied.get('feature_selection_method','')}</b>, "
            f"kept fraction <b>{applied.get('feature_fraction','')}</b>, k = <b>{applied.get('fs_k','')}</b>.",
            body
        ))

    if applied.get("balancing_used"):
        story.append(Paragraph(
            f"<b>• Imbalance handling:</b> Class weights computed (threshold {applied.get('imbalance_threshold','')}).",
            body
        ))

    story.append(PageBreak())

    # -------------------------
    # 3) Output Feature Summary (fixed header + chart)
    # -------------------------
    story.append(Paragraph("3. Output Feature Summary", H2))

    try:
        out_names = list(prep.get_feature_names_out())
    except Exception:
        out_names = list(getattr(prep, "output_feature_names_", []) or [])

    story.append(Paragraph(f"The preprocessing pipeline produced <b>{len(out_names)}</b> final features.", body))

    # chart: feature breakdown by transformer prefix
    try:
        p_feat = _chart_feature_source_breakdown(out_names)
        if p_feat:
            tmp_files.append(p_feat)
            img = _fit_rl_image(p_feat, max_w=doc.width, max_h=7.8*cm)
            story.append(img)
            story.append(Paragraph("Final feature contribution by transformer (count)", caption))
    except Exception:
        pass

    # output feature summary (grouped + publication-friendly)
    if out_names:
        # total_feats = len(out_names) # Already defined above and used
        # story.append(Paragraph(f"The preprocessing pipeline produced <b>{total_feats}</b> final features.", body)) # Duplicate
        story.append(Spacer(1, 0.15*cm))

        # ---- group by transformer prefix ----
        def _grp(n: str) -> str:
            n = str(n)
            if n.startswith("num__"):
                return "Numeric"
            if n.startswith("cat_low__"):
                return "Low-card categorical (one-hot)"
            if n.startswith("cat_high__"):
                return "High-card categorical (frequency)"
            if "__" in n:
                return n.split("__", 1)[0]
            return "Other"

        groups: dict[str, list[str]] = {}
        for n in out_names:
            groups.setdefault(_grp(n), []).append(str(n))

        preferred_order = ["Numeric", "Low-card categorical (one-hot)", "High-card categorical (frequency)"]
        ordered_groups = [g for g in preferred_order if g in groups] + [g for g in sorted(groups.keys()) if g not in preferred_order]

        # ---- compact summary table ----
        summary_rows = [["Group", "Count", "Examples (first 3)"]]
        for g in ordered_groups:
            feats = groups[g]
            ex = ", ".join(_wrap_token(x) for x in feats[:3]) + (" ..." if len(feats) > 3 else "")
            summary_rows.append([Paragraph(g, small), str(len(feats)), Paragraph(ex, mono_wrap)])


        summary_tbl = Table(summary_rows, colWidths=[6.0*cm, 1.6*cm, doc.width - 7.6*cm])
        summary_tbl.setStyle(TableStyle([
            ("GRID", (0,0), (-1,-1), 0.25, colors.grey),
            ("BACKGROUND", (0,0), (-1,0), colors.whitesmoke),
            ("FONTNAME", (0,0), (-1,0), "Helvetica-Bold"),
            ("VALIGN", (0,0), (-1,-1), "TOP"),
            ("LEFTPADDING", (0,0), (-1,-1), 6),
            ("RIGHTPADDING", (0,0), (-1,-1), 6),
            ("TOPPADDING", (0,0), (-1,-1), 4),
            ("BOTTOMPADDING", (0,0), (-1,-1), 4),
            ("ROWBACKGROUNDS", (0,1), (-1,-1), [colors.white, colors.Color(0.97,0.97,0.97)]),
        ]))
        story.append(summary_tbl)
        story.append(Spacer(1, 0.35*cm))

        # ---- detailed per-group tables (multi-column) ----
        max_items_per_group = 240   # keep PDF readable for very wide feature spaces
        ncols = 3
        colw = doc.width / float(ncols)

        # mono defined here

        for g in ordered_groups:
            feats = groups[g]
            shown = feats[:max_items_per_group]
            truncated = len(feats) > max_items_per_group

            header = Paragraph(f"<b>{g} ({len(feats)})</b>" + (f" — showing first {max_items_per_group}" if truncated else ""), small)
            data = [[header, "", ""]]
            for i in range(0, len(shown), ncols):
                row = shown[i:i+ncols]
                if len(row) < ncols:
                    row += [""] * (ncols - len(row))
                data.append([Paragraph(_wrap_token(x), mono_wrap) if x else "" for x in row])

            tf = LongTable(data, colWidths=[colw]*ncols, repeatRows=1)
            tf.setStyle(TableStyle([
                ("SPAN", (0,0), (-1,0)),
                ("GRID", (0,0), (-1,-1), 0.25, colors.grey),
                ("BACKGROUND", (0,0), (-1,0), colors.whitesmoke),
                ("FONTNAME", (0,0), (-1,0), "Helvetica-Bold"),
                ("VALIGN", (0,0), (-1,-1), "TOP"),
                ("ROWBACKGROUNDS", (0,1), (-1,-1), [colors.white, colors.Color(0.97,0.97,0.97)]),
                ("LEFTPADDING", (0,0), (-1,-1), 5),
                ("RIGHTPADDING", (0,0), (-1,-1), 5),
                ("TOPPADDING", (0,0), (-1,-1), 3),
                ("BOTTOMPADDING", (0,0), (-1,-1), 3),
            ]))
            story.append(tf)
            story.append(Spacer(1, 0.30*cm))

    story.append(Paragraph("End of report.", ParagraphStyle("end", fontSize=9, alignment=1)))

    # -------------------------
    # Build
    # -------------------------
    doc.build(story, onFirstPage=draw_header_footer, onLaterPages=draw_header_footer)

    for f in tmp_files:
        try:
            os.remove(f)
        except Exception:
            pass

    print(f"[INFO] Preprocessing report saved to {path}")


In [43]:
def run_preprocessing(X, y, cfg=None):
    if cfg is None:
        cfg = PreprocessConfig()

    prep = AutomataPreprocessor(
        verbose=cfg.verbose,
        report=cfg.report,
        report_path=cfg.report_path,
        project_name=cfg.project_name,
        logo_path=cfg.logo_path,
    )

    prep.set_report_generator(generate_preprocessing_report)

    Xp = prep.fit_transform(X, y)
    yp = y

    return Xp, yp
