In [None]:
import numpy as np
import pandas as pd
import re


def _expit(x: np.ndarray) -> np.ndarray:
    """Numerically stable logistic function."""
    x = np.asarray(x, dtype=float)
    out = np.empty_like(x, dtype=float)
    pos = x >= 0
    out[pos] = 1.0 / (1.0 + np.exp(-x[pos]))
    ex = np.exp(x[~pos])
    out[~pos] = ex / (1.0 + ex)
    return out


def _is_glm_binomial(fitted_model) -> bool:
    """
    Best-effort detection of a statsmodels GLM Binomial(logit) result.
    Used to decide whether trimming rules should be applied.
    """
    try:
        fam = getattr(getattr(fitted_model, "model", None), "family", None)
        if fam is None:
            fam = getattr(fitted_model, "family", None)
        if fam is None:
            return False
        return fam.__class__.__name__.lower() == "binomial"
    except Exception:
        return False


def var_extract(term: str, vars_: list[str]) -> dict:
    """
    Python analogue of the R helper `var_extract()` used in lpredict.R.

    It parses a coefficient name that encodes values as digits and returns:
      - var: list of variable names
      - val: list of numeric values (as floats)

    This is only used when `factor=True` and the coefficient name contains "FACT".
    """
    if len(vars_) > 25:
        raise ValueError("Interaction limit (25) exceeded. Reduce number of other treatments.")

    s = str(term)
    repl = [f"{chr(65+i)}_XX" for i in range(len(vars_))]  # A_XX, B_XX, ...
    for v, r in zip(vars_, repl):
        s = s.replace(v, r)

    nums = [float(x) for x in re.findall(r"\d+", s)]
    nond = "".join(re.findall(r"\D+", s))  # keep only non-digits, concat

    for v, r in zip(vars_, repl):
        nond = nond.replace(r, v)

    vars_out = [x for x in nond.split(":") if x != ""]
    return {"var": vars_out, "val": nums}


def _manual_linear_predict(
    df: pd.DataFrame,
    outcol: str,
    params: pd.Series,
    varlist: list[str],
    const: bool = True,
) -> pd.DataFrame:
    """
    R-style linear predictor:
      df[outcol] = Î£ beta_v * x_v (+ intercept)

    Supports:
      - plain terms that are columns in df
      - interactions "a:b:c" (product of columns)
      - limited "I(expr)" where expr can reference df columns and use + - * / ** ()
    """
    df[outcol] = 0.0

    for term in varlist:
        if term not in params.index:
            continue

        coef = params[term]
        if pd.isna(coef):
            continue
        coef = float(coef)

        # FACT terms (handled elsewhere in lpredict)
        if "FACT" in str(term):
            continue

        # Interaction a:b:c
        if ":" in str(term):
            parts = str(term).split(":")
            ok = all(p in df.columns for p in parts)
            if ok:
                prod = np.ones(len(df), dtype=float)
                for p in parts:
                    prod *= pd.to_numeric(df[p], errors="coerce").to_numpy(dtype=float)
                df[outcol] = df[outcol].to_numpy(dtype=float) + prod * coef
            continue

        # Plain column
        if str(term) in df.columns:
            x = pd.to_numeric(df[str(term)], errors="coerce").to_numpy(dtype=float)
            df[outcol] = df[outcol].to_numpy(dtype=float) + x * coef
            continue

        # I(expr) term (patsy-style)
        m = re.match(r"^I\((.+)\)$", str(term))
        if m is not None:
            expr = m.group(1)
            safe_expr = expr
            for col in sorted(df.columns, key=len, reverse=True):
                if re.search(rf"\b{re.escape(col)}\b", safe_expr):
                    safe_expr = re.sub(
                        rf"\b{re.escape(col)}\b",
                        f"df[{col!r}].astype(float).to_numpy()",
                        safe_expr
                    )
            try:
                val = eval(safe_expr, {"np": np, "df": df})
                df[outcol] = df[outcol].to_numpy(dtype=float) + np.asarray(val, dtype=float) * coef
            except Exception:
                # Skip silently (closer to "do not crash" behavior)
                pass
            continue

        # Unknown term: skip
        continue

    # Add intercept if present
    if const:
        for icpt_name in ("Intercept", "const", "(Intercept)"):
            if icpt_name in params.index and not pd.isna(params[icpt_name]):
                df[outcol] = df[outcol].to_numpy(dtype=float) + float(params[icpt_name])
                break

    return df


def lpredict(
    df: pd.DataFrame,
    outcol: str,
    fitted_model,
    varlist=None,
    const: bool = True,
    prob: bool = False,
    factor: bool = False,
    sensitivity: float = 1e-10,
) -> pd.DataFrame:
    """
    Add predicted values into `df[outcol]`.

    This is a faithful Python analogue of the R internal helper `lpredict()`.

    Core behaviors to match R:
      - Start from a linear predictor built term-by-term (manual mode).
      - If `prob=True`, convert linear predictor to probability via logistic transform.
      - If prob/PS prediction, apply trimming:
            NaN -> 1
            p < 1e-10 -> 0
      - If `factor=True` and coefficient names contain "FACT", treat them as
        encoded interaction indicators and add beta when conditions match.

    Notes:
      - We *prefer* fitted_model.predict(df) when it works (statsmodels),
        but we always keep a manual fallback. When `factor=True`, we use
        the manual R-style path to be consistent.
    """
    df = df.copy()

    # ---- Decide varlist / params ----
    try:
        params = fitted_model.params
        if not isinstance(params, pd.Series):
            # best-effort
            params = pd.Series(params, index=getattr(fitted_model.params, "index", None))
        if params.index is None:
            raise ValueError("params have no index; cannot map coefficients to names.")
    except Exception as e:
        raise TypeError("lpredict: fitted_model must expose `.params` with named coefficients.") from e

    intercept_names = ("Intercept", "const", "(Intercept)")
    if varlist is None:
        exog_names = list(getattr(getattr(fitted_model, "model", None), "exog_names", []))
        varlist = [v for v in exog_names if v not in intercept_names]

    # R behavior when factor=TRUE:
    singletons = []
    if factor:
        singletons = [v for v in (varlist or []) if ":" not in str(v)]
        # override varlist to coefficient names excluding intercept (as in R)
        varlist = [v for v in params.index if v not in intercept_names]

    # ---- Preferred path: model.predict(df) when possible and factor=False ----
    pred = None
    if not factor:
        try:
            pred = fitted_model.predict(df)
        except Exception:
            pred = None

    if pred is not None:
        df[outcol] = np.asarray(pred, dtype=float)
    else:
        # Manual linear predictor
        df = _manual_linear_predict(df, outcol, params=params, varlist=list(varlist), const=const)

        # Add FACT terms if requested
        if factor:
            for term in varlist:
                if "FACT" not in str(term):
                    continue
                if term not in params.index or pd.isna(params[term]):
                    continue
                coef = float(params[term])
                vs = var_extract(str(term), singletons)
                vars_out = vs.get("var", [])
                vals_out = vs.get("val", [])
                if len(vars_out) == 0 or len(vals_out) == 0:
                    continue
                k = min(len(vars_out), len(vals_out))
                sel = np.ones(len(df), dtype=bool)
                for j in range(k):
                    v = vars_out[j]
                    if v not in df.columns:
                        sel &= False
                        continue
                    x = pd.to_numeric(df[v], errors="coerce").to_numpy(dtype=float)
                    sel &= (x == float(vals_out[j]))
                df.loc[sel, outcol] = df.loc[sel, outcol].to_numpy(dtype=float) + coef

        # If prob requested, convert linear predictor -> probability (R behavior)
        if prob:
            arr = df[outcol].astype(float).to_numpy()
            df[outcol] = _expit(arr)

    # ---- Trimming rule (R) for prob/propensity scores ----
    looks_like_ps = isinstance(outcol, str) and outcol.startswith("PS_")
    if prob or looks_like_ps or _is_glm_binomial(fitted_model):
        p = df[outcol].astype(float).to_numpy()
        p = np.where(np.isnan(p), 1.0, p)
        p = np.where(p < sensitivity, 0.0, p)
        df[outcol] = p

    return df
