In [3]:
import numpy as np
from scipy.optimize import minimize

def clipped_ce_and_grad(beta, X, y, eps=1e-6, l2=0.0):
    """
    Cross-entropy on p = clip(X @ beta, eps, 1-eps).
    Returns (loss, grad) with grad matching the piecewise derivative described.
    """
    z = X @ beta                          # (n,)
    p = np.clip(z, eps, 1.0 - eps)        # (n,)

    # ----- Loss -----
    # CE(p,y) = -[ y log p + (1-y) log(1-p) ]
    ce = - (y * np.log(p) + (1 - y) * np.log(1 - p)).sum()
    if l2:
        ce += 0.5 * l2 * np.dot(beta, beta)

    # ----- Gradient wrt beta -----
    # dCE/dp = (p - y) / (p(1-p))
    dCE_dp = (p - y) / (p * (1.0 - p))

    # dp/dz = 1 on the interior, 0 when clipped
    interior = (z > eps) & (z < 1.0 - eps)
    dL_dz = np.zeros_like(z)
    dL_dz[interior] = dCE_dp[interior]  # chain rule: dL/dz = dCE/dp * dp/dz

    grad = X.T @ dL_dz
    if l2:
        grad += l2 * beta

    return ce, grad

def fit_clipped_ce(X, y, beta0=None, eps=1e-6, l2=0.0, tol=1e-8, maxiter=10_000):
    n_features = X.shape[1]
    beta0 = np.zeros(n_features) if beta0 is None else beta0
    obj = lambda b: clipped_ce_and_grad(b, X, y, eps=eps, l2=l2)
    res = minimize(obj, beta0, method="L-BFGS-B", jac=True, tol=tol,
                   options={"maxiter": maxiter})
    return res.x, res