# `pyensmallen` for Maximum Likelihood

LBFGS works well for most smooth convex functions, notably a convex loss such as a likelihood. I generally find that optimization convergence is so fast that bootstrapping the entire procedure may be feasible.

Benchmarks against scipy, cvxpy (hand-rolled), and statsmodels are provided.

In [1]:
import numpy as np
import pyensmallen
import scipy.optimize
import cvxpy as cp
from scipy.special import expit
import time

import statsmodels.api as sm

## Data Generation

In [2]:
np.random.seed(42)
n, k = 1_000_000, 20

# Linear Regression Data
X_linear = np.random.randn(n, k)
print(true_params_linear := np.random.rand(k))
y_linear = X_linear @ true_params_linear + np.random.randn(n)

[0.51639859 0.94598022 0.23380001 0.55162275 0.97811966 0.24254699
 0.64702478 0.70271041 0.26476461 0.77362184 0.7817448  0.36874977
 0.72697004 0.06518613 0.72705723 0.38967364 0.03826155 0.39386005
 0.0438693  0.72142769]


## Linear Regression

### pyensmallen

In [3]:
def linear_objective(params, gradient, X, y):
    params = params.reshape(-1, 1)
    residuals = X @ params - y.reshape(-1, 1)
    objective = np.sum(residuals**2)
    grad = 2 * X.T @ residuals
    gradient[:] = grad.flatten()
    return objective

In [4]:
%%time
optimizer = pyensmallen.L_BFGS()
result_linear_ens = optimizer.optimize(
    lambda params, gradient: linear_objective(params, gradient, X_linear, y_linear),
    np.random.rand(k),
)
print(result_linear_ens)

[0.51556024 0.94691468 0.23404849 0.55121759 0.97818756 0.24338623
 0.64700696 0.70195589 0.26487498 0.77280983 0.78267599 0.36787315
 0.72791074 0.06571446 0.72615144 0.38766298 0.03820425 0.39468909
 0.04304362 0.72195013]
CPU times: user 5.47 s, sys: 151 ms, total: 5.62 s
Wall time: 371 ms


### scipy

In [5]:
%%time
result_linear_scipy = scipy.optimize.minimize(
    fun=lambda b: np.sum((X_linear @ b - y_linear) ** 2),
    x0=np.random.rand(k),
    jac=lambda b: 2 * X_linear.T @ (X_linear @ b - y_linear),
).x
print(result_linear_scipy)

[0.51556024 0.94691468 0.23404849 0.55121759 0.97818756 0.24338623
 0.64700696 0.70195589 0.26487498 0.77280983 0.78267599 0.36787315
 0.72791074 0.06571446 0.72615144 0.38766298 0.03820425 0.39468909
 0.04304362 0.72195013]
CPU times: user 1min 30s, sys: 2.62 s, total: 1min 32s
Wall time: 6.03 s


### cvxpy

In [6]:
%%time
b_linear = cp.Variable(k)
cost_linear = cp.norm(X_linear @ b_linear - y_linear, p=2) ** 2 / n
prob_linear = cp.Problem(cp.Minimize(cost_linear))
prob_linear.solve(solver=cp.SCS)
print(b_linear.value)

[0.51556024 0.94691468 0.23404849 0.55121759 0.97818756 0.24338623
 0.64700696 0.70195589 0.26487498 0.77280983 0.78267599 0.36787315
 0.72791074 0.06571446 0.72615144 0.38766298 0.03820425 0.39468909
 0.04304362 0.72195013]
CPU times: user 23.1 s, sys: 3.6 s, total: 26.7 s
Wall time: 24.9 s


### closed form

In [7]:
%%time
np.linalg.lstsq(X_linear, y_linear, rcond=None)[0]

CPU times: user 2.76 s, sys: 11 ms, total: 2.77 s
Wall time: 411 ms


array([0.51556024, 0.94691468, 0.23404849, 0.55121759, 0.97818756,
       0.24338623, 0.64700696, 0.70195589, 0.26487498, 0.77280983,
       0.78267599, 0.36787315, 0.72791074, 0.06571446, 0.72615144,
       0.38766298, 0.03820425, 0.39468909, 0.04304362, 0.72195013])

In [8]:
%%time
sm.OLS(y_linear, X_linear).fit().params

CPU times: user 7.65 s, sys: 339 ms, total: 7.99 s
Wall time: 1.63 s


array([0.51556024, 0.94691468, 0.23404849, 0.55121759, 0.97818756,
       0.24338623, 0.64700696, 0.70195589, 0.26487498, 0.77280983,
       0.78267599, 0.36787315, 0.72791074, 0.06571446, 0.72615144,
       0.38766298, 0.03820425, 0.39468909, 0.04304362, 0.72195013])

## Logistic Regression

In [9]:
# Logistic Regression Data
n, k = 10_000, 20
X_logistic = np.random.randn(n, k)
print(true_params_logistic := np.random.rand(k))
p = expit(X_logistic @ true_params_logistic)
y_logistic = np.random.binomial(1, p)

[0.17265046 0.1778864  0.49767087 0.64843282 0.98414584 0.21942117
 0.53109792 0.68926063 0.9222397  0.90592967 0.08626337 0.45876915
 0.07621689 0.47511573 0.2636066  0.66777898 0.76603666 0.01132669
 0.01504104 0.02569576]


### pyensmallen

In [10]:
def logistic_objective(params, gradient, X, y):
    z = X @ params
    h = expit(z)
    objective = -np.sum(y * np.log(h) + (1 - y) * np.log1p(-h))
    grad = X.T @ (h - y)
    gradient[:] = grad
    return objective

In [11]:
%%time
X_logistic2 = np.ascontiguousarray(
    X_logistic
)  # Ensure C-contiguous array for better performance
y_logistic2 = y_logistic.ravel()

optimizer = pyensmallen.L_BFGS()
result_logistic_ens = optimizer.optimize(
    lambda params, gradient: logistic_objective(
        params, gradient, X_logistic2, y_logistic2
    ),
    np.random.rand(k),
)
print(result_logistic_ens)

[ 0.17934344  0.17613996  0.47516449  0.65546909  0.99649103  0.18322668
  0.49237341  0.73499685  0.92133107  0.92753401  0.05997722  0.45651545
  0.08532323  0.49863473  0.30227163  0.63633992  0.77514856  0.04266289
 -0.03735041  0.03002617]
CPU times: user 66.6 ms, sys: 144 μs, total: 66.8 ms
Wall time: 10.2 ms


### scipy

In [12]:
%%time
result_logistic_scipy = scipy.optimize.minimize(
    fun=lambda b: -np.sum(
        y_logistic * np.log(expit(X_logistic @ b))
        + (1 - y_logistic) * np.log(1 - expit(X_logistic @ b))
    ),
    x0=np.random.rand(k),
    jac=lambda b: X_logistic.T @ (expit(X_logistic @ b) - y_logistic),
).x
print(result_logistic_scipy)

[ 0.17934344  0.17613996  0.47516449  0.65546909  0.99649103  0.18322668
  0.49237341  0.73499685  0.92133107  0.92753401  0.05997722  0.45651545
  0.08532323  0.49863473  0.30227163  0.63633992  0.77514856  0.04266289
 -0.03735041  0.03002617]
CPU times: user 243 ms, sys: 4.16 ms, total: 247 ms
Wall time: 66.9 ms


### cvxpy

In [13]:
%%time
b_logistic = cp.Variable(k)
log_likelihood = cp.sum(
    cp.multiply(y_logistic, X_logistic @ b_logistic)
    - cp.logistic(X_logistic @ b_logistic)
)
prob_logistic = cp.Problem(cp.Maximize(log_likelihood))
prob_logistic.solve(solver=cp.SCS)
print(b_logistic.value)

[ 0.17934256  0.17613886  0.47516186  0.65546576  0.99648578  0.18322555
  0.4923709   0.73499296  0.92132606  0.92752899  0.05997698  0.45651312
  0.08532281  0.49863207  0.30227002  0.63633648  0.77514439  0.0426626
 -0.03735022  0.0300261 ]
CPU times: user 1.1 s, sys: 93 μs, total: 1.1 s
Wall time: 1.1 s


### statsmodels

does IRLS

In [14]:
%%time
sm.Logit(y_logistic, X_logistic).fit().params

Optimization terminated successfully.
         Current function value: 0.416715
         Iterations 7
CPU times: user 111 ms, sys: 0 ns, total: 111 ms
Wall time: 17.6 ms


array([ 0.17934344,  0.17613996,  0.47516449,  0.65546909,  0.99649103,
        0.18322668,  0.49237341,  0.73499685,  0.92133107,  0.92753401,
        0.05997722,  0.45651545,  0.08532323,  0.49863473,  0.30227163,
        0.63633992,  0.77514856,  0.04266289, -0.03735041,  0.03002617])

## Poisson Regression

In [15]:
n, k = 100_000, 10
# Poisson Regression Data
X_poisson = np.random.randn(n, k)
print(true_params_poisson := np.random.rand(k))
lambda_ = np.exp(X_poisson @ true_params_poisson)
y_poisson = np.random.poisson(lambda_)

[0.86917029 0.96687352 0.57554146 0.2946829  0.65323721 0.74023376
 0.28311498 0.59695143 0.19782333 0.94529361]


## pyensmallen

In [16]:
def poisson_objective(params, gradient, X, y):
    params = params.reshape(-1, 1)
    y = y.reshape(-1, 1)
    Xbeta = X @ params
    lambda_ = np.exp(Xbeta)
    objective = np.sum(lambda_ - np.multiply(y, np.log(lambda_)))
    # Compute the gradient
    grad = X.T @ (lambda_ - y)
    gradient[:] = grad.ravel()
    return objective

In [17]:
%%time
optimizer = pyensmallen.L_BFGS()
result_poisson_ens = optimizer.optimize(
    lambda params, gradient: poisson_objective(params, gradient, X_poisson, y_poisson),
    np.zeros(k),
)
print(result_poisson_ens)

[0.86877556 0.96685947 0.57539955 0.29520506 0.65312095 0.73924052
 0.28189688 0.59807216 0.1970171  0.94496917]
CPU times: user 1.89 s, sys: 4.9 ms, total: 1.89 s
Wall time: 129 ms


### scipy

In [18]:
%%time
result_poisson_scipy = scipy.optimize.minimize(
    fun=lambda b: np.sum(np.exp(X_poisson @ b) - y_poisson * (X_poisson @ b)),
    x0=np.random.rand(k),
    jac=lambda b: X_poisson.T @ (np.exp(X_poisson @ b) - y_poisson),
).x
print(result_poisson_scipy)

[0.86877556 0.96685947 0.57539955 0.29520505 0.65312095 0.73924053
 0.28189688 0.59807215 0.1970171  0.94496917]
CPU times: user 5.01 s, sys: 0 ns, total: 5.01 s
Wall time: 347 ms


### cvxpy

In [None]:
%%capture
b_poisson = cp.Variable(k)
z = X_poisson @ b_poisson
cost_poisson = cp.sum(cp.exp(z) - cp.multiply(y_poisson, z)) / n
prob_poisson = cp.Problem(cp.Minimize(cost_poisson))
prob_poisson.solve(solver=cp.SCS)
print(b_poisson.value)

Runs out of memory.

### statsmodels

In [20]:
%%time
sm.Poisson(y_poisson, X_poisson).fit().params

         Current function value: 75075747038101008.000000
         Iterations: 35
CPU times: user 6.16 s, sys: 3.54 ms, total: 6.17 s
Wall time: 424 ms




array([2.52946653, 4.6910591 , 3.09004472, 0.60682215, 4.08554177,
       4.53577489, 0.33532728, 4.82258398, 1.23212272, 4.19929592])

Way off.