# `pyensmallen` for Maximum Likelihood

LBFGS works well for most smooth convex functions, notably a convex loss such as a likelihood. I generally find that optimization convergence is so fast that bootstrapping the entire procedure may be feasible.

Benchmarks against scipy, cvxpy (hand-rolled), and statsmodels are provided.

In [1]:
import numpy as np
import pyensmallen
import scipy.optimize
import cvxpy as cp
from scipy.special import expit
import time

import statsmodels.api as sm
import nlopt

## Data Generation

In [2]:
np.random.seed(42)
n, k = 1_000_000, 20

# Linear Regression Data
X_linear = np.random.randn(n, k)
print(true_params_linear := np.random.rand(k))
y_linear = X_linear @ true_params_linear + np.random.randn(n)

[0.51639859 0.94598022 0.23380001 0.55162275 0.97811966 0.24254699
 0.64702478 0.70271041 0.26476461 0.77362184 0.7817448  0.36874977
 0.72697004 0.06518613 0.72705723 0.38967364 0.03826155 0.39386005
 0.0438693  0.72142769]


## Linear Regression

### pyensmallen

In [3]:
def linear_objective(params, gradient, X, y):
    params = params.reshape(-1, 1)
    residuals = X @ params - y.reshape(-1, 1)
    objective = np.sum(residuals**2)
    grad = 2 * X.T @ residuals
    gradient[:] = grad.flatten()
    return objective

linear_start = np.random.rand(k)

In [4]:
%%time
optimizer = pyensmallen.L_BFGS()
result_linear_ens = optimizer.optimize(
    lambda params, gradient: linear_objective(params, gradient, X_linear, y_linear),
    linear_start,
)
print(result_linear_ens)

[0.51556024 0.94691468 0.23404849 0.55121759 0.97818756 0.24338623
 0.64700696 0.70195589 0.26487498 0.77280983 0.78267599 0.36787315
 0.72791074 0.06571446 0.72615144 0.38766298 0.03820425 0.39468909
 0.04304362 0.72195013]
CPU times: user 5.61 s, sys: 192 ms, total: 5.8 s
Wall time: 385 ms


### nlopt

In [5]:
%%time
opt = nlopt.opt(nlopt.LD_LBFGS, k)
opt.set_min_objective(lambda params, gradient: linear_objective(params, gradient, X_linear, y_linear))
result_linear_nlopt = opt.optimize(linear_start)
print(result_linear_nlopt)

[0.51556024 0.94691468 0.23404849 0.55121759 0.97818756 0.24338623
 0.64700696 0.70195589 0.26487498 0.77280983 0.78267599 0.36787315
 0.72791074 0.06571446 0.72615144 0.38766298 0.03820425 0.39468909
 0.04304362 0.72195013]
CPU times: user 9.12 s, sys: 282 ms, total: 9.4 s
Wall time: 614 ms


### scipy

In [6]:
%%time
result_linear_scipy = scipy.optimize.minimize(
    fun=lambda b: np.sum((X_linear @ b - y_linear) ** 2),
    x0=linear_start,
    jac=lambda b: 2 * X_linear.T @ (X_linear @ b - y_linear),
).x
print(result_linear_scipy)

[0.51556024 0.94691468 0.23404849 0.55121759 0.97818756 0.24338623
 0.64700696 0.70195589 0.26487498 0.77280983 0.78267599 0.36787315
 0.72791074 0.06571446 0.72615144 0.38766298 0.03820425 0.39468909
 0.04304362 0.72195013]
CPU times: user 1min 34s, sys: 2.21 s, total: 1min 36s
Wall time: 6.31 s


### cvxpy

In [7]:
%%time
b_linear = cp.Variable(k)
cost_linear = cp.norm(X_linear @ b_linear - y_linear, p=2) ** 2 / n
prob_linear = cp.Problem(cp.Minimize(cost_linear))
prob_linear.solve(solver=cp.SCS)
print(b_linear.value)

[0.51556024 0.94691468 0.23404849 0.55121759 0.97818756 0.24338623
 0.64700696 0.70195589 0.26487498 0.77280983 0.78267599 0.36787315
 0.72791074 0.06571446 0.72615144 0.38766298 0.03820425 0.39468909
 0.04304362 0.72195013]
CPU times: user 21.5 s, sys: 3.59 s, total: 25.1 s
Wall time: 25 s


### closed form

In [8]:
%%time
np.linalg.lstsq(X_linear, y_linear, rcond=None)[0]

CPU times: user 1.62 s, sys: 1.05 s, total: 2.67 s
Wall time: 798 ms


array([0.51556024, 0.94691468, 0.23404849, 0.55121759, 0.97818756,
       0.24338623, 0.64700696, 0.70195589, 0.26487498, 0.77280983,
       0.78267599, 0.36787315, 0.72791074, 0.06571446, 0.72615144,
       0.38766298, 0.03820425, 0.39468909, 0.04304362, 0.72195013])

In [9]:
%%time
sm.OLS(y_linear, X_linear).fit().params

CPU times: user 7.05 s, sys: 3.25 s, total: 10.3 s
Wall time: 3.51 s


array([0.51556024, 0.94691468, 0.23404849, 0.55121759, 0.97818756,
       0.24338623, 0.64700696, 0.70195589, 0.26487498, 0.77280983,
       0.78267599, 0.36787315, 0.72791074, 0.06571446, 0.72615144,
       0.38766298, 0.03820425, 0.39468909, 0.04304362, 0.72195013])

## Logistic Regression

In [10]:
# Logistic Regression Data
n, k = 10_000, 20
X_logistic = np.random.randn(n, k)
print(true_params_logistic := np.random.rand(k))
p = expit(X_logistic @ true_params_logistic)
y_logistic = np.random.binomial(1, p)

[0.74560931 0.96037048 0.30659196 0.78557459 0.04726211 0.89027718
 0.24117347 0.89673488 0.36550133 0.97993472 0.25816369 0.58473241
 0.38081169 0.95958797 0.92324205 0.08471913 0.33333406 0.5071405
 0.20493563 0.33786489]


### pyensmallen

In [11]:
def logistic_objective(params, gradient, X, y):
    z = X @ params
    h = expit(z)
    objective = -np.sum(y * np.log(h) + (1 - y) * np.log1p(-h))
    if np.isnan(objective):
        objective = np.inf
    grad = X.T @ (h - y)
    gradient[:] = grad
    return objective

logistic_start = np.random.rand(k)

In [12]:
%%time
X_logistic2 = np.ascontiguousarray(
    X_logistic
)  # Ensure C-contiguous array for better performance
y_logistic2 = y_logistic.ravel()

optimizer = pyensmallen.L_BFGS()
result_logistic_ens = optimizer.optimize(
    lambda params, gradient: logistic_objective(
        params, gradient, X_logistic2, y_logistic2
    ),
    logistic_start,
)
print(result_logistic_ens)

[0.78103852 0.8975452  0.32361818 0.75455507 0.06058863 0.87528454
 0.21365507 0.90129849 0.40673921 0.97992142 0.30068721 0.50152611
 0.40801893 0.97259116 0.93292288 0.11679675 0.31657035 0.53056071
 0.22217501 0.36503279]
CPU times: user 26.2 ms, sys: 129 ms, total: 155 ms
Wall time: 27.8 ms


### nlopt

In [13]:
%%time
opt = nlopt.opt(nlopt.LD_LBFGS, k)
opt.set_min_objective(lambda params, gradient: logistic_objective(
        params, gradient, X_logistic2, y_logistic2
    ))
result_logistic_nlopt = opt.optimize(logistic_start)
print(result_logistic_nlopt)

[0.78103852 0.8975452  0.32361818 0.75455507 0.06058863 0.87528454
 0.21365507 0.90129849 0.40673921 0.97992142 0.30068721 0.50152611
 0.40801892 0.97259117 0.93292288 0.11679676 0.31657035 0.53056071
 0.22217501 0.36503279]
CPU times: user 54.5 ms, sys: 184 ms, total: 239 ms
Wall time: 36.8 ms


  objective = -np.sum(y * np.log(h) + (1 - y) * np.log1p(-h))
  objective = -np.sum(y * np.log(h) + (1 - y) * np.log1p(-h))
  objective = -np.sum(y * np.log(h) + (1 - y) * np.log1p(-h))


### scipy

In [14]:
%%time
result_logistic_scipy = scipy.optimize.minimize(
    fun=lambda b: -np.sum(
        y_logistic * np.log(expit(X_logistic @ b))
        + (1 - y_logistic) * np.log(1 - expit(X_logistic @ b))
    ),
    x0=logistic_start,
    jac=lambda b: X_logistic.T @ (expit(X_logistic @ b) - y_logistic),
).x
print(result_logistic_scipy)



[0.78103852 0.89754519 0.32361818 0.75455506 0.06058863 0.87528454
 0.21365506 0.90129849 0.40673921 0.97992141 0.30068721 0.50152611
 0.40801892 0.97259116 0.93292288 0.11679676 0.31657034 0.53056071
 0.22217501 0.36503279]
CPU times: user 581 ms, sys: 2.21 s, total: 2.79 s
Wall time: 549 ms


### cvxpy

In [15]:
%%time
b_logistic = cp.Variable(k)
log_likelihood = cp.sum(
    cp.multiply(y_logistic, X_logistic @ b_logistic)
    - cp.logistic(X_logistic @ b_logistic)
)
prob_logistic = cp.Problem(cp.Maximize(log_likelihood))
prob_logistic.solve(solver=cp.SCS)
print(b_logistic.value)

[0.78103139 0.8975368  0.32361526 0.75454815 0.06058797 0.87527657
 0.21365308 0.90129021 0.40673545 0.97991244 0.30068428 0.50152154
 0.40801517 0.97258238 0.93291413 0.11679587 0.31656755 0.53055581
 0.22217273 0.36502933]
CPU times: user 3.09 s, sys: 408 ms, total: 3.5 s
Wall time: 3.01 s


### statsmodels

does IRLS

In [16]:
%%time
sm.Logit(y_logistic, X_logistic).fit().params

Optimization terminated successfully.
         Current function value: 0.379271
         Iterations 7
CPU times: user 434 ms, sys: 739 ms, total: 1.17 s
Wall time: 181 ms


array([0.78103852, 0.8975452 , 0.32361818, 0.75455507, 0.06058863,
       0.87528454, 0.21365507, 0.90129849, 0.40673921, 0.97992142,
       0.30068721, 0.50152611, 0.40801892, 0.97259117, 0.93292288,
       0.11679676, 0.31657035, 0.53056071, 0.22217501, 0.36503279])

## Poisson Regression

In [17]:
n, k = 100_000, 10
# Poisson Regression Data
X_poisson = np.random.randn(n, k)
print(true_params_poisson := np.random.rand(k))
lambda_ = np.exp(X_poisson @ true_params_poisson)
y_poisson = np.random.poisson(lambda_)

[0.09990002 0.0547525  0.40475966 0.20774043 0.24657925 0.85989722
 0.86845961 0.14166248 0.09512955 0.92680615]


## pyensmallen

In [18]:
def poisson_objective(params, gradient, X, y):
    params = params.reshape(-1, 1)
    y = y.reshape(-1, 1)
    Xbeta = X @ params
    lambda_ = np.exp(Xbeta)
    objective = np.sum(lambda_ - np.multiply(y, np.log(lambda_)))
    # Compute the gradient
    grad = X.T @ (lambda_ - y)
    gradient[:] = grad.ravel()
    return objective

poisson_start = np.random.rand(k)

In [19]:
%%time
optimizer = pyensmallen.L_BFGS()
result_poisson_ens = optimizer.optimize(
    lambda params, gradient: poisson_objective(params, gradient, X_poisson, y_poisson),
    poisson_start,
)
print(result_poisson_ens)

[0.0978387  0.05226027 0.40495176 0.20798044 0.24750742 0.85914355
 0.86512068 0.14383657 0.09889716 0.92837268]
CPU times: user 244 ms, sys: 427 ms, total: 671 ms
Wall time: 115 ms


### nlopt

In [20]:
%%time
opt = nlopt.opt(nlopt.LD_LBFGS, k)
opt.set_min_objective(lambda params, gradient: poisson_objective(params, gradient, X_poisson, y_poisson))
opt.set_maxeval(100000)
result_poisson_nlopt = opt.optimize(poisson_start)
print(result_poisson_nlopt)

  lambda_ = np.exp(Xbeta)
  objective = np.sum(lambda_ - np.multiply(y, np.log(lambda_)))
  objective = np.sum(lambda_ - np.multiply(y, np.log(lambda_)))
  objective = np.sum(lambda_ - np.multiply(y, np.log(lambda_)))
  grad = X.T @ (lambda_ - y)
  grad = X.T @ (lambda_ - y)


RuntimeError: nlopt failure

### scipy

In [21]:
%%time
result_poisson_scipy = scipy.optimize.minimize(
    fun=lambda b: np.sum(np.exp(X_poisson @ b) - y_poisson * (X_poisson @ b)),
    x0=poisson_start,
    jac=lambda b: X_poisson.T @ (np.exp(X_poisson @ b) - y_poisson),
).x
print(result_poisson_scipy)

[0.09783871 0.05226027 0.40495176 0.20798044 0.24750742 0.85914355
 0.86512068 0.14383657 0.09889716 0.92837268]
CPU times: user 945 ms, sys: 1.79 s, total: 2.73 s
Wall time: 408 ms


### cvxpy

In [None]:
%%capture
b_poisson = cp.Variable(k)
z = X_poisson @ b_poisson
cost_poisson = cp.sum(cp.exp(z) - cp.multiply(y_poisson, z)) / n
prob_poisson = cp.Problem(cp.Minimize(cost_poisson))
prob_poisson.solve(solver=cp.SCS)
print(b_poisson.value)

Runs out of memory.

### statsmodels

In [22]:
%%time
sm.Poisson(y_poisson, X_poisson).fit().params

Optimization terminated successfully.
         Current function value: 1.374653
         Iterations 27
CPU times: user 1.81 s, sys: 3.58 s, total: 5.39 s
Wall time: 838 ms


array([0.09783871, 0.05226027, 0.40495176, 0.20798044, 0.24750742,
       0.85914355, 0.86512068, 0.14383657, 0.09889716, 0.92837268])

Way off.