# Ensmallen for Maximum Likelihood

Benchmark against scipy and cvxpy

In [1]:
import numpy as np
import pyensmallen
import scipy.optimize
import cvxpy as cp
from scipy.special import expit
import time

## Data Generation

In [2]:
np.random.seed(42)
n, k = 1_000_000, 20

# Linear Regression Data
X_linear = np.random.randn(n, k)
print(true_params_linear := np.random.rand(k))
y_linear = X_linear @ true_params_linear + np.random.randn(n)

[0.51639859 0.94598022 0.23380001 0.55162275 0.97811966 0.24254699
 0.64702478 0.70271041 0.26476461 0.77362184 0.7817448  0.36874977
 0.72697004 0.06518613 0.72705723 0.38967364 0.03826155 0.39386005
 0.0438693  0.72142769]


## Linear Regression

### pyensmallen

In [3]:
def linear_objective(params, gradient, X, y):
    params = params.reshape(-1, 1)
    residuals = X @ params - y.reshape(-1, 1)
    objective = np.sum(residuals**2)
    grad = 2 * X.T @ residuals
    gradient[:] = grad.flatten()
    return objective

In [4]:
%%time
optimizer = pyensmallen.L_BFGS()
result_linear_ens = optimizer.optimize(
    lambda params, gradient: linear_objective(params, gradient, X_linear, y_linear),
    np.random.rand(k),
)
print(result_linear_ens)

[0.51556024 0.94691468 0.23404849 0.55121759 0.97818756 0.24338623
 0.64700696 0.70195589 0.26487498 0.77280983 0.78267599 0.36787315
 0.72791074 0.06571446 0.72615144 0.38766298 0.03820425 0.39468909
 0.04304362 0.72195013]
CPU times: user 5.74 s, sys: 147 ms, total: 5.89 s
Wall time: 416 ms


### scipy

In [5]:
%%time
result_linear_scipy = scipy.optimize.minimize(
    fun=lambda b: np.sum((X_linear @ b - y_linear) ** 2),
    x0=np.random.rand(k),
    jac=lambda b: 2 * X_linear.T @ (X_linear @ b - y_linear),
).x
print(result_linear_scipy)

[0.51556024 0.94691468 0.23404849 0.55121759 0.97818756 0.24338623
 0.64700696 0.70195589 0.26487498 0.77280983 0.78267599 0.36787315
 0.72791074 0.06571446 0.72615144 0.38766298 0.03820425 0.39468909
 0.04304362 0.72195013]
CPU times: user 1min 43s, sys: 2.7 s, total: 1min 45s
Wall time: 7.18 s


### cvxpy

In [6]:
%%time
b_linear = cp.Variable(k)
cost_linear = cp.norm(X_linear @ b_linear - y_linear, p=2) ** 2 / n
prob_linear = cp.Problem(cp.Minimize(cost_linear))
prob_linear.solve(solver=cp.SCS)
print(b_linear.value)

[0.51556024 0.94691468 0.23404849 0.55121759 0.97818756 0.24338623
 0.64700696 0.70195589 0.26487498 0.77280983 0.78267599 0.36787315
 0.72791074 0.06571446 0.72615144 0.38766298 0.03820425 0.39468909
 0.04304362 0.72195013]
CPU times: user 22.8 s, sys: 3.59 s, total: 26.4 s
Wall time: 24.7 s


## Logistic Regression

In [7]:
# Logistic Regression Data
n, k = 10_000, 20
X_logistic = np.random.randn(n, k)
print(true_params_logistic := np.random.rand(k))
p = expit(X_logistic @ true_params_logistic)
y_logistic = np.random.binomial(1, p)

[0.17265046 0.1778864  0.49767087 0.64843282 0.98414584 0.21942117
 0.53109792 0.68926063 0.9222397  0.90592967 0.08626337 0.45876915
 0.07621689 0.47511573 0.2636066  0.66777898 0.76603666 0.01132669
 0.01504104 0.02569576]


### pyensmallen

In [8]:
def logistic_objective(params, gradient, X, y):
    z = X @ params
    h = expit(z)
    objective = -np.sum(y * np.log(h) + (1 - y) * np.log1p(-h))
    grad = X.T @ (h - y)
    gradient[:] = grad
    return objective

In [9]:
%%time
X_logistic2 = np.ascontiguousarray(
    X_logistic
)  # Ensure C-contiguous array for better performance
y_logistic2 = y_logistic.ravel()

optimizer = pyensmallen.L_BFGS()
result_logistic_ens = optimizer.optimize(
    lambda params, gradient: logistic_objective(
        params, gradient, X_logistic2, y_logistic2
    ),
    np.random.rand(k),
)
print(result_logistic_ens)

[ 0.17934344  0.17613996  0.47516449  0.65546909  0.99649103  0.18322668
  0.49237341  0.73499685  0.92133107  0.92753401  0.05997722  0.45651545
  0.08532323  0.49863473  0.30227163  0.63633992  0.77514856  0.04266289
 -0.03735041  0.03002617]
CPU times: user 120 ms, sys: 0 ns, total: 120 ms
Wall time: 7.04 ms


### scipy

In [10]:
%%time
result_logistic_scipy = scipy.optimize.minimize(
    fun=lambda b: -np.sum(
        y_logistic * np.log(expit(X_logistic @ b))
        + (1 - y_logistic) * np.log(1 - expit(X_logistic @ b))
    ),
    x0=np.random.rand(k),
    jac=lambda b: X_logistic.T @ (expit(X_logistic @ b) - y_logistic),
).x
print(result_logistic_scipy)

[ 0.17934344  0.17613996  0.47516449  0.65546909  0.99649103  0.18322668
  0.49237341  0.73499685  0.92133107  0.92753401  0.05997722  0.45651545
  0.08532323  0.49863473  0.30227163  0.63633992  0.77514856  0.04266289
 -0.03735041  0.03002617]
CPU times: user 909 ms, sys: 731 μs, total: 910 ms
Wall time: 78 ms


### cvxpy

In [11]:
%%time
b_logistic = cp.Variable(k)
log_likelihood = cp.sum(
    cp.multiply(y_logistic, X_logistic @ b_logistic)
    - cp.logistic(X_logistic @ b_logistic)
)
prob_logistic = cp.Problem(cp.Maximize(log_likelihood))
prob_logistic.solve(solver=cp.SCS)
print(b_logistic.value)

[ 0.17934256  0.17613886  0.47516186  0.65546576  0.99648578  0.18322555
  0.4923709   0.73499296  0.92132606  0.92752899  0.05997698  0.45651312
  0.08532281  0.49863207  0.30227002  0.63633648  0.77514439  0.0426626
 -0.03735022  0.0300261 ]
CPU times: user 1.13 s, sys: 0 ns, total: 1.13 s
Wall time: 1.14 s


## Poisson Regression

In [12]:
n, k = 100_000, 10
# Poisson Regression Data
X_poisson = np.random.randn(n, k)
print(true_params_poisson := np.random.rand(k))
lambda_ = np.exp(X_poisson @ true_params_poisson)
y_poisson = np.random.poisson(lambda_)

[0.86917029 0.96687352 0.57554146 0.2946829  0.65323721 0.74023376
 0.28311498 0.59695143 0.19782333 0.94529361]


## pyensmallen

In [13]:
def poisson_objective(params, gradient, X, y):
    params = params.reshape(-1, 1)
    y = y.reshape(-1, 1)
    Xbeta = X @ params
    lambda_ = np.exp(Xbeta)
    objective = np.sum(lambda_ - np.multiply(y, np.log(lambda_)))
    # Compute the gradient
    grad = X.T @ (lambda_ - y)
    gradient[:] = grad.ravel()
    return objective

In [14]:
%%time
optimizer = pyensmallen.L_BFGS()
result_poisson_ens = optimizer.optimize(
    lambda params, gradient: poisson_objective(params, gradient, X_poisson, y_poisson),
    np.zeros(k),
)
print(result_poisson_ens)

[0.86877556 0.96685947 0.57539955 0.29520506 0.65312095 0.73924052
 0.28189688 0.59807216 0.1970171  0.94496917]
CPU times: user 2.45 s, sys: 0 ns, total: 2.45 s
Wall time: 182 ms


### scipy

In [15]:
%%time
result_poisson_scipy = scipy.optimize.minimize(
    fun=lambda b: np.sum(np.exp(X_poisson @ b) - y_poisson * (X_poisson @ b)),
    x0=np.random.rand(k),
    jac=lambda b: X_poisson.T @ (np.exp(X_poisson @ b) - y_poisson),
).x
print(result_poisson_scipy)

[0.86877556 0.96685947 0.57539955 0.29520505 0.65312095 0.73924053
 0.28189688 0.59807215 0.1970171  0.94496917]
CPU times: user 5.68 s, sys: 0 ns, total: 5.68 s
Wall time: 409 ms


### cvxpy

In [16]:
%%time
b_poisson = cp.Variable(k)
z = X_poisson @ b_poisson
cost_poisson = cp.sum(cp.exp(z) - cp.multiply(y_poisson, z)) / n
prob_poisson = cp.Problem(cp.Minimize(cost_poisson))
prob_poisson.solve(solver=cp.SCS)
print(b_poisson.value)

OverflowError: Python integer 30003300000 out of bounds for int32

Runs out of memory.