# Newton method for logistic regression

Author: Alexandre Gramfort

We consider the L2 regularized logistic regression model with an hyperparameter $\lambda$:

$$
\hat{w} = argmin_{w} \sum_{i=1}^n \log \{1 + \exp(-y_i(x_i^\top w) \}) + \frac{\lambda}{2} \|w\|^2
$$

In [None]:
%matplotlib inline
import math
import numpy as np
import matplotlib.pyplot as plt
from scipy import linalg

In [None]:
from sklearn import datasets

iris = datasets.load_iris()
X = iris.data
y = iris.target

# Take only 2 features
X = X[:, :2]

# Make it binary
X = X[y < 2]
y = y[y < 2]

y[y == 0] = -1

In [None]:
plt.scatter(X[y > 0, 0], X[y > 0, 1], color='r')
plt.scatter(X[y < 0, 0], X[y < 0, 1], color='b');

## Now implement Newton method for logistic regression

In [None]:
def newton_logistic(X, y, lambd):
    X = np.asarray(X, dtype=np.float)
    y = np.asarray(y, dtype=np.float)

    n_samples, n_features = X.shape

    w = np.zeros(n_features)
    pobj = []

    for k in range(10):
        ywTx = y * np.dot(X, w)
        temp = 1. / (1. + np.exp(ywTx))
        grad = - np.dot(X.T, (y * temp)) + lambd * w
        hess = np.dot(X.T, (temp * ( 1. - temp ))[:, None] * X)
        hess.flat[::n_features + 1] += lambd

        w -= linalg.solve(hess, grad)

        this_pobj = np.sum(np.log( 1. + np.exp( - y * np.dot(X, w))))
        this_pobj += lambd * np.dot(w, w) / 2.
        pobj.append(this_pobj)

    print("Global minimum : %s" % pobj[-1])

    return w, pobj

In [None]:
lambd = 0.1
w, pobj = newton_logistic(X, y, lambd=lambd)

### did cost function go down?

In [None]:
plt.plot(np.log10(pobj - pobj[-1] + np.finfo('float').eps), 'b')
plt.xlabel('Iterations')
plt.ylabel(r'$f(x^k) - f(x^*)$')
plt.show()

### Does it do the job?

In [None]:
plt.scatter(X[y > 0, 0], X[y > 0, 1], color='r')
plt.scatter(X[y < 0, 0], X[y < 0, 1], color='b')
xx = np.linspace(4, 8, 10)
plt.plot(xx,  - xx * w[0] / w[1], 'k');

# Logistic Regression with L-BFGS

In [None]:
def f(w):
    pobj = np.sum(np.log(1. + np.exp(- y * np.dot(X, w))))
    pobj += lambd * np.dot(w, w) / 2.
    return pobj

def f_grad(w):
    ywTx = y * np.dot(X, w)
    temp = 1. / (1. + np.exp(ywTx))
    grad = -np.dot(X.T, (y * temp)) + lambd * w
    return grad

from scipy.optimize import fmin_l_bfgs_b
w, _, _ = fmin_l_bfgs_b(f, x0=np.zeros(X.shape[1]), fprime=f_grad)

In [None]:
plt.scatter(X[y > 0, 0], X[y > 0, 1], color='r')
plt.scatter(X[y < 0, 0], X[y < 0, 1], color='b')
xx = np.linspace(4, 8, 10)
plt.plot(xx,  - xx * w[0] / w[1], 'k');

<div class="alert alert-success">
    <b>QUESTION 1:</b>
     <ul>
       <li>Modify f and f_grad to add support for a bias term: b. The probem reads:
$$
\hat{w} = argmin_{w, b} \sum_{i=1}^n \log \{1 + \exp(-y_i(x_i^\top w + b) \}) + \frac{\lambda}{2} \|w\|^2
$$
You will check your gradient with scipy.optimize.check_grad.
         </li>
       <li>Implement a solver for logistic regression with bias and L2 regularization using L-BFGS-B.</li>
       <li>Implement a solver for logistic regression with bias and L2 regularization using Newton's method.</li>
    </ul>
</div>