In [23]:
import numpy as np
import scipy.linalg as la
from Logistic_funcs import grad_lb, grad2_lb, negloglike

# Exercise 3

##Line Search

### Problem A

We have, in our basic steepest descent method, $$\beta_{k+1} = \beta_k + \alpha_k \nabla l(\beta_k)$$

Thus $p_k = \nabla l(\beta_k)$ in this instance.

We use the backtracking method to ensure the curvature condition, thus finding the maximum alpha that satisfies the sufficient decrease condition.
This prevents us from choosing an alpha that is "too small" and must necessarily satisfy the condition without ensuring the progress we want in our algorithm.

### Problem B

In [20]:
def linesearch(x, p, f, gradf, c=.5, iters=1000):
    '''
     This function uses the wolf conditions to compute the "optimal" stepsize for 
     the next step in a descent method.

     Inputs:
     x: A P vector. The current guess vector for the optimizer of the algorithm
     p: A P vector. The descent direction.
     f: A function that takes in x as its only non-keyword argument. 
     gradf: A function that takes in x as its only non-keyword argument. This computes
         the gradient of f at x.
     c: A float between 0 and 1, this constant value is used in the sufficient decrease
         condition
     iters: the number of points to try between 0 and 1 for alpha (the stepsize)
     
     Returns:
     suff: A boolean parameter that indicates whether the sufficiency condition was
         satisfied
     a: The desired stepsize
    '''
    # Compute the possible stepsizes to calculate
    alphas = np.linspace(0,1,iters)[::-1]
    suff = False # Boolean to determine whether the stepsize was sufficient
    for a in alphas:
        fx = f(x)
        fxap = f(x + a * p)
        gradfx = gradf(x)
        if fxap <= f(x) + c * a * gradf(x).dot(p):
            suff = True
            return suff, a
    print "Failed to converge"
    return suff, a

In [24]:
X = np.random.rand(500,11)
y = (np.random.rand(500) > .5).astype(float)
beta = np.random.rand(11)
m = np.ones(500)

In [25]:
def gradient_descent(X, y, beta, m, stepsize=1e-4, tol=1e-10, iters=1000):
    '''
    This function performs a simple gradient descent method to find the beta
    that minimizes the negative log likelihood function.
    
    Inputs:
    X: an m by n array of features
    y: an m vector of successes
    beta: an initial guess for the minimizer (for which we are solving)
    m: an m vector of the number of trials
    stepsize: a float, optional to specify the size of each gradient step
    tol: a float, optional to specify the error size used to indicate convergence
    iters; an integer, optional to specify the max number of iterations for the
         algorithm
         
    Returns:
    beta0: the minimizer of the negative log-likelihood
    err: a list of the error between log-likelihoods for each time step
    betas: a list of the new beta for each time step
    ll: a list of the log likelihoods for each time step
    '''
    beta0 = beta
    betas = [] # Stores the new value of beta at each step
    ll = [] # Stores the new negative log likelihood at each step
    for i in np.arange(iters):
        betas.append(beta0)
        ll.append(negloglike(X, y, beta0, m))
        
        beta1 = beta0 - stepsize * grad_lb(X, y, beta0, m)
        
        if np.abs(loglike(X,y,beta1,m) - loglike(X,y,beta0,m)) > tol:
            beta0 = beta1            
        else:
            print i, "iterations" # Print the number of iterations.
            return beta1, err, betas, ll 
    return beta0, betas, ll

526.45977804568952

## Quasi Newton

###Problem A

### Problem B