# HMC in ND with F-BFGS autotuning of the mass matrix

This notebook implements an autotuning HMC for an N-dimensional distribution based on factorised BFGS (F-BFGS) updating of the (inverse) Hessian.

## 0. Import packages

In [None]:
import time
import numpy as np
import matplotlib.pyplot as plt
from importlib import reload

import testfunctions
import samplestatistics

plt.rcParams["font.family"] = "Times"
plt.rcParams.update({'font.size': 50})
plt.rcParams['xtick.major.pad']='12'
plt.rcParams['ytick.major.pad']='12'

## 1. Input

We first define several input parameters, including the model space dimension, the total number of samples, the number of leapfrog timesteps, and the length of the timestep.

In [None]:
import input_parameters
reload(input_parameters)
test_function,dim,N,Nit,dt,m0,Minv,autotune,ell,update_interval,preco,S0_min,plot_interval,dimension1,dimension2,m1_min,m1_max,m2_min,m2_max=input_parameters.input_parameters()

## 4. Class for F-BFGS autotuning

This class takes care of the factorised BFGS updating. The class must be initialised with the first model and the first gradient. The *update* function then takes the next model and gradient, and computes updates of the approximate matrix factor $\mathbf{S}$, of the approximate Hessian $\mathbf{H}$, and of the approximate inverse Hessian $\mathbf{H}^{-1}$.

**Call to caution**: There is an experimental component in this class. In principle, BFGS updates can only be made when $\mathbf{s}_k^T\mathbf{y}>0$. However, when this quantity is very small, the resulting Hessian approximation may still be close to singular. Empirically, it is better for stability to choose $\mathbf{s}_k^T\mathbf{y}>\gamma$, with some tuning parameter $\gamma>0$. For many examples, $\gamma=2$ works very well.

In [None]:
class fbfgs:
    
    def __init__(self,dim,Minv,m,g):
        """
        Initialise the BFGS iteration.
        
        :param dim: number of model-space dimensions
        :param Minv: initial inverse mass matrix (must be diagonal)
        :param m: current model vector
        :param g: current gradient
        """
        
        self.dim=dim   # Model space dimension.
        self.Hinv=Minv   # Initial (current) estimate of the inverse Hessian H^{-1}.
        
        # Initial (current) matrix factor S and estimate of the Hessian
        
        self.S=np.identity(dim)   
        self.H=np.identity(dim)   
        
        for i in range(dim):
            self.H[i,i]=1.0/Minv[i,i]
            self.S[i,i]=1.0/np.sqrt(Minv[i,i])
        
        self.logdet=0.0   # Initial (current) logarithm of the determinant of the Hessian H.
        self.m=m   # Initial (current) model.
        self.g=g   # Initial (current) gradient.
        self.I=np.identity(dim)   # Identity matrix. 
        
    def update(self,m,g):
        """
        Update BFGS matrix and its factorised form.
        
        :param m: current model vector
        :param g: current gradient
        """
        
        # Compute differences and update vectors.
        s=m-self.m
        y=g-self.g
        
        check=np.dot(s,y)
        #print(check)
        
        # Do nothing unless rho is positive.
        if check>2.0:
            
            # Set new to current vectors.
            self.m=m
            self.g=g
            
            # Precompute inverse Hessian-vector product.
            Hinv_y=np.dot(self.Hinv,y)
            
            # Auxiliary scalars.
            rho=1.0/np.dot(s,y)
            gamma2=rho**2 * np.dot(y,Hinv_y) + rho
            beta=gamma2 * np.dot(s,np.dot(self.H,s))
            theta=-np.sqrt(rho/(beta*gamma2))
        
            # Auxiliary vectors a, b, and u, v.
            a=np.sqrt(gamma2)*s
            b=(rho/np.sqrt(gamma2))*Hinv_y
            u=a
            v=-np.dot(self.H,b+theta*a)
            
            # Update inverse Hessian estimate Hinv. 
            alpha=1.0/(1.0+np.dot(u,v))
            Hinv_v=np.dot(self.Hinv,v)
            self.Hinv=self.Hinv+np.tensordot(Hinv_v,u,axes=0)+np.tensordot(u,Hinv_v,axes=0)+np.tensordot(u,u,axes=0)*np.dot(v,Hinv_v)
        
            # Update Hessian estimate H.
            H_u=np.dot(self.H,u)
            self.H=self.H-alpha*np.tensordot(H_u,v,axes=0)-alpha*np.tensordot(v,H_u,axes=0)+(alpha**2)*np.tensordot(v,v,axes=0)*np.dot(u,H_u)
        
            # Update factor S.
            ST_u=np.dot(self.S.transpose(),u)
            self.S=self.S-alpha*np.tensordot(v,ST_u,axes=0)
            
            # Update determinant of S.
            self.logdet+=np.log(alpha**2)
            
        else: 
            rhoinv=np.dot(s,y)
            print('F-BFGS check failed (1/rho=%f)' % rhoinv)

## 5. Leapfrog integrator

For clarity, we define the leap-frog integrator as a separate function.

In [None]:
def leapfrog(m,p,Nt,dt,Minv,fct,plot=False):
    
    # Plot probability density in the background.
    if plot:
        fct.plotU(dim,dimension1,dimension2,m1_min,m1_max,m2_min,m2_max)
        plt.plot(m[dimension1],m[dimension2],'bo',MarkerSize=15)
    
    # Evaluate initial gradient.
    J=fct.J(m)
    
    # Determine randomised integration length.
    Nti=np.int(Nt*(1.0-0.5*np.random.rand()))
    
    # Leapfrog integration.
    for k in range(Nti):
        
        if plot: m_old=m.copy()
        
        p=p-0.5*dt*J
        m=m+dt*Minv.dot(p)
        J=fct.J(m)
        p=p-0.5*dt*J
        
        # Plot trajectory segment.
        if plot: 
            if k==0: print('number of time steps: %d' % Nti)
            plt.plot([m_old[dimension1],m[dimension1]],[m_old[dimension2],m[dimension2]],'r',linewidth=3)
            plt.plot(m[dimension1],m[dimension2],'kx',markersize=15)
        
    return m, p

## 6. HMC initialisations

Before running the actual HMC sampler, we perform several initialisations. This includes the test function class, the first random model $\mathbf{m}$, and the corresponding gradient of the potential energy $\mathbf{g}=\nabla U$. With this, we can initialise the F-BFGS class, which takes $\mathbf{m}$ and $\mathbf{g}$ as input.

In [None]:
# Initialisation. =============================================================

# Test function class.
fct=testfunctions.f(dim,test_function)

# Number of accepted models.
accept=0

# Randomly chosen initial model.
m=m0

# Posterior statistics.
s=samplestatistics.stats(dimension1,dimension2,N)
s.get(m,0.0,0)

# Initialise BFGS matrix.
g=fct.J(m)
M=fbfgs(dim,Minv,m,g)

# Specific matrix elements for monitoring.
m11=Minv[dimension1,dimension1]*np.ones(N)
m22=Minv[dimension2,dimension2]*np.ones(N)

## 7. Run HMC

We finally run the HMC sampler. In each iteration, we first produce radom momenta $\mathbf{p}$ from a normal distribution with covariance chosen to be the F-BFGS-updated inverse mass matrix $\mathbf{M}^{-1}$, which is defined to be the inverse Hessian $\mathbf{H}^{-1}$ of the potential energy $U$. 

Using the mass matrix, we compute energies and run a leapfrog iteration to solve Hamilton's equations. Following this, we compute the energies of the proposed model and evaluate the modified Metropolis rule (in logarithimic form, to avoid over- or under-flow).

In [None]:
accept=0
start=time.time()

for it in range(N-1):

    # Randomly choose momentum.
    p=np.random.randn(dim)
    p=M.S.dot(p)
    
    # Evaluate energies.
    U=fct.U(m)
    K=0.5*np.dot(p,np.dot(M.Hinv,p))
    H=U+K
    
    # Check if models and trajectories should be plotted.
    if (not it % plot_interval) and it>0: 
        plot=True
        print('iteration: %d' % it)
    else:
        plot=False
    
    # Run leapfrog iteration.
    m_new,p_new=leapfrog(m,p,Nit,dt,M.Hinv,fct,plot)
    plt.show()
    
    # Plot proposed models.
    if plot:
        plt.subplots(1, figsize=(30,10))
        plt.plot(m_new)
        plt.xlabel('model parameter index')
        plt.show()
    
    # Evaluate new energies.
    U_new=fct.U(m_new)
    K_new=0.5*np.dot(p_new,M.Hinv.dot(p_new))
    H_new=U_new+K_new
    
    # Evaluate Metropolis rule in logarithmic form.
    alpha=np.minimum(0.0,H-H_new)
    #print(alpha)
    if alpha>=np.log(np.random.rand(1)):
        # Update model.
        m=m_new
        accept+=1
        # Update BFGS matrix.
        if autotune:
            g=fct.J(m)
            M.update(m,g)
    
    # Accumulate on-the-fly statistics
    s.get(m,M.logdet,it+1)
    m11[it+1]=M.Hinv[dimension1,dimension1]
    m22[it+1]=M.Hinv[dimension2,dimension2]
    
stop=time.time()
print('acceptance rate: %f (%d of %d samples)' % (np.float(accept)/np.float(N),accept,N))
print('elapsed time: %f s' % (stop-start))

## 8. Analyse results

### 8.1. Sample statistics collected on the fly

In [None]:
s.display()

### 8.2. Analysis of the mass matrix

In [None]:
plt.subplots(1, figsize=(20,20))
plt.pcolor(Minv,cmap='Blues')
plt.title('initial inverse mass matrix',pad=20)
plt.colorbar()
plt.show()

plt.subplots(1, figsize=(20,20))
plt.pcolor(M.Hinv,cmap='Blues')
plt.title('final inverse mass matrix',pad=20)
plt.colorbar()
plt.show()

plt.subplots(1, figsize=(20,20))
plt.pcolor(M.S,cmap='RdBu')
plt.title('final S',pad=20)
c=np.max(np.abs(M.S))
plt.clim([-c,c])
plt.colorbar()
plt.show()

plt.subplots(1, figsize=(20,20))
plt.pcolor(np.dot(M.S,M.S.transpose()),cmap='Blues')
plt.title('final SS^T',pad=20)
#c=np.max(np.abs(np.dot(M.S,M.S.transpose())))
#plt.clim([-c,c])
plt.colorbar()
plt.show()

plt.subplots(1, figsize=(20,10))
plt.plot(np.diag(M.Hinv),'k',linewidth=4)
plt.plot(np.diag(Minv),'r',linewidth=4)
plt.xlabel('index')
plt.title('diagonal of inverse mass matrix (final=black, initial=red)')
plt.grid()
plt.show()

plt.subplots(1, figsize=(20,10))
plt.plot(m11,'k',linewidth=4)
plt.plot(m22,'r',linewidth=4)
plt.xlabel('iteration')
plt.title('diagonal elements (black=parameter1, red=parameter2)')
plt.grid()
plt.show()