###    Variationally Sparse GP

In this notebook, we will sample from the variationally optimal posterior distribution over u(inducing outputs), for simple Gaussian likelihoods, this distribution can be calculated analytically. While for non Gaussian likelihoods, we need to write the resulting density in terms of expectation(the expectation of loglikelihood logp(y|f) wrt p(f|u)). 

In [1]:
%matplotlib inline
import  matplotlib
import matplotlib.pyplot as plt
import pystan

In [2]:
import numpy as np
import scipy.io as sio

In [3]:
df = sio.loadmat('snelson1d.mat')
X = df['X']
Y = df['Y']
X_t = df['tX']
Y_t = df['tY']

In [4]:
def kmm_init(X, m = 20):
    """
    This is the same initialization algorithm that is used
    in Kmeans++. It's quite simple and very useful to initialize
    the locations of the inducing points in sparse GPs.
    
    http://ilpubs.stanford.edu:8090/778/1/2006-13.pdf
    1. Take one center c1, initially chosen at random
    2. Take a new center ci with prob. \frac{D(x)^2}{\sum D(x)^2}
    3. Repeat step 2 until we have K centers.
    4. Proceed with standard K-means clustering..
    
    where D(x) is the distance to the closest cluster center. 
    If we dont want to invest time on that, just use the initialisation which is good enough atleast for the 1D case.
    
    :param X: data
    :param m: number of inducing points
    """

    # compute the distances
    XXT = np.dot(X, X.T)
    D = (-2.*XXT + np.diag(XXT)[:,np.newaxis] + np.diag(XXT)[np.newaxis,:])

    # select the first point
    s = np.random.permutation(X.shape[0])[0]
    inducing = [s]
    prob = D[s]/D[s].sum()

    for z in range(m-1):
        s = np.random.multinomial(1, prob.flatten()).argmax()
        inducing.append(s)
        prob = D[s]/D[s].sum()

    inducing = np.array(inducing)
    return X[inducing]

In [5]:
X_u = kmm_init(X)
M = len(X_u)

In [6]:
X_u_noisy = np.linspace(0,6, M)[:, None]
X_u_noisy = X_u + 0.1*np.random.rand(*X_u.shape) 

### Code

In [43]:
stan_code_var ="""
data {
    int<lower=1> N;
    int<lower=1> M;
    real x[N];
    vector[N] y;
    real xu[M];

}
transformed data {
    real delta = 1e-6;
#     real rho = 1;
#     real alpha = 1;

}
parameters {
    vector[N] f;
    
    real<lower=1e-6> sigma;
    real <lower=1e-4> alpha;
    real<lower=1e-4> rho;
    
    vector[M] eta2;
    vector[M] q_u_hat;
}

model {

    vector[N] f_mean;
    matrix[N, N] f_cov;
    matrix[N, N] A;
    matrix[N, N] diff;
    vector[M] mu_hat;
    matrix[M, M] S_hat;
    vector[M] u_hat;
    vector[M] Kuf_y;
    vector[M] Kuu_uf_y;
        
    {
        matrix[M, M] L_U;
        matrix[M, M] Kuu = cov_exp_quad(xu, alpha, rho);
        matrix[N, M] Kfu = cov_exp_quad(x, xu, alpha, rho);
        matrix[N, N] K = cov_exp_quad(x, alpha, rho);
        vector[M] Kuu_inv_u;
        matrix[M, N] v_pred;
        vector[M] u;
        matrix[M,M] sum_mat;
        matrix[M, M] v_hat;
        matrix[M,M] L_S;
        matrix[M,M] L;
        
        for (m in 1:M)
            Kuu[m, m] = Kuu[m, m] + delta;
                    
        L_U = cholesky_decompose(Kuu);
             
        
     
        // q_hat : q_opt(u) : optimal distrubiton for u.
        
        // mu_hat = (1/sigma**2) * Kuu*S*Kuf *y
        // S_hat = Kff * S * Kuu
        // S = ( Kmm + (1/sigma**2)*Kuf*Kfu )^{-1} 
        
        sum_mat = Kuu + (1/square(sigma))*Kfu'*Kfu;
        L_S = cholesky_decompose(sum_mat);
        v_hat = mdivide_left_tri_low(L_S, Kuu);
        v_hat = mdivide_right_tri_low(v_hat',L_S)';

//        S_hat = v_hat'*v_hat;
        S_hat = Kuu*v_hat;
        
        Kuf_y = Kfu'*y;
        Kuu_uf_y = mdivide_left_tri_low(L_S, Kuf_y);
        Kuu_uf_y = mdivide_right_tri_low(Kuu_uf_y',L_S)';

        mu_hat = (1/square(sigma)) * Kuu *Kuu_uf_y; 

        Kuu_inv_u = mdivide_left_tri_low(L_U, u_hat);
        Kuu_inv_u = mdivide_right_tri_low(Kuu_inv_u', L_U)';
        
        f_mean = Kfu*Kuu_inv_u;

        diff = K - v_pred'*v_pred;
        for (i in 1:N){
            for(j in 1:N){
                A[i,j] = 0.0;
            }
            A[i,i] = diff[i,i];
        }
        
        
    }
  
#     rho ~ inv_gamma(5, 5);
#     alpha ~ normal(0, 1);
    
    rho ~ normal(0.90, 0.25);
    alpha ~ normal(0.85, 0.25);

    sigma ~ normal(0, 0.35);
    eta2 ~ normal(0, 1);
    u_hat ~ multi_normal(mu_hat, S_hat);
    f ~ multi_normal(f_mean, A);
    y ~ normal(f, sigma);
}
"""


In [44]:
stan_model_var = pystan.StanModel(model_code=stan_code_var)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_92a1d67cb776a6b064de4cc4f9d5a9ca NOW.


In [45]:
gp_reg_data = {'N':200, 'x':X.flatten(), 'y':Y.flatten(), 'M':20, 'xu':X_u_noisy.flatten()}

In [46]:
fit_var_gp = stan_model_var.sampling(data=gp_reg_data, iter=800, chains=2)

  elif np.issubdtype(np.asarray(v).dtype, float):


RuntimeError: Initialization failed.

In [None]:
fit_dtc = stan_model_dtc.sampling(data=gp_full_data2, iter=600, chains=2)

In [None]:
# print(fit_fitc)
print(fit_dtc)

In [None]:
print(fit_fitc)

In [None]:
samples = fit_dtc.extract(permuted=True)
f = samples['f']
f_mean = np.mean(f, axis=0)
sigma = samples['sigma']
sigma_mean = np.mean(sigma, axis=0)
                

In [None]:
sort_idx = np.argsort(X.ravel())

plt.plot(X[sort_idx], Y[sort_idx], 'r.')
plt.plot(X[sort_idx], f[-1, sort_idx], 'y.');
# plt.plot(X[sort_idx], f[-100, sort_idx], 'g.');
plt.plot(X[sort_idx], f_mean[sort_idx], 'b.');
# plt.plot(X[sort_idx], sigma[-1, sort_idx], 'b.');

In [None]:
plt.plot(X,Y, 'ro')

In [None]:
eta = samples['eta2']
print(eta.shape)
plt.plot(eta[:, 0])
plt.plot(eta[:, 1])
plt.plot(eta[:, 2])

In [None]:

samples_fitc = fit_fitc.extract(permuted=True)
f = samples_fitc['f']
f_mean = np.mean(f, axis=0)
sigma = samples_fitc['sigma']
sigma_mean = np.mean(sigma, axis=0)

In [None]:
sort_idx = np.argsort(X.ravel())

plt.plot(X[sort_idx], Y[sort_idx], 'r.')
plt.plot(X[sort_idx], f[-1, sort_idx], 'y.');
# plt.plot(X[sort_idx], f[-20, sort_idx], 'b.');
# plt.plot(X[sort_idx], f[-40, sort_idx], 'g.');
# plt.plot(X[sort_idx], f[-100, sort_idx], 'g.');
# plt.plot(X[sort_idx], f_mean[sort_idx], 'b.');
# plt.plot(X[sort_idx], sigma[-1, sort_idx], 'b.');

In [None]:
print(f.shape)


In [None]:
plt.hist(sigma);

In [None]:
eta_fitc = samples_fitc['eta2']
print(eta_fitc.shape)
plt.plot(eta_fitc[:, 0])
plt.plot(eta_fitc[:, 1])
plt.plot(eta_fitc[:, 2])