In [1]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d

from scipy import optimize
from utils import color_cycle
from perceptron_code import train_perceptron, generate_teacher_and_test_set

In [2]:
# define some helper functions
def plot_points(X, t, ax, lim = 9, alpha = 1):
    ax.plot(X[t==1,0], X[t==1,1], 'x', color="black", alpha=alpha);
    ax.plot(X[t==0,0], X[t==0,1], 's', color="black", alpha=alpha);
    ax.hlines(y=0, xmin=-lim, xmax=lim, ls=':', color='gray')
    ax.vlines(x=0, ymin=-lim, ymax=lim, ls=':', color='gray')
    plt.gca().set_aspect('equal');


def plot_vec(w, fact=1, wlim = 10, label=None, color=None, alpha=0.5):
    plt.plot([0, w[0]*fact], [0, w[1]*fact], ls='--', alpha=alpha, color=color, label=label)
    plt.plot([-w[1]*fact, w[1]*fact], [w[0]*fact, -w[0]*fact], alpha=alpha, color=color);


def get_wgrid(wlim = 10, num = 100):
    wrange = np.linspace(-wlim, wlim, num)
    w1grid, w2grid = np.meshgrid(wrange, wrange)
    wgrid_stacked = np.dstack([w1grid, w2grid])
    return w1grid, w2grid, wgrid_stacked


def compute_output(w, X):
    h = np.dot(w, X.T)
    y = 1. / (1 + np.exp(-h))
    return y


def compute_E(w, alpha, X, t, eps=1e-10):
    y = compute_output(w, X)
    err = -(t * np.log(y + eps) + (1 - t) * np.log(1 - y + eps)).sum(-1)
    reg = 0.5 * alpha * (w**2).sum(-1)
    E = err + reg
    return err, reg, E


def compute_gradE(w, alpha, X, t):
    y = compute_output(w, X)
    gradE = ((y - t)[:,None] * X).sum(0) + alpha * w
    return gradE

# Where are we going with the first part of this course

MacKay, first part of **Chapter 41**.

_Recommended readings and watching_:

* [Understanding deep learning is also a job for physicists](https://www.nature.com/articles/s41567-020-0929-2) by [Lenka Zdeborová](https://people.epfl.ch/lenka.zdeborova/?lang=en)
* [A new view on independence](https://projecteuclid.org/journals/annals-of-probability/volume-24/issue-1/A-new-look-at-independence/10.1214/aop/1042644705.full) by [Michel Talagrand](https://en.wikipedia.org/wiki/Michel_Talagrand)
* A free online course on [High Dimensional Probability](https://www.math.uci.edu/~rvershyn/teaching/hdp/hdp.html) by [Roman Vershynin](https://www.math.uci.edu/~rvershyn/)
* A free online course on [High Dimensional Analysis: Random Matrices and Machine Learning](https://www.youtube.com/playlist?list=PLY11JnnnTUCabY4nc0hKptrd5qEWtLoo2) by [Roland Speicher](https://www.uni-saarland.de/lehrstuhl/speicher/roland-speicher.html)

# Learning as an inference problem

**Data set**:
$$D=\{x^\mu,t^\mu\}$$ 

with $t^\mu=\pm 1$ and $\mu=1,\ldots,P$.

**Conditional model** (probability of label $t^\mu$ given $x^\mu$):
$$p(t^\mu|x^\mu,w)=\sigma(t^\mu w\cdot x^\mu)$$

**Likelihood**:
$$p(D|w)=\prod_\mu p(t^\mu|x^\mu,w)=e^{-\mathcal{E}(w)}\qquad \mathcal{E}(w)=-\sum_\mu \log p(t^\mu|x^\mu,w)$$

**Prior**:
$$p(w)=\frac{e^{-R(w,\alpha)}}{Z_w(\alpha)}\qquad R(w,\alpha)=\alpha \sum_i w_i^2$$

**Posterior**:
$$p(w|D)=\frac{p(D|w)p(w)}{p(D)} \propto e^{-L(w)}$$
with
$$L(w)=\mathcal{E}(w)+ R(w)$$

# ML versus Bayesian approach

In neural network learning one typically computes the **maximum likelihood** or **maximum posterior** solution.

For new test point $x^\text{test}$, using an ML approach:
$$D \rightarrow w_\mathsf{ml}$$
$$p(t|x^\text{test},w_\mathsf{ml})$$

Bayesian approach requires integration over multiple solutions:
$$D \rightarrow p(w|D)$$
$$p(t|x^\text{test})=\int dw p(w|D) p(t|x^\text{test},w)$$

#### Let's start with an example in low-dimension: a two-dimensional logistic regression problem

In [12]:
alpha = 0.01
Ps = [2, 4, 6]
num_P = len(Ps)
use_mackay_inputs = True

# precompute weight grid
wlim = 9
w1grid, w2grid, wgrid_stacked = get_wgrid(wlim=wlim, num=100)

# generate dataset
if use_mackay_inputs:
    # reproduction of Mackay Fig. 41.1
    Xall = np.array([[5., 5.],
                     [-4., -2.],
                     [1.5, 1.5],
                     [-1, 1.],
                     [1.5, 3.5],
                     [-9., 0.]])
    tall = np.array([1., 0., 0., 1., 1., 0])
else:
    # random inputs
    Xall = np.random.randn(Ps[-1], dim)
    tall = np.random.randint(2, size=Ps[-1])

#### Let's train on subsets of the problem using GD

In [13]:
eta = 0.01
num_iter_gd = 10000

# set a common initial condition
w0 = np.array([5.,5.])

# allocate records
ws_gd = np.zeros((num_P, num_iter_gd, 2))
errs_gd = np.zeros((num_P, num_iter_gd))
regs_gd = np.zeros((num_P, num_iter_gd))
Es_gd = np.zeros((num_P, num_iter_gd))

for iP, P in enumerate(Ps):
    
    # select inputs
    X = Xall[:P]
    t = tall[:P]
    
    w = w0.copy()
    for it in range(num_iter_gd):
        err, reg, E = compute_E(w, alpha, X, t)
        ws_gd[iP,it] = w.copy()
        errs_gd[iP,it] = err
        regs_gd[iP,it] = reg
        Es_gd[iP,it] = err + alpha * reg
        gradE = compute_gradE(w, alpha, X, t)
        w -= eta * gradE

print("done.")

done.


#### Let's now represent the entire posterior over weights by discretizing the weight values

In [None]:
count_fig = 1
fig = plt.figure(figsize=(10, 2 * len(Ps)))
for iP, P in enumerate(Ps):
    
    # select inputs
    X = Xall[:P]
    t = tall[:P]

    # plot inputs
    ax1 = fig.add_subplot(len(Ps), 4, count_fig)
    plot_points(X, t, ax1)
    count_fig += 1

    # compute likelihood and un-normalized posterior
    errgrid, reggreid, Egrid = compute_E(wgrid_stacked, alpha, X, t)
    likwgrid = np.exp(-errgrid)
    pwgrid = np.exp(-Egrid)
    
    # plot likelihood
    ax2 = fig.add_subplot(len(Ps), 4, count_fig, projection='3d')
    ax2.plot_surface(w1grid, w2grid, likwgrid, edgecolor='royalblue', lw=0.5, rstride=8, cstride=8, alpha=0.4);
    count_fig += 1

    # plot posterior
    ax3 = fig.add_subplot(len(Ps), 4, count_fig, projection='3d')
    ax3.plot_surface(w1grid, w2grid, pwgrid, edgecolor='royalblue', lw=0.5, rstride=8, cstride=8, alpha=0.4);
    count_fig += 1

    # plot countour posterior
    ax4 = fig.add_subplot(len(Ps), 4, count_fig)
    ax4.imshow(pwgrid, origin="lower", extent=[-wlim,wlim,-wlim,wlim], alpha=0.5)
    ax4.contour(w1grid, w2grid, pwgrid);
    plt.plot(ws_gd[iP,0,0], ws_gd[iP,0,1], '.', c='black')
    plt.plot(ws_gd[iP,:,0], ws_gd[iP,:,1], c='black')
    plt.plot(ws_gd[iP,-1,0], ws_gd[iP,-1,1], 'x', c='black')
    count_fig += 1
    
plt.tight_layout();

#### Compare the ML predition with the Bayesian one

In [None]:
x_test = np.array([-2.,2.])
t_test = +1

plot_points(X, t, plt)
plt.plot(x_test[0], x_test[1], 'x', c='red');

# compute MP output
y_gd = compute_output(ws_gd[-1,-1], x_test)

# approximate posterior with a grid (expected to work well for large P)
all_outs = compute_output(wgrid_stacked, x_test)
pwgrid_normalized = pwgrid / pwgrid.sum()
y_bayes = (all_outs * pwgrid_normalized).sum()

print("p(t = +1| x_test, w_MP) =", y_gd)
print("p(t = +1| x_test) =", y_bayes)

# Question:

* How do we compute the integral over the two-dimensional posterior
$$p(t|x^\text{test})=\int dw p(w|D) p(t|x^\text{test},w)$$
  in order to get the best predictor for a test input?

This is particularly important in high dimension. We will use both **Monte Carlo** and **Variational approximations** in the next lectures.

# But are high dimensions always bad?

### Self-averaging / Concentration: the bliss of high dimension

**Self-averaging**: *Well-behaved* observables in learning and optimization have negligible fluctuations across random realizations of problem instances

**Concentration**: *Well-behaved* functions in high dimensions are almost anywhere constant.

### An example: computing the average generalization error of a perceptron in a Teacher-student scenario

In [None]:
# generate teacher vector and test patterns
N = 200
Ptest = 10000
T, Xtest, ytest_sign = generate_teacher_and_test_set(Ptest, N)

# set parameters
lr = 0.1
num_epochs = 1000
print_every = 1e10
parallel = True
renormalize = True
verbose = True
learn_bias = False

α_span = np.arange(1, 20, 1)
w_span = np.zeros((len(α_span), N))
normw_span = np.zeros(len(α_span))
err_span = np.zeros(len(α_span))
ov_span = np.zeros(len(α_span))
gen_err_span = np.zeros(len(α_span))

for iα, α in enumerate(α_span):
    if verbose:
        print(f"doing α: {α}")
    
    P = int(α * N)
    X = np.random.randn(P, N)
    y = 1. * (X @ T > 0)

    w, err, errs, ep = train_perceptron(X, y, learn_bias=learn_bias, lr=lr, num_epochs=num_epochs,
                                        print_every=print_every, parallel=parallel, renormalize=renormalize)
    w_span[iα] = w
    normw_span[iα] = (w**2).sum()
    err_span[iα] = err
    ov_span[iα] = w @ T / N
    gen_err_span[iα] = 1 - (np.sign(Xtest @ w) * ytest_sign > 0).sum() / Ptest

print("done.")

In [17]:
# load the theory that was revealed to you in a dream
res = np.loadtxt("theory_perceptron_intro.txt", delimiter=",")
αs = res[0]
Rs = res[1]
gen_errors = res[2]

In [None]:
# plot the theory and experimental points
plt.plot(α_span, ov_span, '.', color='blue', label="$W \cdot T$ / N")
plt.plot(αs, Rs, '--', color='blue', label="$W \cdot T$ / N theory");

plt.plot(α_span, gen_err_span, '.', color='red', label="gen error");
plt.plot(αs, gen_errors, color='red', label="gen error theory");

plt.legend();
plt.xlabel('α');
plt.ylabel('overlap / gen error');

#### By the end-of-year break you will be able to compute the analytical curve and put experimental points on top of it

...even in those cases where no gradient-based methods exist to find the correct solution to a problem.