In [None]:
import os 
import sys
import pickle 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time 

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

sys.path.append("../src")
from load import *
from util import * 
from pegasos import *

%matplotlib inline

In [None]:
plt.rcParams['font.size'] = 14
plt.rcParams['axes.grid'] = True
plt.rcParams['figure.figsize'] = (10,10)

# Problem 2

The kernel matrix is given by $K = XX^T \in \mathbf{R}^{n\times n}$. Explicitly in terms of training vectors it's given by 

$$ K = 
\begin{bmatrix}
x_1^Tx_1 & x_1^Tx_2 & \cdots & x_1^Tx_n\\
x_2^Tx_1 & x_2^Tx_2 & \cdots & x_2^Tx_n\\
\vdots & \vdots & \ddots & \vdots\\
x_n^Tx_1 & x_n^Tx_2 & \cdots & x_n^Tx_n^T
\end{bmatrix}
$$

The squared Euclidean distance between two vectors is given by $d(x,y) = ||x_i-x_j||^2 = ||x_i||^2 + ||x_j||^2 - 2x_i^Tx_j$. It's clear the Gram matrix contains all the information needed to compute the pairwise distances between training examples.

# Problem 3

Consider the regularized least squares objective

$$J(w) = ||Xw-y||^2+ \lambda ||w||^2$$
where $\lambda > 0$

This problem can be written as an ordinary least squres problem of the form $\min ||Aw - b||^2$ where 

$$ 
A = \begin{bmatrix}
X\\
\sqrt{\lambda} I
\end{bmatrix},
b=
\begin{bmatrix}
y \\
0
\end{bmatrix}
$$

This is easily solved as 
$$
\begin{align*}
w^\star &= (A^TA)^{-1}A^Tb\\
&= \left(\begin{bmatrix}
X\\
\sqrt{\lambda} I
\end{bmatrix}^T
\begin{bmatrix}
X\\
\sqrt{\lambda} I
\end{bmatrix}\right)^{-1}
\begin{bmatrix}
X\\
\sqrt{\lambda} I
\end{bmatrix}^T
\begin{bmatrix}
y \\
0
\end{bmatrix}\\
&=(X^TX + \lambda I)^{-1}X^Ty
\end{align*}
$$

The normal equations for the regularized least squares problem is

$$
\begin{align*}
(X^TX + \lambda I)w^\star &= X^Ty\\
w^\star &= \frac{1}{\lambda} (X^Ty - X^TXw^\star)\\
w^\star &= \frac{1}{\lambda} X^T(y - Xw^\star)\\
         &= X^T\alpha\qquad                //\ \alpha := \frac{1}{\lambda} (y - Xw)
\end{align*}
$$

It is evident that the optimal weight vector $w^\star$ is in the span of the data since by the above it is given as a linear combination of the training vectors

$$w^\star = \sum_{i=1}^n \alpha_i x_i$$

The value of the weight vector is 
$$
\begin{align*}
\alpha &= 1/\lambda(y - Xw^\star)\\
\alpha &= 1/\lambda(y - XX^T\alpha)\\
\lambda \alpha &= y - XX^T\alpha\\
(XX^T + \lambda I)\alpha &= y\\
\alpha &= (XX^T + \lambda I)^{-1}y
\end{align*}
$$

The prediction of kernelized ridge regression on the training data is given by

$$
\begin{align*}
\hat{y} &= Xw^\star\\
&= X(X^T\alpha)\\
&= XX^T(XX^T + \lambda I)^{-1}y\\
&= K(K + \lambda I)^{-1}y
\end{align*}
$$

For the prediction of a new training example we have 
$$
\begin{align*}
f(x) &= x^Tw^\star\\
&= \sum_{i=1}^n \alpha_i x^Tx_i\\
&= \sum_{i=1}^n \alpha_i k(x, x_i)\\
&= \alpha^T k_x\\
\end{align*}
$$

# Problem 4

# Appendix A

# Appendix B