# KL Divergence

$$
KL(p||q)=\sum_i p_i\log\frac{p_i}{q_i}
$$

# Distance

$$\begin{array}{lll}
(1)&&d(x,y)\ge 0\quad\mbox{for all $x$ and $y$}\\
(2)&&d(x,y)=0\quad\mbox{if and only if}\quad x=y\\
(3)&&d(x,y)=d(y,x)\quad\mbox{for all $x$ and $y$}\\
(4)&&d(x,y)+d(y,z)\ge d(x,z)\quad\mbox{for all $x$, $y$, and $z$}\\
\end{array}$$

$$\begin{array}{lll}
KL(f||g)&=&\int f(x)\log \frac{f(x)}{g(x)}dx\\
&=&-\int f(x)\log \frac{g(x)}{f(x)}dx\\
&\ge& -\log \int f(x)\frac{g(x)}{f(x)}dx\\
&=&-\log \int g(x)dx\\
&=&-\log 1=0
\end{array}$$

$$
KL(f||f)=\int f(x)\log \frac{f(x)}{f(x)}dx=\int f(x)\log 1dx=0
$$

In [1]:
import numpy as np
np.random.seed(2)

# Three probability distributions 

In [2]:
p = np.random.uniform(0.,1.,3)
q = np.random.uniform(0.,1.,3) 
r = np.random.uniform(0.,1.,3) 

p = p / np.sum(p)
q = q / np.sum(q)
r = r / np.sum(r)
print(p)
print(q)
print(r)

[0.43100234 0.02562935 0.54336831]
[0.36704318 0.35443418 0.27852264]
[0.18214073 0.55116157 0.2666977 ]


# KL Divergence is not symetric.

In [3]:
d_pq = np.sum(p*np.log(p/q))
d_qp = np.sum(q*np.log(q/p))
print(d_pq-d_qp)

-0.3208918219422609


# KL Divergence does not satisfy triangular inequality.

In [4]:
d_pq = np.sum(p*np.log(p/q))
d_qr = np.sum(q*np.log(q/r))
d_pr = np.sum(p*np.log(p/r))
print(d_pq+d_qr-d_pr)

-0.20147521203251506


# Relation to Fisher information metric


$$\begin{array}{lll}
KL(f_{\theta_0}||f_{\theta})
&=&\int f_{\theta_0}\log \frac{f_{\theta_0}}{f_{\theta}}\\
&\approx& KL(f_{\theta_0}||f_{\theta_0})+ (\theta-\theta_0)^T{\bf g}(\theta-\theta_0)+\frac{1}{2}(\theta-\theta_0)^T{\bf H}(\theta-\theta_0)
\end{array}$$
$$$$
$$
KL(f_{\theta_0}||f_{\theta_0})=0
$$
$$$$
$$\begin{array}{llllll}
{\bf g}=0\\
\\
\left[\frac{\partial}{\partial\theta_i}KL(f_{\theta_0}||f_{\theta})\right]_{\theta=\theta_0}
&=&\left[
\int f_{\theta_0}\frac{-\frac{f_{\theta_0}}{f^2_\theta}\frac{\partial f_\theta}{\partial\theta_i}}{\frac{f_{\theta_0}}{f_\theta}}
\right]_{\theta=\theta_0}
&=&-\left[
\int f_{\theta_0}\frac{\partial \log f_\theta}{\partial\theta_i}
\right]_{\theta=\theta_0}\\\\
&=&-\left[
\int f_{\theta_0}\frac{\frac{\partial f_\theta}{\partial\theta_i}}{f_\theta}
\right]_{\theta=\theta_0}\\
&=&-\left[
\int \frac{\partial f_\theta}{\partial\theta_i}
\right]_{\theta=\theta_0}\\
&=&-\left[
\frac{\partial }{\partial\theta_i}\int f_\theta
\right]_{\theta=\theta_0}\\
&=&-\left[
\frac{\partial }{\partial\theta_i}1
\right]_{\theta=\theta_0}=0\\
\end{array}$$
$$$$
$$\begin{array}{lll}
\left[\frac{\partial^2}{\partial\theta_i\partial\theta_j}KL(f_{\theta_0}||f_{\theta})\right]_{\theta=\theta_0}
&=&-\left[
\frac{\partial}{\partial\theta_j}\int f_{\theta_0}\frac{\partial \log f_\theta}{\partial\theta_i}
\right]_{\theta=\theta_0}\\
&=&-\left[
\int f_{\theta_0}\frac{\partial}{\partial\theta_j}\frac{\partial \log f_\theta}{\partial\theta_i}
\right]_{\theta=\theta_0}\\
&=&-\left[
\int f_{\theta_0}\frac{\partial}{\partial\theta_j}\frac{\frac{\partial f_\theta}{\partial\theta_i}}{f_\theta}
\right]_{\theta=\theta_0}\\
&=&\left[
\int f_{\theta_0}\frac{1}{f^2_\theta}\frac{\partial f_\theta}{\partial\theta_i}\frac{\partial f_\theta}{\partial\theta_j}
-\int f_{\theta_0}\frac{\frac{\partial^2}{\partial\theta_i\partial\theta_j}f_\theta}{f_\theta}
\right]_{\theta=\theta_0}\\
&=&\left[
\int f_{\theta_0}\frac{1}{f^2_\theta}\frac{\partial f_\theta}{\partial\theta_i}\frac{\partial f_\theta}{\partial\theta_j}
-\int \frac{\partial^2}{\partial\theta_i\partial\theta_j}f_\theta
\right]_{\theta=\theta_0}\\
&=&\left[
\int f_{\theta_0}\frac{1}{f^2_\theta}\frac{\partial f_\theta}{\partial\theta_i}\frac{\partial f_\theta}{\partial\theta_j}
-\frac{\partial^2}{\partial\theta_i\partial\theta_j}\int f_\theta
\right]_{\theta=\theta_0}\\
&=&\left[
\int f_{\theta_0}\frac{1}{f^2_\theta}\frac{\partial f_\theta}{\partial\theta_i}\frac{\partial f_\theta}{\partial\theta_j}
-\frac{\partial^2}{\partial\theta_i\partial\theta_j}1
\right]_{\theta=\theta_0}\\
&=&\left[
\int f_{\theta_0}\frac{1}{f^2_\theta}\frac{\partial f_\theta}{\partial\theta_i}\frac{\partial f_\theta}{\partial\theta_j}
\right]_{\theta=\theta_0}\\
&=&\left[
\int f_{\theta_0}\frac{\partial \log f_\theta}{\partial\theta_i}\frac{\partial \log f_\theta}{\partial\theta_j}
\right]_{\theta=\theta_0}\\
\end{array}$$
$$$$
$$
{\bf H}=E_{x\sim f_{\theta_0}}(\nabla \log f_\theta)_{\theta=\theta_0}(\nabla \log f_\theta)^T_{\theta=\theta_0}\ge 0\quad\mbox{Fisher information metric}
$$
$$$$
$$\begin{array}{lll}
(\theta-\theta_0)^T{\bf H}(\theta-\theta_0)
&=&(\theta-\theta_0)^T\left[E_{x\sim f_{\theta_0}}(\nabla \log f_\theta)_{\theta=\theta_0}(\nabla \log f_\theta)^T_{\theta=\theta_0}\right](\theta-\theta_0)\\
&=&E_{x\sim f_{\theta_0}}(\theta-\theta_0)^T(\nabla \log f_\theta)_{\theta=\theta_0}(\nabla \log f_\theta)^T_{\theta=\theta_0}(\theta-\theta_0)\\
&=&E_{x\sim f_{\theta_0}}\left[(\nabla \log f_\theta)^T_{\theta=\theta_0}(\theta-\theta_0)\right]^2\ge 0\\
\end{array}$$

$$\begin{array}{lll}
KL(N(\mu,\sigma^2)||N(0,1^2))
&=&\int\frac{1}{\sqrt{2\pi\sigma^2}}e^{-\frac{(x-\mu)^2}{2\sigma^2}}\log\frac{\frac{1}{\sqrt{2\pi\sigma^2}}e^{-\frac{(x-\mu)^2}{2\sigma^2}}}{\frac{1}{\sqrt{2\pi}}e^{-\frac{x^2}{2}}}dx\\
&=&\int\frac{1}{\sqrt{2\pi\sigma^2}}e^{-\frac{(x-\mu)^2}{2\sigma^2}}\log\frac{\frac{1}{\sqrt{\sigma^2}}e^{-\frac{(x-\mu)^2}{2\sigma^2}}}{e^{-\frac{x^2}{2}}}dx\\
&=&\int\frac{1}{\sqrt{2\pi\sigma^2}}e^{-\frac{(x-\mu)^2}{2\sigma^2}}\log\frac{1}{\sqrt{\sigma^2}}dx
+\int\frac{1}{\sqrt{2\pi\sigma^2}}e^{-\frac{(x-\mu)^2}{2\sigma^2}}\log\frac{e^{-\frac{(x-\mu)^2}{2\sigma^2}}}{e^{-\frac{x^2}{2}}}dx\\
&=&\log\frac{1}{\sqrt{\sigma^2}}
+\int\frac{1}{\sqrt{2\pi\sigma^2}}e^{-\frac{(x-\mu)^2}{2\sigma^2}}\left(
-\frac{(x-\mu)^2}{2\sigma^2}+\frac{x^2}{2}
\right)dx\\
&=&\log\frac{1}{\sqrt{\sigma^2}}
+\left(
-\frac{\sigma^2}{2\sigma^2}+\frac{\sigma^2+\mu^2}{2}
\right)\\
&=&-\frac{1}{2}\log\sigma^2
+\left(
-\frac{\sigma^2}{2\sigma^2}+\frac{\sigma^2+\mu^2}{2}
\right)\\
&=&-\frac{1}{2}\left(\log\sigma^2
+
1-\sigma^2-\mu^2
\right)\\
\end{array}$$
$$$$
$$
KL(N(\mu,\Sigma)||N(0,I))=\frac{1}{2}\sum_{i=1}^d\left(\mu_i^2+\sigma_i^2 -1-\log\sigma_i^2
\right)
$$
where
$$
\Sigma=\left(\begin{array}{cccccccc}
\sigma_1^2&0&\cdots&0\\
0&\sigma_2^2&\cdots&0\\
0&0&\ddots&\vdots\\
\vdots&\vdots&\cdots&\sigma_d^2\\
\end{array}\right)
$$