Essentials.tex

\section*{Essentials}
\subsection*{Matrix/Vector}
\textbf{Multiplication:} $\mathbf{C}=\mathbf{AB} \Leftrightarrow c_{ik} = \sum_{j=1}^m a_{ij}\cdot b_{jk}$ \\
\textbf{Orthogonal Matrix:} (full rank square matrix with orthonormal columns) $\mathbf{A}^{-1} = \mathbf{A}^\top$, $\mathbf{A} \mathbf{A}^\top = \mathbf{A}^\top \mathbf{A} = \mathbf{I}$, $\operatorname{det}(\mathbf{A}) \in \{+1, -1\}$, $\operatorname{det}(\mathbf{A}^\top \mathbf{A}) = 1$,
	preserves: inner product, norm, distance, angle, rank, mat. orthogon. \\
\textbf{Inner Product:} $\langle \mathbf{x}, \mathbf{y} \rangle = \mathbf{x}^\top \mathbf{y} = \sum_{i=1}^{N} \mathbf{x}_i \mathbf{y}_i$. \\
$\langle \mathbf{x} \pm \mathbf{y}, \mathbf{x} \pm \mathbf{y} \rangle = \langle \mathbf{x}, \mathbf{x} \rangle \pm 2 \langle \mathbf{x}, \mathbf{y} \rangle + \langle \mathbf{y}, \mathbf{y} \rangle$ \\
$\langle \mathbf{x}, \mathbf{y} + \mathbf{z} \rangle = \langle \mathbf{x}, \mathbf{y} \rangle + \langle \mathbf{x}, \mathbf{z} \rangle$\\
$\langle \mathbf{x}, \mathbf{y} \rangle = \|\mathbf{x}\|_2 \cdot \|\mathbf{y}\|_2 \cdot \cos(\theta)$\\
If $\mathbf{y}$ is a unit vector then $\langle \mathbf{x}, \mathbf{y} \rangle$ projects $\mathbf{x}$ onto $\mathbf{y}$ \\
$(\mathbf{u}_i^T\mathbf{v}_j)\mathbf{v}_j = (\mathbf{v}_j\mathbf{v}_j^T)\mathbf{u}_i$ \\
\textbf{Outer Product:} $\mathbf{u} \mathbf{v}^\top$, \quad $(\mathbf{u} \mathbf{v}^\top)_{i, j} = \mathbf{u}_i \mathbf{v}_j$ \\
\textbf{Transpose:} $(\mathbf{A}^\top)^{-1} = (\mathbf{A}^{-1})^\top$ \\
\textbf{Determinant:} $|\mathbf{A}|=\sum_i \lambda_i$, \quad $|\mathbf{A}^{-1}| = 1/|\mathbf{A}|$

\subsection*{Norms}
$\|\mathbf{x}\|_0 = |\{i | x_i \neq 0\}|$ \quad
$\|\mathbf{x}\|_2 = \sqrt{\sum_{i=1}^{N} \mathbf{x}_i^2} = \sqrt{\langle \mathbf{x}, \mathbf{x} \rangle}$ \\
$\|\mathbf{x}\|_p = \left( \sum_{i=1}^{N} |x_i|^p \right)^{\frac{1}{p}}$ \quad
$\|\mathbf{M}\|_F =\allowbreak \sqrt{\sum_{i=1}^{m} \sum_{j=1}^{n}\mathbf{m}_{i,j}^2} \allowbreak = \sqrt{\sum_{i=1}^{\min\{m, n\}} \sigma_i^2}$  $\allowbreak = \|\sigma(\mathbf{A})\|_2  = \sqrt{trace(\mathbf{M}^T\mathbf{M})} $\\
$\|\mathbf{M}\|_G=\sqrt{\sum_{ij}{g_{ij}x^2_{ij}}}$ (weighted Frobenius) \\
$\|\mathbf{M}\|_1 = \sum_{i,j} | m_{i,j}|$ \quad
$\|\mathbf{M}\|_p = \max_{\mathbf{v} \neq 0} \frac{\|\mathbf{M}\mathbf{v}\|_p}{\|\mathbf{v}\|_p}$ \\
$\|\mathbf{M}\|_2 = \sigma_{\text{max}}(\mathbf{M}) = \|\sigma(\mathbf(M))\|_\infty$ \\
$\|\mathbf{M}\|_* = \sum_{i=1}^{\min(m, n)} \sigma_i = \|\sigma(\mathbf{A})\|_1$ (nuclear norm) \\
$\text{rank}(\mathbf{B}) \geq ||\mathbf{B}||_* \ \text{for} \ ||B||_2 \leq 1$

\subsection*{Derivatives}
$\frac{\partial}{\partial \mathbf{x}}(\mathbf{b}^\top \mathbf{x}) = \frac{\partial}{\partial \mathbf{x}}(\mathbf{x}^\top \mathbf{b}) = \mathbf{b}$ \quad
$\frac{\partial}{\partial \mathbf{x}}(\mathbf{x}^\top \mathbf{x}) = 2\mathbf{x}$\\
$\frac{\partial}{\partial \mathbf{x}}(\mathbf{x}^\top \mathbf{A}\mathbf{x}) = (\mathbf{A}^\top + \mathbf{A})\mathbf{x}$ \quad
$\frac{\partial}{\partial \mathbf{x}}(\mathbf{b}^\top \mathbf{A}\mathbf{x}) = \mathbf{A}^\top \mathbf{b}$\\
$\frac{\partial}{\partial \mathbf{X}}(\mathbf{c}^\top \mathbf{X} \mathbf{b}) = \mathbf{c}\mathbf{b}^\top$ \quad
$\frac{\partial}{\partial \mathbf{X}}(\mathbf{c}^\top \mathbf{X}^\top \mathbf{b}) = \mathbf{b}\mathbf{c}^\top$\\
$\frac{\partial}{\partial \mathbf{x}}(\| \mathbf{x}-\mathbf{b} \|_2) = \frac{\mathbf{x}-\mathbf{b}}{\|\mathbf{x}-\mathbf{b}\|_2}$ \quad
$\frac{\partial}{\partial \mathbf{x}}\log(x) = \frac{1}{x}$ \\
$\frac{\partial}{\partial \mathbf{x}}(\|\mathbf{Ax - b}\|_2^2) = \mathbf{2(A^\top Ax-A^\top b)}$ \quad
$\frac{\partial}{\partial \mathbf{x}}\frac{1}{f(x)} = \frac{-f'}{f^2}$\\
$\frac{\partial}{\partial \mathbf{X}}(|\mathbf{X}|) = |\mathbf{X}|\cdot \mathbf{X}^{-1}$ \quad $\frac{\partial}{\partial x}(\mathbf{Y}^{-1}) = -\mathbf{Y}^{-1} \frac{\partial\mathbf{Y}}{\partial x} \mathbf{Y}^{-1}$

\subsection*{Eigenvalues \& Eigenvectors}
Eigenvalue problem: $\mathbf{Ax}=\lambda\mathbf{x}$ \\
1. solve $\det(\mathbf{A} - \lambda \mathbf{I}) = 0$ resulting in $\{\lambda_i\}$ \\
2. $\forall \lambda_i$ solve $(\mathbf{A} - \lambda_i \mathbf{I})x_i = \mathbf{0}$ for $\mathbf{x}_i$

\subsection*{Eigendecomposition}
$\mathbf{A} \in \mathbb{R}^{N \times N}$ then $\mathbf{A} = \mathbf{Q} \boldsymbol{\Lambda} \mathbf{Q}^{-1}$ with $\mathbf{Q} \in \mathbb{R}^{N \times N}$\\
if fullrank: $\mathbf{A}^{-1} = \mathbf{Q} \boldsymbol{\Lambda}^{-1} \mathbf{Q}^{-1}$ and $(\boldsymbol{\Lambda}^{-1})_{i,i} = {1}/{\lambda_i}$\\
if $\mathbf{A}$ symmetric: $A = \mathbf{Q} \boldsymbol{\Lambda} \mathbf{Q^\top}$ ($\mathbf{Q}$ orthogonal)

\subsection*{Probability / Statistics}
$P(x) = \sum_{y \in Y} P(x, y)$ \quad
$P(x, y) = P(x|y) P(y)$ \\
$\forall y \in Y: \sum_{x \in X} P(x|y) = 1$ (property for any fixed $y$) \\
$P(x|y) = \frac{P(x,y)}{P(y)},\quad \text{if } P(y) > 0$ \quad
$P(x|y) = \frac{P(y|x)P(x)}{P(y)}$ (Bayes' rule) \quad
$P(x_1, \ldots, x_n) = \prod_{i=1}^n P(x_i)$ (iff i.i.d) \\
$P(x|y) = P(x) \Leftrightarrow P(y|x) = P(y)$ (iff $X$, $Y$ indep.) \\
$E[X]:=\sum_{x \in X}x\cdot P(x)$ \quad
$Var[X]:= E[(X-\mu_x)^2]:=\sum_{x \in X}(x-\mu_x)^2P(x)= E(X^2) - E(X)^2$ \\
standard deviation $\sigma_x := \sqrt{Var[X]}$

\subsection*{Lagrangian Multipliers}
Problem $\min_Q g(Q)$ with constraint $\forall j \ \sum_i Q_{ij}=1$ turn into $L(Q, \alpha) = g(Q)+\sum_j \alpha_j (1-\sum_i Q_{ij})$ and find $\max_\alpha \min_Q L(Q,\alpha)$ (can use constraint form.).

\subsection*{Convex Function}
$\forall x_1, x_2 \in X, \forall t \in [0,1]: $ \quad (also $\text{iff} \ \forall x: \  f''(x) \geq 0$)\\
$f(tx_1 + (1-t)x_2)\leq t f(x_1) + (1-t)f(x_2)$ \\
\textbf{Sum} of convex functions is convex, \textbf{log} is convex
%\vspace{1mm}

\textbf{Exercise:} Show that if $f$ is convex, any local optimum is global. Assume there is a local optimum $\hat x$ that is not the global optimum $x^*$, then if we choose $t$ to be in the ball of the local optimum, so we know that $f(t\hat x + (1-t)x^*) \geq f(\hat x)$. Since $f(x^*) < f(\hat x)$,  we have $t\cdot f(\hat x) + (1-t) f(x^*) < f(\hat x)$. So we get $f(t\hat x + (1-t)x^*) \geq f(\hat x) > t\cdot f(\hat x) + (1-t)f(x^*)$, which contradicts the convexity of $f$.

%vspace{1mm}

\subsection*{Jensen Inequality:} 
for convex $\phi$: $\phi (\sum_{i=1}^n \lambda_i x_i) \leq \sum_{i=1}^n \lambda_i f(x_i)$ if $\sum_{i=1}^n \lambda_i = 1$. Also $\phi(E[X])\leq E[\phi(X)]$.