Updated solutions 2 & 3

ajtulloch · Apr 3, 2012 · ac0e1c5 · ac0e1c5
1 parent 3447d63
commit ac0e1c5
Show file tree

Hide file tree

Showing 9 changed files with 525 additions and 379 deletions.
diff --git a/Files/ESL-Chap2Solutions.tex b/Files/ESL-Chap2Solutions.tex
@@ -1,115 +1,8 @@
-% Created by Andrew Tulloch
-
-%!TEX TS-program = xelatex
-%!TEX encoding = UTF-8 Unicode
-
-
-\documentclass[12pt]{amsart}
-\usepackage{amsthm, amsmath, amssymb}
-\usepackage{geometry, setspace, graphicx, enumerate, fullpage}
-\onehalfspacing                 
-\usepackage{fontspec,xltxtra,xunicode}
-
-
-% AMS Theorems
-\theoremstyle{plain}% default 
-\newtheorem{thm}{Theorem}[section] 
-\newtheorem{lem}[thm]{Lemma} 
-\newtheorem{prop}[thm]{Proposition} 
-\newtheorem{exer}[thm]{Exercise} 
-
-\newtheorem*{cor}{Corollary} 
-
-
-\newcommand{\res}[2]{\text{Res}(#1,#2)}
-\theoremstyle{definition} 
-\newtheorem{defn}[thm]{Definition}
-\newtheorem{conj}[thm]{Conjecture}
-\newtheorem{exmp}[thm]{Example}
-
-\theoremstyle{remark} 
-\newtheorem*{rem}{Remark} 
-\newtheorem*{note}{Note} 
-\newtheorem{case}{Case} 
-
-\newcommand{\expc}[1]{\mathbb{E}\left[#1\right]}
-\newcommand{\var}{\text{Var}}
-\newcommand{\cov}[1]{\text{Cov}\left(#1\right)}
-\newcommand{\prob}[1]{\mathbb{P}(#1)}
-\newcommand{\given}{ \, | \,}
-\newcommand{\us}{0 \leq u \leq s}
-\newcommand{\ts}[1]{\{ #1 \}}
-
-\renewcommand{\phi}{\varphi}
-\newcommand{\sigf}{\mathcal{F}}
-
-\newcommand{\dzz}{\, dz}
-\newcommand{\bigo}[1]{\mathcal{O}(#1)}
-
-\newcommand{\al}{\alpha}
-\newcommand{\Q}{\mathbb{Q}}
-\newcommand{\R}{\mathbb{R}}
-\newcommand{\C}{\mathbb{C}}
-\newcommand{\Z}{\mathbb{Z}}
-\newcommand{\E}{\mathbb{E}}
-\newcommand{\N}{\mathbb{N}}
-
-\newcommand{\I}{\mathbb{I}}
-
-\renewcommand{\P}{\mathbb{P}}
-
-\newcommand{\F}{\mathbb{F}}
-\newcommand{\Ga}{\mathbb{G}}
-
-\newcommand{\aut}[1]{\text{Aut}{(#1)}}
-
-\newcommand{\gener}[1]{\langle #1 \rangle}
-\newcommand{\charr}[1]{\text{char}(#1)}
-\newcommand{\nth}{n\textsuperscript{th}}
-
-\newcommand{\tworow}[2]{\genfrac{}{}{0pt}{}{#1}{#2}}
-\newcommand{\xdeg}[2]{[#1 : #2]}
-\newcommand{\gal}[2]{\text{Gal}(#1/#2)}
-\newcommand{\minpoly}[2]{m_{#1, #2}(x)}
-
-\newcommand{\mapping}[5]{\begin{align*}
-	#1 : \quad     #2 &\rightarrow #3 \\
-			#4  &\mapsto #5
-\end{align*}	
-}
-
-
-\def\cip{\,{\buildrel p \over \rightarrow}\,} 
-\def\cid{\,{\buildrel d \over \rightarrow}\,} 
-\def\cas{\,{\buildrel a.s. \over \rightarrow}\,} 
-
-\def\clp{\,{\buildrel L^p \over \rightarrow}\,} 
-
-\def\eqd{\,{\buildrel d \over =}\,} 
-\def\eqas{\,{\buildrel a.s. \over =}\,}
-
-\newcommand{\sigg}{\mathcal{G}}		
-\newcommand{\indic}[1]{\mathbf{1}_{\{ #1 \}} }
-\newcommand{\itos}{\text{It\^o's\ }}
-\DeclareMathOperator*{\argmax}{arg\,max}
-\DeclareMathOperator*{\argmin}{arg\,min}
-
-
-
-\title{Elements of Statistical Learning - Chapter Solutions}								% Document Title
-\author{Andrew Tulloch}
-
-
-\begin{document}
-\maketitle
-\section{Chapter 1}
-
-No exercises.
-
-\section{Chapter 2}
+\chapter{Overview of Supervised Learning}
 \begin{exer}
     Suppose that each of $K$-classes has an associated target $t_k$, which is a vector of all zeroes, except a one in the $k$-th position.  Show that classifying the largest element of $\hat y$ amounts to choosing the closest target, $\min_k \| t_k - \hat y \|$ if the elements of $\hat y$ sum to one. 
 \end{exer}
+
 \begin{proof}
     The assertion is equivalent to showing that \[
     \argmax_i \hat y_i = \argmin_k \| t_k - \hat y \| = \argmin_k \|\hat y - t_k \|^2
@@ -131,6 +24,7 @@ \section{Chapter 2}
 \begin{exer}
     Show how to compute the Bayes decision boundary for the simulation example in Figure 2.5.
 \end{exer}
+
 \begin{proof}
     The Bayes classifier is \[
         \hat G(X) = \argmax_{g \in \mathcal G} P(g | X = x ).
@@ -142,6 +36,14 @@ \section{Chapter 2}
         P(X = x | g = \textsc{blue}) P(g = \textsc{blue}) = P(X = x | g = \textsc{orange}) P(g = \textsc{orange})
     \] And since we know $P(g)$ and $P(X=x|g)$, the decision boundary can be calculated.
 \end{proof}
+
+\begin{exer}
+    Derive equation (2.24)
+\end{exer}
+
+\begin{proof}
+    TODO
+\end{proof}
 
 \begin{exer}
     Consider $N$ data points uniformly distributed in a $p$-dimensional unit ball centered at the origin.  Show the the median distance from the origin to the closest data point is given by \[
@@ -200,7 +102,7 @@ \section{Chapter 2}
                     &= x_0^T \text{Var}_{\mathcal T}(\hat \beta) x_0 \\
                     &= E_{\mathcal T} x_0^T \sigma^2 (\mathbf{X}^T \mathbf{X})^{-1} x_0
             \end{align*} by conditioning (3.8) on $\mathcal T$.
-        \item 
+        \item TODO
     \end{enumerate}
 \end{proof}
 
@@ -221,13 +123,25 @@ \section{Chapter 2}
 \end{proof}
 
 \begin{exer}
-    Suppose that we have a sample of $N$ pairs $x_i, y_i$, drawn IID from the distribution such that $x_i \sim h(x), y_i = f(x_i) + \epsilon_i, E(\epsilon_i) = 0, \text{Var}(\epsilon_i) = \sigma^2$.
+    Suppose that we have a sample of $N$ pairs $x_i, y_i$, drawn IID from the distribution such that \begin{align*}
+        x_i \sim h(x), \\
+        y_i = f(x_i) + \epsilon_i, \\
+        E(\epsilon_i) = 0, \\
+        \text{Var}(\epsilon_i) = \sigma^2.
+    \end{align*}
 
     We construct an estimator for $f$ linear in the $y_i$, \[
         \hat f(x_0) = \sum_{i=1}^N \ell_i(x_0; \mathcal X) y_i
     \] where the weights $\ell_i(x_0; X)$ do not depend on the $y_i$, but do depend on the training sequence $x_i$ denoted by $\mathcal X$.  
     \begin{enumerate}[(a)]
         \item Show that the linear regression and $k$-nearest-neighbour regression are members of this class of estimators.  Describe explicitly the weights $\ell_i(x_0; \mathcal X)$ in each of these cases.
+        \item Decompose the conditional mean-squared error \[
+            E_{\mathcal Y | \mathcal X} \left( f(x_0) - \hat f(x_0) \right)^2
+        \] into a conditional squared bias and a conditional variance component.  $\mathcal Y$ represents the entire training sequence of $y_i$.
+        \item Decompose the (unconditional) MSE \[
+            E_{\mathcal Y, \mathcal X}\left(f(x_0) - \hat f(x_0) \right)^2
+        \] into a squared bias and a variance component.
+        \item Establish a relationship between the square biases and variances in the above two cases.
     \end{enumerate}
 \end{exer}
 
@@ -246,6 +160,56 @@ \section{Chapter 2}
         \] where $N_k(x_0)$ represents the set of $k$-nearest-neighbours of $x_0$.  Clearly, \[
             \ell_i(x_0; \mathcal X) = \frac{1}{k} \mathbf{1}_{x_i \in N_k(x_0)}
         \]
+        \item TODO
+        \item TODO
+        \item TODO
     \end{enumerate}
 \end{proof}
-\end{document}
+
+\begin{exer}
+    Compare the classification performance of linear regression and $k$-nearest neighbour classification on the \texttt{zipcode} data.  In particular, consider on the \texttt{2}'s and \texttt{3}'s, and $k = 1, 3, 5, 7, 15$.  Show both the training and test error for each choice.
+\end{exer}
+
+\begin{proof}
+    TODO - Plot error rates, etc.
+
+\begin{lstlisting}
+# Load training data
+zip.train <- as.data.frame(read.table(file="zip.train", header=FALSE))
+colnames(zip.train) <- c("Y",paste("X.",1:256,sep=""))
+zip.train.filtered <- subset(zip.train, Y == 2 | Y == 3)
+# Create linear regression
+mod <- lm(Y ~ ., data = zip.train.filtered)
+
+# Load testing data
+zip.test <- as.data.frame(read.table(file="zip.test", header=FALSE))
+colnames(zip.test) <- c("Y",paste("X.",1:256,sep=""))
+zip.test.filtered <- subset(zip.test, Y == 2 | Y == 3)
+# Predict categories
+zip.test.filtered$Ypred <- predict(mod, zip.test.filtered)
+
+category_f <- function(x) {
+if (x > 2.5) 3 else 2
+}
+# Round predictions
+zip.test.filtered$Yround <- sapply(zip.test.filtered$Ypred, category_f)
+
+##### KNN
+knn.test.data <- subset(zip.test, Y == 2 | Y == 3)
+knn.train.data <- subset(zip.train, Y == 2 | Y == 3)
+knn.train.data$Y <- as.factor(knn.train.data$Y)
+
+knn.results <- sapply(1:15, function(k) { knn(train=knn.train.data, test=knn.test.data, knn.train.data$Y, k = k) })
+install.packages("mclust")
+errors <- sapply(knn.results, function(classification) { classError(knn.test.data$Y, classification)$errorRate})
+\end{lstlisting}
+
+\end{proof}
+
+\begin{exer}
+    Consider a linear regression model with $p$ parameters, fitted by OLS to a set of trainig data $(x_i, y_i)_{1 \leq i \leq N}$ drawn at random from a population.  Let $\hat \beta$ be the least squares estimate.  Suppose we have some test data $(\tilde x_i, \tilde y_i)_{1 \leq i \leq M}$ drawn at random from the same population as the training data.
+
+    If $R_{tr}(\beta) = \frac{1}{N} \sum_{i=1}^N \left(y_i \beta^T x_i \right)^2$ and $R_{te}(\beta) = \frac{1}{M} \sum_{i=1}^M \left( \tilde y_i - \beta^T \tilde x_i \right)^2$, prove that \[
+        E(R_{tr}(\hat \beta)) \leq E(R_{te}(\hat \beta))
+    \] where the expectation is over all that is random in each expression.
+\end{exer}