Tested

ajtulloch · May 6, 2012 · e441af1 · e441af1
1 parent 7e232c5
commit e441af1
Show file tree

Hide file tree

Showing 15 changed files with 903 additions and 179 deletions.
diff --git a/Files/ESL-Chap2Solutions.tex b/Files/ESL-Chap2Solutions.tex
@@ -170,40 +170,17 @@ \chapter{Overview of Supervised Learning}
     Compare the classification performance of linear regression and $k$-nearest neighbour classification on the \texttt{zipcode} data.  In particular, consider on the \texttt{2}'s and \texttt{3}'s, and $k = 1, 3, 5, 7, 15$.  Show both the training and test error for each choice.
 \end{exer}
 
-\begin{proof}
-    TODO - Plot error rates, etc.
-
-\begin{lstlisting}
-# Load training data
-zip.train <- as.data.frame(read.table(file="zip.train", header=FALSE))
-colnames(zip.train) <- c("Y",paste("X.",1:256,sep=""))
-zip.train.filtered <- subset(zip.train, Y == 2 | Y == 3)
-# Create linear regression
-mod <- lm(Y ~ ., data = zip.train.filtered)
-
-# Load testing data
-zip.test <- as.data.frame(read.table(file="zip.test", header=FALSE))
-colnames(zip.test) <- c("Y",paste("X.",1:256,sep=""))
-zip.test.filtered <- subset(zip.test, Y == 2 | Y == 3)
-# Predict categories
-zip.test.filtered$Ypred <- predict(mod, zip.test.filtered)
-
-category_f <- function(x) {
-if (x > 2.5) 3 else 2
-}
-# Round predictions
-zip.test.filtered$Yround <- sapply(zip.test.filtered$Ypred, category_f)
-
-##### KNN
-knn.test.data <- subset(zip.test, Y == 2 | Y == 3)
-knn.train.data <- subset(zip.train, Y == 2 | Y == 3)
-knn.train.data$Y <- as.factor(knn.train.data$Y)
-
-knn.results <- sapply(1:15, function(k) { knn(train=knn.train.data, test=knn.test.data, knn.train.data$Y, k = k) })
-install.packages("mclust")
-errors <- sapply(knn.results, function(classification) { classError(knn.test.data$Y, classification)$errorRate})
-\end{lstlisting}
-
+\begin{proof}    
+Our implementation in R and graphs are attached.
+
+\clearpage
+\lstinputlisting{./ElemStatLearnCode/src/exercise_2_8.R}
+
+\clearpage
+\begin{figure}
+\centering\includegraphics[width=\textwidth]{./ElemStatLearnCode/graphs/exercise_2_8.pdf}
+\end{figure}
+
 \end{proof}
 
 \begin{exer}
@@ -212,4 +189,4 @@ \chapter{Overview of Supervised Learning}
     If $R_{tr}(\beta) = \frac{1}{N} \sum_{i=1}^N \left(y_i \beta^T x_i \right)^2$ and $R_{te}(\beta) = \frac{1}{M} \sum_{i=1}^M \left( \tilde y_i - \beta^T \tilde x_i \right)^2$, prove that \[
         E(R_{tr}(\hat \beta)) \leq E(R_{te}(\hat \beta))
     \] where the expectation is over all that is random in each expression.
-\end{exer}
+\end{exer}
diff --git a/Files/ESL-Chap3Solutions.tex b/Files/ESL-Chap3Solutions.tex
@@ -38,50 +38,14 @@ \chapter{Linear Methods for Regression}
                                     &= \hat \sigma^2 x_0^T (X^T X)^{-1} x_0.
     \end{align*} where $\hat \sigma^2$ is the estimated variance of the innovations $\epsilon_i$.
 
-
+    R code and graphs of the simulation are attached.
+    \clearpage
+    \lstinputlisting{./ElemStatLearnCode/src/exercise_3_2.R}
+    \clearpage
     \begin{figure}
-	\centering\includegraphics[width=\textwidth]{./RCode/CubicRegression.png}
+	\centering\includegraphics[width=\textwidth]{ElemStatLearnCode/graphs/exercise_3_2.pdf}
     \end{figure}
 
-    We can implement this algorithm in R as follows:
-
-\begin{lstlisting}
-library("ggplot2")
-library("reshape2")
-
-# Raw data
-simulation.xs <- c(1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969)
-simulation.ys <- c(4835, 4970, 5085, 5160, 5310, 5260, 5235, 5255, 5235, 5210, 5175)
-simulation.df <- data.frame(pop = simulation.ys, year = simulation.xs)
-
-# Rescale years
-simulation.df$year <- simulation.df$year - 1964
-
-# Generate regression, construct confidence intervals
-fit <- lm(pop ~ year + I(year^2) + I(year^3), data=simulation.df)
-xs <- seq(-5, 5, 0.1)
-fit.confidence <- predict(fit, data.frame(year=xs), interval="confidence", level=0.95)
-
-
-# Create data frame containing variables of interest
-df <- as.data.frame(fit.confidence)
-df$year <- xs
-df <- melt(df, id.vars="year")
-
-p <- ggplot()
-p <- p + geom_line(aes(x=year, y=value, colour=variable),
-                   df)
-P <- p + geom_point(aes(x=year, y=pop), 
-                    simulation.df)
-p <- p + scale_x_continuous('Year') 
-p <- p + scale_y_continuous('Population')
-p <- p + opts(title="Cubic regression with confidence intervals")
-p <- p + scale_color_brewer(name="Legend",
-        labels=c("Fit", 
-                 "95% Lower Bound", 
-                 "95% Upper Bound"), 
-        palette="Set1")
-\end{lstlisting}
 
     TODO: Part 2.
 \end{proof}
@@ -224,3 +188,36 @@ \chapter{Linear Methods for Regression}
             &= \sum_{i=1}^{N} \left( y_i - \sum_{j=1}^p x_{ij} \beta_j \right)^2 + \sum_{j=1}^p \lambda \beta_j^2 
     \end{align*} which is the objective function for the ridge regression estimate.
 \end{proof}
+
+\begin{exer}
+    Derive expression (3.62), and show that $\hat \beta^{\text{pcr}}(p) = \hat \beta^{\text{ls}}$.
+\end{exer}
+
+\begin{exer}
+    Show that in the orthogonal case, PLS stops after $m=1$ steps, because subsequent $\hat \phi_{mj}$ in step 2 in Algorithm 3.3 are zero.
+\end{exer}
+
+\begin{exer}
+    Verity expression (3.64), and hence show that the PLS directions are a compromise between the OLS coefficients and the principal component directions.
+\end{exer}
+
+\begin{exer}
+    Derive the entries in Table 3.4, the explicit forms for estimators in the orthogonal case.
+\end{exer}
+
+\begin{exer}
+    Repeat the analysis of Table 3.3 on the spam data discussed in Chapter 1.
+\end{exer}
+
+\begin{proof}
+    R code implementing this method is attached.  We require the \texttt{MASS}, \texttt{lars}, and \texttt{pls} packages.
+
+    \clearpage
+    \lstinputlisting{ElemStatLearnCode/src/exercise_3_17.R}
+
+    \begin{figure}
+        \begin{center}
+            \includegraphics[width=\textwidth]{ElemStatLearnCode/graphs/exercise_3_17.pdf}
+        \end{center}
+    \end{figure}
+\end{proof}
diff --git a/Files/ESL-Chap4Solutions.tex b/Files/ESL-Chap4Solutions.tex
@@ -1,7 +1,7 @@
 \chapter{Linear Methods for Classification}
 
 \begin{exer}
-    Show how to solve the geenralised eigenvalue problem $\max a^T B a$ subject to $a^T W a = 1$ by transforming it to a standard eigenvalue problem.
+    Show how to solve the generalised eigenvalue problem $\max a^T B a$ subject to $a^T W a = 1$ by transforming it to a standard eigenvalue problem.
 \end{exer}
 
 \begin{proof}

diff --git a/PDFs/ESL-Solutions.pdf b/PDFs/ESL-Solutions.pdf
diff --git a/_posts/2012-03-28-elements-of-statistical-learning-chapter-2-solutions.md b/_posts/2012-03-28-elements-of-statistical-learning-chapter-2-solutions.md
@@ -32,25 +32,25 @@ See the solutions in [PDF][chap2-pdf] format ([source][chap2-tex]) for a more pl
 
 #### Exercise 2.1
 
-> Suppose that each of $K$-classes has an associated target $t_k$, which is a vector of all zeroes, except a one in the $k$-th position.  Show that classifying the largest element of $\hat y$ amounts to choosing the closest target, $\min_k \| t_k - \hat y \|$ if the elements of $\hat y$ sum to one.
+> Suppose that each of $K$-classes has an associated target $t_k$, which is a vector of all zeroes, except a one in the $k$-th position.  Show that classifying the largest element of $\hat y$ amounts to choosing the closest target, $\min_k \\\| t_k - \hat y \\\|$ if the elements of $\hat y$ sum to one.
 
 
 #### Proof
 
 The assertion is equivalent to showing that \[
-\text{argmax}_i \hat y_i = \text{argmin}_k \| t_k - \hat y \| = \text{argmin}_k \|\hat y - t_k \|^2
+\text{argmax}_i \hat y_i = \text{argmin}_k \\\| t_k - \hat y \\\| = \text{argmin}_k \\\|\hat y - t_k \\\|^2
 \] by monotonicity of $x \mapsto x^2$ and symmetry of the norm.
 
-WLOG, let $\| \cdot \|$ be the Euclidean norm $\| \cdot \|_2$.  Let $k = \text{argmax}_i \hat y_i$, with $\hat y_k = \max y_i$.  Note that then $\hat y_k \geq \frac{1}{K}$, since $\sum \hat y_i = 1$.
+WLOG, let $\\\| \cdot \\\|$ be the Euclidean norm $\\\| \cdot \\\|_2$.  Let $k = \text{argmax}_i \hat y_i$, with $\hat y_k = \max y_i$.  Note that then $\hat y_k \geq \frac{1}{K}$, since $\sum \hat y_i = 1$.
 
 Then for any $k' \neq k$ (note that $y_{k'} \leq y_k$), we have \begin{align}
-\| y - t_{k'} \|_2^2 - \| y - t_k \|_2^2 &= y_k^2 + \left(y_{k'} - 1 \right)^2 - \left( y_{k'}^2 + \left(y_k - 1 \right)^2 \right) \\\\
+\\\| y - t_{k'} \\\|_2^2 - \\\| y - t_k \\\|_2^2 &= y_k^2 + \left(y_{k'} - 1 \right)^2 - \left( y_{k'}^2 + \left(y_k - 1 \right)^2 \right) \\\\
 &= 2 \left(y_k - y_{k'}\right) \\\\
 &\geq 0
 \end{align} since $y_{k'} \leq y_k$ by assumption.
 
 Thus we must have \[
-\text{argmin}_k \| t_k - \hat y \| = \text{argmax}_i \hat y_i
+\text{argmin}_k \\\| t_k - \hat y \\\| = \text{argmax}_i \hat y_i
 \] as required.
 
 
@@ -63,12 +63,12 @@ Thus we must have \[
 
 The Bayes classifier is \[
 \hat G(X) = \text{argmax}_{g \in \mathcal G} P(g | X = x ).
-\] In our two-class example $\text{orange}$ and $\text{blue}$, the decision boundary is the set where \[
-P(g=\text{blue} | X = x) = P(g =\text{orange} | X = x) = \frac{1}{2}.
+\] In our two-class example $\textsc{orange}$ and $\textsc{blue}$, the decision boundary is the set where \[
+P(g=\textsc{blue} | X = x) = P(g =\textsc{orange} | X = x) = \frac{1}{2}.
 \]
 
 By the Bayes rule, this is equivalent to the set of points where \[
-P(X = x | g = \text{blue}) P(g = \text{blue}) = P(X = x | g = \text{orange}) P(g = \text{orange})
+P(X = x | g = \textsc{blue}) P(g = \textsc{blue}) = P(X = x | g = \textsc{orange}) P(g = \textsc{orange})
 \] And since we know $P(g)$ and $P(X=x|g)$, the decision boundary can be calculated.
 
 
@@ -95,9 +95,9 @@ P(\text{All $N$ points are further than $r$ from the origin}) = \frac{1}{2}
 \] by definition of the median.
 
 Since the points $x_i$ are independently distributed, this implies that \[
-\frac{1}{2} = \prod_{i=1}^N P(\|x_i\| > r)
+\frac{1}{2} = \prod_{i=1}^N P(\\\|x_i\\\| > r)
 \] and as the points $x_i$ are uniformly distributed in the unit ball, we have that \begin{align}
-P(\| x_i \| > r) &= 1 - P(\| x_i \| \leq r) \\\\
+P(\\\| x_i \\\| > r) &= 1 - P(\\\| x_i \\\| \leq r) \\\\
 &= 1 - \frac{Kr^p}{K} \\\\
 &= 1 - r^p
 \end{align}  Putting these together, we obtain that \[
@@ -109,15 +109,15 @@ r = \left(1-\left(\frac{1}{2}\right)^{1/N}\right)^{1/p}
 
 #### Exercise 2.5
 
-> Consider inputs drawn from a spherical multivariate-normal distribution $X \sim N(0,\mathbf{1}_p)$. The squared distance from any sample point to the origin has a $\chi^2_p$ distribution with mean $p$. Consider a prediction point $x_0$ drawn from this distribution, and let $a = \frac{x_0}{\| x_0\|}$ be an associated unit vector. Let $z_i = a^T x_i$ be the projection of each of the training points on this direction.
+> Consider inputs drawn from a spherical multivariate-normal distribution $X \sim N(0,\mathbf{1}_p)$. The squared distance from any sample point to the origin has a $\chi^2_p$ distribution with mean $p$. Consider a prediction point $x_0$ drawn from this distribution, and let $a = \frac{x_0}{\\\| x_0\\\|}$ be an associated unit vector. Let $z_i = a^T x_i$ be the projection of each of the training points on this direction.
 > Show that the $z_i$ are distributed $N(0,1)$ with expected squared distance from the origin 1, while the target point has expected squared distance $p$ from the origin.
 > Hence for $p = 10$, a randomly drawn test point is about 3.1 standard deviations from the origin, while all the training points are on average one standard deviation along direction a. So most prediction points see themselves as lying on the edge of the training set.
 
 
 #### Proof
 
-Let $z_i = a^T x_i = \frac{x_0^T}{\| x_0 \|} x_i$.  Then $z_i$ is a linear combination of $N(0,1)$ random variables, and hence normal, with expectation zero and variance \[
-\text{Var}(z_i) = \| a^T \|^2 \text{Var}(x_i) = \text{Var}(x_i) = 1
+Let $z_i = a^T x_i = \frac{x_0^T}{\\\| x_0 \\\|} x_i$.  Then $z_i$ is a linear combination of $N(0,1)$ random variables, and hence normal, with expectation zero and variance \[
+\text{Var}(z_i) = \\\| a^T \\\|^2 \text{Var}(x_i) = \text{Var}(x_i) = 1
 \] as the vector $a$ has unit length and $x_i \sim N(0, 1)$.
 
 For each target point $x_i$, the squared distance from the origin is a $\chi^2_p$ distribution with mean $p$, as required.
@@ -214,45 +214,89 @@ In the $k$-nearest-neighbour representation, we have \[
 
 #### Proof
 
-TODO - Plot error rates, etc.
-
-
-
-    # Load training data
-    zip.train <- as.data.frame(read.table(file="zip.train", header=FALSE))
-    colnames(zip.train) <- c("Y",paste("X.",1:256,sep=""))
-    zip.train.filtered <- subset(zip.train, Y == 2 | Y == 3)
-    # Create linear regression
-    mod <- lm(Y ~ ., data = zip.train.filtered)
-
-    # Load testing data
-    zip.test <- as.data.frame(read.table(file="zip.test", header=FALSE))
-    colnames(zip.test) <- c("Y",paste("X.",1:256,sep=""))
-    zip.test.filtered <- subset(zip.test, Y == 2 | Y == 3)
-    # Predict categories
-    zip.test.filtered$Ypred <- predict(mod, zip.test.filtered)
-
-    category_f <- function(x) {
-    if (x > 2.5) 3 else 2
-    }
-    # Round predictions
-    zip.test.filtered$Yround <- sapply(zip.test.filtered$Ypred, category_f)
-
-    ##### KNN
-    knn.test.data <- subset(zip.test, Y == 2 | Y == 3)
-    knn.train.data <- subset(zip.train, Y == 2 | Y == 3)
-    knn.train.data$Y <- as.factor(knn.train.data$Y)
-
-    knn.results <- sapply(1:15, function(k) { knn(train=knn.train.data, test=knn.test.data, knn.train.data$Y, k = k) })
-    install.packages("mclust")
-    errors <- sapply(knn.results, function(classification) { classError(knn.test.data$Y, classification)$errorRate})
-
-
+Our implementation in R and graphs are attached.
+
+{% highlight R %}
+library('ProjectTemplate')
+load.project()
+
+## Linear Regression
+mod <- lm(Y ~ ., data = zip.train.filtered)
+
+# Round predictions
+category_f <- function(x) { if (x > 2.5) 3 else 2 }
+predictions.lm.test <- as.character(sapply(predict(mod, zip.test.filtered), 
+                                           category_f))
+predictions.lm.train <- as.character(sapply(predict(mod, zip.train.filtered), 
+                                            category_f))
+
+## KNN
+knn.train <- zip.train.filtered[, 2:257]
+knn.test <- zip.test.filtered[, 2:257]
+
+knn.train.Y <- as.factor(zip.train.filtered$Y)
+knn.test.Y <- as.factor(zip.test.filtered$Y)
+
+# KNN Predictions
+predictions.knn.test <- sapply(1:15, function(k) { 
+    knn(train = knn.train, 
+        test = knn.test, 
+        cl = knn.train.Y, 
+        k = k) 
+  })
+predictions.knn.train <- sapply(1:15, function(k) { 
+  knn(train = knn.train, 
+      test = knn.train, 
+      cl = knn.train.Y, 
+      k = k) 
+})
+
+# Compute error rates
+errors.xs <- 1:15
+
+errors.knn.test <- apply(predictions.knn.test, 2, function(prediction) {
+    classError(prediction, as.factor(zip.test.filtered$Y))$errorRate
+  })
+errors.knn.train <- apply(predictions.knn.train, 2, function(prediction) {
+  classError(prediction, as.factor(zip.train.filtered$Y))$errorRate
+})
+errors.lm.test <- sapply(errors.xs, function(k) {
+    classError(predictions.lm.test, as.factor(zip.test.filtered$Y))$errorRate
+  })
+errors.lm.train <- sapply(errors.xs, function(k) {
+  classError(predictions.lm.train, as.factor(zip.train.filtered$Y))$errorRate
+})
+
+errors <- data.frame("K"=errors.xs, 
+                     "KNN.Train"=errors.knn.train, 
+                     "KNN.Test"=errors.knn.test,
+                     "LR.Train"=errors.lm.train,
+                     "LR.Test"=errors.lm.test)
+
+# Create Plot
+plot.data <- melt(errors, id="K") 
+ggplot(data=plot.data,
+       aes(x=K, y=value, colour=variable)) +
+      geom_line() +
+      xlab("k") + 
+      ylab("Classification Error") +
+      opts(title="Classification Errors for different methods on zipcode data")
+      scale_colour_hue(name="Classification Method",
+                       labels=c("k-NN (Train)",
+                                "k-NN (Test)",
+                                "Linear Regression (Train)",
+                                "Linear Regression (Test)")
+                        )
+ggsave(file.path('graphs', 'exercise_2_8.pdf'))
+ggsave(file.path('graphs', 'exercise_2_8.png'), width=8, height=5, dpi=300)
+{% endhighlight %}
+
+<img class="R" src="/images/exercise_2_8.png"/>
 
 #### Exercise 2.10
 
 > Consider a linear regression model with $p$ parameters, fitted by OLS to a set of trainig data $(x_i, y_i)_{1 \leq i \leq N}$ drawn at random from a population.  Let $\hat \beta$ be the least squares estimate.  Suppose we have some test data $(\tilde x_i, \tilde y_i)_{1 \leq i \leq M}$ drawn at random from the same population as the training data.
 > 
 > If $R_{tr}(\beta) = \frac{1}{N} \sum_{i=1}^N \left(y_i \beta^T x_i \right)^2$ and $R_{te}(\beta) = \frac{1}{M} \sum_{i=1}^M \left( \tilde y_i - \beta^T \tilde x_i \right)^2$, prove that \[
 > E(R_{tr}(\hat \beta)) \leq E(R_{te}(\hat \beta))
-> \] where the expectation is over all that is random in each expression.
+> \] where the expectation is over all that is random in each expression.