Below cell is directly taken from code base

In [0]:
options(warn=-1)
library(h2o)
#If there is a proxy: proxy.old <- Sys.getenv('http_proxy'); Sys.setenv('http_proxy'='');
localH2O =  h2o.init(nthreads = -1, port = 54321, max_mem_size = '6G', startH2O = TRUE)


# Students: Use the "absolute" path to the datasets on your machine (important)
labeled.frame <- h2o.importFile(path = 'Task2C_labeled.csv' ,sep=',') 
unlabeled.frame <- h2o.importFile(path = 'Task2C_unlabeled.csv' ,sep=',') 
test.frame <- h2o.importFile(path = 'Task2C_test.csv' ,sep=',') 

labeled.frame[,1] <- as.factor(labeled.frame$label)
unlabeled.frame[,1] <- NA
train.frame <- h2o.rbind(labeled.frame[,-1], unlabeled.frame[,-1])
test.frame[,1] <- as.factor(test.frame$label)

In [0]:
# libraries
library(reshape2)
library(ggplot2)

error function from code base

In [0]:
####################### GENERAL AUXILIARY FUNCTIONS #######################
## The following structure helps us to have functions with multiple outputs
### credit: https://stat.ethz.ch/pipermail/r-help/2004-June/053343.html

error.rate <- function(Y1, T1){
  if (nrow(Y1)!=nrow(T1)){
    stop('error.rate: size of true lables and predicted labels mismatch')
  }
  return (sum(T1!=Y1)/nrow(T1))
}

This below cell is my code implementation

In [0]:
# variables initialisation for storing the required data
error<- 0
reconstruction_error<- NULL
classification_error <- NULL
augmented_error <- NULL
K<- NULL

# for each K value
for (k in seq(20, 400, 20)){
  # appending the K value to a list
  K <- append(K,k)

  ############ reconstruction error (PART 1) ############################################
  NN.model <- h2o.deeplearning(    
    x = 2:ncol(train.frame), # select all pixels + extra features
    training_frame = train.frame, # specify the frame (imported file)    
    hidden = c(k), # number of layers and their units
    epochs = 50, # maximum number of epoches  
    activation = 'Tanh', # activation function 
    autoencoder = TRUE, # is it an autoencoder? Yes!
    l2 = 0.1)
    
    # using the h2o.anomaly function to find the difference 
    # between original and estimated output and keeping per_feature=FALSE
    anomaly<-h2o.anomaly(NN.model, train.frame, per_feature=FALSE)
    # finding the mean of the anomaly
    error <- mean(anomaly)
    # storing the average reconstruction error in a list
    reconstruction_error <- append(reconstruction_error,error)
  
    
  ############# classification error (PART 2) ###########################################
  NN.model.classification <- h2o.deeplearning(    
     x = 2:ncol(labeled.frame), # select all pixels + extra features
     y = 1,
     training_frame = labeled.frame, # specify the frame (imported file)    
     hidden = c(k), # number of layers and their units
     epochs = 50, # maximum number of epoches  
     activation = 'Tanh', # activation function 
     autoencoder = FALSE, 
     l2 = 0.1)
    
    # using the h2o.predict function to estimate the output
    test.predict <- h2o.predict(NN.model.classification, test.frame)$predict
    # using the error function to calculate the misclassification errors
    classification.error <- error.rate(test.predict,test.frame$label)
    # storing the misclassification errors in a list
    classification_error <- append(classification_error, classification.error)
    
  
  ################ augmented (PART 3) ###################################################
  NN.model.labelled <- h2o.deeplearning(    
    x = 2:ncol(labeled.frame), # select all pixels + extra features
    training_frame = labeled.frame, # specify the frame (imported file)    
    hidden = c(k), # number of layers and their units
    epochs = 50, # maximum number of epoches  
    activation = 'Tanh', # activation function 
    autoencoder = TRUE, # is it an autoencoder? Yes!
    l2 = 0.1)
    # from autoencoder getting the output
    project.layer = as.matrix(h2o.deepfeatures(NN.model.labelled, labeled.frame, layer=1))
    # original + extra feature from autoencoder
    extra.feature <- h2o.cbind(labeled.frame, as.h2o(project.layer))
  
  # 3-layer NN with original + extra feature from autoencoder
  NN.model.augmented <- h2o.deeplearning(    
    x = 2:ncol(extra.feature), # select all pixels + extra features
    y=1,
    training_frame = extra.feature, # specify the frame (imported file)    
    hidden = c(k), # number of layers and their units
    epochs = 50, # maximum number of epoches  
    activation = 'Tanh', # activation function 
    autoencoder = FALSE, # is it an autoencoder? Yes!
    l2 = 0.1)
    
    # using the h2o.predict function to estimate the output
    predict.test <- h2o.predict(NN.model.augmented, test.frame)$predict
    # using the error function to calculate the misclassification errors
    classification.error2 <- error.rate(predict.test,test.frame$label)
    # storing the misclassification errors in a list
    augmented_error <- append(augmented_error,classification.error2)
  
}

Plotting for II. and VI.

In [0]:
# PLOT 1
######################### reconstruction error vs middle hidden layer error ##################################

# building the dataframe and storing the k value along with the three errors we got above
error.data <- data.frame(K, reconstruction_error, classification_error, augmented_error)
# if you want to save the below plot then uncomment the below code which are in comment state now
#pdf(file = "reconstruction error vs middle hidden layer Question 3 III")
ggplot(data = error.data, aes(x= K, y= reconstruction_error)) + geom_line() + ggtitle('reconstruction error vs middle hidden layer error')
#dev.off()



# PLOT 2
######################### 3 layer error vs augmented error #####################################################

# creating a dataframe from the lists created in the for loop in the above cell
error.plot <- data.frame(K, classification_error, augmented_error)
# using reshape for ggplot
error.plot.data <- melt(error.plot, id='K') 
# column renaming
names(error.plot.data) <- c('K', 'Type',  'Error') 
# pdf(file = "3 layer error vs augmented error Question 3 VI")
# plotting the 3 layer error vs augmented error for different layers
ggplot(data = error.plot.data, aes(x= K, y= Error, color= Type)) + geom_line() + ggtitle("3 layer error vs augmented error")
# dev.off()

The objective of Autoencoder is reconstruction error minimisation between input/output, helping in learning key features found in data. Lesser the reconstruction error more will be the retention of information found in input.
As per the plot, if the number of hidden layers is restricted to 200-250 we can retain as much information as possible from the input data, as we can see the error is rising sharply after 200-250. 