# Lecture 3 - Hyperparameter tuning for the lung dataset CART model

## Again, use the *lung* dataset from the EPI7913A package

In [9]:
clung <- epi7913A::lung
head(clung)

Unnamed: 0_level_0,inst,time,status,age,sex,ph.ecog,ph.karno,pat.karno,meal.cal,wt.loss,status.category,sex.category
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<chr>,<chr>
1,3,306,2,74,1,1,90,100,1175.0,,dead,Male
2,3,455,2,68,1,0,90,90,1225.0,15.0,dead,Male
3,3,1010,1,56,1,0,90,90,,15.0,censored,Male
4,5,210,2,57,1,1,90,60,1150.0,11.0,dead,Male
5,1,883,2,60,1,0,100,90,,0.0,dead,Male
6,12,1022,1,74,1,1,50,80,513.0,0.0,censored,Male


### Remove the two categorical columns we added last time!

In [10]:
# clean up – remove the extra columns we added in the previous class!
# and make a copy of data to clung and remove and rows that contain NA
clung <- clung[,-11:-12]
clung <- clung[complete.cases(clung),]
head(clung)

Unnamed: 0_level_0,inst,time,status,age,sex,ph.ecog,ph.karno,pat.karno,meal.cal,wt.loss
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
2,3,455,2,68,1,0,90,90,1225,15
4,5,210,2,57,1,1,90,60,1150,11
6,12,1022,1,74,1,1,50,80,513,0
7,7,310,2,68,2,2,70,60,384,10
8,11,361,2,71,2,2,60,80,538,1
9,1,218,2,53,1,1,70,80,825,16


## Initialize data splitting ratio, the number of iterations to randomly repeat the experiment 

In [11]:
# initialize parameters for the experiment
train_data_split = 0.6   # 0.6 for training and 1-0.6 for testing
iterations = 1000        # 10 repeated runs -- this is set to 1000 in the notes!

## Initialize the outcome (alive_at_cutoff) as binary indicating if survival >= survival_cutoff 

In [12]:
survival_cutoff = 365/2 # survival cutoff is 1 year = 365 days
# construct the outcome of 1 for those who survive on or above 
# the cutoff value, 0 otherwise
clung <- within(clung, { 
  alive_at_cutoff <- NA; #initialize a new column
  alive_at_cutoff[time >= survival_cutoff] <- 1
  alive_at_cutoff[time < survival_cutoff ] <- 0 
} )

head(clung)

Unnamed: 0_level_0,inst,time,status,age,sex,ph.ecog,ph.karno,pat.karno,meal.cal,wt.loss,alive_at_cutoff
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>
2,3,455,2,68,1,0,90,90,1225,15,1
4,5,210,2,57,1,1,90,60,1150,11,1
6,12,1022,1,74,1,1,50,80,513,0,1
7,7,310,2,68,2,2,70,60,384,10,1
8,11,361,2,71,2,2,60,80,538,1,1
9,1,218,2,53,1,1,70,80,825,16,1


## set the random seed for reproducability

In [17]:
# set the random seed for reproducibility 
set.seed(17) 

## Define the function that builds the tree and computes logloss

The function takes as input various hyperparameters. 
This returns a list with the logloss value and the actual model (so we can plot it).

In [18]:
## build tree function
build.tree<-function(train.data, valid.data, minsplit, minbucket, maxdepth, cp) {
  # construct a decision tree classification model   
  tree_model <- rpart::rpart(alive_at_cutoff ~ age + sex + meal.cal + wt.loss, 
                             data = train.data, method = "class", control=c(minsplit=minsplit,
                                                                       minbucket=minbucket,
                                                                       maxdepth=maxdepth,
                                                                       cp=cp))

  # predict the probabilities on test data
  tree_predictions <- predict(tree_model, valid.data, type = 'prob')
  # calculate log loss for validation data
  r2 <- MLmetrics::LogLoss(tree_predictions[,2], 
                           as.numeric(as.character(valid.data$alive_at_cutoff)))  
  
  return(list(logloss=r2, model=tree_model))
}

## Split the Dataset

In [19]:
# split the dataset
# partition clung into train, validate and test portions as per train_data_split 
# and stratified to the ratio of Yes:No in alive_at_cutoff
inds <- splitTools::partition(clung$alive_at_cutoff, 
                              p = c(train = 0.6, valid = 0.2, test = 0.2))
train <- clung[inds$train, ]  # retrieve train data
valid<- clung[inds$valid,]   # retrieve the validation data
test  <- clung[inds$test, ]   # retrieve test data

## Perform grid search hyperparameter tuning

In [20]:
# define the hyperparameter grid
gr <- expand.grid(minsplit=c(5,20), minbucket=c(200,5,10), maxdepth = c(1, 3, 5, 7), cp=c(0.001, 0.01, 0.05, 0.1))

# print(gr) ### this is an example of how to print a list

eval.results<-sapply(1:nrow(gr), function(i)
              {
                  build.tree(train, valid, gr[i,1], gr[i,2], gr[i,3], gr[i,4])$logloss
              })

# print(eval.results)

best.index <- which.min(eval.results)
best.param <- gr[which.min(eval.results),]

cat("Best parameters:\n"); print(best.param)

test.result<-build.tree(rbind(train, valid), test, gr[best.index,1], 
                                                   gr[best.index,2], 
                                                   gr[best.index,3], 
                                                   gr[best.index,4])   

cat("\nThe lowest logloss is:");
print(test.result$logloss); cat("\n")

Best parameters:
  minsplit minbucket maxdepth    cp
1        5       200        1 0.001

The lowest logloss is:[1] 0.6137554

