# Lecture 4 - Hyperparameter tuning (optimization) for CART model

## Load the CCHS data form epi7913A package

In [29]:
library(magrittr)

full_data<-epi7913A::cchs %>% dplyr::slice_sample(prop=0.1)
head(full_data)

Unnamed: 0_level_0,age,sex,CANHEARTbin,householdsize,education,maritalstatus,immigration,houseincome
Unnamed: 0_level_1,<fct>,<fct>,<int>,<fct>,<fct>,<fct>,<int>,<fct>
1,4,2,0,1,1,1,0,3
2,7,2,1,1,4,2,0,1
3,2,2,0,2,4,3,1,1
4,3,1,1,2,4,1,0,1
5,7,1,1,2,2,3,0,2
6,6,1,0,5,2,3,0,1


## Split the data into training and testing proportions

In [24]:
# create train and test data
idx <- splitTools::partition(rep(0,nrow(full_data)), p=c(train=0.7, test=0.3), type="stratified")
train_data <- full_data[idx$train,]
test_data <- full_data[idx$test,]

## Define the outcome variable

In [25]:
# define the outcome variable
voutcome <- "CANHEARTbin"

## Train a CART model with optimal hyperparameters

In [26]:
# train a model with optimal hyperparameters
best_model<-sdgm::cart.bestmodel.bin(train_data, voutcome)
lr_model<-sdgm::lr.bestmodel.bin(train_data, voutcom)


 Best Parameters Found: 
Round = 36	minsplit = 5.0000	minbucket = 5.0000	cp = 0.0010	maxdepth = 4.0000	Value = -0.5982682 


## predict on the test data; this is a generic predict function

In [27]:
# predict on the test data; this is a generic predict function
preds <- predict(best_model, test_data)

## Calculate the logloss of prediction made on test data

In [28]:
# logloss
if (!is.null(preds))
{
        test_logloss<- MLmetrics::LogLoss(preds, test_data[,voutcome] )
} else {
  test_logloss <- NA
  print("Logloss calculation failed because there are no predicted values")
}
 
print(paste0("Logloss on CCHS Data is: ", test_logloss))

[1] "Logloss on CCHS Data is: 0.598588742692058"
