# Lecture 3 - Build a K-nearest neighbors model from the lung dataset

## Consider the *lung* data set available form the EPI7913A package
### Load the lung data set form last class

In [1]:
clung <- epi7913A::lung
head(clung)

Unnamed: 0_level_0,inst,time,status,age,sex,ph.ecog,ph.karno,pat.karno,meal.cal,wt.loss,status.category,sex.category
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<chr>,<chr>
1,3,306,2,74,1,1,90,100,1175.0,,dead,Male
2,3,455,2,68,1,0,90,90,1225.0,15.0,dead,Male
3,3,1010,1,56,1,0,90,90,,15.0,censored,Male
4,5,210,2,57,1,1,90,60,1150.0,11.0,dead,Male
5,1,883,2,60,1,0,100,90,,0.0,dead,Male
6,12,1022,1,74,1,1,50,80,513.0,0.0,censored,Male


### Remove the two categorical columns we added last time!

In [2]:
# clean up – remove the extra columns we added in the previous class!
# and make a copy of data to clung and remove and rows that contain NA
clung <- clung[,-11:-12]
clung <- clung[complete.cases(clung),]
head(clung)

Unnamed: 0_level_0,inst,time,status,age,sex,ph.ecog,ph.karno,pat.karno,meal.cal,wt.loss
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
2,3,455,2,68,1,0,90,90,1225,15
4,5,210,2,57,1,1,90,60,1150,11
6,12,1022,1,74,1,1,50,80,513,0
7,7,310,2,68,2,2,70,60,384,10
8,11,361,2,71,2,2,60,80,538,1
9,1,218,2,53,1,1,70,80,825,16


## Initialize data splitting ratio, the number of iterations to randomly repeat the experiment 

In [3]:
# initialize parameters for the experiment
train_data_split = 0.6   # 0.6 for training and 1-0.6 for testing
iterations = 1000        # 10 repeated runs -- this is set to 1000 in the notes!

## Initialize the outcome (alive_at_cutoff) as binary indicating if survival >= survival_cutoff 

In [4]:
survival_cutoff = 365  # survival cutoff is 1 year = 365 days
# construct the outcome of 1 for those who survive on or above 
# the cutoff value, 0 otherwise
clung <- within(clung, { 
  alive_at_cutoff <- NA; #initialize a new column
  alive_at_cutoff[time >= survival_cutoff] <- 1
  alive_at_cutoff[time < survival_cutoff ] <- 0 
} )

head(clung)

Unnamed: 0_level_0,inst,time,status,age,sex,ph.ecog,ph.karno,pat.karno,meal.cal,wt.loss,alive_at_cutoff
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>
2,3,455,2,68,1,0,90,90,1225,15,1
4,5,210,2,57,1,1,90,60,1150,11,0
6,12,1022,1,74,1,1,50,80,513,0,1
7,7,310,2,68,2,2,70,60,384,10,0
8,11,361,2,71,2,2,60,80,538,1,0
9,1,218,2,53,1,1,70,80,825,16,0


## set the random seed for reproducability and allocate the results table

In [5]:
# set the random seed for reproducibility 
set.seed(17) 

## KNN Model Construction:
- ### Initialize local variables to capture results and counters
- ### Start the loop over K-nearest neighbours values 1 to 15
    - ### For the number of iterations:
        - ### Partition *clung* into train and test portions as defined by train_data_split and stratified to the ratio of *Yes:No* in alive_at_cutoff
        - ### Retrieve train and test data
        - ### Construct KNN model from "alive_at_cutoff ~ age + sex"  
        - ### Calculate the log loss of predicted probabilities to actual values in training data (find goodness of fit)
        - ### Using the resulting model, predict the probabilities on the test data
        - ### Calculate how well predicted probabilities fit actual values for test data
        - ### Record the goodness of fit on training and logloss on test

In [6]:
# initialize local variables to capture results and counters
logloss <- NA
ks <- NA
tags <- ""
cnt =1

# start the loop over K values 1 to 15
for (k in 1:15){

  # for the number of iterations:
  for (i in 1:iterations) {
  
    #     partition clung into train and test portions as defined by train_data_split and stratified to the ratio of Yes:No in alive_at_cutoff
    inds <- splitTools::partition(clung$alive_at_cutoff, p = c(train = train_data_split, test = (1 - train_data_split)), type="stratified")
  
    #     retrieve train and test data
    train <- clung[inds$train, ]
    test  <- clung[inds$test, ]
  
    # construct KNN model from "alive_at_cutoff ~ age + sex"  
    knn_model <- class::knn(train[,c(4,5,11)],train[,c(4,5,11)], train$alive_at_cutoff, k, prob=T)
  
    # calculate the log loss of predicted probabilities to actual values in training data (find goodness of fit)
    r1 <- MLmetrics::LogLoss(attributes(knn_model)$prob, train$alive_at_cutoff)
    logloss[cnt] <- r1
    tags[cnt] <- "train"
    ks[cnt] <- k
    cnt <- cnt + 1
  
    # using th resulting model, predict the probabilities on the test data
    knn_model <- class::knn(train[,c(4,5,11)],test[,c(4,5,11)], train$alive_at_cutoff, k, prob=T)
  
    # calculate how well predicted probabilities fit actual values for test data
    r2 <- MLmetrics::LogLoss(attributes(knn_model)$prob, test$alive_at_cutoff)
  
    # record the goodness of fit on training and logloss on test
    logloss[cnt] <- r2
    tags[cnt] <- "test"
    ks[cnt] <- k
    cnt <- cnt + 1
  }
}

head(data.frame(logloss,tags,ks))

Unnamed: 0_level_0,logloss,tags,ks
Unnamed: 0_level_1,<dbl>,<chr>,<int>
1,24.2803,train,1
2,23.62341,test,1
3,24.2803,train,1
4,19.56235,test,1
5,24.42192,train,1
6,20.94561,test,1


## plot the boxplots of log(loss) values for train and test data as collected in the runs

##### graphics::boxplot(logloss ~ ks, sub = tags == "train", at = 1:15 - 0.2, col = "green", boxwex = 0.25, xlab ="K", ylab="logLoss")
graphics::boxplot(logloss ~ ks, sub = tags == "test", at = 1:15 + 0.2, col = "blue", add = T, boxwex = 0.25)
graphics::legend("topright", c("train","test"), fill = c("green","blue"))