In [1]:
library(caret)

Loading required package: lattice
Loading required package: ggplot2


In [2]:
script_dir = getwd() # get current working direcotry

data_red = read.csv(paste(script_dir, "winequality-red.csv", 
                    sep = "/"),
                    header = TRUE, sep = ";")

data_white = read.csv(paste(script_dir, "winequality-white.csv",
                    sep = "/"), 
                    header = TRUE, sep = ";")

data <- rbind(data_red, data_white) # merge the two dataset as requested

In [3]:
# check the probability of each quality score
prop.table(table(data$quality))


          3           4           5           6           7           8 
0.004617516 0.033246114 0.329074958 0.436509158 0.166076651 0.029706018 
          9 
0.000769586 

In [4]:
# shuffle and split the data into three parts
set.seed(1234)
data <- data[sample(nrow(data)),]
split <- floor(nrow(data)/5)
ensembleData <- data[0:(split*2),] # 40% of the data
blenderData <- data[(split*2+1):(split*4),] # 40% of the data
testingData <- data[(split*4+1):nrow(data),] # 20%

In [5]:
# set label name and predictors
labelName <- 'quality'
predictors <- names(ensembleData)[names(ensembleData) != labelName]

In [6]:
# use cross validation in training 
myControl <- trainControl(method='cv', number=3, returnResamp='none')

In [7]:
# quick benchmark model 
test_model <- train(blenderData[,predictors], 
                    blenderData[,labelName],
                    method='plsRglm', 
                    trControl=myControl)
preds <- predict(object=test_model, testingData[,predictors])


Loading required package: plsRglm


____************************************************____

Family: gaussian 
Link function: identity 

____Component____ 1 ____
____Predicting X without NA neither in X nor in Y____
****________________________________________________****

____************************************************____

Family: gaussian 
Link function: identity 

____Component____ 1 ____
____Component____ 2 ____
____Predicting X without NA neither in X nor in Y____
****________________________________________________****

____************************************************____

Family: gaussian 
Link function: identity 

____Component____ 1 ____
____Component____ 2 ____
____Component____ 3 ____
____Predicting X without NA neither in X nor in Y____
****________________________________________________****

____************************************************____

Family: gaussian 
Link function: identity 

____Component____ 1 ____
____Predicting X without NA neither in X nor in Y____
****_______________________

In [8]:
# train all the ensemble models with ensembleData 
# with different preprocessing method

model_1 <- train(ensembleData[,predictors], ensembleData[,labelName], 
    method='plsRglm', trControl=myControl, 
    preProc = 'expoTrans')

#                  c("center", "scale"))
model_2 <- train(ensembleData[,predictors], ensembleData[,labelName], 
    method='plsRglm', trControl=myControl,
    preProc = 'pca')

model_3 <- train(ensembleData[,predictors], 
    ensembleData[,labelName], method='plsRglm', 
    trControl=myControl,
    preProc = 'YeoJohnson')


____************************************************____

Family: gaussian 
Link function: identity 

____Component____ 1 ____
____Predicting X without NA neither in X nor in Y____
****________________________________________________****

____************************************************____

Family: gaussian 
Link function: identity 

____Component____ 1 ____
____Component____ 2 ____
____Predicting X without NA neither in X nor in Y____
****________________________________________________****

____************************************************____

Family: gaussian 
Link function: identity 

____Component____ 1 ____
____Component____ 2 ____
____Component____ 3 ____
____Predicting X without NA neither in X nor in Y____
****________________________________________________****

____************************************************____

Family: gaussian 
Link function: identity 

____Component____ 1 ____
____Predicting X without NA neither in X nor in Y____
****_______________________

____************************************************____

Family: gaussian 
Link function: identity 

____Component____ 1 ____
____Predicting X without NA neither in X nor in Y____
****________________________________________________****

____************************************************____

Family: gaussian 
Link function: identity 

____Component____ 1 ____
No more significant predictors (<0.01) found
____Predicting X without NA neither in X nor in Y____
****________________________________________________****

____************************************************____

Family: gaussian 
Link function: identity 

____Component____ 1 ____
No more significant predictors (<0.01) found
____Predicting X without NA neither in X nor in Y____
****________________________________________________****

____************************************************____

Family: gaussian 
Link function: identity 

____Component____ 1 ____
____Predicting X without NA neither in X nor in Y____
****________

____************************************************____

Family: gaussian 
Link function: identity 

____Component____ 1 ____
____Predicting X without NA neither in X nor in Y____
****________________________________________________****

____************************************************____

Family: gaussian 
Link function: identity 

____Component____ 1 ____
____Component____ 2 ____
____Predicting X without NA neither in X nor in Y____
****________________________________________________****

____************************************************____

Family: gaussian 
Link function: identity 

____Component____ 1 ____
____Component____ 2 ____
____Component____ 3 ____
____Predicting X without NA neither in X nor in Y____
****________________________________________________****

____************************************************____

Family: gaussian 
Link function: identity 

____Component____ 1 ____
____Predicting X without NA neither in X nor in Y____
****_______________________

In [9]:
# get predictions for each ensemble model for two last data sets
# and add them back to themselves
blenderData$q1 <- predict(object=model_1, blenderData[,predictors])
blenderData$q2 <- predict(object=model_2, blenderData[,predictors])
blenderData$q3 <- predict(object=model_3, blenderData[,predictors])


testingData$q1 <- predict(object=model_1, testingData[,predictors])
testingData$q2 <- predict(object=model_2, testingData[,predictors])
testingData$q3 <- predict(object=model_3, testingData[,predictors])


In [10]:
# see how each individual model performed on its own
data.frame(
    MAE1 = caret::MAE(testingData$q1, testingData$quality),
    MAE2 = caret::MAE(testingData$q2, testingData$quality),
    MAE3 = caret::MAE(testingData$q3, testingData$quality)
   
   )


MAE1,MAE2,MAE3
<dbl>,<dbl>,<dbl>
0.6117353,0.5689637,0.5770847


In [11]:
# run a final model to blend all the prediction together
predictors <- names(blenderData)[names(blenderData) != labelName]
final_blender_model <- train(blenderData[,predictors], 
                             blenderData[,labelName], 
                             method='gbm', 
                             trControl=myControl)



Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        0.7078             nan     0.1000    0.0302
     2        0.6825             nan     0.1000    0.0265
     3        0.6628             nan     0.1000    0.0214
     4        0.6427             nan     0.1000    0.0184
     5        0.6260             nan     0.1000    0.0145
     6        0.6134             nan     0.1000    0.0120
     7        0.5995             nan     0.1000    0.0120
     8        0.5905             nan     0.1000    0.0075
     9        0.5825             nan     0.1000    0.0082
    10        0.5742             nan     0.1000    0.0073
    20        0.5301             nan     0.1000    0.0026
    40        0.5050             nan     0.1000   -0.0005
    60        0.4940             nan     0.1000   -0.0005
    80        0.4857             nan     0.1000   -0.0000
   100        0.4777             nan     0.1000   -0.0006
   120        0.4717             nan     0.1000   -0.0001
   140        

In [12]:
# See final prediction and AUC of blended ensemble
preds <- predict(object=final_blender_model, testingData[,predictors])

In [13]:
preds <-as.data.frame(preds)

In [16]:
data.frame(
    MAE1 = caret::MAE(testingData$q1, testingData$quality),
    MAE2 = caret::MAE(testingData$q2, testingData$quality),
    MAE3 = caret::MAE(testingData$q3, testingData$quality),
   
    MAE_ALL = caret::MAE(preds$preds, testingData$quality)
  )



MAE1,MAE2,MAE3,MAE_ALL
<dbl>,<dbl>,<dbl>,<dbl>
0.6117353,0.5689637,0.5770847,0.5354631
