## Dummy Vars

In [36]:
source("1-data_sets.R")

library(tibble)

# use unordered levels
tmp <- train.na %>%
    mutate(Income = factor(Income, ordered = FALSE),
           EducationLevel = factor(EducationLevel, ordered = FALSE))

dv <- dummyVars(~ ., data = tmp[, -7])
train.dv <- predict(dv, newdata = tmp[, -7]) %>% as_data_frame()

tmp <- valid.na %>% mutate(Income = factor(Income, ordered = FALSE), EducationLevel = factor(EducationLevel, ordered = FALSE))

valid.dv <- predict(dv, newdata = tmp) %>% as_data_frame()

rm(tmp)

#### Make a data set without highly correlated variables

In [37]:
hcor <- cor(train.dv, use = "na.or.complete")
hc <- findCorrelation(hcor)
train.hc <- train.dv[, -hc]
valid.hc <- valid.dv[, -hc]

In cor(train.dv, use = "na.or.complete"): the standard deviation is zero

In [39]:
str(train.hc)

Classes 'tbl_df', 'tbl' and 'data.frame':	4455 obs. of  1 variable:
 $ NA: NULL


In [None]:
train.dv$Party <- train.party

#### Set seeds

In [4]:
set.seed(123)
seeds <- vector(mode = "list", length = 51)
for(i in 1:50) seeds[[i]] <- sample.int(1000, 22)

seeds[[51]] <- sample.int(1000, 1)

#### Set train control parameters

In [5]:
trCtrl <- trainControl(method = "repeatedcv", repeats = 5, seeds = seeds, classProbs = TRUE, returnResamp = "final")

### Fit Models

In [6]:
library(doParallel)
registerDoParallel()

Loading required package: foreach

Attaching package: 'foreach'

The following objects are masked from 'package:purrr':

    accumulate, when

Loading required package: iterators
Loading required package: parallel


#### GLM

In [8]:
f

In predict.lm(object, newdata, se.fit, scale = 1, type = ifelse(type == : prediction from a rank-deficient fit may be misleading

In [9]:
glm1

In gsub("knnImpute", paste(x$k, "nearest neighbor imputation"), : argument 'replacement' has length > 1 and only the first element will be used

Generalized Linear Model 

4455 samples
 224 predictor
   2 classes: 'Democrat', 'Republican' 

Pre-processing: Box-Cox transformation (1), YOB nearest neighbor
 imputation (220), centered (220), scaled (220), remove (4) 
Resampling: Cross-Validated (10 fold, repeated 5 times) 
Summary of sample sizes: 499, 499, 499, 498, 498, 499, ... 
Resampling results:

  Accuracy   Kappa    
  0.5570649  0.1140379

 

In [10]:
cm.glm1

Confusion Matrix and Statistics

            Reference
Prediction   Democrat Republican
  Democrat        360        234
  Republican      230        289
                                          
               Accuracy : 0.5831          
                 95% CI : (0.5535, 0.6123)
    No Information Rate : 0.5301          
    P-Value [Acc > NIR] : 0.0002131       
                                          
                  Kappa : 0.1628          
 Mcnemar's Test P-Value : 0.8892356       
                                          
            Sensitivity : 0.6102          
            Specificity : 0.5526          
         Pos Pred Value : 0.6061          
         Neg Pred Value : 0.5568          
             Prevalence : 0.5301          
         Detection Rate : 0.3235          
   Detection Prevalence : 0.5337          
      Balanced Accuracy : 0.5814          
                                          
       'Positive' Class : Democrat        
                             

In [11]:
glm2 <- train(Party ~ ., data = train.hc[, -1], method = "glm", trControl = trCtrl, preProcess = c("nzv", "BoxCox", "knnImpute"))
pred.glm2 <- predict(glm2, newdata = valid.dv, na.action = na.pass)
cm.glm2 <- confusionMatrix(pred.glm2, valid.party)

In predict.lm(object, newdata, se.fit, scale = 1, type = ifelse(type == : prediction from a rank-deficient fit may be misleading

In [12]:
glm2

In gsub("knnImpute", paste(x$k, "nearest neighbor imputation"), : argument 'replacement' has length > 1 and only the first element will be used

Generalized Linear Model 

4455 samples
 122 predictor
   2 classes: 'Democrat', 'Republican' 

Pre-processing: Box-Cox transformation (1), YOB nearest neighbor
 imputation (118), centered (118), scaled (118), remove (4) 
Resampling: Cross-Validated (10 fold, repeated 5 times) 
Summary of sample sizes: 499, 499, 499, 498, 499, 498, ... 
Resampling results:

  Accuracy   Kappa    
  0.5621039  0.1238417

 

In [13]:
cm.glm2

Confusion Matrix and Statistics

            Reference
Prediction   Democrat Republican
  Democrat        361        242
  Republican      229        281
                                          
               Accuracy : 0.5768          
                 95% CI : (0.5472, 0.6061)
    No Information Rate : 0.5301          
    P-Value [Acc > NIR] : 0.000967        
                                          
                  Kappa : 0.1494          
 Mcnemar's Test P-Value : 0.580311        
                                          
            Sensitivity : 0.6119          
            Specificity : 0.5373          
         Pos Pred Value : 0.5987          
         Neg Pred Value : 0.5510          
             Prevalence : 0.5301          
         Detection Rate : 0.3243          
   Detection Prevalence : 0.5418          
      Balanced Accuracy : 0.5746          
                                          
       'Positive' Class : Democrat        
                             

#### Random Forest

In [14]:
set.seed(1056)
rf1 <- train(Party ~ ., data = train.dv[, -1], method = "rf", trControl = trCtrl, preProcess = c("nzv", "BoxCox", "knnImpute"))
pred.rf1 <- predict(rf1, newdata = valid.dv, na.action = na.pass)
cm.rf1 <- confusionMatrix(pred.rf1, valid.party)

Loading required package: randomForest
randomForest 4.6-12
Type rfNews() to see new features/changes/bug fixes.

Attaching package: 'randomForest'

The following object is masked from 'package:ggplot2':

    margin

The following object is masked from 'package:dplyr':

    combine



In [15]:
rf1

In gsub("knnImpute", paste(x$k, "nearest neighbor imputation"), : argument 'replacement' has length > 1 and only the first element will be used

Random Forest 

4455 samples
 224 predictor
   2 classes: 'Democrat', 'Republican' 

Pre-processing: Box-Cox transformation (1), YOB nearest neighbor
 imputation (220), centered (220), scaled (220), remove (4) 
Resampling: Cross-Validated (10 fold, repeated 5 times) 
Summary of sample sizes: 499, 499, 499, 498, 498, 499, ... 
Resampling results across tuning parameters:

  mtry  Accuracy   Kappa    
    2   0.5855909  0.1737593
  113   0.6082468  0.2187510
  224   0.5992208  0.2006682

Accuracy was used to select the optimal model using  the largest value.
The final value used for the model was mtry = 113. 

In [16]:
cm.rf1

Confusion Matrix and Statistics

            Reference
Prediction   Democrat Republican
  Democrat        285        153
  Republican      305        370
                                          
               Accuracy : 0.5885          
                 95% CI : (0.5589, 0.6176)
    No Information Rate : 0.5301          
    P-Value [Acc > NIR] : 5.088e-05       
                                          
                  Kappa : 0.1874          
 Mcnemar's Test P-Value : 1.717e-12       
                                          
            Sensitivity : 0.4831          
            Specificity : 0.7075          
         Pos Pred Value : 0.6507          
         Neg Pred Value : 0.5481          
             Prevalence : 0.5301          
         Detection Rate : 0.2561          
   Detection Prevalence : 0.3935          
      Balanced Accuracy : 0.5953          
                                          
       'Positive' Class : Democrat        
                             

#### LDA

In [19]:
set.seed(1056)
lda1 <- train(Party ~ ., data = train.dv[, -1], method = "lda", trControl = trCtrl, preProcess = c("nzv", "BoxCox", "knnImpute"))
pred.lda1 <- predict(lda1, newdata = valid.dv, na.action = na.pass)
cm.lda1 <- confusionMatrix(pred.lda1, valid.party)

Loading required package: MASS

Attaching package: 'MASS'

The following object is masked from 'package:dplyr':

    select

In lda.default(x, grouping, ...): variables are collinear

In [20]:
lda1

In gsub("knnImpute", paste(x$k, "nearest neighbor imputation"), : argument 'replacement' has length > 1 and only the first element will be used

Linear Discriminant Analysis 

4455 samples
 224 predictor
   2 classes: 'Democrat', 'Republican' 

Pre-processing: Box-Cox transformation (1), YOB nearest neighbor
 imputation (220), centered (220), scaled (220), remove (4) 
Resampling: Cross-Validated (10 fold, repeated 5 times) 
Summary of sample sizes: 499, 499, 499, 498, 498, 499, ... 
Resampling results:

  Accuracy   Kappa   
  0.5574416  0.114963

 

In [21]:
cm.lda1

Confusion Matrix and Statistics

            Reference
Prediction   Democrat Republican
  Democrat        347        229
  Republican      243        294
                                          
               Accuracy : 0.5759          
                 95% CI : (0.5463, 0.6052)
    No Information Rate : 0.5301          
    P-Value [Acc > NIR] : 0.001184        
                                          
                  Kappa : 0.15            
 Mcnemar's Test P-Value : 0.549591        
                                          
            Sensitivity : 0.5881          
            Specificity : 0.5621          
         Pos Pred Value : 0.6024          
         Neg Pred Value : 0.5475          
             Prevalence : 0.5301          
         Detection Rate : 0.3118          
   Detection Prevalence : 0.5175          
      Balanced Accuracy : 0.5751          
                                          
       'Positive' Class : Democrat        
                             

In [22]:
set.seed(1056)
lda2 <- train(Party ~ ., data = train.hc[, -1], method = "lda", trControl = trCtrl, preProcess = c("nzv", "BoxCox", "knnImpute"))
pred.lda2 <- predict(lda2, newdata = valid.dv, na.action = na.pass)
cm.lda2 <- confusionMatrix(pred.lda2, valid.party)

In lda.default(x, grouping, ...): variables are collinear

In [23]:
lda2

In gsub("knnImpute", paste(x$k, "nearest neighbor imputation"), : argument 'replacement' has length > 1 and only the first element will be used

Linear Discriminant Analysis 

4455 samples
 122 predictor
   2 classes: 'Democrat', 'Republican' 

Pre-processing: Box-Cox transformation (1), YOB nearest neighbor
 imputation (118), centered (118), scaled (118), remove (4) 
Resampling: Cross-Validated (10 fold, repeated 5 times) 
Summary of sample sizes: 499, 499, 499, 498, 498, 499, ... 
Resampling results:

  Accuracy   Kappa   
  0.5574416  0.114963

 

In [24]:
cm.lda2

Confusion Matrix and Statistics

            Reference
Prediction   Democrat Republican
  Democrat        344        235
  Republican      246        288
                                          
               Accuracy : 0.5678          
                 95% CI : (0.5381, 0.5972)
    No Information Rate : 0.5301          
    P-Value [Acc > NIR] : 0.006271        
                                          
                  Kappa : 0.1336          
 Mcnemar's Test P-Value : 0.648418        
                                          
            Sensitivity : 0.5831          
            Specificity : 0.5507          
         Pos Pred Value : 0.5941          
         Neg Pred Value : 0.5393          
             Prevalence : 0.5301          
         Detection Rate : 0.3091          
   Detection Prevalence : 0.5202          
      Balanced Accuracy : 0.5669          
                                          
       'Positive' Class : Democrat        
                             

In [26]:
set.seed(1056)
binda1 <- train(Party ~ ., data = train.hc[, -1], method = "binda", trControl = trCtrl, preProcess = c("nzv", "BoxCox", "knnImpute"))
pred.binda1 <- predict(binda1, newdata = valid.dv, na.action = na.pass)
cm.binda1 <- confusionMatrix(pred.binda1, valid.party)

In nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, : There were missing values in resampled performance measures.

Something is wrong; all the Accuracy metric values are missing:
    Accuracy       Kappa    
 Min.   : NA   Min.   : NA  
 1st Qu.: NA   1st Qu.: NA  
 Median : NA   Median : NA  
 Mean   :NaN   Mean   :NaN  
 3rd Qu.: NA   3rd Qu.: NA  
 Max.   : NA   Max.   : NA  
 NA's   :3     NA's   :3    


ERROR: Error in train.default(x, y, weights = w, ...): Stopping


In [None]:
binda1

In [None]:
cm.binda1

## Model Comparison

In [17]:
resamp <- resamples(list(glm1 = glm1, glm2 = glm2, rf1 = rf1))

In resamples.default(list(glm1 = glm1, glm2 = glm2, rf1 = rf1)): 'rf1' did not have 'returnResamp="final"; the optimal tuning parameters are used

In [18]:
summary(resamp)


Call:
summary.resamples(object = resamp)

Models: glm1, glm2, rf1 
Number of resamples: 50 

Accuracy 
       Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
glm1 0.3750  0.5136 0.5586 0.5571  0.5893 0.7321    0
glm2 0.4364  0.5202 0.5675 0.5621  0.6054 0.6909    0
rf1  0.4364  0.5656 0.6182 0.6082  0.6516 0.7455    0

Kappa 
        Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
glm1 -0.2500 0.02419 0.1159 0.1140  0.1786 0.4643    0
glm2 -0.1284 0.03909 0.1366 0.1238  0.2117 0.3796    0
rf1  -0.1195 0.13530 0.2381 0.2188  0.3061 0.4907    0
