## Dummy Vars

In [1]:
source("1-data_sets.R")

library(tibble)

# use unordered levels
tmp <- train.data %>%
    mutate(Income = factor(Income, ordered = FALSE),
           EducationLevel = factor(EducationLevel, ordered = FALSE))

dv <- dummyVars(~ ., data = tmp[, -7])
train.dv <- predict(dv, newdata = tmp[, -7]) %>% as_data_frame()
train.dv$Party <- train.party

tmp <- valid.data %>% mutate(Income = factor(Income, ordered = FALSE), EducationLevel = factor(EducationLevel, ordered = FALSE))

valid.dv <- predict(dv, newdata = tmp) %>% as_data_frame()

rm(tmp)


Attaching package: 'dplyr'

The following objects are masked from 'package:plyr':

    arrange, count, desc, failwith, id, mutate, rename, summarise,
    summarize

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

Loading required package: lattice
Loading required package: ggplot2

Attaching package: 'purrr'

The following object is masked from 'package:caret':

    lift

The following object is masked from 'package:dplyr':

    order_by

The following object is masked from 'package:plyr':

    compact


Attaching package: 'tibble'

The following objects are masked from 'package:dplyr':

    as_data_frame, data_frame, data_frame_, frame_data, glimpse,
    knit_print.trunc_mat, tbl_df, tibble, trunc_mat, type_sum



#### Make a data set without highly correlated variables

In [2]:
hcor <- cor(train.dv[, -226], use = "na.or.complete")
hc <- findCorrelation(hcor)
train.hc <- train.dv[, -hc]
valid.hc <- valid.dv[, -hc]

#### Set seeds

In [3]:
set.seed(123)
seeds <- vector(mode = "list", length = 51)
for(i in 1:50) seeds[[i]] <- sample.int(1000, 22)

seeds[[51]] <- sample.int(1000, 1)

#### Set train control parameters

In [4]:
trCtrl <- trainControl(method = "repeatedcv", repeats = 5, seeds = seeds, classProbs = TRUE, returnResamp = "all")

### Fit Models

In [5]:
library(doParallel)
registerDoParallel()

Loading required package: foreach

Attaching package: 'foreach'

The following objects are masked from 'package:purrr':

    accumulate, when

Loading required package: iterators
Loading required package: parallel


#### GLM

In [6]:
set.seed(1056)
glm1 <- train(Party ~ ., data = train.dv[, -1], method = "glm", trControl = trCtrl,            preProcess = c("nzv", "BoxCox", "knnImpute"))
pred <- predict(model, newdata = valid.dv, na.action = na.pass)
cm <- confusionMatrix(pred, valid.party)

In predict.lm(object, newdata, se.fit, scale = 1, type = ifelse(type == : prediction from a rank-deficient fit may be misleading

In [7]:
modelGLM

In gsub("knnImpute", paste(x$k, "nearest neighbor imputation"), : argument 'replacement' has length > 1 and only the first element will be used

Generalized Linear Model 

4455 samples
 224 predictor
   2 classes: 'Democrat', 'Republican' 

Pre-processing: Box-Cox transformation (1), YOB nearest neighbor
 imputation (220), centered (220), scaled (220), remove (4) 
Resampling: Cross-Validated (10 fold, repeated 5 times) 
Summary of sample sizes: 498, 499, 499, 498, 499, 499, ... 
Resampling results:

  Accuracy   Kappa    
  0.5479286  0.0959749

 

In [8]:
cmGLM

Confusion Matrix and Statistics

            Reference
Prediction   Democrat Republican
  Democrat        360        234
  Republican      230        289
                                          
               Accuracy : 0.5831          
                 95% CI : (0.5535, 0.6123)
    No Information Rate : 0.5301          
    P-Value [Acc > NIR] : 0.0002131       
                                          
                  Kappa : 0.1628          
 Mcnemar's Test P-Value : 0.8892356       
                                          
            Sensitivity : 0.6102          
            Specificity : 0.5526          
         Pos Pred Value : 0.6061          
         Neg Pred Value : 0.5568          
             Prevalence : 0.5301          
         Detection Rate : 0.3235          
   Detection Prevalence : 0.5337          
      Balanced Accuracy : 0.5814          
                                          
       'Positive' Class : Democrat        
                             

In [9]:
modelGLM <- train(Party ~ ., data = train.hc[, -1], method = "glm", trControl = trCtrl,
                  preProcess = c("nzv", "BoxCox", "knnImpute"))
predGLM <- predict(modelGLM, newdata = valid.dv, na.action = na.pass)
cmGLM <- confusionMatrix(predGLM, valid.party)

In predict.lm(object, newdata, se.fit, scale = 1, type = ifelse(type == : prediction from a rank-deficient fit may be misleading

In [10]:
modelGLM

In gsub("knnImpute", paste(x$k, "nearest neighbor imputation"), : argument 'replacement' has length > 1 and only the first element will be used

Generalized Linear Model 

4455 samples
 122 predictor
   2 classes: 'Democrat', 'Republican' 

Pre-processing: Box-Cox transformation (1), YOB nearest neighbor
 imputation (118), centered (118), scaled (118), remove (4) 
Resampling: Cross-Validated (10 fold, repeated 5 times) 
Summary of sample sizes: 499, 499, 499, 498, 499, 498, ... 
Resampling results:

  Accuracy   Kappa    
  0.5621039  0.1238417

 

In [11]:
cmGLM

Confusion Matrix and Statistics

            Reference
Prediction   Democrat Republican
  Democrat        361        242
  Republican      229        281
                                          
               Accuracy : 0.5768          
                 95% CI : (0.5472, 0.6061)
    No Information Rate : 0.5301          
    P-Value [Acc > NIR] : 0.000967        
                                          
                  Kappa : 0.1494          
 Mcnemar's Test P-Value : 0.580311        
                                          
            Sensitivity : 0.6119          
            Specificity : 0.5373          
         Pos Pred Value : 0.5987          
         Neg Pred Value : 0.5510          
             Prevalence : 0.5301          
         Detection Rate : 0.3243          
   Detection Prevalence : 0.5418          
      Balanced Accuracy : 0.5746          
                                          
       'Positive' Class : Democrat        
                             

#### Random Forest

In [12]:
set.seed(1056)
model <- train(Party ~ ., data = train.dv[, -1], method = "rf", trControl = trCtrl, preProcess = c("nzv", "BoxCox", "knnImpute"))
pred <- predict(model, newdata = valid.dv, na.action = na.pass)
cm <- confusionMatrix(pred, valid.party)

Loading required package: randomForest
randomForest 4.6-12
Type rfNews() to see new features/changes/bug fixes.

Attaching package: 'randomForest'

The following object is masked from 'package:ggplot2':

    margin

The following object is masked from 'package:dplyr':

    combine



In [14]:
cm

Confusion Matrix and Statistics

            Reference
Prediction   Democrat Republican
  Democrat        285        153
  Republican      305        370
                                          
               Accuracy : 0.5885          
                 95% CI : (0.5589, 0.6176)
    No Information Rate : 0.5301          
    P-Value [Acc > NIR] : 5.088e-05       
                                          
                  Kappa : 0.1874          
 Mcnemar's Test P-Value : 1.717e-12       
                                          
            Sensitivity : 0.4831          
            Specificity : 0.7075          
         Pos Pred Value : 0.6507          
         Neg Pred Value : 0.5481          
             Prevalence : 0.5301          
         Detection Rate : 0.2561          
   Detection Prevalence : 0.3935          
      Balanced Accuracy : 0.5953          
                                          
       'Positive' Class : Democrat        
                             

## Model Comparison

In [15]:
resamp <- resamples(list(GLM = modelGLM, RF = model))

In resamples.default(list(GLM = modelGLM, RF = model)): 'RF' did not have 'returnResamp="final"; the optimal tuning parameters are used

In [16]:
summary(resamp)


Call:
summary.resamples(object = resamp)

Models: GLM, RF 
Number of resamples: 50 

Accuracy 
      Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
GLM 0.4364  0.5202 0.5675 0.5621  0.6054 0.6909    0
RF  0.5000  0.5656 0.6000 0.6047  0.6364 0.7455    0

Kappa 
       Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
GLM -0.1284 0.03909 0.1366 0.1238  0.2117 0.3796    0
RF   0.0000 0.13350 0.2039 0.2114  0.2751 0.4921    0
