## Mastering Machine Learning in R

## Chapter 1: Cleaning the Data

In [1]:
install.packages("caret")
install.packages("janitor")
install.packages("readr")
install.packages("sjmisc")
install.packages("skimr")
install.packages("tidyverse")
install.packages("vtreat")

Installing package into 'C:/Users/Watson Turbo/Documents/R/win-library/3.6'
(as 'lib' is unspecified)


package 'caret' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
	C:\Users\Watson Turbo\AppData\Local\Temp\RtmpO85mHO\downloaded_packages


Installing package into 'C:/Users/Watson Turbo/Documents/R/win-library/3.6'
(as 'lib' is unspecified)


package 'janitor' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
	C:\Users\Watson Turbo\AppData\Local\Temp\RtmpO85mHO\downloaded_packages


Installing package into 'C:/Users/Watson Turbo/Documents/R/win-library/3.6'
(as 'lib' is unspecified)


package 'readr' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
	C:\Users\Watson Turbo\AppData\Local\Temp\RtmpO85mHO\downloaded_packages


Installing package into 'C:/Users/Watson Turbo/Documents/R/win-library/3.6'
(as 'lib' is unspecified)


package 'sjmisc' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
	C:\Users\Watson Turbo\AppData\Local\Temp\RtmpO85mHO\downloaded_packages


Installing package into 'C:/Users/Watson Turbo/Documents/R/win-library/3.6'
(as 'lib' is unspecified)


package 'skimr' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
	C:\Users\Watson Turbo\AppData\Local\Temp\RtmpO85mHO\downloaded_packages


Installing package into 'C:/Users/Watson Turbo/Documents/R/win-library/3.6'
(as 'lib' is unspecified)


package 'tidyverse' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
	C:\Users\Watson Turbo\AppData\Local\Temp\RtmpO85mHO\downloaded_packages


Installing package into 'C:/Users/Watson Turbo/Documents/R/win-library/3.6'
(as 'lib' is unspecified)


package 'vtreat' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
	C:\Users\Watson Turbo\AppData\Local\Temp\RtmpO85mHO\downloaded_packages


In [2]:
library(magrittr)

In [5]:
#Raw URL
gettysburg <- readr::read_csv("https://raw.githubusercontent.com/PacktPublishing/Mastering-Machine-Learning-with-R-Third-Edition/master/Data/gettysburg.csv")

Parsed with column specification:
cols(
  .default = col_double(),
  type = col_character(),
  state = col_character(),
  regiment_or_battery = col_character(),
  brigade = col_character(),
  division = col_character(),
  corps = col_character(),
  army = col_character(),
  july1_Commander = col_character(),
  Cdr_casualty = col_character()
)
See spec(...) for full column specifications.


In [6]:
colnames(gettysburg)

In [7]:
dim(gettysburg)

In [13]:
#create vector of logical values true or false
dupes <- duplicated(gettysburg)
table(dupes)
which(dupes == "True")

dupes
FALSE  TRUE 
  587     3 

In [14]:
#Remove Duplicates
gettysburg <- dplyr::distinct(gettysburg, .keep_all = TRUE)

gettysburg %>%
  dplyr::filter(army == "Confederate" & type == "Infantry") %>%
#Produce a more readable printout than "summary(gettysburg)""
  sjmisc::descr() -> descr_stats
#write to CSV
readr::write_csv(descr_stats, 'descr_stats.csv')

In [15]:
skimr::skim(gettysburg)

-- Data Summary ------------------------
                           Values    
Name                       gettysburg
Number of rows             587       
Number of columns          26        
_______________________              
Column type frequency:               
  character                9         
  numeric                  17        
________________________             
Group variables            None      

-- Variable type: character ----------------------------------------------------
# A tibble: 9 x 8
  skim_variable       n_missing complete_rate   min   max empty n_unique
* <chr>                   <int>         <dbl> <int> <int> <int>    <int>
1 type                        0             1     7     9     0        3
2 state                       0             1     2    14     0       30
3 regiment_or_battery         0             1     3    28     0      275
4 brigade                     0             1     3    14     0      124
5 division                    0          

## Exploring Categorical Variables

In [22]:
dplyr::count(gettysburg, dplyr::n_distinct(type))

dplyr::n_distinct(type),n
3,587


In [24]:
# Explore al categorical features using tidyverse
gettysburg_cat <-
  gettysburg[, sapply(gettysburg, class) == 'character']

In [25]:
#Summarize all features and num of levels
gettysburg_cat %>%
  dplyr::summarise_all(dplyr::funs(dplyr::n_distinct(.)))

"funs() is soft deprecated as of dplyr 0.8.0
Please use a list of either functions or lambdas: 

  # Simple named list: 
  list(mean = mean, median = median)

  # Auto named with `tibble::lst()`: 
  tibble::lst(mean, median)

  # Using lambdas
  list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))

type,state,regiment_or_battery,brigade,division,corps,army,july1_Commander,Cdr_casualty
3,30,275,124,38,14,2,586,6


In [26]:
#Commander Casualities
gettysburg_cat %>% 
  dplyr::group_by(Cdr_casualty) %>%
  dplyr::summarize(num_rows = n())

"Calling `n()` without importing or prefixing it is deprecated, use `dplyr::n()`.

Cdr_casualty,num_rows
captured,6
killed,29
mortally wounded,24
no,405
wounded,104
wounded-captured,19


In [27]:
#Compare Commander casualties by army
gettysburg_cat %>%
  janitor::tabyl(army, Cdr_casualty)

army,captured,killed,mortally wounded,no,wounded,wounded-captured
Confederate,2,15,13,165,44,17
Union,4,14,11,240,60,2


In [38]:
#Missing values
na_count <-
  sapply(gettysburg, function(y)
    sum(length(which(is.na(
      y
    )))))

na_df <- data.frame(na_count)

print(na_df)            

                    na_count
type                       0
state                      0
regiment_or_battery        0
brigade                    0
division                   0
corps                      0
army                       0
july1_Commander            0
Cdr_casualty               0
men                        0
killed                     6
wounded                    6
captured                   7
missing                   17
total_casualties           8
3inch_rifles               0
4.5inch_rifles             0
10lb_parrots               0
12lb_howitzers             0
12lb_napoleons             0
6lb_howitzers              0
24lb_howitzers             0
20lb_parrots               0
12lb_whitworths            0
14lb_rifles                0
total_guns                 0


<b>  17 Missing Observations

In [39]:
#Code dummy feature and change N/A to 0's
gettysburg$missing_isNA <- 
  ifelse(is.na(gettysburg$missing), 1, 0)

gettysburg$missing[is.na(gettysburg$missing)] <- 0

In [41]:
#Low or no variance
feature_variance <- caret::nearZeroVar(gettysburg, saveMetrics = TRUE)
head(feature_variance)

Unnamed: 0,freqRatio,percentUnique,zeroVar,nzv
type,3.186047,0.5110733,False,False
state,1.094118,5.1107325,False,False
regiment_or_battery,1.105263,46.8483816,False,False
brigade,1.111111,21.1243612,False,False
division,1.423077,6.4735945,False,False
corps,1.08,2.3850085,False,False


<b>freqRatio: ratio of the % frequency for the most common value over the second most common value.

percentUnique: number of unique values divided by the total # of samples multiplied by 100.

In [42]:
which(feature_variance$zeroVar == 'TRUE')

row.names(feature_variance[17, ])

gettysburg_fltrd <- gettysburg[, feature_variance$zeroVar == 'FALSE']

In [45]:
my_treatment <- vtreat::designTreatmentsZ(
  dframe = gettysburg_fltrd,
  varlist = colnames(gettysburg_fltrd),
  minFraction = 0.05
)

[1] "vtreat 1.5.2 inspecting inputs Wed Feb 19 17:31:34 2020"
[1] "designing treatments Wed Feb 19 17:31:34 2020"
[1] " have initial level statistics Wed Feb 19 17:31:34 2020"
[1] " scoring treatments Wed Feb 19 17:31:34 2020"
[1] "have treatment plan Wed Feb 19 17:31:34 2020"


In [48]:
gettysburg_treated <- vtreat::prepare(my_treatment, gettysburg_fltrd)

dim(gettysburg_treated)

colnames(gettysburg_treated)

table(gettysburg_treated$type_catP)

gettysburg_treated <- 
  gettysburg_treated %>%
  dplyr::select(-dplyr::contains('_catP'))


0.0800681431005111  0.219761499148211  0.700170357751278 
                47                129                411 

In [50]:
#Change column names
colnames(gettysburg_treated) <-
  sub('_clean', "", colnames(gettysburg_treated))

colnames(gettysburg_treated) <-
  sub('_isBAD', "_isNA", colnames(gettysburg_treated))

In [54]:
df_corr <- cor(gettysburg_treated, method = "spearman")

high_corr <- caret::findCorrelation(df_corr, cutoff = 0.9)

high_corr
colnames(gettysburg_treated)[c(9, 4, 22, 43, 3, 5)]

In [55]:
gettysburg_noHighCorr <- gettysburg_treated[, -high_corr]

df_corr <- data.frame(df_corr)

df_corr$feature1 <- row.names(df_corr)

gettysburg_corr <-
  tidyr::gather(data = df_corr,
                key = "feature2",
                value = "correlation",
                -feature1)

gettysburg_corr <- 
  gettysburg_corr %>%
  dplyr::filter(feature1 != feature2)

In [57]:
print(gettysburg_corr)

                        feature1                     feature2   correlation
1             type_lev_x_Cavalry         type_lev_x_Artillery -0.1565720238
2            type_lev_x_Infantry         type_lev_x_Artillery -0.8110109037
3            state_lev_x_Georgia         type_lev_x_Artillery -0.0994430577
4           state_lev_x_New_York         type_lev_x_Artillery -0.0430194482
5     state_lev_x_North_Carolina         type_lev_x_Artillery -0.0755040931
6       state_lev_x_Pennsylvania         type_lev_x_Artillery -0.1556542772
7                 state_lev_x_US         type_lev_x_Artillery  0.2347837061
8           state_lev_x_Virginia         type_lev_x_Artillery  0.2091387711
9   division_lev_x_Artillery_Bde         type_lev_x_Artillery  0.4887169493
10             corps_lev_x_Ewell         type_lev_x_Artillery  0.0262349886
11           corps_lev_x_Hancock         type_lev_x_Artillery -0.0882522101
12              corps_lev_x_Hill         type_lev_x_Artillery -0.0506973508
13          

In [56]:
# linear combination

linear_combos <- caret::findLinearCombos(gettysburg_noHighCorr)

linear_combos

colnames(gettysburg_noHighCorr)[c(16, 7, 8, 9, 10, 11, 12, 13, 14, 15)]

#remove linear combos
linear_remove <- colnames(gettysburg_noHighCorr[16])

df <- gettysburg_noHighCorr[, !(colnames(gettysburg_noHighCorr) %in% linear_remove)]

dim(df)