-
Notifications
You must be signed in to change notification settings - Fork 0
/
Feature_Selection_Code.R
159 lines (122 loc) · 5.49 KB
/
Feature_Selection_Code.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# Dimensionality Reduction Series
# Part I: Feature Selection Workshop Code
## Data Prep ----
## Set Seed
set.seed(31415)
## Load Libraries
# install.packages()
library(tidyverse)
library(ggplot2)
library(stats)
library(caret)
library(haven)
library(leaps)
library(glmnet)
library(corrplot)
## Load & Examine Dataset
# Download from github: https://github.com/ccss-rs
getwd()
list.files()
data <- read_csv("labor_market_discrimination.csv")
summary(data)
#attributes(data)
## Filter Method ----
colnames(data)
numeric_vars <- dplyr::select(data, "call", "n_jobs","years_exp", "frac_black","frac_white",
"l_med_hh_inc","frac_dropout","frac_colp","l_inc","parent_sales",
"parent_emp","branch_sales","branch_emp","frac_black_emp_zip",
"frac_white_emp_zip","l_med_hh_inc_emp_zip","frac_dropout_emp_zip","frac_colp_emp_zip","l_inc_emp_zip") %>% drop_na()
numeric_vars = as.data.frame(numeric_vars)
# Compute correlation matrix
cormatrix <- stats::cor(numeric_vars)
View(cormatrix)
corrplot(cormatrix, method = "shade", type="full", order = "AOE", insig = "blank")
## Can also use caret::filterVarImp() to measure and rank feature importance
## Wrapper Methods ----
#### Forward Stepwise Selection ----
#### An asterisk ("*") indicates that a given variable is included in the corresponding model.
fit_fwd = regsubsets(call ~ ., data = numeric_vars, nvmax = 19, method = "forward")
fit_fwd_sum = summary(fit_fwd)
fit_fwd_sum
#### Backward Stepwise Selection ----
## Can also use Recursive Feature Elimination using rfe() from the caret package
fit_bwd = regsubsets(call ~ ., data = numeric_vars, nvmax = 19, method = "backward")
fit_bwd_sum = summary(fit_bwd)
fit_bwd_sum
#### Exhaustive Selection / Best subset selection ----
#### The regsubsets() function (part of the leaps library) performs best subset selection by identifying the best model that contains a given number of predictors, where best is quantified using RSS.
fit_ex = regsubsets(call ~ ., data = numeric_vars, nvmax = 19, method = "exhaustive")
fit_ex_sum = summary(fit_ex)
fit_ex_sum
#### Sequential Replacement Selection ----
fit_seq = regsubsets(call ~ ., data = numeric_vars, nvmax = 19, method = "seqrep")
fit_seq_sum = summary(fit_seq)
fit_seq_sum
### Evaluating Feature Selection Performance ----
# prints attributes/results calculated
names(fit_ex_sum)
fit_ex_sum$bic
fit_ex_sum$rss
fit_ex_sum$adjr2
fit_ex_sum$cp
par(mfrow = c(2, 2))
plot(fit_ex_sum$rss, xlab = "Number of Variables", ylab = "RSS", type = "b")
best_rss = which.min(fit_ex_sum$rss)
points(best_rss, fit_ex_sum$rss[best_rss],
col = "red",cex = 2, pch = 20)
plot(fit_ex_sum$adjr2, xlab = "Number of Variables", ylab = "Adjusted RSq", type = "b")
best_adj_r2 = which.max(fit_ex_sum$adjr2)
points(best_adj_r2, fit_ex_sum$adjr2[best_adj_r2],
col = "red",cex = 2, pch = 20)
plot(fit_ex_sum$cp, xlab = "Number of Variables", ylab = "Cp", type = 'b')
best_cp = which.min(fit_ex_sum$cp)
points(best_cp, fit_ex_sum$cp[best_cp],
col = "red", cex = 2, pch = 20)
plot(fit_ex_sum$bic, xlab = "Number of Variables", ylab = "BIC", type = 'b')
best_bic = which.min(fit_ex_sum$bic)
points(best_bic, fit_ex_sum$bic[best_bic],
col = "red", cex = 2, pch = 20)
## Embedded Methods ----
#### LASSO ----
# alpha: Lasso(1) or Ridge regression(0)
# nlambda:the number of values to try for lambda, glmnet will automatically pick which specific values
numeric_vars <- as.data.frame(numeric_vars)
dependent_vars <- as.matrix(select(numeric_vars,-call))
lasso_fit <- glmnet(x=dependent_vars, y=numeric_vars$call, family = "gaussian")
par(mfrow = c(1, 1))
plot(lasso_fit)
# Get the specific values which glmnet chose for lambda
lasso_fit$lambda
# specifying "lambda" tells R to put log(lambda) on the horizontal axis
plot(lasso_fit,"lambda")
# gets coefs based on specific lambda's
# REVIEW OUTPUT FOR ACCURACY
coef(lasso_fit,s=.05)
coef(lasso_fit,s=.001)
# use cross validation to choose an appropriate λ
# nfolds:specifies how many folds to use for K-fold cross validation
cv_lasso_fit<-cv.glmnet(x=dependent_vars, y=numeric_vars$call,alpha=1,nlambda=20,nfolds=10)
cv_lasso_fit
# The two vertical lines on the plot are the values of lambda which has the lowest CV error
plot(cv_lasso_fit)
cv_lasso_fit$lambda.min
# the largest value of lambda which has a CV error within 1SE of the lowest CV error
cv_lasso_fit$lambda.1se
# use lasso_fit$lambda.min to get the lambda value with the best cverror
# REVIEW OUTPUTS FOR ACCURACY
coef(cv_lasso_fit,s=cv_lasso_fit$lambda.min)
coef(cv_lasso_fit,s=cv_lasso_fit$lambda.1se)
# the predict function takes the model we fit 'cv_lasso_fit'
# using the 's' argument, we specify which value of lambda we want to use
# we could use either the lambda with the smallest CV error or the largest lambda
# with CV error within 1se of the lowest CV error
# new x is the covariates for which we are making predictions. In this case, it's the test data
lasso_predicted_min <-predict(cv_lasso_fit, s=cv_lasso_fit$lambda.min, newx=dependent_vars, type="response")
lasso_predicted_min
lasso_predicted_se <-predict(cv_lasso_fit, s=cv_lasso_fit$lambda.1se, newx=dependent_vars, type="response")
lasso_predicted_se
coef(cv_lasso_fit, s=cv_lasso_fit$lambda.min)
# We can calculate the mean squared prediction error on test data
lasso_test_error <- mean((numeric_vars$call - lasso_predicted_min)^2)
lasso_test_error
# To do the same process with ridge regression instead of LASSO, use alpha = 0 in the glmnet call