-
Notifications
You must be signed in to change notification settings - Fork 0
/
model.R
206 lines (167 loc) · 7.75 KB
/
model.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
optimiseRule <- function(data_input,
minimum_hit_rate = .5,
solver = "lpsolve",
error_weights = c(1, 100),
regularise_threshold = 0.1) {
#' Optimizses thresholded rule
#' @param data_input data.frame. The first column is the outcome variable, the remaining columns are the metrics for which the thresholds are calibrated.
#' @param minimum_hit_rate float. Minimum hit rate that the model is constrained to achieve across the input data.
#' @param solver string. Solver that is used to obtain a solution. Options are 'lpsolve'and 'glpk'.
#' @param error_weights float vector of length 2. It specifies how the model weights the hit rate and true negative rate. Default values are c(1, 100), i.e. the true negative rate has a 100 x higher weight than the hit rate.
#' @param regularise_threshold float. If > 0, higher thresholds are penalized. The higher the value the higher the penalty. The default value is small (0.1) to avoid that smaller threshold are traded for accuracy.
#' @return rule model
data_input_normalised <- normalise(data_input) # normalise all variables to values between 0 and 1.
data_input_normalised <- round(data_input_normalised,5)
x <- as.matrix(data_input_normalised)[, -1, drop = F]
n_objects <- nrow(x)
n_features <-ncol(x)
class_label <- data_input_normalised[, 1] * 2 - 1 # for convenience the class label is recoded from {0,1} to {-1,1}
n_objects_pos <- sum(class_label == 1)
n_objects_neg <-n_objects - n_objects_pos
# list variables in the mixed integer program
thresholds <- rep("C", n_features) # thresholds
Z <- rep("B",n_objects * n_features) # indicating for each object whether test (eg. lr < 0.5) is true
epsilon <- rep("B", n_objects) # vector indicating whether prediction is wrong
metrics <- rep("C", 2) # hit rate, true negative rate (1 - false alarm rate)
all_variables <- c(thresholds, Z, epsilon, metrics)
n_variables <- length(all_variables)# number of all variables
# compute constants required for MIP formulation
delta <- rep(NA, n_features)
for(j in 1:n_features){
a <- unique(sort(x[,j]))
a <- a[a <= 1]
delta[j] <- min(a[-1]-head(a,-1))
}
delta_max <- max(delta)
M <- 1 + delta_max # constant
# initialise constraint matrix (mat), right-hand side of the equations (rhs)
# and the comparison operator (comp)
n_constraints <- n_objects * n_features * 2 + 2 *n_objects + 3
mat <- array(NA, dim = c(n_constraints, n_variables))
rhs <- rep(NA, nrow(mat))
comp <- rep(NA, nrow(mat)) # "G" is >=, "L" is <=, and "E" is ==
# Equations A.5 and A.5:
counter <- 1
for(d in 1:n_features){
for(o in 1:n_objects){
temp <- rep(0, n_variables)
temp[d] <- -1
temp[length(thresholds) + n_objects * (d-1) + o] <- M
mat[counter,] <- temp
rhs[counter] <- -1 * (x[o,d])
comp[counter] <- "G"
counter <- counter + 1
temp <- rep(0, n_variables)
temp[d] <- -1
temp[length(thresholds) + n_objects*(d-1) + o] <- M
mat[counter,] <- temp
rhs[counter] <- M - 1*(x[o,d] + delta[d])
comp[counter] <- "L"
counter <- counter + 1
}
}
# Equations A.7 and A.8:
for(o in 1:n_objects){
temp <- rep(0, n_variables)
temp[length(thresholds) + n_objects * (1:n_features-1) + o] <- class_label[o]
temp[length(thresholds) + length(Z) + o] <- n_features
mat[counter,] <- temp
rhs[counter] <- .5 * class_label[o]
comp[counter] <- "G"
counter <- counter + 1
temp <- rep(0, n_variables)
temp[length(thresholds) + n_objects*(1:n_features - 1) + o] <- class_label[o]
temp[length(thresholds) + length(Z) + o] <- n_features
mat[counter,] <- temp
rhs[counter] <- .5 * class_label[o] + n_features
comp[counter] <- "L"
counter <- counter + 1
}
# Equation A.3
temp <- rep(0, n_variables)
temp[length(thresholds) + length(Z) + length(epsilon) + 1] <- n_objects_pos
temp[length(thresholds) + length(Z) + 1:length(epsilon)] <- (class_label + 1) / 2
mat[counter,] <- temp
rhs[counter] <- n_objects_pos
comp[counter] <- "E"
counter <- counter + 1
# Equation A.4
temp <- rep(0, n_variables)
temp[length(thresholds) + length(Z) + length(epsilon) + 2] <- n_objects_neg
temp[length(thresholds) + length(Z) + 1:length(epsilon)] <- 1- (class_label + 1)/2
mat[counter,] <- temp
rhs[counter] <- n_objects_neg
comp[counter] <- "E"
counter <- counter + 1
# Equation A.2
temp <- rep(0, n_variables)
temp[length(thresholds) + length(Z) + length(epsilon) + 1] <- 1
mat[counter,] <- temp
rhs[counter] <- minimum_hit_rate
comp[counter] <- "G"
# Equaion A.1 (objective function)
temp <- rep(0, n_variables)
temp[(n_variables - 1:0)] <- error_weights
temp[1:length(thresholds)] <- -regularise_threshold
objective_function <- temp
# set lower and upperlimits for variables
lower_bounds <- rep(0, n_variables)
upper_bounds <- rep(1, n_variables)
if(solver == "glpk"){
out <- Rglpk::Rglpk_solve_LP(obj = objective_function,
mat = mat,
dir = plyr::mapvalues(comp, from = c("G", "L", "E"), to = c(">=", "<=", "==")),
rhs = rhs,
# bounds = list(upper = upper_bounds, lower = lower_bounds),
types = all_variables,
max = TRUE,
control = list("verbose" = F))
result <- out$solution
}
if(solver == "lpsolve"){
model <- lpSolveAPI::make.lp(n_constraints, n_variables)
lpSolveAPI::lp.control(model, sense='max')
lpSolveAPI::set.constr.type(model,
types= plyr::mapvalues(comp, from = c("G", "L", "E"), to = c(">=", "<=", "=")))
lpSolveAPI::set.rhs(model, b = rhs)
lpSolveAPI::set.bounds(model, lower = lower_bounds, upper = upper_bounds, columns = 1:n_variables)
variable_types <- ifelse(all_variables =="B", "binary", "real")
for(r in 1:n_variables){
lpSolveAPI::set.column(model, r, mat[,r])
lpSolveAPI::set.type(model, r, type = variable_types[r])
}
lpSolveAPI::set.objfn(model, objective_function)
status <- solve(model)
result <- lpSolveAPI::get.variables(model)
}
# extract thresholds from result
thresholds_learned <- round(result[1:n_features], 5)
index_max <- thresholds_learned > 1
index_min <- thresholds_learned < 0
for(t in 1:n_features){ # smooth thresholds such that they lie between observed values.
if(thresholds_learned[t]>0){
unique_values <- sort(unique(c(round(x[,t],5))))
dif <- unique_values - thresholds_learned[t]
index <- which(dif == min(dif[dif>=0]))
thresholds_learned[t] <- (unique_values[index] + (unique_values[index-1]))/2
}
}
thresholds_learned[index_max] <- 1
thresholds_learned[index_min] <- 0
# transform threshold to original value range from data_input
min_x <- apply(data_input[,-1, drop = F],2,min, na.rm = T)
max_x <- apply(data_input[,-1, drop = F],2,max, na.rm = T)
thresholds_transformed <- thresholds_learned*(max_x-min_x) + min_x
# prepare output of model
output <- list()
counter <- 1
for(d in 1:n_features){
output[[counter]] <- paste(colnames(data_input)[-1][d],
"<",
thresholds_transformed[d]
)
counter <- counter + 1
}
attr(output,"thresholds") <- thresholds_transformed
return(output)
}