-
Notifications
You must be signed in to change notification settings - Fork 0
/
outliers.R
226 lines (212 loc) · 8.45 KB
/
outliers.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
## Author: Alex Whitworth
## Date: December 2012
#' @title Identify Data Outliers
#' @description
#' Determine if the specified columns of a data-frame have outliers. User can use
#' either IQR or quantiles to identify outliers. Can additionally choose to
#' remove or cap min and/or max outliers.
#' @param df An input data frame.
#' @param cols A vector of indices.
#' @param method List of length two. either c("boxplot", coef= X) or
#' c("quantile", q= c(min, max))
find.outliers <- function(df, cols, method= list(...)) {
if (min(cols) < 1) {
stop("you have entered invalid columns. Please correct your inputs.")
} else if (max(cols) > length(df)) {
stop("you have entered invalid columns. Please correct your inputs.")
}
if (method[[1]] != "boxplot" & method[[1]] != "quantile") {
stop(cat("You have entered an invalid method for identifying outliers.
Please correct your call.", fill=T))
}
if (method[[1]] == "boxplot" & method[[2]][1] < 0) {
stop(cat("You have entered an invalid coefficient, Please correct your call.", fill=T))
} else if (method[[1]] == "quantile" & length(method$q) != 2) {
stop(cat("You have entered invalid quantiles, Please correct your call.", fill=T))
}
iqr <- matrix(nrow = length(cols), ncol= 4,
dimnames= list(names(df[cols]),
c("min bound", "max bound", "min outliers?", "max outliers?")))
n <- 1
## determine outliers based on method= selection
if (method[[1]] == "boxplot") {
for (i in cols) {
iqr[n, 1] <- boxplot.stats(df[, i], coef= method[[2]])$stats[1]
iqr[n, 2] <- boxplot.stats(df[, i], coef= method[[2]])$stats[5]
if (min(df[, i]) < boxplot.stats(df[, i], coef= method[[2]])$stats[1]) {
iqr[n, 3] <- T
} else iqr[n, 3] <- F
if (max(df[, i]) > boxplot.stats(df[, i], coef= method[[2]])$stats[5]) {
iqr[n, 4] <- T
} else iqr[n, 4] <- F
n <- n + 1
}
} else if (method[[1]] == "quantile") {
for (i in cols) {
iqr[n, 1] <- quantile(df[, i], probs= method$q[1])
iqr[n, 2] <- quantile(df[, i], probs= method$q[2])
if (min(df[, i]) < quantile(df[, i], probs= method$q[1])) {
iqr[n, 3] <- T
} else iqr[n, 3] <- F
if (max(df[, i]) > quantile(df[, i], probs= method$q[2])) {
iqr[n, 4] <- T
} else iqr[n, 4] <- F
n <- n + 1
}
}
return(iqr)
}
#' @title Identify Data Outliers
#' @description
#' Determine if the specified columns of a data-frame have outliers. User can use
#' either IQR or quantiles to identify outliers. Can additionally choose to
#' remove or cap min and/or max outliers.
#' @param df An input data frame.
#' @param cols A vector of indices.
#' @param method List of length two. either c("boxplot", coef= X) or
#' c("quantile", q= c(min, max))
#' @param rm Logical vector of 2. remove min / max outliers
#' @param cap Logical vector of 2. cap min / max outliers
#' @param sure Logical. confirm you want to remove or cap outliers.
#' @return If sure= FALSE, a message. If sure= TRUE, a dataframe with outliers either
#' removed or capped
remove.outliers <- function(df, cols, method= list(...),
rm= c(min= F, max= F),
cap= c(min= F, max= F), sure= F) {
if (min(cols) < 1) {
stop("you have entered invalid columns. Please correct your inputs.")
} else if (max(cols) > length(df)) {
stop("you have entered invalid columns. Please correct your inputs.")
}
if ((rm[[1]] == F & rm[[2]] == F) & (cap[[1]] == F & cap[[2]] == F)) {
stop(cat("You have not chosen to remove or cap any outliers. You're data frame will not change.", fill=T))
}
if ((rm[[1]] == T || rm[[2]] == T) & (cap[[1]] == T || cap[[2]] == T)) {
stop(cat("Please only remove or cap outliers. You have selected both. Use two steps if needed", fill= T))
}
mat <- find.outliers(df, cols, method= method)
df2 <- df
cnt <- 1
# remove or cap outliers based on selection
if (rm[[1]] == T || rm[[2]] == T) {
# remove outliers
if (rm[[1]] == T & rm[[2]] == T) {
for (i in cols) {
if (mat[cnt, 3] == 1 & mat[cnt, 4] == 1) {
df2 <- df2[df2[, i] >= mat[cnt, 1] & df2[, i] <= mat[cnt, 2], ]
} else if (mat[cnt, 3] == 1 & mat[cnt, 4] == 0) {
df2 <- df2[df2[, i] >= mat[cnt, 1], ]
} else if (mat[cnt, 3] == 0 & mat[cnt, 4] == 1) {
df2 <- df2[df2[, i] <= mat[cnt, 2], ]
}
cnt <- cnt + 1
}
} else if (rm[[1]] == T & rm[[2]] == F) {
for (i in cols) {
if (mat[cnt , 3] == 1) {
df2 <- df2[df2[, i] >= mat[cnt, 1], ]
}
cnt <- cnt + 1
}
} else if (rm[[1]] == F & rm[[2]] == T) {
for (i in cols) {
if (mat[cnt , 4] == 1) {
df2 <- df2[df2[, i] <= mat[cnt, 2], ]
}
cnt <- cnt + 1
}
}
} else if (cap[[1]] == T || cap[[2]] == T) {
# cap outliers
if (cap[[1]] == T & cap[[2]] == T) {
for (i in cols) {
if (mat[cnt, 3] == 1 & mat[cnt, 4] == 1) {
df2[, i] <- ifelse(df2[, i] < mat[cnt, 1], mat[cnt, 1], df2[, i])
df2[, i] <- ifelse(df2[, i] > mat[cnt, 2], mat[cnt, 2], df2[, i])
} else if (mat[cnt, 3] == 1 & mat[cnt, 4] == 0) {
df2[, i] <- ifelse(df2[, i] < mat[cnt, 1], mat[cnt, 1], df2[, i])
} else if (mat[cnt, 3] == 0 & mat[cnt, 4] == 1) {
df2[, i] <- ifelse(df2[, i] > mat[cnt, 2], mat[cnt, 2], df2[, i])
}
cnt <- cnt + 1
}
} else if (cap[[1]] == T & cap[[2]] == F) {
for (i in cols) {
if (mat[cnt , 3] == 1) {
df2[, i] <- ifelse(df2[, i] < mat[cnt, 1], mat[cnt, 1], df2[, i])
}
cnt <- cnt + 1
}
} else if (cap[[1]] == F & cap[[2]] == T) {
for (i in cols) {
if (mat[cnt , 4] == 1) {
df2[, i] <- ifelse(df2[, i] > mat[cnt, 2], mat[cnt, 2], df2[, i])
}
cnt <- cnt + 1
}
}
}
obs.df <- nrow(df)
obs.df2 <- nrow(df2)
if (sure == F) {
if (obs.df > obs.df2) {
stop(cat("You are about to delete", obs.df - obs.df2,
"observations. Are you sure?", fill= T))
} else stop(cat("You have indicated your are not sure you wish to proceed.", fill= T))
} else if (sure == T) {
if (obs.df > obs.df2) {
print(cat("You have deleted", obs.df - obs.df2, "observations.", fill= T))
}
return(df2)
}
}
## needed functions for vectorized version
box.stat <- function(vec, coef2) {
bnd <- boxplot.stats(vec, coef= coef2)$stats[c(1,5)]
min <- min(vec) < bnd[1]
max <- max(vec) > bnd[2]
return(c(bnd, min, max))
}
quant.stat <- function(vec, probs2= c(.025, .975)) {
bnd <- quantile(vec, probs= probs2)
min <- min(vec) < bnd[1]
max <- max(vec) > bnd[2]
return(c(bnd, min, max))
}
#' @title Identify Data Outliers - vectorized
#' @description
#' Determine if the specified columns of a data-frame have outliers. User can use
#' either IQR or quantiles to identify outliers. Can additionally choose to
#' remove or cap min and/or max outliers. This version has been vectorized. However, no
#' improvement in run time shown in tests.
#' @param df An input data frame.
#' @param cols A vector of indices.
#' @param method List of length two. either c("boxplot", coef= X) or
#' c("quantile", q= c(min, max))
find.outliers2 <- function(df, cols, method= list(...)) {
if (min(cols) < 1) {
stop("you have entered invalid columns. Please correct your inputs.")
} else if (max(cols) > length(df)) {
stop("you have entered invalid columns. Please correct your inputs.")
}
if (method[[1]] != "boxplot" & method[[1]] != "quantile") {
stop(cat("You have entered an invalid method for identifying outliers.
Please correct your call.", fill=T))
}
if (method[[1]] == "boxplot" & method[[2]][1] < 0) {
stop(cat("You have entered an invalid coefficient, Please correct your call.", fill=T))
} else if (method[[1]] == "quantile" & length(method[[2]]) != 2) {
stop(cat("You have entered invalid quantiles, Please correct your call.", fill=T))
}
## determine outliers based on method= selection
if (method[[1]] == "boxplot") {
# apply functions
iqr <- t(apply(df[, cols], 2, box.stat, coef2= method[[2]]))
} else if (method[[1]] == "quantile") {
# apply functions
iqr <- t(apply(df[, cols], 2, quant.stat), probs2= method[[2]])
}
# name and return
dimnames(iqr) <- list(names(df)[cols], c("min bound", "max bound", "min outliers?", "max outliers?"))
return(iqr)
}