-
Notifications
You must be signed in to change notification settings - Fork 2
/
benchmarks.Rmd
200 lines (164 loc) · 5.55 KB
/
benchmarks.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
---
title: "Benchmarks"
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{benchmarks}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
---
```{r, include = FALSE}
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
```
## Introduction
In this short vignette, I show off benchmarks of the zoomerjoin package,
comparing it to the excellent [fuzzyjoin](https://github.com/dgrtwo/fuzzyjoin)
package. The two packages are designed to do different things - the fuzzyjoin
package is *very fast,* and provides more distance functions (as well as other
joining modes) - but it's a useful comparison as it shows off the time that can
be saved using LSH relative to all pairwise comparisons, as long as you are
okay with using Jaccard similarity.
In the future, I am hoping to expand the package to implement [this LSH method
for the edit
distance](https://academic.oup.com/bioinformatics/article/35/14/i127/5529166),
and will add it to the benchmarks when / if this feature is completed.
```{r setup, include=F}
library(tidyverse)
```
## Benchmarks
Here, I show the time it takes fuzzyjoin and zoomerjoin to fuzzily join two
datasets as the size of each dataset increases. Fuzzyjoin is initially quick,
but the runtime scales with the square of the input size. Zoomerjoin is slower
for small datasets but is less memory-intensive, and scales with the sum of
the rows in each dataset, so it becomes quicker for larger datasets.
```{r, echo=F}
sim_data <- read_csv("sim_data.csv")
sim_data %>%
mutate(
name = ifelse(name == "time", "Time Usage (s)", "Memory Usage (MB)"),
join_type = ifelse(join_type == "Jaccard Distance",
"Jaccard Distance Join",
"Euclidean Distance Joins"
),
) %>%
ggplot(aes(x = as.numeric(n), y = value, col = package, linetype = package)) +
geom_point() +
geom_line() +
facet_wrap(~ join_type + name, scales = "free") +
scale_y_continuous("Time (s) / memory (MB)")
```
# Benchmarking Code:
Below, I include the code used to generate the benchmarks:
```{r, echo=T, eval=F}
library(zoomerjoin)
library(fuzzyjoin)
library(tidyverse)
library(microbenchmark)
library(profmem)
# Sample million rows from DIME dataset
data_1 <- as.data.frame(sample_n(dime_data, 10^6))
names(data_1) <- c("id_1", "name")
data_2 <- as.data.frame(sample_n(dime_data, 10^6))
names(data_2) <- c("id_2", "name")
# Generate datasets for euclidean join benchmarking
n <- 10^5
p <- 50
X <- matrix(rnorm(n * p), n, p)
X_1 <- as.data.frame(X)
X_2 <- as.data.frame(X + .000000001)
# Get time and memory use statistics for fuzzyjoin when performing jaccard join
fuzzy_jaccard_bench <- function(n) {
time <- microbenchmark(
stringdist_inner_join(data_1[1:n, ],
data_2[1:n, ],
method = "jaccard",
max_dist = .6,
q = 4
),
times = 10
)$time %>%
median()
mem <- profmem(stringdist_inner_join(data_1[1:n, ],
data_2[1:n, ],
method = "jaccard",
max_dist = .6,
q = 4
)) %>%
total()
return(c(time = time, memory = mem))
}
# Get time and memory use statistics for zoomerjoin when performing jaccard join
zoomer_jaccard_bench <- function(n) {
time <- microbenchmark(
jaccard_inner_join(data_1[1:n, ], data_2[1:n, ],
by = "name", band_width = 11,
n_bands = 350, threshold = .7,
n_gram_width = 4
),
times = 50
)$time %>%
median()
mem <- profmem(
jaccard_inner_join(data_1[1:n, ], data_2[1:n, ],
by = "name", band_width = 11,
n_bands = 350, threshold = .7,
n_gram_width = 4
)
) %>%
total()
return(c(time = time, memory = mem))
}
# Get time and memory use statistics for fuzzyjoin when performing Euclidean join
fuzzy_euclid_bench <- function(n) {
time <- microbenchmark(
distance_join(X_1[1:n, ], X_2[1:n, ], max_dist = .1, method = "euclidean"),
times = 10
)$time %>%
median()
mem <- total(profmem(
distance_join(X_1[1:n, ], X_2[1:n, ], max_dist = .1, method = "euclidean")
))
return(c(time = time, memory = mem))
}
# Get time and memory use statistics for zoomerjoin when performing Euclidean join
zoomer_euclid_bench <- function(n) {
time <- microbenchmark(
euclidean_inner_join(X_1[1:n, ], X_2[1:n, ],
threshold = .1, n_bands = 90,
band_width = 2, r = .1
),
times = 50
)$time %>%
median()
mem <- profmem(euclidean_inner_join(X_1[1:n, ], X_2[1:n, ],
threshold = .1, n_bands = 90,
band_width = 2, r = .1
)) %>%
total()
return(c(time = time, memory = mem))
}
# Run Grid of Jaccard Benchmarks, Collect results into DF
n <- seq(500, 4000, 250)
names(n) <- n
fuzzy_jacard_benches <- map_df(n, fuzzy_jaccard_bench, .id = "n")
zoomer_jacard_benches <- map_df(n, zoomer_jaccard_bench, .id = "n")
fuzzy_jacard_benches$package <- "fuzzyjoin"
zoomer_jacard_benches$package <- "zoomerjoin"
jaccard_benches <- bind_rows(fuzzy_jacard_benches, zoomer_jacard_benches)
jaccard_benches$join_type <- "Jaccard Distance"
# Run Grid of Euclidean Benchmarks, Collect results into DF
n <- seq(250, 4000, 250)
names(n) <- n
fuzzy_euclid_benches <- map_df(n, fuzzy_euclid_bench, .id = "n")
zoomer_euclid_benches <- map_df(n, zoomer_euclid_bench, .id = "n")
fuzzy_euclid_benches$package <- "fuzzyjoin"
zoomer_euclid_benches$package <- "zoomerjoin"
euclid_benches <- bind_rows(fuzzy_euclid_benches, zoomer_euclid_benches)
euclid_benches$join_type <- "Euclidean Distance"
sim_data <- bind_rows(euclid_benches, jaccard_benches) %>%
pivot_longer(c(time, memory)) %>%
mutate(value = ifelse(name == "time", value / 10^9, value / 10^6)) # convert ns to s and bytes to Gb.
write_csv(sim_data, "sim_data.csv")
```