-
Notifications
You must be signed in to change notification settings - Fork 3.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ARROW-16894: [C++] Add Benchmarks for Asof Join Node #13426
Changes from 82 commits
23b8c71
f4b2106
7ab446d
138daee
94a8453
6466b80
4c33452
c6c6093
643e368
0781a16
fc75844
26bc862
5a6afbd
4f7cac7
8773317
22c9941
6b27e6b
775be1d
2dc5691
a9dd980
0f39fce
761e5de
0387e5c
15ba43d
b92a303
f0edd17
58f229d
15e2783
7aa252a
9c332eb
cae5592
8f32500
eead16e
d0aec2f
f783818
1a79d10
83398c8
9f3d5c9
e61b9c1
1b4f26b
7287e84
554f7c3
a221715
0c72a94
d4cb13e
121b3bd
66b6b98
776ff18
b4cd1e5
6154eae
599f408
2009b8a
a33eba0
780e964
4020830
5d73258
4df409a
fd87a66
4a8d67c
16e86bf
b3228f2
aaa2b04
798d2b3
c414e8a
b8fb090
c9a608c
47aeb70
1752a87
a7fc6c6
0a74f7e
89f25cc
6f9bb79
c69336a
9adf631
8a2d0a8
ca37af5
3622d5e
cc17f47
5c6a32c
d9f7c14
ac5bcd1
7512796
ff84700
0111994
df1a566
7816ade
fd33711
cdbf5f2
3658a3a
3a212e5
bb3565a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,171 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
#include <string> | ||
|
||
#include "benchmark/benchmark.h" | ||
|
||
#include "arrow/compute/exec/test_util.h" | ||
#include "arrow/dataset/file_parquet.h" | ||
#include "arrow/table.h" | ||
#include "arrow/testing/future_util.h" | ||
|
||
namespace arrow { | ||
namespace compute { | ||
|
||
static const char* kTimeCol = "time"; | ||
static const char* kKeyCol = "id"; | ||
const int kDefaultStart = 0; | ||
const int kDefaultEnd = 500; | ||
const int kDefaultMinColumnVal = -10000; | ||
const int kDefaultMaxColumnVal = 10000; | ||
|
||
struct TableStats { | ||
std::shared_ptr<Table> table; | ||
size_t total_rows; | ||
size_t total_bytes; | ||
}; | ||
|
||
static TableStats MakeTable(const TableGenerationProperties& properties) { | ||
std::shared_ptr<Table> table = MakeRandomTimeSeriesTable(properties); | ||
size_t row_size = sizeof(double) * (table.get()->schema()->num_fields() - 2) + | ||
sizeof(int64_t) + sizeof(int32_t); | ||
size_t rows = table.get()->num_rows(); | ||
return {table, rows, rows * row_size}; | ||
} | ||
|
||
static ExecNode* MakeTableSourceNode(std::shared_ptr<arrow::compute::ExecPlan> plan, | ||
std::shared_ptr<Table> table, int batch_size) { | ||
return *arrow::compute::MakeExecNode( | ||
"table_source", plan.get(), {}, | ||
arrow::compute::TableSourceNodeOptions(table, batch_size)); | ||
iChauster marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
iChauster marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
static void TableJoinOverhead(benchmark::State& state, | ||
TableGenerationProperties left_table_properties, | ||
int left_table_batch_size, | ||
iChauster marked this conversation as resolved.
Show resolved
Hide resolved
|
||
TableGenerationProperties right_table_properties, | ||
int right_table_batch_size, int num_right_tables, | ||
std::string factory_name, ExecNodeOptions& options) { | ||
ExecContext ctx(default_memory_pool(), nullptr); | ||
|
||
left_table_properties.column_prefix = "lt"; | ||
left_table_properties.seed = 0; | ||
TableStats left_table_stats = MakeTable(left_table_properties); | ||
|
||
size_t right_hand_rows = 0; | ||
size_t right_hand_bytes = 0; | ||
std::vector<TableStats> right_input_tables; | ||
right_input_tables.reserve(num_right_tables); | ||
|
||
for (int i = 0; i < num_right_tables; i++) { | ||
right_table_properties.column_prefix = "rt" + std::to_string(i); | ||
right_table_properties.seed = i + 1; | ||
TableStats right_table_stats = MakeTable(right_table_properties); | ||
right_hand_rows += right_table_stats.total_rows; | ||
right_hand_bytes += right_table_stats.total_bytes; | ||
right_input_tables.push_back(right_table_stats); | ||
iChauster marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
for (auto _ : state) { | ||
state.PauseTiming(); | ||
ASSERT_OK_AND_ASSIGN(std::shared_ptr<arrow::compute::ExecPlan> plan, | ||
ExecPlan::Make(&ctx)); | ||
std::vector<ExecNode*> input_nodes = { | ||
MakeTableSourceNode(plan, left_table_stats.table, left_table_batch_size)}; | ||
input_nodes.reserve(right_input_tables.size() + 1); | ||
for (TableStats table_stats : right_input_tables) { | ||
input_nodes.push_back( | ||
MakeTableSourceNode(plan, table_stats.table, right_table_batch_size)); | ||
} | ||
ASSERT_OK_AND_ASSIGN(arrow::compute::ExecNode * join_node, | ||
MakeExecNode(factory_name, plan.get(), input_nodes, options)); | ||
AsyncGenerator<util::optional<ExecBatch>> sink_gen; | ||
MakeExecNode("sink", plan.get(), {join_node}, SinkNodeOptions{&sink_gen}); | ||
state.ResumeTiming(); | ||
ASSERT_FINISHES_OK(StartAndCollect(plan.get(), sink_gen)); | ||
} | ||
|
||
state.counters["total_rows_per_second"] = benchmark::Counter( | ||
static_cast<double>(state.iterations() * | ||
(left_table_stats.total_rows + right_hand_rows)), | ||
benchmark::Counter::kIsRate); | ||
iChauster marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
state.counters["total_bytes_per_second"] = benchmark::Counter( | ||
static_cast<double>(state.iterations() * | ||
(left_table_stats.total_bytes + right_hand_bytes)), | ||
benchmark::Counter::kIsRate); | ||
|
||
state.counters["maximum_peak_memory"] = | ||
benchmark::Counter(static_cast<double>(ctx.memory_pool()->max_memory())); | ||
} | ||
|
||
static void AsOfJoinOverhead(benchmark::State& state) { | ||
int64_t tolerance = 0; | ||
AsofJoinNodeOptions options = AsofJoinNodeOptions(kTimeCol, kKeyCol, tolerance); | ||
TableJoinOverhead( | ||
state, | ||
TableGenerationProperties{int(state.range(0)), int(state.range(1)), | ||
int(state.range(2)), "", kDefaultMinColumnVal, | ||
kDefaultMaxColumnVal, 0, kDefaultStart, kDefaultEnd}, | ||
int(state.range(3)), | ||
TableGenerationProperties{int(state.range(5)), int(state.range(6)), | ||
int(state.range(7)), "", kDefaultMinColumnVal, | ||
kDefaultMaxColumnVal, 0, kDefaultStart, kDefaultEnd}, | ||
int(state.range(8)), int(state.range(4)), "asofjoin", options); | ||
} | ||
|
||
// this generates the set of right hand tables to test on. | ||
void SetArgs(benchmark::internal::Benchmark* bench) { | ||
bench | ||
->ArgNames({"left_freq", "left_cols", "left_ids", "left_batch_size", | ||
"num_right_tables", "right_freq", "right_cols", "right_ids", | ||
"right_batch_size"}) | ||
->UseRealTime(); | ||
int default_freq = 5; | ||
int default_cols = 20; | ||
int default_ids = 500; | ||
int default_num_tables = 1; | ||
int default_batch_size = 100; | ||
|
||
for (int freq : {1, 5, 10}) { | ||
bench->Args({freq, default_cols, default_ids, default_batch_size, default_num_tables, | ||
freq, default_cols, default_ids, default_batch_size}); | ||
} | ||
|
||
for (int cols : {10, 20, 100}) { | ||
bench->Args({default_freq, cols, default_ids, default_batch_size, default_num_tables, | ||
default_freq, cols, default_ids, default_batch_size}); | ||
} | ||
for (int ids : {100, 500, 1000}) { | ||
bench->Args({default_freq, default_cols, ids, default_batch_size, default_num_tables, | ||
default_freq, default_cols, ids, default_batch_size}); | ||
} | ||
for (int num_tables : {1, 10, 50}) { | ||
bench->Args({default_freq, default_cols, default_ids, default_batch_size, num_tables, | ||
default_freq, default_cols, default_ids, default_batch_size}); | ||
} | ||
for (int batch_size : {1, 500, 1000}) { | ||
iChauster marked this conversation as resolved.
Show resolved
Hide resolved
|
||
bench->Args({default_freq, default_cols, default_ids, batch_size, default_num_tables, | ||
default_freq, default_cols, default_ids, batch_size}); | ||
} | ||
} | ||
|
||
BENCHMARK(AsOfJoinOverhead)->Apply(SetArgs); | ||
|
||
} // namespace compute | ||
} // namespace arrow |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -40,6 +40,7 @@ | |
#include "arrow/datum.h" | ||
#include "arrow/record_batch.h" | ||
#include "arrow/table.h" | ||
#include "arrow/testing/builder.h" | ||
#include "arrow/testing/gtest_util.h" | ||
#include "arrow/testing/random.h" | ||
#include "arrow/type.h" | ||
|
@@ -459,5 +460,41 @@ void PrintTo(const Declaration& decl, std::ostream* os) { | |
*os << "}"; | ||
} | ||
|
||
std::shared_ptr<Table> MakeRandomTimeSeriesTable( | ||
const TableGenerationProperties& properties) { | ||
int total_columns = properties.num_columns + 2; | ||
std::vector<std::shared_ptr<Array>> columns; | ||
columns.reserve(total_columns); | ||
arrow::FieldVector field_vector; | ||
field_vector.reserve(total_columns); | ||
|
||
field_vector.push_back(field("time", int64())); | ||
field_vector.push_back(field("id", int32())); | ||
|
||
Int64Builder time_column_builder; | ||
Int32Builder id_column_builder; | ||
for (int time = properties.start; time <= properties.end; | ||
time += properties.time_frequency) { | ||
for (int id = 0; id < properties.num_ids; id++) { | ||
time_column_builder.Append(time); | ||
id_column_builder.Append(id); | ||
} | ||
} | ||
|
||
int num_rows = time_column_builder.length(); | ||
columns.push_back(time_column_builder.Finish().ValueOrDie()); | ||
columns.push_back(id_column_builder.Finish().ValueOrDie()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you change the function to return
iChauster marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
for (int i = 0; i < properties.num_columns; i++) { | ||
field_vector.push_back( | ||
field(properties.column_prefix + std::to_string(i), float64())); | ||
random::RandomArrayGenerator rand = random::RandomArrayGenerator(properties.seed + i); | ||
iChauster marked this conversation as resolved.
Show resolved
Hide resolved
|
||
columns.push_back( | ||
rand.Float64(num_rows, properties.min_column_value, properties.max_column_value)); | ||
} | ||
std::shared_ptr<arrow::Schema> schema = arrow::schema(std::move(field_vector)); | ||
return Table::Make(schema, columns, num_rows); | ||
} | ||
|
||
} // namespace compute | ||
} // namespace arrow |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -145,5 +145,39 @@ class Random64Bit { | |
std::uniform_int_distribution<uint64_t> dist_; | ||
}; | ||
|
||
// Specify properties of a table to be generated. | ||
struct TableGenerationProperties { | ||
/// Indicates the amount of time between data points that lie between | ||
/// the start and end parameters. | ||
int time_frequency; | ||
/// The number of additional random columns in the table. | ||
int num_columns; | ||
/// The number of unique keys in the table. | ||
int num_ids; | ||
/// Specifies the prefix of each randomly generated column. | ||
std::string column_prefix; | ||
/// Specifies the minimum value in the randomly generated column(s). | ||
int min_column_value; | ||
/// Specifies the maximum value in the randomly generated column(s). | ||
int max_column_value; | ||
/// The random seed the random array generator is given to generate the additional | ||
/// columns. | ||
uint seed; | ||
/// Specifies the beginning of 'time' recorded in the table, inclusive. | ||
int start; | ||
/// Specifies the end of 'time' recorded in the table, inclusive. | ||
int end; | ||
}; | ||
|
||
/// The table generated in accordance to the TableGenerationProperties has the following | ||
/// schema: time (int64) id (int32) [properties.column_prefix]0 (float64) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, I think this one got caught in the linting / formatter and made it a bit unclear, but each column is numbered from 0 to n - 1 inclusive, so each column name is something like [properties.column_prefix][i] where i = {0...n-1}. Is there a way I can make this clearer through the comments?
iChauster marked this conversation as resolved.
Show resolved
Hide resolved
|
||
/// [properties.column_prefix]1 (float64) | ||
/// ... | ||
/// [properties.column_prefix][properties.num_columns] (float64) | ||
/// Each id has rows corresponding to a singular data point in the time range (start, end, | ||
/// time_frequency). The table is sorted by time. | ||
std::shared_ptr<Table> MakeRandomTimeSeriesTable( | ||
const TableGenerationProperties& properties); | ||
|
||
} // namespace compute | ||
} // namespace arrow |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are some utilities you can use in
arrow/util/byte_size.h
too if you wanted a more accurate version of the size (e.g. will report size used by validity bitmaps).However, this is fine too I think. It represents a more conceptual data size.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
After some testing, it seems these numbers are identical.