-
Notifications
You must be signed in to change notification settings - Fork 648
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add core algorithms for columnar serialization
This adds the core of the columnar serialization code paths. Even though we internally scan in a columnar fashion in the tablet server, sending those columns across the wire isn't straightforward. We have two bits of necessary processing: 1) the selection vector needs to be taken into account so we only send back selected rows. This means we need to copy out the selected cells and also copy out the selected bits from the null bitmap where relevant. Doing the null bitmap portion efficiently with wide platform support makes up a lot of this patch. 2) for the case of null values, we want to make sure we don't send uninitialized memory (which might include secrets!) to the client. So we need to zero out any cells where the corresponding non-null bitmap bit is unset. To keep the review manageable, this just adds some unit tests and all the new code is initially "dead". Later commits will add the parts that construct the full block of columns to be sent on the wire, hook this into the tserver, etc. Change-Id: I16f2993081aac54609aab4d8219ef0bf6c7708c2 Reviewed-on: http://gerrit.cloudera.org:8080/15556 Tested-by: Kudu Jenkins Reviewed-by: Andrew Wong <awong@cloudera.com> Reviewed-by: Alexey Serbin <aserbin@cloudera.com>
- Loading branch information
1 parent
0a46332
commit 0ba6cb8
Showing
7 changed files
with
842 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,179 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
#include "kudu/common/columnar_serialization.h" | ||
|
||
#include <cstddef> | ||
#include <cstdint> | ||
#include <ostream> | ||
#include <string> | ||
#include <utility> | ||
#include <vector> | ||
|
||
#include <glog/logging.h> | ||
#include <gtest/gtest.h> | ||
|
||
#include "kudu/util/bitmap.h" | ||
#include "kudu/util/faststring.h" | ||
#include "kudu/util/random.h" | ||
#include "kudu/util/scoped_cleanup.h" | ||
#include "kudu/util/test_util.h" | ||
|
||
using std::vector; | ||
|
||
namespace kudu { | ||
|
||
class ColumnarSerializationTest : public KuduTest { | ||
protected: | ||
ColumnarSerializationTest() : rng_(SeedRandom()) { | ||
} | ||
|
||
// TODO(todd): templatize this test for other types once we have specialized | ||
// implementations. | ||
using DataType = uint32_t; | ||
static constexpr int kTypeSize = sizeof(DataType); | ||
|
||
struct RandomCellsAndNulls { | ||
vector<DataType> vals; | ||
faststring non_nulls; | ||
|
||
void VerifyNullsAreZeroed() { | ||
for (int i = 0; i < vals.size(); i++) { | ||
SCOPED_TRACE(i); | ||
if (BitmapTest(non_nulls.data(), i)) { | ||
EXPECT_EQ(0xdeadbeef, vals[i]); | ||
} else { | ||
EXPECT_EQ(0, vals[i]); | ||
} | ||
} | ||
} | ||
}; | ||
|
||
// Generate a random bitmap with the given number of bits. | ||
faststring RandomBitmap(int n_bits) { | ||
faststring bm; | ||
bm.resize(BitmapSize(n_bits)); | ||
|
||
for (int i = 0; i < n_bits; i++) { | ||
BitmapChange(bm.data(), i, rng_.OneIn(3)); | ||
} | ||
return bm; | ||
} | ||
|
||
// Create an array of 0xdeadbeef values and a corresponding | ||
// null bitmap with random entries set to null. | ||
RandomCellsAndNulls CreateDeadBeefsWithRandomNulls() { | ||
auto num_rows = rng_.Uniform(1000) + 1; | ||
vector<uint32_t> vals(num_rows, 0xdeadbeef); | ||
faststring non_nulls = RandomBitmap(num_rows); | ||
return { std::move(vals), std::move(non_nulls) }; | ||
} | ||
|
||
Random rng_; | ||
}; | ||
|
||
|
||
// Simple test of ZeroNullValues for a whole array. | ||
TEST_F(ColumnarSerializationTest, TestZeroNullValues) { | ||
auto data = CreateDeadBeefsWithRandomNulls(); | ||
|
||
internal::ZeroNullValues( | ||
kTypeSize, /* dst_idx= */0, | ||
data.vals.size(), | ||
reinterpret_cast<uint8_t*>(data.vals.data()), | ||
data.non_nulls.data()); | ||
|
||
ASSERT_NO_FATAL_FAILURE(data.VerifyNullsAreZeroed()); | ||
} | ||
|
||
// More complex test test of ZeroNullValues which runs on sub-ranges | ||
// of an array. | ||
TEST_F(ColumnarSerializationTest, TestZeroNullValuesWithOffset) { | ||
auto data = CreateDeadBeefsWithRandomNulls(); | ||
int dst_idx = 0; | ||
while (dst_idx < data.vals.size()) { | ||
auto rem = data.vals.size() - dst_idx; | ||
auto n = rng_.Uniform(rem) + 1; | ||
internal::ZeroNullValues( | ||
kTypeSize, dst_idx, n, | ||
reinterpret_cast<uint8_t*>(data.vals.data()), | ||
data.non_nulls.data()); | ||
dst_idx += n; | ||
} | ||
ASSERT_NO_FATAL_FAILURE(data.VerifyNullsAreZeroed()); | ||
} | ||
|
||
TEST_F(ColumnarSerializationTest, TestCopyNonNullBitmap) { | ||
auto save_method = internal::g_pext_method; | ||
SCOPED_CLEANUP({ internal::g_pext_method = save_method; }); | ||
// Test using all available methods. Depending on the machine where | ||
// the test is running we might miss some, but we typically run this | ||
// test on relatively recent hardware that would support BMI2 (Haswell | ||
// or later). | ||
auto available_methods = internal::GetAvailablePextMethods(); | ||
for (auto m : available_methods) { | ||
SCOPED_TRACE(static_cast<int>(m)); | ||
internal::g_pext_method = m; | ||
auto n_rows = 1 + rng_.Uniform(200); | ||
faststring non_null_bitmap = RandomBitmap(n_rows); | ||
faststring sel_bitmap = RandomBitmap(n_rows); | ||
faststring dst_bitmap; | ||
dst_bitmap.resize(BitmapSize(n_rows)); | ||
|
||
internal::CopyNonNullBitmap( | ||
non_null_bitmap.data(), sel_bitmap.data(), | ||
/*dst_idx=*/0, n_rows, | ||
dst_bitmap.data()); | ||
|
||
vector<bool> expected; | ||
ForEachSetBit(sel_bitmap.data(), n_rows, | ||
[&](size_t bit) { | ||
expected.push_back(BitmapTest(non_null_bitmap.data(), bit)); | ||
}); | ||
LOG(INFO) << "non-null: " << BitmapToString(non_null_bitmap.data(), n_rows); | ||
LOG(INFO) << "selection: " << BitmapToString(sel_bitmap.data(), n_rows); | ||
LOG(INFO) << "result: " << BitmapToString(dst_bitmap.data(), expected.size()); | ||
for (int i = 0; i < expected.size(); i++) { | ||
EXPECT_EQ(expected[i], BitmapTest(dst_bitmap.data(), i)); | ||
} | ||
} | ||
} | ||
|
||
TEST_F(ColumnarSerializationTest, TestCopySelectedRows) { | ||
auto num_rows = rng_.Uniform(1000) + 1; | ||
vector<uint32_t> vals; | ||
for (int i = 0; i < num_rows; i++) { | ||
vals.push_back(rng_.Next()); | ||
} | ||
|
||
vector<uint32_t> expected; | ||
vector<uint16_t> sel_indexes; | ||
for (int i = 0; i < num_rows; i++) { | ||
if (rng_.OneIn(3)) { | ||
sel_indexes.push_back(i); | ||
expected.push_back(vals[i]); | ||
} | ||
} | ||
|
||
vector<uint32_t> ret(expected.size()); | ||
internal::CopySelectedRows(sel_indexes, kTypeSize, | ||
reinterpret_cast<const uint8_t*>(vals.data()), | ||
reinterpret_cast<uint8_t*>(ret.data())); | ||
ASSERT_EQ(expected, ret); | ||
} | ||
|
||
} // namespace kudu |
Oops, something went wrong.