Skip to content

Commit

Permalink
Add core algorithms for columnar serialization
Browse files Browse the repository at this point in the history
This adds the core of the columnar serialization code paths. Even though
we internally scan in a columnar fashion in the tablet server, sending
those columns across the wire isn't straightforward. We have two bits of
necessary processing:

1) the selection vector needs to be taken into account so we only send
back selected rows. This means we need to copy out the selected cells
and also copy out the selected bits from the null bitmap where relevant.
Doing the null bitmap portion efficiently with wide platform support
makes up a lot of this patch.

2) for the case of null values, we want to make sure we don't send
uninitialized memory (which might include secrets!) to the client. So we
need to zero out any cells where the corresponding non-null bitmap bit
is unset.

To keep the review manageable, this just adds some unit tests and all
the new code is initially "dead". Later commits will add the parts that
construct the full block of columns to be sent on the wire, hook this
into the tserver, etc.

Change-Id: I16f2993081aac54609aab4d8219ef0bf6c7708c2
Reviewed-on: http://gerrit.cloudera.org:8080/15556
Tested-by: Kudu Jenkins
Reviewed-by: Andrew Wong <awong@cloudera.com>
Reviewed-by: Alexey Serbin <aserbin@cloudera.com>
  • Loading branch information
toddlipcon committed Mar 27, 2020
1 parent 0a46332 commit 0ba6cb8
Show file tree
Hide file tree
Showing 7 changed files with 842 additions and 1 deletion.
25 changes: 25 additions & 0 deletions LICENSE.txt
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,31 @@ src/kudu/util/array_view.h: 3-clause BSD license with patent grant
for this implementation of the WebRTC code package shall terminate as
of the date such litigation is filed.

--------------------------------------------------------------------------------

src/kudu/common/zp7.cc: MIT license

ZP7 (Zach's Peppy Parallel-Prefix-Popcountin' PEXT/PDEP Polyfill)

Copyright (c) 2020 Zach Wegner

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

--------------------------------------------------------------------------------

Expand Down
5 changes: 4 additions & 1 deletion src/kudu/common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ ADD_EXPORTABLE_LIBRARY(wire_protocol_proto
set(COMMON_SRCS
columnblock.cc
column_predicate.cc
columnar_serialization.cc
encoded_key.cc
generic_iterators.cc
id_mapping.cc
Expand All @@ -60,7 +61,8 @@ set(COMMON_SRCS
table_util.cc
timestamp.cc
types.cc
wire_protocol.cc)
wire_protocol.cc
zp7.cc)

# Workaround for clang bug https://llvm.org/bugs/show_bug.cgi?id=23757
# in which it incorrectly optimizes key_util.cc and causes incorrect results.
Expand All @@ -80,6 +82,7 @@ ADD_EXPORTABLE_LIBRARY(kudu_common
DEPS ${COMMON_LIBS})

SET_KUDU_TEST_LINK_LIBS(kudu_common)
ADD_KUDU_TEST(columnar_serialization-test)
ADD_KUDU_TEST(columnblock-test)
ADD_KUDU_TEST(column_predicate-test NUM_SHARDS 4)
ADD_KUDU_TEST(encoded_key-test)
Expand Down
179 changes: 179 additions & 0 deletions src/kudu/common/columnar_serialization-test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "kudu/common/columnar_serialization.h"

#include <cstddef>
#include <cstdint>
#include <ostream>
#include <string>
#include <utility>
#include <vector>

#include <glog/logging.h>
#include <gtest/gtest.h>

#include "kudu/util/bitmap.h"
#include "kudu/util/faststring.h"
#include "kudu/util/random.h"
#include "kudu/util/scoped_cleanup.h"
#include "kudu/util/test_util.h"

using std::vector;

namespace kudu {

class ColumnarSerializationTest : public KuduTest {
protected:
ColumnarSerializationTest() : rng_(SeedRandom()) {
}

// TODO(todd): templatize this test for other types once we have specialized
// implementations.
using DataType = uint32_t;
static constexpr int kTypeSize = sizeof(DataType);

struct RandomCellsAndNulls {
vector<DataType> vals;
faststring non_nulls;

void VerifyNullsAreZeroed() {
for (int i = 0; i < vals.size(); i++) {
SCOPED_TRACE(i);
if (BitmapTest(non_nulls.data(), i)) {
EXPECT_EQ(0xdeadbeef, vals[i]);
} else {
EXPECT_EQ(0, vals[i]);
}
}
}
};

// Generate a random bitmap with the given number of bits.
faststring RandomBitmap(int n_bits) {
faststring bm;
bm.resize(BitmapSize(n_bits));

for (int i = 0; i < n_bits; i++) {
BitmapChange(bm.data(), i, rng_.OneIn(3));
}
return bm;
}

// Create an array of 0xdeadbeef values and a corresponding
// null bitmap with random entries set to null.
RandomCellsAndNulls CreateDeadBeefsWithRandomNulls() {
auto num_rows = rng_.Uniform(1000) + 1;
vector<uint32_t> vals(num_rows, 0xdeadbeef);
faststring non_nulls = RandomBitmap(num_rows);
return { std::move(vals), std::move(non_nulls) };
}

Random rng_;
};


// Simple test of ZeroNullValues for a whole array.
TEST_F(ColumnarSerializationTest, TestZeroNullValues) {
auto data = CreateDeadBeefsWithRandomNulls();

internal::ZeroNullValues(
kTypeSize, /* dst_idx= */0,
data.vals.size(),
reinterpret_cast<uint8_t*>(data.vals.data()),
data.non_nulls.data());

ASSERT_NO_FATAL_FAILURE(data.VerifyNullsAreZeroed());
}

// More complex test test of ZeroNullValues which runs on sub-ranges
// of an array.
TEST_F(ColumnarSerializationTest, TestZeroNullValuesWithOffset) {
auto data = CreateDeadBeefsWithRandomNulls();
int dst_idx = 0;
while (dst_idx < data.vals.size()) {
auto rem = data.vals.size() - dst_idx;
auto n = rng_.Uniform(rem) + 1;
internal::ZeroNullValues(
kTypeSize, dst_idx, n,
reinterpret_cast<uint8_t*>(data.vals.data()),
data.non_nulls.data());
dst_idx += n;
}
ASSERT_NO_FATAL_FAILURE(data.VerifyNullsAreZeroed());
}

TEST_F(ColumnarSerializationTest, TestCopyNonNullBitmap) {
auto save_method = internal::g_pext_method;
SCOPED_CLEANUP({ internal::g_pext_method = save_method; });
// Test using all available methods. Depending on the machine where
// the test is running we might miss some, but we typically run this
// test on relatively recent hardware that would support BMI2 (Haswell
// or later).
auto available_methods = internal::GetAvailablePextMethods();
for (auto m : available_methods) {
SCOPED_TRACE(static_cast<int>(m));
internal::g_pext_method = m;
auto n_rows = 1 + rng_.Uniform(200);
faststring non_null_bitmap = RandomBitmap(n_rows);
faststring sel_bitmap = RandomBitmap(n_rows);
faststring dst_bitmap;
dst_bitmap.resize(BitmapSize(n_rows));

internal::CopyNonNullBitmap(
non_null_bitmap.data(), sel_bitmap.data(),
/*dst_idx=*/0, n_rows,
dst_bitmap.data());

vector<bool> expected;
ForEachSetBit(sel_bitmap.data(), n_rows,
[&](size_t bit) {
expected.push_back(BitmapTest(non_null_bitmap.data(), bit));
});
LOG(INFO) << "non-null: " << BitmapToString(non_null_bitmap.data(), n_rows);
LOG(INFO) << "selection: " << BitmapToString(sel_bitmap.data(), n_rows);
LOG(INFO) << "result: " << BitmapToString(dst_bitmap.data(), expected.size());
for (int i = 0; i < expected.size(); i++) {
EXPECT_EQ(expected[i], BitmapTest(dst_bitmap.data(), i));
}
}
}

TEST_F(ColumnarSerializationTest, TestCopySelectedRows) {
auto num_rows = rng_.Uniform(1000) + 1;
vector<uint32_t> vals;
for (int i = 0; i < num_rows; i++) {
vals.push_back(rng_.Next());
}

vector<uint32_t> expected;
vector<uint16_t> sel_indexes;
for (int i = 0; i < num_rows; i++) {
if (rng_.OneIn(3)) {
sel_indexes.push_back(i);
expected.push_back(vals[i]);
}
}

vector<uint32_t> ret(expected.size());
internal::CopySelectedRows(sel_indexes, kTypeSize,
reinterpret_cast<const uint8_t*>(vals.data()),
reinterpret_cast<uint8_t*>(ret.data()));
ASSERT_EQ(expected, ret);
}

} // namespace kudu
Loading

0 comments on commit 0ba6cb8

Please sign in to comment.