diff --git a/README.md b/README.md index fbcd4d3ef66..e40bf6ae23e 100755 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ The official docker image for building is [`foundationdb/build`](https://hub.doc To build outside the official docker image you'll need at least these dependencies: 1. Install cmake Version 3.13 or higher [CMake](https://cmake.org/) -1. Install [Mono](http://www.mono-project.com/download/stable/) +1. Install [Mono](https://www.mono-project.com/download/stable/) 1. Install [Ninja](https://ninja-build.org/) (optional, but recommended) If compiling for local development, please set `-DUSE_WERROR=ON` in @@ -177,7 +177,7 @@ Under Windows, only Visual Studio with ClangCl is supported 1. Install [Python](https://www.python.org/downloads/) if is not already installed by Visual Studio 1. (Optional) Install [OpenJDK 11](https://developers.redhat.com/products/openjdk/download) to build Java bindings 1. (Optional) Install [OpenSSL 3.x](https://slproweb.com/products/Win32OpenSSL.html) to build with TLS support -1. (Optional) Install [WIX Toolset](http://wixtoolset.org/) to build Windows installer +1. (Optional) Install [WIX Toolset](https://wixtoolset.org/) to build Windows installer 1. `mkdir build && cd build` 1. `cmake -G "Visual Studio 16 2019" -A x64 -T ClangCl ` 1. `msbuild /p:Configuration=Release foundationdb.sln` diff --git a/bindings/bindingtester/bindingtester.py b/bindings/bindingtester/bindingtester.py index 9c178a09d58..d914e9d9dca 100755 --- a/bindings/bindingtester/bindingtester.py +++ b/bindings/bindingtester/bindingtester.py @@ -202,6 +202,7 @@ def __init__(self, args): self.args.types = list(reduce(lambda t1, t2: filter(t1.__contains__, t2), map(lambda tester: tester.types, self.testers))) self.args.no_directory_snapshot_ops = self.args.no_directory_snapshot_ops or any([not tester.directory_snapshot_ops_enabled for tester in self.testers]) + self.args.no_tenants = self.args.no_tenants or any([not tester.tenants_enabled for tester in self.testers]) def print_test(self): test_instructions = self._generate_test() @@ -282,6 +283,17 @@ def _generate_test(self): def _insert_instructions(self, test_instructions): util.get_logger().info('\nInserting test into database...') del self.db[:] + + while True: + tr = self.db.create_transaction() + try: + tr.options.set_special_key_space_enable_writes() + del tr[b'\xff\xff/management/tenant_map/' : b'\xff\xff/management/tenant_map0'] + tr.commit().wait() + break + except fdb.FDBError as e: + tr.on_error(e).wait() + for subspace, thread in test_instructions.items(): thread.insert_operations(self.db, subspace) @@ -445,6 +457,8 @@ def parse_args(argv): parser.add_argument('--no-directory-snapshot-ops', action='store_true', help='Disables snapshot operations for directory instructions.') + parser.add_argument('--no-tenants', action='store_true', help='Disables tenant operations.') + return parser.parse_args(argv) diff --git a/bindings/bindingtester/known_testers.py b/bindings/bindingtester/known_testers.py index 0fe5ad638f7..fbae72d36c3 100644 --- a/bindings/bindingtester/known_testers.py +++ b/bindings/bindingtester/known_testers.py @@ -26,7 +26,7 @@ class Tester: - def __init__(self, name, cmd, max_int_bits=64, min_api_version=0, max_api_version=MAX_API_VERSION, threads_enabled=True, types=COMMON_TYPES, directory_snapshot_ops_enabled=True): + def __init__(self, name, cmd, max_int_bits=64, min_api_version=0, max_api_version=MAX_API_VERSION, threads_enabled=True, types=COMMON_TYPES, directory_snapshot_ops_enabled=True, tenants_enabled=False): self.name = name self.cmd = cmd self.max_int_bits = max_int_bits @@ -35,6 +35,7 @@ def __init__(self, name, cmd, max_int_bits=64, min_api_version=0, max_api_versio self.threads_enabled = threads_enabled self.types = types self.directory_snapshot_ops_enabled = directory_snapshot_ops_enabled + self.tenants_enabled = tenants_enabled def supports_api_version(self, api_version): return api_version >= self.min_api_version and api_version <= self.max_api_version @@ -57,8 +58,8 @@ def _absolute_path(path): # We could set min_api_version lower on some of these if the testers were updated to support them testers = { - 'python': Tester('python', 'python ' + _absolute_path('python/tests/tester.py'), 2040, 23, MAX_API_VERSION, types=ALL_TYPES), - 'python3': Tester('python3', 'python3 ' + _absolute_path('python/tests/tester.py'), 2040, 23, MAX_API_VERSION, types=ALL_TYPES), + 'python': Tester('python', 'python ' + _absolute_path('python/tests/tester.py'), 2040, 23, MAX_API_VERSION, types=ALL_TYPES, tenants_enabled=True), + 'python3': Tester('python3', 'python3 ' + _absolute_path('python/tests/tester.py'), 2040, 23, MAX_API_VERSION, types=ALL_TYPES, tenants_enabled=True), 'ruby': Tester('ruby', _absolute_path('ruby/tests/tester.rb'), 2040, 23, MAX_API_VERSION), 'java': Tester('java', _java_cmd + 'StackTester', 2040, 510, MAX_API_VERSION, types=ALL_TYPES), 'java_async': Tester('java', _java_cmd + 'AsyncStackTester', 2040, 510, MAX_API_VERSION, types=ALL_TYPES), diff --git a/bindings/bindingtester/spec/tenantTester.md b/bindings/bindingtester/spec/tenantTester.md new file mode 100644 index 00000000000..2ba54a74c4a --- /dev/null +++ b/bindings/bindingtester/spec/tenantTester.md @@ -0,0 +1,77 @@ +Overview +-------- + +Tenant testing is an optional extension to the core binding tester that enables +testing of the tenant API. This testing is enabled by adding some additional +instructions and modifying the behavior of some existing instructions. + +Additional State and Initialization +----------------------------------- + +Your tester should store an additional piece of state tracking the active tenant +that is to be used to create transactions. This tenant must support an unset +state, in which case transactions will be created directly on the database. + +New Instructions +---------------- + +The tenant API introduces some new operations: + +#### TENANT_CREATE + + Pops the top item off of the stack as TENANT_NAME. Creates a new tenant + in the database with the name TENANT_NAME. May optionally push a future + onto the stack. + +#### TENANT_DELETE + + Pops the top item off of the stack as TENANT_NAME. Deletes the tenant with + the name TENANT_NAME from the database. May optionally push a future onto + the stack. + +#### TENANT_SET_ACTIVE + + Pops the top item off of the stack as TENANT_NAME. Opens the tenant with + name TENANT_NAME and stores it as the active tenant. + +#### TENANT_CLEAR_ACTIVE + + Unsets the active tenant. + +Updates to Existing Instructions +-------------------------------- + +Some existing operations in the binding tester will have slightly modified +behavior when tenants are enabled. + +#### NEW_TRANSACTION + + When creating a new transaction, the active tenant should be used. If no active + tenant is set, then the transaction should be created as normal using the + database. + +#### _TENANT suffix + + Similar to the _DATABASE suffix, an operation with the _TENANT suffix indicates + that the operation should be performed on the current active tenant object. If + there is no active tenant, then the operation should be performed on the database + as if _DATABASE was specified. In any case where the operation suffixed with + _DATABASE is allowed to push a future onto the stack, the same operation suffixed + with _TENANT is also allowed to push a future onto the stack. + + If your binding does not support operations directly on a tenant object, you should + simulate it using an anonymous transaction. Remember that set and clear operations + must immediately commit (with appropriate retry behavior!). + + Operations that can include the _TENANT prefix are: + + GET_TENANT + GET_KEY_TENANT + GET_RANGE_TENANT + GET_RANGE_STARTS_WITH_TENANT + GET_RANGE_SELECTOR_TENANT + SET_TENANT + CLEAR_TENANT + CLEAR_RANGE_TENANT + CLEAR_RANGE_STARTS_WITH_TENANT + ATOMIC_OP_TENANT diff --git a/bindings/bindingtester/tests/api.py b/bindings/bindingtester/tests/api.py index df90adf8907..fd495fac763 100644 --- a/bindings/bindingtester/tests/api.py +++ b/bindings/bindingtester/tests/api.py @@ -58,6 +58,7 @@ def setup(self, args): self.outstanding_ops = [] self.random = test_util.RandomGenerator(args.max_int_bits, args.api_version, args.types) self.api_version = args.api_version + self.allocated_tenants = set() def add_stack_items(self, num): self.stack_size += num @@ -137,6 +138,12 @@ def wait_for_reads(self, instructions): test_util.to_front(instructions, self.stack_size - read[0]) instructions.append('WAIT_FUTURE') + def choose_tenant(self, new_tenant_probability): + if len(self.allocated_tenants) == 0 or random.random() < new_tenant_probability: + return self.random.random_string(random.randint(0, 30)) + else: + return random.choice(list(self.allocated_tenants)) + def generate(self, args, thread_number): instructions = InstructionSet() @@ -158,6 +165,7 @@ def generate(self, args, thread_number): write_conflicts = ['WRITE_CONFLICT_RANGE', 'WRITE_CONFLICT_KEY', 'DISABLE_WRITE_CONFLICT'] txn_sizes = ['GET_APPROXIMATE_SIZE'] storage_metrics = ['GET_ESTIMATED_RANGE_SIZE', 'GET_RANGE_SPLIT_POINTS'] + tenants = ['TENANT_CREATE', 'TENANT_DELETE', 'TENANT_SET_ACTIVE', 'TENANT_CLEAR_ACTIVE'] op_choices += reads op_choices += mutations @@ -173,6 +181,9 @@ def generate(self, args, thread_number): op_choices += txn_sizes op_choices += storage_metrics + if not args.no_tenants: + op_choices += tenants + idempotent_atomic_ops = ['BIT_AND', 'BIT_OR', 'MAX', 'MIN', 'BYTE_MIN', 'BYTE_MAX'] atomic_ops = idempotent_atomic_ops + ['ADD', 'BIT_XOR', 'APPEND_IF_FITS'] @@ -195,7 +206,7 @@ def generate(self, args, thread_number): # print 'Adding instruction %s at %d' % (op, index) - if args.concurrency == 1 and (op in database_mutations): + if args.concurrency == 1 and (op in database_mutations or op in ['TENANT_CREATE', 'TENANT_DELETE']): self.wait_for_reads(instructions) test_util.blocking_commit(instructions) self.can_get_commit_version = False @@ -570,18 +581,39 @@ def generate(self, args, thread_number): instructions.push_args(key1, key2, chunkSize) instructions.append(op) self.add_strings(1) - + elif op == 'TENANT_CREATE': + tenant_name = self.choose_tenant(0.8) + self.allocated_tenants.add(tenant_name) + instructions.push_args(tenant_name) + instructions.append(op) + self.add_strings(1) + elif op == 'TENANT_DELETE': + tenant_name = self.choose_tenant(0.2) + if tenant_name in self.allocated_tenants: + self.allocated_tenants.remove(tenant_name) + instructions.push_args(tenant_name) + instructions.append(op) + self.add_strings(1) + elif op == 'TENANT_SET_ACTIVE': + tenant_name = self.choose_tenant(0.8) + instructions.push_args(tenant_name) + instructions.append(op) + elif op == 'TENANT_CLEAR_ACTIVE': + instructions.append(op) else: assert False, 'Unknown operation: ' + op if read_performed and op not in database_reads: self.outstanding_ops.append((self.stack_size, len(instructions) - 1)) - if args.concurrency == 1 and (op in database_reads or op in database_mutations): + if args.concurrency == 1 and (op in database_reads or op in database_mutations or op in ['TENANT_CREATE', 'TENANT_DELETE']): instructions.append('WAIT_FUTURE') instructions.begin_finalization() + if not args.no_tenants: + instructions.append('TENANT_CLEAR_ACTIVE') + if args.concurrency == 1: self.wait_for_reads(instructions) test_util.blocking_commit(instructions) diff --git a/bindings/c/CMakeLists.txt b/bindings/c/CMakeLists.txt index 0c2cfde9a38..2e52b439087 100644 --- a/bindings/c/CMakeLists.txt +++ b/bindings/c/CMakeLists.txt @@ -137,6 +137,7 @@ if(NOT WIN32) add_executable(fdb_c_performance_test test/performance_test.c test/test.h) add_executable(fdb_c_ryw_benchmark test/ryw_benchmark.c test/test.h) add_executable(fdb_c_txn_size_test test/txn_size_test.c test/test.h) + add_executable(fdb_c_client_memory_test test/client_memory_test.cpp test/unit/fdb_api.cpp test/unit/fdb_api.hpp) add_executable(mako ${MAKO_SRCS}) add_executable(fdb_c_setup_tests test/unit/setup_tests.cpp) add_executable(fdb_c_unit_tests ${UNIT_TEST_SRCS}) @@ -147,10 +148,12 @@ if(NOT WIN32) strip_debug_symbols(fdb_c_performance_test) strip_debug_symbols(fdb_c_ryw_benchmark) strip_debug_symbols(fdb_c_txn_size_test) + strip_debug_symbols(fdb_c_client_memory_test) endif() target_link_libraries(fdb_c_performance_test PRIVATE fdb_c Threads::Threads) target_link_libraries(fdb_c_ryw_benchmark PRIVATE fdb_c Threads::Threads) target_link_libraries(fdb_c_txn_size_test PRIVATE fdb_c Threads::Threads) + target_link_libraries(fdb_c_client_memory_test PRIVATE fdb_c Threads::Threads) add_dependencies(fdb_c_setup_tests doctest) add_dependencies(fdb_c_unit_tests doctest) diff --git a/bindings/c/fdb_c.cpp b/bindings/c/fdb_c.cpp index 771e066e6cc..b2f21c90490 100644 --- a/bindings/c/fdb_c.cpp +++ b/bindings/c/fdb_c.cpp @@ -851,9 +851,10 @@ extern "C" DLLEXPORT FDBResult* fdb_transaction_read_blob_granules(FDBTransactio context.get_load_f = granule_context.get_load_f; context.free_load_f = granule_context.free_load_f; context.debugNoMaterialize = granule_context.debugNoMaterialize; + context.granuleParallelism = granule_context.granuleParallelism; Optional rv; - if (readVersion != invalidVersion) { rv = readVersion; } + if (readVersion != latestVersion) { rv = readVersion; } return (FDBResult*)(TXN(tr)->readBlobGranules(range, beginVersion, rv, context).extractPtr());); } diff --git a/bindings/c/foundationdb/fdb_c.h b/bindings/c/foundationdb/fdb_c.h index 6c290f119d1..db13ab6e857 100644 --- a/bindings/c/foundationdb/fdb_c.h +++ b/bindings/c/foundationdb/fdb_c.h @@ -176,7 +176,12 @@ typedef struct readgranulecontext { void* userContext; /* Returns a unique id for the load. Asynchronous to support queueing multiple in parallel. */ - int64_t (*start_load_f)(const char* filename, int filenameLength, int64_t offset, int64_t length, void* context); + int64_t (*start_load_f)(const char* filename, + int filenameLength, + int64_t offset, + int64_t length, + int64_t fullFileLength, + void* context); /* Returns data for the load. Pass the loadId returned by start_load_f */ uint8_t* (*get_load_f)(int64_t loadId, void* context); @@ -187,6 +192,9 @@ typedef struct readgranulecontext { /* Set this to true for testing if you don't want to read the granule files, just do the request to the blob workers */ fdb_bool_t debugNoMaterialize; + + /* Number of granules to load in parallel */ + int granuleParallelism; } FDBReadBlobGranuleContext; DLLEXPORT void fdb_future_cancel(FDBFuture* f); @@ -432,15 +440,15 @@ DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_get_range_split_points(F int end_key_name_length, int64_t chunk_size); -DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_get_blob_granule_ranges(FDBTransaction* db, +DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_get_blob_granule_ranges(FDBTransaction* tr, uint8_t const* begin_key_name, int begin_key_name_length, uint8_t const* end_key_name, int end_key_name_length); -/* InvalidVersion (-1) for readVersion means get read version from transaction +/* LatestVersion (-2) for readVersion means get read version from transaction Separated out as optional because BG reads can support longer-lived reads than normal FDB transactions */ -DLLEXPORT WARN_UNUSED_RESULT FDBResult* fdb_transaction_read_blob_granules(FDBTransaction* db, +DLLEXPORT WARN_UNUSED_RESULT FDBResult* fdb_transaction_read_blob_granules(FDBTransaction* tr, uint8_t const* begin_key_name, int begin_key_name_length, uint8_t const* end_key_name, diff --git a/bindings/c/test/client_memory_test.cpp b/bindings/c/test/client_memory_test.cpp new file mode 100644 index 00000000000..4dbfb90479d --- /dev/null +++ b/bindings/c/test/client_memory_test.cpp @@ -0,0 +1,83 @@ +/* + * client_memory_test.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define FDB_API_VERSION 710 +#include + +#include "unit/fdb_api.hpp" + +#include +#include +#include + +void fdb_check(fdb_error_t e) { + if (e) { + std::cerr << fdb_get_error(e) << std::endl; + std::abort(); + } +} + +FDBDatabase* fdb_open_database(const char* clusterFile) { + FDBDatabase* db; + fdb_check(fdb_create_database(clusterFile, &db)); + return db; +} + +int main(int argc, char** argv) { + if (argc != 2) { + printf("Usage: %s ", argv[0]); + } + fdb_check(fdb_select_api_version(710)); + fdb_check(fdb_setup_network()); + std::thread network_thread{ &fdb_run_network }; + + fdb_check( + fdb_network_set_option(FDBNetworkOption::FDB_NET_OPTION_TRACE_ENABLE, reinterpret_cast(""), 0)); + fdb_check(fdb_network_set_option( + FDBNetworkOption::FDB_NET_OPTION_TRACE_FORMAT, reinterpret_cast("json"), 4)); + + // Use a bunch of memory from different client threads + FDBDatabase* db = fdb_open_database(argv[1]); + auto thread_func = [&]() { + fdb::Transaction tr(db); + for (int i = 0; i < 10000; ++i) { + tr.set(std::to_string(i), std::string(i, '\x00')); + } + tr.cancel(); + }; + std::vector threads; + constexpr auto kThreadCount = 64; + for (int i = 0; i < kThreadCount; ++i) { + threads.emplace_back(thread_func); + } + for (auto& thread : threads) { + thread.join(); + } + fdb_database_destroy(db); + db = nullptr; + + // Memory usage should go down now if the allocator is returning memory to the OS. It's expected that something is + // externally monitoring the memory usage of this process during this sleep. + using namespace std::chrono_literals; + std::this_thread::sleep_for(10s); + + fdb_check(fdb_stop_network()); + network_thread.join(); +} \ No newline at end of file diff --git a/bindings/c/test/mako/mako.c b/bindings/c/test/mako/mako.c index fc69ae06b58..46a75ba3874 100644 --- a/bindings/c/test/mako/mako.c +++ b/bindings/c/test/mako/mako.c @@ -585,6 +585,7 @@ int64_t granule_start_load(const char* filename, int filenameLength, int64_t offset, int64_t length, + int64_t fullFileLength, void* userContext) { FILE* fp; char full_fname[PATH_MAX]; @@ -682,6 +683,7 @@ int run_op_read_blob_granules(FDBTransaction* transaction, granuleContext.get_load_f = &granule_get_load; granuleContext.free_load_f = &granule_free_load; granuleContext.debugNoMaterialize = !doMaterialize; + granuleContext.granuleParallelism = 2; // TODO make knob or setting for changing this? r = fdb_transaction_read_blob_granules(transaction, (uint8_t*)keystr, @@ -689,7 +691,7 @@ int run_op_read_blob_granules(FDBTransaction* transaction, (uint8_t*)keystr2, strlen(keystr2), 0 /* beginVersion*/, - -1, /* endVersion. -1 is use txn read version */ + -2, /* endVersion. -2 (latestVersion) is use txn read version */ granuleContext); free(fileContext.data_by_id); diff --git a/bindings/c/test/unit/fdb_api.cpp b/bindings/c/test/unit/fdb_api.cpp index 4fc715dbc51..b26d7bdf82e 100644 --- a/bindings/c/test/unit/fdb_api.cpp +++ b/bindings/c/test/unit/fdb_api.cpp @@ -138,6 +138,12 @@ Tenant::Tenant(FDBDatabase* db, const uint8_t* name, int name_length) { } } +Tenant::~Tenant() { + if (tenant != nullptr) { + fdb_tenant_destroy(tenant); + } +} + // Transaction Transaction::Transaction(FDBDatabase* db) { if (fdb_error_t err = fdb_database_create_transaction(db, &tr_)) { @@ -146,7 +152,7 @@ Transaction::Transaction(FDBDatabase* db) { } } -Transaction::Transaction(Tenant tenant) { +Transaction::Transaction(Tenant& tenant) { if (fdb_error_t err = fdb_tenant_create_transaction(tenant.tenant, &tr_)) { std::cerr << fdb_get_error(err) << std::endl; std::abort(); diff --git a/bindings/c/test/unit/fdb_api.hpp b/bindings/c/test/unit/fdb_api.hpp index 5653d6e7cb5..fcf1c7e5ca1 100644 --- a/bindings/c/test/unit/fdb_api.hpp +++ b/bindings/c/test/unit/fdb_api.hpp @@ -206,6 +206,11 @@ class Database final { class Tenant final { public: Tenant(FDBDatabase* db, const uint8_t* name, int name_length); + ~Tenant(); + Tenant(const Tenant&) = delete; + Tenant& operator=(const Tenant&) = delete; + Tenant(Tenant&&) = delete; + Tenant& operator=(Tenant&&) = delete; private: friend class Transaction; @@ -219,7 +224,7 @@ class Transaction final { public: // Given an FDBDatabase, initializes a new transaction. Transaction(FDBDatabase* db); - Transaction(Tenant tenant); + Transaction(Tenant& tenant); ~Transaction(); // Wrapper around fdb_transaction_reset. diff --git a/bindings/c/test/unit/unit_tests.cpp b/bindings/c/test/unit/unit_tests.cpp index 9cb2b3ae5aa..78cb2ee2e9e 100644 --- a/bindings/c/test/unit/unit_tests.cpp +++ b/bindings/c/test/unit/unit_tests.cpp @@ -20,6 +20,7 @@ // Unit tests for the FoundationDB C API. +#include "fdb_c_options.g.h" #define FDB_API_VERSION 710 #include #include @@ -2430,6 +2431,38 @@ TEST_CASE("Tenant create, access, and delete") { break; } + while (1) { + StringRef begin = "\xff\xff/management/tenant_map/"_sr; + StringRef end = "\xff\xff/management/tenant_map0"_sr; + + fdb_check(tr.set_option(FDB_TR_OPTION_SPECIAL_KEY_SPACE_ENABLE_WRITES, nullptr, 0)); + fdb::KeyValueArrayFuture f = tr.get_range(FDB_KEYSEL_FIRST_GREATER_OR_EQUAL(begin.begin(), begin.size()), + FDB_KEYSEL_FIRST_GREATER_OR_EQUAL(end.begin(), end.size()), + /* limit */ 0, + /* target_bytes */ 0, + /* FDBStreamingMode */ FDB_STREAMING_MODE_WANT_ALL, + /* iteration */ 0, + /* snapshot */ false, + /* reverse */ 0); + + fdb_error_t err = wait_future(f); + if (err) { + fdb::EmptyFuture f2 = tr.on_error(err); + fdb_check(wait_future(f2)); + continue; + } + + FDBKeyValue const* outKv; + int outCount; + int outMore; + fdb_check(f.get(&outKv, &outCount, &outMore)); + CHECK(outCount == 1); + CHECK(StringRef(outKv->key, outKv->key_length) == StringRef(tenantName).withPrefix(begin)); + + tr.reset(); + break; + } + fdb::Tenant tenant(db, reinterpret_cast(tenantName.c_str()), tenantName.size()); fdb::Transaction tr2(tenant); @@ -2505,6 +2538,152 @@ TEST_CASE("Tenant create, access, and delete") { } } +int64_t granule_start_load_fail(const char* filename, + int filenameLength, + int64_t offset, + int64_t length, + int64_t fullFileLength, + void* userContext) { + CHECK(false); + return -1; +} + +uint8_t* granule_get_load_fail(int64_t loadId, void* userContext) { + CHECK(false); + return nullptr; +} + +void granule_free_load_fail(int64_t loadId, void* userContext) { + CHECK(false); +} + +TEST_CASE("Blob Granule Functions") { + auto confValue = + get_value("\xff/conf/blob_granules_enabled", /* snapshot */ false, { FDB_TR_OPTION_READ_SYSTEM_KEYS }); + if (!confValue.has_value() || confValue.value() != "1") { + return; + } + + // write some data + + insert_data(db, create_data({ { "bg1", "a" }, { "bg2", "b" }, { "bg3", "c" } })); + + // because wiring up files is non-trivial, just test the calls complete with the expected no_materialize error + FDBReadBlobGranuleContext granuleContext; + granuleContext.userContext = nullptr; + granuleContext.start_load_f = &granule_start_load_fail; + granuleContext.get_load_f = &granule_get_load_fail; + granuleContext.free_load_f = &granule_free_load_fail; + granuleContext.debugNoMaterialize = true; + granuleContext.granuleParallelism = 1; + + // dummy values + FDBKeyValue const* out_kv; + int out_count; + int out_more; + + fdb::Transaction tr(db); + int64_t originalReadVersion = -1; + + // test no materialize gets error but completes, save read version + while (1) { + fdb_check(tr.set_option(FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE, nullptr, 0)); + // -2 is latest version + fdb::KeyValueArrayResult r = tr.read_blob_granules(key("bg"), key("bh"), 0, -2, granuleContext); + fdb_error_t err = r.get(&out_kv, &out_count, &out_more); + if (err && err != 2037 /* blob_granule_not_materialized */) { + fdb::EmptyFuture f2 = tr.on_error(err); + fdb_check(wait_future(f2)); + continue; + } + + CHECK(err == 2037 /* blob_granule_not_materialized */); + + // If read done, save read version. Should have already used read version so this shouldn't error + fdb::Int64Future grvFuture = tr.get_read_version(); + fdb_error_t grvErr = wait_future(grvFuture); + CHECK(!grvErr); + CHECK(!grvFuture.get(&originalReadVersion)); + + CHECK(originalReadVersion > 0); + + tr.reset(); + break; + } + + // test with begin version > 0 + while (1) { + fdb_check(tr.set_option(FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE, nullptr, 0)); + // -2 is latest version, read version should be >= originalReadVersion + fdb::KeyValueArrayResult r = + tr.read_blob_granules(key("bg"), key("bh"), originalReadVersion, -2, granuleContext); + fdb_error_t err = r.get(&out_kv, &out_count, &out_more); + ; + if (err && err != 2037 /* blob_granule_not_materialized */) { + fdb::EmptyFuture f2 = tr.on_error(err); + fdb_check(wait_future(f2)); + continue; + } + + CHECK(err == 2037 /* blob_granule_not_materialized */); + + tr.reset(); + break; + } + + // test with prior read version completes after delay larger than normal MVC window + // TODO: should we not do this? + std::this_thread::sleep_for(std::chrono::milliseconds(6000)); + while (1) { + fdb_check(tr.set_option(FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE, nullptr, 0)); + fdb::KeyValueArrayResult r = + tr.read_blob_granules(key("bg"), key("bh"), 0, originalReadVersion, granuleContext); + fdb_error_t err = r.get(&out_kv, &out_count, &out_more); + if (err && err != 2037 /* blob_granule_not_materialized */) { + fdb::EmptyFuture f2 = tr.on_error(err); + fdb_check(wait_future(f2)); + continue; + } + + CHECK(err == 2037 /* blob_granule_not_materialized */); + + tr.reset(); + break; + } + + // test ranges + + while (1) { + fdb::KeyRangeArrayFuture f = tr.get_blob_granule_ranges(key("bg"), key("bh")); + fdb_error_t err = wait_future(f); + if (err) { + fdb::EmptyFuture f2 = tr.on_error(err); + fdb_check(wait_future(f2)); + continue; + } + + const FDBKeyRange* out_kr; + int out_count; + fdb_check(f.get(&out_kr, &out_count)); + + CHECK(out_count >= 1); + // check key ranges are in order + for (int i = 0; i < out_count; i++) { + // key range start < end + CHECK(std::string((const char*)out_kr[i].begin_key, out_kr[i].begin_key_length) < + std::string((const char*)out_kr[i].end_key, out_kr[i].end_key_length)); + } + // Ranges themselves are sorted + for (int i = 0; i < out_count - 1; i++) { + CHECK(std::string((const char*)out_kr[i].end_key, out_kr[i].end_key_length) <= + std::string((const char*)out_kr[i + 1].begin_key, out_kr[i + 1].begin_key_length)); + } + + tr.reset(); + break; + } +} + int main(int argc, char** argv) { if (argc < 3) { std::cout << "Unit tests for the FoundationDB C API.\n" diff --git a/bindings/python/fdb/__init__.py b/bindings/python/fdb/__init__.py index 0054e728082..413c81249ad 100644 --- a/bindings/python/fdb/__init__.py +++ b/bindings/python/fdb/__init__.py @@ -88,6 +88,7 @@ def api_version(ver): 'predicates', 'Future', 'Database', + 'Tenant', 'Transaction', 'KeyValue', 'KeySelector', diff --git a/bindings/python/fdb/impl.py b/bindings/python/fdb/impl.py index 837d3937c63..023e85ae95e 100644 --- a/bindings/python/fdb/impl.py +++ b/bindings/python/fdb/impl.py @@ -34,6 +34,7 @@ import fdb from fdb import six +from fdb.tuple import pack, unpack _network_thread = None _network_thread_reentrant_lock = threading.RLock() @@ -198,9 +199,10 @@ def transactional(*tr_args, **tr_kwargs): one of two actions, depending on the type of the parameter passed to the function at call time. - If given a Database, a Transaction will be created and passed into - the wrapped code in place of the Database. After the function is - complete, the newly created transaction will be committed. + If given a Database or Tenant, a Transaction will be created and + passed into the wrapped code in place of the Database or Tenant. + After the function is complete, the newly created transaction + will be committed. It is important to note that the wrapped method may be called multiple times in the event of a commit failure, until the commit @@ -943,128 +945,114 @@ def on_ready(self, callback): except: pass - -class Database(_FDBBase): - def __init__(self, dpointer): - self.dpointer = dpointer - self.options = _DatabaseOptions(self) - - def __del__(self): - # print('Destroying database 0x%x' % self.dpointer) - self.capi.fdb_database_destroy(self.dpointer) - +class _TransactionCreator(_FDBBase): def get(self, key): - return Database.__database_getitem(self, key) + return _TransactionCreator.__creator_getitem(self, key) def __getitem__(self, key): if isinstance(key, slice): return self.get_range(key.start, key.stop, reverse=(key.step == -1)) - return Database.__database_getitem(self, key) + return _TransactionCreator.__creator_getitem(self, key) def get_key(self, key_selector): - return Database.__database_get_key(self, key_selector) + return _TransactionCreator.__creator_get_key(self, key_selector) def get_range(self, begin, end, limit=0, reverse=False, streaming_mode=StreamingMode.want_all): - return Database.__database_get_range(self, begin, end, limit, reverse, streaming_mode) + return _TransactionCreator.__creator_get_range(self, begin, end, limit, reverse, streaming_mode) def get_range_startswith(self, prefix, *args, **kwargs): - return Database.__database_get_range_startswith(self, prefix, *args, **kwargs) + return _TransactionCreator.__creator_get_range_startswith(self, prefix, *args, **kwargs) def set(self, key, value): - Database.__database_setitem(self, key, value) + _TransactionCreator.__creator_setitem(self, key, value) def __setitem__(self, key, value): - Database.__database_setitem(self, key, value) + _TransactionCreator.__creator_setitem(self, key, value) def clear(self, key): - Database.__database_delitem(self, key) + _TransactionCreator.__creator_delitem(self, key) def clear_range(self, begin, end): - Database.__database_delitem(self, slice(begin, end)) + _TransactionCreator.__creator_delitem(self, slice(begin, end)) def __delitem__(self, key_or_slice): - Database.__database_delitem(self, key_or_slice) + _TransactionCreator.__creator_delitem(self, key_or_slice) def clear_range_startswith(self, prefix): - Database.__database_clear_range_startswith(self, prefix) + _TransactionCreator.__creator_clear_range_startswith(self, prefix) def get_and_watch(self, key): - return Database.__database_get_and_watch(self, key) + return _TransactionCreator.__creator_get_and_watch(self, key) def set_and_watch(self, key, value): - return Database.__database_set_and_watch(self, key, value) + return _TransactionCreator.__creator_set_and_watch(self, key, value) def clear_and_watch(self, key): - return Database.__database_clear_and_watch(self, key) + return _TransactionCreator.__creator_clear_and_watch(self, key) def create_transaction(self): - pointer = ctypes.c_void_p() - self.capi.fdb_database_create_transaction(self.dpointer, ctypes.byref(pointer)) - return Transaction(pointer.value, self) - - def _set_option(self, option, param, length): - self.capi.fdb_database_set_option(self.dpointer, option, param, length) + pass def _atomic_operation(self, opcode, key, param): - Database.__database_atomic_operation(self, opcode, key, param) + _TransactionCreator.__creator_atomic_operation(self, opcode, key, param) #### Transaction implementations #### @staticmethod @transactional - def __database_getitem(tr, key): + def __creator_getitem(tr, key): return tr[key].value @staticmethod @transactional - def __database_get_key(tr, key_selector): + def __creator_get_key(tr, key_selector): return tr.get_key(key_selector).value @staticmethod @transactional - def __database_get_range(tr, begin, end, limit, reverse, streaming_mode): + def __creator_get_range(tr, begin, end, limit, reverse, streaming_mode): return tr.get_range(begin, end, limit, reverse, streaming_mode).to_list() @staticmethod @transactional - def __database_get_range_startswith(tr, prefix, *args, **kwargs): + def __creator_get_range_startswith(tr, prefix, *args, **kwargs): return tr.get_range_startswith(prefix, *args, **kwargs).to_list() @staticmethod @transactional - def __database_setitem(tr, key, value): + def __creator_setitem(tr, key, value): tr[key] = value @staticmethod @transactional - def __database_clear_range_startswith(tr, prefix): + def __creator_clear_range_startswith(tr, prefix): tr.clear_range_startswith(prefix) @staticmethod @transactional - def __database_get_and_watch(tr, key): + def __creator_get_and_watch(tr, key): v = tr.get(key) return v, tr.watch(key) @staticmethod @transactional - def __database_set_and_watch(tr, key, value): + def __creator_set_and_watch(tr, key, value): tr.set(key, value) return tr.watch(key) @staticmethod @transactional - def __database_clear_and_watch(tr, key): + def __creator_clear_and_watch(tr, key): del tr[key] return tr.watch(key) @staticmethod @transactional - def __database_delitem(tr, key_or_slice): + def __creator_delitem(tr, key_or_slice): del tr[key_or_slice] @staticmethod @transactional - def __database_atomic_operation(tr, opcode, key, param): + def __creator_atomic_operation(tr, opcode, key, param): tr._atomic_operation(opcode, key, param) # Asynchronous transactions @@ -1074,11 +1062,11 @@ def declare_asynchronous_transactions(): From = asyncio.From coroutine = asyncio.coroutine - class Database: + class TransactionCreator: @staticmethod @transactional @coroutine - def __database_getitem(tr, key): + def __creator_getitem(tr, key): # raise Return(( yield From( tr[key] ) )) raise Return(tr[key]) yield None @@ -1086,26 +1074,26 @@ def __database_getitem(tr, key): @staticmethod @transactional @coroutine - def __database_get_key(tr, key_selector): + def __creator_get_key(tr, key_selector): raise Return(tr.get_key(key_selector)) yield None @staticmethod @transactional @coroutine - def __database_get_range(tr, begin, end, limit, reverse, streaming_mode): + def __creator_get_range(tr, begin, end, limit, reverse, streaming_mode): raise Return((yield From(tr.get_range(begin, end, limit, reverse, streaming_mode).to_list()))) @staticmethod @transactional @coroutine - def __database_get_range_startswith(tr, prefix, *args, **kwargs): + def __creator_get_range_startswith(tr, prefix, *args, **kwargs): raise Return((yield From(tr.get_range_startswith(prefix, *args, **kwargs).to_list()))) @staticmethod @transactional @coroutine - def __database_setitem(tr, key, value): + def __creator_setitem(tr, key, value): tr[key] = value raise Return() yield None @@ -1113,7 +1101,7 @@ def __database_setitem(tr, key, value): @staticmethod @transactional @coroutine - def __database_clear_range_startswith(tr, prefix): + def __creator_clear_range_startswith(tr, prefix): tr.clear_range_startswith(prefix) raise Return() yield None @@ -1121,7 +1109,7 @@ def __database_clear_range_startswith(tr, prefix): @staticmethod @transactional @coroutine - def __database_get_and_watch(tr, key): + def __creator_get_and_watch(tr, key): v = tr.get(key) raise Return(v, tr.watch(key)) yield None @@ -1129,7 +1117,7 @@ def __database_get_and_watch(tr, key): @staticmethod @transactional @coroutine - def __database_set_and_watch(tr, key, value): + def __creator_set_and_watch(tr, key, value): tr.set(key, value) raise Return(tr.watch(key)) yield None @@ -1137,7 +1125,7 @@ def __database_set_and_watch(tr, key, value): @staticmethod @transactional @coroutine - def __database_clear_and_watch(tr, key): + def __creator_clear_and_watch(tr, key): del tr[key] raise Return(tr.watch(key)) yield None @@ -1145,7 +1133,7 @@ def __database_clear_and_watch(tr, key): @staticmethod @transactional @coroutine - def __database_delitem(tr, key_or_slice): + def __creator_delitem(tr, key_or_slice): del tr[key_or_slice] raise Return() yield None @@ -1153,11 +1141,101 @@ def __database_delitem(tr, key_or_slice): @staticmethod @transactional @coroutine - def __database_atomic_operation(tr, opcode, key, param): + def __creator_atomic_operation(tr, opcode, key, param): tr._atomic_operation(opcode, key, param) raise Return() yield None - return Database + return TransactionCreator + +def process_tenant_name(name): + if isinstance(name, tuple): + return pack(name) + elif isinstance(name, bytes): + return name + else: + raise TypeError('Tenant name must be of type ' + bytes.__name__ + ' or of type ' + tuple.__name__) + +class Database(_TransactionCreator): + def __init__(self, dpointer): + self.dpointer = dpointer + self.options = _DatabaseOptions(self) + + def __del__(self): + # print('Destroying database 0x%x' % self.dpointer) + self.capi.fdb_database_destroy(self.dpointer) + + def _set_option(self, option, param, length): + self.capi.fdb_database_set_option(self.dpointer, option, param, length) + + def open_tenant(self, name): + tname = process_tenant_name(name) + pointer = ctypes.c_void_p() + self.capi.fdb_database_open_tenant(self.dpointer, tname, len(tname), ctypes.byref(pointer)) + return Tenant(pointer.value) + + def create_transaction(self): + pointer = ctypes.c_void_p() + self.capi.fdb_database_create_transaction(self.dpointer, ctypes.byref(pointer)) + return Transaction(pointer.value, self) + + def allocate_tenant(self, name): + Database.__database_allocate_tenant(self, process_tenant_name(name), []) + + def delete_tenant(self, name): + Database.__database_delete_tenant(self, process_tenant_name(name), []) + + # Attempt to allocate a tenant in the cluster. If the tenant already exists, + # this function will return a tenant_already_exists error. If the tenant is created + # concurrently, then this function may return success even if another caller creates + # it. + # + # The existence_check_marker is expected to be an empty list. This function will + # modify the list after completing the existence check to avoid checking for existence + # on retries. This allows the operation to be idempotent. + @staticmethod + @transactional + def __database_allocate_tenant(tr, name, existence_check_marker): + tr.options.set_special_key_space_enable_writes() + key = b'\xff\xff/management/tenant_map/%s' % name + if not existence_check_marker: + existing_tenant = tr[key].wait() + existence_check_marker.append(None) + if existing_tenant != None: + raise fdb.FDBError(2132) # tenant_already_exists + tr[key] = b'' + + # Attempt to remove a tenant in the cluster. If the tenant doesn't exist, this + # function will return a tenant_not_found error. If the tenant is deleted + # concurrently, then this function may return success even if another caller deletes + # it. + # + # The existence_check_marker is expected to be an empty list. This function will + # modify the list after completing the existence check to avoid checking for existence + # on retries. This allows the operation to be idempotent. + @staticmethod + @transactional + def __database_delete_tenant(tr, name, existence_check_marker): + tr.options.set_special_key_space_enable_writes() + key = b'\xff\xff/management/tenant_map/%s' % name + if not existence_check_marker: + existing_tenant = tr[key].wait() + existence_check_marker.append(None) + if existing_tenant == None: + raise fdb.FDBError(2131) # tenant_not_found + del tr[key] + + +class Tenant(_TransactionCreator): + def __init__(self, tpointer): + self.tpointer = tpointer + + def __del__(self): + self.capi.fdb_tenant_destroy(self.tpointer) + + def create_transaction(self): + pointer = ctypes.c_void_p() + self.capi.fdb_tenant_create_transaction(self.tpointer, ctypes.byref(pointer)) + return Transaction(pointer.value, self) fill_operations() @@ -1458,6 +1536,10 @@ def init_c_api(): _capi.fdb_database_destroy.argtypes = [ctypes.c_void_p] _capi.fdb_database_destroy.restype = None + _capi.fdb_database_open_tenant.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.POINTER(ctypes.c_void_p)] + _capi.fdb_database_open_tenant.restype = ctypes.c_int + _capi.fdb_database_open_tenant.errcheck = check_error_code + _capi.fdb_database_create_transaction.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_void_p)] _capi.fdb_database_create_transaction.restype = ctypes.c_int _capi.fdb_database_create_transaction.errcheck = check_error_code @@ -1466,6 +1548,13 @@ def init_c_api(): _capi.fdb_database_set_option.restype = ctypes.c_int _capi.fdb_database_set_option.errcheck = check_error_code + _capi.fdb_tenant_destroy.argtypes = [ctypes.c_void_p] + _capi.fdb_tenant_destroy.restype = None + + _capi.fdb_tenant_create_transaction.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_void_p)] + _capi.fdb_tenant_create_transaction.restype = ctypes.c_int + _capi.fdb_tenant_create_transaction.errcheck = check_error_code + _capi.fdb_transaction_destroy.argtypes = [ctypes.c_void_p] _capi.fdb_transaction_destroy.restype = None @@ -1686,10 +1775,10 @@ def it(): raise asyncio.Return(self) return it() FDBRange.iterate = iterate - AT = Database.declare_asynchronous_transactions() + AT = _TransactionCreator.declare_asynchronous_transactions() for name in dir(AT): - if name.startswith("_Database__database_"): - setattr(Database, name, getattr(AT, name)) + if name.startswith("__TransactionCreator__creator_"): + setattr(_TransactionCreator, name, getattr(AT, name)) def to_list(self): if self._mode == StreamingMode.iterator: diff --git a/bindings/python/tests/tenant_tests.py b/bindings/python/tests/tenant_tests.py new file mode 100755 index 00000000000..9f35620b6aa --- /dev/null +++ b/bindings/python/tests/tenant_tests.py @@ -0,0 +1,123 @@ +#!/usr/bin/python +# +# tenant_tests.py +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2022 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import fdb +import sys +import json +from fdb.tuple import pack + +if __name__ == '__main__': + fdb.api_version(710) + +def test_tenant_tuple_name(db): + tuplename=(b'test', b'level', b'hierarchy', 3, 1.24, 'str') + db.allocate_tenant(tuplename) + + tenant=db.open_tenant(tuplename) + tenant[b'foo'] = b'bar' + + assert tenant[b'foo'] == b'bar' + + del tenant[b'foo'] + db.delete_tenant(tuplename) + +def cleanup_tenant(db, tenant_name): + try: + tenant = db.open_tenant(tenant_name) + del tenant[:] + db.delete_tenant(tenant_name) + except fdb.FDBError as e: + if e.code == 2131: # tenant not found + pass + else: + raise + +def test_tenant_operations(db): + cleanup_tenant(db, b'tenant1') + cleanup_tenant(db, b'tenant2') + + db.allocate_tenant(b'tenant1') + db.allocate_tenant(b'tenant2') + + tenant1 = db.open_tenant(b'tenant1') + tenant2 = db.open_tenant(b'tenant2') + + db[b'tenant_test_key'] = b'no_tenant' + tenant1[b'tenant_test_key'] = b'tenant1' + tenant2[b'tenant_test_key'] = b'tenant2' + + tenant1_entry = db[b'\xff\xff/management/tenant_map/tenant1'] + tenant1_json = json.loads(tenant1_entry) + prefix1 = tenant1_json['prefix'].encode('utf8') + + tenant2_entry = db[b'\xff\xff/management/tenant_map/tenant2'] + tenant2_json = json.loads(tenant2_entry) + prefix2 = tenant2_json['prefix'].encode('utf8') + + assert tenant1[b'tenant_test_key'] == b'tenant1' + assert db[prefix1 + b'tenant_test_key'] == b'tenant1' + assert tenant2[b'tenant_test_key'] == b'tenant2' + assert db[prefix2 + b'tenant_test_key'] == b'tenant2' + assert db[b'tenant_test_key'] == b'no_tenant' + + tr1 = tenant1.create_transaction() + try: + del tr1[:] + tr1.commit().wait() + except fdb.FDBError as e: + tr.on_error(e).wait() + + assert tenant1[b'tenant_test_key'] == None + assert db[prefix1 + b'tenant_test_key'] == None + assert tenant2[b'tenant_test_key'] == b'tenant2' + assert db[prefix2 + b'tenant_test_key'] == b'tenant2' + assert db[b'tenant_test_key'] == b'no_tenant' + + db.delete_tenant(b'tenant1') + try: + tenant1[b'tenant_test_key'] + assert False + except fdb.FDBError as e: + assert e.code == 2131 # tenant not found + + del tenant2[:] + db.delete_tenant(b'tenant2') + + assert db[prefix1 + b'tenant_test_key'] == None + assert db[prefix2 + b'tenant_test_key'] == None + assert db[b'tenant_test_key'] == b'no_tenant' + + del db[b'tenant_test_key'] + + assert db[b'tenant_test_key'] == None + +def test_tenants(db): + test_tenant_tuple_name(db) + test_tenant_operations(db) + +# Expect a cluster file as input. This test will write to the FDB cluster, so +# be aware of potential side effects. +if __name__ == '__main__': + clusterFile = sys.argv[1] + db = fdb.open(clusterFile) + db.options.set_transaction_timeout(2000) # 2 seconds + db.options.set_transaction_retry_limit(3) + + test_tenants(db) diff --git a/bindings/python/tests/tester.py b/bindings/python/tests/tester.py index 6aa41dea4ae..7f8d7942078 100644 --- a/bindings/python/tests/tester.py +++ b/bindings/python/tests/tester.py @@ -49,6 +49,7 @@ from cancellation_timeout_tests import test_combinations from size_limit_tests import test_size_limit_option, test_get_approximate_size +from tenant_tests import test_tenants random.seed(0) @@ -112,12 +113,13 @@ def pop(self, count=None, with_idx=False): class Instruction: - def __init__(self, tr, stack, op, index, isDatabase=False, isSnapshot=False): + def __init__(self, tr, stack, op, index, isDatabase=False, isTenant=False, isSnapshot=False): self.tr = tr self.stack = stack self.op = op self.index = index self.isDatabase = isDatabase + self.isTenant = isTenant self.isSnapshot = isSnapshot def pop(self, count=None, with_idx=False): @@ -277,6 +279,7 @@ class Tester: def __init__(self, db, prefix): self.db = db + self.tenant = None self.instructions = self.db[fdb.tuple.range((prefix,))] @@ -317,7 +320,8 @@ def current_transaction(self): def new_transaction(self): with Tester.tr_map_lock: - Tester.tr_map[self.tr_name] = self.db.create_transaction() + tr_source = self.tenant if self.tenant is not None else self.db + Tester.tr_map[self.tr_name] = tr_source.create_transaction() def switch_transaction(self, name): self.tr_name = name @@ -335,18 +339,22 @@ def run(self): # print("%d. Instruction is %s" % (idx, op)) isDatabase = op.endswith(six.u('_DATABASE')) + isTenant = op.endswith(six.u('_TENANT')) isSnapshot = op.endswith(six.u('_SNAPSHOT')) if isDatabase: op = op[:-9] obj = self.db + elif isTenant: + op = op[:-7] + obj = self.tenant if self.tenant else self.db elif isSnapshot: op = op[:-9] obj = self.current_transaction().snapshot else: obj = self.current_transaction() - inst = Instruction(obj, self.stack, op, idx, isDatabase, isSnapshot) + inst = Instruction(obj, self.stack, op, idx, isDatabase, isTenant, isSnapshot) try: if inst.op == six.u("PUSH"): @@ -583,6 +591,19 @@ def run(self): prefix = inst.pop() Tester.wait_empty(self.db, prefix) inst.push(b"WAITED_FOR_EMPTY") + elif inst.op == six.u("TENANT_CREATE"): + name = inst.pop() + self.db.allocate_tenant(name) + inst.push(b"RESULT_NOT_PRESENT") + elif inst.op == six.u("TENANT_DELETE"): + name = inst.pop() + self.db.delete_tenant(name) + inst.push(b"RESULT_NOT_PRESENT") + elif inst.op == six.u("TENANT_SET_ACTIVE"): + name = inst.pop() + self.tenant = self.db.open_tenant(name) + elif inst.op == six.u("TENANT_CLEAR_ACTIVE"): + self.tenant = None elif inst.op == six.u("UNIT_TESTS"): try: test_db_options(db) @@ -600,6 +621,8 @@ def run(self): test_size_limit_option(db) test_get_approximate_size(db) + test_tenants(db) + except fdb.FDBError as e: print("Unit tests failed: %s" % e.description) traceback.print_exc() diff --git a/cmake/FDBComponents.cmake b/cmake/FDBComponents.cmake index a31d3866886..60fbe44a0d7 100644 --- a/cmake/FDBComponents.cmake +++ b/cmake/FDBComponents.cmake @@ -212,6 +212,17 @@ endif() set(COROUTINE_IMPL ${DEFAULT_COROUTINE_IMPL} CACHE STRING "Which coroutine implementation to use. Options are boost and libcoro") +################################################################################ +# AWS SDK +################################################################################ + +set(BUILD_AWS_BACKUP OFF CACHE BOOL "Build AWS S3 SDK backup client") +if (BUILD_AWS_BACKUP) + set(WITH_AWS_BACKUP ON) +else() + set(WITH_AWS_BACKUP OFF) +endif() + ################################################################################ file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/packages) @@ -232,6 +243,7 @@ function(print_components) message(STATUS "Build Python sdist (make package): ${WITH_PYTHON_BINDING}") message(STATUS "Configure CTest (depends on Python): ${WITH_PYTHON}") message(STATUS "Build with RocksDB: ${WITH_ROCKSDB_EXPERIMENTAL}") + message(STATUS "Build with AWS SDK: ${WITH_AWS_BACKUP}") message(STATUS "=========================================") endfunction() diff --git a/cmake/awssdk.cmake b/cmake/awssdk.cmake new file mode 100644 index 00000000000..c3b9e95a4e3 --- /dev/null +++ b/cmake/awssdk.cmake @@ -0,0 +1,98 @@ +project(awssdk-download NONE) + +# Compile the sdk with clang and libc++, since otherwise we get libc++ vs libstdc++ link errors when compiling fdb with clang +set(AWSSDK_COMPILER_FLAGS "") +set(AWSSDK_LINK_FLAGS "") +if(APPLE OR CLANG OR USE_LIBCXX) + set(AWSSDK_COMPILER_FLAGS -stdlib=libc++ -nostdlib++) + set(AWSSDK_LINK_FLAGS -stdlib=libc++ -lc++abi) +endif() + +include(ExternalProject) +ExternalProject_Add(awssdk_project + GIT_REPOSITORY https://github.com/aws/aws-sdk-cpp.git + GIT_TAG 2af3ce543c322cb259471b3b090829464f825972 # v1.9.200 + SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/awssdk-src" + BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build" + GIT_CONFIG advice.detachedHead=false + CMAKE_ARGS -DBUILD_SHARED_LIBS=OFF # SDK builds shared libs by default, we want static libs + -DENABLE_TESTING=OFF + -DBUILD_ONLY=core # git repo contains SDK for every AWS product, we only want the core auth libraries + -DSIMPLE_INSTALL=ON + -DCMAKE_INSTALL_PREFIX=install # need to specify an install prefix so it doesn't install in /usr/lib - FIXME: use absolute path + -DBYO_CRYPTO=ON # we have our own crypto libraries that conflict if we let aws sdk build and link its own + + + -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_EXE_LINKER_FLAGS=${AWSSDK_COMPILER_FLAGS} + -DCMAKE_CXX_FLAGS=${AWSSDK_LINK_FLAGS} + TEST_COMMAND "" + BUILD_ALWAYS TRUE + # the sdk build produces a ton of artifacts, with their own dependency tree, so there is a very specific dependency order they must be linked in + BUILD_BYPRODUCTS "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-cpp-sdk-core.a" + "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-crt-cpp.a" + "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-s3.a" + "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-auth.a" + "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-event-stream.a" + "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-http.a" + "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-mqtt.a" + "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-io.a" + "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-checksums.a" + "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-compression.a" + "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-cal.a" + "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-common.a" +) + +add_library(awssdk_core STATIC IMPORTED) +add_dependencies(awssdk_core awssdk_project) +set_target_properties(awssdk_core PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-cpp-sdk-core.a") + +add_library(awssdk_crt STATIC IMPORTED) +add_dependencies(awssdk_crt awssdk_project) +set_target_properties(awssdk_crt PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-crt-cpp.a") + +# TODO: can we remove c_s3? It seems to be a dependency of libaws-crt +add_library(awssdk_c_s3 STATIC IMPORTED) +add_dependencies(awssdk_c_s3 awssdk_project) +set_target_properties(awssdk_c_s3 PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-s3.a") + +add_library(awssdk_c_auth STATIC IMPORTED) +add_dependencies(awssdk_c_auth awssdk_project) +set_target_properties(awssdk_c_auth PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-auth.a") + +add_library(awssdk_c_eventstream STATIC IMPORTED) +add_dependencies(awssdk_c_eventstream awssdk_project) +set_target_properties(awssdk_c_eventstream PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-event-stream.a") + +add_library(awssdk_c_http STATIC IMPORTED) +add_dependencies(awssdk_c_http awssdk_project) +set_target_properties(awssdk_c_http PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-http.a") + +add_library(awssdk_c_mqtt STATIC IMPORTED) +add_dependencies(awssdk_c_mqtt awssdk_project) +set_target_properties(awssdk_c_mqtt PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-mqtt.a") + +add_library(awssdk_c_io STATIC IMPORTED) +add_dependencies(awssdk_c_io awssdk_project) +set_target_properties(awssdk_c_io PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-io.a") + +add_library(awssdk_checksums STATIC IMPORTED) +add_dependencies(awssdk_checksums awssdk_project) +set_target_properties(awssdk_checksums PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-checksums.a") + +add_library(awssdk_c_compression STATIC IMPORTED) +add_dependencies(awssdk_c_compression awssdk_project) +set_target_properties(awssdk_c_compression PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-compression.a") + +add_library(awssdk_c_cal STATIC IMPORTED) +add_dependencies(awssdk_c_cal awssdk_project) +set_target_properties(awssdk_c_cal PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-cal.a") + +add_library(awssdk_c_common STATIC IMPORTED) +add_dependencies(awssdk_c_common awssdk_project) +set_target_properties(awssdk_c_common PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-common.a") + +# link them all together in one interface target +add_library(awssdk_target INTERFACE) +target_include_directories(awssdk_target SYSTEM INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/include) +target_link_libraries(awssdk_target INTERFACE awssdk_core awssdk_crt awssdk_c_s3 awssdk_c_auth awssdk_c_eventstream awssdk_c_http awssdk_c_mqtt awssdk_c_io awssdk_checksums awssdk_c_compression awssdk_c_cal awssdk_c_common curl) \ No newline at end of file diff --git a/contrib/Joshua/scripts/localClusterStart.sh b/contrib/Joshua/scripts/localClusterStart.sh index 7cbca6e41b2..abbf93abc5d 100644 --- a/contrib/Joshua/scripts/localClusterStart.sh +++ b/contrib/Joshua/scripts/localClusterStart.sh @@ -346,7 +346,7 @@ function createDatabase # Configure the database. else - "${BINDIR}/fdbcli" -C "${FDBCONF}" --exec 'configure new single memory; status' --timeout "${CONFIGUREWAIT}" --log --log-dir "${LOGDIR}" &>> "${LOGDIR}/fdbclient.log" + "${BINDIR}/fdbcli" -C "${FDBCONF}" --exec 'configure new single memory tenant_mode=optional_experimental; status' --timeout "${CONFIGUREWAIT}" --log --log-dir "${LOGDIR}" &>> "${LOGDIR}/fdbclient.log" if ! displayMessage "Checking if config succeeded" then diff --git a/design/encryption-data-at-rest.md b/design/encryption-data-at-rest.md new file mode 100644 index 00000000000..135a5e51054 --- /dev/null +++ b/design/encryption-data-at-rest.md @@ -0,0 +1,237 @@ +# FDB Encryption **data at-rest** + +## Threat Model + +The proposed solution is `able to handle` the following attacks: + +* An attacker, if able to get access to any FDB cluster host or attached disk, would not be able to read the persisted data. Further, for cloud deployments, returning a cloud instance back to the cloud provider will prevent the cloud provider from reading the contents of data stored on the disk. + +* Data stored on a lost or stolen FDB host persistent disk storage device can’t be recovered. + +The proposed solution `will not be able` to handle the following attacks: + +* Encryption is enabled for data at-rest only, generating a memory dump of FDB processes could enable an attacker to read in-memory data contents. +* An FDB cluster host access, if compromised, would allow an attacker to read/write data managed by the FDB cluster. + +## Goals + +FoundationDB being a multi-model, easily scalable and fault-tolerant, with an ability to provide great performance even with commodity hardware, plays a critical role enabling enterprises to deploy, manage and run mission critical applications. + +Data encryption support is a table-stake feature for modern day enterprise service offerings in the cloud. Customers expect, and at times warrant, that their data and metadata be fully encrypted using the latest security standards. The goal of this document includes: + +* Discuss detailed design to support data at-rest encryption support for data stored in FDB clusters. Encrypting data in-transit and/or in-memory caches at various layers in the query execution pipeline (inside and external to FDB) is out of the scope of this feature. + +* Isolation guarantees: the encryption domain matches with `tenant` partition semantics supported by FDB clusters. Tenants are discrete namespaces in FDB that serve as transaction domains. A tenant is a `identifier` that maps to a `prefix` within the data-FDB cluster, and all operations within a tenant are implicitly bound within a `tenant-prefix`. Refer to `Multi-Tenant FoundationDB API` documentation more details. However, it is possible to use a single encryption key for the whole cluster, in case `tenant partitioning` isn’t available. + +* Ease of integration with external Key Management Services enabling persisting, caching, and lookup of encryption keys. + +## Config Knobs + +* `ServerKnob::ENABLE_ENCRYPION` allows enable/disable encryption feature. +* `ServerKnob::ENCRYPTION_MODE` controls the encryption mode supported. The current scheme supports `AES-256-CTR` encryption mode. + +## Encryption Mode + +The proposal is to use strong AES-256 CTR encryption mode. Salient properties are: + +* HMAC_SHA256 key hashing technique is used to derive encryption keys using a base encryption key and locally generated random number. The formula used is as follows: + +``` + DEK = HMAC SHA256(BEK || UID) + +Where +DEK = Derived Encryption Key +BEK = Base Encryption key +UID = Host local random generated number +``` + +UID is an 8 byte host-local random number. Another option would have been a simple host-local incrementing counter, however, the scheme runs the risk of repeated encryption-key generation on cluster/process restarts. + +* An encryption key derived using the above formula will be cached (in-memory) for a short time interval (10 mins, for instance). The encryption-key is immutable, but, the TTL approach allows refreshing encryption key by reaching out to External Encryption KeyManagement solutions, hence, supporting “restricting lifetime of an encryption” feature if implemented by Encryption Key Management solution. + +* Initialization Vector (IV) selection would be random. + +## Architecture + +The encryption responsibilities are split across multiple modules to ensure data and metadata stored in the cluster is never persisted in plain text on any durable storages (temporary and/or long-term durable storage). + +## Encryption Request Workflow + +### **Write Request** + +* An FDB client initiates a write transaction providing {key, value} in plaintext format. +* An FDB cluster host as part of processing a write transaction would do the following: + 1. Obtain required encryption key based on the transaction request tenant information. + 2. Encrypt mutations before persisting them on Transaction Logs (TLogs). As a background process, the mutations are moved to a long-term durable storage by the Storage Server processes. + +Refer to the sections below for more details. + +### **Read Request** + +* An FDB client initiates a read transaction request. +* An FDB cluster host as part of processing request would do the following: + 1. StorageServer would read desired data blocks from the persistent storage. + 2. Regenerate the encryption key required to decrypt the data. + 3. Decrypt data and pass results as plaintext to the caller. + + +Below diagram depicts the end-to-end encryption workflow detailing various modules involved and their interactions. The following section discusses detailed design for involved components. + +``` + _______________________________________________________ + | FDB CLUSER HOST | + | | + _____________________ | ________________________ _________________ | + | | (proprietary) | | | | | + | |<---------- |--| KMS CONNECTOR | | COMMIT PROXIES | | + | ENCRYPTION KEY | | | | | | | + | MANAGEMENT SOLUTION | | |(non FDB - proprietary) | | | | + | | | |________________________| |_________________| | + | | | ^ | | + |_____________________| | | (REST API) | (Encrypt | + | | V Mutation) | + | _________________________________________ | __________________ + | | | | | | + | | ENCRYPT KEYPROXY SERVER |<------|-----------| | + | |_________________________________________| | | | + | | | | BACKUP FILES | + | | (Encrypt Node) | | | + | V | | | + | _________________________________________ | | (Encrypt file) | + | | |<------|-----------| | + | | REDWOOD STORAGE SERVER | | |__________________| + | |_________________________________________| | + |_______________________________________________________| +``` + +## FDB Encryption + +An FDB client would insert data i.e. plaintext {key, value} in a FDB cluster for persistence. + +### KMS-Connector + +A non-FDB process running on FDB cluster hosts enables an FDB cluster to interact with external Encryption Key Managements services. Salient features includes: + +* An external (non-FDB) standalone process implementing a REST server. + +* Abstracts organization specific KeyManagementService integration details. The proposed design ensures ease of integration given limited infrastructure needed to implement a local/remote REST server. + +* Ensure organization specific code is implemented outside the FDB codebase. + +* The KMS-Connector process is launched and maintained by the FDBMonitor. The process needs to handle the following REST endpoint: + 1. GET - http://localhost/getEncryptionKey + + Define a single interface returning “encryption key string in plaintext” and accepting an + JSON input which can be customized as needed: + +```json + json_input_payload + { + “Version” : int // version + “KeyId” : keyId // string + } +``` + +Few benefits of the above proposed schemes are: +* JSON input format is extensible (adding new fields is backward compatible). + +* Popular Cloud KMS “getPublicKey” API accepts “keyId” as a string, hence, API should be easy to integrate. + + 1. AWS: https://docs.aws.amazon.com/cli/latest/reference/kms/get-public-key.html + 2. GCP: https://cloud.google.com/kms/docs/retrieve-public-key + +`Future improvements`: FDBMonitor at present will launch one KMS-Connector process per FDB cluster host. Though multiple KMS-Connector processes are launched, only one process (collocated with EncryptKeyServer) would consume cluster resources. In future, possible enhancements could be: + +* Enable FDBMonitor to launch “N” (configurable) processes per cluster. +* Enable the FDB cluster to manage external processes as well. + +### Encrypt KeyServer + +Salient features include: + +* New FDB role/process to allow fetching of encryption keys from external KeyManagementService interfaces. The process connects to the KMS-Connector REST interface to fetch desired encryption keys. + +* On an encryption-key fetch from KMS-Connector, it applies HMAC derivative function to generate a new encryption key and cache it in-memory. The in-memory cache is used to serve encryption key fetch requests from other FDB processes. + + +Given encryption keys will be needed as part of cluster-recovery, this process/role needs to be recruited at the start of the cluster-recovery process (just after the “master/sequencer” process/role recruitment). All other FDB processes will interact with this process to obtain encryption keys needed to encrypt and/or decrypt the data payload. + +`Note`: An alternative would be to incorporate the functionality into the ClusterController process itself, however, having clear responsibility separation would make design more flexible and extensible in future if needed. + +### Commit Proxies (CPs) + +When a FDB client initiates a write transaction to insert/update data stored in a FDB cluster, the transaction is received by a CP, which then resolves the transaction by checking if the transaction is allowed. If allowed, it commits the transaction to TLogs. The proposal is to extend CP responsibilities by encrypting mutations using the desired encryption key before mutations get persisted into TLogs (durable storage). The encryption key derivation is achieved using the following formula: + +``` + DEK = HMAC SHA256(BEK || UID) + +Where: + +DEK = Derived Encryption Key +BEK = Base Encryption Key +UID = Host local random generated number +``` + +The Transaction State Store (commonly referred as TxnStateStore) is a Key-Value datastore used by FDB to store metadata about the database itself for bootstrap purposes. The data stored in this store plays a critical role in: guiding the transaction system to persist writes (storage tags to mutations at CPs), and managing FDB internal data movement. The TxnStateStore data gets encrypted with the desired encryption key before getting persisted on the disk queues. + +As part of encryption, every Mutation would be appended by a plaintext `BlobCipherEncryptHeader` to assist decrypting the information for reads. + +CPs would cache (in-memory) recently used encryption-keys to optimize network traffic due to encryption related operations. Further, the caching would improve overall performance, avoiding frequent RPC calls to EncryptKeyServer which may eventually become a scalability bottleneck. Each encryption-key in the cache has a short Time-To-Live (10 mins) and on expiry the process will interact with the EncryptKeyServer to fetch the required encryption-keys. The same caching policy is followed by the Redwood Storage Server and the Backup File processes too. + +### **Caveats** + +The encryption is done inline in the transaction path, which will increase the total commit latencies. Few possible ways to minimize this impact are: + +* Overlap encryption operations with the CP::resolution phase, which would minimize the latency penalty per transaction at the cost of spending more CPU cycles. If needed, for production deployments, we may need to increase the number of CPs per FDB cluster. +* Implement an external process to offload encryption. If done, encryption would appear no different than the CP::resolution phase, where the process would invoke RPC calls to encrypt the buffer and wait for operation completion. + +### Storage Servers + +The encryption design only supports Redwood Storage Server integration, support for other storage engines is yet to be planned. + +### Redwood Storage Nodes + +Redwood at heart is a B+ tree and stores data in two types of nodes: + +* `Non-leaf` nodes: Nodes will only store keys and not values(prefix compression is applied). +* `Leaf` Nodes: Will store `{key, value}` tuples for a given key-range. + +Both above-mentioned nodes will be converted into one or more fixed size pages (likely 4K or 8K) before being persisted on a durable storage. The encryption will be performed at the node level instead of “page level”, i.e. all pages constituting a given Redwood node will be encrypted using the same encryption key generated using the following formula: + +``` + DEK = HMAC SHA256(BEK || UID) + +Where: + +DEK = Derived Encryption Key +BEK = Base Encryption Key +UID = Host local random generated number +``` + +### Backup Files + +Backup Files are designed to pull committed mutations from StorageServers and persist them as “files” stored on cloud backed BlobStorage such as Amazon S3. Each persisted file stores mutations for a given key-range and will be encrypted by generating an encryption key using below formula: + +``` + DEK = HMAC SHA256(BEK || FID) + +Where: + +DEK = Derived Encryption Key +BEK = Base Encryption Key +FID = File Identifier (unique) +``` + +## Decryption on Reads + +To assist reads, FDB processes (StorageServers, Backup Files workers) will be modified to read/parse the encryption header. The data decryption will be done as follows: + +* The FDB process will interact with Encrypt KeyServer to fetch the desired base encryption key corresponding to the key-id persisted in the encryption header. +* Reconstruct the encryption key and decrypt the data block. + +## Future Work + +* Extend the TLog API to allow clients to read “plaintext mutations” directly from a TLogServer. In current implementations there are two consumers of TLogs: + + 1. Storage Server: At present the plan is for StorageServer to decrypt the mutations. + 2. BackupWorker (Apple implementation) which is currently not used in the code. diff --git a/documentation/sphinx/requirements.txt b/documentation/sphinx/requirements.txt index 67ca207628a..06e23ea6d30 100644 --- a/documentation/sphinx/requirements.txt +++ b/documentation/sphinx/requirements.txt @@ -3,3 +3,4 @@ setuptools>=20.10.0,<=57.4.0 sphinx==1.5.6 sphinx-bootstrap-theme==0.4.8 docutils==0.16 +Jinja2==3.0.3 diff --git a/documentation/sphinx/source/api-python.rst b/documentation/sphinx/source/api-python.rst index f3af667e0c7..5dab3e49c6d 100644 --- a/documentation/sphinx/source/api-python.rst +++ b/documentation/sphinx/source/api-python.rst @@ -7,7 +7,7 @@ .. |database-type| replace:: ``Database`` .. |database-class| replace:: :class:`Database` .. |database-auto| replace:: the :func:`@fdb.transactional ` decorator -.. |tenant-type| replace:: FIXME +.. |tenant-type| replace:: :class:`Tenant` .. |transaction-class| replace:: :class:`Transaction` .. |get-key-func| replace:: :func:`Transaction.get_key` .. |get-range-func| replace:: :func:`Transaction.get_range` @@ -316,9 +316,29 @@ A |database-blurb1| |database-blurb2| Returns a new :class:`Transaction` object. Consider using the :func:`@fdb.transactional ` decorator to create transactions instead, since it will automatically provide you with appropriate retry behavior. +.. method:: Database.open_tenant(tenant_name) + + Opens an existing tenant to be used for running transactions and returns it as a :class`Tenant` object. + + The tenant name can be either a byte string or a tuple. If a tuple is provided, the tuple will be packed using the tuple layer to generate the byte string tenant name. + .. |sync-read| replace:: This read is fully synchronous. .. |sync-write| replace:: This change will be committed immediately, and is fully synchronous. +.. method:: Database.allocate_tenant(tenant_name): + + Creates a new tenant in the cluster. |sync-write| + + The tenant name can be either a byte string or a tuple and cannot start with the ``\xff`` byte. If a tuple is provided, the tuple will be packed using the tuple layer to generate the byte string tenant name. + +.. method:: Database.delete_tenant(tenant_name): + + Delete a tenant from the cluster. |sync-write| + + The tenant name can be either a byte string or a tuple. If a tuple is provided, the tuple will be packed using the tuple layer to generate the byte string tenant name. + + It is an error to delete a tenant that still has data. To delete a non-empty tenant, first clear all of the keys in the tenant. + .. method:: Database.get(key) Returns the value associated with the specified key in the database (or ``None`` if the key does not exist). |sync-read| @@ -460,6 +480,17 @@ Database options .. method:: Database.options.set_snapshot_ryw_disable() |option-db-snapshot-ryw-disable-blurb| + +Tenant objects +============== + +.. class:: Tenant + +|tenant-blurb1| + +.. method:: Tenant.create_transaction() + + Returns a new :class:`Transaction` object. Consider using the :func:`@fdb.transactional ` decorator to create transactions instead, since it will automatically provide you with appropriate retry behavior. .. _api-python-transactional-decorator: @@ -479,9 +510,9 @@ Transactional decoration The ``@fdb.transactional`` decorator makes ``simple_function`` a transactional function. All functions using this decorator must have an argument **named** ``tr``. This specially named argument is passed a transaction that the function can use to do reads and writes. - A caller of a transactionally decorated function can pass a :class:`Database` instead of a transaction for the ``tr`` parameter. Then a transaction will be created automatically, and automatically committed before returning to the caller. The decorator will retry calling the decorated function until the transaction successfully commits. + A caller of a transactionally decorated function can pass a :class:`Database` or :class:`Tenant` instead of a transaction for the ``tr`` parameter. Then a transaction will be created automatically, and automatically committed before returning to the caller. The decorator will retry calling the decorated function until the transaction successfully commits. - If ``db`` is a :class:`Database`, a call like :: + If ``db`` is a :class:`Database` or :class:`Tenant`, a call like :: simple_function(db, 'a', 'b') @@ -744,7 +775,7 @@ Committing .. decorator:: transactional() - The ``transactional`` decorator makes it easy to write transactional functions which accept either a :class:`Database` or a :class:`Transaction` as a parameter and automatically commit. See :func:`@fdb.transactional ` for explanation and examples. + The ``transactional`` decorator makes it easy to write transactional functions which accept a :class:`Database`, :class`Tenant`, or :class:`Transaction` as a parameter and automatically commit. See :func:`@fdb.transactional ` for explanation and examples. .. method :: Transaction.commit() @@ -754,7 +785,7 @@ Committing |commit-outstanding-reads-blurb| - .. note :: Consider using the :func:`@fdb.transactional ` decorator, which not only calls :meth:`Database.create_transaction` and :meth:`Transaction.commit()` for you but also implements the required error handling and retry logic for transactions. + .. note :: Consider using the :func:`@fdb.transactional ` decorator, which not only calls :meth:`Database.create_transaction` or :meth`Tenant.create_transaction` and :meth:`Transaction.commit()` for you but also implements the required error handling and retry logic for transactions. .. warning :: |used-during-commit-blurb| diff --git a/documentation/sphinx/source/backups.rst b/documentation/sphinx/source/backups.rst index 740da177067..9dcf5ffd8b5 100644 --- a/documentation/sphinx/source/backups.rst +++ b/documentation/sphinx/source/backups.rst @@ -155,6 +155,12 @@ Here is a complete list of valid parameters: **Example**: The URL parameter *header=x-amz-storage-class:REDUCED_REDUNDANCY* would send the HTTP header required to use the reduced redundancy storage option in the S3 API. +Signing Protocol +================= + +AWS signature version 4 is the default signing protocol choice. This boolean knob ``--knob_http_request_aws_v4_header`` can be used to select between v4 style and v2 style signatures. +If the knob is set to ``true`` then v4 signature will be used and if set to ``false`` then v2 signature will be used. + .. _blob-credential-files: Blob Credential Files diff --git a/documentation/tutorial/tutorial.actor.cpp b/documentation/tutorial/tutorial.actor.cpp index 646a078ed42..67c542b6326 100644 --- a/documentation/tutorial/tutorial.actor.cpp +++ b/documentation/tutorial/tutorial.actor.cpp @@ -238,7 +238,7 @@ ACTOR Future echoClient() { return Void(); } -struct SimpleKeyValueStoreInteface { +struct SimpleKeyValueStoreInterface { constexpr static FileIdentifier file_identifier = 8226647; RequestStream connect; RequestStream get; @@ -253,7 +253,7 @@ struct SimpleKeyValueStoreInteface { struct GetKVInterface { constexpr static FileIdentifier file_identifier = 8062308; - ReplyPromise reply; + ReplyPromise reply; template void serialize(Ar& ar) { @@ -297,7 +297,7 @@ struct ClearRequest { }; ACTOR Future kvStoreServer() { - state SimpleKeyValueStoreInteface inf; + state SimpleKeyValueStoreInterface inf; state std::map store; inf.connect.makeWellKnownEndpoint(WLTOKEN_SIMPLE_KV_SERVER, TaskPriority::DefaultEndpoint); loop { @@ -333,17 +333,17 @@ ACTOR Future kvStoreServer() { } } -ACTOR Future connect() { +ACTOR Future connect() { std::cout << format("%llu: Connect...\n", uint64_t(g_network->now())); - SimpleKeyValueStoreInteface c; + SimpleKeyValueStoreInterface c; c.connect = RequestStream(Endpoint::wellKnown({ serverAddress }, WLTOKEN_SIMPLE_KV_SERVER)); - SimpleKeyValueStoreInteface result = wait(c.connect.getReply(GetKVInterface())); + SimpleKeyValueStoreInterface result = wait(c.connect.getReply(GetKVInterface())); std::cout << format("%llu: done..\n", uint64_t(g_network->now())); return result; } ACTOR Future kvSimpleClient() { - state SimpleKeyValueStoreInteface server = wait(connect()); + state SimpleKeyValueStoreInterface server = wait(connect()); std::cout << format("Set %s -> %s\n", "foo", "bar"); SetRequest setRequest; setRequest.key = "foo"; @@ -356,7 +356,7 @@ ACTOR Future kvSimpleClient() { return Void(); } -ACTOR Future kvClient(SimpleKeyValueStoreInteface server, std::shared_ptr ops) { +ACTOR Future kvClient(SimpleKeyValueStoreInterface server, std::shared_ptr ops) { state Future timeout = delay(20); state int rangeSize = 2 << 12; loop { @@ -397,7 +397,7 @@ ACTOR Future throughputMeasurement(std::shared_ptr operations) { } ACTOR Future multipleClients() { - SimpleKeyValueStoreInteface server = wait(connect()); + SimpleKeyValueStoreInterface server = wait(connect()); auto ops = std::make_shared(0); std::vector> clients(100); for (auto& f : clients) { diff --git a/fdbbackup/FileConverter.actor.cpp b/fdbbackup/FileConverter.actor.cpp index 1e48bd523d0..8aeea5017fa 100644 --- a/fdbbackup/FileConverter.actor.cpp +++ b/fdbbackup/FileConverter.actor.cpp @@ -101,6 +101,7 @@ std::vector getRelevantLogFiles(const std::vector& files, Vers struct ConvertParams { std::string container_url; + Optional proxy; Version begin = invalidVersion; Version end = invalidVersion; bool log_enabled = false; @@ -112,6 +113,10 @@ struct ConvertParams { std::string s; s.append("ContainerURL:"); s.append(container_url); + if (proxy.present()) { + s.append(" Proxy:"); + s.append(proxy.get()); + } s.append(" Begin:"); s.append(format("%" PRId64, begin)); s.append(" End:"); @@ -448,7 +453,8 @@ struct LogFileWriter { }; ACTOR Future convert(ConvertParams params) { - state Reference container = IBackupContainer::openContainer(params.container_url); + state Reference container = + IBackupContainer::openContainer(params.container_url, params.proxy, {}); state BackupFileList listing = wait(container->dumpFileList()); std::sort(listing.logs.begin(), listing.logs.end()); TraceEvent("Container").detail("URL", params.container_url).detail("Logs", listing.logs.size()); diff --git a/fdbbackup/FileConverter.h b/fdbbackup/FileConverter.h index a33032b183b..5ad5c53b1be 100644 --- a/fdbbackup/FileConverter.h +++ b/fdbbackup/FileConverter.h @@ -46,6 +46,7 @@ enum { OPT_HEX_KEY_PREFIX, OPT_BEGIN_VERSION_FILTER, OPT_END_VERSION_FILTER, + OPT_KNOB, OPT_HELP }; @@ -72,6 +73,7 @@ CSimpleOpt::SOption gConverterOptions[] = { { OPT_CONTAINER, "-r", SO_REQ_SEP }, { OPT_HEX_KEY_PREFIX, "--hex-prefix", SO_REQ_SEP }, { OPT_BEGIN_VERSION_FILTER, "--begin-version-filter", SO_REQ_SEP }, { OPT_END_VERSION_FILTER, "--end-version-filter", SO_REQ_SEP }, + { OPT_KNOB, "--knob-", SO_REQ_SEP }, { OPT_HELP, "-?", SO_NONE }, { OPT_HELP, "-h", SO_NONE }, { OPT_HELP, "--help", SO_NONE }, diff --git a/fdbbackup/FileDecoder.actor.cpp b/fdbbackup/FileDecoder.actor.cpp index 93c6cdc0845..7d9e27dcb17 100644 --- a/fdbbackup/FileDecoder.actor.cpp +++ b/fdbbackup/FileDecoder.actor.cpp @@ -26,17 +26,21 @@ #include #include "fdbbackup/BackupTLSConfig.h" +#include "fdbclient/BuildFlags.h" +#include "fdbbackup/FileConverter.h" #include "fdbclient/BackupAgent.actor.h" #include "fdbclient/BackupContainer.h" -#include "fdbbackup/FileConverter.h" #include "fdbclient/CommitTransaction.h" #include "fdbclient/FDBTypes.h" +#include "fdbclient/IKnobCollection.h" +#include "fdbclient/Knobs.h" #include "fdbclient/MutationList.h" +#include "flow/ArgParseUtil.h" #include "flow/IRandom.h" #include "flow/Trace.h" #include "flow/flow.h" #include "flow/serialize.h" -#include "fdbclient/BuildFlags.h" + #include "flow/actorcompiler.h" // has to be last include #define SevDecodeInfo SevVerbose @@ -73,11 +77,13 @@ void printDecodeUsage() { " --list-only Print file list and exit.\n" " -k KEY_PREFIX Use the prefix for filtering mutations\n" " --hex-prefix HEX_PREFIX\n" - " The prefix specified in HEX format, e.g., \\x05\\x01.\n" + " The prefix specified in HEX format, e.g., \"\\\\x05\\\\x01\".\n" " --begin-version-filter BEGIN_VERSION\n" " The version range's begin version (inclusive) for filtering.\n" " --end-version-filter END_VERSION\n" " The version range's end version (exclusive) for filtering.\n" + " --knob-KNOBNAME KNOBVALUE\n" + " Changes a knob value. KNOBNAME should be lowercase." "\n"; return; } @@ -88,6 +94,7 @@ void printBuildInformation() { struct DecodeParams { std::string container_url; + Optional proxy; std::string fileFilter; // only files match the filter will be decoded bool log_enabled = true; std::string log_dir, trace_format, trace_log_group; @@ -97,6 +104,8 @@ struct DecodeParams { Version beginVersionFilter = 0; Version endVersionFilter = std::numeric_limits::max(); + std::vector> knobs; + // Returns if [begin, end) overlap with the filter range bool overlap(Version begin, Version end) const { // Filter [100, 200), [50,75) [200, 300) @@ -107,6 +116,10 @@ struct DecodeParams { std::string s; s.append("ContainerURL: "); s.append(container_url); + if (proxy.present()) { + s.append(", Proxy: "); + s.append(proxy.get()); + } s.append(", FileFilter: "); s.append(fileFilter); if (log_enabled) { @@ -130,8 +143,39 @@ struct DecodeParams { if (!prefix.empty()) { s.append(", KeyPrefix: ").append(printable(KeyRef(prefix))); } + for (const auto& [knob, value] : knobs) { + s.append(", KNOB-").append(knob).append(" = ").append(value); + } return s; } + + void updateKnobs() { + auto& g_knobs = IKnobCollection::getMutableGlobalKnobCollection(); + for (const auto& [knobName, knobValueString] : knobs) { + try { + auto knobValue = g_knobs.parseKnobValue(knobName, knobValueString); + g_knobs.setKnob(knobName, knobValue); + } catch (Error& e) { + if (e.code() == error_code_invalid_option_value) { + std::cerr << "WARNING: Invalid value '" << knobValueString << "' for knob option '" << knobName + << "'\n"; + TraceEvent(SevWarnAlways, "InvalidKnobValue") + .detail("Knob", printable(knobName)) + .detail("Value", printable(knobValueString)); + } else { + std::cerr << "ERROR: Failed to set knob option '" << knobName << "': " << e.what() << "\n"; + TraceEvent(SevError, "FailedToSetKnob") + .errorUnsuppressed(e) + .detail("Knob", printable(knobName)) + .detail("Value", printable(knobValueString)); + throw; + } + } + } + + // Reinitialize knobs in order to update knobs that are dependent on explicitly set knobs + g_knobs.initialize(Randomize::True, IsSimulated::False); + } }; // Decode an ASCII string, e.g., "\x15\x1b\x19\x04\xaf\x0c\x28\x0a", @@ -256,6 +300,16 @@ int parseDecodeCommandLine(DecodeParams* param, CSimpleOpt* args) { param->tlsConfig.blobCredentials.push_back(args->OptionArg()); break; + case OPT_KNOB: { + Optional knobName = extractPrefixedArgument("--knob", args->OptionSyntax()); + if (!knobName.present()) { + std::cerr << "ERROR: unable to parse knob option '" << args->OptionSyntax() << "'\n"; + return FDB_EXIT_ERROR; + } + param->knobs.emplace_back(knobName.get(), args->OptionArg()); + break; + } + #ifndef TLS_DISABLED case TLSConfig::OPT_TLS_PLUGIN: args->OptionArg(); @@ -477,7 +531,8 @@ ACTOR Future process_file(Reference container, LogFile f } ACTOR Future decode_logs(DecodeParams params) { - state Reference container = IBackupContainer::openContainer(params.container_url); + state Reference container = + IBackupContainer::openContainer(params.container_url, params.proxy, {}); state UID uid = deterministicRandom()->randomUniqueID(); state BackupFileList listing = wait(container->dumpFileList()); // remove partitioned logs @@ -552,6 +607,9 @@ int main(int argc, char** argv) { StringRef url(param.container_url); setupNetwork(0, UseMetrics::True); + // Must be called after setupNetwork() to be effective + param.updateKnobs(); + TraceEvent::setNetworkThread(); openTraceFile(NetworkAddress(), 10 << 20, 500 << 20, param.log_dir, "decode", param.trace_log_group); param.tlsConfig.setupBlobCredentials(); diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp index 219d9ab820e..a8b42185690 100644 --- a/fdbbackup/backup.actor.cpp +++ b/fdbbackup/backup.actor.cpp @@ -130,6 +130,7 @@ enum { OPT_USE_PARTITIONED_LOG, // Backup and Restore constants + OPT_PROXY, OPT_TAGNAME, OPT_BACKUPKEYS, OPT_WAITFORDONE, @@ -234,6 +235,7 @@ CSimpleOpt::SOption g_rgBackupStartOptions[] = { { OPT_NOSTOPWHENDONE, "--no-stop-when-done", SO_NONE }, { OPT_DESTCONTAINER, "-d", SO_REQ_SEP }, { OPT_DESTCONTAINER, "--destcontainer", SO_REQ_SEP }, + { OPT_PROXY, "--proxy", SO_REQ_SEP }, // Enable "-p" option after GA // { OPT_USE_PARTITIONED_LOG, "-p", SO_NONE }, { OPT_USE_PARTITIONED_LOG, "--partitioned-log-experimental", SO_NONE }, @@ -294,6 +296,7 @@ CSimpleOpt::SOption g_rgBackupModifyOptions[] = { { OPT_MOD_VERIFY_UID, "--verify-uid", SO_REQ_SEP }, { OPT_DESTCONTAINER, "-d", SO_REQ_SEP }, { OPT_DESTCONTAINER, "--destcontainer", SO_REQ_SEP }, + { OPT_PROXY, "--proxy", SO_REQ_SEP }, { OPT_SNAPSHOTINTERVAL, "-s", SO_REQ_SEP }, { OPT_SNAPSHOTINTERVAL, "--snapshot-interval", SO_REQ_SEP }, { OPT_MOD_ACTIVE_INTERVAL, "--active-snapshot-interval", SO_REQ_SEP }, @@ -482,6 +485,7 @@ CSimpleOpt::SOption g_rgBackupExpireOptions[] = { { OPT_CLUSTERFILE, "--cluster-file", SO_REQ_SEP }, { OPT_DESTCONTAINER, "-d", SO_REQ_SEP }, { OPT_DESTCONTAINER, "--destcontainer", SO_REQ_SEP }, + { OPT_PROXY, "--proxy", SO_REQ_SEP }, { OPT_TRACE, "--log", SO_NONE }, { OPT_TRACE_DIR, "--logdir", SO_REQ_SEP }, { OPT_TRACE_FORMAT, "--trace-format", SO_REQ_SEP }, @@ -517,6 +521,7 @@ CSimpleOpt::SOption g_rgBackupDeleteOptions[] = { #endif { OPT_DESTCONTAINER, "-d", SO_REQ_SEP }, { OPT_DESTCONTAINER, "--destcontainer", SO_REQ_SEP }, + { OPT_PROXY, "--proxy", SO_REQ_SEP }, { OPT_TRACE, "--log", SO_NONE }, { OPT_TRACE_DIR, "--logdir", SO_REQ_SEP }, { OPT_TRACE_FORMAT, "--trace-format", SO_REQ_SEP }, @@ -546,6 +551,7 @@ CSimpleOpt::SOption g_rgBackupDescribeOptions[] = { { OPT_CLUSTERFILE, "--cluster-file", SO_REQ_SEP }, { OPT_DESTCONTAINER, "-d", SO_REQ_SEP }, { OPT_DESTCONTAINER, "--destcontainer", SO_REQ_SEP }, + { OPT_PROXY, "--proxy", SO_REQ_SEP }, { OPT_TRACE, "--log", SO_NONE }, { OPT_TRACE_DIR, "--logdir", SO_REQ_SEP }, { OPT_TRACE_FORMAT, "--trace-format", SO_REQ_SEP }, @@ -578,6 +584,7 @@ CSimpleOpt::SOption g_rgBackupDumpOptions[] = { { OPT_CLUSTERFILE, "--cluster-file", SO_REQ_SEP }, { OPT_DESTCONTAINER, "-d", SO_REQ_SEP }, { OPT_DESTCONTAINER, "--destcontainer", SO_REQ_SEP }, + { OPT_PROXY, "--proxy", SO_REQ_SEP }, { OPT_TRACE, "--log", SO_NONE }, { OPT_TRACE_DIR, "--logdir", SO_REQ_SEP }, { OPT_TRACE_LOG_GROUP, "--loggroup", SO_REQ_SEP }, @@ -652,6 +659,7 @@ CSimpleOpt::SOption g_rgBackupQueryOptions[] = { { OPT_RESTORE_TIMESTAMP, "--query-restore-timestamp", SO_REQ_SEP }, { OPT_DESTCONTAINER, "-d", SO_REQ_SEP }, { OPT_DESTCONTAINER, "--destcontainer", SO_REQ_SEP }, + { OPT_PROXY, "--proxy", SO_REQ_SEP }, { OPT_RESTORE_VERSION, "-qrv", SO_REQ_SEP }, { OPT_RESTORE_VERSION, "--query-restore-version", SO_REQ_SEP }, { OPT_BACKUPKEYS_FILTER, "-k", SO_REQ_SEP }, @@ -689,6 +697,7 @@ CSimpleOpt::SOption g_rgRestoreOptions[] = { { OPT_RESTORE_TIMESTAMP, "--timestamp", SO_REQ_SEP }, { OPT_KNOB, "--knob-", SO_REQ_SEP }, { OPT_RESTORECONTAINER, "-r", SO_REQ_SEP }, + { OPT_PROXY, "--proxy", SO_REQ_SEP }, { OPT_PREFIX_ADD, "--add-prefix", SO_REQ_SEP }, { OPT_PREFIX_REMOVE, "--remove-prefix", SO_REQ_SEP }, { OPT_TAGNAME, "-t", SO_REQ_SEP }, @@ -1920,6 +1929,7 @@ ACTOR Future submitDBBackup(Database src, ACTOR Future submitBackup(Database db, std::string url, + Optional proxy, int initialSnapshotIntervalSeconds, int snapshotIntervalSeconds, Standalone> backupRanges, @@ -1977,6 +1987,7 @@ ACTOR Future submitBackup(Database db, else { wait(backupAgent.submitBackup(db, KeyRef(url), + proxy, initialSnapshotIntervalSeconds, snapshotIntervalSeconds, tagName, @@ -2260,8 +2271,9 @@ ACTOR Future changeDBBackupResumed(Database src, Database dest, bool pause } Reference openBackupContainer(const char* name, - std::string destinationContainer, - Optional const& encryptionKeyFile = {}) { + const std::string& destinationContainer, + const Optional& proxy, + const Optional& encryptionKeyFile) { // Error, if no dest container was specified if (destinationContainer.empty()) { fprintf(stderr, "ERROR: No backup destination was specified.\n"); @@ -2271,7 +2283,7 @@ Reference openBackupContainer(const char* name, Reference c; try { - c = IBackupContainer::openContainer(destinationContainer, encryptionKeyFile); + c = IBackupContainer::openContainer(destinationContainer, proxy, encryptionKeyFile); } catch (Error& e) { std::string msg = format("ERROR: '%s' on URL '%s'", e.what(), destinationContainer.c_str()); if (e.code() == error_code_backup_invalid_url && !IBackupContainer::lastOpenError.empty()) { @@ -2291,6 +2303,7 @@ ACTOR Future runRestore(Database db, std::string originalClusterFile, std::string tagName, std::string container, + Optional proxy, Standalone> ranges, Version beginVersion, Version targetVersion, @@ -2339,7 +2352,7 @@ ACTOR Future runRestore(Database db, state FileBackupAgent backupAgent; state Reference bc = - openBackupContainer(exeRestore.toString().c_str(), container, encryptionKeyFile); + openBackupContainer(exeRestore.toString().c_str(), container, proxy, encryptionKeyFile); // If targetVersion is unset then use the maximum restorable version from the backup description if (targetVersion == invalidVersion) { @@ -2368,6 +2381,7 @@ ACTOR Future runRestore(Database db, origDb, KeyRef(tagName), KeyRef(container), + proxy, ranges, waitForDone, targetVersion, @@ -2411,6 +2425,7 @@ ACTOR Future runRestore(Database db, ACTOR Future runFastRestoreTool(Database db, std::string tagName, std::string container, + Optional proxy, Standalone> ranges, Version dbVersion, bool performRestore, @@ -2440,7 +2455,7 @@ ACTOR Future runFastRestoreTool(Database db, if (performRestore) { if (dbVersion == invalidVersion) { TraceEvent("FastRestoreTool").detail("TargetRestoreVersion", "Largest restorable version"); - BackupDescription desc = wait(IBackupContainer::openContainer(container)->describeBackup()); + BackupDescription desc = wait(IBackupContainer::openContainer(container, proxy, {})->describeBackup()); if (!desc.maxRestorableVersion.present()) { fprintf(stderr, "The specified backup is not restorable to any version.\n"); throw restore_error(); @@ -2457,6 +2472,7 @@ ACTOR Future runFastRestoreTool(Database db, KeyRef(tagName), ranges, KeyRef(container), + proxy, dbVersion, LockDB::True, randomUID, @@ -2478,7 +2494,7 @@ ACTOR Future runFastRestoreTool(Database db, restoreVersion = dbVersion; } else { - state Reference bc = IBackupContainer::openContainer(container); + state Reference bc = IBackupContainer::openContainer(container, proxy, {}); state BackupDescription description = wait(bc->describeBackup()); if (dbVersion <= 0) { @@ -2522,9 +2538,10 @@ ACTOR Future runFastRestoreTool(Database db, ACTOR Future dumpBackupData(const char* name, std::string destinationContainer, + Optional proxy, Version beginVersion, Version endVersion) { - state Reference c = openBackupContainer(name, destinationContainer); + state Reference c = openBackupContainer(name, destinationContainer, proxy, {}); if (beginVersion < 0 || endVersion < 0) { BackupDescription desc = wait(c->describeBackup()); @@ -2552,6 +2569,7 @@ ACTOR Future dumpBackupData(const char* name, ACTOR Future expireBackupData(const char* name, std::string destinationContainer, + Optional proxy, Version endVersion, std::string endDatetime, Database db, @@ -2577,7 +2595,7 @@ ACTOR Future expireBackupData(const char* name, } try { - Reference c = openBackupContainer(name, destinationContainer, encryptionKeyFile); + Reference c = openBackupContainer(name, destinationContainer, proxy, encryptionKeyFile); state IBackupContainer::ExpireProgress progress; state std::string lastProgress; @@ -2623,9 +2641,11 @@ ACTOR Future expireBackupData(const char* name, return Void(); } -ACTOR Future deleteBackupContainer(const char* name, std::string destinationContainer) { +ACTOR Future deleteBackupContainer(const char* name, + std::string destinationContainer, + Optional proxy) { try { - state Reference c = openBackupContainer(name, destinationContainer); + state Reference c = openBackupContainer(name, destinationContainer, proxy, {}); state int numDeleted = 0; state Future done = c->deleteContainer(&numDeleted); @@ -2657,12 +2677,13 @@ ACTOR Future deleteBackupContainer(const char* name, std::string destinati ACTOR Future describeBackup(const char* name, std::string destinationContainer, + Optional proxy, bool deep, Optional cx, bool json, Optional encryptionKeyFile) { try { - Reference c = openBackupContainer(name, destinationContainer, encryptionKeyFile); + Reference c = openBackupContainer(name, destinationContainer, proxy, encryptionKeyFile); state BackupDescription desc = wait(c->describeBackup(deep)); if (cx.present()) wait(desc.resolveVersionTimes(cx.get())); @@ -2688,6 +2709,7 @@ static void reportBackupQueryError(UID operationId, JsonBuilderObject& result, s // resolved to that timestamp. ACTOR Future queryBackup(const char* name, std::string destinationContainer, + Optional proxy, Standalone> keyRangesFilter, Version restoreVersion, std::string originalClusterFile, @@ -2734,7 +2756,7 @@ ACTOR Future queryBackup(const char* name, } try { - state Reference bc = openBackupContainer(name, destinationContainer); + state Reference bc = openBackupContainer(name, destinationContainer, proxy, {}); if (restoreVersion == invalidVersion) { BackupDescription desc = wait(bc->describeBackup()); if (desc.maxRestorableVersion.present()) { @@ -2814,9 +2836,9 @@ ACTOR Future queryBackup(const char* name, return Void(); } -ACTOR Future listBackup(std::string baseUrl) { +ACTOR Future listBackup(std::string baseUrl, Optional proxy) { try { - std::vector containers = wait(IBackupContainer::listContainers(baseUrl)); + std::vector containers = wait(IBackupContainer::listContainers(baseUrl, proxy)); for (std::string container : containers) { printf("%s\n", container.c_str()); } @@ -2852,6 +2874,7 @@ ACTOR Future listBackupTags(Database cx) { struct BackupModifyOptions { Optional verifyUID; Optional destURL; + Optional proxy; Optional snapshotIntervalSeconds; Optional activeSnapshotIntervalSeconds; bool hasChanges() const { @@ -2869,7 +2892,7 @@ ACTOR Future modifyBackup(Database db, std::string tagName, BackupModifyOp state Reference bc; if (options.destURL.present()) { - bc = openBackupContainer(exeBackup.toString().c_str(), options.destURL.get()); + bc = openBackupContainer(exeBackup.toString().c_str(), options.destURL.get(), options.proxy, {}); try { wait(timeoutError(bc->create(), 30)); } catch (Error& e) { @@ -3342,6 +3365,7 @@ int main(int argc, char* argv[]) { break; } + Optional proxy; std::string destinationContainer; bool describeDeep = false; bool describeTimestamps = false; @@ -3595,6 +3619,14 @@ int main(int argc, char* argv[]) { return FDB_EXIT_ERROR; } break; + case OPT_PROXY: + proxy = args->OptionArg(); + if (!Hostname::isHostname(proxy.get()) && !NetworkAddress::parseOptional(proxy.get()).present()) { + fprintf(stderr, "ERROR: Proxy format should be either IP:port or host:port\n"); + return FDB_EXIT_ERROR; + } + modifyOptions.proxy = proxy; + break; case OPT_DESTCONTAINER: destinationContainer = args->OptionArg(); // If the url starts with '/' then prepend "file://" for backwards compatibility @@ -3962,9 +3994,10 @@ int main(int argc, char* argv[]) { if (!initCluster()) return FDB_EXIT_ERROR; // Test out the backup url to make sure it parses. Doesn't test to make sure it's actually writeable. - openBackupContainer(argv[0], destinationContainer, encryptionKeyFile); + openBackupContainer(argv[0], destinationContainer, proxy, encryptionKeyFile); f = stopAfter(submitBackup(db, destinationContainer, + proxy, initialSnapshotIntervalSeconds, snapshotIntervalSeconds, backupKeys, @@ -4036,6 +4069,7 @@ int main(int argc, char* argv[]) { } f = stopAfter(expireBackupData(argv[0], destinationContainer, + proxy, expireVersion, expireDatetime, db, @@ -4047,7 +4081,7 @@ int main(int argc, char* argv[]) { case BackupType::DELETE_BACKUP: initTraceFile(); - f = stopAfter(deleteBackupContainer(argv[0], destinationContainer)); + f = stopAfter(deleteBackupContainer(argv[0], destinationContainer, proxy)); break; case BackupType::DESCRIBE: @@ -4060,6 +4094,7 @@ int main(int argc, char* argv[]) { // given, but quietly skip them if not. f = stopAfter(describeBackup(argv[0], destinationContainer, + proxy, describeDeep, describeTimestamps ? Optional(db) : Optional(), jsonOutput, @@ -4068,7 +4103,7 @@ int main(int argc, char* argv[]) { case BackupType::LIST: initTraceFile(); - f = stopAfter(listBackup(baseUrl)); + f = stopAfter(listBackup(baseUrl, proxy)); break; case BackupType::TAGS: @@ -4081,6 +4116,7 @@ int main(int argc, char* argv[]) { initTraceFile(); f = stopAfter(queryBackup(argv[0], destinationContainer, + proxy, backupKeysFilter, restoreVersion, restoreClusterFileOrig, @@ -4090,7 +4126,7 @@ int main(int argc, char* argv[]) { case BackupType::DUMP: initTraceFile(); - f = stopAfter(dumpBackupData(argv[0], destinationContainer, dumpBegin, dumpEnd)); + f = stopAfter(dumpBackupData(argv[0], destinationContainer, proxy, dumpBegin, dumpEnd)); break; case BackupType::UNDEFINED: @@ -4141,6 +4177,7 @@ int main(int argc, char* argv[]) { restoreClusterFileOrig, tagName, restoreContainer, + proxy, backupKeys, beginVersion, restoreVersion, @@ -4218,6 +4255,7 @@ int main(int argc, char* argv[]) { f = stopAfter(runFastRestoreTool(db, tagName, restoreContainer, + proxy, backupKeys, restoreVersion, !dryRun, diff --git a/fdbcli/TenantCommands.actor.cpp b/fdbcli/TenantCommands.actor.cpp index c03bb17c882..6660893a1f6 100644 --- a/fdbcli/TenantCommands.actor.cpp +++ b/fdbcli/TenantCommands.actor.cpp @@ -51,7 +51,9 @@ ACTOR Future createTenantCommandActor(Reference db, std::vector tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); try { if (!doneExistenceCheck) { - Optional existingTenant = wait(safeThreadFutureToFuture(tr->get(tenantNameKey))); + // Hold the reference to the standalone's memory + state ThreadFuture> existingTenantFuture = tr->get(tenantNameKey); + Optional existingTenant = wait(safeThreadFutureToFuture(existingTenantFuture)); if (existingTenant.present()) { throw tenant_already_exists(); } @@ -96,7 +98,9 @@ ACTOR Future deleteTenantCommandActor(Reference db, std::vector tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); try { if (!doneExistenceCheck) { - Optional existingTenant = wait(safeThreadFutureToFuture(tr->get(tenantNameKey))); + // Hold the reference to the standalone's memory + state ThreadFuture> existingTenantFuture = tr->get(tenantNameKey); + Optional existingTenant = wait(safeThreadFutureToFuture(existingTenantFuture)); if (!existingTenant.present()) { throw tenant_not_found(); } @@ -163,8 +167,10 @@ ACTOR Future listTenantsCommandActor(Reference db, std::vector< loop { try { - RangeResult tenants = wait(safeThreadFutureToFuture( - tr->getRange(firstGreaterOrEqual(beginTenantKey), firstGreaterOrEqual(endTenantKey), limit))); + // Hold the reference to the standalone's memory + state ThreadFuture kvsFuture = + tr->getRange(firstGreaterOrEqual(beginTenantKey), firstGreaterOrEqual(endTenantKey), limit); + RangeResult tenants = wait(safeThreadFutureToFuture(kvsFuture)); if (tenants.empty()) { if (tokens.size() == 1) { @@ -213,7 +219,9 @@ ACTOR Future getTenantCommandActor(Reference db, std::vector tenant = wait(safeThreadFutureToFuture(tr->get(tenantNameKey))); + // Hold the reference to the standalone's memory + state ThreadFuture> tenantFuture = tr->get(tenantNameKey); + Optional tenant = wait(safeThreadFutureToFuture(tenantFuture)); if (!tenant.present()) { throw tenant_not_found(); } diff --git a/fdbclient/BackupAgent.actor.h b/fdbclient/BackupAgent.actor.h index 94cb10d290d..a938dcd51fe 100644 --- a/fdbclient/BackupAgent.actor.h +++ b/fdbclient/BackupAgent.actor.h @@ -165,6 +165,7 @@ class FileBackupAgent : public BackupAgentBase { Key backupTag, Standalone> backupRanges, Key bcUrl, + Optional proxy, Version targetVersion, LockDB lockDB, UID randomUID, @@ -187,6 +188,7 @@ class FileBackupAgent : public BackupAgentBase { Optional cxOrig, Key tagName, Key url, + Optional proxy, Standalone> ranges, WaitForComplete = WaitForComplete::True, Version targetVersion = ::invalidVersion, @@ -202,6 +204,7 @@ class FileBackupAgent : public BackupAgentBase { Optional cxOrig, Key tagName, Key url, + Optional proxy, WaitForComplete waitForComplete = WaitForComplete::True, Version targetVersion = ::invalidVersion, Verbose verbose = Verbose::True, @@ -219,6 +222,7 @@ class FileBackupAgent : public BackupAgentBase { cxOrig, tagName, url, + proxy, rangeRef, waitForComplete, targetVersion, @@ -263,6 +267,7 @@ class FileBackupAgent : public BackupAgentBase { Future submitBackup(Reference tr, Key outContainer, + Optional proxy, int initialSnapshotIntervalSeconds, int snapshotIntervalSeconds, std::string const& tagName, @@ -273,6 +278,7 @@ class FileBackupAgent : public BackupAgentBase { Optional const& encryptionKeyFileName = {}); Future submitBackup(Database cx, Key outContainer, + Optional proxy, int initialSnapshotIntervalSeconds, int snapshotIntervalSeconds, std::string const& tagName, @@ -284,6 +290,7 @@ class FileBackupAgent : public BackupAgentBase { return runRYWTransactionFailIfLocked(cx, [=](Reference tr) { return submitBackup(tr, outContainer, + proxy, initialSnapshotIntervalSeconds, snapshotIntervalSeconds, tagName, @@ -720,20 +727,31 @@ template <> inline Tuple Codec>::pack(Reference const& bc) { Tuple tuple; tuple.append(StringRef(bc->getURL())); + if (bc->getProxy().present()) { + tuple.append(StringRef(bc->getProxy().get())); + } else { + tuple.append(StringRef()); + } if (bc->getEncryptionKeyFileName().present()) { tuple.append(bc->getEncryptionKeyFileName().get()); + } else { + tuple.append(StringRef()); } return tuple; } template <> inline Reference Codec>::unpack(Tuple const& val) { - ASSERT(val.size() == 1 || val.size() == 2); + ASSERT(val.size() == 3); auto url = val.getString(0).toString(); + Optional proxy; + if (!val.getString(1).empty()) { + proxy = val.getString(1).toString(); + } Optional encryptionKeyFileName; - if (val.size() == 2) { - encryptionKeyFileName = val.getString(1).toString(); + if (!val.getString(2).empty()) { + encryptionKeyFileName = val.getString(2).toString(); } - return IBackupContainer::openContainer(url, encryptionKeyFileName); + return IBackupContainer::openContainer(url, proxy, encryptionKeyFileName); } class BackupConfig : public KeyBackedConfig { diff --git a/fdbclient/BackupContainer.actor.cpp b/fdbclient/BackupContainer.actor.cpp index 37b2eae0151..416d15c5484 100644 --- a/fdbclient/BackupContainer.actor.cpp +++ b/fdbclient/BackupContainer.actor.cpp @@ -256,7 +256,8 @@ std::vector IBackupContainer::getURLFormats() { // Get an IBackupContainer based on a container URL string Reference IBackupContainer::openContainer(const std::string& url, - Optional const& encryptionKeyFileName) { + const Optional& proxy, + const Optional& encryptionKeyFileName) { static std::map> m_cache; Reference& r = m_cache[url]; @@ -273,7 +274,7 @@ Reference IBackupContainer::openContainer(const std::string& u // The URL parameters contain blobstore endpoint tunables as well as possible backup-specific options. S3BlobStoreEndpoint::ParametersT backupParams; Reference bstore = - S3BlobStoreEndpoint::fromString(url, &resource, &lastOpenError, &backupParams); + S3BlobStoreEndpoint::fromString(url, proxy, &resource, &lastOpenError, &backupParams); if (resource.empty()) throw backup_invalid_url(); @@ -317,7 +318,7 @@ Reference IBackupContainer::openContainer(const std::string& u // Get a list of URLS to backup containers based on some a shorter URL. This function knows about some set of supported // URL types which support this sort of backup discovery. -ACTOR Future> listContainers_impl(std::string baseURL) { +ACTOR Future> listContainers_impl(std::string baseURL, Optional proxy) { try { StringRef u(baseURL); if (u.startsWith("file://"_sr)) { @@ -327,8 +328,8 @@ ACTOR Future> listContainers_impl(std::string baseURL) std::string resource; S3BlobStoreEndpoint::ParametersT backupParams; - Reference bstore = - S3BlobStoreEndpoint::fromString(baseURL, &resource, &IBackupContainer::lastOpenError, &backupParams); + Reference bstore = S3BlobStoreEndpoint::fromString( + baseURL, proxy, &resource, &IBackupContainer::lastOpenError, &backupParams); if (!resource.empty()) { TraceEvent(SevWarn, "BackupContainer") @@ -370,8 +371,9 @@ ACTOR Future> listContainers_impl(std::string baseURL) } } -Future> IBackupContainer::listContainers(const std::string& baseURL) { - return listContainers_impl(baseURL); +Future> IBackupContainer::listContainers(const std::string& baseURL, + const Optional& proxy) { + return listContainers_impl(baseURL, proxy); } ACTOR Future timeKeeperVersionFromDatetime(std::string datetime, Database db) { diff --git a/fdbclient/BackupContainer.h b/fdbclient/BackupContainer.h index 312e3b8ac70..36c9ff7cfab 100644 --- a/fdbclient/BackupContainer.h +++ b/fdbclient/BackupContainer.h @@ -156,6 +156,7 @@ struct BackupFileList { struct BackupDescription { BackupDescription() : snapshotBytes(0) {} std::string url; + Optional proxy; std::vector snapshots; int64_t snapshotBytes; // The version before which everything has been deleted by an expire @@ -294,11 +295,14 @@ class IBackupContainer { // Get an IBackupContainer based on a container spec string static Reference openContainer(const std::string& url, - const Optional& encryptionKeyFileName = {}); + const Optional& proxy, + const Optional& encryptionKeyFileName); static std::vector getURLFormats(); - static Future> listContainers(const std::string& baseURL); + static Future> listContainers(const std::string& baseURL, + const Optional& proxy); std::string const& getURL() const { return URL; } + Optional const& getProxy() const { return proxy; } Optional const& getEncryptionKeyFileName() const { return encryptionKeyFileName; } static std::string lastOpenError; @@ -306,6 +310,7 @@ class IBackupContainer { // TODO: change the following back to `private` once blob obj access is refactored protected: std::string URL; + Optional proxy; Optional encryptionKeyFileName; }; diff --git a/fdbclient/BackupContainerFileSystem.actor.cpp b/fdbclient/BackupContainerFileSystem.actor.cpp index 7acbd227f26..a4778ecc104 100644 --- a/fdbclient/BackupContainerFileSystem.actor.cpp +++ b/fdbclient/BackupContainerFileSystem.actor.cpp @@ -409,6 +409,7 @@ class BackupContainerFileSystemImpl { Version logStartVersionOverride) { state BackupDescription desc; desc.url = bc->getURL(); + desc.proxy = bc->getProxy(); TraceEvent("BackupContainerDescribe1") .detail("URL", bc->getURL()) @@ -1500,7 +1501,8 @@ Future BackupContainerFileSystem::createTestEncryptionKeyFile(std::string // code but returning a different template type because you can't cast between them Reference BackupContainerFileSystem::openContainerFS( const std::string& url, - Optional const& encryptionKeyFileName) { + const Optional& proxy, + const Optional& encryptionKeyFileName) { static std::map> m_cache; Reference& r = m_cache[url]; @@ -1517,7 +1519,7 @@ Reference BackupContainerFileSystem::openContainerFS( // The URL parameters contain blobstore endpoint tunables as well as possible backup-specific options. S3BlobStoreEndpoint::ParametersT backupParams; Reference bstore = - S3BlobStoreEndpoint::fromString(url, &resource, &lastOpenError, &backupParams); + S3BlobStoreEndpoint::fromString(url, proxy, &resource, &lastOpenError, &backupParams); if (resource.empty()) throw backup_invalid_url(); @@ -1635,7 +1637,9 @@ ACTOR static Future testWriteSnapshotFile(Reference file, Key return Void(); } -ACTOR Future testBackupContainer(std::string url, Optional encryptionKeyFileName) { +ACTOR Future testBackupContainer(std::string url, + Optional proxy, + Optional encryptionKeyFileName) { state FlowLock lock(100e6); if (encryptionKeyFileName.present()) { @@ -1644,7 +1648,7 @@ ACTOR Future testBackupContainer(std::string url, Optional en printf("BackupContainerTest URL %s\n", url.c_str()); - state Reference c = IBackupContainer::openContainer(url, encryptionKeyFileName); + state Reference c = IBackupContainer::openContainer(url, proxy, encryptionKeyFileName); // Make sure container doesn't exist, then create it. try { @@ -1789,12 +1793,13 @@ ACTOR Future testBackupContainer(std::string url, Optional en } TEST_CASE("/backup/containers/localdir/unencrypted") { - wait(testBackupContainer(format("file://%s/fdb_backups/%llx", params.getDataDir().c_str(), timer_int()), {})); + wait(testBackupContainer(format("file://%s/fdb_backups/%llx", params.getDataDir().c_str(), timer_int()), {}, {})); return Void(); } TEST_CASE("/backup/containers/localdir/encrypted") { wait(testBackupContainer(format("file://%s/fdb_backups/%llx", params.getDataDir().c_str(), timer_int()), + {}, format("%s/test_encryption_key", params.getDataDir().c_str()))); return Void(); } @@ -1803,7 +1808,7 @@ TEST_CASE("/backup/containers/url") { if (!g_network->isSimulated()) { const char* url = getenv("FDB_TEST_BACKUP_URL"); ASSERT(url != nullptr); - wait(testBackupContainer(url, {})); + wait(testBackupContainer(url, {}, {})); } return Void(); } @@ -1813,7 +1818,7 @@ TEST_CASE("/backup/containers_list") { state const char* url = getenv("FDB_TEST_BACKUP_URL"); ASSERT(url != nullptr); printf("Listing %s\n", url); - std::vector urls = wait(IBackupContainer::listContainers(url)); + std::vector urls = wait(IBackupContainer::listContainers(url, {})); for (auto& u : urls) { printf("%s\n", u.c_str()); } diff --git a/fdbclient/BackupContainerFileSystem.h b/fdbclient/BackupContainerFileSystem.h index 52c5d3fc546..784b1133956 100644 --- a/fdbclient/BackupContainerFileSystem.h +++ b/fdbclient/BackupContainerFileSystem.h @@ -81,9 +81,9 @@ class BackupContainerFileSystem : public IBackupContainer { Future exists() override = 0; // TODO: refactor this to separate out the "deal with blob store" stuff from the backup business logic - static Reference openContainerFS( - const std::string& url, - const Optional& encryptionKeyFileName = {}); + static Reference openContainerFS(const std::string& url, + const Optional& proxy, + const Optional& encryptionKeyFileName); // Get a list of fileNames and their sizes in the container under the given path // Although not required, an implementation can avoid traversing unwanted subfolders diff --git a/fdbclient/BlobGranuleCommon.h b/fdbclient/BlobGranuleCommon.h index c76e72342d0..97074326d93 100644 --- a/fdbclient/BlobGranuleCommon.h +++ b/fdbclient/BlobGranuleCommon.h @@ -52,19 +52,20 @@ struct BlobFilePointerRef { StringRef filename; int64_t offset; int64_t length; + int64_t fullFileLength; BlobFilePointerRef() {} - BlobFilePointerRef(Arena& to, const std::string& filename, int64_t offset, int64_t length) - : filename(to, filename), offset(offset), length(length) {} + BlobFilePointerRef(Arena& to, const std::string& filename, int64_t offset, int64_t length, int64_t fullFileLength) + : filename(to, filename), offset(offset), length(length), fullFileLength(fullFileLength) {} template void serialize(Ar& ar) { - serializer(ar, filename, offset, length); + serializer(ar, filename, offset, length, fullFileLength); } std::string toString() const { std::stringstream ss; - ss << filename.toString() << ":" << offset << ":" << length; + ss << filename.toString() << ":" << offset << ":" << length << ":" << fullFileLength; return std::move(ss).str(); } }; diff --git a/fdbclient/BlobGranuleFiles.cpp b/fdbclient/BlobGranuleFiles.cpp index a7e5dda5a32..469573e3d95 100644 --- a/fdbclient/BlobGranuleFiles.cpp +++ b/fdbclient/BlobGranuleFiles.cpp @@ -18,9 +18,12 @@ * limitations under the License. */ +#include + #include "contrib/fmt-8.1.1/include/fmt/format.h" #include "flow/serialize.h" #include "fdbclient/BlobGranuleFiles.h" +#include "fdbclient/Knobs.h" #include "fdbclient/SystemData.h" // for allKeys unit test - could remove #include "flow/UnitTest.h" @@ -119,29 +122,43 @@ static void applyDelta(KeyRangeRef keyRange, MutationRef m, std::map& dataMap) { - if (!deltas.empty()) { - // check that consecutive delta file versions are disjoint - ASSERT(lastFileEndVersion < deltas.front().version); + if (deltas.empty()) { + return; + } + // check that consecutive delta file versions are disjoint + ASSERT(lastFileEndVersion < deltas.front().version); + + const MutationsAndVersionRef* mutationIt = deltas.begin(); + // prune beginVersion if necessary + if (beginVersion > deltas.front().version) { + ASSERT(beginVersion <= deltas.back().version); + // binary search for beginVersion + mutationIt = std::lower_bound(deltas.begin(), + deltas.end(), + MutationsAndVersionRef(beginVersion, 0), + MutationsAndVersionRef::OrderByVersion()); } - for (const MutationsAndVersionRef& delta : deltas) { - if (delta.version > readVersion) { + + while (mutationIt != deltas.end()) { + if (mutationIt->version > readVersion) { lastFileEndVersion = readVersion; return; } - for (auto& m : delta.mutations) { + for (auto& m : mutationIt->mutations) { applyDelta(keyRange, m, dataMap); } + mutationIt++; } - if (!deltas.empty()) { - lastFileEndVersion = deltas.back().version; - } + lastFileEndVersion = deltas.back().version; } static Arena loadDeltaFile(StringRef deltaData, KeyRangeRef keyRange, + Version beginVersion, Version readVersion, Version& lastFileEndVersion, std::map& dataMap) { @@ -151,7 +168,7 @@ static Arena loadDeltaFile(StringRef deltaData, reader.deserialize(FileIdentifierFor::value, deltas, parseArena); if (BG_READ_DEBUG) { - fmt::print("Parsed {}} deltas from file\n", deltas.size()); + fmt::print("Parsed {} deltas from file\n", deltas.size()); } // TODO REMOVE sanity check @@ -163,19 +180,18 @@ static Arena loadDeltaFile(StringRef deltaData, ASSERT(deltas[i].version <= deltas[i + 1].version); } - applyDeltas(deltas, keyRange, readVersion, lastFileEndVersion, dataMap); + applyDeltas(deltas, keyRange, beginVersion, readVersion, lastFileEndVersion, dataMap); return parseArena; } RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk, KeyRangeRef keyRange, + Version beginVersion, Version readVersion, Optional snapshotData, StringRef deltaFileData[]) { - // TODO REMOVE with V2 of protocol + // TODO REMOVE with early replying ASSERT(readVersion == chunk.includedVersion); - ASSERT(chunk.snapshotFile.present()); - ASSERT(snapshotData.present()); // Arena to hold all allocations for applying deltas. Most of it, and the arenas produced by reading the files, // will likely be tossed if there are a significant number of mutations, so we copy at the end instead of doing a @@ -195,13 +211,14 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk, fmt::print("Applying {} delta files\n", chunk.deltaFiles.size()); } for (int deltaIdx = 0; deltaIdx < chunk.deltaFiles.size(); deltaIdx++) { - Arena deltaArena = loadDeltaFile(deltaFileData[deltaIdx], keyRange, readVersion, lastFileEndVersion, dataMap); + Arena deltaArena = + loadDeltaFile(deltaFileData[deltaIdx], keyRange, beginVersion, readVersion, lastFileEndVersion, dataMap); arena.dependsOn(deltaArena); } if (BG_READ_DEBUG) { fmt::print("Applying {} memory deltas\n", chunk.newDeltas.size()); } - applyDeltas(chunk.newDeltas, keyRange, readVersion, lastFileEndVersion, dataMap); + applyDeltas(chunk.newDeltas, keyRange, beginVersion, readVersion, lastFileEndVersion, dataMap); RangeResult ret; for (auto& it : dataMap) { @@ -211,50 +228,90 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk, return ret; } +struct GranuleLoadIds { + Optional snapshotId; + std::vector deltaIds; +}; + +static void startLoad(const ReadBlobGranuleContext granuleContext, + const BlobGranuleChunkRef& chunk, + GranuleLoadIds& loadIds) { + + // Start load process for all files in chunk + if (chunk.snapshotFile.present()) { + std::string snapshotFname = chunk.snapshotFile.get().filename.toString(); + // FIXME: remove when we implement file multiplexing + ASSERT(chunk.snapshotFile.get().offset == 0); + ASSERT(chunk.snapshotFile.get().length == chunk.snapshotFile.get().fullFileLength); + loadIds.snapshotId = granuleContext.start_load_f(snapshotFname.c_str(), + snapshotFname.size(), + chunk.snapshotFile.get().offset, + chunk.snapshotFile.get().length, + chunk.snapshotFile.get().fullFileLength, + granuleContext.userContext); + } + loadIds.deltaIds.reserve(chunk.deltaFiles.size()); + for (int deltaFileIdx = 0; deltaFileIdx < chunk.deltaFiles.size(); deltaFileIdx++) { + std::string deltaFName = chunk.deltaFiles[deltaFileIdx].filename.toString(); + // FIXME: remove when we implement file multiplexing + ASSERT(chunk.deltaFiles[deltaFileIdx].offset == 0); + ASSERT(chunk.deltaFiles[deltaFileIdx].length == chunk.deltaFiles[deltaFileIdx].fullFileLength); + int64_t deltaLoadId = granuleContext.start_load_f(deltaFName.c_str(), + deltaFName.size(), + chunk.deltaFiles[deltaFileIdx].offset, + chunk.deltaFiles[deltaFileIdx].length, + chunk.deltaFiles[deltaFileIdx].fullFileLength, + granuleContext.userContext); + loadIds.deltaIds.push_back(deltaLoadId); + } +} + ErrorOr loadAndMaterializeBlobGranules(const Standalone>& files, const KeyRangeRef& keyRange, Version beginVersion, Version readVersion, ReadBlobGranuleContext granuleContext) { + int64_t parallelism = granuleContext.granuleParallelism; + if (parallelism < 1) { + parallelism = 1; + } + if (parallelism >= CLIENT_KNOBS->BG_MAX_GRANULE_PARALLELISM) { + parallelism = CLIENT_KNOBS->BG_MAX_GRANULE_PARALLELISM; + } + + GranuleLoadIds loadIds[files.size()]; + + // Kick off first file reads if parallelism > 1 + for (int i = 0; i < parallelism - 1 && i < files.size(); i++) { + startLoad(granuleContext, files[i], loadIds[i]); + } + try { RangeResult results; - // FIXME: could submit multiple chunks to start_load_f in parallel? - for (const BlobGranuleChunkRef& chunk : files) { - RangeResult chunkRows; - - int64_t snapshotLoadId; - int64_t deltaLoadIds[chunk.deltaFiles.size()]; - - // Start load process for all files in chunk - // In V1 of api snapshot is required, optional is just for forward compatibility - ASSERT(chunk.snapshotFile.present()); - std::string snapshotFname = chunk.snapshotFile.get().filename.toString(); - snapshotLoadId = granuleContext.start_load_f(snapshotFname.c_str(), - snapshotFname.size(), - chunk.snapshotFile.get().offset, - chunk.snapshotFile.get().length, - granuleContext.userContext); - int64_t deltaLoadLengths[chunk.deltaFiles.size()]; - StringRef deltaData[chunk.deltaFiles.size()]; - for (int deltaFileIdx = 0; deltaFileIdx < chunk.deltaFiles.size(); deltaFileIdx++) { - std::string deltaFName = chunk.deltaFiles[deltaFileIdx].filename.toString(); - deltaLoadIds[deltaFileIdx] = granuleContext.start_load_f(deltaFName.c_str(), - deltaFName.size(), - chunk.deltaFiles[deltaFileIdx].offset, - chunk.deltaFiles[deltaFileIdx].length, - granuleContext.userContext); - deltaLoadLengths[deltaFileIdx] = chunk.deltaFiles[deltaFileIdx].length; + for (int chunkIdx = 0; chunkIdx < files.size(); chunkIdx++) { + // Kick off files for this granule if parallelism == 1, or future granule if parallelism > 1 + if (chunkIdx + parallelism - 1 < files.size()) { + startLoad(granuleContext, files[chunkIdx + parallelism - 1], loadIds[chunkIdx + parallelism - 1]); } + RangeResult chunkRows; + // once all loads kicked off, load data for chunk - StringRef snapshotData(granuleContext.get_load_f(snapshotLoadId, granuleContext.userContext), - chunk.snapshotFile.get().length); - if (!snapshotData.begin()) { - return ErrorOr(blob_granule_file_load_error()); + Optional snapshotData; + if (files[chunkIdx].snapshotFile.present()) { + snapshotData = + StringRef(granuleContext.get_load_f(loadIds[chunkIdx].snapshotId.get(), granuleContext.userContext), + files[chunkIdx].snapshotFile.get().length); + if (!snapshotData.get().begin()) { + return ErrorOr(blob_granule_file_load_error()); + } } - for (int i = 0; i < chunk.deltaFiles.size(); i++) { - deltaData[i] = StringRef(granuleContext.get_load_f(deltaLoadIds[i], granuleContext.userContext), - chunk.deltaFiles[i].length); + + StringRef deltaData[files[chunkIdx].deltaFiles.size()]; + for (int i = 0; i < files[chunkIdx].deltaFiles.size(); i++) { + deltaData[i] = + StringRef(granuleContext.get_load_f(loadIds[chunkIdx].deltaIds[i], granuleContext.userContext), + files[chunkIdx].deltaFiles[i].length); // null data is error if (!deltaData[i].begin()) { return ErrorOr(blob_granule_file_load_error()); @@ -262,14 +319,17 @@ ErrorOr loadAndMaterializeBlobGranules(const Standalone(results); @@ -278,8 +338,7 @@ ErrorOr loadAndMaterializeBlobGranules(const Standalone loadAndMaterializeBlobGranules(const Standalone snapshotData, StringRef deltaFileData[]); diff --git a/fdbclient/BlobGranuleReader.actor.cpp b/fdbclient/BlobGranuleReader.actor.cpp index e69d1ad94ff..2fde30be0da 100644 --- a/fdbclient/BlobGranuleReader.actor.cpp +++ b/fdbclient/BlobGranuleReader.actor.cpp @@ -28,6 +28,7 @@ #include "fdbclient/BlobGranuleReader.actor.h" #include "fdbclient/BlobWorkerCommon.h" #include "fdbclient/BlobWorkerInterface.h" +#include "fdbclient/FDBTypes.h" #include "flow/actorcompiler.h" // This must be the last #include. // TODO more efficient data structure besides std::map? PTree is unnecessary since this isn't versioned, but some other @@ -52,7 +53,6 @@ ACTOR Future> readFile(Reference(dataRef, arena); } catch (Error& e) { - printf("Reading file %s got error %s\n", f.toString().c_str(), e.name()); throw e; } } @@ -64,22 +64,25 @@ ACTOR Future> readFile(Reference readBlobGranule(BlobGranuleChunkRef chunk, KeyRangeRef keyRange, + Version beginVersion, Version readVersion, Reference bstore, Optional stats) { - // TODO REMOVE with V2 of protocol + // TODO REMOVE with early replying ASSERT(readVersion == chunk.includedVersion); - ASSERT(chunk.snapshotFile.present()); state Arena arena; try { - Future> readSnapshotFuture = readFile(bstore, chunk.snapshotFile.get()); - state std::vector>> readDeltaFutures; - if (stats.present()) { - ++stats.get()->s3GetReqs; + Future> readSnapshotFuture; + if (chunk.snapshotFile.present()) { + readSnapshotFuture = readFile(bstore, chunk.snapshotFile.get()); + if (stats.present()) { + ++stats.get()->s3GetReqs; + } } + state std::vector>> readDeltaFutures; readDeltaFutures.reserve(chunk.deltaFiles.size()); for (BlobFilePointerRef deltaFile : chunk.deltaFiles) { @@ -89,8 +92,12 @@ ACTOR Future readBlobGranule(BlobGranuleChunkRef chunk, } } - state Standalone snapshotData = wait(readSnapshotFuture); - arena.dependsOn(snapshotData.arena()); + state Optional snapshotData; // not present if snapshotFile isn't present + if (chunk.snapshotFile.present()) { + state Standalone s = wait(readSnapshotFuture); + arena.dependsOn(s.arena()); + snapshotData = s; + } state int numDeltaFiles = chunk.deltaFiles.size(); state StringRef* deltaData = new (arena) StringRef[numDeltaFiles]; @@ -103,10 +110,9 @@ ACTOR Future readBlobGranule(BlobGranuleChunkRef chunk, arena.dependsOn(data.arena()); } - return materializeBlobGranule(chunk, keyRange, readVersion, snapshotData, deltaData); + return materializeBlobGranule(chunk, keyRange, beginVersion, readVersion, snapshotData, deltaData); } catch (Error& e) { - printf("Reading blob granule got error %s\n", e.name()); throw e; } } @@ -121,18 +127,12 @@ ACTOR Future readBlobGranules(BlobGranuleFileRequest request, try { state int i; for (i = 0; i < reply.chunks.size(); i++) { - /*printf("ReadBlobGranules processing chunk %d [%s - %s)\n", - i, - reply.chunks[i].keyRange.begin.printable().c_str(), - reply.chunks[i].keyRange.end.printable().c_str());*/ - RangeResult chunkResult = - wait(readBlobGranule(reply.chunks[i], request.keyRange, request.readVersion, bstore)); + RangeResult chunkResult = wait( + readBlobGranule(reply.chunks[i], request.keyRange, request.beginVersion, request.readVersion, bstore)); results.send(std::move(chunkResult)); } - // printf("ReadBlobGranules done, sending EOS\n"); results.sendError(end_of_stream()); } catch (Error& e) { - printf("ReadBlobGranules got error %s\n", e.name()); results.sendError(e); } diff --git a/fdbclient/BlobGranuleReader.actor.h b/fdbclient/BlobGranuleReader.actor.h index 1b168ebc5d7..958dc817e81 100644 --- a/fdbclient/BlobGranuleReader.actor.h +++ b/fdbclient/BlobGranuleReader.actor.h @@ -40,6 +40,7 @@ // the request ACTOR Future readBlobGranule(BlobGranuleChunkRef chunk, KeyRangeRef keyRange, + Version beginVersion, Version readVersion, Reference bstore, Optional stats = Optional()); diff --git a/fdbclient/BlobWorkerCommon.h b/fdbclient/BlobWorkerCommon.h index 2898412e730..49aed17985e 100644 --- a/fdbclient/BlobWorkerCommon.h +++ b/fdbclient/BlobWorkerCommon.h @@ -38,6 +38,8 @@ struct BlobWorkerStats { Counter commitVersionChecks; Counter granuleUpdateErrors; Counter granuleRequestTimeouts; + Counter readRequestsWithBegin; + Counter readRequestsCollapsed; int numRangesAssigned; int mutationBytesBuffered; @@ -59,6 +61,7 @@ struct BlobWorkerStats { readReqTotalFilesReturned("ReadReqTotalFilesReturned", cc), readReqDeltaBytesReturned("ReadReqDeltaBytesReturned", cc), commitVersionChecks("CommitVersionChecks", cc), granuleUpdateErrors("GranuleUpdateErrors", cc), granuleRequestTimeouts("GranuleRequestTimeouts", cc), + readRequestsWithBegin("ReadRequestsWithBegin", cc), readRequestsCollapsed("ReadRequestsCollapsed", cc), numRangesAssigned(0), mutationBytesBuffered(0), activeReadRequests(0) { specialCounter(cc, "NumRangesAssigned", [this]() { return this->numRangesAssigned; }); specialCounter(cc, "MutationBytesBuffered", [this]() { return this->mutationBytesBuffered; }); diff --git a/fdbclient/BlobWorkerInterface.h b/fdbclient/BlobWorkerInterface.h index f69b73e1bc7..5dd36b71281 100644 --- a/fdbclient/BlobWorkerInterface.h +++ b/fdbclient/BlobWorkerInterface.h @@ -86,13 +86,14 @@ struct BlobGranuleFileRequest { KeyRangeRef keyRange; Version beginVersion = 0; Version readVersion; + bool canCollapseBegin = true; ReplyPromise reply; BlobGranuleFileRequest() {} template void serialize(Ar& ar) { - serializer(ar, keyRange, beginVersion, readVersion, reply, arena); + serializer(ar, keyRange, beginVersion, readVersion, canCollapseBegin, reply, arena); } }; diff --git a/fdbclient/CMakeLists.txt b/fdbclient/CMakeLists.txt index 81b41314d18..3cb841dbd7b 100644 --- a/fdbclient/CMakeLists.txt +++ b/fdbclient/CMakeLists.txt @@ -205,6 +205,17 @@ if(BUILD_AZURE_BACKUP) ) endif() + +if(WITH_AWS_BACKUP) + add_compile_definitions(BUILD_AWS_BACKUP) + + set(FDBCLIENT_SRCS + ${FDBCLIENT_SRCS} + FDBAWSCredentialsProvider.h) + + include(awssdk) +endif() + add_flow_target(STATIC_LIBRARY NAME fdbclient SRCS ${FDBCLIENT_SRCS} ADDL_SRCS ${options_srcs}) add_dependencies(fdbclient fdboptions) target_link_libraries(fdbclient PUBLIC fdbrpc msgpack) @@ -224,3 +235,8 @@ if(BUILD_AZURE_BACKUP) target_link_libraries(fdbclient PRIVATE curl uuid azure-storage-lite) target_link_libraries(fdbclient_sampling PRIVATE curl uuid azure-storage-lite) endif() + +if(BUILD_AWS_BACKUP) + target_link_libraries(fdbclient PUBLIC awssdk_target) + target_link_libraries(fdbclient_sampling PUBLIC awssdk_target) +endif() diff --git a/fdbclient/ClientKnobs.cpp b/fdbclient/ClientKnobs.cpp index 5f098ffa823..29aff4f4e17 100644 --- a/fdbclient/ClientKnobs.cpp +++ b/fdbclient/ClientKnobs.cpp @@ -50,6 +50,7 @@ void ClientKnobs::initialize(Randomize randomize) { init( MAX_GENERATIONS_OVERRIDE, 0 ); init( MAX_GENERATIONS_SIM, 50 ); //Disable network connections after this many generations in simulation, should be less than RECOVERY_DELAY_START_GENERATION + init( COORDINATOR_HOSTNAME_RESOLVE_DELAY, 0.05 ); init( COORDINATOR_RECONNECTION_DELAY, 1.0 ); init( CLIENT_EXAMPLE_AMOUNT, 20 ); init( MAX_CLIENT_STATUS_AGE, 1.0 ); @@ -280,6 +281,9 @@ void ClientKnobs::initialize(Randomize randomize) { init( MVC_CLIENTLIB_CHUNK_SIZE, 8*1024 ); init( MVC_CLIENTLIB_CHUNKS_PER_TRANSACTION, 32 ); + // Blob granules + init( BG_MAX_GRANULE_PARALLELISM, 10 ); + // clang-format on } diff --git a/fdbclient/ClientKnobs.h b/fdbclient/ClientKnobs.h index 8d6afa6d7be..de21a60fd0c 100644 --- a/fdbclient/ClientKnobs.h +++ b/fdbclient/ClientKnobs.h @@ -49,6 +49,7 @@ class ClientKnobs : public KnobsImpl { double MAX_GENERATIONS_OVERRIDE; double MAX_GENERATIONS_SIM; + double COORDINATOR_HOSTNAME_RESOLVE_DELAY; double COORDINATOR_RECONNECTION_DELAY; int CLIENT_EXAMPLE_AMOUNT; double MAX_CLIENT_STATUS_AGE; @@ -272,6 +273,9 @@ class ClientKnobs : public KnobsImpl { int MVC_CLIENTLIB_CHUNK_SIZE; int MVC_CLIENTLIB_CHUNKS_PER_TRANSACTION; + // Blob Granules + int BG_MAX_GRANULE_PARALLELISM; + ClientKnobs(Randomize randomize); void initialize(Randomize randomize); }; diff --git a/fdbclient/DatabaseConfiguration.cpp b/fdbclient/DatabaseConfiguration.cpp index e0205b8628e..e9159503614 100644 --- a/fdbclient/DatabaseConfiguration.cpp +++ b/fdbclient/DatabaseConfiguration.cpp @@ -23,6 +23,7 @@ #include "flow/ITrace.h" #include "flow/Trace.h" #include "flow/genericactors.actor.h" +#include "flow/UnitTest.h" DatabaseConfiguration::DatabaseConfiguration() { resetInternal(); @@ -490,9 +491,9 @@ void DatabaseConfiguration::overwriteProxiesCount() { Optional optGrvProxies = DatabaseConfiguration::get(grvProxiesKey); Optional optProxies = DatabaseConfiguration::get(proxiesKey); - const int mutableGrvProxyCount = optGrvProxies.present() ? toInt(optGrvProxies.get()) : 0; - const int mutableCommitProxyCount = optCommitProxies.present() ? toInt(optCommitProxies.get()) : 0; - const int mutableProxiesCount = optProxies.present() ? toInt(optProxies.get()) : 0; + const int mutableGrvProxyCount = optGrvProxies.present() ? toInt(optGrvProxies.get()) : -1; + const int mutableCommitProxyCount = optCommitProxies.present() ? toInt(optCommitProxies.get()) : -1; + const int mutableProxiesCount = optProxies.present() ? toInt(optProxies.get()) : -1; if (mutableProxiesCount > 1) { TraceEvent(SevDebug, "OverwriteProxiesCount") @@ -502,23 +503,23 @@ void DatabaseConfiguration::overwriteProxiesCount() { .detail("MutableGrvCPCount", mutableGrvProxyCount) .detail("MutableProxiesCount", mutableProxiesCount); - if (grvProxyCount == -1 && commitProxyCount > 0) { - if (mutableProxiesCount > commitProxyCount) { - grvProxyCount = mutableProxiesCount - commitProxyCount; + if (mutableGrvProxyCount == -1 && mutableCommitProxyCount > 0) { + if (mutableProxiesCount > mutableCommitProxyCount) { + grvProxyCount = mutableProxiesCount - mutableCommitProxyCount; } else { // invalid configuration; provision min GrvProxies grvProxyCount = 1; commitProxyCount = mutableProxiesCount - 1; } - } else if (grvProxyCount > 0 && commitProxyCount == -1) { - if (mutableProxiesCount > grvProxyCount) { + } else if (mutableGrvProxyCount > 0 && mutableCommitProxyCount == -1) { + if (mutableProxiesCount > mutableGrvProxyCount) { commitProxyCount = mutableProxiesCount - grvProxyCount; } else { // invalid configuration; provision min CommitProxies commitProxyCount = 1; grvProxyCount = mutableProxiesCount - 1; } - } else if (grvProxyCount == -1 && commitProxyCount == -1) { + } else if (mutableGrvProxyCount == -1 && mutableCommitProxyCount == -1) { // Use DEFAULT_COMMIT_GRV_PROXIES_RATIO to split proxies between Grv & Commit proxies const int derivedGrvProxyCount = std::max(1, @@ -825,3 +826,21 @@ bool DatabaseConfiguration::isOverridden(std::string key) const { return false; } + +TEST_CASE("/fdbclient/databaseConfiguration/overwriteCommitProxy") { + DatabaseConfiguration conf1; + conf1.applyMutation(MutationRef(MutationRef::SetValue, "\xff/conf/grv_proxies"_sr, "5"_sr)); + conf1.applyMutation(MutationRef(MutationRef::SetValue, "\xff/conf/proxies"_sr, "10"_sr)); + conf1.applyMutation(MutationRef(MutationRef::SetValue, "\xff/conf/grv_proxies"_sr, "-1"_sr)); + conf1.applyMutation(MutationRef(MutationRef::SetValue, "\xff/conf/commit_proxies"_sr, "-1"_sr)); + + DatabaseConfiguration conf2; + conf2.applyMutation(MutationRef(MutationRef::SetValue, "\xff/conf/proxies"_sr, "10"_sr)); + conf2.applyMutation(MutationRef(MutationRef::SetValue, "\xff/conf/grv_proxies"_sr, "-1"_sr)); + conf2.applyMutation(MutationRef(MutationRef::SetValue, "\xff/conf/commit_proxies"_sr, "-1"_sr)); + + ASSERT(conf1 == conf2); + ASSERT(conf1.getDesiredCommitProxies() == conf2.getDesiredCommitProxies()); + + return Void(); +} \ No newline at end of file diff --git a/fdbclient/DatabaseContext.h b/fdbclient/DatabaseContext.h index 72d38da77d1..68af8303193 100644 --- a/fdbclient/DatabaseContext.h +++ b/fdbclient/DatabaseContext.h @@ -515,7 +515,7 @@ class DatabaseContext : public ReferenceCounted, public FastAll Counter transactionGrvTimedOutBatches; ContinuousSample latencies, readLatencies, commitLatencies, GRVLatencies, mutationsPerCommit, - bytesPerCommit; + bytesPerCommit, bgLatencies, bgGranulesPerRequest; int outstandingWatches; int maxOutstandingWatches; @@ -544,6 +544,7 @@ class DatabaseContext : public ReferenceCounted, public FastAll bool transactionTracingSample; double verifyCausalReadsProp = 0.0; bool blobGranuleNoMaterialize = false; + bool anyBlobGranuleRequests = false; Future logger; Future throttleExpirer; diff --git a/fdbclient/FDBAWSCredentialsProvider.h b/fdbclient/FDBAWSCredentialsProvider.h new file mode 100644 index 00000000000..f09e2f858e7 --- /dev/null +++ b/fdbclient/FDBAWSCredentialsProvider.h @@ -0,0 +1,47 @@ +/* + * FDBAWSCredentialsProvider.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#if (!defined FDB_AWS_CREDENTIALS_PROVIDER_H) && (defined BUILD_AWS_BACKUP) +#define FDB_AWS_CREDENTIALS_PROVIDER_H +#pragma once + +#include "aws/core/Aws.h" +#include "aws/core/auth/AWSCredentialsProviderChain.h" + +// Singleton +namespace FDBAWSCredentialsProvider { +bool doneInit = false; + +// You're supposed to call AWS::ShutdownAPI(options); once done +// But we want this to live for the lifetime of the process, so we don't do that +static Aws::Auth::AWSCredentials getAwsCredentials() { + if (!doneInit) { + doneInit = true; + Aws::SDKOptions options; + Aws::InitAPI(options); + TraceEvent("AWSSDKInitSuccessful"); + } + Aws::Auth::DefaultAWSCredentialsProviderChain credProvider; + Aws::Auth::AWSCredentials creds = credProvider.GetAWSCredentials(); + return creds; +} +} // namespace FDBAWSCredentialsProvider + +#endif diff --git a/fdbclient/FDBTypes.h b/fdbclient/FDBTypes.h index 9cb6155004b..e183542d89a 100644 --- a/fdbclient/FDBTypes.h +++ b/fdbclient/FDBTypes.h @@ -1360,7 +1360,12 @@ struct ReadBlobGranuleContext { void* userContext; // Returns a unique id for the load. Asynchronous to support queueing multiple in parallel. - int64_t (*start_load_f)(const char* filename, int filenameLength, int64_t offset, int64_t length, void* context); + int64_t (*start_load_f)(const char* filename, + int filenameLength, + int64_t offset, + int64_t length, + int64_t fullFileLength, + void* context); // Returns data for the load. Pass the loadId returned by start_load_f uint8_t* (*get_load_f)(int64_t loadId, void* context); @@ -1371,6 +1376,9 @@ struct ReadBlobGranuleContext { // Set this to true for testing if you don't want to read the granule files, // just do the request to the blob workers bool debugNoMaterialize; + + // number of granules to load in parallel (default 1) + int granuleParallelism = 1; }; // Store metadata associated with each storage server. Now it only contains data be used in perpetual storage wiggle. diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp index fc1dc558c7f..b451747f08a 100644 --- a/fdbclient/FileBackupAgent.actor.cpp +++ b/fdbclient/FileBackupAgent.actor.cpp @@ -4363,13 +4363,14 @@ class FileBackupAgentImpl { Key backupTag, Standalone> backupRanges, Key bcUrl, + Optional proxy, Version targetVersion, LockDB lockDB, UID randomUID, Key addPrefix, Key removePrefix) { // Sanity check backup is valid - state Reference bc = IBackupContainer::openContainer(bcUrl.toString()); + state Reference bc = IBackupContainer::openContainer(bcUrl.toString(), proxy, {}); state BackupDescription desc = wait(bc->describeBackup()); wait(desc.resolveVersionTimes(cx)); @@ -4430,6 +4431,7 @@ class FileBackupAgentImpl { struct RestoreRequest restoreRequest(restoreIndex, restoreTag, bcUrl, + proxy, targetVersion, range, deterministicRandom()->randomUniqueID(), @@ -4510,6 +4512,7 @@ class FileBackupAgentImpl { ACTOR static Future submitBackup(FileBackupAgent* backupAgent, Reference tr, Key outContainer, + Optional proxy, int initialSnapshotIntervalSeconds, int snapshotIntervalSeconds, std::string tagName, @@ -4555,7 +4558,8 @@ class FileBackupAgentImpl { backupContainer = joinPath(backupContainer, std::string("backup-") + nowStr.toString()); } - state Reference bc = IBackupContainer::openContainer(backupContainer, encryptionKeyFileName); + state Reference bc = + IBackupContainer::openContainer(backupContainer, proxy, encryptionKeyFileName); try { wait(timeoutError(bc->create(), 30)); } catch (Error& e) { @@ -4642,6 +4646,7 @@ class FileBackupAgentImpl { Reference tr, Key tagName, Key backupURL, + Optional proxy, Standalone> ranges, Version restoreVersion, Key addPrefix, @@ -4710,7 +4715,7 @@ class FileBackupAgentImpl { // Point the tag to the new uid tag.set(tr, { uid, false }); - Reference bc = IBackupContainer::openContainer(backupURL.toString()); + Reference bc = IBackupContainer::openContainer(backupURL.toString(), proxy, {}); // Configure the new restore restore.tag().set(tr, tagName.toString()); @@ -5303,6 +5308,7 @@ class FileBackupAgentImpl { Optional cxOrig, Key tagName, Key url, + Optional proxy, Standalone> ranges, WaitForComplete waitForComplete, Version targetVersion, @@ -5320,7 +5326,7 @@ class FileBackupAgentImpl { throw restore_error(); } - state Reference bc = IBackupContainer::openContainer(url.toString()); + state Reference bc = IBackupContainer::openContainer(url.toString(), proxy, {}); state BackupDescription desc = wait(bc->describeBackup(true)); if (cxOrig.present()) { @@ -5360,6 +5366,7 @@ class FileBackupAgentImpl { tr, tagName, url, + proxy, ranges, targetVersion, addPrefix, @@ -5499,6 +5506,7 @@ class FileBackupAgentImpl { tagName, ranges, KeyRef(bc->getURL()), + bc->getProxy(), targetVersion, LockDB::True, randomUid, @@ -5520,6 +5528,7 @@ class FileBackupAgentImpl { cx, tagName, KeyRef(bc->getURL()), + bc->getProxy(), ranges, WaitForComplete::True, ::invalidVersion, @@ -5561,13 +5570,14 @@ Future FileBackupAgent::submitParallelRestore(Database cx, Key backupTag, Standalone> backupRanges, Key bcUrl, + Optional proxy, Version targetVersion, LockDB lockDB, UID randomUID, Key addPrefix, Key removePrefix) { return FileBackupAgentImpl::submitParallelRestore( - cx, backupTag, backupRanges, bcUrl, targetVersion, lockDB, randomUID, addPrefix, removePrefix); + cx, backupTag, backupRanges, bcUrl, proxy, targetVersion, lockDB, randomUID, addPrefix, removePrefix); } Future FileBackupAgent::atomicParallelRestore(Database cx, @@ -5582,6 +5592,7 @@ Future FileBackupAgent::restore(Database cx, Optional cxOrig, Key tagName, Key url, + Optional proxy, Standalone> ranges, WaitForComplete waitForComplete, Version targetVersion, @@ -5598,6 +5609,7 @@ Future FileBackupAgent::restore(Database cx, cxOrig, tagName, url, + proxy, ranges, waitForComplete, targetVersion, @@ -5639,6 +5651,7 @@ Future FileBackupAgent::waitRestore(Database cx, Key tagName, Ver Future FileBackupAgent::submitBackup(Reference tr, Key outContainer, + Optional proxy, int initialSnapshotIntervalSeconds, int snapshotIntervalSeconds, std::string const& tagName, @@ -5650,6 +5663,7 @@ Future FileBackupAgent::submitBackup(Reference return FileBackupAgentImpl::submitBackup(this, tr, outContainer, + proxy, initialSnapshotIntervalSeconds, snapshotIntervalSeconds, tagName, diff --git a/fdbclient/JSONDoc.h b/fdbclient/JSONDoc.h index 2fdeb7ba66c..39a1b388ee0 100644 --- a/fdbclient/JSONDoc.h +++ b/fdbclient/JSONDoc.h @@ -22,6 +22,7 @@ #include "fdbclient/json_spirit/json_spirit_writer_template.h" #include "fdbclient/json_spirit/json_spirit_reader_template.h" +#include "flow/Error.h" // JSONDoc is a convenient reader/writer class for manipulating JSON documents using "paths". // Access is done using a "path", which is a string of dot-separated diff --git a/fdbclient/MonitorLeader.actor.cpp b/fdbclient/MonitorLeader.actor.cpp index 509953599bc..3440822ec2e 100644 --- a/fdbclient/MonitorLeader.actor.cpp +++ b/fdbclient/MonitorLeader.actor.cpp @@ -169,7 +169,7 @@ void ClusterConnectionString::resolveHostnamesBlocking() { } void ClusterConnectionString::resetToUnresolved() { - if (hostnames.size() > 0) { + if (status == RESOLVED && hostnames.size() > 0) { coords.clear(); hostnames.clear(); networkAddressToHostname.clear(); @@ -558,8 +558,8 @@ ACTOR Future monitorNominee(Key key, .detail("Hostname", hostname.present() ? hostname.get().toString() : "UnknownHostname") .detail("OldAddr", coord.getLeader.getEndpoint().getPrimaryAddress().toString()); if (rep.getError().code() == error_code_request_maybe_delivered) { - // 50 milliseconds delay to prevent tight resolving loop due to outdated DNS cache - wait(delay(0.05)); + // Delay to prevent tight resolving loop due to outdated DNS cache + wait(delay(CLIENT_KNOBS->COORDINATOR_HOSTNAME_RESOLVE_DELAY)); throw coordinators_changed(); } else { throw rep.getError(); @@ -589,7 +589,6 @@ ACTOR Future monitorNominee(Key key, if (li.present() && li.get().forward) wait(Future(Never())); - wait(Future(Void())); } } } diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp index 704654e2e53..484cf5cb8e1 100644 --- a/fdbclient/MultiVersionTransaction.actor.cpp +++ b/fdbclient/MultiVersionTransaction.actor.cpp @@ -282,8 +282,9 @@ ThreadResult DLTransaction::readBlobGranules(const KeyRangeRef& key context.get_load_f = granuleContext.get_load_f; context.free_load_f = granuleContext.free_load_f; context.debugNoMaterialize = granuleContext.debugNoMaterialize; + context.granuleParallelism = granuleContext.granuleParallelism; - int64_t rv = readVersion.present() ? readVersion.get() : invalidVersion; + int64_t rv = readVersion.present() ? readVersion.get() : latestVersion; FdbCApi::FDBResult* r = api->transactionReadBlobGranules(tr, keyRange.begin.begin(), diff --git a/fdbclient/MultiVersionTransaction.h b/fdbclient/MultiVersionTransaction.h index bb12dfa8abb..5377bf56465 100644 --- a/fdbclient/MultiVersionTransaction.h +++ b/fdbclient/MultiVersionTransaction.h @@ -95,8 +95,12 @@ struct FdbCApi : public ThreadSafeReferenceCounted { void* userContext; // Returns a unique id for the load. Asynchronous to support queueing multiple in parallel. - int64_t ( - *start_load_f)(const char* filename, int filenameLength, int64_t offset, int64_t length, void* context); + int64_t (*start_load_f)(const char* filename, + int filenameLength, + int64_t offset, + int64_t length, + int64_t fullFileLength, + void* context); // Returns data for the load. Pass the loadId returned by start_load_f uint8_t* (*get_load_f)(int64_t loadId, void* context); @@ -107,6 +111,9 @@ struct FdbCApi : public ThreadSafeReferenceCounted { // set this to true for testing if you don't want to read the granule files, just // do the request to the blob workers fdb_bool_t debugNoMaterialize; + + // number of granules to load in parallel (default 1) + int granuleParallelism; } FDBReadBlobGranuleContext; typedef void (*FDBCallback)(FDBFuture* future, void* callback_parameter); diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 1e0556a8aff..71b5c2e4f2f 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -559,6 +559,14 @@ ACTOR Future databaseLogger(DatabaseContext* cx) { .detail("MedianBytesPerCommit", cx->bytesPerCommit.median()) .detail("MaxBytesPerCommit", cx->bytesPerCommit.max()) .detail("NumLocalityCacheEntries", cx->locationCache.size()); + if (cx->anyBlobGranuleRequests) { + ev.detail("MeanBGLatency", cx->bgLatencies.mean()) + .detail("MedianBGLatency", cx->bgLatencies.median()) + .detail("MaxBGLatency", cx->bgLatencies.max()) + .detail("MeanBGGranulesPerRequest", cx->bgGranulesPerRequest.mean()) + .detail("MedianBGGranulesPerRequest", cx->bgGranulesPerRequest.median()) + .detail("MaxBGGranulesPerRequest", cx->bgGranulesPerRequest.max()); + } } cx->latencies.clear(); @@ -567,6 +575,8 @@ ACTOR Future databaseLogger(DatabaseContext* cx) { cx->commitLatencies.clear(); cx->mutationsPerCommit.clear(); cx->bytesPerCommit.clear(); + cx->bgLatencies.clear(); + cx->bgGranulesPerRequest.clear(); lastLogged = now(); } @@ -1379,11 +1389,12 @@ DatabaseContext::DatabaseContext(ReferenceSHARD_STAT_SMOOTH_AMOUNT), + bytesPerCommit(1000), bgLatencies(1000), bgGranulesPerRequest(1000), outstandingWatches(0), sharedStatePtr(nullptr), + lastGrvTime(0.0), cachedReadVersion(0), lastRkBatchThrottleTime(0.0), lastRkDefaultThrottleTime(0.0), + lastProxyRequestTime(0.0), transactionTracingSample(false), taskID(taskID), clientInfo(clientInfo), + clientInfoMonitor(clientInfoMonitor), coordinator(coordinator), apiVersion(apiVersion), mvCacheInsertLocation(0), + healthMetricsLastUpdated(0), detailedHealthMetricsLastUpdated(0), + smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT), specialKeySpace(std::make_unique(specialKeys.begin, specialKeys.end, /* test */ false)), connectToDatabaseEventCacheHolder(format("ConnectToDatabase/%s", dbId.toString().c_str())) { dbId = deterministicRandom()->randomUniqueID(); @@ -1645,7 +1656,8 @@ DatabaseContext::DatabaseContext(const Error& err) transactionsExpensiveClearCostEstCount("ExpensiveClearCostEstCount", cc), transactionGrvFullBatches("NumGrvFullBatches", cc), transactionGrvTimedOutBatches("NumGrvTimedOutBatches", cc), latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), - bytesPerCommit(1000), transactionTracingSample(false), smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT), + bytesPerCommit(1000), bgLatencies(1000), bgGranulesPerRequest(1000), transactionTracingSample(false), + smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT), connectToDatabaseEventCacheHolder(format("ConnectToDatabase/%s", dbId.toString().c_str())) {} // Static constructor used by server processes to create a DatabaseContext @@ -7373,6 +7385,7 @@ ACTOR Future>> readBlobGranulesActor( state Version rv; state Standalone> results; + state double startTime = now(); if (read.present()) { rv = read.get(); @@ -7393,6 +7406,7 @@ ACTOR Future>> readBlobGranulesActor( fmt::print( "BG Mapping for [{0} - %{1}) too large!\n", keyRange.begin.printable(), keyRange.end.printable()); } + TraceEvent(SevWarn, "BGMappingTooLarge").detail("Range", range).detail("Max", 1000); throw unsupported_operation(); } ASSERT(!blobGranuleMapping.more && blobGranuleMapping.size() < CLIENT_KNOBS->TOO_MANY); @@ -7475,6 +7489,7 @@ ACTOR Future>> readBlobGranulesActor( req.keyRange = KeyRangeRef(StringRef(req.arena, granuleStartKey), StringRef(req.arena, granuleEndKey)); req.beginVersion = begin; req.readVersion = rv; + req.canCollapseBegin = true; // TODO make this a parameter once we support it std::vector>> v; v.push_back( @@ -7547,6 +7562,11 @@ ACTOR Future>> readBlobGranulesActor( throw e; } } + + self->trState->cx->anyBlobGranuleRequests = true; + self->trState->cx->bgGranulesPerRequest.addSample(results.size()); + self->trState->cx->bgLatencies.addSample(now() - startTime); + if (readVersionOut != nullptr) { *readVersionOut = rv; } @@ -8735,11 +8755,24 @@ ACTOR Future singleChangeFeedStreamInternal(KeyRange range, results->lastReturnedVersion.set(feedReply.mutations.back().version); } - if (refresh.canBeSet() && !atLatest && feedReply.atLatestVersion) { + if (!refresh.canBeSet()) { + try { + // refresh is set if and only if this actor is cancelled + wait(Future(Void())); + // Catch any unexpected behavior if the above contract is broken + ASSERT(false); + } catch (Error& e) { + ASSERT(e.code() == error_code_actor_cancelled); + throw; + } + } + + if (!atLatest && feedReply.atLatestVersion) { atLatest = true; results->notAtLatest.set(0); } - if (refresh.canBeSet() && feedReply.minStreamVersion > results->storageData[0]->version.get()) { + + if (feedReply.minStreamVersion > results->storageData[0]->version.get()) { results->storageData[0]->version.set(feedReply.minStreamVersion); } } diff --git a/fdbclient/ReadYourWrites.actor.cpp b/fdbclient/ReadYourWrites.actor.cpp index c7b96291961..a16034963bf 100644 --- a/fdbclient/ReadYourWrites.actor.cpp +++ b/fdbclient/ReadYourWrites.actor.cpp @@ -1791,8 +1791,6 @@ Future>> ReadYourWritesTransaction::re Version begin, Optional readVersion, Version* readVersionOut) { - // Remove in V2 of API - ASSERT(begin == 0); if (!options.readYourWritesDisabled) { return blob_granule_no_ryw(); diff --git a/fdbclient/RestoreInterface.h b/fdbclient/RestoreInterface.h index bdb2499298a..b7f3b04bcc4 100644 --- a/fdbclient/RestoreInterface.h +++ b/fdbclient/RestoreInterface.h @@ -49,6 +49,7 @@ struct RestoreRequest { int index; Key tagName; Key url; + Optional proxy; Version targetVersion; KeyRange range; UID randomUid; @@ -64,27 +65,29 @@ struct RestoreRequest { explicit RestoreRequest(const int index, const Key& tagName, const Key& url, + const Optional& proxy, Version targetVersion, const KeyRange& range, const UID& randomUid, Key& addPrefix, Key removePrefix) - : index(index), tagName(tagName), url(url), targetVersion(targetVersion), range(range), randomUid(randomUid), - addPrefix(addPrefix), removePrefix(removePrefix) {} + : index(index), tagName(tagName), url(url), proxy(proxy), targetVersion(targetVersion), range(range), + randomUid(randomUid), addPrefix(addPrefix), removePrefix(removePrefix) {} // To change this serialization, ProtocolVersion::RestoreRequestValue must be updated, and downgrades need to be // considered template void serialize(Ar& ar) { - serializer(ar, index, tagName, url, targetVersion, range, randomUid, addPrefix, removePrefix, reply); + serializer(ar, index, tagName, url, proxy, targetVersion, range, randomUid, addPrefix, removePrefix, reply); } std::string toString() const { std::stringstream ss; ss << "index:" << std::to_string(index) << " tagName:" << tagName.contents().toString() - << " url:" << url.contents().toString() << " targetVersion:" << std::to_string(targetVersion) - << " range:" << range.toString() << " randomUid:" << randomUid.toString() - << " addPrefix:" << addPrefix.toString() << " removePrefix:" << removePrefix.toString(); + << " url:" << url.contents().toString() << " proxy:" << (proxy.present() ? proxy.get() : "") + << " targetVersion:" << std::to_string(targetVersion) << " range:" << range.toString() + << " randomUid:" << randomUid.toString() << " addPrefix:" << addPrefix.toString() + << " removePrefix:" << removePrefix.toString(); return ss.str(); } }; diff --git a/fdbclient/S3BlobStore.actor.cpp b/fdbclient/S3BlobStore.actor.cpp index a9bd360cde2..edfc1d1bc07 100644 --- a/fdbclient/S3BlobStore.actor.cpp +++ b/fdbclient/S3BlobStore.actor.cpp @@ -34,6 +34,8 @@ #include "fdbrpc/IAsyncFile.h" #include "flow/UnitTest.h" #include "fdbclient/rapidxml/rapidxml.hpp" +#include "fdbclient/FDBAWSCredentialsProvider.h" + #include "flow/actorcompiler.h" // has to be last include using namespace rapidxml; @@ -82,6 +84,7 @@ S3BlobStoreEndpoint::BlobKnobs::BlobKnobs() { read_cache_blocks_per_file = CLIENT_KNOBS->BLOBSTORE_READ_CACHE_BLOCKS_PER_FILE; max_send_bytes_per_second = CLIENT_KNOBS->BLOBSTORE_MAX_SEND_BYTES_PER_SECOND; max_recv_bytes_per_second = CLIENT_KNOBS->BLOBSTORE_MAX_RECV_BYTES_PER_SECOND; + sdk_auth = false; } bool S3BlobStoreEndpoint::BlobKnobs::set(StringRef name, int value) { @@ -118,6 +121,7 @@ bool S3BlobStoreEndpoint::BlobKnobs::set(StringRef name, int value) { TRY_PARAM(read_cache_blocks_per_file, rcb); TRY_PARAM(max_send_bytes_per_second, sbps); TRY_PARAM(max_recv_bytes_per_second, rbps); + TRY_PARAM(sdk_auth, sa); #undef TRY_PARAM return false; } @@ -158,7 +162,8 @@ std::string S3BlobStoreEndpoint::BlobKnobs::getURLParameters() const { return r; } -Reference S3BlobStoreEndpoint::fromString(std::string const& url, +Reference S3BlobStoreEndpoint::fromString(const std::string& url, + const Optional& proxy, std::string* resourceFromURL, std::string* error, ParametersT* ignored_parameters) { @@ -171,6 +176,17 @@ Reference S3BlobStoreEndpoint::fromString(std::string const if (prefix != LiteralStringRef("blobstore")) throw format("Invalid blobstore URL prefix '%s'", prefix.toString().c_str()); + Optional proxyHost, proxyPort; + if (proxy.present()) { + if (!Hostname::isHostname(proxy.get()) && !NetworkAddress::parseOptional(proxy.get()).present()) { + throw format("'%s' is not a valid value for proxy. Format should be either IP:port or host:port.", + proxy.get().c_str()); + } + StringRef p(proxy.get()); + proxyHost = p.eat(":").toString(); + proxyPort = p.eat().toString(); + } + Optional cred; if (url.find("@") != std::string::npos) { cred = t.eat("@"); @@ -257,7 +273,8 @@ Reference S3BlobStoreEndpoint::fromString(std::string const creds = S3BlobStoreEndpoint::Credentials{ key.toString(), secret.toString(), securityToken.toString() }; } - return makeReference(host.toString(), service.toString(), creds, knobs, extraHeaders); + return makeReference( + host.toString(), service.toString(), proxyHost, proxyPort, creds, knobs, extraHeaders); } catch (std::string& err) { if (error != nullptr) @@ -506,7 +523,38 @@ ACTOR Future> tryReadJSONFile(std::string path) { return Optional(); } +// If the credentials expire, the connection will eventually fail and be discarded from the pool, and then a new +// connection will be constructed, which will call this again to get updated credentials +static S3BlobStoreEndpoint::Credentials getSecretSdk() { +#ifdef BUILD_AWS_BACKUP + double elapsed = -timer_monotonic(); + Aws::Auth::AWSCredentials awsCreds = FDBAWSCredentialsProvider::getAwsCredentials(); + elapsed += timer_monotonic(); + + if (awsCreds.IsEmpty()) { + TraceEvent(SevWarn, "S3BlobStoreAWSCredsEmpty"); + throw backup_auth_missing(); + } + + S3BlobStoreEndpoint::Credentials fdbCreds; + fdbCreds.key = awsCreds.GetAWSAccessKeyId(); + fdbCreds.secret = awsCreds.GetAWSSecretKey(); + fdbCreds.securityToken = awsCreds.GetSessionToken(); + + TraceEvent("S3BlobStoreGotSdkCredentials").suppressFor(60).detail("Duration", elapsed); + + return fdbCreds; +#else + TraceEvent(SevError, "S3BlobStoreNoSDK"); + throw backup_auth_missing(); +#endif +} + ACTOR Future updateSecret_impl(Reference b) { + if (b->knobs.sdk_auth) { + b->credentials = getSecretSdk(); + return Void(); + } std::vector* pFiles = (std::vector*)g_network->global(INetwork::enBlobCredentialFiles); if (pFiles == nullptr) return Void(); @@ -538,7 +586,7 @@ ACTOR Future updateSecret_impl(Reference b) { JSONDoc accounts(doc.last().get_obj()); if (accounts.has(credentialsFileKey, false) && accounts.last().type() == json_spirit::obj_type) { JSONDoc account(accounts.last()); - S3BlobStoreEndpoint::Credentials creds; + S3BlobStoreEndpoint::Credentials creds = b->credentials.get(); if (b->lookupKey) { std::string apiKey; if (account.tryGet("api_key", apiKey)) @@ -589,11 +637,11 @@ ACTOR Future connect_impl(Referenceservice; + std::string host = b->host, service = b->service; if (service.empty()) service = b->knobs.secure_connection ? "https" : "http"; state Reference conn = - wait(INetworkConnections::net()->connect(b->host, service, b->knobs.secure_connection ? true : false)); + wait(INetworkConnections::net()->connect(host, service, b->knobs.secure_connection ? true : false)); wait(conn->connectHandshake()); TraceEvent("S3BlobStoreEndpointNewConnection") @@ -601,7 +649,7 @@ ACTOR Future connect_impl(ReferencegetPeerAddress()) .detail("ExpiresIn", b->knobs.max_connection_life); - if (b->lookupKey || b->lookupSecret) + if (b->lookupKey || b->lookupSecret || b->knobs.sdk_auth) wait(b->updateSecret()); return S3BlobStoreEndpoint::ReusableConnection({ conn, now() + b->knobs.max_connection_life }); @@ -1574,7 +1622,7 @@ TEST_CASE("/backup/s3/v4headers") { S3BlobStoreEndpoint::Credentials creds{ "AKIAIOSFODNN7EXAMPLE", "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY", "" } // GET without query parameters { - S3BlobStoreEndpoint s3("s3.amazonaws.com", "s3", creds); + S3BlobStoreEndpoint s3("s3.amazonaws.com", "s3", "proxy", "port", creds); std::string verb("GET"); std::string resource("/test.txt"); HTTP::Headers headers; @@ -1589,7 +1637,7 @@ TEST_CASE("/backup/s3/v4headers") { // GET with query parameters { - S3BlobStoreEndpoint s3("s3.amazonaws.com", "s3", creds); + S3BlobStoreEndpoint s3("s3.amazonaws.com", "s3", "proxy", "port", creds); std::string verb("GET"); std::string resource("/test/examplebucket?Action=DescribeRegions&Version=2013-10-15"); HTTP::Headers headers; @@ -1604,7 +1652,7 @@ TEST_CASE("/backup/s3/v4headers") { // POST { - S3BlobStoreEndpoint s3("s3.us-west-2.amazonaws.com", "s3", creds); + S3BlobStoreEndpoint s3("s3.us-west-2.amazonaws.com", "s3", "proxy", "port", creds); std::string verb("POST"); std::string resource("/simple.json"); HTTP::Headers headers; diff --git a/fdbclient/S3BlobStore.h b/fdbclient/S3BlobStore.h index c259d6a4da2..bd29675bae9 100644 --- a/fdbclient/S3BlobStore.h +++ b/fdbclient/S3BlobStore.h @@ -59,7 +59,7 @@ class S3BlobStoreEndpoint : public ReferenceCounted { delete_requests_per_second, multipart_max_part_size, multipart_min_part_size, concurrent_requests, concurrent_uploads, concurrent_lists, concurrent_reads_per_file, concurrent_writes_per_file, read_block_size, read_ahead_blocks, read_cache_blocks_per_file, max_send_bytes_per_second, - max_recv_bytes_per_second; + max_recv_bytes_per_second, sdk_auth; bool set(StringRef name, int value); std::string getURLParameters() const; static std::vector getKnobDescriptions() { @@ -91,17 +91,23 @@ class S3BlobStoreEndpoint : public ReferenceCounted { "read_cache_blocks_per_file (or rcb) Size of the read cache for a file in blocks.", "max_send_bytes_per_second (or sbps) Max send bytes per second for all requests combined.", "max_recv_bytes_per_second (or rbps) Max receive bytes per second for all requests combined (NOT YET " - "USED)." + "USED).", + "sdk_auth (or sa) Use AWS SDK to resolve credentials. Only valid if " + "BUILD_AWS_BACKUP is enabled." }; } }; S3BlobStoreEndpoint(std::string const& host, - std::string service, + std::string const& service, + Optional const& proxyHost, + Optional const& proxyPort, Optional const& creds, BlobKnobs const& knobs = BlobKnobs(), HTTP::Headers extraHeaders = HTTP::Headers()) - : host(host), service(service), credentials(creds), lookupKey(creds.present() && creds.get().key.empty()), + : host(host), service(service), proxyHost(proxyHost), proxyPort(proxyPort), + useProxy(proxyHost.present() && proxyPort.present()), credentials(creds), + lookupKey(creds.present() && creds.get().key.empty()), lookupSecret(creds.present() && creds.get().secret.empty()), knobs(knobs), extraHeaders(extraHeaders), requestRate(new SpeedLimit(knobs.requests_per_second, 1)), requestRateList(new SpeedLimit(knobs.list_requests_per_second, 1)), @@ -112,7 +118,7 @@ class S3BlobStoreEndpoint : public ReferenceCounted { recvRate(new SpeedLimit(knobs.max_recv_bytes_per_second, 1)), concurrentRequests(knobs.concurrent_requests), concurrentUploads(knobs.concurrent_uploads), concurrentLists(knobs.concurrent_lists) { - if (host.empty()) + if (host.empty() || (proxyHost.present() != proxyPort.present())) throw connection_string_invalid(); } @@ -130,10 +136,11 @@ class S3BlobStoreEndpoint : public ReferenceCounted { // Parse url and return a S3BlobStoreEndpoint // If the url has parameters that S3BlobStoreEndpoint can't consume then an error will be thrown unless // ignored_parameters is given in which case the unconsumed parameters will be added to it. - static Reference fromString(std::string const& url, - std::string* resourceFromURL = nullptr, - std::string* error = nullptr, - ParametersT* ignored_parameters = nullptr); + static Reference fromString(const std::string& url, + const Optional& proxy, + std::string* resourceFromURL, + std::string* error, + ParametersT* ignored_parameters); // Get a normalized version of this URL with the given resource and any non-default BlobKnob values as URL // parameters in addition to the passed params string @@ -149,6 +156,10 @@ class S3BlobStoreEndpoint : public ReferenceCounted { std::string host; std::string service; + Optional proxyHost; + Optional proxyPort; + bool useProxy; + Optional credentials; bool lookupKey; bool lookupSecret; diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index 0951498d524..f53efac7861 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -828,6 +828,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi // encrypt key proxy init( ENABLE_ENCRYPTION, false ); + init( ENCRYPTION_MODE, "AES-256-CTR"); // Blob granlues init( BG_URL, isSimulated ? "file://fdbblob/" : "" ); // TODO: store in system key space or something, eventually diff --git a/fdbclient/ServerKnobs.h b/fdbclient/ServerKnobs.h index d120ed3986f..de69ef43dc7 100644 --- a/fdbclient/ServerKnobs.h +++ b/fdbclient/ServerKnobs.h @@ -775,8 +775,9 @@ class ServerKnobs : public KnobsImpl { // Cluster recovery std::string CLUSTER_RECOVERY_EVENT_NAME_PREFIX; - // encrypt key proxy + // Encryption bool ENABLE_ENCRYPTION; + std::string ENCRYPTION_MODE; // blob granule stuff // FIXME: configure url with database configuration instead of knob eventually diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp index 21797cbd678..fd18e94f20b 100644 --- a/fdbclient/SpecialKeySpace.actor.cpp +++ b/fdbclient/SpecialKeySpace.actor.cpp @@ -2704,16 +2704,23 @@ Future> FailedLocalitiesRangeImpl::commit(ReadYourWritesTr } ACTOR Future getTenantList(ReadYourWritesTransaction* ryw, KeyRangeRef kr, GetRangeLimits limitsHint) { - KeyRangeRef tenantRange = - kr.removePrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin) - .removePrefix(TenantMapRangeImpl::submoduleRange.begin); state KeyRef managementPrefix = kr.begin.substr(0, SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin.size() + TenantMapRangeImpl::submoduleRange.begin.size()); - std::map tenants = wait(ManagementAPI::listTenantsTransaction( - &ryw->getTransaction(), tenantRange.begin, tenantRange.end, limitsHint.rows)); + kr = kr.removePrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin); + TenantNameRef beginTenant = kr.begin.removePrefix(TenantMapRangeImpl::submoduleRange.begin); + + TenantNameRef endTenant = kr.end; + if (endTenant.startsWith(TenantMapRangeImpl::submoduleRange.begin)) { + endTenant = endTenant.removePrefix(TenantMapRangeImpl::submoduleRange.begin); + } else { + endTenant = "\xff"_sr; + } + + std::map tenants = + wait(ManagementAPI::listTenantsTransaction(&ryw->getTransaction(), beginTenant, endTenant, limitsHint.rows)); RangeResult results; for (auto tenant : tenants) { @@ -2783,7 +2790,7 @@ Future> TenantMapRangeImpl::commit(ReadYourWritesTransacti TenantNameRef endTenant = range.end().removePrefix( SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin); if (endTenant.startsWith(submoduleRange.begin)) { - endTenant = endTenant.removePrefix(submoduleRange.end); + endTenant = endTenant.removePrefix(submoduleRange.begin); } else { endTenant = "\xff"_sr; } diff --git a/fdbclient/StorageServerInterface.h b/fdbclient/StorageServerInterface.h index 6dce9351cb6..18c2d1044d7 100644 --- a/fdbclient/StorageServerInterface.h +++ b/fdbclient/StorageServerInterface.h @@ -924,7 +924,7 @@ struct OverlappingChangeFeedsReply { }; struct OverlappingChangeFeedsRequest { - constexpr static FileIdentifier file_identifier = 10726174; + constexpr static FileIdentifier file_identifier = 7228462; KeyRange range; Version minVersion; ReplyPromise reply; @@ -939,7 +939,7 @@ struct OverlappingChangeFeedsRequest { }; struct ChangeFeedVersionUpdateReply { - constexpr static FileIdentifier file_identifier = 11815134; + constexpr static FileIdentifier file_identifier = 4246160; Version version = 0; ChangeFeedVersionUpdateReply() {} diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index 5c24441f324..42e0f83a268 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -1190,23 +1190,26 @@ const KeyRange blobGranuleFileKeyRangeFor(UID granuleID) { return KeyRangeRef(startKey, strinc(startKey)); } -const Value blobGranuleFileValueFor(StringRef const& filename, int64_t offset, int64_t length) { +const Value blobGranuleFileValueFor(StringRef const& filename, int64_t offset, int64_t length, int64_t fullFileLength) { BinaryWriter wr(IncludeVersion(ProtocolVersion::withBlobGranule())); wr << filename; wr << offset; wr << length; + wr << fullFileLength; return wr.toValue(); } -std::tuple, int64_t, int64_t> decodeBlobGranuleFileValue(ValueRef const& value) { +std::tuple, int64_t, int64_t, int64_t> decodeBlobGranuleFileValue(ValueRef const& value) { StringRef filename; int64_t offset; int64_t length; + int64_t fullFileLength; BinaryReader reader(value, IncludeVersion()); reader >> filename; reader >> offset; reader >> length; - return std::tuple(filename, offset, length); + reader >> fullFileLength; + return std::tuple(filename, offset, length, fullFileLength); } const Value blobGranulePruneValueFor(Version version, KeyRange range, bool force) { diff --git a/fdbclient/SystemData.h b/fdbclient/SystemData.h index 171130559ed..fcbc20bf97a 100644 --- a/fdbclient/SystemData.h +++ b/fdbclient/SystemData.h @@ -572,8 +572,8 @@ const Key blobGranuleFileKeyFor(UID granuleID, Version fileVersion, uint8_t file std::tuple decodeBlobGranuleFileKey(KeyRef const& key); const KeyRange blobGranuleFileKeyRangeFor(UID granuleID); -const Value blobGranuleFileValueFor(StringRef const& filename, int64_t offset, int64_t length); -std::tuple, int64_t, int64_t> decodeBlobGranuleFileValue(ValueRef const& value); +const Value blobGranuleFileValueFor(StringRef const& filename, int64_t offset, int64_t length, int64_t fullFileLength); +std::tuple, int64_t, int64_t, int64_t> decodeBlobGranuleFileValue(ValueRef const& value); const Value blobGranulePruneValueFor(Version version, KeyRange range, bool force); std::tuple decodeBlobGranulePruneValue(ValueRef const& value); diff --git a/fdbclient/ThreadSafeTransaction.cpp b/fdbclient/ThreadSafeTransaction.cpp index e72439ec306..f7142623c59 100644 --- a/fdbclient/ThreadSafeTransaction.cpp +++ b/fdbclient/ThreadSafeTransaction.cpp @@ -329,9 +329,6 @@ ThreadResult ThreadSafeTransaction::readBlobGranules(const KeyRange Version beginVersion, Optional readVersion, ReadBlobGranuleContext granule_context) { - // In V1 of api this is required, field is just for forward compatibility - ASSERT(beginVersion == 0); - // FIXME: prevent from calling this from another main thread! ISingleThreadTransaction* tr = this->tr; diff --git a/fdbclient/WellKnownEndpoints.h b/fdbclient/WellKnownEndpoints.h index 5db3f34cead..bed5c349358 100644 --- a/fdbclient/WellKnownEndpoints.h +++ b/fdbclient/WellKnownEndpoints.h @@ -52,4 +52,7 @@ enum WellKnownEndpoints { WLTOKEN_RESERVED_COUNT // 23 }; +static_assert(WLTOKEN_PROTOCOL_INFO == + 10); // Enforce that the value of this endpoint does not change per comment above. + #endif diff --git a/fdbmonitor/CMakeLists.txt b/fdbmonitor/CMakeLists.txt index 5d624e8191c..622b0ec5941 100644 --- a/fdbmonitor/CMakeLists.txt +++ b/fdbmonitor/CMakeLists.txt @@ -10,6 +10,16 @@ endif() # as soon as we get rid of the old build system target_link_libraries(fdbmonitor PUBLIC Threads::Threads) +# We don't compile fdbmonitor with thread sanitizer instrumentation, since this +# appears to change its behavior (it no longer seems to restart killed +# processes). fdbmonitor is single-threaded anyway. +get_target_property(fdbmonitor_options fdbmonitor COMPILE_OPTIONS) +list(REMOVE_ITEM fdbmonitor_options "-fsanitize=thread") +set_property(TARGET fdbmonitor PROPERTY COMPILE_OPTIONS ${target_options}) +get_target_property(fdbmonitor_options fdbmonitor LINK_OPTIONS) +list(REMOVE_ITEM fdbmonitor_options "-fsanitize=thread") +set_property(TARGET fdbmonitor PROPERTY LINK_OPTIONS ${target_options}) + if(GENERATE_DEBUG_PACKAGES) fdb_install(TARGETS fdbmonitor DESTINATION fdbmonitor COMPONENT server) else() diff --git a/fdbrpc/AsyncFileCached.actor.cpp b/fdbrpc/AsyncFileCached.actor.cpp index 1ea7c36aa4d..93bf9cfc70f 100644 --- a/fdbrpc/AsyncFileCached.actor.cpp +++ b/fdbrpc/AsyncFileCached.actor.cpp @@ -29,14 +29,7 @@ static std::map, Referen EvictablePage::~EvictablePage() { if (data) { -#if defined(USE_JEMALLOC) - aligned_free(data); -#else - if (pageCache->pageSize == 4096) - FastAllocator<4096>::release(data); - else - aligned_free(data); -#endif + freeFast4kAligned(pageCache->pageSize, data); } if (EvictablePageCache::RANDOM == pageCache->cacheEvictionType) { if (index > -1) { @@ -173,14 +166,7 @@ void AsyncFileCached::releaseZeroCopy(void* data, int length, int64_t offset) { if (o != orphanedPages.end()) { if (o->second == 1) { if (data) { -#if defined(USE_JEMALLOC) - aligned_free(data); -#else - if (length == 4096) - FastAllocator<4096>::release(data); - else - aligned_free(data); -#endif + freeFast4kAligned(length, data); } } else { --o->second; diff --git a/fdbrpc/AsyncFileCached.actor.h b/fdbrpc/AsyncFileCached.actor.h index 8c986db9b52..38236615ea6 100644 --- a/fdbrpc/AsyncFileCached.actor.h +++ b/fdbrpc/AsyncFileCached.actor.h @@ -79,14 +79,9 @@ struct EvictablePageCache : ReferenceCounted { void allocate(EvictablePage* page) { try_evict(); try_evict(); -#if defined(USE_JEMALLOC) - page->data = aligned_alloc(4096, pageSize); -#else - page->data = pageSize == 4096 ? FastAllocator<4096>::allocate() : aligned_alloc(4096, pageSize); -#endif - if (page->data == nullptr) { - platform::outOfMemory(); - } + + page->data = allocateFast4kAligned(pageSize); + if (RANDOM == cacheEvictionType) { page->index = pages.size(); pages.push_back(page); @@ -394,14 +389,7 @@ struct AFCPage : public EvictablePage, public FastAllocated { owner->orphanedPages[data] = zeroCopyRefCount; zeroCopyRefCount = 0; notReading = Void(); -#if defined(USE_JEMALLOC) - data = aligned_alloc(4096, pageCache->pageSize); -#else - data = pageCache->pageSize == 4096 ? FastAllocator<4096>::allocate() : aligned_alloc(4096, pageCache->pageSize); -#endif - if (data == nullptr) { - platform::outOfMemory(); - } + data = allocateFast4kAligned(pageCache->pageSize); } Future write(void const* data, int length, int offset) { diff --git a/fdbrpc/simulator.h b/fdbrpc/simulator.h index 464e473ae01..80f17b39717 100644 --- a/fdbrpc/simulator.h +++ b/fdbrpc/simulator.h @@ -37,12 +37,6 @@ enum ClogMode { ClogDefault, ClogAll, ClogSend, ClogReceive }; class ISimulator : public INetwork { public: - ISimulator() - : desiredCoordinators(1), physicalDatacenters(1), processesPerMachine(0), listenersPerProcess(1), - extraDB(nullptr), usableRegions(1), allowLogSetKills(true), tssMode(TSSMode::Disabled), isStopped(false), - lastConnectionFailure(0), connectionFailuresDisableDuration(0), speedUpSimulation(false), - backupAgents(BackupAgentType::WaitForType), drAgents(BackupAgentType::WaitForType), allSwapsDisabled(false) {} - // Order matters! enum KillType { KillInstantly, @@ -393,7 +387,7 @@ class ISimulator : public INetwork { int listenersPerProcess; std::set protectedAddresses; std::map currentlyRebootingProcesses; - class ClusterConnectionString* extraDB; + std::unique_ptr extraDB; Reference storagePolicy; Reference tLogPolicy; int32_t tLogWriteAntiQuorum; @@ -456,6 +450,9 @@ class ISimulator : public INetwork { return false; } + ISimulator(); + virtual ~ISimulator(); + protected: Mutex mutex; diff --git a/fdbserver/BlobGranuleServerCommon.actor.cpp b/fdbserver/BlobGranuleServerCommon.actor.cpp index c47ed131998..35b8d2e22fd 100644 --- a/fdbserver/BlobGranuleServerCommon.actor.cpp +++ b/fdbserver/BlobGranuleServerCommon.actor.cpp @@ -18,6 +18,7 @@ * limitations under the License. */ +#include "contrib/fmt-8.1.1/include/fmt/format.h" #include "fdbclient/SystemData.h" #include "fdbclient/BlobGranuleCommon.h" #include "fdbserver/BlobGranuleServerCommon.actor.h" @@ -25,6 +26,7 @@ #include "fdbclient/FDBTypes.h" #include "fdbclient/ReadYourWrites.h" #include "flow/Arena.h" +#include "flow/UnitTest.h" #include "flow/actorcompiler.h" // has to be last include // Gets the latest granule history node for range that was persisted @@ -58,13 +60,14 @@ ACTOR Future readGranuleFiles(Transaction* tr, Key* startKey, Key endKey, Standalone filename; int64_t offset; int64_t length; + int64_t fullFileLength; std::tie(gid, version, fileType) = decodeBlobGranuleFileKey(it.key); ASSERT(gid == granuleID); - std::tie(filename, offset, length) = decodeBlobGranuleFileValue(it.value); + std::tie(filename, offset, length, fullFileLength) = decodeBlobGranuleFileValue(it.value); - BlobFileIndex idx(version, filename.toString(), offset, length); + BlobFileIndex idx(version, filename.toString(), offset, length, fullFileLength); if (fileType == 'S') { ASSERT(files->snapshotFiles.empty() || files->snapshotFiles.back().version < idx.version); files->snapshotFiles.push_back(idx); @@ -102,3 +105,255 @@ ACTOR Future loadHistoryFiles(Database cx, UID granuleID) { } } } + +// Normally a beginVersion != 0 means the caller wants all mutations between beginVersion and readVersion, instead of +// the latest snapshot before readVersion + deltas after the snapshot. When canCollapse is set, the beginVersion is +// essentially just an optimization hint. The caller is still concerned with reconstructing rows at readVersion, it just +// knows it doesn't need anything before beginVersion. +// Normally this can eliminate the need for a snapshot and just return a small amount of deltas. But in a highly active +// key range, the granule may have a snapshot file at version X, where beginVersion < X <= readVersion. In this case, if +// the number of bytes in delta files between beginVersion and X is larger than the snapshot file at version X, it is +// strictly more efficient (in terms of files and bytes read) to just use the snapshot file at version X instead. +void GranuleFiles::getFiles(Version beginVersion, + Version readVersion, + bool canCollapse, + BlobGranuleChunkRef& chunk, + Arena& replyArena, + int64_t& deltaBytesCounter) const { + BlobFileIndex dummyIndex; // for searching + + // if beginVersion == 0 or we can collapse, find the latest snapshot <= readVersion + auto snapshotF = snapshotFiles.end(); + if (beginVersion == 0 || canCollapse) { + dummyIndex.version = readVersion; + snapshotF = std::lower_bound(snapshotFiles.begin(), snapshotFiles.end(), dummyIndex); + if (snapshotF == snapshotFiles.end() || snapshotF->version > readVersion) { + ASSERT(snapshotF != snapshotFiles.begin()); + snapshotF--; + } + ASSERT(snapshotF != snapshotFiles.end()); + ASSERT(snapshotF->version <= readVersion); + } + + auto deltaF = deltaFiles.end(); + if (beginVersion > 0) { + dummyIndex.version = beginVersion; + deltaF = std::lower_bound(deltaFiles.begin(), deltaFiles.end(), dummyIndex); + if (canCollapse) { + ASSERT(snapshotF != snapshotFiles.end()); + // If we can collapse, see if delta files up to snapshotVersion are smaller or larger than snapshotBytes in + // total + auto deltaFCopy = deltaF; + int64_t snapshotBytes = snapshotF->length; + while (deltaFCopy != deltaFiles.end() && deltaFCopy->version <= snapshotF->version && snapshotBytes > 0) { + snapshotBytes -= deltaFCopy->length; + deltaFCopy++; + } + // if delta files contain the same or more bytes as the snapshot with collapse, do the collapse + if (snapshotBytes > 0) { + // don't collapse, clear snapshotF and just do delta files + snapshotF = snapshotFiles.end(); + } else { + // do snapshot instead of previous deltas + dummyIndex.version = snapshotF->version; + deltaF = std::upper_bound(deltaFiles.begin(), deltaFiles.end(), dummyIndex); + ASSERT(deltaF == deltaFiles.end() || deltaF->version > snapshotF->version); + } + } + } else { + dummyIndex.version = snapshotF->version; + deltaF = std::upper_bound(deltaFiles.begin(), deltaFiles.end(), dummyIndex); + ASSERT(deltaF == deltaFiles.end() || deltaF->version > snapshotF->version); + } + + Version lastIncluded = invalidVersion; + if (snapshotF != snapshotFiles.end()) { + chunk.snapshotVersion = snapshotF->version; + chunk.snapshotFile = BlobFilePointerRef( + replyArena, snapshotF->filename, snapshotF->offset, snapshotF->length, snapshotF->fullFileLength); + lastIncluded = chunk.snapshotVersion; + } else { + chunk.snapshotVersion = invalidVersion; + } + + while (deltaF != deltaFiles.end() && deltaF->version < readVersion) { + chunk.deltaFiles.emplace_back_deep( + replyArena, deltaF->filename, deltaF->offset, deltaF->length, deltaF->fullFileLength); + deltaBytesCounter += deltaF->length; + ASSERT(lastIncluded < deltaF->version); + lastIncluded = deltaF->version; + deltaF++; + } + // include last delta file that passes readVersion, if it exists + if (deltaF != deltaFiles.end() && lastIncluded < readVersion) { + chunk.deltaFiles.emplace_back_deep( + replyArena, deltaF->filename, deltaF->offset, deltaF->length, deltaF->fullFileLength); + deltaBytesCounter += deltaF->length; + lastIncluded = deltaF->version; + } +} + +static std::string makeTestFileName(Version v) { + return "test" + std::to_string(v); +} + +static BlobFileIndex makeTestFile(Version v, int64_t len) { + return BlobFileIndex(v, makeTestFileName(v), 0, len, len); +} + +static void checkFile(int expectedVersion, const BlobFilePointerRef& actualFile) { + ASSERT(makeTestFileName(expectedVersion) == actualFile.filename.toString()); +} + +static void checkFiles(const GranuleFiles& f, + Version beginVersion, + Version readVersion, + bool canCollapse, + Optional expectedSnapshotVersion, + std::vector expectedDeltaVersions) { + Arena a; + BlobGranuleChunkRef chunk; + int64_t deltaBytes = 0; + f.getFiles(beginVersion, readVersion, canCollapse, chunk, a, deltaBytes); + fmt::print("results({0}, {1}, {2}):\nEXPECTED:\n snapshot={3}\n deltas ({4}):\n", + beginVersion, + readVersion, + canCollapse ? "T" : "F", + expectedSnapshotVersion.present() ? makeTestFileName(expectedSnapshotVersion.get()).c_str() : "", + expectedDeltaVersions.size()); + for (int d : expectedDeltaVersions) { + fmt::print(" {}\n", makeTestFileName(d)); + } + fmt::print("ACTUAL:\n snapshot={0}\n deltas ({1}):\n", + chunk.snapshotFile.present() ? chunk.snapshotFile.get().filename.toString().c_str() : "", + chunk.deltaFiles.size()); + for (auto& it : chunk.deltaFiles) { + fmt::print(" {}\n", it.filename.toString()); + } + printf("\n\n\n"); + ASSERT(expectedSnapshotVersion.present() == chunk.snapshotFile.present()); + if (expectedSnapshotVersion.present()) { + checkFile(expectedSnapshotVersion.get(), chunk.snapshotFile.get()); + } + ASSERT(expectedDeltaVersions.size() == chunk.deltaFiles.size()); + for (int i = 0; i < expectedDeltaVersions.size(); i++) { + checkFile(expectedDeltaVersions[i], chunk.deltaFiles[i]); + } +} + +/* + * Files: + * S @ 100 (10 bytes) + * D @ 150 (5 bytes) + * D @ 200 (6 bytes) + * S @ 200 (15 bytes) + * D @ 250 (7 bytes) + * D @ 300 (8 bytes) + * S @ 300 (10 bytes) + * D @ 350 (4 bytes) + */ +TEST_CASE("/blobgranule/server/common/granulefiles") { + // simple cases first + + // single snapshot file, no deltas + GranuleFiles files; + files.snapshotFiles.push_back(makeTestFile(100, 10)); + + printf("Just snapshot\n"); + + checkFiles(files, 0, 100, false, 100, {}); + checkFiles(files, 0, 200, false, 100, {}); + + printf("Small test\n"); + // add delta files with re-snapshot at end + files.deltaFiles.push_back(makeTestFile(150, 5)); + files.deltaFiles.push_back(makeTestFile(200, 6)); + files.snapshotFiles.push_back(makeTestFile(200, 15)); + + // check different read versions with beginVersion=0 + checkFiles(files, 0, 100, false, 100, {}); + checkFiles(files, 0, 101, false, 100, { 150 }); + checkFiles(files, 0, 149, false, 100, { 150 }); + checkFiles(files, 0, 150, false, 100, { 150 }); + checkFiles(files, 0, 151, false, 100, { 150, 200 }); + checkFiles(files, 0, 199, false, 100, { 150, 200 }); + checkFiles(files, 0, 200, false, 200, {}); + checkFiles(files, 0, 300, false, 200, {}); + + // Test all cases of beginVersion + readVersion. Because delta files are smaller than snapshot at 200, this should + // be the same with and without collapse + checkFiles(files, 100, 200, false, Optional(), { 150, 200 }); + checkFiles(files, 100, 300, false, Optional(), { 150, 200 }); + checkFiles(files, 101, 199, false, Optional(), { 150, 200 }); + checkFiles(files, 149, 151, false, Optional(), { 150, 200 }); + checkFiles(files, 149, 150, false, Optional(), { 150 }); + checkFiles(files, 150, 151, false, Optional(), { 150, 200 }); + checkFiles(files, 151, 200, false, Optional(), { 200 }); + + checkFiles(files, 100, 200, true, Optional(), { 150, 200 }); + checkFiles(files, 100, 300, true, Optional(), { 150, 200 }); + checkFiles(files, 101, 199, true, Optional(), { 150, 200 }); + checkFiles(files, 149, 151, true, Optional(), { 150, 200 }); + checkFiles(files, 149, 150, true, Optional(), { 150 }); + checkFiles(files, 150, 151, true, Optional(), { 150, 200 }); + checkFiles(files, 151, 200, true, Optional(), { 200 }); + + printf("Larger test\n"); + // add more delta files and snapshots to check collapse logic + files.deltaFiles.push_back(makeTestFile(250, 7)); + files.deltaFiles.push_back(makeTestFile(300, 8)); + files.snapshotFiles.push_back(makeTestFile(300, 10)); + files.deltaFiles.push_back(makeTestFile(350, 4)); + + checkFiles(files, 0, 300, false, 300, {}); + checkFiles(files, 0, 301, false, 300, { 350 }); + checkFiles(files, 0, 400, false, 300, { 350 }); + + // check delta files without collapse + + checkFiles(files, 100, 301, false, Optional(), { 150, 200, 250, 300, 350 }); + checkFiles(files, 100, 300, false, Optional(), { 150, 200, 250, 300 }); + checkFiles(files, 100, 251, false, Optional(), { 150, 200, 250, 300 }); + checkFiles(files, 100, 250, false, Optional(), { 150, 200, 250 }); + + checkFiles(files, 151, 300, false, Optional(), { 200, 250, 300 }); + checkFiles(files, 151, 301, false, Optional(), { 200, 250, 300, 350 }); + checkFiles(files, 151, 400, false, Optional(), { 200, 250, 300, 350 }); + + checkFiles(files, 201, 300, false, Optional(), { 250, 300 }); + checkFiles(files, 201, 301, false, Optional(), { 250, 300, 350 }); + checkFiles(files, 201, 400, false, Optional(), { 250, 300, 350 }); + + checkFiles(files, 251, 300, false, Optional(), { 300 }); + checkFiles(files, 251, 301, false, Optional(), { 300, 350 }); + checkFiles(files, 251, 400, false, Optional(), { 300, 350 }); + checkFiles(files, 301, 400, false, Optional(), { 350 }); + checkFiles(files, 351, 400, false, Optional(), {}); + + // check with collapse + // these 2 collapse because the delta files at 150+200+250+300 are larger than the snapshot at 300 + checkFiles(files, 100, 301, true, 300, { 350 }); + checkFiles(files, 100, 300, true, 300, {}); + // these 2 don't collapse because 150+200 delta files are smaller than the snapshot at 200 + checkFiles(files, 100, 251, true, Optional(), { 150, 200, 250, 300 }); + checkFiles(files, 100, 250, true, Optional(), { 150, 200, 250 }); + + // these 3 do collapse because the delta files at 200+250+300 are larger than the snapshot at 300 + checkFiles(files, 151, 300, true, 300, {}); + checkFiles(files, 151, 301, true, 300, { 350 }); + checkFiles(files, 151, 400, true, 300, { 350 }); + + // these 3 do collapse because the delta files at 250+300 are larger than the snapshot at 300 + checkFiles(files, 201, 300, true, 300, {}); + checkFiles(files, 201, 301, true, 300, { 350 }); + checkFiles(files, 201, 400, true, 300, { 350 }); + + // these don't collapse because the delta file at 300 is smaller than the snapshot at 300 + checkFiles(files, 251, 300, true, Optional(), { 300 }); + checkFiles(files, 251, 301, true, Optional(), { 300, 350 }); + checkFiles(files, 251, 400, true, Optional(), { 300, 350 }); + checkFiles(files, 301, 400, true, Optional(), { 350 }); + checkFiles(files, 351, 400, true, Optional(), {}); + + return Void(); +} \ No newline at end of file diff --git a/fdbserver/BlobGranuleServerCommon.actor.h b/fdbserver/BlobGranuleServerCommon.actor.h index d48418c951b..ea3f8c1e3ba 100644 --- a/fdbserver/BlobGranuleServerCommon.actor.h +++ b/fdbserver/BlobGranuleServerCommon.actor.h @@ -49,17 +49,29 @@ struct BlobFileIndex { std::string filename; int64_t offset; int64_t length; + int64_t fullFileLength; BlobFileIndex() {} - BlobFileIndex(Version version, std::string filename, int64_t offset, int64_t length) - : version(version), filename(filename), offset(offset), length(length) {} + BlobFileIndex(Version version, std::string filename, int64_t offset, int64_t length, int64_t fullFileLength) + : version(version), filename(filename), offset(offset), length(length), fullFileLength(fullFileLength) {} + + // compare on version + bool operator<(const BlobFileIndex& r) const { return version < r.version; } }; +// FIXME: initialize these to smaller default sizes to save a bit of memory, particularly snapshotFiles // Stores the files that comprise a blob granule struct GranuleFiles { - std::deque snapshotFiles; - std::deque deltaFiles; + std::vector snapshotFiles; + std::vector deltaFiles; + + void getFiles(Version beginVersion, + Version readVersion, + bool canCollapse, + BlobGranuleChunkRef& chunk, + Arena& replyArena, + int64_t& deltaBytesCounter) const; }; class Transaction; diff --git a/fdbserver/BlobManager.actor.cpp b/fdbserver/BlobManager.actor.cpp index 2b6a4da2bf8..192475f4dd3 100644 --- a/fdbserver/BlobManager.actor.cpp +++ b/fdbserver/BlobManager.actor.cpp @@ -211,6 +211,24 @@ struct SplitEvaluation { : epoch(epoch), seqno(seqno), inProgress(inProgress) {} }; +struct BlobManagerStats { + CounterCollection cc; + + // FIXME: pruning stats + + Counter granuleSplits; + Counter granuleWriteHotSplits; + Future logger; + + // Current stats maintained for a given blob worker process + explicit BlobManagerStats(UID id, double interval, std::unordered_map* workers) + : cc("BlobManagerStats", id.toString()), granuleSplits("GranuleSplits", cc), + granuleWriteHotSplits("GranuleWriteHotSplits", cc) { + specialCounter(cc, "WorkerCount", [workers]() { return workers->size(); }); + logger = traceCounters("BlobManagerMetrics", id, interval, &cc, "BlobManagerMetrics"); + } +}; + struct BlobManagerData : NonCopyable, ReferenceCounted { UID id; Database db; @@ -218,6 +236,8 @@ struct BlobManagerData : NonCopyable, ReferenceCounted { PromiseStream> addActor; Promise doLockCheck; + BlobManagerStats stats; + Reference bstore; std::unordered_map workersById; @@ -246,8 +266,9 @@ struct BlobManagerData : NonCopyable, ReferenceCounted { PromiseStream rangesToAssign; BlobManagerData(UID id, Database db, Optional dcId) - : id(id), db(db), dcId(dcId), knownBlobRanges(false, normalKeys.end), - restartRecruiting(SERVER_KNOBS->DEBOUNCE_RECRUITING_DELAY), recruitingStream(0) {} + : id(id), db(db), dcId(dcId), stats(id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &workersById), + knownBlobRanges(false, normalKeys.end), restartRecruiting(SERVER_KNOBS->DEBOUNCE_RECRUITING_DELAY), + recruitingStream(0) {} }; ACTOR Future>> splitRange(Reference bmData, @@ -753,6 +774,7 @@ ACTOR Future monitorClientRanges(Reference bmData) { } for (KeyRangeRef range : rangesToRemove) { + TraceEvent("ClientBlobRangeRemoved", bmData->id).detail("Range", range); if (BM_DEBUG) { fmt::print( "BM Got range to revoke [{0} - {1})\n", range.begin.printable(), range.end.printable()); @@ -768,6 +790,7 @@ ACTOR Future monitorClientRanges(Reference bmData) { state std::vector>>> splitFutures; // Divide new ranges up into equal chunks by using SS byte sample for (KeyRangeRef range : rangesToAdd) { + TraceEvent("ClientBlobRangeAdded", bmData->id).detail("Range", range); splitFutures.push_back(splitRange(bmData, range, false)); } @@ -1096,6 +1119,11 @@ ACTOR Future maybeSplitRange(Reference bmData, splitVersion); } + ++bmData->stats.granuleSplits; + if (writeHot) { + ++bmData->stats.granuleWriteHotSplits; + } + // transaction committed, send range assignments // range could have been moved since split eval started, so just revoke from whoever has it RangeAssignment raRevoke; @@ -1182,6 +1210,8 @@ ACTOR Future killBlobWorker(Reference bmData, BlobWorkerI // Remove it from workersById also since otherwise that worker addr will remain excluded // when we try to recruit new blob workers. + TraceEvent("KillBlobWorker", bmData->id).detail("WorkerId", bwId); + if (registered) { bmData->deadWorkers.insert(bwId); bmData->workerStats.erase(bwId); @@ -1581,6 +1611,7 @@ static void addAssignment(KeyRangeMap>& map, } ACTOR Future recoverBlobManager(Reference bmData) { + state double recoveryStartTime = now(); state Promise workerListReady; bmData->addActor.send(checkBlobWorkerList(bmData, workerListReady)); wait(workerListReady.getFuture()); @@ -1836,7 +1867,8 @@ ACTOR Future recoverBlobManager(Reference bmData) { TraceEvent("BlobManagerRecovered", bmData->id) .detail("Epoch", bmData->epoch) - .detail("Granules", bmData->workerAssignments.size()) + .detail("Duration", now() - recoveryStartTime) + .detail("Granules", bmData->workerAssignments.size()) // TODO this includes un-set ranges, so it is inaccurate .detail("Assigned", explicitAssignments) .detail("Revoked", outOfDateAssignments.size()); @@ -2087,6 +2119,8 @@ ACTOR Future loadHistoryFiles(Reference bmData, U } } +// FIXME: trace events for pruning + /* * Deletes all files pertaining to the granule with id granuleId and * also removes the history entry for this granule from the system keyspace @@ -2506,7 +2540,7 @@ ACTOR Future monitorPruneKeys(Reference self) { if (BM_DEBUG) { fmt::print("BM constructing backup container from {}\n", SERVER_KNOBS->BG_URL.c_str()); } - self->bstore = BackupContainerFileSystem::openContainerFS(SERVER_KNOBS->BG_URL); + self->bstore = BackupContainerFileSystem::openContainerFS(SERVER_KNOBS->BG_URL, {}, {}); if (BM_DEBUG) { printf("BM constructed backup container\n"); } @@ -2778,7 +2812,7 @@ ACTOR Future blobManager(BlobManagerInterface bmInterf, // DB has [A - B) and [C - D). They should show up in knownBlobRanges, and [B - C) should be in removed. // DB has [B - C). It should show up in knownBlobRanges, [B - C) should be in added, and [A - B) and [C - D) // should be in removed. -TEST_CASE(":/blobmanager/updateranges") { +TEST_CASE("/blobmanager/updateranges") { KeyRangeMap knownBlobRanges(false, normalKeys.end); Arena ar; diff --git a/fdbserver/BlobWorker.actor.cpp b/fdbserver/BlobWorker.actor.cpp index e939d71ec6d..f44bbf6a2dd 100644 --- a/fdbserver/BlobWorker.actor.cpp +++ b/fdbserver/BlobWorker.actor.cpp @@ -18,6 +18,7 @@ * limitations under the License. */ +#include #include #include #include @@ -43,9 +44,10 @@ #include "flow/Error.h" #include "flow/IRandom.h" #include "flow/Trace.h" -#include "flow/actorcompiler.h" // has to be last include #include "flow/network.h" +#include "flow/actorcompiler.h" // has to be last include + #define BW_DEBUG false #define BW_REQUEST_DEBUG false @@ -204,6 +206,7 @@ struct BlobWorkerData : NonCopyable, ReferenceCounted { if (BW_DEBUG) { fmt::print("BW {0} found new manager epoch {1}\n", id.toString(), currentManagerEpoch); } + TraceEvent(SevDebug, "BlobWorkerFoundNewManager", id).detail("Epoch", epoch); } return true; @@ -509,7 +512,8 @@ ACTOR Future writeDeltaFile(Reference bwData, numIterations++; Key dfKey = blobGranuleFileKeyFor(granuleID, currentDeltaVersion, 'D'); - Value dfValue = blobGranuleFileValueFor(fname, 0, serializedSize); + // TODO change once we support file multiplexing + Value dfValue = blobGranuleFileValueFor(fname, 0, serializedSize, serializedSize); tr->set(dfKey, dfValue); if (oldGranuleComplete.present()) { @@ -536,7 +540,8 @@ ACTOR Future writeDeltaFile(Reference bwData, if (BUGGIFY_WITH_PROB(0.01)) { wait(delay(deterministicRandom()->random01())); } - return BlobFileIndex(currentDeltaVersion, fname, 0, serializedSize); + // FIXME: change when we implement multiplexing + return BlobFileIndex(currentDeltaVersion, fname, 0, serializedSize, serializedSize); } catch (Error& e) { wait(tr->onError(e)); } @@ -646,7 +651,8 @@ ACTOR Future writeSnapshot(Reference bwData, wait(readAndCheckGranuleLock(tr, keyRange, epoch, seqno)); numIterations++; Key snapshotFileKey = blobGranuleFileKeyFor(granuleID, version, 'S'); - Key snapshotFileValue = blobGranuleFileValueFor(fname, 0, serializedSize); + // TODO change once we support file multiplexing + Key snapshotFileValue = blobGranuleFileValueFor(fname, 0, serializedSize, serializedSize); tr->set(snapshotFileKey, snapshotFileValue); // create granule history at version if this is a new granule with the initial dump from FDB if (createGranuleHistory) { @@ -690,7 +696,8 @@ ACTOR Future writeSnapshot(Reference bwData, wait(delay(deterministicRandom()->random01())); } - return BlobFileIndex(version, fname, 0, serializedSize); + // FIXME: change when we implement multiplexing + return BlobFileIndex(version, fname, 0, serializedSize, serializedSize); } ACTOR Future dumpInitialSnapshotFromFDB(Reference bwData, @@ -729,7 +736,7 @@ ACTOR Future dumpInitialSnapshotFromFDB(Reference Future streamFuture = tr->getTransaction().getRangeStream(rowsStream, metadata->keyRange, GetRangeLimits(), Snapshot::True); wait(streamFuture && success(snapshotWriter)); - TraceEvent("BlobGranuleSnapshotFile", bwData->id) + TraceEvent(SevDebug, "BlobGranuleSnapshotFile", bwData->id) .detail("Granule", metadata->keyRange) .detail("Version", readVersion); DEBUG_KEY_RANGE("BlobWorkerFDBSnapshot", readVersion, metadata->keyRange, bwData->id); @@ -753,7 +760,8 @@ ACTOR Future dumpInitialSnapshotFromFDB(Reference wait(tr->onError(e)); retries++; TEST(true); // Granule initial snapshot failed - TraceEvent(SevWarn, "BlobGranuleInitialSnapshotRetry", bwData->id) + // FIXME: why can't we supress error event? + TraceEvent(retries < 10 ? SevDebug : SevWarn, "BlobGranuleInitialSnapshotRetry", bwData->id) .error(err) .detail("Granule", metadata->keyRange) .detail("Count", retries); @@ -795,7 +803,8 @@ ACTOR Future compactFromBlob(Reference bwData, ASSERT(snapshotVersion < version); - chunk.snapshotFile = BlobFilePointerRef(filenameArena, snapshotF.filename, snapshotF.offset, snapshotF.length); + chunk.snapshotFile = BlobFilePointerRef( + filenameArena, snapshotF.filename, snapshotF.offset, snapshotF.length, snapshotF.fullFileLength); compactBytesRead += snapshotF.length; int deltaIdx = files.deltaFiles.size() - 1; while (deltaIdx >= 0 && files.deltaFiles[deltaIdx].version > snapshotVersion) { @@ -805,7 +814,8 @@ ACTOR Future compactFromBlob(Reference bwData, Version lastDeltaVersion = invalidVersion; while (deltaIdx < files.deltaFiles.size() && files.deltaFiles[deltaIdx].version <= version) { BlobFileIndex deltaF = files.deltaFiles[deltaIdx]; - chunk.deltaFiles.emplace_back_deep(filenameArena, deltaF.filename, deltaF.offset, deltaF.length); + chunk.deltaFiles.emplace_back_deep( + filenameArena, deltaF.filename, deltaF.offset, deltaF.length, deltaF.fullFileLength); compactBytesRead += deltaF.length; lastDeltaVersion = files.deltaFiles[deltaIdx].version; deltaIdx++; @@ -832,7 +842,7 @@ ACTOR Future compactFromBlob(Reference bwData, rowsStream, false); RangeResult newGranule = - wait(readBlobGranule(chunk, metadata->keyRange, version, bwData->bstore, &bwData->stats)); + wait(readBlobGranule(chunk, metadata->keyRange, 0, version, bwData->bstore, &bwData->stats)); bwData->stats.bytesReadFromS3ForCompaction += compactBytesRead; rowsStream.send(std::move(newGranule)); @@ -875,7 +885,7 @@ ACTOR Future checkSplitAndReSnapshot(Reference bw metadata->bytesInNewDeltaFiles); } - TraceEvent("BlobGranuleSnapshotCheck", bwData->id) + TraceEvent(SevDebug, "BlobGranuleSnapshotCheck", bwData->id) .detail("Granule", metadata->keyRange) .detail("Version", reSnapshotVersion); @@ -952,7 +962,7 @@ ACTOR Future checkSplitAndReSnapshot(Reference bw metadata->keyRange.end.printable(), bytesInNewDeltaFiles); } - TraceEvent("BlobGranuleSnapshotFile", bwData->id) + TraceEvent(SevDebug, "BlobGranuleSnapshotFile", bwData->id) .detail("Granule", metadata->keyRange) .detail("Version", metadata->durableDeltaVersion.get()); @@ -1532,7 +1542,7 @@ ACTOR Future blobGranuleUpdateFiles(Reference bwData, bwData->id.toString().substr(0, 5).c_str(), deltas.version, rollbackVersion); - TraceEvent(SevWarn, "GranuleRollback", bwData->id) + TraceEvent(SevDebug, "GranuleRollback", bwData->id) .detail("Granule", metadata->keyRange) .detail("Version", deltas.version) .detail("RollbackVersion", rollbackVersion); @@ -1646,7 +1656,7 @@ ACTOR Future blobGranuleUpdateFiles(Reference bwData, lastDeltaVersion, oldChangeFeedDataComplete.present() ? ". Finalizing " : ""); } - TraceEvent("BlobGranuleDeltaFile", bwData->id) + TraceEvent(SevDebug, "BlobGranuleDeltaFile", bwData->id) .detail("Granule", metadata->keyRange) .detail("Version", lastDeltaVersion); @@ -1823,13 +1833,13 @@ ACTOR Future blobGranuleUpdateFiles(Reference bwData, } if (e.code() == error_code_granule_assignment_conflict) { - TraceEvent(SevInfo, "GranuleAssignmentConflict", bwData->id) + TraceEvent("GranuleAssignmentConflict", bwData->id) .detail("Granule", metadata->keyRange) .detail("GranuleID", startState.granuleID); return Void(); } if (e.code() == error_code_change_feed_popped) { - TraceEvent(SevInfo, "GranuleGotChangeFeedPopped", bwData->id) + TraceEvent("GranuleChangeFeedPopped", bwData->id) .detail("Granule", metadata->keyRange) .detail("GranuleID", startState.granuleID); return Void(); @@ -2093,16 +2103,25 @@ ACTOR Future waitForVersion(Reference metadata, Version v ACTOR Future doBlobGranuleFileRequest(Reference bwData, BlobGranuleFileRequest req) { if (BW_REQUEST_DEBUG) { - fmt::print("BW {0} processing blobGranuleFileRequest for range [{1} - {2}) @ {3}\n", + fmt::print("BW {0} processing blobGranuleFileRequest for range [{1} - {2}) @ ", bwData->id.toString(), req.keyRange.begin.printable(), req.keyRange.end.printable(), req.readVersion); + if (req.beginVersion > 0) { + fmt::print("{0} - {1}\n", req.beginVersion, req.readVersion); + } else { + fmt::print("{}", req.readVersion); + } } + state bool didCollapse = false; try { - // TODO REMOVE in api V2 - ASSERT(req.beginVersion == 0); + // TODO remove requirement for canCollapseBegin once we implement early replying + ASSERT(req.beginVersion == 0 || req.canCollapseBegin); + if (req.beginVersion != 0) { + ASSERT(req.beginVersion > 0); + } state BlobGranuleFileReply rep; state std::vector> granules; @@ -2150,6 +2169,7 @@ ACTOR Future doBlobGranuleFileRequest(Reference bwData, Bl continue; } state Reference metadata = m; + state Version granuleBeginVersion = req.beginVersion; choose { when(wait(metadata->readable.getFuture())) {} @@ -2290,67 +2310,30 @@ ACTOR Future doBlobGranuleFileRequest(Reference bwData, Bl // granule is up to date, do read ASSERT(metadata->cancelled.canBeSet()); + // Right now we force a collapse if the version range crosses granule boundaries, for simplicity + if (granuleBeginVersion <= chunkFiles.snapshotFiles.front().version) { + TEST(true); // collapsed begin version request because of boundaries + didCollapse = true; + granuleBeginVersion = 0; + } BlobGranuleChunkRef chunk; - // TODO change in V2 + // TODO change with early reply chunk.includedVersion = req.readVersion; chunk.keyRange = KeyRangeRef(StringRef(rep.arena, chunkRange.begin), StringRef(rep.arena, chunkRange.end)); - // handle snapshot files - // TODO refactor the "find snapshot file" logic to GranuleFiles? - // FIXME: binary search instead of linear search, especially when file count is large - int i = chunkFiles.snapshotFiles.size() - 1; - while (i >= 0 && chunkFiles.snapshotFiles[i].version > req.readVersion) { - i--; - } - // because of granule history, we should always be able to find the desired snapshot - // version, and have thrown blob_granule_transaction_too_old earlier if not possible. - if (i < 0) { - fmt::print("req @ {0} >= initial snapshot {1} but can't find snapshot in ({2}) files:\n", - req.readVersion, - metadata->initialSnapshotVersion, - chunkFiles.snapshotFiles.size()); - for (auto& f : chunkFiles.snapshotFiles) { - fmt::print(" {0}", f.version); - } - } - ASSERT(i >= 0); - - BlobFileIndex snapshotF = chunkFiles.snapshotFiles[i]; - chunk.snapshotFile = BlobFilePointerRef(rep.arena, snapshotF.filename, snapshotF.offset, snapshotF.length); - Version snapshotVersion = chunkFiles.snapshotFiles[i].version; - chunk.snapshotVersion = snapshotVersion; - - // handle delta files - // cast this to an int so i going to -1 still compares properly - int lastDeltaFileIdx = chunkFiles.deltaFiles.size() - 1; - i = lastDeltaFileIdx; - // skip delta files that are too new - while (i >= 0 && chunkFiles.deltaFiles[i].version > req.readVersion) { - i--; - } - if (i < lastDeltaFileIdx) { - // we skipped one file at the end with a larger read version, this will actually contain - // our query version, so add it back. - i++; - } - // only include delta files after the snapshot file - int j = i; - while (j >= 0 && chunkFiles.deltaFiles[j].version > snapshotVersion) { - j--; - } - j++; - while (j <= i) { - BlobFileIndex deltaF = chunkFiles.deltaFiles[j]; - chunk.deltaFiles.emplace_back_deep(rep.arena, deltaF.filename, deltaF.offset, deltaF.length); - bwData->stats.readReqDeltaBytesReturned += deltaF.length; - j++; + int64_t deltaBytes = 0; + chunkFiles.getFiles( + granuleBeginVersion, req.readVersion, req.canCollapseBegin, chunk, rep.arena, deltaBytes); + bwData->stats.readReqDeltaBytesReturned += deltaBytes; + if (granuleBeginVersion > 0 && chunk.snapshotFile.present()) { + TEST(true); // collapsed begin version request for efficiency + didCollapse = true; } // new deltas (if version is larger than version of last delta file) // FIXME: do trivial key bounds here if key range is not fully contained in request key // range - - if (req.readVersion > metadata->durableDeltaVersion.get()) { + if (req.readVersion > metadata->durableDeltaVersion.get() && !metadata->currentDeltas.empty()) { if (metadata->durableDeltaVersion.get() != metadata->pendingDeltaVersion) { fmt::print("real-time read [{0} - {1}) @ {2} doesn't have mutations!! durable={3}, pending={4}\n", metadata->keyRange.begin.printable(), @@ -2359,13 +2342,32 @@ ACTOR Future doBlobGranuleFileRequest(Reference bwData, Bl metadata->durableDeltaVersion.get(), metadata->pendingDeltaVersion); } + + // prune mutations based on begin version, if possible ASSERT(metadata->durableDeltaVersion.get() == metadata->pendingDeltaVersion); + // FIXME: I think we can remove this dependsOn since we are doing push_back_deep rep.arena.dependsOn(metadata->currentDeltas.arena()); - for (auto& delta : metadata->currentDeltas) { - if (delta.version > req.readVersion) { + MutationsAndVersionRef* mutationIt = metadata->currentDeltas.begin(); + if (granuleBeginVersion > metadata->currentDeltas.back().version) { + TEST(true); // beginVersion pruning all in-memory mutations + mutationIt = metadata->currentDeltas.end(); + } else if (granuleBeginVersion > metadata->currentDeltas.front().version) { + // binary search for beginVersion + TEST(true); // beginVersion pruning some in-memory mutations + mutationIt = std::lower_bound(metadata->currentDeltas.begin(), + metadata->currentDeltas.end(), + MutationsAndVersionRef(granuleBeginVersion, 0), + MutationsAndVersionRef::OrderByVersion()); + } + + // add mutations to response + while (mutationIt != metadata->currentDeltas.end()) { + if (mutationIt->version > req.readVersion) { + TEST(true); // readVersion pruning some in-memory mutations break; } - chunk.newDeltas.push_back_deep(rep.arena, delta); + chunk.newDeltas.push_back_deep(rep.arena, *mutationIt); + mutationIt++; } } @@ -2376,11 +2378,17 @@ ACTOR Future doBlobGranuleFileRequest(Reference bwData, Bl wait(yield(TaskPriority::DefaultEndpoint)); } + // do these together to keep them synchronous + if (req.beginVersion != 0) { + ++bwData->stats.readRequestsWithBegin; + } + if (didCollapse) { + ++bwData->stats.readRequestsCollapsed; + } ASSERT(!req.reply.isSet()); req.reply.send(rep); --bwData->stats.activeReadRequests; } catch (Error& e) { - // fmt::print("Error in BGFRequest {0}\n", e.name()); if (e.code() == error_code_operation_cancelled) { req.reply.sendError(wrong_shard_server()); throw; @@ -2573,7 +2581,16 @@ ACTOR Future openGranule(Reference bwData, As info.changeFeedStartVersion = tr.getCommittedVersion(); } - TraceEvent("GranuleOpen", bwData->id).detail("Granule", req.keyRange); + TraceEvent openEv("GranuleOpen", bwData->id); + openEv.detail("GranuleID", info.granuleID) + .detail("Granule", req.keyRange) + .detail("Epoch", req.managerEpoch) + .detail("Seqno", req.managerSeqno) + .detail("CFStartVersion", info.changeFeedStartVersion) + .detail("PreviousDurableVersion", info.previousDurableVersion); + if (info.parentGranule.present()) { + openEv.detail("ParentGranuleID", info.parentGranule.get().second); + } return info; } catch (Error& e) { @@ -2894,6 +2911,7 @@ ACTOR Future handleRangeRevoke(Reference bwData, RevokeBlo ACTOR Future registerBlobWorker(Reference bwData, BlobWorkerInterface interf) { state Reference tr = makeReference(bwData->db); + TraceEvent("BlobWorkerRegister", bwData->id); loop { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); @@ -2914,6 +2932,7 @@ ACTOR Future registerBlobWorker(Reference bwData, BlobWork if (BW_DEBUG) { fmt::print("Registered blob worker {}\n", interf.id().toString()); } + TraceEvent("BlobWorkerRegistered", bwData->id); return Void(); } catch (Error& e) { if (BW_DEBUG) { @@ -3021,7 +3040,7 @@ ACTOR Future blobWorker(BlobWorkerInterface bwInterf, if (BW_DEBUG) { fmt::print("BW constructing backup container from {0}\n", SERVER_KNOBS->BG_URL); } - self->bstore = BackupContainerFileSystem::openContainerFS(SERVER_KNOBS->BG_URL); + self->bstore = BackupContainerFileSystem::openContainerFS(SERVER_KNOBS->BG_URL, {}, {}); if (BW_DEBUG) { printf("BW constructed backup container\n"); } diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index b51b2182dbe..8d44c257f69 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -2475,11 +2475,12 @@ ACTOR Future workerHealthMonitor(ClusterControllerData* self) { } } -ACTOR Future clusterControllerCore(ClusterControllerFullInterface interf, +ACTOR Future clusterControllerCore(Reference connRecord, + ClusterControllerFullInterface interf, Future leaderFail, - ServerCoordinators coordinators, LocalityData locality, ConfigDBType configDBType) { + state ServerCoordinators coordinators(connRecord); state ClusterControllerData self(interf, locality, coordinators); state ConfigBroadcaster configBroadcaster(coordinators, configDBType); state Future coordinationPingDelay = delay(SERVER_KNOBS->WORKER_COORDINATION_PING_DELAY); @@ -2612,7 +2613,7 @@ ACTOR Future replaceInterface(ClusterControllerFullInterface interf) { } } -ACTOR Future clusterController(ServerCoordinators coordinators, +ACTOR Future clusterController(Reference connRecord, Reference>> currentCC, bool hasConnected, Reference> asyncPriorityInfo, @@ -2623,9 +2624,10 @@ ACTOR Future clusterController(ServerCoordinators coordinators, state bool inRole = false; cci.initEndpoints(); try { + wait(connRecord->resolveHostnames()); // Register as a possible leader; wait to be elected state Future leaderFail = - tryBecomeLeader(coordinators, cci, currentCC, hasConnected, asyncPriorityInfo); + tryBecomeLeader(connRecord, cci, currentCC, hasConnected, asyncPriorityInfo); state Future shouldReplace = replaceInterface(cci); while (!currentCC->get().present() || currentCC->get().get() != cci) { @@ -2644,7 +2646,7 @@ ACTOR Future clusterController(ServerCoordinators coordinators, startRole(Role::CLUSTER_CONTROLLER, cci.id(), UID()); inRole = true; - wait(clusterControllerCore(cci, leaderFail, coordinators, locality, configDBType)); + wait(clusterControllerCore(connRecord, cci, leaderFail, locality, configDBType)); } } catch (Error& e) { if (inRole) @@ -2673,15 +2675,12 @@ ACTOR Future clusterController(Reference connRec state bool hasConnected = false; loop { try { - wait(connRecord->resolveHostnames()); - ServerCoordinators coordinators(connRecord); - wait(clusterController(coordinators, currentCC, hasConnected, asyncPriorityInfo, locality, configDBType)); + wait(clusterController(connRecord, currentCC, hasConnected, asyncPriorityInfo, locality, configDBType)); + hasConnected = true; } catch (Error& e) { if (e.code() != error_code_coordinators_changed) throw; // Expected to terminate fdbserver } - - hasConnected = true; } } diff --git a/fdbserver/ClusterRecovery.actor.cpp b/fdbserver/ClusterRecovery.actor.cpp index 22b6807a00e..180673f8502 100644 --- a/fdbserver/ClusterRecovery.actor.cpp +++ b/fdbserver/ClusterRecovery.actor.cpp @@ -1477,12 +1477,12 @@ ACTOR Future clusterRecoveryCore(Reference self) { recoverAndEndEpoch.cancel(); - ASSERT(self->commitProxies.size() <= self->configuration.getDesiredCommitProxies()); - ASSERT(self->commitProxies.size() >= 1); - ASSERT(self->grvProxies.size() <= self->configuration.getDesiredGrvProxies()); - ASSERT(self->grvProxies.size() >= 1); - ASSERT(self->resolvers.size() <= self->configuration.getDesiredResolvers()); - ASSERT(self->resolvers.size() >= 1); + ASSERT_LE(self->commitProxies.size(), self->configuration.getDesiredCommitProxies()); + ASSERT_GE(self->commitProxies.size(), 1); + ASSERT_LE(self->grvProxies.size(), self->configuration.getDesiredGrvProxies()); + ASSERT_GE(self->grvProxies.size(), 1); + ASSERT_LE(self->resolvers.size(), self->configuration.getDesiredResolvers()); + ASSERT_GE(self->resolvers.size(), 1); self->recoveryState = RecoveryState::RECOVERY_TRANSACTION; TraceEvent(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_STATE_EVENT_NAME).c_str(), self->dbgid) diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index d83ded0edcb..bdc6259a98c 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -1381,7 +1381,10 @@ class DDTeamCollectionImpl { bool foundSSToRemove = false; for (auto& server : self->server_info) { - if (!server.second->isCorrectStoreType(self->configuration.storageServerStoreType)) { + // If this server isn't the right storage type and its wrong-type trigger has not yet been set + // then set it if we're in aggressive mode and log its presence either way. + if (!server.second->isCorrectStoreType(self->configuration.storageServerStoreType) && + !server.second->wrongStoreTypeToRemove.get()) { // Server may be removed due to failure while the wrongStoreTypeToRemove is sent to the // storageServerTracker. This race may cause the server to be removed before react to // wrongStoreTypeToRemove @@ -1394,12 +1397,16 @@ class DDTeamCollectionImpl { TraceEvent("WrongStoreTypeRemover", self->distributorId) .detail("Server", server.first) .detail("StoreType", server.second->getStoreType()) - .detail("ConfiguredStoreType", self->configuration.storageServerStoreType); - break; + .detail("ConfiguredStoreType", self->configuration.storageServerStoreType) + .detail("RemovingNow", + self->configuration.storageMigrationType == StorageMigrationType::AGGRESSIVE); } } - if (!foundSSToRemove) { + // Stop if no incorrect storage types were found, or if we're not in aggressive mode and can't act on any + // found. Aggressive mode is checked at this location so that in non-aggressive mode the loop will execute + // once and log any incorrect storage types found. + if (!foundSSToRemove || self->configuration.storageMigrationType != StorageMigrationType::AGGRESSIVE) { break; } } diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 836545fec08..d0d12f0387a 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -50,11 +50,12 @@ struct RelocateData { std::vector completeSources; std::vector completeDests; bool wantsNewServers; + bool cancellable; TraceInterval interval; RelocateData() : priority(-1), boundaryPriority(-1), healthPriority(-1), startTime(-1), workFactor(0), wantsNewServers(false), - interval("QueuedRelocation") {} + cancellable(false), interval("QueuedRelocation") {} explicit RelocateData(RelocateShard const& rs) : keys(rs.keys), priority(rs.priority), boundaryPriority(isBoundaryPriority(rs.priority) ? rs.priority : -1), healthPriority(isHealthPriority(rs.priority) ? rs.priority : -1), startTime(now()), @@ -63,7 +64,7 @@ struct RelocateData { rs.priority == SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM || rs.priority == SERVER_KNOBS->PRIORITY_SPLIT_SHARD || rs.priority == SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT), - interval("QueuedRelocation") {} + cancellable(true), interval("QueuedRelocation") {} static bool isHealthPriority(int priority) { return priority == SERVER_KNOBS->PRIORITY_POPULATE_REGION || @@ -610,19 +611,23 @@ struct DDQueueData { .detail( "Problem", "the key range in the inFlight map matches the key range in the RelocateData message"); + } else if (it->value().cancellable) { + TraceEvent(SevError, "DDQueueValidateError13") + .detail("Problem", "key range is cancellable but not in flight!") + .detail("Range", it->range()); } } for (auto it = busymap.begin(); it != busymap.end(); ++it) { for (int i = 0; i < it->second.ledger.size() - 1; i++) { if (it->second.ledger[i] < it->second.ledger[i + 1]) - TraceEvent(SevError, "DDQueueValidateError13") + TraceEvent(SevError, "DDQueueValidateError14") .detail("Problem", "ascending ledger problem") .detail("LedgerLevel", i) .detail("LedgerValueA", it->second.ledger[i]) .detail("LedgerValueB", it->second.ledger[i + 1]); if (it->second.ledger[i] < 0.0) - TraceEvent(SevError, "DDQueueValidateError14") + TraceEvent(SevError, "DDQueueValidateError15") .detail("Problem", "negative ascending problem") .detail("LedgerLevel", i) .detail("LedgerValue", it->second.ledger[i]); @@ -632,13 +637,13 @@ struct DDQueueData { for (auto it = destBusymap.begin(); it != destBusymap.end(); ++it) { for (int i = 0; i < it->second.ledger.size() - 1; i++) { if (it->second.ledger[i] < it->second.ledger[i + 1]) - TraceEvent(SevError, "DDQueueValidateError15") + TraceEvent(SevError, "DDQueueValidateError16") .detail("Problem", "ascending ledger problem") .detail("LedgerLevel", i) .detail("LedgerValueA", it->second.ledger[i]) .detail("LedgerValueB", it->second.ledger[i + 1]); if (it->second.ledger[i] < 0.0) - TraceEvent(SevError, "DDQueueValidateError16") + TraceEvent(SevError, "DDQueueValidateError17") .detail("Problem", "negative ascending problem") .detail("LedgerLevel", i) .detail("LedgerValue", it->second.ledger[i]); @@ -954,7 +959,7 @@ struct DDQueueData { auto containedRanges = inFlight.containedRanges(rd.keys); std::vector cancellableRelocations; for (auto it = containedRanges.begin(); it != containedRanges.end(); ++it) { - if (inFlightActors.liveActorAt(it->range().begin)) { + if (it.value().cancellable) { cancellableRelocations.push_back(it->value()); } } @@ -1180,6 +1185,12 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, // TODO different trace event + knob for overloaded? Could wait on an async var for done moves } + // set cancellable to false on inFlight's entry for this key range + auto inFlightRange = self->inFlight.rangeContaining(rd.keys.begin); + ASSERT(inFlightRange.range() == rd.keys); + ASSERT(inFlightRange.value().randomId == rd.randomId); + inFlightRange.value().cancellable = false; + destIds.clear(); state std::vector healthyIds; state std::vector extraIds; diff --git a/fdbserver/KeyValueStoreRocksDB.actor.cpp b/fdbserver/KeyValueStoreRocksDB.actor.cpp index 7069c9de423..9c4f925854a 100644 --- a/fdbserver/KeyValueStoreRocksDB.actor.cpp +++ b/fdbserver/KeyValueStoreRocksDB.actor.cpp @@ -784,6 +784,8 @@ ACTOR Future rocksDBMetricLogger(std::shared_ptr stat { "EstimateLiveDataSize", rocksdb::DB::Properties::kEstimateLiveDataSize }, { "BaseLevel", rocksdb::DB::Properties::kBaseLevel }, { "EstPendCompactBytes", rocksdb::DB::Properties::kEstimatePendingCompactionBytes }, + { "BlockCacheUsage", rocksdb::DB::Properties::kBlockCacheUsage }, + { "BlockCachePinnedUsage", rocksdb::DB::Properties::kBlockCachePinnedUsage }, }; state std::unordered_map readIteratorPoolStats = { diff --git a/fdbserver/LeaderElection.actor.cpp b/fdbserver/LeaderElection.actor.cpp index 1c968a515c2..9d677b1dacd 100644 --- a/fdbserver/LeaderElection.actor.cpp +++ b/fdbserver/LeaderElection.actor.cpp @@ -25,28 +25,56 @@ #include "fdbclient/MonitorLeader.h" #include "flow/actorcompiler.h" // This must be the last #include. +// Keep trying to become a leader by submitting itself to all coordinators. +// Monitor the health of all coordinators at the same time. +// Note: for coordinators whose NetworkAddress is parsed out of a hostname, a connection failure will cause this actor +// to throw `coordinators_changed()` error ACTOR Future submitCandidacy(Key key, LeaderElectionRegInterface coord, LeaderInfo myInfo, UID prevChangeID, - Reference>>> nominees, - int index) { + AsyncTrigger* nomineeChange, + Optional* nominee, + Optional hostname = Optional()) { loop { - auto const& nom = nominees->get()[index]; - Optional li = wait( - retryBrokenPromise(coord.candidacy, - CandidacyRequest(key, myInfo, nom.present() ? nom.get().changeID : UID(), prevChangeID), - TaskPriority::CoordinationReply)); + state Optional li; - if (li != nominees->get()[index]) { - std::vector> v = nominees->get(); - v[index] = li; - nominees->set(v); + if (coord.candidacy.getEndpoint().getPrimaryAddress().fromHostname) { + state ErrorOr> rep = wait(coord.candidacy.tryGetReply( + CandidacyRequest(key, myInfo, nominee->present() ? nominee->get().changeID : UID(), prevChangeID), + TaskPriority::CoordinationReply)); + if (rep.isError()) { + // Connecting to nominee failed, most likely due to connection failed. + TraceEvent("SubmitCandadicyError") + .error(rep.getError()) + .detail("Hostname", hostname.present() ? hostname.get().toString() : "UnknownHostname") + .detail("OldAddr", coord.candidacy.getEndpoint().getPrimaryAddress().toString()); + if (rep.getError().code() == error_code_request_maybe_delivered) { + // Delay to prevent tight resolving loop due to outdated DNS cache + wait(delay(CLIENT_KNOBS->COORDINATOR_HOSTNAME_RESOLVE_DELAY)); + throw coordinators_changed(); + } else { + throw rep.getError(); + } + } else if (rep.present()) { + li = rep.get(); + } + } else { + Optional tmp = wait(retryBrokenPromise( + coord.candidacy, + CandidacyRequest(key, myInfo, nominee->present() ? nominee->get().changeID : UID(), prevChangeID), + TaskPriority::CoordinationReply)); + li = tmp; + } + + wait(Future(Void())); // Make sure we weren't cancelled + + if (li != *nominee) { + *nominee = li; + nomineeChange->trigger(); if (li.present() && li.get().forward) wait(Future(Never())); - - wait(Future(Void())); // Make sure we weren't cancelled } } } @@ -84,13 +112,14 @@ ACTOR Future changeLeaderCoordinators(ServerCoordinators coordinators, Val return Void(); } -ACTOR Future tryBecomeLeaderInternal(ServerCoordinators coordinators, +ACTOR Future tryBecomeLeaderInternal(Reference connRecord, Value proposedSerializedInterface, Reference> outSerializedLeader, bool hasConnected, Reference> asyncPriorityInfo) { - state Reference>>> nominees( - new AsyncVar>>()); + state ServerCoordinators coordinators(connRecord); + state AsyncTrigger nomineeChange; + state std::vector> nominees; state LeaderInfo myInfo; state Future candidacies; state bool iAmLeader = false; @@ -105,8 +134,6 @@ ACTOR Future tryBecomeLeaderInternal(ServerCoordinators coordinators, wait(delay(SERVER_KNOBS->WAIT_FOR_GOOD_RECRUITMENT_DELAY)); } - nominees->set(std::vector>(coordinators.clientLeaderServers.size())); - myInfo.serializedInfo = proposedSerializedInterface; outSerializedLeader->set(Value()); @@ -114,6 +141,9 @@ ACTOR Future tryBecomeLeaderInternal(ServerCoordinators coordinators, (SERVER_KNOBS->BUGGIFY_ALL_COORDINATION || BUGGIFY) ? buggifyDelayedAsyncVar(outSerializedLeader) : Void(); while (!iAmLeader) { + wait(connRecord->resolveHostnames()); + coordinators = ServerCoordinators(connRecord); + nominees.resize(coordinators.leaderElectionServers.size()); state Future badCandidateTimeout; myInfo.changeID = deterministicRandom()->randomUniqueID(); @@ -122,13 +152,25 @@ ACTOR Future tryBecomeLeaderInternal(ServerCoordinators coordinators, std::vector> cand; cand.reserve(coordinators.leaderElectionServers.size()); - for (int i = 0; i < coordinators.leaderElectionServers.size(); i++) - cand.push_back(submitCandidacy( - coordinators.clusterKey, coordinators.leaderElectionServers[i], myInfo, prevChangeID, nominees, i)); + for (int i = 0; i < coordinators.leaderElectionServers.size(); i++) { + Optional hostname; + auto r = connRecord->getConnectionString().networkAddressToHostname.find( + coordinators.leaderElectionServers[i].candidacy.getEndpoint().getPrimaryAddress()); + if (r != connRecord->getConnectionString().networkAddressToHostname.end()) { + hostname = r->second; + } + cand.push_back(submitCandidacy(coordinators.clusterKey, + coordinators.leaderElectionServers[i], + myInfo, + prevChangeID, + &nomineeChange, + &nominees[i], + hostname)); + } candidacies = waitForAll(cand); loop { - state Optional> leader = getLeader(nominees->get()); + state Optional> leader = getLeader(nominees); if (leader.present() && leader.get().first.forward) { // These coordinators are forwarded to another set. But before we change our own cluster file, we need // to make sure that a majority of coordinators know that. SOMEDAY: Wait briefly to see if other @@ -172,22 +214,30 @@ ACTOR Future tryBecomeLeaderInternal(ServerCoordinators coordinators, // If more than 2*SERVER_KNOBS->POLLING_FREQUENCY elapses while we are nominated by some coordinator but // there is no leader, we might be breaking the leader election process for someone with better // communications but lower ID, so change IDs. - if ((!leader.present() || !leader.get().second) && - std::count(nominees->get().begin(), nominees->get().end(), myInfo)) { + if ((!leader.present() || !leader.get().second) && std::count(nominees.begin(), nominees.end(), myInfo)) { if (!badCandidateTimeout.isValid()) badCandidateTimeout = delay(SERVER_KNOBS->POLLING_FREQUENCY * 2, TaskPriority::CoordinationReply); } else badCandidateTimeout = Future(); - choose { - when(wait(nominees->onChange())) {} - when(wait(badCandidateTimeout.isValid() ? badCandidateTimeout : Never())) { - TEST(true); // Bad candidate timeout - TraceEvent("LeaderBadCandidateTimeout", myInfo.changeID).log(); + try { + choose { + when(wait(nomineeChange.onTrigger())) {} + when(wait(badCandidateTimeout.isValid() ? badCandidateTimeout : Never())) { + TEST(true); // Bad candidate timeout + TraceEvent("LeaderBadCandidateTimeout", myInfo.changeID).log(); + break; + } + when(wait(candidacies)) { ASSERT(false); } + when(wait(asyncPriorityInfo->onChange())) { break; } + } + } catch (Error& e) { + if (e.code() == error_code_coordinators_changed) { + connRecord->getConnectionString().resetToUnresolved(); break; + } else { + throw e; } - when(wait(candidacies)) { ASSERT(false); } - when(wait(asyncPriorityInfo->onChange())) { break; } } } diff --git a/fdbserver/LeaderElection.h b/fdbserver/LeaderElection.h index ad5d959ce56..9639116ec55 100644 --- a/fdbserver/LeaderElection.h +++ b/fdbserver/LeaderElection.h @@ -37,7 +37,7 @@ class ServerCoordinators; // eventually be set. If the return value is cancelled, the candidacy or leadership of the proposedInterface // will eventually end. template -Future tryBecomeLeader(ServerCoordinators const& coordinators, +Future tryBecomeLeader(Reference const& connRecord, LeaderInterface const& proposedInterface, Reference>> const& outKnownLeader, bool hasConnected, @@ -50,20 +50,20 @@ Future changeLeaderCoordinators(ServerCoordinators const& coordinators, Va #pragma region Implementation #endif // __INTEL_COMPILER -Future tryBecomeLeaderInternal(ServerCoordinators const& coordinators, +Future tryBecomeLeaderInternal(Reference const& connRecord, Value const& proposedSerializedInterface, Reference> const& outSerializedLeader, bool const& hasConnected, Reference> const& asyncPriorityInfo); template -Future tryBecomeLeader(ServerCoordinators const& coordinators, +Future tryBecomeLeader(Reference const& connRecord, LeaderInterface const& proposedInterface, Reference>> const& outKnownLeader, bool hasConnected, Reference> const& asyncPriorityInfo) { auto serializedInfo = makeReference>(); - Future m = tryBecomeLeaderInternal(coordinators, + Future m = tryBecomeLeaderInternal(connRecord, ObjectWriter::toValue(proposedInterface, IncludeVersion()), serializedInfo, hasConnected, diff --git a/fdbserver/RestoreController.actor.cpp b/fdbserver/RestoreController.actor.cpp index 64d7d3d785b..8092ebe39a9 100644 --- a/fdbserver/RestoreController.actor.cpp +++ b/fdbserver/RestoreController.actor.cpp @@ -47,7 +47,8 @@ ACTOR static Future collectBackupFiles(Reference bc, RestoreRequest request); ACTOR static Future buildRangeVersions(KeyRangeMap* pRangeVersions, std::vector* pRangeFiles, - Key url); + Key url, + Optional proxy); ACTOR static Future processRestoreRequest(Reference self, Database cx, @@ -317,7 +318,7 @@ ACTOR static Future processRestoreRequest(Reference allFiles; state Version minRangeVersion = MAX_VERSION; - self->initBackupContainer(request.url); + self->initBackupContainer(request.url, request.proxy); // Get all backup files' description and save them to files state Version targetVersion = @@ -334,7 +335,7 @@ ACTOR static Future processRestoreRequest(Reference rangeVersions(minRangeVersion, allKeys.end); if (SERVER_KNOBS->FASTRESTORE_GET_RANGE_VERSIONS_EXPENSIVE) { - wait(buildRangeVersions(&rangeVersions, &rangeFiles, request.url)); + wait(buildRangeVersions(&rangeVersions, &rangeFiles, request.url, request.proxy)); } else { // Debug purpose, dump range versions auto ranges = rangeVersions.ranges(); @@ -881,13 +882,14 @@ ACTOR static Future insertRangeVersion(KeyRangeMap* pRangeVersion // Expensive and slow operation that should not run in real prod. ACTOR static Future buildRangeVersions(KeyRangeMap* pRangeVersions, std::vector* pRangeFiles, - Key url) { + Key url, + Optional proxy) { if (!g_network->isSimulated()) { TraceEvent(SevError, "ExpensiveBuildRangeVersions") .detail("Reason", "Parsing all range files is slow and memory intensive"); return Void(); } - Reference bc = IBackupContainer::openContainer(url.toString()); + Reference bc = IBackupContainer::openContainer(url.toString(), proxy, {}); // Key ranges not in range files are empty; // Assign highest version to avoid applying any mutation in these ranges diff --git a/fdbserver/RestoreController.actor.h b/fdbserver/RestoreController.actor.h index 5c9a271f7aa..77aa5e6494a 100644 --- a/fdbserver/RestoreController.actor.h +++ b/fdbserver/RestoreController.actor.h @@ -446,13 +446,15 @@ struct RestoreControllerData : RestoreRoleData, public ReferenceCounted proxy) { if (bcUrl == url && bc.isValid()) { return; } - TraceEvent("FastRestoreControllerInitBackupContainer").detail("URL", url); + TraceEvent("FastRestoreControllerInitBackupContainer") + .detail("URL", url) + .detail("Proxy", proxy.present() ? proxy.get() : ""); bcUrl = url; - bc = IBackupContainer::openContainer(url.toString()); + bc = IBackupContainer::openContainer(url.toString(), proxy, {}); } }; diff --git a/fdbserver/RestoreLoader.actor.cpp b/fdbserver/RestoreLoader.actor.cpp index 9aa1aadee3b..1afabdcb950 100644 --- a/fdbserver/RestoreLoader.actor.cpp +++ b/fdbserver/RestoreLoader.actor.cpp @@ -262,7 +262,7 @@ ACTOR Future restoreLoaderCore(RestoreLoaderInterface loaderInterf, when(RestoreLoadFileRequest req = waitNext(loaderInterf.loadFile.getFuture())) { requestTypeStr = "loadFile"; hasQueuedRequests = !self->loadingQueue.empty() || !self->sendingQueue.empty(); - self->initBackupContainer(req.param.url); + self->initBackupContainer(req.param.url, req.param.proxy); self->loadingQueue.push(req); if (!hasQueuedRequests) { self->hasPendingRequests->set(true); diff --git a/fdbserver/RestoreLoader.actor.h b/fdbserver/RestoreLoader.actor.h index b16e4c11faf..92b11a5a1cf 100644 --- a/fdbserver/RestoreLoader.actor.h +++ b/fdbserver/RestoreLoader.actor.h @@ -226,12 +226,12 @@ struct RestoreLoaderData : RestoreRoleData, public ReferenceCounted proxy) { if (bcUrl == url && bc.isValid()) { return; } bcUrl = url; - bc = IBackupContainer::openContainer(url.toString()); + bc = IBackupContainer::openContainer(url.toString(), proxy, {}); } }; diff --git a/fdbserver/RestoreWorkerInterface.actor.h b/fdbserver/RestoreWorkerInterface.actor.h index 065b22c4682..3c2830514ae 100644 --- a/fdbserver/RestoreWorkerInterface.actor.h +++ b/fdbserver/RestoreWorkerInterface.actor.h @@ -368,6 +368,7 @@ struct LoadingParam { bool isRangeFile; Key url; + Optional proxy; Optional rangeVersion; // range file's version int64_t blockSize; @@ -386,12 +387,13 @@ struct LoadingParam { template void serialize(Ar& ar) { - serializer(ar, isRangeFile, url, rangeVersion, blockSize, asset); + serializer(ar, isRangeFile, url, proxy, rangeVersion, blockSize, asset); } std::string toString() const { std::stringstream str; str << "isRangeFile:" << isRangeFile << " url:" << url.toString() + << " proxy:" << (proxy.present() ? proxy.get() : "") << " rangeVersion:" << (rangeVersion.present() ? rangeVersion.get() : -1) << " blockSize:" << blockSize << " RestoreAsset:" << asset.toString(); return str.str(); diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index dc3c1595c1a..ef00a0f001c 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -53,6 +53,13 @@ extern "C" int g_expect_full_pointermap; extern const char* getSourceVersion(); +ISimulator::ISimulator() + : desiredCoordinators(1), physicalDatacenters(1), processesPerMachine(0), listenersPerProcess(1), usableRegions(1), + allowLogSetKills(true), tssMode(TSSMode::Disabled), isStopped(false), lastConnectionFailure(0), + connectionFailuresDisableDuration(0), speedUpSimulation(false), backupAgents(BackupAgentType::WaitForType), + drAgents(BackupAgentType::WaitForType), allSwapsDisabled(false) {} +ISimulator::~ISimulator() = default; + using namespace std::literals; // TODO: Defining these here is just asking for ODR violations. @@ -266,6 +273,9 @@ class TestConfig { configDBType = configDBTypeFromString(value); } } + if (attrib == "randomlyRenameZoneId") { + randomlyRenameZoneId = strcmp(value.c_str(), "true") == 0; + } if (attrib == "blobGranulesEnabled") { blobGranulesEnabled = strcmp(value.c_str(), "true") == 0; } @@ -307,6 +317,7 @@ class TestConfig { stderrSeverity, machineCount, processesPerMachine, coordinators; bool blobGranulesEnabled = false; Optional config; + bool randomlyRenameZoneId = false; bool allowDefaultTenant = true; bool allowDisablingTenants = true; @@ -364,7 +375,8 @@ class TestConfig { .add("extraMachineCountDC", &extraMachineCountDC) .add("blobGranulesEnabled", &blobGranulesEnabled) .add("allowDefaultTenant", &allowDefaultTenant) - .add("allowDisablingTenants", &allowDisablingTenants); + .add("allowDisablingTenants", &allowDisablingTenants) + .add("randomlyRenameZoneId", &randomlyRenameZoneId); try { auto file = toml::parse(testFile); if (file.contains("configuration") && toml::find(file, "configuration").is_table()) { @@ -1039,6 +1051,11 @@ ACTOR Future restartSimulatedSystem(std::vector>* systemActor auto configDBType = testConfig.getConfigDBType(); + // Randomly change data center id names to test that localities + // can be modified on cluster restart + bool renameZoneIds = testConfig.randomlyRenameZoneId ? deterministicRandom()->random01() < 0.1 : false; + TEST(renameZoneIds); // Zone ID names altered in restart test + // allows multiple ipAddr entries ini.SetMultiKey(); @@ -1059,7 +1076,7 @@ ACTOR Future restartSimulatedSystem(std::vector>* systemActor bool enableExtraDB = (testConfig.extraDB == 3); ClusterConnectionString conn(ini.GetValue("META", "connectionString")); if (enableExtraDB) { - g_simulator.extraDB = new ClusterConnectionString(ini.GetValue("META", "connectionString")); + g_simulator.extraDB = std::make_unique(ini.GetValue("META", "connectionString")); } if (!testConfig.disableHostname) { auto mockDNSStr = ini.GetValue("META", "mockDNS"); @@ -1087,7 +1104,11 @@ ACTOR Future restartSimulatedSystem(std::vector>* systemActor if (zoneIDini == nullptr) { zoneId = machineId; } else { - zoneId = StringRef(zoneIDini); + auto zoneIdStr = std::string(zoneIDini); + if (renameZoneIds) { + zoneIdStr = "modified/" + zoneIdStr; + } + zoneId = Standalone(zoneIdStr); } ProcessClass::ClassType cType = @@ -1142,7 +1163,7 @@ ACTOR Future restartSimulatedSystem(std::vector>* systemActor } LocalityData localities(Optional>(), zoneId, machineId, dcUID); - localities.set(LiteralStringRef("data_hall"), dcUID); + localities.set("data_hall"_sr, dcUID); // SOMEDAY: parse backup agent from test file systemActors->push_back(reportErrors( @@ -2040,9 +2061,9 @@ void setupSimulatedSystem(std::vector>* systemActors, deterministicRandom()->randomShuffle(coordinatorAddresses); ASSERT_EQ(coordinatorAddresses.size(), coordinatorCount); - ClusterConnectionString conn(coordinatorAddresses, LiteralStringRef("TestCluster:0")); + ClusterConnectionString conn(coordinatorAddresses, "TestCluster:0"_sr); if (useHostname) { - conn = ClusterConnectionString(coordinatorHostnames, LiteralStringRef("TestCluster:0")); + conn = ClusterConnectionString(coordinatorHostnames, "TestCluster:0"_sr); } // If extraDB==0, leave g_simulator.extraDB as null because the test does not use DR. @@ -2050,21 +2071,21 @@ void setupSimulatedSystem(std::vector>* systemActors, // The DR database can be either a new database or itself g_simulator.extraDB = BUGGIFY - ? (useHostname ? new ClusterConnectionString(coordinatorHostnames, LiteralStringRef("TestCluster:0")) - : new ClusterConnectionString(coordinatorAddresses, LiteralStringRef("TestCluster:0"))) + ? (useHostname ? std::make_unique(coordinatorHostnames, "TestCluster:0"_sr) + : std::make_unique(coordinatorAddresses, "TestCluster:0"_sr)) : (useHostname - ? new ClusterConnectionString(extraCoordinatorHostnames, LiteralStringRef("ExtraCluster:0")) - : new ClusterConnectionString(extraCoordinatorAddresses, LiteralStringRef("ExtraCluster:0"))); + ? std::make_unique(extraCoordinatorHostnames, "ExtraCluster:0"_sr) + : std::make_unique(extraCoordinatorAddresses, "ExtraCluster:0"_sr)); } else if (testConfig.extraDB == 2) { // The DR database is a new database g_simulator.extraDB = - useHostname ? new ClusterConnectionString(extraCoordinatorHostnames, LiteralStringRef("ExtraCluster:0")) - : new ClusterConnectionString(extraCoordinatorAddresses, LiteralStringRef("ExtraCluster:0")); + useHostname ? std::make_unique(extraCoordinatorHostnames, "ExtraCluster:0"_sr) + : std::make_unique(extraCoordinatorAddresses, "ExtraCluster:0"_sr); } else if (testConfig.extraDB == 3) { // The DR database is the same database - g_simulator.extraDB = - useHostname ? new ClusterConnectionString(coordinatorHostnames, LiteralStringRef("TestCluster:0")) - : new ClusterConnectionString(coordinatorAddresses, LiteralStringRef("TestCluster:0")); + g_simulator.extraDB = useHostname + ? std::make_unique(coordinatorHostnames, "TestCluster:0"_sr) + : std::make_unique(coordinatorAddresses, "TestCluster:0"_sr); } *pConnString = conn; @@ -2164,7 +2185,7 @@ void setupSimulatedSystem(std::vector>* systemActors, // check the sslEnablementMap using only one ip LocalityData localities(Optional>(), zoneId, machineId, dcUID); - localities.set(LiteralStringRef("data_hall"), dcUID); + localities.set("data_hall"_sr, dcUID); systemActors->push_back(reportErrors(simulatedMachine(conn, ips, sslEnabled, @@ -2191,7 +2212,7 @@ void setupSimulatedSystem(std::vector>* systemActors, Standalone newMachineId(deterministicRandom()->randomUniqueID().toString()); LocalityData localities(Optional>(), newZoneId, newMachineId, dcUID); - localities.set(LiteralStringRef("data_hall"), dcUID); + localities.set("data_hall"_sr, dcUID); systemActors->push_back(reportErrors(simulatedMachine(*g_simulator.extraDB, extraIps, sslEnabled, @@ -2395,7 +2416,7 @@ ACTOR void setupAndRun(std::string dataFolder, 100.0)); // FIXME: snapshot restore does not support multi-region restore, hence restore it as single region always if (restoring) { - startingConfiguration = LiteralStringRef("usable_regions=1"); + startingConfiguration = "usable_regions=1"_sr; } } else { g_expect_full_pointermap = 1; diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index aaf33f3ee76..a56632c1e6c 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -858,7 +858,7 @@ struct StorageServer { CounterCollection cc; Counter allQueries, getKeyQueries, getValueQueries, getRangeQueries, getMappedRangeQueries, getRangeStreamQueries, finishedQueries, lowPriorityQueries, rowsQueried, bytesQueried, watchQueries, - emptyQueries, feedRowsQueried, feedBytesQueried; + emptyQueries, feedRowsQueried, feedBytesQueried, feedStreamQueries, feedVersionQueries; // Bytes of the mutations that have been added to the memory of the storage server. When the data is durable // and cleared from the memory, we do not subtract it but add it to bytesDurable. @@ -930,6 +930,7 @@ struct StorageServer { lowPriorityQueries("LowPriorityQueries", cc), rowsQueried("RowsQueried", cc), bytesQueried("BytesQueried", cc), watchQueries("WatchQueries", cc), emptyQueries("EmptyQueries", cc), feedRowsQueried("FeedRowsQueried", cc), feedBytesQueried("FeedBytesQueried", cc), + feedStreamQueries("FeedStreamQueries", cc), feedVersionQueries("FeedVersionQueries", cc), bytesInput("BytesInput", cc), logicalBytesInput("LogicalBytesInput", cc), logicalBytesMoveInOverhead("LogicalBytesMoveInOverhead", cc), kvCommitLogicalBytes("KVCommitLogicalBytes", cc), kvClearRanges("KVClearRanges", cc), @@ -2436,6 +2437,8 @@ ACTOR Future changeFeedStreamQ(StorageServer* data, ChangeFeedStreamReques req.reply.setByteLimit(std::min((int64_t)req.replyBufferSize, SERVER_KNOBS->CHANGEFEEDSTREAM_LIMIT_BYTES)); } + ++data->counters.feedStreamQueries; + wait(delay(0, TaskPriority::DefaultEndpoint)); try { @@ -2587,6 +2590,7 @@ ACTOR Future changeFeedStreamQ(StorageServer* data, ChangeFeedStreamReques } ACTOR Future changeFeedVersionUpdateQ(StorageServer* data, ChangeFeedVersionUpdateRequest req) { + ++data->counters.feedVersionQueries; wait(data->version.whenAtLeast(req.minVersion)); wait(delay(0)); Version minVersion = data->minFeedVersionForAddress(req.reply.getEndpoint().getPrimaryAddress()); diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index 6863ec39c61..0cc0faa57d4 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -2726,8 +2726,6 @@ ACTOR Future fdbd(Reference connRecord, actors.push_back(serveProcess()); try { - wait(connRecord->resolveHostnames()); - ServerCoordinators coordinators(connRecord); if (g_network->isSimulated()) { whitelistBinPaths = ",, random_path, /bin/snap_create.sh,,"; } @@ -2745,8 +2743,8 @@ ACTOR Future fdbd(Reference connRecord, if (coordFolder.size()) { // SOMEDAY: remove the fileNotFound wrapper and make DiskQueue construction safe from errors setting up // their files - actors.push_back(fileNotFoundToNever( - coordinationServer(coordFolder, coordinators.ccr, configNode, configBroadcastInterface))); + actors.push_back( + fileNotFoundToNever(coordinationServer(coordFolder, connRecord, configNode, configBroadcastInterface))); } state UID processIDUid = wait(createAndLockProcessIdFile(dataFolder)); diff --git a/fdbserver/workloads/AtomicRestore.actor.cpp b/fdbserver/workloads/AtomicRestore.actor.cpp index 4c3c2703f9e..86d90e10932 100644 --- a/fdbserver/workloads/AtomicRestore.actor.cpp +++ b/fdbserver/workloads/AtomicRestore.actor.cpp @@ -93,6 +93,7 @@ struct AtomicRestoreWorkload : TestWorkload { try { wait(backupAgent.submitBackup(cx, StringRef(backupContainer), + {}, deterministicRandom()->randomInt(0, 60), deterministicRandom()->randomInt(0, 100), BackupAgentBase::getDefaultTagName(), diff --git a/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp b/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp index 890bdf6a3a6..650ca6f2c69 100644 --- a/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp +++ b/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp @@ -222,6 +222,7 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload { try { wait(backupAgent->submitBackup(cx, StringRef(backupContainer), + {}, deterministicRandom()->randomInt(0, 60), deterministicRandom()->randomInt(0, 100), tag.toString(), @@ -377,6 +378,7 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload { cx, self->backupTag, KeyRef(lastBackupContainer), + {}, WaitForComplete::True, ::invalidVersion, Verbose::True, @@ -478,6 +480,7 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload { // the configuration to disable backup workers before restore. extraBackup = backupAgent.submitBackup(cx, LiteralStringRef("file://simfdb/backups/"), + {}, deterministicRandom()->randomInt(0, 60), deterministicRandom()->randomInt(0, 100), self->backupTag.toString(), @@ -523,7 +526,8 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload { .detail("BackupTag", printable(self->backupTag)); // start restoring - auto container = IBackupContainer::openContainer(lastBackupContainer->getURL()); + auto container = + IBackupContainer::openContainer(lastBackupContainer->getURL(), lastBackupContainer->getProxy(), {}); BackupDescription desc = wait(container->describeBackup()); ASSERT(self->usePartitionedLogs == desc.partitioned); ASSERT(desc.minRestorableVersion.present()); // We must have a valid backup now. @@ -566,6 +570,7 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload { self->backupTag, self->backupRanges, KeyRef(lastBackupContainer->getURL()), + lastBackupContainer->getProxy(), targetVersion, self->locked, randomID, diff --git a/fdbserver/workloads/BackupCorrectness.actor.cpp b/fdbserver/workloads/BackupCorrectness.actor.cpp index 92550a23bff..4c827627649 100644 --- a/fdbserver/workloads/BackupCorrectness.actor.cpp +++ b/fdbserver/workloads/BackupCorrectness.actor.cpp @@ -266,6 +266,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload { try { wait(backupAgent->submitBackup(cx, StringRef(backupContainer), + {}, deterministicRandom()->randomInt(0, 60), deterministicRandom()->randomInt(0, 100), tag.toString(), @@ -423,6 +424,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload { cx, self->backupTag, KeyRef(lastBackupContainer), + {}, WaitForComplete::True, ::invalidVersion, Verbose::True, @@ -523,6 +525,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload { try { extraBackup = backupAgent.submitBackup(cx, "file://simfdb/backups/"_sr, + {}, deterministicRandom()->randomInt(0, 60), deterministicRandom()->randomInt(0, 100), self->backupTag.toString(), @@ -557,7 +560,9 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload { .detail("RestoreAfter", self->restoreAfter) .detail("BackupTag", printable(self->backupTag)); - auto container = IBackupContainer::openContainer(lastBackupContainer->getURL()); + auto container = IBackupContainer::openContainer(lastBackupContainer->getURL(), + lastBackupContainer->getProxy(), + lastBackupContainer->getEncryptionKeyFileName()); BackupDescription desc = wait(container->describeBackup()); Version targetVersion = -1; @@ -593,6 +598,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload { cx, restoreTag, KeyRef(lastBackupContainer->getURL()), + lastBackupContainer->getProxy(), WaitForComplete::True, targetVersion, Verbose::True, @@ -616,6 +622,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload { cx, restoreTag, KeyRef(lastBackupContainer->getURL()), + lastBackupContainer->getProxy(), self->restoreRanges, WaitForComplete::True, targetVersion, @@ -646,6 +653,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload { cx, restoreTags[restoreIndex], KeyRef(lastBackupContainer->getURL()), + lastBackupContainer->getProxy(), self->restoreRanges, WaitForComplete::True, ::invalidVersion, @@ -675,6 +683,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload { cx, restoreTags[restoreIndex], KeyRef(lastBackupContainer->getURL()), + lastBackupContainer->getProxy(), WaitForComplete::True, ::invalidVersion, Verbose::True, diff --git a/fdbserver/workloads/BackupToBlob.actor.cpp b/fdbserver/workloads/BackupToBlob.actor.cpp index ee27e1a4808..480ae62466d 100644 --- a/fdbserver/workloads/BackupToBlob.actor.cpp +++ b/fdbserver/workloads/BackupToBlob.actor.cpp @@ -62,6 +62,7 @@ struct BackupToBlobWorkload : TestWorkload { wait(delay(self->backupAfter)); wait(backupAgent.submitBackup(cx, self->backupURL, + {}, self->initSnapshotInterval, self->snapshotInterval, self->backupTag.toString(), diff --git a/fdbserver/workloads/BlobGranuleCorrectnessWorkload.actor.cpp b/fdbserver/workloads/BlobGranuleCorrectnessWorkload.actor.cpp index ea43cebbb75..fc6d3035ae6 100644 --- a/fdbserver/workloads/BlobGranuleCorrectnessWorkload.actor.cpp +++ b/fdbserver/workloads/BlobGranuleCorrectnessWorkload.actor.cpp @@ -250,13 +250,13 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload { if (BGW_DEBUG) { printf("Blob Granule Correctness constructing simulated backup container\n"); } - self->bstore = BackupContainerFileSystem::openContainerFS("file://fdbblob/"); + self->bstore = BackupContainerFileSystem::openContainerFS("file://fdbblob/", {}, {}); } else { if (BGW_DEBUG) { printf("Blob Granule Correctness constructing backup container from %s\n", SERVER_KNOBS->BG_URL.c_str()); } - self->bstore = BackupContainerFileSystem::openContainerFS(SERVER_KNOBS->BG_URL); + self->bstore = BackupContainerFileSystem::openContainerFS(SERVER_KNOBS->BG_URL, {}, {}); if (BGW_DEBUG) { printf("Blob Granule Correctness constructed backup container\n"); } @@ -272,15 +272,20 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload { } // FIXME: typedef this pair type and/or chunk list - ACTOR Future>>> - readFromBlob(Database cx, BlobGranuleCorrectnessWorkload* self, KeyRange range, Version version) { + ACTOR Future>>> readFromBlob( + Database cx, + BlobGranuleCorrectnessWorkload* self, + KeyRange range, + Version beginVersion, + Version readVersion) { state RangeResult out; state Standalone> chunks; state Transaction tr(cx); loop { try { - Standalone> chunks_ = wait(tr.readBlobGranules(range, 0, version)); + Standalone> chunks_ = + wait(tr.readBlobGranules(range, beginVersion, readVersion)); chunks = chunks_; break; } catch (Error& e) { @@ -289,7 +294,7 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload { } for (const BlobGranuleChunkRef& chunk : chunks) { - RangeResult chunkRows = wait(readBlobGranule(chunk, range, version, self->bstore)); + RangeResult chunkRows = wait(readBlobGranule(chunk, range, beginVersion, readVersion, self->bstore)); out.arena().dependsOn(chunkRows.arena()); out.append(out.arena(), chunkRows.begin(), chunkRows.size()); } @@ -321,7 +326,7 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload { Version rv = wait(self->doGrv(&tr)); state Version readVersion = rv; std::pair>> blob = - wait(self->readFromBlob(cx, self, threadData->directoryRange, readVersion)); + wait(self->readFromBlob(cx, self, threadData->directoryRange, 0, readVersion)); fmt::print("Directory {0} got {1} RV {2}\n", threadData->directoryID, doSetup ? "initial" : "final", @@ -349,6 +354,7 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload { const Optional& blobValue, uint32_t startKey, uint32_t endKey, + Version beginVersion, Version readVersion, const std::pair>>& blob) { threadData->mismatches++; @@ -360,11 +366,13 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload { ev.detail("DirectoryID", format("%08x", threadData->directoryID)) .detail("RangeStart", format("%08x", startKey)) .detail("RangeEnd", format("%08x", endKey)) + .detail("BeginVersion", beginVersion) .detail("Version", readVersion); - fmt::print("Found mismatch! Request for dir {0} [{1} - {2}) @ {3}\n", + fmt::print("Found mismatch! Request for dir {0} [{1} - {2}) @ {3} - {4}\n", format("%08x", threadData->directoryID), format("%08x", startKey), format("%08x", endKey), + beginVersion, readVersion); if (lastMatching.present()) { fmt::print(" last correct: {}\n", lastMatching.get().printable()); @@ -456,6 +464,29 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload { readVersion); } + // because each chunk could be separately collapsed or not if we set beginVersion, we have to track it by chunk + KeyRangeMap beginVersionByChunk; + beginVersionByChunk.insert(normalKeys, 0); + int beginCollapsed = 0; + int beginNotCollapsed = 0; + for (auto& chunk : blob.second) { + if (!chunk.snapshotFile.present()) { + ASSERT(beginVersion > 0); + ASSERT(chunk.snapshotVersion == invalidVersion); + beginCollapsed++; + beginVersionByChunk.insert(chunk.keyRange, beginVersion); + } else { + ASSERT(chunk.snapshotVersion != invalidVersion); + if (beginVersion > 0) { + beginNotCollapsed++; + } + } + } + TEST(beginCollapsed > 0); // BGCorrectness got collapsed request with beginVersion > 0 + TEST(beginNotCollapsed > 0); // BGCorrectness got un-collapsed request with beginVersion > 0 + TEST(beginCollapsed > 0 && + beginNotCollapsed > 0); // BGCorrectness got both collapsed and uncollapsed in the same request! + while (checkIt != threadData->keyData.end() && checkIt->first < endKeyExclusive) { uint32_t key = checkIt->first; if (DEBUG_READ_OP(threadData->directoryID, readVersion)) { @@ -475,6 +506,16 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload { for (; idIdx < checkIt->second.writes.size() && checkIt->second.writes[idIdx].writeVersion <= readVersion; idIdx++) { Key nextKeyShouldBe = threadData->getKey(key, idIdx); + Version keyBeginVersion = beginVersionByChunk.rangeContaining(nextKeyShouldBe).cvalue(); + if (keyBeginVersion > checkIt->second.writes[idIdx].writeVersion) { + if (DEBUG_READ_OP(threadData->directoryID, readVersion)) { + fmt::print("DBG READ: Skip ID {0} written @ {1} < beginVersion {2}\n", + idIdx, + checkIt->second.writes[idIdx].clearVersion, + keyBeginVersion); + } + continue; + } if (DEBUG_READ_OP(threadData->directoryID, readVersion)) { fmt::print("DBG READ: Checking ID {0} ({1}) written @ {2}\n", format("%08x", idIdx), @@ -491,6 +532,7 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload { Optional(), startKeyInclusive, endKeyExclusive, + beginVersion, readVersion, blob); return false; @@ -509,6 +551,7 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload { Optional(), startKeyInclusive, endKeyExclusive, + beginVersion, readVersion, blob); return false; @@ -523,6 +566,7 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload { blob.first[resultIdx].value, startKeyInclusive, endKeyExclusive, + beginVersion, readVersion, blob); return false; @@ -545,6 +589,7 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload { Optional(), startKeyInclusive, endKeyExclusive, + beginVersion, readVersion, blob); return false; @@ -565,6 +610,7 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload { state double targetReadBytesPerSec = threadData->targetByteRate * 4; ASSERT(targetReadBytesPerSec > 0); + state Version beginVersion; state Version readVersion; TraceEvent("BlobGranuleCorrectnessReaderStart").log(); @@ -610,26 +656,42 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload { state KeyRange range = KeyRangeRef(threadData->getKey(startKey, 0), threadData->getKey(endKey, 0)); // pick read version - // TODO could also pick begin version here ASSERT(threadData->writeVersions.back() >= threadData->minSuccessfulReadVersion); + size_t readVersionIdx; // randomly choose up to date vs time travel read if (deterministicRandom()->random01() < 0.5) { threadData->reads++; + readVersionIdx = threadData->writeVersions.size() - 1; readVersion = threadData->writeVersions.back(); } else { threadData->timeTravelReads++; + size_t startIdx = 0; loop { - int readVersionIdx = deterministicRandom()->randomInt(0, threadData->writeVersions.size()); + readVersionIdx = deterministicRandom()->randomInt(startIdx, threadData->writeVersions.size()); readVersion = threadData->writeVersions[readVersionIdx]; if (readVersion >= threadData->minSuccessfulReadVersion) { break; + } else { + startIdx = readVersionIdx + 1; } } } + // randomly choose begin version or not + beginVersion = 0; + if (deterministicRandom()->random01() < 0.5) { + int startIdx = 0; + int endIdxExclusive = readVersionIdx + 1; + // Choose skewed towards later versions. It's ok if beginVersion isn't readable though because it + // will collapse + size_t beginVersionIdx = (size_t)std::sqrt( + deterministicRandom()->randomInt(startIdx * startIdx, endIdxExclusive * endIdxExclusive)); + beginVersion = threadData->writeVersions[beginVersionIdx]; + } + std::pair>> blob = - wait(self->readFromBlob(cx, self, range, readVersion)); - self->validateResult(threadData, blob, startKey, endKey, 0, readVersion); + wait(self->readFromBlob(cx, self, range, beginVersion, readVersion)); + self->validateResult(threadData, blob, startKey, endKey, beginVersion, readVersion); int resultBytes = blob.first.expectedSize(); threadData->rowsRead += blob.first.size(); @@ -822,7 +884,7 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload { fmt::print("Directory {0} doing final data check @ {1}\n", threadData->directoryID, readVersion); } std::pair>> blob = - wait(self->readFromBlob(cx, self, threadData->directoryRange, readVersion)); + wait(self->readFromBlob(cx, self, threadData->directoryRange, 0, readVersion)); result = self->validateResult(threadData, blob, 0, std::numeric_limits::max(), 0, readVersion); finalRowsValidated = blob.first.size(); diff --git a/fdbserver/workloads/BlobGranuleVerifier.actor.cpp b/fdbserver/workloads/BlobGranuleVerifier.actor.cpp index d4264058ca2..ba49923bf1f 100644 --- a/fdbserver/workloads/BlobGranuleVerifier.actor.cpp +++ b/fdbserver/workloads/BlobGranuleVerifier.actor.cpp @@ -90,13 +90,13 @@ struct BlobGranuleVerifierWorkload : TestWorkload { if (BGV_DEBUG) { printf("Blob Granule Verifier constructing simulated backup container\n"); } - bstore = BackupContainerFileSystem::openContainerFS("file://fdbblob/"); + bstore = BackupContainerFileSystem::openContainerFS("file://fdbblob/", {}, {}); } else { if (BGV_DEBUG) { printf("Blob Granule Verifier constructing backup container from %s\n", SERVER_KNOBS->BG_URL.c_str()); } - bstore = BackupContainerFileSystem::openContainerFS(SERVER_KNOBS->BG_URL); + bstore = BackupContainerFileSystem::openContainerFS(SERVER_KNOBS->BG_URL, {}, {}); if (BGV_DEBUG) { printf("Blob Granule Verifier constructed backup container\n"); } @@ -225,7 +225,7 @@ struct BlobGranuleVerifierWorkload : TestWorkload { } for (const BlobGranuleChunkRef& chunk : chunks) { - RangeResult chunkRows = wait(readBlobGranule(chunk, range, version, self->bstore)); + RangeResult chunkRows = wait(readBlobGranule(chunk, range, 0, version, self->bstore)); out.arena().dependsOn(chunkRows.arena()); out.append(out.arena(), chunkRows.begin(), chunkRows.size()); } diff --git a/fdbserver/workloads/ConsistencyCheck.actor.cpp b/fdbserver/workloads/ConsistencyCheck.actor.cpp index 27a8e6f4cf6..16d39a7f1e6 100644 --- a/fdbserver/workloads/ConsistencyCheck.actor.cpp +++ b/fdbserver/workloads/ConsistencyCheck.actor.cpp @@ -2378,9 +2378,9 @@ struct ConsistencyCheckWorkload : TestWorkload { (!nonExcludedWorkerProcessMap.count(db.encryptKeyProxy.get().address()) || nonExcludedWorkerProcessMap[db.encryptKeyProxy.get().address()].processClass.machineClassFitness( ProcessClass::EncryptKeyProxy) > fitnessLowerBound)) { - TraceEvent("ConsistencyCheck_EncyrptKeyProxyNotBest") + TraceEvent("ConsistencyCheck_EncryptKeyProxyNotBest") .detail("BestEncryptKeyProxyFitness", fitnessLowerBound) - .detail("ExistingEncyrptKeyProxyFitness", + .detail("ExistingEncryptKeyProxyFitness", nonExcludedWorkerProcessMap.count(db.encryptKeyProxy.get().address()) ? nonExcludedWorkerProcessMap[db.encryptKeyProxy.get().address()] .processClass.machineClassFitness(ProcessClass::EncryptKeyProxy) diff --git a/fdbserver/workloads/EncryptionOps.actor.cpp b/fdbserver/workloads/EncryptionOps.actor.cpp index cad228dfa03..487491048b4 100644 --- a/fdbserver/workloads/EncryptionOps.actor.cpp +++ b/fdbserver/workloads/EncryptionOps.actor.cpp @@ -21,16 +21,19 @@ #include "fdbclient/DatabaseContext.h" #include "fdbclient/NativeAPI.actor.h" #include "flow/IRandom.h" -#include "flow/StreamCipher.h" +#include "flow/BlobCipher.h" #include "fdbserver/workloads/workloads.actor.h" +#include "flow/ITrace.h" #include "flow/Trace.h" -#include "flow/actorcompiler.h" // This must be the last #include. - -#if ENCRYPTION_ENABLED #include #include #include +#include + +#include "flow/actorcompiler.h" // This must be the last #include. + +#if ENCRYPTION_ENABLED #define MEGA_BYTES (1024 * 1024) #define NANO_SECOND (1000 * 1000 * 1000) @@ -78,44 +81,68 @@ struct WorkloadMetrics { } }; +// Workload generator for encryption/decryption operations. +// 1. For every client run, it generate unique random encryptionDomainId range and simulate encryption of +// either fixed size or variable size payload. +// 2. For each encryption run, it would interact with BlobCipherKeyCache to fetch the desired encryption key, +// which then is used for encrypting the plaintext payload. +// 3. Encryption operation generates 'encryption header', it is leveraged to decrypt the ciphertext obtained from +// step#2 (simulate real-world scenario) +// +// Correctness validations: +// ----------------------- +// Correctness invariants are validated at various steps: +// 1. Encryption key correctness: as part of performing decryption, BlobCipherKeyCache lookup is done to procure +// desired encrytion key based on: {encryptionDomainId, baseCipherId}; the obtained key is validated against +// the encryption key used for encrypting the data. +// 2. After encryption, generated 'encryption header' fields are validated, encrypted buffer size and contents are +// validated. +// 3. After decryption, the obtained deciphertext is validated against the orginal plaintext payload. +// +// Performance metrics: +// ------------------- +// The workload generator profiles below operations across the iterations and logs the details at the end, they are: +// 1. Time spent in encryption key fetch (and derivation) operations. +// 2. Time spent encrypting the buffer (doesn't incude key lookup time); also records the throughput in MB/sec. +// 3. Time spent decrypting the buffer (doesn't incude key lookup time); also records the throughput in MB/sec. + struct EncryptionOpsWorkload : TestWorkload { int mode; int64_t numIterations; int pageSize; int maxBufSize; std::unique_ptr buff; - std::unique_ptr validationBuff; - StreamCipher::IV iv; - std::unique_ptr hmacGenerator; - std::unique_ptr parentKey; Arena arena; std::unique_ptr metrics; + BlobCipherDomainId minDomainId; + BlobCipherDomainId maxDomainId; + BlobCipherBaseKeyId minBaseCipherId; + EncryptionOpsWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) { mode = getOption(options, LiteralStringRef("fixedSize"), 1); numIterations = getOption(options, LiteralStringRef("numIterations"), 10); pageSize = getOption(options, LiteralStringRef("pageSize"), 4096); maxBufSize = getOption(options, LiteralStringRef("maxBufSize"), 512 * 1024); buff = std::make_unique(maxBufSize); - validationBuff = std::make_unique(maxBufSize); - iv = getRandomIV(); - hmacGenerator = std::make_unique(); - parentKey = std::make_unique(AES_256_KEY_LENGTH); - generateRandomData(parentKey.get(), AES_256_KEY_LENGTH); + // assign unique encryptionDomainId range per workload clients + minDomainId = wcx.clientId * 100 + mode * 30 + 1; + maxDomainId = deterministicRandom()->randomInt(minDomainId, minDomainId + 10) + 5; + minBaseCipherId = 100; metrics = std::make_unique(); - TraceEvent("EncryptionOpsWorkload").detail("Mode", getModeStr()); + TraceEvent("EncryptionOpsWorkload") + .detail("Mode", getModeStr()) + .detail("MinDomainId", minDomainId) + .detail("MaxDomainId", maxDomainId); } - bool isFixedSizePayload() { return mode == 1; } + ~EncryptionOpsWorkload() { TraceEvent("EncryptionOpsWorkload_Done").log(); } - StreamCipher::IV getRandomIV() { - generateRandomData(iv.data(), iv.size()); - return iv; - } + bool isFixedSizePayload() { return mode == 1; } std::string getModeStr() const { if (mode == 1) { @@ -127,47 +154,97 @@ struct EncryptionOpsWorkload : TestWorkload { throw internal_error(); } - void updateEncryptionKey(StreamCipherKey* cipherKey) { - auto start = std::chrono::high_resolution_clock::now(); - applyHmacKeyDerivationFunc(cipherKey, hmacGenerator.get(), arena); - auto end = std::chrono::high_resolution_clock::now(); + void generateRandomBaseCipher(const int maxLen, uint8_t* buff, int* retLen) { + memset(buff, 0, maxLen); + *retLen = deterministicRandom()->randomInt(maxLen / 2, maxLen); + generateRandomData(buff, *retLen); + } + + void setupCipherEssentials() { + auto& cipherKeyCache = BlobCipherKeyCache::getInstance(); + + TraceEvent("SetupCipherEssentials_Start").detail("MinDomainId", minDomainId).detail("MaxDomainId", maxDomainId); + + uint8_t buff[AES_256_KEY_LENGTH]; + std::vector> cipherKeys; + for (BlobCipherDomainId id = minDomainId; id <= maxDomainId; id++) { + int cipherLen = 0; + generateRandomBaseCipher(AES_256_KEY_LENGTH, &buff[0], &cipherLen); + cipherKeyCache.insertCipherKey(id, minBaseCipherId, buff, cipherLen); + + ASSERT(cipherLen > 0 && cipherLen <= AES_256_KEY_LENGTH); - metrics->updateKeyDerivationTime(std::chrono::duration(end - start).count()); + cipherKeys = cipherKeyCache.getAllCiphers(id); + ASSERT(cipherKeys.size() == 1); + } + + TraceEvent("SetupCipherEssentials_Done").detail("MinDomainId", minDomainId).detail("MaxDomainId", maxDomainId); + } + + void resetCipherEssentials() { + auto& cipherKeyCache = BlobCipherKeyCache::getInstance(); + cipherKeyCache.cleanup(); + + TraceEvent("ResetCipherEssentials_Done").log(); } - StringRef doEncryption(const StreamCipherKey* key, uint8_t* payload, int len) { - EncryptionStreamCipher encryptor(key, iv); + void updateLatestBaseCipher(const BlobCipherDomainId encryptDomainId, + uint8_t* baseCipher, + int* baseCipherLen, + BlobCipherBaseKeyId* nextBaseCipherId) { + auto& cipherKeyCache = BlobCipherKeyCache::getInstance(); + Reference cipherKey = cipherKeyCache.getLatestCipherKey(encryptDomainId); + *nextBaseCipherId = cipherKey->getBaseCipherId() + 1; + + generateRandomBaseCipher(AES_256_KEY_LENGTH, baseCipher, baseCipherLen); + + ASSERT(*baseCipherLen > 0 && *baseCipherLen <= AES_256_KEY_LENGTH); + TraceEvent("UpdateBaseCipher").detail("DomainId", encryptDomainId).detail("BaseCipherId", *nextBaseCipherId); + } + + Reference doEncryption(Reference key, + uint8_t* payload, + int len, + BlobCipherEncryptHeader* header) { + uint8_t iv[AES_256_IV_LENGTH]; + generateRandomData(&iv[0], AES_256_IV_LENGTH); + EncryptBlobCipherAes265Ctr encryptor(key, &iv[0], AES_256_IV_LENGTH); auto start = std::chrono::high_resolution_clock::now(); - auto encrypted = encryptor.encrypt(buff.get(), len, arena); - encryptor.finish(arena); + Reference encrypted = encryptor.encrypt(payload, len, header, arena); auto end = std::chrono::high_resolution_clock::now(); // validate encrypted buffer size and contents (not matching with plaintext) - ASSERT(encrypted.size() == len); - std::copy(encrypted.begin(), encrypted.end(), validationBuff.get()); - ASSERT(memcmp(validationBuff.get(), buff.get(), len) != 0); + ASSERT(encrypted->getLogicalSize() == len); + ASSERT(memcmp(encrypted->begin(), payload, len) != 0); + ASSERT(header->flags.headerVersion == EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION); metrics->updateEncryptionTime(std::chrono::duration(end - start).count()); return encrypted; } - void doDecryption(const StreamCipherKey* key, - const StringRef& encrypted, + void doDecryption(Reference encrypted, int len, + const BlobCipherEncryptHeader& header, uint8_t* originalPayload, - uint8_t* validationBuff) { - DecryptionStreamCipher decryptor(key, iv); + Reference orgCipherKey) { + ASSERT(header.flags.headerVersion == EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION); + ASSERT(header.flags.encryptMode == BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR); + + auto& cipherKeyCache = BlobCipherKeyCache::getInstance(); + Reference cipherKey = cipherKeyCache.getCipherKey(header.encryptDomainId, header.baseCipherId); + ASSERT(cipherKey.isValid()); + ASSERT(cipherKey->isEqual(orgCipherKey)); + + DecryptBlobCipherAes256Ctr decryptor(cipherKey, &header.iv[0]); auto start = std::chrono::high_resolution_clock::now(); - Standalone decrypted = decryptor.decrypt(encrypted.begin(), len, arena); - decryptor.finish(arena); + Reference decrypted = decryptor.decrypt(encrypted->begin(), len, header, arena); auto end = std::chrono::high_resolution_clock::now(); // validate decrypted buffer size and contents (matching with original plaintext) - ASSERT(decrypted.size() == len); - std::copy(decrypted.begin(), decrypted.end(), validationBuff); - ASSERT(memcmp(validationBuff, originalPayload, len) == 0); + ASSERT(decrypted->getLogicalSize() == len); + ASSERT(memcmp(decrypted->begin(), originalPayload, len) == 0); metrics->updateDecryptionTime(std::chrono::duration(end - start).count()); } @@ -177,22 +254,64 @@ struct EncryptionOpsWorkload : TestWorkload { std::string description() const override { return "EncryptionOps"; } Future start(Database const& cx) override { + uint8_t baseCipher[AES_256_KEY_LENGTH]; + int baseCipherLen = 0; + BlobCipherBaseKeyId nextBaseCipherId; + + // Setup encryptDomainIds and corresponding baseCipher details + setupCipherEssentials(); + for (int i = 0; i < numIterations; i++) { - StreamCipherKey key(AES_256_KEY_LENGTH); - // derive the encryption key - updateEncryptionKey(&key); + bool updateBaseCipher = deterministicRandom()->randomInt(1, 100) < 5; + + // Step-1: Encryption key derivation, caching the cipher for later use + auto& cipherKeyCache = BlobCipherKeyCache::getInstance(); + + // randomly select a domainId + const BlobCipherDomainId encryptDomainId = deterministicRandom()->randomInt(minDomainId, maxDomainId); + ASSERT(encryptDomainId >= minDomainId && encryptDomainId <= maxDomainId); + + if (updateBaseCipher) { + // simulate baseCipherId getting refreshed/updated + updateLatestBaseCipher(encryptDomainId, &baseCipher[0], &baseCipherLen, &nextBaseCipherId); + cipherKeyCache.insertCipherKey(encryptDomainId, nextBaseCipherId, &baseCipher[0], baseCipherLen); + } + + auto start = std::chrono::high_resolution_clock::now(); + Reference cipherKey = cipherKeyCache.getLatestCipherKey(encryptDomainId); + auto end = std::chrono::high_resolution_clock::now(); + metrics->updateKeyDerivationTime(std::chrono::duration(end - start).count()); + + // Validate sanity of "getLatestCipher", especially when baseCipher gets updated + if (updateBaseCipher) { + ASSERT(cipherKey->getBaseCipherId() == nextBaseCipherId); + ASSERT(cipherKey->getBaseCipherLen() == baseCipherLen); + ASSERT(memcmp(cipherKey->rawBaseCipher(), baseCipher, baseCipherLen) == 0); + } int dataLen = isFixedSizePayload() ? pageSize : deterministicRandom()->randomInt(100, maxBufSize); generateRandomData(buff.get(), dataLen); - // encrypt the payload - const auto& encrypted = doEncryption(&key, buff.get(), dataLen); - - // decrypt the payload - doDecryption(&key, encrypted, dataLen, buff.get(), validationBuff.get()); + // Encrypt the payload - generates BlobCipherEncryptHeader to assist decryption later + BlobCipherEncryptHeader header; + try { + Reference encrypted = doEncryption(cipherKey, buff.get(), dataLen, &header); + + // Decrypt the payload - parses the BlobCipherEncryptHeader, fetch corresponding cipherKey and + // decrypt + doDecryption(encrypted, dataLen, header, buff.get(), cipherKey); + } catch (Error& e) { + TraceEvent("Failed") + .detail("DomainId", encryptDomainId) + .detail("BaseCipherId", cipherKey->getBaseCipherId()); + throw; + } metrics->updateBytes(dataLen); } + + // Cleanup cipherKeys + resetCipherEssentials(); return Void(); } diff --git a/fdbserver/workloads/IncrementalBackup.actor.cpp b/fdbserver/workloads/IncrementalBackup.actor.cpp index 688387023e4..e40133ffd0f 100644 --- a/fdbserver/workloads/IncrementalBackup.actor.cpp +++ b/fdbserver/workloads/IncrementalBackup.actor.cpp @@ -98,12 +98,12 @@ struct IncrementalBackupWorkload : TestWorkload { if (!backupContainer.isValid()) { TraceEvent("IBackupCheckListContainersAttempt").log(); state std::vector containers = - wait(IBackupContainer::listContainers(self->backupDir.toString())); + wait(IBackupContainer::listContainers(self->backupDir.toString(), {})); TraceEvent("IBackupCheckListContainersSuccess") .detail("Size", containers.size()) .detail("First", containers.front()); if (containers.size()) { - backupContainer = IBackupContainer::openContainer(containers.front()); + backupContainer = IBackupContainer::openContainer(containers.front(), {}, {}); } } state bool e = wait(backupContainer->exists()); @@ -152,6 +152,7 @@ struct IncrementalBackupWorkload : TestWorkload { try { wait(self->backupAgent.submitBackup(cx, self->backupDir, + {}, 0, 1e8, self->tag.toString(), @@ -219,7 +220,7 @@ struct IncrementalBackupWorkload : TestWorkload { } TraceEvent("IBackupStartListContainersAttempt").log(); state std::vector containers = - wait(IBackupContainer::listContainers(self->backupDir.toString())); + wait(IBackupContainer::listContainers(self->backupDir.toString(), {})); TraceEvent("IBackupStartListContainersSuccess") .detail("Size", containers.size()) .detail("First", containers.front()); @@ -229,6 +230,7 @@ struct IncrementalBackupWorkload : TestWorkload { cx, Key(self->tag.toString()), backupURL, + {}, WaitForComplete::True, invalidVersion, Verbose::True, diff --git a/fdbserver/workloads/RestoreBackup.actor.cpp b/fdbserver/workloads/RestoreBackup.actor.cpp index c7122bc107f..c08fc7de703 100644 --- a/fdbserver/workloads/RestoreBackup.actor.cpp +++ b/fdbserver/workloads/RestoreBackup.actor.cpp @@ -114,6 +114,7 @@ struct RestoreBackupWorkload final : TestWorkload { cx, self->tag, Key(self->backupContainer->getURL()), + self->backupContainer->getProxy(), WaitForComplete::True, ::invalidVersion, Verbose::True))); diff --git a/fdbserver/workloads/RestoreFromBlob.actor.cpp b/fdbserver/workloads/RestoreFromBlob.actor.cpp index 482f22ded45..9d072bb731a 100644 --- a/fdbserver/workloads/RestoreFromBlob.actor.cpp +++ b/fdbserver/workloads/RestoreFromBlob.actor.cpp @@ -61,8 +61,8 @@ struct RestoreFromBlobWorkload : TestWorkload { restoreRanges.push_back_deep(restoreRanges.arena(), normalKeys); wait(delay(self->restoreAfter)); - Version v = - wait(backupAgent.restore(cx, {}, self->backupTag, self->backupURL, restoreRanges, self->waitForComplete)); + Version v = wait( + backupAgent.restore(cx, {}, self->backupTag, self->backupURL, {}, restoreRanges, self->waitForComplete)); return Void(); } diff --git a/fdbserver/workloads/SaveAndKill.actor.cpp b/fdbserver/workloads/SaveAndKill.actor.cpp index 7488db55978..316d9b13c92 100644 --- a/fdbserver/workloads/SaveAndKill.actor.cpp +++ b/fdbserver/workloads/SaveAndKill.actor.cpp @@ -37,11 +37,9 @@ struct SaveAndKillWorkload : TestWorkload { int isRestoring; SaveAndKillWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) { - restartInfo = - getOption(options, LiteralStringRef("restartInfoLocation"), LiteralStringRef("simfdb/restartInfo.ini")) - .toString(); - testDuration = getOption(options, LiteralStringRef("testDuration"), 10.0); - isRestoring = getOption(options, LiteralStringRef("isRestoring"), 0); + restartInfo = getOption(options, "restartInfoLocation"_sr, "simfdb/restartInfo.ini"_sr).toString(); + testDuration = getOption(options, "testDuration"_sr, 10.0); + isRestoring = getOption(options, "isRestoring"_sr, 0); } std::string description() const override { return "SaveAndKillWorkload"; } @@ -70,23 +68,22 @@ struct SaveAndKillWorkload : TestWorkload { std::vector processes = g_simulator.getAllProcesses(); std::map rebootingProcesses = g_simulator.currentlyRebootingProcesses; - std::map allProcessesMap = - std::map(); - for (auto it = rebootingProcesses.begin(); it != rebootingProcesses.end(); it++) { - if (allProcessesMap.find(it->second->dataFolder) == allProcessesMap.end()) - allProcessesMap[it->second->dataFolder] = it->second; + std::map allProcessesMap; + for (const auto& [_, process] : rebootingProcesses) { + if (allProcessesMap.find(process->dataFolder) == allProcessesMap.end()) { + allProcessesMap[process->dataFolder] = process; + } } - for (auto it = processes.begin(); it != processes.end(); it++) { - if (allProcessesMap.find((*it)->dataFolder) == allProcessesMap.end()) - allProcessesMap[(*it)->dataFolder] = *it; + for (const auto& process : processes) { + if (allProcessesMap.find(process->dataFolder) == allProcessesMap.end()) { + allProcessesMap[process->dataFolder] = process; + } } ini.SetValue("META", "processCount", format("%d", allProcessesMap.size() - 1).c_str()); std::map machines; int j = 0; - for (auto processIterator = allProcessesMap.begin(); processIterator != allProcessesMap.end(); - processIterator++) { - ISimulator::ProcessInfo* process = processIterator->second; + for (const auto& [_, process] : allProcessesMap) { std::string machineId = printable(process->locality.machineId()); const char* machineIdString = machineId.c_str(); if (strcmp(process->name, "TestSystem") != 0) { diff --git a/fdbserver/workloads/SubmitBackup.actor.cpp b/fdbserver/workloads/SubmitBackup.actor.cpp index 50759bf014a..aa4dd13d9bd 100644 --- a/fdbserver/workloads/SubmitBackup.actor.cpp +++ b/fdbserver/workloads/SubmitBackup.actor.cpp @@ -57,6 +57,7 @@ struct SubmitBackupWorkload final : TestWorkload { try { wait(self->backupAgent.submitBackup(cx, self->backupDir, + {}, self->initSnapshotInterval, self->snapshotInterval, self->tag.toString(), diff --git a/fdbserver/workloads/UnitTests.actor.cpp b/fdbserver/workloads/UnitTests.actor.cpp index f337b965144..3b2bf42fd76 100644 --- a/fdbserver/workloads/UnitTests.actor.cpp +++ b/fdbserver/workloads/UnitTests.actor.cpp @@ -30,6 +30,7 @@ void forceLinkMemcpyTests(); void forceLinkMemcpyPerfTests(); #if (!defined(TLS_DISABLED) && !defined(_WIN32)) void forceLinkStreamCipherTests(); +void forceLinkBLockCiherTests(); #endif void forceLinkParallelStreamTests(); void forceLinkSimExternalConnectionTests(); @@ -76,6 +77,7 @@ struct UnitTestWorkload : TestWorkload { forceLinkMemcpyPerfTests(); #if (!defined(TLS_DISABLED) && !defined(_WIN32)) forceLinkStreamCipherTests(); + void forceLinkBlobCipherTests(); #endif forceLinkParallelStreamTests(); forceLinkSimExternalConnectionTests(); diff --git a/flow/Arena.cpp b/flow/Arena.cpp index 2ab44eab858..4fb563c721c 100644 --- a/flow/Arena.cpp +++ b/flow/Arena.cpp @@ -342,23 +342,23 @@ ArenaBlock* ArenaBlock::create(int dataSize, Reference& next) { b->bigSize = 256; INSTRUMENT_ALLOCATE("Arena256"); } else if (reqSize <= 512) { - b = (ArenaBlock*)FastAllocator<512>::allocate(); + b = (ArenaBlock*)new uint8_t[512]; b->bigSize = 512; INSTRUMENT_ALLOCATE("Arena512"); } else if (reqSize <= 1024) { - b = (ArenaBlock*)FastAllocator<1024>::allocate(); + b = (ArenaBlock*)new uint8_t[1024]; b->bigSize = 1024; INSTRUMENT_ALLOCATE("Arena1024"); } else if (reqSize <= 2048) { - b = (ArenaBlock*)FastAllocator<2048>::allocate(); + b = (ArenaBlock*)new uint8_t[2048]; b->bigSize = 2048; INSTRUMENT_ALLOCATE("Arena2048"); } else if (reqSize <= 4096) { - b = (ArenaBlock*)FastAllocator<4096>::allocate(); + b = (ArenaBlock*)new uint8_t[4096]; b->bigSize = 4096; INSTRUMENT_ALLOCATE("Arena4096"); } else { - b = (ArenaBlock*)FastAllocator<8192>::allocate(); + b = (ArenaBlock*)new uint8_t[8192]; b->bigSize = 8192; INSTRUMENT_ALLOCATE("Arena8192"); } @@ -460,26 +460,26 @@ void ArenaBlock::destroyLeaf() { FastAllocator<256>::release(this); INSTRUMENT_RELEASE("Arena256"); } else if (bigSize <= 512) { - FastAllocator<512>::release(this); + delete[] reinterpret_cast(this); INSTRUMENT_RELEASE("Arena512"); } else if (bigSize <= 1024) { - FastAllocator<1024>::release(this); + delete[] reinterpret_cast(this); INSTRUMENT_RELEASE("Arena1024"); } else if (bigSize <= 2048) { - FastAllocator<2048>::release(this); + delete[] reinterpret_cast(this); INSTRUMENT_RELEASE("Arena2048"); } else if (bigSize <= 4096) { - FastAllocator<4096>::release(this); + delete[] reinterpret_cast(this); INSTRUMENT_RELEASE("Arena4096"); } else if (bigSize <= 8192) { - FastAllocator<8192>::release(this); + delete[] reinterpret_cast(this); INSTRUMENT_RELEASE("Arena8192"); } else { #ifdef ALLOC_INSTRUMENTATION allocInstr["ArenaHugeKB"].dealloc((bigSize + 1023) >> 10); #endif g_hugeArenaMemory.fetch_sub(bigSize); - delete[](uint8_t*) this; + delete[] reinterpret_cast(this); } } } diff --git a/flow/BlobCipher.cpp b/flow/BlobCipher.cpp new file mode 100644 index 00000000000..a909b783a6a --- /dev/null +++ b/flow/BlobCipher.cpp @@ -0,0 +1,652 @@ +/* + * BlobCipher.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flow/BlobCipher.h" +#include "flow/Error.h" +#include "flow/FastRef.h" +#include "flow/IRandom.h" +#include "flow/ITrace.h" +#include "flow/network.h" +#include "flow/Trace.h" +#include "flow/UnitTest.h" + +#include +#include + +#if ENCRYPTION_ENABLED + +// BlobCipherEncryptHeader +BlobCipherEncryptHeader::BlobCipherEncryptHeader() { + flags.encryptMode = BLOB_CIPHER_ENCRYPT_MODE_NONE; +} + +// BlobCipherKey class methods + +BlobCipherKey::BlobCipherKey(const BlobCipherDomainId& domainId, + const BlobCipherBaseKeyId& baseCiphId, + const uint8_t* baseCiph, + int baseCiphLen) { + BlobCipherRandomSalt salt; + if (g_network->isSimulated()) { + salt = deterministicRandom()->randomUInt64(); + } else { + salt = nondeterministicRandom()->randomUInt64(); + } + initKey(domainId, baseCiph, baseCiphLen, baseCiphId, salt); + /*TraceEvent("BlobCipherKey") + .detail("DomainId", domainId) + .detail("BaseCipherId", baseCipherId) + .detail("BaseCipherLen", baseCipherLen) + .detail("RandomSalt", randomSalt) + .detail("CreationTime", creationTime);*/ +} + +void BlobCipherKey::initKey(const BlobCipherDomainId& domainId, + const uint8_t* baseCiph, + int baseCiphLen, + const BlobCipherBaseKeyId& baseCiphId, + const BlobCipherRandomSalt& salt) { + // Set the base encryption key properties + baseCipher = std::make_unique(AES_256_KEY_LENGTH); + memset(baseCipher.get(), 0, AES_256_KEY_LENGTH); + memcpy(baseCipher.get(), baseCiph, std::min(baseCiphLen, AES_256_KEY_LENGTH)); + baseCipherLen = baseCiphLen; + baseCipherId = baseCiphId; + // Set the encryption domain for the base encryption key + encryptDomainId = domainId; + randomSalt = salt; + // derive the encryption key + cipher = std::make_unique(AES_256_KEY_LENGTH); + memset(cipher.get(), 0, AES_256_KEY_LENGTH); + applyHmacSha256Derivation(); + // update the key creation time + creationTime = now(); +} + +void BlobCipherKey::applyHmacSha256Derivation() { + Arena arena; + uint8_t buf[baseCipherLen + sizeof(BlobCipherRandomSalt)]; + memcpy(&buf[0], baseCipher.get(), baseCipherLen); + memcpy(&buf[0] + baseCipherLen, &randomSalt, sizeof(BlobCipherRandomSalt)); + HmacSha256DigestGen hmacGen(baseCipher.get(), baseCipherLen); + StringRef digest = hmacGen.digest(&buf[0], baseCipherLen + sizeof(BlobCipherRandomSalt), arena); + std::copy(digest.begin(), digest.end(), cipher.get()); + if (digest.size() < AES_256_KEY_LENGTH) { + memcpy(cipher.get() + digest.size(), buf, AES_256_KEY_LENGTH - digest.size()); + } +} + +void BlobCipherKey::reset() { + memset(baseCipher.get(), 0, baseCipherLen); + memset(cipher.get(), 0, AES_256_KEY_LENGTH); +} + +// BlobKeyIdCache class methods + +BlobCipherKeyIdCache::BlobCipherKeyIdCache() + : domainId(INVALID_DOMAIN_ID), latestBaseCipherKeyId(INVALID_CIPHER_KEY_ID) {} + +BlobCipherKeyIdCache::BlobCipherKeyIdCache(BlobCipherDomainId dId) + : domainId(dId), latestBaseCipherKeyId(INVALID_CIPHER_KEY_ID) { + TraceEvent("Init_BlobCipherKeyIdCache").detail("DomainId", domainId); +} + +Reference BlobCipherKeyIdCache::getLatestCipherKey() { + return getCipherByBaseCipherId(latestBaseCipherKeyId); +} + +Reference BlobCipherKeyIdCache::getCipherByBaseCipherId(BlobCipherBaseKeyId baseCipherKeyId) { + BlobCipherKeyIdCacheMapCItr itr = keyIdCache.find(baseCipherKeyId); + if (itr == keyIdCache.end()) { + throw encrypt_key_not_found(); + } + return itr->second; +} + +void BlobCipherKeyIdCache::insertBaseCipherKey(BlobCipherBaseKeyId baseCipherId, + const uint8_t* baseCipher, + int baseCipherLen) { + ASSERT(baseCipherId > INVALID_CIPHER_KEY_ID); + + // BaseCipherKeys are immutable, ensure that cached value doesn't get updated. + BlobCipherKeyIdCacheMapCItr itr = keyIdCache.find(baseCipherId); + if (itr != keyIdCache.end()) { + if (memcmp(itr->second->rawBaseCipher(), baseCipher, baseCipherLen) == 0) { + TraceEvent("InsertBaseCipherKey_AlreadyPresent") + .detail("BaseCipherKeyId", baseCipherId) + .detail("DomainId", domainId); + // Key is already present; nothing more to do. + return; + } else { + TraceEvent("InsertBaseCipherKey_UpdateCipher") + .detail("BaseCipherKeyId", baseCipherId) + .detail("DomainId", domainId); + throw encrypt_update_cipher(); + } + } + + keyIdCache.emplace(baseCipherId, makeReference(domainId, baseCipherId, baseCipher, baseCipherLen)); + // Update the latest BaseCipherKeyId for the given encryption domain + latestBaseCipherKeyId = baseCipherId; +} + +void BlobCipherKeyIdCache::cleanup() { + for (auto& keyItr : keyIdCache) { + keyItr.second->reset(); + } + + keyIdCache.clear(); +} + +std::vector> BlobCipherKeyIdCache::getAllCipherKeys() { + std::vector> cipherKeys; + for (auto& keyItr : keyIdCache) { + cipherKeys.push_back(keyItr.second); + } + return cipherKeys; +} + +// BlobCipherKeyCache class methods + +void BlobCipherKeyCache::insertCipherKey(const BlobCipherDomainId& domainId, + const BlobCipherBaseKeyId& baseCipherId, + const uint8_t* baseCipher, + int baseCipherLen) { + if (domainId == INVALID_DOMAIN_ID || baseCipherId == INVALID_CIPHER_KEY_ID) { + throw encrypt_invalid_id(); + } + + try { + auto domainItr = domainCacheMap.find(domainId); + if (domainItr == domainCacheMap.end()) { + // Add mapping to track new encryption domain + Reference keyIdCache = makeReference(domainId); + keyIdCache->insertBaseCipherKey(baseCipherId, baseCipher, baseCipherLen); + domainCacheMap.emplace(domainId, keyIdCache); + } else { + // Track new baseCipher keys + Reference keyIdCache = domainItr->second; + keyIdCache->insertBaseCipherKey(baseCipherId, baseCipher, baseCipherLen); + } + + TraceEvent("InsertCipherKey").detail("DomainId", domainId).detail("BaseCipherKeyId", baseCipherId); + } catch (Error& e) { + TraceEvent("InsertCipherKey_Failed").detail("BaseCipherKeyId", baseCipherId).detail("DomainId", domainId); + throw; + } +} + +Reference BlobCipherKeyCache::getLatestCipherKey(const BlobCipherDomainId& domainId) { + auto domainItr = domainCacheMap.find(domainId); + if (domainItr == domainCacheMap.end()) { + TraceEvent("GetLatestCipherKey_DomainNotFound").detail("DomainId", domainId); + throw encrypt_key_not_found(); + } + + Reference keyIdCache = domainItr->second; + Reference cipherKey = keyIdCache->getLatestCipherKey(); + if ((now() - cipherKey->getCreationTime()) > BlobCipherKeyCache::CIPHER_KEY_CACHE_TTL_SEC) { + TraceEvent("GetLatestCipherKey_ExpiredTTL") + .detail("DomainId", domainId) + .detail("BaseCipherId", cipherKey->getBaseCipherId()); + throw encrypt_key_ttl_expired(); + } + + return cipherKey; +} + +Reference BlobCipherKeyCache::getCipherKey(const BlobCipherDomainId& domainId, + const BlobCipherBaseKeyId& baseCipherId) { + auto domainItr = domainCacheMap.find(domainId); + if (domainItr == domainCacheMap.end()) { + throw encrypt_key_not_found(); + } + + Reference keyIdCache = domainItr->second; + return keyIdCache->getCipherByBaseCipherId(baseCipherId); +} + +void BlobCipherKeyCache::resetEncyrptDomainId(const BlobCipherDomainId domainId) { + auto domainItr = domainCacheMap.find(domainId); + if (domainItr == domainCacheMap.end()) { + throw encrypt_key_not_found(); + } + + Reference keyIdCache = domainItr->second; + keyIdCache->cleanup(); + TraceEvent("ResetEncryptDomainId").detail("DomainId", domainId); +} + +void BlobCipherKeyCache::cleanup() noexcept { + BlobCipherKeyCache& instance = BlobCipherKeyCache::getInstance(); + for (auto& domainItr : instance.domainCacheMap) { + Reference keyIdCache = domainItr.second; + keyIdCache->cleanup(); + TraceEvent("BlobCipherKeyCache_Cleanup").detail("DomainId", domainItr.first); + } + + instance.domainCacheMap.clear(); +} + +std::vector> BlobCipherKeyCache::getAllCiphers(const BlobCipherDomainId& domainId) { + auto domainItr = domainCacheMap.find(domainId); + if (domainItr == domainCacheMap.end()) { + return {}; + } + + Reference keyIdCache = domainItr->second; + return keyIdCache->getAllCipherKeys(); +} + +// EncryptBlobCipher class methods + +EncryptBlobCipherAes265Ctr::EncryptBlobCipherAes265Ctr(Reference key, + const uint8_t* cipherIV, + const int ivLen) + : ctx(EVP_CIPHER_CTX_new()), cipherKey(key) { + ASSERT(ivLen == AES_256_IV_LENGTH); + memcpy(&iv[0], cipherIV, ivLen); + + if (ctx == nullptr) { + throw encrypt_ops_error(); + } + if (EVP_EncryptInit_ex(ctx, EVP_aes_256_ctr(), nullptr, nullptr, nullptr) != 1) { + throw encrypt_ops_error(); + } + if (EVP_EncryptInit_ex(ctx, nullptr, nullptr, key.getPtr()->data(), cipherIV) != 1) { + throw encrypt_ops_error(); + } +} + +Reference EncryptBlobCipherAes265Ctr::encrypt(const uint8_t* plaintext, + const int plaintextLen, + BlobCipherEncryptHeader* header, + Arena& arena) { + TEST(true); // Encrypting data with BlobCipher + + Reference encryptBuf = makeReference(plaintextLen + AES_BLOCK_SIZE, arena); + uint8_t* ciphertext = encryptBuf->begin(); + int bytes{ 0 }; + if (EVP_EncryptUpdate(ctx, ciphertext, &bytes, plaintext, plaintextLen) != 1) { + TraceEvent("Encrypt_UpdateFailed") + .detail("BaseCipherId", cipherKey->getBaseCipherId()) + .detail("EncryptDomainId", cipherKey->getDomainId()); + throw encrypt_ops_error(); + } + + int finalBytes{ 0 }; + if (EVP_EncryptFinal_ex(ctx, ciphertext + bytes, &finalBytes) != 1) { + TraceEvent("Encrypt_FinalFailed") + .detail("BaseCipherId", cipherKey->getBaseCipherId()) + .detail("EncryptDomainId", cipherKey->getDomainId()); + throw encrypt_ops_error(); + } + + if ((bytes + finalBytes) != plaintextLen) { + TraceEvent("Encrypt_UnexpectedCipherLen") + .detail("PlaintextLen", plaintextLen) + .detail("EncryptedBufLen", bytes + finalBytes); + throw encrypt_ops_error(); + } + + // populate header details for the encrypted blob. + header->flags.size = sizeof(BlobCipherEncryptHeader); + header->flags.headerVersion = EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION; + header->flags.encryptMode = BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR; + header->baseCipherId = cipherKey->getBaseCipherId(); + header->encryptDomainId = cipherKey->getDomainId(); + header->salt = cipherKey->getSalt(); + memcpy(&header->iv[0], &iv[0], AES_256_IV_LENGTH); + + // Preserve checksum of encrypted bytes in the header; approach protects against disk induced bit-rot/flip + // scenarios. AES CTR mode doesn't generate 'tag' by default as with schemes such as: AES 256 GCM. + + header->ciphertextChecksum = computeEncryptChecksum(ciphertext, bytes + finalBytes, cipherKey->getSalt(), arena); + + encryptBuf->setLogicalSize(plaintextLen); + return encryptBuf; +} + +EncryptBlobCipherAes265Ctr::~EncryptBlobCipherAes265Ctr() { + if (ctx != nullptr) { + EVP_CIPHER_CTX_free(ctx); + } +} + +// DecryptBlobCipher class methods + +DecryptBlobCipherAes256Ctr::DecryptBlobCipherAes256Ctr(Reference key, const uint8_t* iv) + : ctx(EVP_CIPHER_CTX_new()) { + if (ctx == nullptr) { + throw encrypt_ops_error(); + } + if (!EVP_DecryptInit_ex(ctx, EVP_aes_256_ctr(), nullptr, nullptr, nullptr)) { + throw encrypt_ops_error(); + } + if (!EVP_DecryptInit_ex(ctx, nullptr, nullptr, key.getPtr()->data(), iv)) { + throw encrypt_ops_error(); + } +} + +void DecryptBlobCipherAes256Ctr::verifyEncryptBlobHeader(const uint8_t* ciphertext, + const int ciphertextLen, + const BlobCipherEncryptHeader& header, + Arena& arena) { + // validate header flag sanity + if (header.flags.headerVersion != EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION || + header.flags.encryptMode != BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR) { + TraceEvent("VerifyEncryptBlobHeader") + .detail("HeaderVersion", header.flags.headerVersion) + .detail("HeaderMode", header.flags.encryptMode) + .detail("ExpectedVersion", EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION) + .detail("ExpectedMode", BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR); + throw encrypt_header_metadata_mismatch(); + } + + // encrypted byte checksum sanity; protection against data bit-rot/flip. + BlobCipherChecksum computed = computeEncryptChecksum(ciphertext, ciphertextLen, header.salt, arena); + if (computed != header.ciphertextChecksum) { + TraceEvent("VerifyEncryptBlobHeader_ChecksumMismatch") + .detail("HeaderVersion", header.flags.headerVersion) + .detail("HeaderMode", header.flags.encryptMode) + .detail("CiphertextChecksum", header.ciphertextChecksum) + .detail("ComputedCiphertextChecksum", computed); + throw encrypt_header_checksum_mismatch(); + } +} + +Reference DecryptBlobCipherAes256Ctr::decrypt(const uint8_t* ciphertext, + const int ciphertextLen, + const BlobCipherEncryptHeader& header, + Arena& arena) { + TEST(true); // Decrypting data with BlobCipher + + verifyEncryptBlobHeader(ciphertext, ciphertextLen, header, arena); + + Reference decrypted = makeReference(ciphertextLen + AES_BLOCK_SIZE, arena); + uint8_t* plaintext = decrypted->begin(); + int bytesDecrypted{ 0 }; + if (!EVP_DecryptUpdate(ctx, plaintext, &bytesDecrypted, ciphertext, ciphertextLen)) { + TraceEvent("Decrypt_UpdateFailed") + .detail("BaseCipherId", header.baseCipherId) + .detail("EncryptDomainId", header.encryptDomainId); + throw encrypt_ops_error(); + } + + int finalBlobBytes{ 0 }; + if (EVP_DecryptFinal_ex(ctx, plaintext + bytesDecrypted, &finalBlobBytes) <= 0) { + TraceEvent("Decrypt_FinalFailed") + .detail("BaseCipherId", header.baseCipherId) + .detail("EncryptDomainId", header.encryptDomainId); + throw encrypt_ops_error(); + } + + if ((bytesDecrypted + finalBlobBytes) != ciphertextLen) { + TraceEvent("Encrypt_UnexpectedPlaintextLen") + .detail("CiphertextLen", ciphertextLen) + .detail("DecryptedBufLen", bytesDecrypted + finalBlobBytes); + throw encrypt_ops_error(); + } + + decrypted->setLogicalSize(ciphertextLen); + return decrypted; +} + +DecryptBlobCipherAes256Ctr::~DecryptBlobCipherAes256Ctr() { + if (ctx != nullptr) { + EVP_CIPHER_CTX_free(ctx); + } +} + +// HmacSha256DigestGen class methods + +HmacSha256DigestGen::HmacSha256DigestGen(const unsigned char* key, size_t len) : ctx(HMAC_CTX_new()) { + if (!HMAC_Init_ex(ctx, key, len, EVP_sha256(), nullptr)) { + throw encrypt_ops_error(); + } +} + +HmacSha256DigestGen::~HmacSha256DigestGen() { + if (ctx != nullptr) { + HMAC_CTX_free(ctx); + } +} + +StringRef HmacSha256DigestGen::digest(const unsigned char* data, size_t len, Arena& arena) { + TEST(true); // Digest generation + unsigned int digestLen = HMAC_size(ctx); + auto digest = new (arena) unsigned char[digestLen]; + if (HMAC_Update(ctx, data, len) != 1) { + throw encrypt_ops_error(); + } + + if (HMAC_Final(ctx, digest, &digestLen) != 1) { + throw encrypt_ops_error(); + } + return StringRef(digest, digestLen); +} + +// Only used to link unit tests +void forceLinkBlobCipherTests() {} + +// Tests cases includes: +// 1. Populate cache by inserting 'baseCipher' details for new encryptionDomainIds +// 2. Random lookup for cipherKeys and content validation +// 3. Inserting of 'identical' cipherKey (already cached) more than once works as desired. +// 4. Inserting of 'non-identical' cipherKey (already cached) more than once works as desired. +// 5. Validation encryption ops (correctness): +// 5.1. Encyrpt a buffer followed by decryption of the buffer, validate the contents. +// 5.2. Simulate anomolies such as: EncyrptionHeader corruption, checkSum mismatch / encryptionMode mismatch etc. +// 6. Cache cleanup +// 6.1 cleanup cipherKeys by given encryptDomainId +// 6.2. Cleanup all cached cipherKeys +TEST_CASE("flow/BlobCipher") { + TraceEvent("BlobCipherTest_Start").log(); + // Construct a dummy External Key Manager representation and populate with some keys + class BaseCipher : public ReferenceCounted, NonCopyable { + public: + BlobCipherDomainId domainId; + int len; + BlobCipherBaseKeyId keyId; + std::unique_ptr key; + + BaseCipher(const BlobCipherDomainId& dId, const BlobCipherBaseKeyId& kId) + : domainId(dId), len(deterministicRandom()->randomInt(AES_256_KEY_LENGTH / 2, AES_256_KEY_LENGTH + 1)), + keyId(kId), key(std::make_unique(len)) { + generateRandomData(key.get(), len); + } + }; + + using BaseKeyMap = std::unordered_map>; + using DomainKeyMap = std::unordered_map; + DomainKeyMap domainKeyMap; + const BlobCipherDomainId minDomainId = 1; + const BlobCipherDomainId maxDomainId = deterministicRandom()->randomInt(minDomainId, minDomainId + 10) + 5; + const BlobCipherBaseKeyId minBaseCipherKeyId = 100; + const BlobCipherBaseKeyId maxBaseCipherKeyId = + deterministicRandom()->randomInt(minBaseCipherKeyId, minBaseCipherKeyId + 50) + 15; + for (int dId = minDomainId; dId <= maxDomainId; dId++) { + for (int kId = minBaseCipherKeyId; kId <= maxBaseCipherKeyId; kId++) { + domainKeyMap[dId].emplace(kId, makeReference(dId, kId)); + } + } + ASSERT(domainKeyMap.size() == maxDomainId); + + // insert BlobCipher keys into BlobCipherKeyCache map and validate + TraceEvent("BlobCipherTest_InsertKeys").log(); + BlobCipherKeyCache& cipherKeyCache = BlobCipherKeyCache::getInstance(); + for (auto& domainItr : domainKeyMap) { + for (auto& baseKeyItr : domainItr.second) { + Reference baseCipher = baseKeyItr.second; + + cipherKeyCache.insertCipherKey( + baseCipher->domainId, baseCipher->keyId, baseCipher->key.get(), baseCipher->len); + } + } + TraceEvent("BlobCipherTest_InsertKeysDone").log(); + + // validate the cipherKey lookups work as desired + for (auto& domainItr : domainKeyMap) { + for (auto& baseKeyItr : domainItr.second) { + Reference baseCipher = baseKeyItr.second; + Reference cipherKey = cipherKeyCache.getCipherKey(baseCipher->domainId, baseCipher->keyId); + ASSERT(cipherKey.isValid()); + // validate common cipher properties - domainId, baseCipherId, baseCipherLen, rawBaseCipher + ASSERT(cipherKey->getBaseCipherId() == baseCipher->keyId); + ASSERT(cipherKey->getDomainId() == baseCipher->domainId); + ASSERT(cipherKey->getBaseCipherLen() == baseCipher->len); + // ensure that baseCipher matches with the cached information + ASSERT(std::memcmp(cipherKey->rawBaseCipher(), baseCipher->key.get(), cipherKey->getBaseCipherLen()) == 0); + // validate the encryption derivation + ASSERT(std::memcmp(cipherKey->rawCipher(), baseCipher->key.get(), cipherKey->getBaseCipherLen()) != 0); + } + } + TraceEvent("BlobCipherTest_LooksupDone").log(); + + // Ensure attemtping to insert existing cipherKey (identical) more than once is treated as a NOP + try { + Reference baseCipher = domainKeyMap[minDomainId][minBaseCipherKeyId]; + cipherKeyCache.insertCipherKey(baseCipher->domainId, baseCipher->keyId, baseCipher->key.get(), baseCipher->len); + } catch (Error& e) { + throw; + } + TraceEvent("BlobCipherTest_ReinsertIdempotentKeyDone").log(); + + // Ensure attemtping to insert an existing cipherKey (modified) fails with appropriate error + try { + Reference baseCipher = domainKeyMap[minDomainId][minBaseCipherKeyId]; + uint8_t rawCipher[baseCipher->len]; + memcpy(rawCipher, baseCipher->key.get(), baseCipher->len); + // modify few bytes in the cipherKey + for (int i = 2; i < 5; i++) { + rawCipher[i]++; + } + cipherKeyCache.insertCipherKey(baseCipher->domainId, baseCipher->keyId, &rawCipher[0], baseCipher->len); + } catch (Error& e) { + if (e.code() != error_code_encrypt_update_cipher) { + throw; + } + } + TraceEvent("BlobCipherTest_ReinsertNonIdempotentKeyDone").log(); + + // Validate Encyrption ops + Reference cipherKey = cipherKeyCache.getLatestCipherKey(minDomainId); + const int bufLen = deterministicRandom()->randomInt(786, 2127) + 512; + uint8_t orgData[bufLen]; + generateRandomData(&orgData[0], bufLen); + + Arena arena; + uint8_t iv[AES_256_IV_LENGTH]; + generateRandomData(&iv[0], AES_256_IV_LENGTH); + + // validate basic encrypt followed by decrypt operation + EncryptBlobCipherAes265Ctr encryptor(cipherKey, iv, AES_256_IV_LENGTH); + BlobCipherEncryptHeader header; + Reference encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + + ASSERT(encrypted->getLogicalSize() == bufLen); + ASSERT(memcmp(&orgData[0], encrypted->begin(), bufLen) != 0); + ASSERT(header.flags.headerVersion == EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION); + ASSERT(header.flags.encryptMode == BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR); + + TraceEvent("BlobCipherTest_EncryptDone") + .detail("HeaderVersion", header.flags.headerVersion) + .detail("HeaderEncryptMode", header.flags.encryptMode) + .detail("DomainId", header.encryptDomainId) + .detail("BaseCipherId", header.baseCipherId) + .detail("HeaderChecksum", header.ciphertextChecksum); + + Reference encyrptKey = cipherKeyCache.getCipherKey(header.encryptDomainId, header.baseCipherId); + ASSERT(encyrptKey->isEqual(cipherKey)); + DecryptBlobCipherAes256Ctr decryptor(encyrptKey, &header.iv[0]); + Reference decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena); + + ASSERT(decrypted->getLogicalSize() == bufLen); + ASSERT(memcmp(decrypted->begin(), &orgData[0], bufLen) == 0); + + TraceEvent("BlobCipherTest_DecryptDone").log(); + + // induce encryption header corruption - headerVersion corrupted + header.flags.headerVersion += 1; + try { + decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena); + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_metadata_mismatch) { + throw; + } + header.flags.headerVersion -= 1; + } + + // induce encryption header corruption - encryptionMode corrupted + header.flags.encryptMode += 1; + try { + decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena); + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_metadata_mismatch) { + throw; + } + header.flags.encryptMode -= 1; + } + + // induce encryption header corruption - checksum mismatch + header.ciphertextChecksum += 1; + try { + decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena); + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_checksum_mismatch) { + throw; + } + header.ciphertextChecksum -= 1; + } + + // Validate dropping encyrptDomainId cached keys + const BlobCipherDomainId candidate = deterministicRandom()->randomInt(minDomainId, maxDomainId); + cipherKeyCache.resetEncyrptDomainId(candidate); + std::vector> cachedKeys = cipherKeyCache.getAllCiphers(candidate); + ASSERT(cachedKeys.empty()); + + // Validate dropping all cached cipherKeys + cipherKeyCache.cleanup(); + for (int dId = minDomainId; dId < maxDomainId; dId++) { + std::vector> cachedKeys = cipherKeyCache.getAllCiphers(dId); + ASSERT(cachedKeys.empty()); + } + + TraceEvent("BlobCipherTest_Done").log(); + return Void(); +} + +BlobCipherChecksum computeEncryptChecksum(const uint8_t* payload, + const int payloadLen, + const BlobCipherRandomSalt& salt, + Arena& arena) { + // FIPS compliance recommendation is to leverage cryptographic digest mechanism to generate checksum + // Leverage HMAC_SHA256 using header.randomSalt as the initialization 'key' for the hmac digest. + + HmacSha256DigestGen hmacGenerator((const uint8_t*)&salt, sizeof(salt)); + StringRef digest = hmacGenerator.digest(payload, payloadLen, arena); + ASSERT(digest.size() >= sizeof(BlobCipherChecksum)); + + BlobCipherChecksum checksum; + memcpy((uint8_t*)&checksum, digest.begin(), sizeof(BlobCipherChecksum)); + return checksum; +} + +#endif // ENCRYPTION_ENABLED diff --git a/flow/BlobCipher.h b/flow/BlobCipher.h new file mode 100644 index 00000000000..151e60efd06 --- /dev/null +++ b/flow/BlobCipher.h @@ -0,0 +1,321 @@ +/* + * BlobCipher.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +#if (!defined(TLS_DISABLED) && !defined(_WIN32)) +#define ENCRYPTION_ENABLED 1 +#else +#define ENCRYPTION_ENABLED 0 +#endif + +#if ENCRYPTION_ENABLED + +#include "flow/Arena.h" +#include "flow/FastRef.h" +#include "flow/flow.h" +#include "flow/xxhash.h" + +#include +#include +#include +#include +#include + +#define AES_256_KEY_LENGTH 32 +#define AES_256_IV_LENGTH 16 +#define INVALID_DOMAIN_ID 0 +#define INVALID_CIPHER_KEY_ID 0 + +using BlobCipherDomainId = uint64_t; +using BlobCipherRandomSalt = uint64_t; +using BlobCipherBaseKeyId = uint64_t; +using BlobCipherChecksum = uint64_t; + +typedef enum { BLOB_CIPHER_ENCRYPT_MODE_NONE = 0, BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR = 1 } BlockCipherEncryptMode; + +// Encryption operations buffer management +// Approach limits number of copies needed during encryption or decryption operations. +// For encryption EncryptBuf is allocated using client supplied Arena and provided to AES library to capture +// the ciphertext. Similarly, on decryption EncryptBuf is allocated using client supplied Arena and provided +// to the AES library to capture decipher text and passed back to the clients. Given the object passed around +// is reference-counted, it gets freed once refrenceCount goes to 0. + +class EncryptBuf : public ReferenceCounted, NonCopyable { +public: + EncryptBuf(int size, Arena& arena) : allocSize(size), logicalSize(size) { + if (size > 0) { + buffer = new (arena) uint8_t[size]; + } else { + buffer = nullptr; + } + } + + int getLogicalSize() { return logicalSize; } + void setLogicalSize(int value) { + ASSERT(value <= allocSize); + logicalSize = value; + } + uint8_t* begin() { return buffer; } + +private: + int allocSize; + int logicalSize; + uint8_t* buffer; +}; + +// BlobCipher Encryption header format +// This header is persisted along with encrypted buffer, it contains information necessary +// to assist decrypting the buffers to serve read requests. +// +// The total space overhead is 56 bytes. + +#pragma pack(push, 1) // exact fit - no padding +typedef struct BlobCipherEncryptHeader { + union { + struct { + uint8_t size; // reading first byte is sufficient to determine header + // length. ALWAYS THE FIRST HEADER ELEMENT. + uint8_t headerVersion{}; + uint8_t encryptMode{}; + uint8_t _reserved[5]{}; + } flags; + uint64_t _padding{}; + }; + // Encyrption domain boundary identifier. + BlobCipherDomainId encryptDomainId{}; + // BaseCipher encryption key identifier + BlobCipherBaseKeyId baseCipherId{}; + // Random salt + BlobCipherRandomSalt salt{}; + // Checksum of the encrypted buffer. It protects against 'tampering' of ciphertext as well 'bit rots/flips'. + BlobCipherChecksum ciphertextChecksum{}; + // Initialization vector used to encrypt the payload. + uint8_t iv[AES_256_IV_LENGTH]; + + BlobCipherEncryptHeader(); +} BlobCipherEncryptHeader; +#pragma pack(pop) + +// This interface is in-memory representation of CipherKey used for encryption/decryption information. +// It caches base encryption key properties as well as caches the 'derived encryption' key obtained by applying +// HMAC-SHA-256 derivation technique. + +class BlobCipherKey : public ReferenceCounted, NonCopyable { +public: + BlobCipherKey(const BlobCipherDomainId& domainId, + const BlobCipherBaseKeyId& baseCiphId, + const uint8_t* baseCiph, + int baseCiphLen); + + uint8_t* data() const { return cipher.get(); } + uint64_t getCreationTime() const { return creationTime; } + BlobCipherDomainId getDomainId() const { return encryptDomainId; } + BlobCipherRandomSalt getSalt() const { return randomSalt; } + BlobCipherBaseKeyId getBaseCipherId() const { return baseCipherId; } + int getBaseCipherLen() const { return baseCipherLen; } + uint8_t* rawCipher() const { return cipher.get(); } + uint8_t* rawBaseCipher() const { return baseCipher.get(); } + bool isEqual(const Reference toCompare) { + return encryptDomainId == toCompare->getDomainId() && baseCipherId == toCompare->getBaseCipherId() && + randomSalt == toCompare->getSalt() && baseCipherLen == toCompare->getBaseCipherLen() && + memcmp(cipher.get(), toCompare->rawCipher(), AES_256_KEY_LENGTH) == 0 && + memcmp(baseCipher.get(), toCompare->rawBaseCipher(), baseCipherLen) == 0; + } + void reset(); + +private: + // Encryption domain boundary identifier + BlobCipherDomainId encryptDomainId; + // Base encryption cipher key properties + std::unique_ptr baseCipher; + int baseCipherLen; + BlobCipherBaseKeyId baseCipherId; + // Random salt used for encryption cipher key derivation + BlobCipherRandomSalt randomSalt; + // Creation timestamp for the derived encryption cipher key + uint64_t creationTime; + // Derived encryption cipher key + std::unique_ptr cipher; + + void initKey(const BlobCipherDomainId& domainId, + const uint8_t* baseCiph, + int baseCiphLen, + const BlobCipherBaseKeyId& baseCiphId, + const BlobCipherRandomSalt& salt); + void applyHmacSha256Derivation(); +}; + +// This interface allows FDB processes participating in encryption to store and +// index recently used encyption cipher keys. FDB encryption has two dimensions: +// 1. Mapping on cipher encryption keys per "encryption domains" +// 2. Per encryption domain, the cipher keys are index using "baseCipherKeyId". +// +// The design supports NIST recommendation of limiting lifetime of an encryption +// key. For details refer to: +// https://csrc.nist.gov/publications/detail/sp/800-57-part-1/rev-3/archive/2012-07-10 +// +// Below gives a pictoral representation of in-memory datastructure implemented +// to index encryption keys: +// { encryptionDomain -> { baseCipherId -> cipherKey } } +// +// Supported cache lookups schemes: +// 1. Lookup cipher based on { encryptionDomainId, baseCipherKeyId } tuple. +// 2. Lookup latest cipher key for a given encryptionDomainId. +// +// Client is responsible to handle cache-miss usecase, the corrective operation +// might vary based on the calling process, for instance: EncryptKeyServer +// cache-miss shall invoke RPC to external Encryption Key Manager to fetch the +// required encryption key, however, CPs/SSs cache-miss would result in RPC to +// EncryptKeyServer to refresh the desired encryption key. + +using BlobCipherKeyIdCacheMap = std::unordered_map>; +using BlobCipherKeyIdCacheMapCItr = std::unordered_map>::const_iterator; + +struct BlobCipherKeyIdCache : ReferenceCounted { +public: + BlobCipherKeyIdCache(); + explicit BlobCipherKeyIdCache(BlobCipherDomainId dId); + + // API returns the last inserted cipherKey. + // If none exists, 'encrypt_key_not_found' is thrown. + Reference getLatestCipherKey(); + // API returns cipherKey corresponding to input 'baseCipherKeyId'. + // If none exists, 'encrypt_key_not_found' is thrown. + Reference getCipherByBaseCipherId(BlobCipherBaseKeyId baseCipherKeyId); + // API enables inserting base encryption cipher details to the BlobCipherKeyIdCache. + // Given cipherKeys are immutable, attempting to re-insert same 'identical' cipherKey + // is treated as a NOP (success), however, an attempt to update cipherKey would throw + // 'encrypt_update_cipher' exception. + void insertBaseCipherKey(BlobCipherBaseKeyId baseCipherId, const uint8_t* baseCipher, int baseCipherLen); + // API cleanup the cache by dropping all cached cipherKeys + void cleanup(); + // API returns list of all 'cached' cipherKeys + std::vector> getAllCipherKeys(); + +private: + BlobCipherDomainId domainId; + BlobCipherKeyIdCacheMap keyIdCache; + BlobCipherBaseKeyId latestBaseCipherKeyId; +}; + +using BlobCipherDomainCacheMap = std::unordered_map>; + +class BlobCipherKeyCache : NonCopyable { +public: + // Enable clients to insert base encryption cipher details to the BlobCipherKeyCache. + // The cipherKeys are indexed using 'baseCipherId', given cipherKeys are immutable, + // attempting to re-insert same 'identical' cipherKey is treated as a NOP (success), + // however, an attempt to update cipherKey would throw 'encrypt_update_cipher' exception. + void insertCipherKey(const BlobCipherDomainId& domainId, + const BlobCipherBaseKeyId& baseCipherId, + const uint8_t* baseCipher, + int baseCipherLen); + // API returns the last insert cipherKey for a given encyryption domain Id. + // If none exists, it would throw 'encrypt_key_not_found' exception. + Reference getLatestCipherKey(const BlobCipherDomainId& domainId); + // API returns cipherKey corresponding to {encryptionDomainId, baseCipherId} tuple. + // If none exists, it would throw 'encrypt_key_not_found' exception. + Reference getCipherKey(const BlobCipherDomainId& domainId, const BlobCipherBaseKeyId& baseCipherId); + // API returns point in time list of all 'cached' cipherKeys for a given encryption domainId. + std::vector> getAllCiphers(const BlobCipherDomainId& domainId); + // API enables dropping all 'cached' cipherKeys for a given encryption domain Id. + // Useful to cleanup cache if an encryption domain gets removed/destroyed etc. + void resetEncyrptDomainId(const BlobCipherDomainId domainId); + + static BlobCipherKeyCache& getInstance() { + static BlobCipherKeyCache instance; + return instance; + } + // Ensures cached encryption key(s) (plaintext) never gets persisted as part + // of FDB process/core dump. + static void cleanup() noexcept; + +private: + BlobCipherDomainCacheMap domainCacheMap; + static constexpr uint64_t CIPHER_KEY_CACHE_TTL_SEC = 10 * 60L; + + BlobCipherKeyCache() {} +}; + +// This interface enables data block encryption. An invocation to encrypt() will +// do two things: +// 1) generate encrypted ciphertext for given plaintext input. +// 2) generate BlobCipherEncryptHeader (including the 'header checksum') and persit for decryption on reads. + +class EncryptBlobCipherAes265Ctr final : NonCopyable, public ReferenceCounted { +public: + static constexpr uint8_t ENCRYPT_HEADER_VERSION = 1; + + EncryptBlobCipherAes265Ctr(Reference key, const uint8_t* iv, const int ivLen); + ~EncryptBlobCipherAes265Ctr(); + Reference encrypt(const uint8_t* plaintext, + const int plaintextLen, + BlobCipherEncryptHeader* header, + Arena&); + +private: + EVP_CIPHER_CTX* ctx; + Reference cipherKey; + uint8_t iv[AES_256_IV_LENGTH]; +}; + +// This interface enable data block decryption. An invocation to decrypt() would generate +// 'plaintext' for a given 'ciphertext' input, the caller needs to supply BlobCipherEncryptHeader. + +class DecryptBlobCipherAes256Ctr final : NonCopyable, public ReferenceCounted { +public: + DecryptBlobCipherAes256Ctr(Reference key, const uint8_t* iv); + ~DecryptBlobCipherAes256Ctr(); + Reference decrypt(const uint8_t* ciphertext, + const int ciphertextLen, + const BlobCipherEncryptHeader& header, + Arena&); + +private: + EVP_CIPHER_CTX* ctx; + + void verifyEncryptBlobHeader(const uint8_t* cipherText, + const int ciphertextLen, + const BlobCipherEncryptHeader& header, + Arena& arena); +}; + +class HmacSha256DigestGen final : NonCopyable { +public: + HmacSha256DigestGen(const unsigned char* key, size_t len); + ~HmacSha256DigestGen(); + HMAC_CTX* getCtx() const { return ctx; } + StringRef digest(unsigned char const* data, size_t len, Arena&); + +private: + HMAC_CTX* ctx; +}; + +BlobCipherChecksum computeEncryptChecksum(const uint8_t* payload, + const int payloadLen, + const BlobCipherRandomSalt& salt, + Arena& arena); + +#endif // ENCRYPTION_ENABLED diff --git a/flow/CMakeLists.txt b/flow/CMakeLists.txt index 96a8842bbf7..6884e5af783 100644 --- a/flow/CMakeLists.txt +++ b/flow/CMakeLists.txt @@ -8,6 +8,8 @@ set(FLOW_SRCS ArgParseUtil.h AsioReactor.h BooleanParam.h + BlobCipher.h + BlobCipher.cpp CompressedInt.actor.cpp CompressedInt.h Deque.cpp diff --git a/flow/FastAlloc.h b/flow/FastAlloc.h index 935f00e3589..993b0036030 100644 --- a/flow/FastAlloc.h +++ b/flow/FastAlloc.h @@ -210,13 +210,24 @@ class FastAllocated { if (s != sizeof(Object)) abort(); INSTRUMENT_ALLOCATE(typeid(Object).name()); - void* p = FastAllocator < sizeof(Object) <= 64 ? 64 : nextFastAllocatedSize(sizeof(Object)) > ::allocate(); - return p; + + if constexpr (sizeof(Object) <= 256) { + void* p = FastAllocator < sizeof(Object) <= 64 ? 64 : nextFastAllocatedSize(sizeof(Object)) > ::allocate(); + return p; + } else { + void* p = new uint8_t[nextFastAllocatedSize(sizeof(Object))]; + return p; + } } static void operator delete(void* s) { INSTRUMENT_RELEASE(typeid(Object).name()); - FastAllocator::release(s); + + if constexpr (sizeof(Object) <= 256) { + FastAllocator::release(s); + } else { + delete[] reinterpret_cast(s); + } } // Redefine placement new so you can still use it static void* operator new(size_t, void* p) { return p; } @@ -236,18 +247,6 @@ class FastAllocated { return FastAllocator<128>::allocate(); if (size <= 256) return FastAllocator<256>::allocate(); - if (size <= 512) - return FastAllocator<512>::allocate(); - if (size <= 1024) - return FastAllocator<1024>::allocate(); - if (size <= 2048) - return FastAllocator<2048>::allocate(); - if (size <= 4096) - return FastAllocator<4096>::allocate(); - if (size <= 8192) - return FastAllocator<8192>::allocate(); - if (size <= 16384) - return FastAllocator<16384>::allocate(); return new uint8_t[size]; } @@ -264,21 +263,11 @@ inline void freeFast(int size, void* ptr) { return FastAllocator<128>::release(ptr); if (size <= 256) return FastAllocator<256>::release(ptr); - if (size <= 512) - return FastAllocator<512>::release(ptr); - if (size <= 1024) - return FastAllocator<1024>::release(ptr); - if (size <= 2048) - return FastAllocator<2048>::release(ptr); - if (size <= 4096) - return FastAllocator<4096>::release(ptr); - if (size <= 8192) - return FastAllocator<8192>::release(ptr); - if (size <= 16384) - return FastAllocator<16384>::release(ptr); delete[](uint8_t*) ptr; } +// Allocate a block of memory aligned to 4096 bytes. Size must be a multiple of +// 4096. Guaranteed not to return null. Use freeFast4kAligned to free. [[nodiscard]] inline void* allocateFast4kAligned(int size) { #if !defined(USE_JEMALLOC) // Use FastAllocator for sizes it supports to avoid internal fragmentation in some implementations of aligned_alloc @@ -296,6 +285,7 @@ inline void freeFast(int size, void* ptr) { return result; } +// Free a pointer returned from allocateFast4kAligned(size) inline void freeFast4kAligned(int size, void* ptr) { #if !defined(USE_JEMALLOC) // Sizes supported by FastAllocator must be release via FastAllocator diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp index 9602fc431e4..20a13ac8c72 100644 --- a/flow/Platform.actor.cpp +++ b/flow/Platform.actor.cpp @@ -33,6 +33,7 @@ #if (!defined(TLS_DISABLED) && !defined(_WIN32)) #include "flow/StreamCipher.h" +#include "flow/BlobCipher.h" #endif #include "flow/Trace.h" #include "flow/Error.h" @@ -3501,6 +3502,7 @@ void crashHandler(int sig) { #if (!defined(TLS_DISABLED) && !defined(_WIN32)) StreamCipherKey::cleanup(); StreamCipher::cleanup(); + BlobCipherKeyCache::cleanup(); #endif fflush(stdout); diff --git a/flow/error_definitions.h b/flow/error_definitions.h index 1f82b0be462..7710bca9cab 100755 --- a/flow/error_definitions.h +++ b/flow/error_definitions.h @@ -284,6 +284,15 @@ ERROR( snap_log_anti_quorum_unsupported, 2507, "Unsupported when log anti quorum ERROR( snap_with_recovery_unsupported, 2508, "Cluster recovery during snapshot operation not supported") ERROR( snap_invalid_uid_string, 2509, "The given uid string is not a 32-length hex string") +// 3XXX - Encryption operations errors +ERROR( encrypt_ops_error, 3000, "Encryption operation error") +ERROR( encrypt_header_metadata_mismatch, 3001, "Encryption header metadata mismatch") +ERROR( encrypt_key_not_found, 3002, "Expected encryption key is missing") +ERROR( encrypt_key_ttl_expired, 3003, "Expected encryption key TTL has expired") +ERROR( encrypt_header_checksum_mismatch, 3004, "Encryption header checksum mismatch") +ERROR( encrypt_update_cipher, 3005, "Attempt to update encryption cipher key") +ERROR( encrypt_invalid_id, 3006, "Invalid encryption domainId or encryption cipher key id") + // 4xxx Internal errors (those that should be generated only by bugs) are decimal 4xxx ERROR( unknown_error, 4000, "An unknown error occurred" ) // C++ exception not of type Error ERROR( internal_error, 4100, "An internal error occurred" ) diff --git a/flow/flat_buffers.cpp b/flow/flat_buffers.cpp index eb6daad7870..ea1c1c2dce0 100644 --- a/flow/flat_buffers.cpp +++ b/flow/flat_buffers.cpp @@ -19,6 +19,7 @@ */ #include "flow/flat_buffers.h" +#include "flow/FileIdentifier.h" #include "flow/UnitTest.h" #include "flow/Arena.h" #include "flow/serialize.h" @@ -26,6 +27,7 @@ #include #include +#include #include namespace detail { @@ -361,6 +363,7 @@ struct string_serialized_traits : std::true_type { namespace unit_tests { struct Y1 { + constexpr static FileIdentifier file_identifier = 338229; int a; template @@ -369,6 +372,14 @@ struct Y1 { } }; +struct Y1Hasher { + std::size_t operator()(const Y1& y) const noexcept { return std::hash()(y.a); } +}; + +struct Y1Equal { + bool operator()(const Y1& l, const Y1& r) const { return l.a == r.a; } +}; + struct Y2 { int a; std::variant b; @@ -563,4 +574,43 @@ TEST_CASE("/flow/FlatBuffers/EmptyPreSerVectorRefs") { return Void(); } +TEST_CASE("/flow/FlatBuffers/EmptyUnorderedSet") { + int kSize = deterministicRandom()->randomInt(0, 100); + Standalone msg = + ObjectWriter::toValue(std::vector>(kSize), Unversioned()); + ObjectReader rd(msg.begin(), Unversioned()); + std::vector> xs; + rd.deserialize(xs); + ASSERT(xs.size() == kSize); + for (const auto& x : xs) { + ASSERT(x.size() == 0); + } + return Void(); +} + +TEST_CASE("/flow/FlatBuffers/NonEmptyUnorderedSet") { + int kSize = deterministicRandom()->randomInt(0, 100); + std::vector> src; + std::unordered_set s; + for (int i = 0; i < kSize; i++) { + Y1 y; + y.a = i; + s.insert(y); + } + src.push_back(s); + + Standalone msg = ObjectWriter::toValue(src, Unversioned()); + ObjectReader rd(msg.begin(), Unversioned()); + std::vector> xs; + rd.deserialize(xs); + ASSERT(xs.size() == 1); + ASSERT(xs[0].size() == kSize); + for (int i = 0; i < kSize; i++) { + Y1 y; + y.a = i; + ASSERT(xs[0].find(y) != xs[0].end()); + } + return Void(); +} + } // namespace unit_tests diff --git a/flow/flat_buffers.h b/flow/flat_buffers.h index 04f3a881bdb..5b98f745368 100644 --- a/flow/flat_buffers.h +++ b/flow/flat_buffers.h @@ -35,6 +35,7 @@ #include #include #include +#include #include #include "flow/FileIdentifier.h" #include "flow/ObjectSerializerTraits.h" @@ -250,6 +251,31 @@ struct vector_like_traits> : std::true_type { return v.begin(); } }; +template +struct vector_like_traits> : std::true_type { + using Vec = std::unordered_set; + using value_type = Key; + using iterator = typename Vec::const_iterator; + using insert_iterator = std::insert_iterator; + + template + static size_t num_entries(const Vec& v, Context&) { + return v.size(); + } + template + static void reserve(Vec& v, size_t size, Context&) { + v.reserve(size); + } + + template + static insert_iterator insert(Vec& v, Context&) { + return std::inserter(v, v.end()); + } + template + static iterator begin(const Vec& v, Context&) { + return v.begin(); + } +}; template <> struct dynamic_size_traits : std::true_type { diff --git a/flow/serialize.h b/flow/serialize.h index 7609ccf6787..3a405d185db 100644 --- a/flow/serialize.h +++ b/flow/serialize.h @@ -20,6 +20,7 @@ #ifndef FLOW_SERIALIZE_H #define FLOW_SERIALIZE_H +#include #pragma once #include @@ -172,6 +173,13 @@ template struct CompositionDepthFor> : std::integral_constant::value + 1> { }; +template +struct FileIdentifierFor> : ComposedIdentifierExternal {}; + +template +struct CompositionDepthFor> + : std::integral_constant::value + 1> {}; + template inline void save(Archive& ar, const std::vector& value) { ar << (int)value.size(); @@ -762,9 +770,6 @@ struct PacketBuffer : SendBuffer { public: static PacketBuffer* create(size_t size = 0) { size = std::max(size, PACKET_BUFFER_MIN_SIZE - PACKET_BUFFER_OVERHEAD); - if (size == PACKET_BUFFER_MIN_SIZE - PACKET_BUFFER_OVERHEAD) { - return new (FastAllocator::allocate()) PacketBuffer{ size }; - } uint8_t* mem = new uint8_t[size + PACKET_BUFFER_OVERHEAD]; return new (mem) PacketBuffer{ size }; } @@ -772,11 +777,7 @@ struct PacketBuffer : SendBuffer { void addref() { ++reference_count; } void delref() { if (!--reference_count) { - if (size_ == PACKET_BUFFER_MIN_SIZE - PACKET_BUFFER_OVERHEAD) { - FastAllocator::release(this); - } else { - delete[] this; - } + delete[] reinterpret_cast(this); } } int bytes_unwritten() const { return size_ - bytes_written; } diff --git a/tests/BGServerCommonUnit.toml b/tests/BGServerCommonUnit.toml new file mode 100644 index 00000000000..6d37b51e6b1 --- /dev/null +++ b/tests/BGServerCommonUnit.toml @@ -0,0 +1,9 @@ +[[test]] +testTitle = 'BlobGranuleServerCommonUnit' +useDB = false +startDelay = 0 + + [[test.workload]] + testName = 'UnitTests' + maxTestCases = 0 + testsMatching = /blobgranule/server/common/ diff --git a/tests/BlobGranuleFileUnit.toml b/tests/BlobGranuleFileUnit.toml new file mode 100644 index 00000000000..d725be4d676 --- /dev/null +++ b/tests/BlobGranuleFileUnit.toml @@ -0,0 +1,10 @@ +[[test]] +testTitle = 'BlobGranuleFileUnit' +useDB = false +startDelay = 0 + + [[test.workload]] + testName = 'UnitTests' + maxTestCases = 0 + testsMatching = /blobgranule/files/ + diff --git a/tests/BlobGranuleFileUnit.txt b/tests/BlobGranuleFileUnit.txt deleted file mode 100644 index efdeffe0858..00000000000 --- a/tests/BlobGranuleFileUnit.txt +++ /dev/null @@ -1,7 +0,0 @@ -testTitle=UnitTests -startDelay=0 -useDB=false - - testName=UnitTests - maxTestCases=0 - testsMatching=/blobgranule/ diff --git a/tests/BlobManagerUnit.toml b/tests/BlobManagerUnit.toml new file mode 100644 index 00000000000..9e151c94e89 --- /dev/null +++ b/tests/BlobManagerUnit.toml @@ -0,0 +1,9 @@ +[[test]] +testTitle = 'BlobManagerUnit' +useDB = false +startDelay = 0 + + [[test.workload]] + testName = 'UnitTests' + maxTestCases = 0 + testsMatching = /blobmanager/ diff --git a/tests/BlobManagerUnit.txt b/tests/BlobManagerUnit.txt deleted file mode 100644 index c0fc3da21d8..00000000000 --- a/tests/BlobManagerUnit.txt +++ /dev/null @@ -1,7 +0,0 @@ -testTitle=UnitTests -startDelay=0 -useDB=false - - testName=UnitTests - maxTestCases=0 - testsMatching=/blobmanager/ diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 42713ad6cba..44f0cf4d4e8 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -50,8 +50,9 @@ if(WITH_PYTHON) add_fdb_test(TEST_FILES s3VersionHeaders.txt IGNORE) add_fdb_test(TEST_FILES BandwidthThrottle.txt IGNORE) add_fdb_test(TEST_FILES BigInsert.txt IGNORE) - add_fdb_test(TEST_FILES BlobGranuleFileUnit.txt) - add_fdb_test(TEST_FILES BlobManagerUnit.txt) + add_fdb_test(TEST_FILES BGServerCommonUnit.toml) + add_fdb_test(TEST_FILES BlobGranuleFileUnit.toml) + add_fdb_test(TEST_FILES BlobManagerUnit.toml) add_fdb_test(TEST_FILES ConsistencyCheck.txt IGNORE) add_fdb_test(TEST_FILES DDMetricsExclude.txt IGNORE) add_fdb_test(TEST_FILES DataDistributionMetrics.txt IGNORE) diff --git a/tests/fast/BlobGranuleVerifyAtomicOps.toml b/tests/fast/BlobGranuleVerifyAtomicOps.toml index 4831d8b9856..a7c7cf27bcc 100644 --- a/tests/fast/BlobGranuleVerifyAtomicOps.toml +++ b/tests/fast/BlobGranuleVerifyAtomicOps.toml @@ -1,5 +1,7 @@ [configuration] blobGranulesEnabled = true +# FIXME: re-enable rocks at some point +storageEngineExcludeTypes = [4] [[test]] testTitle = 'BlobGranuleVerifyAtomicOps' diff --git a/tests/fast/BlobGranuleVerifyCycle.toml b/tests/fast/BlobGranuleVerifyCycle.toml index b15bc34a858..588f2475e0d 100644 --- a/tests/fast/BlobGranuleVerifyCycle.toml +++ b/tests/fast/BlobGranuleVerifyCycle.toml @@ -1,5 +1,7 @@ [configuration] blobGranulesEnabled = true +# FIXME: re-enable rocks at some point +storageEngineExcludeTypes = [4] [[test]] testTitle = 'BlobGranuleVerifyCycle' diff --git a/tests/fast/BlobGranuleVerifySmall.toml b/tests/fast/BlobGranuleVerifySmall.toml index 22a4b15ae69..ea463963522 100644 --- a/tests/fast/BlobGranuleVerifySmall.toml +++ b/tests/fast/BlobGranuleVerifySmall.toml @@ -1,6 +1,8 @@ [configuration] blobGranulesEnabled = true -storageEngineExcludeTypes = [3] # FIXME: exclude redwood because WriteDuringRead can write massive KV pairs and we don't chunk change feed data on disk yet +# FIXME: exclude redwood because WriteDuringRead can write massive KV pairs and we don't chunk change feed data on disk yet +# FIXME: re-enable rocks at some point +storageEngineExcludeTypes = [3, 4] [[test]] testTitle = 'BlobGranuleVerifySmall' diff --git a/tests/fast/BlobGranuleVerifySmallClean.toml b/tests/fast/BlobGranuleVerifySmallClean.toml index 0a7d2a95d6a..e9262a86fa4 100644 --- a/tests/fast/BlobGranuleVerifySmallClean.toml +++ b/tests/fast/BlobGranuleVerifySmallClean.toml @@ -1,6 +1,8 @@ [configuration] blobGranulesEnabled = true -storageEngineExcludeTypes = [3] # FIXME: exclude redwood because WriteDuringRead can write massive KV pairs and we don't chunk change feed data on disk yet +# FIXME: exclude redwood because WriteDuringRead can write massive KV pairs and we don't chunk change feed data on disk yet +# FIXME: re-enable rocks at some point +storageEngineExcludeTypes = [3, 4] [[test]] testTitle = 'BlobGranuleVerifySmallClean' diff --git a/tests/restarting/from_7.1.0/ConfigureTestRestart-2.toml b/tests/restarting/from_7.1.0/ConfigureTestRestart-2.toml index a181ce821e1..8230bec4793 100644 --- a/tests/restarting/from_7.1.0/ConfigureTestRestart-2.toml +++ b/tests/restarting/from_7.1.0/ConfigureTestRestart-2.toml @@ -1,3 +1,6 @@ +[[configuration]] +randomlyRenameZoneId=true + [[test]] testTitle='CloggedConfigureDatabaseTest' runSetup=false diff --git a/tests/slow/BlobGranuleCorrectness.toml b/tests/slow/BlobGranuleCorrectness.toml index 10c03ab63bf..700093d1d9b 100644 --- a/tests/slow/BlobGranuleCorrectness.toml +++ b/tests/slow/BlobGranuleCorrectness.toml @@ -1,5 +1,7 @@ [configuration] blobGranulesEnabled = true +# FIXME: re-enable rocks at some point +storageEngineExcludeTypes = [4] [[test]] testTitle = 'BlobGranuleCorrectness' diff --git a/tests/slow/BlobGranuleCorrectnessClean.toml b/tests/slow/BlobGranuleCorrectnessClean.toml index a538e7203b2..4886f8cb8e5 100644 --- a/tests/slow/BlobGranuleCorrectnessClean.toml +++ b/tests/slow/BlobGranuleCorrectnessClean.toml @@ -1,5 +1,7 @@ [configuration] blobGranulesEnabled = true +# FIXME: re-enable rocks at some point +storageEngineExcludeTypes = [4] [[test]] testTitle = 'BlobGranuleCorrectness' diff --git a/tests/slow/BlobGranuleVerifyBalance.toml b/tests/slow/BlobGranuleVerifyBalance.toml index 385b88ff69b..96638227a76 100644 --- a/tests/slow/BlobGranuleVerifyBalance.toml +++ b/tests/slow/BlobGranuleVerifyBalance.toml @@ -1,5 +1,7 @@ [configuration] blobGranulesEnabled = true +# FIXME: re-enable rocks at some point +storageEngineExcludeTypes = [4] [[test]] testTitle = 'BlobGranuleVerifyBalance' diff --git a/tests/slow/BlobGranuleVerifyBalanceClean.toml b/tests/slow/BlobGranuleVerifyBalanceClean.toml index 65bb8ad15cd..24888e28a1e 100644 --- a/tests/slow/BlobGranuleVerifyBalanceClean.toml +++ b/tests/slow/BlobGranuleVerifyBalanceClean.toml @@ -1,5 +1,7 @@ [configuration] blobGranulesEnabled = true +# FIXME: re-enable rocks at some point +storageEngineExcludeTypes = [4] [[test]] testTitle = 'BlobGranuleVerifyBalanceClean' diff --git a/tests/slow/BlobGranuleVerifyLarge.toml b/tests/slow/BlobGranuleVerifyLarge.toml index de55422d89b..66458efcbf9 100644 --- a/tests/slow/BlobGranuleVerifyLarge.toml +++ b/tests/slow/BlobGranuleVerifyLarge.toml @@ -1,5 +1,7 @@ [configuration] blobGranulesEnabled = true +# FIXME: re-enable rocks at some point +storageEngineExcludeTypes = [4] [[test]] testTitle = 'BlobGranuleVerifyLarge' diff --git a/tests/slow/BlobGranuleVerifyLargeClean.toml b/tests/slow/BlobGranuleVerifyLargeClean.toml index 782935a68bd..c8e8dbed1fe 100644 --- a/tests/slow/BlobGranuleVerifyLargeClean.toml +++ b/tests/slow/BlobGranuleVerifyLargeClean.toml @@ -1,5 +1,7 @@ [configuration] blobGranulesEnabled = true +# FIXME: re-enable rocks at some point +storageEngineExcludeTypes = [4] [[test]] testTitle = 'BlobGranuleVerifyLargeClean'