Skip to content

Commit f96eb80

Browse files
neunhoefjsteemann
andauthored
Read from followers: APM-296 (#16335)
Add read-from-followers for clusters. This currently covers the following APIs: - single document reads - batch document reads - standalone AQL queries - edge reads - exists reads (HTTP HEAD) - streaming transactions and operations in their context So far, it does not cover: - JavaScript transactions - The graph API Co-authored-by: Jan <jsteemann@users.noreply.github.com>
1 parent 8651f07 commit f96eb80

26 files changed

+628
-57
lines changed

CHANGELOG

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,18 @@
11
devel
22
-----
33

4+
* Introduce reading from followers in clusters. This works by offering
5+
an additional HTTP header "x-arango-allow-dirty-read" for certain
6+
read-only APIs. This header has already been used for active failover
7+
deployments to allow reading from followers. Using this header leads
8+
to the fact that coordinators are allowed to read from follower shards
9+
instead only from leader shards. This can help to spread the read load
10+
better across the cluster. Obviously, using this header can result in
11+
"dirty reads", which are read results returning stale data or even
12+
not-yet-officially committed data. Use at your own risk if performance
13+
is more important than correctness or if you know that data does not
14+
change.
15+
416
* Changed HTTP response code for error number 1521 from 500 to 400.
517

618
Error 1521 (query collection lock failed) is nowadays only emitted by

arangod/Aql/ShardLocking.cpp

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
#include "Aql/Query.h"
3636
#include "Cluster/ClusterFeature.h"
3737
#include "Logger/LogMacros.h"
38+
#include "StorageEngine/TransactionState.h"
3839
#include "Utilities/NameValidator.h"
3940

4041
using namespace arangodb;
@@ -378,16 +379,24 @@ ShardLocking::getShardMapping() {
378379
}
379380
}
380381
}
382+
TRI_ASSERT(!shardIds.empty());
381383
auto& server = _query.vocbase().server();
382384
if (!server.hasFeature<ClusterFeature>()) {
383385
THROW_ARANGO_EXCEPTION(TRI_ERROR_SHUTTING_DOWN);
384386
}
385387
auto& ci = server.getFeature<ClusterFeature>().clusterInfo();
386-
// We have at least one shard, otherwise we would not have snippets!
387-
TRI_ASSERT(!shardIds.empty());
388-
_shardMapping = ci.getResponsibleServers(shardIds);
389-
388+
#ifdef USE_ENTERPRISE
389+
auto& trx = _query.trxForOptimization();
390+
if (trx.state()->options().allowDirtyReads) {
391+
_shardMapping = trx.state()->whichReplicas(shardIds);
392+
} else
393+
#endif
394+
{
395+
// We have at least one shard, otherwise we would not have snippets!
396+
_shardMapping = ci.getResponsibleServers(shardIds);
397+
}
390398
TRI_ASSERT(_shardMapping.size() == shardIds.size());
399+
391400
for (auto const& lockInfo : _collectionLocking) {
392401
for (auto const& sid : lockInfo.second.allShards) {
393402
auto mapped = _shardMapping.find(sid);

arangod/Aql/ShardLocking.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,9 @@ class ShardLocking {
130130
// Get a full mapping of ShardID => LeaderID.
131131
// This will stay constant during this query, and a query could be aborted in
132132
// case of failovers.
133+
// For ReadFromFollower situations in read-only queries, this map maps
134+
// each ShardID to the actual leader or follower which has been chosen
135+
// for the query.
133136
containers::FlatHashMap<ShardID, ServerID> const& getShardMapping();
134137

135138
// Get the shards of the given collection within the given snippet.

arangod/Cluster/ClusterInfo.cpp

Lines changed: 108 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1025,6 +1025,8 @@ void ClusterInfo::loadPlan() {
10251025
decltype(_plannedCollections) newCollections;
10261026
decltype(_shards) newShards;
10271027
decltype(_shardServers) newShardServers;
1028+
decltype(_shardToShardGroupLeader) newShardToShardGroupLeader;
1029+
decltype(_shardGroups) newShardGroups;
10281030
decltype(_shardToName) newShardToName;
10291031
decltype(_dbAnalyzersRevision) newDbAnalyzersRevision;
10301032
decltype(_newStuffByDatabase) newStuffByDatabase;
@@ -1041,6 +1043,8 @@ void ClusterInfo::loadPlan() {
10411043
newCollections = _plannedCollections;
10421044
newShards = _shards;
10431045
newShardServers = _shardServers;
1046+
newShardToShardGroupLeader = _shardToShardGroupLeader;
1047+
newShardGroups = _shardGroups;
10441048
newShardToName = _shardToName;
10451049
newDbAnalyzersRevision = _dbAnalyzersRevision;
10461050
newStuffByDatabase = _newStuffByDatabase;
@@ -1097,6 +1101,8 @@ void ClusterInfo::loadPlan() {
10971101
newShards.erase(shardName);
10981102
newShardServers.erase(shardName);
10991103
newShardToName.erase(shardName);
1104+
newShardToShardGroupLeader.erase(shardName);
1105+
newShardGroups.erase(shardName);
11001106
}
11011107
}
11021108
}
@@ -1449,6 +1455,10 @@ void ClusterInfo::loadPlan() {
14491455
newShards.erase(shardId);
14501456
newShardServers.erase(shardId);
14511457
newShardToName.erase(shardId);
1458+
// We try to erase the shard ID anyway, no problem if it is
1459+
// not in there, should it be a shard group leader!
1460+
newShardToShardGroupLeader.erase(shardId);
1461+
newShardGroups.erase(shardId);
14521462
}
14531463
collectionsPath.pop_back();
14541464
}
@@ -1549,6 +1559,59 @@ void ClusterInfo::loadPlan() {
15491559
continue;
15501560
}
15511561
}
1562+
// Now that the loop is completed, we have to run through it one more
1563+
// time to get the shard groups done:
1564+
for (auto const& colPair : *databaseCollections) {
1565+
if (colPair.first == colPair.second.collection->name()) {
1566+
// Every collection shows up once with its ID and once with its name.
1567+
// We only want it once, so we only take it when we see the ID, not
1568+
// the name as key:
1569+
continue;
1570+
}
1571+
auto const& groupLeader =
1572+
colPair.second.collection->distributeShardsLike();
1573+
if (!groupLeader.empty()) {
1574+
auto groupLeaderCol = newShards.find(groupLeader);
1575+
if (groupLeaderCol != newShards.end()) {
1576+
auto col = newShards.find(
1577+
std::to_string(colPair.second.collection->id().id()));
1578+
if (col != newShards.end()) {
1579+
if (col->second->size() == 0) {
1580+
// Can happen for smart edge collections. But in this case we
1581+
// can ignore the collection.
1582+
continue;
1583+
}
1584+
TRI_ASSERT(groupLeaderCol->second->size() == col->second->size());
1585+
for (size_t i = 0; i < col->second->size(); ++i) {
1586+
newShardToShardGroupLeader.try_emplace(
1587+
col->second->at(i), groupLeaderCol->second->at(i));
1588+
auto it = newShardGroups.find(groupLeaderCol->second->at(i));
1589+
if (it == newShardGroups.end()) {
1590+
// Need to create a new list:
1591+
auto list = std::make_shared<std::vector<ShardID>>();
1592+
list->reserve(2);
1593+
// group leader as well as member:
1594+
list->emplace_back(groupLeaderCol->second->at(i));
1595+
list->emplace_back(col->second->at(i));
1596+
newShardGroups.try_emplace(groupLeaderCol->second->at(i),
1597+
std::move(list));
1598+
} else {
1599+
// Need to add us to the list:
1600+
it->second->push_back(col->second->at(i));
1601+
}
1602+
}
1603+
} else {
1604+
LOG_TOPIC("12f32", WARN, Logger::CLUSTER)
1605+
<< "loadPlan: Strange, could not find collection: "
1606+
<< colPair.second.collection->name();
1607+
}
1608+
} else {
1609+
LOG_TOPIC("22312", WARN, Logger::CLUSTER)
1610+
<< "loadPlan: Strange, could not find proto collection: "
1611+
<< groupLeader;
1612+
}
1613+
}
1614+
}
15521615
newCollections.insert_or_assign(databaseName,
15531616
std::move(databaseCollections));
15541617
}
@@ -1649,6 +1712,8 @@ void ClusterInfo::loadPlan() {
16491712
_plannedCollections.swap(newCollections);
16501713
_shards.swap(newShards);
16511714
_shardServers.swap(newShardServers);
1715+
_shardToShardGroupLeader.swap(newShardToShardGroupLeader);
1716+
_shardGroups.swap(newShardGroups);
16521717
_shardToName.swap(newShardToName);
16531718
}
16541719

@@ -6064,21 +6129,41 @@ void ClusterInfo::setFailedServers(
60646129
#ifdef ARANGODB_USE_GOOGLE_TESTS
60656130
void ClusterInfo::setServers(
60666131
containers::FlatHashMap<ServerID, std::string> servers) {
6067-
WRITE_LOCKER(readLocker, _serversProt.lock);
6132+
WRITE_LOCKER(writeLocker, _serversProt.lock);
60686133
_servers = std::move(servers);
60696134
}
60706135

60716136
void ClusterInfo::setServerAliases(
60726137
containers::FlatHashMap<ServerID, std::string> aliases) {
6073-
WRITE_LOCKER(readLocker, _serversProt.lock);
6138+
WRITE_LOCKER(writeLocker, _serversProt.lock);
60746139
_serverAliases = std::move(aliases);
60756140
}
60766141

60776142
void ClusterInfo::setServerAdvertisedEndpoints(
60786143
containers::FlatHashMap<ServerID, std::string> advertisedEndpoints) {
6079-
WRITE_LOCKER(readLocker, _serversProt.lock);
6144+
WRITE_LOCKER(writeLocker, _serversProt.lock);
60806145
_serverAdvertisedEndpoints = std::move(advertisedEndpoints);
60816146
}
6147+
6148+
void ClusterInfo::setShardToShardGroupLeader(
6149+
containers::FlatHashMap<ShardID, ShardID> shardToShardGroupLeader) {
6150+
WRITE_LOCKER(writeLocker, _planProt.lock);
6151+
_shardToShardGroupLeader = std::move(shardToShardGroupLeader);
6152+
}
6153+
6154+
void ClusterInfo::setShardGroups(
6155+
containers::FlatHashMap<ShardID, std::shared_ptr<std::vector<ShardID>>>
6156+
shardGroups) {
6157+
WRITE_LOCKER(writeLocker, _planProt.lock);
6158+
_shardGroups = std::move(shardGroups);
6159+
}
6160+
6161+
void ClusterInfo::setShardIds(
6162+
containers::FlatHashMap<ShardID, std::shared_ptr<std::vector<ServerID>>>
6163+
shardIds) {
6164+
WRITE_LOCKER(writeLocker, _currentProt.lock);
6165+
_shardIds = std::move(shardIds);
6166+
}
60826167
#endif
60836168

60846169
bool ClusterInfo::serverExists(std::string_view serverId) const noexcept {
@@ -6945,6 +7030,26 @@ VPackBuilder ClusterInfo::toVelocyPack() {
69457030
}
69467031
}
69477032
}
7033+
dump.add(VPackValue("shardToShardGroupLeader"));
7034+
{
7035+
VPackObjectBuilder d(&dump);
7036+
for (auto const& s : _shardToShardGroupLeader) {
7037+
dump.add(s.first, VPackValue(s.second));
7038+
}
7039+
}
7040+
dump.add(VPackValue("shardGroups"));
7041+
{
7042+
VPackObjectBuilder d(&dump);
7043+
for (auto const& s : _shardGroups) {
7044+
dump.add(VPackValue(s.first));
7045+
{
7046+
VPackArrayBuilder d2(&dump);
7047+
for (auto const& ss : *s.second) {
7048+
dump.add(VPackValue(ss));
7049+
}
7050+
}
7051+
}
7052+
}
69487053
dump.add(VPackValue("shards"));
69497054
{
69507055
VPackObjectBuilder d(&dump);

arangod/Cluster/ClusterInfo.h

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -897,6 +897,28 @@ class ClusterInfo final {
897897
containers::FlatHashMap<ShardID, ServerID> getResponsibleServers(
898898
containers::FlatHashSet<ShardID> const&);
899899

900+
//////////////////////////////////////////////////////////////////////////////
901+
/// @brief atomically find all servers who are responsible for the given
902+
/// shards (choose either the leader or some follower for each, but
903+
/// make the choice consistent with `distributeShardsLike` dependencies.
904+
/// Will throw an exception if no leader can be found for any
905+
/// of the shards. Will return an empty result if the shards couldn't be
906+
/// determined after a while - it is the responsibility of the caller to
907+
/// check for an empty result!
908+
/// The map `result` can already contain a partial choice, this method
909+
/// ensures that all the shards in `list` are in the end set in the
910+
/// `result` map. Additional shards can be added to `result` as needed,
911+
/// in particular the shard prototypes of the shards in list will be added.
912+
/// It is not allowed that `result` contains a setting for a shard but
913+
/// no setting (or a different one) for its shard prototype!
914+
//////////////////////////////////////////////////////////////////////////////
915+
916+
#ifdef USE_ENTERPRISE
917+
void getResponsibleServersReadFromFollower(
918+
containers::FlatHashSet<ShardID> const& list,
919+
containers::FlatHashMap<ShardID, ServerID>& result);
920+
#endif
921+
900922
//////////////////////////////////////////////////////////////////////////////
901923
/// @brief find the shard list of a collection, sorted numerically
902924
//////////////////////////////////////////////////////////////////////////////
@@ -946,6 +968,16 @@ class ClusterInfo final {
946968

947969
void setServerAdvertisedEndpoints(
948970
containers::FlatHashMap<ServerID, std::string> advertisedEndpoints);
971+
972+
void setShardToShardGroupLeader(
973+
containers::FlatHashMap<ShardID, ShardID> shardToShardGroupLeader);
974+
975+
void setShardGroups(
976+
containers::FlatHashMap<ShardID, std::shared_ptr<std::vector<ShardID>>>);
977+
978+
void setShardIds(
979+
containers::FlatHashMap<ShardID, std::shared_ptr<std::vector<ServerID>>>
980+
shardIds);
949981
#endif
950982

951983
bool serverExists(std::string_view serverID) const noexcept;
@@ -1232,6 +1264,44 @@ class ClusterInfo final {
12321264
// planned shard ID => collection name
12331265
containers::FlatHashMap<ShardID, CollectionID> _shardToName;
12341266

1267+
// planned shard ID => shard ID of shard group leader
1268+
// This deserves an explanation. If collection B has `distributeShardsLike`
1269+
// collection A, then A and B have the same number of shards. We say that
1270+
// the k-th shard of A and the k-th shard of B are in the same "shard group".
1271+
// This can be true for multiple collections, but they must then always
1272+
// have the same collection A under `distributeShardsLike`. The shard of
1273+
// collection A is then called the "shard group leader". It is guaranteed that
1274+
// the shards of a shard group are always planned to be on the same
1275+
// dbserver, and the leader is always the same for all shards in the group.
1276+
// If a shard is a shard group leader, it does not appear in this map.
1277+
// Example:
1278+
// Collection: A B C
1279+
// Shard index 0: s1 s5 s9
1280+
// Shard index 1: s2 s6 s10
1281+
// Shard index 2: s3 s7 s11
1282+
// Shard index 3: s4 s8 s12
1283+
// Here, collection B has "distributeShardsLike" set to "A",
1284+
// collection C has "distributeShardsLike" set to "B",
1285+
// the `numberOfShards` is 4 for all three collections.
1286+
// Shard groups are: s1, s5, s9
1287+
// and: s2, s6, s10
1288+
// and: s3, s7, s11
1289+
// and: s4, s8, s12
1290+
// Shard group leaders are s1, s2, s3 and s4.
1291+
// That is, "shard group" is across collections, "shard index" is
1292+
// within a collection.
1293+
// All three collections must have the same `replicationFactor`, and
1294+
// it is guaranteed, that all shards in a group always have the same
1295+
// leader and the same list of followers.
1296+
// Note however, that a follower for a shard group can be in sync with
1297+
// its leader for some of the shards in the group and not for others!
1298+
// Note that shard group leaders themselves do not appear in this map:
1299+
containers::FlatHashMap<ShardID, ShardID> _shardToShardGroupLeader;
1300+
// In the following map we store for each shard group leader the list
1301+
// of shards in the group, including the leader.
1302+
containers::FlatHashMap<ShardID, std::shared_ptr<std::vector<ShardID>>>
1303+
_shardGroups;
1304+
12351305
AllViews _plannedViews; // from Plan/Views/
12361306
AllViews _newPlannedViews; // views that have been created during `loadPlan`
12371307
// execution

0 commit comments

Comments
 (0)