Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[forge] Add a test for validation set changes #4279

Merged
merged 1 commit into from Sep 30, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 5 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions crates/aptos/Cargo.toml
Expand Up @@ -72,6 +72,7 @@ default = []
fuzzing = []
no-upload-proposal = []
indexer = ["aptos-node/indexer"]
cli-framework-test-move = []

[build-dependencies]
shadow-rs = "0.16.2"
7 changes: 7 additions & 0 deletions crates/aptos/src/test/mod.rs
Expand Up @@ -16,7 +16,10 @@ use crate::common::types::{
MovePackageDir, OptionalPoolAddressArgs, PrivateKeyInputOptions, PromptOptions,
PublicKeyInputOptions, RestOptions, RngArgs, SaveFile, TransactionOptions, TransactionSummary,
};

#[cfg(feature = "cli-framework-test-move")]
use crate::common::utils::write_to_file;

use crate::move_tool::{
ArgWithType, CompilePackage, DownloadPackage, FrameworkPackageArgs, IncludedArtifacts,
InitPackage, MemberId, PublishPackage, RunFunction, TestPackage,
Expand Down Expand Up @@ -52,7 +55,10 @@ use serde::{Deserialize, Serialize};
use serde_json::Value;
use std::collections::HashMap;
use std::{collections::BTreeMap, mem, path::PathBuf, str::FromStr, time::Duration};

#[cfg(feature = "cli-framework-test-move")]
use thiserror::private::PathAsDisplay;

use tokio::time::{sleep, Instant};

#[cfg(test)]
Expand Down Expand Up @@ -722,6 +728,7 @@ impl CliTestFramework {
self.move_dir = Some(move_dir.path().to_path_buf());
}

#[cfg(feature = "cli-framework-test-move")]
pub fn add_move_files(&self) {
let move_dir = self.move_dir();
let sources_dir = move_dir.join("sources");
Expand Down
29 changes: 29 additions & 0 deletions testsuite/forge-cli/src/main.rs
Expand Up @@ -21,6 +21,7 @@ use testcases::performance_with_fullnode_test::PerformanceBenchmarkWithFN;
use testcases::state_sync_performance::StateSyncValidatorPerformance;
use testcases::three_region_simulation_test::ThreeRegionSimulationTest;
use testcases::twin_validator_test::TwinValidatorTest;
use testcases::validator_join_leave_test::ValidatorJoinLeaveTest;
use testcases::validator_reboot_stress_test::ValidatorRebootStressTest;
use testcases::{
compatibility_test::SimpleValidatorUpgrade, forge_setup_test::ForgeSetupTest, generate_traffic,
Expand Down Expand Up @@ -437,6 +438,7 @@ fn single_test_suite(test_name: &str) -> Result<ForgeConfig<'static>> {
state_sync_perf_fullnodes_execute_transactions(config)
}
"state_sync_perf_validators" => state_sync_perf_validators(config),
"validators_join_and_leave" => validators_join_and_leave(config),
zcchahaha marked this conversation as resolved.
Show resolved Hide resolved
"compat" => config
.with_initial_validator_count(NonZeroUsize::new(5).unwrap())
.with_network_tests(vec![&SimpleValidatorUpgrade])
Expand Down Expand Up @@ -776,6 +778,33 @@ fn state_sync_perf_validators(forge_config: ForgeConfig<'static>) -> ForgeConfig
.with_success_criteria(SuccessCriteria::new(5000, 10000, false, None, None, None))
}

/// The config for running a validator join and leave test.
fn validators_join_and_leave(forge_config: ForgeConfig<'static>) -> ForgeConfig<'static> {
forge_config
.with_initial_validator_count(NonZeroUsize::new(20).unwrap())
.with_genesis_helm_config_fn(Arc::new(|helm_values| {
helm_values["chain"]["epoch_duration_secs"] = 60.into();
helm_values["chain"]["allow_new_validators"] = true.into();
}))
.with_network_tests(vec![&ValidatorJoinLeaveTest])
.with_success_criteria(SuccessCriteria::new(
5000,
10000,
true,
Some(Duration::from_secs(240)),
Some(SystemMetricsThreshold::new(
// Check that we don't use more than 12 CPU cores for 30% of the time.
MetricsThreshold::new(12, 30),
// Check that we don't use more than 10 GB of memory for 30% of the time.
MetricsThreshold::new(10 * 1024 * 1024 * 1024, 30),
)),
Some(StateProgressThreshold {
max_no_progress_secs: 10.0,
max_round_gap: 4,
}),
))
}

fn land_blocking_test_suite(duration: Duration) -> ForgeConfig<'static> {
ForgeConfig::default()
.with_initial_validator_count(NonZeroUsize::new(20).unwrap())
Expand Down
1 change: 1 addition & 0 deletions testsuite/smoke-test/src/aptos_cli/mod.rs
Expand Up @@ -2,5 +2,6 @@
// SPDX-License-Identifier: Apache-2.0

mod account;
#[cfg(feature = "cli-framework-test-move")]
mod r#move;
mod validator;
17 changes: 11 additions & 6 deletions testsuite/testcases/Cargo.toml
Expand Up @@ -11,19 +11,24 @@ edition = "2021"

[dependencies]
anyhow = "1.0.57"
futures = "0.3.21"
rand = "0.7.3"
reqwest = { version = "0.11.10", features = ["json", "cookies", "blocking"] }
serde_json = "1.0.81"
tokio = { version = "1.21.0", features = ["full"] }
aptos = { path = "../../crates/aptos", features = ["fuzzing"] }

aptos = { path = "../../crates/aptos" }
aptos-genesis = { path = "../../crates/aptos-genesis", features = ["testing"] }
aptos-global-constants = { path = "../../config/global-constants" }
aptos-keygen = { path = "../../crates/aptos-keygen" }
aptos-logger = { path = "../../crates/aptos-logger" }
aptos-rest-client = { path = "../../crates/aptos-rest-client" }
aptos-sdk = { path = "../../sdk" }
aptos-types = { path = "../../types" }

forge = { path = "../forge" }
futures = "0.3.21"
hex = "0.4.3"
move-examples = { path = "../../aptos-move/move-examples" }
rand = "0.7.3"
reqwest = { version = "0.11.10", features = ["json", "cookies", "blocking"] }
serde_json = "1.0.81"
tokio = { version = "1.21.0", features = ["full"] }

[[test]]
name = "forge-local-compatibility"
Expand Down
3 changes: 2 additions & 1 deletion testsuite/testcases/src/lib.rs
Expand Up @@ -17,6 +17,7 @@ pub mod reconfiguration_test;
pub mod state_sync_performance;
pub mod three_region_simulation_test;
pub mod twin_validator_test;
pub mod validator_join_leave_test;
pub mod validator_reboot_stress_test;

use anyhow::{anyhow, ensure};
Expand Down Expand Up @@ -116,7 +117,7 @@ pub trait NetworkLoadTest: Test {
fn setup(&self, _ctx: &mut NetworkContext) -> Result<LoadDestination> {
Ok(LoadDestination::AllNodes)
}
// Load is started before this funciton is called, and stops after this function returns.
// Load is started before this function is called, and stops after this function returns.
// Expected duration is passed into this function, expecting this function to take that much
// time to finish. How long this function takes will dictate how long the actual test lasts.
fn test(&self, _swarm: &mut dyn Swarm, duration: Duration) -> Result<()> {
Expand Down
237 changes: 237 additions & 0 deletions testsuite/testcases/src/validator_join_leave_test.rs
@@ -0,0 +1,237 @@
// Copyright (c) Aptos
// SPDX-License-Identifier: Apache-2.0

use crate::{LoadDestination, NetworkLoadTest};
use aptos::account::create::DEFAULT_FUNDED_COINS;
use aptos_logger::info;
use aptos_sdk::crypto::ed25519::Ed25519PrivateKey;
use aptos_sdk::crypto::PrivateKey;
use forge::{
reconfig, NetworkContext, NetworkTest, NodeExt, Result, Swarm, SwarmExt, Test, FORGE_KEY_SEED,
};

use aptos_keygen::KeyGen;

use aptos::test::CliTestFramework;
use aptos_types::account_address::AccountAddress;
use aptos_types::transaction::authenticator::AuthenticationKey;
use std::time::Duration;
use tokio::runtime::Runtime;

const MAX_NODE_LAG_SECS: u64 = 360;

pub struct ValidatorJoinLeaveTest;

impl Test for ValidatorJoinLeaveTest {
fn name(&self) -> &'static str {
"validator join and leave sets"
}
}

impl NetworkLoadTest for ValidatorJoinLeaveTest {
fn setup(&self, _ctx: &mut NetworkContext) -> Result<LoadDestination> {
Ok(LoadDestination::AllValidators)
}

fn test(&self, swarm: &mut dyn Swarm, duration: Duration) -> Result<()> {
// Verify we have at least 7 validators (i.e., 3f+1, where f is 2)
zcchahaha marked this conversation as resolved.
Show resolved Hide resolved
// so we can lose 2 validators but still make progress.
let all_validators = swarm.validators().map(|v| v.peer_id()).collect::<Vec<_>>();
let num_validators = all_validators.len();
if num_validators < 7 {
return Err(anyhow::format_err!(
"ValidatorSet leaving and rejoining test require at least 7 validators! Given: {:?}.",
num_validators
));
}

let faucet_endpoint: reqwest::Url = "http://localhost:8081".parse().unwrap();
// Connect the operator tool to the node's JSON RPC API
let rest_client = swarm.validators().next().unwrap().rest_client();
let transaction_factory = swarm.chain_info().transaction_factory();
let runtime = Runtime::new().unwrap();

let mut cli = runtime.block_on(async {
CliTestFramework::new(
swarm.validators().next().unwrap().rest_api_endpoint(),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use rest_client.clone() here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The parameter is a URL, not a rest client.

faucet_endpoint,
/*num_cli_accounts=*/ 0,
)
.await
});

let mut public_info = swarm.chain_info().into_aptos_public_info();

let mut validator_cli_indices = Vec::new();

let starting_seed_in_decimal = i64::from_str_radix(FORGE_KEY_SEED, 16)?;

for i in 0..num_validators {
// Initialize keyGen to get validator private keys. We uses the same seed in the test
// driver as in the genesis script so that the validator keys are deterministic.
let mut seed_slice = [0u8; 32];
let seed_in_decimal = starting_seed_in_decimal + (i as i64);
let seed_in_hex_string = format!("{seed_in_decimal:0>64x}");

hex::decode_to_slice(seed_in_hex_string, &mut seed_slice)?;

let mut keygen = KeyGen::from_seed(seed_slice);

let (validator_cli_index, _keys, account_balance) = runtime.block_on(async {
let (validator_cli_index, keys) =
init_validator_account(&mut cli, &mut keygen).await;

let auth_key = AuthenticationKey::ed25519(&keys.account_private_key.public_key());
let validator_account_address = AccountAddress::new(*auth_key.derived_address());

public_info
.mint(validator_account_address, DEFAULT_FUNDED_COINS)
.await
.unwrap();

let account_balance = public_info
.get_balance(validator_account_address)
.await
.unwrap();

(validator_cli_index, keys, account_balance)
});
assert_eq!(account_balance, DEFAULT_FUNDED_COINS);
validator_cli_indices.push(validator_cli_index);

assert_eq!(
runtime.block_on(get_validator_state(&cli, validator_cli_index)),
ValidatorState::ACTIVE
);
}

// Log the test setup
info!(
"Running validator join and leave test {:?} with {:?} validators.",
self.name(),
num_validators,
);

// Wait for all nodes to synchronize and stabilize.
info!("Waiting for the validators to be synchronized.");
runtime.block_on(async {
swarm
.wait_for_all_nodes_to_catchup(Duration::from_secs(MAX_NODE_LAG_SECS))
.await
})?;

// Wait for 1/3 of the test duration.
std::thread::sleep(duration / 3);

runtime.block_on(async {
// 1/3 validators leave the validator set.
info!("Make the last 1/3 validators leave the validator set!");
for operator_index in validator_cli_indices.iter().rev().take(num_validators / 3) {
cli.leave_validator_set(*operator_index, None)
.await
.unwrap();

reconfig(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh, you should probably do reconfig once at the end, this will create bunch of epochs.

we also want to test that they can all ask to leave simultaneously

in the follow-up randomized test you are mentioning below, you can randomly select how many validators to leave simulatenously, before reconfig.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a reconfig after the loop.

&rest_client,
&transaction_factory,
swarm.chain_info().root_account(),
)
.await;
}

reconfig(
&rest_client,
&transaction_factory,
swarm.chain_info().root_account(),
)
.await;
});

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would add a meaningful wait time here. i.e. you can split test_duration into 3 , instead of 2, and have this order:

  • 1/3 of duration
  • leave validator set
  • 1/3 of duration
  • join validator set
  • 1/3 of duration

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In addition to this, it would also be good to do some sort of stress testing on nodes joing/leaving validator set. How about we do above operation in loop for several time and at the end you ensure that the validators are able to catch up with the network.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Split the test into 3 durations.
With regards to the stress testing, can we do it in another PR and a separate test?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A separate PR sounds good to me.

// Wait for 1/3 of the test duration.
std::thread::sleep(duration / 3);

runtime.block_on(async {
// Rejoining validator set.
info!("Make the last 1/3 validators rejoin the validator set!");
for operator_index in validator_cli_indices.iter().rev().take(num_validators / 3) {
cli.join_validator_set(*operator_index, None).await.unwrap();

reconfig(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same, to reconfig once , after the loop

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto

&rest_client,
&transaction_factory,
swarm.chain_info().root_account(),
)
.await;
}

reconfig(
&rest_client,
&transaction_factory,
swarm.chain_info().root_account(),
)
.await;
});

// Wait for all nodes to synchronize and stabilize.
info!("Waiting for the validators to be synchronized.");
runtime.block_on(async {
swarm
.wait_for_all_nodes_to_catchup(Duration::from_secs(MAX_NODE_LAG_SECS))
.await
})?;

Ok(())
}
}

impl NetworkTest for ValidatorJoinLeaveTest {
fn run<'t>(&self, ctx: &mut NetworkContext<'t>) -> Result<()> {
<dyn NetworkLoadTest>::run(self, ctx)
}
}

#[derive(Debug, PartialEq, Eq)]
enum ValidatorState {
ACTIVE,
JOINING,
LEAVING,
NONE,
}

struct ValidatorNodeKeys {
account_private_key: Ed25519PrivateKey,
}

impl ValidatorNodeKeys {
pub fn new(keygen: &mut KeyGen) -> Self {
Self {
account_private_key: keygen.generate_ed25519_private_key(),
}
}
}

async fn init_validator_account(
cli: &mut CliTestFramework,
keygen: &mut KeyGen,
) -> (usize, ValidatorNodeKeys) {
let validator_node_keys = ValidatorNodeKeys::new(keygen);
let validator_cli_index =
cli.add_account_to_cli(validator_node_keys.account_private_key.clone());
(validator_cli_index, validator_node_keys)
}

async fn get_validator_state(cli: &CliTestFramework, pool_index: usize) -> ValidatorState {
let validator_set = cli.show_validator_set().await.unwrap();
let pool_address = cli.account_id(pool_index);

for (state, list) in [
(ValidatorState::ACTIVE, &validator_set.active_validators),
(ValidatorState::JOINING, &validator_set.pending_active),
(ValidatorState::LEAVING, &validator_set.pending_inactive),
] {
if list.iter().any(|info| info.account_address == pool_address) {
return state;
}
}
ValidatorState::NONE
}