diff --git a/src/server/info_collector.cpp b/src/server/info_collector.cpp index 258f851bf7..49536e533f 100644 --- a/src/server/info_collector.cpp +++ b/src/server/info_collector.cpp @@ -102,7 +102,6 @@ void info_collector::on_app_stat() all.storage_count += row.storage_count; all.rdb_block_cache_hit_count += row.rdb_block_cache_hit_count; all.rdb_block_cache_total_count += row.rdb_block_cache_total_count; - all.rdb_block_cache_mem_usage += row.rdb_block_cache_mem_usage; all.rdb_index_and_filter_blocks_mem_usage += row.rdb_index_and_filter_blocks_mem_usage; all.rdb_memtable_mem_usage += row.rdb_memtable_mem_usage; read_qps[i] = row.get_qps + row.multi_get_qps + row.scan_qps; @@ -139,7 +138,6 @@ void info_collector::on_app_stat() std::abs(row.rdb_block_cache_total_count) < 1e-6 ? 0 : row.rdb_block_cache_hit_count / row.rdb_block_cache_total_count * 1000000); - counters->rdb_block_cache_mem_usage->set(row.rdb_block_cache_mem_usage); counters->rdb_index_and_filter_blocks_mem_usage->set( row.rdb_index_and_filter_blocks_mem_usage); counters->rdb_memtable_mem_usage->set(row.rdb_memtable_mem_usage); @@ -192,7 +190,6 @@ info_collector::AppStatCounters *info_collector::get_app_counters(const std::str INIT_COUNTER(storage_mb); INIT_COUNTER(storage_count); INIT_COUNTER(rdb_block_cache_hit_rate); - INIT_COUNTER(rdb_block_cache_mem_usage); INIT_COUNTER(rdb_index_and_filter_blocks_mem_usage); INIT_COUNTER(rdb_memtable_mem_usage); INIT_COUNTER(read_qps); diff --git a/src/shell/command_helper.h b/src/shell/command_helper.h index 1034d738c6..7420c47b75 100644 --- a/src/shell/command_helper.h +++ b/src/shell/command_helper.h @@ -485,7 +485,6 @@ struct row_data double storage_count = 0; double rdb_block_cache_hit_count = 0; double rdb_block_cache_total_count = 0; - double rdb_block_cache_mem_usage = 0; double rdb_index_and_filter_blocks_mem_usage = 0; double rdb_memtable_mem_usage = 0; }; @@ -531,8 +530,6 @@ update_app_pegasus_perf_counter(row_data &row, const std::string &counter_name, row.rdb_block_cache_hit_count += value; else if (counter_name == "rdb.block_cache.total_count") row.rdb_block_cache_total_count += value; - else if (counter_name == "rdb.block_cache.memory_usage") - row.rdb_block_cache_mem_usage += value; else if (counter_name == "rdb.index_and_filter_blocks.memory_usage") row.rdb_index_and_filter_blocks_mem_usage += value; else if (counter_name == "rdb.memtable.memory_usage") diff --git a/src/shell/commands.h b/src/shell/commands.h index 92fa00e5d3..0f56cba836 100644 --- a/src/shell/commands.h +++ b/src/shell/commands.h @@ -164,9 +164,37 @@ inline bool ls_apps(command_executor *e, shell_context *sc, arguments args) return true; } +struct list_nodes_helper +{ + std::string node_name; + std::string node_status; + int primary_count; + int secondary_count; + int64_t memused_res_mb; + int64_t block_cache_bytes; + int64_t mem_tbl_bytes; + int64_t mem_idx_bytes; + int64_t disk_available_total_ratio; + int64_t disk_available_min_ratio; + list_nodes_helper(const std::string &n, const std::string &s) + : node_name(n), + node_status(s), + primary_count(0), + secondary_count(0), + memused_res_mb(0), + block_cache_bytes(0), + mem_tbl_bytes(0), + mem_idx_bytes(0), + disk_available_total_ratio(0), + disk_available_min_ratio(0) + { + } +}; inline bool ls_nodes(command_executor *e, shell_context *sc, arguments args) { static struct option long_options[] = {{"detailed", no_argument, 0, 'd'}, + {"resolve_ip", no_argument, 0, 'r'}, + {"resource_usage", no_argument, 0, 'u'}, {"status", required_argument, 0, 's'}, {"output", required_argument, 0, 'o'}, {0, 0, 0, 0}}; @@ -174,17 +202,25 @@ inline bool ls_nodes(command_executor *e, shell_context *sc, arguments args) std::string status; std::string output_file; bool detailed = false; + bool resolve_ip = false; + bool resource_usage = false; optind = 0; while (true) { int option_index = 0; int c; - c = getopt_long(args.argc, args.argv, "ds:o:", long_options, &option_index); + c = getopt_long(args.argc, args.argv, "drus:o:", long_options, &option_index); if (c == -1) break; switch (c) { case 'd': detailed = true; break; + case 'r': + resolve_ip = true; + break; + case 'u': + resource_usage = true; + break; case 's': status = optarg; break; @@ -217,9 +253,184 @@ inline bool ls_nodes(command_executor *e, shell_context *sc, arguments args) status.c_str()); } - ::dsn::error_code err = sc->ddl_client->list_nodes(s, detailed, output_file); - if (err != ::dsn::ERR_OK) - std::cout << "list nodes failed, error=" << err.to_string() << std::endl; + std::map nodes; + auto r = sc->ddl_client->list_nodes(s, nodes); + if (r != dsn::ERR_OK) { + std::cout << "list nodes failed, error=" << r.to_string() << std::endl; + return true; + } + + std::map tmp_map; + int alive_node_count = 0; + for (auto &kv : nodes) { + if (kv.second == dsn::replication::node_status::NS_ALIVE) + alive_node_count++; + std::string status_str = dsn::enum_to_string(kv.second); + status_str = status_str.substr(status_str.find("NS_") + 3); + std::string node_name = kv.first.to_std_string(); + if (resolve_ip) { + // TODO: put hostname_from_ip_port into common utils + node_name = sc->ddl_client->hostname_from_ip_port(node_name.c_str()); + } + tmp_map.emplace(kv.first, list_nodes_helper(node_name, status_str)); + } + + if (detailed) { + std::vector<::dsn::app_info> apps; + r = sc->ddl_client->list_apps(dsn::app_status::AS_AVAILABLE, apps); + if (r != dsn::ERR_OK) { + std::cout << "list apps failed, error=" << r.to_string() << std::endl; + return true; + } + + for (auto &app : apps) { + int32_t app_id; + int32_t partition_count; + std::vector partitions; + r = sc->ddl_client->list_app(app.app_name, app_id, partition_count, partitions); + if (r != dsn::ERR_OK) { + std::cout << "list app " << app.app_name << " failed, error=" << r.to_string() + << std::endl; + return true; + } + + for (const dsn::partition_configuration &p : partitions) { + if (!p.primary.is_invalid()) { + auto find = tmp_map.find(p.primary); + if (find != tmp_map.end()) { + find->second.primary_count++; + } + } + for (const dsn::rpc_address &addr : p.secondaries) { + auto find = tmp_map.find(addr); + if (find != tmp_map.end()) { + find->second.secondary_count++; + } + } + } + } + } + + if (resource_usage) { + std::vector nodes; + if (!fill_nodes(sc, "replica-server", nodes)) { + derror("get replica server node list failed"); + return true; + } + + ::dsn::command command; + command.cmd = "perf-counters"; + command.arguments.push_back(".*memused.res(MB)"); + command.arguments.push_back(".*rdb.block_cache.memory_usage"); + command.arguments.push_back(".*disk.available.total.ratio"); + command.arguments.push_back(".*disk.available.min.ratio"); + command.arguments.push_back(".*@.*"); + std::vector> results; + call_remote_command(sc, nodes, command, results); + + for (int i = 0; i < nodes.size(); ++i) { + dsn::rpc_address node_addr = nodes[i].address; + auto tmp_it = tmp_map.find(node_addr); + if (tmp_it == tmp_map.end()) + continue; + if (!results[i].first) { + derror("query perf counter info from node %s failed", node_addr.to_string()); + return true; + } + dsn::perf_counter_info info; + dsn::blob bb(results[i].second.data(), 0, results[i].second.size()); + if (!dsn::json::json_forwarder::decode(bb, info)) { + derror("decode perf counter info from node %s failed, result = %s", + node_addr.to_string(), + results[i].second.c_str()); + return true; + } + if (info.result != "OK") { + derror("query perf counter info from node %s returns error, error = %s", + node_addr.to_string(), + info.result.c_str()); + return true; + } + list_nodes_helper &h = tmp_it->second; + for (dsn::perf_counter_metric &m : info.counters) { + if (m.name == "replica*server*memused.res(MB)") + h.memused_res_mb = m.value; + else if (m.name == "replica*app.pegasus*rdb.block_cache.memory_usage") + h.block_cache_bytes = m.value; + else if (m.name == "replica*eon.replica_stub*disk.available.total.ratio") + h.disk_available_total_ratio = m.value; + else if (m.name == "replica*eon.replica_stub*disk.available.min.ratio") + h.disk_available_min_ratio = m.value; + else { + int32_t app_id_x, partition_index_x; + std::string counter_name; + bool parse_ret = parse_app_pegasus_perf_counter_name( + m.name, app_id_x, partition_index_x, counter_name); + dassert(parse_ret, "name = %s", m.name.c_str()); + if (counter_name == "rdb.memtable.memory_usage") + h.mem_tbl_bytes += m.value; + else if (counter_name == "rdb.index_and_filter_blocks.memory_usage") + h.mem_idx_bytes += m.value; + } + } + } + } + + // print configuration_list_nodes_response + std::streambuf *buf; + std::ofstream of; + + if (!output_file.empty()) { + of.open(output_file); + buf = of.rdbuf(); + } else { + buf = std::cout.rdbuf(); + } + std::ostream out(buf); + + dsn::utils::table_printer tp; + tp.add_title("address"); + tp.add_column("status"); + if (detailed) { + tp.add_column("replica_count", tp_alignment::kRight); + tp.add_column("primary_count", tp_alignment::kRight); + tp.add_column("secondary_count", tp_alignment::kRight); + } + if (resource_usage) { + tp.add_column("memused_res_mb", tp_alignment::kRight); + tp.add_column("block_cache_mb", tp_alignment::kRight); + tp.add_column("mem_tbl_mb", tp_alignment::kRight); + tp.add_column("mem_idx_mb", tp_alignment::kRight); + tp.add_column("disk_avl_total_ratio", tp_alignment::kRight); + tp.add_column("disk_avl_min_ratio", tp_alignment::kRight); + } + for (auto &kv : tmp_map) { + tp.add_row(kv.second.node_name); + tp.append_data(kv.second.node_status); + if (detailed) { + tp.append_data(kv.second.primary_count + kv.second.secondary_count); + tp.append_data(kv.second.primary_count); + tp.append_data(kv.second.secondary_count); + } + if (resource_usage) { + tp.append_data(kv.second.memused_res_mb); + tp.append_data(kv.second.block_cache_bytes / (1 << 20U)); + tp.append_data(kv.second.mem_tbl_bytes / (1 << 20U)); + tp.append_data(kv.second.mem_idx_bytes / (1 << 20U)); + tp.append_data(kv.second.disk_available_total_ratio); + tp.append_data(kv.second.disk_available_min_ratio); + } + } + tp.output(out); + out << std::endl; + + dsn::utils::table_printer tp_count; + tp_count.add_row_name_and_data("total_node_count", nodes.size()); + tp_count.add_row_name_and_data("alive_node_count", alive_node_count); + tp_count.add_row_name_and_data("unalive_node_count", nodes.size() - alive_node_count); + tp_count.output(out, ": "); + out << std::endl; + return true; } @@ -3772,7 +3983,6 @@ inline bool app_stat(command_executor *e, shell_context *sc, arguments args) sum.storage_count += row.storage_count; sum.rdb_block_cache_hit_count += row.rdb_block_cache_hit_count; sum.rdb_block_cache_total_count += row.rdb_block_cache_total_count; - sum.rdb_block_cache_mem_usage += row.rdb_block_cache_mem_usage; sum.rdb_index_and_filter_blocks_mem_usage += row.rdb_index_and_filter_blocks_mem_usage; sum.rdb_memtable_mem_usage += row.rdb_memtable_mem_usage; } @@ -3808,8 +4018,9 @@ inline bool app_stat(command_executor *e, shell_context *sc, arguments args) tp.add_column("rejected", tp_alignment::kRight); tp.add_column("file_mb", tp_alignment::kRight); tp.add_column("file_num", tp_alignment::kRight); + tp.add_column("mem_tbl_mb", tp_alignment::kRight); + tp.add_column("mem_idx_mb", tp_alignment::kRight); tp.add_column("hit_rate", tp_alignment::kRight); - tp.add_column("rdb_mem_mb", tp_alignment::kRight); } for (row_data &row : rows) { @@ -3832,15 +4043,13 @@ inline bool app_stat(command_executor *e, shell_context *sc, arguments args) tp.append_data(row.recent_write_throttling_reject_count); tp.append_data(row.storage_mb); tp.append_data((uint64_t)row.storage_count); + tp.append_data(row.rdb_memtable_mem_usage / (1 << 20U)); + tp.append_data(row.rdb_index_and_filter_blocks_mem_usage / (1 << 20U)); double block_cache_hit_rate = std::abs(row.rdb_block_cache_total_count) < 1e-6 ? 0.0 : row.rdb_block_cache_hit_count / row.rdb_block_cache_total_count; tp.append_data(block_cache_hit_rate); - tp.append_data((row.rdb_block_cache_mem_usage + - row.rdb_index_and_filter_blocks_mem_usage + - row.rdb_memtable_mem_usage) / - (1 << 20U)); } } tp.output(out); diff --git a/src/shell/main.cpp b/src/shell/main.cpp index 9ddd4c7dfe..e215d105a5 100644 --- a/src/shell/main.cpp +++ b/src/shell/main.cpp @@ -56,7 +56,8 @@ static command_executor commands[] = { { "nodes", "get the node status for this cluster", - "[-d|--detailed] [-o|--output file_name] [-s|--status all|alive|unalive]", + "[-d|--detailed] [-r|--resolve_ip] [-u|--resource_usage] " + "[-o|--output file_name] [-s|--status all|alive|unalive]", ls_nodes, }, {