From 5ee93df51e3399c9428e4c0e8ea608bc3b29494a Mon Sep 17 00:00:00 2001 From: vahe1994 Date: Sun, 8 Jan 2023 14:19:31 +0400 Subject: [PATCH 01/10] 1. Added relay options to servers 2. Enabled relay options by default 3. Changed hivemind version to 1.1.5 --- setup.cfg | 2 +- src/petals/cli/run_server.py | 8 +++++++- src/petals/client/remote_model.py | 2 ++ src/petals/server/server.py | 5 ++++- 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/setup.cfg b/setup.cfg index 11513bd45..3ba993ea0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -37,7 +37,7 @@ install_requires = huggingface-hub==0.11.1 transformers==4.25.1 speedtest-cli==2.1.3 - hivemind==1.1.3 + hivemind==1.1.5 tensor_parallel==1.0.23 humanfriendly async-timeout>=4.0.2 diff --git a/src/petals/cli/run_server.py b/src/petals/cli/run_server.py index e08993769..7ae16288c 100644 --- a/src/petals/cli/run_server.py +++ b/src/petals/cli/run_server.py @@ -127,7 +127,10 @@ def main(): parser.add_argument("--mean_balance_check_period", type=float, default=60, help="Check the swarm's balance every N seconds (and rebalance it if necessary)") + parser.add_argument("--auto_relay", action='store_true', help="Enabling relay for NAT traversal") + parser.add_argument('--no-auto_relay', dest='auto_relay', action='store_false') parser.add_argument("--use_auth_token", action='store_true', help="auth token for from_pretrained") + parser.add_argument('--load_in_8bit', type=str, default=None, help="Convert the loaded transformer blocks into mixed-8bit quantized model. " "Default: True if GPU is available. Use `--load_in_8bit False` to disable this") @@ -140,7 +143,7 @@ def main(): help="Skip checking this server's reachability via health.petals.ml " "when connecting to the public swarm. If you connect to a private swarm, " "the check is skipped by default. Use this option only if you know what you are doing") - + parser.set_defaults(auto_relay=True) # fmt:on args = vars(parser.parse_args()) args.pop("config", None) @@ -158,6 +161,8 @@ def main(): announce_maddrs = args.pop("announce_maddrs") public_ip = args.pop("public_ip") + use_auto_relay = args.pop("auto_relay") + if public_ip is not None: assert announce_maddrs is None, "You can't use --public_ip and --announce_maddrs at the same time" assert port != 0, "Please specify a fixed non-zero --port when you use --public_ip (e.g., --port 31337)" @@ -197,6 +202,7 @@ def main(): compression=compression, max_disk_space=max_disk_space, attn_cache_size=attn_cache_size, + use_auto_relay=use_auto_relay, ) try: server.run() diff --git a/src/petals/client/remote_model.py b/src/petals/client/remote_model.py index 3e52e40f6..5d22bfd52 100644 --- a/src/petals/client/remote_model.py +++ b/src/petals/client/remote_model.py @@ -107,6 +107,8 @@ def __init__(self, config: DistributedBloomConfig): num_workers=n_layer, startup_timeout=config.daemon_startup_timeout, start=True, + use_relay=True, + use_auto_relay=True, ) ) assert isinstance(dht, hivemind.DHT) and dht.is_alive(), "dht must be a running hivemind.DHT instance" diff --git a/src/petals/server/server.py b/src/petals/server/server.py index a8927aa3b..f00b0c5b8 100644 --- a/src/petals/server/server.py +++ b/src/petals/server/server.py @@ -78,6 +78,8 @@ def __init__( load_in_8bit: Optional[bool] = None, tensor_parallel_devices: Optional[Sequence[torch.device]] = None, skip_reachability_check: bool = False, + use_relay: bool = True, + use_auto_relay: bool = True, **kwargs, ): """Create a server with one or more bloom blocks. See run_server.py for documentation.""" @@ -117,7 +119,8 @@ def __init__( ) self.module_uids = [f"{self.prefix}.{block_index}" for block_index in range(self.block_config.n_layer)] - self.dht = DHT(initial_peers=initial_peers, start=True, num_workers=self.block_config.n_layer, **kwargs) + self.dht = DHT(initial_peers=initial_peers, start=True, num_workers=self.block_config.n_layer, + use_relay=use_relay, use_auto_relay=use_auto_relay, **kwargs) visible_maddrs_str = [str(a) for a in self.dht.get_visible_maddrs()] if initial_peers == PUBLIC_INITIAL_PEERS: logger.info(f"Connecting to the public swarm, peer_id = {self.dht.peer_id}") From d641b9b5544c6e542b7cee9bca7bd7f51f8e2833 Mon Sep 17 00:00:00 2001 From: vahe1994 Date: Sun, 8 Jan 2023 15:43:32 +0400 Subject: [PATCH 02/10] - style reformatting --- src/petals/server/server.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/petals/server/server.py b/src/petals/server/server.py index f00b0c5b8..84810c316 100644 --- a/src/petals/server/server.py +++ b/src/petals/server/server.py @@ -119,8 +119,14 @@ def __init__( ) self.module_uids = [f"{self.prefix}.{block_index}" for block_index in range(self.block_config.n_layer)] - self.dht = DHT(initial_peers=initial_peers, start=True, num_workers=self.block_config.n_layer, - use_relay=use_relay, use_auto_relay=use_auto_relay, **kwargs) + self.dht = DHT( + initial_peers=initial_peers, + start=True, + num_workers=self.block_config.n_layer, + use_relay=use_relay, + use_auto_relay=use_auto_relay, + **kwargs, + ) visible_maddrs_str = [str(a) for a in self.dht.get_visible_maddrs()] if initial_peers == PUBLIC_INITIAL_PEERS: logger.info(f"Connecting to the public swarm, peer_id = {self.dht.peer_id}") From d99d7107e157796f4208c64e023da2fb6a0d8f1f Mon Sep 17 00:00:00 2001 From: Aleksandr Borzunov Date: Sun, 8 Jan 2023 13:51:07 +0000 Subject: [PATCH 03/10] Refactor CLI arg to look as --use_auto_relay False/True --- src/petals/cli/run_server.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/petals/cli/run_server.py b/src/petals/cli/run_server.py index 7ae16288c..f7643b4c8 100644 --- a/src/petals/cli/run_server.py +++ b/src/petals/cli/run_server.py @@ -12,6 +12,9 @@ logger = get_logger(__file__) +TRUE_CONSTANTS = ["TRUE", "1"] + + def main(): # fmt:off parser = configargparse.ArgParser(default_config_files=["config.yml"], @@ -127,8 +130,9 @@ def main(): parser.add_argument("--mean_balance_check_period", type=float, default=60, help="Check the swarm's balance every N seconds (and rebalance it if necessary)") - parser.add_argument("--auto_relay", action='store_true', help="Enabling relay for NAT traversal") - parser.add_argument('--no-auto_relay', dest='auto_relay', action='store_false') + parser.add_argument("--use_auto_relay", type=str, default="True", + help="Look for libp2p relays for NAT traversal. " + "Use `--use_auto_relay False/True` to disable/enable this") parser.add_argument("--use_auth_token", action='store_true', help="auth token for from_pretrained") parser.add_argument('--load_in_8bit', type=str, default=None, @@ -143,7 +147,7 @@ def main(): help="Skip checking this server's reachability via health.petals.ml " "when connecting to the public swarm. If you connect to a private swarm, " "the check is skipped by default. Use this option only if you know what you are doing") - parser.set_defaults(auto_relay=True) + # fmt:on args = vars(parser.parse_args()) args.pop("config", None) @@ -161,8 +165,6 @@ def main(): announce_maddrs = args.pop("announce_maddrs") public_ip = args.pop("public_ip") - use_auto_relay = args.pop("auto_relay") - if public_ip is not None: assert announce_maddrs is None, "You can't use --public_ip and --announce_maddrs at the same time" assert port != 0, "Please specify a fixed non-zero --port when you use --public_ip (e.g., --port 31337)" @@ -191,9 +193,11 @@ def main(): if args.pop("new_swarm"): args["initial_peers"] = [] + use_auto_relay = args.pop("use_auto_relay").upper() in TRUE_CONSTANTS + load_in_8bit = args.pop("load_in_8bit") if load_in_8bit is not None: - args["load_in_8bit"] = load_in_8bit.lower() in ["true", "1"] + load_in_8bit = load_in_8bit.upper() in TRUE_CONSTANTS server = Server( **args, @@ -203,6 +207,7 @@ def main(): max_disk_space=max_disk_space, attn_cache_size=attn_cache_size, use_auto_relay=use_auto_relay, + load_in_8bit=load_in_8bit, ) try: server.run() From 5e1c9fc9d15b623e8562c85bcf34af329dc1e402 Mon Sep 17 00:00:00 2001 From: Aleksandr Borzunov Date: Mon, 9 Jan 2023 06:58:54 +0000 Subject: [PATCH 04/10] Delay reachability check, add retries to it --- src/petals/server/server.py | 60 +++++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 25 deletions(-) diff --git a/src/petals/server/server.py b/src/petals/server/server.py index 84810c316..b8a14ae07 100644 --- a/src/petals/server/server.py +++ b/src/petals/server/server.py @@ -130,8 +130,6 @@ def __init__( visible_maddrs_str = [str(a) for a in self.dht.get_visible_maddrs()] if initial_peers == PUBLIC_INITIAL_PEERS: logger.info(f"Connecting to the public swarm, peer_id = {self.dht.peer_id}") - if not skip_reachability_check: - self._check_reachability() else: logger.info(f"Running DHT node on {visible_maddrs_str}, initial peers = {initial_peers}") @@ -203,31 +201,43 @@ def __init__( self.mean_balance_check_period = mean_balance_check_period self.mean_block_selection_delay = mean_block_selection_delay - self.stop = threading.Event() + # We delay the reachability check to the end of init, so the server has time to join libp2p relays + if not skip_reachability_check and initial_peers == PUBLIC_INITIAL_PEERS: + self._check_reachability() - def _check_reachability(self): - try: - r = requests.get(f"http://health.petals.ml/api/v1/is_reachable/{self.dht.peer_id}", timeout=10) - r.raise_for_status() - response = r.json() - except Exception as e: - logger.warning(f"Skipping reachability check because health.petals.ml is down: {repr(e)}") - return - - if not response["success"]: - # This happens only if health.petals.ml is up and explicitly told us that we are unreachable - raise RuntimeError( - f"Server is not reachable from the Internet:\n\n" - f"{response['message']}\n\n" - f"You need to fix your port forwarding and/or firewall settings. How to do that:\n\n" - f" 1. Choose a specific port for the Petals server, for example, 31337.\n" - f" 2. Ensure that this port is accessible from the Internet and not blocked by your firewall.\n" - f" 3. Add these arguments to explicitly announce your IP address and port to other peers:\n" - f" python -m petals.cli.run_server ... --public_ip {response['your_ip']} --port 31337\n" - f" 4. If it does not help, ask for help in our Discord: https://discord.gg/Wuk8BnrEPH\n" - ) + self.stop = threading.Event() - logger.info("Server is reachable from the Internet, it will appear at http://health.petals.ml soon") + def _check_reachability(self, n_retries=10, retry_delay=30): + for i in range(n_retries): + try: + r = requests.get(f"http://health.petals.ml/api/v1/is_reachable/{self.dht.peer_id}", timeout=10) + r.raise_for_status() + response = r.json() + + if response["success"]: + logger.info( + f"Server is reachable from the Internet. " + f"It will appear at http://health.petals.ml soon, peer_id = {self.dht.peer_id}" + ) + return + + if i < n_retries - 1: + logger.info(f"Server is not reachable from the Internet yet. Retrying in {retry_delay} sec") + time.sleep(retry_delay) + except Exception as e: + logger.warning(f"Skipping reachability check because health.petals.ml is down: {repr(e)}") + return + + raise RuntimeError( + f"Server has not become reachable from the Internet:\n\n" + f"{response['message']}\n\n" + f"You need to fix your port forwarding and/or firewall settings. How to do that:\n\n" + f" 1. Choose a specific port for the Petals server, for example, 31337.\n" + f" 2. Ensure that this port is accessible from the Internet and not blocked by your firewall.\n" + f" 3. Add these arguments to explicitly announce your IP address and port to other peers:\n" + f" python -m petals.cli.run_server ... --public_ip {response['your_ip']} --port 31337\n" + f" 4. If it does not help, ask for help in our Discord: https://discord.gg/Wuk8BnrEPH\n" + ) def _choose_num_blocks(self) -> int: assert ( From a39e5442faeac05ccef033a7dc55e9775a53b477 Mon Sep 17 00:00:00 2001 From: Aleksandr Borzunov Date: Mon, 9 Jan 2023 13:23:35 +0000 Subject: [PATCH 05/10] Leave --no_auto_relay argument only --- src/petals/cli/run_server.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/src/petals/cli/run_server.py b/src/petals/cli/run_server.py index f7643b4c8..fc0771d27 100644 --- a/src/petals/cli/run_server.py +++ b/src/petals/cli/run_server.py @@ -12,9 +12,6 @@ logger = get_logger(__file__) -TRUE_CONSTANTS = ["TRUE", "1"] - - def main(): # fmt:off parser = configargparse.ArgParser(default_config_files=["config.yml"], @@ -41,6 +38,9 @@ def main(): 'This is a simplified way to set the --announce_maddrs option (see below).' 'Default: server announces IPv4/IPv6 addresses of your network interfaces') + parser.add_argument("--no_auto_relay", action="store_false", dest="use_auto_relay", + help="Do not look for libp2p relays to reach peers behind NATs/firewalls") + parser.add_argument('--host_maddrs', nargs='+', required=False, help='Multiaddrs to listen for external connections from other peers') parser.add_argument('--announce_maddrs', nargs='+', required=False, @@ -130,11 +130,7 @@ def main(): parser.add_argument("--mean_balance_check_period", type=float, default=60, help="Check the swarm's balance every N seconds (and rebalance it if necessary)") - parser.add_argument("--use_auto_relay", type=str, default="True", - help="Look for libp2p relays for NAT traversal. " - "Use `--use_auto_relay False/True` to disable/enable this") parser.add_argument("--use_auth_token", action='store_true', help="auth token for from_pretrained") - parser.add_argument('--load_in_8bit', type=str, default=None, help="Convert the loaded transformer blocks into mixed-8bit quantized model. " "Default: True if GPU is available. Use `--load_in_8bit False` to disable this") @@ -193,11 +189,9 @@ def main(): if args.pop("new_swarm"): args["initial_peers"] = [] - use_auto_relay = args.pop("use_auto_relay").upper() in TRUE_CONSTANTS - load_in_8bit = args.pop("load_in_8bit") if load_in_8bit is not None: - load_in_8bit = load_in_8bit.upper() in TRUE_CONSTANTS + args["load_in_8bit"] = load_in_8bit.lower() in ["true", "1"] server = Server( **args, @@ -206,8 +200,6 @@ def main(): compression=compression, max_disk_space=max_disk_space, attn_cache_size=attn_cache_size, - use_auto_relay=use_auto_relay, - load_in_8bit=load_in_8bit, ) try: server.run() From 59f465fd4356a2f676bfcc0471d883eb1c88dbe7 Mon Sep 17 00:00:00 2001 From: Aleksandr Borzunov Date: Mon, 9 Jan 2023 13:59:55 +0000 Subject: [PATCH 06/10] Shorten info message --- src/petals/server/server.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/petals/server/server.py b/src/petals/server/server.py index b8a14ae07..82666415c 100644 --- a/src/petals/server/server.py +++ b/src/petals/server/server.py @@ -215,10 +215,7 @@ def _check_reachability(self, n_retries=10, retry_delay=30): response = r.json() if response["success"]: - logger.info( - f"Server is reachable from the Internet. " - f"It will appear at http://health.petals.ml soon, peer_id = {self.dht.peer_id}" - ) + logger.info("Server is reachable from the Internet. It will appear at http://health.petals.ml soon") return if i < n_retries - 1: From 6d8322e9eb456f5c096b4d85f04eb9125fd3ce6d Mon Sep 17 00:00:00 2001 From: Aleksandr Borzunov Date: Mon, 9 Jan 2023 14:13:59 +0000 Subject: [PATCH 07/10] Improve "GPU is not available" message --- src/petals/server/server.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/petals/server/server.py b/src/petals/server/server.py index 82666415c..1523a44a7 100644 --- a/src/petals/server/server.py +++ b/src/petals/server/server.py @@ -240,7 +240,9 @@ def _choose_num_blocks(self) -> int: assert ( self.converted_model_name_or_path == "bigscience/bloom-petals" ), "If you use a model other than bigscience/bloom-petals, please specify --num_blocks manually" - assert self.device.type == "cuda", "If you run a non-GPU server, please specify --num_blocks manually" + assert self.device.type == "cuda", \ + "GPU is not available. If you want to run a CPU-only server, please specify --num_blocks. " \ + "CPU-only servers in the public swarm are discouraged since they are much slower" num_devices = len(self.tensor_parallel_devices) if self.tensor_parallel_devices else 1 if num_devices > 1: From 43b19976a3bf88814656f271247534623455b9fa Mon Sep 17 00:00:00 2001 From: Aleksandr Borzunov Date: Mon, 9 Jan 2023 15:22:26 +0000 Subject: [PATCH 08/10] Perform reachability check once blocks are loaded to avoid delays --- src/petals/server/reachability.py | 37 +++++++++++++++++++++++++ src/petals/server/server.py | 46 +++++++------------------------ 2 files changed, 47 insertions(+), 36 deletions(-) create mode 100644 src/petals/server/reachability.py diff --git a/src/petals/server/reachability.py b/src/petals/server/reachability.py new file mode 100644 index 000000000..440bc87c9 --- /dev/null +++ b/src/petals/server/reachability.py @@ -0,0 +1,37 @@ +import time + +import requests +from hivemind.utils.logging import get_logger + +logger = get_logger(__file__) + + +def check_reachability(peer_id, wait_time: float = 600, retry_delay: float = 15) -> None: + for attempt_no in range(math.floor(wait_time / retry_delay) + 1): + try: + r = requests.get(f"http://health.petals.ml/api/v1/is_reachable/{peer_id}", timeout=10) + r.raise_for_status() + response = r.json() + + if response["success"]: + logger.info("Server is reachable from the Internet. It will appear at http://health.petals.ml soon") + return + + if attempt_no == 0: + # If health.petals.ml didn't manage to connect right away, we need to wait for libp2p to set up relays + logger.info("Detected a NAT or a firewall, connecting to libp2p relays. This takes a few minutes") + time.sleep(retry_delay) + except Exception as e: + logger.warning(f"Skipping reachability check because health.petals.ml is down: {repr(e)}") + return + + raise RuntimeError( + f"Server has not become reachable from the Internet:\n\n" + f"{response['message']}\n\n" + f"You need to fix your port forwarding and/or firewall settings. How to do that:\n\n" + f" 1. Choose a specific port for the Petals server, for example, 31337.\n" + f" 2. Ensure that this port is accessible from the Internet and not blocked by your firewall.\n" + f" 3. Add these arguments to explicitly announce your IP address and port to other peers:\n" + f" python -m petals.cli.run_server ... --public_ip {response['your_ip']} --port 31337\n" + f" 4. If it does not help, ask for help in our Discord: https://discord.gg/Wuk8BnrEPH\n" + ) diff --git a/src/petals/server/server.py b/src/petals/server/server.py index 1523a44a7..e1a22930a 100644 --- a/src/petals/server/server.py +++ b/src/petals/server/server.py @@ -10,7 +10,6 @@ import numpy as np import psutil -import requests import torch from hivemind import DHT, MAX_DHT_TIME_DISCREPANCY_SECONDS, BatchTensorDescriptor, get_dht_time from hivemind.moe.server.layers import add_custom_models_from_file @@ -28,6 +27,7 @@ from petals.server.block_utils import get_block_size from petals.server.handler import TransformerConnectionHandler from petals.server.memory_cache import MemoryCache +from petals.server.reachability import check_reachability from petals.server.throughput import get_host_throughput from petals.utils.convert_block import check_device_balance, convert_block from petals.utils.disk_cache import DEFAULT_CACHE_DIR @@ -132,6 +132,7 @@ def __init__( logger.info(f"Connecting to the public swarm, peer_id = {self.dht.peer_id}") else: logger.info(f"Running DHT node on {visible_maddrs_str}, initial peers = {initial_peers}") + self.need_reachability_check = not skip_reachability_check and initial_peers == PUBLIC_INITIAL_PEERS if device is None: device = "cuda" if torch.cuda.is_available() else "cpu" @@ -201,48 +202,16 @@ def __init__( self.mean_balance_check_period = mean_balance_check_period self.mean_block_selection_delay = mean_block_selection_delay - # We delay the reachability check to the end of init, so the server has time to join libp2p relays - if not skip_reachability_check and initial_peers == PUBLIC_INITIAL_PEERS: - self._check_reachability() - self.stop = threading.Event() - def _check_reachability(self, n_retries=10, retry_delay=30): - for i in range(n_retries): - try: - r = requests.get(f"http://health.petals.ml/api/v1/is_reachable/{self.dht.peer_id}", timeout=10) - r.raise_for_status() - response = r.json() - - if response["success"]: - logger.info("Server is reachable from the Internet. It will appear at http://health.petals.ml soon") - return - - if i < n_retries - 1: - logger.info(f"Server is not reachable from the Internet yet. Retrying in {retry_delay} sec") - time.sleep(retry_delay) - except Exception as e: - logger.warning(f"Skipping reachability check because health.petals.ml is down: {repr(e)}") - return - - raise RuntimeError( - f"Server has not become reachable from the Internet:\n\n" - f"{response['message']}\n\n" - f"You need to fix your port forwarding and/or firewall settings. How to do that:\n\n" - f" 1. Choose a specific port for the Petals server, for example, 31337.\n" - f" 2. Ensure that this port is accessible from the Internet and not blocked by your firewall.\n" - f" 3. Add these arguments to explicitly announce your IP address and port to other peers:\n" - f" python -m petals.cli.run_server ... --public_ip {response['your_ip']} --port 31337\n" - f" 4. If it does not help, ask for help in our Discord: https://discord.gg/Wuk8BnrEPH\n" - ) - def _choose_num_blocks(self) -> int: assert ( self.converted_model_name_or_path == "bigscience/bloom-petals" ), "If you use a model other than bigscience/bloom-petals, please specify --num_blocks manually" - assert self.device.type == "cuda", \ - "GPU is not available. If you want to run a CPU-only server, please specify --num_blocks. " \ + assert self.device.type == "cuda", ( + "GPU is not available. If you want to run a CPU-only server, please specify --num_blocks. " "CPU-only servers in the public swarm are discouraged since they are much slower" + ) num_devices = len(self.tensor_parallel_devices) if self.tensor_parallel_devices else 1 if num_devices > 1: @@ -305,6 +274,7 @@ def run(self): use_auth_token=self.use_auth_token, load_in_8bit=self.load_in_8bit, tensor_parallel_devices=self.tensor_parallel_devices, + need_reachability_check=self.need_reachability_check, start=True, ) try: @@ -398,6 +368,7 @@ def create( use_auth_token: Optional[str], load_in_8bit: bool, tensor_parallel_devices: Sequence[torch.device], + need_reachability_check: bool, **kwargs, ) -> ModuleContainer: module_uids = [f"{prefix}.{block_index}" for block_index in block_indices] @@ -451,6 +422,9 @@ def create( min_batch_size=min_batch_size, max_batch_size=max_batch_size, ) + + if need_reachability_check: + check_reachability(dht.peer_id) except: logger.debug("Shutting down backends") for backend in blocks.values(): From 434b630527db282bd15dac997dad1e24d52efc61 Mon Sep 17 00:00:00 2001 From: Aleksandr Borzunov Date: Mon, 9 Jan 2023 15:29:59 +0000 Subject: [PATCH 09/10] Update constant and comment --- src/petals/server/reachability.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/petals/server/reachability.py b/src/petals/server/reachability.py index 440bc87c9..6dcb3f28f 100644 --- a/src/petals/server/reachability.py +++ b/src/petals/server/reachability.py @@ -6,7 +6,7 @@ logger = get_logger(__file__) -def check_reachability(peer_id, wait_time: float = 600, retry_delay: float = 15) -> None: +def check_reachability(peer_id, wait_time: float = 7 * 60, retry_delay: float = 15) -> None: for attempt_no in range(math.floor(wait_time / retry_delay) + 1): try: r = requests.get(f"http://health.petals.ml/api/v1/is_reachable/{peer_id}", timeout=10) @@ -18,7 +18,8 @@ def check_reachability(peer_id, wait_time: float = 600, retry_delay: float = 15) return if attempt_no == 0: - # If health.petals.ml didn't manage to connect right away, we need to wait for libp2p to set up relays + # Usually, libp2p manages to set up relays before we finish loading blocks. + # In other cases, we may need to wait for up to `wait_time` seconds before it's done. logger.info("Detected a NAT or a firewall, connecting to libp2p relays. This takes a few minutes") time.sleep(retry_delay) except Exception as e: From dbf504bf18b6b5392553c269070be58e90517bf0 Mon Sep 17 00:00:00 2001 From: Aleksandr Borzunov Date: Mon, 9 Jan 2023 15:45:05 +0000 Subject: [PATCH 10/10] Fix imports --- src/petals/server/reachability.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/petals/server/reachability.py b/src/petals/server/reachability.py index 6dcb3f28f..d8b5fba25 100644 --- a/src/petals/server/reachability.py +++ b/src/petals/server/reachability.py @@ -1,3 +1,4 @@ +import math import time import requests