In [1]:
from dask.distributed import WorkerPlugin, get_worker, LocalCluster, Client, Variable
import time
from datetime import datetime, timedelta

In [2]:
cluster = LocalCluster(
    host="0.0.0.0",
    scheduler_kwargs={"external_address": "localhost"},
)



RuntimeError: Cluster failed to start: Server.__init__() got an unexpected keyword argument 'external_address'

In [2]:
cluster = LocalCluster(
        n_workers=1,
        threads_per_worker=1,
        processes=True,
        # memory_limit="2GB",
    )
client = Client(cluster)
client.forward_logging()

{'tcp://127.0.0.1:60732': {'status': 'OK'}}

In [39]:
def dummy_task(n, var_st):
    print(f"task {n} begins...")
    st_dict = var_st.get()
    st_dict["start_time"] = datetime.now()
    var_st.set(st_dict)
    time.sleep(300)
    print(f"task {n} finished")

In [40]:
vars = []
future_dict = {}

symbol = 1
var_st = Variable()
var_st.set({"symbol":symbol})
vars.append(var_st)
future_dict[symbol] = client.submit(dummy_task, symbol, var_st)

In [41]:
symbol = 2
var_st = Variable()
var_st.set({"symbol": symbol})
vars.append(var_st)
future_dict[symbol] = client.submit(dummy_task, symbol, var_st)

In [42]:
symbol = 3
var_st = Variable()
var_st.set({"symbol": symbol})
vars.append(var_st)
future_dict[symbol] = client.submit(dummy_task, symbol, var_st)

In [79]:
for v in vars:
    try:
        vd = v.get("200ms")
        print(vd)
        if not "start_time" in vd:
            continue
        if vd["start_time"] + timedelta(seconds=60) <= datetime.now():
            ## the task is timeout. check future status. if it's still processing, cancel it.
            symbol = vd["symbol"]
            future = future_dict[symbol]
            if not future.done():
                print(f"cancelling task {symbol}")
                future.cancel()
            v.delete()
            vars.remove(v)
    except TimeoutError as e:
        print('normal timeout')
        vars.remove(v)

task 3 finished


2024-04-23 19:58:19,695 - distributed.core - INFO - Connection to tcp://127.0.0.1:60727 has been closed.


In [1]:
# Sample list of tuples
tuples_list = [
    (
        {
            "ar_layer_spec": [4, 15],
            "ar_reg": 49.16784,
            "batch_size": 300,
            "growth": "discontinuous",
            "lagged_reg_layer_spec": [256, 3],
            "n_lags": 19,
            "normalize": "soft1",
            "optimizer": "SGD",
            "seasonality_mode": "additive",
            "seasonality_reg": 47.37235,
            "topk_covar": 295,
            "trend_reg": 63.65796,
            "yearly_seasonality": 21,
        },
        1.2,
    ),
    (
        {
            "ar_layer_spec": [32, 34],
            "ar_reg": 52.31495,
            "batch_size": 400,
            "growth": "discontinuous",
            "lagged_reg_layer_spec": [512, 42],
            "n_lags": 29,
            "normalize": "soft",
            "optimizer": "SGD",
            "seasonality_mode": "multiplicative",
            "seasonality_reg": 25.50586,
            "topk_covar": 537,
            "trend_reg": 75.50941,
            "yearly_seasonality": 29,
        },
        2.3,
    ),
]

# Unpack the list of tuples and use zip to transpose
dicts, nones = zip(*tuples_list)

# Convert the resulting tuples to lists
list_dicts = list(dicts)
list_nones = list(nones)

# Output the results
print("List of dictionaries:", list_dicts)
print("List of None elements:", list_nones)

List of dictionaries: [{'ar_layer_spec': [4, 15], 'ar_reg': 49.16784, 'batch_size': 300, 'growth': 'discontinuous', 'lagged_reg_layer_spec': [256, 3], 'n_lags': 19, 'normalize': 'soft1', 'optimizer': 'SGD', 'seasonality_mode': 'additive', 'seasonality_reg': 47.37235, 'topk_covar': 295, 'trend_reg': 63.65796, 'yearly_seasonality': 21}, {'ar_layer_spec': [32, 34], 'ar_reg': 52.31495, 'batch_size': 400, 'growth': 'discontinuous', 'lagged_reg_layer_spec': [512, 42], 'n_lags': 29, 'normalize': 'soft', 'optimizer': 'SGD', 'seasonality_mode': 'multiplicative', 'seasonality_reg': 25.50586, 'topk_covar': 537, 'trend_reg': 75.50941, 'yearly_seasonality': 29}]
List of None elements: [1.2, 2.3]


# Reproducer for Lock issue

In [1]:
import os

os.environ["PYTHONUNBUFFERED"] = "1"

import time
import random
from datetime import datetime

from dask.distributed import (
    Client,
    LocalCluster,
    get_worker,
    wait,
    Lock,
)

# NOTE: adjust the number of workers as needed. The more the sooner of crash.
n_workers = 6

def dummy_task(i):
    lock = Lock("shared_lock")
    while not lock.acquire("3s"):
        time.sleep(1)
    print(
        f'{datetime.now().strftime("%Y-%m-%d %H:%M:%S")} worker#{get_worker().name} acquired lock and completed task #{i}'
    )
    time.sleep(random.uniform(1,3))
    lock.release()
    return None


def main():
    cluster = LocalCluster(
        n_workers=n_workers,
        threads_per_worker=1,
        processes=True,
    )
    client = Client(cluster)

    futures = []
    i = 0
    while True:
        futures.append(
            client.submit(
                dummy_task,
                i,
            )
        )
        if len(futures) > n_workers * 2:
            _, undone = wait(futures, return_when="FIRST_COMPLETED")
            futures = list(undone)
        i += 1


if __name__ == "__main__":
    main()

2025-02-18 16:06:18 worker#0 acquired lock and completed task #1
2025-02-18 16:06:19 worker#2 acquired lock and completed task #0
2025-02-18 16:06:20 worker#1 acquired lock and completed task #3
2025-02-18 16:06:22 worker#3 acquired lock and completed task #2
2025-02-18 16:06:23 worker#0 acquired lock and completed task #6
2025-02-18 16:06:25 worker#2 acquired lock and completed task #4
2025-02-18 16:06:26 worker#1 acquired lock and completed task #7
2025-02-18 16:06:28 worker#3 acquired lock and completed task #5
2025-02-18 16:06:30 worker#0 acquired lock and completed task #8
2025-02-18 16:06:32 worker#2 acquired lock and completed task #9
2025-02-18 16:06:33 worker#1 acquired lock and completed task #10
2025-02-18 16:06:36 worker#3 acquired lock and completed task #11
2025-02-18 16:06:37 worker#0 acquired lock and completed task #12
2025-02-18 16:06:39 worker#2 acquired lock and completed task #13
2025-02-18 16:06:41 worker#1 acquired lock and completed task #14
2025-02-18 16:06:43 

2025-02-18 16:57:55,280 - tornado.application - ERROR - Exception in callback <bound method Semaphore._refresh_leases of <distributed.lock.Lock object at 0x111c901d0>>
Traceback (most recent call last):
  File "/Users/jx/.pyenv/versions/3.12.2/envs/venv_3.12.2/lib/python3.12/site-packages/tornado/ioloop.py", line 939, in _run
    await val
  File "/Users/jx/.pyenv/versions/3.12.2/envs/venv_3.12.2/lib/python3.12/site-packages/distributed/semaphore.py", line 436, in _refresh_leases
    await retry_operation(
  File "/Users/jx/.pyenv/versions/3.12.2/envs/venv_3.12.2/lib/python3.12/site-packages/distributed/utils_comm.py", line 416, in retry_operation
    return await retry(
           ^^^^^^^^^^^^
  File "/Users/jx/.pyenv/versions/3.12.2/envs/venv_3.12.2/lib/python3.12/site-packages/distributed/utils_comm.py", line 395, in retry
    return await coro()
           ^^^^^^^^^^^^
  File "/Users/jx/.pyenv/versions/3.12.2/envs/venv_3.12.2/lib/python3.12/site-packages/distributed/core.py", line 

KeyboardInterrupt: 

2025-02-18 16:57:56,648 - distributed.semaphore - ERROR - Release failed for id=a4c9ebd3adcb4890b81b421b4ec1ddda, lease_id=465c14a3682b446bb5ac7b593ecda511, name=shared_lock. Cluster network might be unstable?
Traceback (most recent call last):
  File "/Users/jx/.pyenv/versions/3.12.2/envs/venv_3.12.2/lib/python3.12/site-packages/distributed/semaphore.py", line 486, in _release
    await retry_operation(
  File "/Users/jx/.pyenv/versions/3.12.2/envs/venv_3.12.2/lib/python3.12/site-packages/distributed/utils_comm.py", line 416, in retry_operation
    return await retry(
           ^^^^^^^^^^^^
  File "/Users/jx/.pyenv/versions/3.12.2/envs/venv_3.12.2/lib/python3.12/site-packages/distributed/utils_comm.py", line 395, in retry
    return await coro()
           ^^^^^^^^^^^^
  File "/Users/jx/.pyenv/versions/3.12.2/envs/venv_3.12.2/lib/python3.12/site-packages/distributed/core.py", line 1256, in send_recv_from_rpc
    comm = await self.pool.connect(self.addr)
           ^^^^^^^^^^^^^^^^^^^