**使用dask加速科学计算**

@author: Rui Zhu  
@data: 2024-05-19  
@follow: https://examples.dask.org/applications/embarrassingly-parallel.html

In [18]:
import time
import random
import pandas as pd
import numpy as np

from dask.distributed import Client

启动dask客户端的dashboard

In [2]:
client = Client(threads_per_worker=4, n_workers=5)
client  # 需要安装依赖'pip install bokeh'

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 5
Total threads: 20,Total memory: 16.00 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:56540,Workers: 5
Dashboard: http://127.0.0.1:8787/status,Total threads: 20
Started: Just now,Total memory: 16.00 GiB

0,1
Comm: tcp://127.0.0.1:56553,Total threads: 4
Dashboard: http://127.0.0.1:56554/status,Memory: 3.20 GiB
Nanny: tcp://127.0.0.1:56543,
Local directory: /var/folders/fz/b935cghx795_zycx9njnpf3m0000gn/T/dask-scratch-space/worker-k3vuw27k,Local directory: /var/folders/fz/b935cghx795_zycx9njnpf3m0000gn/T/dask-scratch-space/worker-k3vuw27k

0,1
Comm: tcp://127.0.0.1:56556,Total threads: 4
Dashboard: http://127.0.0.1:56557/status,Memory: 3.20 GiB
Nanny: tcp://127.0.0.1:56545,
Local directory: /var/folders/fz/b935cghx795_zycx9njnpf3m0000gn/T/dask-scratch-space/worker-g87i_5dj,Local directory: /var/folders/fz/b935cghx795_zycx9njnpf3m0000gn/T/dask-scratch-space/worker-g87i_5dj

0,1
Comm: tcp://127.0.0.1:56559,Total threads: 4
Dashboard: http://127.0.0.1:56560/status,Memory: 3.20 GiB
Nanny: tcp://127.0.0.1:56547,
Local directory: /var/folders/fz/b935cghx795_zycx9njnpf3m0000gn/T/dask-scratch-space/worker-w5nqjmb8,Local directory: /var/folders/fz/b935cghx795_zycx9njnpf3m0000gn/T/dask-scratch-space/worker-w5nqjmb8

0,1
Comm: tcp://127.0.0.1:56562,Total threads: 4
Dashboard: http://127.0.0.1:56563/status,Memory: 3.20 GiB
Nanny: tcp://127.0.0.1:56549,
Local directory: /var/folders/fz/b935cghx795_zycx9njnpf3m0000gn/T/dask-scratch-space/worker-rx9rh3ev,Local directory: /var/folders/fz/b935cghx795_zycx9njnpf3m0000gn/T/dask-scratch-space/worker-rx9rh3ev

0,1
Comm: tcp://127.0.0.1:56565,Total threads: 4
Dashboard: http://127.0.0.1:56566/status,Memory: 3.20 GiB
Nanny: tcp://127.0.0.1:56551,
Local directory: /var/folders/fz/b935cghx795_zycx9njnpf3m0000gn/T/dask-scratch-space/worker-y4_rv45s,Local directory: /var/folders/fz/b935cghx795_zycx9njnpf3m0000gn/T/dask-scratch-space/worker-y4_rv45s


定义科学计算的计算模块

In [3]:
def costly_simulation(list_param):
    """
    假设有一个耗时的模拟计算
    """
    time.sleep(random.random())
    return sum(list_param)


%time costly_simulation([1, 2, 3, 4])

CPU times: user 7.54 ms, sys: 3.76 ms, total: 11.3 ms
Wall time: 282 ms


10

定义数据表

In [4]:
input_params = pd.DataFrame(np.random.random(size=(500, 4)),
                            columns=['param_a', 'param_b', 'param_c', 'param_d'])
input_params.head()

Unnamed: 0,param_a,param_b,param_c,param_d
0,0.923154,0.36534,0.47699,0.35746
1,0.259822,0.368204,0.662306,0.74679
2,0.344211,0.245044,0.960873,0.064148
3,0.635141,0.867804,0.453658,0.197505
4,0.632023,0.716136,0.863443,0.996077


传统的for循环方法

In [5]:
results = []

for parameters in input_params.values[:10]:
    result = costly_simulation(parameters)
    results.append(result)

results

[2.1229447469126335,
 2.0371214924760235,
 1.6142769762222497,
 2.1541072218539634,
 3.2076791753341283,
 3.067674038883048,
 2.224111676611256,
 1.1449684725310574,
 2.013908420963451,
 2.0591488317825193]

将计算任务添加到列表里

In [9]:
import dask
lazy_results = []

for parameters in input_params.values[:10]:
    lazy_result = dask.delayed(costly_simulation)(parameters)
    lazy_results.append(lazy_result)
lazy_results[0]

Delayed('costly_simulation-e31a5c60-4b23-46f7-87d8-e920bfa105a5')

并行计算列表中的任务

In [11]:
dask.compute(*lazy_results)

(2.1229447469126335,
 2.0371214924760235,
 1.6142769762222497,
 2.1541072218539634,
 3.2076791753341283,
 3.067674038883048,
 2.224111676611256,
 1.1449684725310574,
 2.013908420963451,
 2.0591488317825193)

In [17]:
import dask
lazy_results = []

for parameters in input_params.values:
    lazy_result = dask.delayed(costly_simulation)(parameters)
    lazy_results.append(lazy_result)

# 将任务添加到内存中准备计算
futures = dask.persist(*lazy_results)  # trigger computation in the background

# client.cluster.scale(8)  # adjust the number of workers
# results = dask.compute(*futures)