In [2]:
%run ../../python-seminar/DataFiles_and_Notebooks/talktools.py

<img src="https://www.evernote.com/l/AUV1r1xvhBdPF6lX-2SJLkO-vkkmCXEDrMwB/image.png">
http://www.slideshare.net/ManojitNandi/parallel-programming-in-python-speeding-up-your-analysis

Remember, you can create (fork) many processes, which are copies of the original parent process (memory, data, state) and act independently of each other. To share data between them you have to explicitly do that within each process. The Pythonic way we do multiprocessing (creation of new processes, communication between processes) is with `multiprocessing`.

*"effectively side-stepping the Global Interpreter Lock by using subprocesses instead of threads. Due to this, the multiprocessing module allows the programmer to fully leverage multiple processors on a given machine. It runs on both Unix and Windows."*

- https://docs.python.org/3/library/multiprocessing.html

The analog to `threading.Thread` is `multiprocessing.Process`. You should be able to do a drop-in replacement. Instead of `current_thread()` you'd use `os.getpid()`.

In [4]:
%%writefile multip.py

import multiprocessing
import time
import os
import sys
import logging
import random


root = logging.getLogger()
root.handlers = []
logging.basicConfig(level=logging.DEBUG, stream=sys.stdout,
                    format='(%(threadName)-9s-pid %(process)d) %(message)s',)


def worker(num):
    """thread worker function"""
    
    sleep_time = random.randint(1, 8)
    logging.debug(f'worker: {num} sleeping for {sleep_time} s, pid: {os.getpid()}')
    time.sleep(sleep_time)
    logging.debug(f'done: worker: {num}')
    return

if __name__ == "__main__":
    procs = []
    for i in range(2):
        p = multiprocessing.Process(target=worker, args=(i,))
        procs.append(p)
        p.start()

Writing multip.py


In [9]:
%%time
!python multip.py

(MainThread-pid 66646) worker: 1 sleeping for 6 s, pid: 66646
(MainThread-pid 66645) worker: 0 sleeping for 4 s, pid: 66645
(MainThread-pid 66645) done: worker: 0
(MainThread-pid 66646) done: worker: 1
CPU times: user 258 ms, sys: 67.2 ms, total: 325 ms
Wall time: 6.31 s


If your machine has multiple cores, these two processes may get run on those two separate cores, independently.

You may need to share info between processes. You can do this, just like with Threads with `Queues`. You can also use the (UNIX-like) Pipe to have  two processes communicate with each other:

In [10]:
%%writefile multip1.py
# https://docs.python.org/3/library/multiprocessing.html#exchanging-objects-between-processes
from multiprocessing import Process, Pipe

def f(conn):
    conn.send([42, None, 'hello'])
    conn.close()

if __name__ == '__main__':
    parent_conn, child_conn = Pipe()
    p = Process(target=f, args=(child_conn,))
    p.start()
    print(parent_conn.recv())   # prints "[42, None, 'hello']"
    p.join()

Writing multip1.py


In [12]:
%run multip1.py

[42, None, 'hello']


Using pools of workers (in separate processes) with multiprocessing `Pool.`

https://docs.python.org/3.6/library/multiprocessing.html#using-a-pool-of-workers

In [13]:
%%writefile g.py

import time

def g(x):
    # domain specific stuff here!
    time.sleep(0.2)
    return x*x

Writing g.py


In [32]:
from multiprocessing import Pool 
from g import g

pool = Pool(processes=10)     # start 4 worker processes 

In [33]:
pool

<multiprocessing.pool.Pool state=RUN pool_size=10>

In [34]:
%time pool.map(g, range(10))

CPU times: user 3.18 ms, sys: 2.29 ms, total: 5.46 ms
Wall time: 219 ms


[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

In [35]:
import time
def g1(x):
    # domain specific stuff here!
    time.sleep(0.2)
    return x*x

In [36]:
%time list(map(g1, range(10)))

CPU times: user 2.34 ms, sys: 2.94 ms, total: 5.28 ms
Wall time: 2.05 s


[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

In [37]:
# print same numbers in arbitrary order
for i in pool.imap_unordered(g, range(10)):
    print(i, sep=" ", end=" ")

0 9 81 1 4 16 25 36 64 49 

Python is packaging (pickling) up your functions and sending them to different processes.

In [None]:
# This will fail...
pool.map(lambda x: x**3, range(10))

In [42]:
# run only one process "g(10)" asynchronously 
result = pool.apply_async(g, [10])

# prints "100" unless you timeout
print(result.get(timeout=0.25)) 

100


In [43]:
result

<multiprocessing.pool.ApplyResult at 0x7fca000b8be0>

In [44]:
del pool

In [45]:
%%writefile f.py
def f(x): 
    return x*x


Writing f.py


`It can take a long time to fork the processes so unless you're running something longer than it takes to fork the processes, it doesn't make sense to use much more processes than # of cores`

In [48]:
from multiprocessing import Pool 
import time

from f import f

for i in [1, 2, 3, 4, 8, 16, 32, 64]:
    print(i,"*"*5, flush=True)
    pool = Pool(processes=i)               # start 4 worker processes 
    start = time.time()
    pool.map(f, range(10000000))
    print("{0:0.4f} sec".format(time.time() - start))
    pool.terminate()
    del pool

1 *****
1.8884 sec
2 *****
1.2854 sec
3 *****
1.1321 sec
4 *****
1.1263 sec
8 *****
1.0422 sec
16 *****
0.9532 sec
32 *****
0.9983 sec
64 *****
0.9792 sec


In [49]:
!ulimit -a

-t: cpu time (seconds)              unlimited
-f: file size (blocks)              unlimited
-d: data seg size (kbytes)          unlimited
-s: stack size (kbytes)             8176
-c: core file size (blocks)         0
-v: address space (kbytes)          unlimited
-l: locked-in-memory size (kbytes)  unlimited
-u: processes                       5333
-n: file descriptors                4096


# Launching parallel tasks with `concurrent.futures`

Built-in, create different pools for executing **maps** (single loop over data). Local resources.

<i>"The `concurrent.futures` module provides a high-level interface for asynchronously executing callables.

The asynchronous execution can be performed with threads, using `ThreadPoolExecutor`, or separate processes, using `ProcessPoolExecutor`. Both implement the same interface, which is defined by the abstract Executor class."</i>

https://docs.python.org/3/library/concurrent.futures.html

In [50]:
from concurrent.futures import ProcessPoolExecutor
e = ProcessPoolExecutor(2)  # can also use a threadpool

In [51]:
%%time 
from time import sleep

results = []
for i in range(8):
    sleep(1)
    results.append(i + 1)

CPU times: user 3.16 ms, sys: 3.26 ms, total: 6.41 ms
Wall time: 8.03 s


In [52]:
results

[1, 2, 3, 4, 5, 6, 7, 8]

In [55]:
%%writefile slow.py
from time import sleep, time

from concurrent.futures import ProcessPoolExecutor
e = ProcessPoolExecutor() 

def slowfunc(x):
    sleep(1)
    return(x+1)

if __name__ == "__main__":
    s = time()
    results = list(e.map(slowfunc, range(8)))
    print(f"Finished in {time() - s:0.3f} sec")
    print(results)
    
    e.shutdown()

Overwriting slow.py


In [56]:
!python slow.py

Finished in 1.099 sec
[1, 2, 3, 4, 5, 6, 7, 8]


Figured out I have 4 cores and ran it in 4 separate processes.

## Breakout

Convert the sequential code to parallel using `concurrent.futures`

In [81]:
%%writefile fast.py

import requests
from bs4 import BeautifulSoup
from concurrent.futures import ProcessPoolExecutor

url = "https://en.wikipedia.org/wiki/Special:Random"

def run():
    """worker function"""
    
    resp = requests.get(url).text
    title = BeautifulSoup(resp, 'html.parser').title.string.split('- Wikipedia')[0]
    return len(title)


e = ProcessPoolExecutor()
lens = list(e.map(run,range(10)))
print(lens)

Overwriting fast.py


In [82]:
%%time
!python fast.py

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Users/alizabeverage/opt/anaconda3/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/Users/alizabeverage/opt/anaconda3/lib/python3.9/multiprocessing/spawn.py", line 125, in _main
    prepare(preparation_data)
  File "/Users/alizabeverage/opt/anaconda3/lib/python3.9/multiprocessing/spawn.py", line 236, in prepare
    _fixup_main_from_path(data['init_main_from_path'])
  File "/Users/alizabeverage/opt/anaconda3/lib/python3.9/multiprocessing/spawn.py", line 287, in _fixup_main_from_path
    main_content = runpy.run_path(main_path,
  File "/Users/alizabeverage/opt/anaconda3/lib/python3.9/runpy.py", line 268, in run_path
    return _run_module_code(code, init_globals, run_name,
  File "/Users/alizabeverage/opt/anaconda3/lib/python3.9/runpy.py", line 97, in _run_module_code
    _run_code(code, mod_globals, init_globals,
  File "/User

CPU times: user 12.6 ms, sys: 28.4 ms, total: 41 ms
Wall time: 631 ms


### Executor.submit

`submit` starts an execution in a separate thread or process and immediately returns a `Future` object that points back to the result. Until the function completes, the future is pending. We get the result of a task with `.result()`, which blocks until the computation is complete.

In [67]:
%%writefile slowfunc.py
from time import sleep

def slowfunc(x, y, delay=1):
    sleep(delay)
    return(x + y)

Writing slowfunc.py


In [68]:
%%time 

from concurrent.futures import ProcessPoolExecutor
e = ProcessPoolExecutor() 

from slowfunc import slowfunc

future = e.submit(slowfunc, 1, 2)

CPU times: user 7.76 ms, sys: 17 ms, total: 24.8 ms
Wall time: 27.8 ms


In [69]:
future

<Future at 0x7fca496cbbb0 state=running>

In [70]:
future.result()

3

In [71]:
%%time 
futures = [e.submit(slowfunc,1,2, delay=1) for _ in range(10)]
results = [f.result() for f in futures]

CPU times: user 23.6 ms, sys: 99 ms, total: 123 ms
Wall time: 1.18 s


In [72]:
print(results)

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3]


## Joblib

http://pythonhosted.org/joblib/

Running Python functions as pipeline jobs. The *vision is to provide tools to easily achieve better performance and reproducibility when working with long running jobs.* Specifically meant to work well with large data (ie. numpy arrays).

  - **Avoid computing twice the same thing**: code is rerun over an over, for instance when prototyping computational-heavy jobs (as in scientific development), but hand-crafted solution to alleviate this issue is error-prone and often leads to unreproducible results
  - **Persist to disk transparently**: persisting in an efficient way arbitrary objects containing large data is hard. Using joblib’s caching mechanism avoids hand-written persistence and implicitly links the file on disk to the execution context of the original Python object. As a result, joblib’s persistence is good for resuming an application status or computational job, eg after a crash.

Joblib strives to address these problems while leaving your code and your flow control as unmodified as possible (no framework, no new paradigms).

In [None]:
#!conda install joblib -y

In [None]:
from math import sqrt
[sqrt(i ** 2) for i in range(10)]

### Parallel Helpers

Joblib provides a simple helper class to write parallel for loops using multiprocessing. The core idea is to write the code to be executed as a generator expression, and convert it to parallel computing.

In [None]:
from math import sqrt
from joblib import Parallel, delayed

By default Parallel uses the Python multiprocessing module to fork separate Python worker processes to execute tasks concurrently on separate CPUs. This is a reasonable default for generic Python programs but it induces some overhead as the input and output data need to be serialized in a queue for communication with the worker processes. 

In [None]:
Parallel(n_jobs=2,backend="threading") \
  (delayed(sqrt)(i ** 2) for i in range(10))

In [None]:
import time
start = time.time()
Parallel(n_jobs=5,verbose=5) \
  (delayed(time.sleep)(1) for _ in range(10))
print(time.time()-start)

### On demand recomputing: the `Memory` class

Caching long running results so it can be reused. Let's try to cache to disk:

In [None]:
from joblib import Memory
memory = Memory(location="/tmp/", verbose=0)  # try a higher verbosity

In [None]:
@memory.cache
def f(x):
    print('Running f(%s)' % x)
    return x

In [None]:
print(f(1))

In [None]:
print(f(1))

In [None]:
print(f(10))

In [None]:
print(f(20))

In [None]:
!ls -lat  /tmp/joblib/

In [None]:
memory = Memory(location="/tmp/",verbose=0, mmap_mode="r+")

In [None]:
@memory.cache
def josh(x,blah=True):
    print('Running josh(%s)' % x)
    return x

In [None]:
print(josh(1))

In [None]:
print(josh(1))

In [None]:
print(josh(1, blah=False))

Ignoring variables:

In [None]:
@memory.cache(ignore=['blah'])
def h(x,blah=True):
    print('Running h(%s)' % x)
    return x

In [None]:
print(h(1))

In [None]:
print(h(1,blah=False))

Note: joblib also gives (for persistence) `joblib.dump()` and `joblib.load()` provide a replacement for pickle to work efficiently on Python objects containing large data, in particular large numpy arrays.