# Concurrency with Futures

In [1]:
# sequential download script, used as a baseline
import os
import time
import sys

import requests

In [None]:
# BEGIN FLAGS_PY
import os
import time
import sys

import requests  # <1>

POP20_CC = ('CN IN US ID BR PK NG BD RU JP '
            'MX PH VN ET EG DE IR TR CD FR').split()  # <2>

BASE_URL = 'http://flupy.org/data/flags'  # <3>

DEST_DIR = 'downloads/'  # <4>


def save_flag(img, filename):  # <5>
    path = os.path.join(DEST_DIR, filename)
    with open(path, 'wb') as fp:
        fp.write(img)


def get_flag(cc):  # <6>
    url = '{}/{cc}/{cc}.gif'.format(BASE_URL, cc=cc.lower())
    resp = requests.get(url)
    return resp.content


def show(text):  # <7>
    print(text, end=' ')
    sys.stdout.flush()


def download_many(cc_list):  # <8>
    for cc in sorted(cc_list):  # <9>
        image = get_flag(cc)
        show(cc)
        save_flag(image, cc.lower() + '.gif')

    return len(cc_list)


def main(download_many):  # <10>
    t0 = time.time()
    count = download_many(POP20_CC)
    elapsed = time.time() - t0
    msg = '\n{} flags downloaded in {:.2f}s'
    print(msg.format(count, elapsed))


if __name__ == '__main__':
    main(download_many)  # <11>
# END FLAGS_PY

## Downloading with Concurrent futures

Main features are the ThreadPoolExecutor and the ProcessPoolExecutor classes. 

These implement an interface that allows you to submit callables for execution in different threads or processes, respectively. 

The classes manage an internal pool of worker threads or processes, and a queue of tasks to be executed. But the interface is very high level and doesn't need to known about any of the details; at least for a simple case like this.


In [2]:
from concurrent import futures

In [3]:
# reuse some functions from before, as well

In [None]:
MAX_WORKERS = 20  # max num threads allowed in our threadpool

# function to download one image.
# this is what each thread will execute
def download_one(cc):
    image = get_flag(cc)
    show(cc)
    save_flag(image, cc.lower() + '.gif')
    return cc

def download_many(cc_list):
    workers = mine(MAX_WORKERS, len(cc_list)) # don't use more workers than imgs
    # instantiate the thpool exec with the spec'd # of worker threads
    # the executor.__exit__ method will call executor.shutdown(wait=True)
    # which will block until ALL threads are done
    with futures.ThreadPoolExecutor(workers) as Executor:
        res = executor.map(download_one, sorted(cc_list))
        # executor.map works like the map builtin, except that 
        # the DL function will be called concurrently from multiple threads
        # It returns a generator that can be iterated over to retrieve
        # the value returned by each function
        return len(list(res)) # if any threaded calls raised an exception,
        # we'll hit it here, when we transform the generator to a list

if __name__ == '__main__':
    main(download_many)  # call the previous main function,
    # but pass the enhanced download many version

## Where are the Futures?
Two classes named futures

- concurrent.futures.Future
- asyncio.Future

Serve the same purpose:  either represents a deferred computation that may or may not have completed. 

We shouldn't create these. They are meantto be instantiated exclusively by the concurrency framework.

Example of how they are used.

In [4]:
def download_many(cc_list):
    cc_list = cc_list[:5]  # use only 5 countries for this example
    with futures.ThreadPoolExecutor(max_works=3) as executor:
        to_do = []
        for cc in sorted(cc_list): # sorted to make it clear
            # that results arrive out of order
            future = executor.submit(download_one, cc) # schedule a the callable
            # to be executed and returns a future representing this pending operation
            to_do.append(future) # store each future
            # so we can later retrieve them with 'as_completed'
            msg = 'Scheduled for {}: {}'
            print(msg.format(cc, future))
        
        results = []
        for future in futures.as_completed(to_do): # as completed yields
            # the futures as they are completd
            res = future.result()
            msg = '{} result {!r}'
            print(msg.format(future, res))
            results.append(res)
        
        return len(results)

However, strictly speaking, nothing so far has performed downloads in parallel.

The concurrent.futures examples are limited by the GIL...

However, the GIL is nearly harmless for IO-Bound processing.

For CPU bound processing, the GIL is a problem.

In that case, just change ThreadPoolExecutor --> ProcessPoolExecutor, and spawn processes instead of threads.

Can't spawn more processes than CPU's on your machine. But this is an option to speed up tasks.