import numpy as np
# ^^^ pyforest auto-imports - don't write above this line
# Parallelism

In [1]:
import pandas as pd
import numpy as np

# Silly example

In [2]:
import time

def my_sleep(x):
    '''
    Sleeps for x-seconds and returns the result x
    '''
    print(f'Sleeping for {x} seconds.')
    time.sleep(x)
    print(f'Returning {x}')
    return x

In [3]:
my_sleep(5)

Sleeping for 5 seconds.
Returning 5


5

In [4]:
my_list = [1,2,3,4,5,6]

In [5]:
sum(my_list)

21

In [6]:
from tqdm.auto import tqdm

## Serial code

In [7]:
for item in tqdm(my_list):
    my_sleep(item)

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

Sleeping for 1 seconds.
Returning 1
Sleeping for 2 seconds.
Returning 2
Sleeping for 3 seconds.
Returning 3
Sleeping for 4 seconds.
Returning 4
Sleeping for 5 seconds.
Returning 5
Sleeping for 6 seconds.
Returning 6



In [21]:
map(my_sleep, my_list)

<map at 0x1a0edd0f080>

In [23]:
%%time

list(map(my_sleep, my_list))

Sleeping for 1 seconds.
Returning 1
Sleeping for 2 seconds.
Returning 2
Sleeping for 3 seconds.
Returning 3
Sleeping for 4 seconds.
Returning 4
Wall time: 10 s


[1, 2, 3, 4]

## Parallel code

In [8]:
from multiprocessing import Pool, cpu_count

cpu_count()

6

## You have to create a pool of `n` process.

In [9]:
pool = Pool(processes=cpu_count())

### We'll `use the magic function` here to measure the velocity of this code in parallel.

However, if you run this code, watch what happens:

In [None]:
%%time

result = pool.map(my_sleep, my_list)
pool.terminate()

## This happens because multiprocessing not always (?) work in Jupyter Notebooks. 

_Some versions of linux or macbooks may handle it well (yay unix)_. But certainly it doesn't work for Windows.

### What should we do then? Two solutions.

1. We have to write our functions inside a `.py` file.

2. Install `multiprocess` (note it is different from Python's `multiprocessing` module)

In [10]:
!pwd

/c/Users/andreaguiar/Desktop/usr/dev/ironhack/paralellism


In [16]:
from sleeper import my_sleep_py

In [20]:
pool = Pool(processes=cpu_count())

In [21]:
%%time

result = pool.map(my_sleep_py, my_list)
pool.terminate()

Wall time: 6.08 s


In [22]:
!pip install multiprocess

Collecting multiprocess
  Downloading multiprocess-0.70.9.tar.gz (1.6 MB)
Building wheels for collected packages: multiprocess
  Building wheel for multiprocess (setup.py): started
  Building wheel for multiprocess (setup.py): finished with status 'done'
  Created wheel for multiprocess: filename=multiprocess-0.70.9-py3-none-any.whl size=108035 sha256=6eaaf9f84d06f3f602b5dcb3934d8b4d689092f3496ded39e64bdbb232fb1225
  Stored in directory: c:\users\andreaguiar\appdata\local\pip\cache\wheels\83\2b\b4\50d7cae5b9069434454fd36da009832592af4fff58b51db8d9
Successfully built multiprocess
Installing collected packages: multiprocess
Successfully installed multiprocess-0.70.9


## using multiprocess


In [23]:
# using multiprocess instead of multiprocessing
from multiprocess import Pool

In [31]:
pool = Pool(processes=6)

In [4]:
def my_sleep_jupyter(x):
    '''
    Sleeps for x-seconds and returns the result x
    '''
    import time
    print(f'Sleeping for {x} seconds.')
    time.sleep(x)
    print(f'Returning {x}')
    return x


In [33]:
%%time

result = pool.map(my_sleep_jupyter, my_list)
pool.terminate()

Wall time: 6.04 s


# Running Asynchronous code

## What is asynchrony?

- `result.ready()`
- `result.wait()`
- `result.get()`

In [1]:
from multiprocess import Pool

In [2]:
pool = Pool(processes=6)

In [39]:
result = pool.map_async(my_sleep_jupyter, [10, 12, 8, 13, 4, 3])

In [40]:
print('Do something that doesn"t depend on result')
print('...')
print('Now the time came when the result is needed.')
result.wait()

result_list = result.get()
pool.terminate()
print(f'Now go on and use the results obtained - {result_list}')

Do something that doesn"t depend on result
...
Now the time came when the result is needed.
Now go on and use the results obtained - [10, 12, 8, 13, 4, 3]


# CPU intensive computations

In [41]:
def square(x):
    return x ** 2

In [42]:
n = 1000000

In [44]:
random_numbers = np.random.random(size=n)

<IPython.core.display.Javascript object>

In [46]:
%%timeit
    
result = [square(item) for item in random_numbers]

1 loop, best of 5: 334 ms per loop


In [52]:
pool = Pool(processes=2)

In [53]:
random_numbers = np.random.random(size=n)

<IPython.core.display.Javascript object>

In [54]:
%%time

result = pool.map(square, random_numbers)

Wall time: 18.5 s


In [55]:
pool.terminate()

In [None]:
# GIL - global interpreter lock

## profiling tools

In [56]:
%%prun

result = [square(item) for item in np.random.random(size=n)]

<IPython.core.display.Javascript object>

 

## Usually, for CPU intensive computations, Pool.map won't speed up your code.

Why? It will spend more time managing process, replicating data and sending data to other process than actually computing it.



In [None]:
## Cython - CPython

In [None]:
!pip install cython

In [59]:
%load_ext Cython

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [60]:
%%cython -a
def square_c(x):
    return x ** 2

In [61]:
random_numbers = np.random.random(size=n)

<IPython.core.display.Javascript object>

In [62]:
%%timeit

result = [square_c(item) for item in np.random.random(size=n)]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

1 loop, best of 5: 356 ms per loop


# When is multiprocess useful then? 


## I/O bound computations

In [6]:
import pandas as pd

In [64]:
import requests

In [5]:
n_max = 51852

In [67]:
from tqdm.auto import tqdm

In [68]:
colnames = ['team_a','score_a','score_b','team_b','event','stars']
my_range = range(int(np.ceil(n_max/100)))


for i in tqdm(my_range):
    response = requests.get(f'https://www.hltv.org/results?offset={i * 100}')
    df = pd.concat(pd.read_html(response.text))
    df.to_csv(f'tmp/results_{i}.csv', index=False, sep=',')

<IPython.core.display.Javascript object>

HBox(children=(IntProgress(value=0, max=519), HTML(value='')))

KeyboardInterrupt: 

In [1]:
from multiprocess import Pool

In [8]:
colnames = ['team_a','score_a','score_b','team_b','event','stars']
my_range = range(int(np.ceil(n_max/100)))

<IPython.core.display.Javascript object>

In [97]:
def download_parallel(i):
    import requests
    import pandas as pd
    
    response = requests.get(f'https://www.hltv.org/results?offset={i * 100}')
    df = pd.concat(pd.read_html(response.text))
    df.to_csv(f'tmp/results_{i}.csv', index=False, sep=',')
    
    return df

In [3]:
pool = Pool(processes=6)

In [98]:
results = pool.map_async(download_parallel, [1,2,3,4,5])

In [99]:
import os

In [105]:
results.get()[4]

Unnamed: 0,0,1,2,3,4
0,D13,2 - 0,Updraft,Asia Minor East Asia Closed Qualifier - ESL On...,bo3
0,EHOME,2 - 0,Invictus,Huya Pro League - Spring 2020,bo3
0,Airborne,4 - 16,AVANT,ESEA MDL Season 33 Australia,inf
0,Paradox,2 - 1,MARKandLARRY,LPL Pro League Season 4,bo3
0,Skyfire,0 - 2,Rooster,ESL Australia & NZ Championship Season 10,bo3
...,...,...,...,...,...
0,LevelZero,16 - 19,Umumba,ILG Cup Season 3,nuke
0,Paradox,2 - 1,Lucid,ESL Australia & NZ Championship Season 10 Qual...,bo3
0,Global,16 - 13,Umumba,ILG Cup Season 3,inf
0,Wings,16 - 3,LevelZero,ILG Cup Season 3,ovp


In [86]:
len(os.listdir('tmp'))

207