In [1]:
#!pip install celery

In [2]:
#!sudo rabbitmqctl add_user myguest myguestpwd
#!sudo rabbitmqctl set_permissions -p / myguest "." "." ".*"

In [3]:
!sudo rabbitmqctl list_users

Listing users ...
guest	[administrator]
myguest	[]


This code will run on the **server** machine. It will **ask its worker machines** to complete some sortings task and send the results back to the server.

In [4]:
import random
import time
from celery import group
from mergesort import sort, merge


In [5]:
# Create a list of 1,000,000 elements in random order.
sequence = list(range(1000000))
random.shuffle(sequence)


In [6]:
# Split the sequence in a number of chunks and process those independently.
n = 4
l = len(sequence) // n
subseqs = [sequence[i * l:(i + 1) * l] for i in range(n - 1)]
subseqs.append(sequence[(n - 1) * l:])

In [7]:
len(subseqs)

4

In [8]:
for i in range(len(subseqs)):
    print('Lentght of sequence {}: {}'.format(i,len(subseqs[i])))

Lentght of sequence 0: 250000
Lentght of sequence 1: 250000
Lentght of sequence 2: 250000
Lentght of sequence 3: 250000


Before you run the next cell, you will need to run th code on a worker machine with

**"celery -A mergesort worker --loglevel=info". **

Then that machine will become a worker, and will be able to run the app task, i.e. the sort function, whenever the broker requests it.



In [9]:

t0 = time.time()

# Ask the Celery workers to sort each sub-sequence.
# Use a group to run the individual independent tasks as a unit of work.

# celery.group creates a group of tasks to be executed in parallel.
# 'sort.s' is the signature of the sort function. This indicates that we want to call this function on the worker machines

lazy_partials = group(sort.s(seq) for seq in subseqs)() # call remote workers to run the sort task parallel 
t1 = time.time()-t0

# We will with till we get back the results from all of them
partials = lazy_partials.get() # will wait for the tasks to return
t2 = time.time()-t0
# Merge all the individual sorted sub-lists into our final result.
result = partials[0]
for partial in partials[1:]:
    result = merge(result, partial) # local merge the results back from the workers

t3 = time.time() - t0

print('Tasks sent to workers in %.02fs' % (t1))
print('Results from all the workers came back in %.02fs' % (t2))
print('Distributed mergesort took %.02fs' % (t3))



Tasks sent to workers in 1.14s
Results from all the workers came back in 11.98s
Distributed mergesort took 12.58s


In [10]:
# Do the same thing locally and compare the times.
t0 = time.time()

# Here we will call the 'sort' function witohut its signature 'sort.s' to indicate we want to run this remotely.
truth = sort(sequence)
dt = time.time() - t0
print('Local mergesort took %.02fs' % (dt))



Local mergesort took 22.74s


**In this case local sort took longer time then parralel sort using the workers!**

In [11]:
# Final sanity checks.
assert result == truth
assert result == sorted(sequence)

# Yayyy sorting was successful

**Let us see some more tests**

In [12]:
#the below line just send the tasks to the workers and ask them to run the tasks parallel
lazy_partials = group(sort.s(seq) for seq in subseqs)() # call remote workers to run the sort task parallel 

print(len(lazy_partials))
for i in lazy_partials: 
    print(i)

# We get the results back in a lazy way. The results have not been calculated yet!   

4
53f40593-3c0c-4ce3-8fa9-44069e70bf7c
1fee9843-c3c4-416b-b0e2-d38653a7257e
a6b1b1dd-6f0b-4c3f-a136-ab83a39d2268
ae7b78cc-4794-4da9-9627-898d06eb6902


In [13]:
# We need to call the .get() function to get the final results from all the workers:
partials = lazy_partials.get()    
print(len(partials))
for i in range(len(partials)): 
    print('length of chunk {}: {}'.format(i, len(partials[i])))

4
length of chunk 0: 250000
length of chunk 1: 250000
length of chunk 2: 250000
length of chunk 3: 250000


**Let us check the running time again!**

In [14]:
t0 = time.time()
lazy_partials = group(sort.s(seq) for seq in subseqs)() # call remote workers to run the sort task 
dt = time.time() - t0
print(' took %.02fs' % (dt))


 took 1.00s


It took this much time to communicate with the workes, but the results are not calculated yet.
In the background calculation continues...

In [15]:
t0 = time.time()
partials = lazy_partials.get() # will wait for the tasks to return
dt = time.time() - t0
print('took %.02fs' % (dt))


took 10.27s


We needed this much more time to get all the calculated results from the workers