#### Parallelizing pandas dataframe

In [1]:
import pandas as pd
import numpy as np

In [2]:
# smaple data
df = pd.DataFrame(np.random.randint(0,100,size=(10000, 4)), columns=list('ABCD'))
df.head()

Unnamed: 0,A,B,C,D
0,4,98,68,59
1,1,2,23,62
2,49,66,23,22
3,11,21,89,67
4,80,34,84,80


In [3]:
df.shape

(10000, 4)

In [4]:
# sample operation
def mul_row(a:int, b:int, c:int, d:int):
    return a * b * c *d

In [5]:
%%time

# apply function to dataframe without parallelizing
df['mult_result'] = df.apply(lambda row: mul_row(row["A"], row["B"], row["C"], row["D"]), axis=1)
df.head(10)

CPU times: user 130 ms, sys: 3.42 ms, total: 134 ms
Wall time: 132 ms


Unnamed: 0,A,B,C,D,mult_result
0,4,98,68,59,1572704
1,1,2,23,62,2852
2,49,66,23,22,1636404
3,11,21,89,67,1377453
4,80,34,84,80,18278400
5,0,37,7,69,0
6,8,6,73,13,45552
7,23,33,75,51,2903175
8,37,96,25,60,5328000
9,16,98,47,87,6411552


In [6]:
# Swifter

# Official doc: https://github.com/jmcarpenter2/swifter
# examples: https://github.com/jmcarpenter2/swifter/blob/master/examples/swifter_apply_examples.ipynb

In [7]:
#!pip install swifter

In [8]:
import swifter

# multiple columns apply
%time df['mult_result_sw'] = df[['A', 'B', 'C', 'D']].swifter.apply(lambda row: mul_row(row['A'], row['B'], row['C'], row['D']), axis=1)

df.head(10)

CPU times: user 7.2 ms, sys: 530 µs, total: 7.73 ms
Wall time: 7.74 ms


Unnamed: 0,A,B,C,D,mult_result,mult_result_sw
0,4,98,68,59,1572704,1572704
1,1,2,23,62,2852,2852
2,49,66,23,22,1636404,1636404
3,11,21,89,67,1377453,1377453
4,80,34,84,80,18278400,18278400
5,0,37,7,69,0,0
6,8,6,73,13,45552,45552
7,23,33,75,51,2903175,2903175
8,37,96,25,60,5328000,5328000
9,16,98,47,87,6411552,6411552


In [9]:
# Pandarallel

# Official doc: https://nalepae.github.io/pandarallel/
# https://github.com/nalepae/pandarallel

In [10]:
#!pip install pandarallel 

In [11]:
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True)

# sample function
def mul_row_2(row):
    return row['A'] * row['B'] * row['C'] * row['D']

%time df['mult_result_pl'] = df.parallel_apply(mul_row_2, axis=1)

df.head(10)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1250), Label(value='0 / 1250'))), …

CPU times: user 104 ms, sys: 66.1 ms, total: 171 ms
Wall time: 196 ms


Unnamed: 0,A,B,C,D,mult_result,mult_result_sw,mult_result_pl
0,4,98,68,59,1572704,1572704,1572704
1,1,2,23,62,2852,2852,2852
2,49,66,23,22,1636404,1636404,1636404
3,11,21,89,67,1377453,1377453,1377453
4,80,34,84,80,18278400,18278400,18278400
5,0,37,7,69,0,0,0
6,8,6,73,13,45552,45552,45552
7,23,33,75,51,2903175,2903175,2903175
8,37,96,25,60,5328000,5328000,5328000
9,16,98,47,87,6411552,6411552,6411552


In [None]:
# Multiprocessing

# Official doc: https://docs.python.org/3/library/multiprocessing.html

# ref: https://stackoverflow.com/questions/26784164/pandas-multiprocessing-apply


import multiprocessing as mp
from func_file import parallelize_mul

def parallelize_dataframe(df, func):
    num_processes = 4 # if set to mp.cpu_count() automatically splits the dataframe into the number of cpu cores
    df_split = np.array_split(df, num_processes)
    with mp.Pool(num_processes) as p:
        df = pd.concat(p.map(func, df_split))
    return df


# NOTE:
# Created a py file with the required functions because, otherwise, an error is thrown --> AttributeError: Can't get attribute 'parallelize_mul' on <module '__main__' (built-in)>
# REF: https://stackoverflow.com/questions/47313732/jupyter-notebook-never-finishes-processing-using-multiprocessing-python-3
'''
Functions in func_file

def parallelize_mul(df):
    df['output'] = df.apply(mul_row_2, axis=1)
    return df

def mul_row_2(row):
    return row['A'] * row['B'] * row['C'] * row['D']
'''

In [13]:
%time df_output = parallelize_dataframe(df, parallelize_mul)

CPU times: user 17.7 ms, sys: 55.1 ms, total: 72.8 ms
Wall time: 627 ms


In [14]:
df_output.head()

Unnamed: 0,A,B,C,D,mult_result,mult_result_sw,mult_result_pl,output
0,4,98,68,59,1572704,1572704,1572704,1572704
1,1,2,23,62,2852,2852,2852,2852
2,49,66,23,22,1636404,1636404,1636404,1636404
3,11,21,89,67,1377453,1377453,1377453,1377453
4,80,34,84,80,18278400,18278400,18278400,18278400
