In [1]:
from pathlib import Path
import concurrent.futures

import numpy as np
import pandas as pd
from joblib import Parallel,delayed
import dask

%load_ext memory_profiler
%load_ext line_profiler

In [2]:
dataPath = Path('../data')
rawData = dataPath / 'raw'
rawData.exists()

True

In [10]:
N = 250000
df = pd.DataFrame(np.random.standard_normal((N,50)))
filePath = rawData / "sampleExcel.xlsx"
# df.to_excel(filePath,engine="openpyxl")

In [13]:
%timeit -n 1 -r 1 df.mean()

156 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [4]:
repeatN = 2

In [5]:
def coreFunc(filePath):
    df = pd.read_excel(filePath,engine="openpyxl")
    for i in range(5):
        df = df ** 2
    return df

In [6]:
%lprun -f coreFunc coreFunc(filePath)

Timer unit: 1e-06 s

Total time: 0.933781 s
File: /tmp/ipykernel_4175/1174551119.py
Function: coreFunc at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def coreFunc(filePath):
     2         1     931197.0 931197.0     99.7      df = pd.read_excel(filePath,engine="openpyxl")
     3         6         48.0      8.0      0.0      for i in range(5):
     4         5       2529.0    505.8      0.3          df = df ** 2
     5         1          7.0      7.0      0.0      return df

In [7]:
def serialPandas(repeatN):
    return [coreFunc(filePath) for _ in range(repeatN)]
serial = serialPandas(repeatN = repeatN)
assert isinstance(serial,list)
assert all([isinstance(el,pd.DataFrame) for el in serial])

In [8]:
%timeit -n 1 -r 1 serialPandas(repeatN=repeatN)

419 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
# n_jobs = 2
# def jlPandas(repeatN:int,n_jobs:int,backend:str):
#     return Parallel(n_jobs=n_jobs, backend=backend)(delayed(coreFunc) for _ in range(repeatN))
# for backend in ["loky","threading","multiprocessing"]:
#     print(backend)
#     loky = jlPandas(repeatN = repeatN,backend=backend,n_jobs=2)
#     assert isinstance(loky,list)
#     assert all([isinstance(el,pd.DataFrame) for el in loky])
#     assert len(loky) == len(serial)
#     assert all([pd.testing.assert_frame_equal(lk,sr)==None for lk,sr in zip(loky,serial)]),f"{[pd.testing.assert_frame_equal(lk,sr) for lk,sr in zip(loky,serial)]}"


loky


In [9]:
def example(filePath):
    # This is a static table with basic information like distributions
    return coreFunc(filePath)

def concurrentPandas(repeatN):
    results = []
    with concurrent.futures.ProcessPoolExecutor() as pool:
        futr_results = [pool.submit(example,filePath) for _ in range(repeatN)]
        done_results = concurrent.futures.as_completed(futr_results)
        for _ in futr_results: 
            results.append(next(done_results).result())
    return results
repeatN = 2
conc = concurrentPandas(repeatN=repeatN)
assert isinstance(conc,list)
assert all([isinstance(el,pd.DataFrame) for el in conc])
assert len(conc) == len(serial)
assert all([pd.testing.assert_frame_equal(lk,sr)==None for lk,sr in zip(conc,serial)]),f"{[pd.testing.assert_frame_equal(lk,sr) for lk,sr in zip(conc,serial)]}"


In [None]:
repeatN = 5
n_jobs = 4
print("serial")
%timeit -n 1 -r 1 serialPandas(repeatN=repeatN)
print("concurrent")
%timeit -n 1 -r 1 concurrentPandas(repeatN=repeatN)

print("loky")
%timeit -n 1  -r 1 jlPandas(repeatN = repeatN,backend="loky",n_jobs=n_jobs)
print("threading")
%timeit -n 1  -r 1 jlPandas(repeatN = repeatN,backend="threading",n_jobs=n_jobs)

print("multiprocessing")
%timeit -n 1  -r 1 jlPandas(repeatN = repeatN,backend="multiprocessing",n_jobs=n_jobs)


serial
1min 46s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
concurrent
36.3 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
loky
