# Dask Examples

In [1]:
import os
import numpy as np
import pandas as pd
import dask.dataframe as dd

In [2]:
data_dir = 'data'
if not os.path.exists(data_dir):
    os.mkdir(data_dir)

## 1. Working with large data sets in multiple files

## Setup

In [3]:
%%time

# Create n large csv files (could be too big to fit all in memory)
n = 5
shape = (10000, 1000)
index_start = 0

index_end = index_start
for i in range(n):
    filepath = os.path.join(data_dir, f'datafile_{i:02d}.csv')
    index_start, index_end = index_end, index_end + shape[0]
    if not os.path.exists(filepath):
        data = (i + 1) * np.random.randn(shape[0], shape[1])
        print(f"Array {i} size in memory: {data.nbytes*1e-6:.2f} MB")
        index = pd.RangeIndex(index_start, index_end, name='Index')
        pd.DataFrame(data, index=index).to_csv(filepath)

Array 0 size in memory: 80.00 MB
Array 1 size in memory: 80.00 MB
Array 2 size in memory: 80.00 MB
Array 3 size in memory: 80.00 MB
Array 4 size in memory: 80.00 MB
CPU times: user 1min 33s, sys: 4.57 s, total: 1min 37s
Wall time: 1min 43s


## Work with data using Dask

In [4]:
from dask.distributed import Client, progress

client = Client(n_workers=2, threads_per_worker=2, memory_limit='1GB')
client

0,1
Client  Scheduler: tcp://127.0.0.1:59933  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 2  Cores: 4  Memory: 2.00 GB


In [5]:
%%time

# Define Dask computations
dataframes = [
    dd.read_csv(os.path.join(data_dir, filename)).set_index('Index')
    for filename in os.listdir(data_dir) if filename.endswith('.csv')
]

combined_df = dd.concat(dataframes)

CPU times: user 8.18 s, sys: 259 ms, total: 8.44 s
Wall time: 17.1 s


In [6]:
combined_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.400181,0.610649,-1.505594,-0.722887,0.556303,-0.348722,-0.364021,0.731849,0.748032,-0.223252,...,0.346338,-1.747218,0.634405,-0.395872,0.010671,0.539237,-1.692777,-0.227991,0.790736,-1.417924
1,1.875394,-2.148834,-0.148973,-0.333544,0.999116,1.184814,0.642828,1.042833,1.039625,-0.640682,...,0.530851,0.641269,0.556786,0.528313,-0.085312,1.674977,0.106694,-0.958992,-0.472368,0.030656
2,1.172176,0.423629,-3.305103,0.319064,0.154128,-0.537401,-0.418881,-0.487734,0.685797,-0.066923,...,0.497662,-0.195739,1.2184,1.180031,1.854949,0.055486,-1.333175,-0.777586,-0.363799,0.664064
3,-0.532357,2.093363,-0.373611,-0.488999,-0.752565,-2.075056,-1.100282,0.505867,2.278174,-0.464888,...,-0.601322,0.081838,0.028054,-0.746534,0.037045,0.731593,0.353677,-1.825305,-2.086533,1.330815
4,0.689753,-0.446567,2.682292,1.336689,0.998821,-0.537471,-0.517583,-0.781442,-1.702376,-0.74831,...,-0.695017,-0.040847,-0.529393,-0.722727,0.544039,-0.235467,0.145447,0.682038,-0.405109,-0.90755


In [7]:
combined_df.tail()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
49995,1.998952,-1.373204,-5.674616,-1.858813,4.397917,8.142478,-0.553167,0.136201,6.037382,-10.171684,...,-7.362298,0.460649,-1.593336,-5.511917,3.121796,-3.322765,-2.019599,-2.921468,-0.24602,3.070247
49996,-1.229766,2.649333,-5.808332,-1.262984,6.945975,-4.941374,0.800986,6.167685,5.543103,-6.911818,...,4.472874,7.47633,-9.853683,-6.145771,-0.100017,3.250495,-1.059554,5.195828,10.35607,0.290861
49997,-3.543714,-1.389486,-4.557311,-7.266622,0.464583,-2.006719,3.110085,3.813746,-7.970505,0.100407,...,6.1191,-1.736862,-4.450691,5.149278,-5.062946,4.990604,6.01038,-0.449093,3.068553,4.885985
49998,-2.109118,-1.675563,0.216072,7.618959,-3.620683,-11.645267,-4.712474,1.508366,-1.903519,1.538641,...,-0.212659,-1.371856,-0.589846,-4.825625,5.797191,-6.006743,-12.203543,0.640505,-4.079706,-0.33972
49999,-2.80174,3.730997,7.3814,4.195546,3.277159,-4.685313,-3.000238,-2.019227,-9.338714,-5.452893,...,5.139748,1.417912,0.129172,-4.022613,2.007765,-4.702833,-0.226432,-3.716302,4.862043,6.231629


In [8]:
%%time

# Check the index values are not duplicated
assert np.all(combined_df.index.compute().duplicated() == False)

CPU times: user 946 ms, sys: 68.4 ms, total: 1.01 s
Wall time: 8.1 s


In [9]:
%%time

# Define whatever calcs you need here
summary = {
    'Mean': combined_df.mean(),
    'Min': combined_df.min(),
    'Max': combined_df.max()
}

CPU times: user 659 ms, sys: 12.5 ms, total: 671 ms
Wall time: 664 ms


In [10]:
%%time

# Execute computations
results = dd.compute(summary)
type(results)

CPU times: user 1.17 s, sys: 109 ms, total: 1.28 s
Wall time: 17.2 s


tuple

In [11]:
pd.DataFrame(results[0])

Unnamed: 0,Mean,Min,Max
0,-0.006559,-19.179415,18.751679
1,0.004683,-19.625493,17.669995
2,-0.007000,-19.705957,18.007073
3,0.020988,-20.205763,17.571911
4,0.008356,-19.157591,18.266343
...,...,...,...
995,-0.021518,-18.641149,21.199286
996,-0.021955,-22.531171,18.191493
997,-0.007056,-20.054675,18.313921
998,0.030749,-19.856643,17.387779


In [12]:
%%time

# Go back and fetch one data record
row = combined_df.loc[25000].compute()
row[0:10]

CPU times: user 967 ms, sys: 69.4 ms, total: 1.04 s
Wall time: 9.73 s


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
25000,3.598747,-2.700632,-0.620862,1.233367,-0.852075,-0.682858,2.711008,-0.753854,-1.43202,-3.327781,...,1.546905,0.037293,-4.618509,-0.066837,3.290556,-3.396892,-3.204845,-1.67425,-1.43782,-1.178303


In [13]:
# Clean up
client.close()

for filename in os.listdir(data_dir):
    os.remove(os.path.join(data_dir, filename))

del summary, combined_df, dataframes