# Dask Examples

In [1]:
import os
import numpy as np
import pandas as pd
import dask.dataframe as dd

In [2]:
data_dir = 'data'
if not os.path.exists(data_dir):
    os.mkdir(data_dir)

## 1. Working with large data sets in multiple files

## Setup

In [3]:
%%time

# Create 5 large csv files (could be too big to fit all in memory)
shape = (10000, 1000)

for i in range(5):
    filepath = os.path.join(data_dir, f'datafile_{i:02d}.csv')
    if not os.path.exists(filepath):
        data = (i + 1) * np.random.randn(shape[0], shape[1])
        print(f"Array {i} size in memory: {data.nbytes*1e-6:.2f} MB")
        pd.DataFrame(data).to_csv(filepath)

Array 0 size in memory: 80.00 MB
Array 1 size in memory: 80.00 MB
Array 2 size in memory: 80.00 MB
Array 3 size in memory: 80.00 MB
Array 4 size in memory: 80.00 MB
CPU times: user 1min 33s, sys: 4.51 s, total: 1min 37s
Wall time: 1min 38s


## Work with data using Dask

In [4]:
from dask.distributed import Client, progress

client = Client(n_workers=2, threads_per_worker=2, memory_limit='1GB')
client

0,1
Client  Scheduler: tcp://127.0.0.1:56225  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 2  Cores: 4  Memory: 2.00 GB


In [5]:
%%time

# Define Dask computations
dataframes = [
    dd.read_csv(os.path.join(data_dir, filename)).set_index('Unnamed: 0')
    for filename in os.listdir(data_dir)
]

combined_df = dd.concat(dataframes).reset_index(drop=True)

CPU times: user 8.23 s, sys: 245 ms, total: 8.47 s
Wall time: 17.3 s


In [6]:
%%time

# Define whatever calcs you need here
summary = combined_df.describe()

CPU times: user 7.9 s, sys: 318 ms, total: 8.22 s
Wall time: 8.01 s


In [7]:
%%time

# Execute computations
summary.compute()

CPU times: user 1min 37s, sys: 4.83 s, total: 1min 41s
Wall time: 3min 35s


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,...,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,0.016292,0.005187,-0.005829,0.007926,0.00026,0.017051,-0.015583,-0.016277,0.01478,-0.003238,...,0.00875,0.017138,0.04188,-0.024165,-0.002877,-0.013523,-0.003742,0.004362,0.012684,-0.009275
std,3.304547,3.318474,3.337107,3.325068,3.31547,3.315709,3.316358,3.32353,3.308416,3.313375,...,3.342689,3.334616,3.307999,3.311561,3.310151,3.301174,3.312251,3.321849,3.335768,3.318291
min,-19.035765,-20.381286,-19.992876,-18.836203,-18.76999,-20.629614,-21.942701,-19.553089,-21.193616,-17.959556,...,-18.108314,-20.346939,-22.628082,-18.535893,-18.858015,-20.014767,-18.020325,-18.425364,-18.622013,-18.198762
25%,-0.663147,-0.653778,-0.6609,-0.625495,-0.654016,-0.532954,-0.556047,-0.626029,-0.621095,-0.652785,...,-0.656257,-0.636233,-0.651321,-0.575609,-0.650657,-0.654585,-0.658362,-0.630469,-0.68456,-0.625501
50%,0.119989,0.289878,0.08338,0.164652,0.245312,0.182461,0.184561,0.251316,0.169028,0.119771,...,0.292113,0.368796,0.231273,0.180168,0.205766,0.144189,0.087745,0.0812,0.224281,0.191066
75%,3.524226,3.498102,3.426767,3.519426,3.368119,3.600682,3.459851,3.420165,3.442486,3.473244,...,3.545111,3.570854,3.427513,3.363331,3.530686,3.51164,3.560842,3.528468,3.733729,3.552553
max,19.126532,19.613366,19.029658,19.991366,18.927045,18.918707,18.540806,19.042571,20.195461,17.98055,...,19.378494,20.455148,19.142178,18.386409,20.15114,19.138382,17.678503,19.277931,18.847879,18.087793


In [8]:
%%time

# Go back and fetch one data record
row = combined_df.loc[25000].compute()
row[0:10]

CPU times: user 2.77 s, sys: 155 ms, total: 2.92 s
Wall time: 12 s


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999


In [9]:
# Clean up
client.close()

for filename in os.listdir(data_dir):
    os.remove(os.path.join(data_dir, filename))

del summary, combined_df, dataframes