In [1]:
from itertools import cycle
import time
import os

import dask
import dask.array as da
from dask.distributed import wait, Client, get_worker
import numpy as np
from sklearn import datasets
from sklearn.metrics import confusion_matrix

In [2]:
POINTS_PER_FRAGMENT = 10
NUMBER_OF_FRAGMENTS = 5
DIMENSIONS = 3
NUMBER_OF_CENTERS = 3
NUMBER_OF_CSVM_ITERATIONS = 3

SEED = 42


In [3]:
client = Client()

In [4]:
def generate_points(num_points, num_centers, dim, seed):
    # We are mimicking the center generation that datasets.make_blobs do,
    # but using a fixed seed which makes the centers equal for all generate_points call
    np.random.seed(777)
    centers = np.random.uniform(-1, 1, (num_centers, dim)) * 10

    points, labels = datasets.make_blobs(
        n_samples=num_points, n_features=dim, centers=centers, shuffle=True, random_state=seed)

    return points, labels


In [5]:
dataset_blocks = list()
labels_blocks = list()

for i in range(NUMBER_OF_FRAGMENTS):
    d, l = dask.delayed(generate_points, nout=2)(POINTS_PER_FRAGMENT, NUMBER_OF_CENTERS, DIMENSIONS, SEED+i)
    dataset_blocks.append(da.from_delayed(d, shape=(POINTS_PER_FRAGMENT, DIMENSIONS), dtype=np.float64))
    labels_blocks.append(da.from_delayed(l, shape=(POINTS_PER_FRAGMENT,), dtype=np.int64))

dataset = da.concatenate(dataset_blocks).persist()
labels = da.concatenate(labels_blocks).persist()

wait(dataset)
wait(labels)

DoneAndNotDoneFutures(done={<Future: finished, type: numpy.ndarray, key: ('concatenate-64698f87b352a0adae036b2632ed1f89', 1)>, <Future: finished, type: numpy.ndarray, key: ('concatenate-64698f87b352a0adae036b2632ed1f89', 4)>, <Future: finished, type: numpy.ndarray, key: ('concatenate-64698f87b352a0adae036b2632ed1f89', 2)>, <Future: finished, type: numpy.ndarray, key: ('concatenate-64698f87b352a0adae036b2632ed1f89', 3)>, <Future: finished, type: numpy.ndarray, key: ('concatenate-64698f87b352a0adae036b2632ed1f89', 0)>}, not_done=set())

In [6]:
dataset

Unnamed: 0,Array,Chunk
Bytes,1.17 kiB,240 B
Shape,"(50, 3)","(10, 3)"
Dask graph,5 chunks in 1 graph layer,5 chunks in 1 graph layer
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 1.17 kiB 240 B Shape (50, 3) (10, 3) Dask graph 5 chunks in 1 graph layer Data type float64 numpy.ndarray",3  50,

Unnamed: 0,Array,Chunk
Bytes,1.17 kiB,240 B
Shape,"(50, 3)","(10, 3)"
Dask graph,5 chunks in 1 graph layer,5 chunks in 1 graph layer
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [7]:
labels

Unnamed: 0,Array,Chunk
Bytes,400 B,80 B
Shape,"(50,)","(10,)"
Dask graph,5 chunks in 1 graph layer,5 chunks in 1 graph layer
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 400 B 80 B Shape (50,) (10,) Dask graph 5 chunks in 1 graph layer Data type int64 numpy.ndarray",50  1,

Unnamed: 0,Array,Chunk
Bytes,400 B,80 B
Shape,"(50,)","(10,)"
Dask graph,5 chunks in 1 graph layer,5 chunks in 1 graph layer
Data type,int64 numpy.ndarray,int64 numpy.ndarray


In [8]:
ids = da.random.randint(2**16, size=(POINTS_PER_FRAGMENT * NUMBER_OF_FRAGMENTS,), chunks=(POINTS_PER_FRAGMENT,))
ids

Unnamed: 0,Array,Chunk
Bytes,400 B,80 B
Shape,"(50,)","(10,)"
Dask graph,5 chunks in 1 graph layer,5 chunks in 1 graph layer
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 400 B 80 B Shape (50,) (10,) Dask graph 5 chunks in 1 graph layer Data type int64 numpy.ndarray",50  1,

Unnamed: 0,Array,Chunk
Bytes,400 B,80 B
Shape,"(50,)","(10,)"
Dask graph,5 chunks in 1 graph layer,5 chunks in 1 graph layer
Data type,int64 numpy.ndarray,int64 numpy.ndarray


In [13]:
eval(list(client.who_has(dataset).items())[0][0])

('concatenate-75f088bfcb8e68ff8fd181f4440bd727', 0, 0)