****************************************************************

In [1]:
# Importing the root of this bootcamp
import os.path as osp
import sys

sys.path.append(osp.abspath('..'))

# Computing basic Stats with the CPU

In [9]:
import dask
import dask.array as da
import numpy as np
import os.path as osp
import torch
from typing import Dict, Text

import config
import utils

step = config.config['timestep']
num_workers = config.config['num_workers']

feats_path = osp.join(config.processed_data_path, f'features-{step}')

x = da.from_npy_stack(osp.join(feats_path, 'x'))
y = da.from_npy_stack(osp.join(feats_path, 'y'))
edge = da.from_npy_stack(osp.join(feats_path, 'edge'))

In [10]:
x

Unnamed: 0,Array,Chunk
Bytes,11.16 GiB,13.48 MiB
Shape,"(1085440, 138, 20)","(1280, 138, 20)"
Count,848 Tasks,848 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 11.16 GiB 13.48 MiB Shape (1085440, 138, 20) (1280, 138, 20) Count 848 Tasks 848 Chunks Type float32 numpy.ndarray",20  138  1085440,

Unnamed: 0,Array,Chunk
Bytes,11.16 GiB,13.48 MiB
Shape,"(1085440, 138, 20)","(1280, 138, 20)"
Count,848 Tasks,848 Chunks
Type,float32,numpy.ndarray


## Loading in a single thread with pure NumPy

Open an `htop` in a side terminal, and watch the memory grow

In [19]:
@utils.timing
def compute_stats_mono(
    x: da.Array, 
    y: da.Array, 
    edge: da.Array) -> Dict[Text, np.ndarray]:
    
    # Load the data into memory
    # Simulate pure NumPy
    num_workers = 1
    x = x.compute(num_workers=num_workers)
    y = y.compute(num_workers=num_workers)
    edge = edge.compute(num_workers=num_workers)
    
    # Compute basic stats using NumPy, on axis 0
    x_mean = np.mean(x, axis=0)
    y_mean = np.mean(y, axis=0)
    edge_mean = np.mean(edge, axis=0)
    
    x_std = np.std(x, axis=0)
    y_std = np.std(y, axis=0)
    edge_std = np.std(edge, axis=0)
    
    return {
        'x_mean': x_mean,
        'y_mean': y_mean,
        'edge_mean': edge_mean,
        'x_std': x_std,
        'y_std': y_std,
        'edge_std': edge_std
    }

In [20]:
stats = compute_stats_mono(x, y, edge)

518427.51 ms


## Multithreaded loading with Dask

Again, most of the process in *Dask* is handled in lazy evaluation mode. Dask builds the graph and executes the command only if needed, proceeding with optimizations along the way

In [21]:
@utils.timing
def compute_stats_multi(
    x: da.Array, 
    y: da.Array, 
    edge: da.Array) -> Dict[Text, np.ndarray]:
    
    # Lazy evaluation
    x_mean = da.mean(x, axis=0)
    y_mean = da.mean(y, axis=0)
    edge_mean = da.mean(edge, axis=0)

    x_std = da.std(x, axis=0)
    y_std = da.std(y, axis=0)
    edge_std = da.std(edge, axis=0)
    
    # Scaling computation by increasing default number of workers
    num_workers = 16
    
    x_mean.compute(num_workers=num_workers)
    y_mean.compute(num_workers=num_workers)
    edge_mean.compute(num_workers=num_workers)
    
    x_std.compute(num_workers=num_workers)
    y_std.compute(num_workers=num_workers)
    edge_std.compute(num_workers=num_workers)
    
    return {
        'x_mean': x_mean,
        'y_mean': y_mean,
        'edge_mean': edge_mean,
        'x_std': x_std,
        'y_std': y_std,
        'edge_std': edge_std
    }

In [22]:
stats = compute_stats_multi(x, y, edge)

171877.10 ms


You should observe a 5x gain in computation time.

## Saving the Stats for later use

`torch.save` uses the Python Pickle format to save data. You can save anything pickable, which is not exactly a limitation since many pure Python code is pickle-serializable.

In [23]:
stats_path = osp.join(config.processed_data_path, f"stats-{step}.pt")
torch.save(stats, stats_path)