# Computing basic Stats with the CPU

In [1]:
from kosmoss import CONFIG, DATA_PATH, PROCESSED_DATA_PATH
from kosmoss.utils import timing

In [2]:
import dask
import dask.array as da
import numpy as np
import os.path as osp
import torch
from typing import Dict, List, Text

step = CONFIG['timestep']
num_workers = CONFIG['num_workers']

features_path = osp.join(PROCESSED_DATA_PATH, f'features-{step}')

x = da.from_npy_stack(osp.join(features_path, 'x'))
y = da.from_npy_stack(osp.join(features_path, 'y'))
edge = da.from_npy_stack(osp.join(features_path, 'edge'))

In [3]:
x

Unnamed: 0,Array,Chunk
Bytes,11.16 GiB,13.48 MiB
Shape,"(1085440, 138, 20)","(1280, 138, 20)"
Count,848 Tasks,848 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 11.16 GiB 13.48 MiB Shape (1085440, 138, 20) (1280, 138, 20) Count 848 Tasks 848 Chunks Type float32 numpy.ndarray",20  138  1085440,

Unnamed: 0,Array,Chunk
Bytes,11.16 GiB,13.48 MiB
Shape,"(1085440, 138, 20)","(1280, 138, 20)"
Count,848 Tasks,848 Chunks
Type,float32,numpy.ndarray


## Loading in a single thread with pure NumPy

Several cons to using this method:

* Slow, monothreaded data loading
* Requires to load the entire file content at once into memory
* Can be limited depending on the amount of available RAM

In [4]:
@timing
def compute_stats_mono(arrays: List[da.Array]) -> Dict[Text, np.ndarray]:
    
    # Simulate pure NumPy
    num_workers = 1
    
    stats = {}
    for a in arrays:
        
        # Load data into memory
        a_ = a.compute(num_workers=num_workers)
        
        # Compute mean and standard-deviation for array
        a_mean = np.mean(a_, axis=0)
        a_std = np.std(a_, axis=0)
        
        name = a.name.split("/")[-1]
        stats.update({
            f'{name}_mean': torch.tensor(a_mean),
            f'{name}_std': torch.tensor(a_std)
        })
        
    return stats

Open an `htop` in a side terminal, and watch the memory grow

In [5]:
stats = compute_stats_mono([x, y, edge])

280975.03 ms


## Multithreaded loading with Dask

Again, most of the process in *Dask* is handled in lazy evaluation mode. Dask builds a computational graph called a *Directed Acyclic Graph* (DAG) and executes the command only if needed, proceeding with optimizations along the way, if any.

Moreover, the `compute()` method executes the DAG on each data chunk by using Math formula to distribute computations when possible.

In [6]:
@timing
def compute_stats_multi(arrays: List[da.Array]) -> Dict[Text, np.ndarray]:
    
    # Scaling computation by increasing default number of workers
    num_workers = 16
    
    stats = {}
    for a in arrays:
        
        # Lazy evaluation
        a_mean = da.mean(a, axis=0)
        a_std = da.std(a, axis=0)
        
        # Compute mean and standard-deviation for current array
        m = a_mean.compute(num_workers=num_workers)
        s = a_std.compute(num_workers=num_workers)
        
        name = a.name.split("/")[-1]
        stats.update({
            f'{name}_mean': torch.tensor(m),
            f'{name}_std': torch.tensor(s)
        })
        
    return stats

In [7]:
stats = compute_stats_multi([x, y, edge])

18083.31 ms


You should observe a substantial gain in computational time.

## Saving the Stats for later use

We will use this data to perform on-the-fly input normalization within the model itself with a Normalization layer.

`torch.save` uses the Python Pickle format to save data. You can save anything pickable, which is not exactly a limitation since many pure Python code is pickle-serializable.

In [8]:
stats_path = osp.join(DATA_PATH, f"stats-features-{step}.pt")
torch.save(stats, stats_path)

# Same for the Flattened dataset

In [10]:
flattened_path = osp.join(PROCESSED_DATA_PATH, f'flattened-{step}')

x = da.from_npy_stack(osp.join(flattened_path, 'x'))
y = da.from_npy_stack(osp.join(flattened_path, 'y'))

stats = compute_stats_multi([x, y])

stats_path = osp.join(DATA_PATH, f"stats-flattened-{step}.pt")
torch.save(stats, stats_path)

49732.58 ms
