In [24]:
from arviz.data.base import generate_dims_coords
from arviz.data.converters import convert_to_inference_data
from line_profiler import LineProfiler
import numpy as np
from numpy import array, average, dot
import numba
from copy import deepcopy
import datetime
import warnings
from arviz.plots.kdeplot import _fast_kde_2d as f2
import numpy as np
import pkg_resources
import xarray as xr
import timeit
from scipy.signal import gaussian, convolve, convolve2d  # pylint: disable=no-name-in-module
from scipy.sparse import coo_matrix

In [2]:
lp = LineProfiler()
wrapper = lp(generate_dims_coords)
wrapper((500,600,80), 'x')
lp.print_stats()

Timer unit: 1e-06 s

Total time: 0.000288 s
File: /home/banzee/Desktop/arviz/arviz/data/base.py
Function: generate_dims_coords at line 30

Line #      Hits         Time  Per Hit   % Time  Line Contents
    30                                           def generate_dims_coords(shape, var_name, dims=None, coords=None, default_dims=None):
    31                                               """Generate default dimensions and coordinates for a variable.
    32                                           
    33                                               Parameters
    34                                               ----------
    35                                               shape : tuple[int]
    36                                                   Shape of the variable
    37                                               var_name : str
    38                                                   Name of the variable. Used in the default name, if necessary
    39                          

In [3]:
@numba.njit
def range_(x):
    return np.arange(x)


def range_jit(x):
    return np.arange(x)

In [4]:
%timeit range_(100)

The slowest run took 6.99 times longer than the fastest. This could mean that an intermediate result is being cached.
6.34 µs ± 6.31 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [5]:
%timeit range_jit(100)

1.33 µs ± 72.9 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [6]:
def generate_dims_coords(shape, var_name, dims=None, coords=None, default_dims=None):
    if default_dims is None:
        default_dims = []
    if dims is None:
        dims = []
    if len([dim for dim in dims if dim not in default_dims]) > len(shape):
        warnings.warn(
            (
                "In variable {var_name}, there are "
                + "more dims ({dims_len}) given than exist ({shape_len}). "
                + "Passed array should have shape (chains, draws, *shape)"
            ).format(var_name=var_name, dims_len=len(dims), shape_len=len(shape)),
            SyntaxWarning,
        )
    if coords is None:
        coords = {}

    coords = deepcopy(coords)
    dims = deepcopy(dims)

    for idx, dim_len in enumerate(shape):
        if (len(dims) < idx + 1) or (dims[idx] is None):
            dim_name = "{var_name}_dim_{idx}".format(var_name=var_name, idx=idx)
            if len(dims) < idx + 1:
                dims.append(dim_name)
            else:
                dims[idx] = dim_name
        dim_name = dims[idx]
        if dim_name not in coords:
            coords[dim_name] = np.arange(dim_len)
    coords = {key: coord for key, coord in coords.items() if any(key == dim for dim in dims)}
    return dims, coords



def generate_dims_coords_jit(shape, var_name, dims=None, coords=None, default_dims=None):
    if default_dims is None:
        default_dims = []
    if dims is None:
        dims = []
    if len([dim for dim in dims if dim not in default_dims]) > len(shape):
        warnings.warn(
            (
                "In variable {var_name}, there are "
                + "more dims ({dims_len}) given than exist ({shape_len}). "
                + "Passed array should have shape (chains, draws, *shape)"
            ).format(var_name=var_name, dims_len=len(dims), shape_len=len(shape)),
            SyntaxWarning,
        )
    if coords is None:
        coords = {}

    coords = deepcopy(coords)
    dims = deepcopy(dims)

    for idx, dim_len in enumerate(shape):
        if (len(dims) < idx + 1) or (dims[idx] is None):
            dim_name = "{var_name}_dim_{idx}".format(var_name=var_name, idx=idx)
            if len(dims) < idx + 1:
                dims.append(dim_name)
            else:
                dims[idx] = dim_name
        dim_name = dims[idx]
        if dim_name not in coords:
            coords[dim_name] = range_(dim_len)
    coords = {key: coord for key, coord in coords.items() if any(key == dim for dim in dims)}
    return dims, coords


In [7]:
%timeit generate_dims_coords((10000,10000), 'x')

54.7 µs ± 3.2 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [8]:
%timeit generate_dims_coords_jit((10000,10000), 'x')

31.5 µs ± 718 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [9]:
%timeit generate_dims_coords((10,190), 'x')

17.6 µs ± 155 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [10]:
%timeit generate_dims_coords_jit((10,190), 'x')

18.9 µs ± 1.98 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [11]:
def numpy_to_data_array(ary, *, var_name="data", coords=None, dims=None):
    # manage and transform copies
    default_dims = ["chain", "draw"]
    ary = np.atleast_2d(ary)
    n_chains, n_samples, *shape = ary.shape
    if n_chains > n_samples:
        warnings.warn(
            "More chains ({n_chains}) than draws ({n_samples}). "
            "Passed array should have shape (chains, draws, *shape)".format(
                n_chains=n_chains, n_samples=n_samples
            ),
            SyntaxWarning,
        )

    dims, coords = generate_dims_coords(
        shape, var_name, dims=dims, coords=coords, default_dims=default_dims
    )

    # reversed order for default dims: 'chain', 'draw'
    if "draw" not in dims:
        dims = ["draw"] + dims
    if "chain" not in dims:
        dims = ["chain"] + dims

    if "chain" not in coords:
        coords["chain"] = np.arange(n_chains)
    if "draw" not in coords:
        coords["draw"] = np.arange(n_samples)

    # filter coords based on the dims
    coords = {key: xr.IndexVariable((key,), data=coords[key]) for key in dims}
    return xr.DataArray(ary, coords=coords, dims=dims)


def numpy_to_data_array_jit(ary, *, var_name="data", coords=None, dims=None):
    # manage and transform copies
    default_dims = ["chain", "draw"]
    ary = np.atleast_2d(ary)
    n_chains, n_samples, *shape = ary.shape
    if n_chains > n_samples:
        warnings.warn(
            "More chains ({n_chains}) than draws ({n_samples}). "
            "Passed array should have shape (chains, draws, *shape)".format(
                n_chains=n_chains, n_samples=n_samples
            ),
            SyntaxWarning,
        )

    dims, coords = generate_dims_coords_jit(
        shape, var_name, dims=dims, coords=coords, default_dims=default_dims
    )

    # reversed order for default dims: 'chain', 'draw'
    if "draw" not in dims:
        dims = ["draw"] + dims
    if "chain" not in dims:
        dims = ["chain"] + dims

    if "chain" not in coords:
        coords["chain"] = range_(n_chains)
    if "draw" not in coords:
        coords["draw"] = range_(n_samples)

    # filter coords based on the dims
    coords = {key: xr.IndexVariable((key,), data=coords[key]) for key in dims}
    return xr.DataArray(ary, coords=coords, dims=dims)

In [12]:
data = np.random.randn(10000,100)
linear = np.random.randn(1000000)
small = np.random.randn(100,100)

In [13]:
%timeit numpy_to_data_array(data)

  if sys.path[0] == '':


360 µs ± 27.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [14]:
%timeit numpy_to_data_array_jit(data)



355 µs ± 24 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [15]:
%timeit numpy_to_data_array(linear)

2.31 ms ± 149 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [16]:
%timeit numpy_to_data_array_jit(linear)

2.2 ms ± 16.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [17]:
%timeit numpy_to_data_array(small)

314 µs ± 7.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [18]:
%timeit numpy_to_data_array_jit(small)

311 µs ± 11.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [19]:
'Very Similar Performance. Up for reconsideration'

'Very Similar Performance. Up for reconsideration'

In [None]:
# Dict to dataset bottleneck ---> numpy_to_ndarray

In [23]:
""""""""""""""""""""""""""""""""""""""""""""""Converters"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""

'"Converters'

In [25]:
data = np.random.randn(10000,100)

In [26]:
lp = LineProfiler()
wrapper = lp(convert_to_inference_data)
wrapper(data)
lp.print_stats()

Timer unit: 1e-06 s

Total time: 0.006499 s
File: /home/banzee/Desktop/arviz/arviz/data/converters.py
Function: convert_to_inference_data at line 16

Line #      Hits         Time  Per Hit   % Time  Line Contents
    16                                           def convert_to_inference_data(obj, *, group="posterior", coords=None, dims=None, **kwargs):
    17                                               r"""Convert a supported object to an InferenceData object.
    18                                           
    19                                               This function sends `obj` to the right conversion function. It is idempotent,
    20                                               in that it will return arviz.InferenceData objects unchanged.
    21                                           
    22                                               Parameters
    23                                               ----------
    24                                               obj : d



In [None]:
# Bottleneck is dict to dataset. Refer above

In [28]:
""""""""""""""""""""""""""""""""""""""""""""""""""""DATASETS.PY"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""

'"DATASETS.PY'

In [29]:
# Nothing to improve here

In [30]:
"""""""""""""""""""""""""""""""""""""""""""""""""""""io_dict"""""""""""""""""""""""""""""""""""""""""""""""""""""""""'""'

'""io_dict""'

In [31]:
# Bottleneck is dict to dataset. Refer above

In [38]:
"""""""""""""""""""""""""""""""""""""""""io_netcdf"""""""""""""""""""""""""""""""""""""""""""""""""""""""""

'""io_netcdf'

In [39]:
#Bottlenecks---->Inference data and convert_to_inference_data