**Conclusion:** 
* It is always slower when writing the whole block of data compared to writing data in a loop timestep by timesteps
* This is also true when specifying the chunksize to [1, 900, 900] (which is also seems to be the default when not specifying the chunksize...)
* This is also true when not using compression (which speeds up things considerably)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import xarray as xr
import radolan_to_netcdf as rtn
import numpy as np
import netCDF4

In [3]:
fn = '/tmp/test.nc'
test_data = np.random.randn(100, 900, 900)

# Write time step by time step

In [4]:
rtn.create_empty_netcdf(fn=fn, product_name='RW')

In [5]:
%%time

with netCDF4.Dataset(fn, 'a') as nc_fh: 
    for i in range(test_data.shape[0]):
        nc_fh['rainfall_amount'][i, :, :] = test_data[i, :, :]

CPU times: user 35.9 s, sys: 394 ms, total: 36.3 s
Wall time: 9.23 s


# Write whole block of data at once

In [6]:
rtn.create_empty_netcdf(fn=fn, product_name='RW')

In [7]:
%%time

with netCDF4.Dataset(fn, 'a') as nc_fh: 
    nc_fh['rainfall_amount'][0:100, :, :] = test_data[0:100, :, :]

CPU times: user 10.5 s, sys: 1.19 s, total: 11.7 s
Wall time: 9.84 s


In [8]:
ls -lh /tmp/test.nc

-rw-r--r--  1 chwala-c  wheel    79M  7 Mai 09:42 /tmp/test.nc


# Try if it is faster with defined chunksize

In [9]:
metadata_per_timestamp = {
    'maxrange': {
        'variable_parameters': {
            'datatype': 'i2',
            'dimensions': ('time'),
        },
        'attributes': {
            'units': 'km',
        },
    },
    'radarlocations': {
        'variable_parameters': {
            'datatype': str,
            'dimensions': ('time'),
        },
        'attributes': {
            'long_name': 'List of radar locations available at time stamp'
        },
    },
}

config = {
        'variables': {
            'rainfall_amount': {
                'variable_parameters': {
                    'datatype': 'i2',
                    'dimensions': ('time', 'y', 'x'),
                    'chunksizes': [1, 900, 900],
                    'fill_value': -9999,
                    'zlib': True,
                    'complevel': 5,
                },
                'attributes': {
                    'long_name': 'Hourly rainfall',
                    'standard_name': 'rainfall_amount',
                    'units': 'kg',
                    'scale_factor': 0.1,
                    'add_offset': 0,
                    'coordinates': 'longitudes latitudes',
                    'grid_mapping': 'RADOLAN_grid',
                },
            },
        },
        'metadata_per_timestamp': metadata_per_timestamp,
        'metadata_fixed': {
            'n_lats': 900,
            'n_lons': 900,
        }
}

In [10]:
rtn.create_empty_netcdf(fn=fn, product_config_dict=config, product_name='RW')

In [11]:
%%time

with netCDF4.Dataset(fn, 'a') as nc_fh: 
    for i in range(test_data.shape[0]):
        nc_fh['rainfall_amount'][i, :, :] = test_data[i, :, :]

CPU times: user 36.9 s, sys: 451 ms, total: 37.3 s
Wall time: 9.64 s


In [12]:
ls -lh /tmp/test.nc

-rw-r--r--  1 chwala-c  wheel    79M  7 Mai 09:42 /tmp/test.nc


In [13]:
rtn.create_empty_netcdf(fn=fn, product_config_dict=config, product_name='RW')

In [14]:
%%time

with netCDF4.Dataset(fn, 'a') as nc_fh: 
    nc_fh['rainfall_amount'][0:100, :, :] = test_data[0:100, :, :]  

CPU times: user 10.6 s, sys: 1.09 s, total: 11.7 s
Wall time: 9.91 s


In [15]:
ls -lh /tmp/test.nc

-rw-r--r--  1 chwala-c  wheel    79M  7 Mai 09:43 /tmp/test.nc


# Try without compression

In [16]:
metadata_per_timestamp = {
    'maxrange': {
        'variable_parameters': {
            'datatype': 'i2',
            'dimensions': ('time'),
        },
        'attributes': {
            'units': 'km',
        },
    },
    'radarlocations': {
        'variable_parameters': {
            'datatype': str,
            'dimensions': ('time'),
        },
        'attributes': {
            'long_name': 'List of radar locations available at time stamp'
        },
    },
}

config = {
        'variables': {
            'rainfall_amount': {
                'variable_parameters': {
                    'datatype': 'i2',
                    'dimensions': ('time', 'y', 'x'),
                    'fill_value': -9999,
                    'zlib': False,
                    'complevel': 0,
                },
                'attributes': {
                    'long_name': 'Hourly rainfall',
                    'standard_name': 'rainfall_amount',
                    'units': 'kg',
                    'scale_factor': 0.1,
                    'add_offset': 0,
                    'coordinates': 'longitudes latitudes',
                    'grid_mapping': 'RADOLAN_grid',
                },
            },
        },
        'metadata_per_timestamp': metadata_per_timestamp,
        'metadata_fixed': {
            'n_lats': 900,
            'n_lons': 900,
        }
}

In [17]:
rtn.create_empty_netcdf(fn=fn, product_config_dict=config, product_name='RW')

In [18]:
%%time

with netCDF4.Dataset(fn, 'a') as nc_fh: 
    for i in range(test_data.shape[0]):
        nc_fh['rainfall_amount'][i, :, :] = test_data[i, :, :]

CPU times: user 2.44 s, sys: 255 ms, total: 2.69 s
Wall time: 700 ms


In [19]:
ls -lh /tmp/test.nc

-rw-r--r--  1 chwala-c  wheel   167M  7 Mai 09:43 /tmp/test.nc


In [20]:
rtn.create_empty_netcdf(fn=fn, product_config_dict=config, product_name='RW')

In [21]:
%%time

with netCDF4.Dataset(fn, 'a') as nc_fh: 
    nc_fh['rainfall_amount'][0:100, :, :] = test_data[0:100, :, :]  

CPU times: user 1.74 s, sys: 1.05 s, total: 2.78 s
Wall time: 1.05 s


# Trying different cache sizes

C documentation

https://www.unidata.ucar.edu/software/netcdf/docs/group__variables.html#ga2788cbfc6880ec70c304292af2bc7546

Python documentaiton

http://unidata.github.io/netcdf4-python/netCDF4/index.html#netCDF4.Variable.set_var_chunk_cache

In [22]:
metadata_per_timestamp = {
    'maxrange': {
        'variable_parameters': {
            'datatype': 'i2',
            'dimensions': ('time'),
        },
        'attributes': {
            'units': 'km',
        },
    },
    'radarlocations': {
        'variable_parameters': {
            'datatype': str,
            'dimensions': ('time'),
        },
        'attributes': {
            'long_name': 'List of radar locations available at time stamp'
        },
    },
}

config = {
        'variables': {
            'rainfall_amount': {
                'variable_parameters': {
                    'datatype': 'i2',
                    'dimensions': ('time', 'y', 'x'),
                    'chunksizes': [1, 900, 900],
                    'fill_value': -9999,
                    'zlib': True,
                    'complevel': 5,
                },
                'attributes': {
                    'long_name': 'Hourly rainfall',
                    'standard_name': 'rainfall_amount',
                    'units': 'kg',
                    'scale_factor': 0.1,
                    'add_offset': 0,
                    'coordinates': 'longitudes latitudes',
                    'grid_mapping': 'RADOLAN_grid',
                },
            },
        },
        'metadata_per_timestamp': metadata_per_timestamp,
        'metadata_fixed': {
            'n_lats': 900,
            'n_lons': 900,
        }
}

rtn.create_empty_netcdf(fn=fn, product_config_dict=config, product_name='RW')

In [23]:
%%time

with netCDF4.Dataset(fn, 'a') as nc_fh: 
    nc_fh['rainfall_amount'].set_var_chunk_cache(size=100e6, nelems=1, preemption=1)
    nc_fh['rainfall_amount'][0:100, :, :] = test_data[0:100, :, :]  

CPU times: user 10.6 s, sys: 1.04 s, total: 11.6 s
Wall time: 9.83 s


I tried many differenet combination of `size`, `nelems` and `preemtion` but the effect is marginal.