Add performance benchmarks (#180)

Adds benchmarks using the pytest-benchmark package. Benchmarking is disabled when running pytest normally, benchmarks can be run with pytest --benchmark-enable --benchmark-only.
adamreeve · Apr 9, 2020 · 53d7cbc · 53d7cbc
1 parent 5ef9394
commit 53d7cbc
Show file tree

Hide file tree

Showing 5 changed files with 347 additions and 18 deletions.
diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,7 @@
 *.swp
 .tox
 .hypothesis
+.benchmark
 
 # Wercker directories
 _builds

diff --git a/nptdms/test/test_benchmarks.py b/nptdms/test/test_benchmarks.py
@@ -0,0 +1,316 @@
+import os
+import numpy as np
+import pytest
+
+from nptdms import TdmsFile
+from nptdms.test.util import (
+    GeneratedFile,
+    hexlify_value,
+    string_hexlify,
+    segment_objects_metadata,
+    channel_metadata,
+    channel_metadata_with_no_data,
+    channel_metadata_with_repeated_structure)
+from nptdms.test.scenarios import TDS_TYPE_INT32
+
+
+@pytest.mark.benchmark(group='read-all-data')
+def test_read_contiguous_data(benchmark):
+    """ Benchmark reading a file with multiple channels of contiguous data
+    """
+    tdms_file = benchmark(read_from_start, get_contiguous_file().get_bytes_io_file())
+
+    np.testing.assert_equal(tdms_file['group']['channel1'][:], np.repeat([1], 10000))
+    np.testing.assert_equal(tdms_file['group']['channel2'][:], np.repeat([2], 10000))
+    np.testing.assert_equal(tdms_file['group']['channel3'][:], np.repeat([3], 10000))
+    np.testing.assert_equal(tdms_file['group']['channel4'][:], np.repeat([4], 10000))
+
+
+@pytest.mark.benchmark(group='read-all-data')
+def test_read_interleaved_data(benchmark):
+    """ Benchmark reading a file with interleaved data
+    """
+    tdms_file = benchmark(read_from_start, get_interleaved_file().get_bytes_io_file())
+
+    np.testing.assert_equal(tdms_file['group']['channel1'][:], np.repeat([1], 10000))
+    np.testing.assert_equal(tdms_file['group']['channel2'][:], np.repeat([2], 10000))
+    np.testing.assert_equal(tdms_file['group']['channel3'][:], np.repeat([3], 10000))
+    np.testing.assert_equal(tdms_file['group']['channel4'][:], np.repeat([4], 10000))
+
+
+@pytest.mark.benchmark(group='read-all-channel')
+def test_read_contiguous_data_channel(benchmark):
+    """ Benchmark reading a single channel's data from a contiguous data file
+    """
+    with TdmsFile.open(get_contiguous_file().get_bytes_io_file()) as tdms_file:
+        channel = tdms_file['group']['channel3']
+        channel_data = benchmark(read_channel_data, channel)
+
+        expected_data = np.repeat([3], 10000)
+        np.testing.assert_equal(channel_data, expected_data)
+
+
+@pytest.mark.benchmark(group='read-all-channel')
+def test_read_interleaved_data_channel(benchmark):
+    """ Benchmark reading a single channel's data from an interleaved data file
+    """
+    with TdmsFile.open(get_interleaved_file().get_bytes_io_file()) as tdms_file:
+        channel = tdms_file['group']['channel3']
+        channel_data = benchmark(read_channel_data, channel)
+
+        expected_data = np.repeat([3], 10000)
+        np.testing.assert_equal(channel_data, expected_data)
+
+
+@pytest.mark.benchmark(group='read-all-channel')
+def test_stream_contiguous_data_channel(benchmark):
+    """ Benchmark streaming channel data from a contiguous data file
+    """
+    with TdmsFile.open(get_contiguous_file().get_bytes_io_file()) as tdms_file:
+        channel = tdms_file['group']['channel3']
+        channel_data = benchmark(stream_chunks, channel)
+
+        channel_data = np.concatenate(channel_data)
+        expected_data = np.repeat([3], 10000)
+        np.testing.assert_equal(channel_data, expected_data)
+
+
+@pytest.mark.benchmark(group='read-all-channel')
+def test_stream_interleaved_data_channel(benchmark):
+    """ Benchmark streaming channel data from an interleaved data file
+    """
+    with TdmsFile.open(get_interleaved_file().get_bytes_io_file()) as tdms_file:
+        channel = tdms_file['group']['channel3']
+        channel_data = benchmark(stream_chunks, channel)
+
+        channel_data = np.concatenate(channel_data)
+        expected_data = np.repeat([3], 10000)
+        np.testing.assert_equal(channel_data, expected_data)
+
+
+@pytest.mark.benchmark(group='slice-channel')
+def test_slice_contiguous_data_channel(benchmark):
+    """ Benchmark reading a slice of data from a contiguous data file
+    """
+    with TdmsFile.open(get_contiguous_file().get_bytes_io_file()) as tdms_file:
+        channel = tdms_file['group']['channel3']
+        channel_data = benchmark(get_slice, channel, 5555, 6555)
+
+        expected_data = np.repeat([3], 1000)
+        np.testing.assert_equal(channel_data, expected_data)
+
+
+@pytest.mark.benchmark(group='slice-channel')
+def test_slice_interleaved_data_channel(benchmark):
+    """ Benchmark reading a slice of data from an interleaved data file
+    """
+    with TdmsFile.open(get_interleaved_file().get_bytes_io_file()) as tdms_file:
+        channel = tdms_file['group']['channel3']
+        channel_data = benchmark(get_slice, channel, 5555, 6555)
+
+        expected_data = np.repeat([3], 1000)
+        np.testing.assert_equal(channel_data, expected_data)
+
+
+@pytest.mark.benchmark(group='read-all-channel')
+def test_index_contiguous_data_channel(benchmark):
+    """ Benchmark reading a data from a contiguous data file using integer indices
+    """
+    with TdmsFile.open(get_contiguous_file().get_bytes_io_file()) as tdms_file:
+        channel = tdms_file['group']['channel3']
+        channel_data = np.zeros(10000, dtype=channel.dtype)
+        benchmark(index_values, channel, channel_data)
+
+        expected_data = np.repeat([3], 10000)
+        np.testing.assert_equal(channel_data, expected_data)
+
+
+@pytest.mark.benchmark(group='read-all-channel')
+def test_index_interleaved_data_channel(benchmark):
+    """ Benchmark reading a data from a interleaved data file using integer indices
+    """
+    with TdmsFile.open(get_interleaved_file().get_bytes_io_file()) as tdms_file:
+        channel = tdms_file['group']['channel3']
+        channel_data = np.zeros(10000, dtype=channel.dtype)
+        benchmark(index_values, channel, channel_data)
+
+        expected_data = np.repeat([3], 10000)
+        np.testing.assert_equal(channel_data, expected_data)
+
+
+@pytest.mark.benchmark(group='read-scaled-channel')
+def test_stream_scaled_data_chunks(benchmark):
+    """ Benchmark streaming channel data when the data is scaled
+    """
+    properties = {
+        "NI_Number_Of_Scales":
+            (3, "01 00 00 00"),
+        "NI_Scale[0]_Scale_Type":
+            (0x20, hexlify_value("<I", len("Linear")) + string_hexlify("Linear")),
+        "NI_Scale[0]_Linear_Slope":
+            (10, hexlify_value("<d", 2.0)),
+        "NI_Scale[0]_Linear_Y_Intercept":
+            (10, hexlify_value("<d", 10.0))
+    }
+    test_file = GeneratedFile()
+    data_array = np.arange(0, 1000, dtype=np.dtype('int32'))
+    data = data_array.tobytes()
+    test_file.add_segment(
+        ("kTocMetaData", "kTocRawData", "kTocNewObjList"),
+        segment_objects_metadata(
+            channel_metadata("/'group'/'channel1'", TDS_TYPE_INT32, 100, properties),
+        ),
+        data, binary_data=True
+    )
+    for _ in range(0, 9):
+        test_file.add_segment(
+            ("kTocRawData", ), "", data, binary_data=True)
+
+    with TdmsFile.open(test_file.get_bytes_io_file()) as tdms_file:
+        channel = tdms_file['group']['channel1']
+        channel_data = benchmark(stream_chunks, channel)
+
+        channel_data = np.concatenate(channel_data)
+        expected_data = np.tile(10.0 + 2.0 * data_array, 10)
+        np.testing.assert_equal(channel_data, expected_data)
+
+
+@pytest.mark.benchmark(group='read-metadata')
+def test_complex_metadata_reading(benchmark):
+    """ Benchmark reading metadata for a file with many channels and segments with alternating sets of objects
+    """
+    test_file = GeneratedFile()
+    data = np.array([0] * 5, dtype=np.dtype('int32')).tobytes()
+    test_file.add_segment(
+        ("kTocMetaData", "kTocRawData", "kTocNewObjList"),
+        segment_objects_metadata(
+            channel_metadata("/'group'/'channel0'", TDS_TYPE_INT32, 1),
+            channel_metadata("/'group'/'channel1'", TDS_TYPE_INT32, 1),
+            channel_metadata("/'group'/'channel2'", TDS_TYPE_INT32, 1),
+            channel_metadata("/'group'/'channel3'", TDS_TYPE_INT32, 1),
+            channel_metadata("/'group'/'channel4'", TDS_TYPE_INT32, 1),
+        ),
+        data, binary_data=True
+    )
+    test_file.add_segment(
+        ("kTocMetaData", "kTocRawData", "kTocNewObjList"),
+        segment_objects_metadata(
+            channel_metadata("/'group'/'channel5'", TDS_TYPE_INT32, 1),
+            channel_metadata("/'group'/'channel6'", TDS_TYPE_INT32, 1),
+            channel_metadata("/'group'/'channel7'", TDS_TYPE_INT32, 1),
+            channel_metadata("/'group'/'channel8'", TDS_TYPE_INT32, 1),
+            channel_metadata("/'group'/'channel9'", TDS_TYPE_INT32, 1),
+        ),
+        data, binary_data=True
+    )
+    for _ in range(9):
+        test_file.add_segment(
+            ("kTocMetaData", "kTocRawData", "kTocNewObjList"),
+            segment_objects_metadata(
+                channel_metadata_with_no_data("/'group'/'channel0'"),
+                channel_metadata_with_no_data("/'group'/'channel1'"),
+                channel_metadata_with_no_data("/'group'/'channel2'"),
+                channel_metadata_with_no_data("/'group'/'channel3'"),
+                channel_metadata_with_no_data("/'group'/'channel4'"),
+                channel_metadata_with_repeated_structure("/'group'/'channel5'"),
+                channel_metadata_with_repeated_structure("/'group'/'channel6'"),
+                channel_metadata_with_repeated_structure("/'group'/'channel7'"),
+                channel_metadata_with_repeated_structure("/'group'/'channel8'"),
+                channel_metadata_with_repeated_structure("/'group'/'channel9'"),
+            ),
+            data, binary_data=True
+        )
+        test_file.add_segment(
+            ("kTocMetaData", "kTocRawData", "kTocNewObjList"),
+            segment_objects_metadata(
+                channel_metadata_with_repeated_structure("/'group'/'channel0'"),
+                channel_metadata_with_repeated_structure("/'group'/'channel1'"),
+                channel_metadata_with_repeated_structure("/'group'/'channel2'"),
+                channel_metadata_with_repeated_structure("/'group'/'channel3'"),
+                channel_metadata_with_repeated_structure("/'group'/'channel4'"),
+                channel_metadata_with_no_data("/'group'/'channel5'"),
+                channel_metadata_with_no_data("/'group'/'channel6'"),
+                channel_metadata_with_no_data("/'group'/'channel7'"),
+                channel_metadata_with_no_data("/'group'/'channel8'"),
+                channel_metadata_with_no_data("/'group'/'channel9'"),
+            ),
+            data, binary_data=True
+        )
+
+    tdms_file = benchmark(read_metadata_from_start, test_file.get_bytes_io_file())
+
+    assert len(tdms_file) == 1
+    assert len(tdms_file['group']) == 10
+    for channel_num in range(10):
+        assert len(tdms_file['group']['channel{0}'.format(channel_num)]) == 10
+
+
+def get_contiguous_file():
+    test_file = GeneratedFile()
+    data_chunk = np.repeat(np.array([1, 2, 3, 4], dtype=np.dtype('int32')), 100)
+    data_array = np.tile(data_chunk, 10)
+    data = data_array.tobytes()
+    test_file.add_segment(
+        ("kTocMetaData", "kTocRawData", "kTocNewObjList"),
+        segment_objects_metadata(
+            channel_metadata("/'group'/'channel1'", TDS_TYPE_INT32, 100),
+            channel_metadata("/'group'/'channel2'", TDS_TYPE_INT32, 100),
+            channel_metadata("/'group'/'channel3'", TDS_TYPE_INT32, 100),
+            channel_metadata("/'group'/'channel4'", TDS_TYPE_INT32, 100),
+        ),
+        data, binary_data=True
+    )
+    for _ in range(0, 9):
+        test_file.add_segment(
+            ("kTocRawData", ), "", data, binary_data=True)
+    return test_file
+
+
+def get_interleaved_file():
+    test_file = GeneratedFile()
+    data_array = np.tile(np.array([1, 2, 3, 4], dtype=np.dtype('int32')), 1000)
+    data = data_array.tobytes()
+    test_file.add_segment(
+        ("kTocMetaData", "kTocRawData", "kTocNewObjList", "kTocInterleavedData"),
+        segment_objects_metadata(
+            channel_metadata("/'group'/'channel1'", TDS_TYPE_INT32, 100),
+            channel_metadata("/'group'/'channel2'", TDS_TYPE_INT32, 100),
+            channel_metadata("/'group'/'channel3'", TDS_TYPE_INT32, 100),
+            channel_metadata("/'group'/'channel4'", TDS_TYPE_INT32, 100),
+        ),
+        data, binary_data=True
+    )
+    for _ in range(0, 9):
+        test_file.add_segment(
+            ("kTocRawData", "kTocInterleavedData"), "", data, binary_data=True)
+    return test_file
+
+
+def read_from_start(file):
+    file.seek(0, os.SEEK_SET)
+    return TdmsFile.read(file)
+
+
+def read_metadata_from_start(file):
+    file.seek(0, os.SEEK_SET)
+    return TdmsFile.read_metadata(file)
+
+
+def read_channel_data(chan):
+    return chan[:]
+
+
+def stream_chunks(chan):
+    all_data = []
+    for chunk in chan.data_chunks():
+        all_data.append(chunk[:])
+    return all_data
+
+
+def get_slice(chan, start, stop):
+    return chan[start:stop]
+
+
+def index_values(chan, target):
+    for i in range(len(chan)):
+        target[i] = chan[i]
diff --git a/nptdms/test/util.py b/nptdms/test/util.py
@@ -205,24 +205,27 @@ class GeneratedFile(object):
     def __init__(self):
         self._content = []
 
-    def add_segment(self, toc, metadata, data, incomplete=False):
+    def add_segment(self, toc, metadata, data, incomplete=False, binary_data=False):
         metadata_bytes = _hex_to_bytes(metadata)
-        data_bytes = _hex_to_bytes(data)
+        data_bytes = data if binary_data else _hex_to_bytes(data)
         if toc is not None:
             lead_in = b'TDSm'
             toc_mask = long(0)
-            if "kTocMetaData" in toc:
-                toc_mask = toc_mask | long(1) << 1
-            if "kTocRawData" in toc:
-                toc_mask = toc_mask | long(1) << 3
-            if "kTocDAQmxRawData" in toc:
-                toc_mask = toc_mask | long(1) << 7
-            if "kTocInterleavedData" in toc:
-                toc_mask = toc_mask | long(1) << 5
-            if "kTocBigEndian" in toc:
-                toc_mask = toc_mask | long(1) << 6
-            if "kTocNewObjList" in toc:
-                toc_mask = toc_mask | long(1) << 2
+            for toc_item in toc:
+                if toc_item == "kTocMetaData":
+                    toc_mask = toc_mask | long(1) << 1
+                elif toc_item == "kTocRawData":
+                    toc_mask = toc_mask | long(1) << 3
+                elif toc_item == "kTocDAQmxRawData":
+                    toc_mask = toc_mask | long(1) << 7
+                elif toc_item == "kTocInterleavedData":
+                    toc_mask = toc_mask | long(1) << 5
+                elif toc_item == "kTocBigEndian":
+                    toc_mask = toc_mask | long(1) << 6
+                elif toc_item == "kTocNewObjList":
+                    toc_mask = toc_mask | long(1) << 2
+                else:
+                    raise ValueError("Unrecognised TOC value: %s" % toc_item)
             lead_in += struct.pack('<i', toc_mask)
             lead_in += _hex_to_bytes("69 12 00 00")
             next_segment_offset = len(metadata_bytes) + len(data_bytes)
@@ -266,6 +269,12 @@ def load(self, *args, **kwargs):
             file.seek(0)
             return tdms.TdmsFile(file, *args, **kwargs)
 
+    def get_bytes_io_file(self):
+        file = BytesIO()
+        file.write(self._get_contents())
+        file.seek(0)
+        return file
+
     def _get_contents(self):
         contents = b''
         for segment in self._content:
@@ -287,9 +296,7 @@ def _get_index_contents(self):
 
 class BytesIoTestFile(GeneratedFile):
     def load(self, *args, **kwargs):
-        file = BytesIO()
-        file.write(self._get_contents())
-        file.seek(0)
+        file = self.get_bytes_io_file()
         return tdms.TdmsFile(file, *args, **kwargs)
 
 

diff --git a/setup.py b/setup.py
@@ -41,7 +41,7 @@ def read_version():
   ],
   install_requires = ['numpy'],
   extras_require = {
-      'test': ['pytest>=3.1.0', 'hypothesis', 'mock<4.0;python_version<"3.4"'],
+      'test': ['pytest>=3.1.0', 'hypothesis', 'pytest-benchmark', 'mock<4.0;python_version<"3.4"'],
       'pandas': ['pandas'],
       'hdf': ['h5py>=2.10.0'],
       'thermocouple_scaling': ['thermocouples_reference', 'scipy'],