Skip to content

Commit

Permalink
Various small performance optimisations (#233)
Browse files Browse the repository at this point in the history
Small optimisations to improve metadata read performance and reduce memory use
  • Loading branch information
adamreeve committed Mar 15, 2021
1 parent 69710c3 commit a8079b7
Show file tree
Hide file tree
Showing 5 changed files with 135 additions and 127 deletions.
7 changes: 3 additions & 4 deletions nptdms/base_segment.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,16 @@ class BaseSegmentObject(object):

__slots__ = [
'path', 'number_values', 'data_size',
'has_data', 'data_type', 'endianness']
'has_data', 'data_type']

def __init__(self, path, endianness):
def __init__(self, path):
self.path = path
self.number_values = 0
self.data_size = 0
self.has_data = False
self.data_type = None
self.endianness = endianness

def read_raw_data_index(self, file, raw_data_index_header):
def read_raw_data_index(self, file, raw_data_index_header, endianness):
""" Read the raw data index for a single object in a segment
"""
raise NotImplementedError("Segment metadata reading must be implemented in base classes")
Expand Down
56 changes: 29 additions & 27 deletions nptdms/daqmx.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from collections import defaultdict
import numpy as np
import struct

from nptdms import types
from nptdms.base_segment import (
Expand All @@ -8,6 +9,7 @@


log = log_manager.get_logger(__name__)
_struct_unpack = struct.unpack

FORMAT_CHANGING_SCALER = 0x00001269
DIGITAL_LINE_SCALER = 0x0000126A
Expand Down Expand Up @@ -106,11 +108,11 @@ class DaqmxSegmentObject(BaseSegmentObject):

__slots__ = ['daqmx_metadata']

def __init__(self, path, endianness):
super(DaqmxSegmentObject, self).__init__(path, endianness)
def __init__(self, path):
super(DaqmxSegmentObject, self).__init__(path)
self.daqmx_metadata = None

def read_raw_data_index(self, f, raw_data_index_header):
def read_raw_data_index(self, f, raw_data_index_header, endianness):
if raw_data_index_header not in (FORMAT_CHANGING_SCALER, DIGITAL_LINE_SCALER):
raise ValueError(
"Unexpected raw data index for DAQmx data: 0x%08X" %
Expand All @@ -122,13 +124,13 @@ def read_raw_data_index(self, f, raw_data_index_header):
# has 0x00001369, which appears to be incorrect

# Read the data type
data_type_val = types.Uint32.read(f, self.endianness)
data_type_val = types.Uint32.read(f, endianness)
try:
self.data_type = types.tds_data_types[data_type_val]
except KeyError:
raise KeyError("Unrecognised data type: %s" % data_type_val)

daqmx_metadata = DaqMxMetadata(f, self.endianness, raw_data_index_header, self.data_type)
daqmx_metadata = DaqMxMetadata(f, endianness, raw_data_index_header, self.data_type)
log.debug("DAQmx metadata: %r", daqmx_metadata)

# DAQmx format has special chunking
Expand All @@ -149,8 +151,6 @@ class DaqMxMetadata(object):
"""

__slots__ = [
'scaler_type',
'dimension',
'chunk_size',
'raw_data_widths',
'scalers',
Expand All @@ -161,15 +161,15 @@ def __init__(self, f, endianness, scaler_type, channel_data_type):
Read the metadata for a DAQmx raw segment. This is the raw
DAQmx-specific portion of the raw data index.
"""
self.scaler_type = scaler_type
self.dimension = types.Uint32.read(f, endianness)
metadata_bytes = f.read(16)
(dimension,
self.chunk_size,
scaler_vector_length) = _struct_unpack(endianness + 'LQL', metadata_bytes)

# In TDMS format version 2.0, 1 is the only valid value for dimension
if self.dimension != 1:
if dimension != 1:
raise ValueError("Data dimension is not 1")
self.chunk_size = types.Uint64.read(f, endianness)

# size of vector of format changing scalers
scaler_vector_length = types.Uint32.read(f, endianness)
scaler_class = _scaler_classes[scaler_type]
self.scalers = [
scaler_class(f, endianness)
Expand Down Expand Up @@ -217,14 +217,15 @@ class DaqMxScaler(object):
]

def __init__(self, open_file, endianness):
data_type_code = types.Uint32.read(open_file, endianness)
self.data_type = DAQMX_TYPES[data_type_code]
scaler_bytes = open_file.read(20)

# more info for format changing scaler
self.raw_buffer_index = types.Uint32.read(open_file, endianness)
self.raw_byte_offset = types.Uint32.read(open_file, endianness)
self.sample_format_bitmap = types.Uint32.read(open_file, endianness)
self.scale_id = types.Uint32.read(open_file, endianness)
(data_type_code,
self.raw_buffer_index,
self.raw_byte_offset,
self.sample_format_bitmap,
self.scale_id) = _struct_unpack(endianness + 'LLLLL', scaler_bytes)

self.data_type = DAQMX_TYPES[data_type_code]

def byte_offset(self):
return self.raw_byte_offset
Expand Down Expand Up @@ -254,14 +255,15 @@ class DigitalLineScaler(object):
]

def __init__(self, open_file, endianness):
data_type_code = types.Uint32.read(open_file, endianness)
self.data_type = DAQMX_TYPES[data_type_code]
scaler_bytes = open_file.read(17)

(data_type_code,
self.raw_buffer_index,
self.raw_bit_offset,
self.sample_format_bitmap,
self.scale_id) = _struct_unpack(endianness + 'LLLBL', scaler_bytes)

# more info for digital line scaler
self.raw_buffer_index = types.Uint32.read(open_file, endianness)
self.raw_bit_offset = types.Uint32.read(open_file, endianness)
self.sample_format_bitmap = types.Uint8.read(open_file, endianness)
self.scale_id = types.Uint32.read(open_file, endianness)
self.data_type = DAQMX_TYPES[data_type_code]

def byte_offset(self):
return self.raw_bit_offset // 8
Expand Down
47 changes: 22 additions & 25 deletions nptdms/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import logging
import os
import numpy as np
import struct

from nptdms import types
from nptdms.common import ObjectPath, toc_properties
Expand All @@ -14,6 +15,7 @@


log = log_manager.get_logger(__name__)
_struct_unpack = struct.unpack


class TdmsReader(object):
Expand Down Expand Up @@ -82,21 +84,20 @@ def read_metadata(self, require_segment_indexes=False):
while True:
start_position = file.tell()
try:
segment = self._read_segment_metadata(
segment, properties = self._read_segment_metadata(
file, segment_position, index_cache, previous_segment, reading_index_file)
except EOFError:
# We've finished reading the file
break

self._update_object_metadata(segment)
self._update_object_properties(segment)
self._update_object_properties(properties)
self._segments.append(segment)
previous_segment = segment

segment_position = segment.next_segment_pos
if reading_index_file:
lead_size = 7 * 4
file.seek(start_position + lead_size + segment.raw_data_offset, os.SEEK_SET)
file.seek(start_position + segment.data_position - segment.position, os.SEEK_SET)
else:
file.seek(segment.next_segment_pos, os.SEEK_SET)
finally:
Expand Down Expand Up @@ -228,29 +229,30 @@ def read_channel_chunk_for_index(self, channel_path, index):
return chunk_data, chunk_offset

def _read_segment_metadata(
self, file, segment_position, index_cache, previous_segment=None, is_index_file=False):
(position, toc_mask, endianness, data_position, raw_data_offset,
next_segment_offset, next_segment_pos) = self._read_lead_in(file, segment_position, is_index_file)
self, file, segment_position, index_cache, previous_segment, is_index_file):
(position, toc_mask, data_position, next_segment_pos) = self._read_lead_in(
file, segment_position, is_index_file)

segment = TdmsSegment(
position, toc_mask, endianness, next_segment_offset,
next_segment_pos, raw_data_offset, data_position)
position, toc_mask, next_segment_pos, data_position)

segment.read_segment_objects(
properties = segment.read_segment_objects(
file, self._prev_segment_objects, index_cache, previous_segment)
return segment
return segment, properties

def _read_lead_in(self, file, segment_position, is_index_file=False):
lead_in_bytes = file.read(28)

expected_tag = b'TDSh' if is_index_file else b'TDSm'
tag = file.read(4)
tag = lead_in_bytes[:4]
if tag == b'':
raise EOFError
if tag != expected_tag:
raise ValueError(
"Segment does not start with %r, but with %r" % (expected_tag, tag))

# Next four bytes are table of contents mask
toc_mask = types.Int32.read(file)
toc_mask = _struct_unpack('<l', lead_in_bytes[4:8])[0]

if log.isEnabledFor(logging.DEBUG):
log.debug("Reading segment at %d", segment_position)
Expand All @@ -260,15 +262,12 @@ def _read_lead_in(self, file, segment_position, is_index_file=False):

endianness = '>' if (toc_mask & toc_properties['kTocBigEndian']) else '<'

# Next four bytes are version number
version = types.Int32.read(file, endianness)
# Next four bytes are version number, then 8 bytes each for the offset values
(version, next_segment_offset, raw_data_offset) = _struct_unpack(endianness + 'lQQ', lead_in_bytes[8:28])

if version not in (4712, 4713):
log.warning("Unrecognised version number.")

# Now 8 bytes each for the offset values
next_segment_offset = types.Uint64.read(file, endianness)
raw_data_offset = types.Uint64.read(file, endianness)

# Calculate data and next segment position
lead_size = 7 * 4
data_position = segment_position + lead_size + raw_data_offset
Expand All @@ -279,15 +278,13 @@ def _read_lead_in(self, file, segment_position, is_index_file=False):
"Last segment of file has unknown size, "
"will attempt to read to the end of the file")
next_segment_pos = self._get_data_file_size()
next_segment_offset = next_segment_pos - segment_position - lead_size
else:
log.debug("Next segment offset = %d, raw data offset = %d, data size = %d b",
next_segment_offset, raw_data_offset, next_segment_offset - raw_data_offset)
next_segment_pos = (
segment_position + next_segment_offset + lead_size)

return (segment_position, toc_mask, endianness, data_position, raw_data_offset,
next_segment_offset, next_segment_pos)
return segment_position, toc_mask, data_position, next_segment_pos

def _verify_segment_start(self, segment):
""" When reading data for a segment, check for the TDSm tag at the start of the segment in an attempt
Expand Down Expand Up @@ -323,11 +320,11 @@ def _update_object_metadata(self, segment):
if segment_object.scaler_data_types is not None:
_update_object_scaler_data_types(path, object_metadata, segment_object)

def _update_object_properties(self, segment):
def _update_object_properties(self, segment_object_properties):
""" Update object properties using any properties in a segment
"""
if segment.object_properties is not None:
for path, properties in segment.object_properties.items():
if segment_object_properties is not None:
for path, properties in segment_object_properties.items():
object_metadata = self._get_or_create_object(path)
for prop, val in properties:
object_metadata.properties[prop] = val
Expand Down

0 comments on commit a8079b7

Please sign in to comment.