Skip to content

Commit

Permalink
Use tdms_index files if available when reading metadata (#178)
Browse files Browse the repository at this point in the history
  • Loading branch information
adamreeve committed Apr 3, 2020
1 parent d5e2a0c commit 2d3feab
Show file tree
Hide file tree
Showing 4 changed files with 174 additions and 108 deletions.
140 changes: 121 additions & 19 deletions nptdms/reader.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
""" Lower level TDMS reader API that allows finer grained reading of data
"""

import logging
import os
import numpy as np
from nptdms.common import ObjectPath

from nptdms import types
from nptdms.common import ObjectPath, toc_properties
from nptdms.utils import Timer, OrderedDict
from nptdms.tdms_segment import read_segment_metadata
from nptdms.base_segment import RawChannelDataChunk
from nptdms.tdms_segment import ContiguousDataSegment, InterleavedDataSegment
from nptdms.daqmx import DaqmxSegment
from nptdms.log import log_manager


log = log_manager.get_logger(__name__)


Expand All @@ -27,6 +33,7 @@ def __init__(self, tdms_file):
self._prev_segment_objects = {}
self.object_metadata = OrderedDict()
self._file_path = None
self._index_file_path = None

self._segment_channel_offsets = None
self._segment_chunk_sizes = None
Expand All @@ -38,6 +45,9 @@ def __init__(self, tdms_file):
# Is path to a file
self._file = open(tdms_file, 'rb')
self._file_path = tdms_file
index_file_path = tdms_file + '_index'
if os.path.isfile(index_file_path):
self._index_file_path = index_file_path

def close(self):
if self._file_path is not None:
Expand All @@ -51,24 +61,38 @@ def read_metadata(self):
""" Read all metadata and structure information from a TdmsFile
"""

if self._index_file_path is not None:
reading_index_file = True
file = open(self._index_file_path, 'rb')
else:
reading_index_file = False
file = self._file

self._segments = []
with Timer(log, "Read metadata"):
# Read metadata first to work out how much space we need
previous_segment = None
while True:
try:
segment = read_segment_metadata(
self._file, self._prev_segment_objects, previous_segment)
except EOFError:
# We've finished reading the file
break

self._update_object_metadata(segment)
self._update_object_properties(segment)
self._segments.append(segment)
previous_segment = segment

self._file.seek(segment.next_segment_pos)
segment_position = 0
try:
with Timer(log, "Read metadata"):
# Read metadata first to work out how much space we need
previous_segment = None
while True:
try:
segment = self._read_segment_metadata(
file, segment_position, previous_segment, reading_index_file)
except EOFError:
# We've finished reading the file
break

self._update_object_metadata(segment)
self._update_object_properties(segment)
self._segments.append(segment)
previous_segment = segment

segment_position = segment.next_segment_pos
if not reading_index_file:
file.seek(segment.next_segment_pos)
finally:
if reading_index_file:
file.close()

def read_raw_data(self):
""" Read raw data from all segments, chunk by chunk
Expand Down Expand Up @@ -177,6 +201,84 @@ def read_channel_chunk_for_index(self, channel_path, index):
chunk_offset = segment_start_index + chunk_index * chunk_size
return chunk_data, chunk_offset

def _read_segment_metadata(
self, file, segment_position, previous_segment=None, is_index_file=False):
(position, toc_mask, endianness, data_position, raw_data_offset,
next_segment_offset, next_segment_pos) = self._read_lead_in(file, segment_position, is_index_file)

segment_args = (
position, toc_mask, endianness, next_segment_offset,
next_segment_pos, raw_data_offset, data_position)
if toc_mask & toc_properties['kTocDAQmxRawData']:
segment = DaqmxSegment(*segment_args)
elif toc_mask & toc_properties['kTocInterleavedData']:
segment = InterleavedDataSegment(*segment_args)
else:
segment = ContiguousDataSegment(*segment_args)

segment.read_segment_objects(
file, self._prev_segment_objects, previous_segment)
return segment

def _read_lead_in(self, file, segment_position, is_index_file=False):
expected_tag = b'TDSh' if is_index_file else b'TDSm'
tag = file.read(4)
if tag == b'':
raise EOFError
if tag != expected_tag:
raise ValueError(
"Segment does not start with %r, but with %r" % (expected_tag, tag))

log.debug("Reading segment at %d", segment_position)

# Next four bytes are table of contents mask
toc_mask = types.Int32.read(file)

if log.isEnabledFor(logging.DEBUG):
for prop_name, prop_mask in toc_properties.items():
prop_is_set = (toc_mask & prop_mask) != 0
log.debug("Property %s is %s", prop_name, prop_is_set)

endianness = '>' if (toc_mask & toc_properties['kTocBigEndian']) else '<'

# Next four bytes are version number
version = types.Int32.read(file, endianness)
if version not in (4712, 4713):
log.warning("Unrecognised version number.")

# Now 8 bytes each for the offset values
next_segment_offset = types.Uint64.read(file, endianness)
raw_data_offset = types.Uint64.read(file, endianness)

# Calculate data and next segment position
lead_size = 7 * 4
data_position = segment_position + lead_size + raw_data_offset
if next_segment_offset == 0xFFFFFFFFFFFFFFFF:
# Segment size is unknown. This can happen if LabVIEW crashes.
# Try to read until the end of the file.
log.warning(
"Last segment of file has unknown size, "
"will attempt to read to the end of the file")
next_segment_pos = self._get_data_file_size()
next_segment_offset = next_segment_pos - segment_position - lead_size
else:
log.debug("Next segment offset = %d, raw data offset = %d",
next_segment_offset, raw_data_offset)
log.debug("Data size = %d b",
next_segment_offset - raw_data_offset)
next_segment_pos = (
segment_position + next_segment_offset + lead_size)

return (segment_position, toc_mask, endianness, data_position, raw_data_offset,
next_segment_offset, next_segment_pos)

def _get_data_file_size(self):
current_pos = self._file.tell()
self._file.seek(0, os.SEEK_END)
end_pos = self._file.tell()
self._file.seek(current_pos, os.SEEK_SET)
return end_pos

def _update_object_metadata(self, segment):
""" Update object metadata using the metadata read from a single segment
"""
Expand Down
84 changes: 0 additions & 84 deletions nptdms/tdms_segment.py
Original file line number Diff line number Diff line change
@@ -1,104 +1,20 @@
import logging
import os
import numpy as np

from nptdms import types
from nptdms.common import toc_properties
from nptdms.base_segment import (
BaseSegment,
BaseSegmentObject,
RawChannelDataChunk,
RawDataChunk,
read_interleaved_segment_bytes,
fromfile)
from nptdms.daqmx import DaqmxSegment
from nptdms.log import log_manager


log = log_manager.get_logger(__name__)


def read_segment_metadata(file, previous_segment_objects, previous_segment=None):
(position, toc_mask, endianness, data_position, raw_data_offset,
next_segment_offset, next_segment_pos) = read_lead_in(file)

segment_args = (
position, toc_mask, endianness, next_segment_offset,
next_segment_pos, raw_data_offset, data_position)
if toc_mask & toc_properties['kTocDAQmxRawData']:
segment = DaqmxSegment(*segment_args)
elif toc_mask & toc_properties['kTocInterleavedData']:
segment = InterleavedDataSegment(*segment_args)
else:
segment = ContiguousDataSegment(*segment_args)

segment.read_segment_objects(
file, previous_segment_objects, previous_segment)
return segment


def read_lead_in(file):
position = file.tell()
# First four bytes should be TDSm
try:
tag = file.read(4).decode('utf-8')
except UnicodeDecodeError:
raise ValueError("Segment does not start with TDSm")
if tag == '':
raise EOFError
if tag != 'TDSm':
raise ValueError(
"Segment does not start with TDSm, but with %s" % tag)

log.debug("Reading segment at %d", position)

# Next four bytes are table of contents mask
toc_mask = types.Int32.read(file)

if log.isEnabledFor(logging.DEBUG):
for prop_name, prop_mask in toc_properties.items():
prop_is_set = (toc_mask & prop_mask) != 0
log.debug("Property %s is %s", prop_name, prop_is_set)

endianness = '>' if (toc_mask & toc_properties['kTocBigEndian']) else '<'

# Next four bytes are version number
version = types.Int32.read(file, endianness)
if version not in (4712, 4713):
log.warning("Unrecognised version number.")

# Now 8 bytes each for the offset values
next_segment_offset = types.Uint64.read(file, endianness)
raw_data_offset = types.Uint64.read(file, endianness)

# Calculate data and next segment position
lead_size = 7 * 4
data_position = position + lead_size + raw_data_offset
if next_segment_offset == 0xFFFFFFFFFFFFFFFF:
# Segment size is unknown. This can happen if Labview crashes.
# Try to read until the end of the file.
log.warning(
"Last segment of file has unknown size, "
"will attempt to read to the end of the file")
current_pos = file.tell()
file.seek(0, os.SEEK_END)
end_pos = file.tell()
file.seek(current_pos, os.SEEK_SET)

next_segment_pos = end_pos
next_segment_offset = end_pos - position - lead_size
else:
log.debug("Next segment offset = %d, raw data offset = %d",
next_segment_offset, raw_data_offset)
log.debug("Data size = %d b",
next_segment_offset - raw_data_offset)
next_segment_pos = (
position + next_segment_offset + lead_size)

return (position, toc_mask, endianness, data_position, raw_data_offset,
next_segment_offset, next_segment_pos)


class InterleavedDataSegment(BaseSegment):
""" A TDMS segment with interleaved data
"""
Expand Down
12 changes: 12 additions & 0 deletions nptdms/test/test_tdms_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,18 @@ def test_access_data_property_after_opening_throws():
assert "Channel data has not been read" in str(exc_info.value)


@pytest.mark.parametrize("test_file,expected_data", scenarios.get_scenarios())
def test_read_with_index_file(test_file, expected_data):
""" Test reading a file with an associated tdms_index file
"""
with test_file.get_tempfile_with_index() as tdms_file_path:
tdms_file = TdmsFile.read(tdms_file_path)

for ((group, channel), expected_channel_data) in expected_data.items():
channel_obj = tdms_file[group][channel]
compare_arrays(channel_obj.data, expected_channel_data)


@pytest.mark.filterwarnings('ignore:.* is deprecated')
def test_get_objects():
"""Test reading data"""
Expand Down
46 changes: 41 additions & 5 deletions nptdms/test/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
"""

import binascii
from contextlib import contextmanager
import os
from io import BytesIO
import struct
import tempfile
Expand Down Expand Up @@ -201,7 +203,7 @@ class GeneratedFile(object):
"""Generate a TDMS file for testing"""

def __init__(self):
self._content = b''
self._content = []

def add_segment(self, toc, metadata, data, incomplete=False):
metadata_bytes = _hex_to_bytes(metadata)
Expand Down Expand Up @@ -232,27 +234,61 @@ def add_segment(self, toc, metadata, data, incomplete=False):
lead_in += struct.pack('<Q', raw_data_offset)
else:
lead_in = b''
self._content += lead_in + metadata_bytes + data_bytes
self._content.append((lead_in, metadata_bytes, data_bytes))

def get_tempfile(self, **kwargs):
named_file = tempfile.NamedTemporaryFile(suffix=".tdms", **kwargs)
file = named_file.file
file.write(self._content)
file.write(self._get_contents())
file.seek(0)
return named_file

@contextmanager
def get_tempfile_with_index(self):
directory = tempfile.mkdtemp()
tdms_path = os.path.join(directory, 'test_file.tdms')
tdms_index_path = os.path.join(directory, 'test_file.tdms_index')
with open(tdms_path, 'wb') as file:
file.write(self._get_contents())
with open(tdms_index_path, 'wb') as file:
file.write(self._get_index_contents())
try:
yield tdms_path
finally:
os.unlink(tdms_path)
os.unlink(tdms_index_path)
os.rmdir(directory)

def load(self, *args, **kwargs):
with tempfile.NamedTemporaryFile(suffix=".tdms") as named_file:
file = named_file.file
file.write(self._content)
file.write(self._get_contents())
file.seek(0)
return tdms.TdmsFile(file, *args, **kwargs)

def _get_contents(self):
contents = b''
for segment in self._content:
contents += segment[0]
contents += segment[1]
contents += segment[2]
return contents

def _get_index_contents(self):
contents = b''
for segment in self._content:
lead_in = segment[0]
if len(lead_in) >= 4:
lead_in = b'TDSh' + lead_in[4:]
contents += lead_in
contents += segment[1]
return contents


class BytesIoTestFile(GeneratedFile):
def load(self, *args, **kwargs):
file = BytesIO()
file.write(self._content)
file.write(self._get_contents())
file.seek(0)
return tdms.TdmsFile(file, *args, **kwargs)

Expand Down

0 comments on commit 2d3feab

Please sign in to comment.