Skip to content

Commit

Permalink
Handle invalid UTF-8 encoding in strings (#295)
Browse files Browse the repository at this point in the history
Log a warning and then retry with the replacement error handling
approach.
  • Loading branch information
adamreeve committed Jan 17, 2023
1 parent 9b6db0f commit 1c756a0
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 2 deletions.
31 changes: 31 additions & 0 deletions nptdms/test/test_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from datetime import date, datetime
import io
import numpy as np
import struct
import pytest

from nptdms import types
Expand Down Expand Up @@ -54,3 +55,33 @@ def test_timestamp_from_date():
read_datetime = types.TimeStamp.read(data_file)

assert expected_datetime == read_datetime.as_datetime64()


def test_invalid_utf8_string_read(caplog):
""" Test reading a single invalid string value"""
file = io.BytesIO(struct.pack("<L", 3) + b'0 \xb0')
string_value = types.String.read(file)

assert string_value == "0 �"
assert "WARNING" in caplog.text
assert "0 \\xb0" in caplog.text


def test_invalid_utf8_strings_read(caplog):
"""Test reading multiple string values where one is invalid"""
string_bytes = [
b'hello',
b'0 \xb0',
b'world',
]
offset = 0
offsets = []
for val in string_bytes:
offset += len(val)
offsets.append(struct.pack("<L", offset))
file = io.BytesIO(b''.join(offsets + string_bytes))
string_values = types.String.read_values(file, len(string_bytes))

assert string_values == ["hello", "0 �", "world"]
assert "WARNING" in caplog.text
assert "0 \\xb0" in caplog.text
18 changes: 16 additions & 2 deletions nptdms/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
import numpy as np
import struct
from nptdms.timestamp import TdmsTimestamp, TimestampArray
from nptdms.log import log_manager


log = log_manager.get_logger(__name__)


__all__ = [
Expand Down Expand Up @@ -205,7 +209,7 @@ def __init__(self, value):
def read(file, endianness="<"):
size_bytes = file.read(4)
size = _struct_unpack(endianness + 'L', size_bytes)[0]
return file.read(size).decode('utf-8')
return String._decode(file.read(size))

@classmethod
def read_values(cls, file, number_values, endianness="<"):
Expand All @@ -220,9 +224,19 @@ def read_values(cls, file, number_values, endianness="<"):
strings = []
for i in range(number_values):
s = file.read(offsets[i + 1] - offsets[i])
strings.append(s.decode('utf-8'))
strings.append(String._decode(s))
return strings

@staticmethod
def _decode(string_bytes):
try:
return string_bytes.decode('utf-8')
except UnicodeDecodeError as exc:
log.warning(
"Error decoding string from bytes %s, retrying with replace handler: %s",
string_bytes, exc)
return string_bytes.decode('utf-8', errors='replace')


@tds_data_type(0x21, np.bool_)
class Boolean(StructType):
Expand Down

0 comments on commit 1c756a0

Please sign in to comment.