Skip to content

Commit

Permalink
Seek end behavior (#21) (#22)
Browse files Browse the repository at this point in the history
* IdzipReader: support seeking from SEEK_END.


* fix EOF calculation when reading



---------

Co-authored-by: Benjamin Moody <benjaminmoody@gmail.com>
  • Loading branch information
bauman and Benjamin Moody committed Feb 15, 2023
1 parent 6ea5e2c commit 2a26db0
Show file tree
Hide file tree
Showing 4 changed files with 112 additions and 7 deletions.
14 changes: 11 additions & 3 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ jobs:
os: [ubuntu-latest, windows-latest]
python-version: ['3.7', '3.8', '3.9', '3.10', '3.11' ]
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
Expand All @@ -26,4 +26,12 @@ jobs:
- name: Test Testable Components
run: |
# All other tests depend upon fixture data not stored with the repository
cd test && pytest -l -s -v test_high_level_api.py --cov-report term
echo "generating a data file sufficient to extend past 1 member"
curl -o test/data/sample.txt http://textfiles.com/stories/bureau.txt
seq 30000 | xargs -P1 -n1 -I@ cat test/data/sample.txt >> test/data/large.txt
python idzip/command.py test/data/large.txt
cd test
pytest -l -s -v test_high_level_api.py --cov-report term
cd ..
pytest -l -s -v test/test_seek_read_behavior.py --cov-report term
18 changes: 15 additions & 3 deletions idzip/decompressor.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@

import os
from math import inf
import struct
import zlib
import itertools
Expand Down Expand Up @@ -99,8 +100,16 @@ def read(self, size=-1):
chunk_index += 1

except EOFError:
# The read data will be returned.
pass
# PR#16/18 - support identifying EOF
# use a read() as a sync from desired position to actual position
# read(0) can be used as a synchronization call
dec_eof_position = self._members[-1].start_pos + self._members[-1].isize
self._pos = dec_eof_position
if prefixed_buffer:
# subtracting the data in the EOF Case so the normal path will add it back
# before the function return to avoid changing the path
# adding up lengths rather than concatenating here to avoid creating new buffers
self._pos -= sum([len(x) for x in prefixed_buffer]) - prefix_size
prefixed_buffer = b"".join(prefixed_buffer)
result = prefixed_buffer[prefix_size:]
self._pos += len(result)
Expand Down Expand Up @@ -230,7 +239,10 @@ def seek(self, offset, whence=os.SEEK_SET):
elif whence == os.SEEK_CUR:
new_pos = self._pos + offset
elif whence == os.SEEK_END:
raise ValueError("Seek from the end not supported")
member = self._select_member(inf)
new_pos = member.start_pos + member.isize
if offset < 0: # gzip will not seek past the end of the file
new_pos += offset # idzip must not seek past the end of the file
else:
raise ValueError("Unknown whence: %r" % whence)

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/python
from setuptools import setup, find_packages

VERSION = "0.3.8"
VERSION = "0.3.9"

setup(
name = "python-idzip",
Expand Down
85 changes: 85 additions & 0 deletions test/test_seek_read_behavior.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import idzip
import gzip
import io
from time import time


def test_seeking(report_time=False):
# tests that SEEK_END behavior matches gzip
# testing a few things
# 1. Multi-member (>2GB) works the same as single member
# 2. idzip should still be faster than gzip
# 3. Seek to end behaves the same, but
# 4. Seek to before the end behaves the same
# 5. Seek to after the end behaves the same

f = idzip.open("test/data/large.txt.dz")
g = gzip.open("test/data/large.txt.dz")
f_s_s = time() # file_seek_start time
f_pos = f.seek(0, io.SEEK_END)
f_s_e = time() # file_seek_end time
if report_time:
print(f"idzip seek time: {f_s_e - f_s_s}")
g_s_s = time()
g_pos = g.seek(0, io.SEEK_END)
g_s_e = time()
if report_time:
print(f"gzip seek time: {g_s_e - g_s_s}")
# gzip will find the exact EOF position during the seek
# idzip can find the exact EOF position during only if using SEEK_END
assert g_pos == f_pos

f_r_s = time() # file_read_start
f.read(0)
f_r_e = time() # file_read_end
if report_time:
print(f"idzip read time: {f_r_e - f_r_s}")
g_r_s = time()
g.read(0)
g_r_e = time()
if report_time:
print(f"gzip read time: {g_r_e - g_r_s}")

# the file position should now be synchronized
f_pos = f.tell()
g_pos = g.tell()
if report_time:
print(f"idzip END: {f_pos}")
print(f"gzip END: {g_pos}")
assert f_pos == g_pos

g_pos = g.seek(-50, io.SEEK_END)
f_pos = f.seek(-50, io.SEEK_END)
assert f_pos == g_pos

# read past EOF, data should match and position should match
g_data = g.read(200)
f_data = f.read(200)
assert g_data == f_data

g_pos = g.tell()
f_pos = f.tell()
assert g_pos == f_pos

g.seek(-50, io.SEEK_END)
f.seek(-50, io.SEEK_END)
assert f_pos == g_pos

_ = g.read()
_ = f.read()
g_pos = g.tell()
f_pos = f.tell()
assert g_pos == f_pos

g_pos = g.seek(50, io.SEEK_END)
f_pos = f.seek(50, io.SEEK_END)
assert f_pos == g_pos
_ = g.read()
_ = f.read()
g_pos = g.tell()
f_pos = f.tell()
assert g_pos == f_pos


if __name__ == "__main__":
test_seeking(report_time=True)

0 comments on commit 2a26db0

Please sign in to comment.