In [9]:
import collections
from collections import Iterable, ByteString
from contextlib import ExitStack, AbstractContextManager
from enum import Enum
import glob
from io import BytesIO
from itertools import groupby, islice
import os

import py7zlib

def namedtuple_with_defaults(typename, field_names, defaults=()):
    T = collections.namedtuple(typename, field_names)
    T.__new__.__defaults__ = (None,) * len(T._fields)
    if isinstance(defaults, collections.Mapping):
        prototype = T(**defaults)
    else:
        prototype = T(*defaults)
    T.__new__.__defaults__ = tuple(prototype)
    return T

namedtuple = namedtuple_with_defaults

class ABCModality(Enum):
    META = 'meta'
    PARA = 'para'
    STEP = 'step'
    STL2 = 'stl2'
    OBJ = 'obj'
    FEAT = 'feat'
    STAT = 'stat'



ALL_ABC_MODALITIES = [modality.value for modality in ABCModality]
ABC_7Z_FILEMASK = 'abc_{chunk}_{modality}_v{version}.7z'  # abc_0000_feat_v00.7z
ABC_INAR_FILEMASK = '{dirname}/{dirname}_{hash}_{modalityex}_{number}.{ext}'  # '00000002/00000002_1ffb81a71e5b402e966b9341_features_001.yml'


# def _compose_filemask(chunks, modalities, version):

#     chunk_mask = '*'
#     if None is not chunks:
#         assert isinstance(chunks, Iterable)
#         assert all(isinstance(s, ByteString) for s in chunks)

#         chunk_mask = '[' + '|'.join(chunks) + ']'

#     modalities

#     version

#     return ABC_7Z_FILEMASK.format(
#         chunk=chunk_mask, modality='x', version=0,
#     )


def _extract_modality(filename):
    name = os.path.basename(filename)
    name, ext = os.path.splitext(name)
    abc, chunk, modality, version = name.split('_')
    return modality


def _extract_inar_id(pathname):
    # assume name matches ABC_INAR_FILEMASK
    name = os.path.basename(pathname)
    name, ext = os.path.splitext(name)
    dirname, hash, modalityex, number = name.split('_')
    return '_'.join([dirname, hash, number])


# ABCItem is the primary data instance in ABC, containing
# all the modalities in the form of bitstreams, supporting
# file-like interfacing e.g. `.read()`.
ABCItem = namedtuple(
    'ABCItem',
    'pathname archive_pathname item_id ' + ' '.join(ALL_ABC_MODALITIES))


class ABC7ZFile(Iterable, AbstractContextManager):
    """A helper class for reading 7z files."""

    def __init__(self, filename):
        self.filename = filename
        self.modality = _extract_modality(filename)
        assert self.modality in ALL_ABC_MODALITIES, 'unknown modality: "{}"'.format(self.modality)
        self._reset_handles()

    def _reset_handles(self):
        self.file_handle = None
        self.archive_handle = None
        self._names_list = None

    def _open(self):
        self.file_handle = open(self.filename, 'rb')
        self.archive_handle = py7zlib.Archive7z(self.file_handle)
        self._names_list = self.archive_handle.getnames()

    def _close(self):
        self.file_handle.close()
        self._reset_handles()

    def __enter__(self):
        self._open()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        if self._isopen():
            self._close()

    def _get_item_by_name(self, name):
        bytes_io = BytesIO(self.archive_handle.getmember(name).read())
        item_id = _extract_inar_id(name)
        return ABCItem(self.filename, name, item_id, **{self.modality: bytes_io})

    def __iter__(self):
        for name in self.archive_handle.getnames():
            yield self._get_item_by_name(name)

    def _isopen(self):
        return (self.file_handle is not None and
                self.archive_handle is not None and
                self._names_list is not None)

    def __getitem__(self, key):
        assert self._isopen()
        if isinstance(key, int):
            name = self._names_list[key]
            return self._get_item_by_name(name)
        elif isinstance(key, slice):
            names = list(islice(self._names_list, key.start, key.stop, key.step))
            return (self._get_item_by_name(name) for name in names)



# class ABCChunk(Iterable):
#     """A chunk is a collection of files (with different modalities),
#     that iterates over all files simultaneously."""

#     def __init__(self, filenames):
#         self.filenames = filenames
#         self.file_handles = []

#     def __iter__(self):
#         with ExitStack() as stack:
#             self.file_handles = [stack.enter_context(ABC7ZFile(filename))
#                                  for filename in self.filenames]


#         for filename, handle in zip(self.filenames, self.file_handles):


#         item = ABCItem()

#         yield item



# class ABCData(Iterable):
#     def __init__(self, data_dir, modalities=ALL_ABC_MODALITIES, chunks=None,
#                  shape_representation='trimesh', version='00'):
#         self.data_dir = data_dir
#         self.modalities = modalities
#         self.chunks = chunks
#         self.version = version
#         self.shape_representation = shape_representation

#         filemask = _compose_filemask(self.chunks, self.modalities, self.version)

#         self.data_files = glob.glob(os.path.join(self.data_dir, filemask))

#         chunk_getter = lambda s: ABC_7Z_FILEMASK.

#         self.data_files = {chunk: chunk_files
#                            for chunk, chunk_files in groupby(self.data_files, key=chunk_getter)}

#     def __iter__(self):
#         for chunk in self.chunks:
#             filenames_by_chunk = self.data_files[chunk]
#             chunk = ABCChunk(**filenames_by_chunk)
#             for item in chunk:
#                 yield item


In [10]:
with ABC7ZFile('/home/artonson/tmp/abc/abc_0000_obj_v00.7z') as abc_7z_file:
#     x = abc_7z_file[0:3]
    
    print(len(abc_7z_file.archive_handle.getnames()))

7168


In [12]:
from joblib import Parallel, delayed

In [25]:
def read_slice(filename, slice_params):
    slice_start, slice_end = slice_params
    with ABC7ZFile(filename) as abc_7z_file:
        x = abc_7z_file[slice_start:slice_end]
        for i, item in enumerate(x):
            y = item.obj.getvalue()
            print(filename, slice_start + i, len(y))
#             yield item

In [20]:
x = list(read_slice('/home/artonson/tmp/abc/abc_0000_obj_v00.7z', (0, 2)))

/home/artonson/tmp/abc/abc_0000_obj_v00.7z 0 4523731
/home/artonson/tmp/abc/abc_0000_obj_v00.7z 1 4678073


In [22]:
x[0]

ABCItem(pathname='/home/artonson/tmp/abc/abc_0000_obj_v00.7z', archive_pathname='00000002/00000002_1ffb81a71e5b402e966b9341_trimesh_001.obj', item_id='00000002_1ffb81a71e5b402e966b9341_001', meta=None, para=None, step=None, stl2=None, obj=<_io.BytesIO object at 0x7fdd7c333b48>, feat=None, stat=None)

In [26]:
slice_start = list(range(0, 7000, 100))
slice_end = list(range(99, 7099, 100))
slice_params = zip(slice_start, slice_end)

filename = '/home/artonson/tmp/abc/abc_0000_obj_v00.7z'

In [27]:
parallel = Parallel(n_jobs=40, verbose=10)
delayed_read_slice = delayed(read_slice)

parallel(
    delayed_read_slice(filename, sp) for sp in slice_params
)

[Parallel(n_jobs=40)]: Using backend LokyBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done   7 out of  70 | elapsed:  7.4min remaining: 66.9min
[Parallel(n_jobs=40)]: Done  15 out of  70 | elapsed: 10.4min remaining: 38.3min
[Parallel(n_jobs=40)]: Done  23 out of  70 | elapsed: 12.8min remaining: 26.1min
[Parallel(n_jobs=40)]: Done  31 out of  70 | elapsed: 13.3min remaining: 16.7min
[Parallel(n_jobs=40)]: Done  39 out of  70 | elapsed: 15.5min remaining: 12.3min
[Parallel(n_jobs=40)]: Done  47 out of  70 | elapsed: 17.0min remaining:  8.3min
[Parallel(n_jobs=40)]: Done  55 out of  70 | elapsed: 18.6min remaining:  5.1min
[Parallel(n_jobs=40)]: Done  63 out of  70 | elapsed: 19.8min remaining:  2.2min
[Parallel(n_jobs=40)]: Done  70 out of  70 | elapsed: 21.2min finished


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]