activeloopai · farizrahman4u · Apr 6, 2022 · Feb 9, 2022 · Feb 10, 2022 · Feb 10, 2022
diff --git a/hub/api/read.py b/hub/api/read.py
@@ -46,6 +46,7 @@ def read(
         Image: "bmp", "dib", "gif", "ico", "jpeg", "jpeg2000", "pcx", "png", "ppm", "sgi", "tga", "tiff", "webp", "wmf", "xbm"
         Audio: "flac", "mp3", "wav"
         Video: "mp4", "mkv", "avi"
+        Dicom: "dcm"
 
     Args:
         path (str): Path to a supported file.

diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py
@@ -862,6 +862,7 @@ def test_compressions_list():
         "apng",
         "avi",
         "bmp",
+        "dcm",
         "dib",
         "flac",
         "gif",
@@ -892,6 +893,7 @@ def test_htypes_list():
         "bbox",
         "binary_mask",
         "class_label",
+        "dicom",
         "generic",
         "image",
         "json",

diff --git a/hub/api/tests/test_dicom.py b/hub/api/tests/test_dicom.py
@@ -0,0 +1,37 @@
+import hub
+import pytest
+from pydicom.data import get_testdata_file
+from pydicom import dcmread
+
+
+def test_dicom_basic(memory_ds):
+    ds = memory_ds
+    path = get_testdata_file("MR_small.dcm")
+    with ds:
+        ds.create_tensor("x", htype="dicom")
+        dcm = hub.read(path)
+        assert dcm.dtype == "int16"
+        assert dcm.shape == (64, 64, 1)
+        ds.x.append(dcm)
+        ds.x.append(dcm)
+    assert ds.x.dtype == "int16"
+    arr = ds.x.numpy()
+    assert arr.dtype == "int16"
+    assert arr.shape == (2, 64, 64, 1)
+    for item in dcmread(path):
+        if not isinstance(item.value, bytes):
+            assert item.keyword in ds.x[0].sample_info
+
+
+def test_dicom_mixed_dtype(memory_ds):
+    ds = memory_ds
+    with ds:
+        ds.create_tensor("x", htype="dicom")
+        dcm = hub.read(get_testdata_file("MR_small.dcm"))
+        assert dcm.dtype == "int16"
+        ds.x.append(dcm)
+        dcm = hub.read(get_testdata_file("ExplVR_BigEnd.dcm"))
+        assert dcm.dtype == "uint8"
+        ds.x.append(dcm)
+    arr = ds.x[:, :10, :10, :1].numpy()
+    assert arr.dtype == "int16"
diff --git a/hub/api/tests/test_sample_info.py b/hub/api/tests/test_sample_info.py
@@ -1,12 +1,11 @@
-from miniaudio import mp3_get_file_info  # type: ignore
 from PIL import Image  # type: ignore
 from PIL.ExifTags import TAGS  # type: ignore
-
-import hub
-import pytest
+from miniaudio import mp3_get_file_info  # type: ignore
 import numpy as np
+import pytest
 import os
 import sys
+import hub
 
 
 def get_exif_helper(path):

diff --git a/hub/compression.py b/hub/compression.py
@@ -67,7 +67,9 @@
     c for c in IMAGE_COMPRESSIONS if c.upper() in Image.SAVE and c.upper() in Image.OPEN
 ]
 
+
 IMAGE_COMPRESSIONS.insert(0, "apng")
+IMAGE_COMPRESSIONS.insert(2, "dcm")
 
 SUPPORTED_COMPRESSIONS = [
     *BYTE_COMPRESSIONS,

diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py
@@ -978,7 +978,7 @@ def _numpy(
         length = self.num_samples
         last_shape = None
         enc = self.chunk_id_encoder
-
+        htype = self.tensor_meta.htype
         if use_data_cache and self.is_data_cachable:
             samples = self.numpy_from_data_cache(index, length, aslist)
         else:
@@ -999,9 +999,9 @@ def _numpy(
                         )[tuple(entry.value for entry in index.values[2:])]
                     else:
                         chunk = self.get_chunk_from_chunk_id(chunk_ids[0])
-                        sample = chunk.read_sample(local_sample_index)[
-                            tuple(entry.value for entry in index.values[1:])
-                        ]
+                        sample = chunk.read_sample(
+                            local_sample_index, cast=htype != "dicom"
+                        )[tuple(entry.value for entry in index.values[1:])]
                 elif len(index.values) == 1:
                     # Tiled sample, all chunks required
                     chunks = self.get_chunks_for_sample(global_sample_index)

diff --git a/hub/core/compression.py b/hub/core/compression.py
@@ -304,6 +304,8 @@ def decompress_array(
 
     if compression == "apng":
         return _decompress_apng(buffer)  # type: ignore
+    if compression == "dcm":
+        return _decompress_dicom(buffer)  # type: ignore
     try:
         if shape is not None and 0 in shape:
             return np.zeros(shape, dtype=dtype)
@@ -420,6 +422,8 @@ def verify_compressed_file(
         elif compression in ("mp4", "mkv", "avi"):
             if isinstance(file, (bytes, memoryview, str)):
                 return _read_video_shape(file), "|u1"  # type: ignore
+        elif compression == "dcm":
+            return _read_dicom_shape_and_dtype(file)
         else:
             return _fast_decompress(file)
     except Exception as e:
@@ -434,10 +438,11 @@ def verify_compressed_file(
 def get_compression(header=None, path=None):
     if path:
         # These formats are recognized by file extension for now
-        file_formats = ["mp3", "flac", "wav", "mp4", "mkv", "avi"]
+        file_formats = [".mp3", ".flac", ".wav", ".mp4", ".mkv", ".avi", ".dcm"]
+        path = str(path).lower()
         for fmt in file_formats:
-            if str(path).lower().endswith("." + fmt):
-                return fmt
+            if path.endswith(fmt):
+                return fmt[1:]
     if header:
         if not Image.OPEN:
             Image.init()
@@ -606,6 +611,8 @@ def read_meta_from_compressed_file(
                 shape, typestr = _read_png_shape_and_dtype(f)
             except Exception:
                 raise CorruptedSampleError("png")
+        elif compression == "dcm":
+            shape, typestr = _read_dicom_shape_and_dtype(f)
         elif get_compression_type(compression) == AUDIO_COMPRESSION:
             try:
                 shape, typestr = _read_audio_shape(file, compression), "<f4"
@@ -696,6 +703,41 @@ def _read_jpeg_shape_from_buffer(buf: bytes) -> Tuple[int, ...]:
     return shape
 
 
+def _read_dicom_shape_and_dtype(
+    f: Union[bytes, BinaryIO]
+) -> Tuple[Tuple[int, ...], str]:
+    try:
+        from pydicom import dcmread
+        from pydicom.pixel_data_handlers.util import pixel_dtype
+    except ImportError:
+        raise ModuleNotFoundError(
+            "Pydicom not found. Install using `pip install pydicom`"
+        )
+    if not hasattr(f, "read"):
+        f = BytesIO(f)  # type: ignore
+    dcm = dcmread(f)
+    nchannels = dcm[0x0028, 0x0002].value
+    shape = (dcm.Rows, dcm.Columns, nchannels)
+    isfloat = "FloatPixelData" in dcm or "DoubleFloatPixelData" in dcm
+    dtype = pixel_dtype(dcm, isfloat).str
+    return shape, dtype
+
+
+def _decompress_dicom(f: Union[str, bytes, BinaryIO]):
+    if isinstance(f, (bytes, memoryview, bytearray)):
+        f = BytesIO(f)
+    try:
+        from pydicom import dcmread
+    except ImportError:
+        raise ModuleNotFoundError(
+            "Pydicom not found. Install using `pip install pydicom`"
+        )
+    arr = dcmread(f).pixel_array
+    if arr.ndim == 2:
+        return np.expand_dims(arr, -1)
+    return arr
+
+
 def _read_png_shape_and_dtype(f: Union[bytes, BinaryIO]) -> Tuple[Tuple[int, ...], str]:
     """Reads shape and dtype of a png file from a file like object or file contents.
     If a file like object is provided, all of its contents are NOT loaded into memory."""

diff --git a/hub/core/dataset/dataset.py b/hub/core/dataset/dataset.py
@@ -410,7 +410,7 @@ def create_tensor(
         if info_kwargs:
             tensor.info.update(info_kwargs)
         self.storage.maybe_flush()
-        if create_sample_info_tensor and htype in ("image", "audio", "video"):
+        if create_sample_info_tensor and htype in ("image", "audio", "video", "dicom"):
             self._create_sample_info_tensor(name)
         if create_shape_tensor and htype not in ("text", "json"):
             self._create_sample_shape_tensor(name, htype=htype)
@@ -426,6 +426,7 @@ def _create_sample_shape_tensor(self, tensor: str, htype: str):
             create_id_tensor=False,
             create_sample_info_tensor=False,
             create_shape_tensor=False,
+            max_chunk_size=SAMPLE_INFO_TENSOR_MAX_CHUNK_SIZE,
         )
         f = "append_len" if htype == "list" else "append_shape"
         self._link_tensors(

diff --git a/hub/core/sample.py b/hub/core/sample.py
@@ -1,3 +1,4 @@
+from ast import Bytes
 from hub.core.compression import (
     compress_array,
     decompress_array,
@@ -114,10 +115,10 @@ def buffer(self):
 
     @property
     def dtype(self):
-        if self._dtype:
-            return self._dtype
-        self._read_meta()
-        return np.dtype(self._typestr).name
+        if self._dtype is None:
+            self._read_meta()
+            self._dtype = np.dtype(self._typestr).name
+        return self._dtype
 
     @property
     def shape(self):
@@ -130,6 +131,23 @@ def compression(self):
             self._read_meta()
         return self._compression
 
+    def _load_dicom(self):
+        if self._array is not None:
+            return
+        try:
+            from pydicom import dcmread
+        except ImportError:
+            raise ModuleNotFoundError(
+                "Pydicom not found. Install using `pip install pydicom`"
+            )
+        if self.path and get_path_type(self.path) == "local":
+            dcm = dcmread(self.path)
+        else:
+            dcm = dcmread(BytesIO(self.buffer))
+        self._array = dcm.pixel_array
+        self._shape = self._array.shape
+        self._typestr = self._array.__array_interface__["typestr"]
+
     def _read_meta(self, f=None):
         if self._shape is not None:
             return
@@ -152,6 +170,33 @@ def _read_meta(self, f=None):
         if store:
             self._compressed_bytes[self._compression] = f
 
+    def _get_dicom_meta(self) -> dict:
+        try:
+            from pydicom import dcmread
+            from pydicom.dataelem import RawDataElement
+        except ImportError:
+            raise ModuleNotFoundError(
+                "Pydicom not found. Install using `pip install pydicom`"
+            )
+        if self.path and get_path_type(self.path) == "local":
+            dcm = dcmread(self.path)
+        else:
+            dcm = dcmread(BytesIO(self.buffer))
+
+        meta = {
+            x.keyword: {
+                "name": x.name,
+                "tag": str(x.tag),
+                "value": x.value
+                if isinstance(x.value, (str, int, float))
+                else x.to_json_dict(None, None).get("Value", ""),  # type: ignore
+                "vr": x.VR,
+            }
+            for x in dcm
+            if not isinstance(x.value, bytes)
+        }
+        return meta
+
     def _get_video_meta(self) -> dict:
         if self.path and get_path_type(self.path) == "local":
             container, vstream = _open_video(self.path)
@@ -244,13 +289,20 @@ def uncompressed_bytes(self) -> bytes:
         """Returns uncompressed bytes."""
 
         if self._uncompressed_bytes is None:
+            if self._array is not None:
+                self._uncompressed_bytes = self._array.tobytes()
+                return self._uncompressed_bytes
             if self.path is not None:
                 compr = self._compression
                 if compr is None:
                     compr = get_compression(path=self.path)
-                if get_compression_type(compr) in (
-                    AUDIO_COMPRESSION,
-                    VIDEO_COMPRESSION,
+                if (
+                    get_compression_type(compr)
+                    in (
+                        AUDIO_COMPRESSION,
+                        VIDEO_COMPRESSION,
+                    )
+                    or compr == "dcm"
                 ):
                     self._compression = compr
                     if self._array is None:
@@ -393,12 +445,15 @@ def _getexif(self) -> dict:
     @property
     def meta(self) -> dict:
         meta: Dict[str, Union[Dict, str]] = {}
-        compression_type = get_compression_type(self.compression)
-        if compression_type == IMAGE_COMPRESSION:
+        compression = self.compression
+        compression_type = get_compression_type(compression)
+        if compression == "dcm":
+            meta.update(self._get_dicom_meta())
+        elif compression_type == IMAGE_COMPRESSION:
             meta["exif"] = self._getexif()
-        if compression_type == VIDEO_COMPRESSION:
+        elif compression_type == VIDEO_COMPRESSION:
             meta.update(self._get_video_meta())
-        if compression_type == AUDIO_COMPRESSION:
+        elif compression_type == AUDIO_COMPRESSION:
             meta.update(self._get_audio_meta())
         meta["shape"] = self.shape
         meta["format"] = self.compression

diff --git a/hub/core/tests/test_compression.py b/hub/core/tests/test_compression.py
@@ -27,13 +27,10 @@
 from PIL import Image  # type: ignore
 
 
-compressions = SUPPORTED_COMPRESSIONS[:]
-compressions.remove(None)  # type: ignore
-compressions.remove("wmf")  # driver has to be provided by user for wmf write support
-
 image_compressions = IMAGE_COMPRESSIONS[:]
 image_compressions.remove("wmf")
 image_compressions.remove("apng")
+image_compressions.remove("dcm")
 
 
 @pytest.mark.parametrize("compression", image_compressions + BYTE_COMPRESSIONS)

diff --git a/hub/htype.py b/hub/htype.py
@@ -67,6 +67,7 @@
     },
     "list": {"dtype": "List"},
     "text": {"dtype": "str"},
+    "dicom": {"sample_compression": "dcm"},
 }
 
 HTYPE_VERIFICATIONS: Dict[str, Dict] = {

diff --git a/hub/requirements/common.txt b/hub/requirements/common.txt
@@ -10,4 +10,5 @@ humbug>=0.2.6
 tqdm
 numcodecs
 miniaudio~=1.44
-av>=8.1.0; python_version >= '3.7' or sys_platform != 'win32'
+av>=8.1.0; python_version >= '3.7' or sys_platform != 'win32'
+pydicom
diff --git a/setup.py b/setup.py
@@ -31,6 +31,7 @@
     "audio": ["miniaudio"],
     "gcp": ["google-cloud-storage", "google-auth", "google-auth-oauthlib"],
     "video": ["av"],
+    "dicom": ["pydicom"],
 }
 
 all_extras = {r for v in extras.values() for r in v}