From 88857bf2aea946a5c033088dbbd6545e6523b8d2 Mon Sep 17 00:00:00 2001
From: Sejin Kim <40668167+skim2257@users.noreply.github.com>
Date: Mon, 5 Feb 2024 15:59:56 -0500
Subject: [PATCH] Detached branch (#94) (#95)

* Detached branch (#94)

* v1.2.0 for lighter install.

* update main-ci.yml

* fixed test_dataset

* added zipfile and request
---
 .github/workflows/main-ci.yml |   4 +-
 README.md                     |   5 +-
 imgtools/autopipeline.py      |   1 -
 requirements.txt              |   3 -
 setup.py                      |   3 +-
 tests/test_components.py      | 107 ++-----------------------
 tests/test_dataset.py         | 145 ++++++++++++++++++++++++++++++++++
 7 files changed, 157 insertions(+), 111 deletions(-)
 create mode 100644 tests/test_dataset.py

diff --git a/.github/workflows/main-ci.yml b/.github/workflows/main-ci.yml
index 46df10a..04e529f 100644
--- a/.github/workflows/main-ci.yml
+++ b/.github/workflows/main-ci.yml
@@ -29,7 +29,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        python -m pip install flake8 pytest setuptools wheel twine
+        python -m pip install flake8 pytest[torch] setuptools wheel twine
         pip install -e .
         pip install -r requirements.txt
     - name: Import checking
@@ -37,5 +37,5 @@ jobs:
         python -c "import imgtools"
     - name: Run pytest
       run: |
-        pytest tests -s
+        pytest tests
 
diff --git a/README.md b/README.md
index 1c9a9ec..c7f1477 100644
--- a/README.md
+++ b/README.md
@@ -7,10 +7,9 @@
 [![Documentation Status](https://readthedocs.org/projects/med-imagetools/badge/?version=documentation)](https://med-imagetools.readthedocs.io/en/documentation/?badge=documentation)
 ![DOI Status](https://zenodo.org/badge/243786996.svg)
 
-## Latest Updates (v1.0.3) - Oct 13th, 2022
+## Latest Updates (v1.2.0) - Feb 5th, 2024
 * Documentation is now available at: https://med-imagetools.readthedocs.io
-* Fixed relative path handling issue #53 and extra patient folder issue #57
-* Subseries crawl feature added, but not yet integrated into AutoPipeline. Will collect user data with prototypes first.
+* Dependencies have been reduced for a lighter install. `torch` and `torchio` dependencies have been moved to an extra pip install flag. Use `pip install med-imagetools[torch]` to use the Dataset feature and 
 
 #### Med-ImageTools core features
 * AutoPipeline CLI
diff --git a/imgtools/autopipeline.py b/imgtools/autopipeline.py
index 8893ba1..6b02223 100644
--- a/imgtools/autopipeline.py
+++ b/imgtools/autopipeline.py
@@ -23,7 +23,6 @@
 from imgtools.utils.args import parser
 from joblib import Parallel, delayed
 from imgtools.modules import Segmentation
-from torch import sparse_coo_tensor
 from sklearn.model_selection import train_test_split
 import matplotlib.pyplot as plt
 import pandas as pd
diff --git a/requirements.txt b/requirements.txt
index 75f86e3..1c67570 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,9 +8,6 @@ pynrrd
 scikit-image
 SimpleITK
 tqdm
-torch
-torchio
-scikit-learn
 pyyaml
 dill
 attrs
diff --git a/setup.py b/setup.py
index dde01a0..2bf6b73 100644
--- a/setup.py
+++ b/setup.py
@@ -8,7 +8,7 @@
     
 setup(
     name="med-imagetools",
-    version="1.1.7",
+    version="1.2.0",
     author="Sejin Kim, Michal Kazmierski, Kevin Qu, Vishwesh Ramanathan, Benjamin Haibe-Kains",
     author_email="benjamin.haibe.kains@utoronto.ca",
     description="Transparent and reproducible image processing pipelines in Python.",
@@ -19,6 +19,7 @@
     packages=find_packages(),
     extras_require={
         'debug': ['pyvis'],
+        'torch': ['torch', 'torchio']
     },
     entry_points={'console_scripts': ['autopipeline = imgtools.autopipeline:main', 'betapipeline = imgtools.autopipeline_refactored:main']},
     classifiers=[
diff --git a/tests/test_components.py b/tests/test_components.py
index 0abd280..c63700d 100644
--- a/tests/test_components.py
+++ b/tests/test_components.py
@@ -1,17 +1,11 @@
 import os, pathlib
-import shutil
 import urllib.request as request
 from zipfile import ZipFile
-import torchio as tio
-from torch.utils.data import DataLoader
+
 import pytest
+import SimpleITK as sitk
 import pandas as pd
-import nrrd
-import torch
-from typing import List
-import re
 from imgtools.autopipeline import AutoPipeline
-from imgtools.io import Dataset
 import ast
 
 @pytest.fixture(scope="session")
@@ -44,71 +38,21 @@ def dataset_path():
     edge_path = pathlib.Path(imgtools_path, f"imgtools_{dataset_name}_edges.csv").as_posix()
     yield quebec_path, output_path, crawl_path, edge_path
 
-#Defining for test_dataset method in Test_components class
-def collate_fn(data):
-    """
-       data: is a tio.subject with multiple columns
-             Need to return required data
-    """
-    mod_names = [items for items in data[0].keys() if items.split("_")[0]=="mod"]
-    temp_stack = {}
-    for names in mod_names:
-        temp_stack[names] = torch.stack(tuple(items[names].data for items in data))
-    return temp_stack
 
-class select_roi_names(tio.LabelTransform):
-    """
-    Based on the given roi names, selects from the given set
-    """
-    def __init__(
-            self,
-            roi_names: List[str] = None,
-            **kwargs
-            ) -> None:
-        super().__init__(**kwargs)
-        self.kwargs = kwargs
-        self.roi_names = roi_names
-    
-    def apply_transform(self, subject):
-        #list of roi_names
-        for image in self.get_images(subject):
-            #For only applying to labelmaps
-            metadata = subject["metadata_RTSTRUCT_CT"]
-            patterns = self.roi_names
-            mask = torch.empty_like(image.data)[:len(patterns)]
-            for j,pat in enumerate(patterns):
-                k = []
-                for i,col in enumerate(metadata):
-                    if re.match(pat,col,flags=re.IGNORECASE):
-                        k.append(i)
-                if len(k)==0:
-                    mask[j] = mask[j]*0
-                else:  
-                    mask[j] = (image.data[k].sum(axis=0)>0)*1    
-            image.set_data(mask)
-        return subject
-    
-    def is_invertible(self):
-        return False
 
 
 # @pytest.mark.parametrize("modalities",["PT", "CT,RTSTRUCT", "CT,RTDOSE", "CT,PT,RTDOSE", "CT,RTSTRUCT,RTDOSE", "CT,RTSTRUCT,RTDOSE,PT"])
 @pytest.mark.parametrize("modalities", ["CT", "CT,RTSTRUCT", "CT,RTSTRUCT,RTDOSE"])#, "CT,RTDOSE,PT"])
 class TestComponents:
     """
-    For testing the autopipeline and dataset components of the med-imagetools package
+    For testing the autopipeline components of the med-imagetools package
     It has two methods:
     test_pipeline:
         1) Checks if there is any crawler and edge table output generated by autopipeline
         2) Checks if for the test data, the lengths of the crawler and edge table matches the actual length of what should be ideally created
         3) Checks if the length of component table(dataset.csv) is correct or not
         4) Checks for every component, the shape of all different modalities matches or not
-    test_dataset:
-        1) Checks if the length of the dataset matches
-        2) Checks if the items in the subject object is correct and present
-        3) Checks if you are able to load it via load_nrrd and load_directly, and checks if the subjects generated matches
-        4) Checks if torch data loader can load the formed dataset and get atleast 1 iteration
-        5) Checks if the transforms are happening by checking the size
+
     """
     @pytest.fixture(autouse=True)
     def _get_path(self, dataset_path):
@@ -154,48 +98,9 @@ def test_pipeline(self, modalities):
                 print(subject_id, col, filename)
                 path_mod = pathlib.Path(output_path_mod, subject_id, col, f"{filename}.nii.gz").as_posix()
                 # All modalities except RTSTRUCT should be of type torchIO.ScalarImage
-                temp_dicom = tio.ScalarImage(path_mod).data
+                temp_dicom = sitk.GetArrayFromImage(sitk.ReadImage(path_mod))
                 shapes.append(temp_dicom.shape)
             A = [item == shapes[0] for item in shapes]
             print(shapes)
             assert all(A)
-    
-    def test_dataset(self, modalities):
-        """
-        Testing the Dataset class
-        """
-        output_path_mod = pathlib.Path(self.output_path, str("temp_folder_" + ("_").join(modalities.split(",")))).as_posix()
-        comp_path = pathlib.Path(output_path_mod).resolve().joinpath('dataset.csv').as_posix()
-        comp_table = pd.read_csv(comp_path, index_col=0)
-        print(comp_path, comp_table)
-        
-        #Loading from nrrd files
-        subjects_nrrd = Dataset.load_image(output_path_mod, ignore_multi=True)
-        #Loading files directly
-        # subjects_direct = Dataset.load_directly(self.input_path,modalities=modalities,ignore_multi=True)
-        
-        #The number of subjects is equal to the number of components which is 2 for this dataset
-        # assert len(subjects_nrrd) == len(subjects_direct) == 2, "There was some error in generation of subject object"
-        # assert subjects_nrrd[0].keys() == subjects_direct[0].keys()
-
-        # del subjects_direct
-        # To check if all metadata items present in the keys
-        # temp_nrrd = subjects_nrrd[0]
-        # columns_shdbe_present = set([col if col.split("_")[0]=="metadata" else "mod_"+("_").join(col.split("_")[1:]) for col in list(comp_table.columns) if col.split("_")[0] in ["folder","metadata"]])
-        # print(columns_shdbe_present)
-        # assert set(temp_nrrd.keys()).issubset(columns_shdbe_present), "Not all items present in dictionary, some fault in going through the different columns in a single component"
-
-        transforms = tio.Compose([tio.Resample(4), tio.CropOrPad((96,96,40)), select_roi_names(["larynx"]), tio.OneHot()])
-
-        #Forming dataset and dataloader
-        test_set = tio.SubjectsDataset(subjects_nrrd, transform=transforms)
-        test_loader = torch.utils.data.DataLoader(test_set,batch_size=2,shuffle=True,collate_fn = collate_fn)
-
-        #Check test_set is correct
-        assert len(test_set)==2
-
-        #Get items from test loader
-        #If this function fails , there is some error in formation of test
-        data = next(iter(test_loader))
-        A = [item[1].shape == (2,1,96,96,40) if not "RTSTRUCT" in item[0] else item[1].shape == (2,2,96,96,40) for item in data.items()]
-        assert all(A), "There is some problem in the transformation/the formation of subject object"
+    
\ No newline at end of file
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
new file mode 100644
index 0000000..a9a0471
--- /dev/null
+++ b/tests/test_dataset.py
@@ -0,0 +1,145 @@
+import pathlib, os
+import re
+import pandas as pd
+import torchio as tio
+import pytest
+import torch
+import urllib.request as request
+from zipfile import ZipFile
+
+from typing import List
+from torch.utils.data import DataLoader
+from imgtools.io import Dataset
+
+@pytest.fixture(scope="session")
+def dataset_path():
+    curr_path = pathlib.Path(__file__).parent.parent.resolve()
+    quebec_path = pathlib.Path(pathlib.Path(curr_path, "data", "Head-Neck-PET-CT").as_posix())
+    
+    if not os.path.exists(quebec_path):
+        pathlib.Path(quebec_path).mkdir(parents=True, exist_ok=True)
+        # Download QC dataset
+        print("Downloading the test dataset...")
+        quebec_data_url = "https://github.com/bhklab/tcia_samples/blob/main/Head-Neck-PET-CT.zip?raw=true"
+        quebec_zip_path = pathlib.Path(quebec_path, "Head-Neck-PET-CT.zip").as_posix()
+        request.urlretrieve(quebec_data_url, quebec_zip_path)
+        with ZipFile(quebec_zip_path, 'r') as zipfile:
+            zipfile.extractall(quebec_path)
+        os.remove(quebec_zip_path)
+    else:
+        print("Data already downloaded...")
+    output_path = pathlib.Path(curr_path, 'tests','temp').as_posix()
+    quebec_path = quebec_path.as_posix()
+    
+    #Dataset name
+    dataset_name  = os.path.basename(quebec_path)
+    imgtools_path = pathlib.Path(os.path.dirname(quebec_path), '.imgtools')
+
+    #Defining paths for autopipeline and dataset component
+    crawl_path = pathlib.Path(imgtools_path, f"imgtools_{dataset_name}.csv").as_posix()
+    json_path =  pathlib.Path(imgtools_path, f"imgtools_{dataset_name}.json").as_posix()
+    edge_path = pathlib.Path(imgtools_path, f"imgtools_{dataset_name}_edges.csv").as_posix()
+    yield quebec_path, output_path, crawl_path, edge_path
+
+class select_roi_names(tio.LabelTransform):
+    """
+    Based on the given roi names, selects from the given set
+    """
+    def __init__(
+            self,
+            roi_names: List[str] = None,
+            **kwargs
+            ) -> None:
+        super().__init__(**kwargs)
+        self.kwargs = kwargs
+        self.roi_names = roi_names
+    
+    def apply_transform(self, subject):
+        #list of roi_names
+        for image in self.get_images(subject):
+            #For only applying to labelmaps
+            metadata = subject["metadata_RTSTRUCT_CT"]
+            patterns = self.roi_names
+            mask = torch.empty_like(image.data)[:len(patterns)]
+            for j,pat in enumerate(patterns):
+                k = []
+                for i,col in enumerate(metadata):
+                    if re.match(pat,col,flags=re.IGNORECASE):
+                        k.append(i)
+                if len(k)==0:
+                    mask[j] = mask[j]*0
+                else:  
+                    mask[j] = (image.data[k].sum(axis=0)>0)*1    
+            image.set_data(mask)
+        return subject
+    
+    def is_invertible(self):
+        return False
+
+#Defining for test_dataset method in Test_components class
+def collate_fn(data):
+    """
+       data: is a tio.subject with multiple columns
+             Need to return required data
+    """
+    mod_names = [items for items in data[0].keys() if items.split("_")[0]=="mod"]
+    temp_stack = {}
+    for names in mod_names:
+        temp_stack[names] = torch.stack(tuple(items[names].data for items in data))
+    return temp_stack
+
+@pytest.mark.parametrize("modalities", ["CT", "CT,RTSTRUCT", "CT,RTSTRUCT,RTDOSE"])
+class TestDataset:
+    """
+    For testing the dataset components of the med-imagetools package
+    test_dataset:
+        1) Checks if the length of the dataset matches
+        2) Checks if the items in the subject object is correct and present
+        3) Checks if you are able to load it via load_nrrd and load_directly, and checks if the subjects generated matches
+        4) Checks if torch data loader can load the formed dataset and get atleast 1 iteration
+        5) Checks if the transforms are happening by checking the size
+    """
+    @pytest.fixture(autouse=True)
+    def _get_path(self, dataset_path):
+        self.input_path, self.output_path, self.crawl_path, self.edge_path = dataset_path
+        print(dataset_path)
+
+    def test_dataset(self, modalities):
+        """
+        Testing the Dataset class
+        """
+        output_path_mod = pathlib.Path(self.output_path, str("temp_folder_" + ("_").join(modalities.split(",")))).as_posix()
+        comp_path = pathlib.Path(output_path_mod).resolve().joinpath('dataset.csv').as_posix()
+        comp_table = pd.read_csv(comp_path, index_col=0)
+        print(comp_path, comp_table)
+        
+        #Loading from nrrd files
+        subjects_nrrd = Dataset.load_image(output_path_mod, ignore_multi=True)
+        #Loading files directly
+        # subjects_direct = Dataset.load_directly(self.input_path,modalities=modalities,ignore_multi=True)
+        
+        #The number of subjects is equal to the number of components which is 2 for this dataset
+        # assert len(subjects_nrrd) == len(subjects_direct) == 2, "There was some error in generation of subject object"
+        # assert subjects_nrrd[0].keys() == subjects_direct[0].keys()
+
+        # del subjects_direct
+        # To check if all metadata items present in the keys
+        # temp_nrrd = subjects_nrrd[0]
+        # columns_shdbe_present = set([col if col.split("_")[0]=="metadata" else "mod_"+("_").join(col.split("_")[1:]) for col in list(comp_table.columns) if col.split("_")[0] in ["folder","metadata"]])
+        # print(columns_shdbe_present)
+        # assert set(temp_nrrd.keys()).issubset(columns_shdbe_present), "Not all items present in dictionary, some fault in going through the different columns in a single component"
+
+        transforms = tio.Compose([tio.Resample(4), tio.CropOrPad((96,96,40)), select_roi_names(["larynx"]), tio.OneHot()])
+
+        #Forming dataset and dataloader
+        test_set = tio.SubjectsDataset(subjects_nrrd, transform=transforms)
+        test_loader = torch.utils.data.DataLoader(test_set,batch_size=2,shuffle=True,collate_fn = collate_fn)
+
+        #Check test_set is correct
+        assert len(test_set)==2
+
+        #Get items from test loader
+        #If this function fails , there is some error in formation of test
+        data = next(iter(test_loader))
+        A = [item[1].shape == (2,1,96,96,40) if not "RTSTRUCT" in item[0] else item[1].shape == (2,2,96,96,40) for item in data.items()]
+        assert all(A), "There is some problem in the transformation/the formation of subject object"