Subseries (#61)

* subseries implementation step 1 * Fixes for issues #57, #53. * update README. * fixed pytest error
bhklab · Oct 14, 2022 · f8f3606 · f8f3606
1 parent 4adf148
commit f8f3606
Show file tree

Hide file tree

Showing 7 changed files with 73 additions and 48 deletions.
diff --git a/README.md b/README.md
@@ -6,9 +6,10 @@
 ![GitHub forks](https://img.shields.io/github/forks/bhklab/med-imagetools?style=social)
 [![Documentation Status](https://readthedocs.org/projects/med-imagetools/badge/?version=documentation)](https://med-imagetools.readthedocs.io/en/documentation/?badge=documentation)
 
-### Latest Updates (v1.0.2) - Oct 12th, 2022
-Documentation is now available at: https://med-imagetools.readthedocs.io
-
+## Latest Updates (v1.0.3) - Oct 13th, 2022
+* Documentation is now available at: https://med-imagetools.readthedocs.io
+* Fixed relative path handling issue #53 and extra patient folder issue #57
+* Subseries crawl feature added, but not yet integrated into AutoPipeline. Will collect user data with prototypes first.
 
 #### Med-ImageTools core features
 * AutoPipeline CLI

diff --git a/imgtools/autopipeline.py b/imgtools/autopipeline.py
@@ -61,7 +61,8 @@ def __init__(self,
                  is_nnunet_inference=False,
                  dataset_json_path="",
                  continue_processing=False,
-                 dry_run=False):
+                 dry_run=False,
+                 verbose=False):
         """Initialize the pipeline.
 
         Parameters
@@ -114,6 +115,7 @@ def __init__(self,
 
         self.continue_processing = continue_processing
         self.dry_run = dry_run
+        self.v = verbose
 
         if dry_run:
             is_nnunet = False
@@ -127,13 +129,20 @@ def __init__(self,
         # pipeline configuration
         if not os.path.isabs(input_directory):
             input_directory = pathlib.Path(os.getcwd(), input_directory).as_posix()
+        else:
+            input_directory = pathlib.Path(input_directory).as_posix()  # consistent parsing. ensures last child directory doesn't end with slash
+
         if not os.path.isabs(output_directory):
             output_directory = pathlib.Path(os.getcwd(), output_directory).as_posix()
+        else:
+            output_directory = pathlib.Path(output_directory).as_posix() # consistent parsing. ensures last child directory doesn't end with slash
+
         if not os.path.exists(output_directory):
             # raise FileNotFoundError(f"Output directory {output_directory} does not exist")
             os.makedirs(output_directory)
         if not os.path.exists(input_directory):
             raise FileNotFoundError(f"Input directory {input_directory} does not exist")
+
         self.input_directory = pathlib.Path(input_directory).as_posix()
         self.output_directory = pathlib.Path(output_directory).as_posix()
 
@@ -399,7 +408,8 @@ def process_one_subject(self, subject_id):
                 mult_conn = colname.split("_")[-1].isnumeric()
                 num = colname.split("_")[-1]
 
-                # print(output_stream) #could include for verbose
+                if self.v:
+                    print("output_stream:", output_stream)
 
                 if read_results[i] is None:
                     print("The subject id: {} has no {}".format(subject_id, colname))
@@ -413,7 +423,8 @@ def process_one_subject(self, subject_id):
                         extractor.SetIndex([0, 0, 0, 0])    
 
                         image = extractor.Execute(image)
-                        # print(image.GetSize()) #could include with verbose
+                        if self.v:
+                            print("image.GetSize():", image.GetSize())
                     try:
                         image = self.resample(image)
                     except Exception as e:
@@ -464,10 +475,7 @@ def process_one_subject(self, subject_id):
                         doses = read_results[i]
 
                     # save output
-                    if not mult_conn:
-                        self.output(subject_id, doses, output_stream)
-                    else:
-                        self.output(f"{subject_id}_{num}", doses, output_stream)
+                    self.output(subject_id, doses, output_stream)
                     metadata[f"size_{output_stream}"] = str(doses.GetSize())
                     metadata[f"metadata_{colname}"] = [read_results[i].get_metadata()]
 
@@ -515,10 +523,10 @@ def process_one_subject(self, subject_id):
                         if name not in self.existing_roi_names.keys():
                             self.existing_roi_names[name] = len(self.existing_roi_names)
                     mask.existing_roi_names = self.existing_roi_names
-                    # print(self.existing_roi_names,"alskdfj")
 
-                    # save output
-                    # print(mask.GetSize()) #could include with verbose
+
+                    if self.v:
+                        print("mask.GetSize():", mask.GetSize())
                     mask_arr = np.transpose(sitk.GetArrayFromImage(mask))
 
                     if self.is_nnunet:
@@ -534,18 +542,18 @@ def process_one_subject(self, subject_id):
                         if len(mask_arr.shape) == 3:
                             mask_arr = mask_arr.reshape(1, mask_arr.shape[0], mask_arr.shape[1], mask_arr.shape[2])
 
-                        # print(mask_arr.shape) #could include with verbose
+                        if self.v:
+                            print(mask_arr.shape)
+
                         roi_names_list = list(mask.roi_names.keys())
                         for i in range(mask_arr.shape[0]):
                             new_mask = sitk.GetImageFromArray(np.transpose(mask_arr[i]))
                             new_mask.CopyInformation(mask)
                             new_mask = Segmentation(new_mask)
                             mask_to_process = new_mask
-                            if not mult_conn:
-                                # self.output(roi_names_list[i], mask_to_process, output_stream)
-                                self.output(subject_id, mask_to_process, output_stream, True, roi_names_list[i])
-                            else:
-                                self.output(f"{subject_id}_{num}", mask_to_process, output_stream, True, roi_names_list[i])
+
+                            # output
+                            self.output(subject_id, mask_to_process, output_stream, True, roi_names_list[i])
 
                     if hasattr(structure_set, "metadata") and structure_set.metadata is not None:
                         metadata.update(structure_set.metadata)
@@ -561,10 +569,8 @@ def process_one_subject(self, subject_id):
                         Warning("No CT image present. Returning PT/PET image without resampling.")
                         pet = read_results[i]
 
-                    if not mult_conn:
-                        self.output(subject_id, pet, output_stream)
-                    else:
-                        self.output(f"{subject_id}_{num}", pet, output_stream)
+                    # output
+                    self.output(subject_id, pet, output_stream)
                     metadata[f"size_{output_stream}"] = str(pet.GetSize())
                     metadata[f"metadata_{colname}"] = [read_results[i].get_metadata()]
 
@@ -665,7 +671,7 @@ def run(self):
         """Execute the pipeline, possibly in parallel.
         """
         # Joblib prints progress to stdout if verbose > 50
-        verbose = 51 if self.show_progress else 0
+        verbose = 51 if self.v or self.show_progress else 0
 
         subject_ids = self._get_loader_subject_ids()
         patient_ids = []
@@ -733,7 +739,8 @@ def main():
                             is_nnunet_inference=args.is_nnunet_inference,
                             dataset_json_path=args.dataset_json_path,
                             continue_processing=args.continue_processing,
-                            dry_run=args.dry_run)
+                            dry_run=args.dry_run,
+                            verbose=args.verbose)
     if not args.dry_run:
         print(f'starting AutoPipeline...')
         pipeline.run()

diff --git a/imgtools/modules/datagraph.py b/imgtools/modules/datagraph.py
@@ -337,7 +337,6 @@ def parser(self, query_string: str) -> pd.DataFrame:
         else:
             raise ValueError("Please enter the correct query")
 
-        print(self.mods, final_df)
         final_df.reset_index(drop=True, inplace=True)
         final_df["index_chng"] = final_df.index.astype(str) + "_" + final_df["patient_ID"].astype(str)
         final_df.set_index("index_chng", inplace=True)
@@ -398,8 +397,6 @@ def graph_query(self,
             col_ids = [cols for cols in list(final_df.columns)[1:] if bad != cols.split("_")[1]]
             final_df = final_df[[*list(final_df.columns)[:1], *col_ids]]
 
-        final_df.to_csv("final_df.csv")
-
         if return_components:
             return self.final_dict
         else:
@@ -529,7 +526,6 @@ def _get_df(self,
                 for j in range(len(CT_series)):
                     #Check if the number of nodes in a components isn't less than the query nodes, if yes then remove that component
                     mods_present = set([items.split("_")[1] for items in save_folder_comp[j].keys() if items.split("_")[0] == "folder"])
-                    print('\nmods_present', mods_present, mods_wanted)
                     #Checking if all the read modalities are present in a component
                     if mods_wanted.issubset(mods_present) == True:
                         remove_index.append(j)

diff --git a/imgtools/utils/args.py b/imgtools/utils/args.py
@@ -64,6 +64,9 @@ def parser():
     parser.add_argument("--dry_run", default=False, action="store_true",
                         help="Make a dry run of the pipeline, only producing the edge table and dataset.csv.")
 
+    parser.add_argument("--verbose", default=False, action="store_true",
+                        help="Verbose output flag.")
+
     # parser.add_argument("--custom_train_test_split_path", type=str,
     #                     help="Path to the YAML file defining the custom train-test-split.")
 

diff --git a/imgtools/utils/crawl.py b/imgtools/utils/crawl.py
@@ -59,23 +59,33 @@ def crawl_one(folder):
                 except:
                     series_description = ""
 
+                try:
+                    subseries = str(meta.AcquisitionNumber)
+                except:
+                    subseries = "default"
+
+                # try:
+
+
                 if patient not in database:
                     database[patient] = {}
                 if study not in database[patient]:
                     database[patient][study] = {'description': study_description}
                 if series not in database[patient][study]:
                     parent, _ = os.path.split(folder)
                     rel_path = pathlib.Path(os.path.split(parent)[1], os.path.relpath(path, parent)).as_posix()
-                    database[patient][study][series] = {'instances': [],
-                                                        'instance_uid': instance,
-                                                        'modality': meta.Modality,
-                                                        'description': series_description,
-                                                        'reference_ct': reference_ct,
-                                                        'reference_rs': reference_rs,
-                                                        'reference_pl': reference_pl,
-                                                        'reference_frame': reference_frame,
-                                                        'folder': rel_path}
-                database[patient][study][series]['instances'].append(instance)
+                    database[patient][study][series] = {'description': series_description}
+                if subseries not in database[patient][study][series]:
+                    database[patient][study][series][subseries] = {'instances': [],
+                                                                   'instance_uid': instance,
+                                                                   'modality': meta.Modality,
+                                                                   'reference_ct': reference_ct,
+                                                                   'reference_rs': reference_rs,
+                                                                   'reference_pl': reference_pl,
+                                                                   'reference_frame': reference_frame,
+                                                                   'folder': rel_path}
+
+                database[patient][study][series][subseries]['instances'].append(instance)
             except:
                 pass
 
@@ -86,13 +96,21 @@ def to_df(database_dict):
     for pat in database_dict:
         for study in database_dict[pat]:
             for series in database_dict[pat][study]:
-                if series != 'description':
-                    columns = ['patient_ID', 'study', 'study_description', 'series', 'series_description', 'modality', 'instances', 'instance_uid', 'reference_ct', 'reference_rs', 'reference_pl', 'reference_frame', 'folder']
-                    values = [pat, study, database_dict[pat][study]['description'], series, database_dict[pat][study][series]['description'], database_dict[pat][study][series]['modality'], len(database_dict[pat][study][series]['instances']),
-                    database_dict[pat][study][series]['instance_uid'], database_dict[pat][study][series]['reference_ct'], database_dict[pat][study][series]['reference_rs'], database_dict[pat][study][series]['reference_pl'],
-                    database_dict[pat][study][series]['reference_frame'], database_dict[pat][study][series]['folder']]
-                    df_add = pd.DataFrame([values], columns=columns)
-                    df = pd.concat([df, df_add], ignore_index=True)
+                if series != 'description': # skip description key in dict
+                    for subseries in database_dict[pat][study][series]:
+                        if subseries != 'description': # skip description key in dict
+                            columns = ['patient_ID', 'study', 'study_description', 
+                                       'series', 'series_description', 'subseries', 'modality', 
+                                       'instances', 'instance_uid', 
+                                       'reference_ct', 'reference_rs', 'reference_pl', 'reference_frame', 'folder']
+                            values = [pat, study, database_dict[pat][study]['description'], 
+                                      series, database_dict[pat][study][series]['description'], 
+                                      subseries, database_dict[pat][study][series][subseries]['modality'], 
+                                      len(database_dict[pat][study][series][subseries]['instances']), database_dict[pat][study][series][subseries]['instance_uid'], 
+                                      database_dict[pat][study][series][subseries]['reference_ct'], database_dict[pat][study][series][subseries]['reference_rs'], 
+                                      database_dict[pat][study][series][subseries]['reference_pl'], database_dict[pat][study][series][subseries]['reference_frame'], database_dict[pat][study][series][subseries]['folder']]
+                            df_add = pd.DataFrame([values], columns=columns)
+                            df = pd.concat([df, df_add], ignore_index=True)
     return df
 
 def crawl(top, 

diff --git a/setup.py b/setup.py
@@ -8,7 +8,7 @@
 
 setup(
     name="med-imagetools",
-    version="1.0.2",
+    version="1.0.3",
     author="Sejin Kim, Michal Kazmierski, Kevin Qu, Vishwesh Ramanathan, Benjamin Haibe-Kains",
     author_email="benjamin.haibe.kains@utoronto.ca",
     description="Transparent and reproducible image processing pipelines in Python.",

diff --git a/tests/test_components.py b/tests/test_components.py
@@ -122,7 +122,7 @@ def test_pipeline(self, modalities):
         n_jobs = 2
         output_path_mod = pathlib.Path(self.output_path, str("temp_folder_" + ("_").join(modalities.split(",")))).as_posix()
         #Initialize pipeline for the current setting
-        pipeline = AutoPipeline(self.input_path, output_path_mod, modalities, n_jobs=n_jobs,spacing=(5,5,5))
+        pipeline = AutoPipeline(self.input_path, output_path_mod, modalities, n_jobs=n_jobs,spacing=(5,5,5), overwrite=True)
         #Run for different modalities
         comp_path = pathlib.Path(output_path_mod, "dataset.csv").as_posix()
         pipeline.run()