From 9efa5d6d082b11d17b9115e3ee831fe70f8d0ee2 Mon Sep 17 00:00:00 2001 From: Elina Thibeau Sutre Date: Tue, 27 Apr 2021 16:29:39 +0200 Subject: [PATCH 01/37] Add new arguments for ROI in random-search (#150) --- .../tools/deep_learning/models/random.py | 55 +++++++++++-------- clinicadl/clinicadl/train/random_search.py | 3 + 2 files changed, 34 insertions(+), 24 deletions(-) diff --git a/clinicadl/clinicadl/tools/deep_learning/models/random.py b/clinicadl/clinicadl/tools/deep_learning/models/random.py index 292fac6da..84c26f291 100644 --- a/clinicadl/clinicadl/tools/deep_learning/models/random.py +++ b/clinicadl/clinicadl/tools/deep_learning/models/random.py @@ -11,7 +11,9 @@ def sampling_fn(value, sampling_type): if isinstance(value, (tuple, list)): - if sampling_type is "choice": + if sampling_type is "fixed": + return value + elif sampling_type is "choice": return random.choice(value) elif sampling_type is "exponent": exponent = random.uniform(*value) @@ -43,51 +45,56 @@ def random_sampling(rs_options, options): """ sampling_dict = { + "accumulation_steps": "randint", + "baseline": "choice", + "caps_dir": "fixed", + "channels_limit": "fixed", + "data_augmentation": "fixed", + "diagnoses": "fixed", + "dropout": "uniform", + "epochs": "fixed", + "learning_rate": "exponent", + "loss": "choice", "mode": "choice", + "multi_cohort": "fixed", + "n_fcblocks": "randint", "network_type": "choice", "network_normalization": "choice", - "n_fcblocks": "randint", + "optimizer": "choice", + "patience": "fixed", "preprocessing": "choice", - "baseline": "choice", + "sampler": "choice", + "tolerance": "fixed", + "transfer_learning_path": "choice", + "transfer_learning_selection": "choice", + "tsv_path": "fixed", "unnormalize": "choice", - "learning_rate": "exponent", - "dropout": "uniform", - "accumulation_steps": "randint", - "loss": "choice", - "optimizer": "choice", "wd_bool": "choice", "weight_decay": "exponent", - "sampler": "choice", - "transfer_learning_path": "choice", - "transfer_learning_selection": "choice" } - fixed_values = ["tsv_path", "caps_dir", - "epochs", "patience", "tolerance", - "diagnoses", "data_augmentation", - "multi_cohort", - "channels_limit", - "use_extracted_patches", - "use_extracted_slices"] + additional_mode_dict = { "image": {}, "patch": { "patch_size": "randint", "selection_threshold": "uniform", - "stride_size": "randint" + "stride_size": "randint", + "use_extracted_patches": "fixed", }, "roi": { - "selection_threshold": "uniform" + "selection_threshold": "uniform", + "roi_list": "fixed", + "use_extracted_roi": "fixed", + "uncropped_roi": "fixed", }, "slice": { "discarded_slices": "randint", "selection_threshold": "uniform", - "slice_direction": "choice" + "slice_direction": "choice", + "use_extracted_slices": "fixed", } } - for name in fixed_values: - setattr(options, name, getattr(rs_options, name)) - for name, sampling_type in sampling_dict.items(): sampled_value = sampling_fn(getattr(rs_options, name), sampling_type) setattr(options, name, sampled_value) diff --git a/clinicadl/clinicadl/train/random_search.py b/clinicadl/clinicadl/train/random_search.py index ce276247c..eb0937a14 100755 --- a/clinicadl/clinicadl/train/random_search.py +++ b/clinicadl/clinicadl/train/random_search.py @@ -39,14 +39,17 @@ def check_and_complete(rs_options): "unnormalize": False, "patch_size": 50, "patience": 0, + "roi_list": None, "selection_threshold": 0, "slice_direction": 0, "stride_size": 50, "tolerance": 0.0, "transfer_learning_path": None, "transfer_learning_selection": "best_loss", + "uncropped_roi": False, "use_extracted_patches": False, "use_extracted_slices": False, + "use_extracted_roi": False, "wd_bool": True, "weight_decay": 4, "sampler": "random" From f00a12c4f58695a635690925c922b9ee983e6d42 Mon Sep 17 00:00:00 2001 From: Elina Thibeau-Sutre Date: Thu, 11 Feb 2021 16:06:38 +0100 Subject: [PATCH 02/37] Adapt to adni-to-bids changes --- clinicadl/clinicadl/tools/tsv/data_formatting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clinicadl/clinicadl/tools/tsv/data_formatting.py b/clinicadl/clinicadl/tools/tsv/data_formatting.py index 944f42300..d2d92d1d4 100644 --- a/clinicadl/clinicadl/tools/tsv/data_formatting.py +++ b/clinicadl/clinicadl/tools/tsv/data_formatting.py @@ -406,7 +406,7 @@ def get_labels(merged_tsv, missing_mods, results_path, # Remove SMC patients if remove_smc: - if "diagnosis_bl" in bids_df.columns.values: # Retro-compatibility + if "diagnosis_bl" in bids_df.columns.values: # Retro-compatibility bids_df = bids_df[~(bids_df.diagnosis_bl == "SMC")] if "diagnosis_sc" in bids_df.columns.values: bids_df = bids_df[~(bids_df.diagnosis_sc == "SMC")] From e6bc98285199bf077c74a0fdd38a9d92d36e3b75 Mon Sep 17 00:00:00 2001 From: Elina Thibeau-Sutre Date: Tue, 16 Mar 2021 10:51:43 +0100 Subject: [PATCH 03/37] Predict GM intensities from atlases --- clinicadl/clinicadl/cli.py | 16 ++ .../tools/deep_learning/cnn_utils.py | 29 ++- .../clinicadl/tools/deep_learning/data.py | 171 ++++++++++++++++-- .../clinicadl/tools/deep_learning/iotools.py | 10 + .../tools/deep_learning/models/__init__.py | 22 ++- .../tools/deep_learning/models/image_level.py | 16 +- .../tools/deep_learning/models/patch_level.py | 1 + .../tools/deep_learning/models/random.py | 3 + .../tools/deep_learning/models/slice_level.py | 2 +- clinicadl/clinicadl/train/random_search.py | 3 + clinicadl/clinicadl/train/train_multiCNN.py | 2 +- clinicadl/clinicadl/train/train_singleCNN.py | 2 +- 12 files changed, 242 insertions(+), 35 deletions(-) diff --git a/clinicadl/clinicadl/cli.py b/clinicadl/clinicadl/cli.py index 0623955f2..765798338 100644 --- a/clinicadl/clinicadl/cli.py +++ b/clinicadl/clinicadl/cli.py @@ -1394,6 +1394,11 @@ def return_train_parent_parser(retrain=False): default=0, type=int, help='Fix the number of iterations to perform before computing an evaluations. Default will only ' 'perform one evaluation at the end of each epoch.') + train_comput_group.add_argument( + '--merged_tsv_path', + default=None, type=str, + help="Path to the output of clinica iotools merged-tsv (concatenation for multi-cohort). " + ) train_data_group = train_parent_parser.add_argument_group( TRAIN_CATEGORIES["DATA"]) @@ -1435,6 +1440,17 @@ def return_train_parent_parser(retrain=False): '--sampler', '-s', help="Sampler choice (random, or weighted for imbalanced datasets)", default="random", type=str, choices=["random", "weighted"]) + train_data_group.add_argument( + "--predict_atlas_intensities", + help="Atlases used in t1-volume pipeline to make intensities prediction.", + default=None, type=str, + choices=["AAL2", "AICHA", "Hammers", "LPBA40", "Neuromorphometrics"] + ) + train_data_group.add_argument( + "--atlas_weight", + help="Weight to put on the MSE loss used to compute the error on atlas intensities.", + default=1, type=float, + ) train_cv_group = train_parent_parser.add_argument_group( TRAIN_CATEGORIES["CROSS-VALIDATION"]) diff --git a/clinicadl/clinicadl/tools/deep_learning/cnn_utils.py b/clinicadl/clinicadl/tools/deep_learning/cnn_utils.py index b6f3aed6d..05a0616e8 100644 --- a/clinicadl/clinicadl/tools/deep_learning/cnn_utils.py +++ b/clinicadl/clinicadl/tools/deep_learning/cnn_utils.py @@ -103,10 +103,23 @@ def train(model, train_loader, valid_loader, criterion, optimizer, resume, log_d if hasattr(model, "variational") and model.variational: z, mu, std, train_output = model(imgs) kl_loss = kl_divergence(z, mu, std) - loss = criterion(train_output, labels) + kl_loss + loss = kl_loss else: train_output = model(imgs) - loss = criterion(train_output, labels) + loss = 0 + + if "atlas" in data: + if options.gpu: + atlas_data = data["atlas"].cuda() + else: + atlas_data = data["atlas"] + atlas_output = train_output[:, -atlas_data.size(1)::] + classif_output = train_output[:, :-atlas_data.size(1):] + loss += criterion(classif_output, labels) + loss += options.atlas_weight * torch.nn.MSELoss(reduction="sum")(atlas_output, atlas_data) + + else: + loss += criterion(train_output, labels) # Back propagation loss.backward() @@ -306,6 +319,7 @@ def test(model, dataloader, use_cuda, criterion, mode="image", use_labels=True): results_df = pd.DataFrame(columns=columns) total_loss = 0 total_kl_loss = 0 + total_atlas_loss = 0 total_time = 0 tend = time() with torch.no_grad(): @@ -323,6 +337,16 @@ def test(model, dataloader, use_cuda, criterion, mode="image", use_labels=True): total_kl_loss += kl_loss.item() else: outputs = model(inputs) + + if "atlas" in data: + if use_cuda: + atlas_data = data["atlas"].cuda() + else: + atlas_data = data["atlas"] + atlas_output = outputs[:, -atlas_data.size(1)::] + outputs = outputs[:, :-atlas_data.size(1):] + total_atlas_loss += torch.nn.MSELoss(reduction="sum")(atlas_output, atlas_data) + if use_labels: loss = criterion(outputs, labels) total_loss += loss.item() @@ -353,6 +377,7 @@ def test(model, dataloader, use_cuda, criterion, mode="image", use_labels=True): results_df.predicted_label.values.astype(int)) metrics_dict['total_loss'] = total_loss metrics_dict['total_kl_loss'] = total_kl_loss + metrics_dict['total_atlas_loss'] = total_atlas_loss torch.cuda.empty_cache() return results_df, metrics_dict diff --git a/clinicadl/clinicadl/tools/deep_learning/data.py b/clinicadl/clinicadl/tools/deep_learning/data.py index a70c5e512..43c58e519 100644 --- a/clinicadl/clinicadl/tools/deep_learning/data.py +++ b/clinicadl/clinicadl/tools/deep_learning/data.py @@ -3,7 +3,7 @@ import torch import pandas as pd import numpy as np -from os import path +from os import path, listdir from torch.utils.data import Dataset, sampler import torchvision.transforms as transforms import abc @@ -22,7 +22,8 @@ class MRIDataset(Dataset): def __init__(self, caps_directory, data_file, preprocessing, transformations, labels, - augmentation_transformations=None, multi_cohort=False): + augmentation_transformations=None, multi_cohort=False, + atlas=None, group=None, merged_df=None): self.caps_dict = self.create_caps_dict(caps_directory, multi_cohort) self.transformations = transformations self.augmentation_transformations = augmentation_transformations @@ -77,6 +78,19 @@ def __init__(self, caps_directory, data_file, "If you want to run a binary classification please change the labels involved." % (unique_diagnoses, unique_codes)) + self.merged_df = merged_df + if merged_df is not None and "participant_id" in merged_df.columns.values: + self.merged_df.set_index(["participant_id", "session_id"], inplace=True) + + self.atlas = atlas + self.group_dict = None + if self.atlas is not None and group is None: + self.group_dict = self.create_group_dict(caps_directory, multi_cohort, group) + if self.atlas is not None and self.merged_df is not None: + filtered_columns = [col for col in merged_df.columns.values + if "t1-volume" in col and atlas in col] + self.merged_df = self.merged_df[filtered_columns] + self.elem_per_image = self.num_elem_per_image() self.size = self[0]['image'].size() @@ -106,6 +120,40 @@ def create_caps_dict(caps_directory, multi_cohort): return caps_dict + @staticmethod + def create_group_dict(caps_directory, multi_cohort, group): + if multi_cohort: + if not caps_directory.endswith('.tsv'): + raise ValueError('If multi_cohort is given, the caps_dir argument should be a path to a TSV file.') + else: + caps_df = pd.read_csv(caps_directory, sep="\t") + check_multi_cohort_tsv(caps_df, 'CAPS') + if "group_label" not in caps_df.columns and group is None: + raise ValueError('When atlas intensities are involved the group_label column must be filled ' + 'in the CAPS TSV file.') + group_dict = dict() + for idx in range(len(caps_df)): + cohort = caps_df.loc[idx, 'cohort'] + if group is None: + group_label = f"group-{caps_df.loc[idx, 'group_label']}" + else: + group_label = f"group-{group}" + group_dict[cohort] = group_label + else: + if group is None: + groups_list = listdir(path.join(caps_directory, "groups")) + if len(groups_list) == 0: + raise ValueError("A commun group could not be found for the CAPS folder wanted.") + elif len(groups_list) > 1: + raise ValueError(f"Several groups were found for the CAPS folder wanted {groups_list}. " + "Please precise which group should be used.") + else: + group_dict = {'single': groups_list[0]} + else: + group_dict = {'single': f"group-{group}"} + + return group_dict + def _get_path(self, participant, session, cohort, mode="image"): if cohort not in self.caps_dict.keys(): @@ -135,6 +183,33 @@ def _get_path(self, participant, session, cohort, mode="image"): return image_path + def _get_statistics_df(self, participant, session, cohort): + if cohort not in self.caps_dict.keys(): + raise ValueError('Cohort names in labels and CAPS definitions do not match.') + + if self.merged_df is None: + + statistics_path = path.join( + self.caps_dict[cohort], + 'subjects', + participant, + session, + "t1", + "spm", + "dartel", + self.group_dict[cohort], + "atlas_statistics", + f"{participant}_{session}_T1w_segm-graymatter_space-Ixi549Space_modulated-on_probability_space-{self.atlas}_map-graymatter_statistics.tsv" + ) + + if not path.exists(statistics_path): + raise ValueError(f"Last step of t1-volume with {self.group_dict[cohort]} was not run on {participant} | {session}") + + return pd.read_csv(statistics_path, sep="\t", usecols=["mean_scalar"], dtype=np.float32, squeeze=True) + + else: + return self.merged_df.loc[(participant, session)] + def _get_meta_data(self, idx): image_idx = idx // self.elem_per_image participant = self.df.loc[image_idx, 'participant_id'] @@ -180,6 +255,13 @@ def _get_full_image(self): return image + def len_atlas(self): + example_data = self[0] + if "atlas" in example_data: + return len(example_data["atlas"]) + else: + return 0 + @abc.abstractmethod def __getitem__(self, idx): pass @@ -202,7 +284,8 @@ class MRIDatasetImage(MRIDataset): def __init__(self, caps_directory, data_file, preprocessing='t1-linear', train_transformations=None, - labels=True, all_transformations=None, multi_cohort=False): + labels=True, all_transformations=None, multi_cohort=False, + atlas=None, merged_df=None): """ Args: caps_directory (string): Directory of all the images. @@ -212,13 +295,16 @@ def __init__(self, caps_directory, data_file, labels (bool): If True the diagnosis will be extracted from the given DataFrame. all_transformations (callable, options): Optional transform to be applied during training and evaluation. multi_cohort (bool): If True caps_directory is the path to a TSV file linking cohort names and paths. + atlas (str): name of an atlas if predicting the regional intensities. + merged_df (DataFrame): DataFrame of all TSV files needed for atlas intensities prediction. """ self.elem_index = None self.mode = "image" super().__init__(caps_directory, data_file, preprocessing, augmentation_transformations=train_transformations, labels=labels, - transformations=all_transformations, multi_cohort=multi_cohort) + transformations=all_transformations, multi_cohort=multi_cohort, + atlas=atlas, merged_df=merged_df) def __getitem__(self, idx): participant, session, cohort, _, label = self._get_meta_data(idx) @@ -235,6 +321,11 @@ def __getitem__(self, idx): sample = {'image': image, 'label': label, 'participant_id': participant, 'session_id': session, 'image_path': image_path} + if self.atlas is not None: + atlas_df = self._get_statistics_df(participant, session, cohort) + atlas_pt = torch.from_numpy(atlas_df.values).float() + sample['atlas'] = atlas_pt + return sample def num_elem_per_image(self): @@ -245,7 +336,7 @@ class MRIDatasetPatch(MRIDataset): def __init__(self, caps_directory, data_file, patch_size, stride_size, train_transformations=None, prepare_dl=False, patch_index=None, preprocessing="t1-linear", labels=True, all_transformations=None, - multi_cohort=False): + multi_cohort=False, atlas=None, merged_df=None): """ Args: caps_directory (string): Directory of all the images. @@ -260,6 +351,8 @@ def __init__(self, caps_directory, data_file, patch_size, stride_size, train_tra labels (bool): If True the diagnosis will be extracted from the given DataFrame. all_transformations (callable, options): Optional transform to be applied during training and evaluation. multi_cohort (bool): If True caps_directory is the path to a TSV file linking cohort names and paths. + atlas (str): name of an atlas if predicting the regional intensities. + merged_df (DataFrame): DataFrame of all TSV files needed for atlas intensities prediction. """ if preprocessing == "shepplogan": @@ -271,7 +364,8 @@ def __init__(self, caps_directory, data_file, patch_size, stride_size, train_tra self.prepare_dl = prepare_dl super().__init__(caps_directory, data_file, preprocessing, augmentation_transformations=train_transformations, labels=labels, - transformations=all_transformations, multi_cohort=multi_cohort) + transformations=all_transformations, multi_cohort=multi_cohort, + atlas=atlas, merged_df=merged_df) def __getitem__(self, idx): participant, session, cohort, patch_idx, label = self._get_meta_data(idx) @@ -297,6 +391,11 @@ def __getitem__(self, idx): sample = {'image': image, 'label': label, 'participant_id': participant, 'session_id': session, 'patch_id': patch_idx} + if self.atlas is not None: + atlas_df = self._get_statistics_df(participant, session, cohort) + atlas_pt = torch.from_numpy(atlas_df.mean_scalar.values).float() + sample['atlas'] = atlas_pt + return sample def num_elem_per_image(self): @@ -334,7 +433,7 @@ class MRIDatasetRoi(MRIDataset): def __init__(self, caps_directory, data_file, roi_list=None, cropped_roi=True, roi_index=None, preprocessing="t1-linear", train_transformations=None, prepare_dl=False, labels=True, - all_transformations=None, multi_cohort=False): + all_transformations=None, multi_cohort=False, atlas=None, merged_df=None): """ Args: caps_directory (string): Directory of all the images. @@ -349,6 +448,8 @@ def __init__(self, caps_directory, data_file, roi_list=None, cropped_roi=True, r labels (bool): If True the diagnosis will be extracted from the given DataFrame. all_transformations (callable, options): Optional transform to be applied during training and evaluation. multi_cohort (bool): If True caps_directory is the path to a TSV file linking cohort names and paths. + atlas (str): name of an atlas if predicting the regional intensities. + merged_df (DataFrame): DataFrame of all TSV files needed for atlas intensities prediction. """ if preprocessing == "shepplogan": @@ -360,10 +461,17 @@ def __init__(self, caps_directory, data_file, roi_list=None, cropped_roi=True, r self.prepare_dl = prepare_dl self.mask_list = self.find_masks(caps_directory, preprocessing) super().__init__(caps_directory, data_file, preprocessing, augmentation_transformations=train_transformations, - labels=labels, transformations=all_transformations, multi_cohort=multi_cohort) + labels=labels, transformations=all_transformations, multi_cohort=multi_cohort, + atlas=atlas, merged_df=merged_df) def __getitem__(self, idx): + from time import time + + t0 = time() participant, session, cohort, roi_idx, label = self._get_meta_data(idx) + t1 = time( + ) + print(f"get meta data {t1 - t0}") if self.prepare_dl: if self.roi_list is None: @@ -380,6 +488,8 @@ def __getitem__(self, idx): image_path = self._get_path(participant, session, cohort, "image") image = torch.load(image_path) patch = self.extract_roi_from_mri(image, roi_idx) + t2 = time() + print(f"get roi {t2 - t1}") if self.transformations: patch = self.transformations(patch) @@ -387,10 +497,22 @@ def __getitem__(self, idx): if self.augmentation_transformations and not self.eval_mode: patch = self.augmentation_transformations(patch) + t3 = time() + print(f"transformations {t3 - t2}") + sample = {'image': patch, 'label': label, 'participant_id': participant, 'session_id': session, 'roi_id': roi_idx} + t4 = time() + print(f"sample {t4 - t3}") + if self.atlas is not None: + atlas_df = self._get_statistics_df(participant, session, cohort) + atlas_pt = torch.from_numpy(atlas_df.values).float() + sample['atlas'] = atlas_pt + t5 = time() + print(f"get atlas {t5 - t4}") + return sample def num_elem_per_image(self): @@ -513,7 +635,7 @@ class MRIDatasetSlice(MRIDataset): def __init__(self, caps_directory, data_file, slice_index=None, preprocessing="t1-linear", train_transformations=None, mri_plane=0, prepare_dl=False, discarded_slices=20, mixed=False, labels=True, all_transformations=None, - multi_cohort=False): + multi_cohort=False, atlas=None, merged_df=None): """ Args: caps_directory (string): Directory of all the images. @@ -531,6 +653,8 @@ def __init__(self, caps_directory, data_file, slice_index=None, preprocessing="t labels (bool): If True the diagnosis will be extracted from the given DataFrame. all_transformations (callable, options): Optional transform to be applied during training and evaluation. multi_cohort (bool): If True caps_directory is the path to a TSV file linking cohort names and paths. + atlas (str): name of an atlas if predicting the regional intensities. + merged_df (DataFrame): DataFrame of all TSV files needed for atlas intensities prediction. """ # Rename MRI plane if preprocessing == "shepplogan": @@ -560,7 +684,8 @@ def __init__(self, caps_directory, data_file, slice_index=None, preprocessing="t self.prepare_dl = prepare_dl super().__init__(caps_directory, data_file, preprocessing, augmentation_transformations=train_transformations, labels=labels, - transformations=all_transformations, multi_cohort=multi_cohort) + transformations=all_transformations, multi_cohort=multi_cohort, + atlas=atlas, merged_df=merged_df) def __getitem__(self, idx): participant, session, cohort, slice_idx, label = self._get_meta_data(idx) @@ -587,6 +712,11 @@ def __getitem__(self, idx): 'participant_id': participant, 'session_id': session, 'slice_id': slice_idx} + if self.atlas is not None: + atlas_df = self._get_statistics_df(participant, session, cohort) + atlas_pt = torch.from_numpy(atlas_df.mean_scalar.values).float() + sample['atlas'] = atlas_pt + return sample def num_elem_per_image(self): @@ -642,6 +772,11 @@ def return_dataset(mode, input_dir, data_df, preprocessing, if cnn_index is not None and mode in ["image"]: raise ValueError("Multi-CNN is not implemented for %s mode." % mode) + if params.merged_tsv_path is not None: + merged_df = pd.read_csv(params.merged_tsv_path, sep="\t") + else: + merged_df = None + if mode == "image": return MRIDatasetImage( input_dir, @@ -650,7 +785,9 @@ def return_dataset(mode, input_dir, data_df, preprocessing, train_transformations=train_transformations, all_transformations=all_transformations, labels=labels, - multi_cohort=multi_cohort + multi_cohort=multi_cohort, + atlas=params.predict_atlas_intensities, + merged_df=merged_df ) elif mode == "patch": return MRIDatasetPatch( @@ -664,7 +801,9 @@ def return_dataset(mode, input_dir, data_df, preprocessing, prepare_dl=prepare_dl, patch_index=cnn_index, labels=labels, - multi_cohort=multi_cohort + multi_cohort=multi_cohort, + atlas=params.predict_atlas_intensities, + merged_df=merged_df ) elif mode == "roi": return MRIDatasetRoi( @@ -678,7 +817,9 @@ def return_dataset(mode, input_dir, data_df, preprocessing, prepare_dl=prepare_dl, roi_index=cnn_index, labels=labels, - multi_cohort=multi_cohort + multi_cohort=multi_cohort, + atlas=params.predict_atlas_intensities, + merged_df=merged_df ) elif mode == "slice": return MRIDatasetSlice( @@ -692,7 +833,9 @@ def return_dataset(mode, input_dir, data_df, preprocessing, discarded_slices=params.discarded_slices, slice_index=cnn_index, labels=labels, - multi_cohort=multi_cohort + multi_cohort=multi_cohort, + atlas=params.predict_atlas_intensities, + merged_df=merged_df ) else: raise ValueError("Mode %s is not implemented." % mode) diff --git a/clinicadl/clinicadl/tools/deep_learning/iotools.py b/clinicadl/clinicadl/tools/deep_learning/iotools.py index 3648acbaa..2f78c3505 100644 --- a/clinicadl/clinicadl/tools/deep_learning/iotools.py +++ b/clinicadl/clinicadl/tools/deep_learning/iotools.py @@ -62,6 +62,7 @@ def translate_parameters(args): args.num_workers = args.nproc args.optimizer = "Adam" args.loss = "default" + args.atlas = args.predict_atlas_intensities if hasattr(args, "caps_dir"): args.input_dir = args.caps_dir @@ -252,6 +253,15 @@ def read_json(options, json_path=None, test=False): if not hasattr(options, 'multi_cohort'): options.multi_cohort = False + if not hasattr(options, "predict_atlas_intensities"): + options.predict_atlas_intensities = None + + if not hasattr(options, "merged_tsv_path"): + options.merged_tsv_path = None + + if not hasattr(options, "atlas_weight"): + options.atlas_weight = 1 + return options diff --git a/clinicadl/clinicadl/tools/deep_learning/models/__init__.py b/clinicadl/clinicadl/tools/deep_learning/models/__init__.py index dc9d94fa8..59a4b79df 100644 --- a/clinicadl/clinicadl/tools/deep_learning/models/__init__.py +++ b/clinicadl/clinicadl/tools/deep_learning/models/__init__.py @@ -6,21 +6,27 @@ from .random import RandomArchitecture -def create_model(options, initial_shape): +def create_model(options, initial_shape, len_atlas=0): """ Creates model object from the model_name. - :param options: (Namespace) arguments needed to create the model. - :param initial_shape: (array-like) shape of the input data. - :return: (Module) the model object + Args: + options: (Namespace) arguments needed to create the model. + initial_shape: (array-like) shape of the input data. + len_atlas: (int) length of the atlas in case of double prediction + + Returns: + (Module) the model object """ if not hasattr(options, "model"): model = RandomArchitecture(options.convolutions, options.n_fcblocks, initial_shape, - options.dropout, options.network_normalization, n_classes=2) + options.dropout, options.network_normalization, + n_classes=2 + len_atlas) else: try: - model = eval(options.model)(dropout=options.dropout) + model = eval(options.model)(dropout=options.dropout, + n_classes=2 + len_atlas) except NameError: raise NotImplementedError( 'The model wanted %s has not been implemented.' % options.model) @@ -55,9 +61,9 @@ def create_autoencoder(options, initial_shape, difference=0): return decoder -def init_model(options, initial_shape, autoencoder=False): +def init_model(options, initial_shape, autoencoder=False, len_atlas=0): - model = create_model(options, initial_shape) + model = create_model(options, initial_shape, len_atlas=len_atlas) if autoencoder: model = AutoEncoder(model) diff --git a/clinicadl/clinicadl/tools/deep_learning/models/image_level.py b/clinicadl/clinicadl/tools/deep_learning/models/image_level.py index bc05f25d9..0ba14f082 100755 --- a/clinicadl/clinicadl/tools/deep_learning/models/image_level.py +++ b/clinicadl/clinicadl/tools/deep_learning/models/image_level.py @@ -15,7 +15,7 @@ class Conv5_FC3(nn.Module): Image level architecture used on Minimal preprocessing """ - def __init__(self, dropout=0.5): + def __init__(self, dropout=0.5, n_classes=2): super(Conv5_FC3, self).__init__() self.features = nn.Sequential( @@ -56,7 +56,7 @@ def __init__(self, dropout=0.5): nn.Linear(1300, 50), nn.ReLU(), - nn.Linear(50, 2) + nn.Linear(50, n_classes) ) @@ -75,7 +75,7 @@ class VConv5_FC3(nn.Module): Image level architecture used on Minimal preprocessing """ - def __init__(self, dropout=0.5): + def __init__(self, dropout=0.5, n_classes=2): super(VConv5_FC3, self).__init__() self.features = nn.Sequential( @@ -120,7 +120,7 @@ def __init__(self, dropout=0.5): nn.Linear(1300, 50), nn.ReLU(), - nn.Linear(50, 2) + nn.Linear(50, n_classes) ) @@ -148,7 +148,7 @@ class Conv5_FC3_mni(nn.Module): Image level architecture used on Extensive preprocessing """ - def __init__(self, dropout=0.5): + def __init__(self, dropout=0.5, n_classes=2): super(Conv5_FC3_mni, self).__init__() self.features = nn.Sequential( @@ -189,7 +189,7 @@ def __init__(self, dropout=0.5): nn.Linear(1300, 50), nn.ReLU(), - nn.Linear(50, 2) + nn.Linear(50, n_classes) ) @@ -208,7 +208,7 @@ class Conv6_FC3(nn.Module): Image level architecture used on Minimal preprocessing """ - def __init__(self, dropout=0.5): + def __init__(self, dropout=0.5, n_classes=2): super(Conv6_FC3, self).__init__() self.features = nn.Sequential( @@ -253,7 +253,7 @@ def __init__(self, dropout=0.5): nn.Linear(1000, 50), nn.ReLU(), - nn.Linear(50, 2) + nn.Linear(50, n_classes) ) diff --git a/clinicadl/clinicadl/tools/deep_learning/models/patch_level.py b/clinicadl/clinicadl/tools/deep_learning/models/patch_level.py index 0fd804951..42ca659f9 100755 --- a/clinicadl/clinicadl/tools/deep_learning/models/patch_level.py +++ b/clinicadl/clinicadl/tools/deep_learning/models/patch_level.py @@ -6,6 +6,7 @@ from torch import nn from .modules import PadMaxPool3d, Flatten + class Conv4_FC3(nn.Module): """ Classifier for a binary classification task diff --git a/clinicadl/clinicadl/tools/deep_learning/models/random.py b/clinicadl/clinicadl/tools/deep_learning/models/random.py index 84c26f291..b6da1aa08 100644 --- a/clinicadl/clinicadl/tools/deep_learning/models/random.py +++ b/clinicadl/clinicadl/tools/deep_learning/models/random.py @@ -46,6 +46,7 @@ def random_sampling(rs_options, options): sampling_dict = { "accumulation_steps": "randint", + "atlas_weight": "uniform", "baseline": "choice", "caps_dir": "fixed", "channels_limit": "fixed", @@ -55,6 +56,7 @@ def random_sampling(rs_options, options): "epochs": "fixed", "learning_rate": "exponent", "loss": "choice", + "merged_tsv_path": "fixed", "mode": "choice", "multi_cohort": "fixed", "n_fcblocks": "randint", @@ -63,6 +65,7 @@ def random_sampling(rs_options, options): "optimizer": "choice", "patience": "fixed", "preprocessing": "choice", + "predict_atlas_intensities": "fixed", "sampler": "choice", "tolerance": "fixed", "transfer_learning_path": "choice", diff --git a/clinicadl/clinicadl/tools/deep_learning/models/slice_level.py b/clinicadl/clinicadl/tools/deep_learning/models/slice_level.py index d9b5ce9f6..f953d795e 100755 --- a/clinicadl/clinicadl/tools/deep_learning/models/slice_level.py +++ b/clinicadl/clinicadl/tools/deep_learning/models/slice_level.py @@ -38,7 +38,7 @@ def resnet18(**kwargs): # add a fc layer on top of the transfer_learning model and a softmax classifier model.add_module('drop_out', nn.Dropout(p=kwargs["dropout"])) - model.add_module('fc_out', nn.Linear(1000, 2)) + model.add_module('fc_out', nn.Linear(1000, kwargs["n_classes"])) return model diff --git a/clinicadl/clinicadl/train/random_search.py b/clinicadl/clinicadl/train/random_search.py index eb0937a14..68ca7c3a6 100755 --- a/clinicadl/clinicadl/train/random_search.py +++ b/clinicadl/clinicadl/train/random_search.py @@ -24,6 +24,7 @@ def check_and_complete(rs_options): default_values = { "accumulation_steps": 1, + "atlas_weight": 1, "baseline": False, "channels_limit": 512, "d_reduction": "MaxPooling", @@ -32,6 +33,7 @@ def check_and_complete(rs_options): "dropout": 0, "learning_rate": 4, "loss": "default", + "merged_tsv_path": None, "multi_cohort": False, "n_conv": 1, "network_normalization": "BatchNorm", @@ -39,6 +41,7 @@ def check_and_complete(rs_options): "unnormalize": False, "patch_size": 50, "patience": 0, + "predict_atlas_intensities": None, "roi_list": None, "selection_threshold": 0, "slice_direction": 0, diff --git a/clinicadl/clinicadl/train/train_multiCNN.py b/clinicadl/clinicadl/train/train_multiCNN.py index 142104b4d..78b1ecf8c 100755 --- a/clinicadl/clinicadl/train/train_multiCNN.py +++ b/clinicadl/clinicadl/train/train_multiCNN.py @@ -97,7 +97,7 @@ def train_multi_cnn(params): # Initialize the model main_logger.info('Initialization of the model %i' % cnn_index) - model = create_model(params, initial_shape=data_train.size) + model = create_model(params, initial_shape=data_train.size, len_atlas=data_train.len_atlas()) model = transfer_learning(model, fi, source_path=params.transfer_learning_path, gpu=params.gpu, selection=params.transfer_learning_selection, logger=main_logger) diff --git a/clinicadl/clinicadl/train/train_singleCNN.py b/clinicadl/clinicadl/train/train_singleCNN.py index 327f09a97..957785572 100755 --- a/clinicadl/clinicadl/train/train_singleCNN.py +++ b/clinicadl/clinicadl/train/train_singleCNN.py @@ -89,7 +89,7 @@ def train_single_cnn(params): # Initialize the model main_logger.info('Initialization of the model') - model = init_model(params, initial_shape=data_train.size) + model = init_model(params, initial_shape=data_train.size, len_atlas=data_train.len_atlas()) model = transfer_learning(model, fi, source_path=params.transfer_learning_path, gpu=params.gpu, selection=params.transfer_learning_selection, logger=main_logger) From d2dcb3e54b23e074b2384caafb8515de507382a8 Mon Sep 17 00:00:00 2001 From: Elina Thibeau-Sutre Date: Tue, 16 Mar 2021 12:02:11 +0100 Subject: [PATCH 04/37] Add random search analysis file --- .../classify/random_search_analysis.py | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 clinicadl/clinicadl/classify/random_search_analysis.py diff --git a/clinicadl/clinicadl/classify/random_search_analysis.py b/clinicadl/clinicadl/classify/random_search_analysis.py new file mode 100644 index 000000000..11c01851d --- /dev/null +++ b/clinicadl/clinicadl/classify/random_search_analysis.py @@ -0,0 +1,48 @@ +""" +Produces a tsv file to analyze the performance of one launch of the random search. +""" +import os +from os import path +import pandas as pd +import numpy as np +from warnings import warn + + +def random_search_analysis(launch_dir, splits): + + jobs_list = os.listdir(launch_dir) + jobs_list = [job for job in jobs_list if job[0:4] == 'job_'] + + for selection in ['balanced_accuracy', 'loss']: + + columns = ['run', '>0.5', '>0.55', '>0.6', '>0.65', '>0.7', '>0.75', '>0.8', '>0.85', '>0.9', '>0.95', 'folds'] + output_df = pd.DataFrame(columns=columns) + thresholds = np.arange(0.5, 1, 0.05) + thresholds = np.insert(thresholds, 0, 0) + + for job in jobs_list: + + valid_accuracies = [] + for fold in splits: + performance_path = path.join(launch_dir, job, f'fold-{fold}', 'cnn_classification', f'best_{selection}') + if path.exists(performance_path): + valid_df = pd.read_csv(path.join(performance_path, 'validation_image_level_metrics.tsv'), sep='\t') + valid_accuracies.append(valid_df.loc[0, 'balanced_accuracy'].astype(float)) + else: + warn(f"The fold {fold} doesn't exist for job {job}") + + # Find the mean value of all existing folds + if len(valid_accuracies) > 0: + bac_valid = np.mean(valid_accuracies) + row = (bac_valid > thresholds).astype(int) + else: + row = np.zeros(len(thresholds), dtype=int) + row = np.concatenate([row, [len(valid_accuracies)]]) + row_df = pd.DataFrame(index=[job], data=row.reshape(1, -1), columns=columns) + output_df = pd.concat([output_df, row_df]) + + total_df = pd.DataFrame(np.array(output_df.sum()).reshape(1, -1), columns=columns, index=['total']) + output_df = pd.concat([output_df, total_df]) + output_df.sort_index(inplace=True) + + output_df.to_csv(path.join(launch_dir, "analysis_" + selection + '.tsv'), sep='\t') \ No newline at end of file From 38e1015afa7f15a5de26c0bd9b6a3040acf49beb Mon Sep 17 00:00:00 2001 From: Elina Thibeau-Sutre Date: Wed, 17 Mar 2021 12:00:05 +0100 Subject: [PATCH 05/37] Add random search tools --- .../classify/random_search_analysis.py | 9 +- clinicadl/clinicadl/cli.py | 130 ++++++++++++++---- clinicadl/clinicadl/resume/__init__.py | 0 .../clinicadl/resume/automatic_resume.py | 81 +++++++++++ .../clinicadl/resume/resume_autoencoder.py | 97 +++++++++++++ .../clinicadl/resume/resume_single_CNN.py | 87 ++++++++++++ .../tools/deep_learning/autoencoder_utils.py | 13 +- .../tools/deep_learning/cnn_utils.py | 13 +- .../tools/deep_learning/models/iotools.py | 4 +- clinicadl/clinicadl/train/resume_CNN.py | 99 ------------- .../clinicadl/train/resume_autoencoder.py | 109 --------------- .../clinicadl/train/train_autoencoder.py | 5 +- clinicadl/clinicadl/train/train_multiCNN.py | 5 +- clinicadl/clinicadl/train/train_singleCNN.py | 5 +- 14 files changed, 404 insertions(+), 253 deletions(-) create mode 100644 clinicadl/clinicadl/resume/__init__.py create mode 100644 clinicadl/clinicadl/resume/automatic_resume.py create mode 100644 clinicadl/clinicadl/resume/resume_autoencoder.py create mode 100644 clinicadl/clinicadl/resume/resume_single_CNN.py delete mode 100644 clinicadl/clinicadl/train/resume_CNN.py delete mode 100644 clinicadl/clinicadl/train/resume_autoencoder.py diff --git a/clinicadl/clinicadl/classify/random_search_analysis.py b/clinicadl/clinicadl/classify/random_search_analysis.py index 11c01851d..d9abbd76a 100644 --- a/clinicadl/clinicadl/classify/random_search_analysis.py +++ b/clinicadl/clinicadl/classify/random_search_analysis.py @@ -10,8 +10,13 @@ def random_search_analysis(launch_dir, splits): - jobs_list = os.listdir(launch_dir) - jobs_list = [job for job in jobs_list if job[0:4] == 'job_'] + if splits is None: + splits = [0] + + jobs_list = [job for job in os.listdir(launch_dir) + if path.exists(path.join(launch_dir, job, "commandline.json"))] + + print(jobs_list) for selection in ['balanced_accuracy', 'loss']: diff --git a/clinicadl/clinicadl/cli.py b/clinicadl/clinicadl/cli.py index 765798338..da71ae0d9 100644 --- a/clinicadl/clinicadl/cli.py +++ b/clinicadl/clinicadl/cli.py @@ -104,8 +104,33 @@ def generate_data_func(args): def rs_func(args): from .train.random_search import launch_search + from .train.train_from_model import retrain + from .classify.random_search_analysis import random_search_analysis + + if args.random_task == "generate": + launch_search(args) + elif args.random_task == "analysis": + random_search_analysis( + args.launch_dir, + args.splits + ) + elif args.random_task == "retrain": + retrain(args) + else: + raise ValueError('This task was not implemented in random-search.') + - launch_search(args) +def resume_func(args): + from .resume.automatic_resume import automatic_resume + + automatic_resume( + model_path=args.model_path, + gpu=not args.use_cpu, + batch_size=args.batch_size, + num_workers=args.nproc, + evaluation_steps=args.evaluation_steps, + verbose=args.verbose + ) # Function to dispatch training to corresponding function @@ -570,27 +595,28 @@ def preprocessing_help(args): qc_volume_parser.set_defaults(func=qc_func) # random search parsers - rs_generate_parser = subparser.add_parser( + rs_parser = subparser.add_parser( 'random-search', parents=[parent_parser], help='Generate random networks to explore hyper parameters space.', formatter_class=argparse.ArgumentDefaultsHelpFormatter ) - # rs_subparsers = rs_parser.add_subparsers( - # title='''Possibilities for random network training''', - # description='''You can generate and train a new random network, - # or relaunch a previous random job with some alterations.''', - # dest='random_task', - # help='''****** Possible tasks ******''' - # ) - # rs_subparsers.required = True - - # rs_generate_parser = rs_subparsers.add_parser( - # 'generate', - # parents=[parent_parser], - # help='Sample a new network and train it.', - # formatter_class=argparse.ArgumentDefaultsHelpFormatter - # ) + rs_subparsers = rs_parser.add_subparsers( + title='''Possibilities for random network training''', + description='''You can generate and train a new random network, + or relaunch a previous random job with some alterations.''', + dest='random_task', + help='''****** Possible tasks ******''' + ) + + rs_subparsers.required = True + + rs_generate_parser = rs_subparsers.add_parser( + 'generate', + parents=[parent_parser], + help='Sample a new network and train it.', + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) rs_pos_group = rs_generate_parser.add_argument_group( TRAIN_CATEGORIES["POSITIONAL"] ) @@ -631,13 +657,69 @@ def preprocessing_help(args): rs_generate_parser.set_defaults(func=rs_func) - # retrain_parent_parser = return_train_parent_parser(retrain=True) - # rs_retrain_parser = rs_subparsers.add_parser( - # 'retrain', - # parents=[parent_parser, retrain_parent_parser], - # help='Train a network previously created by generate.', - # formatter_class=argparse.ArgumentDefaultsHelpFormatter - # ) + rs_analysis_parser = rs_subparsers.add_parser( + 'analysis', + help="Performs the analysis of all jobs in launch_dir", + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + rs_analysis_parser.add_argument( + "launch_dir", + type=str, + help="Directory containing the random_search.json file." + ) + + rs_analysis_parser.add_argument( + "--splits", + type=int, nargs="+", default=None, + help="List of the folds used for the analysis. Default will perform only the first fold." + ) + + rs_analysis_parser.set_defaults(func=rs_func) + + retrain_parent_parser = return_train_parent_parser(retrain=True) + rs_retrain_parser = rs_subparsers.add_parser( + 'retrain', + parents=[parent_parser, retrain_parent_parser], + help='Train a network previously created by generate.', + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + rs_retrain_parser.set_defaults(func=rs_func) + + resume_parser = subparser.add_parser( + 'resume', + parents=[parent_parser], + help='Resume all jobs prematurely ended in launch_dir.', + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + resume_parser.add_argument( + "model_path", + type=str, + help="Directory containing the random_search.json file." + ) + resume_parser.add_argument( + "-np", "--nproc", + help='Number of cores used the quality check. (default=2)', + type=int, default=2 + ) + resume_parser.add_argument( + '-cpu', '--use_cpu', action='store_true', + help='If provided, will use CPU instead of GPU.', + ) + resume_parser.add_argument( + '--batch_size', + default=2, type=int, + help='Batch size for data loading. (default=2)') + resume_parser.add_argument( + '--evaluation_steps', '-esteps', + default=0, type=int, + help='Fix the number of iterations to perform before computing an evaluations. Default will only ' + 'perform one evaluation at the end of each epoch.' + ) + + resume_parser.set_defaults(func=resume_func) train_parser = subparser.add_parser( 'train', diff --git a/clinicadl/clinicadl/resume/__init__.py b/clinicadl/clinicadl/resume/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/clinicadl/clinicadl/resume/automatic_resume.py b/clinicadl/clinicadl/resume/automatic_resume.py new file mode 100644 index 000000000..42ee4ff5a --- /dev/null +++ b/clinicadl/clinicadl/resume/automatic_resume.py @@ -0,0 +1,81 @@ +""" +Automatic relaunch of jobs that were stopped before the end of training. +Unfinished folds are detected as they do not contain a "performances" sub-folder +""" + +import argparse +import os +from os import path + + +def automatic_resume(model_path, + gpu, + batch_size, + num_workers, + evaluation_steps, + verbose=0): + from ..tools.deep_learning.iotools import return_logger, read_json + from ..train.train_singleCNN import train_single_cnn + from ..train.train_multiCNN import train_multi_cnn + from ..train.train_autoencoder import train_autoencoder + + from .resume_single_CNN import resume_single_cnn + from .resume_autoencoder import resume_autoencoder + + logger = return_logger(verbose=verbose, name_fn="automatic resume") + + options = argparse.Namespace() + options.model_path = model_path + logger.info(f"Job being resumed: {model_path}") + + options = read_json(options) + + # Set computational parameters + options.gpu = gpu + options.batch_size = batch_size + options.num_workers = num_workers + options.evaluation_steps = evaluation_steps + + # Set verbose + options.verbose = verbose + + fold_list = sorted([int(fold.split("-")[1]) for fold in os.listdir(options.model_path) if fold[:4:] == "fold"]) + finished_folds = [fold for fold in fold_list + if "cnn_classification" in os.listdir(path.join(options.model_path, f"fold-{fold}"))] + stopped_folds = [fold for fold in fold_list if fold not in finished_folds and + "checkpoint.pth.tar" in os.listdir(path.join(options.model_path, f"fold-{fold}", "models"))] + + if options.split is None: + if options.n_splits is None: + fold_iterator = range(1) + else: + fold_iterator = range(options.n_splits) + else: + fold_iterator = options.split + + absent_folds = [fold for fold in fold_iterator if fold not in finished_folds and fold not in stopped_folds] + logger.info(f"Finished folds {finished_folds}") + logger.info(f"Stopped folds {stopped_folds}") + logger.info(f"Missing folds {absent_folds}") + + # To ensure retro-compatibility with random search + options.output_dir = options.model_path + + for fold in stopped_folds: + if options.network_type == "cnn": + resume_single_cnn(options, fold) + elif options.network_type == "autoencoder": + resume_autoencoder(options, fold) + else: + raise NotImplementedError(f'Resume function is not implemented for network type {options.network_type}') + + if len(absent_folds) != 0: + options.split = absent_folds + if options.network_type == "cnn": + train_single_cnn(options, erase_existing=False) + elif options.network_type == "multicnn": + train_multi_cnn(options, erase_existing=False) + elif options.network_type == "autoencoder": + train_autoencoder(options, erase_existing=False) + else: + raise NotImplementedError(f'Resume function is not implemented for network type {options.network_type}') diff --git a/clinicadl/clinicadl/resume/resume_autoencoder.py b/clinicadl/clinicadl/resume/resume_autoencoder.py new file mode 100644 index 000000000..806e44885 --- /dev/null +++ b/clinicadl/clinicadl/resume/resume_autoencoder.py @@ -0,0 +1,97 @@ +# coding: utf8 + +from os import path +import torch +from torch.utils.data import DataLoader + +from ..tools.deep_learning.data import load_data, get_transforms, return_dataset, generate_sampler +from ..tools.deep_learning.models import init_model, load_model, load_optimizer +from ..tools.deep_learning.iotools import return_logger, \ + commandline_to_json, write_requirements_version, translate_parameters +from ..tools.deep_learning.autoencoder_utils import train, get_criterion, visualize_image + + +def resume_autoencoder(params, resumed_split): + main_logger = return_logger(params.verbose, "main process") + train_logger = return_logger(params.verbose, "train") + + commandline_to_json(params, logger=main_logger) + write_requirements_version(params.output_dir) + params = translate_parameters(params) + train_transforms, all_transforms = get_transforms(params.mode, + minmaxnormalization=params.minmaxnormalization, + data_augmentation=params.data_augmentation) + + training_df, valid_df = load_data( + params.tsv_path, + params.diagnoses, + resumed_split, + n_splits=params.n_splits, + baseline=params.baseline, + logger=main_logger, + multi_cohort=params.multi_cohort + ) + + data_train = return_dataset(params.mode, params.input_dir, training_df, params.preprocessing, + train_transformations=train_transforms, all_transformations=all_transforms, + params=params) + data_valid = return_dataset(params.mode, params.input_dir, valid_df, params.preprocessing, + train_transformations=train_transforms, all_transformations=all_transforms, + params=params) + + train_sampler = generate_sampler(data_train, params.sampler) + + train_loader = DataLoader( + data_train, + batch_size=params.batch_size, + sampler=train_sampler, + num_workers=params.num_workers, + pin_memory=True + ) + + valid_loader = DataLoader( + data_valid, + batch_size=params.batch_size, + shuffle=False, + num_workers=params.num_workers, + pin_memory=True + ) + + # Initialize the model + main_logger.info('Initialization of the model') + decoder = init_model(params, initial_shape=data_train.size, autoencoder=True) + model_dir = path.join(params.output_dir, f"fold-{resumed_split}", "models") + decoder, current_epoch = load_model(decoder, model_dir, params.gpu, 'checkpoint.pth.tar') + + params.beginning_epoch = current_epoch + 1 + + # Define criterion and optimizer + criterion = get_criterion(params.loss) + optimizer_path = path.join(params.output_dir, f"fold-{resumed_split}", "models", "optimizer.pth.tar") + optimizer = load_optimizer(optimizer_path, decoder) + + # Define output directories + log_dir = path.join( + params.output_dir, f'fold-{resumed_split}', 'tensorboard_logs') + model_dir = path.join( + params.output_dir, f'fold-{resumed_split}', 'models') + visualization_dir = path.join( + params.output_dir, f'fold-{resumed_split}', 'autoencoder_reconstruction') + + main_logger.debug('Beginning the training task') + train(decoder, train_loader, valid_loader, criterion, optimizer, False, + log_dir, model_dir, params, train_logger) + + if params.visualization: + best_decoder, _ = load_model(decoder, path.join(model_dir, "best_loss"), + params.gpu, filename='model_best.pth.tar') + nb_images = data_train.size.elem_per_image + if nb_images <= 2: + nb_images *= 3 + visualize_image(best_decoder, valid_loader, path.join(visualization_dir, "validation"), + nb_images=nb_images) + visualize_image(best_decoder, train_loader, path.join(visualization_dir, "train"), + nb_images=nb_images) + del decoder + torch.cuda.empty_cache() + diff --git a/clinicadl/clinicadl/resume/resume_single_CNN.py b/clinicadl/clinicadl/resume/resume_single_CNN.py new file mode 100644 index 000000000..43bfa6ecb --- /dev/null +++ b/clinicadl/clinicadl/resume/resume_single_CNN.py @@ -0,0 +1,87 @@ +# coding: utf8 + +from os import path +from torch.utils.data import DataLoader + +from ..train.train_singleCNN import test_single_cnn +from ..tools.deep_learning.data import load_data, get_transforms, return_dataset, generate_sampler +from ..tools.deep_learning.models import init_model, load_model, load_optimizer +from ..tools.deep_learning.iotools import return_logger, \ + commandline_to_json, write_requirements_version, translate_parameters +from ..tools.deep_learning.cnn_utils import train, get_criterion + + +def resume_single_cnn(params, resumed_split): + main_logger = return_logger(params.verbose, "main process") + train_logger = return_logger(params.verbose, "train") + eval_logger = return_logger(params.verbose, "final evaluation") + + commandline_to_json(params, logger=main_logger) + write_requirements_version(params.output_dir) + params = translate_parameters(params) + train_transforms, all_transforms = get_transforms(params.mode, + minmaxnormalization=params.minmaxnormalization, + data_augmentation=params.data_augmentation) + + training_df, valid_df = load_data( + params.tsv_path, + params.diagnoses, + resumed_split, + n_splits=params.n_splits, + baseline=params.baseline, + logger=main_logger, + multi_cohort=params.multi_cohort + ) + + data_train = return_dataset(params.mode, params.input_dir, training_df, params.preprocessing, + train_transformations=train_transforms, all_transformations=all_transforms, + params=params) + data_valid = return_dataset(params.mode, params.input_dir, valid_df, params.preprocessing, + train_transformations=train_transforms, all_transformations=all_transforms, + params=params) + + train_sampler = generate_sampler(data_train, params.sampler) + + train_loader = DataLoader( + data_train, + batch_size=params.batch_size, + sampler=train_sampler, + num_workers=params.num_workers, + pin_memory=True + ) + + valid_loader = DataLoader( + data_valid, + batch_size=params.batch_size, + shuffle=False, + num_workers=params.num_workers, + pin_memory=True + ) + + # Initialize the model + main_logger.info('Initialization of the model') + model = init_model(params, initial_shape=data_train.size, len_atlas=data_train.len_atlas()) + model_dir = path.join(params.output_dir, f"fold-{resumed_split}", "models") + model, current_epoch = load_model(model, model_dir, params.gpu, 'checkpoint.pth.tar') + + params.beginning_epoch = current_epoch + 1 + + # Define criterion and optimizer + criterion = get_criterion(params.loss) + optimizer_path = path.join(params.output_dir, f"fold-{resumed_split}", "models", "optimizer.pth.tar") + optimizer = load_optimizer(optimizer_path, model) + + # Define output directories + log_dir = path.join( + params.output_dir, f'fold-{resumed_split}', 'tensorboard_logs') + model_dir = path.join( + params.output_dir, f'fold-{resumed_split}', 'models') + + main_logger.debug('Beginning the training task') + train(model, train_loader, valid_loader, criterion, + optimizer, True, log_dir, model_dir, params, train_logger) + + test_single_cnn(model, params.output_dir, train_loader, "train", + resumed_split, criterion, params.mode, eval_logger, params.selection_threshold, gpu=params.gpu) + test_single_cnn(model, params.output_dir, valid_loader, "validation", + resumed_split, criterion, params.mode, eval_logger, params.selection_threshold, gpu=params.gpu) diff --git a/clinicadl/clinicadl/tools/deep_learning/autoencoder_utils.py b/clinicadl/clinicadl/tools/deep_learning/autoencoder_utils.py index d6f4e5665..c364d07fa 100644 --- a/clinicadl/clinicadl/tools/deep_learning/autoencoder_utils.py +++ b/clinicadl/clinicadl/tools/deep_learning/autoencoder_utils.py @@ -34,9 +34,6 @@ def train(decoder, train_loader, valid_loader, criterion, optimizer, resume, """ from tensorboardX import SummaryWriter - columns = ['epoch', 'iteration', 'time', 'loss_train', 'loss_valid'] - filename = os.path.join(os.path.dirname(log_dir), 'training.tsv') - if logger is None: logger = logging @@ -56,10 +53,12 @@ def train(decoder, train_loader, valid_loader, criterion, optimizer, resume, if not os.path.exists(filename): raise ValueError( 'The training.tsv file of the resumed experiment does not exist.') - truncated_tsv = pd.read_csv(filename, sep='\t') - truncated_tsv.set_index(['epoch', 'iteration'], inplace=True) - truncated_tsv.drop(options.beginning_epoch, level=0, inplace=True) - truncated_tsv.to_csv(filename, index=True, sep='\t') + truncated_df = pd.read_csv(filename, sep='\t') + truncated_df.set_index(['epoch', 'iteration'], inplace=True) + epochs = [epoch for epoch, _ in truncated_df.index.values] + if options.beginning_epoch in epochs: + truncated_df.drop(options.beginning_epoch, level=0, inplace=True) + truncated_df.to_csv(filename, index=True, sep='\t') # Create writers writer_train = SummaryWriter(os.path.join(log_dir, 'train')) diff --git a/clinicadl/clinicadl/tools/deep_learning/cnn_utils.py b/clinicadl/clinicadl/tools/deep_learning/cnn_utils.py index 05a0616e8..bf842b655 100644 --- a/clinicadl/clinicadl/tools/deep_learning/cnn_utils.py +++ b/clinicadl/clinicadl/tools/deep_learning/cnn_utils.py @@ -62,10 +62,13 @@ def train(model, train_loader, valid_loader, criterion, optimizer, resume, log_d else: if not os.path.exists(filename): raise ValueError('The training.tsv file of the resumed experiment does not exist.') - truncated_tsv = pd.read_csv(filename, sep='\t') - truncated_tsv.set_index(['epoch', 'iteration'], inplace=True) - truncated_tsv.drop(options.beginning_epoch, level=0, inplace=True) - truncated_tsv.to_csv(filename, index=True, sep='\t') + truncated_df = pd.read_csv(filename, sep='\t') + truncated_df.set_index(['epoch', 'iteration'], inplace=True, drop=True) + epochs = [epoch for epoch, _ in truncated_df.index.values] + if options.beginning_epoch in epochs: + truncated_df.drop(options.beginning_epoch, level=0, inplace=True) + truncated_df.to_csv(filename, index=True, sep='\t') + assert hasattr(options, "beginning_epoch") # Create writers writer_train = SummaryWriter(os.path.join(log_dir, 'train')) @@ -345,7 +348,7 @@ def test(model, dataloader, use_cuda, criterion, mode="image", use_labels=True): atlas_data = data["atlas"] atlas_output = outputs[:, -atlas_data.size(1)::] outputs = outputs[:, :-atlas_data.size(1):] - total_atlas_loss += torch.nn.MSELoss(reduction="sum")(atlas_output, atlas_data) + total_atlas_loss += torch.nn.MSELoss(reduction="sum")(atlas_output, atlas_data).item() if use_labels: loss = criterion(outputs, labels) diff --git a/clinicadl/clinicadl/tools/deep_learning/models/iotools.py b/clinicadl/clinicadl/tools/deep_learning/models/iotools.py index bc3ff5d8c..75df1fbe8 100644 --- a/clinicadl/clinicadl/tools/deep_learning/models/iotools.py +++ b/clinicadl/clinicadl/tools/deep_learning/models/iotools.py @@ -66,5 +66,7 @@ def load_optimizer(optimizer_path, model): print('Loading optimizer') optimizer_dict = torch.load(optimizer_path) name = optimizer_dict["name"] - optimizer = eval("torch.optim." + name)(filter(lambda x: x.requires_grad, model.parameters())) + optimizer = getattr(torch.optim, name)(filter(lambda x: x.requires_grad, model.parameters())) optimizer.load_state_dict(optimizer_dict["optimizer"]) + + return optimizer diff --git a/clinicadl/clinicadl/train/resume_CNN.py b/clinicadl/clinicadl/train/resume_CNN.py deleted file mode 100644 index 7b2e5b4e6..000000000 --- a/clinicadl/clinicadl/train/resume_CNN.py +++ /dev/null @@ -1,99 +0,0 @@ -# coding: utf8 - -import argparse -from os import path -from time import time -import torch -from torch.utils.data import DataLoader - -from .train_singleCNN import test_single_cnn -from clinicadl.tools.deep_learning.data import MRIDataset, MinMaxNormalization, load_data -from clinicadl.tools.deep_learning import create_model, load_model, load_optimizer, read_json -from clinicadl.tools.deep_learning.cnn_utils import train - -parser = argparse.ArgumentParser(description="Argparser for Pytorch 3D CNN") - -# Mandatory arguments -parser.add_argument("model_path", type=str, - help="model selected") -parser.add_argument("split", type=int, - help="Will load the specific split wanted.") - -# Computational argument -parser.add_argument('--gpu', action='store_true', default=False, - help='Uses gpu instead of cpu if cuda is available') -parser.add_argument("--num_workers", '-w', default=1, type=int, - help='the number of batch being loaded in parallel') - - -def main(options): - - options = read_json(options) - - if options.evaluation_steps % options.accumulation_steps != 0 and options.evaluation_steps != 1: - raise Exception('Evaluation steps %d must be a multiple of accumulation steps %d' % - (options.evaluation_steps, options.accumulation_steps)) - - if options.minmaxnormalization: - transformations = MinMaxNormalization() - else: - transformations = None - - total_time = time() - - # Get the data. - training_tsv, valid_tsv = load_data(options.diagnosis_path, options.diagnoses, - options.split, options.n_splits, options.baseline) - - data_train = MRIDataset(options.input_dir, training_tsv, transform=transformations, preprocessing=options.preprocessing) - data_valid = MRIDataset(options.input_dir, valid_tsv, transform=transformations, preprocessing=options.preprocessing) - - # Use argument load to distinguish training and testing - train_loader = DataLoader(data_train, - batch_size=options.batch_size, - shuffle=True, - num_workers=options.num_workers, - pin_memory=True - ) - - valid_loader = DataLoader(data_valid, - batch_size=options.batch_size, - shuffle=False, - num_workers=options.num_workers, - pin_memory=True - ) - - # Initialize the model - print('Initialization of the model') - model = create_model(options, data_train.size) - model_dir = path.join(options.model_path, "best_model_dir", "CNN", "fold_" + str(options.split)) - model, current_epoch = load_model(model, model_dir, options.gpu, 'checkpoint.pth.tar') - - options.beginning_epoch = current_epoch + 1 - - # Define criterion and optimizer - criterion = torch.nn.CrossEntropyLoss() - optimizer_path = path.join(options.model_path, 'optimizer.pth.tar') - optimizer = load_optimizer(optimizer_path, model) - - # Define output directories - log_dir = path.join(options.output_dir, 'log_dir', 'fold_%i' % options.split, 'CNN') - model_dir = path.join(options.output_dir, 'best_model_dir', 'fold_%i' % options.split, 'CNN') - - print('Resuming the training task') - train(model, train_loader, valid_loader, criterion, optimizer, True, log_dir, model_dir, options) - - options.model_path = options.output_dir - test_single_cnn(train_loader, "train", options.split, criterion, options) - test_single_cnn(valid_loader, "validation", options.split, criterion, options) - - total_time = time() - total_time - print("Total time of computation: %d s" % total_time) - - -if __name__ == "__main__": - commandline = parser.parse_known_args() - options = commandline[0] - if commandline[1]: - print("unknown arguments: %s" % parser.parse_known_args()[1]) - main(options) diff --git a/clinicadl/clinicadl/train/resume_autoencoder.py b/clinicadl/clinicadl/train/resume_autoencoder.py deleted file mode 100644 index 424fd70ad..000000000 --- a/clinicadl/clinicadl/train/resume_autoencoder.py +++ /dev/null @@ -1,109 +0,0 @@ -# coding: utf8 - -import argparse -import torch -from os import path -from time import time -from torch.utils.data import DataLoader - -from clinicadl.tools.deep_learning.data import MRIDataset, MinMaxNormalization, load_data -from clinicadl.tools.deep_learning import load_model, create_autoencoder, load_optimizer, read_json -from clinicadl.tools.deep_learning.autoencoder_utils import train, visualize_image - -parser = argparse.ArgumentParser(description="Argparser for Pytorch 3D CNN") - -# Mandatory arguments -parser.add_argument("model_path", type=str, - help="model selected") -parser.add_argument("split", type=int, - help="Will load the specific split wanted.") - -# Optimizer arguments -parser.add_argument('--gpu', action='store_true', default=False, - help='Uses gpu instead of cpu if cuda is available') -parser.add_argument("--num_workers", '-w', default=1, type=int, - help='the number of batch being loaded in parallel') - - -def main(options): - - options = read_json(options) - - if options.evaluation_steps % options.accumulation_steps != 0 and options.evaluation_steps != 1: - raise Exception('Evaluation steps %d must be a multiple of accumulation steps %d' % - (options.evaluation_steps, options.accumulation_steps)) - - if options.minmaxnormalization: - transformations = MinMaxNormalization() - else: - transformations = None - - total_time = time() - - # Get the data. - training_tsv, valid_tsv = load_data(options.diagnosis_path, options.diagnoses, - options.split, options.n_splits, options.baseline) - - data_train = MRIDataset(options.input_dir, training_tsv, transform=transformations, - preprocessing=options.preprocessing) - data_valid = MRIDataset(options.input_dir, valid_tsv, transform=transformations, - preprocessing=options.preprocessing) - - # Use argument load to distinguish training and testing - train_loader = DataLoader(data_train, - batch_size=options.batch_size, - shuffle=True, - num_workers=options.num_workers, - pin_memory=True - ) - - valid_loader = DataLoader(data_valid, - batch_size=options.batch_size, - shuffle=False, - num_workers=options.num_workers, - pin_memory=True - ) - - # Initialize the model - print('Initialization of the model') - decoder = create_autoencoder(options, data_train.size) - - decoder, current_epoch = load_model(decoder, options.model_path, options.gpu, 'checkpoint.pth.tar') - if options.gpu: - decoder = decoder.cuda() - - options.beginning_epoch = current_epoch + 1 - - # Define criterion and optimizer - criterion = torch.nn.MSELoss() - optimizer_path = path.join(options.model_path, 'optimizer.pth.tar') - optimizer = load_optimizer(optimizer_path, decoder) - - # Define output directories - log_dir = path.join(options.output_dir, 'log_dir', 'fold_%i' % options.split, 'ConvAutoencoder') - visualization_dir = path.join(options.output_dir, 'visualize', 'fold_%i' % options.split) - model_dir = path.join(options.output_dir, 'best_model_dir', 'fold_%i' % options.split, 'ConvAutoencoder') - - print('Resuming the training task') - train(decoder, train_loader, valid_loader, criterion, optimizer, False, - log_dir, model_dir, options) - - if options.visualization: - print("Visualization of autoencoder reconstruction") - best_decoder, _ = load_model(decoder, path.join(model_dir, "best_loss"), - options.gpu, filename='model_best.pth.tar') - visualize_image(best_decoder, valid_loader, path.join(visualization_dir, "validation"), nb_images=3) - visualize_image(best_decoder, train_loader, path.join(visualization_dir, "train"), nb_images=3) - del decoder - torch.cuda.empty_cache() - - total_time = time() - total_time - print("Total time of computation: %d s" % total_time) - - -if __name__ == "__main__": - ret = parser.parse_known_args() - options = ret[0] - if ret[1]: - print("unknown arguments: %s" % parser.parse_known_args()[1]) - main(options) diff --git a/clinicadl/clinicadl/train/train_autoencoder.py b/clinicadl/clinicadl/train/train_autoencoder.py index ad38f43fe..afee4faa0 100755 --- a/clinicadl/clinicadl/train/train_autoencoder.py +++ b/clinicadl/clinicadl/train/train_autoencoder.py @@ -14,7 +14,7 @@ from ..tools.deep_learning.iotools import commandline_to_json, write_requirements_version, translate_parameters -def train_autoencoder(params): +def train_autoencoder(params, erase_existing=True): """ Trains an autoencoder and writes: - logs obtained with Tensorboard during training, @@ -28,7 +28,8 @@ def train_autoencoder(params): """ main_logger = return_logger(params.verbose, "main process") train_logger = return_logger(params.verbose, "train") - check_and_clean(params.output_dir) + if erase_existing: + check_and_clean(params.output_dir) commandline_to_json(params, logger=main_logger) write_requirements_version(params.output_dir) diff --git a/clinicadl/clinicadl/train/train_multiCNN.py b/clinicadl/clinicadl/train/train_multiCNN.py index 78b1ecf8c..43c22bc1a 100755 --- a/clinicadl/clinicadl/train/train_multiCNN.py +++ b/clinicadl/clinicadl/train/train_multiCNN.py @@ -15,7 +15,7 @@ from ..tools.deep_learning.iotools import commandline_to_json, write_requirements_version, translate_parameters -def train_multi_cnn(params): +def train_multi_cnn(params, erase_existing=True): """ Trains one CNN per patch and writes for each CNN: - logs obtained with Tensorboard during training, @@ -31,7 +31,8 @@ def train_multi_cnn(params): main_logger = return_logger(params.verbose, "main process") train_logger = return_logger(params.verbose, "train") eval_logger = return_logger(params.verbose, "final evaluation") - check_and_clean(params.output_dir) + if erase_existing: + check_and_clean(params.output_dir) commandline_to_json(params, logger=main_logger) write_requirements_version(params.output_dir) diff --git a/clinicadl/clinicadl/train/train_singleCNN.py b/clinicadl/clinicadl/train/train_singleCNN.py index 957785572..6b6f5e3e6 100755 --- a/clinicadl/clinicadl/train/train_singleCNN.py +++ b/clinicadl/clinicadl/train/train_singleCNN.py @@ -15,7 +15,7 @@ from ..tools.deep_learning.iotools import commandline_to_json, write_requirements_version, translate_parameters -def train_single_cnn(params): +def train_single_cnn(params, erase_existing=True): """ Trains a single CNN and writes: - logs obtained with Tensorboard during training, @@ -30,7 +30,8 @@ def train_single_cnn(params): main_logger = return_logger(params.verbose, "main process") train_logger = return_logger(params.verbose, "train") eval_logger = return_logger(params.verbose, "final evaluation") - check_and_clean(params.output_dir) + if erase_existing: + check_and_clean(params.output_dir) commandline_to_json(params, logger=main_logger) write_requirements_version(params.output_dir) From fffafa63e71b146d04d554776a456162fa1f6802 Mon Sep 17 00:00:00 2001 From: Elina Thibeau-Sutre Date: Wed, 17 Mar 2021 17:57:07 +0100 Subject: [PATCH 06/37] Add doc on random-search analysis --- docs/RandomSearch.md | 70 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 60 insertions(+), 10 deletions(-) diff --git a/docs/RandomSearch.md b/docs/RandomSearch.md index a1aafd1b8..eee134baa 100644 --- a/docs/RandomSearch.md +++ b/docs/RandomSearch.md @@ -1,4 +1,6 @@ -# `clinicadl random-search` - Train random models sampled from a defined hyperparameter space +# `clinicadl random-search` - Launch and analyse random search results + +## `clinicadl random-search generate`- Train random models sampled from a defined hyperparameter space This functionality trains a random model with hyperparameters sampled from a predefined space. The hyperparameter space is defined in a `random_search.json` file that must be manually filled @@ -21,18 +23,18 @@ the options of the [train function](./Train/Introduction.md) except for the that are defined with the commandline arguments. Some variables were also added to sample the architecture of the network. -## Prerequisites +### Prerequisites You need to execute the [`clinicadl tsvtool getlabels`](TSVTools.md#getlabels---extract-labels-specific-to-alzheimers-disease) and [`clinicadl tsvtool {split|kfold}`](TSVTools.md#split---single-split-observing-similar-age-and-sex-distributions) commands prior to running this task to have the correct TSV file organization. Moreover, there should be a CAPS, obtained running the `t1-linear` pipeline of ClinicaDL. -## Running the task +### Running the task This task can be run with the following command line: ```Text -clinicadl random-search +clinicadl random-search generate ``` where: @@ -52,7 +54,7 @@ Optional arguments: - `--n_splits` (int) is a number of splits k to load in the case of a k-fold cross-validation. Default will load a single-split. - `--split` (list of int) is a subset of folds that will be used for training. By default all splits available are used. -## Content of `random_search.json` +### Content of `random_search.json` `random_search.json` must be present in `launch_dir` before running the command. An example of this file can be found in @@ -169,7 +171,7 @@ Mode-dependent variables: !!! note "Sampling different modes" The mode-dependent variables are used only if the corresponding mode is sampled. -## Outputs +### Outputs Results are stored in the results folder given by `launch_dir`, according to the following file system: @@ -179,12 +181,12 @@ the following file system: └── ``` -## Example of setting +### Example of setting In the following we give an example of a `random_search.json` file and two possible sets of options that can be sampled from it. -### `random_search.json` +#### `random_search.json` ``` {"mode": ["patch", "image"], @@ -211,7 +213,7 @@ two possible sets of options that can be sampled from it. "n_fcblocks": [1, 3]} ``` -### Options #1 +#### Options #1 ``` {"mode": "image", @@ -269,7 +271,7 @@ The scheme of the corresponding architecture is the following: ![Illustration of the CNN corresponding to options #1](images/random1.png) -### Options #2 +#### Options #2 ``` {"mode": "patch", @@ -320,3 +322,51 @@ number of layers for each convolutional block, described in the `conv` dictionna The scheme of the corresponding architecture is the following: ![Illustration of the CNN corresponding to options #2](images/random2.png) + + +## `clinicadl random-search analysis` - Find best performing jobs + +This tool allows to parse all the jobs trained with ClinicaDL and +produces a TSV files that indicates how many jobs have a validation balanced accuracy +higher than a threshold (from 0.50 to 0.95). + +### Prerequisites + +A random search must have run in `launch_directory`, so +`clinicadl random-search generate ` must have been executed at least one time. + +### Running the task + +This task can be run with the following command line: +```Text +clinicadl random-search analysis + +``` +where `launch_directory` (str) is the parent directory of output folder containing the file `random_search.json`. + +The list of the folds that can be included in the analysis can be specified in `splits` option. +If nothing is specified only the first split is included for all jobs. + +### Outputs + +Two TSV files are produced in `launch_directory`: +- `analysis_balanced_accuracy.tsv` which gives the results of the best models according to validation balanced accuracy, +- `analysis_balanced_accuracy.tsv` which gives the results of the best models according to validation loss. + +The content of these TSV files is as follows: + +``` + run >0.5 >0.55 ... >0.85 >0.9 >0.95 folds +job-0 1 1 1 0 0 0 4 +job-1 1 1 1 0 0 0 3 +... +job-10 1 1 1 1 0 0 3 +total 9 8 8 2 1 0 32 +``` + +where: +- the column `run` indicates if the job has run are not (it can crash at the beginning because the +architecture chosen is too large for the GPU). +- the columns `>XX` indicates if the job has a validation balanced accuracy higher than `XX`. +- the columns `folds` indicates how many folds were found for this job, +- the last row `total` is the sum of all the previous rows. From ce4313b144861cf52e0b1e26ccd944d38ba91bfa Mon Sep 17 00:00:00 2001 From: Elina Thibeau-Sutre Date: Wed, 17 Mar 2021 17:59:01 +0100 Subject: [PATCH 07/37] detach retrain from random-search --- clinicadl/clinicadl/cli.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/clinicadl/clinicadl/cli.py b/clinicadl/clinicadl/cli.py index da71ae0d9..b41f0ee77 100644 --- a/clinicadl/clinicadl/cli.py +++ b/clinicadl/clinicadl/cli.py @@ -104,7 +104,6 @@ def generate_data_func(args): def rs_func(args): from .train.random_search import launch_search - from .train.train_from_model import retrain from .classify.random_search_analysis import random_search_analysis if args.random_task == "generate": @@ -114,12 +113,16 @@ def rs_func(args): args.launch_dir, args.splits ) - elif args.random_task == "retrain": - retrain(args) else: raise ValueError('This task was not implemented in random-search.') +def retrain_func(args): + from .train.train_from_model import retrain + + retrain(args) + + def resume_func(args): from .resume.automatic_resume import automatic_resume @@ -678,14 +681,14 @@ def preprocessing_help(args): rs_analysis_parser.set_defaults(func=rs_func) retrain_parent_parser = return_train_parent_parser(retrain=True) - rs_retrain_parser = rs_subparsers.add_parser( + retrain_parser = subparser.add_parser( 'retrain', parents=[parent_parser, retrain_parent_parser], help='Train a network previously created by generate.', formatter_class=argparse.ArgumentDefaultsHelpFormatter ) - rs_retrain_parser.set_defaults(func=rs_func) + retrain_parser.set_defaults(func=retrain_func) resume_parser = subparser.add_parser( 'resume', From e502673421584561eb2b6a53ed3f90406b006fd0 Mon Sep 17 00:00:00 2001 From: Elina Thibeau-Sutre Date: Thu, 18 Mar 2021 18:02:41 +0100 Subject: [PATCH 08/37] Add resume file --- docs/Resume.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 docs/Resume.md diff --git a/docs/Resume.md b/docs/Resume.md new file mode 100644 index 000000000..e69de29bb From 860923bc63affd500578f1daf0e40aef7b1cccf4 Mon Sep 17 00:00:00 2001 From: Elina Thibeau-Sutre Date: Fri, 19 Mar 2021 14:42:50 +0100 Subject: [PATCH 09/37] Add retrain + resume info --- clinicadl/clinicadl/cli.py | 81 +++++++++++++------ clinicadl/clinicadl/main.py | 5 +- .../clinicadl/resume/automatic_resume.py | 15 ++-- .../clinicadl/tools/deep_learning/data.py | 2 +- .../clinicadl/tools/deep_learning/iotools.py | 5 +- clinicadl/clinicadl/train/train_from_model.py | 36 +-------- clinicadl/tests/test_random_search.py | 26 ++++-- docs/Classify.md | 5 +- docs/Resume.md | 73 +++++++++++++++++ docs/Retrain.md | 42 ++++++++++ 10 files changed, 215 insertions(+), 75 deletions(-) create mode 100644 docs/Retrain.md diff --git a/clinicadl/clinicadl/cli.py b/clinicadl/clinicadl/cli.py index b41f0ee77..0731a4003 100644 --- a/clinicadl/clinicadl/cli.py +++ b/clinicadl/clinicadl/cli.py @@ -120,15 +120,35 @@ def rs_func(args): def retrain_func(args): from .train.train_from_model import retrain + if args.model_path == args.output_dir: + raise ValueError(f"The output directory path {args.output_dir} cannot be the same " + f"than the path to the reference model {args.model_path}.") + + if args.use_cpu and args.use_gpu: + raise ValueError("The flags --use_cpu and --use_gpu cannot be specified at the same time.") + elif args.use_gpu: + args.use_cpu = False + else: + args.use_cpu = None + retrain(args) def resume_func(args): from .resume.automatic_resume import automatic_resume + if args.use_cpu and args.use_gpu: + raise ValueError("The flags --use_cpu and --use_gpu cannot be specified at the same time.") + elif args.use_cpu: + gpu = False + elif args.use_gpu: + gpu = True + else: + gpu = None + automatic_resume( model_path=args.model_path, - gpu=not args.use_cpu, + gpu=gpu, batch_size=args.batch_size, num_workers=args.nproc, evaluation_steps=args.evaluation_steps, @@ -655,7 +675,7 @@ def preprocessing_help(args): rs_comp_group.add_argument( '--evaluation_steps', '-esteps', default=0, type=int, - help='Fix the number of iterations to perform before computing an evaluations. Default will only ' + help='Fix the number of iterations to perform before computing an evaluation. Default will only ' 'perform one evaluation at the end of each epoch.') rs_generate_parser.set_defaults(func=rs_func) @@ -693,8 +713,7 @@ def preprocessing_help(args): resume_parser = subparser.add_parser( 'resume', parents=[parent_parser], - help='Resume all jobs prematurely ended in launch_dir.', - formatter_class=argparse.ArgumentDefaultsHelpFormatter + help='Resume all jobs prematurely ended in launch_dir.' ) resume_parser.add_argument( @@ -702,24 +721,34 @@ def preprocessing_help(args): type=str, help="Directory containing the random_search.json file." ) - resume_parser.add_argument( + + resume_comp_group = resume_parser.add_argument_group( + TRAIN_CATEGORIES["COMPUTATIONAL"] + ) + resume_comp_group.add_argument( "-np", "--nproc", - help='Number of cores used the quality check. (default=2)', - type=int, default=2 + help='Number of cores used the quality check. ' + 'Default will reuse the same value than in training.', + type=int, default=None ) - resume_parser.add_argument( - '-cpu', '--use_cpu', action='store_true', - help='If provided, will use CPU instead of GPU.', + resume_comp_group.add_argument( + '-cpu', '--use_cpu', action='store_true', default=False, + help='Override the previous command line to use CPU.', ) - resume_parser.add_argument( + resume_comp_group.add_argument( + '-gpu', '--use_gpu', action='store_true', default=False, + help='Override the previous command line to use GPU.', + ) + resume_comp_group.add_argument( '--batch_size', - default=2, type=int, - help='Batch size for data loading. (default=2)') - resume_parser.add_argument( + default=None, type=int, + help='Batch size for data loading. ' + 'Default will reuse the same value than in training.') + resume_comp_group.add_argument( '--evaluation_steps', '-esteps', - default=0, type=int, - help='Fix the number of iterations to perform before computing an evaluations. Default will only ' - 'perform one evaluation at the end of each epoch.' + default=None, type=int, + help='Fix the number of iterations to perform before computing an evaluation. ' + 'Default will reuse the same value than in training.' ) resume_parser.set_defaults(func=resume_func) @@ -1465,23 +1494,29 @@ def return_train_parent_parser(retrain=False): train_comput_group.add_argument( '-cpu', '--use_cpu', action='store_true', help='If provided, will use CPU instead of GPU.', - default=False) + default=None if retrain else False) + if retrain: + train_comput_group.add_argument( + '-gpu', '--use_gpu', action='store_true', + help='If provided, will use GPU instead of CPU.', + default=None if retrain else False + ) train_comput_group.add_argument( '-np', '--nproc', help='Number of cores used during the training. (default=2)', - type=int, default=2) + type=int, default=None if retrain else 2) train_comput_group.add_argument( '--batch_size', - default=2, type=int, + default=None if retrain else 2, type=int, help='Batch size for training. (default=2)') train_comput_group.add_argument( '--evaluation_steps', '-esteps', - default=0, type=int, - help='Fix the number of iterations to perform before computing an evaluations. Default will only ' + default=None if retrain else 0, type=int, + help='Fix the number of iterations to perform before computing an evaluation. Default will only ' 'perform one evaluation at the end of each epoch.') train_comput_group.add_argument( '--merged_tsv_path', - default=None, type=str, + default=None if retrain else "", type=str, help="Path to the output of clinica iotools merged-tsv (concatenation for multi-cohort). " ) diff --git a/clinicadl/clinicadl/main.py b/clinicadl/clinicadl/main.py index 94e6ead23..cf40bbcb1 100644 --- a/clinicadl/clinicadl/main.py +++ b/clinicadl/clinicadl/main.py @@ -14,8 +14,11 @@ def main(): print(f"ClinicaDL version is: {clinicadl.__version__}") exit(0) if hasattr(args, 'use_cpu'): - if not args.use_cpu and not torch.cuda.is_available(): + if args.use_cpu is not None and not args.use_cpu and not torch.cuda.is_available(): raise ValueError("No GPU is available. Please add the -cpu flag to run on CPU.") + if hasattr(args, 'use_gpu'): + if args.use_gpu and torch.cuda.is_available(): + raise ValueError("No GPU is available. Please disable -gpu flag to run on CPU.") if not args.task: parser.print_help() diff --git a/clinicadl/clinicadl/resume/automatic_resume.py b/clinicadl/clinicadl/resume/automatic_resume.py index 42ee4ff5a..410f4e94a 100644 --- a/clinicadl/clinicadl/resume/automatic_resume.py +++ b/clinicadl/clinicadl/resume/automatic_resume.py @@ -8,6 +8,11 @@ from os import path +def replace_arg(options, key_name, value): + if value is not None: + setattr(options, key_name, value) + + def automatic_resume(model_path, gpu, batch_size, @@ -28,13 +33,13 @@ def automatic_resume(model_path, options.model_path = model_path logger.info(f"Job being resumed: {model_path}") - options = read_json(options) + options = read_json(options, read_computational=True) # Set computational parameters - options.gpu = gpu - options.batch_size = batch_size - options.num_workers = num_workers - options.evaluation_steps = evaluation_steps + replace_arg(options, "gpu", gpu) + replace_arg(options, "batch_size", batch_size) + replace_arg(options, "num_workers", num_workers) + replace_arg(options, "evaluation_steps", evaluation_steps) # Set verbose options.verbose = verbose diff --git a/clinicadl/clinicadl/tools/deep_learning/data.py b/clinicadl/clinicadl/tools/deep_learning/data.py index 43c58e519..6a8918c96 100644 --- a/clinicadl/clinicadl/tools/deep_learning/data.py +++ b/clinicadl/clinicadl/tools/deep_learning/data.py @@ -772,7 +772,7 @@ def return_dataset(mode, input_dir, data_df, preprocessing, if cnn_index is not None and mode in ["image"]: raise ValueError("Multi-CNN is not implemented for %s mode." % mode) - if params.merged_tsv_path is not None: + if params.merged_tsv_path is not "": merged_df = pd.read_csv(params.merged_tsv_path, sep="\t") else: merged_df = None diff --git a/clinicadl/clinicadl/tools/deep_learning/iotools.py b/clinicadl/clinicadl/tools/deep_learning/iotools.py index 2f78c3505..1d3f5d649 100644 --- a/clinicadl/clinicadl/tools/deep_learning/iotools.py +++ b/clinicadl/clinicadl/tools/deep_learning/iotools.py @@ -149,7 +149,7 @@ def commandline_to_json(commandline, logger=None, filename="commandline.json"): f.close() -def read_json(options, json_path=None, test=False): +def read_json(options, json_path=None, test=False, read_computational=False): """ Read a json file to update python argparse Namespace. Ensures retro-compatibility with previous namings in clinicadl. @@ -158,6 +158,7 @@ def read_json(options, json_path=None, test=False): options: (argparse.Namespace) options of the model. json_path: (str) If given path to the json file, else found with options.model_path. test: (bool) If given the reader will ignore some options specific to data. + read_computational: (bool) if set to True, the computational arguments are also read. Returns: options (args.Namespace) options of the model updated """ @@ -174,7 +175,7 @@ def read_json(options, json_path=None, test=False): for key, item in json_data.items(): # We do not change computational options - if key in computational_list: + if key in computational_list and not read_computational: pass # If used for evaluation, some parameters were already given if test and key in evaluation_parameters: diff --git a/clinicadl/clinicadl/train/train_from_model.py b/clinicadl/clinicadl/train/train_from_model.py index 5714da68b..7edb1c265 100644 --- a/clinicadl/clinicadl/train/train_from_model.py +++ b/clinicadl/clinicadl/train/train_from_model.py @@ -38,41 +38,7 @@ def set_options(options, new_options): def retrain(new_options): options = deepcopy(new_options) - delattr(options, 'batch_size') - options = read_json(options) - - # Adapt batch size with accumulation steps to match previous one - if new_options.batch_size < options.batch_size: - ratio = options.batch_size / new_options.batch_size - if not ratio.is_integer(): - warnings.warn("The new batch size %i value is not a divisor of the previous one %i." - "The batch size for the training will be %i." % - (new_options.batch_size * int(options.accumulation_steps), - options.batch_size * int(options.accumulation_steps), - new_options.batch_size * int(ratio) * int(options.accumulation_steps))) - options.accumulation_steps *= int(ratio) - options.batch_size = new_options.batch_size - - elif new_options.batch_size > options.batch_size: - if new_options.batch_size < options.batch_size * options.accumulation_steps: - ratio = options.batch_size * options.accumulation_steps // new_options.batch_size - warnings.warn("The previous batch size value was %i. " - "The new batch size value is %i." % - (options.batch_size * int(options.accumulation_steps), - new_options.batch_size * int(ratio))) - options.accumulation_steps = ratio - options.batch_size = new_options.batch_size - - else: - warnings.warn("The previous batch size value was %i. " - "The new batch size value is %i." % - (options.batch_size * int(options.accumulation_steps), - new_options.batch_size)) - options.accumulation_steps = 1 - options.batch_size = new_options.batch_size - - # Update evaluation steps to match new accumulation steps value - options.evaluation_steps = find_evaluation_steps(options.accumulation_steps, options.evaluation_steps) + options = read_json(options, read_computational=True) # Default behaviour reuse the same dataset as before options = set_options(options, new_options) diff --git a/clinicadl/tests/test_random_search.py b/clinicadl/tests/test_random_search.py index cb2086d5c..605315487 100644 --- a/clinicadl/tests/test_random_search.py +++ b/clinicadl/tests/test_random_search.py @@ -31,24 +31,30 @@ def cli_commands(request): 'first_conv_width': [1, 3], 'n_fcblocks': [1, 2] } - test_input = [ + generate_input = [ 'random-search', + 'generate', launch_dir, name_dir, '--n_splits', '2', '--split', '0', - '-cpu' + ] + log_input = [ + 'random-search', + 'analysis', + launch_dir, + '--n_splits', '2' ] else: raise NotImplementedError( "Test %s is not implemented." % request.param) - return arg_dict, test_input + return arg_dict, generate_input, log_input def test_random_search(cli_commands): - arg_dict, test_input = cli_commands + arg_dict, generate_input, log_input = cli_commands # Write random_search.json file os.makedirs(launch_dir, exist_ok=True) @@ -57,9 +63,17 @@ def test_random_search(cli_commands): f.write(json_file) f.close() - flag_error = not os.system("clinicadl " + " ".join(test_input)) + flag_error_generate = not os.system("clinicadl " + " ".join(generate_input)) performances_flag = os.path.exists( os.path.join(launch_dir, name_dir, "fold-0", "cnn_classification")) - assert flag_error + flag_error_log = not os.system("clinicadl " + " ".join(log_input)) + analysis_flag = True + for metric in ["loss", "balanced_accuracy"]: + analysis_flag = analysis_flag and os.path.exists( + os.path.join(launch_dir, f"analysis_{metric}.tsv") + ) + assert flag_error_generate assert performances_flag + assert flag_error_log + assert analysis_flag shutil.rmtree(launch_dir) diff --git a/docs/Classify.md b/docs/Classify.md index 62c65aeee..9e9fb535f 100644 --- a/docs/Classify.md +++ b/docs/Classify.md @@ -1,7 +1,8 @@ # `clinicadl classify` - Inference using pretrained models This functionality performs image classification using models trained with -[`clinicadl train`](./Train/Introduction.md) task. It can also use pretrained +[`clinicadl train`](./Train/Introduction.md) of [`clinicadl random-search generate`](./RandomSearch.md) +tasks. It can also use pretrained models if their folder structure is similar to the structure created by the command `clinicadl train`. At the top level of each model folder there are two files: @@ -25,7 +26,7 @@ performance on the validation set according to one or several metrics (.pth.tar In order to execute this task, the input images must be listed in a `tsv_file` formatted using the CAPS definition. Please check which preprocessing needs to -be performed in the `commandline.json` file in the results folder. If it has +be performed in the `commandline.json` file in the results folder. If it has not been performed, execute the preprocessing pipeline as well as `clinicadl extract` to obtain the tensor versions of the images. diff --git a/docs/Resume.md b/docs/Resume.md index e69de29bb..f03ec4165 100644 --- a/docs/Resume.md +++ b/docs/Resume.md @@ -0,0 +1,73 @@ +# `clinicadl resume` - Resume a prematurely stopped job + +This functionality allows to resume a prematurely stopped job trained with +[`clinicadl train`](./Train/Introduction.md) of [`clinicadl random-search generate`](./RandomSearch.md) tasks. +It can also use pretrained +models if their folder structure is similar to the structure created by the +command `clinicadl train`. The files that are used by this function are the following: + +- `commandline.json` describes the training parameters used to create the + model, +- `checkpoint.pth.tar` contains the last version of the weights of the network, +- `optimizer.pth.tar` contains the last version of the parameters of the optimizer, +- `training.tsv` contains the successive values of the metrics during training. + +These files are organized in `model_path` as follows: + +``` + +├── commandline.json +└── fold- + ├── models + │   ├── best_balanced_accuracy + │   │   └── model_best.pth.tar + │   ├── best_loss + │   │ └── model_best.pth.tar + │   ├── checkpoint.pth.tar + │   └── optimizer.pth.tar + ├── tensorboard_logs + │   ├── train + │   │   └── events.out.tfevents.1616090758.r7i7n7 + │   └── validation + │   └── events.out.tfevents.1616090758.r7i7n7 + └── training.tsv +``` + +You should also ensure that the data at `tsv_path` and `caps_dir` in `commandline.json` +is still present and correspond to the ones used during training. + +## Prerequisites + +Please check which preprocessing needs to +be performed in the `commandline.json` file in the results folder. If it has +not been performed, execute the preprocessing pipeline as well as `clinicadl +extract` to obtain the tensor versions of the images. + +## Running the task +This task can be run with the following command line: +```Text +clinicadl resume + +``` +where `model_path` (str) is a path to the folder where the model and the json file +are stored. + +By default the arguments corresponding to computational resources will be the same +than the ones defined in `commandline.json`. However it is possible to change them +by using the following options: + +- `--nproc` (int) changes the number of workers used by the DataLoader. +- `--use_cpu` (bool) forces to use CPU. +- `--use_gpu` (bool) forces to use GPU. +- `--batch_size` (int) changes the size of the batch used in the DataLoader. +- `--evaluation_steps` (int) changes the number of iterations to perform before +computing an evaluation. + +## Outputs + +The outputs correspond to the ones obtained using [`clinicadl train`](./Train/Introduction.md#outputs) + +!!! note + The files `checkpoint.pth.tar` and `optimizer.pth.tar` are automatically removed as soon + as the [stopping criterion](./Train/Details.md#stopping-criterion) is reached and the + performances of the models are evaluated on the training and validation datasets. diff --git a/docs/Retrain.md b/docs/Retrain.md new file mode 100644 index 000000000..73b3cb1e7 --- /dev/null +++ b/docs/Retrain.md @@ -0,0 +1,42 @@ +# `clinicadl retrain` - Launch a new network training based on a previously trained network + +This command allows to train a network defined in a `commandline.json` generated by +[`clinicadl train`](./Train/Introduction.md) or +[`clinicadl random-search generate`](RandomSearch.md#clinicadl-random-search-generate--train-random-models-sampled-from-a-defined-hyperparameter-space). + +This is particularly useful for outputs of random search, as the random architecture +produced my be not straightforward to implement. + +!!! note "Training from json file" + You don't actually need to have the complete folder of the network used as a reference. + The only file used in this function is `commandline.json` so you can also manually create this file + and store it in `model_path`. + +## Prerequisites + +Please check which preprocessing needs to +be performed in the `commandline.json` file in the results folder. If it has +not been performed, execute the preprocessing pipeline as well as `clinicadl +extract` to obtain the tensor versions of the images. + +## Running the task +This task can be run with the following command line: +```Text +clinicadl retrain + +``` +where +- `model_path` (str) is a path to the folder where the reference model and the json file +are stored. +- `output_dir` (str) is a path to the folder of the new model that will be trained. + +By default all the arguments will correspond to the ones used to train the reference model. +However you can change any of these by setting a new value to the optional parameters. + +!!! note "Optional parameters" + For more information on the optional parameters, please check [`clinicadl train` + description](./Train/Introduction.md#running-the-task) + +## Outputs + +The outputs correspond to the ones obtained using [`clinicadl train`](./Train/Introduction.md#outputs). From e1637151df1c5d44df388771ff35194c42517339 Mon Sep 17 00:00:00 2001 From: Elina Thibeau-Sutre Date: Fri, 19 Mar 2021 14:52:53 +0100 Subject: [PATCH 10/37] Add doc on atlas intensity prediction --- clinicadl/clinicadl/cli.py | 10 +++++----- docs/Train/Introduction.md | 18 ++++++++++++++++-- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/clinicadl/clinicadl/cli.py b/clinicadl/clinicadl/cli.py index 0731a4003..cfeb5eafa 100644 --- a/clinicadl/clinicadl/cli.py +++ b/clinicadl/clinicadl/cli.py @@ -1514,11 +1514,6 @@ def return_train_parent_parser(retrain=False): default=None if retrain else 0, type=int, help='Fix the number of iterations to perform before computing an evaluation. Default will only ' 'perform one evaluation at the end of each epoch.') - train_comput_group.add_argument( - '--merged_tsv_path', - default=None if retrain else "", type=str, - help="Path to the output of clinica iotools merged-tsv (concatenation for multi-cohort). " - ) train_data_group = train_parent_parser.add_argument_group( TRAIN_CATEGORIES["DATA"]) @@ -1571,6 +1566,11 @@ def return_train_parent_parser(retrain=False): help="Weight to put on the MSE loss used to compute the error on atlas intensities.", default=1, type=float, ) + train_data_group.add_argument( + '--merged_tsv_path', + default=None if retrain else "", type=str, + help="Path to the output of clinica iotools merged-tsv (concatenation for multi-cohort). " + ) train_cv_group = train_parent_parser.add_argument_group( TRAIN_CATEGORIES["CROSS-VALIDATION"]) diff --git a/docs/Train/Introduction.md b/docs/Train/Introduction.md index 671034e32..b02743eda 100644 --- a/docs/Train/Introduction.md +++ b/docs/Train/Introduction.md @@ -9,7 +9,13 @@ It mainly relies on the PyTorch deep learning library You need to execute the [`clinicadl tsvtool getlabels`](../TSVTools.md#getlabels---extract-labels-specific-to-alzheimers-disease) and [`clinicadl tsvtool {split|kfold}`](../TSVTools.md#split---single-split-observing-similar-age-and-sex-distributions) commands prior to running this task to have the correct TSV file organization. -Moreover, there should be a CAPS, obtained running the `t1-linear` pipeline of ClinicaDL. +Moreover, there should be a CAPS, obtained running the preprocessing pipeline wanted. + +!!! note "Parcellation prediction" + It is now possible to predict the intensities in different regions of a neuronatomical atlas + with the binary classification task. In this case the outputs of the preprocessing pipeline + `clinica run t1-volume` must exist in the CAPS. + ## Running the task The training task can be run with the following command line: @@ -25,7 +31,6 @@ The options depend on the type of input used, but at most it can be chosen betwe - `caps_directory` (str) is the input folder containing the neuroimaging data in a [CAPS](https://aramislab.paris.inria.fr/clinica/docs/public/latest/CAPS/Introduction/) hierarchy. In case of [multi-cohort training](Details.md#multi-cohort), must be a path to a TSV file. - `preprocessing` (str) corresponds to the preprocessing pipeline whose outputs will be used for training. -The current version only supports `t1-linear`, but `t1-extensive` will be implemented in next versions of `clinicadl`. - `tsv_path` (str) is the input folder of a TSV file tree generated by `clinicadl tsvtool {split|kfold}`. In case of [multi-cohort training](Details.md#multi-cohort), must be a path to a TSV file. - `output_directory` (str) is the folder where the results are stored. @@ -51,6 +56,15 @@ Options shared for all values of `mode` are organized in groups: `weighted` will give a stronger weight to underrepresented classes. Default: `random`. - `--multi_cohort` (bool) is a flag indicated that [multi-cohort training](Details.md#multi-cohort) is performed. In this case, `caps_directory` and `tsv_path` must be paths to TSV files. + - `--predict_atlas_intensities` (str) corresponds to a neuroanatomical atlas. + If given the network learns to compute the grey matter intensity + according to this atlas. The number of output nodes will be 2 + number of regions in the atlas. + Must be chosen in the following list: AAL2, AICHA, Hammers, LPBA40, "Neuromorphometrics. + - `--atlas_weight` (float) is the weight put on the MSE loss computed to learn the grey matter intensities + when added to the cross-entropy loss. Default: `1`. + - `--merged_tsv_path` (str) is the path to the output of `clinica iotools merged-tsv` (or the concatenation + of the outputs in case of multi-cohort framework). Giving this file can accelerate the computation when + computing the grey matter intensities. - **Cross-validation arguments** - `--n_splits` (int) is a number of splits k to load in the case of a k-fold cross-validation. Default will load a single-split. - `--split` (list of int) is a subset of folds that will be used for training. By default all splits available are used. From 9fb96ec5ba20676608b9aae25458946a7dc81564 Mon Sep 17 00:00:00 2001 From: Elina Thibeau-Sutre Date: Fri, 19 Mar 2021 15:52:15 +0100 Subject: [PATCH 11/37] Fix style --- clinicadl/clinicadl/classify/random_search_analysis.py | 4 +--- clinicadl/clinicadl/resume/resume_autoencoder.py | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/clinicadl/clinicadl/classify/random_search_analysis.py b/clinicadl/clinicadl/classify/random_search_analysis.py index d9abbd76a..b1f0ea107 100644 --- a/clinicadl/clinicadl/classify/random_search_analysis.py +++ b/clinicadl/clinicadl/classify/random_search_analysis.py @@ -16,8 +16,6 @@ def random_search_analysis(launch_dir, splits): jobs_list = [job for job in os.listdir(launch_dir) if path.exists(path.join(launch_dir, job, "commandline.json"))] - print(jobs_list) - for selection in ['balanced_accuracy', 'loss']: columns = ['run', '>0.5', '>0.55', '>0.6', '>0.65', '>0.7', '>0.75', '>0.8', '>0.85', '>0.9', '>0.95', 'folds'] @@ -50,4 +48,4 @@ def random_search_analysis(launch_dir, splits): output_df = pd.concat([output_df, total_df]) output_df.sort_index(inplace=True) - output_df.to_csv(path.join(launch_dir, "analysis_" + selection + '.tsv'), sep='\t') \ No newline at end of file + output_df.to_csv(path.join(launch_dir, "analysis_" + selection + '.tsv'), sep='\t') diff --git a/clinicadl/clinicadl/resume/resume_autoencoder.py b/clinicadl/clinicadl/resume/resume_autoencoder.py index 806e44885..46bf52bef 100644 --- a/clinicadl/clinicadl/resume/resume_autoencoder.py +++ b/clinicadl/clinicadl/resume/resume_autoencoder.py @@ -94,4 +94,3 @@ def resume_autoencoder(params, resumed_split): nb_images=nb_images) del decoder torch.cuda.empty_cache() - From db3d35a9ef3cb20681adc5e87bba4c527f93587e Mon Sep 17 00:00:00 2001 From: Elina Thibeau-Sutre Date: Fri, 19 Mar 2021 15:52:23 +0100 Subject: [PATCH 12/37] Fix tests --- clinicadl/clinicadl/interpret/group_backprop.py | 2 ++ clinicadl/clinicadl/interpret/individual_backprop.py | 2 ++ clinicadl/clinicadl/tools/deep_learning/iotools.py | 5 +++-- clinicadl/tests/test_random_search.py | 2 +- 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/clinicadl/clinicadl/interpret/group_backprop.py b/clinicadl/clinicadl/interpret/group_backprop.py index 2904c4f07..46fcb9091 100644 --- a/clinicadl/clinicadl/interpret/group_backprop.py +++ b/clinicadl/clinicadl/interpret/group_backprop.py @@ -39,6 +39,8 @@ def group_backprop(options): options.input_dir = model_options.input_dir if options.target_diagnosis is None: options.target_diagnosis = options.diagnosis + options.merged_tsv_path = model_options.merged_tsv_path + options.predict_atlas_intensities = model_options.predict_atlas_intensities for fold in fold_list: main_logger.info(fold) diff --git a/clinicadl/clinicadl/interpret/individual_backprop.py b/clinicadl/clinicadl/interpret/individual_backprop.py index 1e3d35369..a97c14fb7 100644 --- a/clinicadl/clinicadl/interpret/individual_backprop.py +++ b/clinicadl/clinicadl/interpret/individual_backprop.py @@ -39,6 +39,8 @@ def individual_backprop(options): options.input_dir = model_options.input_dir if options.target_diagnosis is None: options.target_diagnosis = options.diagnosis + options.merged_tsv_path = model_options.merged_tsv_path + options.predict_atlas_intensities = model_options.predict_atlas_intensities for fold in fold_list: main_logger.info(fold) diff --git a/clinicadl/clinicadl/tools/deep_learning/iotools.py b/clinicadl/clinicadl/tools/deep_learning/iotools.py index 1d3f5d649..6aad23b75 100644 --- a/clinicadl/clinicadl/tools/deep_learning/iotools.py +++ b/clinicadl/clinicadl/tools/deep_learning/iotools.py @@ -62,8 +62,9 @@ def translate_parameters(args): args.num_workers = args.nproc args.optimizer = "Adam" args.loss = "default" - args.atlas = args.predict_atlas_intensities + if hasattr(args, "predict_atlas_intensities"): + args.atlas = args.predict_atlas_intensities if hasattr(args, "caps_dir"): args.input_dir = args.caps_dir if hasattr(args, "unnormalize"): @@ -258,7 +259,7 @@ def read_json(options, json_path=None, test=False, read_computational=False): options.predict_atlas_intensities = None if not hasattr(options, "merged_tsv_path"): - options.merged_tsv_path = None + options.merged_tsv_path = "" if not hasattr(options, "atlas_weight"): options.atlas_weight = 1 diff --git a/clinicadl/tests/test_random_search.py b/clinicadl/tests/test_random_search.py index 605315487..2f4174ce9 100644 --- a/clinicadl/tests/test_random_search.py +++ b/clinicadl/tests/test_random_search.py @@ -43,7 +43,7 @@ def cli_commands(request): 'random-search', 'analysis', launch_dir, - '--n_splits', '2' + '--splits', '2' ] else: raise NotImplementedError( From eb85bc806c32e0f3e3e582d52195618c6329ffd6 Mon Sep 17 00:00:00 2001 From: Elina Thibeau-Sutre Date: Tue, 23 Mar 2021 10:45:22 +0100 Subject: [PATCH 13/37] Set default value for n_splits to 0 instead of None --- clinicadl/clinicadl/cli.py | 10 ++++++---- clinicadl/clinicadl/tools/deep_learning/data.py | 6 +++--- clinicadl/clinicadl/tools/deep_learning/iotools.py | 3 +++ 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/clinicadl/clinicadl/cli.py b/clinicadl/clinicadl/cli.py index cfeb5eafa..b9e3b14cd 100644 --- a/clinicadl/clinicadl/cli.py +++ b/clinicadl/clinicadl/cli.py @@ -652,8 +652,9 @@ def preprocessing_help(args): rs_data_group = rs_generate_parser.add_argument_group( TRAIN_CATEGORIES["CROSS-VALIDATION"] ) - rs_data_group.add_argument("--n_splits", type=int, default=None, - help="If a value is given will load data of a k-fold CV") + rs_data_group.add_argument("--n_splits", type=int, default=0, + help="If a value is given for k will load data of a k-fold CV. " + "Default value (0) will load a single split.") rs_data_group.add_argument("--split", type=int, default=None, nargs="+", help="Will load the specific split wanted.") @@ -1576,8 +1577,9 @@ def return_train_parent_parser(retrain=False): TRAIN_CATEGORIES["CROSS-VALIDATION"]) train_cv_group.add_argument( '--n_splits', - help='If a value is given will load data of a k-fold CV. Else will load a single split.', - type=int, default=None) + help='If a value is given for k will load data of a k-fold CV. ' + 'Default value (0) will load a single split.', + type=int, default=None if retrain else 0) train_cv_group.add_argument( '--split', help='Train the list of given folds. By default train all folds.', diff --git a/clinicadl/clinicadl/tools/deep_learning/data.py b/clinicadl/clinicadl/tools/deep_learning/data.py index 6a8918c96..c9a8c4312 100644 --- a/clinicadl/clinicadl/tools/deep_learning/data.py +++ b/clinicadl/clinicadl/tools/deep_learning/data.py @@ -984,7 +984,7 @@ def get_transforms(mode, minmaxnormalization=True, data_augmentation=None): ################################ def load_data(tsv_path, diagnoses_list, - split, n_splits=None, baseline=True, + split, n_splits=0, baseline=True, logger=None, multi_cohort=False): if logger is None: @@ -1035,7 +1035,7 @@ def load_data(tsv_path, diagnoses_list, def load_data_single(train_val_path, diagnoses_list, - split, n_splits=None, baseline=True, + split, n_splits=0, baseline=True, logger=None): if logger is None: @@ -1044,7 +1044,7 @@ def load_data_single(train_val_path, diagnoses_list, train_df = pd.DataFrame() valid_df = pd.DataFrame() - if n_splits is None: + if n_splits == 0: train_path = path.join(train_val_path, 'train') valid_path = path.join(train_val_path, 'validation') diff --git a/clinicadl/clinicadl/tools/deep_learning/iotools.py b/clinicadl/clinicadl/tools/deep_learning/iotools.py index 6aad23b75..0aba845ed 100644 --- a/clinicadl/clinicadl/tools/deep_learning/iotools.py +++ b/clinicadl/clinicadl/tools/deep_learning/iotools.py @@ -264,6 +264,9 @@ def read_json(options, json_path=None, test=False, read_computational=False): if not hasattr(options, "atlas_weight"): options.atlas_weight = 1 + if hasattr(options, "n_splits") and options.n_splits is None: + options.n_splits = 0 + return options From ff86c4e5c45443b2be88f9a768eda21a06f6ef3d Mon Sep 17 00:00:00 2001 From: Elina Thibeau-Sutre Date: Thu, 8 Apr 2021 15:54:40 +0200 Subject: [PATCH 14/37] Update CHANGELOG --- CHANGELOG | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/CHANGELOG b/CHANGELOG index ff675ea33..2623dc485 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -26,6 +26,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Security +## ClinicaDL 0.2.2 + +### Added + +- New functionality `clinicadl random-search analysis` to obtain the histogram of the balanced accuracy +over a random search folder. +- New functionality `clinicadl train from_json` to train a model with parameters defined in a JSON file. +- New functionality `clinicadl resume` to resume a prematurely stopped training task. +- possibility to learn the grey matter intensities with the binary classification during training, +based on `t1-volume` outputs. + +### Changed + +- Previous `clinicadl random-search` is now `clinicadl random-search generate` + ## ClinicaDL 0.2.1 ### Added @@ -47,7 +62,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added -- New functionality `clinica interpret` to generate saliency maps linked +- New functionality `clinicadl interpret` to generate saliency maps linked to pretrained models based on groups of individual images. - New functionality `clinicadl random-search` to sample random networks from a predefined hyperparameter space. From f5ce6b8953c7e9e92b248eb1438b2aafa453c050 Mon Sep 17 00:00:00 2001 From: Elina Thibeau-Sutre Date: Thu, 8 Apr 2021 15:54:51 +0200 Subject: [PATCH 15/37] Fix style --- clinicadl/clinicadl/tools/tsv/data_formatting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clinicadl/clinicadl/tools/tsv/data_formatting.py b/clinicadl/clinicadl/tools/tsv/data_formatting.py index d2d92d1d4..944f42300 100644 --- a/clinicadl/clinicadl/tools/tsv/data_formatting.py +++ b/clinicadl/clinicadl/tools/tsv/data_formatting.py @@ -406,7 +406,7 @@ def get_labels(merged_tsv, missing_mods, results_path, # Remove SMC patients if remove_smc: - if "diagnosis_bl" in bids_df.columns.values: # Retro-compatibility + if "diagnosis_bl" in bids_df.columns.values: # Retro-compatibility bids_df = bids_df[~(bids_df.diagnosis_bl == "SMC")] if "diagnosis_sc" in bids_df.columns.values: bids_df = bids_df[~(bids_df.diagnosis_sc == "SMC")] From 3275c1d7a3326b954b8ce1adb2f2a003bfc5bfce Mon Sep 17 00:00:00 2001 From: Elina Thibeau-Sutre Date: Tue, 20 Apr 2021 15:05:54 +0200 Subject: [PATCH 16/37] Change retrain in train from_json --- clinicadl/clinicadl/cli.py | 156 ++++++++---------- .../clinicadl/tools/deep_learning/__init__.py | 2 +- .../clinicadl/tools/deep_learning/iotools.py | 62 +++++++ clinicadl/clinicadl/train/random_search.py | 62 +------ clinicadl/clinicadl/train/train_from_json.py | 24 +++ clinicadl/clinicadl/train/train_from_model.py | 51 ------ clinicadl/tests/test_train_from_json.py | 117 +++++++++++++ docs/{ => Train}/Retrain.md | 0 8 files changed, 272 insertions(+), 202 deletions(-) create mode 100644 clinicadl/clinicadl/train/train_from_json.py delete mode 100644 clinicadl/clinicadl/train/train_from_model.py create mode 100644 clinicadl/tests/test_train_from_json.py rename docs/{ => Train}/Retrain.md (100%) diff --git a/clinicadl/clinicadl/cli.py b/clinicadl/clinicadl/cli.py index b9e3b14cd..51d8c5516 100644 --- a/clinicadl/clinicadl/cli.py +++ b/clinicadl/clinicadl/cli.py @@ -118,20 +118,12 @@ def rs_func(args): def retrain_func(args): - from .train.train_from_model import retrain + from .train.train_from_json import retrain - if args.model_path == args.output_dir: - raise ValueError(f"The output directory path {args.output_dir} cannot be the same " - f"than the path to the reference model {args.model_path}.") - - if args.use_cpu and args.use_gpu: - raise ValueError("The flags --use_cpu and --use_gpu cannot be specified at the same time.") - elif args.use_gpu: - args.use_cpu = False - else: - args.use_cpu = None - - retrain(args) + retrain( + args.json_path, + args.output_dir + ) def resume_func(args): @@ -291,8 +283,7 @@ def parse_command_line(): subparser = parser.add_subparsers( title='''Task to execute with clinicadl:''', - description='''What kind of task do you want to use with clinicadl? - (tsvtool, preprocessing, generate, train, classify).''', + description='''What kind of task do you want to use with clinicadl?''', dest='task', help='''****** Tasks proposed by clinicadl ******''') @@ -701,16 +692,6 @@ def preprocessing_help(args): rs_analysis_parser.set_defaults(func=rs_func) - retrain_parent_parser = return_train_parent_parser(retrain=True) - retrain_parser = subparser.add_parser( - 'retrain', - parents=[parent_parser, retrain_parent_parser], - help='Train a network previously created by generate.', - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - - retrain_parser.set_defaults(func=retrain_func) - resume_parser = subparser.add_parser( 'resume', parents=[parent_parser], @@ -799,7 +780,7 @@ def preprocessing_help(args): dest='network_type', help='''****** Choose a type of network ******''') - train_parent_parser = return_train_parent_parser(retrain=False) + train_parent_parser = return_train_parent_parser() train_image_ae_parser = train_image_subparser.add_parser( "autoencoder", parents=[ @@ -1015,7 +996,7 @@ def preprocessing_help(args): ######################### train_slice_parser = train_subparser.add_parser( "slice", - help="Train a 2D slice-level CNN.") + help="Train a 2D slice-level network.") train_slice_subparser = train_slice_parser.add_subparsers( title='''Task to be performed''', @@ -1096,6 +1077,24 @@ def preprocessing_help(args): train_slice_multicnn_parser.set_defaults(func=train_func) + ######################### + # FROM JSON + ######################### + train_json_parser = train_subparser.add_parser( + "from_json", + parents=[parent_parser], + help="Train a network as defined in a JSON file.") + train_json_group = train_json_parser.add_argument_group( + TRAIN_CATEGORIES["POSITIONAL"]) + train_json_group.add_argument( + "json_path", type=str, + help="Path to the JSON file.") + train_json_group.add_argument( + "output_dir", type=str, + help="Directory in which the new job is stored.") + + train_json_parser.set_defaults(func=retrain_func) + # Classify - Classify a subject or a list of tsv files with the CNN # provided as argument. # classify_parser: get command line arguments and options @@ -1456,100 +1455,77 @@ def preprocessing_help(args): return parser -def return_train_parent_parser(retrain=False): +def return_train_parent_parser(): # Main train parent parser common to train and random search train_parent_parser = argparse.ArgumentParser(add_help=False) train_pos_group = train_parent_parser.add_argument_group( TRAIN_CATEGORIES["POSITIONAL"]) - if retrain: - train_pos_group.add_argument( - "model_path", type=str, - help="Path to the trained model folder.") - train_pos_group.add_argument( - "output_dir", type=str, - help="Directory in which the new job is stored.") - else: - train_pos_group.add_argument( - 'caps_dir', - help='Data using CAPS structure.', - default=None) - train_pos_group.add_argument( - 'preprocessing', - help='Defines the type of preprocessing of CAPS data.', - choices=['t1-linear', 't1-extensive', 't1-volume'], type=str) - train_pos_group.add_argument( - 'tsv_path', - help='TSV path with subjects/sessions to process.', - default=None) - train_pos_group.add_argument( - 'output_dir', - help='Folder containing results of the training.', - default=None) - train_pos_group.add_argument( - 'model', - help='CNN Model to be used during the training.', - default='Conv5_FC3') + train_pos_group.add_argument( + 'caps_dir', + help='Data using CAPS structure.', + default=None) + train_pos_group.add_argument( + 'preprocessing', + help='Defines the type of preprocessing of CAPS data.', + choices=['t1-linear', 't1-extensive', 't1-volume'], type=str) + train_pos_group.add_argument( + 'tsv_path', + help='TSV path with subjects/sessions to process.', + default=None) + train_pos_group.add_argument( + 'output_dir', + help='Folder containing results of the training.', + default=None) + train_pos_group.add_argument( + 'model', + help='CNN Model to be used during the training.', + default='Conv5_FC3') train_comput_group = train_parent_parser.add_argument_group( TRAIN_CATEGORIES["COMPUTATIONAL"]) train_comput_group.add_argument( '-cpu', '--use_cpu', action='store_true', help='If provided, will use CPU instead of GPU.', - default=None if retrain else False) - if retrain: - train_comput_group.add_argument( - '-gpu', '--use_gpu', action='store_true', - help='If provided, will use GPU instead of CPU.', - default=None if retrain else False - ) + default=False) train_comput_group.add_argument( '-np', '--nproc', help='Number of cores used during the training. (default=2)', - type=int, default=None if retrain else 2) + type=int, default=2) train_comput_group.add_argument( '--batch_size', - default=None if retrain else 2, type=int, + default=2, type=int, help='Batch size for training. (default=2)') train_comput_group.add_argument( '--evaluation_steps', '-esteps', - default=None if retrain else 0, type=int, + default=0, type=int, help='Fix the number of iterations to perform before computing an evaluation. Default will only ' 'perform one evaluation at the end of each epoch.') train_data_group = train_parent_parser.add_argument_group( TRAIN_CATEGORIES["DATA"]) - - if retrain: - train_data_group.add_argument( - "--caps_dir", type=str, default=None, - help="Data using CAPS structure.") - train_data_group.add_argument( - "--tsv_path", type=str, default=None, - help="TSV path with subjects/sessions to process.") - train_data_group.add_argument( "--multi_cohort", help="Performs multi-cohort training. In this case, caps_dir and tsv_path must be paths to TSV files.", action="store_true", - default=None if retrain else False + default=False ) train_data_group.add_argument( '--diagnoses', '-d', help='List of diagnoses that will be selected for training.', - default=None if retrain else ['AD', 'CN'], nargs='+', type=str, + default=['AD', 'CN'], nargs='+', type=str, choices=['AD', 'BV', 'CN', 'MCI', 'sMCI', 'pMCI']) train_data_group.add_argument( '--baseline', help='If provided, only the baseline sessions are used for training.', action="store_true", - default=None if retrain else False) + default=False) train_data_group.add_argument( '--unnormalize', '-un', help='Disable default MinMaxNormalization.', action="store_true", - default=None if retrain else False) + default=False) train_data_group.add_argument( - "--data_augmentation", nargs="+", default=None if retrain else False, + "--data_augmentation", nargs="+", default=False, choices=["None", "Noise", "Erasing", "CropPad", "Smoothing"], help="Randomly applies transforms on the training set.") train_data_group.add_argument( @@ -1569,7 +1545,7 @@ def return_train_parent_parser(retrain=False): ) train_data_group.add_argument( '--merged_tsv_path', - default=None if retrain else "", type=str, + default="", type=str, help="Path to the output of clinica iotools merged-tsv (concatenation for multi-cohort). " ) @@ -1579,7 +1555,7 @@ def return_train_parent_parser(retrain=False): '--n_splits', help='If a value is given for k will load data of a k-fold CV. ' 'Default value (0) will load a single split.', - type=int, default=None if retrain else 0) + type=int, default=0) train_cv_group.add_argument( '--split', help='Train the list of given folds. By default train all folds.', @@ -1590,36 +1566,36 @@ def return_train_parent_parser(retrain=False): train_optim_group.add_argument( '--epochs', help='Maximum number of epochs.', - default=None if retrain else 20, type=int) + default=20, type=int) train_optim_group.add_argument( '--learning_rate', '-lr', help='Learning rate of the optimization.', - default=None if retrain else 1e-4, type=float) + default=1e-4, type=float) train_optim_group.add_argument( '--weight_decay', '-wd', help='Weight decay value used in optimization.', - default=None if retrain else 1e-4, type=float) + default=1e-4, type=float) train_optim_group.add_argument( '--dropout', help='rate of dropout that will be applied to dropout layers in CNN.', - default=None if retrain else 0, type=float) + default=0, type=float) train_optim_group.add_argument( '--patience', help='Number of epochs for early stopping patience.', - type=int, default=None if retrain else 0) + type=int, default=0) train_optim_group.add_argument( '--tolerance', help='Value for the early stopping tolerance.', - type=float, default=None if retrain else 0.0) + type=float, default=0.0) train_optim_group.add_argument( '--accumulation_steps', '-asteps', help='Accumulates gradients during the given number of iterations before performing the weight update ' 'in order to virtually increase the size of the batch.', - default=None if retrain else 1, type=int) + default=1, type=int) # train_optim_group.add_argument( # "--loss", # help="Replaces default losses: cross-entropy for CNN and MSE for autoencoders.", - # type=str, default=None if retrain else "default", + # type=str, default="default", # choices=["default", "L1", "L1Norm", "SmoothL1", "SmoothL1Norm"]) return train_parent_parser diff --git a/clinicadl/clinicadl/tools/deep_learning/__init__.py b/clinicadl/clinicadl/tools/deep_learning/__init__.py index f572d13bc..4598e8c29 100644 --- a/clinicadl/clinicadl/tools/deep_learning/__init__.py +++ b/clinicadl/clinicadl/tools/deep_learning/__init__.py @@ -1,5 +1,5 @@ from .models import create_autoencoder, create_model, load_model, load_optimizer, save_checkpoint -from .iotools import read_json, commandline_to_json, write_requirements_version +from .iotools import read_json, commandline_to_json, write_requirements_version, check_and_complete class EarlyStopping(object): diff --git a/clinicadl/clinicadl/tools/deep_learning/iotools.py b/clinicadl/clinicadl/tools/deep_learning/iotools.py index 0aba845ed..daa11cdc7 100644 --- a/clinicadl/clinicadl/tools/deep_learning/iotools.py +++ b/clinicadl/clinicadl/tools/deep_learning/iotools.py @@ -270,6 +270,68 @@ def read_json(options, json_path=None, test=False, read_computational=False): return options +def check_and_complete(options, random_search=False): + """ + This function initializes missing fields with missing values. + Some fields are mandatory and cannot be initialized by default; this will raise an issue if they are missing. + + Args: + options: (Namespace) the options used for training. + random_search: (bool) If True the options are looking for mandatory values of random-search. + """ + filename = 'random_search.json' + + default_values = { + "accumulation_steps": 1, + "atlas_weight": 1, + "baseline": False, + "channels_limit": 512, + "data_augmentation": False, + "diagnoses": ['AD', 'CN'], + "discarded_slices": 20, + "dropout": 0, + "epochs": 20, + "learning_rate": 4, + "loss": "default", + "merged_tsv_path": None, + "multi_cohort": False, + "n_conv": 1, + "optimizer": "Adam", + "unnormalize": False, + "patch_size": 50, + "patience": 0, + "predict_atlas_intensities": None, + "selection_threshold": 0, + "slice_direction": 0, + "stride_size": 50, + "tolerance": 0.0, + "transfer_learning_path": None, + "transfer_learning_selection": "best_loss", + "use_extracted_patches": False, + "use_extracted_slices": False, + "use_extracted_roi": False, + "wd_bool": True, + "weight_decay": 4, + "sampler": "random" + } + if random_search: + default_values["d_reduction"] = "MaxPooling" + default_values["network_normalization"] = "BatchNorm" + + for name, default_value in default_values.items(): + if not hasattr(options, name): + setattr(options, name, default_value) + + mandatory_arguments = ['network_type', 'mode', + 'tsv_path', 'caps_dir', 'preprocessing'] + if random_search: + mandatory_arguments += ['n_convblocks', 'first_conv_width', 'n_fcblocks'] + + for argument in mandatory_arguments: + if not hasattr(options, argument): + raise ValueError(f"The argument {argument} must be specified in {filename}.") + + def set_default_dropout(args): if args.dropout is None: if args.mode == 'image': diff --git a/clinicadl/clinicadl/train/random_search.py b/clinicadl/clinicadl/train/random_search.py index 68ca7c3a6..0daee2c17 100755 --- a/clinicadl/clinicadl/train/random_search.py +++ b/clinicadl/clinicadl/train/random_search.py @@ -5,76 +5,18 @@ import argparse from os import path -from ..tools.deep_learning import read_json +from ..tools.deep_learning import read_json, check_and_complete from ..tools.deep_learning.models.random import random_sampling from .train_multiCNN import train_multi_cnn from .train_singleCNN import train_single_cnn from .train_autoencoder import train_autoencoder -def check_and_complete(rs_options): - """ - This function initializes fields so a random model can be sampled. - Some fields are mandatory and cannot be initialized by default; this will raise an issue if they are missing. - - Args: - rs_options: (Namespace) the random search options - """ - filename = 'random_search.json' - - default_values = { - "accumulation_steps": 1, - "atlas_weight": 1, - "baseline": False, - "channels_limit": 512, - "d_reduction": "MaxPooling", - "data_augmentation": False, - "discarded_slices": 20, - "dropout": 0, - "learning_rate": 4, - "loss": "default", - "merged_tsv_path": None, - "multi_cohort": False, - "n_conv": 1, - "network_normalization": "BatchNorm", - "optimizer": "Adam", - "unnormalize": False, - "patch_size": 50, - "patience": 0, - "predict_atlas_intensities": None, - "roi_list": None, - "selection_threshold": 0, - "slice_direction": 0, - "stride_size": 50, - "tolerance": 0.0, - "transfer_learning_path": None, - "transfer_learning_selection": "best_loss", - "uncropped_roi": False, - "use_extracted_patches": False, - "use_extracted_slices": False, - "use_extracted_roi": False, - "wd_bool": True, - "weight_decay": 4, - "sampler": "random" - } - for name, default_value in default_values.items(): - if not hasattr(rs_options, name): - setattr(rs_options, name, default_value) - - mandatory_arguments = ['epochs', 'network_type', 'mode', - 'tsv_path', 'caps_dir', 'diagnoses', 'preprocessing', - 'n_convblocks', 'first_conv_width', 'n_fcblocks'] - - for argument in mandatory_arguments: - if not hasattr(rs_options, argument): - raise ValueError(f"The argument {argument} must be specified in {filename}.") - - def launch_search(options): rs_options = argparse.Namespace() rs_options = read_json(rs_options, path.join(options.launch_dir, 'random_search.json')) - check_and_complete(rs_options) + check_and_complete(rs_options, random_search=True) random_sampling(rs_options, options) options.output_dir = path.join(options.launch_dir, options.name) diff --git a/clinicadl/clinicadl/train/train_from_json.py b/clinicadl/clinicadl/train/train_from_json.py new file mode 100644 index 000000000..1b05080f5 --- /dev/null +++ b/clinicadl/clinicadl/train/train_from_json.py @@ -0,0 +1,24 @@ +""" +Retrain a model defined by a commandline.json file +""" + +import argparse + +from ..tools.deep_learning import read_json, check_and_complete +from .train_autoencoder import train_autoencoder +from .train_singleCNN import train_single_cnn +from .train_multiCNN import train_multi_cnn + + +def retrain(json_path, output_dir): + options = argparse.Namespace() + options = read_json(options, json_path=json_path, read_computational=True) + check_and_complete(options) + options.output_dir = output_dir + + if options.network_type == "autoencoder": + train_autoencoder(options) + elif options.network_type == "cnn": + train_single_cnn(options) + elif options.network_type == "multicnn": + train_multi_cnn(options) diff --git a/clinicadl/clinicadl/train/train_from_model.py b/clinicadl/clinicadl/train/train_from_model.py deleted file mode 100644 index 7edb1c265..000000000 --- a/clinicadl/clinicadl/train/train_from_model.py +++ /dev/null @@ -1,51 +0,0 @@ -""" -Retrain a model defined by a commandline.json file -""" - -from copy import deepcopy -import warnings - -from ..tools.deep_learning.iotools import read_json -from ..tools.deep_learning.models.random import find_evaluation_steps -from .train_autoencoder import train_autoencoder -from .train_singleCNN import train_single_cnn -from .train_multiCNN import train_multi_cnn - - -def set_options(options, new_options): - from ..tools.deep_learning.iotools import computational_list - arg_list = list(vars(new_options).keys()) - - try: - arg_list.remove("same_init") - if new_options.same_init is not None: - if new_options.same_init == "False": - options.same_init = False - else: - options.same_init = new_options.same_init - - except ValueError: - pass - - for arg in arg_list: - new_value = getattr(new_options, arg) - if new_value is not None or arg in computational_list: - setattr(options, arg, new_value) - - return options - - -def retrain(new_options): - - options = deepcopy(new_options) - options = read_json(options, read_computational=True) - - # Default behaviour reuse the same dataset as before - options = set_options(options, new_options) - - if options.network_type == "autoencoder": - train_autoencoder(options) - elif options.network_type == "cnn": - train_single_cnn(options) - elif options.network_type == "multicnn": - train_multi_cnn(options) diff --git a/clinicadl/tests/test_train_from_json.py b/clinicadl/tests/test_train_from_json.py new file mode 100644 index 000000000..b3b6d0e3c --- /dev/null +++ b/clinicadl/tests/test_train_from_json.py @@ -0,0 +1,117 @@ +# coding: utf8 + +import pytest +import os +import shutil + + +@pytest.fixture(params=[ + 'train_slice_cnn', + 'train_image_cnn', + 'train_patch_cnn', + 'train_patch_multicnn', + 'train_roi_cnn', + 'train_roi_multicnn' +]) +def cli_commands(request): + + if request.param == 'train_slice_cnn': + test_input = [ + 'train', + 'slice', + 'cnn', + 'data/dataset/random_example', + 't1-linear', + 'data/labels_list', + 'results', + 'resnet18', + '--epochs', '1', + '--n_splits', '2', + '--split', '0' + ] + elif request.param == 'train_image_cnn': + test_input = [ + 'train', + 'image', + 'cnn', + 'data/dataset/random_example', + 't1-linear', + 'data/labels_list', + 'results', + 'Conv5_FC3', + '--epochs', '1', + '--n_splits', '2', + '--split', '0' + ] + elif request.param == 'train_patch_cnn': + test_input = [ + 'train', + 'patch', + 'cnn', + 'data/dataset/random_example', + 't1-linear', + 'data/labels_list', + 'results', + 'Conv4_FC3', + '--epochs', '1', + '--n_splits', '2', + '--split', '0' + ] + elif request.param == 'train_patch_multicnn': + test_input = [ + 'train', + 'patch', + 'multicnn', + 'data/dataset/random_example', + 't1-linear', + 'data/labels_list', + 'results', + 'Conv4_FC3', + '--epochs', '1', + '--n_splits', '2', + '--split', '0' + ] + elif request.param == 'train_roi_cnn': + test_input = [ + 'train', + 'roi', + 'cnn', + 'data/dataset/random_example', + 't1-linear', + 'data/labels_list', + 'results', + 'Conv4_FC3', + '--epochs', '1', + '--n_splits', '2', + '--split', '0' + ] + elif request.param == 'train_roi_multicnn': + test_input = [ + 'train', + 'roi', + 'multicnn', + 'data/dataset/random_example', + 't1-linear', + 'data/labels_list', + 'results', + 'Conv4_FC3', + '--epochs', '1', + '--n_splits', '2', + '--split', '0' + ] + else: + raise NotImplementedError( + "Test %s is not implemented." % + request.param) + + return test_input + + +def test_train(cli_commands): + test_input = cli_commands + flag_error = not os.system("clinicadl " + " ".join(test_input)) + performances_flag = os.path.exists( + os.path.join("results", "fold-0", "cnn_classification")) + assert flag_error + assert performances_flag + shutil.rmtree("results") diff --git a/docs/Retrain.md b/docs/Train/Retrain.md similarity index 100% rename from docs/Retrain.md rename to docs/Train/Retrain.md From e7f569dc448c4955e37862477b1d5c81e54fd71b Mon Sep 17 00:00:00 2001 From: Elina Thibeau-Sutre Date: Tue, 20 Apr 2021 15:06:07 +0200 Subject: [PATCH 17/37] Add corresponding test --- clinicadl/tests/test_train_from_json.py | 117 +++++------------------- 1 file changed, 24 insertions(+), 93 deletions(-) diff --git a/clinicadl/tests/test_train_from_json.py b/clinicadl/tests/test_train_from_json.py index b3b6d0e3c..7d8684059 100644 --- a/clinicadl/tests/test_train_from_json.py +++ b/clinicadl/tests/test_train_from_json.py @@ -6,112 +6,43 @@ @pytest.fixture(params=[ - 'train_slice_cnn', - 'train_image_cnn', - 'train_patch_cnn', - 'train_patch_multicnn', - 'train_roi_cnn', - 'train_roi_multicnn' + 'train_roi_cnn' ]) def cli_commands(request): - if request.param == 'train_slice_cnn': - test_input = [ - 'train', - 'slice', - 'cnn', - 'data/dataset/random_example', - 't1-linear', - 'data/labels_list', - 'results', - 'resnet18', - '--epochs', '1', - '--n_splits', '2', - '--split', '0' - ] - elif request.param == 'train_image_cnn': - test_input = [ - 'train', - 'image', - 'cnn', - 'data/dataset/random_example', - 't1-linear', - 'data/labels_list', - 'results', - 'Conv5_FC3', - '--epochs', '1', - '--n_splits', '2', - '--split', '0' - ] - elif request.param == 'train_patch_cnn': - test_input = [ - 'train', - 'patch', - 'cnn', - 'data/dataset/random_example', - 't1-linear', - 'data/labels_list', - 'results', - 'Conv4_FC3', - '--epochs', '1', - '--n_splits', '2', - '--split', '0' - ] - elif request.param == 'train_patch_multicnn': - test_input = [ - 'train', - 'patch', - 'multicnn', - 'data/dataset/random_example', - 't1-linear', - 'data/labels_list', - 'results', - 'Conv4_FC3', - '--epochs', '1', - '--n_splits', '2', - '--split', '0' - ] - elif request.param == 'train_roi_cnn': - test_input = [ - 'train', - 'roi', - 'cnn', - 'data/dataset/random_example', - 't1-linear', - 'data/labels_list', - 'results', - 'Conv4_FC3', - '--epochs', '1', - '--n_splits', '2', - '--split', '0' - ] - elif request.param == 'train_roi_multicnn': - test_input = [ - 'train', - 'roi', - 'multicnn', - 'data/dataset/random_example', - 't1-linear', - 'data/labels_list', - 'results', - 'Conv4_FC3', - '--epochs', '1', - '--n_splits', '2', - '--split', '0' - ] + if request.param == 'train_roi_cnn': + command_dict = { + "mode": "roi", + "network_type": "cnn", + "caps_dir": "data/dataset/random_example", + "preprocessing": "t1-linear", + "tsv_path": "data/labels_list", + "model": "Conv4_FC3", + + "epochs": 1, + "n_splits": 2, + "split": 0, + "gpu": True + } else: raise NotImplementedError( "Test %s is not implemented." % request.param) - return test_input + return command_dict def test_train(cli_commands): - test_input = cli_commands - flag_error = not os.system("clinicadl " + " ".join(test_input)) + import json + + json = json.dumps(cli_commands, skipkeys=True, indent=4) + with open(os.path.join("commandline.json"), "w") as f: + f.write(json) + + flag_error = not os.system("clinicadl train from_json commandline.json results") performances_flag = os.path.exists( os.path.join("results", "fold-0", "cnn_classification")) assert flag_error assert performances_flag shutil.rmtree("results") + os.remove("commandline.json") From 5dd42ceaf5e1c7cbf4d93b4e2a03b7afd938c928 Mon Sep 17 00:00:00 2001 From: Elina Thibeau-Sutre Date: Tue, 20 Apr 2021 15:06:30 +0200 Subject: [PATCH 18/37] Change retrain in train from_json --- docs/Train/Introduction.md | 9 +++++++-- docs/Train/Retrain.md | 37 +++++++++++++++---------------------- mkdocs.yml | 1 + 3 files changed, 23 insertions(+), 24 deletions(-) diff --git a/docs/Train/Introduction.md b/docs/Train/Introduction.md index b02743eda..847f28de1 100644 --- a/docs/Train/Introduction.md +++ b/docs/Train/Introduction.md @@ -1,7 +1,8 @@ # `train` - Train deep learning networks for neuroimaging classification -This task enables the training of a convolutional neural network (CNN) classifier using different formats of inputs -(whole 3D images, 3D patches or 2D slices), as defined in [[Wen et al., 2020](https://doi.org/10.1016/j.media.2020.101694)]. +This task enables the training of a convolutional neural network (CNN) classifier or an autoencoder using +different formats of inputs (whole 3D images, 3D patches or 2D slices), as defined in +[[Wen et al., 2020](https://doi.org/10.1016/j.media.2020.101694)]. It mainly relies on the PyTorch deep learning library [[Paszke et al., 2019](https://papers.nips.cc/paper/9015-pytorch-an-imperative-style-high-performance-deep-learning-library)]. @@ -37,6 +38,10 @@ In case of [multi-cohort training](Details.md#multi-cohort), must be a path to a - `architecture` (str) is the name of the architecture used (e.g. `Conv5_FC3`). It must correspond to a class that inherits from `nn.Module` imported in `tools/deep_learning/models/__init__.py`. +!!! tip "`from_json` mode" + To shorten the command line, the option [`from_json`](Retrain.md) was added to `clinicadl train` in order + to define all the arguments in a JSON file. + Options shared for all values of `mode` are organized in groups: - **Computational resources** diff --git a/docs/Train/Retrain.md b/docs/Train/Retrain.md index 73b3cb1e7..7e6707d0a 100644 --- a/docs/Train/Retrain.md +++ b/docs/Train/Retrain.md @@ -1,42 +1,35 @@ -# `clinicadl retrain` - Launch a new network training based on a previously trained network +# `clinicadl train from_json` - Launch a new network training based on a JSON file This command allows to train a network defined in a `commandline.json` generated by -[`clinicadl train`](./Train/Introduction.md) or -[`clinicadl random-search generate`](RandomSearch.md#clinicadl-random-search-generate--train-random-models-sampled-from-a-defined-hyperparameter-space). +[`clinicadl train`](Introduction.md) or +[`clinicadl random-search generate`](../RandomSearch.md#clinicadl-random-search-generate--train-random-models-sampled-from-a-defined-hyperparameter-space). This is particularly useful for outputs of random search, as the random architecture produced my be not straightforward to implement. -!!! note "Training from json file" - You don't actually need to have the complete folder of the network used as a reference. - The only file used in this function is `commandline.json` so you can also manually create this file - and store it in `model_path`. +!!! note "JSON file generation" + JSON files that may be used by `clinicadl train from_json` can be generated + by filling the options of [`clinicadl train`](Introduction.md#running-the-task). + The undefined values of optional parameters will be automatically replaced by their default value. + You can also reuse (and modify) the `commandline.json` files automatically generated by other `clincadl train` + modes! ## Prerequisites -Please check which preprocessing needs to -be performed in the `commandline.json` file in the results folder. If it has -not been performed, execute the preprocessing pipeline as well as `clinicadl -extract` to obtain the tensor versions of the images. +Please check which preprocessing needs to be performed in the JSON file used as input. +If it has not been performed, execute the preprocessing pipeline as well as `clinicadl +preprocessing extract-tensor` to obtain the tensor versions of the images. ## Running the task This task can be run with the following command line: ```Text -clinicadl retrain +clinicadl train from_json ``` where -- `model_path` (str) is a path to the folder where the reference model and the json file -are stored. +- `json_path` (str) is a path to the JSON file used to build the model. - `output_dir` (str) is a path to the folder of the new model that will be trained. -By default all the arguments will correspond to the ones used to train the reference model. -However you can change any of these by setting a new value to the optional parameters. - -!!! note "Optional parameters" - For more information on the optional parameters, please check [`clinicadl train` - description](./Train/Introduction.md#running-the-task) - ## Outputs -The outputs correspond to the ones obtained using [`clinicadl train`](./Train/Introduction.md#outputs). +The outputs correspond to the ones obtained using [`clinicadl train`](Introduction.md#outputs). diff --git a/mkdocs.yml b/mkdocs.yml index 01cbad2b8..a7a2016b1 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -76,6 +76,7 @@ nav: - Patch-based: Train/Patch.md - Slice-based: Train/Slice.md - ROI-based: Train/ROI.md + - JSON defined: Train/Retrain.md - Custom experiment: Train/Custom.md - Implementation details: Train/Details.md - Classify: Classify.md From 56c9ae111fa3c147eda18ccb94807879db99b91f Mon Sep 17 00:00:00 2001 From: Elina Thibeau-Sutre Date: Mon, 26 Apr 2021 10:50:34 +0200 Subject: [PATCH 19/37] Add verbose --- clinicadl/clinicadl/cli.py | 3 ++- clinicadl/clinicadl/train/train_from_json.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/clinicadl/clinicadl/cli.py b/clinicadl/clinicadl/cli.py index 51d8c5516..a4c929a31 100644 --- a/clinicadl/clinicadl/cli.py +++ b/clinicadl/clinicadl/cli.py @@ -122,7 +122,8 @@ def retrain_func(args): retrain( args.json_path, - args.output_dir + args.output_dir, + verbose=args.verbose ) diff --git a/clinicadl/clinicadl/train/train_from_json.py b/clinicadl/clinicadl/train/train_from_json.py index 1b05080f5..ed00366d0 100644 --- a/clinicadl/clinicadl/train/train_from_json.py +++ b/clinicadl/clinicadl/train/train_from_json.py @@ -10,11 +10,12 @@ from .train_multiCNN import train_multi_cnn -def retrain(json_path, output_dir): +def retrain(json_path, output_dir, verbose=0): options = argparse.Namespace() options = read_json(options, json_path=json_path, read_computational=True) check_and_complete(options) options.output_dir = output_dir + options.verbose = verbose if options.network_type == "autoencoder": train_autoencoder(options) From 896315bca903a9003b6525620ba5c2a96640819f Mon Sep 17 00:00:00 2001 From: Elina Thibeau-Sutre Date: Mon, 26 Apr 2021 18:31:59 +0200 Subject: [PATCH 20/37] Separate default values per mode --- clinicadl/clinicadl/cli.py | 10 --- .../clinicadl/tools/deep_learning/iotools.py | 76 ++++++++++++------- .../tools/deep_learning/models/random.py | 6 ++ clinicadl/tests/test_train_from_json.py | 3 +- 4 files changed, 56 insertions(+), 39 deletions(-) diff --git a/clinicadl/clinicadl/cli.py b/clinicadl/clinicadl/cli.py index a4c929a31..c0764aaa7 100644 --- a/clinicadl/clinicadl/cli.py +++ b/clinicadl/clinicadl/cli.py @@ -640,16 +640,6 @@ def preprocessing_help(args): rs_pos_group.add_argument("name", type=str, help="Name of the job.") - # Data management - rs_data_group = rs_generate_parser.add_argument_group( - TRAIN_CATEGORIES["CROSS-VALIDATION"] - ) - rs_data_group.add_argument("--n_splits", type=int, default=0, - help="If a value is given for k will load data of a k-fold CV. " - "Default value (0) will load a single split.") - rs_data_group.add_argument("--split", type=int, default=None, nargs="+", - help="Will load the specific split wanted.") - rs_comp_group = rs_generate_parser.add_argument_group( TRAIN_CATEGORIES["COMPUTATIONAL"] ) diff --git a/clinicadl/clinicadl/tools/deep_learning/iotools.py b/clinicadl/clinicadl/tools/deep_learning/iotools.py index daa11cdc7..aeadbdf1e 100644 --- a/clinicadl/clinicadl/tools/deep_learning/iotools.py +++ b/clinicadl/clinicadl/tools/deep_learning/iotools.py @@ -127,20 +127,10 @@ def commandline_to_json(commandline, logger=None, filename="commandline.json"): os.makedirs(output_dir, exist_ok=True) # remove these entries from the commandline log file - if 'func' in commandline_arg_dict: - del commandline_arg_dict['func'] - - if 'output_dir' in commandline_arg_dict: - del commandline_arg_dict['output_dir'] - - if 'launch_dir' in commandline_arg_dict: - del commandline_arg_dict['launch_dir'] - - if 'name' in commandline_arg_dict: - del commandline_arg_dict['name'] - - if 'verbose' in commandline_arg_dict: - del commandline_arg_dict['verbose'] + remove_list = ['func', 'output_dir', 'launch_dir', 'name', 'verbose', 'logname'] + for variable in remove_list: + if variable in commandline_arg_dict: + del commandline_arg_dict[variable] # save to json file json = json.dumps(commandline_arg_dict, skipkeys=True, indent=4) @@ -279,48 +269,71 @@ def check_and_complete(options, random_search=False): options: (Namespace) the options used for training. random_search: (bool) If True the options are looking for mandatory values of random-search. """ + + def set_default(namespace, default_dict): + for name, default_value in default_dict.items(): + if not hasattr(namespace, name): + setattr(namespace, name, default_value) + filename = 'random_search.json' default_values = { "accumulation_steps": 1, "atlas_weight": 1, "baseline": False, - "channels_limit": 512, + "batch_size": 2, "data_augmentation": False, "diagnoses": ['AD', 'CN'], - "discarded_slices": 20, "dropout": 0, "epochs": 20, + "evaluation_steps": 0, "learning_rate": 4, "loss": "default", "merged_tsv_path": None, "multi_cohort": False, - "n_conv": 1, + "n_splits": 0, + "nproc": 2, "optimizer": "Adam", "unnormalize": False, - "patch_size": 50, "patience": 0, "predict_atlas_intensities": None, - "selection_threshold": 0, - "slice_direction": 0, - "stride_size": 50, + "split": None, "tolerance": 0.0, "transfer_learning_path": None, "transfer_learning_selection": "best_loss", - "use_extracted_patches": False, - "use_extracted_slices": False, - "use_extracted_roi": False, + "use_cpu": False, "wd_bool": True, "weight_decay": 4, "sampler": "random" } + mode_default_values = { + "patch": { + "patch_size": 50, + "stride_size": 50, + "selection_threshold": 0, + "use_extracted_patches": False + }, + "roi": { + "roi_list": None, + "selection_threshold": 0, + "uncropped_roi": False, + "use_extracted_roi": False + }, + "slice": { + "discarded_slices": 20, + "selection_threshold": 0, + "slice_direction": 0, + "use_extracted_slices": False + }, + "image": {} + } if random_search: default_values["d_reduction"] = "MaxPooling" default_values["network_normalization"] = "BatchNorm" + default_values["channels_limit"] = 512 + default_values["n_conv"] = 1 - for name, default_value in default_values.items(): - if not hasattr(options, name): - setattr(options, name, default_value) + set_default(options, default_values) mandatory_arguments = ['network_type', 'mode', 'tsv_path', 'caps_dir', 'preprocessing'] @@ -331,6 +344,15 @@ def check_and_complete(options, random_search=False): if not hasattr(options, argument): raise ValueError(f"The argument {argument} must be specified in {filename}.") + if random_search: + for mode, mode_dict in mode_default_values.items(): + set_default(options, mode_dict) + else: + if options.mode not in mode_default_values: + raise NotImplementedError(f"The mode optional arguments corresponding to mode {options.mode}") + mode_dict = mode_default_values[options.mode] + set_default(options, mode_dict) + def set_default_dropout(args): if args.dropout is None: diff --git a/clinicadl/clinicadl/tools/deep_learning/models/random.py b/clinicadl/clinicadl/tools/deep_learning/models/random.py index b6da1aa08..111c947ed 100644 --- a/clinicadl/clinicadl/tools/deep_learning/models/random.py +++ b/clinicadl/clinicadl/tools/deep_learning/models/random.py @@ -48,18 +48,22 @@ def random_sampling(rs_options, options): "accumulation_steps": "randint", "atlas_weight": "uniform", "baseline": "choice", + "batch_size": "fixed", "caps_dir": "fixed", "channels_limit": "fixed", "data_augmentation": "fixed", "diagnoses": "fixed", "dropout": "uniform", "epochs": "fixed", + "evaluation_steps": "fixed", "learning_rate": "exponent", "loss": "choice", "merged_tsv_path": "fixed", "mode": "choice", "multi_cohort": "fixed", "n_fcblocks": "randint", + "n_splits": "fixed", + "nproc": "fixed", "network_type": "choice", "network_normalization": "choice", "optimizer": "choice", @@ -67,11 +71,13 @@ def random_sampling(rs_options, options): "preprocessing": "choice", "predict_atlas_intensities": "fixed", "sampler": "choice", + "split": "fixed", "tolerance": "fixed", "transfer_learning_path": "choice", "transfer_learning_selection": "choice", "tsv_path": "fixed", "unnormalize": "choice", + "use_cpu": "fixed", "wd_bool": "choice", "weight_decay": "exponent", } diff --git a/clinicadl/tests/test_train_from_json.py b/clinicadl/tests/test_train_from_json.py index 7d8684059..ab698c48a 100644 --- a/clinicadl/tests/test_train_from_json.py +++ b/clinicadl/tests/test_train_from_json.py @@ -21,8 +21,7 @@ def cli_commands(request): "epochs": 1, "n_splits": 2, - "split": 0, - "gpu": True + "split": [0], } else: raise NotImplementedError( From 720e4db773cbb34ca676bd52c35de1976bc64306 Mon Sep 17 00:00:00 2001 From: Elina Thibeau-Sutre Date: Mon, 26 Apr 2021 18:32:20 +0200 Subject: [PATCH 21/37] remove optional parameters --- docs/RandomSearch.md | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/docs/RandomSearch.md b/docs/RandomSearch.md index eee134baa..86cd24f38 100644 --- a/docs/RandomSearch.md +++ b/docs/RandomSearch.md @@ -42,18 +42,6 @@ where: - `launch_directory` (str) is the parent directory of output folder containing the file `random_search.json`. - `name` (str) is the name of the output folder containing the experiment. -Optional arguments: - -- **Computational resources** - - `--use_cpu` (bool) forces to use CPU. Default behaviour is to try to use a GPU and to raise an error if it is not found. - - `--nproc` (int) is the number of workers used by the DataLoader. Default value: `2`. - - `--batch_size` (int) is the size of the batch used in the DataLoader. Default value: `2`. - - `--evaluation_steps` (int) gives the number of iterations to perform an [evaluation internal to an epoch](Train/Details.md#evaluation). - Default will only perform an evaluation at the end of each epoch. -- **Cross-validation arguments** - - `--n_splits` (int) is a number of splits k to load in the case of a k-fold cross-validation. Default will load a single-split. - - `--split` (list of int) is a subset of folds that will be used for training. By default all splits available are used. - ### Content of `random_search.json` `random_search.json` must be present in `launch_dir` before running the command. @@ -99,6 +87,12 @@ Optional variables: - `network_normalization` (str) is the type of normalization performed after convolutions. Must include only `BatchNorm`, `InstanceNorm` or `None`. Sampling function: `choice`. Default: `BatchNorm`. +- **Computational resources** + - `--use_cpu` (bool) forces to use CPU. Default behaviour is to try to use a GPU and to raise an error if it is not found. + - `--nproc` (int) is the number of workers used by the DataLoader. Default value: `2`. + - `--batch_size` (int) is the size of the batch used in the DataLoader. Default value: `2`. + - `--evaluation_steps` (int) gives the number of iterations to perform an [evaluation internal to an epoch](Train/Details.md#evaluation). + Default will only perform an evaluation at the end of each epoch. - **Data management** - `baseline` (bool) allows to only load `_baseline.tsv` files when set to `True`. Sampling function: `choice`. Default: `False`. @@ -109,6 +103,9 @@ Optional variables: Sampling function: `choice`. Default: `False`. - `sampler` (str) is the sampler used on the training set. It must be chosen in [`random`, `weighted`]. Sampling function: `choice`. Default: `random`. +- **Cross-validation arguments** + - `--n_splits` (int) is a number of splits k to load in the case of a k-fold cross-validation. Default will load a single-split. + - `--split` (list of int) is a subset of folds that will be used for training. By default all splits available are used. - **Optimization parameters** - `learning_rate` (float) is the learning rate used to perform weight update. Sampling function: `exponent`. Default: `4` (leading to a value of `1e-4`). From c4a3dbedf1d502144172134fc56b95bfbe7260f2 Mon Sep 17 00:00:00 2001 From: Elina Thibeau-Sutre Date: Mon, 26 Apr 2021 19:56:56 +0200 Subject: [PATCH 22/37] Remove analysis optional argument --- CHANGELOG | 2 ++ .../classify/random_search_analysis.py | 17 +++++++++++++---- clinicadl/clinicadl/cli.py | 7 ------- .../clinicadl/tools/deep_learning/iotools.py | 6 +++++- clinicadl/tests/test_random_search.py | 10 +++++----- docs/RandomSearch.md | 3 --- 6 files changed, 25 insertions(+), 20 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 2623dc485..2cd4663e9 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -40,6 +40,8 @@ based on `t1-volume` outputs. ### Changed - Previous `clinicadl random-search` is now `clinicadl random-search generate` +- cross-validation and computational arguments of `clinicadl random-search generate` are + now defined in `random_search.json`. ## ClinicaDL 0.2.1 diff --git a/clinicadl/clinicadl/classify/random_search_analysis.py b/clinicadl/clinicadl/classify/random_search_analysis.py index b1f0ea107..1d9c40685 100644 --- a/clinicadl/clinicadl/classify/random_search_analysis.py +++ b/clinicadl/clinicadl/classify/random_search_analysis.py @@ -7,11 +7,20 @@ import numpy as np from warnings import warn +from clinicadl.tools.deep_learning import read_json -def random_search_analysis(launch_dir, splits): - if splits is None: - splits = [0] +def random_search_analysis(launch_dir): + + rs_options = read_json(json_path=path.join(launch_dir, "random_search.json")) + + if rs_options.split is None: + if rs_options.n_splits is None: + fold_iterator = range(1) + else: + fold_iterator = range(rs_options.n_splits) + else: + fold_iterator = rs_options.split jobs_list = [job for job in os.listdir(launch_dir) if path.exists(path.join(launch_dir, job, "commandline.json"))] @@ -26,7 +35,7 @@ def random_search_analysis(launch_dir, splits): for job in jobs_list: valid_accuracies = [] - for fold in splits: + for fold in fold_iterator: performance_path = path.join(launch_dir, job, f'fold-{fold}', 'cnn_classification', f'best_{selection}') if path.exists(performance_path): valid_df = pd.read_csv(path.join(performance_path, 'validation_image_level_metrics.tsv'), sep='\t') diff --git a/clinicadl/clinicadl/cli.py b/clinicadl/clinicadl/cli.py index c0764aaa7..a99486e31 100644 --- a/clinicadl/clinicadl/cli.py +++ b/clinicadl/clinicadl/cli.py @@ -111,7 +111,6 @@ def rs_func(args): elif args.random_task == "analysis": random_search_analysis( args.launch_dir, - args.splits ) else: raise ValueError('This task was not implemented in random-search.') @@ -675,12 +674,6 @@ def preprocessing_help(args): help="Directory containing the random_search.json file." ) - rs_analysis_parser.add_argument( - "--splits", - type=int, nargs="+", default=None, - help="List of the folds used for the analysis. Default will perform only the first fold." - ) - rs_analysis_parser.set_defaults(func=rs_func) resume_parser = subparser.add_parser( diff --git a/clinicadl/clinicadl/tools/deep_learning/iotools.py b/clinicadl/clinicadl/tools/deep_learning/iotools.py index aeadbdf1e..a630277b7 100644 --- a/clinicadl/clinicadl/tools/deep_learning/iotools.py +++ b/clinicadl/clinicadl/tools/deep_learning/iotools.py @@ -140,7 +140,7 @@ def commandline_to_json(commandline, logger=None, filename="commandline.json"): f.close() -def read_json(options, json_path=None, test=False, read_computational=False): +def read_json(options=None, json_path=None, test=False, read_computational=False): """ Read a json file to update python argparse Namespace. Ensures retro-compatibility with previous namings in clinicadl. @@ -155,6 +155,10 @@ def read_json(options, json_path=None, test=False, read_computational=False): """ import json from os import path + from argparse import Namespace + + if options is None: + options = Namespace() evaluation_parameters = ["diagnosis_path", "input_dir", "diagnoses"] prep_compatibility_dict = {"mni": "t1-extensive", "linear": "t1-linear"} diff --git a/clinicadl/tests/test_random_search.py b/clinicadl/tests/test_random_search.py index 2f4174ce9..ab8532276 100644 --- a/clinicadl/tests/test_random_search.py +++ b/clinicadl/tests/test_random_search.py @@ -27,6 +27,9 @@ def cli_commands(request): 'patience': 0, 'tolerance': 0.0, + 'n_splits': 2, + 'split': [0], + 'n_convblocks': [3, 5], 'first_conv_width': [1, 3], 'n_fcblocks': [1, 2] @@ -35,15 +38,12 @@ def cli_commands(request): 'random-search', 'generate', launch_dir, - name_dir, - '--n_splits', '2', - '--split', '0', + name_dir ] log_input = [ 'random-search', 'analysis', - launch_dir, - '--splits', '2' + launch_dir ] else: raise NotImplementedError( diff --git a/docs/RandomSearch.md b/docs/RandomSearch.md index 86cd24f38..741edca94 100644 --- a/docs/RandomSearch.md +++ b/docs/RandomSearch.md @@ -341,9 +341,6 @@ clinicadl random-search analysis ``` where `launch_directory` (str) is the parent directory of output folder containing the file `random_search.json`. -The list of the folds that can be included in the analysis can be specified in `splits` option. -If nothing is specified only the first split is included for all jobs. - ### Outputs Two TSV files are produced in `launch_directory`: From 53f34d8b923b877d61cf4293a072cfcb62daf0b1 Mon Sep 17 00:00:00 2001 From: Elina Thibeau Sutre Date: Thu, 6 May 2021 09:57:50 +0200 Subject: [PATCH 23/37] Update CHANGELOG Co-authored-by: mdiazmel --- CHANGELOG | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 2cd4663e9..01a5874b9 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -34,7 +34,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 over a random search folder. - New functionality `clinicadl train from_json` to train a model with parameters defined in a JSON file. - New functionality `clinicadl resume` to resume a prematurely stopped training task. -- possibility to learn the grey matter intensities with the binary classification during training, +- Possibility to learn the grey matter intensities with the binary classification during training, based on `t1-volume` outputs. ### Changed @@ -85,4 +85,3 @@ based on `t1-volume` outputs. - Fix broken file when running preprocessing in t1-extensive. - From f2c0e957e50258ecda3a09ef36af48d6b3a3e884 Mon Sep 17 00:00:00 2001 From: Elina Thibeau-Sutre Date: Thu, 6 May 2021 10:02:19 +0200 Subject: [PATCH 24/37] Add review suggestions --- clinicadl/clinicadl/cli.py | 4 ++-- docs/Classify.md | 2 +- docs/Resume.md | 4 +--- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/clinicadl/clinicadl/cli.py b/clinicadl/clinicadl/cli.py index a99486e31..7e76023ad 100644 --- a/clinicadl/clinicadl/cli.py +++ b/clinicadl/clinicadl/cli.py @@ -282,7 +282,7 @@ def parse_command_line(): parent_parser.add_argument('--verbose', '-v', action='count', default=0) subparser = parser.add_subparsers( - title='''Task to execute with clinicadl:''', + title='''Task to execute with clinicadl''', description='''What kind of task do you want to use with clinicadl?''', dest='task', help='''****** Tasks proposed by clinicadl ******''') @@ -423,7 +423,7 @@ def parse_command_line(): ) preprocessing_subparser = preprocessing_parser.add_subparsers( - title='''Preprocessing task to execute with clinicadl:''', + title='''Preprocessing task to execute with clinicadl''', description='''What kind of task do you want to perform with clinicadl? (run, quality-check, extract-tensor).''', dest='preprocessing_task', diff --git a/docs/Classify.md b/docs/Classify.md index 9e9fb535f..59a927555 100644 --- a/docs/Classify.md +++ b/docs/Classify.md @@ -1,7 +1,7 @@ # `clinicadl classify` - Inference using pretrained models This functionality performs image classification using models trained with -[`clinicadl train`](./Train/Introduction.md) of [`clinicadl random-search generate`](./RandomSearch.md) +[`clinicadl train`](./Train/Introduction.md) or [`clinicadl random-search generate`](./RandomSearch.md) tasks. It can also use pretrained models if their folder structure is similar to the structure created by the command `clinicadl train`. At the top level of each model folder there are two diff --git a/docs/Resume.md b/docs/Resume.md index f03ec4165..c2ac95b77 100644 --- a/docs/Resume.md +++ b/docs/Resume.md @@ -2,9 +2,7 @@ This functionality allows to resume a prematurely stopped job trained with [`clinicadl train`](./Train/Introduction.md) of [`clinicadl random-search generate`](./RandomSearch.md) tasks. -It can also use pretrained -models if their folder structure is similar to the structure created by the -command `clinicadl train`. The files that are used by this function are the following: +The files that are used by this function are the following: - `commandline.json` describes the training parameters used to create the model, From a63c2504d74777202b91f43d2efbad4765205598 Mon Sep 17 00:00:00 2001 From: Elina Thibeau-Sutre Date: Thu, 6 May 2021 17:16:25 +0200 Subject: [PATCH 25/37] Change resume position in command line --- CHANGELOG | 2 +- clinicadl/clinicadl/cli.py | 89 ++++++++++++++++++++------------------ docs/{ => Train}/Resume.md | 10 ++--- mkdocs.yml | 1 + 4 files changed, 53 insertions(+), 49 deletions(-) rename docs/{ => Train}/Resume.md (88%) diff --git a/CHANGELOG b/CHANGELOG index 01a5874b9..fb13da9a4 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -33,7 +33,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - New functionality `clinicadl random-search analysis` to obtain the histogram of the balanced accuracy over a random search folder. - New functionality `clinicadl train from_json` to train a model with parameters defined in a JSON file. -- New functionality `clinicadl resume` to resume a prematurely stopped training task. +- New functionality `clinicadl train resume` to resume a prematurely stopped training task. - Possibility to learn the grey matter intensities with the binary classification during training, based on `t1-volume` outputs. diff --git a/clinicadl/clinicadl/cli.py b/clinicadl/clinicadl/cli.py index 7e76023ad..4e6e8b4fc 100644 --- a/clinicadl/clinicadl/cli.py +++ b/clinicadl/clinicadl/cli.py @@ -676,49 +676,6 @@ def preprocessing_help(args): rs_analysis_parser.set_defaults(func=rs_func) - resume_parser = subparser.add_parser( - 'resume', - parents=[parent_parser], - help='Resume all jobs prematurely ended in launch_dir.' - ) - - resume_parser.add_argument( - "model_path", - type=str, - help="Directory containing the random_search.json file." - ) - - resume_comp_group = resume_parser.add_argument_group( - TRAIN_CATEGORIES["COMPUTATIONAL"] - ) - resume_comp_group.add_argument( - "-np", "--nproc", - help='Number of cores used the quality check. ' - 'Default will reuse the same value than in training.', - type=int, default=None - ) - resume_comp_group.add_argument( - '-cpu', '--use_cpu', action='store_true', default=False, - help='Override the previous command line to use CPU.', - ) - resume_comp_group.add_argument( - '-gpu', '--use_gpu', action='store_true', default=False, - help='Override the previous command line to use GPU.', - ) - resume_comp_group.add_argument( - '--batch_size', - default=None, type=int, - help='Batch size for data loading. ' - 'Default will reuse the same value than in training.') - resume_comp_group.add_argument( - '--evaluation_steps', '-esteps', - default=None, type=int, - help='Fix the number of iterations to perform before computing an evaluation. ' - 'Default will reuse the same value than in training.' - ) - - resume_parser.set_defaults(func=resume_func) - train_parser = subparser.add_parser( 'train', help='Train with your data and create a model.') @@ -1079,6 +1036,52 @@ def preprocessing_help(args): train_json_parser.set_defaults(func=retrain_func) + ######################### + # RESUME + ######################### + resume_parser = train_subparser.add_parser( + 'resume', + parents=[parent_parser], + help='Resume all jobs prematurely ended in launch_dir.' + ) + + resume_parser.add_argument( + "model_path", + type=str, + help="Directory containing the random_search.json file." + ) + + resume_comp_group = resume_parser.add_argument_group( + TRAIN_CATEGORIES["COMPUTATIONAL"] + ) + resume_comp_group.add_argument( + "-np", "--nproc", + help='Number of cores used the quality check. ' + 'Default will reuse the same value than in training.', + type=int, default=None + ) + resume_comp_group.add_argument( + '-cpu', '--use_cpu', action='store_true', default=False, + help='Override the previous command line to use CPU.', + ) + resume_comp_group.add_argument( + '-gpu', '--use_gpu', action='store_true', default=False, + help='Override the previous command line to use GPU.', + ) + resume_comp_group.add_argument( + '--batch_size', + default=None, type=int, + help='Batch size for data loading. ' + 'Default will reuse the same value than in training.') + resume_comp_group.add_argument( + '--evaluation_steps', '-esteps', + default=None, type=int, + help='Fix the number of iterations to perform before computing an evaluation. ' + 'Default will reuse the same value than in training.' + ) + + resume_parser.set_defaults(func=resume_func) + # Classify - Classify a subject or a list of tsv files with the CNN # provided as argument. # classify_parser: get command line arguments and options diff --git a/docs/Resume.md b/docs/Train/Resume.md similarity index 88% rename from docs/Resume.md rename to docs/Train/Resume.md index c2ac95b77..6d4a579f7 100644 --- a/docs/Resume.md +++ b/docs/Train/Resume.md @@ -1,7 +1,7 @@ -# `clinicadl resume` - Resume a prematurely stopped job +# `clinicadl train resume` - Resume a prematurely stopped job This functionality allows to resume a prematurely stopped job trained with -[`clinicadl train`](./Train/Introduction.md) of [`clinicadl random-search generate`](./RandomSearch.md) tasks. +[`clinicadl train`](Introduction.md) of [`clinicadl random-search generate`](../RandomSearch.md) tasks. The files that are used by this function are the following: - `commandline.json` describes the training parameters used to create the @@ -44,7 +44,7 @@ extract` to obtain the tensor versions of the images. ## Running the task This task can be run with the following command line: ```Text -clinicadl resume +clinicadl train resume ``` where `model_path` (str) is a path to the folder where the model and the json file @@ -63,9 +63,9 @@ computing an evaluation. ## Outputs -The outputs correspond to the ones obtained using [`clinicadl train`](./Train/Introduction.md#outputs) +The outputs correspond to the ones obtained using [`clinicadl train`](Introduction.md#outputs) !!! note The files `checkpoint.pth.tar` and `optimizer.pth.tar` are automatically removed as soon - as the [stopping criterion](./Train/Details.md#stopping-criterion) is reached and the + as the [stopping criterion](Details.md#stopping-criterion) is reached and the performances of the models are evaluated on the training and validation datasets. diff --git a/mkdocs.yml b/mkdocs.yml index a7a2016b1..3cf68ccd2 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -77,6 +77,7 @@ nav: - Slice-based: Train/Slice.md - ROI-based: Train/ROI.md - JSON defined: Train/Retrain.md + - Resume: Train/Resume.md - Custom experiment: Train/Custom.md - Implementation details: Train/Details.md - Classify: Classify.md From 135b981b1fd8e8ddead077fc71e72ebc8d832fe5 Mon Sep 17 00:00:00 2001 From: Elina Thibeau Sutre Date: Tue, 18 May 2021 15:08:40 +0200 Subject: [PATCH 26/37] Downsampling input (#147) * Predict GM intensities from atlases * Change retrain in train from_json * Adapt to adni-to-bids changes * Add multi-cohort * Update docs * update info.json * Reverse bad merge * update gradients for variational CNN * Improve diagnosis inference to avoid losing sessions * Remove shepplogan generation * Fix minor bug in qc * Add multi-cohort * Fix test * Ensure retro-compatibility * Implement first dragt of ROI reader * Fix custom ROI extraction * Add tests on custom ROI * Add t1-volume extraction * Deal with different level of cropping * Fix test * Fix multicnn for ROI and slice parsers * Read correctly outputs of clinica * Add t1-linear-downsampled * Predict GM intensities from atlases * Update Conv5_FC3_down * Fix single ROI case * Fix style * Remove double merged_tsv_path * Fix parameters translation Co-authored-by: Mauricio DIAZ --- clinicadl/clinicadl/cli.py | 1 + .../clinicadl/interpret/group_backprop.py | 2 + .../interpret/individual_backprop.py | 3 + .../preprocessing/t1_extensive/info.json | 2 +- .../tools/deep_learning/cnn_utils.py | 3 +- .../clinicadl/tools/deep_learning/data.py | 25 ++------ .../clinicadl/tools/deep_learning/iotools.py | 2 +- .../tools/deep_learning/models/__init__.py | 2 +- .../tools/deep_learning/models/image_level.py | 60 +++++++++++++++++++ .../clinicadl/tools/inputs/filename_types.py | 1 + docs/Train/Details.md | 1 - 11 files changed, 78 insertions(+), 24 deletions(-) diff --git a/clinicadl/clinicadl/cli.py b/clinicadl/clinicadl/cli.py index 4e6e8b4fc..e71571c26 100644 --- a/clinicadl/clinicadl/cli.py +++ b/clinicadl/clinicadl/cli.py @@ -1534,6 +1534,7 @@ def return_train_parent_parser(): '--merged_tsv_path', default="", type=str, help="Path to the output of clinica iotools merged-tsv (concatenation for multi-cohort). " + "Can accelerate training if atlas intensities are predicted." ) train_cv_group = train_parent_parser.add_argument_group( diff --git a/clinicadl/clinicadl/interpret/group_backprop.py b/clinicadl/clinicadl/interpret/group_backprop.py index 46fcb9091..936aa5051 100644 --- a/clinicadl/clinicadl/interpret/group_backprop.py +++ b/clinicadl/clinicadl/interpret/group_backprop.py @@ -33,6 +33,8 @@ def group_backprop(options): if options.tsv_path is None and options.input_dir is None: options.multi_cohort = model_options.multi_cohort + else: + options.multi_cohort = False if options.tsv_path is None: options.tsv_path = model_options.tsv_path if options.input_dir is None: diff --git a/clinicadl/clinicadl/interpret/individual_backprop.py b/clinicadl/clinicadl/interpret/individual_backprop.py index a97c14fb7..dc93f264c 100644 --- a/clinicadl/clinicadl/interpret/individual_backprop.py +++ b/clinicadl/clinicadl/interpret/individual_backprop.py @@ -24,6 +24,7 @@ def individual_backprop(options): raise ValueError("No folds were found at path %s" % options.model_path) model_options = argparse.Namespace() + model_options = read_json(model_options, path.join(options.model_path, 'commandline.json')) model_options = translate_parameters(model_options) model_options.gpu = options.gpu @@ -33,6 +34,8 @@ def individual_backprop(options): if options.tsv_path is None and options.input_dir is None: options.multi_cohort = model_options.multi_cohort + else: + options.multi_cohort = False if options.tsv_path is None: options.tsv_path = model_options.tsv_path if options.input_dir is None: diff --git a/clinicadl/clinicadl/preprocessing/t1_extensive/info.json b/clinicadl/clinicadl/preprocessing/t1_extensive/info.json index bc80b5e7e..df4dd2b4b 100644 --- a/clinicadl/clinicadl/preprocessing/t1_extensive/info.json +++ b/clinicadl/clinicadl/preprocessing/t1_extensive/info.json @@ -4,5 +4,5 @@ "version": "0.1.0", "space_caps": "3M", "space_wd": "3M", - "dependencies": [] + "dependencies": [], } diff --git a/clinicadl/clinicadl/tools/deep_learning/cnn_utils.py b/clinicadl/clinicadl/tools/deep_learning/cnn_utils.py index bf842b655..551c4f109 100644 --- a/clinicadl/clinicadl/tools/deep_learning/cnn_utils.py +++ b/clinicadl/clinicadl/tools/deep_learning/cnn_utils.py @@ -614,7 +614,8 @@ def soft_voting(performance_df, validation_df, mode, selection_threshold=None, u def mode_to_image_tsvs(output_dir, fold, selection, mode, dataset="test"): """ - Copy mode-level tsvs to name them as image-level tsvs + Copy mode-level tsvs to name them as image-level TSV files + Args: output_dir: (str) path to the output directory. fold: (int) Fold number of the cross-validation. diff --git a/clinicadl/clinicadl/tools/deep_learning/data.py b/clinicadl/clinicadl/tools/deep_learning/data.py index c9a8c4312..612d45496 100644 --- a/clinicadl/clinicadl/tools/deep_learning/data.py +++ b/clinicadl/clinicadl/tools/deep_learning/data.py @@ -164,6 +164,11 @@ def _get_path(self, participant, session, cohort, mode="image"): 'deeplearning_prepare_data', '%s_based' % mode, 't1_linear', participant + '_' + session + FILENAME_TYPE['cropped'] + '.pt') + elif self.preprocessing == "t1-linear-downsampled": + image_path = path.join(self.caps_dict[cohort], 'subjects', participant, session, + 'deeplearning_prepare_data', '%s_based' % mode, 't1_linear', + participant + '_' + session + + FILENAME_TYPE['downsampled'] + '.pt') elif self.preprocessing == "t1-extensive": image_path = path.join(self.caps_dict[cohort], 'subjects', participant, session, 'deeplearning_prepare_data', '%s_based' % mode, 't1_extensive', @@ -465,13 +470,7 @@ def __init__(self, caps_directory, data_file, roi_list=None, cropped_roi=True, r atlas=atlas, merged_df=merged_df) def __getitem__(self, idx): - from time import time - - t0 = time() participant, session, cohort, roi_idx, label = self._get_meta_data(idx) - t1 = time( - ) - print(f"get meta data {t1 - t0}") if self.prepare_dl: if self.roi_list is None: @@ -488,8 +487,6 @@ def __getitem__(self, idx): image_path = self._get_path(participant, session, cohort, "image") image = torch.load(image_path) patch = self.extract_roi_from_mri(image, roi_idx) - t2 = time() - print(f"get roi {t2 - t1}") if self.transformations: patch = self.transformations(patch) @@ -497,21 +494,14 @@ def __getitem__(self, idx): if self.augmentation_transformations and not self.eval_mode: patch = self.augmentation_transformations(patch) - t3 = time() - print(f"transformations {t3 - t2}") - sample = {'image': patch, 'label': label, 'participant_id': participant, 'session_id': session, 'roi_id': roi_idx} - t4 = time() - print(f"sample {t4 - t3}") if self.atlas is not None: atlas_df = self._get_statistics_df(participant, session, cohort) atlas_pt = torch.from_numpy(atlas_df.values).float() sample['atlas'] = atlas_pt - t5 = time() - print(f"get atlas {t5 - t4}") return sample @@ -525,7 +515,6 @@ def num_elem_per_image(self): def extract_roi_from_mri(self, image_tensor, roi_idx): """ - :param image_tensor: (Tensor) the tensor of the image. :param roi_idx: (int) Region index. :return: Tensor of the extracted region. @@ -751,7 +740,6 @@ def return_dataset(mode, input_dir, data_df, preprocessing, prepare_dl=False): """ Return appropriate Dataset according to given options. - Args: mode: (str) input used by the network. Chosen from ['image', 'patch', 'roi', 'slice']. input_dir: (str) path to a directory containing a CAPS structure. @@ -772,7 +760,7 @@ def return_dataset(mode, input_dir, data_df, preprocessing, if cnn_index is not None and mode in ["image"]: raise ValueError("Multi-CNN is not implemented for %s mode." % mode) - if params.merged_tsv_path is not "": + if params.merged_tsv_path is not "" and params.merged_tsv_path is not None: merged_df = pd.read_csv(params.merged_tsv_path, sep="\t") else: merged_df = None @@ -944,7 +932,6 @@ def __call__(self, image): def get_transforms(mode, minmaxnormalization=True, data_augmentation=None): """ Outputs the transformations that will be applied to the dataset - :param mode: (str) input used by the network. Chosen from ['image', 'patch', 'roi', 'slice']. :param minmaxnormalization: (bool) if True will perform MinMaxNormalization :param data_augmentation: (list[str]) list of data augmentation performed on the training set. diff --git a/clinicadl/clinicadl/tools/deep_learning/iotools.py b/clinicadl/clinicadl/tools/deep_learning/iotools.py index a630277b7..aa3295b94 100644 --- a/clinicadl/clinicadl/tools/deep_learning/iotools.py +++ b/clinicadl/clinicadl/tools/deep_learning/iotools.py @@ -253,7 +253,7 @@ def read_json(options=None, json_path=None, test=False, read_computational=False options.predict_atlas_intensities = None if not hasattr(options, "merged_tsv_path"): - options.merged_tsv_path = "" + options.merged_tsv_path = None if not hasattr(options, "atlas_weight"): options.atlas_weight = 1 diff --git a/clinicadl/clinicadl/tools/deep_learning/models/__init__.py b/clinicadl/clinicadl/tools/deep_learning/models/__init__.py index 59a4b79df..9131fc259 100644 --- a/clinicadl/clinicadl/tools/deep_learning/models/__init__.py +++ b/clinicadl/clinicadl/tools/deep_learning/models/__init__.py @@ -1,6 +1,6 @@ from .autoencoder import AutoEncoder, initialize_other_autoencoder, transfer_learning from .iotools import load_model, load_optimizer, save_checkpoint -from .image_level import Conv5_FC3, Conv5_FC3_mni, Conv6_FC3, VConv5_FC3 +from .image_level import Conv5_FC3, Conv5_FC3_mni, Conv6_FC3, VConv5_FC3, Conv5_FC3_down from .patch_level import Conv4_FC3 from .slice_level import resnet18, ConvNet from .random import RandomArchitecture diff --git a/clinicadl/clinicadl/tools/deep_learning/models/image_level.py b/clinicadl/clinicadl/tools/deep_learning/models/image_level.py index 0ba14f082..6d0836f20 100755 --- a/clinicadl/clinicadl/tools/deep_learning/models/image_level.py +++ b/clinicadl/clinicadl/tools/deep_learning/models/image_level.py @@ -264,3 +264,63 @@ def forward(self, x): x = self.classifier(x) return x + + +class Conv5_FC3_down(nn.Module): + """ + Classifier for a binary classification task + + Image level architecture used on Minimal preprocessing + """ + def __init__(self, dropout=0.5, n_classes=2): + super(Conv5_FC3_down, self).__init__() + + self.features = nn.Sequential( + nn.Conv3d(1, 8, 3, padding=1), + nn.BatchNorm3d(8), + nn.ReLU(), + PadMaxPool3d(2, 2), + + nn.Conv3d(8, 16, 3, padding=1), + nn.BatchNorm3d(16), + nn.ReLU(), + PadMaxPool3d(2, 2), + + nn.Conv3d(16, 32, 3, padding=1), + nn.BatchNorm3d(32), + nn.ReLU(), + PadMaxPool3d(2, 2), + + nn.Conv3d(32, 64, 3, padding=1), + nn.BatchNorm3d(64), + nn.ReLU(), + PadMaxPool3d(2, 2), + + nn.Conv3d(64, 128, 3, padding=1), + nn.BatchNorm3d(128), + nn.ReLU(), + PadMaxPool3d(2, 2), + + ) + + self.classifier = nn.Sequential( + Flatten(), + nn.Dropout(p=dropout), + + nn.Linear(128 * 3 * 4 * 3, 350), + nn.ReLU(), + + nn.Linear(350, 25), + nn.ReLU(), + + nn.Linear(25, n_classes) + + ) + + self.flattened_shape = [-1, 128, 3, 4, 3] + + def forward(self, x): + x = self.features(x) + x = self.classifier(x) + + return x diff --git a/clinicadl/clinicadl/tools/inputs/filename_types.py b/clinicadl/clinicadl/tools/inputs/filename_types.py index 5775aacc5..99b3ebe03 100644 --- a/clinicadl/clinicadl/tools/inputs/filename_types.py +++ b/clinicadl/clinicadl/tools/inputs/filename_types.py @@ -2,6 +2,7 @@ FILENAME_TYPE = {'full': '_T1w_space-MNI152NLin2009cSym_res-1x1x1_T1w', 'cropped': '_T1w_space-MNI152NLin2009cSym_desc-Crop_res-1x1x1_T1w', + 'downsampled': '_T1w_space-MNI152NLin2009cSym_desc-Crop_res-2x2x2_T1w', 'skull_stripped': '_space-Ixi549Space_desc-skullstripped_T1w', 'gm_maps': '_T1w_segm-graymatter_space-Ixi549Space_modulated-off_probability', 'shepplogan': '_phantom-SheppLogan'} diff --git a/docs/Train/Details.md b/docs/Train/Details.md index 96547a453..8730f535b 100644 --- a/docs/Train/Details.md +++ b/docs/Train/Details.md @@ -143,4 +143,3 @@ or [`kfold`](../TSVTools.md#kfold---k-fold-split) methods. - `diagnoses` the diagnoses that will be used in the cohort. Must correspond to a single string with labels accepted by `clinicadl train` (`AD`, `BV`, `CN`, `MCI`, `sMCI` or `pMCI`) separated by commas. See the [dedicated section](./Custom.md#custom-labels) to use custom labels. - From d6b49d9b430b4133b3dd1c2eff63738c6bdd7fd7 Mon Sep 17 00:00:00 2001 From: Mauricio DIAZ Date: Thu, 20 May 2021 01:38:12 +0200 Subject: [PATCH 27/37] Add magic word to avoid reformat code --- .../preprocessing/t1_extensive/t1_extensive_pipeline.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/clinicadl/clinicadl/preprocessing/t1_extensive/t1_extensive_pipeline.py b/clinicadl/clinicadl/preprocessing/t1_extensive/t1_extensive_pipeline.py index 5ca38dd14..f10e259ca 100644 --- a/clinicadl/clinicadl/preprocessing/t1_extensive/t1_extensive_pipeline.py +++ b/clinicadl/clinicadl/preprocessing/t1_extensive/t1_extensive_pipeline.py @@ -143,6 +143,7 @@ def build_output_node(self): write_node.inputs.base_directory = self.caps_directory write_node.inputs.parameterization = False + # fmt: off self.connect( [ (self.input_node, container_path, [("norm_t1w", "bids_or_caps_filename")]), @@ -150,6 +151,7 @@ def build_output_node(self): (container_path, write_node, [(("container", fix_join, ""), "container")]), ] ) + # fmt: on def build_core_nodes(self): """Build and connect the core nodes of the pipeline.""" @@ -196,6 +198,7 @@ def build_core_nodes(self): # Connection # ========== + # fmt: off self.connect( [ (self.input_node, caps_filename, [("norm_t1w", "norm_t1w")]), @@ -204,3 +207,4 @@ def build_core_nodes(self): (skull_stripping, self.output_node, [("masked_image_path", "skull_stripped_t1w")]), ] ) + # fmt: on From 5e46aef09897ceb8276daa73b0b8f9e0ea353311 Mon Sep 17 00:00:00 2001 From: Mauricio DIAZ Date: Thu, 20 May 2021 02:06:12 +0200 Subject: [PATCH 28/37] Add pre-commit and parameters for project --- .pre-commit-config.yaml | 20 +++++++++++++++++++ clinicadl/pyproject.toml | 42 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 .pre-commit-config.yaml create mode 100644 clinicadl/pyproject.toml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 000000000..04d3bb2f6 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,20 @@ +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v2.3.0 + hooks: + - id: check-yaml +- repo: https://github.com/psf/black + rev: 20.8b0 + hooks: + - id: black +- repo: https://github.com/pycqa/isort + rev: 5.8.0 + hooks: + - id: isort + name: isort (python) + - id: isort + name: isort (cython) + types: [cython] + - id: isort + name: isort (pyi) + types: [pyi] diff --git a/clinicadl/pyproject.toml b/clinicadl/pyproject.toml new file mode 100644 index 000000000..0fc0e1df8 --- /dev/null +++ b/clinicadl/pyproject.toml @@ -0,0 +1,42 @@ +[tool.isort] +multi_line_output = 3 +include_trailing_comma = true +force_grid_wrap = 0 +use_parentheses = true +ensure_newline_before_comments = true +line_length = 88 +skip_gitignore = true + +[tool.black] +line-length = 88 +target-version = ['py36', 'py37', 'py38'] +include = '\.pyi?$' +force-exclude = ''' +/( + \.eggs + | \.git + | \.hg + | \.mypy_cache + | \.tox + | \.venv + | \.pytest_cache + | _build + | buck-out + | build + | dist + | docs + | README.md + | MANIFEST.in + | LICENSE.txt + | clinicadl/VERSION + | clinica/pipelines/dwi_connectome +)/ +''' +exclude = ''' +/( + README.md + | MANIFEST.in + | LICENSE.txt + | clinicadl/VERSION +)/ +''' From 1645bbf45b6aad3b8ed74975907373fc98bde5fa Mon Sep 17 00:00:00 2001 From: Mauricio DIAZ Date: Thu, 20 May 2021 02:08:14 +0200 Subject: [PATCH 29/37] Refactoring all files with black --- .gitignore | 2 +- clinicadl/clinicadl/__init__.py | 7 +- clinicadl/clinicadl/classify/inference.py | 271 ++- .../classify/random_search_analysis.py | 56 +- clinicadl/clinicadl/cli.py | 1660 ++++++++++------- clinicadl/clinicadl/interpret/gradients.py | 3 +- .../clinicadl/interpret/group_backprop.py | 137 +- .../interpret/individual_backprop.py | 143 +- clinicadl/clinicadl/main.py | 26 +- .../t1_extensive/t1_extensive_cli.py | 3 +- .../t1_extensive/t1_extensive_pipeline.py | 18 +- .../t1_extensive/t1_extensive_utils.py | 4 +- .../quality_check/t1_linear/quality_check.py | 52 +- .../quality_check/t1_linear/utils.py | 130 +- .../quality_check/t1_volume/quality_check.py | 39 +- .../quality_check/t1_volume/utils.py | 97 +- .../clinicadl/resume/automatic_resume.py | 57 +- .../clinicadl/resume/resume_autoencoder.py | 119 +- .../clinicadl/resume/resume_single_CNN.py | 126 +- .../clinicadl/tools/data/generate_data.py | 303 +-- clinicadl/clinicadl/tools/data/utils.py | 199 +- .../clinicadl/tools/deep_learning/__init__.py | 25 +- .../tools/deep_learning/autoencoder_utils.py | 196 +- .../tools/deep_learning/cnn_utils.py | 630 +++++-- .../clinicadl/tools/deep_learning/data.py | 887 ++++++--- .../clinicadl/tools/deep_learning/iotools.py | 89 +- .../tools/deep_learning/models/__init__.py | 34 +- .../tools/deep_learning/models/autoencoder.py | 132 +- .../tools/deep_learning/models/image_level.py | 75 +- .../tools/deep_learning/models/iotools.py | 42 +- .../tools/deep_learning/models/modules.py | 12 +- .../tools/deep_learning/models/patch_level.py | 14 +- .../tools/deep_learning/models/random.py | 194 +- .../tools/deep_learning/models/slice_level.py | 39 +- .../clinicadl/tools/inputs/filename_types.py | 26 +- .../clinicadl/tools/tsv/data_formatting.py | 373 ++-- clinicadl/clinicadl/tools/tsv/data_split.py | 235 ++- .../tools/tsv/demographics_analysis.py | 184 +- clinicadl/clinicadl/tools/tsv/kfold_split.py | 162 +- clinicadl/clinicadl/tools/tsv/restriction.py | 17 +- clinicadl/clinicadl/tools/tsv/test.py | 91 +- clinicadl/clinicadl/tools/tsv/tsv_utils.py | 95 +- clinicadl/clinicadl/train/__init__.py | 4 +- clinicadl/clinicadl/train/random_search.py | 8 +- .../clinicadl/train/train_autoencoder.py | 153 +- clinicadl/clinicadl/train/train_from_json.py | 4 +- clinicadl/clinicadl/train/train_multiCNN.py | 232 ++- clinicadl/clinicadl/train/train_singleCNN.py | 200 +- clinicadl/setup.py | 42 +- clinicadl/tests/test_classify.py | 74 +- clinicadl/tests/test_cli.py | 366 ++-- clinicadl/tests/test_generate.py | 124 +- clinicadl/tests/test_interpret.py | 80 +- clinicadl/tests/test_random_search.py | 61 +- clinicadl/tests/test_train_ae.py | 97 +- clinicadl/tests/test_train_cnn.py | 187 +- clinicadl/tests/test_train_custom_roi.py | 89 +- clinicadl/tests/test_train_from_json.py | 14 +- clinicadl/tests/test_transfer_learning.py | 352 ++-- clinicadl/tests/test_tsvtool.py | 31 +- 60 files changed, 5727 insertions(+), 3395 deletions(-) diff --git a/.gitignore b/.gitignore index da1191812..380cf233c 100644 --- a/.gitignore +++ b/.gitignore @@ -2,7 +2,7 @@ build/ dist/ examples/external-data -Clinica.egg-info +clinicadl.egg-info *egg-info *.log *.lock diff --git a/clinicadl/clinicadl/__init__.py b/clinicadl/clinicadl/__init__.py index f287d9437..9200e1204 100644 --- a/clinicadl/clinicadl/__init__.py +++ b/clinicadl/clinicadl/__init__.py @@ -1,9 +1,10 @@ -__all__ = ['__version__'] +__all__ = ["__version__"] # Load the Clinica package version -import sys import pkgutil -__version__ = pkgutil.get_data(__package__, 'VERSION').decode('ascii').strip() +import sys + +__version__ = pkgutil.get_data(__package__, "VERSION").decode("ascii").strip() version = __version__ # import pkg_resources diff --git a/clinicadl/clinicadl/classify/inference.py b/clinicadl/clinicadl/classify/inference.py index 1fa2b183a..62ca91da9 100644 --- a/clinicadl/clinicadl/classify/inference.py +++ b/clinicadl/clinicadl/classify/inference.py @@ -1,29 +1,48 @@ # coding: utf8 -from os.path import join, exists -from os import strerror, makedirs, listdir import errno import pathlib -from clinicadl.tools.deep_learning import create_model, load_model, read_json, commandline_to_json -from clinicadl.tools.deep_learning.iotools import return_logger, translate_parameters -from clinicadl.tools.deep_learning.data import return_dataset, get_transforms, compute_num_cnn, load_data_test -from clinicadl.tools.deep_learning.cnn_utils import test, soft_voting_to_tsvs, mode_level_to_tsvs, get_criterion +from os import listdir, makedirs, strerror +from os.path import exists, join + from torch.utils.data import DataLoader +from clinicadl.tools.deep_learning import ( + commandline_to_json, + create_model, + load_model, + read_json, +) +from clinicadl.tools.deep_learning.cnn_utils import ( + get_criterion, + mode_level_to_tsvs, + soft_voting_to_tsvs, + test, +) +from clinicadl.tools.deep_learning.data import ( + compute_num_cnn, + get_transforms, + load_data_test, + return_dataset, +) +from clinicadl.tools.deep_learning.iotools import return_logger, translate_parameters + -def classify(caps_dir, - tsv_path, - model_path, - prefix_output, - labels=True, - gpu=True, - num_workers=0, - batch_size=1, - prepare_dl=True, - selection_metrics=None, - diagnoses=None, - verbose=0, - multi_cohort=False): +def classify( + caps_dir, + tsv_path, + model_path, + prefix_output, + labels=True, + gpu=True, + num_workers=0, + batch_size=1, + prepare_dl=True, + selection_metrics=None, + diagnoses=None, + verbose=0, + multi_cohort=False, +): """ This function verifies the input folders, and the existence of the json file then it launch the inference stage from a specific model. @@ -51,12 +70,11 @@ def classify(caps_dir, # Infer json file from model_path (suppose that json file is at the same # folder) - json_file = join(model_path, 'commandline.json') + json_file = join(model_path, "commandline.json") if not exists(json_file): logger.error("Json file doesn't exist") - raise FileNotFoundError( - errno.ENOENT, strerror(errno.ENOENT), json_file) + raise FileNotFoundError(errno.ENOENT, strerror(errno.ENOENT), json_file) inference_from_model( caps_dir, @@ -72,24 +90,26 @@ def classify(caps_dir, selection_metrics, diagnoses, logger, - multi_cohort + multi_cohort, ) -def inference_from_model(caps_dir, - tsv_path, - model_path=None, - json_file=None, - prefix=None, - labels=True, - gpu=True, - num_workers=0, - batch_size=1, - prepare_dl=False, - selection_metrics=None, - diagnoses=None, - logger=None, - multi_cohort=False): +def inference_from_model( + caps_dir, + tsv_path, + model_path=None, + json_file=None, + prefix=None, + labels=True, + gpu=True, + num_workers=0, + batch_size=1, + prepare_dl=False, + selection_metrics=None, + diagnoses=None, + logger=None, + multi_cohort=False, +): """ Inference from previously trained model. @@ -134,8 +154,9 @@ def inference_from_model(caps_dir, logger = logging parser = argparse.ArgumentParser() - parser.add_argument("model_path", type=str, - help="Path to the trained model folder.") + parser.add_argument( + "model_path", type=str, help="Path to the trained model folder." + ) options = parser.parse_args([model_path]) options = read_json(options, json_path=json_file) @@ -163,41 +184,56 @@ def inference_from_model(caps_dir, # loop depending the number of folds found in the model folder for fold_dir in currentDirectory.glob(currentPattern): fold = int(str(fold_dir).split("-")[-1]) - out_path = join(fold_dir, 'models') + out_path = join(fold_dir, "models") for selection_metric in selection_metrics: - if options.mode_task == 'multicnn': + if options.mode_task == "multicnn": for cnn_dir in listdir(out_path): - if not exists(join(out_path, cnn_dir, "best_%s" % selection_metric, 'model_best.pth.tar')): + if not exists( + join( + out_path, + cnn_dir, + "best_%s" % selection_metric, + "model_best.pth.tar", + ) + ): raise FileNotFoundError( errno.ENOENT, strerror(errno.ENOENT), - join(out_path, - cnn_dir, - "best_%s" % selection_metric, - 'model_best.pth.tar') + join( + out_path, + cnn_dir, + "best_%s" % selection_metric, + "model_best.pth.tar", + ), ) else: full_model_path = join(out_path, "best_%s" % selection_metric) - if not exists(join(full_model_path, 'model_best.pth.tar')): + if not exists(join(full_model_path, "model_best.pth.tar")): raise FileNotFoundError( errno.ENOENT, strerror(errno.ENOENT), - join(full_model_path, 'model_best.pth.tar')) + join(full_model_path, "model_best.pth.tar"), + ) - performance_dir = join(fold_dir, 'cnn_classification', 'best_%s' % selection_metric) + performance_dir = join( + fold_dir, "cnn_classification", "best_%s" % selection_metric + ) makedirs(performance_dir, exist_ok=True) - commandline_to_json({ - "output_dir": model_path, - "caps_dir": caps_dir, - "tsv_path": tsv_path, - "prefix": prefix, - "labels": labels - }, filename=f"commandline_classify-{prefix}") + commandline_to_json( + { + "output_dir": model_path, + "caps_dir": caps_dir, + "tsv_path": tsv_path, + "prefix": prefix, + "labels": labels, + }, + filename=f"commandline_classify-{prefix}", + ) # It launch the corresponding function, depending on the mode. inference_from_model_generic( @@ -213,11 +249,11 @@ def inference_from_model(caps_dir, num_cnn=num_cnn, logger=logger, multi_cohort=multi_cohort, - prepare_dl=prepare_dl + prepare_dl=prepare_dl, ) # Soft voting - if hasattr(options, 'selection_threshold'): + if hasattr(options, "selection_threshold"): selection_thresh = options.selection_threshold else: selection_thresh = 0.8 @@ -225,34 +261,59 @@ def inference_from_model(caps_dir, # Write files at the image level (for patch, roi and slice). # It assumes the existance of validation files to perform soft-voting if options.mode in ["patch", "roi", "slice"]: - soft_voting_to_tsvs(currentDirectory, fold, "best_%s" % selection_metric, options.mode, - prefix, num_cnn=num_cnn, selection_threshold=selection_thresh, - use_labels=labels, logger=logger) - - logger.info("Prediction results and metrics are written in the " - "following folder: %s" % performance_dir) + soft_voting_to_tsvs( + currentDirectory, + fold, + "best_%s" % selection_metric, + options.mode, + prefix, + num_cnn=num_cnn, + selection_threshold=selection_thresh, + use_labels=labels, + logger=logger, + ) + + logger.info( + "Prediction results and metrics are written in the " + "following folder: %s" % performance_dir + ) -def inference_from_model_generic(caps_dir, tsv_path, model_path, model_options, - prefix, output_dir, fold, selection, - labels=True, num_cnn=None, logger=None, - multi_cohort=False, prepare_dl=True): - from os.path import join +def inference_from_model_generic( + caps_dir, + tsv_path, + model_path, + model_options, + prefix, + output_dir, + fold, + selection, + labels=True, + num_cnn=None, + logger=None, + multi_cohort=False, + prepare_dl=True, +): import logging + from os.path import join if logger is None: logger = logging gpu = not model_options.use_cpu - _, all_transforms = get_transforms(model_options.mode, model_options.minmaxnormalization) + _, all_transforms = get_transforms( + model_options.mode, model_options.minmaxnormalization + ) - test_df = load_data_test(tsv_path, model_options.diagnoses, multi_cohort=multi_cohort) + test_df = load_data_test( + tsv_path, model_options.diagnoses, multi_cohort=multi_cohort + ) # Define loss and optimizer criterion = get_criterion(model_options.loss) - if model_options.mode_task == 'multicnn': + if model_options.mode_task == "multicnn": for n in range(num_cnn): @@ -267,7 +328,7 @@ def inference_from_model_generic(caps_dir, tsv_path, model_path, model_options, cnn_index=n, labels=labels, prepare_dl=prepare_dl, - multi_cohort=multi_cohort + multi_cohort=multi_cohort, ) test_loader = DataLoader( @@ -275,15 +336,17 @@ def inference_from_model_generic(caps_dir, tsv_path, model_path, model_options, batch_size=model_options.batch_size, shuffle=False, num_workers=model_options.nproc, - pin_memory=True) + pin_memory=True, + ) # load the best trained model during the training model = create_model(model_options, test_dataset.size) model, best_epoch = load_model( model, - join(model_path, 'cnn-%i' % n, selection), + join(model_path, "cnn-%i" % n, selection), gpu, - filename='model_best.pth.tar') + filename="model_best.pth.tar", + ) cnn_df, cnn_metrics = test( model, @@ -291,15 +354,31 @@ def inference_from_model_generic(caps_dir, tsv_path, model_path, model_options, gpu, criterion, mode=model_options.mode, - use_labels=labels + use_labels=labels, ) if labels: - logger.info("%s balanced accuracy is %f for %s %i and model selected on %s" - % (prefix, cnn_metrics["balanced_accuracy"], model_options.mode, n, selection)) - - mode_level_to_tsvs(output_dir, cnn_df, cnn_metrics, fold, selection, model_options.mode, - dataset=prefix, cnn_index=n) + logger.info( + "%s balanced accuracy is %f for %s %i and model selected on %s" + % ( + prefix, + cnn_metrics["balanced_accuracy"], + model_options.mode, + n, + selection, + ) + ) + + mode_level_to_tsvs( + output_dir, + cnn_df, + cnn_metrics, + fold, + selection, + model_options.mode, + dataset=prefix, + cnn_index=n, + ) else: @@ -314,7 +393,7 @@ def inference_from_model_generic(caps_dir, tsv_path, model_path, model_options, params=model_options, labels=labels, prepare_dl=prepare_dl, - multi_cohort=multi_cohort + multi_cohort=multi_cohort, ) # Load the data @@ -323,13 +402,14 @@ def inference_from_model_generic(caps_dir, tsv_path, model_path, model_options, batch_size=model_options.batch_size, shuffle=False, num_workers=model_options.nproc, - pin_memory=True) + pin_memory=True, + ) # Load model from path model = create_model(model_options, test_dataset.size) best_model, best_epoch = load_model( - model, join(model_path, selection), - gpu, filename='model_best.pth.tar') + model, join(model_path, selection), gpu, filename="model_best.pth.tar" + ) # Run the model on the data predictions_df, metrics = test( @@ -338,12 +418,21 @@ def inference_from_model_generic(caps_dir, tsv_path, model_path, model_options, gpu, criterion, mode=model_options.mode, - use_labels=labels + use_labels=labels, ) if labels: - logger.info("%s level %s balanced accuracy is %f for model selected on %s" - % (model_options.mode, prefix, metrics["balanced_accuracy"], selection)) + logger.info( + "%s level %s balanced accuracy is %f for model selected on %s" + % (model_options.mode, prefix, metrics["balanced_accuracy"], selection) + ) - mode_level_to_tsvs(output_dir, predictions_df, metrics, fold, selection, model_options.mode, - dataset=prefix) + mode_level_to_tsvs( + output_dir, + predictions_df, + metrics, + fold, + selection, + model_options.mode, + dataset=prefix, + ) diff --git a/clinicadl/clinicadl/classify/random_search_analysis.py b/clinicadl/clinicadl/classify/random_search_analysis.py index 1d9c40685..c49b6fa17 100644 --- a/clinicadl/clinicadl/classify/random_search_analysis.py +++ b/clinicadl/clinicadl/classify/random_search_analysis.py @@ -3,10 +3,11 @@ """ import os from os import path -import pandas as pd -import numpy as np from warnings import warn +import numpy as np +import pandas as pd + from clinicadl.tools.deep_learning import read_json @@ -22,12 +23,28 @@ def random_search_analysis(launch_dir): else: fold_iterator = rs_options.split - jobs_list = [job for job in os.listdir(launch_dir) - if path.exists(path.join(launch_dir, job, "commandline.json"))] + jobs_list = [ + job + for job in os.listdir(launch_dir) + if path.exists(path.join(launch_dir, job, "commandline.json")) + ] - for selection in ['balanced_accuracy', 'loss']: + for selection in ["balanced_accuracy", "loss"]: - columns = ['run', '>0.5', '>0.55', '>0.6', '>0.65', '>0.7', '>0.75', '>0.8', '>0.85', '>0.9', '>0.95', 'folds'] + columns = [ + "run", + ">0.5", + ">0.55", + ">0.6", + ">0.65", + ">0.7", + ">0.75", + ">0.8", + ">0.85", + ">0.9", + ">0.95", + "folds", + ] output_df = pd.DataFrame(columns=columns) thresholds = np.arange(0.5, 1, 0.05) thresholds = np.insert(thresholds, 0, 0) @@ -36,10 +53,23 @@ def random_search_analysis(launch_dir): valid_accuracies = [] for fold in fold_iterator: - performance_path = path.join(launch_dir, job, f'fold-{fold}', 'cnn_classification', f'best_{selection}') + performance_path = path.join( + launch_dir, + job, + f"fold-{fold}", + "cnn_classification", + f"best_{selection}", + ) if path.exists(performance_path): - valid_df = pd.read_csv(path.join(performance_path, 'validation_image_level_metrics.tsv'), sep='\t') - valid_accuracies.append(valid_df.loc[0, 'balanced_accuracy'].astype(float)) + valid_df = pd.read_csv( + path.join( + performance_path, "validation_image_level_metrics.tsv" + ), + sep="\t", + ) + valid_accuracies.append( + valid_df.loc[0, "balanced_accuracy"].astype(float) + ) else: warn(f"The fold {fold} doesn't exist for job {job}") @@ -53,8 +83,12 @@ def random_search_analysis(launch_dir): row_df = pd.DataFrame(index=[job], data=row.reshape(1, -1), columns=columns) output_df = pd.concat([output_df, row_df]) - total_df = pd.DataFrame(np.array(output_df.sum()).reshape(1, -1), columns=columns, index=['total']) + total_df = pd.DataFrame( + np.array(output_df.sum()).reshape(1, -1), columns=columns, index=["total"] + ) output_df = pd.concat([output_df, total_df]) output_df.sort_index(inplace=True) - output_df.to_csv(path.join(launch_dir, "analysis_" + selection + '.tsv'), sep='\t') + output_df.to_csv( + path.join(launch_dir, "analysis_" + selection + ".tsv"), sep="\t" + ) diff --git a/clinicadl/clinicadl/cli.py b/clinicadl/clinicadl/cli.py index e71571c26..dff5521e2 100644 --- a/clinicadl/clinicadl/cli.py +++ b/clinicadl/clinicadl/cli.py @@ -5,39 +5,41 @@ from colorama import Fore - TRAIN_CATEGORIES = { # General parent group - 'POSITIONAL': '%sPositional arguments%s' % (Fore.BLUE, Fore.RESET), - 'COMPUTATIONAL': '%sComputational resources%s' % (Fore.BLUE, Fore.RESET), - 'DATA': '%sData management%s' % (Fore.BLUE, Fore.RESET), - 'CROSS-VALIDATION': '%sCross-validation arguments%s' % (Fore.BLUE, Fore.RESET), - 'OPTIMIZATION': '%sOptimization parameters%s' % (Fore.BLUE, Fore.RESET), + "POSITIONAL": "%sPositional arguments%s" % (Fore.BLUE, Fore.RESET), + "COMPUTATIONAL": "%sComputational resources%s" % (Fore.BLUE, Fore.RESET), + "DATA": "%sData management%s" % (Fore.BLUE, Fore.RESET), + "CROSS-VALIDATION": "%sCross-validation arguments%s" % (Fore.BLUE, Fore.RESET), + "OPTIMIZATION": "%sOptimization parameters%s" % (Fore.BLUE, Fore.RESET), # Other parent groups - 'TRANSFER LEARNING': '%sTransfer learning%s' % (Fore.BLUE, Fore.RESET), - 'AUTOENCODER': '%sAutoencoder specific%s' % (Fore.BLUE, Fore.RESET), + "TRANSFER LEARNING": "%sTransfer learning%s" % (Fore.BLUE, Fore.RESET), + "AUTOENCODER": "%sAutoencoder specific%s" % (Fore.BLUE, Fore.RESET), # Slice-level - 'SLICE': '%sSlice-level parameters%s' % (Fore.BLUE, Fore.RESET), - 'SLICE CNN': '%sSlice-level CNN parameters%s' % (Fore.BLUE, Fore.RESET), + "SLICE": "%sSlice-level parameters%s" % (Fore.BLUE, Fore.RESET), + "SLICE CNN": "%sSlice-level CNN parameters%s" % (Fore.BLUE, Fore.RESET), # Patch arguments - 'PATCH': '%sPatch-level parameters%s' % (Fore.BLUE, Fore.RESET), - 'PATCH CNN': '%sPatch-level CNN parameters%s' % (Fore.BLUE, Fore.RESET), + "PATCH": "%sPatch-level parameters%s" % (Fore.BLUE, Fore.RESET), + "PATCH CNN": "%sPatch-level CNN parameters%s" % (Fore.BLUE, Fore.RESET), # ROI-based arguments - 'ROI': '%sROI-based parameters%s' % (Fore.BLUE, Fore.RESET), - 'ROI CNN': '%sROI-based CNN parameters%s' % (Fore.BLUE, Fore.RESET), + "ROI": "%sROI-based parameters%s" % (Fore.BLUE, Fore.RESET), + "ROI CNN": "%sROI-based CNN parameters%s" % (Fore.BLUE, Fore.RESET), # Other optional arguments - 'OPTIONAL': '%sOther options%s' % (Fore.BLUE, Fore.RESET), + "OPTIONAL": "%sOther options%s" % (Fore.BLUE, Fore.RESET), # Model selection - 'MODEL': '%sModel selection%s' % (Fore.BLUE, Fore.RESET), + "MODEL": "%sModel selection%s" % (Fore.BLUE, Fore.RESET), # Display - 'DISPLAY': '%sResults display%s' % (Fore.BLUE, Fore.RESET), + "DISPLAY": "%sResults display%s" % (Fore.BLUE, Fore.RESET), } def extract_tensors(args): import sys + + from clinica.pipelines.deeplearning_prepare_data.deeplearning_prepare_data_cli import ( + DeepLearningPrepareDataCLI, + ) from clinica.utils.stream import FilterOut - from clinica.pipelines.deeplearning_prepare_data.deeplearning_prepare_data_cli import DeepLearningPrepareDataCLI sys.stdout = FilterOut(sys.stdout) @@ -57,20 +59,18 @@ def qc_func(args): threshold=args.threshold, batch_size=args.batch_size, num_workers=args.nproc, - gpu=not args.use_cpu + gpu=not args.use_cpu, ) elif args.preprocessing == "t1-volume": - volume_qc( - args.caps_dir, - args.output_dir, - args.group_label - ) + volume_qc(args.caps_dir, args.output_dir, args.group_label) def generate_data_func(args): - from .tools.data.generate_data import (generate_random_dataset, - generate_trivial_dataset, - generate_shepplogan_dataset) + from .tools.data.generate_data import ( + generate_random_dataset, + generate_shepplogan_dataset, + generate_trivial_dataset, + ) if args.mode == "random": generate_random_dataset( @@ -80,7 +80,8 @@ def generate_data_func(args): n_subjects=args.n_subjects, mean=args.mean, sigma=args.sigma, - preprocessing=args.preprocessing) + preprocessing=args.preprocessing, + ) elif args.mode == "trivial": generate_trivial_dataset( caps_dir=args.caps_dir, @@ -92,19 +93,22 @@ def generate_data_func(args): atrophy_percent=args.atrophy_percent, ) else: - labels_distribution = {"AD": args.AD_subtypes_distribution, "CN": args.CN_subtypes_distribution} + labels_distribution = { + "AD": args.AD_subtypes_distribution, + "CN": args.CN_subtypes_distribution, + } generate_shepplogan_dataset( output_dir=args.output_dir, img_size=args.image_size, labels_distribution=labels_distribution, samples=args.n_subjects, - smoothing=args.smoothing + smoothing=args.smoothing, ) def rs_func(args): - from .train.random_search import launch_search from .classify.random_search_analysis import random_search_analysis + from .train.random_search import launch_search if args.random_task == "generate": launch_search(args) @@ -113,24 +117,22 @@ def rs_func(args): args.launch_dir, ) else: - raise ValueError('This task was not implemented in random-search.') + raise ValueError("This task was not implemented in random-search.") def retrain_func(args): from .train.train_from_json import retrain - retrain( - args.json_path, - args.output_dir, - verbose=args.verbose - ) + retrain(args.json_path, args.output_dir, verbose=args.verbose) def resume_func(args): from .resume.automatic_resume import automatic_resume if args.use_cpu and args.use_gpu: - raise ValueError("The flags --use_cpu and --use_gpu cannot be specified at the same time.") + raise ValueError( + "The flags --use_cpu and --use_gpu cannot be specified at the same time." + ) elif args.use_cpu: gpu = False elif args.use_gpu: @@ -144,7 +146,7 @@ def resume_func(args): batch_size=args.batch_size, num_workers=args.nproc, evaluation_steps=args.evaluation_steps, - verbose=args.verbose + verbose=args.verbose, ) @@ -160,7 +162,9 @@ def train_func(args): elif args.network_type == "multicnn": train_multi_cnn(args) else: - raise NotImplementedError('Framework %s not implemented in clinicadl' % args.network_type) + raise NotImplementedError( + "Framework %s not implemented in clinicadl" % args.network_type + ) # Function to dispatch command line options from classify to corresponding @@ -179,7 +183,7 @@ def classify_func(args): selection_metrics=args.selection_metrics, diagnoses=args.diagnoses, verbose=args.verbose, - multi_cohort=args.multi_cohort + multi_cohort=args.multi_cohort, ) @@ -207,7 +211,7 @@ def tsv_getlabels_func(args): time_horizon=args.time_horizon, variables_of_interest=args.variables_of_interest, remove_smc=not args.keep_smc, - verbose=args.verbose + verbose=args.verbose, ) @@ -223,7 +227,7 @@ def tsv_split_func(args): p_sex_threshold=args.p_sex_threshold, ignore_demographics=args.ignore_demographics, verbose=args.verbose, - categorical_split_variable=args.categorical_split_variable + categorical_split_variable=args.categorical_split_variable, ) @@ -236,7 +240,7 @@ def tsv_kfold_func(args): subset_name=args.subset_name, MCI_sub_categories=args.MCI_sub_categories, stratification=args.stratification, - verbose=args.verbose + verbose=args.verbose, ) @@ -247,7 +251,7 @@ def tsv_analysis_func(args): args.merged_tsv, args.formatted_data_path, args.results_path, - diagnoses=args.diagnoses + diagnoses=args.diagnoses, ) @@ -265,461 +269,531 @@ def interpret_func(args): def parse_command_line(): parser = argparse.ArgumentParser( - prog='clinicadl', - description='Deep learning software for neuroimaging datasets') - - parser.add_argument('-l', '--logname', - dest='logname', - default="clinicaDL.log", - metavar=('file.log'), - help='Define the log file name (default: clinicaDL.log)') - parser.add_argument("-V", "--version", - dest='version', - action='store_true', default=False, - help="ClinicaDL's installed version") + prog="clinicadl", description="Deep learning software for neuroimaging datasets" + ) + + parser.add_argument( + "-l", + "--logname", + dest="logname", + default="clinicaDL.log", + metavar=("file.log"), + help="Define the log file name (default: clinicaDL.log)", + ) + parser.add_argument( + "-V", + "--version", + dest="version", + action="store_true", + default=False, + help="ClinicaDL's installed version", + ) parent_parser = argparse.ArgumentParser(add_help=False) - parent_parser.add_argument('--verbose', '-v', action='count', default=0) + parent_parser.add_argument("--verbose", "-v", action="count", default=0) subparser = parser.add_subparsers( - title='''Task to execute with clinicadl''', - description='''What kind of task do you want to use with clinicadl?''', - dest='task', - help='''****** Tasks proposed by clinicadl ******''') + title="""Task to execute with clinicadl""", + description="""What kind of task do you want to use with clinicadl?""", + dest="task", + help="""****** Tasks proposed by clinicadl ******""", + ) # subparser.required = True # Generate synthetic data generate_parser = subparser.add_parser( - 'generate', - help='Generate synthetic data for functional tests.' + "generate", help="Generate synthetic data for functional tests." ) generate_subparser = generate_parser.add_subparsers( - title='''Type of synthetic data generated''', - description='''What type of synthetic data do you want to generate? - (random, shepplogan, trivial).''', - dest='mode', - help='''****** Synthetic datasets proposed by clinicadl ******''') + title="""Type of synthetic data generated""", + description="""What type of synthetic data do you want to generate? + (random, shepplogan, trivial).""", + dest="mode", + help="""****** Synthetic datasets proposed by clinicadl ******""", + ) # generate_subparser.required = True # Positional arguments generate_rs_parent_parser = argparse.ArgumentParser(add_help=False) generate_rs_parent_parser.add_argument( - 'caps_dir', - help='Data using CAPS structure.', - default=None + "caps_dir", help="Data using CAPS structure.", default=None ) generate_rs_parent_parser.add_argument( - 'preprocessing', + "preprocessing", type=str, - choices=['t1-linear', 't1-extensive'], - help="Preprocessing used to generate synthetic data." + choices=["t1-linear", "t1-extensive"], + help="Preprocessing used to generate synthetic data.", ) generate_rs_parent_parser.add_argument( - 'output_dir', - help='Folder containing the synthetic dataset.', + "output_dir", + help="Folder containing the synthetic dataset.", ) generate_rs_parent_parser.add_argument( - "--subjects_sessions_tsv", "-tsv", - help='TSV file containing a list of subjects with their sessions.', - type=str, default=None + "--subjects_sessions_tsv", + "-tsv", + help="TSV file containing a list of subjects with their sessions.", + type=str, + default=None, ) generate_rs_parent_parser.add_argument( - '--n_subjects', + "--n_subjects", type=int, default=300, - help="Number of subjects in each class of the synthetic dataset." + help="Number of subjects in each class of the synthetic dataset.", ) generate_random_parser = generate_subparser.add_parser( "random", - parents=[ - parent_parser, - generate_rs_parent_parser], - help="Generate a random dataset in which gaussian noise is added to brain images.") + parents=[parent_parser, generate_rs_parent_parser], + help="Generate a random dataset in which gaussian noise is added to brain images.", + ) generate_random_parser.add_argument( - '--mean', + "--mean", type=float, default=0, - help="Mean value of the noise added for the random dataset." + help="Mean value of the noise added for the random dataset.", ) generate_random_parser.add_argument( - '--sigma', + "--sigma", type=float, default=0.5, - help="Standard deviation of the noise added for the random dataset." + help="Standard deviation of the noise added for the random dataset.", ) generate_random_parser.set_defaults(func=generate_data_func) generate_trivial_parser = generate_subparser.add_parser( "trivial", - parents=[ - parent_parser, - generate_rs_parent_parser], - help="Generate a trivial dataset in which gaussian half of the brain is atrophied.") + parents=[parent_parser, generate_rs_parent_parser], + help="Generate a trivial dataset in which gaussian half of the brain is atrophied.", + ) generate_trivial_parser.add_argument( - '--mask_path', + "--mask_path", type=str, - help='path to the extracted masks to generate the two labels.', - default=None + help="path to the extracted masks to generate the two labels.", + default=None, ) generate_trivial_parser.add_argument( - '--atrophy_percent', + "--atrophy_percent", type=float, default=60, - help='percentage of atrophy applied' + help="percentage of atrophy applied", ) generate_trivial_parser.set_defaults(func=generate_data_func) generate_shepplogan_parser = generate_subparser.add_parser( "shepplogan", - help="Generate a dataset of 2D images including 3 subtypes based on Shepp Logan phantom." + help="Generate a dataset of 2D images including 3 subtypes based on Shepp Logan phantom.", ) generate_shepplogan_parser.add_argument( - 'output_dir', - help='Folder containing the synthetic dataset.', + "output_dir", + help="Folder containing the synthetic dataset.", ) generate_shepplogan_parser.add_argument( - '--n_subjects', + "--n_subjects", type=int, default=300, - help="Number of subjects in each class of the synthetic dataset." + help="Number of subjects in each class of the synthetic dataset.", ) generate_shepplogan_parser.add_argument( - '--image_size', + "--image_size", type=int, default=128, - help="Size in pixels of the squared images." + help="Size in pixels of the squared images.", ) generate_shepplogan_parser.add_argument( - '--CN_subtypes_distribution', '-Csd', - type=float, nargs='+', + "--CN_subtypes_distribution", + "-Csd", + type=float, + nargs="+", default=[1.0, 0.0, 0.0], - help="Probability of each subtype to be drawn in CN label." + help="Probability of each subtype to be drawn in CN label.", ) generate_shepplogan_parser.add_argument( - '--AD_subtypes_distribution', '-Asd', - type=float, nargs='+', + "--AD_subtypes_distribution", + "-Asd", + type=float, + nargs="+", default=[0.05, 0.85, 0.10], - help="Probability of each subtype to be drawn in AD label." + help="Probability of each subtype to be drawn in AD label.", ) generate_shepplogan_parser.add_argument( - '--smoothing', - action='store_true', + "--smoothing", + action="store_true", default=False, - help='Adds random smoothing to generated data.' + help="Adds random smoothing to generated data.", ) generate_shepplogan_parser.set_defaults(func=generate_data_func) # Preprocessing - from clinica.pipelines.t1_linear.t1_linear_cli import T1LinearCLI from clinica.engine.cmdparser import init_cmdparser_objects + from clinica.pipelines.t1_linear.t1_linear_cli import T1LinearCLI + from .preprocessing.t1_extensive.t1_extensive_cli import T1ExtensiveCli + preprocessing_parser = subparser.add_parser( - 'preprocessing', - help='Preprocess T1w-weighted images with t1-linear or t1-extensive pipelines.' + "preprocessing", + help="Preprocess T1w-weighted images with t1-linear or t1-extensive pipelines.", ) preprocessing_subparser = preprocessing_parser.add_subparsers( - title='''Preprocessing task to execute with clinicadl''', - description='''What kind of task do you want to perform with clinicadl? - (run, quality-check, extract-tensor).''', - dest='preprocessing_task', - help='''****** Tasks proposed by clinicadl ******''') + title="""Preprocessing task to execute with clinicadl""", + description="""What kind of task do you want to perform with clinicadl? + (run, quality-check, extract-tensor).""", + dest="preprocessing_task", + help="""****** Tasks proposed by clinicadl ******""", + ) preprocessing_subparser.required = True run_parser = preprocessing_subparser.add_parser( - 'run', - help='Preprocess T1w-weighted images with t1-linear or t1-extensive pipelines.' + "run", + help="Preprocess T1w-weighted images with t1-linear or t1-extensive pipelines.", + ) + run_parser._positionals.title = ( + "%sclinicadl preprocessing expects one of the following pipelines%s" + % (Fore.GREEN, Fore.RESET) ) - run_parser._positionals.title = ('%sclinicadl preprocessing expects one of the following pipelines%s' - % (Fore.GREEN, Fore.RESET)) def preprocessing_help(args): - print('%sNo pipeline was specified. Type clinica preprocessing -h for details%s' % - (Fore.RED, Fore.RESET)) + print( + "%sNo pipeline was specified. Type clinica preprocessing -h for details%s" + % (Fore.RED, Fore.RESET) + ) run_parser.set_defaults(func=preprocessing_help) init_cmdparser_objects( parser, - run_parser.add_subparsers(dest='preprocessing'), + run_parser.add_subparsers(dest="preprocessing"), [ T1LinearCLI(), T1ExtensiveCli(), - ] + ], ) extract_parser = preprocessing_subparser.add_parser( - 'extract-tensor', - help='Prepare data generated Clinica for PyTorch with Tensor extraction (image, patches or slices).' + "extract-tensor", + help="Prepare data generated Clinica for PyTorch with Tensor extraction (image, patches or slices).", ) - clinica_comp = extract_parser.add_argument_group('%sClinica mandatory arguments%s' % (Fore.BLUE, Fore.RESET)) - clinica_comp.add_argument( - "caps_directory", - help='Path to the CAPS directory.' + clinica_comp = extract_parser.add_argument_group( + "%sClinica mandatory arguments%s" % (Fore.BLUE, Fore.RESET) ) - clinica_comp.add_argument("modality", - help='''For which modality the tensor will be extracted. + clinica_comp.add_argument("caps_directory", help="Path to the CAPS directory.") + clinica_comp.add_argument( + "modality", + help="""For which modality the tensor will be extracted. 't1-linear': images prepocessed with t1-linear pipeline. 't1-extensive': images preprocessed with t1-extensive pipeline. 'custom': find images with a custom suffix in their filename and - transform them to tensor format.''', - choices=['t1-linear', 't1-extensive', 'custom'], default='t1-linear' - ) + transform them to tensor format.""", + choices=["t1-linear", "t1-extensive", "custom"], + default="t1-linear", + ) clinica_comp.add_argument( "extract_method", - help='''Format of the extracted features. Three options: + help="""Format of the extracted features. Three options: 'image' to convert to PyTorch tensor the complete 3D image, 'patch' to extract 3D volumetric patches and 'slice' to extract 2D slices from the image. - By default the features are extracted from the cropped image.''', - choices=['image', 'slice', 'patch'], - default='image' + By default the features are extracted from the cropped image.""", + choices=["image", "slice", "patch"], + default="image", ) - optional = extract_parser.add_argument_group('%sPipeline options%s' % (Fore.BLUE, Fore.RESET)) + optional = extract_parser.add_argument_group( + "%sPipeline options%s" % (Fore.BLUE, Fore.RESET) + ) optional.add_argument( - '-uui', '--use_uncropped_image', - help='''Use the uncropped image instead of the - cropped image generated by t1-linear.''', - default=False, action="store_true" + "-uui", + "--use_uncropped_image", + help="""Use the uncropped image instead of the + cropped image generated by t1-linear.""", + default=False, + action="store_true", ) optional_patch = extract_parser.add_argument_group( "%sPipeline options if you chose ‘patch’ extraction%s" % (Fore.BLUE, Fore.RESET) ) optional_patch.add_argument( - '-ps', '--patch_size', - help='''Patch size (default: --patch_size 50).''', - type=int, default=50 + "-ps", + "--patch_size", + help="""Patch size (default: --patch_size 50).""", + type=int, + default=50, ) optional_patch.add_argument( - '-ss', '--stride_size', - help='''Stride size (default: --stride_size 50).''', - type=int, default=50 + "-ss", + "--stride_size", + help="""Stride size (default: --stride_size 50).""", + type=int, + default=50, ) optional_slice = extract_parser.add_argument_group( "%sPipeline options if you chose ‘slice’ extraction%s" % (Fore.BLUE, Fore.RESET) ) optional_slice.add_argument( - '-sd', '--slice_direction', - help='''Slice direction. Three options: + "-sd", + "--slice_direction", + help="""Slice direction. Three options: '0' -> Sagittal plane, '1' -> Coronal plane or '2' -> Axial plane - (default: sagittal plane i.e. --slice_direction 0)''', - type=int, default=0 + (default: sagittal plane i.e. --slice_direction 0)""", + type=int, + default=0, ) optional_slice.add_argument( - '-sm', '--slice_mode', - help='''Slice mode. Two options: 'rgb' to save the slice in + "-sm", + "--slice_mode", + help="""Slice mode. Two options: 'rgb' to save the slice in three identical channels, ‘single’ to save the slice in a - single channel (default: --slice_mode rgb).''', - choices=['rgb', 'single'], default='rgb' + single channel (default: --slice_mode rgb).""", + choices=["rgb", "single"], + default="rgb", ) optional_custom = extract_parser.add_argument_group( "%sPipeline options if you chose ‘custom’ modality%s" % (Fore.BLUE, Fore.RESET) ) optional_custom.add_argument( - '-cn', '--custom_suffix', - help='''Custom suffix filename, e.g.: + "-cn", + "--custom_suffix", + help="""Custom suffix filename, e.g.: 'graymatter_space-Ixi549Space_modulated-off_probability.nii.gz', or 'segm-whitematter_probability.nii.gz' - ''', - type=str, default='' + """, + type=str, + default="", ) # Clinica standard arguments (e.g. --n_procs) - clinica_standard_options = extract_parser.add_argument_group('%sClinica standard options%s' % (Fore.BLUE, Fore.RESET)) + clinica_standard_options = extract_parser.add_argument_group( + "%sClinica standard options%s" % (Fore.BLUE, Fore.RESET) + ) clinica_standard_options.add_argument( - "-tsv", "--subjects_sessions_tsv", - help='TSV file containing a list of subjects with their sessions.' + "-tsv", + "--subjects_sessions_tsv", + help="TSV file containing a list of subjects with their sessions.", ) clinica_standard_options.add_argument( - "-wd", "--working_directory", - help='Temporary directory to store pipelines intermediate results.' + "-wd", + "--working_directory", + help="Temporary directory to store pipelines intermediate results.", ) clinica_standard_options.add_argument( - "-np", "--n_procs", - metavar='N', type=int, - help='Number of cores used to run in parallel.' + "-np", + "--n_procs", + metavar="N", + type=int, + help="Number of cores used to run in parallel.", ) extract_parser.set_defaults(func=extract_tensors) qc_parser = preprocessing_subparser.add_parser( - 'quality-check', - help='Performs quality check procedure for t1-linear pipeline.' - 'Original code can be found at https://github.com/vfonov/deep-qc' + "quality-check", + help="Performs quality check procedure for t1-linear pipeline." + "Original code can be found at https://github.com/vfonov/deep-qc", ) qc_subparsers = qc_parser.add_subparsers( - title='''Preprocessing pipelines available''', - description='''Which preprocessing pipeline do you want to check?''', - dest='preprocessing', - help='''****** Preprocessing pipelines ******''') + title="""Preprocessing pipelines available""", + description="""Which preprocessing pipeline do you want to check?""", + dest="preprocessing", + help="""****** Preprocessing pipelines ******""", + ) qc_subparsers.required = True qc_linear_parser = qc_subparsers.add_parser( - 't1-linear', - help='Performs quality check on t1-linear pipeline.' - ) - qc_linear_parser.add_argument("caps_dir", - help='Data using CAPS structure.', - type=str) - qc_linear_parser.add_argument("output_path", - help="Path to the output tsv file (filename included).", - type=str) - - qc_linear_parser.add_argument("--subjects_sessions_tsv", "-tsv", - help='TSV file containing a list of subjects with their sessions.', - type=str, default=None) - qc_linear_parser.add_argument("--threshold", - help='The threshold on the output probability to decide if the image ' - 'passed or failed. (default=0.5)', - type=float, default=0.5) - qc_linear_parser.add_argument('--batch_size', - help='Batch size used in DataLoader (default=1).', - default=1, type=int) - qc_linear_parser.add_argument("-np", "--nproc", - help='Number of cores used the quality check. (default=2)', - type=int, default=2) - qc_linear_parser.add_argument('-cpu', '--use_cpu', action='store_true', - help='If provided, will use CPU instead of GPU.', - default=False) + "t1-linear", help="Performs quality check on t1-linear pipeline." + ) + qc_linear_parser.add_argument( + "caps_dir", help="Data using CAPS structure.", type=str + ) + qc_linear_parser.add_argument( + "output_path", help="Path to the output tsv file (filename included).", type=str + ) + + qc_linear_parser.add_argument( + "--subjects_sessions_tsv", + "-tsv", + help="TSV file containing a list of subjects with their sessions.", + type=str, + default=None, + ) + qc_linear_parser.add_argument( + "--threshold", + help="The threshold on the output probability to decide if the image " + "passed or failed. (default=0.5)", + type=float, + default=0.5, + ) + qc_linear_parser.add_argument( + "--batch_size", + help="Batch size used in DataLoader (default=1).", + default=1, + type=int, + ) + qc_linear_parser.add_argument( + "-np", + "--nproc", + help="Number of cores used the quality check. (default=2)", + type=int, + default=2, + ) + qc_linear_parser.add_argument( + "-cpu", + "--use_cpu", + action="store_true", + help="If provided, will use CPU instead of GPU.", + default=False, + ) qc_linear_parser.set_defaults(func=qc_func) qc_volume_parser = qc_subparsers.add_parser( - 't1-volume', - help='Performs quality check on t1-volume pipeline.' - ) - qc_volume_parser.add_argument("caps_dir", - help='Data using CAPS structure.', - type=str) - qc_volume_parser.add_argument("output_dir", - help="Path to the output directory containing TSV files.", - type=str) - qc_volume_parser.add_argument("group_label", - help="Identifier for the group of subjects used to create the DARTEL template.", - type=str) + "t1-volume", help="Performs quality check on t1-volume pipeline." + ) + qc_volume_parser.add_argument( + "caps_dir", help="Data using CAPS structure.", type=str + ) + qc_volume_parser.add_argument( + "output_dir", + help="Path to the output directory containing TSV files.", + type=str, + ) + qc_volume_parser.add_argument( + "group_label", + help="Identifier for the group of subjects used to create the DARTEL template.", + type=str, + ) qc_volume_parser.set_defaults(func=qc_func) # random search parsers rs_parser = subparser.add_parser( - 'random-search', + "random-search", parents=[parent_parser], - help='Generate random networks to explore hyper parameters space.', - formatter_class=argparse.ArgumentDefaultsHelpFormatter + help="Generate random networks to explore hyper parameters space.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) rs_subparsers = rs_parser.add_subparsers( - title='''Possibilities for random network training''', - description='''You can generate and train a new random network, - or relaunch a previous random job with some alterations.''', - dest='random_task', - help='''****** Possible tasks ******''' + title="""Possibilities for random network training""", + description="""You can generate and train a new random network, + or relaunch a previous random job with some alterations.""", + dest="random_task", + help="""****** Possible tasks ******""", ) rs_subparsers.required = True rs_generate_parser = rs_subparsers.add_parser( - 'generate', + "generate", parents=[parent_parser], - help='Sample a new network and train it.', - formatter_class=argparse.ArgumentDefaultsHelpFormatter + help="Sample a new network and train it.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) - rs_pos_group = rs_generate_parser.add_argument_group( - TRAIN_CATEGORIES["POSITIONAL"] + rs_pos_group = rs_generate_parser.add_argument_group(TRAIN_CATEGORIES["POSITIONAL"]) + rs_pos_group.add_argument( + "launch_dir", type=str, help="Directory containing the random_search.json file." ) - rs_pos_group.add_argument("launch_dir", type=str, - help="Directory containing the random_search.json file.") - rs_pos_group.add_argument("name", type=str, - help="Name of the job.") + rs_pos_group.add_argument("name", type=str, help="Name of the job.") rs_comp_group = rs_generate_parser.add_argument_group( TRAIN_CATEGORIES["COMPUTATIONAL"] ) rs_comp_group.add_argument( - '-cpu', '--use_cpu', action='store_true', - help='If provided, will use CPU instead of GPU.', - default=False) + "-cpu", + "--use_cpu", + action="store_true", + help="If provided, will use CPU instead of GPU.", + default=False, + ) rs_comp_group.add_argument( - '-np', '--nproc', - help='Number of cores used during the training.', - type=int, default=2) + "-np", + "--nproc", + help="Number of cores used during the training.", + type=int, + default=2, + ) rs_comp_group.add_argument( - '--batch_size', - default=2, type=int, - help='Batch size for training.') + "--batch_size", default=2, type=int, help="Batch size for training." + ) rs_comp_group.add_argument( - '--evaluation_steps', '-esteps', - default=0, type=int, - help='Fix the number of iterations to perform before computing an evaluation. Default will only ' - 'perform one evaluation at the end of each epoch.') + "--evaluation_steps", + "-esteps", + default=0, + type=int, + help="Fix the number of iterations to perform before computing an evaluation. Default will only " + "perform one evaluation at the end of each epoch.", + ) rs_generate_parser.set_defaults(func=rs_func) rs_analysis_parser = rs_subparsers.add_parser( - 'analysis', + "analysis", help="Performs the analysis of all jobs in launch_dir", - formatter_class=argparse.ArgumentDefaultsHelpFormatter + formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) rs_analysis_parser.add_argument( - "launch_dir", - type=str, - help="Directory containing the random_search.json file." + "launch_dir", type=str, help="Directory containing the random_search.json file." ) rs_analysis_parser.set_defaults(func=rs_func) train_parser = subparser.add_parser( - 'train', - help='Train with your data and create a model.') + "train", help="Train with your data and create a model." + ) train_subparser = train_parser.add_subparsers( - title='''Inputs types implemented in clinicadl''', - description='''What type of input do you want to use? - (image, patch, roi, slice).''', - dest='mode', - help='''****** Input types proposed by clinicadl ******''') + title="""Inputs types implemented in clinicadl""", + description="""What type of input do you want to use? + (image, patch, roi, slice).""", + dest="mode", + help="""****** Input types proposed by clinicadl ******""", + ) train_subparser.required = True # Transfer learning transfer_learning_parent = argparse.ArgumentParser(add_help=False) transfer_learning_group = transfer_learning_parent.add_argument_group( - TRAIN_CATEGORIES["TRANSFER LEARNING"]) + TRAIN_CATEGORIES["TRANSFER LEARNING"] + ) transfer_learning_group.add_argument( - '--transfer_learning_path', + "--transfer_learning_path", help="If an existing path is given, a pretrained model is used.", - type=str, default=None) + type=str, + default=None, + ) # Autoencoder autoencoder_parent = argparse.ArgumentParser(add_help=False) autoencoder_group = autoencoder_parent.add_argument_group( - TRAIN_CATEGORIES["AUTOENCODER"]) + TRAIN_CATEGORIES["AUTOENCODER"] + ) autoencoder_group.add_argument( - '--visualization', - help='Save examples of image reconstructions.', + "--visualization", + help="Save examples of image reconstructions.", action="store_true", - default=False) + default=False, + ) ###################### # IMAGE ###################### train_image_parser = train_subparser.add_parser( - "image", - help="Train a 3D image-level network.") + "image", help="Train a 3D image-level network." + ) train_image_subparser = train_image_parser.add_subparsers( - title='''Task to be performed''', - description='''Autoencoder reconstruction or cnn classification ?''', - dest='network_type', - help='''****** Choose a type of network ******''') + title="""Task to be performed""", + description="""Autoencoder reconstruction or cnn classification ?""", + dest="network_type", + help="""****** Choose a type of network ******""", + ) train_parent_parser = return_train_parent_parser() train_image_ae_parser = train_image_subparser.add_parser( @@ -728,23 +802,26 @@ def preprocessing_help(args): parent_parser, train_parent_parser, autoencoder_parent, - transfer_learning_parent], - help="Train an image-level autoencoder.") + transfer_learning_parent, + ], + help="Train an image-level autoencoder.", + ) train_image_ae_parser.set_defaults(func=train_func) train_image_cnn_parser = train_image_subparser.add_parser( "cnn", - parents=[ - parent_parser, - train_parent_parser, - transfer_learning_parent], - help="Train an image-level CNN.") + parents=[parent_parser, train_parent_parser, transfer_learning_parent], + help="Train an image-level CNN.", + ) # /!\ If parents list is changed the arguments won't be in the right group anymore ! train_image_cnn_parser._action_groups[-1].add_argument( - '--transfer_learning_selection', + "--transfer_learning_selection", help="If transfer_learning from CNN, chooses which best transfer model is selected.", - type=str, default="best_balanced_accuracy", choices=["best_loss", "best_balanced_accuracy"]) + type=str, + default="best_balanced_accuracy", + choices=["best_loss", "best_balanced_accuracy"], + ) train_image_cnn_parser.set_defaults(func=train_func) @@ -752,37 +829,44 @@ def preprocessing_help(args): # PATCH ######################### train_patch_parser = train_subparser.add_parser( - "patch", - help="Train a 3D patch-level network.") + "patch", help="Train a 3D patch-level network." + ) train_patch_parent = argparse.ArgumentParser(add_help=False) - train_patch_group = train_patch_parent.add_argument_group( - TRAIN_CATEGORIES["PATCH"]) + train_patch_group = train_patch_parent.add_argument_group(TRAIN_CATEGORIES["PATCH"]) train_patch_group.add_argument( - '-ps', '--patch_size', - help='Patch size', - type=int, default=50) + "-ps", "--patch_size", help="Patch size", type=int, default=50 + ) train_patch_group.add_argument( - '-ss', '--stride_size', - help='Stride size', - type=int, default=50) + "-ss", "--stride_size", help="Stride size", type=int, default=50 + ) train_patch_group.add_argument( - '--use_extracted_patches', - help='''If provided the outputs of extract preprocessing are used, else the whole - MRI is loaded.''', - default=False, action="store_true") + "--use_extracted_patches", + help="""If provided the outputs of extract preprocessing are used, else the whole + MRI is loaded.""", + default=False, + action="store_true", + ) train_patch_subparser = train_patch_parser.add_subparsers( - title='''Task to be performed''', - description='''Autoencoder reconstruction or (multi)cnn classification ?''', - dest='network_type', - help='''****** Choose a type of network ******''') + title="""Task to be performed""", + description="""Autoencoder reconstruction or (multi)cnn classification ?""", + dest="network_type", + help="""****** Choose a type of network ******""", + ) train_patch_subparser.required = True train_patch_ae_parser = train_patch_subparser.add_parser( "autoencoder", - parents=[parent_parser, train_parent_parser, train_patch_parent, autoencoder_parent, transfer_learning_parent], - help="Train a 3D patch-level autoencoder.") + parents=[ + parent_parser, + train_parent_parser, + train_patch_parent, + autoencoder_parent, + transfer_learning_parent, + ], + help="Train a 3D patch-level autoencoder.", + ) train_patch_ae_parser.set_defaults(func=train_func) @@ -792,22 +876,30 @@ def preprocessing_help(args): parent_parser, train_parent_parser, train_patch_parent, - transfer_learning_parent], - help="Train a 3D patch-level CNN.") + transfer_learning_parent, + ], + help="Train a 3D patch-level CNN.", + ) # /!\ If parents list is changed the arguments won't be in the right group anymore ! train_patch_cnn_parser._action_groups[-1].add_argument( - '--transfer_learning_selection', + "--transfer_learning_selection", help="If transfer_learning from CNN, chooses which best transfer model is selected.", - type=str, default="best_balanced_accuracy", choices=["best_loss", "best_balanced_accuracy"]) + type=str, + default="best_balanced_accuracy", + choices=["best_loss", "best_balanced_accuracy"], + ) train_patch_cnn_group = train_patch_cnn_parser.add_argument_group( - TRAIN_CATEGORIES["PATCH CNN"]) + TRAIN_CATEGORIES["PATCH CNN"] + ) train_patch_cnn_group.add_argument( - '--selection_threshold', - help='''Threshold on the balanced accuracies to compute the + "--selection_threshold", + help="""Threshold on the balanced accuracies to compute the subject-level performance. Patches are selected if their balanced - accuracy > threshold. Default corresponds to no selection.''', - type=float, default=0.0) + accuracy > threshold. Default corresponds to no selection.""", + type=float, + default=0.0, + ) train_patch_cnn_parser.set_defaults(func=train_func) @@ -817,22 +909,30 @@ def preprocessing_help(args): parent_parser, train_parent_parser, train_patch_parent, - transfer_learning_parent], - help="Train a 3D patch-level multi-CNN (one CNN is trained per patch location).") + transfer_learning_parent, + ], + help="Train a 3D patch-level multi-CNN (one CNN is trained per patch location).", + ) # /!\ If parents list is changed the arguments won't be in the right group anymore ! train_patch_multicnn_parser._action_groups[-1].add_argument( - '--transfer_learning_selection', + "--transfer_learning_selection", help="If transfer_learning from CNN, chooses which best transfer model is selected.", - type=str, default="best_balanced_accuracy", choices=["best_loss", "best_balanced_accuracy"]) + type=str, + default="best_balanced_accuracy", + choices=["best_loss", "best_balanced_accuracy"], + ) train_patch_multicnn_group = train_patch_multicnn_parser.add_argument_group( - TRAIN_CATEGORIES["PATCH CNN"]) + TRAIN_CATEGORIES["PATCH CNN"] + ) train_patch_multicnn_group.add_argument( - '--selection_threshold', - help='''Threshold on the balanced accuracies to compute the + "--selection_threshold", + help="""Threshold on the balanced accuracies to compute the subject-level performance. Patches are selected if their balanced - accuracy > threshold. Default corresponds to no selection.''', - type=float, default=0.0) + accuracy > threshold. Default corresponds to no selection.""", + type=float, + default=0.0, + ) train_patch_multicnn_parser.set_defaults(func=train_func) @@ -840,33 +940,41 @@ def preprocessing_help(args): # ROI ######################### train_roi_parser = train_subparser.add_parser( - "roi", - help="Train a ROI-based level network.") + "roi", help="Train a ROI-based level network." + ) train_roi_parent = argparse.ArgumentParser(add_help=False) - train_roi_group = train_roi_parent.add_argument_group( - TRAIN_CATEGORIES["ROI"]) + train_roi_group = train_roi_parent.add_argument_group(TRAIN_CATEGORIES["ROI"]) train_roi_group.add_argument( - '-rl', '--roi_list', - help='Names of the regions used for the classification task.' - 'Default will use the hippocampi as described in (Wen et al, 2019).', - type=str, nargs="+", default=None) + "-rl", + "--roi_list", + help="Names of the regions used for the classification task." + "Default will use the hippocampi as described in (Wen et al, 2019).", + type=str, + nargs="+", + default=None, + ) train_roi_group.add_argument( - '--uncropped_roi', - help='If given the image is as large as the whole image. Default will crop the image' - 'with the smallest bounding box possible.', - action='store_true', default=False) + "--uncropped_roi", + help="If given the image is as large as the whole image. Default will crop the image" + "with the smallest bounding box possible.", + action="store_true", + default=False, + ) train_roi_group.add_argument( - '--use_extracted_roi', - help='''If provided the outputs of extract preprocessing are used, else the whole - MRI is loaded.''', - default=False, action="store_true") + "--use_extracted_roi", + help="""If provided the outputs of extract preprocessing are used, else the whole + MRI is loaded.""", + default=False, + action="store_true", + ) train_roi_subparser = train_roi_parser.add_subparsers( - title='''Task to be performed''', - description='''Autoencoder reconstruction or cnn classification ?''', - dest='network_type', - help='''****** Choose a type of network ******''') + title="""Task to be performed""", + description="""Autoencoder reconstruction or cnn classification ?""", + dest="network_type", + help="""****** Choose a type of network ******""", + ) train_roi_subparser.required = True train_roi_ae_parser = train_roi_subparser.add_parser( @@ -876,9 +984,10 @@ def preprocessing_help(args): train_parent_parser, train_roi_parent, autoencoder_parent, - transfer_learning_parent + transfer_learning_parent, ], - help="Train a ROI-based autoencoder.") + help="Train a ROI-based autoencoder.", + ) train_roi_ae_parser.set_defaults(func=train_func) @@ -888,22 +997,30 @@ def preprocessing_help(args): parent_parser, train_parent_parser, train_roi_parent, - transfer_learning_parent], - help="Train a ROI-based CNN.") + transfer_learning_parent, + ], + help="Train a ROI-based CNN.", + ) # /!\ If parents list is changed the arguments won't be in the right group anymore ! train_roi_cnn_parser._action_groups[-1].add_argument( - '--transfer_learning_selection', + "--transfer_learning_selection", help="If transfer_learning from CNN, chooses which best transfer model is selected.", - type=str, default="best_balanced_accuracy", choices=["best_loss", "best_balanced_accuracy"]) + type=str, + default="best_balanced_accuracy", + choices=["best_loss", "best_balanced_accuracy"], + ) train_roi_cnn_group = train_roi_cnn_parser.add_argument_group( - TRAIN_CATEGORIES["ROI CNN"]) + TRAIN_CATEGORIES["ROI CNN"] + ) train_roi_cnn_group.add_argument( - '--selection_threshold', - help='''Threshold on the balanced accuracies to compute the + "--selection_threshold", + help="""Threshold on the balanced accuracies to compute the subject-level performance. ROIs are selected if their balanced - accuracy > threshold. Default corresponds to no selection.''', - type=float, default=0.0) + accuracy > threshold. Default corresponds to no selection.""", + type=float, + default=0.0, + ) train_roi_cnn_parser.set_defaults(func=train_func) @@ -913,22 +1030,30 @@ def preprocessing_help(args): parent_parser, train_parent_parser, train_roi_parent, - transfer_learning_parent], - help="Train a ROI-based multi-CNN (one CNN is trained per patch location).") + transfer_learning_parent, + ], + help="Train a ROI-based multi-CNN (one CNN is trained per patch location).", + ) # /!\ If parents list is changed the arguments won't be in the right group anymore ! train_roi_multicnn_parser._action_groups[-1].add_argument( - '--transfer_learning_selection', + "--transfer_learning_selection", help="If transfer_learning from CNN, chooses which best transfer model is selected.", - type=str, default="best_balanced_accuracy", choices=["best_loss", "best_balanced_accuracy"]) + type=str, + default="best_balanced_accuracy", + choices=["best_loss", "best_balanced_accuracy"], + ) train_roi_multicnn_group = train_roi_multicnn_parser.add_argument_group( - TRAIN_CATEGORIES["ROI CNN"]) + TRAIN_CATEGORIES["ROI CNN"] + ) train_roi_multicnn_group.add_argument( - '--selection_threshold', - help='''Threshold on the balanced accuracies to compute the + "--selection_threshold", + help="""Threshold on the balanced accuracies to compute the subject-level performance. Patches are selected if their balanced - accuracy > threshold. Default corresponds to no selection.''', - type=float, default=0.0) + accuracy > threshold. Default corresponds to no selection.""", + type=float, + default=0.0, + ) train_roi_multicnn_parser.set_defaults(func=train_func) @@ -936,85 +1061,122 @@ def preprocessing_help(args): # SLICE ######################### train_slice_parser = train_subparser.add_parser( - "slice", - help="Train a 2D slice-level network.") + "slice", help="Train a 2D slice-level network." + ) train_slice_subparser = train_slice_parser.add_subparsers( - title='''Task to be performed''', - description='''Autoencoder reconstruction or cnn classification ?''', - dest='network_type', - help='''****** Choose a type of network ******''') + title="""Task to be performed""", + description="""Autoencoder reconstruction or cnn classification ?""", + dest="network_type", + help="""****** Choose a type of network ******""", + ) train_slice_subparser.required = True train_slice_parent = argparse.ArgumentParser(add_help=False) - train_slice_group = train_slice_parent.add_argument_group( - TRAIN_CATEGORIES["SLICE"]) + train_slice_group = train_slice_parent.add_argument_group(TRAIN_CATEGORIES["SLICE"]) train_slice_group.add_argument( - '--slice_direction', '-sd', - help='''Which coordinate axis to take for slicing the MRI. + "--slice_direction", + "-sd", + help="""Which coordinate axis to take for slicing the MRI. 0 for sagittal 1 for coronal - 2 for axial direction.''', - default=0, type=int) + 2 for axial direction.""", + default=0, + type=int, + ) train_slice_group.add_argument( - '--discarded_slices', - help='''Number of slices discarded from respectively the beginning and + "--discarded_slices", + help="""Number of slices discarded from respectively the beginning and the end of the MRI volume. If only one argument is given, it will be - used for both sides.''', - default=20, type=int, nargs='+' + used for both sides.""", + default=20, + type=int, + nargs="+", ) train_slice_group.add_argument( - '--use_extracted_slices', - help='''If provided the outputs of extract preprocessing are used, else the whole - MRI is loaded.''', - default=False, action="store_true") + "--use_extracted_slices", + help="""If provided the outputs of extract preprocessing are used, else the whole + MRI is loaded.""", + default=False, + action="store_true", + ) train_slice_ae_parser = train_slice_subparser.add_parser( "autoencoder", - parents=[parent_parser, train_parent_parser, train_slice_parent, transfer_learning_parent], - help="Train a 2D slice-level autoencoder.") + parents=[ + parent_parser, + train_parent_parser, + train_slice_parent, + transfer_learning_parent, + ], + help="Train a 2D slice-level autoencoder.", + ) train_slice_ae_parser.set_defaults(func=train_func) train_slice_cnn_parser = train_slice_subparser.add_parser( "cnn", - parents=[parent_parser, train_parent_parser, train_slice_parent, transfer_learning_parent], - help="Train a 2D slice-level CNN.") + parents=[ + parent_parser, + train_parent_parser, + train_slice_parent, + transfer_learning_parent, + ], + help="Train a 2D slice-level CNN.", + ) # /!\ If parents list is changed the arguments won't be in the right group anymore ! train_slice_cnn_parser._action_groups[-1].add_argument( - '--transfer_learning_selection', + "--transfer_learning_selection", help="If transfer_learning from CNN, chooses which best transfer model is selected.", - type=str, default="best_balanced_accuracy", choices=["best_loss", "best_balanced_accuracy"]) + type=str, + default="best_balanced_accuracy", + choices=["best_loss", "best_balanced_accuracy"], + ) train_slice_cnn_group = train_slice_cnn_parser.add_argument_group( - TRAIN_CATEGORIES["SLICE CNN"]) + TRAIN_CATEGORIES["SLICE CNN"] + ) train_slice_cnn_group.add_argument( - '--selection_threshold', - help='''Threshold on the balanced accuracies to compute the + "--selection_threshold", + help="""Threshold on the balanced accuracies to compute the subject-level performance. Slices are selected if their balanced - accuracy > threshold. Default corresponds to no selection.''', - type=float, default=0.0) + accuracy > threshold. Default corresponds to no selection.""", + type=float, + default=0.0, + ) train_slice_cnn_parser.set_defaults(func=train_func) train_slice_multicnn_parser = train_slice_subparser.add_parser( "multicnn", - parents=[parent_parser, train_parent_parser, train_slice_parent, transfer_learning_parent], - help="Train a 2D slice-level multi-CNN.") + parents=[ + parent_parser, + train_parent_parser, + train_slice_parent, + transfer_learning_parent, + ], + help="Train a 2D slice-level multi-CNN.", + ) # /!\ If parents list is changed the arguments won't be in the right group anymore ! train_slice_multicnn_parser._action_groups[-1].add_argument( - '--transfer_learning_selection', + "--transfer_learning_selection", help="If transfer_learning from CNN, chooses which best transfer model is selected.", - type=str, default="best_balanced_accuracy", choices=["best_loss", "best_balanced_accuracy"]) + type=str, + default="best_balanced_accuracy", + choices=["best_loss", "best_balanced_accuracy"], + ) train_slice_multicnn_group = train_slice_multicnn_parser.add_argument_group( - TRAIN_CATEGORIES["SLICE CNN"]) + TRAIN_CATEGORIES["SLICE CNN"] + ) train_slice_multicnn_group.add_argument( - '--selection_threshold', - help='''Threshold on the balanced accuracies to compute the + "--selection_threshold", + help="""Threshold on the balanced accuracies to compute the subject-level performance. Slices are selected if their balanced - accuracy > threshold. Default corresponds to no selection.''', - type=float, default=0.0) + accuracy > threshold. Default corresponds to no selection.""", + type=float, + default=0.0, + ) train_slice_multicnn_parser.set_defaults(func=train_func) @@ -1024,15 +1186,15 @@ def preprocessing_help(args): train_json_parser = train_subparser.add_parser( "from_json", parents=[parent_parser], - help="Train a network as defined in a JSON file.") + help="Train a network as defined in a JSON file.", + ) train_json_group = train_json_parser.add_argument_group( - TRAIN_CATEGORIES["POSITIONAL"]) - train_json_group.add_argument( - "json_path", type=str, - help="Path to the JSON file.") + TRAIN_CATEGORIES["POSITIONAL"] + ) + train_json_group.add_argument("json_path", type=str, help="Path to the JSON file.") train_json_group.add_argument( - "output_dir", type=str, - help="Directory in which the new job is stored.") + "output_dir", type=str, help="Directory in which the new job is stored." + ) train_json_parser.set_defaults(func=retrain_func) @@ -1040,44 +1202,54 @@ def preprocessing_help(args): # RESUME ######################### resume_parser = train_subparser.add_parser( - 'resume', + "resume", parents=[parent_parser], - help='Resume all jobs prematurely ended in launch_dir.' + help="Resume all jobs prematurely ended in launch_dir.", ) resume_parser.add_argument( - "model_path", - type=str, - help="Directory containing the random_search.json file." + "model_path", type=str, help="Directory containing the random_search.json file." ) resume_comp_group = resume_parser.add_argument_group( TRAIN_CATEGORIES["COMPUTATIONAL"] ) resume_comp_group.add_argument( - "-np", "--nproc", - help='Number of cores used the quality check. ' - 'Default will reuse the same value than in training.', - type=int, default=None + "-np", + "--nproc", + help="Number of cores used the quality check. " + "Default will reuse the same value than in training.", + type=int, + default=None, ) resume_comp_group.add_argument( - '-cpu', '--use_cpu', action='store_true', default=False, - help='Override the previous command line to use CPU.', + "-cpu", + "--use_cpu", + action="store_true", + default=False, + help="Override the previous command line to use CPU.", ) resume_comp_group.add_argument( - '-gpu', '--use_gpu', action='store_true', default=False, - help='Override the previous command line to use GPU.', + "-gpu", + "--use_gpu", + action="store_true", + default=False, + help="Override the previous command line to use GPU.", ) resume_comp_group.add_argument( - '--batch_size', - default=None, type=int, - help='Batch size for data loading. ' - 'Default will reuse the same value than in training.') + "--batch_size", + default=None, + type=int, + help="Batch size for data loading. " + "Default will reuse the same value than in training.", + ) resume_comp_group.add_argument( - '--evaluation_steps', '-esteps', - default=None, type=int, - help='Fix the number of iterations to perform before computing an evaluation. ' - 'Default will reuse the same value than in training.' + "--evaluation_steps", + "-esteps", + default=None, + type=int, + help="Fix the number of iterations to perform before computing an evaluation. " + "Default will reuse the same value than in training.", ) resume_parser.set_defaults(func=resume_func) @@ -1087,346 +1259,471 @@ def preprocessing_help(args): # classify_parser: get command line arguments and options classify_parser = subparser.add_parser( - 'classify', + "classify", parents=[parent_parser], - help='''Classify one image or a list of images with your previously - trained model.''') + help="""Classify one image or a list of images with your previously + trained model.""", + ) classify_pos_group = classify_parser.add_argument_group( - TRAIN_CATEGORIES["POSITIONAL"]) + TRAIN_CATEGORIES["POSITIONAL"] + ) classify_pos_group.add_argument( - 'caps_directory', - help='Data using CAPS structure.', - default=None) + "caps_directory", help="Data using CAPS structure.", default=None + ) classify_pos_group.add_argument( - 'tsv_path', - help='''Path to the file with subjects/sessions to process. + "tsv_path", + help="""Path to the file with subjects/sessions to process. If it includes the filename will load the tsv file directly. - Else will load the baseline tsv files of wanted diagnoses produced by tsvtool.''', - default=None) + Else will load the baseline tsv files of wanted diagnoses produced by tsvtool.""", + default=None, + ) classify_pos_group.add_argument( - 'model_path', - help='''Path to the folder where the model is stored. Folder structure - should be the same obtained during the training.''', - default=None) + "model_path", + help="""Path to the folder where the model is stored. Folder structure + should be the same obtained during the training.""", + default=None, + ) classify_pos_group.add_argument( - 'prefix_output', - help='Prefix to name the files resulting from the classify task.', - type=str) + "prefix_output", + help="Prefix to name the files resulting from the classify task.", + type=str, + ) # Computational resources classify_comput_group = classify_parser.add_argument_group( - TRAIN_CATEGORIES["COMPUTATIONAL"]) + TRAIN_CATEGORIES["COMPUTATIONAL"] + ) classify_comput_group.add_argument( - '-cpu', '--use_cpu', action='store_true', - help='Uses CPU instead of GPU.', - default=False) + "-cpu", + "--use_cpu", + action="store_true", + help="Uses CPU instead of GPU.", + default=False, + ) classify_comput_group.add_argument( - '-np', '--nproc', - help='Number of cores used during the task.', - type=int, default=2) + "-np", + "--nproc", + help="Number of cores used during the task.", + type=int, + default=2, + ) classify_comput_group.add_argument( - '--batch_size', - default=2, type=int, - help='Batch size for data loading. (default=2)') + "--batch_size", + default=2, + type=int, + help="Batch size for data loading. (default=2)", + ) # Specific classification arguments classify_specific_group = classify_parser.add_argument_group( TRAIN_CATEGORIES["OPTIONAL"] ) classify_specific_group.add_argument( - '-nl', '--no_labels', action='store_true', - help='Add this flag if your dataset does not contain a ground truth.', - default=False) + "-nl", + "--no_labels", + action="store_true", + help="Add this flag if your dataset does not contain a ground truth.", + default=False, + ) classify_specific_group.add_argument( - '--use_extracted_features', - help='''If True the extract slices or patche are used, otherwise the they - will be extracted on the fly (if necessary).''', - default=False, action="store_true") + "--use_extracted_features", + help="""If True the extract slices or patche are used, otherwise the they + will be extracted on the fly (if necessary).""", + default=False, + action="store_true", + ) classify_specific_group.add_argument( - '--selection_metrics', - help='''List of metrics to find the best models to evaluate. Default will - classify best model based on balanced accuracy.''', - choices=['loss', 'balanced_accuracy'], - default=['balanced_accuracy'], - nargs='+' + "--selection_metrics", + help="""List of metrics to find the best models to evaluate. Default will + classify best model based on balanced accuracy.""", + choices=["loss", "balanced_accuracy"], + default=["balanced_accuracy"], + nargs="+", ) classify_specific_group.add_argument( "--diagnoses", help="List of participants that will be classified.", - nargs="+", type=str, choices=['AD', 'CN', 'MCI', 'sMCI', 'pMCI'], default=None) + nargs="+", + type=str, + choices=["AD", "CN", "MCI", "sMCI", "pMCI"], + default=None, + ) classify_specific_group.add_argument( "--multi_cohort", help="Performs multi-cohort classification. In this case, caps_dir and tsv_path must be paths to TSV files.", action="store_true", - default=False + default=False, ) classify_parser.set_defaults(func=classify_func) tsv_parser = subparser.add_parser( - 'tsvtool', - help='''Handle tsv files for metadata processing and data splits.''') + "tsvtool", help="""Handle tsv files for metadata processing and data splits.""" + ) tsv_subparser = tsv_parser.add_subparsers( - title='''Task to execute with tsv tool:''', - description='''What kind of task do you want to use with tsv tool? - (restrict, getlabels, split, kfold, analysis).''', - dest='tsv_task', - help='''****** Tasks proposed by clinicadl tsv tool ******''') + title="""Task to execute with tsv tool:""", + description="""What kind of task do you want to use with tsv tool? + (restrict, getlabels, split, kfold, analysis).""", + dest="tsv_task", + help="""****** Tasks proposed by clinicadl tsv tool ******""", + ) tsv_subparser.required = True tsv_restrict_subparser = tsv_subparser.add_parser( - 'restrict', - help='Reproduce restrictions applied to AIBL and OASIS datasets') + "restrict", help="Reproduce restrictions applied to AIBL and OASIS datasets" + ) tsv_restrict_subparser.add_argument( "dataset", help="dataset on which the restriction is performed.", choices=["AIBL", "OASIS"], - type=str) + type=str, + ) tsv_restrict_subparser.add_argument( "merged_tsv", help="Path to the file obtained by the command clinica iotools merge-tsv.", - type=str) + type=str, + ) tsv_restrict_subparser.add_argument( "results_path", help="Path to the output tsv file (filename included).", - type=str) + type=str, + ) tsv_restrict_subparser.set_defaults(func=tsv_restrict_func) tsv_getlabels_subparser = tsv_subparser.add_parser( - 'getlabels', - parents=[parent_parser], - help='Get labels in separate tsv files.') + "getlabels", parents=[parent_parser], help="Get labels in separate tsv files." + ) tsv_getlabels_subparser.add_argument( "merged_tsv", help="Path to the file obtained by the command clinica iotools merge-tsv.", - type=str) + type=str, + ) tsv_getlabels_subparser.add_argument( "missing_mods", help="Path to the folder where the outputs of clinica iotools missing-mods are.", - type=str) + type=str, + ) tsv_getlabels_subparser.add_argument( "results_path", type=str, - help="Path to the folder where tsv files are extracted.") + help="Path to the folder where tsv files are extracted.", + ) # Optional arguments tsv_getlabels_subparser.add_argument( - "--modality", "-mod", + "--modality", + "-mod", help="Modality to select sessions. Sessions which do not include the modality will be excluded.", - default="t1w", type=str) + default="t1w", + type=str, + ) tsv_getlabels_subparser.add_argument( "--diagnoses", help="Labels that must be extracted from merged_tsv.", - nargs="+", type=str, choices=['AD', 'BV', 'CN', 'MCI', 'sMCI', 'pMCI'], default=['AD', 'CN']) + nargs="+", + type=str, + choices=["AD", "BV", "CN", "MCI", "sMCI", "pMCI"], + default=["AD", "CN"], + ) tsv_getlabels_subparser.add_argument( "--time_horizon", help="Time horizon to analyse stability of MCI subjects.", - default=36, type=int) + default=36, + type=int, + ) tsv_getlabels_subparser.add_argument( "--restriction_path", help="Path to a tsv containing the sessions that can be included.", - type=str, default=None) + type=str, + default=None, + ) tsv_getlabels_subparser.add_argument( "--variables_of_interest", help="Variables of interest that will be kept in the final lists." - "Default will keep the diagnosis, age and the sex needed for the split procedure.", - type=str, nargs="+", default=None) + "Default will keep the diagnosis, age and the sex needed for the split procedure.", + type=str, + nargs="+", + default=None, + ) tsv_getlabels_subparser.add_argument( "--keep_smc", help="This flag allows to keep SMC participants, else they are removed.", - default=False, action="store_true" + default=False, + action="store_true", ) tsv_getlabels_subparser.set_defaults(func=tsv_getlabels_func) tsv_split_subparser = tsv_subparser.add_parser( - 'split', + "split", parents=[parent_parser], - help='Performs one stratified shuffle split on participant level.') + help="Performs one stratified shuffle split on participant level.", + ) tsv_split_subparser.add_argument( "formatted_data_path", help="Path to the folder containing data extracted by clinicadl tsvtool getlabels.", - type=str) + type=str, + ) # Optional arguments tsv_split_subparser.add_argument( "--n_test", help="If >= 1, number of subjects to put in set with name 'subset_name'. " - "If < 1, proportion of subjects to put set with name 'subset_name'. " - "If 0, no training set is created and the whole dataset is considered as one set with name 'subset_name.", - type=float, default=100.) + "If < 1, proportion of subjects to put set with name 'subset_name'. " + "If 0, no training set is created and the whole dataset is considered as one set with name 'subset_name.", + type=float, + default=100.0, + ) tsv_split_subparser.add_argument( "--MCI_sub_categories", help="Deactivate default managing of MCI sub-categories to avoid data leakage.", - action="store_false", default=True) + action="store_false", + default=True, + ) tsv_split_subparser.add_argument( - "--p_sex_threshold", "-ps", + "--p_sex_threshold", + "-ps", help="The threshold used for the chi2 test on sex distributions.", - default=0.80, type=float) + default=0.80, + type=float, + ) tsv_split_subparser.add_argument( - "--p_age_threshold", "-pa", + "--p_age_threshold", + "-pa", help="The threshold used for the T-test on age distributions.", - default=0.80, type=float) + default=0.80, + type=float, + ) tsv_split_subparser.add_argument( "--subset_name", help="Name of the subset that is complementary to train.", - type=str, default="test") + type=str, + default="test", + ) tsv_split_subparser.add_argument( "--ignore_demographics", help="If True do not use age and sex to create the splits.", - default=False, action="store_true" + default=False, + action="store_true", ) tsv_split_subparser.add_argument( "--categorical_split_variable", help="Name of a categorical variable used for a stratified shuffle split " - "(in addition to age and sex selection).", - default=None, type=str + "(in addition to age and sex selection).", + default=None, + type=str, ) tsv_split_subparser.set_defaults(func=tsv_split_func) tsv_kfold_subparser = tsv_subparser.add_parser( - 'kfold', + "kfold", parents=[parent_parser], - help='Performs a k-fold split on participant level.') + help="Performs a k-fold split on participant level.", + ) tsv_kfold_subparser.add_argument( "formatted_data_path", help="Path to the folder containing data extracted by clinicadl tsvtool getlabels.", - type=str) + type=str, + ) # Optional arguments tsv_kfold_subparser.add_argument( "--n_splits", help="Number of folds in the k-fold split." - "If 0, there is no training set and the whole dataset is considered as a test set.", - type=int, default=5) + "If 0, there is no training set and the whole dataset is considered as a test set.", + type=int, + default=5, + ) tsv_kfold_subparser.add_argument( "--MCI_sub_categories", help="Deactivate default managing of MCI sub-categories to avoid data leakage.", - action="store_false", default=True) + action="store_false", + default=True, + ) tsv_kfold_subparser.add_argument( "--subset_name", help="Name of the subset that is complementary to train.", - type=str, default="validation") + type=str, + default="validation", + ) tsv_kfold_subparser.add_argument( "--stratification", help="Name of a variable used to stratify the k-fold split.", - type=str, default=None) + type=str, + default=None, + ) tsv_kfold_subparser.set_defaults(func=tsv_kfold_func) tsv_analysis_subparser = tsv_subparser.add_parser( - 'analysis', - help='Produces a demographic analysis of the extracted labels.') + "analysis", help="Produces a demographic analysis of the extracted labels." + ) tsv_analysis_subparser.add_argument( "merged_tsv", help="Path to the file obtained by the command clinica iotools merge-tsv.", - type=str) + type=str, + ) tsv_analysis_subparser.add_argument( "formatted_data_path", help="Path to the folder containing data extracted by clinicadl tsvtool getlabels.", - type=str) + type=str, + ) tsv_analysis_subparser.add_argument( "results_path", help="Path to the output tsv file (filename included).", - type=str) + type=str, + ) # Modality selection tsv_analysis_subparser.add_argument( "--diagnoses", help="Labels selected for the demographic analysis.", - default=['AD', 'CN'], nargs="+", type=str, choices=['AD', 'BV', 'CN', 'MCI', 'sMCI', 'pMCI']) + default=["AD", "CN"], + nargs="+", + type=str, + choices=["AD", "BV", "CN", "MCI", "sMCI", "pMCI"], + ) tsv_analysis_subparser.set_defaults(func=tsv_analysis_func) interpret_parser = subparser.add_parser( - 'interpret', - help='''Interpret classification performed by a CNN with saliency maps.''') + "interpret", + help="""Interpret classification performed by a CNN with saliency maps.""", + ) interpret_subparser = interpret_parser.add_subparsers( - title='''Type of saliency map to perform:''', - description='''Do you want to perform a group saliency map or individual ones?''', - dest='task', - help='''****** Saliency maps proposed by clinicadl ******''' + title="""Type of saliency map to perform:""", + description="""Do you want to perform a group saliency map or individual ones?""", + dest="task", + help="""****** Saliency maps proposed by clinicadl ******""", ) interpret_subparser.required = True interpret_parent_parser = argparse.ArgumentParser(add_help=False) - interpret_pos_group = interpret_parent_parser.add_argument_group(TRAIN_CATEGORIES["POSITIONAL"]) + interpret_pos_group = interpret_parent_parser.add_argument_group( + TRAIN_CATEGORIES["POSITIONAL"] + ) interpret_pos_group.add_argument( - "model_path", type=str, - help="Path to the model output directory.") + "model_path", type=str, help="Path to the model output directory." + ) interpret_pos_group.add_argument( - "name", type=str, - help="Name of the interpretation map.") + "name", type=str, help="Name of the interpretation map." + ) - interpret_comput_group = interpret_parent_parser.add_argument_group(TRAIN_CATEGORIES["COMPUTATIONAL"]) + interpret_comput_group = interpret_parent_parser.add_argument_group( + TRAIN_CATEGORIES["COMPUTATIONAL"] + ) interpret_comput_group.add_argument( - "--batch_size", default=1, type=int, - help="Batch size for selection of images (keep_true).") + "--batch_size", + default=1, + type=int, + help="Batch size for selection of images (keep_true).", + ) interpret_comput_group.add_argument( - '-cpu', '--use_cpu', - action='store_true', default=False, - help='Uses gpu instead of cpu if cuda is available.') + "-cpu", + "--use_cpu", + action="store_true", + default=False, + help="Uses gpu instead of cpu if cuda is available.", + ) interpret_comput_group.add_argument( - '-np', '--nproc', - default=2, type=int, - help='the number of batches being loaded in parallel.') + "-np", + "--nproc", + default=2, + type=int, + help="the number of batches being loaded in parallel.", + ) - interpret_model_group = interpret_parent_parser.add_argument_group(TRAIN_CATEGORIES["MODEL"]) + interpret_model_group = interpret_parent_parser.add_argument_group( + TRAIN_CATEGORIES["MODEL"] + ) interpret_model_group.add_argument( - "--selection", default=['best_loss'], type=str, nargs="+", - choices=['best_loss', 'best_balanced_accuracy'], - help="Loads the model selected on minimal loss or maximum accuracy on validation.") + "--selection", + default=["best_loss"], + type=str, + nargs="+", + choices=["best_loss", "best_balanced_accuracy"], + help="Loads the model selected on minimal loss or maximum accuracy on validation.", + ) - interpret_data_group = interpret_parent_parser.add_argument_group(TRAIN_CATEGORIES["DATA"]) + interpret_data_group = interpret_parent_parser.add_argument_group( + TRAIN_CATEGORIES["DATA"] + ) interpret_data_group.add_argument( - "--tsv_path", type=str, default=None, - help="TSV path with subjects/sessions to process, if different from classification task.") + "--tsv_path", + type=str, + default=None, + help="TSV path with subjects/sessions to process, if different from classification task.", + ) interpret_data_group.add_argument( - "--caps_dir", type=str, default=None, - help="Path to input dir of the MRI (preprocessed CAPS_dir), if different from classification task") + "--caps_dir", + type=str, + default=None, + help="Path to input dir of the MRI (preprocessed CAPS_dir), if different from classification task", + ) interpret_data_group.add_argument( "--multi_cohort", help="Performs multi-cohort interpretation. In this case, caps_dir and tsv_path must be paths to TSV files.", action="store_true", - default=False + default=False, ) interpret_data_group.add_argument( - "--diagnosis", "-d", default='AD', type=str, - help="The images corresponding to this diagnosis only will be loaded.") + "--diagnosis", + "-d", + default="AD", + type=str, + help="The images corresponding to this diagnosis only will be loaded.", + ) interpret_data_group.add_argument( - "--target_diagnosis", default=None, type=str, - help="Which class the gradients explain. If None is given will be equal to diagnosis.") + "--target_diagnosis", + default=None, + type=str, + help="Which class the gradients explain. If None is given will be equal to diagnosis.", + ) interpret_data_group.add_argument( - "--baseline", action="store_true", default=False, - help="If provided, only the baseline sessions are used for training.") + "--baseline", + action="store_true", + default=False, + help="If provided, only the baseline sessions are used for training.", + ) interpret_data_group.add_argument( - "--keep_true", type=lambda x: bool(strtobool(x)), default=None, - help="Chooses false or true positive values of the classification. No selection by default") + "--keep_true", + type=lambda x: bool(strtobool(x)), + default=None, + help="Chooses false or true positive values of the classification. No selection by default", + ) interpret_data_group.add_argument( - "--nifti_template_path", type=str, default=None, - help="Path to a nifti template to retrieve affine values.") + "--nifti_template_path", + type=str, + default=None, + help="Path to a nifti template to retrieve affine values.", + ) - interpret_display_group = interpret_parent_parser.add_argument_group(TRAIN_CATEGORIES["DISPLAY"]) + interpret_display_group = interpret_parent_parser.add_argument_group( + TRAIN_CATEGORIES["DISPLAY"] + ) interpret_display_group.add_argument( - "--vmax", type=float, default=0.5, - help="Maximum value used in 2D image display.") + "--vmax", + type=float, + default=0.5, + help="Maximum value used in 2D image display.", + ) interpret_group_parser = interpret_subparser.add_parser( "group", parents=[parent_parser, interpret_parent_parser], - help="Mean saliency map over a list of sessions" + help="Mean saliency map over a list of sessions", ) interpret_group_parser.set_defaults(func=interpret_func) @@ -1434,7 +1731,7 @@ def preprocessing_help(args): interpret_individual_parser = interpret_subparser.add_parser( "individual", parents=[parent_parser, interpret_parent_parser], - help="Individual saliency maps for each session in the input TSV file." + help="Individual saliency maps for each session in the input TSV file.", ) interpret_individual_parser.set_defaults(func=interpret_func) @@ -1446,140 +1743,185 @@ def return_train_parent_parser(): # Main train parent parser common to train and random search train_parent_parser = argparse.ArgumentParser(add_help=False) train_pos_group = train_parent_parser.add_argument_group( - TRAIN_CATEGORIES["POSITIONAL"]) + TRAIN_CATEGORIES["POSITIONAL"] + ) train_pos_group.add_argument( - 'caps_dir', - help='Data using CAPS structure.', - default=None) + "caps_dir", help="Data using CAPS structure.", default=None + ) train_pos_group.add_argument( - 'preprocessing', - help='Defines the type of preprocessing of CAPS data.', - choices=['t1-linear', 't1-extensive', 't1-volume'], type=str) + "preprocessing", + help="Defines the type of preprocessing of CAPS data.", + choices=["t1-linear", "t1-extensive", "t1-volume"], + type=str, + ) train_pos_group.add_argument( - 'tsv_path', - help='TSV path with subjects/sessions to process.', - default=None) + "tsv_path", help="TSV path with subjects/sessions to process.", default=None + ) train_pos_group.add_argument( - 'output_dir', - help='Folder containing results of the training.', - default=None) + "output_dir", help="Folder containing results of the training.", default=None + ) train_pos_group.add_argument( - 'model', - help='CNN Model to be used during the training.', - default='Conv5_FC3') + "model", help="CNN Model to be used during the training.", default="Conv5_FC3" + ) train_comput_group = train_parent_parser.add_argument_group( - TRAIN_CATEGORIES["COMPUTATIONAL"]) + TRAIN_CATEGORIES["COMPUTATIONAL"] + ) train_comput_group.add_argument( - '-cpu', '--use_cpu', action='store_true', - help='If provided, will use CPU instead of GPU.', - default=False) + "-cpu", + "--use_cpu", + action="store_true", + help="If provided, will use CPU instead of GPU.", + default=False, + ) train_comput_group.add_argument( - '-np', '--nproc', - help='Number of cores used during the training. (default=2)', - type=int, default=2) + "-np", + "--nproc", + help="Number of cores used during the training. (default=2)", + type=int, + default=2, + ) train_comput_group.add_argument( - '--batch_size', - default=2, type=int, - help='Batch size for training. (default=2)') + "--batch_size", default=2, type=int, help="Batch size for training. (default=2)" + ) train_comput_group.add_argument( - '--evaluation_steps', '-esteps', - default=0, type=int, - help='Fix the number of iterations to perform before computing an evaluation. Default will only ' - 'perform one evaluation at the end of each epoch.') + "--evaluation_steps", + "-esteps", + default=0, + type=int, + help="Fix the number of iterations to perform before computing an evaluation. Default will only " + "perform one evaluation at the end of each epoch.", + ) - train_data_group = train_parent_parser.add_argument_group( - TRAIN_CATEGORIES["DATA"]) + train_data_group = train_parent_parser.add_argument_group(TRAIN_CATEGORIES["DATA"]) train_data_group.add_argument( "--multi_cohort", help="Performs multi-cohort training. In this case, caps_dir and tsv_path must be paths to TSV files.", action="store_true", - default=False + default=False, ) train_data_group.add_argument( - '--diagnoses', '-d', - help='List of diagnoses that will be selected for training.', - default=['AD', 'CN'], nargs='+', type=str, - choices=['AD', 'BV', 'CN', 'MCI', 'sMCI', 'pMCI']) + "--diagnoses", + "-d", + help="List of diagnoses that will be selected for training.", + default=["AD", "CN"], + nargs="+", + type=str, + choices=["AD", "BV", "CN", "MCI", "sMCI", "pMCI"], + ) train_data_group.add_argument( - '--baseline', - help='If provided, only the baseline sessions are used for training.', + "--baseline", + help="If provided, only the baseline sessions are used for training.", action="store_true", - default=False) + default=False, + ) train_data_group.add_argument( - '--unnormalize', '-un', - help='Disable default MinMaxNormalization.', + "--unnormalize", + "-un", + help="Disable default MinMaxNormalization.", action="store_true", - default=False) + default=False, + ) train_data_group.add_argument( - "--data_augmentation", nargs="+", default=False, + "--data_augmentation", + nargs="+", + default=False, choices=["None", "Noise", "Erasing", "CropPad", "Smoothing"], - help="Randomly applies transforms on the training set.") + help="Randomly applies transforms on the training set.", + ) train_data_group.add_argument( - '--sampler', '-s', + "--sampler", + "-s", help="Sampler choice (random, or weighted for imbalanced datasets)", - default="random", type=str, choices=["random", "weighted"]) + default="random", + type=str, + choices=["random", "weighted"], + ) train_data_group.add_argument( "--predict_atlas_intensities", help="Atlases used in t1-volume pipeline to make intensities prediction.", - default=None, type=str, - choices=["AAL2", "AICHA", "Hammers", "LPBA40", "Neuromorphometrics"] + default=None, + type=str, + choices=["AAL2", "AICHA", "Hammers", "LPBA40", "Neuromorphometrics"], ) train_data_group.add_argument( "--atlas_weight", help="Weight to put on the MSE loss used to compute the error on atlas intensities.", - default=1, type=float, + default=1, + type=float, ) train_data_group.add_argument( - '--merged_tsv_path', - default="", type=str, + "--merged_tsv_path", + default="", + type=str, help="Path to the output of clinica iotools merged-tsv (concatenation for multi-cohort). " - "Can accelerate training if atlas intensities are predicted." + "Can accelerate training if atlas intensities are predicted.", ) train_cv_group = train_parent_parser.add_argument_group( - TRAIN_CATEGORIES["CROSS-VALIDATION"]) + TRAIN_CATEGORIES["CROSS-VALIDATION"] + ) train_cv_group.add_argument( - '--n_splits', - help='If a value is given for k will load data of a k-fold CV. ' - 'Default value (0) will load a single split.', - type=int, default=0) + "--n_splits", + help="If a value is given for k will load data of a k-fold CV. " + "Default value (0) will load a single split.", + type=int, + default=0, + ) train_cv_group.add_argument( - '--split', - help='Train the list of given folds. By default train all folds.', - type=int, default=None, nargs='+') + "--split", + help="Train the list of given folds. By default train all folds.", + type=int, + default=None, + nargs="+", + ) train_optim_group = train_parent_parser.add_argument_group( - TRAIN_CATEGORIES["OPTIMIZATION"]) + TRAIN_CATEGORIES["OPTIMIZATION"] + ) train_optim_group.add_argument( - '--epochs', - help='Maximum number of epochs.', - default=20, type=int) + "--epochs", help="Maximum number of epochs.", default=20, type=int + ) train_optim_group.add_argument( - '--learning_rate', '-lr', - help='Learning rate of the optimization.', - default=1e-4, type=float) + "--learning_rate", + "-lr", + help="Learning rate of the optimization.", + default=1e-4, + type=float, + ) train_optim_group.add_argument( - '--weight_decay', '-wd', - help='Weight decay value used in optimization.', - default=1e-4, type=float) + "--weight_decay", + "-wd", + help="Weight decay value used in optimization.", + default=1e-4, + type=float, + ) train_optim_group.add_argument( - '--dropout', - help='rate of dropout that will be applied to dropout layers in CNN.', - default=0, type=float) + "--dropout", + help="rate of dropout that will be applied to dropout layers in CNN.", + default=0, + type=float, + ) train_optim_group.add_argument( - '--patience', - help='Number of epochs for early stopping patience.', - type=int, default=0) + "--patience", + help="Number of epochs for early stopping patience.", + type=int, + default=0, + ) train_optim_group.add_argument( - '--tolerance', - help='Value for the early stopping tolerance.', - type=float, default=0.0) + "--tolerance", + help="Value for the early stopping tolerance.", + type=float, + default=0.0, + ) train_optim_group.add_argument( - '--accumulation_steps', '-asteps', - help='Accumulates gradients during the given number of iterations before performing the weight update ' - 'in order to virtually increase the size of the batch.', - default=1, type=int) + "--accumulation_steps", + "-asteps", + help="Accumulates gradients during the given number of iterations before performing the weight update " + "in order to virtually increase the size of the batch.", + default=1, + type=int, + ) # train_optim_group.add_argument( # "--loss", # help="Replaces default losses: cross-entropy for CNN and MSE for autoencoders.", diff --git a/clinicadl/clinicadl/interpret/gradients.py b/clinicadl/clinicadl/interpret/gradients.py index 8ed224a4c..70b6eaba6 100644 --- a/clinicadl/clinicadl/interpret/gradients.py +++ b/clinicadl/clinicadl/interpret/gradients.py @@ -3,8 +3,9 @@ class VanillaBackProp: """ - Produces gradients generated with vanilla back propagation from the image + Produces gradients generated with vanilla back propagation from the image """ + def __init__(self, model, gpu=False): self.model = model self.gradients = None diff --git a/clinicadl/clinicadl/interpret/group_backprop.py b/clinicadl/clinicadl/interpret/group_backprop.py index 936aa5051..0ef8a4a1f 100644 --- a/clinicadl/clinicadl/interpret/group_backprop.py +++ b/clinicadl/clinicadl/interpret/group_backprop.py @@ -1,16 +1,27 @@ -from os import path +import argparse import os -import numpy as np +import warnings +from os import path + +import matplotlib.pyplot as plt import nibabel as nib +import numpy as np from torch.utils.data import DataLoader -import argparse -import matplotlib.pyplot as plt -import warnings -from clinicadl.tools.deep_learning.iotools import read_json, commandline_to_json, translate_parameters, return_logger from clinicadl.tools.deep_learning.cnn_utils import get_criterion, sort_predicted +from clinicadl.tools.deep_learning.data import ( + get_transforms, + load_data_test, + return_dataset, +) +from clinicadl.tools.deep_learning.iotools import ( + commandline_to_json, + read_json, + return_logger, + translate_parameters, +) from clinicadl.tools.deep_learning.models import create_model, load_model -from clinicadl.tools.deep_learning.data import load_data_test, return_dataset, get_transforms + from .gradients import VanillaBackProp @@ -19,12 +30,16 @@ def group_backprop(options): main_logger = return_logger(options.verbose, "main process") options = translate_parameters(options) - fold_list = [fold for fold in os.listdir(options.model_path) if fold[:5:] == "fold-"] + fold_list = [ + fold for fold in os.listdir(options.model_path) if fold[:5:] == "fold-" + ] if len(fold_list) == 0: raise ValueError("No folds were found at path %s" % options.model_path) model_options = argparse.Namespace() - model_options = read_json(model_options, path.join(options.model_path, 'commandline.json')) + model_options = read_json( + model_options, path.join(options.model_path, "commandline.json") + ) model_options = translate_parameters(model_options) model_options.gpu = options.gpu @@ -47,69 +62,100 @@ def group_backprop(options): for fold in fold_list: main_logger.info(fold) for selection in options.selection: - results_path = path.join(options.model_path, fold, 'gradients', - selection, options.name) + results_path = path.join( + options.model_path, fold, "gradients", selection, options.name + ) criterion = get_criterion(model_options.loss) # Data management (remove data not well predicted by the CNN) - training_df = load_data_test(options.tsv_path, [options.diagnosis], baseline=options.baseline, - multi_cohort=options.multi_cohort) + training_df = load_data_test( + options.tsv_path, + [options.diagnosis], + baseline=options.baseline, + multi_cohort=options.multi_cohort, + ) training_df.reset_index(drop=True, inplace=True) # Model creation - _, all_transforms = get_transforms(model_options.mode, - minmaxnormalization=model_options.minmaxnormalization) + _, all_transforms = get_transforms( + model_options.mode, + minmaxnormalization=model_options.minmaxnormalization, + ) with warnings.catch_warnings(): warnings.simplefilter("ignore") - data_example = return_dataset(model_options.mode, options.input_dir, - training_df, model_options.preprocessing, - train_transformations=None, all_transformations=all_transforms, - prepare_dl=options.prepare_dl, multi_cohort=options.multi_cohort, - params=model_options) + data_example = return_dataset( + model_options.mode, + options.input_dir, + training_df, + model_options.preprocessing, + train_transformations=None, + all_transformations=all_transforms, + prepare_dl=options.prepare_dl, + multi_cohort=options.multi_cohort, + params=model_options, + ) model = create_model(model_options, data_example.size) - model_dir = os.path.join(options.model_path, fold, 'models', selection) - model, best_epoch = load_model(model, model_dir, gpu=options.gpu, filename='model_best.pth.tar') + model_dir = os.path.join(options.model_path, fold, "models", selection) + model, best_epoch = load_model( + model, model_dir, gpu=options.gpu, filename="model_best.pth.tar" + ) options.output_dir = results_path commandline_to_json(options, logger=main_logger) # Keep only subjects who were correctly / wrongly predicted by the network - training_df = sort_predicted(model, training_df, options.input_dir, model_options, - criterion, options.keep_true, - batch_size=options.batch_size, num_workers=options.num_workers, - gpu=options.gpu) + training_df = sort_predicted( + model, + training_df, + options.input_dir, + model_options, + criterion, + options.keep_true, + batch_size=options.batch_size, + num_workers=options.num_workers, + gpu=options.gpu, + ) if len(training_df) > 0: # Save the tsv files used for the saliency maps - training_df.to_csv(path.join('data.tsv'), sep='\t', index=False) + training_df.to_csv(path.join("data.tsv"), sep="\t", index=False) with warnings.catch_warnings(): warnings.simplefilter("ignore") - data_train = return_dataset(model_options.mode, options.input_dir, - training_df, model_options.preprocessing, - train_transformations=None, all_transformations=all_transforms, - prepare_dl=options.prepare_dl, multi_cohort=options.multi_cohort, - params=model_options) - - train_loader = DataLoader(data_train, - batch_size=options.batch_size, - shuffle=True, - num_workers=options.num_workers, - pin_memory=True) + data_train = return_dataset( + model_options.mode, + options.input_dir, + training_df, + model_options.preprocessing, + train_transformations=None, + all_transformations=all_transforms, + prepare_dl=options.prepare_dl, + multi_cohort=options.multi_cohort, + params=model_options, + ) + + train_loader = DataLoader( + data_train, + batch_size=options.batch_size, + shuffle=True, + num_workers=options.num_workers, + pin_memory=True, + ) interpreter = VanillaBackProp(model, gpu=options.gpu) cum_map = 0 for data in train_loader: if options.gpu: - input_batch = data['image'].cuda() + input_batch = data["image"].cuda() else: - input_batch = data['image'] + input_batch = data["image"] - maps = interpreter.generate_gradients(input_batch, - data_train.diagnosis_code[options.target_diagnosis]) + maps = interpreter.generate_gradients( + input_batch, data_train.diagnosis_code[options.target_diagnosis] + ) cum_map += maps.sum(axis=0) mean_map = cum_map / len(data_train) @@ -125,7 +171,12 @@ def group_backprop(options): nib.save(mean_map_nii, path.join(results_path, "map.nii.gz")) else: jpg_path = path.join(results_path, "map.jpg") - plt.imshow(mean_map[0], cmap="coolwarm", vmin=-options.vmax, vmax=options.vmax) + plt.imshow( + mean_map[0], + cmap="coolwarm", + vmin=-options.vmax, + vmax=options.vmax, + ) plt.colorbar() plt.savefig(jpg_path) plt.close() diff --git a/clinicadl/clinicadl/interpret/individual_backprop.py b/clinicadl/clinicadl/interpret/individual_backprop.py index dc93f264c..c79f2697d 100644 --- a/clinicadl/clinicadl/interpret/individual_backprop.py +++ b/clinicadl/clinicadl/interpret/individual_backprop.py @@ -1,16 +1,27 @@ -from os import path +import argparse import os -import numpy as np +import warnings +from os import path + +import matplotlib.pyplot as plt import nibabel as nib +import numpy as np from torch.utils.data import DataLoader -import argparse -import matplotlib.pyplot as plt -import warnings -from clinicadl.tools.deep_learning.iotools import read_json, commandline_to_json, translate_parameters, return_logger from clinicadl.tools.deep_learning.cnn_utils import get_criterion, sort_predicted +from clinicadl.tools.deep_learning.data import ( + get_transforms, + load_data_test, + return_dataset, +) +from clinicadl.tools.deep_learning.iotools import ( + commandline_to_json, + read_json, + return_logger, + translate_parameters, +) from clinicadl.tools.deep_learning.models import create_model, load_model -from clinicadl.tools.deep_learning.data import load_data_test, return_dataset, get_transforms + from .gradients import VanillaBackProp @@ -19,13 +30,17 @@ def individual_backprop(options): main_logger = return_logger(options.verbose, "main process") options = translate_parameters(options) - fold_list = [fold for fold in os.listdir(options.model_path) if fold[:5:] == "fold-"] + fold_list = [ + fold for fold in os.listdir(options.model_path) if fold[:5:] == "fold-" + ] if len(fold_list) == 0: raise ValueError("No folds were found at path %s" % options.model_path) model_options = argparse.Namespace() - model_options = read_json(model_options, path.join(options.model_path, 'commandline.json')) + model_options = read_json( + model_options, path.join(options.model_path, "commandline.json") + ) model_options = translate_parameters(model_options) model_options.gpu = options.gpu @@ -48,70 +63,105 @@ def individual_backprop(options): for fold in fold_list: main_logger.info(fold) for selection in options.selection: - results_path = path.join(options.model_path, fold, 'gradients', - selection, options.name) + results_path = path.join( + options.model_path, fold, "gradients", selection, options.name + ) criterion = get_criterion(model_options.loss) # Data management (remove data not well predicted by the CNN) - training_df = load_data_test(options.tsv_path, [options.diagnosis], baseline=options.baseline, - multi_cohort=options.multi_cohort) + training_df = load_data_test( + options.tsv_path, + [options.diagnosis], + baseline=options.baseline, + multi_cohort=options.multi_cohort, + ) training_df.reset_index(drop=True, inplace=True) # Model creation - _, all_transforms = get_transforms(model_options.mode, - minmaxnormalization=model_options.minmaxnormalization) + _, all_transforms = get_transforms( + model_options.mode, + minmaxnormalization=model_options.minmaxnormalization, + ) with warnings.catch_warnings(): warnings.simplefilter("ignore") - data_example = return_dataset(model_options.mode, options.input_dir, - training_df, model_options.preprocessing, - train_transformations=None, all_transformations=all_transforms, - prepare_dl=options.prepare_dl, multi_cohort=options.multi_cohort, - params=model_options) + data_example = return_dataset( + model_options.mode, + options.input_dir, + training_df, + model_options.preprocessing, + train_transformations=None, + all_transformations=all_transforms, + prepare_dl=options.prepare_dl, + multi_cohort=options.multi_cohort, + params=model_options, + ) model = create_model(model_options, data_example.size) - model_dir = os.path.join(options.model_path, fold, 'models', selection) - model, best_epoch = load_model(model, model_dir, gpu=options.gpu, filename='model_best.pth.tar') + model_dir = os.path.join(options.model_path, fold, "models", selection) + model, best_epoch = load_model( + model, model_dir, gpu=options.gpu, filename="model_best.pth.tar" + ) options.output_dir = results_path commandline_to_json(options, logger=main_logger) # Keep only subjects who were correctly / wrongly predicted by the network - training_df = sort_predicted(model, training_df, options.input_dir, model_options, - criterion, options.keep_true, - batch_size=options.batch_size, num_workers=options.num_workers, - gpu=options.gpu) + training_df = sort_predicted( + model, + training_df, + options.input_dir, + model_options, + criterion, + options.keep_true, + batch_size=options.batch_size, + num_workers=options.num_workers, + gpu=options.gpu, + ) if len(training_df) > 0: # Save the tsv files used for the saliency maps - training_df.to_csv(path.join('data.tsv'), sep='\t', index=False) + training_df.to_csv(path.join("data.tsv"), sep="\t", index=False) with warnings.catch_warnings(): warnings.simplefilter("ignore") - data_train = return_dataset(model_options.mode, options.input_dir, - training_df, model_options.preprocessing, - train_transformations=None, all_transformations=all_transforms, - prepare_dl=options.prepare_dl, multi_cohort=options.multi_cohort, - params=model_options) - - train_loader = DataLoader(data_train, - batch_size=options.batch_size, - shuffle=True, - num_workers=options.num_workers, - pin_memory=True) + data_train = return_dataset( + model_options.mode, + options.input_dir, + training_df, + model_options.preprocessing, + train_transformations=None, + all_transformations=all_transforms, + prepare_dl=options.prepare_dl, + multi_cohort=options.multi_cohort, + params=model_options, + ) + + train_loader = DataLoader( + data_train, + batch_size=options.batch_size, + shuffle=True, + num_workers=options.num_workers, + pin_memory=True, + ) interpreter = VanillaBackProp(model, gpu=options.gpu) for data in train_loader: if options.gpu: - input_batch = data['image'].cuda() + input_batch = data["image"].cuda() else: - input_batch = data['image'] + input_batch = data["image"] - map_np = interpreter.generate_gradients(input_batch, - data_train.diagnosis_code[options.target_diagnosis]) + map_np = interpreter.generate_gradients( + input_batch, data_train.diagnosis_code[options.target_diagnosis] + ) for i in range(options.batch_size): - single_path = path.join(results_path, data['participant_id'][i], data['session_id'][i]) + single_path = path.join( + results_path, + data["participant_id"][i], + data["session_id"][i], + ) os.makedirs(single_path, exist_ok=True) if len(data_train.size) == 4: @@ -125,7 +175,12 @@ def individual_backprop(options): nib.save(map_nii, path.join(single_path, "map.nii.gz")) else: jpg_path = path.join(single_path, "map.jpg") - plt.imshow(map_np[i, 0, :, :], cmap="coolwarm", vmin=-options.vmax, vmax=options.vmax) + plt.imshow( + map_np[i, 0, :, :], + cmap="coolwarm", + vmin=-options.vmax, + vmax=options.vmax, + ) plt.colorbar() plt.savefig(jpg_path) plt.close() diff --git a/clinicadl/clinicadl/main.py b/clinicadl/clinicadl/main.py index cf40bbcb1..2c3fd5178 100644 --- a/clinicadl/clinicadl/main.py +++ b/clinicadl/clinicadl/main.py @@ -1,24 +1,34 @@ # coding: utf8 -from . import cli import torch +from . import cli + def main(): parser = cli.parse_command_line() args = parser.parse_args() - if (args.version): + if args.version: import clinicadl + print(f"ClinicaDL version is: {clinicadl.__version__}") exit(0) - if hasattr(args, 'use_cpu'): - if args.use_cpu is not None and not args.use_cpu and not torch.cuda.is_available(): - raise ValueError("No GPU is available. Please add the -cpu flag to run on CPU.") - if hasattr(args, 'use_gpu'): + if hasattr(args, "use_cpu"): + if ( + args.use_cpu is not None + and not args.use_cpu + and not torch.cuda.is_available() + ): + raise ValueError( + "No GPU is available. Please add the -cpu flag to run on CPU." + ) + if hasattr(args, "use_gpu"): if args.use_gpu and torch.cuda.is_available(): - raise ValueError("No GPU is available. Please disable -gpu flag to run on CPU.") + raise ValueError( + "No GPU is available. Please disable -gpu flag to run on CPU." + ) if not args.task: parser.print_help() @@ -26,5 +36,5 @@ def main(): args.func(args) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/clinicadl/clinicadl/preprocessing/t1_extensive/t1_extensive_cli.py b/clinicadl/clinicadl/preprocessing/t1_extensive/t1_extensive_cli.py index e80382228..d0ac810f7 100644 --- a/clinicadl/clinicadl/preprocessing/t1_extensive/t1_extensive_cli.py +++ b/clinicadl/clinicadl/preprocessing/t1_extensive/t1_extensive_cli.py @@ -32,9 +32,10 @@ def define_options(self): def run_command(self, args): """Run the pipeline with defined args.""" + from clinica.utils.ux import print_crash_files_and_exit, print_end_pipeline from networkx import Graph + from .t1_extensive_pipeline import T1Extensive - from clinica.utils.ux import print_end_pipeline, print_crash_files_and_exit pipeline = T1Extensive( caps_directory=self.absolute_path(args.caps_directory), diff --git a/clinicadl/clinicadl/preprocessing/t1_extensive/t1_extensive_pipeline.py b/clinicadl/clinicadl/preprocessing/t1_extensive/t1_extensive_pipeline.py index f10e259ca..7814b2819 100644 --- a/clinicadl/clinicadl/preprocessing/t1_extensive/t1_extensive_pipeline.py +++ b/clinicadl/clinicadl/preprocessing/t1_extensive/t1_extensive_pipeline.py @@ -19,6 +19,7 @@ class T1Extensive(cpe.Pipeline): @staticmethod def get_processed_images(caps_directory, subjects, sessions): import os + from clinica.utils.filemanip import extract_image_ids from clinica.utils.inputs import clinica_file_reader @@ -65,10 +66,11 @@ def get_output_fields(self): def build_input_node(self): """Build and connect an input node to the pipeline.""" import os - import nipype.pipeline.engine as npe + import nipype.interfaces.utility as nutil - from clinica.utils.inputs import clinica_file_reader + import nipype.pipeline.engine as npe from clinica.utils.exceptions import ClinicaException + from clinica.utils.inputs import clinica_file_reader from clinica.utils.stream import cprint from clinica.utils.ux import print_images_to_process @@ -121,10 +123,10 @@ def build_input_node(self): def build_output_node(self): """Build and connect an output node to the pipeline.""" + import nipype.interfaces.io as nio import nipype.interfaces.utility as nutil import nipype.pipeline.engine as npe - import nipype.interfaces.io as nio - from clinica.utils.nipype import fix_join, container_from_filename + from clinica.utils.nipype import container_from_filename, fix_join # Find container path from filename # ================================= @@ -156,13 +158,15 @@ def build_output_node(self): def build_core_nodes(self): """Build and connect the core nodes of the pipeline.""" import os - import nipype.pipeline.engine as npe + import nipype.interfaces.utility as nutil + import nipype.pipeline.engine as npe from clinica.utils.inputs import RemoteFileStructure + from .t1_extensive_utils import ( - get_caps_filename, apply_binary_mask, - get_file_from_server + get_caps_filename, + get_file_from_server, ) # Get CAPS Filename diff --git a/clinicadl/clinicadl/preprocessing/t1_extensive/t1_extensive_utils.py b/clinicadl/clinicadl/preprocessing/t1_extensive/t1_extensive_utils.py index ce054d903..ef6456bf3 100644 --- a/clinicadl/clinicadl/preprocessing/t1_extensive/t1_extensive_utils.py +++ b/clinicadl/clinicadl/preprocessing/t1_extensive/t1_extensive_utils.py @@ -37,6 +37,7 @@ def apply_binary_mask(input_img, binary_img, output_filename): input_img*binary_img """ import os + import nibabel as nib original_image = nib.load(input_img) @@ -68,8 +69,9 @@ def get_file_from_server(remote_file, cache_path=None): """ import os from pathlib import Path - from clinica.utils.stream import cprint + from clinica.utils.inputs import fetch_file + from clinica.utils.stream import cprint home = str(Path.home()) if cache_path: diff --git a/clinicadl/clinicadl/quality_check/t1_linear/quality_check.py b/clinicadl/clinicadl/quality_check/t1_linear/quality_check.py index 851a8312e..0dbbb5292 100755 --- a/clinicadl/clinicadl/quality_check/t1_linear/quality_check.py +++ b/clinicadl/clinicadl/quality_check/t1_linear/quality_check.py @@ -2,45 +2,51 @@ This file contains all methods needed to perform the quality check procedure after t1-linear preprocessing. """ from os import makedirs -from os.path import dirname, join, exists, splitext, abspath +from os.path import abspath, dirname, exists, join, splitext from pathlib import Path import pandas as pd import torch +from clinica.utils.inputs import RemoteFileStructure, fetch_file from torch.utils.data import DataLoader -from .utils import QCDataset, resnet_qc_18 -from clinica.utils.inputs import fetch_file, RemoteFileStructure from ...tools.data.utils import load_and_check_tsv from ...tools.deep_learning.data import MRIDataset +from .utils import QCDataset, resnet_qc_18 -def quality_check(caps_dir, output_path, - tsv_path=None, threshold=0.5, - batch_size=1, num_workers=0, gpu=True): +def quality_check( + caps_dir, + output_path, + tsv_path=None, + threshold=0.5, + batch_size=1, + num_workers=0, + gpu=True, +): if splitext(output_path)[1] != ".tsv": raise ValueError("Please provide an output path to a tsv file") # Fetch QC model home = str(Path.home()) - cache_clinicadl = join(home, '.cache', 'clinicadl', 'models') - url_aramis = 'https://aramislab.paris.inria.fr/files/data/models/dl/qc/' + cache_clinicadl = join(home, ".cache", "clinicadl", "models") + url_aramis = "https://aramislab.paris.inria.fr/files/data/models/dl/qc/" FILE1 = RemoteFileStructure( - filename='resnet18.pth.tar', + filename="resnet18.pth.tar", url=url_aramis, - checksum='a97a781be3820b06424fe891ec405c78b87ad51a27b6b81614dbdb996ce60104' + checksum="a97a781be3820b06424fe891ec405c78b87ad51a27b6b81614dbdb996ce60104", ) makedirs(cache_clinicadl, exist_ok=True) model_file = join(cache_clinicadl, FILE1.filename) - if not(exists(model_file)): + if not (exists(model_file)): try: model_file = fetch_file(FILE1, cache_clinicadl) except IOError as err: - print('Unable to download required model for QC process:', err) + print("Unable to download required model for QC process:", err) # Load QC model model = resnet_qc_18() @@ -57,27 +63,31 @@ def quality_check(caps_dir, output_path, dataset = QCDataset(caps_dir, df) dataloader = DataLoader( - dataset, - num_workers=num_workers, - batch_size=batch_size, - pin_memory=True + dataset, num_workers=num_workers, batch_size=batch_size, pin_memory=True ) - columns = ['participant_id', 'session_id', 'pass_probability', 'pass'] + columns = ["participant_id", "session_id", "pass_probability", "pass"] qc_df = pd.DataFrame(columns=columns) softmax = torch.nn.Softmax(dim=1) for data in dataloader: - inputs = data['image'] + inputs = data["image"] if gpu: inputs = inputs.cuda() outputs = softmax.forward(model(inputs)) - for idx, sub in enumerate(data['participant_id']): + for idx, sub in enumerate(data["participant_id"]): pass_probability = outputs[idx, 1].item() - row = [[sub, data['session_id'][idx], pass_probability, pass_probability > threshold]] + row = [ + [ + sub, + data["session_id"][idx], + pass_probability, + pass_probability > threshold, + ] + ] row_df = pd.DataFrame(row, columns=columns) qc_df = qc_df.append(row_df) qc_df.sort_values("pass_probability", ascending=False, inplace=True) - qc_df.to_csv(output_path, sep='\t', index=False) + qc_df.to_csv(output_path, sep="\t", index=False) diff --git a/clinicadl/clinicadl/quality_check/t1_linear/utils.py b/clinicadl/clinicadl/quality_check/t1_linear/utils.py index c191463a7..8f6db6494 100644 --- a/clinicadl/clinicadl/quality_check/t1_linear/utils.py +++ b/clinicadl/clinicadl/quality_check/t1_linear/utils.py @@ -2,19 +2,21 @@ Copied from https://github.com/vfonov/deep-qc/blob/master/python/model/resnet_qc.py """ -import torch.nn as nn -from torch.utils.data import Dataset -import nibabel as nib from os import path + +import nibabel as nib import torch +import torch.nn as nn +from torch.utils.data import Dataset from clinicadl.tools.deep_learning.data import FILENAME_TYPE def conv3x3(in_planes, out_planes, stride=1): """3x3 convolution with padding""" - return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, - padding=1, bias=False) + return nn.Conv2d( + in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False + ) def conv1x1(in_planes, out_planes, stride=1): @@ -55,15 +57,17 @@ def forward(self, x): class ResNetQC(nn.Module): - - def __init__(self, block, layers, num_classes=2, use_ref=False, zero_init_residual=False): + def __init__( + self, block, layers, num_classes=2, use_ref=False, zero_init_residual=False + ): super(ResNetQC, self).__init__() self.inplanes = 64 self.use_ref = use_ref self.feat = 3 self.expansion = block.expansion - self.conv1 = nn.Conv2d(2 if self.use_ref else 1, 64, kernel_size=7, stride=2, padding=3, - bias=False) + self.conv1 = nn.Conv2d( + 2 if self.use_ref else 1, 64, kernel_size=7, stride=2, padding=3, bias=False + ) self.bn1 = nn.BatchNorm2d(64) self.relu = nn.ReLU(inplace=True) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) @@ -74,11 +78,19 @@ def __init__(self, block, layers, num_classes=2, use_ref=False, zero_init_residu # for merging multiple features self.addon = nn.Sequential( - nn.Conv2d(self.feat * 512 * block.expansion, 512 * block.expansion, kernel_size=1, stride=1, padding=0, - bias=True), + nn.Conv2d( + self.feat * 512 * block.expansion, + 512 * block.expansion, + kernel_size=1, + stride=1, + padding=0, + bias=True, + ), nn.BatchNorm2d(512 * block.expansion), nn.ReLU(inplace=True), - nn.Conv2d(512 * block.expansion, 32, kernel_size=1, stride=1, padding=0, bias=True), + nn.Conv2d( + 512 * block.expansion, 32, kernel_size=1, stride=1, padding=0, bias=True + ), nn.BatchNorm2d(32), nn.ReLU(inplace=True), nn.Conv2d(32, 32, kernel_size=7, stride=1, padding=0, bias=True), @@ -88,12 +100,12 @@ def __init__(self, block, layers, num_classes=2, use_ref=False, zero_init_residu nn.BatchNorm2d(32), nn.ReLU(inplace=False), nn.Dropout2d(p=0.5, inplace=True), - nn.Conv2d(32, num_classes, kernel_size=1, stride=1, padding=0, bias=True) + nn.Conv2d(32, num_classes, kernel_size=1, stride=1, padding=0, bias=True), ) # initialization for m in self.modules(): if isinstance(m, nn.Conv2d): - nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") elif isinstance(m, nn.BatchNorm2d): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) @@ -165,9 +177,13 @@ def __init__(self, img_dir, data_df, use_extracted_tensors=False): self.df = data_df self.use_extracted_tensors = use_extracted_tensors - if ('session_id' not in list(self.df.columns.values)) or ('participant_id' not in list(self.df.columns.values)): - raise Exception("the data file is not in the correct format." - "Columns should include ['participant_id', 'session_id']") + if ("session_id" not in list(self.df.columns.values)) or ( + "participant_id" not in list(self.df.columns.values) + ): + raise Exception( + "the data file is not in the correct format." + "Columns should include ['participant_id', 'session_id']" + ) self.normalization = MinMaxNormalization() @@ -175,24 +191,37 @@ def __len__(self): return len(self.df) def __getitem__(self, idx): - subject = self.df.loc[idx, 'participant_id'] - session = self.df.loc[idx, 'session_id'] + subject = self.df.loc[idx, "participant_id"] + session = self.df.loc[idx, "session_id"] if self.use_extracted_tensors: - image_path = path.join(self.img_dir, 'subjects', subject, session, 'deeplearning_prepare_data', - 'image_based', 't1_linear', - '%s_%s%s.pt' % (subject, session, FILENAME_TYPE["full"])) + image_path = path.join( + self.img_dir, + "subjects", + subject, + session, + "deeplearning_prepare_data", + "image_based", + "t1_linear", + "%s_%s%s.pt" % (subject, session, FILENAME_TYPE["full"]), + ) image = torch.load(image_path) image = self.pt_transform(image) else: - image_path = path.join(self.img_dir, 'subjects', subject, session, 't1_linear', - '%s_%s%s.nii.gz' % (subject, session, FILENAME_TYPE["full"])) + image_path = path.join( + self.img_dir, + "subjects", + subject, + session, + "t1_linear", + "%s_%s%s.nii.gz" % (subject, session, FILENAME_TYPE["full"]), + ) image = nib.load(image_path) image = self.nii_transform(image) - sample = {'image': image, 'participant_id': subject, 'session_id': session} + sample = {"image": image, "participant_id": subject, "session_id": session} return sample @@ -212,13 +241,15 @@ def nii_transform(image): input_images = [ sample[:, :, int(sz[2] / 2)], sample[int(sz[0] / 2), :, :], - sample[:, int(sz[1] / 2), :] + sample[:, int(sz[1] / 2), :], ] output_images = [ - np.zeros((224, 224),), + np.zeros( + (224, 224), + ), + np.zeros((224, 224)), np.zeros((224, 224)), - np.zeros((224, 224)) ] # flip, resize and crop @@ -230,28 +261,36 @@ def nii_transform(image): if len(input_images[i].shape) == 3: slice = np.reshape( - input_images[i], (input_images[i].shape[0], input_images[i].shape[1])) + input_images[i], + (input_images[i].shape[0], input_images[i].shape[1]), + ) else: slice = input_images[i] _scale = min(256.0 / slice.shape[0], 256.0 / slice.shape[1]) # slice[::-1, :] is to flip the first axis of image slice = transform.rescale( - slice[::-1, :], _scale, mode='constant', clip=False) + slice[::-1, :], _scale, mode="constant", clip=False + ) sz = slice.shape # pad image - dummy = np.zeros((256, 256),) - dummy[int((256 - sz[0]) / 2): int((256 - sz[0]) / 2) + sz[0], - int((256 - sz[1]) / 2): int((256 - sz[1]) / 2) + sz[1]] = slice + dummy = np.zeros( + (256, 256), + ) + dummy[ + int((256 - sz[0]) / 2) : int((256 - sz[0]) / 2) + sz[0], + int((256 - sz[1]) / 2) : int((256 - sz[1]) / 2) + sz[1], + ] = slice # rotate and flip the image back to the right direction for each view, if the MRI was read by nibabel # it seems that this will rotate the image 90 degree with # counter-clockwise direction and then flip it horizontally - output_images[i] = np.flip( - np.rot90(dummy[16:240, 16:240]), axis=1).copy() + output_images[i] = np.flip(np.rot90(dummy[16:240, 16:240]), axis=1).copy() - return torch.cat([torch.from_numpy(i).float().unsqueeze_(0) for i in output_images]).unsqueeze_(0) + return torch.cat( + [torch.from_numpy(i).float().unsqueeze_(0) for i in output_images] + ).unsqueeze_(0) def pt_transform(self, image): from torch.nn.functional import interpolate, pad @@ -262,7 +301,7 @@ def pt_transform(self, image): input_images = [ image[:, :, int(sz[2] / 2)], image[int(sz[0] / 2), :, :], - image[:, int(sz[1] / 2), :] + image[:, int(sz[1] / 2), :], ] output_images = list() @@ -272,7 +311,9 @@ def pt_transform(self, image): scale = min(256.0 / slice.shape[0], 256.0 / slice.shape[1]) # slice[::-1, :] is to flip the first axis of image - slice = interpolate(torch.flip(slice, (0,)).unsqueeze(0).unsqueeze(0), scale_factor=scale) + slice = interpolate( + torch.flip(slice, (0,)).unsqueeze(0).unsqueeze(0), scale_factor=scale + ) slice = slice[0, 0, :, :] padding = self.get_padding(slice) @@ -281,9 +322,18 @@ def pt_transform(self, image): # rotate and flip the image back to the right direction for each view, if the MRI was read by nibabel # it seems that this will rotate the image 90 degree with # counter-clockwise direction and then flip it horizontally - output_images.append(torch.flip(torch.rot90(slice[16:240, 16:240], 1, [0, 1]), [1, ]).clone()) + output_images.append( + torch.flip( + torch.rot90(slice[16:240, 16:240], 1, [0, 1]), + [ + 1, + ], + ).clone() + ) - return torch.cat([image.float().unsqueeze_(0) for image in output_images]).unsqueeze_(0) + return torch.cat( + [image.float().unsqueeze_(0) for image in output_images] + ).unsqueeze_(0) @staticmethod def get_padding(image): diff --git a/clinicadl/clinicadl/quality_check/t1_volume/quality_check.py b/clinicadl/clinicadl/quality_check/t1_volume/quality_check.py index b11a78818..33949ec1f 100644 --- a/clinicadl/clinicadl/quality_check/t1_volume/quality_check.py +++ b/clinicadl/clinicadl/quality_check/t1_volume/quality_check.py @@ -4,30 +4,43 @@ 2) percentage of non zero values < 15 % or > 50 % 3) frontal similarity of T1 volume with the template < 0.40 """ -import pandas as pd from os import path + +import pandas as pd + from .utils import extract_metrics def quality_check(caps_dir, output_dir, group_label): extract_metrics(caps_dir=caps_dir, output_dir=output_dir, group_label=group_label) - qc_df = pd.read_csv(path.join(output_dir, 'QC_metrics.tsv'), sep='\t') + qc_df = pd.read_csv(path.join(output_dir, "QC_metrics.tsv"), sep="\t") rejection1_df = qc_df[qc_df.max_intensity > 0.95] rejection1_df.to_csv( - path.join(output_dir, 'pass_step-1.tsv'), sep='\t', index=False) - print("Number of sessions removed based on max intensity: %i" - % (len(qc_df) - len(rejection1_df))) + path.join(output_dir, "pass_step-1.tsv"), sep="\t", index=False + ) + print( + "Number of sessions removed based on max intensity: %i" + % (len(qc_df) - len(rejection1_df)) + ) - rejection2_df = rejection1_df[(rejection1_df.non_zero_percentage < 0.5) & - (rejection1_df.non_zero_percentage > 0.15)] + rejection2_df = rejection1_df[ + (rejection1_df.non_zero_percentage < 0.5) + & (rejection1_df.non_zero_percentage > 0.15) + ] rejection2_df.to_csv( - path.join(output_dir, 'pass_step-2.tsv'), sep='\t', index=False) - print("Number of sessions removed based on non-zero voxels: %i" - % (len(rejection1_df) - len(rejection2_df))) + path.join(output_dir, "pass_step-2.tsv"), sep="\t", index=False + ) + print( + "Number of sessions removed based on non-zero voxels: %i" + % (len(rejection1_df) - len(rejection2_df)) + ) rejection3_df = rejection2_df[rejection2_df.frontal_similarity > 0.10] rejection3_df.to_csv( - path.join(output_dir, 'pass_step-3.tsv'), sep='\t', index=False) - print("Number of sessions removed based on frontal similarity with DARTEL template: %i" - % (len(rejection2_df) - len(rejection3_df))) + path.join(output_dir, "pass_step-3.tsv"), sep="\t", index=False + ) + print( + "Number of sessions removed based on frontal similarity with DARTEL template: %i" + % (len(rejection2_df) - len(rejection3_df)) + ) diff --git a/clinicadl/clinicadl/quality_check/t1_volume/utils.py b/clinicadl/clinicadl/quality_check/t1_volume/utils.py index ecc13047f..4576f3be8 100644 --- a/clinicadl/clinicadl/quality_check/t1_volume/utils.py +++ b/clinicadl/clinicadl/quality_check/t1_volume/utils.py @@ -1,14 +1,14 @@ """ Produces a tsv file to study all the nii files and perform the quality check. """ -from os import path -import numpy as np -import pandas as pd -import nibabel as nib import os +from os import path from pathlib import Path -from clinica.utils.inputs import fetch_file, RemoteFileStructure +import nibabel as nib +import numpy as np +import pandas as pd +from clinica.utils.inputs import RemoteFileStructure, fetch_file def extract_metrics(caps_dir, output_dir, group_label): @@ -17,52 +17,71 @@ def extract_metrics(caps_dir, output_dir, group_label): # Load eyes segmentation home = str(Path.home()) - cache_clinicadl = path.join(home, '.cache', 'clinicadl', 'segmentation') - url_aramis = 'https://aramislab.paris.inria.fr/files/data/template/' + cache_clinicadl = path.join(home, ".cache", "clinicadl", "segmentation") + url_aramis = "https://aramislab.paris.inria.fr/files/data/template/" FILE1 = RemoteFileStructure( - filename='eyes_segmentation.nii.gz', + filename="eyes_segmentation.nii.gz", url=url_aramis, - checksum='56f699c06cafc62ad8bb5b41b188c7c412d684d810a11d6f4cbb441c0ce944ee' + checksum="56f699c06cafc62ad8bb5b41b188c7c412d684d810a11d6f4cbb441c0ce944ee", ) - if not(path.exists(cache_clinicadl)): + if not (path.exists(cache_clinicadl)): os.makedirs(cache_clinicadl) segmentation_file = path.join(cache_clinicadl, FILE1.filename) - if not(path.exists(segmentation_file)): + if not (path.exists(segmentation_file)): try: segmentation_file = fetch_file(FILE1, cache_clinicadl) except IOError as err: - raise IOError( - 'Unable to download required eyes segmentation for QC:', err) + raise IOError("Unable to download required eyes segmentation for QC:", err) segmentation_nii = nib.load(segmentation_file) segmentation_np = segmentation_nii.get_fdata() # Get the GM template template_path = path.join( - caps_dir, 'groups', f'group-{group_label}', 't1', f'group-{group_label}_template.nii.gz') + caps_dir, + "groups", + f"group-{group_label}", + "t1", + f"group-{group_label}_template.nii.gz", + ) template_nii = nib.load(template_path) template_np = template_nii.get_fdata() template_np = np.sum(template_np, axis=3) template_segmentation_np = template_np * segmentation_np # Get the data - filename = path.join(output_dir, 'QC_metrics.tsv') - columns = ['participant_id', 'session_id', 'max_intensity', - 'non_zero_percentage', 'frontal_similarity'] + filename = path.join(output_dir, "QC_metrics.tsv") + columns = [ + "participant_id", + "session_id", + "max_intensity", + "non_zero_percentage", + "frontal_similarity", + ] results_df = pd.DataFrame() - subjects = os.listdir(path.join(caps_dir, 'subjects')) + subjects = os.listdir(path.join(caps_dir, "subjects")) subjects = [subject for subject in subjects if subject[:4:] == "sub-"] for subject in subjects: - subject_path = path.join(caps_dir, 'subjects', subject) + subject_path = path.join(caps_dir, "subjects", subject) sessions = os.listdir(subject_path) sessions = [session for session in sessions if session[:4:] == "ses-"] for session in sessions: - image_path = path.join(subject_path, session, 't1', 'spm', 'segmentation', 'normalized_space', - subject + '_' + session + '_T1w_segm-graymatter_space-Ixi549Space_modulated-off_probability.nii.gz') + image_path = path.join( + subject_path, + session, + "t1", + "spm", + "segmentation", + "normalized_space", + subject + + "_" + + session + + "_T1w_segm-graymatter_space-Ixi549Space_modulated-off_probability.nii.gz", + ) if path.exists(image_path): # GM analysis @@ -70,30 +89,40 @@ def extract_metrics(caps_dir, output_dir, group_label): image_np = image_nii.get_fdata() image_segmentation_np = image_np * segmentation_np eyes_nmi_value = nmi( - occlusion1=template_segmentation_np, occlusion2=image_segmentation_np) - - non_zero_percentage = np.count_nonzero( - image_np) / image_np.size - - row = [[subject, session, np.max( - image_np), non_zero_percentage, eyes_nmi_value]] + occlusion1=template_segmentation_np, + occlusion2=image_segmentation_np, + ) + + non_zero_percentage = np.count_nonzero(image_np) / image_np.size + + row = [ + [ + subject, + session, + np.max(image_np), + non_zero_percentage, + eyes_nmi_value, + ] + ] row_df = pd.DataFrame(row, columns=columns) results_df = pd.concat([results_df, row_df]) - results_df.sort_values('max_intensity', inplace=True, ascending=True) - results_df.to_csv(filename, sep='\t', index=False) + results_df.sort_values("max_intensity", inplace=True, ascending=True) + results_df.to_csv(filename, sep="\t", index=False) def nmi(occlusion1, occlusion2): - """ Mutual information for joint histogram - """ + """Mutual information for joint histogram""" # Convert bins counts to probability values hist_inter, _, _ = np.histogram2d(occlusion1.ravel(), occlusion2.ravel()) hist1, _, _ = np.histogram2d(occlusion1.ravel(), occlusion1.ravel()) hist2, _, _ = np.histogram2d(occlusion2.ravel(), occlusion2.ravel()) - return 2 * _mutual_information(hist_inter) / ( - _mutual_information(hist1) + _mutual_information(hist2)) + return ( + 2 + * _mutual_information(hist_inter) + / (_mutual_information(hist1) + _mutual_information(hist2)) + ) def _mutual_information(hgram): diff --git a/clinicadl/clinicadl/resume/automatic_resume.py b/clinicadl/clinicadl/resume/automatic_resume.py index 410f4e94a..f5f0c917a 100644 --- a/clinicadl/clinicadl/resume/automatic_resume.py +++ b/clinicadl/clinicadl/resume/automatic_resume.py @@ -13,19 +13,15 @@ def replace_arg(options, key_name, value): setattr(options, key_name, value) -def automatic_resume(model_path, - gpu, - batch_size, - num_workers, - evaluation_steps, - verbose=0): - from ..tools.deep_learning.iotools import return_logger, read_json - from ..train.train_singleCNN import train_single_cnn - from ..train.train_multiCNN import train_multi_cnn +def automatic_resume( + model_path, gpu, batch_size, num_workers, evaluation_steps, verbose=0 +): + from ..tools.deep_learning.iotools import read_json, return_logger from ..train.train_autoencoder import train_autoencoder - - from .resume_single_CNN import resume_single_cnn + from ..train.train_multiCNN import train_multi_cnn + from ..train.train_singleCNN import train_single_cnn from .resume_autoencoder import resume_autoencoder + from .resume_single_CNN import resume_single_cnn logger = return_logger(verbose=verbose, name_fn="automatic resume") @@ -44,11 +40,26 @@ def automatic_resume(model_path, # Set verbose options.verbose = verbose - fold_list = sorted([int(fold.split("-")[1]) for fold in os.listdir(options.model_path) if fold[:4:] == "fold"]) - finished_folds = [fold for fold in fold_list - if "cnn_classification" in os.listdir(path.join(options.model_path, f"fold-{fold}"))] - stopped_folds = [fold for fold in fold_list if fold not in finished_folds and - "checkpoint.pth.tar" in os.listdir(path.join(options.model_path, f"fold-{fold}", "models"))] + fold_list = sorted( + [ + int(fold.split("-")[1]) + for fold in os.listdir(options.model_path) + if fold[:4:] == "fold" + ] + ) + finished_folds = [ + fold + for fold in fold_list + if "cnn_classification" + in os.listdir(path.join(options.model_path, f"fold-{fold}")) + ] + stopped_folds = [ + fold + for fold in fold_list + if fold not in finished_folds + and "checkpoint.pth.tar" + in os.listdir(path.join(options.model_path, f"fold-{fold}", "models")) + ] if options.split is None: if options.n_splits is None: @@ -58,7 +69,11 @@ def automatic_resume(model_path, else: fold_iterator = options.split - absent_folds = [fold for fold in fold_iterator if fold not in finished_folds and fold not in stopped_folds] + absent_folds = [ + fold + for fold in fold_iterator + if fold not in finished_folds and fold not in stopped_folds + ] logger.info(f"Finished folds {finished_folds}") logger.info(f"Stopped folds {stopped_folds}") logger.info(f"Missing folds {absent_folds}") @@ -72,7 +87,9 @@ def automatic_resume(model_path, elif options.network_type == "autoencoder": resume_autoencoder(options, fold) else: - raise NotImplementedError(f'Resume function is not implemented for network type {options.network_type}') + raise NotImplementedError( + f"Resume function is not implemented for network type {options.network_type}" + ) if len(absent_folds) != 0: options.split = absent_folds @@ -83,4 +100,6 @@ def automatic_resume(model_path, elif options.network_type == "autoencoder": train_autoencoder(options, erase_existing=False) else: - raise NotImplementedError(f'Resume function is not implemented for network type {options.network_type}') + raise NotImplementedError( + f"Resume function is not implemented for network type {options.network_type}" + ) diff --git a/clinicadl/clinicadl/resume/resume_autoencoder.py b/clinicadl/clinicadl/resume/resume_autoencoder.py index 46bf52bef..6ce453691 100644 --- a/clinicadl/clinicadl/resume/resume_autoencoder.py +++ b/clinicadl/clinicadl/resume/resume_autoencoder.py @@ -1,14 +1,28 @@ # coding: utf8 from os import path + import torch from torch.utils.data import DataLoader -from ..tools.deep_learning.data import load_data, get_transforms, return_dataset, generate_sampler +from ..tools.deep_learning.autoencoder_utils import ( + get_criterion, + train, + visualize_image, +) +from ..tools.deep_learning.data import ( + generate_sampler, + get_transforms, + load_data, + return_dataset, +) +from ..tools.deep_learning.iotools import ( + commandline_to_json, + return_logger, + translate_parameters, + write_requirements_version, +) from ..tools.deep_learning.models import init_model, load_model, load_optimizer -from ..tools.deep_learning.iotools import return_logger, \ - commandline_to_json, write_requirements_version, translate_parameters -from ..tools.deep_learning.autoencoder_utils import train, get_criterion, visualize_image def resume_autoencoder(params, resumed_split): @@ -18,9 +32,11 @@ def resume_autoencoder(params, resumed_split): commandline_to_json(params, logger=main_logger) write_requirements_version(params.output_dir) params = translate_parameters(params) - train_transforms, all_transforms = get_transforms(params.mode, - minmaxnormalization=params.minmaxnormalization, - data_augmentation=params.data_augmentation) + train_transforms, all_transforms = get_transforms( + params.mode, + minmaxnormalization=params.minmaxnormalization, + data_augmentation=params.data_augmentation, + ) training_df, valid_df = load_data( params.tsv_path, @@ -29,15 +45,27 @@ def resume_autoencoder(params, resumed_split): n_splits=params.n_splits, baseline=params.baseline, logger=main_logger, - multi_cohort=params.multi_cohort + multi_cohort=params.multi_cohort, ) - data_train = return_dataset(params.mode, params.input_dir, training_df, params.preprocessing, - train_transformations=train_transforms, all_transformations=all_transforms, - params=params) - data_valid = return_dataset(params.mode, params.input_dir, valid_df, params.preprocessing, - train_transformations=train_transforms, all_transformations=all_transforms, - params=params) + data_train = return_dataset( + params.mode, + params.input_dir, + training_df, + params.preprocessing, + train_transformations=train_transforms, + all_transformations=all_transforms, + params=params, + ) + data_valid = return_dataset( + params.mode, + params.input_dir, + valid_df, + params.preprocessing, + train_transformations=train_transforms, + all_transformations=all_transforms, + params=params, + ) train_sampler = generate_sampler(data_train, params.sampler) @@ -46,7 +74,7 @@ def resume_autoencoder(params, resumed_split): batch_size=params.batch_size, sampler=train_sampler, num_workers=params.num_workers, - pin_memory=True + pin_memory=True, ) valid_loader = DataLoader( @@ -54,43 +82,68 @@ def resume_autoencoder(params, resumed_split): batch_size=params.batch_size, shuffle=False, num_workers=params.num_workers, - pin_memory=True + pin_memory=True, ) # Initialize the model - main_logger.info('Initialization of the model') + main_logger.info("Initialization of the model") decoder = init_model(params, initial_shape=data_train.size, autoencoder=True) model_dir = path.join(params.output_dir, f"fold-{resumed_split}", "models") - decoder, current_epoch = load_model(decoder, model_dir, params.gpu, 'checkpoint.pth.tar') + decoder, current_epoch = load_model( + decoder, model_dir, params.gpu, "checkpoint.pth.tar" + ) params.beginning_epoch = current_epoch + 1 # Define criterion and optimizer criterion = get_criterion(params.loss) - optimizer_path = path.join(params.output_dir, f"fold-{resumed_split}", "models", "optimizer.pth.tar") + optimizer_path = path.join( + params.output_dir, f"fold-{resumed_split}", "models", "optimizer.pth.tar" + ) optimizer = load_optimizer(optimizer_path, decoder) # Define output directories - log_dir = path.join( - params.output_dir, f'fold-{resumed_split}', 'tensorboard_logs') - model_dir = path.join( - params.output_dir, f'fold-{resumed_split}', 'models') + log_dir = path.join(params.output_dir, f"fold-{resumed_split}", "tensorboard_logs") + model_dir = path.join(params.output_dir, f"fold-{resumed_split}", "models") visualization_dir = path.join( - params.output_dir, f'fold-{resumed_split}', 'autoencoder_reconstruction') + params.output_dir, f"fold-{resumed_split}", "autoencoder_reconstruction" + ) - main_logger.debug('Beginning the training task') - train(decoder, train_loader, valid_loader, criterion, optimizer, False, - log_dir, model_dir, params, train_logger) + main_logger.debug("Beginning the training task") + train( + decoder, + train_loader, + valid_loader, + criterion, + optimizer, + False, + log_dir, + model_dir, + params, + train_logger, + ) if params.visualization: - best_decoder, _ = load_model(decoder, path.join(model_dir, "best_loss"), - params.gpu, filename='model_best.pth.tar') + best_decoder, _ = load_model( + decoder, + path.join(model_dir, "best_loss"), + params.gpu, + filename="model_best.pth.tar", + ) nb_images = data_train.size.elem_per_image if nb_images <= 2: nb_images *= 3 - visualize_image(best_decoder, valid_loader, path.join(visualization_dir, "validation"), - nb_images=nb_images) - visualize_image(best_decoder, train_loader, path.join(visualization_dir, "train"), - nb_images=nb_images) + visualize_image( + best_decoder, + valid_loader, + path.join(visualization_dir, "validation"), + nb_images=nb_images, + ) + visualize_image( + best_decoder, + train_loader, + path.join(visualization_dir, "train"), + nb_images=nb_images, + ) del decoder torch.cuda.empty_cache() diff --git a/clinicadl/clinicadl/resume/resume_single_CNN.py b/clinicadl/clinicadl/resume/resume_single_CNN.py index 43bfa6ecb..1d148a6b7 100644 --- a/clinicadl/clinicadl/resume/resume_single_CNN.py +++ b/clinicadl/clinicadl/resume/resume_single_CNN.py @@ -1,14 +1,24 @@ # coding: utf8 from os import path + from torch.utils.data import DataLoader -from ..train.train_singleCNN import test_single_cnn -from ..tools.deep_learning.data import load_data, get_transforms, return_dataset, generate_sampler +from ..tools.deep_learning.cnn_utils import get_criterion, train +from ..tools.deep_learning.data import ( + generate_sampler, + get_transforms, + load_data, + return_dataset, +) +from ..tools.deep_learning.iotools import ( + commandline_to_json, + return_logger, + translate_parameters, + write_requirements_version, +) from ..tools.deep_learning.models import init_model, load_model, load_optimizer -from ..tools.deep_learning.iotools import return_logger, \ - commandline_to_json, write_requirements_version, translate_parameters -from ..tools.deep_learning.cnn_utils import train, get_criterion +from ..train.train_singleCNN import test_single_cnn def resume_single_cnn(params, resumed_split): @@ -19,9 +29,11 @@ def resume_single_cnn(params, resumed_split): commandline_to_json(params, logger=main_logger) write_requirements_version(params.output_dir) params = translate_parameters(params) - train_transforms, all_transforms = get_transforms(params.mode, - minmaxnormalization=params.minmaxnormalization, - data_augmentation=params.data_augmentation) + train_transforms, all_transforms = get_transforms( + params.mode, + minmaxnormalization=params.minmaxnormalization, + data_augmentation=params.data_augmentation, + ) training_df, valid_df = load_data( params.tsv_path, @@ -30,15 +42,27 @@ def resume_single_cnn(params, resumed_split): n_splits=params.n_splits, baseline=params.baseline, logger=main_logger, - multi_cohort=params.multi_cohort + multi_cohort=params.multi_cohort, ) - data_train = return_dataset(params.mode, params.input_dir, training_df, params.preprocessing, - train_transformations=train_transforms, all_transformations=all_transforms, - params=params) - data_valid = return_dataset(params.mode, params.input_dir, valid_df, params.preprocessing, - train_transformations=train_transforms, all_transformations=all_transforms, - params=params) + data_train = return_dataset( + params.mode, + params.input_dir, + training_df, + params.preprocessing, + train_transformations=train_transforms, + all_transformations=all_transforms, + params=params, + ) + data_valid = return_dataset( + params.mode, + params.input_dir, + valid_df, + params.preprocessing, + train_transformations=train_transforms, + all_transformations=all_transforms, + params=params, + ) train_sampler = generate_sampler(data_train, params.sampler) @@ -47,7 +71,7 @@ def resume_single_cnn(params, resumed_split): batch_size=params.batch_size, sampler=train_sampler, num_workers=params.num_workers, - pin_memory=True + pin_memory=True, ) valid_loader = DataLoader( @@ -55,33 +79,67 @@ def resume_single_cnn(params, resumed_split): batch_size=params.batch_size, shuffle=False, num_workers=params.num_workers, - pin_memory=True + pin_memory=True, ) # Initialize the model - main_logger.info('Initialization of the model') - model = init_model(params, initial_shape=data_train.size, len_atlas=data_train.len_atlas()) + main_logger.info("Initialization of the model") + model = init_model( + params, initial_shape=data_train.size, len_atlas=data_train.len_atlas() + ) model_dir = path.join(params.output_dir, f"fold-{resumed_split}", "models") - model, current_epoch = load_model(model, model_dir, params.gpu, 'checkpoint.pth.tar') + model, current_epoch = load_model( + model, model_dir, params.gpu, "checkpoint.pth.tar" + ) params.beginning_epoch = current_epoch + 1 # Define criterion and optimizer criterion = get_criterion(params.loss) - optimizer_path = path.join(params.output_dir, f"fold-{resumed_split}", "models", "optimizer.pth.tar") + optimizer_path = path.join( + params.output_dir, f"fold-{resumed_split}", "models", "optimizer.pth.tar" + ) optimizer = load_optimizer(optimizer_path, model) # Define output directories - log_dir = path.join( - params.output_dir, f'fold-{resumed_split}', 'tensorboard_logs') - model_dir = path.join( - params.output_dir, f'fold-{resumed_split}', 'models') - - main_logger.debug('Beginning the training task') - train(model, train_loader, valid_loader, criterion, - optimizer, True, log_dir, model_dir, params, train_logger) - - test_single_cnn(model, params.output_dir, train_loader, "train", - resumed_split, criterion, params.mode, eval_logger, params.selection_threshold, gpu=params.gpu) - test_single_cnn(model, params.output_dir, valid_loader, "validation", - resumed_split, criterion, params.mode, eval_logger, params.selection_threshold, gpu=params.gpu) + log_dir = path.join(params.output_dir, f"fold-{resumed_split}", "tensorboard_logs") + model_dir = path.join(params.output_dir, f"fold-{resumed_split}", "models") + + main_logger.debug("Beginning the training task") + train( + model, + train_loader, + valid_loader, + criterion, + optimizer, + True, + log_dir, + model_dir, + params, + train_logger, + ) + + test_single_cnn( + model, + params.output_dir, + train_loader, + "train", + resumed_split, + criterion, + params.mode, + eval_logger, + params.selection_threshold, + gpu=params.gpu, + ) + test_single_cnn( + model, + params.output_dir, + valid_loader, + "validation", + resumed_split, + criterion, + params.mode, + eval_logger, + params.selection_threshold, + gpu=params.gpu, + ) diff --git a/clinicadl/clinicadl/tools/data/generate_data.py b/clinicadl/clinicadl/tools/data/generate_data.py index 7322e38d8..75a6c29f3 100644 --- a/clinicadl/clinicadl/tools/data/generate_data.py +++ b/clinicadl/clinicadl/tools/data/generate_data.py @@ -3,25 +3,40 @@ """ This file generates data for trivial or intractable (random) data for binary classification. """ -import pandas as pd -import numpy as np +import tarfile +from copy import copy +from os import makedirs +from os.path import exists, join + import nibabel as nib +import numpy as np +import pandas as pd import torch -from os.path import join, exists -from os import makedirs -from copy import copy +from clinica.utils.inputs import RemoteFileStructure, fetch_file -from clinica.utils.inputs import fetch_file, RemoteFileStructure -from .utils import im_loss_roi_gaussian_distribution, find_image_path, load_and_check_tsv, generate_shepplogan_phantom -from ..tsv.tsv_utils import extract_baseline -from ..deep_learning.data import MRIDataset -from clinicadl.tools.inputs.filename_types import FILENAME_TYPE from clinicadl.tools.deep_learning.iotools import check_and_clean, commandline_to_json -import tarfile - +from clinicadl.tools.inputs.filename_types import FILENAME_TYPE -def generate_random_dataset(caps_dir, output_dir, n_subjects, tsv_path=None, mean=0, - sigma=0.5, preprocessing="t1-linear", multi_cohort=False): +from ..deep_learning.data import MRIDataset +from ..tsv.tsv_utils import extract_baseline +from .utils import ( + find_image_path, + generate_shepplogan_phantom, + im_loss_roi_gaussian_distribution, + load_and_check_tsv, +) + + +def generate_random_dataset( + caps_dir, + output_dir, + n_subjects, + tsv_path=None, + mean=0, + sigma=0.5, + preprocessing="t1-linear", + multi_cohort=False, +): """ Generates a random dataset. @@ -46,14 +61,16 @@ def generate_random_dataset(caps_dir, output_dir, n_subjects, tsv_path=None, mea tsv file describing this output """ - commandline_to_json({ - "output_dir": output_dir, - "caps_dir": caps_dir, - "preprocessing": preprocessing, - "n_subjects": n_subjects, - "mean": mean, - "sigma": sigma - }) + commandline_to_json( + { + "output_dir": output_dir, + "caps_dir": caps_dir, + "preprocessing": preprocessing, + "n_subjects": n_subjects, + "mean": mean, + "sigma": sigma, + } + ) # Transform caps_dir in dict caps_dict = MRIDataset.create_caps_dict(caps_dir, multi_cohort=multi_cohort) @@ -61,35 +78,45 @@ def generate_random_dataset(caps_dir, output_dir, n_subjects, tsv_path=None, mea data_df = load_and_check_tsv(tsv_path, caps_dict, output_dir) # Create subjects dir - makedirs(join(output_dir, 'subjects'), exist_ok=True) + makedirs(join(output_dir, "subjects"), exist_ok=True) # Retrieve image of first subject - participant_id = data_df.loc[0, 'participant_id'] - session_id = data_df.loc[0, 'session_id'] - cohort = data_df.loc[0, 'cohort'] + participant_id = data_df.loc[0, "participant_id"] + session_id = data_df.loc[0, "session_id"] + cohort = data_df.loc[0, "cohort"] - image_path = find_image_path(caps_dict, participant_id, session_id, cohort, preprocessing) + image_path = find_image_path( + caps_dict, participant_id, session_id, cohort, preprocessing + ) image_nii = nib.load(image_path) image = image_nii.get_data() # Create output tsv file - participant_id_list = [f'sub-RAND{i}' for i in range(2 * n_subjects)] - session_id_list = ['ses-M00'] * 2 * n_subjects - diagnosis_list = ['AD'] * n_subjects + ['CN'] * n_subjects + participant_id_list = [f"sub-RAND{i}" for i in range(2 * n_subjects)] + session_id_list = ["ses-M00"] * 2 * n_subjects + diagnosis_list = ["AD"] * n_subjects + ["CN"] * n_subjects data = np.array([participant_id_list, session_id_list, diagnosis_list]) data = data.T - output_df = pd.DataFrame(data, columns=['participant_id', 'session_id', 'diagnosis']) - output_df['age_bl'] = 60 - output_df['sex'] = 'F' - output_df.to_csv(join(output_dir, 'data.tsv'), sep='\t', index=False) + output_df = pd.DataFrame( + data, columns=["participant_id", "session_id", "diagnosis"] + ) + output_df["age_bl"] = 60 + output_df["sex"] = "F" + output_df.to_csv(join(output_dir, "data.tsv"), sep="\t", index=False) for i in range(2 * n_subjects): gauss = np.random.normal(mean, sigma, image.shape) - participant_id = f'sub-RAND{i}' + participant_id = f"sub-RAND{i}" noisy_image = image + gauss - noisy_image_nii = nib.Nifti1Image(noisy_image, header=image_nii.header, affine=image_nii.affine) - noisy_image_nii_path = join(output_dir, 'subjects', participant_id, 'ses-M00', 't1_linear') - noisy_image_nii_filename = participant_id + '_ses-M00' + FILENAME_TYPE['cropped'] + '.nii.gz' + noisy_image_nii = nib.Nifti1Image( + noisy_image, header=image_nii.header, affine=image_nii.affine + ) + noisy_image_nii_path = join( + output_dir, "subjects", participant_id, "ses-M00", "t1_linear" + ) + noisy_image_nii_filename = ( + participant_id + "_ses-M00" + FILENAME_TYPE["cropped"] + ".nii.gz" + ) makedirs(noisy_image_nii_path, exist_ok=True) nib.save(noisy_image_nii, join(noisy_image_nii_path, noisy_image_nii_filename)) @@ -101,11 +128,21 @@ def generate_random_dataset(caps_dir, output_dir, n_subjects, tsv_path=None, mea session_df = output_df[output_df.session_id == session] out_df = copy(session_df[["participant_id"]]) out_df["synthetic"] = [1] * len(out_df) - out_df.to_csv(join(missing_path, f"missing_mods_{session}.tsv"), sep="\t", index=False) - - -def generate_trivial_dataset(caps_dir, output_dir, n_subjects, tsv_path=None, preprocessing="linear", - mask_path=None, atrophy_percent=60, multi_cohort=False): + out_df.to_csv( + join(missing_path, f"missing_mods_{session}.tsv"), sep="\t", index=False + ) + + +def generate_trivial_dataset( + caps_dir, + output_dir, + n_subjects, + tsv_path=None, + preprocessing="linear", + mask_path=None, + atrophy_percent=60, + multi_cohort=False, +): """ Generates a fully separable dataset. @@ -133,13 +170,15 @@ def generate_trivial_dataset(caps_dir, output_dir, n_subjects, tsv_path=None, pr """ from pathlib import Path - commandline_to_json({ - "output_dir": output_dir, - "caps_dir": caps_dir, - "preprocessing": preprocessing, - "n_subjects": n_subjects, - "atrophy_percent": atrophy_percent - }) + commandline_to_json( + { + "output_dir": output_dir, + "caps_dir": caps_dir, + "preprocessing": preprocessing, + "n_subjects": n_subjects, + "atrophy_percent": atrophy_percent, + } + ) # Transform caps_dir in dict caps_dict = MRIDataset.create_caps_dict(caps_dir, multi_cohort=multi_cohort) @@ -149,44 +188,49 @@ def generate_trivial_dataset(caps_dir, output_dir, n_subjects, tsv_path=None, pr data_df = extract_baseline(data_df, "None") home = str(Path.home()) - cache_clinicadl = join(home, '.cache', 'clinicadl', 'ressources', 'masks') - url_aramis = 'https://aramislab.paris.inria.fr/files/data/masks/' - FILE1 = RemoteFileStructure(filename='AAL2.tar.gz', - url=url_aramis, - checksum='89427970921674792481bffd2de095c8fbf49509d615e7e09e4bc6f0e0564471' - ) + cache_clinicadl = join(home, ".cache", "clinicadl", "ressources", "masks") + url_aramis = "https://aramislab.paris.inria.fr/files/data/masks/" + FILE1 = RemoteFileStructure( + filename="AAL2.tar.gz", + url=url_aramis, + checksum="89427970921674792481bffd2de095c8fbf49509d615e7e09e4bc6f0e0564471", + ) makedirs(cache_clinicadl, exist_ok=True) if n_subjects > len(data_df): - raise ValueError(f"The number of subjects {n_subjects} cannot be higher " - f"than the number of subjects in the baseline dataset of size {len(data_df)}") + raise ValueError( + f"The number of subjects {n_subjects} cannot be higher " + f"than the number of subjects in the baseline dataset of size {len(data_df)}" + ) if mask_path is None: - if not exists(join(cache_clinicadl, 'AAL2')): + if not exists(join(cache_clinicadl, "AAL2")): try: - print('Try to download AAL2 masks') + print("Try to download AAL2 masks") mask_path_tar = fetch_file(FILE1, cache_clinicadl) tar_file = tarfile.open(mask_path_tar) - print('File: ' + mask_path_tar) + print("File: " + mask_path_tar) try: tar_file.extractall(cache_clinicadl) tar_file.close() - mask_path = join(cache_clinicadl, 'AAL2') + mask_path = join(cache_clinicadl, "AAL2") except RuntimeError: - print('Unable to extract downloaded files') + print("Unable to extract downloaded files") except IOError as err: - print('Unable to download required templates:', err) - raise ValueError('''Unable to download masks, please download them + print("Unable to download required templates:", err) + raise ValueError( + """Unable to download masks, please download them manually at https://aramislab.paris.inria.fr/files/data/masks/ - and provide a valid path.''') + and provide a valid path.""" + ) else: - mask_path = join(cache_clinicadl, 'AAL2') + mask_path = join(cache_clinicadl, "AAL2") # Create subjects dir - makedirs(join(output_dir, 'subjects'), exist_ok=True) + makedirs(join(output_dir, "subjects"), exist_ok=True) # Output tsv file - columns = ['participant_id', 'session_id', 'diagnosis', 'age_bl', 'sex'] + columns = ["participant_id", "session_id", "diagnosis", "age_bl", "sex"] output_df = pd.DataFrame(columns=columns) diagnosis_list = ["AD", "CN"] @@ -197,28 +241,34 @@ def generate_trivial_dataset(caps_dir, output_dir, n_subjects, tsv_path=None, pr participant_id = data_df.loc[data_idx, "participant_id"] session_id = data_df.loc[data_idx, "session_id"] cohort = data_df.loc[data_idx, "cohort"] - filename = f'sub-TRIV{i}_ses-M00' + FILENAME_TYPE['cropped'] + '.nii.gz' - path_image = join(output_dir, 'subjects', f'sub-TRIV{i}', 'ses-M00', 't1_linear') + filename = f"sub-TRIV{i}_ses-M00" + FILENAME_TYPE["cropped"] + ".nii.gz" + path_image = join( + output_dir, "subjects", f"sub-TRIV{i}", "ses-M00", "t1_linear" + ) makedirs(path_image, exist_ok=True) - image_path = find_image_path(caps_dict, participant_id, session_id, cohort, preprocessing) + image_path = find_image_path( + caps_dict, participant_id, session_id, cohort, preprocessing + ) image_nii = nib.load(image_path) image = image_nii.get_data() - atlas_to_mask = nib.load(join(mask_path, f'mask-{label + 1}.nii')).get_data() + atlas_to_mask = nib.load(join(mask_path, f"mask-{label + 1}.nii")).get_data() # Create atrophied image - trivial_image = im_loss_roi_gaussian_distribution(image, atlas_to_mask, atrophy_percent) + trivial_image = im_loss_roi_gaussian_distribution( + image, atlas_to_mask, atrophy_percent + ) trivial_image_nii = nib.Nifti1Image(trivial_image, affine=image_nii.affine) trivial_image_nii.to_filename(join(path_image, filename)) # Append row to output tsv - row = [f'sub-TRIV{i}', 'ses-M00', diagnosis_list[label], 60, 'F'] + row = [f"sub-TRIV{i}", "ses-M00", diagnosis_list[label], 60, "F"] row_df = pd.DataFrame([row], columns=columns) output_df = output_df.append(row_df) - output_df.to_csv(join(output_dir, 'data.tsv'), sep='\t', index=False) + output_df.to_csv(join(output_dir, "data.tsv"), sep="\t", index=False) missing_path = join(output_dir, "missing_mods") makedirs(missing_path, exist_ok=True) @@ -228,20 +278,25 @@ def generate_trivial_dataset(caps_dir, output_dir, n_subjects, tsv_path=None, pr session_df = output_df[output_df.session_id == session] out_df = copy(session_df[["participant_id"]]) out_df["synthetic"] = [1] * len(out_df) - out_df.to_csv(join(missing_path, f"missing_mods_{session}.tsv"), sep="\t", index=False) + out_df.to_csv( + join(missing_path, f"missing_mods_{session}.tsv"), sep="\t", index=False + ) -def generate_shepplogan_dataset(output_dir, img_size, labels_distribution, - samples=100, smoothing=True): +def generate_shepplogan_dataset( + output_dir, img_size, labels_distribution, samples=100, smoothing=True +): check_and_clean(join(output_dir, "subjects")) - commandline_to_json({ - "output_dir": output_dir, - "img_size": img_size, - "labels_distribution": labels_distribution, - "samples": samples, - "smoothing": smoothing - }) + commandline_to_json( + { + "output_dir": output_dir, + "img_size": img_size, + "labels_distribution": labels_distribution, + "samples": samples, + "smoothing": smoothing, + } + ) columns = ["participant_id", "session_id", "diagnosis", "subtype"] data_df = pd.DataFrame(columns=columns) @@ -249,20 +304,30 @@ def generate_shepplogan_dataset(output_dir, img_size, labels_distribution, samples_per_subtype = np.array(labels_distribution[label]) * samples for subtype in range(len(samples_per_subtype)): for j in range(int(samples_per_subtype[subtype])): - participant_id = "sub-CLNC%i%04d" % (i, j + np.sum(samples_per_subtype[:subtype:]).astype(int)) + participant_id = "sub-CLNC%i%04d" % ( + i, + j + np.sum(samples_per_subtype[:subtype:]).astype(int), + ) session_id = "ses-M00" - row_df = pd.DataFrame([[participant_id, session_id, label, subtype]], - columns=columns) + row_df = pd.DataFrame( + [[participant_id, session_id, label, subtype]], columns=columns + ) data_df = data_df.append(row_df) # Image generation - path_out = join(output_dir, "subjects", "%s_%s%s.pt" - % (participant_id, session_id, FILENAME_TYPE["shepplogan"])) - img = generate_shepplogan_phantom(img_size, label=subtype, smoothing=smoothing) + path_out = join( + output_dir, + "subjects", + "%s_%s%s.pt" + % (participant_id, session_id, FILENAME_TYPE["shepplogan"]), + ) + img = generate_shepplogan_phantom( + img_size, label=subtype, smoothing=smoothing + ) torch_img = torch.from_numpy(img).float().unsqueeze(0) torch.save(torch_img, path_out) - data_df.to_csv(join(output_dir, 'data.tsv'), sep="\t", index=False) + data_df.to_csv(join(output_dir, "data.tsv"), sep="\t", index=False) missing_path = join(output_dir, "missing_mods") if not exists(missing_path): @@ -273,20 +338,25 @@ def generate_shepplogan_dataset(output_dir, img_size, labels_distribution, session_df = data_df[data_df.session_id == session] out_df = copy(session_df[["participant_id"]]) out_df["t1w"] = [1] * len(out_df) - out_df.to_csv(join(missing_path, "missing_mods_%s.tsv" % session), sep="\t", index=False) + out_df.to_csv( + join(missing_path, "missing_mods_%s.tsv" % session), sep="\t", index=False + ) -def generate_shepplogan_dataset(output_dir, img_size, labels_distribution, - samples=100, smoothing=True): +def generate_shepplogan_dataset( + output_dir, img_size, labels_distribution, samples=100, smoothing=True +): check_and_clean(join(output_dir, "subjects")) - commandline_to_json({ - "output_dir": output_dir, - "img_size": img_size, - "labels_distribution": labels_distribution, - "samples": samples, - "smoothing": smoothing - }) + commandline_to_json( + { + "output_dir": output_dir, + "img_size": img_size, + "labels_distribution": labels_distribution, + "samples": samples, + "smoothing": smoothing, + } + ) columns = ["participant_id", "session_id", "diagnosis", "subtype"] data_df = pd.DataFrame(columns=columns) @@ -294,19 +364,28 @@ def generate_shepplogan_dataset(output_dir, img_size, labels_distribution, for j in range(samples): participant_id = "sub-CLNC%i%04d" % (i, j) session_id = "ses-M00" - subtype = np.random.choice(np.arange(len(labels_distribution[label])), p=labels_distribution[label]) - row_df = pd.DataFrame([[participant_id, session_id, label, subtype]], - columns=columns) + subtype = np.random.choice( + np.arange(len(labels_distribution[label])), p=labels_distribution[label] + ) + row_df = pd.DataFrame( + [[participant_id, session_id, label, subtype]], columns=columns + ) data_df = data_df.append(row_df) # Image generation - path_out = join(output_dir, "subjects", "%s_%s%s.pt" - % (participant_id, session_id, FILENAME_TYPE["shepplogan"])) - img = generate_shepplogan_phantom(img_size, label=subtype, smoothing=smoothing) + path_out = join( + output_dir, + "subjects", + "%s_%s%s.pt" + % (participant_id, session_id, FILENAME_TYPE["shepplogan"]), + ) + img = generate_shepplogan_phantom( + img_size, label=subtype, smoothing=smoothing + ) torch_img = torch.from_numpy(img).float().unsqueeze(0) torch.save(torch_img, path_out) - data_df.to_csv(join(output_dir, 'data.tsv'), sep="\t", index=False) + data_df.to_csv(join(output_dir, "data.tsv"), sep="\t", index=False) missing_path = join(output_dir, "missing_mods") if not exists(missing_path): @@ -317,4 +396,6 @@ def generate_shepplogan_dataset(output_dir, img_size, labels_distribution, session_df = data_df[data_df.session_id == session] out_df = copy(session_df[["participant_id"]]) out_df["t1w"] = [1] * len(out_df) - out_df.to_csv(join(missing_path, "missing_mods_%s.tsv" % session), sep="\t", index=False) + out_df.to_csv( + join(missing_path, "missing_mods_%s.tsv" % session), sep="\t", index=False + ) diff --git a/clinicadl/clinicadl/tools/data/utils.py b/clinicadl/clinicadl/tools/data/utils.py index 1ba37d436..2f48ca331 100755 --- a/clinicadl/clinicadl/tools/data/utils.py +++ b/clinicadl/clinicadl/tools/data/utils.py @@ -1,43 +1,52 @@ # coding: utf8 -import numpy as np -from clinicadl.tools.inputs.filename_types import FILENAME_TYPE import random -from skimage.draw import ellipse + +import numpy as np from scipy.ndimage import gaussian_filter +from skimage.draw import ellipse + +from clinicadl.tools.inputs.filename_types import FILENAME_TYPE def load_and_check_tsv(tsv_path, caps_dict, output_path): - import pandas as pd from os.path import join + + import pandas as pd from clinica.iotools.utils.data_handling import create_subs_sess_list + from ..deep_learning.data import check_multi_cohort_tsv if tsv_path is not None: if len(caps_dict) == 1: - df = pd.read_csv(tsv_path, sep='\t') - if ('session_id' not in list(df.columns.values)) or ( - 'participant_id' not in list(df.columns.values)): - raise Exception("the data file is not in the correct format." - "Columns should include ['participant_id', 'session_id']") + df = pd.read_csv(tsv_path, sep="\t") + if ("session_id" not in list(df.columns.values)) or ( + "participant_id" not in list(df.columns.values) + ): + raise Exception( + "the data file is not in the correct format." + "Columns should include ['participant_id', 'session_id']" + ) else: - tsv_df = pd.read_csv(tsv_path, sep='\t') + tsv_df = pd.read_csv(tsv_path, sep="\t") check_multi_cohort_tsv(tsv_df, "labels") df = pd.DataFrame() for idx in range(len(tsv_df)): - cohort_name = tsv_df.loc[idx, 'cohort'] - cohort_path = tsv_df.loc[idx, 'path'] + cohort_name = tsv_df.loc[idx, "cohort"] + cohort_path = tsv_df.loc[idx, "path"] cohort_df = pd.read_csv(cohort_path, sep="\t") - cohort_df['cohort'] = cohort_name + cohort_df["cohort"] = cohort_name df = pd.concat([df, cohort_df]) else: df = pd.DataFrame() for cohort, caps_path in caps_dict.items(): - create_subs_sess_list(caps_path, output_path, - is_bids_dir=False, use_session_tsv=False) + create_subs_sess_list( + caps_path, output_path, is_bids_dir=False, use_session_tsv=False + ) cohort_df = pd.read_csv( - join(output_path, 'subjects_sessions_list.tsv'), sep="\t") - cohort_df['cohort'] = cohort + join(output_path, "subjects_sessions_list.tsv"), sep="\t" + ) + cohort_df["cohort"] = cohort df = pd.concat([df, cohort_df]) return df @@ -47,22 +56,37 @@ def find_image_path(caps_dict, participant_id, session_id, cohort, preprocessing from os import path if cohort not in caps_dict.keys(): - raise ValueError('Cohort names in labels and CAPS definitions do not match.') + raise ValueError("Cohort names in labels and CAPS definitions do not match.") if preprocessing == "t1-linear": - image_path = path.join(caps_dict[cohort], 'subjects', participant_id, session_id, - 't1_linear', - participant_id + '_' + session_id + - FILENAME_TYPE['cropped'] + '.nii.gz') + image_path = path.join( + caps_dict[cohort], + "subjects", + participant_id, + session_id, + "t1_linear", + participant_id + "_" + session_id + FILENAME_TYPE["cropped"] + ".nii.gz", + ) elif preprocessing == "t1-extensive": - image_path = path.join(caps_dict[cohort], 'subjects', participant_id, session_id, - 't1', 'spm', 'segmentation', 'normalized_space', - participant_id + '_' + session_id + - FILENAME_TYPE['skull_stripped'] + '.nii.gz') + image_path = path.join( + caps_dict[cohort], + "subjects", + participant_id, + session_id, + "t1", + "spm", + "segmentation", + "normalized_space", + participant_id + + "_" + + session_id + + FILENAME_TYPE["skull_stripped"] + + ".nii.gz", + ) else: raise ValueError( - "Preprocessing %s must be in ['t1-linear', 't1-extensive']." % - preprocessing) + "Preprocessing %s must be in ['t1-linear', 't1-extensive']." % preprocessing + ) return image_path @@ -73,7 +97,7 @@ def binary_t1_pgm(im_data): :return: binarized probability gray maps """ m = im_data > 0.0 - m = m.astype('float32') + m = m.astype("float32") return m @@ -96,8 +120,7 @@ def im_loss_roi_gaussian_distribution(im_data, atlas_to_mask, min_value): n_diff = n_new * 10 + min_value for i, coord in enumerate(coordinates): x, y, z = coord - gm_loss[x, y, z] = gm_masked[x, y, z] - \ - n_diff[i] * (gm_masked[x, y, z]) / 100 + gm_loss[x, y, z] = gm_masked[x, y, z] - n_diff[i] * (gm_masked[x, y, z]) / 100 normal_region = np.array(im_data, copy=True) normal_region[atlas_to_mask > 0] = 0 @@ -115,28 +138,23 @@ def find_borders_of_one_roi(output_path, i, tsv_atlas): :param tsv_atlas: tsv atlas :return: mask with 1 where there are the borders of roi 'i' with the other regions """ - import nibabel as nib import os + import nibabel as nib + total_mask = [] border_1 = nib.load( - os.path.join( - output_path, - 'borders-' + - str(i) + - '.nii')).get_data() + os.path.join(output_path, "borders-" + str(i) + ".nii") + ).get_data() for j in set(tsv_atlas.label): if j not in [0, i]: border_2 = nib.load( - os.path.join( - output_path, - 'borders-' + - str(j) + - '.nii')).get_data() + os.path.join(output_path, "borders-" + str(j) + ".nii") + ).get_data() image_sum = border_1 + border_2 mask = image_sum > 1 - mask = mask.astype('float32') + mask = mask.astype("float32") total_mask.append(mask) first_mask = total_mask[0] @@ -158,10 +176,11 @@ def find_border_of_prob_gm_and_atlas(output_path, i, tsv_atlas, im_data): """ mask_of_ROI_AAL2 = find_borders_of_one_roi(output_path, i, tsv_atlas) gm = binary_t1_pgm(im_data) - final_mask_for_gaussian = mask_of_ROI_AAL2 + \ - gm # AAL2 borders between the ROIS + T1w GM + final_mask_for_gaussian = ( + mask_of_ROI_AAL2 + gm + ) # AAL2 borders between the ROIS + T1w GM f_g = final_mask_for_gaussian > 1 - f_g = f_g.astype('float32') + f_g = f_g.astype("float32") idx = np.nonzero(f_g) coordinates = [] @@ -177,12 +196,13 @@ def generate_scales(size): return random.uniform(0.8, 0.9), random.uniform(0.8, 0.9) else: raise NotImplementedError( - "Size %s was not implemented for variable sizes." % size) + "Size %s was not implemented for variable sizes." % size + ) def generate_shepplogan_phantom(img_size, label=0, smoothing=True): img = np.zeros((img_size, img_size)) - center = (img_size + 1.) / 2.0 + center = (img_size + 1.0) / 2.0 a = center - 2 b = center * 2 / 3 - 2 @@ -203,8 +223,9 @@ def generate_shepplogan_phantom(img_size, label=0, smoothing=True): # Brain offset = random.uniform(1, img_size / 32) - rr, cc = ellipse(center + offset / 2, center, a - offset, - b - offset, (img_size, img_size)) + rr, cc = ellipse( + center + offset / 2, center, a - offset, b - offset, (img_size, img_size) + ) img[rr, cc] = 0.2 # Central @@ -212,9 +233,14 @@ def generate_shepplogan_phantom(img_size, label=0, smoothing=True): offset2 = random.uniform(1, img_size / 32) scale1, scale2 = generate_scales("large") phi = random.uniform(-np.pi, np.pi) - rr, cc = ellipse(center + offset1, center + offset2, - b / 6 * scale1, b / 6 * scale2, - (img_size, img_size), rotation=phi) + rr, cc = ellipse( + center + offset1, + center + offset2, + b / 6 * scale1, + b / 6 * scale2, + (img_size, img_size), + rotation=phi, + ) img[rr, cc] = color # ROI 1 @@ -222,9 +248,14 @@ def generate_shepplogan_phantom(img_size, label=0, smoothing=True): offset2 = random.uniform(1, img_size / 32) scale1, scale2 = generate_scales(roi1) phi = random.uniform(-np.pi, np.pi) - rr, cc = ellipse(center * 0.6 + offset1, center + offset2, - b / 3 * scale1, b / 4 * scale2, - (img_size, img_size), rotation=phi) + rr, cc = ellipse( + center * 0.6 + offset1, + center + offset2, + b / 3 * scale1, + b / 4 * scale2, + (img_size, img_size), + rotation=phi, + ) img[rr, cc] = color # ROI 2 @@ -232,48 +263,74 @@ def generate_shepplogan_phantom(img_size, label=0, smoothing=True): offset2 = random.uniform(1, img_size / 32) scale1, scale2 = generate_scales(roi2) phi = random.uniform(-np.pi, np.pi) - rr, cc = ellipse(center * 1.5 + offset1, center + offset2, - b / 10 * scale1, b / 10 * scale2, - (img_size, img_size), rotation=phi) + rr, cc = ellipse( + center * 1.5 + offset1, + center + offset2, + b / 10 * scale1, + b / 10 * scale2, + (img_size, img_size), + rotation=phi, + ) img[rr, cc] = color offset1 = random.uniform(1, img_size / 32) offset2 = random.uniform(1, img_size / 32) scale1, scale2 = generate_scales(roi2) phi = random.uniform(-np.pi, np.pi) - rr, cc = ellipse(center * 1.5 + offset1, center * 1.1 + offset2, - b / 10 * scale1, b / 10 * scale2, - (img_size, img_size), rotation=phi) + rr, cc = ellipse( + center * 1.5 + offset1, + center * 1.1 + offset2, + b / 10 * scale1, + b / 10 * scale2, + (img_size, img_size), + rotation=phi, + ) img[rr, cc] = color offset1 = random.uniform(1, img_size / 32) offset2 = random.uniform(1, img_size / 32) scale1, scale2 = generate_scales(roi2) phi = random.uniform(-np.pi, np.pi) - rr, cc = ellipse(center * 1.5 + offset1, center * 0.9 + offset2, - b / 10 * scale1, b / 10 * scale2, - (img_size, img_size), rotation=phi) + rr, cc = ellipse( + center * 1.5 + offset1, + center * 0.9 + offset2, + b / 10 * scale1, + b / 10 * scale2, + (img_size, img_size), + rotation=phi, + ) img[rr, cc] = color # Ventricle 1 a_roi = a * random.uniform(0.8, 1.2) phi = np.random.uniform(-np.pi / 16, np.pi / 16) - rr, cc = ellipse(center, center * 0.75, a_roi / 3, a_roi / 6, - (img_size, img_size), rotation=np.pi / 8 + phi) + rr, cc = ellipse( + center, + center * 0.75, + a_roi / 3, + a_roi / 6, + (img_size, img_size), + rotation=np.pi / 8 + phi, + ) img[rr, cc] = 0.0 # Ventricle 2 a_roi = a * random.uniform(0.8, 1.2) phi = np.random.uniform(-np.pi / 16, np.pi / 16) - rr, cc = ellipse(center, center * 1.25, a_roi / 3, a_roi / 6, - (img_size, img_size), rotation=-np.pi / 8 + phi) + rr, cc = ellipse( + center, + center * 1.25, + a_roi / 3, + a_roi / 6, + (img_size, img_size), + rotation=-np.pi / 8 + phi, + ) img[rr, cc] = 0.0 # Random smoothing if smoothing: sigma = random.uniform(0, 1) - img = gaussian_filter(img, sigma * img_size / - 100.) # smoothing of data + img = gaussian_filter(img, sigma * img_size / 100.0) # smoothing of data img.clip(0, 1) diff --git a/clinicadl/clinicadl/tools/deep_learning/__init__.py b/clinicadl/clinicadl/tools/deep_learning/__init__.py index 4598e8c29..a2a5b8776 100644 --- a/clinicadl/clinicadl/tools/deep_learning/__init__.py +++ b/clinicadl/clinicadl/tools/deep_learning/__init__.py @@ -1,9 +1,20 @@ -from .models import create_autoencoder, create_model, load_model, load_optimizer, save_checkpoint -from .iotools import read_json, commandline_to_json, write_requirements_version, check_and_complete +from .iotools import ( + check_and_complete, + commandline_to_json, + read_json, + write_requirements_version, +) +from .models import ( + create_autoencoder, + create_model, + load_model, + load_optimizer, + save_checkpoint, +) class EarlyStopping(object): - def __init__(self, mode='min', min_delta=0, patience=10): + def __init__(self, mode="min", min_delta=0, patience=10): self.mode = mode self.min_delta = min_delta self.patience = patience @@ -38,10 +49,10 @@ def step(self, metrics): return False def _init_is_better(self, mode, min_delta): - if mode not in {'min', 'max'}: - raise ValueError('mode ' + mode + ' is unknown!') + if mode not in {"min", "max"}: + raise ValueError("mode " + mode + " is unknown!") - if mode == 'min': + if mode == "min": self.is_better = lambda a, best: a < best - best * min_delta - if mode == 'max': + if mode == "max": self.is_better = lambda a, best: a > best + best * min_delta diff --git a/clinicadl/clinicadl/tools/deep_learning/autoencoder_utils.py b/clinicadl/clinicadl/tools/deep_learning/autoencoder_utils.py index c364d07fa..4aeefb2dd 100644 --- a/clinicadl/clinicadl/tools/deep_learning/autoencoder_utils.py +++ b/clinicadl/clinicadl/tools/deep_learning/autoencoder_utils.py @@ -1,21 +1,32 @@ -import numpy as np -import os import logging +import os import warnings -from torch import nn -import pandas as pd from time import time -from clinicadl.tools.deep_learning.iotools import check_and_clean -from clinicadl.tools.deep_learning import EarlyStopping, save_checkpoint +import numpy as np +import pandas as pd +from torch import nn +from clinicadl.tools.deep_learning import EarlyStopping, save_checkpoint +from clinicadl.tools.deep_learning.iotools import check_and_clean ############################# # AutoEncoder train / test # ############################# -def train(decoder, train_loader, valid_loader, criterion, optimizer, resume, - log_dir, model_dir, options, logger=None): + +def train( + decoder, + train_loader, + valid_loader, + criterion, + optimizer, + resume, + log_dir, + model_dir, + options, + logger=None, +): """ Function used to train an autoencoder. The best autoencoder will be found in the 'best_model_dir' of options.output_dir. @@ -37,32 +48,33 @@ def train(decoder, train_loader, valid_loader, criterion, optimizer, resume, if logger is None: logger = logging - columns = ['epoch', 'iteration', 'time', 'loss_train', 'loss_valid'] - filename = os.path.join(os.path.dirname(log_dir), 'training.tsv') + columns = ["epoch", "iteration", "time", "loss_train", "loss_valid"] + filename = os.path.join(os.path.dirname(log_dir), "training.tsv") if not resume: check_and_clean(model_dir) check_and_clean(log_dir) results_df = pd.DataFrame(columns=columns) - with open(filename, 'w') as f: - results_df.to_csv(f, index=False, sep='\t') + with open(filename, "w") as f: + results_df.to_csv(f, index=False, sep="\t") options.beginning_epoch = 0 else: if not os.path.exists(filename): raise ValueError( - 'The training.tsv file of the resumed experiment does not exist.') - truncated_df = pd.read_csv(filename, sep='\t') - truncated_df.set_index(['epoch', 'iteration'], inplace=True) + "The training.tsv file of the resumed experiment does not exist." + ) + truncated_df = pd.read_csv(filename, sep="\t") + truncated_df.set_index(["epoch", "iteration"], inplace=True) epochs = [epoch for epoch, _ in truncated_df.index.values] if options.beginning_epoch in epochs: truncated_df.drop(options.beginning_epoch, level=0, inplace=True) - truncated_df.to_csv(filename, index=True, sep='\t') + truncated_df.to_csv(filename, index=True, sep="\t") # Create writers - writer_train = SummaryWriter(os.path.join(log_dir, 'train')) - writer_valid = SummaryWriter(os.path.join(log_dir, 'validation')) + writer_train = SummaryWriter(os.path.join(log_dir, "train")) + writer_valid = SummaryWriter(os.path.join(log_dir, "validation")) decoder.train() train_loader.dataset.train() @@ -76,7 +88,8 @@ def train(decoder, train_loader, valid_loader, criterion, optimizer, resume, epoch = options.beginning_epoch early_stopping = EarlyStopping( - 'min', min_delta=options.tolerance, patience=options.patience) + "min", min_delta=options.tolerance, patience=options.patience + ) loss_valid = None t_beginning = time() @@ -89,9 +102,9 @@ def train(decoder, train_loader, valid_loader, criterion, optimizer, resume, step_flag = True for i, data in enumerate(train_loader): if options.gpu: - imgs = data['image'].cuda() + imgs = data["image"].cuda() else: - imgs = data['image'] + imgs = data["image"] train_output = decoder(imgs) loss = criterion(train_output, imgs) @@ -105,90 +118,106 @@ def train(decoder, train_loader, valid_loader, criterion, optimizer, resume, optimizer.zero_grad() # Evaluate the decoder only when no gradients are accumulated - if options.evaluation_steps != 0 and (i + 1) % options.evaluation_steps == 0: + if ( + options.evaluation_steps != 0 + and (i + 1) % options.evaluation_steps == 0 + ): evaluation_flag = False - loss_train = test_ae( - decoder, train_loader, options.gpu, criterion) - mean_loss_train = loss_train / \ - (len(train_loader) * train_loader.batch_size) - - loss_valid = test_ae( - decoder, valid_loader, options.gpu, criterion) - mean_loss_valid = loss_valid / \ - (len(valid_loader) * valid_loader.batch_size) + loss_train = test_ae(decoder, train_loader, options.gpu, criterion) + mean_loss_train = loss_train / ( + len(train_loader) * train_loader.batch_size + ) + + loss_valid = test_ae(decoder, valid_loader, options.gpu, criterion) + mean_loss_valid = loss_valid / ( + len(valid_loader) * valid_loader.batch_size + ) decoder.train() train_loader.dataset.train() writer_train.add_scalar( - 'loss', mean_loss_train, i + epoch * len(train_loader)) + "loss", mean_loss_train, i + epoch * len(train_loader) + ) writer_valid.add_scalar( - 'loss', mean_loss_valid, i + epoch * len(train_loader)) - logger.info("%s level training loss is %f at the end of iteration %d" - % (options.mode, mean_loss_train, i)) - logger.info("%s level validation loss is %f at the end of iteration %d" - % (options.mode, mean_loss_valid, i)) + "loss", mean_loss_valid, i + epoch * len(train_loader) + ) + logger.info( + "%s level training loss is %f at the end of iteration %d" + % (options.mode, mean_loss_train, i) + ) + logger.info( + "%s level validation loss is %f at the end of iteration %d" + % (options.mode, mean_loss_valid, i) + ) t_current = time() - t_beginning - row = [epoch, i, t_current, - mean_loss_train, mean_loss_valid] + row = [epoch, i, t_current, mean_loss_train, mean_loss_valid] row_df = pd.DataFrame([row], columns=columns) - with open(filename, 'a') as f: - row_df.to_csv(f, header=False, index=False, sep='\t') + with open(filename, "a") as f: + row_df.to_csv(f, header=False, index=False, sep="\t") # If no step has been performed, raise Exception if step_flag: raise Exception( - 'The model has not been updated once in the epoch. The accumulation step may be too large.') + "The model has not been updated once in the epoch. The accumulation step may be too large." + ) # If no evaluation has been performed, warn the user if evaluation_flag and options.evaluation_steps != 0: - logger.warning('Your evaluation steps are too big compared to the size of the dataset.' - 'The model is evaluated only once at the end of the epoch') + logger.warning( + "Your evaluation steps are too big compared to the size of the dataset." + "The model is evaluated only once at the end of the epoch" + ) # Always test the results and save them once at the end of the epoch - logger.debug('Last checkpoint at the end of the epoch %d' % epoch) + logger.debug("Last checkpoint at the end of the epoch %d" % epoch) loss_train = test_ae(decoder, train_loader, options.gpu, criterion) - mean_loss_train = loss_train / \ - (len(train_loader) * train_loader.batch_size) + mean_loss_train = loss_train / (len(train_loader) * train_loader.batch_size) loss_valid = test_ae(decoder, valid_loader, options.gpu, criterion) - mean_loss_valid = loss_valid / \ - (len(valid_loader) * valid_loader.batch_size) + mean_loss_valid = loss_valid / (len(valid_loader) * valid_loader.batch_size) decoder.train() train_loader.dataset.train() - writer_train.add_scalar('loss', mean_loss_train, - i + epoch * len(train_loader)) - writer_valid.add_scalar('loss', mean_loss_valid, - i + epoch * len(train_loader)) - logger.info("%s level training loss is %f at the end of iteration %d" - % (options.mode, mean_loss_train, i)) - logger.info("%s level validation loss is %f at the end of iteration %d" - % (options.mode, mean_loss_valid, i)) + writer_train.add_scalar("loss", mean_loss_train, i + epoch * len(train_loader)) + writer_valid.add_scalar("loss", mean_loss_valid, i + epoch * len(train_loader)) + logger.info( + "%s level training loss is %f at the end of iteration %d" + % (options.mode, mean_loss_train, i) + ) + logger.info( + "%s level validation loss is %f at the end of iteration %d" + % (options.mode, mean_loss_valid, i) + ) t_current = time() - t_beginning row = [epoch, i, t_current, mean_loss_train, mean_loss_valid] row_df = pd.DataFrame([row], columns=columns) - with open(filename, 'a') as f: - row_df.to_csv(f, header=False, index=False, sep='\t') + with open(filename, "a") as f: + row_df.to_csv(f, header=False, index=False, sep="\t") is_best = loss_valid < best_loss_valid best_loss_valid = min(best_loss_valid, loss_valid) # Always save the model at the end of the epoch and update best model - save_checkpoint({'model': decoder.state_dict(), - 'epoch': epoch, - 'valid_loss': loss_valid}, - False, is_best, - model_dir) + save_checkpoint( + {"model": decoder.state_dict(), "epoch": epoch, "valid_loss": loss_valid}, + False, + is_best, + model_dir, + ) # Save optimizer state_dict to be able to reload - save_checkpoint({'optimizer': optimizer.state_dict(), - 'epoch': epoch, - 'name': options.optimizer, - }, - False, False, - model_dir, - filename='optimizer.pth.tar') + save_checkpoint( + { + "optimizer": optimizer.state_dict(), + "epoch": epoch, + "name": options.optimizer, + }, + False, + False, + model_dir, + filename="optimizer.pth.tar", + ) epoch += 1 @@ -215,9 +244,9 @@ def test_ae(decoder, dataloader, use_cuda, criterion): total_loss = 0 for i, data in enumerate(dataloader, 0): if use_cuda: - inputs = data['image'].cuda() + inputs = data["image"].cuda() else: - inputs = data['image'] + inputs = data["image"] outputs = decoder(inputs) loss = criterion(outputs, inputs) @@ -240,6 +269,7 @@ def visualize_image(decoder, dataloader, visualization_path, nb_images=1): """ import nibabel as nib import numpy as np + from .iotools import check_and_clean check_and_clean(visualization_path) @@ -257,10 +287,13 @@ def visualize_image(decoder, dataloader, visualization_path, nb_images=1): input_np = image.squeeze(0).squeeze(0).cpu().detach().numpy() output_nii = nib.Nifti1Image(output_np, np.eye(4)) input_nii = nib.Nifti1Image(input_np, np.eye(4)) - nib.save(output_nii, os.path.join( - visualization_path, 'output-%i.nii.gz' % image_index)) - nib.save(input_nii, os.path.join( - visualization_path, 'input-%i.nii.gz' % image_index)) + nib.save( + output_nii, + os.path.join(visualization_path, "output-%i.nii.gz" % image_index), + ) + nib.save( + input_nii, os.path.join(visualization_path, "input-%i.nii.gz" % image_index) + ) ########### @@ -273,13 +306,14 @@ def get_criterion(option): elif option == "L1Norm" or option == "L1": if option == "L1Norm": warnings.warn( - "Normalization refers to SoftMax and cannot be applied for autoencoder training.") + "Normalization refers to SoftMax and cannot be applied for autoencoder training." + ) return nn.L1Loss() elif option == "SmoothL1Norm" or option == "SmoothL1": if option == "SmoothL1Norm": warnings.warn( - "Normalization refers to SoftMax and cannot be applied for autoencoder training.") + "Normalization refers to SoftMax and cannot be applied for autoencoder training." + ) return nn.SmoothL1Loss() else: - raise ValueError( - "The option %s is unknown for criterion selection" % option) + raise ValueError("The option %s is unknown for criterion selection" % option) diff --git a/clinicadl/clinicadl/tools/deep_learning/cnn_utils.py b/clinicadl/clinicadl/tools/deep_learning/cnn_utils.py index 551c4f109..3e4368208 100644 --- a/clinicadl/clinicadl/tools/deep_learning/cnn_utils.py +++ b/clinicadl/clinicadl/tools/deep_learning/cnn_utils.py @@ -1,26 +1,38 @@ # coding: utf8 -import torch -import numpy as np +import logging import os import warnings -import pandas as pd from time import time -import logging -from torch.nn.modules.loss import _Loss + +import numpy as np +import pandas as pd +import scipy.sparse as sp +import torch import torch.nn.functional as F from sklearn.utils import column_or_1d -import scipy.sparse as sp +from torch.nn.modules.loss import _Loss -from clinicadl.tools.deep_learning.iotools import check_and_clean from clinicadl.tools.deep_learning import EarlyStopping, save_checkpoint - +from clinicadl.tools.deep_learning.iotools import check_and_clean ##################### # CNN train / test # ##################### -def train(model, train_loader, valid_loader, criterion, optimizer, resume, log_dir, model_dir, options, logger=None): + +def train( + model, + train_loader, + valid_loader, + criterion, + optimizer, + resume, + log_dir, + model_dir, + options, + logger=None, +): """ Function used to train a CNN. The best model and checkpoint will be found in the 'best_model_dir' of options.output_dir. @@ -37,42 +49,51 @@ def train(model, train_loader, valid_loader, criterion, optimizer, resume, log_d options: (Namespace) ensemble of other options given to the main script. logger: (logging object) writer to stdout and stderr """ - from tensorboardX import SummaryWriter from time import time + from tensorboardX import SummaryWriter + if logger is None: logger = logging - columns = ['epoch', 'iteration', 'time', - 'balanced_accuracy_train', 'loss_train', - 'balanced_accuracy_valid', 'loss_valid'] + columns = [ + "epoch", + "iteration", + "time", + "balanced_accuracy_train", + "loss_train", + "balanced_accuracy_valid", + "loss_valid", + ] if hasattr(model, "variational") and model.variational: columns += ["kl_loss_train", "kl_loss_valid"] - filename = os.path.join(os.path.dirname(log_dir), 'training.tsv') + filename = os.path.join(os.path.dirname(log_dir), "training.tsv") if not resume: check_and_clean(model_dir) check_and_clean(log_dir) results_df = pd.DataFrame(columns=columns) - with open(filename, 'w') as f: - results_df.to_csv(f, index=False, sep='\t') + with open(filename, "w") as f: + results_df.to_csv(f, index=False, sep="\t") options.beginning_epoch = 0 else: if not os.path.exists(filename): - raise ValueError('The training.tsv file of the resumed experiment does not exist.') - truncated_df = pd.read_csv(filename, sep='\t') - truncated_df.set_index(['epoch', 'iteration'], inplace=True, drop=True) + raise ValueError( + "The training.tsv file of the resumed experiment does not exist." + ) + truncated_df = pd.read_csv(filename, sep="\t") + truncated_df.set_index(["epoch", "iteration"], inplace=True, drop=True) epochs = [epoch for epoch, _ in truncated_df.index.values] if options.beginning_epoch in epochs: truncated_df.drop(options.beginning_epoch, level=0, inplace=True) - truncated_df.to_csv(filename, index=True, sep='\t') + truncated_df.to_csv(filename, index=True, sep="\t") assert hasattr(options, "beginning_epoch") # Create writers - writer_train = SummaryWriter(os.path.join(log_dir, 'train')) - writer_valid = SummaryWriter(os.path.join(log_dir, 'validation')) + writer_train = SummaryWriter(os.path.join(log_dir, "train")) + writer_valid = SummaryWriter(os.path.join(log_dir, "validation")) # Initialize variables best_valid_accuracy = -1.0 @@ -82,7 +103,9 @@ def train(model, train_loader, valid_loader, criterion, optimizer, resume, log_d model.train() # set the model to training mode train_loader.dataset.train() - early_stopping = EarlyStopping('min', min_delta=options.tolerance, patience=options.patience) + early_stopping = EarlyStopping( + "min", min_delta=options.tolerance, patience=options.patience + ) mean_loss_valid = None t_beginning = time() @@ -99,9 +122,9 @@ def train(model, train_loader, valid_loader, criterion, optimizer, resume, log_d t0 = time() total_time = total_time + t0 - tend if options.gpu: - imgs, labels = data['image'].cuda(), data['label'].cuda() + imgs, labels = data["image"].cuda(), data["label"].cuda() else: - imgs, labels = data['image'], data['label'] + imgs, labels = data["image"], data["label"] if hasattr(model, "variational") and model.variational: z, mu, std, train_output = model(imgs) @@ -116,10 +139,12 @@ def train(model, train_loader, valid_loader, criterion, optimizer, resume, log_d atlas_data = data["atlas"].cuda() else: atlas_data = data["atlas"] - atlas_output = train_output[:, -atlas_data.size(1)::] - classif_output = train_output[:, :-atlas_data.size(1):] + atlas_output = train_output[:, -atlas_data.size(1) : :] + classif_output = train_output[:, : -atlas_data.size(1) :] loss += criterion(classif_output, labels) - loss += options.atlas_weight * torch.nn.MSELoss(reduction="sum")(atlas_output, atlas_data) + loss += options.atlas_weight * torch.nn.MSELoss(reduction="sum")( + atlas_output, atlas_data + ) else: loss += criterion(train_output, labels) @@ -137,103 +162,171 @@ def train(model, train_loader, valid_loader, criterion, optimizer, resume, log_d del loss # Evaluate the model only when no gradients are accumulated - if options.evaluation_steps != 0 and (i + 1) % options.evaluation_steps == 0: + if ( + options.evaluation_steps != 0 + and (i + 1) % options.evaluation_steps == 0 + ): evaluation_flag = False _, results_train = test(model, train_loader, options.gpu, criterion) - mean_loss_train = results_train["total_loss"] / (len(train_loader) * train_loader.batch_size) + mean_loss_train = results_train["total_loss"] / ( + len(train_loader) * train_loader.batch_size + ) _, results_valid = test(model, valid_loader, options.gpu, criterion) - mean_loss_valid = results_valid["total_loss"] / (len(valid_loader) * valid_loader.batch_size) + mean_loss_valid = results_valid["total_loss"] / ( + len(valid_loader) * valid_loader.batch_size + ) model.train() train_loader.dataset.train() global_step = i + epoch * len(train_loader) - writer_train.add_scalar('balanced_accuracy', results_train["balanced_accuracy"], global_step) - writer_train.add_scalar('loss', mean_loss_train, global_step) - writer_valid.add_scalar('balanced_accuracy', results_valid["balanced_accuracy"], global_step) - writer_valid.add_scalar('loss', mean_loss_valid, global_step) - logger.info("%s level training accuracy is %f at the end of iteration %d" - % (options.mode, results_train["balanced_accuracy"], i)) - logger.info("%s level validation accuracy is %f at the end of iteration %d" - % (options.mode, results_valid["balanced_accuracy"], i)) + writer_train.add_scalar( + "balanced_accuracy", + results_train["balanced_accuracy"], + global_step, + ) + writer_train.add_scalar("loss", mean_loss_train, global_step) + writer_valid.add_scalar( + "balanced_accuracy", + results_valid["balanced_accuracy"], + global_step, + ) + writer_valid.add_scalar("loss", mean_loss_valid, global_step) + logger.info( + "%s level training accuracy is %f at the end of iteration %d" + % (options.mode, results_train["balanced_accuracy"], i) + ) + logger.info( + "%s level validation accuracy is %f at the end of iteration %d" + % (options.mode, results_valid["balanced_accuracy"], i) + ) t_current = time() - t_beginning - row = [epoch, i, t_current, - results_train["balanced_accuracy"], mean_loss_train, - results_valid["balanced_accuracy"], mean_loss_valid] + row = [ + epoch, + i, + t_current, + results_train["balanced_accuracy"], + mean_loss_train, + results_valid["balanced_accuracy"], + mean_loss_valid, + ] if hasattr(model, "variational") and model.variational: - row += [results_train["total_kl_loss"] / (len(train_loader) * train_loader.batch_size), - results_valid["total_kl_loss"] / (len(valid_loader) * valid_loader.batch_size)] + row += [ + results_train["total_kl_loss"] + / (len(train_loader) * train_loader.batch_size), + results_valid["total_kl_loss"] + / (len(valid_loader) * valid_loader.batch_size), + ] row_df = pd.DataFrame([row], columns=columns) - with open(filename, 'a') as f: - row_df.to_csv(f, header=False, index=False, sep='\t') + with open(filename, "a") as f: + row_df.to_csv(f, header=False, index=False, sep="\t") tend = time() - logger.debug('Mean time per batch loading: %.10f s' - % (total_time / len(train_loader) * train_loader.batch_size)) + logger.debug( + "Mean time per batch loading: %.10f s" + % (total_time / len(train_loader) * train_loader.batch_size) + ) # If no step has been performed, raise Exception if step_flag: - raise Exception('The model has not been updated once in the epoch. The accumulation step may be too large.') + raise Exception( + "The model has not been updated once in the epoch. The accumulation step may be too large." + ) # If no evaluation has been performed, warn the user elif evaluation_flag and options.evaluation_steps != 0: - warnings.warn('Your evaluation steps are too big compared to the size of the dataset.' - 'The model is evaluated only once at the end of the epoch') + warnings.warn( + "Your evaluation steps are too big compared to the size of the dataset." + "The model is evaluated only once at the end of the epoch" + ) # Always test the results and save them once at the end of the epoch model.zero_grad() - logger.debug('Last checkpoint at the end of the epoch %d' % epoch) + logger.debug("Last checkpoint at the end of the epoch %d" % epoch) _, results_train = test(model, train_loader, options.gpu, criterion) - mean_loss_train = results_train["total_loss"] / (len(train_loader) * train_loader.batch_size) + mean_loss_train = results_train["total_loss"] / ( + len(train_loader) * train_loader.batch_size + ) _, results_valid = test(model, valid_loader, options.gpu, criterion) - mean_loss_valid = results_valid["total_loss"] / (len(valid_loader) * valid_loader.batch_size) + mean_loss_valid = results_valid["total_loss"] / ( + len(valid_loader) * valid_loader.batch_size + ) model.train() train_loader.dataset.train() global_step = (epoch + 1) * len(train_loader) - writer_train.add_scalar('balanced_accuracy', results_train["balanced_accuracy"], global_step) - writer_train.add_scalar('loss', mean_loss_train, global_step) - writer_valid.add_scalar('balanced_accuracy', results_valid["balanced_accuracy"], global_step) - writer_valid.add_scalar('loss', mean_loss_valid, global_step) - logger.info("%s level training accuracy is %f at the end of iteration %d" - % (options.mode, results_train["balanced_accuracy"], len(train_loader))) - logger.info("%s level validation accuracy is %f at the end of iteration %d" - % (options.mode, results_valid["balanced_accuracy"], len(train_loader))) + writer_train.add_scalar( + "balanced_accuracy", results_train["balanced_accuracy"], global_step + ) + writer_train.add_scalar("loss", mean_loss_train, global_step) + writer_valid.add_scalar( + "balanced_accuracy", results_valid["balanced_accuracy"], global_step + ) + writer_valid.add_scalar("loss", mean_loss_valid, global_step) + logger.info( + "%s level training accuracy is %f at the end of iteration %d" + % (options.mode, results_train["balanced_accuracy"], len(train_loader)) + ) + logger.info( + "%s level validation accuracy is %f at the end of iteration %d" + % (options.mode, results_valid["balanced_accuracy"], len(train_loader)) + ) t_current = time() - t_beginning - row = [epoch, i, t_current, - results_train["balanced_accuracy"], mean_loss_train, - results_valid["balanced_accuracy"], mean_loss_valid] + row = [ + epoch, + i, + t_current, + results_train["balanced_accuracy"], + mean_loss_train, + results_valid["balanced_accuracy"], + mean_loss_valid, + ] if hasattr(model, "variational") and model.variational: - row += [results_train["total_kl_loss"] / (len(train_loader) * train_loader.batch_size), - results_valid["total_kl_loss"] / (len(valid_loader) * valid_loader.batch_size)] + row += [ + results_train["total_kl_loss"] + / (len(train_loader) * train_loader.batch_size), + results_valid["total_kl_loss"] + / (len(valid_loader) * valid_loader.batch_size), + ] row_df = pd.DataFrame([row], columns=columns) - with open(filename, 'a') as f: - row_df.to_csv(f, header=False, index=False, sep='\t') + with open(filename, "a") as f: + row_df.to_csv(f, header=False, index=False, sep="\t") accuracy_is_best = results_valid["balanced_accuracy"] > best_valid_accuracy loss_is_best = mean_loss_valid < best_valid_loss - best_valid_accuracy = max(results_valid["balanced_accuracy"], best_valid_accuracy) + best_valid_accuracy = max( + results_valid["balanced_accuracy"], best_valid_accuracy + ) best_valid_loss = min(mean_loss_valid, best_valid_loss) - save_checkpoint({'model': model.state_dict(), - 'epoch': epoch, - 'valid_loss': mean_loss_valid, - 'valid_acc': results_valid["balanced_accuracy"]}, - accuracy_is_best, loss_is_best, - model_dir) + save_checkpoint( + { + "model": model.state_dict(), + "epoch": epoch, + "valid_loss": mean_loss_valid, + "valid_acc": results_valid["balanced_accuracy"], + }, + accuracy_is_best, + loss_is_best, + model_dir, + ) # Save optimizer state_dict to be able to reload - save_checkpoint({'optimizer': optimizer.state_dict(), - 'epoch': epoch, - 'name': options.optimizer, - }, - False, False, - model_dir, - filename='optimizer.pth.tar') + save_checkpoint( + { + "optimizer": optimizer.state_dict(), + "epoch": epoch, + "name": options.optimizer, + }, + False, + False, + model_dir, + filename="optimizer.pth.tar", + ) epoch += 1 @@ -258,7 +351,9 @@ def evaluate_prediction(y, y_pred): false_positive = np.sum((y_pred == 1) & (y == 0)) false_negative = np.sum((y_pred == 0) & (y == 1)) - accuracy = (true_positive + true_negative) / (true_positive + true_negative + false_positive + false_negative) + accuracy = (true_positive + true_negative) / ( + true_positive + true_negative + false_positive + false_negative + ) if (true_positive + false_negative) != 0: sensitivity = true_positive / (true_positive + false_negative) @@ -282,13 +377,14 @@ def evaluate_prediction(y, y_pred): balanced_accuracy = (sensitivity + specificity) / 2 - results = {'accuracy': accuracy, - 'balanced_accuracy': balanced_accuracy, - 'sensitivity': sensitivity, - 'specificity': specificity, - 'ppv': ppv, - 'npv': npv, - } + results = { + "accuracy": accuracy, + "balanced_accuracy": balanced_accuracy, + "sensitivity": sensitivity, + "specificity": specificity, + "ppv": ppv, + "npv": npv, + } return results @@ -314,7 +410,15 @@ def test(model, dataloader, use_cuda, criterion, mode="image", use_labels=True): if mode == "image": columns = ["participant_id", "session_id", "true_label", "predicted_label"] elif mode in ["patch", "roi", "slice"]: - columns = ['participant_id', 'session_id', '%s_id' % mode, 'true_label', 'predicted_label', 'proba0', 'proba1'] + columns = [ + "participant_id", + "session_id", + "%s_id" % mode, + "true_label", + "predicted_label", + "proba0", + "proba1", + ] else: raise ValueError("The mode %s is invalid." % mode) @@ -330,9 +434,9 @@ def test(model, dataloader, use_cuda, criterion, mode="image", use_labels=True): t0 = time() total_time = total_time + t0 - tend if use_cuda: - inputs, labels = data['image'].cuda(), data['label'].cuda() + inputs, labels = data["image"].cuda(), data["label"].cuda() else: - inputs, labels = data['image'], data['label'] + inputs, labels = data["image"], data["label"] if hasattr(model, "variational") and model.variational: z, mu, std, outputs = model(inputs) @@ -346,9 +450,11 @@ def test(model, dataloader, use_cuda, criterion, mode="image", use_labels=True): atlas_data = data["atlas"].cuda() else: atlas_data = data["atlas"] - atlas_output = outputs[:, -atlas_data.size(1)::] - outputs = outputs[:, :-atlas_data.size(1):] - total_atlas_loss += torch.nn.MSELoss(reduction="sum")(atlas_output, atlas_data).item() + atlas_output = outputs[:, -atlas_data.size(1) : :] + outputs = outputs[:, : -atlas_data.size(1) :] + total_atlas_loss += torch.nn.MSELoss(reduction="sum")( + atlas_output, atlas_data + ).item() if use_labels: loss = criterion(outputs, labels) @@ -356,14 +462,29 @@ def test(model, dataloader, use_cuda, criterion, mode="image", use_labels=True): _, predicted = torch.max(outputs.data, 1) # Generate detailed DataFrame - for idx, sub in enumerate(data['participant_id']): + for idx, sub in enumerate(data["participant_id"]): if mode == "image": - row = [[sub, data['session_id'][idx], labels[idx].item(), predicted[idx].item()]] + row = [ + [ + sub, + data["session_id"][idx], + labels[idx].item(), + predicted[idx].item(), + ] + ] else: normalized_output = softmax(outputs) - row = [[sub, data['session_id'][idx], data['%s_id' % mode][idx].item(), - labels[idx].item(), predicted[idx].item(), - normalized_output[idx, 0].item(), normalized_output[idx, 1].item()]] + row = [ + [ + sub, + data["session_id"][idx], + data["%s_id" % mode][idx].item(), + labels[idx].item(), + predicted[idx].item(), + normalized_output[idx, 0].item(), + normalized_output[idx, 1].item(), + ] + ] row_df = pd.DataFrame(row, columns=columns) results_df = pd.concat([results_df, row_df]) @@ -376,55 +497,97 @@ def test(model, dataloader, use_cuda, criterion, mode="image", use_labels=True): results_df = results_df.drop("true_label", axis=1) metrics_dict = None else: - metrics_dict = evaluate_prediction(results_df.true_label.values.astype(int), - results_df.predicted_label.values.astype(int)) - metrics_dict['total_loss'] = total_loss - metrics_dict['total_kl_loss'] = total_kl_loss - metrics_dict['total_atlas_loss'] = total_atlas_loss + metrics_dict = evaluate_prediction( + results_df.true_label.values.astype(int), + results_df.predicted_label.values.astype(int), + ) + metrics_dict["total_loss"] = total_loss + metrics_dict["total_kl_loss"] = total_kl_loss + metrics_dict["total_atlas_loss"] = total_atlas_loss torch.cuda.empty_cache() return results_df, metrics_dict -def sort_predicted(model, data_df, input_dir, model_options, criterion, keep_true, - batch_size=1, num_workers=0, gpu=False): - from .data import return_dataset, get_transforms - from torch.utils.data import DataLoader +def sort_predicted( + model, + data_df, + input_dir, + model_options, + criterion, + keep_true, + batch_size=1, + num_workers=0, + gpu=False, +): from copy import copy + from torch.utils.data import DataLoader + + from .data import get_transforms, return_dataset + if keep_true is None: return data_df - _, all_transforms = get_transforms(model_options.mode, model_options.minmaxnormalization) - dataset = return_dataset(mode=model_options.mode, input_dir=input_dir, - data_df=data_df, preprocessing=model_options.preprocessing, - train_transformations=None, all_transformations=all_transforms, - params=model_options) - dataloader = DataLoader(dataset, - batch_size=batch_size, - shuffle=False, - num_workers=num_workers, - pin_memory=True) + _, all_transforms = get_transforms( + model_options.mode, model_options.minmaxnormalization + ) + dataset = return_dataset( + mode=model_options.mode, + input_dir=input_dir, + data_df=data_df, + preprocessing=model_options.preprocessing, + train_transformations=None, + all_transformations=all_transforms, + params=model_options, + ) + dataloader = DataLoader( + dataset, + batch_size=batch_size, + shuffle=False, + num_workers=num_workers, + pin_memory=True, + ) test_options = copy(model_options) test_options.gpu = gpu - results_df, _ = test(model, dataloader, gpu, criterion, model_options.mode, use_labels=True) + results_df, _ = test( + model, dataloader, gpu, criterion, model_options.mode, use_labels=True + ) - sorted_df = data_df.sort_values(['participant_id', 'session_id']).reset_index(drop=True) - results_df = results_df.sort_values(['participant_id', 'session_id']).reset_index(drop=True) + sorted_df = data_df.sort_values(["participant_id", "session_id"]).reset_index( + drop=True + ) + results_df = results_df.sort_values(["participant_id", "session_id"]).reset_index( + drop=True + ) if keep_true: - return sorted_df[results_df.true_label == results_df.predicted_label].reset_index(drop=True) + return sorted_df[ + results_df.true_label == results_df.predicted_label + ].reset_index(drop=True) else: - return sorted_df[results_df.true_label != results_df.predicted_label].reset_index(drop=True) + return sorted_df[ + results_df.true_label != results_df.predicted_label + ].reset_index(drop=True) ################################# # Voting systems ################################# -def mode_level_to_tsvs(output_dir, results_df, metrics, fold, selection, mode, dataset='train', cnn_index=None): + +def mode_level_to_tsvs( + output_dir, + results_df, + metrics, + fold, + selection, + mode, + dataset="train", + cnn_index=None, +): """ Writes the outputs of the test function in tsv files. @@ -439,26 +602,49 @@ def mode_level_to_tsvs(output_dir, results_df, metrics, fold, selection, mode, d cnn_index: (int) provide the cnn_index only for a multi-cnn framework. """ if cnn_index is None: - performance_dir = os.path.join(output_dir, 'fold-%i' % fold, 'cnn_classification', selection) + performance_dir = os.path.join( + output_dir, "fold-%i" % fold, "cnn_classification", selection + ) else: - performance_dir = os.path.join(output_dir, 'fold-%i' % fold, 'cnn_classification', 'cnn-%i' % cnn_index, - selection) + performance_dir = os.path.join( + output_dir, + "fold-%i" % fold, + "cnn_classification", + "cnn-%i" % cnn_index, + selection, + ) os.makedirs(performance_dir, exist_ok=True) - results_df.to_csv(os.path.join(performance_dir, '%s_%s_level_prediction.tsv' % (dataset, mode)), index=False, - sep='\t') + results_df.to_csv( + os.path.join(performance_dir, "%s_%s_level_prediction.tsv" % (dataset, mode)), + index=False, + sep="\t", + ) if metrics is not None: metrics["%s_id" % mode] = cnn_index if isinstance(metrics, dict): - pd.DataFrame(metrics, index=[0]).to_csv(os.path.join(performance_dir, '%s_%s_level_metrics.tsv' % (dataset, mode)), - index=False, sep='\t') + pd.DataFrame(metrics, index=[0]).to_csv( + os.path.join( + performance_dir, "%s_%s_level_metrics.tsv" % (dataset, mode) + ), + index=False, + sep="\t", + ) elif isinstance(metrics, pd.DataFrame): - metrics.to_csv(os.path.join(performance_dir, '%s_%s_level_metrics.tsv' % (dataset, mode)), - index=False, sep='\t') + metrics.to_csv( + os.path.join( + performance_dir, "%s_%s_level_metrics.tsv" % (dataset, mode) + ), + index=False, + sep="\t", + ) else: - raise ValueError("Bad type for metrics: %s. Must be dict or DataFrame." % type(metrics).__name__) + raise ValueError( + "Bad type for metrics: %s. Must be dict or DataFrame." + % type(metrics).__name__ + ) def concat_multi_cnn_results(output_dir, fold, selection, mode, dataset, num_cnn): @@ -466,17 +652,23 @@ def concat_multi_cnn_results(output_dir, fold, selection, mode, dataset, num_cnn prediction_df = pd.DataFrame() metrics_df = pd.DataFrame() for cnn_index in range(num_cnn): - cnn_dir = os.path.join(output_dir, 'fold-%i' % fold, 'cnn_classification', 'cnn-%i' % cnn_index) + cnn_dir = os.path.join( + output_dir, "fold-%i" % fold, "cnn_classification", "cnn-%i" % cnn_index + ) performance_dir = os.path.join(cnn_dir, selection) - cnn_pred_path = os.path.join(performance_dir, '%s_%s_level_prediction.tsv' % (dataset, mode)) - cnn_metrics_path = os.path.join(performance_dir, '%s_%s_level_metrics.tsv' % (dataset, mode)) - - cnn_pred_df = pd.read_csv(cnn_pred_path, sep='\t') + cnn_pred_path = os.path.join( + performance_dir, "%s_%s_level_prediction.tsv" % (dataset, mode) + ) + cnn_metrics_path = os.path.join( + performance_dir, "%s_%s_level_metrics.tsv" % (dataset, mode) + ) + + cnn_pred_df = pd.read_csv(cnn_pred_path, sep="\t") prediction_df = pd.concat([prediction_df, cnn_pred_df]) os.remove(cnn_pred_path) if os.path.exists(cnn_metrics_path): - cnn_metrics_df = pd.read_csv(cnn_metrics_path, sep='\t') + cnn_metrics_df = pd.read_csv(cnn_metrics_path, sep="\t") metrics_df = pd.concat([metrics_df, cnn_metrics_df]) os.remove(cnn_metrics_path) @@ -491,26 +683,42 @@ def concat_multi_cnn_results(output_dir, fold, selection, mode, dataset, num_cnn metrics_df = None else: metrics_df.reset_index(drop=True, inplace=True) - mode_level_to_tsvs(output_dir, prediction_df, metrics_df, fold, selection, mode, dataset) + mode_level_to_tsvs( + output_dir, prediction_df, metrics_df, fold, selection, mode, dataset + ) def retrieve_sub_level_results(output_dir, fold, selection, mode, dataset, num_cnn): """Retrieve performance_df for single or multi-CNN framework. If the results of the multi-CNN were not concatenated it will be done here.""" - result_tsv = os.path.join(output_dir, 'fold-%i' % fold, 'cnn_classification', selection, - '%s_%s_level_prediction.tsv' % (dataset, mode)) + result_tsv = os.path.join( + output_dir, + "fold-%i" % fold, + "cnn_classification", + selection, + "%s_%s_level_prediction.tsv" % (dataset, mode), + ) if os.path.exists(result_tsv): - performance_df = pd.read_csv(result_tsv, sep='\t') + performance_df = pd.read_csv(result_tsv, sep="\t") else: concat_multi_cnn_results(output_dir, fold, selection, mode, dataset, num_cnn) - performance_df = pd.read_csv(result_tsv, sep='\t') + performance_df = pd.read_csv(result_tsv, sep="\t") return performance_df -def soft_voting_to_tsvs(output_dir, fold, selection, mode, dataset='test', num_cnn=None, - selection_threshold=None, logger=None, use_labels=True): +def soft_voting_to_tsvs( + output_dir, + fold, + selection, + mode, + dataset="test", + num_cnn=None, + selection_threshold=None, + logger=None, + use_labels=True, +): """ Writes soft voting results in tsv files. @@ -531,29 +739,52 @@ def soft_voting_to_tsvs(output_dir, fold, selection, mode, dataset='test', num_c logger = logging # Choose which dataset is used to compute the weights of soft voting. - if dataset in ['train', 'validation']: + if dataset in ["train", "validation"]: validation_dataset = dataset else: - validation_dataset = 'validation' - test_df = retrieve_sub_level_results(output_dir, fold, selection, mode, dataset, num_cnn) - validation_df = retrieve_sub_level_results(output_dir, fold, selection, mode, validation_dataset, num_cnn) - - performance_path = os.path.join(output_dir, 'fold-%i' % fold, 'cnn_classification', selection) + validation_dataset = "validation" + test_df = retrieve_sub_level_results( + output_dir, fold, selection, mode, dataset, num_cnn + ) + validation_df = retrieve_sub_level_results( + output_dir, fold, selection, mode, validation_dataset, num_cnn + ) + + performance_path = os.path.join( + output_dir, "fold-%i" % fold, "cnn_classification", selection + ) os.makedirs(performance_path, exist_ok=True) - df_final, metrics = soft_voting(test_df, validation_df, mode, selection_threshold=selection_threshold, - use_labels=use_labels) - - df_final.to_csv(os.path.join(os.path.join(performance_path, '%s_image_level_prediction.tsv' % dataset)), - index=False, sep='\t') + df_final, metrics = soft_voting( + test_df, + validation_df, + mode, + selection_threshold=selection_threshold, + use_labels=use_labels, + ) + + df_final.to_csv( + os.path.join( + os.path.join(performance_path, "%s_image_level_prediction.tsv" % dataset) + ), + index=False, + sep="\t", + ) if use_labels: - pd.DataFrame(metrics, index=[0]).to_csv(os.path.join(performance_path, '%s_image_level_metrics.tsv' % dataset), - index=False, sep='\t') - logger.info("image level %s balanced accuracy is %f for model selected on %s" - % (dataset, metrics["balanced_accuracy"], selection)) - - -def soft_voting(performance_df, validation_df, mode, selection_threshold=None, use_labels=True): + pd.DataFrame(metrics, index=[0]).to_csv( + os.path.join(performance_path, "%s_image_level_metrics.tsv" % dataset), + index=False, + sep="\t", + ) + logger.info( + "image level %s balanced accuracy is %f for model selected on %s" + % (dataset, metrics["balanced_accuracy"], selection) + ) + + +def soft_voting( + performance_df, validation_df, mode, selection_threshold=None, use_labels=True +): """ Computes soft voting based on the probabilities in performance_df. Weights are computed based on the accuracies of validation_df. @@ -573,23 +804,31 @@ def soft_voting(performance_df, validation_df, mode, selection_threshold=None, u """ # Compute the sub-level accuracies on the validation set: - validation_df["accurate_prediction"] = validation_df.apply(lambda x: check_prediction(x), axis=1) - sub_level_accuracies = validation_df.groupby("%s_id" % mode)["accurate_prediction"].sum() + validation_df["accurate_prediction"] = validation_df.apply( + lambda x: check_prediction(x), axis=1 + ) + sub_level_accuracies = validation_df.groupby("%s_id" % mode)[ + "accurate_prediction" + ].sum() if selection_threshold is not None: sub_level_accuracies[sub_level_accuracies < selection_threshold] = 0 weight_series = sub_level_accuracies / sub_level_accuracies.sum() # Sort to allow weighted average computation - performance_df.sort_values(['participant_id', 'session_id', '%s_id' % mode], inplace=True) + performance_df.sort_values( + ["participant_id", "session_id", "%s_id" % mode], inplace=True + ) weight_series.sort_index(inplace=True) # Soft majority vote if use_labels: - columns = ['participant_id', 'session_id', 'true_label', 'predicted_label'] + columns = ["participant_id", "session_id", "true_label", "predicted_label"] else: - columns = ['participant_id', 'session_id', 'predicted_label'] + columns = ["participant_id", "session_id", "predicted_label"] df_final = pd.DataFrame(columns=columns) - for (subject, session), subject_df in performance_df.groupby(['participant_id', 'session_id']): + for (subject, session), subject_df in performance_df.groupby( + ["participant_id", "session_id"] + ): proba0 = np.average(subject_df["proba0"], weights=weight_series) proba1 = np.average(subject_df["proba1"], weights=weight_series) proba_list = [proba0, proba1] @@ -604,8 +843,10 @@ def soft_voting(performance_df, validation_df, mode, selection_threshold=None, u df_final = df_final.append(row_df) if use_labels: - results = evaluate_prediction(df_final.true_label.values.astype(int), - df_final.predicted_label.values.astype(int)) + results = evaluate_prediction( + df_final.true_label.values.astype(int), + df_final.predicted_label.values.astype(int), + ) else: results = None @@ -625,15 +866,25 @@ def mode_to_image_tsvs(output_dir, fold, selection, mode, dataset="test"): validation, the weights of soft voting will be computed on validation accuracies. """ sub_df = retrieve_sub_level_results(output_dir, fold, selection, mode, dataset, 1) - sub_df.drop([f'{mode}_id', 'proba0', 'proba1'], axis=1, inplace=True) - - performance_path = os.path.join(output_dir, f'fold-{fold}', 'cnn_classification', selection) - sub_df.to_csv(os.path.join(performance_path, f'{dataset}_image_level_prediction.tsv'), - index=False, sep='\t') - metrics_df = pd.read_csv(os.path.join(performance_path, f'{dataset}_{mode}_level_metrics.tsv'), sep="\t") - metrics_df.drop([f'{mode}_id'], axis=1, inplace=True) - metrics_df.to_csv(os.path.join(performance_path, f'{dataset}_image_level_metrics.tsv'), - index=False, sep='\t') + sub_df.drop([f"{mode}_id", "proba0", "proba1"], axis=1, inplace=True) + + performance_path = os.path.join( + output_dir, f"fold-{fold}", "cnn_classification", selection + ) + sub_df.to_csv( + os.path.join(performance_path, f"{dataset}_image_level_prediction.tsv"), + index=False, + sep="\t", + ) + metrics_df = pd.read_csv( + os.path.join(performance_path, f"{dataset}_{mode}_level_metrics.tsv"), sep="\t" + ) + metrics_df.drop([f"{mode}_id"], axis=1, inplace=True) + metrics_df.to_csv( + os.path.join(performance_path, f"{dataset}_image_level_metrics.tsv"), + index=False, + sep="\t", + ) def check_prediction(row): @@ -681,7 +932,9 @@ def get_criterion(option): elif option == "L1Norm" or option == "L1": return L1ClassificationLoss(reduction="sum", normalization=(option == "L1Norm")) elif option == "SmoothL1Norm" or option == "SmoothL1": - return SmoothL1ClassificationLoss(reduction="sum", normalization=(option == "SmoothL1Norm")) + return SmoothL1ClassificationLoss( + reduction="sum", normalization=(option == "SmoothL1Norm") + ) else: raise ValueError("The option %s is unknown for criterion selection" % option) @@ -703,8 +956,7 @@ def binarize_label(y, classes, pos_label=1, neg_label=0): data = np.empty_like(indices) data.fill(pos_label) - Y = sp.csr_matrix((data, indices, indptr), - shape=(n_samples, n_classes)) + Y = sp.csr_matrix((data, indices, indptr), shape=(n_samples, n_classes)) Y = Y.toarray() Y = Y.astype(int, copy=False) if neg_label != 0: @@ -734,7 +986,7 @@ def kl_divergence(z, mu, std): log_pz = p.log_prob(z) # kl - kl = (log_qzx - log_pz) + kl = log_qzx - log_pz # go from single dim distribution to multi-dim kl = kl.mean(-1).sum() diff --git a/clinicadl/clinicadl/tools/deep_learning/data.py b/clinicadl/clinicadl/tools/deep_learning/data.py index 612d45496..7140df9c5 100644 --- a/clinicadl/clinicadl/tools/deep_learning/data.py +++ b/clinicadl/clinicadl/tools/deep_learning/data.py @@ -1,16 +1,17 @@ # coding: utf8 -import torch -import pandas as pd -import numpy as np -from os import path, listdir -from torch.utils.data import Dataset, sampler -import torchvision.transforms as transforms import abc import logging import warnings -from clinicadl.tools.inputs.filename_types import FILENAME_TYPE, MASK_PATTERN +from os import listdir, path +import numpy as np +import pandas as pd +import torch +import torchvision.transforms as transforms +from torch.utils.data import Dataset, sampler + +from clinicadl.tools.inputs.filename_types import FILENAME_TYPE, MASK_PATTERN ################################# # Datasets loaders @@ -20,39 +21,47 @@ class MRIDataset(Dataset): """Abstract class for all derived MRIDatasets.""" - def __init__(self, caps_directory, data_file, - preprocessing, transformations, labels, - augmentation_transformations=None, multi_cohort=False, - atlas=None, group=None, merged_df=None): + def __init__( + self, + caps_directory, + data_file, + preprocessing, + transformations, + labels, + augmentation_transformations=None, + multi_cohort=False, + atlas=None, + group=None, + merged_df=None, + ): self.caps_dict = self.create_caps_dict(caps_directory, multi_cohort) self.transformations = transformations self.augmentation_transformations = augmentation_transformations self.eval_mode = False self.labels = labels self.diagnosis_code = { - 'CN': 0, - 'BV': 1, - 'AD': 1, - 'sMCI': 0, - 'pMCI': 1, - 'MCI': 1, - 'unlabeled': -1} + "CN": 0, + "BV": 1, + "AD": 1, + "sMCI": 0, + "pMCI": 1, + "MCI": 1, + "unlabeled": -1, + } self.preprocessing = preprocessing - if not hasattr(self, 'elem_index'): - raise ValueError( - "Child class of MRIDataset must set elem_index attribute.") - if not hasattr(self, 'mode'): - raise ValueError( - "Child class of MRIDataset must set mode attribute.") + if not hasattr(self, "elem_index"): + raise ValueError("Child class of MRIDataset must set elem_index attribute.") + if not hasattr(self, "mode"): + raise ValueError("Child class of MRIDataset must set mode attribute.") # Check the format of the tsv file here if isinstance(data_file, str): - self.df = pd.read_csv(data_file, sep='\t') + self.df = pd.read_csv(data_file, sep="\t") elif isinstance(data_file, pd.DataFrame): self.df = data_file else: - raise Exception('The argument data_file is not of correct type.') + raise Exception("The argument data_file is not of correct type.") if not multi_cohort: self.df["cohort"] = "single" @@ -66,17 +75,21 @@ def __init__(self, caps_directory, data_file, mandatory_col.add("%s_id" % self.mode) if not mandatory_col.issubset(set(self.df.columns.values)): - raise Exception("the data file is not in the correct format." - "Columns should include %s" % mandatory_col) + raise Exception( + "the data file is not in the correct format." + "Columns should include %s" % mandatory_col + ) unique_diagnoses = set(self.df.diagnosis) unique_codes = set() for diagnosis in unique_diagnoses: unique_codes.add(self.diagnosis_code[diagnosis]) if len(unique_codes) == 1: - warnings.warn("The diagnoses found in the DataFrame %s only corresponds to one class %s. " - "If you want to run a binary classification please change the labels involved." - % (unique_diagnoses, unique_codes)) + warnings.warn( + "The diagnoses found in the DataFrame %s only corresponds to one class %s. " + "If you want to run a binary classification please change the labels involved." + % (unique_diagnoses, unique_codes) + ) self.merged_df = merged_df if merged_df is not None and "participant_id" in merged_df.columns.values: @@ -85,14 +98,19 @@ def __init__(self, caps_directory, data_file, self.atlas = atlas self.group_dict = None if self.atlas is not None and group is None: - self.group_dict = self.create_group_dict(caps_directory, multi_cohort, group) + self.group_dict = self.create_group_dict( + caps_directory, multi_cohort, group + ) if self.atlas is not None and self.merged_df is not None: - filtered_columns = [col for col in merged_df.columns.values - if "t1-volume" in col and atlas in col] + filtered_columns = [ + col + for col in merged_df.columns.values + if "t1-volume" in col and atlas in col + ] self.merged_df = self.merged_df[filtered_columns] self.elem_per_image = self.num_elem_per_image() - self.size = self[0]['image'].size() + self.size = self[0]["image"].size() def __len__(self): return len(self.df) * self.elem_per_image @@ -103,37 +121,43 @@ def create_caps_dict(caps_directory, multi_cohort): from clinica.utils.inputs import check_caps_folder if multi_cohort: - if not caps_directory.endswith('.tsv'): - raise ValueError('If multi_cohort is given, the caps_dir argument should be a path to a TSV file.') + if not caps_directory.endswith(".tsv"): + raise ValueError( + "If multi_cohort is given, the caps_dir argument should be a path to a TSV file." + ) else: caps_df = pd.read_csv(caps_directory, sep="\t") - check_multi_cohort_tsv(caps_df, 'CAPS') + check_multi_cohort_tsv(caps_df, "CAPS") caps_dict = dict() for idx in range(len(caps_df)): - cohort = caps_df.loc[idx, 'cohort'] - caps_path = caps_df.loc[idx, 'path'] + cohort = caps_df.loc[idx, "cohort"] + caps_path = caps_df.loc[idx, "path"] check_caps_folder(caps_path) caps_dict[cohort] = caps_path else: check_caps_folder(caps_directory) - caps_dict = {'single': caps_directory} + caps_dict = {"single": caps_directory} return caps_dict @staticmethod def create_group_dict(caps_directory, multi_cohort, group): if multi_cohort: - if not caps_directory.endswith('.tsv'): - raise ValueError('If multi_cohort is given, the caps_dir argument should be a path to a TSV file.') + if not caps_directory.endswith(".tsv"): + raise ValueError( + "If multi_cohort is given, the caps_dir argument should be a path to a TSV file." + ) else: caps_df = pd.read_csv(caps_directory, sep="\t") - check_multi_cohort_tsv(caps_df, 'CAPS') + check_multi_cohort_tsv(caps_df, "CAPS") if "group_label" not in caps_df.columns and group is None: - raise ValueError('When atlas intensities are involved the group_label column must be filled ' - 'in the CAPS TSV file.') + raise ValueError( + "When atlas intensities are involved the group_label column must be filled " + "in the CAPS TSV file." + ) group_dict = dict() for idx in range(len(caps_df)): - cohort = caps_df.loc[idx, 'cohort'] + cohort = caps_df.loc[idx, "cohort"] if group is None: group_label = f"group-{caps_df.loc[idx, 'group_label']}" else: @@ -143,60 +167,96 @@ def create_group_dict(caps_directory, multi_cohort, group): if group is None: groups_list = listdir(path.join(caps_directory, "groups")) if len(groups_list) == 0: - raise ValueError("A commun group could not be found for the CAPS folder wanted.") + raise ValueError( + "A commun group could not be found for the CAPS folder wanted." + ) elif len(groups_list) > 1: - raise ValueError(f"Several groups were found for the CAPS folder wanted {groups_list}. " - "Please precise which group should be used.") + raise ValueError( + f"Several groups were found for the CAPS folder wanted {groups_list}. " + "Please precise which group should be used." + ) else: - group_dict = {'single': groups_list[0]} + group_dict = {"single": groups_list[0]} else: - group_dict = {'single': f"group-{group}"} + group_dict = {"single": f"group-{group}"} return group_dict def _get_path(self, participant, session, cohort, mode="image"): if cohort not in self.caps_dict.keys(): - raise ValueError('Cohort names in labels and CAPS definitions do not match.') + raise ValueError( + "Cohort names in labels and CAPS definitions do not match." + ) if self.preprocessing == "t1-linear": - image_path = path.join(self.caps_dict[cohort], 'subjects', participant, session, - 'deeplearning_prepare_data', '%s_based' % mode, 't1_linear', - participant + '_' + session - + FILENAME_TYPE['cropped'] + '.pt') + image_path = path.join( + self.caps_dict[cohort], + "subjects", + participant, + session, + "deeplearning_prepare_data", + "%s_based" % mode, + "t1_linear", + participant + "_" + session + FILENAME_TYPE["cropped"] + ".pt", + ) elif self.preprocessing == "t1-linear-downsampled": - image_path = path.join(self.caps_dict[cohort], 'subjects', participant, session, - 'deeplearning_prepare_data', '%s_based' % mode, 't1_linear', - participant + '_' + session - + FILENAME_TYPE['downsampled'] + '.pt') + image_path = path.join( + self.caps_dict[cohort], + "subjects", + participant, + session, + "deeplearning_prepare_data", + "%s_based" % mode, + "t1_linear", + participant + "_" + session + FILENAME_TYPE["downsampled"] + ".pt", + ) elif self.preprocessing == "t1-extensive": - image_path = path.join(self.caps_dict[cohort], 'subjects', participant, session, - 'deeplearning_prepare_data', '%s_based' % mode, 't1_extensive', - participant + '_' + session - + FILENAME_TYPE['skull_stripped'] + '.pt') + image_path = path.join( + self.caps_dict[cohort], + "subjects", + participant, + session, + "deeplearning_prepare_data", + "%s_based" % mode, + "t1_extensive", + participant + "_" + session + FILENAME_TYPE["skull_stripped"] + ".pt", + ) elif self.preprocessing == "t1-volume": - image_path = path.join(self.caps_dict[cohort], 'subjects', participant, session, - 'deeplearning_prepare_data', '%s_based' % mode, 'custom', - participant + '_' + session - + FILENAME_TYPE['gm_maps'] + '.pt') + image_path = path.join( + self.caps_dict[cohort], + "subjects", + participant, + session, + "deeplearning_prepare_data", + "%s_based" % mode, + "custom", + participant + "_" + session + FILENAME_TYPE["gm_maps"] + ".pt", + ) elif self.preprocessing == "shepplogan": - image_path = path.join(self.caps_dict[cohort], 'subjects', - '%s_%s%s.pt' % (participant, session, FILENAME_TYPE['shepplogan'])) + image_path = path.join( + self.caps_dict[cohort], + "subjects", + "%s_%s%s.pt" % (participant, session, FILENAME_TYPE["shepplogan"]), + ) else: raise NotImplementedError( - "The path to preprocessing %s is not implemented" % self.preprocessing) + "The path to preprocessing %s is not implemented" % self.preprocessing + ) return image_path def _get_statistics_df(self, participant, session, cohort): if cohort not in self.caps_dict.keys(): - raise ValueError('Cohort names in labels and CAPS definitions do not match.') + raise ValueError( + "Cohort names in labels and CAPS definitions do not match." + ) if self.merged_df is None: statistics_path = path.join( self.caps_dict[cohort], - 'subjects', + "subjects", participant, session, "t1", @@ -204,48 +264,59 @@ def _get_statistics_df(self, participant, session, cohort): "dartel", self.group_dict[cohort], "atlas_statistics", - f"{participant}_{session}_T1w_segm-graymatter_space-Ixi549Space_modulated-on_probability_space-{self.atlas}_map-graymatter_statistics.tsv" + f"{participant}_{session}_T1w_segm-graymatter_space-Ixi549Space_modulated-on_probability_space-{self.atlas}_map-graymatter_statistics.tsv", ) if not path.exists(statistics_path): - raise ValueError(f"Last step of t1-volume with {self.group_dict[cohort]} was not run on {participant} | {session}") - - return pd.read_csv(statistics_path, sep="\t", usecols=["mean_scalar"], dtype=np.float32, squeeze=True) + raise ValueError( + f"Last step of t1-volume with {self.group_dict[cohort]} was not run on {participant} | {session}" + ) + + return pd.read_csv( + statistics_path, + sep="\t", + usecols=["mean_scalar"], + dtype=np.float32, + squeeze=True, + ) else: return self.merged_df.loc[(participant, session)] def _get_meta_data(self, idx): image_idx = idx // self.elem_per_image - participant = self.df.loc[image_idx, 'participant_id'] - session = self.df.loc[image_idx, 'session_id'] - cohort = self.df.loc[image_idx, 'cohort'] + participant = self.df.loc[image_idx, "participant_id"] + session = self.df.loc[image_idx, "session_id"] + cohort = self.df.loc[image_idx, "cohort"] if self.elem_index is None: elem_idx = idx % self.elem_per_image elif self.elem_index == "mixed": - elem_idx = self.df.loc[image_idx, '%s_id' % self.mode] + elem_idx = self.df.loc[image_idx, "%s_id" % self.mode] else: elem_idx = self.elem_index if self.labels: - diagnosis = self.df.loc[image_idx, 'diagnosis'] + diagnosis = self.df.loc[image_idx, "diagnosis"] label = self.diagnosis_code[diagnosis] else: - label = self.diagnosis_code['unlabeled'] + label = self.diagnosis_code["unlabeled"] return participant, session, cohort, elem_idx, label def _get_full_image(self): - from ..data.utils import find_image_path as get_nii_path import nibabel as nib - participant_id = self.df.loc[0, 'participant_id'] - session_id = self.df.loc[0, 'session_id'] - cohort = self.df.loc[0, 'cohort'] + from ..data.utils import find_image_path as get_nii_path + + participant_id = self.df.loc[0, "participant_id"] + session_id = self.df.loc[0, "session_id"] + cohort = self.df.loc[0, "cohort"] try: - image_path = self._get_path(participant_id, session_id, cohort, mode="image") + image_path = self._get_path( + participant_id, session_id, cohort, mode="image" + ) image = torch.load(image_path) except FileNotFoundError: image_path = get_nii_path( @@ -253,7 +324,8 @@ def _get_full_image(self): participant_id, session_id, cohort=cohort, - preprocessing=self.preprocessing) + preprocessing=self.preprocessing, + ) image_nii = nib.load(image_path) image_np = image_nii.get_fdata() image = ToTensor()(image_np) @@ -287,10 +359,18 @@ def train(self): class MRIDatasetImage(MRIDataset): """Dataset of MRI organized in a CAPS folder.""" - def __init__(self, caps_directory, data_file, - preprocessing='t1-linear', train_transformations=None, - labels=True, all_transformations=None, multi_cohort=False, - atlas=None, merged_df=None): + def __init__( + self, + caps_directory, + data_file, + preprocessing="t1-linear", + train_transformations=None, + labels=True, + all_transformations=None, + multi_cohort=False, + atlas=None, + merged_df=None, + ): """ Args: caps_directory (string): Directory of all the images. @@ -306,10 +386,17 @@ def __init__(self, caps_directory, data_file, """ self.elem_index = None self.mode = "image" - super().__init__(caps_directory, data_file, preprocessing, - augmentation_transformations=train_transformations, labels=labels, - transformations=all_transformations, multi_cohort=multi_cohort, - atlas=atlas, merged_df=merged_df) + super().__init__( + caps_directory, + data_file, + preprocessing, + augmentation_transformations=train_transformations, + labels=labels, + transformations=all_transformations, + multi_cohort=multi_cohort, + atlas=atlas, + merged_df=merged_df, + ) def __getitem__(self, idx): participant, session, cohort, _, label = self._get_meta_data(idx) @@ -323,13 +410,18 @@ def __getitem__(self, idx): if self.augmentation_transformations and not self.eval_mode: image = self.augmentation_transformations(image) - sample = {'image': image, 'label': label, 'participant_id': participant, 'session_id': session, - 'image_path': image_path} + sample = { + "image": image, + "label": label, + "participant_id": participant, + "session_id": session, + "image_path": image_path, + } if self.atlas is not None: atlas_df = self._get_statistics_df(participant, session, cohort) atlas_pt = torch.from_numpy(atlas_df.values).float() - sample['atlas'] = atlas_pt + sample["atlas"] = atlas_pt return sample @@ -338,10 +430,22 @@ def num_elem_per_image(self): class MRIDatasetPatch(MRIDataset): - - def __init__(self, caps_directory, data_file, patch_size, stride_size, train_transformations=None, prepare_dl=False, - patch_index=None, preprocessing="t1-linear", labels=True, all_transformations=None, - multi_cohort=False, atlas=None, merged_df=None): + def __init__( + self, + caps_directory, + data_file, + patch_size, + stride_size, + train_transformations=None, + prepare_dl=False, + patch_index=None, + preprocessing="t1-linear", + labels=True, + all_transformations=None, + multi_cohort=False, + atlas=None, + merged_df=None, + ): """ Args: caps_directory (string): Directory of all the images. @@ -361,25 +465,40 @@ def __init__(self, caps_directory, data_file, patch_size, stride_size, train_tra """ if preprocessing == "shepplogan": - raise ValueError("Patch mode is not available for preprocessing %s" % preprocessing) + raise ValueError( + "Patch mode is not available for preprocessing %s" % preprocessing + ) self.patch_size = patch_size self.stride_size = stride_size self.elem_index = patch_index self.mode = "patch" self.prepare_dl = prepare_dl - super().__init__(caps_directory, data_file, preprocessing, - augmentation_transformations=train_transformations, labels=labels, - transformations=all_transformations, multi_cohort=multi_cohort, - atlas=atlas, merged_df=merged_df) + super().__init__( + caps_directory, + data_file, + preprocessing, + augmentation_transformations=train_transformations, + labels=labels, + transformations=all_transformations, + multi_cohort=multi_cohort, + atlas=atlas, + merged_df=merged_df, + ) def __getitem__(self, idx): participant, session, cohort, patch_idx, label = self._get_meta_data(idx) if self.prepare_dl: - patch_path = path.join(self._get_path(participant, session, cohort, "patch")[0:-7] - + '_patchsize-' + str(self.patch_size) - + '_stride-' + str(self.stride_size) - + '_patch-' + str(patch_idx) + '_T1w.pt') + patch_path = path.join( + self._get_path(participant, session, cohort, "patch")[0:-7] + + "_patchsize-" + + str(self.patch_size) + + "_stride-" + + str(self.stride_size) + + "_patch-" + + str(patch_idx) + + "_T1w.pt" + ) image = torch.load(patch_path) else: @@ -393,13 +512,18 @@ def __getitem__(self, idx): if self.augmentation_transformations and not self.eval_mode: image = self.augmentation_transformations(image) - sample = {'image': image, 'label': label, - 'participant_id': participant, 'session_id': session, 'patch_id': patch_idx} + sample = { + "image": image, + "label": label, + "participant_id": participant, + "session_id": session, + "patch_id": patch_idx, + } if self.atlas is not None: atlas_df = self._get_statistics_df(participant, session, cohort) atlas_pt = torch.from_numpy(atlas_df.mean_scalar.values).float() - sample['atlas'] = atlas_pt + sample["atlas"] = atlas_pt return sample @@ -409,36 +533,51 @@ def num_elem_per_image(self): image = self._get_full_image() - patches_tensor = image.unfold(1, self.patch_size, self.stride_size - ).unfold(2, self.patch_size, self.stride_size - ).unfold(3, self.patch_size, self.stride_size).contiguous() - patches_tensor = patches_tensor.view(-1, - self.patch_size, - self.patch_size, - self.patch_size) + patches_tensor = ( + image.unfold(1, self.patch_size, self.stride_size) + .unfold(2, self.patch_size, self.stride_size) + .unfold(3, self.patch_size, self.stride_size) + .contiguous() + ) + patches_tensor = patches_tensor.view( + -1, self.patch_size, self.patch_size, self.patch_size + ) num_patches = patches_tensor.shape[0] return num_patches def extract_patch_from_mri(self, image_tensor, index_patch): - patches_tensor = image_tensor.unfold(1, self.patch_size, self.stride_size - ).unfold(2, self.patch_size, self.stride_size - ).unfold(3, self.patch_size, self.stride_size).contiguous() - patches_tensor = patches_tensor.view(-1, - self.patch_size, - self.patch_size, - self.patch_size) - extracted_patch = patches_tensor[index_patch, ...].unsqueeze_( - 0).clone() + patches_tensor = ( + image_tensor.unfold(1, self.patch_size, self.stride_size) + .unfold(2, self.patch_size, self.stride_size) + .unfold(3, self.patch_size, self.stride_size) + .contiguous() + ) + patches_tensor = patches_tensor.view( + -1, self.patch_size, self.patch_size, self.patch_size + ) + extracted_patch = patches_tensor[index_patch, ...].unsqueeze_(0).clone() return extracted_patch class MRIDatasetRoi(MRIDataset): - - def __init__(self, caps_directory, data_file, roi_list=None, cropped_roi=True, roi_index=None, - preprocessing="t1-linear", train_transformations=None, prepare_dl=False, labels=True, - all_transformations=None, multi_cohort=False, atlas=None, merged_df=None): + def __init__( + self, + caps_directory, + data_file, + roi_list=None, + cropped_roi=True, + roi_index=None, + preprocessing="t1-linear", + train_transformations=None, + prepare_dl=False, + labels=True, + all_transformations=None, + multi_cohort=False, + atlas=None, + merged_df=None, + ): """ Args: caps_directory (string): Directory of all the images. @@ -458,16 +597,26 @@ def __init__(self, caps_directory, data_file, roi_list=None, cropped_roi=True, r """ if preprocessing == "shepplogan": - raise ValueError("ROI mode is not available for preprocessing %s" % preprocessing) + raise ValueError( + "ROI mode is not available for preprocessing %s" % preprocessing + ) self.elem_index = roi_index self.mode = "roi" self.roi_list = roi_list self.cropped_roi = cropped_roi self.prepare_dl = prepare_dl self.mask_list = self.find_masks(caps_directory, preprocessing) - super().__init__(caps_directory, data_file, preprocessing, augmentation_transformations=train_transformations, - labels=labels, transformations=all_transformations, multi_cohort=multi_cohort, - atlas=atlas, merged_df=merged_df) + super().__init__( + caps_directory, + data_file, + preprocessing, + augmentation_transformations=train_transformations, + labels=labels, + transformations=all_transformations, + multi_cohort=multi_cohort, + atlas=atlas, + merged_df=merged_df, + ) def __getitem__(self, idx): participant, session, cohort, roi_idx, label = self._get_meta_data(idx) @@ -475,8 +624,9 @@ def __getitem__(self, idx): if self.prepare_dl: if self.roi_list is None: raise NotImplementedError( - 'The extraction of ROIs prior to training is not implemented for default ROIs.' - 'Please disable --use_extracted_rois or precise the regions in --roi_names.') + "The extraction of ROIs prior to training is not implemented for default ROIs." + "Please disable --use_extracted_rois or precise the regions in --roi_names." + ) # read the regions directly roi_path = self._get_path(participant, session, cohort, "roi") @@ -494,14 +644,18 @@ def __getitem__(self, idx): if self.augmentation_transformations and not self.eval_mode: patch = self.augmentation_transformations(patch) - sample = {'image': patch, 'label': label, - 'participant_id': participant, 'session_id': session, - 'roi_id': roi_idx} + sample = { + "image": patch, + "label": label, + "participant_id": participant, + "session_id": session, + "roi_id": roi_idx, + } if self.atlas is not None: atlas_df = self._get_statistics_df(participant, session, cohort) atlas_pt = torch.from_numpy(atlas_df.values).float() - sample['atlas'] = atlas_pt + sample["atlas"] = atlas_pt return sample @@ -530,30 +684,44 @@ def extract_roi_from_mri(self, image_tensor, roi_idx): # the center of the right hippocampus crop_center = (109, 96, 68) else: - raise NotImplementedError("The extraction of hippocampi was not implemented for " - "preprocessing %s" % self.preprocessing) + raise NotImplementedError( + "The extraction of hippocampi was not implemented for " + "preprocessing %s" % self.preprocessing + ) crop_size = (50, 50, 50) # the output cropped hippocampus size if self.cropped_roi: extracted_roi = image_tensor[ :, - crop_center[0] - crop_size[0] // 2: crop_center[0] + crop_size[0] // 2:, - crop_center[1] - crop_size[1] // 2: crop_center[1] + crop_size[1] // 2:, - crop_center[2] - crop_size[2] // 2: crop_center[2] + crop_size[2] // 2: + crop_center[0] + - crop_size[0] // 2 : crop_center[0] + + crop_size[0] // 2 :, + crop_center[1] + - crop_size[1] // 2 : crop_center[1] + + crop_size[1] // 2 :, + crop_center[2] + - crop_size[2] // 2 : crop_center[2] + + crop_size[2] // 2 :, ].clone() else: - raise NotImplementedError("The uncropped option for the default ROI was not implemented.") + raise NotImplementedError( + "The uncropped option for the default ROI was not implemented." + ) else: roi_mask = self.mask_list[roi_idx] extracted_roi = image_tensor * roi_mask if self.cropped_roi: - extracted_roi = extracted_roi[np.ix_(roi_mask.any((1, 2, 3)), - roi_mask.any((0, 2, 3)), - roi_mask.any((0, 1, 3)), - roi_mask.any((0, 1, 2)))] + extracted_roi = extracted_roi[ + np.ix_( + roi_mask.any((1, 2, 3)), + roi_mask.any((0, 2, 3)), + roi_mask.any((0, 1, 3)), + roi_mask.any((0, 1, 2)), + ) + ] return extracted_roi.float() @@ -565,7 +733,7 @@ def find_masks(self, caps_directory, preprocessing): templates_dict = { "t1-linear": "MNI152NLin2009cSym", "t1-volume": "Ixi549Space", - "t1-extensive": "Ixi549Space" + "t1-extensive": "Ixi549Space", } if self.prepare_dl or self.roi_list is None: @@ -575,17 +743,24 @@ def find_masks(self, caps_directory, preprocessing): for roi in self.roi_list: template = templates_dict[preprocessing] if preprocessing == "t1-linear": - mask_pattern = MASK_PATTERN['cropped'] + mask_pattern = MASK_PATTERN["cropped"] elif preprocessing == "t1-volume": - mask_pattern = MASK_PATTERN['gm_maps'] + mask_pattern = MASK_PATTERN["gm_maps"] elif preprocessing == "t1-extensive": - mask_pattern = MASK_PATTERN['skull_stripped'] + mask_pattern = MASK_PATTERN["skull_stripped"] else: - raise NotImplementedError("Roi extraction for %s preprocessing was not implemented." - % preprocessing) - - mask_path = path.join(caps_directory, "masks", "roi_based", "tpl-%s" % template, - "tpl-%s%s_roi-%s_mask.nii.gz" % (template, mask_pattern, roi)) + raise NotImplementedError( + "Roi extraction for %s preprocessing was not implemented." + % preprocessing + ) + + mask_path = path.join( + caps_directory, + "masks", + "roi_based", + "tpl-%s" % template, + "tpl-%s%s_roi-%s_mask.nii.gz" % (template, mask_pattern, roi), + ) mask_nii = nib.load(mask_path) mask_list.append(mask_nii.get_fdata()) @@ -598,16 +773,29 @@ def compute_roi_filename(self, image_path, roi_index): image_filename = path.basename(image_path) image_descriptors = image_filename.split("_") if "desc-Crop" not in image_descriptors and self.cropped_roi: - image_descriptors = self.insert_descriptor(image_descriptors, "desc-CropRoi", "space") + image_descriptors = self.insert_descriptor( + image_descriptors, "desc-CropRoi", "space" + ) elif "desc-Crop" in image_descriptors: - image_descriptors = [descriptor for descriptor in image_descriptors if descriptor != "desc-Crop"] + image_descriptors = [ + descriptor + for descriptor in image_descriptors + if descriptor != "desc-Crop" + ] if self.cropped_roi: - image_descriptors = self.insert_descriptor(image_descriptors, "desc-CropRoi", "space") + image_descriptors = self.insert_descriptor( + image_descriptors, "desc-CropRoi", "space" + ) else: - image_descriptors = self.insert_descriptor(image_descriptors, "desc-CropImage", "space") + image_descriptors = self.insert_descriptor( + image_descriptors, "desc-CropImage", "space" + ) - return path.join(image_dir, "_".join(image_descriptors))[0:-7] + f"_roi-{self.roi_list[roi_index]}_T1w.pt" + return ( + path.join(image_dir, "_".join(image_descriptors))[0:-7] + + f"_roi-{self.roi_list[roi_index]}_T1w.pt" + ) @staticmethod def insert_descriptor(image_descriptors, descriptor_to_add, key_to_follow): @@ -620,11 +808,23 @@ def insert_descriptor(image_descriptors, descriptor_to_add, key_to_follow): class MRIDatasetSlice(MRIDataset): - - def __init__(self, caps_directory, data_file, slice_index=None, preprocessing="t1-linear", - train_transformations=None, mri_plane=0, prepare_dl=False, - discarded_slices=20, mixed=False, labels=True, all_transformations=None, - multi_cohort=False, atlas=None, merged_df=None): + def __init__( + self, + caps_directory, + data_file, + slice_index=None, + preprocessing="t1-linear", + train_transformations=None, + mri_plane=0, + prepare_dl=False, + discarded_slices=20, + mixed=False, + labels=True, + all_transformations=None, + multi_cohort=False, + atlas=None, + merged_df=None, + ): """ Args: caps_directory (string): Directory of all the images. @@ -647,15 +847,16 @@ def __init__(self, caps_directory, data_file, slice_index=None, preprocessing="t """ # Rename MRI plane if preprocessing == "shepplogan": - raise ValueError("Slice mode is not available for preprocessing %s" % preprocessing) + raise ValueError( + "Slice mode is not available for preprocessing %s" % preprocessing + ) self.elem_index = slice_index self.mri_plane = mri_plane - self.direction_list = ['sag', 'cor', 'axi'] + self.direction_list = ["sag", "cor", "axi"] if self.mri_plane >= len(self.direction_list): raise ValueError( - "mri_plane value %i > %i" % - (self.mri_plane, len( - self.direction_list))) + "mri_plane value %i > %i" % (self.mri_plane, len(self.direction_list)) + ) # Manage discarded_slices if isinstance(discarded_slices, int): @@ -671,10 +872,17 @@ def __init__(self, caps_directory, data_file, slice_index=None, preprocessing="t self.mode = "slice" self.prepare_dl = prepare_dl - super().__init__(caps_directory, data_file, preprocessing, - augmentation_transformations=train_transformations, labels=labels, - transformations=all_transformations, multi_cohort=multi_cohort, - atlas=atlas, merged_df=merged_df) + super().__init__( + caps_directory, + data_file, + preprocessing, + augmentation_transformations=train_transformations, + labels=labels, + transformations=all_transformations, + multi_cohort=multi_cohort, + atlas=atlas, + merged_df=merged_df, + ) def __getitem__(self, idx): participant, session, cohort, slice_idx, label = self._get_meta_data(idx) @@ -682,9 +890,11 @@ def __getitem__(self, idx): if self.prepare_dl: # read the slices directly - slice_path = path.join(self._get_path(participant, session, cohort, "slice")[0:-7] - + '_axis-%s' % self.direction_list[self.mri_plane] - + '_channel-rgb_slice-%i_T1w.pt' % slice_idx) + slice_path = path.join( + self._get_path(participant, session, cohort, "slice")[0:-7] + + "_axis-%s" % self.direction_list[self.mri_plane] + + "_channel-rgb_slice-%i_T1w.pt" % slice_idx + ) image = torch.load(slice_path) else: image_path = self._get_path(participant, session, cohort, "image") @@ -697,14 +907,18 @@ def __getitem__(self, idx): if self.augmentation_transformations and not self.eval_mode: image = self.augmentation_transformations(image) - sample = {'image': image, 'label': label, - 'participant_id': participant, 'session_id': session, - 'slice_id': slice_idx} + sample = { + "image": image, + "label": label, + "participant_id": participant, + "session_id": session, + "slice_id": slice_idx, + } if self.atlas is not None: atlas_df = self._get_statistics_df(participant, session, cohort) atlas_pt = torch.from_numpy(atlas_df.mean_scalar.values).float() - sample['atlas'] = atlas_pt + sample["atlas"] = atlas_pt return sample @@ -713,8 +927,11 @@ def num_elem_per_image(self): return 1 image = self._get_full_image() - return image.size(self.mri_plane + 1) - \ - self.discarded_slices[0] - self.discarded_slices[1] + return ( + image.size(self.mri_plane + 1) + - self.discarded_slices[0] + - self.discarded_slices[1] + ) def extract_slice_from_mri(self, image, index_slice): """ @@ -734,10 +951,19 @@ def extract_slice_from_mri(self, image, index_slice): return triple_slice -def return_dataset(mode, input_dir, data_df, preprocessing, - all_transformations, params, train_transformations=None, - cnn_index=None, labels=True, multi_cohort=False, - prepare_dl=False): +def return_dataset( + mode, + input_dir, + data_df, + preprocessing, + all_transformations, + params, + train_transformations=None, + cnn_index=None, + labels=True, + multi_cohort=False, + prepare_dl=False, +): """ Return appropriate Dataset according to given options. Args: @@ -775,7 +1001,7 @@ def return_dataset(mode, input_dir, data_df, preprocessing, labels=labels, multi_cohort=multi_cohort, atlas=params.predict_atlas_intensities, - merged_df=merged_df + merged_df=merged_df, ) elif mode == "patch": return MRIDatasetPatch( @@ -791,7 +1017,7 @@ def return_dataset(mode, input_dir, data_df, preprocessing, labels=labels, multi_cohort=multi_cohort, atlas=params.predict_atlas_intensities, - merged_df=merged_df + merged_df=merged_df, ) elif mode == "roi": return MRIDatasetRoi( @@ -807,7 +1033,7 @@ def return_dataset(mode, input_dir, data_df, preprocessing, labels=labels, multi_cohort=multi_cohort, atlas=params.predict_atlas_intensities, - merged_df=merged_df + merged_df=merged_df, ) elif mode == "slice": return MRIDatasetSlice( @@ -823,7 +1049,7 @@ def return_dataset(mode, input_dir, data_df, preprocessing, labels=labels, multi_cohort=multi_cohort, atlas=params.predict_atlas_intensities, - merged_df=merged_df + merged_df=merged_df, ) else: raise ValueError("Mode %s is not implemented." % mode) @@ -834,14 +1060,28 @@ def compute_num_cnn(input_dir, tsv_path, options, data="train"): _, transformations = get_transforms(options.mode, options.minmaxnormalization) if data == "train": - example_df, _ = load_data(tsv_path, options.diagnoses, 0, options.n_splits, options.baseline, - multi_cohort=options.multi_cohort) + example_df, _ = load_data( + tsv_path, + options.diagnoses, + 0, + options.n_splits, + options.baseline, + multi_cohort=options.multi_cohort, + ) else: - example_df = load_data_test(tsv_path, options.diagnoses, multi_cohort=options.multi_cohort) + example_df = load_data_test( + tsv_path, options.diagnoses, multi_cohort=options.multi_cohort + ) - full_dataset = return_dataset(options.mode, input_dir, example_df, - options.preprocessing, train_transformations=None, - all_transformations=transformations, params=options) + full_dataset = return_dataset( + options.mode, + input_dir, + example_df, + options.preprocessing, + train_transformations=None, + all_transformations=transformations, + params=options, + ) return full_dataset.elem_per_image @@ -850,6 +1090,7 @@ def compute_num_cnn(input_dir, tsv_path, options, data="train"): # Transformations ################################## + class RandomNoising(object): """Applies a random zoom to a tensor""" @@ -872,6 +1113,7 @@ def __init__(self, sigma=1): def __call__(self, image): import random + from scipy.ndimage import gaussian_filter sigma = random.uniform(0, self.sigma) @@ -888,26 +1130,29 @@ def __call__(self, image): dimensions = len(image.shape) - 1 crop = np.random.randint(-self.length, self.length, dimensions) if dimensions == 2: - output = torch.nn.functional.pad(image, (-crop[0], crop[0], -crop[1], crop[1])) + output = torch.nn.functional.pad( + image, (-crop[0], crop[0], -crop[1], crop[1]) + ) elif dimensions == 3: - output = torch.nn.functional.pad(image, (-crop[0], crop[0], -crop[1], crop[1], -crop[2], crop[2])) + output = torch.nn.functional.pad( + image, (-crop[0], crop[0], -crop[1], crop[1], -crop[2], crop[2]) + ) else: raise ValueError("RandomCropPad is only available for 2D or 3D data.") return output class GaussianSmoothing(object): - def __init__(self, sigma): self.sigma = sigma def __call__(self, sample): from scipy.ndimage.filters import gaussian_filter - image = sample['image'] + image = sample["image"] np.nan_to_num(image, copy=False) smoothed_image = gaussian_filter(image, sigma=self.sigma) - sample['image'] = smoothed_image + sample["image"] = smoothed_image return sample @@ -939,13 +1184,17 @@ def get_transforms(mode, minmaxnormalization=True, data_augmentation=None): - container transforms.Compose including transforms to apply in train and evaluation mode. - container transforms.Compose including transforms to apply in evaluation mode only. """ - augmentation_dict = {"Noise": RandomNoising(sigma=0.1), - "Erasing": transforms.RandomErasing(), - "CropPad": RandomCropPad(10), - "Smoothing": RandomSmoothing(), - "None": None} + augmentation_dict = { + "Noise": RandomNoising(sigma=0.1), + "Erasing": transforms.RandomErasing(), + "CropPad": RandomCropPad(10), + "Smoothing": RandomSmoothing(), + "None": None, + } if data_augmentation: - augmentation_list = [augmentation_dict[augmentation] for augmentation in data_augmentation] + augmentation_list = [ + augmentation_dict[augmentation] for augmentation in data_augmentation + ] else: augmentation_list = [] @@ -956,9 +1205,11 @@ def get_transforms(mode, minmaxnormalization=True, data_augmentation=None): if mode == "slice": trg_size = (224, 224) - transformations_list += [transforms.ToPILImage(), - transforms.Resize(trg_size), - transforms.ToTensor()] + transformations_list += [ + transforms.ToPILImage(), + transforms.Resize(trg_size), + transforms.ToTensor(), + ] all_transformations = transforms.Compose(transformations_list) train_transformations = transforms.Compose(augmentation_list) @@ -970,16 +1221,25 @@ def get_transforms(mode, minmaxnormalization=True, data_augmentation=None): # tsv files loaders ################################ -def load_data(tsv_path, diagnoses_list, - split, n_splits=0, baseline=True, - logger=None, multi_cohort=False): + +def load_data( + tsv_path, + diagnoses_list, + split, + n_splits=0, + baseline=True, + logger=None, + multi_cohort=False, +): if logger is None: logger = logging if multi_cohort: if not tsv_path.endswith(".tsv"): - raise ValueError('If multi_cohort is given, the tsv_path argument should be a path to a TSV file.') + raise ValueError( + "If multi_cohort is given, the tsv_path argument should be a path to a TSV file." + ) else: tsv_df = pd.read_csv(tsv_path, sep="\t") check_multi_cohort_tsv(tsv_df, "labels") @@ -987,43 +1247,59 @@ def load_data(tsv_path, diagnoses_list, valid_df = pd.DataFrame() found_diagnoses = set() for idx in range(len(tsv_df)): - cohort_name = tsv_df.loc[idx, 'cohort'] - cohort_path = tsv_df.loc[idx, 'path'] - cohort_diagnoses = tsv_df.loc[idx, 'diagnoses'].replace(' ', '').split(",") + cohort_name = tsv_df.loc[idx, "cohort"] + cohort_path = tsv_df.loc[idx, "path"] + cohort_diagnoses = ( + tsv_df.loc[idx, "diagnoses"].replace(" ", "").split(",") + ) if bool(set(cohort_diagnoses) & set(diagnoses_list)): target_diagnoses = list(set(cohort_diagnoses) & set(diagnoses_list)) - cohort_train_df, cohort_valid_df = load_data_single(cohort_path, target_diagnoses, split, - n_splits=n_splits, - baseline=baseline, - logger=logger) + cohort_train_df, cohort_valid_df = load_data_single( + cohort_path, + target_diagnoses, + split, + n_splits=n_splits, + baseline=baseline, + logger=logger, + ) cohort_train_df["cohort"] = cohort_name cohort_valid_df["cohort"] = cohort_name train_df = pd.concat([train_df, cohort_train_df]) valid_df = pd.concat([valid_df, cohort_valid_df]) - found_diagnoses = found_diagnoses | (set(cohort_diagnoses) & set(diagnoses_list)) + found_diagnoses = found_diagnoses | ( + set(cohort_diagnoses) & set(diagnoses_list) + ) if found_diagnoses != set(diagnoses_list): - raise ValueError(f"The diagnoses found in the multi cohort dataset {found_diagnoses} " - f"do not correspond to the diagnoses wanted {set(diagnoses_list)}.") + raise ValueError( + f"The diagnoses found in the multi cohort dataset {found_diagnoses} " + f"do not correspond to the diagnoses wanted {set(diagnoses_list)}." + ) train_df.reset_index(inplace=True, drop=True) valid_df.reset_index(inplace=True, drop=True) else: if tsv_path.endswith(".tsv"): - raise ValueError('To use multi-cohort framework, please add --multi_cohort flag.') + raise ValueError( + "To use multi-cohort framework, please add --multi_cohort flag." + ) else: - train_df, valid_df = load_data_single(tsv_path, diagnoses_list, split, - n_splits=n_splits, - baseline=baseline, - logger=logger) + train_df, valid_df = load_data_single( + tsv_path, + diagnoses_list, + split, + n_splits=n_splits, + baseline=baseline, + logger=logger, + ) train_df["cohort"] = "single" valid_df["cohort"] = "single" return train_df, valid_df -def load_data_single(train_val_path, diagnoses_list, - split, n_splits=0, baseline=True, - logger=None): +def load_data_single( + train_val_path, diagnoses_list, split, n_splits=0, baseline=True, logger=None +): if logger is None: logger = logging @@ -1032,12 +1308,16 @@ def load_data_single(train_val_path, diagnoses_list, valid_df = pd.DataFrame() if n_splits == 0: - train_path = path.join(train_val_path, 'train') - valid_path = path.join(train_val_path, 'validation') + train_path = path.join(train_val_path, "train") + valid_path = path.join(train_val_path, "validation") else: - train_path = path.join(train_val_path, f'train_splits-{n_splits}', f'split-{split}') - valid_path = path.join(train_val_path, f'validation_splits-{n_splits}', f'split-{split}') + train_path = path.join( + train_val_path, f"train_splits-{n_splits}", f"split-{split}" + ) + valid_path = path.join( + train_val_path, f"validation_splits-{n_splits}", f"split-{split}" + ) logger.debug("Train path %s" % train_path) logger.debug("Valid path %s" % valid_path) @@ -1045,16 +1325,14 @@ def load_data_single(train_val_path, diagnoses_list, for diagnosis in diagnoses_list: if baseline: - train_diagnosis_path = path.join( - train_path, diagnosis + '_baseline.tsv') + train_diagnosis_path = path.join(train_path, diagnosis + "_baseline.tsv") else: - train_diagnosis_path = path.join(train_path, diagnosis + '.tsv') + train_diagnosis_path = path.join(train_path, diagnosis + ".tsv") - valid_diagnosis_path = path.join( - valid_path, diagnosis + '_baseline.tsv') + valid_diagnosis_path = path.join(valid_path, diagnosis + "_baseline.tsv") - train_diagnosis_df = pd.read_csv(train_diagnosis_path, sep='\t') - valid_diagnosis_df = pd.read_csv(valid_diagnosis_path, sep='\t') + train_diagnosis_df = pd.read_csv(train_diagnosis_path, sep="\t") + valid_diagnosis_df = pd.read_csv(valid_diagnosis_path, sep="\t") train_df = pd.concat([train_df, train_diagnosis_df]) valid_df = pd.concat([valid_df, valid_diagnosis_df]) @@ -1069,33 +1347,45 @@ def load_data_test(test_path, diagnoses_list, baseline=True, multi_cohort=False) if multi_cohort: if not test_path.endswith(".tsv"): - raise ValueError('If multi_cohort is given, the tsv_path argument should be a path to a TSV file.') + raise ValueError( + "If multi_cohort is given, the tsv_path argument should be a path to a TSV file." + ) else: tsv_df = pd.read_csv(test_path, sep="\t") check_multi_cohort_tsv(tsv_df, "labels") test_df = pd.DataFrame() found_diagnoses = set() for idx in range(len(tsv_df)): - cohort_name = tsv_df.loc[idx, 'cohort'] - cohort_path = tsv_df.loc[idx, 'path'] - cohort_diagnoses = tsv_df.loc[idx, 'diagnoses'].replace(' ', '').split(",") + cohort_name = tsv_df.loc[idx, "cohort"] + cohort_path = tsv_df.loc[idx, "path"] + cohort_diagnoses = ( + tsv_df.loc[idx, "diagnoses"].replace(" ", "").split(",") + ) if bool(set(cohort_diagnoses) & set(diagnoses_list)): target_diagnoses = list(set(cohort_diagnoses) & set(diagnoses_list)) - cohort_test_df = load_data_test_single(cohort_path, target_diagnoses, baseline=baseline) + cohort_test_df = load_data_test_single( + cohort_path, target_diagnoses, baseline=baseline + ) cohort_test_df["cohort"] = cohort_name test_df = pd.concat([test_df, cohort_test_df]) - found_diagnoses = found_diagnoses | (set(cohort_diagnoses) & set(diagnoses_list)) + found_diagnoses = found_diagnoses | ( + set(cohort_diagnoses) & set(diagnoses_list) + ) if found_diagnoses != set(diagnoses_list): - raise ValueError(f"The diagnoses found in the multi cohort dataset {found_diagnoses} " - f"do not correspond to the diagnoses wanted {set(diagnoses_list)}.") + raise ValueError( + f"The diagnoses found in the multi cohort dataset {found_diagnoses} " + f"do not correspond to the diagnoses wanted {set(diagnoses_list)}." + ) test_df.reset_index(inplace=True, drop=True) else: if test_path.endswith(".tsv"): - tsv_df = pd.read_csv(test_path, sep='\t') + tsv_df = pd.read_csv(test_path, sep="\t") multi_col = {"cohort", "path"} if multi_col.issubset(tsv_df.columns.values): - raise ValueError('To use multi-cohort framework, please add --multi_cohort flag.') + raise ValueError( + "To use multi-cohort framework, please add --multi_cohort flag." + ) test_df = load_data_test_single(test_path, diagnoses_list, baseline=baseline) test_df["cohort"] = "single" @@ -1104,13 +1394,17 @@ def load_data_test(test_path, diagnoses_list, baseline=True, multi_cohort=False) def load_data_test_single(test_path, diagnoses_list, baseline=True): - if test_path.endswith('.tsv'): - test_df = pd.read_csv(test_path, sep='\t') + if test_path.endswith(".tsv"): + test_df = pd.read_csv(test_path, sep="\t") if "diagnosis" not in test_df.columns.values: - raise ValueError(f"'diagnosis' column must be present in TSV file {test_path}.") + raise ValueError( + f"'diagnosis' column must be present in TSV file {test_path}." + ) test_df = test_df[test_df.diagnosis.isin(diagnoses_list)] if len(test_df) == 0: - raise ValueError(f"Diagnoses wanted {diagnoses_list} were not found in TSV file {test_path}.") + raise ValueError( + f"Diagnoses wanted {diagnoses_list} were not found in TSV file {test_path}." + ) return test_df test_df = pd.DataFrame() @@ -1118,11 +1412,11 @@ def load_data_test_single(test_path, diagnoses_list, baseline=True): for diagnosis in diagnoses_list: if baseline: - test_diagnosis_path = path.join(test_path, diagnosis + '_baseline.tsv') + test_diagnosis_path = path.join(test_path, diagnosis + "_baseline.tsv") else: - test_diagnosis_path = path.join(test_path, diagnosis + '.tsv') + test_diagnosis_path = path.join(test_path, diagnosis + ".tsv") - test_diagnosis_df = pd.read_csv(test_diagnosis_path, sep='\t') + test_diagnosis_df = pd.read_csv(test_diagnosis_path, sep="\t") test_df = pd.concat([test_df, test_diagnosis_df]) test_df.reset_index(inplace=True, drop=True) @@ -1152,35 +1446,30 @@ def mix_slices(df_training, df_validation, mri_plane=0, val_size=0.15): slices_per_patient = 179 - 40 slice_index = list(np.arange(20, 179 - 20)) - participant_list = list(df_all['participant_id']) - session_list = list(df_all['session_id']) - label_list = list(df_all['diagnosis']) + participant_list = list(df_all["participant_id"]) + session_list = list(df_all["session_id"]) + label_list = list(df_all["diagnosis"]) slice_participant_list = [ - ele for ele in participant_list for _ in range(slices_per_patient)] + ele for ele in participant_list for _ in range(slices_per_patient) + ] slice_session_list = [ - ele for ele in session_list for _ in range(slices_per_patient)] - slice_label_list = [ - ele for ele in label_list for _ in range(slices_per_patient)] + ele for ele in session_list for _ in range(slices_per_patient) + ] + slice_label_list = [ele for ele in label_list for _ in range(slices_per_patient)] slice_index_list = slice_index * len(label_list) df_final = pd.DataFrame( - columns=[ - 'participant_id', - 'session_id', - 'slice_id', - 'diagnosis']) - df_final['participant_id'] = np.array(slice_participant_list) - df_final['session_id'] = np.array(slice_session_list) - df_final['slice_id'] = np.array(slice_index_list) - df_final['diagnosis'] = np.array(slice_label_list) + columns=["participant_id", "session_id", "slice_id", "diagnosis"] + ) + df_final["participant_id"] = np.array(slice_participant_list) + df_final["session_id"] = np.array(slice_session_list) + df_final["slice_id"] = np.array(slice_index_list) + df_final["diagnosis"] = np.array(slice_label_list) y = np.array(slice_label_list) # split the train data into training and validation set - skf_2 = StratifiedShuffleSplit( - n_splits=1, - test_size=val_size, - random_state=10000) + skf_2 = StratifiedShuffleSplit(n_splits=1, test_size=val_size, random_state=10000) indices = next(skf_2.split(np.zeros(len(y)), y)) train_ind, valid_ind = indices @@ -1193,7 +1482,7 @@ def mix_slices(df_training, df_validation, mri_plane=0, val_size=0.15): return df_sub_train, df_sub_valid -def generate_sampler(dataset, sampler_option='random'): +def generate_sampler(dataset, sampler_option="random"): """ Returns sampler according to the wanted options @@ -1217,12 +1506,14 @@ def generate_sampler(dataset, sampler_option='random'): key = dataset.diagnosis_code[label] weights += [weight_per_class[key]] * dataset.elem_per_image - if sampler_option == 'random': + if sampler_option == "random": return sampler.RandomSampler(weights) - elif sampler_option == 'weighted': + elif sampler_option == "weighted": return sampler.WeightedRandomSampler(weights, len(weights)) else: - raise NotImplementedError(f"The option {sampler_option} for sampler is not implemented") + raise NotImplementedError( + f"The option {sampler_option} for sampler is not implemented" + ) def check_multi_cohort_tsv(tsv_df, purpose): @@ -1231,4 +1522,6 @@ def check_multi_cohort_tsv(tsv_df, purpose): else: mandatory_col = {"cohort", "path", "diagnoses"} if not mandatory_col.issubset(tsv_df.columns.values): - raise ValueError(f'Columns of the TSV file used for {purpose} location must include {mandatory_col}') + raise ValueError( + f"Columns of the TSV file used for {purpose} location must include {mandatory_col}" + ) diff --git a/clinicadl/clinicadl/tools/deep_learning/iotools.py b/clinicadl/clinicadl/tools/deep_learning/iotools.py index aa3295b94..1ae1848f8 100644 --- a/clinicadl/clinicadl/tools/deep_learning/iotools.py +++ b/clinicadl/clinicadl/tools/deep_learning/iotools.py @@ -38,7 +38,7 @@ def return_logger(verbose, name_fn): return logger -computational_list = ['gpu', 'batch_size', 'num_workers', 'evaluation_steps'] +computational_list = ["gpu", "batch_size", "num_workers", "evaluation_steps"] def write_requirements_version(output_path): @@ -47,11 +47,15 @@ def write_requirements_version(output_path): from warnings import warn try: - env_variables = subprocess.check_output("pip freeze", shell=True).decode("utf-8") + env_variables = subprocess.check_output("pip freeze", shell=True).decode( + "utf-8" + ) with open(path.join(output_path, "environment.txt"), "w") as file: file.write(env_variables) except subprocess.CalledProcessError: - warn("You do not have the right to execute pip freeze. Your environment will not be written") + warn( + "You do not have the right to execute pip freeze. Your environment will not be written" + ) def translate_parameters(args): @@ -93,8 +97,8 @@ def translate_parameters(args): def check_and_clean(d): - import shutil import os + import shutil if os.path.exists(d): shutil.rmtree(d) @@ -123,11 +127,11 @@ def commandline_to_json(commandline, logger=None, filename="commandline.json"): commandline_arg_dict = copy(commandline) else: commandline_arg_dict = copy(vars(commandline)) - output_dir = commandline_arg_dict['output_dir'] + output_dir = commandline_arg_dict["output_dir"] os.makedirs(output_dir, exist_ok=True) # remove these entries from the commandline log file - remove_list = ['func', 'output_dir', 'launch_dir', 'name', 'verbose', 'logname'] + remove_list = ["func", "output_dir", "launch_dir", "name", "verbose", "logname"] for variable in remove_list: if variable in commandline_arg_dict: del commandline_arg_dict[variable] @@ -154,8 +158,8 @@ def read_json(options=None, json_path=None, test=False, read_computational=False options (args.Namespace) options of the model updated """ import json - from os import path from argparse import Namespace + from os import path if options is None: options = Namespace() @@ -163,7 +167,7 @@ def read_json(options=None, json_path=None, test=False, read_computational=False evaluation_parameters = ["diagnosis_path", "input_dir", "diagnoses"] prep_compatibility_dict = {"mni": "t1-extensive", "linear": "t1-linear"} if json_path is None: - json_path = path.join(options.model_path, 'commandline.json') + json_path = path.join(options.model_path, "commandline.json") with open(json_path, "r") as f: json_data = json.load(f) @@ -183,14 +187,14 @@ def read_json(options=None, json_path=None, test=False, read_computational=False options.model = options.network del options.network - if not hasattr(options, 'discarded_sliced'): + if not hasattr(options, "discarded_sliced"): options.discarded_slices = 20 if isinstance(options.preprocessing, str): if options.preprocessing in prep_compatibility_dict.keys(): options.preprocessing = prep_compatibility_dict[options.preprocessing] - if hasattr(options, 'mri_plane'): + if hasattr(options, "mri_plane"): options.slice_direction = options.mri_plane del options.mri_plane @@ -207,13 +211,13 @@ def read_json(options=None, json_path=None, test=False, read_computational=False options.transfer_learning_difference = options.pretrained_difference del options.pretrained_difference - if hasattr(options, 'patch_stride'): + if hasattr(options, "patch_stride"): options.stride_size = options.patch_stride - if hasattr(options, 'use_gpu'): + if hasattr(options, "use_gpu"): options.use_cpu = not options.use_gpu - if hasattr(options, 'mode'): + if hasattr(options, "mode"): if options.mode == "subject": options.mode = "image" if options.mode == "slice" and not hasattr(options, "network_type"): @@ -236,17 +240,17 @@ def read_json(options=None, json_path=None, test=False, read_computational=False if not hasattr(options, "loss"): options.loss = "default" - if not hasattr(options, 'dropout') or options.dropout is None: + if not hasattr(options, "dropout") or options.dropout is None: options.dropout = None set_default_dropout(options) - if not hasattr(options, 'uncropped_roi'): + if not hasattr(options, "uncropped_roi"): options.uncropped_roi = False - if not hasattr(options, 'roi_list'): + if not hasattr(options, "roi_list"): options.roi_list = None - if not hasattr(options, 'multi_cohort'): + if not hasattr(options, "multi_cohort"): options.multi_cohort = False if not hasattr(options, "predict_atlas_intensities"): @@ -279,7 +283,7 @@ def set_default(namespace, default_dict): if not hasattr(namespace, name): setattr(namespace, name, default_value) - filename = 'random_search.json' + filename = "random_search.json" default_values = { "accumulation_steps": 1, @@ -287,7 +291,7 @@ def set_default(namespace, default_dict): "baseline": False, "batch_size": 2, "data_augmentation": False, - "diagnoses": ['AD', 'CN'], + "diagnoses": ["AD", "CN"], "dropout": 0, "epochs": 20, "evaluation_steps": 0, @@ -308,28 +312,28 @@ def set_default(namespace, default_dict): "use_cpu": False, "wd_bool": True, "weight_decay": 4, - "sampler": "random" + "sampler": "random", } mode_default_values = { "patch": { "patch_size": 50, "stride_size": 50, "selection_threshold": 0, - "use_extracted_patches": False + "use_extracted_patches": False, }, "roi": { "roi_list": None, "selection_threshold": 0, "uncropped_roi": False, - "use_extracted_roi": False + "use_extracted_roi": False, }, "slice": { "discarded_slices": 20, "selection_threshold": 0, "slice_direction": 0, - "use_extracted_slices": False + "use_extracted_slices": False, }, - "image": {} + "image": {}, } if random_search: default_values["d_reduction"] = "MaxPooling" @@ -339,30 +343,39 @@ def set_default(namespace, default_dict): set_default(options, default_values) - mandatory_arguments = ['network_type', 'mode', - 'tsv_path', 'caps_dir', 'preprocessing'] + mandatory_arguments = [ + "network_type", + "mode", + "tsv_path", + "caps_dir", + "preprocessing", + ] if random_search: - mandatory_arguments += ['n_convblocks', 'first_conv_width', 'n_fcblocks'] + mandatory_arguments += ["n_convblocks", "first_conv_width", "n_fcblocks"] for argument in mandatory_arguments: if not hasattr(options, argument): - raise ValueError(f"The argument {argument} must be specified in {filename}.") + raise ValueError( + f"The argument {argument} must be specified in {filename}." + ) if random_search: for mode, mode_dict in mode_default_values.items(): set_default(options, mode_dict) else: if options.mode not in mode_default_values: - raise NotImplementedError(f"The mode optional arguments corresponding to mode {options.mode}") + raise NotImplementedError( + f"The mode optional arguments corresponding to mode {options.mode}" + ) mode_dict = mode_default_values[options.mode] set_default(options, mode_dict) def set_default_dropout(args): if args.dropout is None: - if args.mode == 'image': + if args.mode == "image": args.dropout = 0.5 - elif args.mode == 'slice': + elif args.mode == "slice": args.dropout = 0.8 else: args.dropout = 0 @@ -370,25 +383,29 @@ def set_default_dropout(args): def memReport(): import gc + import torch cnt_tensor = 0 for obj in gc.get_objects(): - if torch.is_tensor(obj) and (hasattr(obj, 'data') and torch.is_tensor(obj.data)): + if torch.is_tensor(obj) and ( + hasattr(obj, "data") and torch.is_tensor(obj.data) + ): print(type(obj), obj.size(), obj.is_cuda) cnt_tensor += 1 - print('Count: ', cnt_tensor) + print("Count: ", cnt_tensor) def cpuStats(): + import os import sys + import psutil - import os print(sys.version) print(psutil.cpu_percent()) print(psutil.virtual_memory()) # physical memory usage pid = os.getpid() py = psutil.Process(pid) - memoryUse = py.memory_info()[0] / 2. ** 30 # memory use in GB...I think - print('memory GB:', memoryUse) + memoryUse = py.memory_info()[0] / 2.0 ** 30 # memory use in GB...I think + print("memory GB:", memoryUse) diff --git a/clinicadl/clinicadl/tools/deep_learning/models/__init__.py b/clinicadl/clinicadl/tools/deep_learning/models/__init__.py index 9131fc259..581b6b6cd 100644 --- a/clinicadl/clinicadl/tools/deep_learning/models/__init__.py +++ b/clinicadl/clinicadl/tools/deep_learning/models/__init__.py @@ -1,9 +1,9 @@ from .autoencoder import AutoEncoder, initialize_other_autoencoder, transfer_learning +from .image_level import Conv5_FC3, Conv5_FC3_down, Conv5_FC3_mni, Conv6_FC3, VConv5_FC3 from .iotools import load_model, load_optimizer, save_checkpoint -from .image_level import Conv5_FC3, Conv5_FC3_mni, Conv6_FC3, VConv5_FC3, Conv5_FC3_down from .patch_level import Conv4_FC3 -from .slice_level import resnet18, ConvNet from .random import RandomArchitecture +from .slice_level import ConvNet, resnet18 def create_model(options, initial_shape, len_atlas=0): @@ -20,16 +20,23 @@ def create_model(options, initial_shape, len_atlas=0): """ if not hasattr(options, "model"): - model = RandomArchitecture(options.convolutions, options.n_fcblocks, initial_shape, - options.dropout, options.network_normalization, - n_classes=2 + len_atlas) + model = RandomArchitecture( + options.convolutions, + options.n_fcblocks, + initial_shape, + options.dropout, + options.network_normalization, + n_classes=2 + len_atlas, + ) else: try: - model = eval(options.model)(dropout=options.dropout, - n_classes=2 + len_atlas) + model = eval(options.model)( + dropout=options.dropout, n_classes=2 + len_atlas + ) except NameError: raise NotImplementedError( - 'The model wanted %s has not been implemented.' % options.model) + "The model wanted %s has not been implemented." % options.model + ) if options.gpu: model.cuda() @@ -47,16 +54,21 @@ def create_autoencoder(options, initial_shape, difference=0): :param initial_shape: (array-like) shape of the input data. :param difference: (int) difference of depth between the pretrained encoder and the new one. :return: (Module) the model object """ - from .autoencoder import AutoEncoder, initialize_other_autoencoder from os import path + from .autoencoder import AutoEncoder, initialize_other_autoencoder + model = create_model(options, initial_shape) decoder = AutoEncoder(model) if options.transfer_learning_path is not None: if path.splitext(options.transfer_learning_path) != ".pth.tar": - raise ValueError("The full path to the model must be given (filename included).") - decoder = initialize_other_autoencoder(decoder, options.transfer_learning_path, difference) + raise ValueError( + "The full path to the model must be given (filename included)." + ) + decoder = initialize_other_autoencoder( + decoder, options.transfer_learning_path, difference + ) return decoder diff --git a/clinicadl/clinicadl/tools/deep_learning/models/autoencoder.py b/clinicadl/clinicadl/tools/deep_learning/models/autoencoder.py index 7e800d447..1a13413c3 100644 --- a/clinicadl/clinicadl/tools/deep_learning/models/autoencoder.py +++ b/clinicadl/clinicadl/tools/deep_learning/models/autoencoder.py @@ -1,14 +1,21 @@ # coding: utf8 -from torch import nn -import torch from copy import deepcopy -from .modules import PadMaxPool3d, PadMaxPool2d, CropMaxUnpool3d, CropMaxUnpool2d, Flatten, Reshape +import torch +from torch import nn +from .modules import ( + CropMaxUnpool2d, + CropMaxUnpool3d, + Flatten, + PadMaxPool2d, + PadMaxPool3d, + Reshape, +) -class AutoEncoder(nn.Module): +class AutoEncoder(nn.Module): def __init__(self, model=None): """ Construct an autoencoder from a given CNN. The encoder part corresponds to the convolutional part of the CNN. @@ -16,6 +23,7 @@ def __init__(self, model=None): :param model: (Module) a CNN. The convolutional part must be comprised in a 'features' class variable. """ from copy import deepcopy + super(AutoEncoder, self).__init__() self.level = 0 @@ -73,13 +81,24 @@ def construct_inv_layers(self, model): inv_layers = [] for i, layer in enumerate(self.encoder): if isinstance(layer, nn.Conv3d): - inv_layers.append(nn.ConvTranspose3d(layer.out_channels, layer.in_channels, layer.kernel_size, - stride=layer.stride, padding=layer.padding)) + inv_layers.append( + nn.ConvTranspose3d( + layer.out_channels, + layer.in_channels, + layer.kernel_size, + stride=layer.stride, + padding=layer.padding, + ) + ) self.level += 1 elif isinstance(layer, PadMaxPool3d): - inv_layers.append(CropMaxUnpool3d(layer.kernel_size, stride=layer.stride)) + inv_layers.append( + CropMaxUnpool3d(layer.kernel_size, stride=layer.stride) + ) elif isinstance(layer, PadMaxPool2d): - inv_layers.append(CropMaxUnpool2d(layer.kernel_size, stride=layer.stride)) + inv_layers.append( + CropMaxUnpool2d(layer.kernel_size, stride=layer.stride) + ) elif isinstance(layer, nn.Linear): inv_layers.append(nn.Linear(layer.out_features, layer.in_features)) elif isinstance(layer, Flatten): @@ -108,7 +127,10 @@ def replace_relu(inv_layers): idx_relu = idx if idx_conv != -1 and idx_relu != -1: - inv_layers[idx_relu], inv_layers[idx_conv] = inv_layers[idx_conv], inv_layers[idx_relu] + inv_layers[idx_relu], inv_layers[idx_conv] = ( + inv_layers[idx_conv], + inv_layers[idx_relu], + ) idx_conv, idx_relu = -1, -1 # Check if number of features of batch normalization layers is still correct @@ -120,9 +142,15 @@ def replace_relu(inv_layers): return inv_layers -def transfer_learning(model, split, source_path=None, gpu=False, - selection="best_balanced_accuracy", cnn_index=None, - logger=None): +def transfer_learning( + model, + split, + source_path=None, + gpu=False, + selection="best_balanced_accuracy", + cnn_index=None, + logger=None, +): """ Allows transfer learning from a CNN or an autoencoder to a CNN @@ -135,16 +163,19 @@ def transfer_learning(model, split, source_path=None, gpu=False, :return: (nn.Module) the model after transfer learning. """ import argparse + import logging from os import path + from ..iotools import read_json, translate_parameters - import logging if logger is None: logger = logging if source_path is not None: source_commandline = argparse.Namespace() - source_commandline = read_json(source_commandline, json_path=path.join(source_path, "commandline.json")) + source_commandline = read_json( + source_commandline, json_path=path.join(source_path, "commandline.json") + ) source_commandline = translate_parameters(source_commandline) if source_commandline.mode_task == "autoencoder": logger.info("A pretrained autoencoder is loaded at path %s" % source_path) @@ -152,7 +183,9 @@ def transfer_learning(model, split, source_path=None, gpu=False, else: logger.info("A pretrained CNN is loaded at path %s" % source_path) - model = transfer_cnn_weights(model, source_path, split, selection=selection, cnn_index=cnn_index) + model = transfer_cnn_weights( + model, source_path, split, selection=selection, cnn_index=cnn_index + ) else: logger.info("The model is trained from scratch.") @@ -175,15 +208,17 @@ def transfer_autoencoder_weights(model, source_path, split): :param split: (int) split number to load :return: (str) path to the written weights ready to be loaded """ - from copy import deepcopy import os + from copy import deepcopy if not isinstance(model, AutoEncoder): decoder = AutoEncoder(model) else: decoder = model - model_path = os.path.join(source_path, 'fold-%i' % split, 'models', "best_loss", "model_best.pth.tar") + model_path = os.path.join( + source_path, "fold-%i" % split, "models", "best_loss", "model_best.pth.tar" + ) source_dict = torch.load(model_path) initialize_other_autoencoder(decoder, source_dict) @@ -197,7 +232,9 @@ def transfer_autoencoder_weights(model, source_path, split): return model -def transfer_cnn_weights(model, source_path, split, selection="best_balanced_accuracy", cnn_index=None): +def transfer_cnn_weights( + model, source_path, split, selection="best_balanced_accuracy", cnn_index=None +): """ Set the weights of the model according to the CNN at source path. :param model: (Module) the model which must be initialized @@ -209,18 +246,29 @@ def transfer_cnn_weights(model, source_path, split, selection="best_balanced_acc """ import os + import torch if isinstance(model, AutoEncoder): - raise ValueError('Transfer learning from CNN to autoencoder was not implemented.') + raise ValueError( + "Transfer learning from CNN to autoencoder was not implemented." + ) - model_path = os.path.join(source_path, "fold-%i" % split, "models", selection, "model_best.pth.tar") + model_path = os.path.join( + source_path, "fold-%i" % split, "models", selection, "model_best.pth.tar" + ) if cnn_index is not None and not os.path.exists(model_path): print("Transfer learning from multi-CNN, cnn-%i" % cnn_index) - model_path = os.path.join(source_path, "fold_%i" % split, "models", "cnn-%i" % cnn_index, - selection, "model_best.pth.tar") + model_path = os.path.join( + source_path, + "fold_%i" % split, + "models", + "cnn-%i" % cnn_index, + selection, + "model_best.pth.tar", + ) results = torch.load(model_path) - model.load_state_dict(results['model']) + model.load_state_dict(results["model"]) return model @@ -234,33 +282,45 @@ def initialize_other_autoencoder(decoder, source_dict): """ try: - decoder.load_state_dict(source_dict['model']) + decoder.load_state_dict(source_dict["model"]) except RuntimeError: - print("The source and target autoencoders do not have the same size." - "The transfer learning task may not work correctly for custom models.") + print( + "The source and target autoencoders do not have the same size." + "The transfer learning task may not work correctly for custom models." + ) - parameters_dict = source_dict['model'] - difference = find_maximum_layer(decoder.state_dict()) - find_maximum_layer(parameters_dict) + parameters_dict = source_dict["model"] + difference = find_maximum_layer(decoder.state_dict()) - find_maximum_layer( + parameters_dict + ) for key in parameters_dict.keys(): - section, number, spec = key.split('.') + section, number, spec = key.split(".") number = int(number) - if section == 'encoder' and number < len(decoder.encoder): + if section == "encoder" and number < len(decoder.encoder): data = getattr(getattr(decoder, section)[number], spec).data assert data.shape == parameters_dict[key].shape - getattr(getattr(decoder, section)[number], spec).data = parameters_dict[key] - elif section == 'decoder': + getattr(getattr(decoder, section)[number], spec).data = parameters_dict[ + key + ] + elif section == "decoder": # Deeper target autoencoder if difference >= 0: - data = getattr(getattr(decoder, section)[number + difference], spec).data + data = getattr( + getattr(decoder, section)[number + difference], spec + ).data assert data.shape == parameters_dict[key].shape - getattr(getattr(decoder, section)[number + difference], spec).data = parameters_dict[key] + getattr( + getattr(decoder, section)[number + difference], spec + ).data = parameters_dict[key] # More shallow target autoencoder elif difference < 0 and number < len(decoder.decoder): data = getattr(getattr(decoder, section)[number], spec).data - new_key = '.'.join(['decoder', str(number + abs(difference)), spec]) + new_key = ".".join(["decoder", str(number + abs(difference)), spec]) assert data.shape == parameters_dict[new_key].shape - getattr(getattr(decoder, section)[number], spec).data = parameters_dict[new_key] + getattr( + getattr(decoder, section)[number], spec + ).data = parameters_dict[new_key] return decoder diff --git a/clinicadl/clinicadl/tools/deep_learning/models/image_level.py b/clinicadl/clinicadl/tools/deep_learning/models/image_level.py index 6d0836f20..4a00af373 100755 --- a/clinicadl/clinicadl/tools/deep_learning/models/image_level.py +++ b/clinicadl/clinicadl/tools/deep_learning/models/image_level.py @@ -1,8 +1,9 @@ # coding: utf8 -from .modules import PadMaxPool3d, Flatten -import torch.nn as nn import torch +import torch.nn as nn + +from .modules import Flatten, PadMaxPool3d """ All the architectures are built here @@ -15,6 +16,7 @@ class Conv5_FC3(nn.Module): Image level architecture used on Minimal preprocessing """ + def __init__(self, dropout=0.5, n_classes=2): super(Conv5_FC3, self).__init__() @@ -23,41 +25,32 @@ def __init__(self, dropout=0.5, n_classes=2): nn.BatchNorm3d(8), nn.ReLU(), PadMaxPool3d(2, 2), - nn.Conv3d(8, 16, 3, padding=1), nn.BatchNorm3d(16), nn.ReLU(), PadMaxPool3d(2, 2), - nn.Conv3d(16, 32, 3, padding=1), nn.BatchNorm3d(32), nn.ReLU(), PadMaxPool3d(2, 2), - nn.Conv3d(32, 64, 3, padding=1), nn.BatchNorm3d(64), nn.ReLU(), PadMaxPool3d(2, 2), - nn.Conv3d(64, 128, 3, padding=1), nn.BatchNorm3d(128), nn.ReLU(), PadMaxPool3d(2, 2), - ) self.classifier = nn.Sequential( Flatten(), nn.Dropout(p=dropout), - nn.Linear(128 * 6 * 7 * 6, 1300), nn.ReLU(), - nn.Linear(1300, 50), nn.ReLU(), - - nn.Linear(50, n_classes) - + nn.Linear(50, n_classes), ) self.flattened_shape = [-1, 128, 6, 7, 6] @@ -75,6 +68,7 @@ class VConv5_FC3(nn.Module): Image level architecture used on Minimal preprocessing """ + def __init__(self, dropout=0.5, n_classes=2): super(VConv5_FC3, self).__init__() @@ -83,45 +77,30 @@ def __init__(self, dropout=0.5, n_classes=2): nn.BatchNorm3d(8), nn.ReLU(), PadMaxPool3d(2, 2), - nn.Conv3d(8, 16, 3, padding=1), nn.BatchNorm3d(16), nn.ReLU(), PadMaxPool3d(2, 2), - nn.Conv3d(16, 32, 3, padding=1), nn.BatchNorm3d(32), nn.ReLU(), PadMaxPool3d(2, 2), - nn.Conv3d(32, 64, 3, padding=1), nn.BatchNorm3d(64), nn.ReLU(), PadMaxPool3d(2, 2), - nn.Conv3d(64, 128, 3, padding=1), nn.BatchNorm3d(128), nn.ReLU(), PadMaxPool3d(2, 2), - ) - self.fc_mu = nn.Sequential( - Flatten(), - nn.Linear(128 * 6 * 7 * 6, 1300) - ) + self.fc_mu = nn.Sequential(Flatten(), nn.Linear(128 * 6 * 7 * 6, 1300)) - self.fc_var = nn.Sequential( - Flatten(), - nn.Linear(128 * 6 * 7 * 6, 1300) - ) + self.fc_var = nn.Sequential(Flatten(), nn.Linear(128 * 6 * 7 * 6, 1300)) self.classifier = nn.Sequential( - nn.Linear(1300, 50), - nn.ReLU(), - - nn.Linear(50, n_classes) - + nn.Linear(1300, 50), nn.ReLU(), nn.Linear(50, n_classes) ) self.flattened_shape = [-1, 128, 6, 7, 6] @@ -148,6 +127,7 @@ class Conv5_FC3_mni(nn.Module): Image level architecture used on Extensive preprocessing """ + def __init__(self, dropout=0.5, n_classes=2): super(Conv5_FC3_mni, self).__init__() @@ -156,41 +136,32 @@ def __init__(self, dropout=0.5, n_classes=2): nn.BatchNorm3d(8), nn.ReLU(), PadMaxPool3d(2, 2), - nn.Conv3d(8, 16, 3, padding=1), nn.BatchNorm3d(16), nn.ReLU(), PadMaxPool3d(2, 2), - nn.Conv3d(16, 32, 3, padding=1), nn.BatchNorm3d(32), nn.ReLU(), PadMaxPool3d(2, 2), - nn.Conv3d(32, 64, 3, padding=1), nn.BatchNorm3d(64), nn.ReLU(), PadMaxPool3d(2, 2), - nn.Conv3d(64, 128, 3, padding=1), nn.BatchNorm3d(128), nn.ReLU(), PadMaxPool3d(2, 2), - ) self.classifier = nn.Sequential( Flatten(), nn.Dropout(p=dropout), - nn.Linear(128 * 4 * 5 * 4, 1300), nn.ReLU(), - nn.Linear(1300, 50), nn.ReLU(), - - nn.Linear(50, n_classes) - + nn.Linear(50, n_classes), ) self.flattened_shape = [-1, 128, 4, 5, 4] @@ -208,6 +179,7 @@ class Conv6_FC3(nn.Module): Image level architecture used on Minimal preprocessing """ + def __init__(self, dropout=0.5, n_classes=2): super(Conv6_FC3, self).__init__() @@ -216,27 +188,22 @@ def __init__(self, dropout=0.5, n_classes=2): nn.BatchNorm3d(8), nn.ReLU(), PadMaxPool3d(2, 2), - nn.Conv3d(8, 16, 3, padding=1), nn.BatchNorm3d(16), nn.ReLU(), PadMaxPool3d(2, 2), - nn.Conv3d(16, 32, 3, padding=1), nn.BatchNorm3d(32), nn.ReLU(), PadMaxPool3d(2, 2), - nn.Conv3d(32, 64, 3, padding=1), nn.BatchNorm3d(64), nn.ReLU(), PadMaxPool3d(2, 2), - nn.Conv3d(64, 128, 3, padding=1), nn.BatchNorm3d(128), nn.ReLU(), PadMaxPool3d(2, 2), - nn.Conv3d(128, 256, 3, padding=1), nn.BatchNorm3d(256), nn.ReLU(), @@ -246,15 +213,11 @@ def __init__(self, dropout=0.5, n_classes=2): self.classifier = nn.Sequential( Flatten(), nn.Dropout(p=dropout), - nn.Linear(256 * 3 * 4 * 3, 1000), nn.ReLU(), - nn.Linear(1000, 50), nn.ReLU(), - - nn.Linear(50, n_classes) - + nn.Linear(50, n_classes), ) self.flattened_shape = [-1, 256, 3, 4, 3] @@ -272,6 +235,7 @@ class Conv5_FC3_down(nn.Module): Image level architecture used on Minimal preprocessing """ + def __init__(self, dropout=0.5, n_classes=2): super(Conv5_FC3_down, self).__init__() @@ -280,41 +244,32 @@ def __init__(self, dropout=0.5, n_classes=2): nn.BatchNorm3d(8), nn.ReLU(), PadMaxPool3d(2, 2), - nn.Conv3d(8, 16, 3, padding=1), nn.BatchNorm3d(16), nn.ReLU(), PadMaxPool3d(2, 2), - nn.Conv3d(16, 32, 3, padding=1), nn.BatchNorm3d(32), nn.ReLU(), PadMaxPool3d(2, 2), - nn.Conv3d(32, 64, 3, padding=1), nn.BatchNorm3d(64), nn.ReLU(), PadMaxPool3d(2, 2), - nn.Conv3d(64, 128, 3, padding=1), nn.BatchNorm3d(128), nn.ReLU(), PadMaxPool3d(2, 2), - ) self.classifier = nn.Sequential( Flatten(), nn.Dropout(p=dropout), - nn.Linear(128 * 3 * 4 * 3, 350), nn.ReLU(), - nn.Linear(350, 25), nn.ReLU(), - - nn.Linear(25, n_classes) - + nn.Linear(25, n_classes), ) self.flattened_shape = [-1, 128, 3, 4, 3] diff --git a/clinicadl/clinicadl/tools/deep_learning/models/iotools.py b/clinicadl/clinicadl/tools/deep_learning/models/iotools.py index 75df1fbe8..0ac2a6930 100644 --- a/clinicadl/clinicadl/tools/deep_learning/models/iotools.py +++ b/clinicadl/clinicadl/tools/deep_learning/models/iotools.py @@ -5,12 +5,20 @@ """ -def save_checkpoint(state, accuracy_is_best, loss_is_best, checkpoint_dir, filename='checkpoint.pth.tar', - best_accuracy='best_balanced_accuracy', best_loss='best_loss'): - import torch +def save_checkpoint( + state, + accuracy_is_best, + loss_is_best, + checkpoint_dir, + filename="checkpoint.pth.tar", + best_accuracy="best_balanced_accuracy", + best_loss="best_loss", +): import os import shutil + import torch + os.makedirs(checkpoint_dir, exist_ok=True) torch.save(state, os.path.join(checkpoint_dir, filename)) @@ -18,15 +26,21 @@ def save_checkpoint(state, accuracy_is_best, loss_is_best, checkpoint_dir, filen best_accuracy_path = os.path.join(checkpoint_dir, best_accuracy) if not os.path.exists(best_accuracy_path): os.makedirs(best_accuracy_path) - shutil.copyfile(os.path.join(checkpoint_dir, filename), os.path.join(best_accuracy_path, 'model_best.pth.tar')) + shutil.copyfile( + os.path.join(checkpoint_dir, filename), + os.path.join(best_accuracy_path, "model_best.pth.tar"), + ) if loss_is_best: best_loss_path = os.path.join(checkpoint_dir, best_loss) os.makedirs(best_loss_path, exist_ok=True) - shutil.copyfile(os.path.join(checkpoint_dir, filename), os.path.join(best_loss_path, 'model_best.pth.tar')) + shutil.copyfile( + os.path.join(checkpoint_dir, filename), + os.path.join(best_loss_path, "model_best.pth.tar"), + ) -def load_model(model, checkpoint_dir, gpu, filename='model_best.pth.tar'): +def load_model(model, checkpoint_dir, gpu, filename="model_best.pth.tar"): """ Load the weights written in checkpoint_dir in the model object. @@ -36,18 +50,19 @@ def load_model(model, checkpoint_dir, gpu, filename='model_best.pth.tar'): :param filename: (str) Name of the file containing the parameters to loaded. :return: (Module) the update model. """ + import os from copy import deepcopy + import torch - import os best_model = deepcopy(model) param_dict = torch.load(os.path.join(checkpoint_dir, filename), map_location="cpu") - best_model.load_state_dict(param_dict['model']) + best_model.load_state_dict(param_dict["model"]) if gpu: best_model = best_model.cuda() - return best_model, param_dict['epoch'] + return best_model, param_dict["epoch"] def load_optimizer(optimizer_path, model): @@ -59,14 +74,17 @@ def load_optimizer(optimizer_path, model): :return: optimizer initialized with specific state and linked to model parameters. """ from os import path + import torch if not path.exists(optimizer_path): - raise ValueError('The optimizer was not found at path %s' % optimizer_path) - print('Loading optimizer') + raise ValueError("The optimizer was not found at path %s" % optimizer_path) + print("Loading optimizer") optimizer_dict = torch.load(optimizer_path) name = optimizer_dict["name"] - optimizer = getattr(torch.optim, name)(filter(lambda x: x.requires_grad, model.parameters())) + optimizer = getattr(torch.optim, name)( + filter(lambda x: x.requires_grad, model.parameters()) + ) optimizer.load_state_dict(optimizer_dict["optimizer"]) return optimizer diff --git a/clinicadl/clinicadl/tools/deep_learning/models/modules.py b/clinicadl/clinicadl/tools/deep_learning/models/modules.py index 135604bc3..d905f9e5a 100644 --- a/clinicadl/clinicadl/tools/deep_learning/models/modules.py +++ b/clinicadl/clinicadl/tools/deep_learning/models/modules.py @@ -26,8 +26,7 @@ def __init__(self, kernel_size, stride, return_indices=False, return_pad=False): super(PadMaxPool3d, self).__init__() self.kernel_size = kernel_size self.stride = stride - self.pool = nn.MaxPool3d( - kernel_size, stride, return_indices=return_indices) + self.pool = nn.MaxPool3d(kernel_size, stride, return_indices=return_indices) self.pad = nn.ConstantPad3d(padding=0, value=0) self.return_indices = return_indices self.return_pad = return_pad @@ -38,8 +37,7 @@ def set_new_return(self, return_indices=True, return_pad=True): self.pool.return_indices = return_indices def forward(self, f_maps): - coords = [self.stride - - f_maps.size(i + 2) % self.stride for i in range(3)] + coords = [self.stride - f_maps.size(i + 2) % self.stride for i in range(3)] for i, coord in enumerate(coords): if coord == self.stride: coords[i] = 0 @@ -68,8 +66,7 @@ def __init__(self, kernel_size, stride, return_indices=False, return_pad=False): super(PadMaxPool2d, self).__init__() self.kernel_size = kernel_size self.stride = stride - self.pool = nn.MaxPool2d( - kernel_size, stride, return_indices=return_indices) + self.pool = nn.MaxPool2d(kernel_size, stride, return_indices=return_indices) self.pad = nn.ConstantPad2d(padding=0, value=0) self.return_indices = return_indices self.return_pad = return_pad @@ -80,8 +77,7 @@ def set_new_return(self, return_indices=True, return_pad=True): self.pool.return_indices = return_indices def forward(self, f_maps): - coords = [self.stride - - f_maps.size(i + 2) % self.stride for i in range(2)] + coords = [self.stride - f_maps.size(i + 2) % self.stride for i in range(2)] for i, coord in enumerate(coords): if coord == self.stride: coords[i] = 0 diff --git a/clinicadl/clinicadl/tools/deep_learning/models/patch_level.py b/clinicadl/clinicadl/tools/deep_learning/models/patch_level.py index 42ca659f9..c23209459 100755 --- a/clinicadl/clinicadl/tools/deep_learning/models/patch_level.py +++ b/clinicadl/clinicadl/tools/deep_learning/models/patch_level.py @@ -4,7 +4,8 @@ Script containing the models for the patch level experiments. """ from torch import nn -from .modules import PadMaxPool3d, Flatten + +from .modules import Flatten, PadMaxPool3d class Conv4_FC3(nn.Module): @@ -26,36 +27,29 @@ def __init__(self, dropout=0, n_classes=2): nn.BatchNorm3d(15), nn.ReLU(), PadMaxPool3d(2, 2), - nn.Conv3d(15, 25, 3), nn.BatchNorm3d(25), nn.ReLU(), PadMaxPool3d(2, 2), - nn.Conv3d(25, 50, 3), nn.BatchNorm3d(50), nn.ReLU(), PadMaxPool3d(2, 2), - nn.Conv3d(50, 50, 3), nn.BatchNorm3d(50), nn.ReLU(), - PadMaxPool3d(2, 2) - + PadMaxPool3d(2, 2), ) self.classifier = nn.Sequential( # Fully connected layers Flatten(), - nn.Dropout(p=dropout), nn.Linear(50 * 2 * 2 * 2, 50), nn.ReLU(), - nn.Dropout(p=dropout), nn.Linear(50, 40), nn.ReLU(), - - nn.Linear(40, n_classes) + nn.Linear(40, n_classes), ) self.flattened_shape = [-1, 50, 2, 2, 2] diff --git a/clinicadl/clinicadl/tools/deep_learning/models/random.py b/clinicadl/clinicadl/tools/deep_learning/models/random.py index 111c947ed..2063a8256 100644 --- a/clinicadl/clinicadl/tools/deep_learning/models/random.py +++ b/clinicadl/clinicadl/tools/deep_learning/models/random.py @@ -1,9 +1,11 @@ -from .modules import * -import torch.nn as nn -import numpy as np import random from copy import deepcopy +import numpy as np +import torch.nn as nn + +from .modules import * + """ All the architectures are built here """ @@ -23,8 +25,7 @@ def sampling_fn(value, sampling_type): elif sampling_type is "uniform": return random.uniform(*value) else: - raise ValueError( - "Sampling type %s is not implemented" % sampling_type) + raise ValueError("Sampling type %s is not implemented" % sampling_type) else: if sampling_type is "exponent": return 10 ** -value @@ -101,7 +102,7 @@ def random_sampling(rs_options, options): "selection_threshold": "uniform", "slice_direction": "choice", "use_extracted_slices": "fixed", - } + }, } for name, sampling_type in sampling_dict.items(): @@ -110,7 +111,8 @@ def random_sampling(rs_options, options): if options.mode not in additional_mode_dict.keys(): raise NotImplementedError( - "Mode %s was not correctly implemented for random search" % options.mode) + "Mode %s was not correctly implemented for random search" % options.mode + ) additional_dict = additional_mode_dict[options.mode] for name, sampling_type in additional_dict.items(): @@ -122,7 +124,8 @@ def random_sampling(rs_options, options): options.weight_decay = 0 options.evaluation_steps = find_evaluation_steps( - options.accumulation_steps, goal=options.evaluation_steps) + options.accumulation_steps, goal=options.evaluation_steps + ) options.convolutions = random_conv_sampling(rs_options) return options @@ -163,14 +166,15 @@ def random_conv_sampling(rs_options): current_out_channels = first_conv_width for i in range(n_convblocks): conv_dict = dict() - conv_dict['in_channels'] = current_in_channels - conv_dict['out_channels'] = current_out_channels + conv_dict["in_channels"] = current_in_channels + conv_dict["out_channels"] = current_out_channels current_in_channels, current_out_channels = update_channels( - current_out_channels, rs_options.channels_limit) - conv_dict['n_conv'] = sampling_fn(rs_options.n_conv, "choice") - conv_dict['d_reduction'] = d_reduction - convolutions['conv' + str(i)] = conv_dict + current_out_channels, rs_options.channels_limit + ) + conv_dict["n_conv"] = sampling_fn(rs_options.n_conv, "choice") + conv_dict["d_reduction"] = d_reduction + convolutions["conv" + str(i)] = conv_dict return convolutions @@ -185,13 +189,20 @@ def update_channels(out_channels, channels_limit=512): class RandomArchitecture(nn.Module): """ - Classifier for a multi-class classification task + Classifier for a multi-class classification task - Initially named Initial_architecture - """ + Initially named Initial_architecture + """ - def __init__(self, convolutions, n_fcblocks, initial_shape, dropout=0.5, network_normalization="BatchNorm", - n_classes=2): + def __init__( + self, + convolutions, + n_fcblocks, + initial_shape, + dropout=0.5, + network_normalization="BatchNorm", + n_classes=2, + ): """ Construct the Architecture randomly chosen for Random Search. @@ -214,12 +225,11 @@ def __init__(self, convolutions, n_fcblocks, initial_shape, dropout=0.5, network convolutional_block = self.define_convolutional_block(item) self.features.add_module(key, convolutional_block) - self.classifier = nn.Sequential( - Flatten(), - nn.Dropout(p=dropout)) + self.classifier = nn.Sequential(Flatten(), nn.Dropout(p=dropout)) fc, flattened_shape = self.fc_dict_design( - n_fcblocks, convolutions, initial_shape, n_classes) + n_fcblocks, convolutions, initial_shape, n_classes + ) for key, item in fc.items(): n_fc = int(key[2::]) if n_fc == len(fc) - 1: @@ -231,10 +241,10 @@ def __init__(self, convolutions, n_fcblocks, initial_shape, dropout=0.5, network self.flattened_shape = flattened_shape def __len__(self): - fc_list = [('classifier', 'FC' + str(i)) - for i in range(len(self.classifier) - 2)] - conv_list = [('features', 'conv' + str(i)) - for i in range(len(self.features))] + fc_list = [ + ("classifier", "FC" + str(i)) for i in range(len(self.classifier) - 2) + ] + conv_list = [("features", "conv" + str(i)) for i in range(len(self.features))] return len(conv_list) + len(fc_list) def forward(self, x): @@ -256,32 +266,45 @@ def define_convolutional_block(self, conv_dict): Returns: (nn.Module) a list of modules in a nn.Sequential list """ - in_channels = conv_dict['in_channels'] if conv_dict['in_channels'] is not None else self.first_in_channels - out_channels = conv_dict['out_channels'] + in_channels = ( + conv_dict["in_channels"] + if conv_dict["in_channels"] is not None + else self.first_in_channels + ) + out_channels = conv_dict["out_channels"] conv_block = [] - for i in range(conv_dict['n_conv'] - 1): - conv_block.append(self.layers_dict["Conv"]( - in_channels, in_channels, 3, stride=1, padding=1)) - conv_block = self.append_normalization_layer( - conv_block, in_channels) + for i in range(conv_dict["n_conv"] - 1): + conv_block.append( + self.layers_dict["Conv"]( + in_channels, in_channels, 3, stride=1, padding=1 + ) + ) + conv_block = self.append_normalization_layer(conv_block, in_channels) conv_block.append(nn.LeakyReLU()) - if conv_dict['d_reduction'] == "MaxPooling": - conv_block.append(self.layers_dict["Conv"]( - in_channels, out_channels, 3, stride=1, padding=1)) - conv_block = self.append_normalization_layer( - conv_block, out_channels) + if conv_dict["d_reduction"] == "MaxPooling": + conv_block.append( + self.layers_dict["Conv"]( + in_channels, out_channels, 3, stride=1, padding=1 + ) + ) + conv_block = self.append_normalization_layer(conv_block, out_channels) conv_block.append(nn.LeakyReLU()) conv_block.append(self.layers_dict["Pool"](2, 2)) - elif conv_dict['d_reduction'] == "stride": - conv_block.append(self.layers_dict["Conv"]( - in_channels, out_channels, 3, stride=2, padding=1)) - conv_block = self.append_normalization_layer( - conv_block, out_channels) + elif conv_dict["d_reduction"] == "stride": + conv_block.append( + self.layers_dict["Conv"]( + in_channels, out_channels, 3, stride=2, padding=1 + ) + ) + conv_block = self.append_normalization_layer(conv_block, out_channels) conv_block.append(nn.LeakyReLU()) else: - raise ValueError("Dimension reduction %s is not supported. Please only include" - "'MaxPooling' or 'stride' in your sampling options." % conv_dict['d_reduction']) + raise ValueError( + "Dimension reduction %s is not supported. Please only include" + "'MaxPooling' or 'stride' in your sampling options." + % conv_dict["d_reduction"] + ) return nn.Sequential(*conv_block) @@ -298,26 +321,34 @@ def append_normalization_layer(self, conv_block, num_features): if self.network_normalization in ["BatchNorm", "InstanceNorm"]: conv_block.append( - self.layers_dict[self.network_normalization](num_features)) + self.layers_dict[self.network_normalization](num_features) + ) elif self.network_normalization is not None: - raise ValueError("The network normalization %s value must be in ['BatchNorm', 'InstanceNorm', None]" - % self.network_normalization) + raise ValueError( + "The network normalization %s value must be in ['BatchNorm', 'InstanceNorm', None]" + % self.network_normalization + ) return conv_block def return_layers_dict(self): if self.dimension == 3: - layers = {"Conv": nn.Conv3d, - "Pool": PadMaxPool3d, - "InstanceNorm": nn.InstanceNorm3d, - "BatchNorm": nn.BatchNorm3d} + layers = { + "Conv": nn.Conv3d, + "Pool": PadMaxPool3d, + "InstanceNorm": nn.InstanceNorm3d, + "BatchNorm": nn.BatchNorm3d, + } elif self.dimension == 2: - layers = {"Conv": nn.Conv2d, - "Pool": PadMaxPool2d, - "InstanceNorm": nn.InstanceNorm2d, - "BatchNorm": nn.BatchNorm2d} + layers = { + "Conv": nn.Conv2d, + "Pool": PadMaxPool2d, + "InstanceNorm": nn.InstanceNorm2d, + "BatchNorm": nn.BatchNorm2d, + } else: raise ValueError( - "Cannot construct random network in dimension %i" % self.dimension) + "Cannot construct random network in dimension %i" % self.dimension + ) return layers @staticmethod @@ -339,10 +370,7 @@ def define_fc_layer(fc_dict, last_block=False): if last_block: fc_block = [nn.Linear(in_features, out_features)] else: - fc_block = [ - nn.Linear(in_features, out_features), - nn.LeakyReLU() - ] + fc_block = [nn.Linear(in_features, out_features), nn.LeakyReLU()] return nn.Sequential(*fc_block) @@ -357,14 +385,16 @@ def cascading_randomization(self, n, random_model=None): Returns: self """ - fc_list = [('classifier', 'FC' + str(i)) - for i in range(len(self.classifier) - 2)] - conv_list = [('features', 'conv' + str(i)) - for i in range(len(self.features))] + fc_list = [ + ("classifier", "FC" + str(i)) for i in range(len(self.classifier) - 2) + ] + conv_list = [("features", "conv" + str(i)) for i in range(len(self.features))] layers_list = conv_list + fc_list if n > len(layers_list): - raise ValueError('The number of randomized layers %i cannot exceed the number of layers of the network %i' - % (n, len(layers_list))) + raise ValueError( + "The number of randomized layers %i cannot exceed the number of layers of the network %i" + % (n, len(layers_list)) + ) for i in range(-n, 0): block, name = layers_list[i] print(block, name) @@ -402,14 +432,16 @@ def fix_first_layers(self, n): Returns: self """ - fc_list = [('classifier', 'FC' + str(i)) - for i in range(len(self.classifier) - 2)] - conv_list = [('features', 'conv' + str(i)) - for i in range(len(self.features))] + fc_list = [ + ("classifier", "FC" + str(i)) for i in range(len(self.classifier) - 2) + ] + conv_list = [("features", "conv" + str(i)) for i in range(len(self.features))] layers_list = conv_list + fc_list if n > len(layers_list): - raise ValueError('The number of randomized layers %i cannot exceed the number of layers of the network %i' - % (n, len(layers_list))) + raise ValueError( + "The number of randomized layers %i cannot exceed the number of layers of the network %i" + % (n, len(layers_list)) + ) for i in range(n): block, name = layers_list[i] print(block, name) @@ -419,7 +451,7 @@ def fix_first_layers(self, n): return self - @ staticmethod + @staticmethod def fc_dict_design(n_fcblocks, convolutions, initial_shape, n_classes=2): """ Sample parameters for a random architecture (FC part). @@ -434,9 +466,9 @@ def fc_dict_design(n_fcblocks, convolutions, initial_shape, n_classes=2): (list) the shape of the flattened layer """ n_conv = len(convolutions) - last_conv = convolutions['conv%i' % (len(convolutions) - 1)] - out_channels = last_conv['out_channels'] - flattened_shape = np.ceil(np.array(initial_shape) / 2**n_conv) + last_conv = convolutions["conv%i" % (len(convolutions) - 1)] + out_channels = last_conv["out_channels"] + flattened_shape = np.ceil(np.array(initial_shape) / 2 ** n_conv) flattened_shape[0] = out_channels in_features = np.product(flattened_shape) @@ -448,10 +480,10 @@ def fc_dict_design(n_fcblocks, convolutions, initial_shape, n_classes=2): for i in range(n_fcblocks): fc_dict = dict() out_features = in_features / ratio - fc_dict['in_features'] = int(np.round(in_features)) - fc_dict['out_features'] = int(np.round(out_features)) + fc_dict["in_features"] = int(np.round(in_features)) + fc_dict["out_features"] = int(np.round(out_features)) in_features = out_features - fc['FC' + str(i)] = fc_dict + fc["FC" + str(i)] = fc_dict return fc, flattened_shape diff --git a/clinicadl/clinicadl/tools/deep_learning/models/slice_level.py b/clinicadl/clinicadl/tools/deep_learning/models/slice_level.py index f953d795e..09a62ceb1 100755 --- a/clinicadl/clinicadl/tools/deep_learning/models/slice_level.py +++ b/clinicadl/clinicadl/tools/deep_learning/models/slice_level.py @@ -1,14 +1,14 @@ # coding: utf8 +import math + import torch.utils.model_zoo as model_zoo -from torchvision.models.resnet import BasicBlock from torch import nn -import math +from torchvision.models.resnet import BasicBlock + from .modules import Flatten -model_urls = { - 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth' -} +model_urls = {"resnet18": "https://download.pytorch.org/models/resnet18-5c106cde.pth"} def resnet18(**kwargs): @@ -20,7 +20,7 @@ def resnet18(**kwargs): """ model = ResNetDesigner(BasicBlock, [2, 2, 2, 2], **kwargs) try: - model.load_state_dict(model_zoo.load_url(model_urls['resnet18'])) + model.load_state_dict(model_zoo.load_url(model_urls["resnet18"])) except Exception as err: print("Error is:", err) # raise ConnectionError('The URL %s may not be functional anymore. Check if it still exists or ' @@ -37,19 +37,17 @@ def resnet18(**kwargs): p.requires_grad = True # add a fc layer on top of the transfer_learning model and a softmax classifier - model.add_module('drop_out', nn.Dropout(p=kwargs["dropout"])) - model.add_module('fc_out', nn.Linear(1000, kwargs["n_classes"])) + model.add_module("drop_out", nn.Dropout(p=kwargs["dropout"])) + model.add_module("fc_out", nn.Linear(1000, kwargs["n_classes"])) return model class ResNetDesigner(nn.Module): - def __init__(self, block, layers, num_classes=1000, **kwargs): self.inplanes = 64 super(ResNetDesigner, self).__init__() - self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, - bias=False) + self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = nn.BatchNorm2d(64) self.relu = nn.ReLU(inplace=True) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) @@ -63,7 +61,7 @@ def __init__(self, block, layers, num_classes=1000, **kwargs): for m in self.modules(): if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels - m.weight.data.normal_(0, math.sqrt(2. / n)) + m.weight.data.normal_(0, math.sqrt(2.0 / n)) elif isinstance(m, nn.BatchNorm2d): m.weight.data.fill_(1) m.bias.data.zero_() @@ -72,8 +70,13 @@ def _make_layer(self, block, planes, blocks, stride=1): downsample = None if stride != 1 or self.inplanes != planes * block.expansion: downsample = nn.Sequential( - nn.Conv2d(self.inplanes, planes * block.expansion, - kernel_size=1, stride=stride, bias=False), + nn.Conv2d( + self.inplanes, + planes * block.expansion, + kernel_size=1, + stride=stride, + bias=False, + ), nn.BatchNorm2d(planes * block.expansion), ) @@ -117,28 +120,24 @@ def __init__(self, dropout=0.5, n_classes=2): nn.LeakyReLU(), nn.MaxPool2d(2, 2), # feature_map : (8@64x64) - nn.Conv2d(8, 16, 3, padding=1), nn.Conv2d(16, 16, 3, padding=1), nn.BatchNorm2d(16), nn.LeakyReLU(), nn.MaxPool2d(2, 2), # feature_map : (16@32x32) - nn.Conv2d(16, 32, 3, padding=1), nn.Conv2d(32, 32, 3, padding=1), nn.BatchNorm2d(32), nn.LeakyReLU(), nn.MaxPool2d(2, 2), # feature_map : (32@16x16) - nn.Conv2d(32, 64, 3, padding=1), nn.Conv2d(64, 64, 3, padding=1), nn.BatchNorm2d(64), nn.LeakyReLU(), nn.MaxPool2d(2, 2), # feature_map : (64@8x8) - nn.Conv2d(64, 64, 3, padding=1), nn.Conv2d(64, 64, 3, padding=1), nn.BatchNorm2d(64), @@ -148,9 +147,7 @@ def __init__(self, dropout=0.5, n_classes=2): ) self.classifier = nn.Sequential( - Flatten(), - nn.Dropout(dropout), - nn.Linear(64 * 4 * 4, n_classes) + Flatten(), nn.Dropout(dropout), nn.Linear(64 * 4 * 4, n_classes) ) def forward(self, x): diff --git a/clinicadl/clinicadl/tools/inputs/filename_types.py b/clinicadl/clinicadl/tools/inputs/filename_types.py index 99b3ebe03..b47bba690 100644 --- a/clinicadl/clinicadl/tools/inputs/filename_types.py +++ b/clinicadl/clinicadl/tools/inputs/filename_types.py @@ -1,14 +1,18 @@ # coding: utf8 -FILENAME_TYPE = {'full': '_T1w_space-MNI152NLin2009cSym_res-1x1x1_T1w', - 'cropped': '_T1w_space-MNI152NLin2009cSym_desc-Crop_res-1x1x1_T1w', - 'downsampled': '_T1w_space-MNI152NLin2009cSym_desc-Crop_res-2x2x2_T1w', - 'skull_stripped': '_space-Ixi549Space_desc-skullstripped_T1w', - 'gm_maps': '_T1w_segm-graymatter_space-Ixi549Space_modulated-off_probability', - 'shepplogan': '_phantom-SheppLogan'} +FILENAME_TYPE = { + "full": "_T1w_space-MNI152NLin2009cSym_res-1x1x1_T1w", + "cropped": "_T1w_space-MNI152NLin2009cSym_desc-Crop_res-1x1x1_T1w", + "downsampled": "_T1w_space-MNI152NLin2009cSym_desc-Crop_res-2x2x2_T1w", + "skull_stripped": "_space-Ixi549Space_desc-skullstripped_T1w", + "gm_maps": "_T1w_segm-graymatter_space-Ixi549Space_modulated-off_probability", + "shepplogan": "_phantom-SheppLogan", +} -MASK_PATTERN = {'full': '_res-1x1x1', - 'cropped': '_desc-Crop_res-1x1x1', - 'skull_stripped': '', - 'gm_maps': '', - 'shepplogan': ''} +MASK_PATTERN = { + "full": "_res-1x1x1", + "cropped": "_desc-Crop_res-1x1x1", + "skull_stripped": "", + "gm_maps": "", + "shepplogan": "", +} diff --git a/clinicadl/clinicadl/tools/tsv/data_formatting.py b/clinicadl/clinicadl/tools/tsv/data_formatting.py index 944f42300..a246600d1 100644 --- a/clinicadl/clinicadl/tools/tsv/data_formatting.py +++ b/clinicadl/clinicadl/tools/tsv/data_formatting.py @@ -10,14 +10,22 @@ in the OASIS dataset is not done in this script. Moreover a quality check may be needed at the end of preprocessing pipelines, leading to the removal of some subjects. """ -from ..deep_learning.iotools import return_logger, commandline_to_json -from .tsv_utils import neighbour_session, last_session, after_end_screening, find_label, first_session -import pandas as pd -from os import path -from copy import copy -import numpy as np import logging import os +from copy import copy +from os import path + +import numpy as np +import pandas as pd + +from ..deep_learning.iotools import commandline_to_json, return_logger +from .tsv_utils import ( + after_end_screening, + find_label, + first_session, + last_session, + neighbour_session, +) def cleaning_nan_diagnoses(bids_df, logger): @@ -34,32 +42,45 @@ def cleaning_nan_diagnoses(bids_df, logger): bids_copy_df = copy(bids_df) # Look for the diagnosis in another column in ADNI - if 'adni_diagnosis_change' in bids_df.columns: - change_dict = {1: 'CN', 2: 'MCI', 3: 'AD', 4: 'MCI', 5: 'AD', 6: 'AD', 7: 'CN', 8: 'MCI', 9: 'CN', -1: np.nan} + if "adni_diagnosis_change" in bids_df.columns: + change_dict = { + 1: "CN", + 2: "MCI", + 3: "AD", + 4: "MCI", + 5: "AD", + 6: "AD", + 7: "CN", + 8: "MCI", + 9: "CN", + -1: np.nan, + } missing_diag = 0 found_diag = 0 for subject, session in bids_df.index.values: - diagnosis = bids_df.loc[(subject, session), 'diagnosis'] + diagnosis = bids_df.loc[(subject, session), "diagnosis"] if isinstance(diagnosis, float): missing_diag += 1 - change = bids_df.loc[(subject, session), 'adni_diagnosis_change'] + change = bids_df.loc[(subject, session), "adni_diagnosis_change"] if not np.isnan(change) and change != -1: found_diag += 1 - bids_copy_df.loc[(subject, session), 'diagnosis'] = change_dict[change] + bids_copy_df.loc[(subject, session), "diagnosis"] = change_dict[ + change + ] else: missing_diag = 0 found_diag = 0 for subject, session in bids_df.index.values: - diagnosis = bids_df.loc[(subject, session), 'diagnosis'] + diagnosis = bids_df.loc[(subject, session), "diagnosis"] if isinstance(diagnosis, float): missing_diag += 1 - logger.debug('Missing diagnoses: %i' % missing_diag) - logger.debug('Missing diagnoses not found: %i' % (missing_diag - found_diag)) + logger.debug("Missing diagnoses: %i" % missing_diag) + logger.debug("Missing diagnoses not found: %i" % (missing_diag - found_diag)) return bids_copy_df @@ -83,7 +104,7 @@ def infer_or_drop_diagnosis(bids_df, logger): session_list = [int(session[5::]) for _, session in subject_df.index.values] for _, session in subject_df.index.values: - diagnosis = subject_df.loc[(subject, session), 'diagnosis'] + diagnosis = subject_df.loc[(subject, session), "diagnosis"] session_nb = int(session[5::]) if isinstance(diagnosis, float): @@ -91,27 +112,41 @@ def infer_or_drop_diagnosis(bids_df, logger): bids_copy_df.drop((subject, session), inplace=True) else: prev_session = neighbour_session(session_nb, session_list, -1) - prev_diagnosis = bids_df.loc[(subject, prev_session), 'diagnosis'] - while isinstance(prev_diagnosis, float) and prev_session != first_session(subject_df): - prev_session = neighbour_session(int(prev_session[5::]), session_list, -1) - prev_diagnosis = bids_df.loc[(subject, prev_session), 'diagnosis'] + prev_diagnosis = bids_df.loc[(subject, prev_session), "diagnosis"] + while isinstance( + prev_diagnosis, float + ) and prev_session != first_session(subject_df): + prev_session = neighbour_session( + int(prev_session[5::]), session_list, -1 + ) + prev_diagnosis = bids_df.loc[ + (subject, prev_session), "diagnosis" + ] post_session = neighbour_session(session_nb, session_list, +1) - post_diagnosis = bids_df.loc[(subject, post_session), 'diagnosis'] - while isinstance(post_diagnosis, float) and post_session != last_session(session_list): - post_session = neighbour_session(int(post_session[5::]), session_list, +1) - post_diagnosis = bids_df.loc[(subject, post_session), 'diagnosis'] + post_diagnosis = bids_df.loc[(subject, post_session), "diagnosis"] + while isinstance( + post_diagnosis, float + ) and post_session != last_session(session_list): + post_session = neighbour_session( + int(post_session[5::]), session_list, +1 + ) + post_diagnosis = bids_df.loc[ + (subject, post_session), "diagnosis" + ] if prev_diagnosis == post_diagnosis: found_diag_interpol += 1 - bids_copy_df.loc[(subject, session), 'diagnosis'] = prev_diagnosis + bids_copy_df.loc[ + (subject, session), "diagnosis" + ] = prev_diagnosis else: bids_copy_df.drop((subject, session), inplace=True) - logger.debug('Inferred diagnosis: %i' % found_diag_interpol) + logger.debug("Inferred diagnosis: %i" % found_diag_interpol) return bids_copy_df -def mod_selection(bids_df, missing_mods_dict, mod='t1w'): +def mod_selection(bids_df, missing_mods_dict, mod="t1w"): """ Select only sessions for which the modality is present @@ -136,7 +171,7 @@ def mod_selection(bids_df, missing_mods_dict, mod='t1w'): return bids_copy_df -def stable_selection(bids_df, diagnosis='AD', logger=None): +def stable_selection(bids_df, diagnosis="AD", logger=None): """ Select only subjects whom diagnosis is identical during the whole follow-up. @@ -162,10 +197,12 @@ def stable_selection(bids_df, diagnosis='AD', logger=None): for subject, subject_df in bids_df.groupby(level=0): subject_drop = False try: - diagnosis_bl = subject_df.loc[(subject, 'ses-M00'), 'baseline_diagnosis'] + diagnosis_bl = subject_df.loc[(subject, "ses-M00"), "baseline_diagnosis"] except KeyError: - raise KeyError("The baseline session is necessary for labels selection. It is missing for subject %s" - % subject) + raise KeyError( + "The baseline session is necessary for labels selection. It is missing for subject %s" + % subject + ) diagnosis_values = subject_df.diagnosis.values for diagnosis in diagnosis_values: if not isinstance(diagnosis, float): @@ -176,7 +213,7 @@ def stable_selection(bids_df, diagnosis='AD', logger=None): if subject_drop: bids_copy_df.drop(subject, inplace=True) bids_df = copy(bids_copy_df) - logger.debug('Number of unstable subjects dropped: %i' % n_subjects) + logger.debug("Number of unstable subjects dropped: %i" % n_subjects) bids_df = infer_or_drop_diagnosis(bids_df, logger) return bids_df @@ -198,7 +235,7 @@ def mci_stability(bids_df, horizon_time=36, logger=None): logger = logging logger.basicConfig(level=logging.DEBUG) - diagnosis_list = ['MCI', 'EMCI', 'LMCI'] + diagnosis_list = ["MCI", "EMCI", "LMCI"] bids_df = bids_df[(bids_df.baseline_diagnosis.isin(diagnosis_list))] bids_df = cleaning_nan_diagnoses(bids_df, logger) bids_df = infer_or_drop_diagnosis(bids_df, logger) @@ -212,9 +249,13 @@ def mci_stability(bids_df, horizon_time=36, logger=None): diagnosis_list = [] for session in session_list: if session < 10: - diagnosis_list.append(bids_df.loc[(subject, 'ses-M0' + str(session)), 'diagnosis']) + diagnosis_list.append( + bids_df.loc[(subject, "ses-M0" + str(session)), "diagnosis"] + ) else: - diagnosis_list.append(bids_df.loc[(subject, 'ses-M' + str(session)), 'diagnosis']) + diagnosis_list.append( + bids_df.loc[(subject, "ses-M" + str(session)), "diagnosis"] + ) new_diagnosis = diagnosis_list[0] nb_change = 0 @@ -227,63 +268,83 @@ def mci_stability(bids_df, horizon_time=36, logger=None): nb_subjects += 1 bids_copy_df.drop(subject, inplace=True) - logger.debug('Dropped subjects: %i' % nb_subjects) + logger.debug("Dropped subjects: %i" % nb_subjects) bids_df = copy(bids_copy_df) # Stability of sessions - stability_dict = {'CN': 'r', 'MCI': 's', 'AD': 'p'} # Do not take into account the case of missing diag = nan + stability_dict = { + "CN": "r", + "MCI": "s", + "AD": "p", + } # Do not take into account the case of missing diag = nan bids_copy_df = copy(bids_df) for subject, subject_df in bids_df.groupby(level=0): session_list = [int(session[5::]) for _, session in subject_df.index.values] # print(subject_df.diagnosis) for _, session in subject_df.index.values: - diagnosis = subject_df.loc[(subject, session), 'diagnosis'] + diagnosis = subject_df.loc[(subject, session), "diagnosis"] # If the diagnosis is not MCI we remove the time point - if diagnosis != 'MCI': + if diagnosis != "MCI": bids_copy_df.drop((subject, session), inplace=True) else: session_nb = int(session[5::]) horizon_session_nb = session_nb + horizon_time - horizon_session = 'ses-M' + str(horizon_session_nb) + horizon_session = "ses-M" + str(horizon_session_nb) # print(session, '-->', horizon_session) if horizon_session_nb in session_list: - horizon_diagnosis = subject_df.loc[(subject, horizon_session), 'diagnosis'] - update_diagnosis = stability_dict[horizon_diagnosis] + 'MCI' + horizon_diagnosis = subject_df.loc[ + (subject, horizon_session), "diagnosis" + ] + update_diagnosis = stability_dict[horizon_diagnosis] + "MCI" # print(horizon_diagnosis, update_diagnosis) - bids_copy_df.loc[(subject, session), 'diagnosis'] = update_diagnosis + bids_copy_df.loc[(subject, session), "diagnosis"] = update_diagnosis else: if after_end_screening(horizon_session_nb, session_list): # Two situations, change in last session AD or CN --> pMCI or rMCI # Last session MCI --> uMCI - last_diagnosis = subject_df.loc[(subject, last_session(session_list)), 'diagnosis'] + last_diagnosis = subject_df.loc[ + (subject, last_session(session_list)), "diagnosis" + ] # This section must be discussed --> removed in Jorge's paper - if last_diagnosis != 'MCI': - update_diagnosis = stability_dict[last_diagnosis] + 'MCI' + if last_diagnosis != "MCI": + update_diagnosis = stability_dict[last_diagnosis] + "MCI" else: - update_diagnosis = 'uMCI' + update_diagnosis = "uMCI" # print(update_diagnosis) - bids_copy_df.loc[(subject, session), 'diagnosis'] = update_diagnosis + bids_copy_df.loc[ + (subject, session), "diagnosis" + ] = update_diagnosis else: - prev_session = neighbour_session(horizon_session_nb, session_list, -1) - post_session = neighbour_session(horizon_session_nb, session_list, +1) + prev_session = neighbour_session( + horizon_session_nb, session_list, -1 + ) + post_session = neighbour_session( + horizon_session_nb, session_list, +1 + ) # print('prev_session', prev_session) # print('post_session', post_session) - prev_diagnosis = subject_df.loc[(subject, prev_session), 'diagnosis'] - if prev_diagnosis != 'MCI': - update_diagnosis = stability_dict[prev_diagnosis] + 'MCI' + prev_diagnosis = subject_df.loc[ + (subject, prev_session), "diagnosis" + ] + if prev_diagnosis != "MCI": + update_diagnosis = stability_dict[prev_diagnosis] + "MCI" else: - post_diagnosis = subject_df.loc[(subject, post_session), 'diagnosis'] - if post_diagnosis != 'MCI': - update_diagnosis = 'uMCI' + post_diagnosis = subject_df.loc[ + (subject, post_session), "diagnosis" + ] + if post_diagnosis != "MCI": + update_diagnosis = "uMCI" else: - update_diagnosis = 'sMCI' + update_diagnosis = "sMCI" # print(update_diagnosis) - bids_copy_df.loc[(subject, session), 'diagnosis'] = update_diagnosis + bids_copy_df.loc[ + (subject, session), "diagnosis" + ] = update_diagnosis return bids_copy_df @@ -306,7 +367,7 @@ def diagnosis_removal(MCI_df, diagnosis_list): for subject, subject_df in MCI_df.groupby(level=0): session_list = [int(session[5::]) for _, session in subject_df.index.values] last_session_id = last_session(session_list) - last_diagnosis = subject_df.loc[(subject, last_session_id), 'diagnosis'] + last_diagnosis = subject_df.loc[(subject, last_session_id), "diagnosis"] if last_diagnosis in diagnosis_list: output_df.drop(subject, inplace=True) @@ -328,20 +389,31 @@ def apply_restriction(bids_df, restriction_path): bids_copy_df = copy(bids_df) if restriction_path is not None: - restriction_df = pd.read_csv(restriction_path, sep='\t') + restriction_df = pd.read_csv(restriction_path, sep="\t") for subject, session in bids_df.index.values: - subject_qc_df = restriction_df[(restriction_df.participant_id == subject) & (restriction_df.session_id == session)] + subject_qc_df = restriction_df[ + (restriction_df.participant_id == subject) + & (restriction_df.session_id == session) + ] if len(subject_qc_df) != 1: bids_copy_df.drop((subject, session), inplace=True) return bids_copy_df -def get_labels(merged_tsv, missing_mods, results_path, - diagnoses, modality="t1w", restriction_path=None, - time_horizon=36, variables_of_interest=None, - remove_smc=True, verbose=0): +def get_labels( + merged_tsv, + missing_mods, + results_path, + diagnoses, + modality="t1w", + restriction_path=None, + time_horizon=36, + variables_of_interest=None, + remove_smc=True, + verbose=0, +): """ Writes one tsv file per label in diagnoses argument based on merged_tsv and missing_mods. @@ -362,25 +434,28 @@ def get_labels(merged_tsv, missing_mods, results_path, """ logger = return_logger(verbose, "getlabels") - commandline_to_json({ - "output_dir": results_path, - "merged_tsv": merged_tsv, - "missing_mods": missing_mods, - "diagnoses": diagnoses, - "modality": modality, - "restriction_path": restriction_path, - "time_horizon": time_horizon, - "variables_of_interest": variables_of_interest, - "remove_smc": remove_smc - }, filename="getlabels.json") + commandline_to_json( + { + "output_dir": results_path, + "merged_tsv": merged_tsv, + "missing_mods": missing_mods, + "diagnoses": diagnoses, + "modality": modality, + "restriction_path": restriction_path, + "time_horizon": time_horizon, + "variables_of_interest": variables_of_interest, + "remove_smc": remove_smc, + }, + filename="getlabels.json", + ) # Reading files - bids_df = pd.read_csv(merged_tsv, sep='\t') - bids_df.set_index(['participant_id', 'session_id'], inplace=True) + bids_df = pd.read_csv(merged_tsv, sep="\t") + bids_df.set_index(["participant_id", "session_id"], inplace=True) variables_list = ["diagnosis"] try: - variables_list.append(find_label(bids_df.columns.values, 'age')) - variables_list.append(find_label(bids_df.columns.values, 'sex')) + variables_list.append(find_label(bids_df.columns.values, "age")) + variables_list.append(find_label(bids_df.columns.values, "sex")) except ValueError: logger.warn("The age or sex values were not found in the dataset.") if variables_of_interest is not None: @@ -392,13 +467,15 @@ def get_labels(merged_tsv, missing_mods, results_path, for file in list_files: filename, fileext = path.splitext(file) - if fileext == '.tsv': - session = filename.split('_')[-1] - missing_mods_df = pd.read_csv(path.join(missing_mods, file), sep='\t') + if fileext == ".tsv": + session = filename.split("_")[-1] + missing_mods_df = pd.read_csv(path.join(missing_mods, file), sep="\t") if len(missing_mods_df) == 0: - raise ValueError("Empty DataFrame at path %s" % path.join(missing_mods, file)) + raise ValueError( + "Empty DataFrame at path %s" % path.join(missing_mods, file) + ) - missing_mods_df.set_index('participant_id', drop=True, inplace=True) + missing_mods_df.set_index("participant_id", drop=True, inplace=True) missing_mods_dict[session] = missing_mods_df # Creating results path @@ -413,84 +490,118 @@ def get_labels(merged_tsv, missing_mods, results_path, # Adding the field baseline_diagnosis bids_copy_df = copy(bids_df) - bids_copy_df['baseline_diagnosis'] = pd.Series(np.zeros(len(bids_df)), index=bids_df.index) + bids_copy_df["baseline_diagnosis"] = pd.Series( + np.zeros(len(bids_df)), index=bids_df.index + ) for subject, subject_df in bids_df.groupby(level=0): - baseline_diagnosis = subject_df.loc[(subject, 'ses-M00'), 'diagnosis'] - bids_copy_df.loc[subject, 'baseline_diagnosis'] = baseline_diagnosis + baseline_diagnosis = subject_df.loc[(subject, "ses-M00"), "diagnosis"] + bids_copy_df.loc[subject, "baseline_diagnosis"] = baseline_diagnosis bids_df = copy(bids_copy_df) time_MCI_df = None - if 'AD' in diagnoses: - logger.info('Beginning the selection of AD label') - output_df = stable_selection(bids_df, diagnosis='AD', logger=logger) + if "AD" in diagnoses: + logger.info("Beginning the selection of AD label") + output_df = stable_selection(bids_df, diagnosis="AD", logger=logger) output_df = mod_selection(output_df, missing_mods_dict, modality) output_df = apply_restriction(output_df, restriction_path) diagnosis_df = output_df[variables_list] - diagnosis_df.to_csv(path.join(results_path, 'AD.tsv'), sep='\t') - sub_df = diagnosis_df.reset_index().groupby('participant_id')['session_id'].nunique() - logger.info('Found %s AD subjects for a total of %s sessions\n' % (len(sub_df), len(diagnosis_df))) - - if 'BV' in diagnoses: - logger.info('Beginning the selection of BV label') - output_df = stable_selection(bids_df, diagnosis='BV', logger=logger) + diagnosis_df.to_csv(path.join(results_path, "AD.tsv"), sep="\t") + sub_df = ( + diagnosis_df.reset_index().groupby("participant_id")["session_id"].nunique() + ) + logger.info( + "Found %s AD subjects for a total of %s sessions\n" + % (len(sub_df), len(diagnosis_df)) + ) + + if "BV" in diagnoses: + logger.info("Beginning the selection of BV label") + output_df = stable_selection(bids_df, diagnosis="BV", logger=logger) output_df = mod_selection(output_df, missing_mods_dict, modality) output_df = apply_restriction(output_df, restriction_path) diagnosis_df = output_df[variables_list] - diagnosis_df.to_csv(path.join(results_path, 'BV.tsv'), sep='\t') - sub_df = diagnosis_df.reset_index().groupby('participant_id')['session_id'].nunique() - logger.info('Found %s BV subjects for a total of %s sessions\n' % (len(sub_df), len(diagnosis_df))) - - if 'CN' in diagnoses: - logger.info('Beginning the selection of CN label') - output_df = stable_selection(bids_df, diagnosis='CN', logger=logger) + diagnosis_df.to_csv(path.join(results_path, "BV.tsv"), sep="\t") + sub_df = ( + diagnosis_df.reset_index().groupby("participant_id")["session_id"].nunique() + ) + logger.info( + "Found %s BV subjects for a total of %s sessions\n" + % (len(sub_df), len(diagnosis_df)) + ) + + if "CN" in diagnoses: + logger.info("Beginning the selection of CN label") + output_df = stable_selection(bids_df, diagnosis="CN", logger=logger) output_df = mod_selection(output_df, missing_mods_dict, modality) output_df = apply_restriction(output_df, restriction_path) diagnosis_df = output_df[variables_list] - diagnosis_df.to_csv(path.join(results_path, 'CN.tsv'), sep='\t') - sub_df = diagnosis_df.reset_index().groupby('participant_id')['session_id'].nunique() - logger.info('Found %s CN subjects for a total of %s sessions\n' % (len(sub_df), len(diagnosis_df))) - - if 'MCI' in diagnoses: - logger.info('Beginning of the selection of MCI label') - MCI_df = mci_stability(bids_df, 10 ** 4, logger=logger) # Remove rMCI independently from time horizon - output_df = diagnosis_removal(MCI_df, diagnosis_list=['rMCI']) + diagnosis_df.to_csv(path.join(results_path, "CN.tsv"), sep="\t") + sub_df = ( + diagnosis_df.reset_index().groupby("participant_id")["session_id"].nunique() + ) + logger.info( + "Found %s CN subjects for a total of %s sessions\n" + % (len(sub_df), len(diagnosis_df)) + ) + + if "MCI" in diagnoses: + logger.info("Beginning of the selection of MCI label") + MCI_df = mci_stability( + bids_df, 10 ** 4, logger=logger + ) # Remove rMCI independently from time horizon + output_df = diagnosis_removal(MCI_df, diagnosis_list=["rMCI"]) output_df = mod_selection(output_df, missing_mods_dict, modality) output_df = apply_restriction(output_df, restriction_path) # Relabelling everything as MCI - output_df.diagnosis = ['MCI'] * len(output_df) + output_df.diagnosis = ["MCI"] * len(output_df) diagnosis_df = output_df[variables_list] - diagnosis_df.to_csv(path.join(results_path, 'MCI.tsv'), sep='\t') - sub_df = diagnosis_df.reset_index().groupby('participant_id')['session_id'].nunique() - logger.info('Found %s MCI subjects for a total of %s sessions\n' % (len(sub_df), len(diagnosis_df))) - - if 'sMCI' in diagnoses: - logger.info('Beginning of the selection of sMCI label') + diagnosis_df.to_csv(path.join(results_path, "MCI.tsv"), sep="\t") + sub_df = ( + diagnosis_df.reset_index().groupby("participant_id")["session_id"].nunique() + ) + logger.info( + "Found %s MCI subjects for a total of %s sessions\n" + % (len(sub_df), len(diagnosis_df)) + ) + + if "sMCI" in diagnoses: + logger.info("Beginning of the selection of sMCI label") time_MCI_df = mci_stability(bids_df, time_horizon, logger=logger) - output_df = diagnosis_removal(time_MCI_df, diagnosis_list=['rMCI', 'pMCI']) - output_df = output_df[output_df.diagnosis == 'sMCI'] + output_df = diagnosis_removal(time_MCI_df, diagnosis_list=["rMCI", "pMCI"]) + output_df = output_df[output_df.diagnosis == "sMCI"] output_df = mod_selection(output_df, missing_mods_dict, modality) output_df = apply_restriction(output_df, restriction_path) diagnosis_df = output_df[variables_list] - diagnosis_df.to_csv(path.join(results_path, 'sMCI.tsv'), sep='\t') - sub_df = diagnosis_df.reset_index().groupby('participant_id')['session_id'].nunique() - logger.info('Found %s sMCI subjects for a total of %s sessions\n' % (len(sub_df), len(diagnosis_df))) - - if 'pMCI' in diagnoses: - logger.info('Beginning of the selection of pMCI label') + diagnosis_df.to_csv(path.join(results_path, "sMCI.tsv"), sep="\t") + sub_df = ( + diagnosis_df.reset_index().groupby("participant_id")["session_id"].nunique() + ) + logger.info( + "Found %s sMCI subjects for a total of %s sessions\n" + % (len(sub_df), len(diagnosis_df)) + ) + + if "pMCI" in diagnoses: + logger.info("Beginning of the selection of pMCI label") if time_MCI_df is None: time_MCI_df = mci_stability(bids_df, time_horizon) - output_df = time_MCI_df[time_MCI_df.diagnosis == 'pMCI'] + output_df = time_MCI_df[time_MCI_df.diagnosis == "pMCI"] output_df = mod_selection(output_df, missing_mods_dict, modality) output_df = apply_restriction(output_df, restriction_path) diagnosis_df = output_df[variables_list] - diagnosis_df.to_csv(path.join(results_path, 'pMCI.tsv'), sep='\t') - sub_df = diagnosis_df.reset_index().groupby('participant_id')['session_id'].nunique() - logger.info('Found %s pMCI subjects for a total of %s sessions\n' % (len(sub_df), len(diagnosis_df))) + diagnosis_df.to_csv(path.join(results_path, "pMCI.tsv"), sep="\t") + sub_df = ( + diagnosis_df.reset_index().groupby("participant_id")["session_id"].nunique() + ) + logger.info( + "Found %s pMCI subjects for a total of %s sessions\n" + % (len(sub_df), len(diagnosis_df)) + ) diff --git a/clinicadl/clinicadl/tools/tsv/data_split.py b/clinicadl/clinicadl/tools/tsv/data_split.py index 47baf812d..2e6924a72 100644 --- a/clinicadl/clinicadl/tools/tsv/data_split.py +++ b/clinicadl/clinicadl/tools/tsv/data_split.py @@ -1,25 +1,41 @@ # coding: utf8 -from .tsv_utils import complementary_list, extract_baseline, chi2, category_conversion, remove_unicity, find_label, \ - retrieve_longitudinal, remove_sub_labels -from ..deep_learning.iotools import return_logger, commandline_to_json -from scipy.stats import ttest_ind +import logging +import os import shutil -import pandas as pd from os import path + import numpy as np -import os -import logging +import pandas as pd +from scipy.stats import ttest_ind from sklearn.model_selection import StratifiedShuffleSplit - -sex_dict = {'M': 0, 'F': 1} - - -def create_split(diagnosis, diagnosis_df, split_label, n_test, - p_age_threshold=0.80, p_sex_threshold=0.80, - supplementary_train_df=None, - ignore_demographics=False, logger=None): +from ..deep_learning.iotools import commandline_to_json, return_logger +from .tsv_utils import ( + category_conversion, + chi2, + complementary_list, + extract_baseline, + find_label, + remove_sub_labels, + remove_unicity, + retrieve_longitudinal, +) + +sex_dict = {"M": 0, "F": 1} + + +def create_split( + diagnosis, + diagnosis_df, + split_label, + n_test, + p_age_threshold=0.80, + p_sex_threshold=0.80, + supplementary_train_df=None, + ignore_demographics=False, + logger=None, +): """ Split data at the subject-level in training and test set with equivalent age, sex and split_label distributions @@ -61,17 +77,21 @@ def create_split(diagnosis, diagnosis_df, split_label, n_test, n_test = int(n_test * len(baseline_df)) if not {split_label}.issubset(set(baseline_df.columns.values)): - raise ValueError(f"The column {split_label} is missing." - f"Please add it using the --variables_of_interest flag in getlabels.") + raise ValueError( + f"The column {split_label} is missing." + f"Please add it using the --variables_of_interest flag in getlabels." + ) if not ignore_demographics: try: sex_label = find_label(baseline_df.columns.values, "sex") age_label = find_label(baseline_df.columns.values, "age") except ValueError: - raise ValueError("This dataset do not have age or sex values. " - "Please add the flag --ignore_demographics to split " - "without trying to balance age or sex distributions.") + raise ValueError( + "This dataset do not have age or sex values. " + "Please add the flag --ignore_demographics to split " + "without trying to balance age or sex distributions." + ) sex = list(baseline_df[sex_label].values) age = list(baseline_df[age_label].values) @@ -97,7 +117,9 @@ def create_split(diagnosis, diagnosis_df, split_label, n_test, if len(set(sex)) != 1: sex_test = [sex_dict[sex[idx]] for idx in test_index] - sex_train = [sex_dict[sex[idx]] for idx in train_index] + sup_train_sex + sex_train = [ + sex_dict[sex[idx]] for idx in train_index + ] + sup_train_sex _, p_sex = chi2(sex_test, sex_train) else: p_sex = 1 @@ -113,7 +135,9 @@ def create_split(diagnosis, diagnosis_df, split_label, n_test, train_df.reset_index(drop=True, inplace=True) n_try += 1 - logger.info("Split for diagnosis %s was found after %i trials" % (diagnosis, n_try)) + logger.info( + "Split for diagnosis %s was found after %i trials" % (diagnosis, n_try) + ) else: idx = np.arange(len(baseline_df)) @@ -126,9 +150,17 @@ def create_split(diagnosis, diagnosis_df, split_label, n_test, return train_df, test_df -def split_diagnoses(formatted_data_path, n_test=100, subset_name="test", MCI_sub_categories=True, - p_age_threshold=0.80, p_sex_threshold=0.80, categorical_split_variable=None, - ignore_demographics=False, verbose=0): +def split_diagnoses( + formatted_data_path, + n_test=100, + subset_name="test", + MCI_sub_categories=True, + p_age_threshold=0.80, + p_sex_threshold=0.80, + categorical_split_variable=None, + ignore_demographics=False, + verbose=0, +): """ Performs a single split for each label independently on the subject level. The train folder will contain two lists per diagnosis (baseline and longitudinal), @@ -159,21 +191,24 @@ def split_diagnoses(formatted_data_path, n_test=100, subset_name="test", MCI_sub """ logger = return_logger(verbose, "split") - commandline_to_json({ - "output_dir": formatted_data_path, - "n_test": n_test, - "subset_name": subset_name, - "MCI_sub_categories": MCI_sub_categories, - "p_age_threshold": p_age_threshold, - "p_sex_threshold": p_sex_threshold, - "categorical_split_variable": categorical_split_variable, - "ignore_demographics": ignore_demographics - }, filename="split.json") + commandline_to_json( + { + "output_dir": formatted_data_path, + "n_test": n_test, + "subset_name": subset_name, + "MCI_sub_categories": MCI_sub_categories, + "p_age_threshold": p_age_threshold, + "p_sex_threshold": p_sex_threshold, + "categorical_split_variable": categorical_split_variable, + "ignore_demographics": ignore_demographics, + }, + filename="split.json", + ) # Read files results_path = formatted_data_path - train_path = path.join(results_path, 'train') + train_path = path.join(results_path, "train") if path.exists(train_path): shutil.rmtree(train_path) if n_test > 0: @@ -188,53 +223,75 @@ def split_diagnoses(formatted_data_path, n_test=100, subset_name="test", MCI_sub os.makedirs(test_path) diagnosis_df_paths = os.listdir(results_path) - diagnosis_df_paths = [x for x in diagnosis_df_paths if x.endswith('.tsv')] - diagnosis_df_paths = [x for x in diagnosis_df_paths if not x.endswith('_baseline.tsv')] + diagnosis_df_paths = [x for x in diagnosis_df_paths if x.endswith(".tsv")] + diagnosis_df_paths = [ + x for x in diagnosis_df_paths if not x.endswith("_baseline.tsv") + ] MCI_special_treatment = False - if 'MCI.tsv' in diagnosis_df_paths and n_test > 0: + if "MCI.tsv" in diagnosis_df_paths and n_test > 0: if MCI_sub_categories: - diagnosis_df_paths.remove('MCI.tsv') + diagnosis_df_paths.remove("MCI.tsv") MCI_special_treatment = True - elif 'sMCI.tsv' in diagnosis_df_paths or 'pMCI.tsv' in diagnosis_df_paths: - logger.warning("MCI special treatment was deactivated though MCI subgroups were found." - "Be aware that it may cause data leakage in transfer learning tasks.") + elif "sMCI.tsv" in diagnosis_df_paths or "pMCI.tsv" in diagnosis_df_paths: + logger.warning( + "MCI special treatment was deactivated though MCI subgroups were found." + "Be aware that it may cause data leakage in transfer learning tasks." + ) # The baseline session must be kept before or we are taking all the sessions to mix them for diagnosis_df_path in diagnosis_df_paths: - diagnosis_df = pd.read_csv(path.join(results_path, diagnosis_df_path), - sep='\t') + diagnosis_df = pd.read_csv(path.join(results_path, diagnosis_df_path), sep="\t") interest_columns = diagnosis_df.columns.values - diagnosis = diagnosis_df_path.split('.')[0] + diagnosis = diagnosis_df_path.split(".")[0] logger.info(f"Running split for diagnosis {diagnosis}") if n_test > 0: - train_df, test_df = create_split(diagnosis, diagnosis_df, categorical_split_variable, n_test=n_test, - p_age_threshold=p_age_threshold, - p_sex_threshold=p_sex_threshold, - ignore_demographics=ignore_demographics, - logger=logger) + train_df, test_df = create_split( + diagnosis, + diagnosis_df, + categorical_split_variable, + n_test=n_test, + p_age_threshold=p_age_threshold, + p_sex_threshold=p_sex_threshold, + ignore_demographics=ignore_demographics, + logger=logger, + ) # Save baseline splits - train_df.to_csv(path.join(train_path, f'{diagnosis}_baseline.tsv'), sep='\t', index=False) - test_df.to_csv(path.join(test_path, f'{diagnosis}_baseline.tsv'), sep='\t', index=False) + train_df.to_csv( + path.join(train_path, f"{diagnosis}_baseline.tsv"), + sep="\t", + index=False, + ) + test_df.to_csv( + path.join(test_path, f"{diagnosis}_baseline.tsv"), sep="\t", index=False + ) long_train_df = retrieve_longitudinal(train_df, diagnosis_df) - long_train_df.to_csv(path.join(train_path, f'{diagnosis}.tsv'), sep='\t', index=False) + long_train_df.to_csv( + path.join(train_path, f"{diagnosis}.tsv"), sep="\t", index=False + ) long_test_df = retrieve_longitudinal(test_df, diagnosis_df) - long_test_df.to_csv(path.join(test_path, f'{diagnosis}.tsv'), sep='\t', index=False) + long_test_df.to_csv( + path.join(test_path, f"{diagnosis}.tsv"), sep="\t", index=False + ) else: baseline_df = extract_baseline(diagnosis_df, diagnosis) test_df = baseline_df[interest_columns] - test_df.to_csv(path.join(test_path, f'{diagnosis}_baseline.tsv'), sep='\t', index=False) + test_df.to_csv( + path.join(test_path, f"{diagnosis}_baseline.tsv"), sep="\t", index=False + ) long_test_df = retrieve_longitudinal(test_df, diagnosis_df) - long_test_df.to_csv(path.join(test_path, f'{diagnosis}.tsv'), sep='\t', index=False) + long_test_df.to_csv( + path.join(test_path, f"{diagnosis}.tsv"), sep="\t", index=False + ) if MCI_special_treatment: # Extraction of MCI subjects without intersection with the sMCI / pMCI train - diagnosis_df = pd.read_csv(path.join(results_path, 'MCI.tsv'), sep='\t') - MCI_df = diagnosis_df.set_index(['participant_id', 'session_id']) + diagnosis_df = pd.read_csv(path.join(results_path, "MCI.tsv"), sep="\t") + MCI_df = diagnosis_df.set_index(["participant_id", "session_id"]) baseline_df = extract_baseline(MCI_df, set_index=False, diagnosis="MCI") if n_test > 1: @@ -242,38 +299,58 @@ def split_diagnoses(formatted_data_path, n_test=100, subset_name="test", MCI_sub else: n_test = int(n_test * len(baseline_df)) - MCI_df, supplementary_diagnoses = remove_sub_labels(MCI_df, ["sMCI", "pMCI"], - diagnosis_df_paths, results_path, - logger=logger) + MCI_df, supplementary_diagnoses = remove_sub_labels( + MCI_df, ["sMCI", "pMCI"], diagnosis_df_paths, results_path, logger=logger + ) if len(supplementary_diagnoses) == 0: - raise ValueError('The MCI_sub_categories flag is not needed as there are no intersections with' - 'MCI subcategories.') + raise ValueError( + "The MCI_sub_categories flag is not needed as there are no intersections with" + "MCI subcategories." + ) # Construction of supplementary train supplementary_train_df = pd.DataFrame() for diagnosis in supplementary_diagnoses: - sup_baseline_train_df = pd.read_csv(path.join(train_path, f'{diagnosis}_baseline.tsv'), sep='\t') - supplementary_train_df = pd.concat([supplementary_train_df, sup_baseline_train_df]) - sub_df = supplementary_train_df.reset_index().groupby('participant_id')['session_id'].nunique() - logger.debug(f'supplementary_train_df {len(sub_df)} subjects, {len(supplementary_diagnoses)} scans') + sup_baseline_train_df = pd.read_csv( + path.join(train_path, f"{diagnosis}_baseline.tsv"), sep="\t" + ) + supplementary_train_df = pd.concat( + [supplementary_train_df, sup_baseline_train_df] + ) + sub_df = ( + supplementary_train_df.reset_index() + .groupby("participant_id")["session_id"] + .nunique() + ) + logger.debug( + f"supplementary_train_df {len(sub_df)} subjects, {len(supplementary_diagnoses)} scans" + ) supplementary_train_df.reset_index(drop=True, inplace=True) # MCI selection MCI_df.reset_index(inplace=True) - baseline_df = extract_baseline(MCI_df, 'MCI') - - train_df, test_df = create_split('MCI', baseline_df, categorical_split_variable, - n_test=n_test, p_age_threshold=p_age_threshold, - p_sex_threshold=p_sex_threshold, ignore_demographics=ignore_demographics, - logger=logger, - supplementary_train_df=supplementary_train_df) + baseline_df = extract_baseline(MCI_df, "MCI") + + train_df, test_df = create_split( + "MCI", + baseline_df, + categorical_split_variable, + n_test=n_test, + p_age_threshold=p_age_threshold, + p_sex_threshold=p_sex_threshold, + ignore_demographics=ignore_demographics, + logger=logger, + supplementary_train_df=supplementary_train_df, + ) # Write selection of MCI - train_df.to_csv(path.join(train_path, 'MCI_baseline.tsv'), sep='\t', index=False) - test_df.to_csv(path.join(test_path, 'MCI_baseline.tsv'), sep='\t', index=False) + train_df.to_csv( + path.join(train_path, "MCI_baseline.tsv"), sep="\t", index=False + ) + test_df.to_csv(path.join(test_path, "MCI_baseline.tsv"), sep="\t", index=False) long_train_df = retrieve_longitudinal(train_df, diagnosis_df) - long_train_df.to_csv(path.join(train_path, 'MCI.tsv'), sep='\t', index=False) + long_train_df.to_csv(path.join(train_path, "MCI.tsv"), sep="\t", index=False) long_test_df = retrieve_longitudinal(test_df, diagnosis_df) - long_test_df.to_csv(path.join(test_path, 'MCI.tsv'), sep='\t', index=False) + long_test_df.to_csv(path.join(test_path, "MCI.tsv"), sep="\t", index=False) diff --git a/clinicadl/clinicadl/tools/tsv/demographics_analysis.py b/clinicadl/clinicadl/tools/tsv/demographics_analysis.py index 16741b425..eb909a62a 100644 --- a/clinicadl/clinicadl/tools/tsv/demographics_analysis.py +++ b/clinicadl/clinicadl/tools/tsv/demographics_analysis.py @@ -1,12 +1,14 @@ # coding: utf-8 -import pandas as pd -from .tsv_utils import first_session, next_session, add_demographics, find_label import os from os import path -import numpy as np from warnings import warn +import numpy as np +import pandas as pd + +from .tsv_utils import add_demographics, find_label, first_session, next_session + def demographics_analysis(merged_tsv, formatted_data_path, results_path, diagnoses): """ @@ -24,94 +26,158 @@ def demographics_analysis(merged_tsv, formatted_data_path, results_path, diagnos demographic analysis of the tsv files in formatted_data_path. """ - merged_df = pd.read_csv(merged_tsv, sep='\t') - merged_df.set_index(['participant_id', 'session_id'], inplace=True) + merged_df = pd.read_csv(merged_tsv, sep="\t") + merged_df.set_index(["participant_id", "session_id"], inplace=True) parent_directory = path.abspath(path.join(results_path, os.pardir)) os.makedirs(parent_directory, exist_ok=True) - fields_dict = {'age': find_label(merged_df.columns.values, 'age'), - 'sex': find_label(merged_df.columns.values, 'sex'), - 'MMSE': find_label(merged_df.columns.values, 'mms'), - 'CDR': 'cdr_global'} - - columns = ['n_subjects', 'mean_age', 'std_age', - 'min_age', 'max_age', 'sexF', - 'sexM', 'mean_MMSE', 'std_MMSE', - 'min_MMSE', 'max_MMSE', 'CDR_0', - 'CDR_0.5', 'CDR_1', 'CDR_2', - 'CDR_3', 'mean_scans', 'std_scans', - 'n_scans'] - results_df = pd.DataFrame(index=diagnoses, columns=columns, data=np.zeros((len(diagnoses), len(columns)))) + fields_dict = { + "age": find_label(merged_df.columns.values, "age"), + "sex": find_label(merged_df.columns.values, "sex"), + "MMSE": find_label(merged_df.columns.values, "mms"), + "CDR": "cdr_global", + } + + columns = [ + "n_subjects", + "mean_age", + "std_age", + "min_age", + "max_age", + "sexF", + "sexM", + "mean_MMSE", + "std_MMSE", + "min_MMSE", + "max_MMSE", + "CDR_0", + "CDR_0.5", + "CDR_1", + "CDR_2", + "CDR_3", + "mean_scans", + "std_scans", + "n_scans", + ] + results_df = pd.DataFrame( + index=diagnoses, columns=columns, data=np.zeros((len(diagnoses), len(columns))) + ) # Need all values for mean and variance (age, MMSE and scans) diagnosis_dict = dict.fromkeys(diagnoses) for diagnosis in diagnoses: - diagnosis_dict[diagnosis] = {'age': [], 'MMSE': [], 'scans': []} - diagnosis_path = path.join(formatted_data_path, diagnosis + '.tsv') + diagnosis_dict[diagnosis] = {"age": [], "MMSE": [], "scans": []} + diagnosis_path = path.join(formatted_data_path, diagnosis + ".tsv") if not path.exists(diagnosis_path): - print("TSV file with all sessions was not found for diagnosis %s. " - "Loads baseline version instead." % diagnosis) - diagnosis_path = path.join(formatted_data_path, diagnosis + '_baseline.tsv') - diagnosis_df = pd.read_csv(diagnosis_path, sep='\t') + print( + "TSV file with all sessions was not found for diagnosis %s. " + "Loads baseline version instead." % diagnosis + ) + diagnosis_path = path.join(formatted_data_path, diagnosis + "_baseline.tsv") + diagnosis_df = pd.read_csv(diagnosis_path, sep="\t") diagnosis_demographics_df = add_demographics(diagnosis_df, merged_df, diagnosis) - diagnosis_demographics_df.set_index(['participant_id', 'session_id'], inplace=True) - diagnosis_df.set_index(['participant_id', 'session_id'], inplace=True) + diagnosis_demographics_df.set_index( + ["participant_id", "session_id"], inplace=True + ) + diagnosis_df.set_index(["participant_id", "session_id"], inplace=True) for subject, subject_df in diagnosis_df.groupby(level=0): first_session_id = first_session(subject_df) - feature_absence = isinstance(merged_df.loc[(subject, first_session_id), 'diagnosis'], float) + feature_absence = isinstance( + merged_df.loc[(subject, first_session_id), "diagnosis"], float + ) while feature_absence: first_session_id = next_session(subject_df, first_session_id) - feature_absence = isinstance(merged_df.loc[(subject, first_session_id), 'diagnosis'], float) + feature_absence = isinstance( + merged_df.loc[(subject, first_session_id), "diagnosis"], float + ) demographics_subject_df = merged_df.loc[subject] # Extract features - results_df.loc[diagnosis, 'n_subjects'] += 1 - results_df.loc[diagnosis, 'n_scans'] += len(subject_df) - diagnosis_dict[diagnosis]['age'].append( - merged_df.loc[(subject, first_session_id), fields_dict['age']]) - diagnosis_dict[diagnosis]['MMSE'].append( - merged_df.loc[(subject, first_session_id), fields_dict['MMSE']]) - diagnosis_dict[diagnosis]['scans'].append(len(subject_df)) - sexF = len(demographics_subject_df[(demographics_subject_df[fields_dict['sex']].isin(['F']))]) > 0 - sexM = len(demographics_subject_df[(demographics_subject_df[fields_dict['sex']].isin(['M']))]) > 0 + results_df.loc[diagnosis, "n_subjects"] += 1 + results_df.loc[diagnosis, "n_scans"] += len(subject_df) + diagnosis_dict[diagnosis]["age"].append( + merged_df.loc[(subject, first_session_id), fields_dict["age"]] + ) + diagnosis_dict[diagnosis]["MMSE"].append( + merged_df.loc[(subject, first_session_id), fields_dict["MMSE"]] + ) + diagnosis_dict[diagnosis]["scans"].append(len(subject_df)) + sexF = ( + len( + demographics_subject_df[ + (demographics_subject_df[fields_dict["sex"]].isin(["F"])) + ] + ) + > 0 + ) + sexM = ( + len( + demographics_subject_df[ + (demographics_subject_df[fields_dict["sex"]].isin(["M"])) + ] + ) + > 0 + ) if sexF: - results_df.loc[diagnosis, 'sexF'] += 1 + results_df.loc[diagnosis, "sexF"] += 1 elif sexM: - results_df.loc[diagnosis, 'sexM'] += 1 + results_df.loc[diagnosis, "sexM"] += 1 else: - raise ValueError('Patient %s has no sex' % subject) + raise ValueError("Patient %s has no sex" % subject) - cdr = merged_df.at[(subject, first_session_id), fields_dict['CDR']] + cdr = merged_df.at[(subject, first_session_id), fields_dict["CDR"]] if cdr == 0: - results_df.loc[diagnosis, 'CDR_0'] += 1 + results_df.loc[diagnosis, "CDR_0"] += 1 elif cdr == 0.5: - results_df.loc[diagnosis, 'CDR_0.5'] += 1 + results_df.loc[diagnosis, "CDR_0.5"] += 1 elif cdr == 1: - results_df.loc[diagnosis, 'CDR_1'] += 1 + results_df.loc[diagnosis, "CDR_1"] += 1 elif cdr == 2: - results_df.loc[diagnosis, 'CDR_2'] += 1 + results_df.loc[diagnosis, "CDR_2"] += 1 elif cdr == 3: - results_df.loc[diagnosis, 'CDR_3'] += 1 + results_df.loc[diagnosis, "CDR_3"] += 1 else: - warn(f'Patient {subject} has CDR {cdr}') + warn(f"Patient {subject} has CDR {cdr}") for diagnosis in diagnoses: - results_df.loc[diagnosis, 'mean_age'] = np.nanmean(diagnosis_dict[diagnosis]['age']) - results_df.loc[diagnosis, 'std_age'] = np.nanstd(diagnosis_dict[diagnosis]['age']) - results_df.loc[diagnosis, 'min_age'] = np.nanmin(diagnosis_dict[diagnosis]['age']) - results_df.loc[diagnosis, 'max_age'] = np.nanmax(diagnosis_dict[diagnosis]['age']) - results_df.loc[diagnosis, 'mean_MMSE'] = np.nanmean(diagnosis_dict[diagnosis]['MMSE']) - results_df.loc[diagnosis, 'std_MMSE'] = np.nanstd(diagnosis_dict[diagnosis]['MMSE']) - results_df.loc[diagnosis, 'min_MMSE'] = np.nanmin(diagnosis_dict[diagnosis]['MMSE']) - results_df.loc[diagnosis, 'max_MMSE'] = np.nanmax(diagnosis_dict[diagnosis]['MMSE']) - results_df.loc[diagnosis, 'mean_scans'] = np.nanmean(diagnosis_dict[diagnosis]['scans']) - results_df.loc[diagnosis, 'std_scans'] = np.nanstd(diagnosis_dict[diagnosis]['scans']) + results_df.loc[diagnosis, "mean_age"] = np.nanmean( + diagnosis_dict[diagnosis]["age"] + ) + results_df.loc[diagnosis, "std_age"] = np.nanstd( + diagnosis_dict[diagnosis]["age"] + ) + results_df.loc[diagnosis, "min_age"] = np.nanmin( + diagnosis_dict[diagnosis]["age"] + ) + results_df.loc[diagnosis, "max_age"] = np.nanmax( + diagnosis_dict[diagnosis]["age"] + ) + results_df.loc[diagnosis, "mean_MMSE"] = np.nanmean( + diagnosis_dict[diagnosis]["MMSE"] + ) + results_df.loc[diagnosis, "std_MMSE"] = np.nanstd( + diagnosis_dict[diagnosis]["MMSE"] + ) + results_df.loc[diagnosis, "min_MMSE"] = np.nanmin( + diagnosis_dict[diagnosis]["MMSE"] + ) + results_df.loc[diagnosis, "max_MMSE"] = np.nanmax( + diagnosis_dict[diagnosis]["MMSE"] + ) + results_df.loc[diagnosis, "mean_scans"] = np.nanmean( + diagnosis_dict[diagnosis]["scans"] + ) + results_df.loc[diagnosis, "std_scans"] = np.nanstd( + diagnosis_dict[diagnosis]["scans"] + ) for key in diagnosis_dict[diagnosis]: if np.isnan(diagnosis_dict[diagnosis][key]).any(): - warn(f"NaN values were found for {key} values associated to diagnosis {diagnosis}") + warn( + f"NaN values were found for {key} values associated to diagnosis {diagnosis}" + ) results_df.index.name = "diagnosis" - results_df.to_csv(results_path, sep='\t') + results_df.to_csv(results_path, sep="\t") diff --git a/clinicadl/clinicadl/tools/tsv/kfold_split.py b/clinicadl/clinicadl/tools/tsv/kfold_split.py index 28229b381..0f5a6d3b4 100644 --- a/clinicadl/clinicadl/tools/tsv/kfold_split.py +++ b/clinicadl/clinicadl/tools/tsv/kfold_split.py @@ -1,19 +1,28 @@ # coding: utf8 -from .tsv_utils import extract_baseline, retrieve_longitudinal, remove_sub_labels -from ..deep_learning.iotools import return_logger, commandline_to_json +import os import shutil -from sklearn.model_selection import StratifiedKFold from os import path -import os -import pandas as pd + import numpy as np +import pandas as pd +from sklearn.model_selection import StratifiedKFold + +from ..deep_learning.iotools import commandline_to_json, return_logger +from .tsv_utils import extract_baseline, remove_sub_labels, retrieve_longitudinal -sex_dict = {'M': 0, 'F': 1} +sex_dict = {"M": 0, "F": 1} -def write_splits(diagnosis, diagnosis_df, split_label, n_splits, - train_path, test_path, supplementary_diagnoses=None): +def write_splits( + diagnosis, + diagnosis_df, + split_label, + n_splits, + train_path, + test_path, + supplementary_diagnoses=None, +): """ Split data at the subject-level in training and test to have equivalent distributions in split_label. @@ -44,7 +53,7 @@ def write_splits(diagnosis, diagnosis_df, split_label, n_splits, splits = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2) for i, indices in enumerate(splits.split(np.zeros(len(y)), y)): - print(f'Split {i}') + print(f"Split {i}") train_index, test_index = indices test_df = baseline_df.iloc[test_index] @@ -52,26 +61,55 @@ def write_splits(diagnosis, diagnosis_df, split_label, n_splits, if supplementary_diagnoses is not None: for supplementary_diagnosis in supplementary_diagnoses: - sup_train_df = pd.read_csv(path.join(train_path, f'split-{i}', - f'{supplementary_diagnosis}_baseline.tsv'), sep='\t') + sup_train_df = pd.read_csv( + path.join( + train_path, + f"split-{i}", + f"{supplementary_diagnosis}_baseline.tsv", + ), + sep="\t", + ) train_df = pd.concat([train_df, sup_train_df]) - sup_test_df = pd.read_csv(path.join(test_path, f'split-{i}', - f'{supplementary_diagnosis}_baseline.tsv'), sep='\t') + sup_test_df = pd.read_csv( + path.join( + test_path, + f"split-{i}", + f"{supplementary_diagnosis}_baseline.tsv", + ), + sep="\t", + ) test_df = pd.concat([test_df, sup_test_df]) train_df.reset_index(inplace=True, drop=True) test_df.reset_index(inplace=True, drop=True) - train_df.to_csv(path.join(train_path, f'split-{i}', f'{diagnosis}_baseline.tsv'), sep='\t', index=False) - test_df.to_csv(path.join(test_path, f'split-{i}', f'{diagnosis}_baseline.tsv'), sep='\t', index=False) + train_df.to_csv( + path.join(train_path, f"split-{i}", f"{diagnosis}_baseline.tsv"), + sep="\t", + index=False, + ) + test_df.to_csv( + path.join(test_path, f"split-{i}", f"{diagnosis}_baseline.tsv"), + sep="\t", + index=False, + ) long_train_df = retrieve_longitudinal(train_df, diagnosis_df) - long_train_df.to_csv(path.join(train_path, f'split-{i}', f'{diagnosis}.tsv'), sep='\t', index=False) - - -def split_diagnoses(formatted_data_path, - n_splits=5, subset_name="validation", MCI_sub_categories=True, - stratification=None, verbose=0): + long_train_df.to_csv( + path.join(train_path, f"split-{i}", f"{diagnosis}.tsv"), + sep="\t", + index=False, + ) + + +def split_diagnoses( + formatted_data_path, + n_splits=5, + subset_name="validation", + MCI_sub_categories=True, + stratification=None, + verbose=0, +): """ Performs a k-fold split for each label independently on the subject level. The train folder will contain two lists per fold per diagnosis (baseline and longitudinal), @@ -91,71 +129,89 @@ def split_diagnoses(formatted_data_path, - formatted_data_path/train_splits-/split-/