diff --git a/deepobs/abstract_runner/abstract_runner.py b/deepobs/abstract_runner/abstract_runner.py
index 867b3d12..b4aa08cf 100644
--- a/deepobs/abstract_runner/abstract_runner.py
+++ b/deepobs/abstract_runner/abstract_runner.py
@@ -1,5 +1,4 @@
 # -*- coding: utf-8 -*-
-
 """Module implementing the abstract Runner."""
 import os
 import json
@@ -11,6 +10,7 @@
 import warnings
 from copy import deepcopy
 from deepobs import config as global_config
+import glob
 
 
 class Runner(abc.ABC):
@@ -31,7 +31,6 @@ class Runner(abc.ABC):
     create_output_directory: Creates the output folder of the run.
     write_output: Writes the output of the run to the output directory.
     """
-
     def __init__(self, optimizer_class, hyperparameter_names):
         """ Creates a new Runner instance
 
@@ -56,22 +55,21 @@ def __init__(self, optimizer_class, hyperparameter_names):
         self._hyperparameter_names = hyperparameter_names
 
     def run(self,
-            testproblem = None,
-            hyperparams = None,
-            batch_size = None,
-            num_epochs = None,
+            testproblem=None,
+            hyperparams=None,
+            batch_size=None,
+            num_epochs=None,
             random_seed=None,
             data_dir=None,
             output_dir=None,
             weight_decay=None,
             no_logs=None,
-            train_log_interval = None,
-            print_train_iter = None,
-            tb_log = None,
-            tb_log_dir = None,
-            **training_params
-            ):
-
+            train_log_interval=None,
+            print_train_iter=None,
+            tb_log=None,
+            tb_log_dir=None,
+            skip_if_exists=False,
+            **training_params):
         """Runs a testproblem with the optimizer_class. Has the following tasks:
             1. setup testproblem
             2. run the training (must be implemented by subclass)
@@ -91,6 +89,7 @@ def run(self,
             print_train_iter (bool): Whether to print the training progress at each train_log_interval.
             tb_log (bool): Whether to use tensorboard logging or not
             tb_log_dir (str): The path where to save tensorboard events.
+            skip_if_exists (bool): Skip training if the output already exists.
             training_params (dict): Kwargs for the training method.
 
         Returns:
@@ -105,85 +104,95 @@ def run(self,
             where <...meta data...> stores the run args.
 
         """
-        args = self.parse_args(testproblem,
-                               hyperparams,
-                               batch_size,
-                               num_epochs,
-                               random_seed,
-                               data_dir,
-                               output_dir,
-                               weight_decay,
-                               no_logs,
-                               train_log_interval,
-                               print_train_iter,
-                               tb_log,
-                               tb_log_dir,
-                               training_params)
-
-        return self._run(**args)
+        exists, matches = self.run_exists(
+            testproblem=testproblem,
+            hyperparams=hyperparams,
+            batch_size=batch_size,
+            num_epochs=num_epochs,
+            random_seed=random_seed,
+            data_dir=data_dir,
+            output_dir=output_dir,
+            weight_decay=weight_decay,
+            no_logs=no_logs,
+            train_log_interval=train_log_interval,
+            print_train_iter=print_train_iter,
+            tb_log=tb_log,
+            tb_log_dir=tb_log_dir,
+            **training_params)
+
+        require_run = not (exists and skip_if_exists)
+
+        if require_run:
+            args = self.parse_args(
+                testproblem,
+                hyperparams,
+                batch_size,
+                num_epochs,
+                random_seed,
+                data_dir,
+                output_dir,
+                weight_decay,
+                no_logs,
+                train_log_interval,
+                print_train_iter,
+                tb_log,
+                tb_log_dir,
+                training_params,
+            )
+
+            return self._run(**args)
+        else:
+            print("Found output file(s): {}\nSkipping run.".format(matches))
 
     def _run(self,
-             testproblem = None,
-             hyperparams = None,
-             batch_size = None,
-             num_epochs = None,
-             random_seed = None,
-             data_dir = None,
-             output_dir = None,
-             weight_decay = None,
-             no_logs = None,
-             train_log_interval = None,
-             print_train_iter = None,
-             tb_log = None,
-             tb_log_dir = None,
+             testproblem=None,
+             hyperparams=None,
+             batch_size=None,
+             num_epochs=None,
+             random_seed=None,
+             data_dir=None,
+             output_dir=None,
+             weight_decay=None,
+             no_logs=None,
+             train_log_interval=None,
+             print_train_iter=None,
+             tb_log=None,
+             tb_log_dir=None,
              **training_params):
 
         # Creates a backup copy of the initial parameters. Users might change the dicts during training.
         hyperparams_before_training = deepcopy(hyperparams)
         training_params_before_training = deepcopy(training_params)
 
-        if batch_size is None:
-            batch_size = global_config.get_testproblem_default_setting(testproblem)['batch_size']
-        if num_epochs is None:
-            num_epochs = global_config.get_testproblem_default_setting(testproblem)['num_epochs']
+        batch_size = self._use_default_batch_size_if_missing(
+            testproblem, batch_size)
+        num_epochs = self._use_default_num_epochs_if_missing(
+            testproblem, num_epochs)
 
         if data_dir is not None:
             global_config.set_data_dir(data_dir)
 
-        run_directory, file_name = self.generate_output_directory_name(testproblem,
-                                                                       batch_size,
-                                                                       num_epochs,
-                                                                       weight_decay,
-                                                                       random_seed,
-                                                                       output_dir,
-                                                                       hyperparams,
-                                                                       **training_params)
+        run_directory, file_name = self.generate_output_directory_name(
+            testproblem, batch_size, num_epochs, weight_decay, random_seed,
+            output_dir, hyperparams, **training_params)
 
         if tb_log:
             if tb_log_dir == 'none':
-                print('Tensorboard logging: No tb_log_dir specified, using settings folder {0:s} as default.'.format(run_directory))
+                print(
+                    'Tensorboard logging: No tb_log_dir specified, using settings folder {0:s} as default.'
+                    .format(run_directory))
                 os.makedirs(run_directory, exist_ok=True)
                 tb_log_dir = run_directory
 
-        tproblem = self.create_testproblem(testproblem,
-                                           batch_size,
-                                           weight_decay,
-                                           random_seed)
-
-        output = self.training(tproblem,
-                               hyperparams,
-                               num_epochs,
-                               print_train_iter,
-                               train_log_interval,
-                               tb_log,
-                               tb_log_dir,
-                               **training_params)
-
-        output = self._post_process_output(output,
-                                           testproblem,
-                                           batch_size,
-                                           num_epochs,
-                                           random_seed,
+        tproblem = self.create_testproblem(testproblem, batch_size,
+                                           weight_decay, random_seed)
+
+        output = self.training(tproblem, hyperparams, num_epochs,
+                               print_train_iter, train_log_interval, tb_log,
+                               tb_log_dir, **training_params)
+
+        output = self._post_process_output(output, testproblem, batch_size,
+                                           num_epochs, random_seed,
                                            weight_decay,
                                            hyperparams_before_training,
                                            **training_params_before_training)
@@ -193,16 +202,99 @@ def _run(self,
 
         return output
 
+    def run_exists(self,
+                   testproblem=None,
+                   hyperparams=None,
+                   batch_size=None,
+                   num_epochs=None,
+                   random_seed=None,
+                   data_dir=None,
+                   output_dir=None,
+                   weight_decay=None,
+                   no_logs=None,
+                   train_log_interval=None,
+                   print_train_iter=None,
+                   tb_log=None,
+                   tb_log_dir=None,
+                   **training_params):
+        """Return whether output file for this run already exists.
+
+        Args:
+            See `run` method.
+
+        Returns:
+            bool, list(str): The first parameter is `True` if the `.json` \
+                output file already exists, else `False`. The list contains \
+                the paths to the files that match the run.
+        """
+        args = self.parse_args(
+            testproblem,
+            hyperparams,
+            batch_size,
+            num_epochs,
+            random_seed,
+            data_dir,
+            output_dir,
+            weight_decay,
+            no_logs,
+            train_log_interval,
+            print_train_iter,
+            tb_log,
+            tb_log_dir,
+            training_params,
+        )
+        return self._run_exists(**args)
+
+    def _run_exists(self,
+                    testproblem=None,
+                    hyperparams=None,
+                    batch_size=None,
+                    num_epochs=None,
+                    random_seed=None,
+                    data_dir=None,
+                    output_dir=None,
+                    weight_decay=None,
+                    no_logs=None,
+                    train_log_interval=None,
+                    print_train_iter=None,
+                    tb_log=None,
+                    tb_log_dir=None,
+                    **training_params):
+
+        batch_size = self._use_default_batch_size_if_missing(
+            testproblem, batch_size)
+        num_epochs = self._use_default_num_epochs_if_missing(
+            testproblem, num_epochs)
+
+        run_directory, _ = self.generate_output_directory_name(
+            testproblem, batch_size, num_epochs, weight_decay, random_seed,
+            output_dir, hyperparams, **training_params)
+        file_regex = "{}*.json".format(self._filename_no_date(random_seed))
+        pattern = os.path.join(run_directory, file_regex)
+        matches = glob.glob(pattern)
+
+        exists = bool(matches)
+        return exists, matches
+
+    def _use_default_batch_size_if_missing(self, testproblem, batch_size):
+        fall_back_to_default = (batch_size is None)
+        if fall_back_to_default:
+            batch_size = self._use_default(testproblem, 'batch_size')
+        return batch_size
+
+    def _use_default_num_epochs_if_missing(self, testproblem, num_epochs):
+        fall_back_to_default = (num_epochs is None)
+        if fall_back_to_default:
+            num_epochs = self._use_default(testproblem, 'num_epochs')
+        return num_epochs
+
+    @staticmethod
+    def _use_default(testproblem, key):
+        return global_config.get_testproblem_default_setting(testproblem)[key]
 
     @abc.abstractmethod
-    def training(self,
-                 tproblem,
-                 hyperparams,
-                 num_epochs,
-                 print_train_iter,
-                 train_log_interval,
-                 tb_log, tb_log_dir,
-                 **training_params):
+    def training(self, tproblem, hyperparams, num_epochs, print_train_iter,
+                 train_log_interval, tb_log, tb_log_dir, **training_params):
         """Performs the training and stores the metrices.
 
             Args:
@@ -259,18 +351,23 @@ def _add_hyperparams_to_argparse(self, parser, args, hyperparams):
             hyperparams (dict): Hyperparameters that are to read in.
 
         """
-        if hyperparams is None:    # if no hyperparams dict is passed to run()
-            for hp_name, hp_specification in self._hyperparameter_names.items():
-                _add_hp_to_argparse(parser, self._optimizer_name, hp_specification, hp_name)
+        if hyperparams is None:  # if no hyperparams dict is passed to run()
+            for hp_name, hp_specification in self._hyperparameter_names.items(
+            ):
+                _add_hp_to_argparse(parser, self._optimizer_name,
+                                    hp_specification, hp_name)
 
-        else:     # if there is one, fill the missing params from command line
-            for hp_name, hp_specification in self._hyperparameter_names.items():
+        else:  # if there is one, fill the missing params from command line
+            for hp_name, hp_specification in self._hyperparameter_names.items(
+            ):
                 if hp_name in hyperparams:
                     args[hp_name] = hyperparams[hp_name]
                 else:
-                    _add_hp_to_argparse(parser, self._optimizer_name, hp_specification, hp_name)
+                    _add_hp_to_argparse(parser, self._optimizer_name,
+                                        hp_specification, hp_name)
 
-    def _add_training_params_to_output_dir_name(self, training_params, run_folder_name):
+    def _add_training_params_to_output_dir_name(self, training_params,
+                                                run_folder_name):
         """Overwrite this method to specify how your
         runner should format additional training_parameters in the run folder name.
 
@@ -285,10 +382,12 @@ def _add_training_params_to_output_dir_name(self, training_params, run_folder_na
             if tp_value is not None:
                 run_folder_name += "__{0:s}".format(tp_name)
                 run_folder_name += "__{0:s}".format(
-                    float2str(tp_value) if isinstance(tp_value, float) else str(tp_value))
+                    float2str(tp_value) if isinstance(tp_value, float
+                                                      ) else str(tp_value))
         return run_folder_name
 
-    def _add_hyperparams_to_output_dir_name(self, optimizer_hyperparams, run_folder_name):
+    def _add_hyperparams_to_output_dir_name(self, optimizer_hyperparams,
+                                            run_folder_name):
         """Overwrite this method to specify how your
         runner should format optimizer hyper_parameters in the run folder name.
 
@@ -304,25 +403,14 @@ def _add_hyperparams_to_output_dir_name(self, optimizer_hyperparams, run_folder_
         for hp_name, hp_value in sorted(optimizer_hyperparams.items()):
             run_folder_name += "__{0:s}".format(hp_name)
             run_folder_name += "__{0:s}".format(
-                float2str(hp_value) if isinstance(hp_value, float) else str(hp_value))
+                float2str(hp_value) if isinstance(hp_value, float
+                                                  ) else str(hp_value))
         return run_folder_name
 
-    def parse_args(self,
-                   testproblem,
-                   hyperparams,
-                   batch_size,
-                   num_epochs,
-                   random_seed,
-                   data_dir,
-                   output_dir,
-                   weight_decay,
-                   no_logs,
-                   train_log_interval,
-                   print_train_iter,
-                   tb_log,
-                   tb_log_dir,
+    def parse_args(self, testproblem, hyperparams, batch_size, num_epochs,
+                   random_seed, data_dir, output_dir, weight_decay, no_logs,
+                   train_log_interval, print_train_iter, tb_log, tb_log_dir,
                    training_params):
-
         """Constructs an argparse.ArgumentParser and parses the arguments from command line.
 
         Args:
@@ -345,7 +433,8 @@ def parse_args(self,
             dict: A dicionary of all arguments.
             """
         args = {}
-        parser = argparse.ArgumentParser(description='Arguments for running optimizer script.')
+        parser = argparse.ArgumentParser(
+            description='Arguments for running optimizer script.')
 
         if testproblem is None:
             parser.add_argument('testproblem')
@@ -353,11 +442,10 @@ def parse_args(self,
             args['testproblem'] = testproblem
 
         if weight_decay is None:
-            parser.add_argument(
-                "--weight_decay",
-                "--wd",
-                type=float,
-                help="""Factor
+            parser.add_argument("--weight_decay",
+                                "--wd",
+                                type=float,
+                                help="""Factor
           used for the weight_deacy. If not given, the default weight decay for
           this model is used. Note that not all models use weight decay and this
           value will be ignored in such a case.""")
@@ -365,20 +453,18 @@ def parse_args(self,
             args['weight_decay'] = weight_decay
 
         if batch_size is None:
-            parser.add_argument(
-                "--batch_size",
-                "--bs",
-                type=int,
-                help="The batch size (positive integer).")
+            parser.add_argument("--batch_size",
+                                "--bs",
+                                type=int,
+                                help="The batch size (positive integer).")
         else:
             args['batch_size'] = batch_size
 
         if num_epochs is None:
-            parser.add_argument(
-                "-N",
-                "--num_epochs",
-                type=int,
-                help="Total number of training epochs.")
+            parser.add_argument("-N",
+                                "--num_epochs",
+                                type=int,
+                                help="Total number of training epochs.")
         else:
             args['num_epochs'] = num_epochs
 
@@ -393,9 +479,8 @@ def parse_args(self,
             args['random_seed'] = random_seed
 
         if data_dir is None:
-            parser.add_argument(
-                "--data_dir",
-                help="""Path to the base data dir. If
+            parser.add_argument("--data_dir",
+                                help="""Path to the base data dir. If
       not specified, DeepOBS uses its default.""")
         else:
             args['data_dir'] = data_dir
@@ -424,7 +509,7 @@ def parse_args(self,
         if train_log_interval is None:
             parser.add_argument(
                 "--train_log_interval",
-                type = int,
+                type=int,
                 default=10,
                 help="""Interval of steps at which to log training loss.""")
         else:
@@ -436,7 +521,9 @@ def parse_args(self,
                 action="store_const",
                 const=True,
                 default=False,
-                help="""Add this flag to print the mini-batch-loss at the train_log_interval.""")
+                help=
+                """Add this flag to print the mini-batch-loss at the train_log_interval."""
+            )
         else:
             args['print_train_iter'] = print_train_iter
 
@@ -455,7 +542,9 @@ def parse_args(self,
                 "--tb_log_dir",
                 type=str,
                 default="none",
-                help="""Path to the directory where the tensorboard logs are saved.""")
+                help=
+                """Path to the directory where the tensorboard logs are saved."""
+            )
         else:
             args['tb_log_dir'] = tb_log_dir
 
@@ -474,16 +563,10 @@ def parse_args(self,
 
         return args
 
-    def generate_output_directory_name(self,
-                                       testproblem,
-                                       batch_size,
-                                       num_epochs,
-                                       weight_decay,
-                                       random_seed,
-                                       output_dir,
-                                       optimizer_hyperparams,
-                                       **training_params
-                                       ):
+    def generate_output_directory_name(self, testproblem, batch_size,
+                                       num_epochs, weight_decay, random_seed,
+                                       output_dir, optimizer_hyperparams,
+                                       **training_params):
         # add everything mandatory to the name
         run_folder_name = "num_epochs__" + str(
             num_epochs) + "__batch_size__" + str(batch_size)
@@ -492,22 +575,30 @@ def generate_output_directory_name(self,
                 float2str(weight_decay))
 
         # Add all hyperparameters to the name.
-        run_folder_name = self._add_hyperparams_to_output_dir_name(optimizer_hyperparams, run_folder_name)
+        run_folder_name = self._add_hyperparams_to_output_dir_name(
+            optimizer_hyperparams, run_folder_name)
 
         # Add training parameters to the name.
-        run_folder_name = self._add_training_params_to_output_dir_name(training_params, run_folder_name)
+        run_folder_name = self._add_training_params_to_output_dir_name(
+            training_params, run_folder_name)
 
-        file_name = "random_seed__{0:d}__".format(random_seed)
+        file_name = self._filename_no_date(random_seed)
         file_name += time.strftime("%Y-%m-%d-%H-%M-%S")
 
-        run_directory = os.path.join(output_dir, testproblem, self._optimizer_name,
-                                     run_folder_name)
+        run_directory = os.path.join(output_dir, testproblem,
+                                     self._optimizer_name, run_folder_name)
 
         return run_directory, file_name
-    
-    def _post_process_output(self, output, testproblem, batch_size, num_epochs, random_seed, weight_decay, hyperparams, **training_params):
+
+    @staticmethod
+    def _filename_no_date(random_seed):
+        return "random_seed__{0:d}__".format(random_seed)
+
+    def _post_process_output(self, output, testproblem, batch_size, num_epochs,
+                             random_seed, weight_decay, hyperparams,
+                             **training_params):
         """Ensures that for both frameworks the structure of the output is the same"""
-        
+
         # remove test accuracy if it is not available
         if 'test_accuracies' in output:
             if all(output['test_accuracies']) == 0:
@@ -517,20 +608,22 @@ def _post_process_output(self, output, testproblem, batch_size, num_epochs, rand
                     del output['valid_accuracies']
                 except KeyError:
                     pass
-        
+
         # merge meta data to output dict
-        output = {'testproblem': testproblem,
-                  'batch_size': batch_size,
-                  'num_epochs': num_epochs,
-                  'random_seed': random_seed,
-                  'weight_decay': weight_decay,
-                  'optimizer_name': self._optimizer_name,
-                  'optimizer_hyperparams': hyperparams,
-                  'training_params': training_params,
-                  **output}
-        
+        output = {
+            'testproblem': testproblem,
+            'batch_size': batch_size,
+            'num_epochs': num_epochs,
+            'random_seed': random_seed,
+            'weight_decay': weight_decay,
+            'optimizer_name': self._optimizer_name,
+            'optimizer_hyperparams': hyperparams,
+            'training_params': training_params,
+            **output
+        }
+
         return output
-            
+
     @staticmethod
     def write_output(output, run_folder_name, file_name):
         """Writes the JSON output.
@@ -540,17 +633,20 @@ def write_output(output, run_folder_name, file_name):
             run_folder_name (str): The name of the output folder.
             file_name (str): The file name where the output is written to.
         """
-        with open(os.path.join(run_folder_name, file_name + ".json"), "w") as f:
+        with open(os.path.join(run_folder_name, file_name + ".json"),
+                  "w") as f:
             json.dump(output, f, indent=4)
 
     @staticmethod
-    def _abort_routine(epoch_count, num_epochs, train_losses, valid_losses, test_losses, train_accuracies,
-                       valid_accuracies, test_accuracies, minibatch_train_losses):
+    def _abort_routine(epoch_count, num_epochs, train_losses, valid_losses,
+                       test_losses, train_accuracies, valid_accuracies,
+                       test_accuracies, minibatch_train_losses):
         """A routine that is executed if a training run is aborted (loss is NaN or Inf)."""
 
-        warnings.warn('Breaking from run after epoch ' + str(
-            epoch_count) + 'due to wrongly calibrated optimization (Loss is Nan or Inf). The metrices for the remaining epochs will be filled with the initial performance values.',
-                      RuntimeWarning)
+        warnings.warn(
+            'Breaking from run after epoch ' + str(epoch_count) +
+            'due to wrongly calibrated optimization (Loss is Nan or Inf). The metrices for the remaining epochs will be filled with the initial performance values.',
+            RuntimeWarning)
 
         # fill the rest of the metrices with initial observations
         for i in range(epoch_count, num_epochs):
diff --git a/deepobs/analyzer/analyze.py b/deepobs/analyzer/analyze.py
index 13ab8fcc..3b984919 100644
--- a/deepobs/analyzer/analyze.py
+++ b/deepobs/analyzer/analyze.py
@@ -1,14 +1,21 @@
-
 from __future__ import print_function
+
 import os
+import time
+from collections import Counter
+
 import numpy as np
+import pandas as pd
+
 from matplotlib import pyplot as plt
-from .shared_utils import create_setting_analyzer_ranking, _determine_available_metric, _get_optimizer_name_and_testproblem_from_path, _check_output_structure, _check_setting_folder_is_not_empty
+
 from ..tuner.tuner_utils import generate_tuning_summary
-from .analyze_utils import _rescale_ax, _preprocess_path
-import pandas as pd
-import time
-from collections import Counter
+from .analyze_utils import _preprocess_path, _rescale_ax
+from .shared_utils import (_check_output_structure,
+                           _check_setting_folder_is_not_empty,
+                           _determine_available_metric,
+                           _get_optimizer_name_and_testproblem_from_path,
+                           create_setting_analyzer_ranking)
 
 
 def check_output(results_path):
@@ -26,32 +33,38 @@ def check_output(results_path):
         optimizers = os.listdir(testproblem_path)
         for optimizer in optimizers:
             optimizer_path = os.path.join(testproblem_path, optimizer)
-            settings = [setting for setting in os.listdir(optimizer_path) if os.path.isdir(os.path.join(optimizer_path, setting)) and 'num_epochs' in setting]
+            settings = [
+                setting for setting in os.listdir(optimizer_path)
+                if os.path.isdir(os.path.join(optimizer_path, setting))
+                and 'num_epochs' in setting
+            ]
             n_runs_list = []
             for setting in settings:
                 setting_path = os.path.join(optimizer_path, setting)
                 _check_setting_folder_is_not_empty(setting_path)
-                jsons_files = [file for file in os.listdir(setting_path) if 'json' in file]
+                jsons_files = [
+                    file for file in os.listdir(setting_path) if 'json' in file
+                ]
                 n_runs_list.append(len(jsons_files))
                 for json_file in jsons_files:
                     json_path = os.path.join(setting_path, json_file)
                     _check_output_structure(setting_path, json_file)
             counter = Counter(n_runs_list)
             for n_runs, count in counter.items():
-                print('{0:s} | {1:s}: {2:d} setting(s) with {3:d} seed(s).'.format(testproblem, optimizer, count, n_runs))
+                print('{0:s} | {1:s}: {2:d} setting(s) with {3:d} seed(s).'.
+                      format(testproblem, optimizer, count, n_runs))
 
 
 def estimate_runtime(framework,
                      runner_cls,
                      optimizer_cls,
                      optimizer_hp,
-                     n_runs = 5,
+                     n_runs=5,
                      sgd_lr=0.01,
                      testproblem='mnist_mlp',
-                     num_epochs = 5,
-                     batch_size = 128,
+                     num_epochs=5,
+                     batch_size=128,
                      **kwargs):
-
     """Can be used to estimates the runtime overhead of a new optimizer compared to SGD. Runs the new optimizer and
     SGD seperately and calculates the fraction of wall clock overhead.
 
@@ -98,13 +111,12 @@ def estimate_runtime(framework,
         # SGD
         print("Running SGD")
         start_sgd = time.time()
-        runner.run(
-            testproblem=testproblem,
-            hyperparams=hyperparams,
-            batch_size=batch_size,
-            num_epochs=num_epochs,
-            no_logs=True,
-            **kwargs)
+        runner.run(testproblem=testproblem,
+                   hyperparams=hyperparams,
+                   batch_size=batch_size,
+                   num_epochs=num_epochs,
+                   no_logs=True,
+                   **kwargs)
         end_sgd = time.time()
 
         sgd_times.append(end_sgd - start_sgd)
@@ -114,13 +126,12 @@ def estimate_runtime(framework,
         runner = runner_cls(optimizer_cls, optimizer_hp)
         print("Running...", optimizer_class.__name__)
         start_script = time.time()
-        runner.run(
-            testproblem=testproblem,
-            hyperparams=hyperparams,
-            batch_size=batch_size,
-            num_epochs=num_epochs,
-            no_logs=True,
-            **kwargs)
+        runner.run(testproblem=testproblem,
+                   hyperparams=hyperparams,
+                   batch_size=batch_size,
+                   num_epochs=num_epochs,
+                   no_logs=True,
+                   **kwargs)
         end_script = time.time()
 
         new_opt_times.append(end_script - start_script)
@@ -139,8 +150,10 @@ def estimate_runtime(framework,
     return output
 
 
-def plot_results_table(results_path, mode='most', metric='valid_accuracies', conv_perf_file=None):
-    
+def plot_results_table(results_path,
+                       mode='most',
+                       metric='valid_accuracies',
+                       conv_perf_file=None):
     """Summarizes the performance of the optimizer and prints it to a pandas data frame.
 
             Args:
@@ -154,7 +167,9 @@ def plot_results_table(results_path, mode='most', metric='valid_accuracies', con
                 """
     table_dic = {}
     testproblems = os.listdir(results_path)
-    metric_keys = ['Hyperparameters', 'Performance', 'Speed', 'Training Parameters']
+    metric_keys = [
+        'Hyperparameters', 'Performance', 'Speed', 'Training Parameters'
+    ]
     for testproblem in testproblems:
         # init new subdict for testproblem
         for metric_key in metric_keys:
@@ -164,11 +179,14 @@ def plot_results_table(results_path, mode='most', metric='valid_accuracies', con
         optimizers = os.listdir(testproblem_path)
         for optimizer in optimizers:
             optimizer_path = os.path.join(testproblem_path, optimizer)
-            optimizer_performance_dic = get_performance_dictionary(optimizer_path, mode, metric, conv_perf_file)
+            optimizer_performance_dic = get_performance_dictionary(
+                optimizer_path, mode, metric, conv_perf_file)
 
             # invert inner dics for multiindexing
             for metric_key in metric_keys:
-                table_dic[(testproblem, metric_key)][optimizer] = optimizer_performance_dic[metric_key]
+                table_dic[(
+                    testproblem, metric_key
+                )][optimizer] = optimizer_performance_dic[metric_key]
 
     # correct multiindexing
     table = pd.DataFrame.from_dict(table_dic, orient='index')
@@ -176,7 +194,11 @@ def plot_results_table(results_path, mode='most', metric='valid_accuracies', con
     return table
 
 
-def plot_testset_performances(results_path, mode = 'most', metric = 'valid_accuracies', reference_path = None):
+def plot_testset_performances(results_path,
+                              mode='most',
+                              metric='valid_accuracies',
+                              reference_path=None,
+                              which='mean_and_std'):
     """Plots all optimizer performances for all testproblems.
 
     Args:
@@ -184,27 +206,46 @@ def plot_testset_performances(results_path, mode = 'most', metric = 'valid_accur
         mode (str): The mode by which to decide the best setting.
         metric (str): The metric by which to decide the best setting.
         reference_path(str): Path to the reference results folder. For each available reference testproblem, all optimizers are plotted as reference.
+        which (str): ['mean_and_std', 'median_and_quartiles'] Solid plot mean or median, shaded plots standard deviation or lower/upper quartiles.
 
     Returns:
         matplotlib.axes.Axes: The axes with the plots.
 
         """
-    testproblems = [path for path in os.listdir(results_path) if os.path.isdir(os.path.join(results_path, path))]
+    testproblems = [
+        path for path in os.listdir(results_path)
+        if os.path.isdir(os.path.join(results_path, path))
+    ]
     if reference_path is not None:
         reference_path = os.path.join(reference_path)
-        reference_testproblems = [path for path in os.listdir(results_path) if os.path.isdir(os.path.join(reference_path, path))]
+        reference_testproblems = [
+            path for path in os.listdir(results_path)
+            if os.path.isdir(os.path.join(reference_path, path))
+        ]
     else:
         reference_testproblems = []
     n_testproblems = len(testproblems)
     __, ax = plt.subplots(4, n_testproblems, sharex='col')
     for idx, testproblem in enumerate(testproblems):
         testproblem_path = os.path.join(results_path, testproblem)
-        ax[:, idx] = _plot_optimizer_performance(testproblem_path, ax[:, idx], mode, metric)
+        ax[:, idx] = _plot_optimizer_performance(testproblem_path,
+                                                 ax[:, idx],
+                                                 mode,
+                                                 metric,
+                                                 which=which)
         if testproblem in reference_testproblems:
-            reference_testproblem_path = os.path.join(reference_path, testproblem)
-            ax[:, idx] = _plot_optimizer_performance(reference_testproblem_path, ax[:, idx], mode, metric)
-
-    metrices = ['test_losses', 'train_losses', 'test_accuracies', 'train_accuracies']
+            reference_testproblem_path = os.path.join(reference_path,
+                                                      testproblem)
+            ax[:, idx] = _plot_optimizer_performance(
+                reference_testproblem_path,
+                ax[:, idx],
+                mode,
+                metric,
+                which=which)
+
+    metrices = [
+        'test_losses', 'train_losses', 'test_accuracies', 'train_accuracies'
+    ]
     for idx, _metric in enumerate(metrices):
         # label y axes
         ax[idx, 0].set_ylabel(_metric)
@@ -215,16 +256,21 @@ def plot_testset_performances(results_path, mode = 'most', metric = 'valid_accur
     # show legend of optimizers
     ax[0, 0].legend()
     plt.tight_layout()
-    plt.show()
     return ax
 
 
-def plot_hyperparameter_sensitivity_2d(optimizer_path, hyperparams, mode='final', metric = 'valid_accuracies', xscale='linear', yscale = 'linear'):
+def plot_hyperparameter_sensitivity_2d(optimizer_path,
+                                       hyperparams,
+                                       mode='final',
+                                       metric='valid_accuracies',
+                                       xscale='linear',
+                                       yscale='linear'):
     param1, param2 = hyperparams
     metric = _determine_available_metric(optimizer_path, metric)
     tuning_summary = generate_tuning_summary(optimizer_path, mode, metric)
 
-    optimizer_name, testproblem = _get_optimizer_name_and_testproblem_from_path(optimizer_path)
+    optimizer_name, testproblem = _get_optimizer_name_and_testproblem_from_path(
+        optimizer_path)
 
     param_values1 = np.array([d['params'][param1] for d in tuning_summary])
     param_values2 = np.array([d['params'][param2] for d in tuning_summary])
@@ -234,7 +280,11 @@ def plot_hyperparameter_sensitivity_2d(optimizer_path, hyperparams, mode='final'
 
     _, ax = plt.subplots()
 
-    con = ax.tricontourf(param_values1, param_values2, target_means, cmap = 'CMRmap', levels=len(target_means))
+    con = ax.tricontourf(param_values1,
+                         param_values2,
+                         target_means,
+                         cmap='CMRmap',
+                         levels=len(target_means))
     ax.scatter(param_values1, param_values2)
     ax.set_xscale(xscale)
     ax.set_yscale(yscale)
@@ -242,25 +292,30 @@ def plot_hyperparameter_sensitivity_2d(optimizer_path, hyperparams, mode='final'
     ax.set_ylabel(param2)
     cbar = plt.colorbar(con)
     cbar.set_label(metric)
-    plt.show()
     return ax
 
 
-def _plot_hyperparameter_sensitivity(optimizer_path, hyperparam, ax, mode='final', metric = 'valid_accuracies',
-                                    plot_std=False):
+def _plot_hyperparameter_sensitivity(optimizer_path,
+                                     hyperparam,
+                                     ax,
+                                     mode='final',
+                                     metric='valid_accuracies',
+                                     plot_std=False):
 
     metric = _determine_available_metric(optimizer_path, metric)
     tuning_summary = generate_tuning_summary(optimizer_path, mode, metric)
 
-    optimizer_name, testproblem = _get_optimizer_name_and_testproblem_from_path(optimizer_path)
+    optimizer_name, testproblem = _get_optimizer_name_and_testproblem_from_path(
+        optimizer_path)
 
     # create array for plotting
     param_values = [d['params'][hyperparam] for d in tuning_summary]
-    target_means = [d[metric +'_mean'] for d in tuning_summary]
-    target_stds = [d[metric +'_mean'] for d in tuning_summary]
+    target_means = [d[metric + '_mean'] for d in tuning_summary]
+    target_stds = [d[metric + '_mean'] for d in tuning_summary]
 
-    param_values, target_means, target_stds = (list(t) for t in
-                                               zip(*sorted(zip(param_values, target_means, target_stds))))
+    param_values, target_means, target_stds = (
+        list(t)
+        for t in zip(*sorted(zip(param_values, target_means, target_stds))))
 
     param_values = np.array(param_values)
     target_means = np.array(target_means)
@@ -272,16 +327,20 @@ def _plot_hyperparameter_sensitivity(optimizer_path, hyperparam, ax, mode='final
             param_value = rank.aggregate['optimizer_hyperparams'][hyperparam]
             for value in values:
                 ax.scatter(param_value, value, marker='x', color='b')
-            ax.plot((param_value, param_value), (min(values), max(values)), color='grey', linestyle='--')
+            ax.plot((param_value, param_value), (min(values), max(values)),
+                    color='grey',
+                    linestyle='--')
     ax.set_title(testproblem, fontsize=20)
     return ax
 
 
-def plot_hyperparameter_sensitivity(path, hyperparam, mode='final', metric = 'valid_accuracies',
+def plot_hyperparameter_sensitivity(path,
+                                    hyperparam,
+                                    mode='final',
+                                    metric='valid_accuracies',
                                     xscale='linear',
                                     plot_std=True,
-                                    reference_path = None):
-
+                                    reference_path=None):
     """Plots the hyperparameter sensitivtiy of the optimizer.
 
     Args:
@@ -300,25 +359,31 @@ def plot_hyperparameter_sensitivity(path, hyperparam, mode='final', metric = 'va
     pathes = _preprocess_path(path)
     for optimizer_path in pathes:
         metric = _determine_available_metric(optimizer_path, metric)
-        ax = _plot_hyperparameter_sensitivity(optimizer_path, hyperparam, ax, mode, metric, plot_std)
+        ax = _plot_hyperparameter_sensitivity(optimizer_path, hyperparam, ax,
+                                              mode, metric, plot_std)
     if reference_path is not None:
         pathes = _preprocess_path(reference_path)
         for reference_optimizer_path in pathes:
-            metric = _determine_available_metric(reference_optimizer_path, metric)
-            ax = _plot_hyperparameter_sensitivity(reference_optimizer_path, hyperparam, ax, mode, metric, plot_std)
+            metric = _determine_available_metric(reference_optimizer_path,
+                                                 metric)
+            ax = _plot_hyperparameter_sensitivity(reference_optimizer_path,
+                                                  hyperparam, ax, mode, metric,
+                                                  plot_std)
 
     plt.xscale(xscale)
     plt.xlabel(hyperparam, fontsize=16)
     plt.ylabel(metric, fontsize=16)
     ax.tick_params(labelsize=14)
     ax.legend()
-    plt.show()
     return ax
 
 
-def plot_final_metric_vs_tuning_rank(optimizer_path, metric='valid_accuracies'):
+def plot_final_metric_vs_tuning_rank(optimizer_path,
+                                     metric='valid_accuracies'):
     metric = _determine_available_metric(optimizer_path, metric)
-    ranks = create_setting_analyzer_ranking(optimizer_path, mode='final', metric=metric)
+    ranks = create_setting_analyzer_ranking(optimizer_path,
+                                            mode='final',
+                                            metric=metric)
     means = []
     fig, ax = plt.subplots()
     for idx, rank in enumerate(ranks):
@@ -326,17 +391,22 @@ def plot_final_metric_vs_tuning_rank(optimizer_path, metric='valid_accuracies'):
         values = rank.get_all_final_values(metric)
         for value in values:
             ax.scatter(idx, value, marker='x', color='b')
-        ax.plot((idx, idx), (min(values), max(values)), color= 'grey', linestyle='--')
+        ax.plot((idx, idx), (min(values), max(values)),
+                color='grey',
+                linestyle='--')
     ax.plot(range(len(ranks)), means)
-    optimizer, testproblem = _get_optimizer_name_and_testproblem_from_path(optimizer_path)
+    optimizer, testproblem = _get_optimizer_name_and_testproblem_from_path(
+        optimizer_path)
     ax.set_title(optimizer + ' on ' + testproblem)
     ax.set_xlabel('tuning rank')
     ax.set_ylabel(metric)
-    plt.show()
     return fig, ax
 
 
-def get_performance_dictionary(optimizer_path, mode = 'most', metric = 'valid_accuracies', conv_perf_file = None):
+def get_performance_dictionary(optimizer_path,
+                               mode='most',
+                               metric='valid_accuracies',
+                               conv_perf_file=None):
     """Summarizes the performance of the optimizer.
 
     Args:
@@ -349,7 +419,8 @@ def get_performance_dictionary(optimizer_path, mode = 'most', metric = 'valid_ac
         dict: A dictionary that holds the best setting and it's performance on the test set.
         """
     metric = _determine_available_metric(optimizer_path, metric)
-    setting_analyzers_ranking = create_setting_analyzer_ranking(optimizer_path, mode, metric)
+    setting_analyzers_ranking = create_setting_analyzer_ranking(
+        optimizer_path, mode, metric)
     sett = setting_analyzers_ranking[0]
 
     perf_dict = dict()
@@ -374,40 +445,86 @@ def get_performance_dictionary(optimizer_path, mode = 'most', metric = 'valid_ac
     return perf_dict
 
 
-def _plot_optimizer_performance(path, ax = None, mode = 'most', metric = 'valid_accuracies'):
+def _plot_optimizer_performance(path,
+                                ax=None,
+                                mode='most',
+                                metric='valid_accuracies',
+                                which='mean_and_std'):
     """Plots the training curve of an optimizer.
 
     Args:
         path (str): Path to the optimizer or to a whole testproblem (in this case all optimizers in the testproblem folder are plotted).
-        ax (matplotlib.axes.Axes): The axes to plot the trainig curves for all metrices. Must have 4 subaxes.
+        ax (matplotlib.axes.Axes): The axes to plot the trainig curves for all metrics. Must have 4 subaxes.
         mode (str): The mode by which to decide the best setting.
         metric (str): The metric by which to decide the best setting.
+        which (str): ['mean_and_std', 'median_and_quartiles', 'mean_and_std_log']
+            - Solid plot mean or median or exponentiated mean of log
+            - Shaded plots standard deviation or lower/upper quartiles or exponentiated std of log
+
     Returns:
         matplotlib.axes.Axes: The axes with the plots.
-
         """
-    metrices = ['test_losses', 'train_losses', 'test_accuracies', 'train_accuracies']
-    if ax is None:  # create default axis for all 4 metrices
+    loss_metrics = [
+        'test_losses',
+        'train_losses',
+    ]
+    accuracy_metrics = [
+        'test_accuracies',
+        'train_accuracies',
+    ]
+    metrics = loss_metrics + accuracy_metrics
+
+    def is_loss(metric):
+        return metric in loss_metrics
+
+    if ax is None:  # create default axis for all 4 metrics
         _, ax = plt.subplots(4, 1, sharex='col')
 
     pathes = _preprocess_path(path)
     for optimizer_path in pathes:
-        setting_analyzer_ranking = create_setting_analyzer_ranking(optimizer_path, mode, metric)
+        setting_analyzer_ranking = create_setting_analyzer_ranking(
+            optimizer_path, mode, metric)
         setting = setting_analyzer_ranking[0]
 
         optimizer_name = os.path.basename(optimizer_path)
-        for idx, _metric in enumerate(metrices):
+        for idx, _metric in enumerate(metrics):
             if _metric in setting.aggregate:
-                mean = setting.aggregate[_metric]['mean']
-                std = setting.aggregate[_metric]['std']
-                ax[idx].plot(mean, label=optimizer_name)
-                ax[idx].fill_between(range(len(mean)), mean - std, mean + std, alpha=0.3)
-    _, testproblem = _get_optimizer_name_and_testproblem_from_path(optimizer_path)
+                if which == 'mean_and_std_log':
+                    if is_loss(_metric):
+                        ax[idx].set_yscale('log')
+                        center = setting.aggregate[_metric]['mean_log']
+                        std = setting.aggregate[_metric]['std_log']
+                        low, high = center - std, center + std
+                    else:
+                        center = setting.aggregate[_metric]['mean']
+                        std = setting.aggregate[_metric]['std']
+                        low, high = center - std, center + std
+                elif which == 'mean_and_std':
+                    center = setting.aggregate[_metric]['mean']
+                    std = setting.aggregate[_metric]['std']
+                    low, high = center - std, center + std
+                elif which == 'median_and_quartiles':
+                    center = setting.aggregate[_metric]['median']
+                    low = setting.aggregate[_metric]['lower_quartile']
+                    high = setting.aggregate[_metric]['upper_quartile']
+                else:
+                    raise ValueError("Unknown value which={}".format(which))
+
+                ax[idx].plot(center, label=optimizer_name)
+                ax[idx].fill_between(range(len(center)), low, high, alpha=0.3)
+
+    _, testproblem = _get_optimizer_name_and_testproblem_from_path(
+        optimizer_path)
     ax[0].set_title(testproblem, fontsize=18)
     return ax
 
 
-def plot_optimizer_performance(path, ax = None, mode = 'most', metric = 'valid_accuracies', reference_path = None):
+def plot_optimizer_performance(path,
+                               ax=None,
+                               mode='most',
+                               metric='valid_accuracies',
+                               reference_path=None,
+                               which='mean_and_std'):
     """Plots the training curve of optimizers and addionally plots reference results from the ``reference_path``
 
     Args:
@@ -416,20 +533,29 @@ def plot_optimizer_performance(path, ax = None, mode = 'most', metric = 'valid_a
         mode (str): The mode by which to decide the best setting.
         metric (str): The metric by which to decide the best setting.
         reference_path (str): Path to the reference optimizer or to a whole testproblem (in this case all optimizers in the testproblem folder are taken as reference).
+        which (str): ['mean_and_std', 'median_and_quartiles', 'mean_and_std_log']
+            - Solid plot mean or median or exponentiated mean of log
+            - Shaded plots standard deviation or lower/upper quartiles or exponentiated std of log
 
     Returns:
         matplotlib.axes.Axes: The axes with the plots.
 
         """
 
-    ax = _plot_optimizer_performance(path, ax, mode, metric)
+    ax = _plot_optimizer_performance(path, ax, mode, metric, which=which)
     if reference_path is not None:
-        ax = _plot_optimizer_performance(reference_path, ax, mode, metric)
-
-    metrices = ['test_losses', 'train_losses', 'test_accuracies', 'train_accuracies']
+        ax = _plot_optimizer_performance(reference_path,
+                                         ax,
+                                         mode,
+                                         metric,
+                                         which=which)
+
+    metrices = [
+        'test_losses', 'train_losses', 'test_accuracies', 'train_accuracies'
+    ]
     for idx, _metric in enumerate(metrices):
         # set y labels
-        ax[idx].set_ylabel(_metric, fontsize = 14)
+        ax[idx].set_ylabel(_metric, fontsize=14)
         # rescale plots
         # ax[idx] = _rescale_ax(ax[idx])
         ax[idx].tick_params(labelsize=12)
@@ -437,8 +563,6 @@ def plot_optimizer_performance(path, ax = None, mode = 'most', metric = 'valid_a
     # show optimizer legends
     ax[0].legend(fontsize=12)
 
-    ax[3].set_xlabel('epochs', fontsize = 14)
+    ax[3].set_xlabel('epochs', fontsize=14)
 
-    plt.show()
     return ax
-
diff --git a/deepobs/analyzer/shared_utils.py b/deepobs/analyzer/shared_utils.py
index 30ba2e75..845d5c6f 100644
--- a/deepobs/analyzer/shared_utils.py
+++ b/deepobs/analyzer/shared_utils.py
@@ -1,15 +1,17 @@
 import json
 import os
-import numpy as np
 import warnings
 
+import numpy as np
+
 
 def _check_setting_folder_is_not_empty(setting_path):
     runs = [run for run in os.listdir(setting_path) if 'json' in run]
     try:
         assert len(runs) > 0
     except AssertionError:
-        print('Found a setting folder with no runs inside: {0:s}'.format(setting_path))
+        print('Found a setting folder with no runs inside: {0:s}'.format(
+            setting_path))
 
 
 def _check_output_structure(path, file_name):
@@ -30,59 +32,86 @@ def _check_output_structure(path, file_name):
         assert 'test_losses' in json_data
 
         # all must have the same length
-        assert len(json_data['train_losses']) == len(json_data['test_losses']) == len(json_data['valid_losses']) == json_data['num_epochs']+1
+        assert len(json_data['train_losses']) == len(
+            json_data['test_losses']) == len(
+                json_data['valid_losses']) == json_data['num_epochs'] + 1
     except AssertionError as e:
-        print('Found corrupted output file: {0:s} in path: {1:s}'.format(file_name, path))
+        print('Found corrupted output file: {0:s} in path: {1:s}'.format(
+            file_name, path))
 
 
-def aggregate_runs(setting_folder):
+def aggregate_runs(setting_folder, custom_metrics=None):
     """Aggregates all seed runs for a setting.
     Args:
         setting_folder (str): The path to the setting folder.
+        custom_metrics (list(str)): Additional metrics that will be extracted if available
     Returns:
         A dictionary that contains the aggregated mean and std of all metrices, as well as the meta data.
         """
+    dobs_metrics = [
+        'train_losses', 'valid_losses', 'test_losses', 'train_accuracies',
+        'valid_accuracies', 'test_accuracies'
+    ]
+    if custom_metrics is None:
+        custom_metrics = []
+
     runs = [run for run in os.listdir(setting_folder) if run.endswith(".json")]
-    # metrices
-    train_losses = []
-    valid_losses = []
-    test_losses = []
-    train_accuracies = []
-    valid_accuracies = []
-    test_accuracies = []
 
-    for run in runs:
-        json_data = _load_json(setting_folder, run)
-        train_losses.append(json_data['train_losses'])
+    runs = [run for run in os.listdir(setting_folder) if run.endswith(".json")]
+    if not runs:
+        raise RuntimeError(f"No .json file in {setting_folder}")
 
-        # TODO remove try-except once validation metrices are available for the baselines
-        try:
-            valid_losses.append(json_data['valid_losses'])
-        except KeyError:
-            pass
+    def no_data():
+        return []
 
-        test_losses.append(json_data['test_losses'])
-        # just add accuracies to the aggregate if they are available
-        if 'train_accuracies' in json_data :
-            train_accuracies.append(json_data['train_accuracies'])
+    all_metrics = dobs_metrics + custom_metrics
+    all_metrics_data = {m: no_data() for m in all_metrics}
 
-            # TODO remove try-except once validation metrices are available for the baselines
+    for run in runs:
+        json_data = _load_json(setting_folder, run)
+        for metric in all_metrics:
             try:
-                valid_accuracies.append(json_data['valid_accuracies'])
+                run_data = json_data[metric]
             except KeyError:
-                pass
-
-            test_accuracies.append(json_data['test_accuracies'])
+                run_data = no_data()
+            all_metrics_data[metric].append(run_data)
+
+    # custom metrics: fill with nans if run quit earlier
+    metrics_require_nans = set()
+    nans_inserted = 0
+    for metric in custom_metrics:
+        max_num_points = max(
+            len(run_data) for run_data in all_metrics_data[metric])
+        # fill up with nans
+        for run_data in all_metrics_data[metric]:
+            while len(run_data) < max_num_points:
+                metrics_require_nans.add(metric)
+                nans_inserted += 1
+                run_data.append(float('nan'))
+    if nans_inserted > 0:
+        print(
+            "[CUSTOM METRICS]: Needed to insert {} NaNs".format(nans_inserted))
+        print("[CUSTOM METRICS]: Affected metrics {}".format(
+            metrics_require_nans))
 
     aggregate = dict()
-    for metrics in ['train_losses', 'valid_losses', 'test_losses', 'train_accuracies', 'valid_accuracies', 'test_accuracies']:
+    for metric in all_metrics:
+        data = np.array(all_metrics_data[metric])
         # only add the metric if available
-        if len(eval(metrics)) != 0:
-            aggregate[metrics] = {
-                    'mean': np.mean(eval(metrics), axis=0),
-                    'std': np.std(eval(metrics), axis=0),
-                    'all_final_values': [met[-1] for met in eval(metrics)]
-                }
+        is_empty = data.shape[1] == 0
+        if not is_empty:
+            aggregate[metric] = {
+                'mean': np.mean(data, axis=0),
+                'std': np.std(data, axis=0),
+                'all_final_values': [met[-1] for met in data],
+                'lower_quartile': np.quantile(data, 0.25, axis=0),
+                'median': np.median(data, axis=0),
+                'upper_quartile': np.quantile(data, 0.75, axis=0),
+                'mean_log': np.power(10, np.mean(np.log10(data), axis=0)),
+                'std_log': np.power(10, np.std(np.log10(data), axis=0)),
+                'min': np.min(data, axis=0),
+                'max': np.max(data, axis=0),
+            }
     # merge meta data
     aggregate['optimizer_hyperparams'] = json_data['optimizer_hyperparams']
     aggregate['training_params'] = json_data['training_params']
@@ -95,7 +124,10 @@ def aggregate_runs(setting_folder):
 def _read_all_settings_folders(optimizer_path):
     """Returns a list of all setting folders in ``optimizer_path``"""
     optimizer_path = os.path.join(optimizer_path)
-    return [f for f in os.listdir(optimizer_path) if os.path.isdir(os.path.join(optimizer_path, f)) and 'num_epochs' in f]
+    return [
+        f for f in os.listdir(optimizer_path)
+        if os.path.isdir(os.path.join(optimizer_path, f)) and 'num_epochs' in f
+    ]
 
 
 def _check_if_metric_is_available(optimizer_path, metric):
@@ -111,22 +143,29 @@ def _check_if_metric_is_available(optimizer_path, metric):
         return False
 
 
-def _determine_available_metric(optimizer_path, metric, default_metric = 'valid_losses'):
+def _determine_available_metric(optimizer_path,
+                                metric,
+                                default_metric='valid_losses'):
     """Checks if the metric ``metric`` is availabe for the runs in ``optimizer_path``.
     If not, it returns the fallback metric ``default_metric``."""
-    optimizer_name, testproblem_name = _get_optimizer_name_and_testproblem_from_path(optimizer_path)
+    optimizer_name, testproblem_name = _get_optimizer_name_and_testproblem_from_path(
+        optimizer_path)
     if _check_if_metric_is_available(optimizer_path, metric):
         return metric
     else:
 
         # TODO remove if-else once validation metrics are available for the baselines
         if _check_if_metric_is_available(optimizer_path, default_metric):
-            warnings.warn('Metric {0:s} does not exist for testproblem {1:s}. We now use fallback metric {2:s}'.format(
-                metric, testproblem_name, default_metric), RuntimeWarning)
+            warnings.warn(
+                'Metric {0:s} does not exist for testproblem {1:s}. We now use fallback metric {2:s}'
+                .format(metric, testproblem_name,
+                        default_metric), RuntimeWarning)
             return default_metric
         else:
-            warnings.warn('Cannot fallback to metric {0:s} for optimizer {1:s} on testproblem {2:s}. Will now fallback to metric test_losses'.format(
-                default_metric, optimizer_name, testproblem_name), RuntimeWarning)
+            warnings.warn(
+                'Cannot fallback to metric {0:s} for optimizer {1:s} on testproblem {2:s}. Will now fallback to metric test_losses'
+                .format(default_metric, optimizer_name,
+                        testproblem_name), RuntimeWarning)
             return 'test_losses'
 
 
@@ -149,18 +188,22 @@ def _clear_json(path, file):
 
 def _load_json(path, file_name):
     with open(os.path.join(path, file_name), "r") as f:
-         json_data = json.load(f)
+        json_data = json.load(f)
     return json_data
 
 
-def _get_all_setting_analyzer(optimizer_path):
+def _get_all_setting_analyzer(optimizer_path, custom_metrics=None):
     """Creates a list of SettingAnalyzers (one for each setting in ``optimizer_path``)"""
+    if custom_metrics is None:
+        custom_metrics = []
+
     optimizer_path = os.path.join(optimizer_path)
     setting_folders = _read_all_settings_folders(optimizer_path)
     setting_analyzers = []
     for sett in setting_folders:
         sett_path = os.path.join(optimizer_path, sett)
-        setting_analyzers.append(SettingAnalyzer(sett_path))
+        setting_analyzers.append(
+            SettingAnalyzer(sett_path, custom_metrics=custom_metrics))
     return setting_analyzers
 
 
@@ -170,7 +213,10 @@ def _get_optimizer_name_and_testproblem_from_path(optimizer_path):
     return optimizer_name, testproblem
 
 
-def create_setting_analyzer_ranking(optimizer_path, mode = 'final', metric = 'valid_accuracies'):
+def create_setting_analyzer_ranking(optimizer_path,
+                                    mode='final',
+                                    metric='valid_accuracies',
+                                    custom_metrics=None):
     """Reads in all settings in ``optimizer_path`` and sets up a ranking by returning an ordered list of SettingAnalyzers.
     Args:
         optimizer_path (str): The path to the optimizer to analyse.
@@ -179,8 +225,12 @@ def create_setting_analyzer_ranking(optimizer_path, mode = 'final', metric = 'va
     Returns:
         An ordered list of SettingAnalyzers. I.e. the first item is considered 'the best one' etc.
     """
+    if custom_metrics is None:
+        custom_metrics = []
+
     metric = _determine_available_metric(optimizer_path, metric)
-    setting_analyzers = _get_all_setting_analyzer(optimizer_path)
+    setting_analyzers = _get_all_setting_analyzer(
+        optimizer_path, custom_metrics=custom_metrics)
 
     if 'acc' in metric:
         sgn = -1
@@ -188,17 +238,29 @@ def create_setting_analyzer_ranking(optimizer_path, mode = 'final', metric = 'va
         sgn = 1
 
     if mode == 'final':
-        setting_analyzers_ordered = sorted(setting_analyzers, key=lambda idx: sgn * idx.get_final_value(metric))
+        setting_analyzers_ordered = sorted(
+            setting_analyzers,
+            key=lambda idx: sgn * idx.get_final_value(metric))
     elif mode == 'best':
-        setting_analyzers_ordered = sorted(setting_analyzers, key=lambda idx: sgn * idx.get_best_value(metric))
+        setting_analyzers_ordered = sorted(
+            setting_analyzers,
+            key=lambda idx: sgn * idx.get_best_value(metric))
     elif mode == 'most':
         # if all have the same amount of runs, i.e. no 'most' avalaible, fall back to 'final'
-        if all(x.n_runs == setting_analyzers[0].n_runs for x in setting_analyzers):
-            optimizer_name, testproblem_name = _get_optimizer_name_and_testproblem_from_path(optimizer_path)
-            warnings.warn('All settings for {0:s} on test problem {1:s} have the same number of seeds runs. Mode \'most\' does not make sense and we use the fallback mode \'final\''.format(optimizer_path, testproblem_name), RuntimeWarning)
-            setting_analyzers_ordered = sorted(setting_analyzers, key=lambda idx: sgn * idx.get_final_value(metric))
+        if all(x.n_runs == setting_analyzers[0].n_runs
+               for x in setting_analyzers):
+            optimizer_name, testproblem_name = _get_optimizer_name_and_testproblem_from_path(
+                optimizer_path)
+            warnings.warn(
+                'All settings for {0:s} on test problem {1:s} have the same number of seeds runs. Mode \'most\' does not make sense and we use the fallback mode \'final\''
+                .format(optimizer_path, testproblem_name), RuntimeWarning)
+            setting_analyzers_ordered = sorted(
+                setting_analyzers,
+                key=lambda idx: sgn * idx.get_final_value(metric))
         else:
-            setting_analyzers_ordered = sorted(setting_analyzers, key=lambda idx: idx.n_runs, reverse=True)
+            setting_analyzers_ordered = sorted(setting_analyzers,
+                                               key=lambda idx: idx.n_runs,
+                                               reverse=True)
     else:
         raise RuntimeError('Mode not implemented')
 
@@ -213,28 +275,32 @@ class SettingAnalyzer:
         aggregate (dictionary): Contains the mean and std of the runs as well as the meta data.
         n_runs (int): The number of seed runs that were performed for this setting.
     """
-
-    def __init__(self, path):
+    def __init__(self, path, custom_metrics=None):
         """Initializes a new SettingAnalyzer instance.
 
         Args:
             path (str): String to the setting folder.
         """
+        if custom_metrics is None:
+            custom_metrics = []
 
         self.path = path
         self.n_runs = self.__get_number_of_runs()
-        self.aggregate = aggregate_runs(path)
+        self.aggregate = aggregate_runs(path, custom_metrics=custom_metrics)
 
     def __get_number_of_runs(self):
         """Calculates the total number of seed runs."""
-        return len([run for run in os.listdir(self.path) if run.endswith(".json")])
+        return len(
+            [run for run in os.listdir(self.path) if run.endswith(".json")])
 
     def get_final_value(self, metric):
         """Get the final (mean) value of the metric."""
         try:
             return self.aggregate[metric]['mean'][-1]
         except KeyError:
-            raise KeyError('Metric {0:s} not available for testproblem {1:s} of this setting'.format(metric, self.aggregate['testproblem']))
+            raise KeyError(
+                'Metric {0:s} not available for testproblem {1:s} of this setting'
+                .format(metric, self.aggregate['testproblem']))
 
     def get_best_value(self, metric):
         """Get the best (mean) value of the metric."""
@@ -246,7 +312,9 @@ def get_best_value(self, metric):
             else:
                 raise NotImplementedError
         except KeyError:
-            raise KeyError('Metric {0:s} not available for testproblem {1:s} of this setting'.format(metric, self.aggregate['testproblem']))
+            raise KeyError(
+                'Metric {0:s} not available for testproblem {1:s} of this setting'
+                .format(metric, self.aggregate['testproblem']))
 
     def calculate_speed(self, conv_perf_file):
         """Calculates the speed of the setting."""
@@ -283,4 +351,6 @@ def get_all_final_values(self, metric):
         try:
             return self.aggregate[metric]['all_final_values']
         except KeyError:
-            raise KeyError('Metric {0:s} not available for testproblem {1:s} of this setting'.format(metric, self.aggregate['testproblem']))
+            raise KeyError(
+                'Metric {0:s} not available for testproblem {1:s} of this setting'
+                .format(metric, self.aggregate['testproblem']))
diff --git a/deepobs/pytorch/testproblems/testproblem.py b/deepobs/pytorch/testproblems/testproblem.py
index 78130b3a..ca120319 100644
--- a/deepobs/pytorch/testproblems/testproblem.py
+++ b/deepobs/pytorch/testproblems/testproblem.py
@@ -48,6 +48,8 @@ def __init__(self, batch_size, weight_decay=None):
         self._weight_decay = weight_decay
         self._device = torch.device(config.get_default_device())
 
+        self._batch_count = 0
+
         # Public attributes by which to interact with test problems. These have to
         # be created by the set_up function of sub-classes.
         self.data = None
@@ -89,10 +91,12 @@ def test_init_op(self):
 
     def _get_next_batch(self):
         """Returns the next batch from the iterator."""
+        self._batch_count += 1
         return next(self._iterator)
 
     def get_batch_loss_and_accuracy(self,
                                     return_forward_func = False,
+                                    evaluate_forward_func = True,
                                     reduction = 'mean',
                                     add_regularization_if_available = True):
 
@@ -141,7 +145,10 @@ def _get_batch_loss_and_accuracy():
             return loss + regularizer_loss, accuracy
 
         if return_forward_func:
-            return _get_batch_loss_and_accuracy(), _get_batch_loss_and_accuracy
+            if evaluate_forward_func is True:
+                return _get_batch_loss_and_accuracy(), _get_batch_loss_and_accuracy
+            else:
+                return _get_batch_loss_and_accuracy
         else:
             return _get_batch_loss_and_accuracy()