diff --git a/.gitignore b/.gitignore
index d4bf4b13aa..61ace37664 100755
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@ docs/build/*
 *.py[cod]
 
 # C extensions
+*.c
 *.so
 
 # Packages
@@ -46,3 +47,5 @@ download
 *.pkl
 num_run
 number_submission
+.pypirc
+dmypy.json
diff --git a/.travis.yml b/.travis.yml
index f1031eb382..76240caa38 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -33,7 +33,7 @@ matrix:
   - os: linux
     env: DISTRIB="conda" COVERAGE="true" DOCPUSH="true" PYTHON="3.6"
   - os: linux
-    env: DISTRIB="conda" $TEST_DIST="true" PYTHON="3.7"
+    env: DISTRIB="conda" TEST_DIST="true" PYTHON="3.7"
   - os: linux
     env: DISTRIB="conda" EXAMPLES="true" PYTHON=3.7"
   - os: linux
diff --git a/autosklearn/__version__.py b/autosklearn/__version__.py
index 41db4b01f8..c7ccaa718a 100644
--- a/autosklearn/__version__.py
+++ b/autosklearn/__version__.py
@@ -1,4 +1,4 @@
 """Version information."""
 
 # The following line *must* be the last in the module, exactly as formatted:
-__version__ = "0.5.1"
+__version__ = "0.5.2"
diff --git a/autosklearn/automl.py b/autosklearn/automl.py
index 84aa583525..3b1067e2e2 100644
--- a/autosklearn/automl.py
+++ b/autosklearn/automl.py
@@ -1057,6 +1057,9 @@ def predict_proba(self, X, batch_size=None, n_jobs=1):
 
 
 class AutoMLRegressor(BaseAutoML):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        
     def fit(
         self,
         X: np.ndarray,
diff --git a/autosklearn/ensemble_builder.py b/autosklearn/ensemble_builder.py
index a719c42362..a2d90dc601 100644
--- a/autosklearn/ensemble_builder.py
+++ b/autosklearn/ensemble_builder.py
@@ -257,13 +257,15 @@ def read_ensemble_preds(self):
 
         if self.shared_mode is False:
             pred_path = os.path.join(
-                    self.dir_ensemble,
-                    'predictions_ensemble_%s_*.npy' % self.seed)
+                glob.escape(self.dir_ensemble),
+                'predictions_ensemble_%s_*.npy' % self.seed,
+            )
         # pSMAC
         else:
             pred_path = os.path.join(
-                    self.dir_ensemble,
-                    'predictions_ensemble_*_*.npy')
+                glob.escape(self.dir_ensemble),
+                'predictions_ensemble_*_*.npy',
+            )
 
         y_ens_files = glob.glob(pred_path)
         # no validation predictions so far -- no files
@@ -453,13 +455,21 @@ def get_valid_test_preds(self, selected_keys: list):
 
         for k in selected_keys:
             valid_fn = glob.glob(
-                os.path.join(self.dir_valid, 'predictions_valid_%d_%d.npy'
-                                    % (self.read_preds[k]["seed"],
-                                       self.read_preds[k]["num_run"])))
+                os.path.join(
+                    glob.escape(self.dir_valid),
+                    'predictions_valid_%d_%d.npy' % (
+                        self.read_preds[k]["seed"],
+                        self.read_preds[k]["num_run"])
+                )
+            )
             test_fn = glob.glob(
-                os.path.join(self.dir_test, 'predictions_test_%d_%d.npy' %
-                                   (self.read_preds[k]["seed"],
-                                    self.read_preds[k]["num_run"])))
+                os.path.join(
+                    glob.escape(self.dir_test),
+                    'predictions_test_%d_%d.npy' % (
+                        self.read_preds[k]["seed"],
+                        self.read_preds[k]["num_run"])
+                )
+            )
 
             # TODO don't read valid and test if not changed
             if len(valid_fn) == 0:
@@ -636,11 +646,11 @@ def predict(self, set_: str,
 
     def _read_np_fn(self, fp):
         if self.precision is "16":
-            predictions = np.load(fp).astype(dtype=np.float16)
+            predictions = np.load(fp, allow_pickle=True).astype(dtype=np.float16)
         elif self.precision is "32":
-            predictions = np.load(fp).astype(dtype=np.float32)
+            predictions = np.load(fp, allow_pickle=True).astype(dtype=np.float32)
         elif self.precision is "64":
-            predictions = np.load(fp).astype(dtype=np.float64)
+            predictions = np.load(fp, allow_pickle=True).astype(dtype=np.float64)
         else:
-            predictions = np.load(fp)
+            predictions = np.load(fp, allow_pickle=True)
         return predictions
diff --git a/autosklearn/util/backend.py b/autosklearn/util/backend.py
index 86da1610e6..03dfafac98 100644
--- a/autosklearn/util/backend.py
+++ b/autosklearn/util/backend.py
@@ -244,7 +244,7 @@ def get_smac_output_directory_for_run(self, seed):
 
     def get_smac_output_glob(self, smac_run_id: Union[str, int] = 1) -> str:
         return os.path.join(
-            self.temporary_directory,
+            glob.escape(self.temporary_directory),
             'smac3-output',
             'run_%s' % str(smac_run_id),
         )
@@ -265,7 +265,7 @@ def save_targets_ensemble(self, targets):
         # number of times where we erronously keep a lock on the ensemble
         # targets file although the process already was killed
         try:
-            existing_targets = np.load(filepath)
+            existing_targets = np.load(filepath, allow_pickle=True)
             if existing_targets.shape[0] > targets.shape[0] or \
                     (existing_targets.shape == targets.shape and
                          np.allclose(existing_targets, targets)):
@@ -278,7 +278,7 @@ def save_targets_ensemble(self, targets):
         with lockfile.LockFile(lock_path):
             if os.path.exists(filepath):
                 with open(filepath, 'rb') as fh:
-                    existing_targets = np.load(fh)
+                    existing_targets = np.load(fh, allow_pickle=True)
                     if existing_targets.shape[0] > targets.shape[0] or \
                             (existing_targets.shape == targets.shape and
                              np.allclose(existing_targets, targets)):
@@ -299,7 +299,7 @@ def load_targets_ensemble(self):
         lock_path = filepath + '.lock'
         with lockfile.LockFile(lock_path):
             with open(filepath, 'rb') as fh:
-                targets = np.load(fh)
+                targets = np.load(fh, allow_pickle=True)
 
         return targets
 
@@ -346,8 +346,9 @@ def save_model(self, model, idx, seed):
     def list_all_models(self, seed):
         model_directory = self.get_model_dir()
         if seed >= 0:
-            model_files = glob.glob(os.path.join(model_directory,
-                                                 '%s.*.model' % seed))
+            model_files = glob.glob(
+                os.path.join(glob.escape(model_directory), '%s.*.model' % seed)
+            )
         else:
             model_files = os.listdir(model_directory)
             model_files = [os.path.join(model_directory, mf)
@@ -408,9 +409,11 @@ def load_ensemble(self, seed):
             self.logger.warning('Directory %s does not exist' % ensemble_dir)
             return None
 
+        print(seed)
         if seed >= 0:
-            indices_files = glob.glob(os.path.join(ensemble_dir,
-                                                   '%s.*.ensemble' % seed))
+            indices_files = glob.glob(
+                os.path.join(glob.escape(ensemble_dir), '%s.*.ensemble' % seed)
+            )
             indices_files.sort()
         else:
             indices_files = os.listdir(ensemble_dir)
@@ -419,6 +422,7 @@ def load_ensemble(self, seed):
 
         with open(indices_files[-1], 'rb') as fh:
             ensemble_members_run_numbers = pickle.load(fh)
+        print(indices_files)
 
         return ensemble_members_run_numbers
 
diff --git a/doc/releases.rst b/doc/releases.rst
index 52f01e4247..20cc286526 100644
--- a/doc/releases.rst
+++ b/doc/releases.rst
@@ -11,6 +11,22 @@
 Releases
 ========
 
+Version 0.5.2
+=============
+
+* FIX #669: Correctly handle arguments to the ``AutoMLRegressor``
+* FIX #667: Auto-sklearn works with numpy 1.16.3 again.
+* ADD #676: Allow brackets [ ] inside the temporary and output directory paths.
+* ADD #424: (Experimental) scripts to reproduce the results from the original Auto-sklearn paper.
+
+Contributors
+************
+
+* Jin Woo Ahn
+* Herilalaina Rakotoarison
+* Matthias Feurer
+* yazanobeidi
+
 Version 0.5.1
 =============
 
diff --git a/scripts/2015_nips_paper/Readme.md b/scripts/2015_nips_paper/Readme.md
new file mode 100644
index 0000000000..fd57c78f43
--- /dev/null
+++ b/scripts/2015_nips_paper/Readme.md
@@ -0,0 +1,34 @@
+## Reproduce results of Efficient and Robust Automated Machine Learning (Feurer et al.)
+This folder contains all necessary scripts in order to reproduce the results shown in
+Figure 3 of Efficient and Robust Automated Machine Learning (Feurer et al.). The scripts
+can be modified to include different datasets, change the runtime, etc. The scripts only
+only handles classification tasks, and balanced accuracy is used as the score metric.
+
+### 1. Creating commands.txt
+To run the experiment, first create commands.txt by running:
+```bash
+cd setup
+bash create_commands.sh
+```
+The script can be modified to run experiments with different settings, i.e. 
+different runtime and/or different tasks.
+
+### 2. Executing commands.txt
+Run each commands in commands.txt:
+```bash
+cd run
+bash run_commands.sh
+```
+Each command line in commands.txt first executes model fitting, and then creating the
+single best and ensemble trajectories. Therefore, the commands can be run in parallel
+on a cluster by modifying run_commands.sh.
+
+### 3. Plotting the results
+To plot the results, run:
+```bash
+cd plot
+bash plot_ranks.py
+```
+
+
+
diff --git a/scripts/2015_nips_paper/plot/plot_ranks.py b/scripts/2015_nips_paper/plot/plot_ranks.py
new file mode 100644
index 0000000000..5be095389c
--- /dev/null
+++ b/scripts/2015_nips_paper/plot/plot_ranks.py
@@ -0,0 +1,165 @@
+#!/usr/bin/env python3
+
+import csv
+import sys
+import os
+
+import numpy as np
+
+import pandas as pd
+import matplotlib.pyplot as plt
+
+
+def read_csv(fn, has_header=True, data_type=str):
+    """
+    Function which reads the csv files containing trajectories
+    of the auto-sklearn runs.
+    """
+    data = list()
+    header = None
+    with open(fn, 'r') as csvfile:
+        csv_reader = csv.reader(csvfile, delimiter=',', quotechar='|')
+        for row in csv_reader:
+            if header is None and has_header:
+                header = row
+                continue
+            data.append(list(map(data_type, [i.strip() for i in row])))
+    return header, data
+
+
+def fill_trajectory(performance_list, time_list):
+    # Create n series objects.
+    series_list = []
+    for n in range(len(time_list)):
+        series_list.append(pd.Series(data=performance_list[n], index=time_list[n]))
+
+    # Concatenate to one Series with NaN vales.
+    series = pd.concat(series_list, axis=1)
+
+    # Fill missing performance values (NaNs) with last non-NaN value.
+    series = series.fillna(method='ffill')
+
+    # return the trajectories over seeds (series object)
+    return series
+
+
+def main():
+    # name of the file where the plot is stored
+    saveto = "../plot.png"
+    # runtime of each experiment
+    max_runtime = 3600
+    # folder where all trajectories are stored.
+    working_directory = "../log_output"
+
+    # list of models
+    model_list = ['vanilla', 'ensemble', 'metalearning', 'meta_ensemble']
+
+    # list of seeds
+    seed_dir = os.path.join(working_directory, 'vanilla')
+    seed_list = [seed for seed in os.listdir(seed_dir)]
+
+    # list of tasks
+    vanilla_task_dir = os.path.join(seed_dir, seed_list[0])
+    task_list = [task_id for task_id in os.listdir(vanilla_task_dir)]
+
+    # Step 1. Merge all trajectories into one Dataframe object.
+    #####################################################################################
+    all_trajectories = []
+
+    for model in model_list:
+        trajectories = []
+        for task_id in task_list:
+            csv_files = []
+
+            for seed in seed_list:
+                # collect all csv files of different seeds for current model and
+                # current task.
+                if model in ['vanilla', 'ensemble']:
+                    csv_file = os.path.join(working_directory,
+                                            'vanilla',
+                                            seed,
+                                            task_id,
+                                            "score_{}.csv".format(model)
+                                            )
+
+                elif model in ['metalearning', 'meta_ensemble']:
+                    csv_file = os.path.join(working_directory,
+                                            'metalearning',
+                                            seed,
+                                            task_id,
+                                            "score_{}.csv".format(model),
+                                            )
+                csv_files.append(csv_file)
+
+            performance_list = []
+            time_list = []
+
+            # Get data from csv
+            for fl in csv_files:
+                _, csv_data = read_csv(fl, has_header=True)
+                csv_data = np.array(csv_data)
+                # Replace too high values with args.maxsize
+                data = [min([sys.maxsize, float(i.strip())]) for i in
+                        csv_data[:, 2]]  # test trajectories are stored in third column
+
+                time_steps = [float(i.strip()) for i in csv_data[:, 0]]
+                assert time_steps[0] == 0
+
+                performance_list.append(data)
+                time_list.append(time_steps)
+
+            # trajectory is the pd.Series object containing all seed runs of the
+            # current model and current task.
+            trajectory = fill_trajectory(performance_list, time_list)
+            trajectories.append(trajectory)
+
+        # list[list[pd.Series]]
+        all_trajectories.append(trajectories)
+
+    # Step 2. Compute average ranks of the trajectories.
+    #####################################################################################
+    all_rankings = []
+    n_iter = 500  # number of bootstrap samples to use for estimating the ranks.
+    n_tasks = len(task_list)
+
+    for i in range(n_iter):
+        pick = np.random.choice(all_trajectories[0][0].shape[1],
+                                size=(len(model_list)))
+
+        for j in range(n_tasks):
+            all_trajectories_tmp = pd.DataFrame(
+                {model_list[k]: at[j].iloc[:, pick[k]] for
+                 k, at in enumerate(all_trajectories)}
+            )
+            all_trajectories_tmp = all_trajectories_tmp.fillna(method='ffill', axis=0)
+            r_tmp = all_trajectories_tmp.rank(axis=1)
+            all_rankings.append(r_tmp)
+
+    final_ranks = []
+    for i, model in enumerate(model_list):
+        ranks_for_model = []
+        for ranking in all_rankings:
+            ranks_for_model.append(ranking.loc[:, model])
+        ranks_for_model = pd.DataFrame(ranks_for_model)
+        ranks_for_model = ranks_for_model.fillna(method='ffill', axis=1)
+        final_ranks.append(ranks_for_model.mean(skipna=True))
+
+    # Step 3. Plot the average ranks over time.
+    #####################################################################################
+    for i, model in enumerate(model_list):
+        X_data = []
+        y_data = []
+        for x, y in final_ranks[i].iteritems():
+            X_data.append(x)
+            y_data.append(y)
+        X_data.append(max_runtime)
+        y_data.append(y)
+        plt.plot(X_data, y_data, label=model)
+        plt.xlabel('time [sec]')
+        plt.ylabel('average rank')
+        plt.legend()
+    plt.savefig(saveto)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/2015_nips_paper/run/remove_dataset_from_metadata.py b/scripts/2015_nips_paper/run/remove_dataset_from_metadata.py
new file mode 100644
index 0000000000..f31e16e65f
--- /dev/null
+++ b/scripts/2015_nips_paper/run/remove_dataset_from_metadata.py
@@ -0,0 +1,64 @@
+import os
+
+import arff
+from shutil import copyfile
+
+
+def remove_dataset_from_aslib_arff(input_file,
+                                   output_file,
+                                   id,
+                                   ):
+    with open(input_file) as fh:
+        arff_object = arff.load(fh)
+    for i in range(len(arff_object['data']) - 1, -1, -1):
+        if str(arff_object['data'][i][0]) == str(id):
+            del arff_object['data'][i]
+
+    with open(output_file, "w") as fh:
+        arff.dump(arff_object, fh)
+    del arff_object
+
+
+def remove_dataset(metadata_directory,
+                   output_directory,
+                   id,
+                   ):
+    metadata_sub_directories = os.listdir(metadata_directory)
+
+    for metadata_sub_directory in metadata_sub_directories:
+        subdir = os.path.join(metadata_directory, metadata_sub_directory)
+        output_subdir = os.path.join(output_directory, metadata_sub_directory)
+        try:
+            os.makedirs(output_subdir)
+        except OSError:
+            pass
+
+        arf = "algorithm_runs.arff"
+        algorithm_runs_file = os.path.join(subdir, arf)
+        output_file = os.path.join(output_subdir, arf)
+        remove_dataset_from_aslib_arff(algorithm_runs_file, output_file, id)
+
+        fcf = "feature_costs.arff"
+        feature_costs_file = os.path.join(subdir, fcf)
+        output_file = os.path.join(output_subdir, fcf)
+        remove_dataset_from_aslib_arff(feature_costs_file, output_file, id)
+
+        frf = "feature_runstatus.arff"
+        feature_runstatus_file = os.path.join(subdir, frf)
+        output_file = os.path.join(output_subdir, frf)
+        remove_dataset_from_aslib_arff(feature_runstatus_file, output_file, id)
+
+        fvf = "feature_values.arff"
+        features_values_file = os.path.join(subdir, fvf)
+        output_file = os.path.join(output_subdir, fvf)
+        remove_dataset_from_aslib_arff(features_values_file, output_file, id)
+
+        desc = "description.txt"
+        description_file = os.path.join(subdir, desc)
+        output_file = os.path.join(output_subdir, desc)
+        copyfile(description_file, output_file)
+
+        configs = "configurations.csv"
+        configs_file = os.path.join(subdir, configs)
+        output_file = os.path.join(output_subdir, configs)
+        copyfile(configs_file, output_file)
diff --git a/scripts/2015_nips_paper/run/run_auto_sklearn.py b/scripts/2015_nips_paper/run/run_auto_sklearn.py
new file mode 100644
index 0000000000..279fee7806
--- /dev/null
+++ b/scripts/2015_nips_paper/run/run_auto_sklearn.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python3
+
+import os
+import argparse
+import numpy as np
+import openml
+
+from autosklearn.classification import AutoSklearnClassifier
+from autosklearn.metrics import balanced_accuracy
+from remove_dataset_from_metadata import remove_dataset
+import score_ensemble
+
+
+def load_task(task_id):
+    """Function used for loading data."""
+    task = openml.tasks.get_task(task_id)
+    X, y = task.get_X_and_y()
+    train_indices, test_indices = task.get_train_test_split_indices()
+    X_train = X[train_indices]
+    y_train = y[train_indices]
+    X_test = X[test_indices]
+    y_test = y[test_indices]
+    dataset = openml.datasets.get_dataset(task.dataset_id)
+    _, _, cat = dataset.get_data(return_categorical_indicator=True,
+                                 target=task.target_name)
+    del _
+    del dataset
+    cat = ['categorical' if c else 'numerical' for c in cat]
+
+    unique = np.unique(y_train)
+    mapping = {unique_value: i for i, unique_value in enumerate(unique)}
+    y_train = np.array([mapping[value] for value in y_train])
+    y_test = np.array([mapping[value] for value in y_test])
+
+    return X_train, y_train, X_test, y_test, cat
+
+
+def run_experiment(working_directory,
+                   time_limit,
+                   per_run_time_limit,
+                   task_id,
+                   seed,
+                   use_metalearning,
+                   ):
+    # set this to local dataset cache
+    # openml.config.cache_directory = os.path.join(working_directory, "../cache")
+
+    seed_dir = os.path.join(working_directory, str(seed))
+    try:
+        os.makedirs(seed_dir)
+    except Exception:
+        print("Directory {0} aleardy created.".format(seed_dir))
+
+    tmp_dir = os.path.join(seed_dir, str(task_id))
+
+    # With metalearning
+    if use_metalearning is True:
+        # path to the original metadata directory.
+        metadata_directory = os.path.abspath(os.path.dirname(__file__))
+        metadata_directory = os.path.join(metadata_directory,
+                                          "../../../autosklearn/metalearning/files/")
+
+        # Create new metadata directory not containing task_id.
+        new_metadata_directory = os.path.abspath(os.path.join(working_directory,
+                                                              "metadata_%i" % task_id))
+
+        try:
+            os.makedirs(new_metadata_directory)
+        except OSError:
+            pass  # pass because new metadata is created for this task.
+
+        # remove the given task id from metadata directory.
+        remove_dataset(metadata_directory, new_metadata_directory, task_id)
+
+        automl_arguments = {
+            'time_left_for_this_task': time_limit,
+            'per_run_time_limit': per_run_time_limit,
+            'initial_configurations_via_metalearning': 25,
+            'ensemble_size': 0,
+            'seed': seed,
+            'ml_memory_limit': 3072,
+            'resampling_strategy': 'holdout',
+            'resampling_strategy_arguments': {'train_size': 0.67},
+            'tmp_folder': tmp_dir,
+            'delete_tmp_folder_after_terminate': False,
+            'disable_evaluator_output': False,
+            'metadata_directory': new_metadata_directory
+        }
+
+    # Without metalearning
+    else:
+        automl_arguments = {
+            'time_left_for_this_task': time_limit,
+            'per_run_time_limit': per_run_time_limit,
+            'initial_configurations_via_metalearning': 0,
+            'ensemble_size': 0,
+            'seed': seed,
+            'ml_memory_limit': 3072,
+            'resampling_strategy': 'holdout',
+            'resampling_strategy_arguments': {'train_size': 0.67},
+            'tmp_folder': tmp_dir,
+            'delete_tmp_folder_after_terminate': False,
+            'disable_evaluator_output': False,
+        }
+
+    automl = AutoSklearnClassifier(**automl_arguments)
+
+    X_train, y_train, X_test, y_test, cat = load_task(task_id)
+
+    automl.fit(X_train, y_train,
+               dataset_name=str(task_id),
+               X_test=X_test, y_test=y_test,
+               metric=balanced_accuracy)
+
+
+def main(working_directory,
+         output_file,
+         task_id,
+         seed,
+         model,
+         time_limit,
+         per_run_time_limit):
+    # vanilla and metalearning must be called first before ensemble and
+    # meta_ensemble can be called, respectively.
+    if model == "vanilla":
+        run_experiment(working_directory,
+                       time_limit,
+                       per_run_time_limit,
+                       task_id,
+                       seed,
+                       use_metalearning=False,
+                       )
+        score_ensemble.main(working_directory,
+                            output_file,
+                            task_id,
+                            seed,
+                            ensemble_size=1,
+                            )
+    elif model == "metalearning":
+        run_experiment(working_directory,
+                       time_limit,
+                       per_run_time_limit,
+                       task_id,
+                       seed,
+                       use_metalearning=True,
+                       )
+        score_ensemble.main(working_directory,
+                            output_file,
+                            task_id,
+                            seed,
+                            ensemble_size=1,
+                            )
+    else:
+        score_ensemble.main(working_directory,
+                            output_file,
+                            task_id,
+                            seed,
+                            ensemble_size=50,
+                            )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--working-directory', type=str, required=True)
+    parser.add_argument("--output-file", type=str, required=True)
+    parser.add_argument("--time-limit", type=int, required=True)
+    parser.add_argument("--per-runtime-limit", type=int, required=True)
+    parser.add_argument('--task-id', type=int, required=True)
+    parser.add_argument('-s', '--seed', type=int)
+    parser.add_argument("--model", type=str, required=True)
+
+    args = parser.parse_args()
+    working_directory = args.working_directory  # logdir/vanilla or logdir/metalearning
+    output_file = args.output_file
+    task_id = args.task_id
+    seed = args.seed
+    model = args.model
+    time_limit = args.time_limit
+    per_run_time_limit = args.per_runtime_limit
+
+    main(working_directory,
+         output_file,
+         task_id,
+         seed,
+         model,
+         time_limit,
+         per_run_time_limit,
+         )
diff --git a/scripts/2015_nips_paper/run/run_commands.sh b/scripts/2015_nips_paper/run/run_commands.sh
new file mode 100644
index 0000000000..18963109f7
--- /dev/null
+++ b/scripts/2015_nips_paper/run/run_commands.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+# Run all commands in commands.txt. Each command line executes first the model fitting,
+# and then creates the trajectory of a single model and the ensemble. Therefore, each
+# line can be executed separately and in parallel, for example, on a cluster environment.
+cat "../commands.txt" | while read line; do eval "$line"; done
\ No newline at end of file
diff --git a/scripts/2015_nips_paper/run/score_ensemble.py b/scripts/2015_nips_paper/run/score_ensemble.py
new file mode 100644
index 0000000000..54d51c4e9c
--- /dev/null
+++ b/scripts/2015_nips_paper/run/score_ensemble.py
@@ -0,0 +1,235 @@
+from argparse import ArgumentParser
+import csv
+import glob
+import os
+import time
+
+import numpy as np
+import sklearn.externals.joblib as joblib
+
+from autosklearn.ensembles.ensemble_selection import EnsembleSelection
+from autosklearn.metrics import balanced_accuracy
+
+from autosklearn.util.backend import create
+
+
+def _load_file(f):
+    split = f.split('_')
+    as_seed = int(split[-2])
+    ta_seed = int(split[-1].split('.')[0])
+    np_array = np.load(f)
+    return np_array, (as_seed, ta_seed), os.path.getmtime(f)
+
+
+def read_files(directory, seed=None, n_jobs=1):
+
+    seed_pattern = '*' if seed is None else str(seed)
+    glob_pattern = os.path.join(directory, "predictions_*_%s_*.npy" %
+                                seed_pattern)
+    files = sorted(glob.glob(glob_pattern))
+    files = joblib.Parallel(n_jobs=n_jobs, verbose=10)(
+        joblib.delayed(_load_file)(f=f) for f in files)
+
+    return files
+
+
+def main(input_directories, output_file, task_id, seed, ensemble_size, n_jobs=1):
+    seed = None if seed is None or seed < 0 else int(seed)
+
+    if isinstance(input_directories, str):
+        # add seed and task id directories
+        input_directories += '/%i/%i' % (seed, task_id)
+        input_directories = [input_directories]
+
+    else:
+        new_directories = []
+        for dir in input_directories:
+            dir += '/%i/%i' % (seed, task_id)
+            new_directories.append(dir)
+        input_directories = new_directories
+
+    validation_files = []
+    test_files = []
+    starttime = np.inf
+
+    # Get the prediction files.
+    for input_directory in input_directories:
+        print('Loading files from input directory:', input_directory)
+        validation_files_ = read_files(
+            os.path.join(input_directory,
+                         '.auto-sklearn/predictions_ensemble'),
+            n_jobs=n_jobs)
+        validation_files.extend(validation_files_)
+        test_files_ = read_files(
+            os.path.join(input_directory,
+                         '.auto-sklearn/predictions_test'),
+            n_jobs=n_jobs)
+        test_files.extend(test_files_)
+
+        assert len(validation_files_) > 0
+        assert len(validation_files_) == len(test_files_)
+
+        print('Loaded %d files!' % len(validation_files_))
+
+        # if not specified, we get all files.
+        seed_pattern = '*' if seed is None else str(seed)
+        glob_pattern = os.path.join(input_directory,
+                                    ".auto-sklearn",
+                                    "start_time_%s" % seed_pattern)
+        start_time_files = glob.glob(glob_pattern)
+
+        # find the earliest startime.
+        for start_time_file in start_time_files:
+            with open(start_time_file, "r") as fh:
+                starttime_candidate = float(fh.read())
+                if starttime_candidate < starttime:
+                    starttime = starttime_candidate
+
+        del validation_files_
+        del test_files_
+
+    validation_files.sort(key=lambda t: t[-1])
+
+    keys_to_test_files = {test_file[1]: test_file
+                          for test_file in test_files}
+    # Resort such that both files have the same order
+    test_files = [keys_to_test_files[validation_file[1]]
+                  for validation_file in validation_files]
+
+    assert [validation_file[1] for validation_file in validation_files] == [
+        test_file[1] for test_file in test_files]
+
+    losses = []
+    top_models_at_step = dict()
+
+    backend = create(input_directory, input_directory + "_output",
+                     delete_tmp_folder_after_terminate=False,
+                     delete_output_folder_after_terminate=True,
+                     shared_mode=True)
+    valid_labels = backend.load_targets_ensemble()
+    score = balanced_accuracy
+
+    # Compute losses and remember best model at each time step.
+    for i in range(len(validation_files)):
+        loss = 1 - score(valid_labels, validation_files[i][0])
+        losses.append(loss)
+        sorted_losses = np.argsort(losses)[:200]
+        top_models_at_step[i] = sorted_losses
+
+    models_to_remove = set(list(range(len(validation_files))))
+    for top_models_at_i in top_models_at_step:
+        for top_model in top_models_at_step[top_models_at_i]:
+            if top_model in models_to_remove:
+                models_to_remove.remove(top_model)
+
+    print("Removing the following %d models from the library: %s"
+          % (len(models_to_remove), models_to_remove))
+    for model_id in models_to_remove:
+        validation_files[model_id] = None
+        test_files[model_id] = None
+
+    print('Starting ensemble building!')
+    output = joblib.Parallel(n_jobs=n_jobs, verbose=20)(
+        joblib.delayed(
+            evaluate)(input_directory=input_directories[0],
+                      validation_files=[validation_files[j] for
+                                        j in range(len(validation_files))
+                                        if j in top_models_at_step[i]],
+                      test_files=[test_files[j] for
+                                  j in range(len(test_files))
+                                  if j in top_models_at_step[i]],
+                      ensemble_size=ensemble_size)
+        for i in range(len(test_files)))
+
+    # Create output csv file
+    file_path = os.path.abspath("%s/%s" % (input_directory, output_file))
+    with open(file_path, "w") as csv_file:
+        fieldnames = ['Time', 'Training (Empirical) Performance',
+                      'Test Set Performance']
+        csv_writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
+        csv_writer.writeheader()
+
+        # First time step
+        csv_writer.writerow({'Time': 0,
+                             'Training (Empirical) Performance': 1.0,
+                             'Test Set Performance': 1.0})
+
+        for i, o in enumerate(output):
+            csv_writer.writerow({'Time': o['ensemble_time']
+                                 + o['time_function_evaluation']
+                                 - starttime,
+                                 'Training (Empirical) Performance':
+                                 o['ensemble_error'],
+                                 'Test Set Performance':
+                                 o['ensemble_test_error']})
+
+
+def evaluate(input_directory, validation_files, test_files, ensemble_size=50):
+
+    backend = create(input_directory, input_directory + "_output",
+                     delete_tmp_folder_after_terminate=False,
+                     delete_output_folder_after_terminate=False,
+                     shared_mode=True)
+
+    valid_labels = backend.load_targets_ensemble()
+    D = backend.load_datamanager()
+    test_labels = D.data["Y_test"]
+
+    score = balanced_accuracy
+
+    # Read the modification time of the predictions file and
+    # compute the interval to the first prediction file.
+    # This interval will be add to the time we needed to build the ensemble
+    time_function_evaluation = validation_files[-1][-1]
+
+    # Build the ensemble
+    start = time.time()
+    ensemble_selection = EnsembleSelection(ensemble_size=ensemble_size,
+                                           task_type=D.info['task'],
+                                           metric=score,
+                                           random_state=np.random.RandomState())
+
+    validation_predictions = np.array([v[0] for v in validation_files])
+    test_predictions = np.array([t[0] for t in test_files])
+
+    ensemble_selection.fit(validation_predictions, valid_labels,
+                           identifiers=None)
+    y_hat_ensemble = ensemble_selection.predict(np.array(
+        validation_predictions))
+    y_hat_test = ensemble_selection.predict(np.array(test_predictions))
+
+    # Compute validation error
+    ensemble_error = 1 - score(valid_labels, y_hat_ensemble)
+
+    # Compute test error
+    ensemble_test_error = 1 - score(test_labels, y_hat_test)
+
+    ensemble_time = time.time() - start
+
+    rval = {'ensemble_time': ensemble_time,
+            'time_function_evaluation': time_function_evaluation,
+            'ensemble_error': ensemble_error,
+            'ensemble_test_error': ensemble_test_error}
+
+    return rval
+
+
+if __name__ == '__main__':
+    parser = ArgumentParser()
+    parser.add_argument('--input-directory', type=str,
+                        required=True, nargs='+')
+    parser.add_argument('--task-id', type=int, required=True)
+    parser.add_argument('-s', '--seed', type=int)
+    parser.add_argument("--output-file", type=str, default='score_ensemble.csv')
+    parser.add_argument("--ensemble-size", type=int, default=50)
+    parser.add_argument("--n-jobs", type=int, default=1)
+    args = parser.parse_args()
+
+    input_directory = args.input_directory  # logdir/vanilla or logdir/metalearning
+    output_file = args.output_file
+    task_id = args.task_id
+    seed = args.seed
+    ensemble_size = args.ensemble_size
+    n_jobs = args.n_jobs
+
+    main(input_directory, output_file, task_id, seed, ensemble_size)
diff --git a/scripts/2015_nips_paper/setup/create_commands.sh b/scripts/2015_nips_paper/setup/create_commands.sh
new file mode 100755
index 0000000000..23c409b2c5
--- /dev/null
+++ b/scripts/2015_nips_paper/setup/create_commands.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+
+dir=../log_output  # working directory
+#task_ids=$(python get_tasks.py)
+task_ids="233 236 242 244 246 248 251 252 253 254 256 258 260 261 262 266 273 275 288 2117 2118 2119 2120 2122 2123 2350 3043 3044 75090 75092 75093 75098 75099 75100 75103 75104 75105 75106 75107 75108 75111 75112 75113 75114 75115 75116 75117 75119 75120 75121 75122 75125 75126 75129 75131 75133 75136 75137 75138 75139 75140 75142 75143 75146 75147 75148 75149 75150 75151 75152 75153 75155 75157 75159 75160 75161 75162 75163 75164 75165 75166 75168 75169 75170 75171 75172 75173 75174 75175 75176 75179 75180 75182 75183 75184 75185 75186 75188 75189 75190 75191 75192 75194 75195 75196 75197 75198 75199 75200 75201 75202 75203 75204 75205 75206 75207 75208 75209 75210 75212 75213 75216 75218 75220 75222 75224 75226 75228 75229 75233 75238 75240 75244 75245 75246 75247 75248 75249 75251 "
+seeds="0 1 2 3 4 5 6 7 8 9 "
+time_limit=3600
+per_runtime_limit=360
+
+
+# Create commands. Each command line first executes model fitting with auto-sklearn and
+# creates the trajectory of the single best performance. Then, it creates the trajectory
+# of the ensemble performance.
+echo "creating commands.txt..."
+for seed in $seeds; do
+    for task_id in $task_ids; do
+        for model in vanilla metalearning; do
+            # store vanilla and ensemble in one folder and meta and meta_ens in another.
+            if [ "$model" == "vanilla" ]; then
+                cmd="python run_auto_sklearn.py --working-directory $dir/$model --task-id $task_id \
+                -s $seed --output-file score_vanilla.csv --model vanilla --time-limit $time_limit \
+                --per-runtime-limit $per_runtime_limit "
+                cmd+="; python run_auto_sklearn.py --working-directory $dir/$model --task-id $task_id \
+                -s $seed --output-file score_ensemble.csv --model ensemble --time-limit $time_limit \
+                --per-runtime-limit $per_runtime_limit "
+            elif [ "$model" == "metalearning" ]; then
+                cmd="python run_auto_sklearn.py --working-directory $dir/$model --task-id $task_id \
+                -s $seed --output-file score_metalearning.csv --model metalearning --time-limit $time_limit \
+                --per-runtime-limit $per_runtime_limit "
+                cmd+="; python run_auto_sklearn.py --working-directory $dir/$model --task-id $task_id \
+                -s $seed --output-file score_meta_ensemble.csv --model meta_ensemble --time-limit $time_limit \
+                --per-runtime-limit $per_runtime_limit "
+            fi
+            # Create commands.txt in the 2015_nips_paper folder and not in setup folder.
+            echo $cmd >> ../commands.txt
+        done
+    done
+done
+echo "creating commands.txt done"
diff --git a/scripts/2015_nips_paper/setup/get_tasks.py b/scripts/2015_nips_paper/setup/get_tasks.py
new file mode 100644
index 0000000000..09f06a0a64
--- /dev/null
+++ b/scripts/2015_nips_paper/setup/get_tasks.py
@@ -0,0 +1,57 @@
+# get tasks from openml
+import openml
+import pandas as pd
+
+
+# List of dataset IDs used for the NIPS experiment.
+dataset_ids = [1000, 1002, 1018, 1019, 1020, 1021, 1036, 1040, 1041, 1049, 1050, 1053,
+               1056, 1067, 1068, 1069, 1111, 1112, 1114, 1116, 1119, 1120, 1128, 1130,
+               1134, 1138, 1139, 1142, 1146, 1161, 1166, 12, 14, 16, 179, 180, 181, 182,
+               184, 185, 18, 21, 22, 23, 24, 26, 273, 28, 293, 300, 30, 31, 32, 351, 354,
+               357, 36, 389, 38, 390, 391, 392, 393, 395, 396, 398, 399, 3, 401, 44, 46,
+               554, 57, 60, 679, 6, 715, 718, 720, 722, 723, 727, 728, 734, 735, 737,
+               740, 741, 743, 751, 752, 761, 772, 797, 799, 803, 806, 807, 813, 816, 819,
+               821, 822, 823, 833, 837, 843, 845, 846, 847, 849, 866, 871, 881, 897, 901,
+               903, 904, 910, 912, 913, 914, 917, 923, 930, 934, 953, 958, 959, 962, 966,
+               971, 976, 977, 978, 979, 980, 991, 993, 995]
+
+
+def get_task_ids(dataset_ids):
+    # return task ids of corresponding datset ids.
+
+    # active tasks
+    tasks_a = openml.tasks.list_tasks(task_type_id=1, status='active')
+    tasks_a = pd.DataFrame.from_dict(tasks_a, orient="index")
+
+    # query only those with holdout as the resampling startegy.
+    tasks_a = tasks_a[(tasks_a.estimation_procedure == "33% Holdout set")]
+
+    # deactivated tasks
+    tasks_d = openml.tasks.list_tasks(task_type_id=1, status='deactivated')
+    tasks_d = pd.DataFrame.from_dict(tasks_d, orient="index")
+
+    tasks_d = tasks_d[(tasks_d.estimation_procedure == "33% Holdout set")]
+
+    task_ids = []
+    for did in dataset_ids:
+        task_a = list(tasks_a.query("did == {}".format(did)).tid)
+        if len(task_a) > 1:  # if there are more than one task, take the lowest one.
+            task_a = [min(task_a)]
+        task_d = list(tasks_d.query("did == {}".format(did)).tid)
+        if len(task_d) > 1:
+            task_d = [min(task_d)]
+        task_ids += list(task_a + task_d)
+
+    return task_ids  # return list of all task ids.
+
+
+def main():
+    task_ids = sorted(get_task_ids(dataset_ids))
+    string_to_print = ''
+    for tid in task_ids:
+        string_to_print += str(tid) + ' '
+    print(string_to_print)  # print the task ids for bash script.
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/test_automl/test_estimators.py b/test/test_automl/test_estimators.py
index e5454dd8db..c143bcaa9c 100644
--- a/test/test_automl/test_estimators.py
+++ b/test/test_automl/test_estimators.py
@@ -271,7 +271,7 @@ def test_fit_pSMAC(self):
         true_targets_ensemble_path = os.path.join(tmp, '.auto-sklearn',
                                                   'true_targets_ensemble.npy')
         with open(true_targets_ensemble_path, 'rb') as fh:
-            true_targets_ensemble = np.load(fh)
+            true_targets_ensemble = np.load(fh, allow_pickle=True)
         true_targets_ensemble[-1] = 1 if true_targets_ensemble[-1] != 1 else 0
         true_targets_ensemble = true_targets_ensemble.astype(int)
         probas = np.zeros((len(true_targets_ensemble), 2), dtype=float)
diff --git a/test/test_ensemble_builder/test_ensemble.py b/test/test_ensemble_builder/test_ensemble.py
index 7515f75e6d..b9dfa778fb 100644
--- a/test/test_ensemble_builder/test_ensemble.py
+++ b/test/test_ensemble_builder/test_ensemble.py
@@ -16,7 +16,7 @@
 
 
 class BackendMock(object):
-    
+
     def __init__(self):
         this_directory = os.path.abspath(
             os.path.dirname(__file__)
@@ -24,7 +24,7 @@ def __init__(self):
         self.temporary_directory = os.path.join(
             this_directory, 'data',
         )
-    
+
     def load_targets_ensemble(self):
         with open(os.path.join(
             self.temporary_directory,
@@ -32,11 +32,11 @@ def load_targets_ensemble(self):
             "predictions_ensemble",
             "predictions_ensemble_true.npy"
         ),"rb") as fp:
-            y = np.load(fp)
+            y = np.load(fp, allow_pickle=True)
         return y
-    
+
 class EnsembleBuilderMemMock(EnsembleBuilder):
-    
+
     def fit_ensemble(self,selected_keys):
         np.ones([10000000,1000000])
 
@@ -49,7 +49,7 @@ def tearDown(self):
         pass
 
     def testRead(self):
-        
+
         ensbuilder = EnsembleBuilder(
             backend=self.backend,
             dataset_name="TEST",
@@ -74,9 +74,9 @@ def testRead(self):
             ".auto-sklearn/predictions_ensemble/predictions_ensemble_0_2.npy"
         )
         self.assertEqual(ensbuilder.read_preds[filename]["ens_score"], 1.0)
-                    
+
     def testNBest(self):
-        
+
         ensbuilder = EnsembleBuilder(
             backend=self.backend,
             dataset_name="TEST",
@@ -86,7 +86,7 @@ def testNBest(self):
             seed=0, # important to find the test files
             ensemble_nbest=1,
         )
-        
+
         ensbuilder.read_ensemble_preds()
         sel_keys = ensbuilder.get_n_best_preds()
 
@@ -97,10 +97,10 @@ def testNBest(self):
             ".auto-sklearn/predictions_ensemble/predictions_ensemble_0_2.npy"
         )
         self.assertEquals(sel_keys[0], fixture)
-        
+
     def testFallBackNBest(self):
-        
-        ensbuilder = EnsembleBuilder(backend=self.backend, 
+
+        ensbuilder = EnsembleBuilder(backend=self.backend,
                                     dataset_name="TEST",
                                     task_type=1,  #Binary Classification
                                     metric=roc_auc,
@@ -108,7 +108,7 @@ def testFallBackNBest(self):
                                     seed=0, # important to find the test files
                                     ensemble_nbest=1
                                     )
-        
+
         ensbuilder.read_ensemble_preds()
 
         filename = os.path.join(
@@ -122,7 +122,7 @@ def testFallBackNBest(self):
             ".auto-sklearn/predictions_ensemble/predictions_ensemble_0_1.npy"
         )
         ensbuilder.read_preds[filename]["ens_score"] = -1
-        
+
         sel_keys = ensbuilder.get_n_best_preds()
 
         fixture = os.path.join(
@@ -130,10 +130,10 @@ def testFallBackNBest(self):
             ".auto-sklearn/predictions_ensemble/predictions_ensemble_0_1.npy"
         )
         self.assertEquals(sel_keys[0], fixture)
-        
+
     def testGetValidTestPreds(self):
-        
-        ensbuilder = EnsembleBuilder(backend=self.backend, 
+
+        ensbuilder = EnsembleBuilder(backend=self.backend,
                                     dataset_name="TEST",
                                     task_type=1,  #Binary Classification
                                     metric=roc_auc,
@@ -141,9 +141,9 @@ def testGetValidTestPreds(self):
                                     seed=0, # important to find the test files
                                     ensemble_nbest=1
                                     )
-        
+
         ensbuilder.read_ensemble_preds()
-        
+
         d2 = os.path.join(
             self.backend.temporary_directory,
             ".auto-sklearn/predictions_ensemble/predictions_ensemble_0_2.npy"
@@ -152,21 +152,21 @@ def testGetValidTestPreds(self):
             self.backend.temporary_directory,
             ".auto-sklearn/predictions_ensemble/predictions_ensemble_0_1.npy"
         )
-        
+
         sel_keys = ensbuilder.get_n_best_preds()
-        
+
         ensbuilder.get_valid_test_preds(selected_keys=sel_keys)
-        
+
         # selected --> read valid and test predictions
         self.assertIsNotNone(ensbuilder.read_preds[d2][Y_VALID])
         self.assertIsNotNone(ensbuilder.read_preds[d2][Y_TEST])
-        
+
         # not selected --> should still be None
         self.assertIsNone(ensbuilder.read_preds[d1][Y_VALID])
         self.assertIsNone(ensbuilder.read_preds[d1][Y_TEST])
-        
+
     def testEntireEnsembleBuilder(self):
-        
+
         ensbuilder = EnsembleBuilder(
             backend=self.backend,
             dataset_name="TEST",
@@ -177,7 +177,7 @@ def testEntireEnsembleBuilder(self):
             ensemble_nbest=2,
         )
         ensbuilder.SAVE2DISC = False
-        
+
         ensbuilder.read_ensemble_preds()
 
         d2 = os.path.join(
@@ -187,12 +187,12 @@ def testEntireEnsembleBuilder(self):
 
         sel_keys = ensbuilder.get_n_best_preds()
         self.assertGreater(len(sel_keys), 0)
-        
+
         ensemble = ensbuilder.fit_ensemble(selected_keys=sel_keys)
         print(ensemble, sel_keys)
-        
+
         n_sel_valid, n_sel_test = ensbuilder.get_valid_test_preds(selected_keys=sel_keys)
-        
+
         # both valid and test prediction files are available
         self.assertGreater(len(n_sel_valid), 0)
         self.assertEqual(n_sel_valid, n_sel_test)
@@ -221,10 +221,10 @@ def testEntireEnsembleBuilder(self):
         # so that y_valid should be exactly y_valid_d2
         y_valid_d2 = ensbuilder.read_preds[d2][Y_VALID][:, 1]
         np.testing.assert_array_almost_equal(y_valid, y_valid_d2)
-        
+
     def testMain(self):
-        
-        ensbuilder = EnsembleBuilder(backend=self.backend, 
+
+        ensbuilder = EnsembleBuilder(backend=self.backend,
                                     dataset_name="TEST",
                                     task_type=1,  #Binary Classification
                                     metric=roc_auc,
@@ -234,17 +234,17 @@ def testMain(self):
                                     max_iterations=1 # prevents infinite loop
                                     )
         ensbuilder.SAVE2DISC = False
-        
+
         ensbuilder.main()
-        
+
         self.assertEqual(len(ensbuilder.read_preds), 2)
         self.assertIsNotNone(ensbuilder.last_hash)
         self.assertIsNotNone(ensbuilder.y_true_ensemble)
-        
+
     def testLimit(self):
-        
-                
-        ensbuilder = EnsembleBuilderMemMock(backend=self.backend, 
+
+
+        ensbuilder = EnsembleBuilderMemMock(backend=self.backend,
                                             dataset_name="TEST",
                                             task_type=1,  #Binary Classification
                                             metric=roc_auc,
@@ -255,9 +255,9 @@ def testLimit(self):
                                             memory_limit=10 # small memory limit to trigger MemoryException
                                             )
         ensbuilder.SAVE2DISC = False
-        
+
         ensbuilder.run()
-        
+
         # it should try to reduce ensemble_nbest until it also failed at 2
         self.assertEqual(ensbuilder.ensemble_nbest,1)