prepares for Command and Conquer style browser based experiment manag…

…ement - `experiment_name` is now compulsory - Addresses #207 and prepare for browser based "command center" - Added 'trees' reduction strategy - Added 'forrest' reduction strategy - added `scan_utils.py` as a home for helper functions for /scan - fixed tests to handle the new changes
autonomio · Aug 5, 2019 · 240ff85 · 240ff85
1 parent b682a5e
commit 240ff85
Show file tree

Hide file tree

Showing 22 changed files with 209 additions and 45 deletions.
diff --git a/docs/Monitoring.md b/docs/Monitoring.md
@@ -29,6 +29,8 @@ Epoch-by-epoch training data is available during the experiment using the `Exper
 
 ```python
 model.fit(...
-          callbacks=[talos.utils.ExperimentLogCallback('filename_or_path', params)])
+          callbacks=[talos.utils.ExperimentLogCallback('experiment_name', params)])
 ```
-NOTE: `params` is the params dictionary in the `Scan()` input model.
+Here `params` is the params dictionary in the `Scan()` input model. Both
+`experiment_name` and `experiment_id` should match with the current experiment,
+as otherwise
diff --git a/docs/Scan.md b/docs/Scan.md
@@ -19,7 +19,7 @@ Argument | Input | Description
 `y` | array or list of arrays | prediction outcome variable
 `params` | dict | the parameter dictionary
 `model` | function | the Keras model as a function
-`experiment_name` | str | Used for experiment log
+`experiment_name` | str | Used for creating the experiment logging folder
 `x_val` | array or list of arrays | validation data for x
 `y_val` | array or list of arrays | validation data for y
 `val_split` | float | validation data split ratio

diff --git a/talos/autom8/automodel.py b/talos/autom8/automodel.py
@@ -1,6 +1,6 @@
 class AutoModel:
 
-    def __init__(self, task, metric=None):
+    def __init__(self, task, experiment_name, metric=None):
 
         '''
 
@@ -19,12 +19,16 @@ def __init__(self, task, metric=None):
             If 'continuous' then mae is used for metric, if 'binary',
             'multiclass', or 'multilabel', f1score is used. Accuracy is always
             used.
+        experiment_name | str | Must be same as in `Scan()`
         metric : None or list
             You can also input a list with one or more custom metrics or names
             of Keras or Talos metrics.
         '''
 
+        from talos.utils.experiment_log_callback import ExperimentLogCallback
+
         self.task = task
+        self.experiment_name = experiment_name
         self.metric = metric
 
         if self.task is not None:
@@ -36,6 +40,7 @@ def __init__(self, task, metric=None):
 
         # create the model
         self.model = self._create_input_model
+        self.callback = ExperimentLogCallback
 
     def _set_metric(self):
 
@@ -112,6 +117,7 @@ def _create_input_model(self, x_train, y_train, x_val, y_val, params):
                         batch_size=params['batch_size'],
                         epochs=params['epochs'],
                         verbose=0,
+                        callbacks=[self.callback(self.experiment_name, params)],
                         validation_data=[x_val, y_val])
 
         # pass the output to Talos

diff --git a/talos/autom8/autoscan.py b/talos/autom8/autoscan.py
@@ -2,7 +2,8 @@ class AutoScan:
 
     def __init__(self,
                  task,
-                 max_param_values):
+                 max_param_values,
+                 experiment_name):
 
         '''Configure the `AutoScan()` experiment and then use
         the property `start` in the returned class object to start
@@ -16,6 +17,7 @@ def __init__(self,
 
         self.task = task
         self.max_param_values = max_param_values
+        self.experiment_name = experiment_name
 
     def start(self, x, y, **kwargs):
 
@@ -34,11 +36,19 @@ def start(self, x, y, **kwargs):
 
         try:
             kwargs['params']
-            scan_object = talos.Scan(x, y, model=m, **kwargs)
+            scan_object = talos.Scan(x, y,
+                                     model=m,
+                                     experiment_name=self.experiment_name,
+                                     **kwargs)
         except KeyError:
             p = talos.autom8.AutoParams(task=self.task)
             p.resample_params(self.max_param_values)
             params = p.params
-            scan_object = talos.Scan(x, y, params, m, **kwargs)
+            scan_object = talos.Scan(x=x,
+                                     y=y,
+                                     params=params,
+                                     model=m,
+                                     experiment_name=self.experiment_name,
+                                     **kwargs)
 
         return scan_object
diff --git a/talos/logging/results.py b/talos/logging/results.py
@@ -30,7 +30,7 @@ def save_result(self):
 
     import numpy as np
 
-    np.savetxt(self.experiment_name + '.csv',
+    np.savetxt(self._experiment_log,
                self.result,
                fmt='%s',
                delimiter=',')

diff --git a/talos/reducers/correlation.py b/talos/reducers/correlation.py
@@ -48,7 +48,7 @@ def correlation(self, method):
     # convert things back to their original dtype
     value = np.array([value]).astype(dtype)[0]
 
-    # this is where modify the parameter space accordingly
+    # this is where we modify the parameter space accordingly
     self.param_object.remove_is(label, value)
 
     return self
diff --git a/talos/reducers/forrest.py b/talos/reducers/forrest.py
@@ -0,0 +1,44 @@
+def forrest(self):
+
+    '''Random Forrest based reduction strategy. Somewhat more
+    aggressive than for example 'spearman' because there are no
+    negative values, but instead the highest positive correlation
+    is minused from all the values so that max value is 0, and then
+    values are turned into positive. The one with the highest positive
+    score in the end will be dropped. This means that anything with
+    0 originally, is a candidate for dropping. Because there are multiple
+    zeroes in many cases, there is an element of randomness on which one
+    is dropped.
+
+    '''
+
+    import wrangle
+    import numpy as np
+
+    # handle conversion to multi_labels
+    from .reduce_utils import cols_to_multilabel
+    data = cols_to_multilabel(self)
+
+    # get the correlations
+    corr_values = wrangle.df_corr_randomforest(data, self.reduction_metric)
+
+    # drop labels where value is NaN
+    corr_values.dropna(inplace=True)
+
+    # handle the turning around of values (see docstring for more info)
+    corr_values -= corr_values[0]
+    corr_values = corr_values.abs()
+
+    # get the strongest correlation
+    corr_values = corr_values.index[-1]
+
+    # get the label, value, and dtype from the column header
+    label, dtype, value = corr_values.split('~')
+
+    # convert things back to their original dtype
+    value = np.array([value]).astype(dtype)[0]
+
+    # this is where we modify the parameter space accordingly
+    self.param_object.remove_is(label, value)
+
+    return value, label
diff --git a/talos/reducers/reduce_run.py b/talos/reducers/reduce_run.py
@@ -20,6 +20,9 @@ def reduce_run(self):
     '''
 
     from .correlation import correlation
+    from .forrest import forrest
+    from .trees import trees
+
     from .local_strategy import local_strategy
     from .limit_by_metric import limit_by_metric
 
@@ -53,12 +56,12 @@ def reduce_run(self):
             self = correlation(self, 'spearman')
 
         # check if random forrest can do something
-        if self.reduction_method == 'random_forrest':
-            pass
+        if self.reduction_method == 'forrest':
+            self = forrest(self)
 
         # check if random forrest can do something
-        if self.reduction_method == 'extra_trees':
-            pass
+        if self.reduction_method == 'trees':
+            self = trees(self)
 
         # check if monte carlo can do something
         if self.reduction_method == 'monte_carlo':

diff --git a/talos/reducers/reduce_utils.py b/talos/reducers/reduce_utils.py
@@ -8,7 +8,7 @@ def cols_to_multilabel(self):
     import pandas as pd
 
     # read in the experiment log
-    data = pd.read_csv(self.experiment_name + '.csv')
+    data = pd.read_csv(self._experiment_log)
 
     # apply recuction window
     data = data.tail(self.reduction_window)

diff --git a/talos/reducers/trees.py b/talos/reducers/trees.py
@@ -0,0 +1,48 @@
+def trees(self, quantile=.8):
+
+    '''Extra Trees based reduction strategy. Like 'forrest', somewhat more
+    aggressive than for example 'spearman' because there are no
+    negative values, but instead the highest positive correlation
+    is minused from all the values so that max value is 0, and then
+    values are turned into positive. The one with the highest positive
+    score in the end will be dropped. This means that anything with
+    0 originally, is a candidate for dropping. Because there are multiple
+    zeroes in many cases, there is an element of randomness on which one
+    is dropped.
+
+    '''
+
+    import wrangle
+    import numpy as np
+
+    # handle conversion to multi_labels
+    from .reduce_utils import cols_to_multilabel
+    data = cols_to_multilabel(self)
+
+    # because extra trees wants label as 'y' we first transform with quantile
+    quantile_value = data[self.reduction_metric].quantile(quantile)
+    data[self.reduction_metric] = data[self.reduction_metric] > quantile_value
+
+    # get the correlations
+    corr_values = wrangle.df_corr_extratrees(data, self.reduction_metric)
+
+    # drop labels where value is NaN
+    corr_values.dropna(inplace=True)
+
+    # handle the turning around of values (see docstring for more info)
+    corr_values -= corr_values[0]
+    corr_values = corr_values.abs()
+
+    # get the strongest correlation
+    corr_values = corr_values.index[-1]
+
+    # get the label, value, and dtype from the column header
+    label, dtype, value = corr_values.split('~')
+
+    # convert things back to their original dtype
+    value = np.array([value]).astype(dtype)[0]
+
+    # this is where we modify the parameter space accordingly
+    self.param_object.remove_is(label, value)
+
+    return value, label
diff --git a/talos/scan/Scan.py b/talos/scan/Scan.py
@@ -48,9 +48,10 @@ def model():
     model : keras model
         Any Keras model with relevant declrations like params['first_neuron']
     experiment_name : str
-        Experiment name will be used to produce the file name for the
-        results saved in the local directory. Make sure to change it between
-        experiments to avoid log of previous experiment from being overwritten.
+        Experiment name will be used to produce a folder (unless already) it's
+        there from previous iterations of the experiment. Logs of the
+        experiment are saved in the folder with timestamp of start
+        time as filenames.
     x_val : ndarray
         User specified cross-validation data. (Default is None).
     y_val : ndarray
@@ -124,8 +125,12 @@ def model():
 
     global self
 
-    def __init__(self, x, y, params, model,
-                 experiment_name=None,
+    def __init__(self,
+                 x,
+                 y,
+                 params,
+                 model,
+                 experiment_name,
                  x_val=None,
                  y_val=None,
                  val_split=.3,

diff --git a/talos/scan/scan_prepare.py b/talos/scan/scan_prepare.py
@@ -3,11 +3,9 @@ def scan_prepare(self):
     '''Includes all preparation procedures up until starting the first scan
     through scan_run()'''
 
-    import time as ti
+    from .scan_utils import initialize_log
 
-    # create the name for the experiment
-    if self.experiment_name is None:
-        self.experiment_name = ti.strftime('%D%H%M%S').replace('/', '')
+    self._experiment_log = initialize_log(self)
 
     # for the case where x_val or y_val is missing when other is present
     self.custom_val_split = False

diff --git a/talos/scan/scan_round.py b/talos/scan/scan_round.py
@@ -3,16 +3,16 @@ def scan_round(self):
     '''The main operational function that manages the experiment
     on the level of execution of each round.'''
 
-    import time as ti
+    import time
     import gc
 
     # print round params
     if self.print_params is True:
         print(self.round_params)
 
     # set start time
-    round_start = ti.strftime('%D-%H%M%S')
-    start = ti.time()
+    round_start = time.strftime('%D-%H%M%S')
+    start = time.time()
 
     # fit the model
     from ..model.ingest_model import ingest_model

diff --git a/talos/scan/scan_utils.py b/talos/scan/scan_utils.py
@@ -0,0 +1,21 @@
+def initialize_log(self):
+
+    import time
+    import os
+
+    # create the experiment folder (unless one is already there)
+    try:
+        path = os.getcwd()
+        os.mkdir(path + '/' + self.experiment_name)
+    except FileExistsError:
+        pass
+
+    _experiment_id = time.strftime('%D%H%M%S').replace('/', '')
+    _file_name = _experiment_id + '.csv'
+    _experiment_log = './' + self.experiment_name + '/' + _file_name
+
+    f = open(_experiment_log, 'w')
+    f.write('')
+    f.close()
+
+    return _experiment_log
diff --git a/talos/templates/pipelines.py b/talos/templates/pipelines.py
@@ -6,6 +6,7 @@ def breast_cancer(round_limit=2, random_method='uniform_mersenne'):
                           ta.templates.datasets.breast_cancer()[1],
                           ta.templates.params.breast_cancer(),
                           ta.templates.models.breast_cancer,
+                          'test',
                           round_limit=round_limit)
 
     return scan_object
@@ -19,6 +20,7 @@ def cervical_cancer(round_limit=2, random_method='uniform_mersenne'):
                           ta.templates.datasets.cervical_cancer()[1],
                           ta.templates.params.cervical_cancer(),
                           ta.templates.models.cervical_cancer,
+                          'test',
                           round_limit=round_limit)
 
     return scan_object
@@ -32,6 +34,7 @@ def iris(round_limit=2, random_method='uniform_mersenne'):
                           ta.templates.datasets.iris()[1],
                           ta.templates.params.iris(),
                           ta.templates.models.iris,
+                          'test',
                           round_limit=round_limit)
 
     return scan_object
@@ -45,6 +48,7 @@ def titanic(round_limit=2, random_method='uniform_mersenne'):
                           ta.templates.datasets.titanic()[1][:50],
                           ta.templates.params.titanic(),
                           ta.templates.models.titanic,
+                          'test',
                           round_limit=round_limit)
 
     return scan_object