Skip to content

Commit

Permalink
prepares for Command and Conquer style browser based experiment manag…
Browse files Browse the repository at this point in the history
…ement

- `experiment_name` is now compulsory
- Addresses #207 and prepare for browser based "command center"
- Added 'trees' reduction strategy
- Added 'forrest' reduction strategy
- added `scan_utils.py` as a home for helper functions for /scan
- fixed tests to handle the new changes
  • Loading branch information
mikkokotila committed Aug 5, 2019
1 parent b682a5e commit 240ff85
Show file tree
Hide file tree
Showing 22 changed files with 209 additions and 45 deletions.
6 changes: 4 additions & 2 deletions docs/Monitoring.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ Epoch-by-epoch training data is available during the experiment using the `Exper

```python
model.fit(...
callbacks=[talos.utils.ExperimentLogCallback('filename_or_path', params)])
callbacks=[talos.utils.ExperimentLogCallback('experiment_name', params)])
```
NOTE: `params` is the params dictionary in the `Scan()` input model.
Here `params` is the params dictionary in the `Scan()` input model. Both
`experiment_name` and `experiment_id` should match with the current experiment,
as otherwise
2 changes: 1 addition & 1 deletion docs/Scan.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ Argument | Input | Description
`y` | array or list of arrays | prediction outcome variable
`params` | dict | the parameter dictionary
`model` | function | the Keras model as a function
`experiment_name` | str | Used for experiment log
`experiment_name` | str | Used for creating the experiment logging folder
`x_val` | array or list of arrays | validation data for x
`y_val` | array or list of arrays | validation data for y
`val_split` | float | validation data split ratio
Expand Down
8 changes: 7 additions & 1 deletion talos/autom8/automodel.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
class AutoModel:

def __init__(self, task, metric=None):
def __init__(self, task, experiment_name, metric=None):

'''
Expand All @@ -19,12 +19,16 @@ def __init__(self, task, metric=None):
If 'continuous' then mae is used for metric, if 'binary',
'multiclass', or 'multilabel', f1score is used. Accuracy is always
used.
experiment_name | str | Must be same as in `Scan()`
metric : None or list
You can also input a list with one or more custom metrics or names
of Keras or Talos metrics.
'''

from talos.utils.experiment_log_callback import ExperimentLogCallback

self.task = task
self.experiment_name = experiment_name
self.metric = metric

if self.task is not None:
Expand All @@ -36,6 +40,7 @@ def __init__(self, task, metric=None):

# create the model
self.model = self._create_input_model
self.callback = ExperimentLogCallback

def _set_metric(self):

Expand Down Expand Up @@ -112,6 +117,7 @@ def _create_input_model(self, x_train, y_train, x_val, y_val, params):
batch_size=params['batch_size'],
epochs=params['epochs'],
verbose=0,
callbacks=[self.callback(self.experiment_name, params)],
validation_data=[x_val, y_val])

# pass the output to Talos
Expand Down
16 changes: 13 additions & 3 deletions talos/autom8/autoscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@ class AutoScan:

def __init__(self,
task,
max_param_values):
max_param_values,
experiment_name):

'''Configure the `AutoScan()` experiment and then use
the property `start` in the returned class object to start
Expand All @@ -16,6 +17,7 @@ def __init__(self,

self.task = task
self.max_param_values = max_param_values
self.experiment_name = experiment_name

def start(self, x, y, **kwargs):

Expand All @@ -34,11 +36,19 @@ def start(self, x, y, **kwargs):

try:
kwargs['params']
scan_object = talos.Scan(x, y, model=m, **kwargs)
scan_object = talos.Scan(x, y,
model=m,
experiment_name=self.experiment_name,
**kwargs)
except KeyError:
p = talos.autom8.AutoParams(task=self.task)
p.resample_params(self.max_param_values)
params = p.params
scan_object = talos.Scan(x, y, params, m, **kwargs)
scan_object = talos.Scan(x=x,
y=y,
params=params,
model=m,
experiment_name=self.experiment_name,
**kwargs)

return scan_object
2 changes: 1 addition & 1 deletion talos/logging/results.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def save_result(self):

import numpy as np

np.savetxt(self.experiment_name + '.csv',
np.savetxt(self._experiment_log,
self.result,
fmt='%s',
delimiter=',')
Expand Down
2 changes: 1 addition & 1 deletion talos/reducers/correlation.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def correlation(self, method):
# convert things back to their original dtype
value = np.array([value]).astype(dtype)[0]

# this is where modify the parameter space accordingly
# this is where we modify the parameter space accordingly
self.param_object.remove_is(label, value)

return self
44 changes: 44 additions & 0 deletions talos/reducers/forrest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
def forrest(self):

'''Random Forrest based reduction strategy. Somewhat more
aggressive than for example 'spearman' because there are no
negative values, but instead the highest positive correlation
is minused from all the values so that max value is 0, and then
values are turned into positive. The one with the highest positive
score in the end will be dropped. This means that anything with
0 originally, is a candidate for dropping. Because there are multiple
zeroes in many cases, there is an element of randomness on which one
is dropped.
'''

import wrangle
import numpy as np

# handle conversion to multi_labels
from .reduce_utils import cols_to_multilabel
data = cols_to_multilabel(self)

# get the correlations
corr_values = wrangle.df_corr_randomforest(data, self.reduction_metric)

# drop labels where value is NaN
corr_values.dropna(inplace=True)

# handle the turning around of values (see docstring for more info)
corr_values -= corr_values[0]
corr_values = corr_values.abs()

# get the strongest correlation
corr_values = corr_values.index[-1]

# get the label, value, and dtype from the column header
label, dtype, value = corr_values.split('~')

# convert things back to their original dtype
value = np.array([value]).astype(dtype)[0]

# this is where we modify the parameter space accordingly
self.param_object.remove_is(label, value)

return value, label
11 changes: 7 additions & 4 deletions talos/reducers/reduce_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ def reduce_run(self):
'''

from .correlation import correlation
from .forrest import forrest
from .trees import trees

from .local_strategy import local_strategy
from .limit_by_metric import limit_by_metric

Expand Down Expand Up @@ -53,12 +56,12 @@ def reduce_run(self):
self = correlation(self, 'spearman')

# check if random forrest can do something
if self.reduction_method == 'random_forrest':
pass
if self.reduction_method == 'forrest':
self = forrest(self)

# check if random forrest can do something
if self.reduction_method == 'extra_trees':
pass
if self.reduction_method == 'trees':
self = trees(self)

# check if monte carlo can do something
if self.reduction_method == 'monte_carlo':
Expand Down
2 changes: 1 addition & 1 deletion talos/reducers/reduce_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def cols_to_multilabel(self):
import pandas as pd

# read in the experiment log
data = pd.read_csv(self.experiment_name + '.csv')
data = pd.read_csv(self._experiment_log)

# apply recuction window
data = data.tail(self.reduction_window)
Expand Down
48 changes: 48 additions & 0 deletions talos/reducers/trees.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
def trees(self, quantile=.8):

'''Extra Trees based reduction strategy. Like 'forrest', somewhat more
aggressive than for example 'spearman' because there are no
negative values, but instead the highest positive correlation
is minused from all the values so that max value is 0, and then
values are turned into positive. The one with the highest positive
score in the end will be dropped. This means that anything with
0 originally, is a candidate for dropping. Because there are multiple
zeroes in many cases, there is an element of randomness on which one
is dropped.
'''

import wrangle
import numpy as np

# handle conversion to multi_labels
from .reduce_utils import cols_to_multilabel
data = cols_to_multilabel(self)

# because extra trees wants label as 'y' we first transform with quantile
quantile_value = data[self.reduction_metric].quantile(quantile)
data[self.reduction_metric] = data[self.reduction_metric] > quantile_value

# get the correlations
corr_values = wrangle.df_corr_extratrees(data, self.reduction_metric)

# drop labels where value is NaN
corr_values.dropna(inplace=True)

# handle the turning around of values (see docstring for more info)
corr_values -= corr_values[0]
corr_values = corr_values.abs()

# get the strongest correlation
corr_values = corr_values.index[-1]

# get the label, value, and dtype from the column header
label, dtype, value = corr_values.split('~')

# convert things back to their original dtype
value = np.array([value]).astype(dtype)[0]

# this is where we modify the parameter space accordingly
self.param_object.remove_is(label, value)

return value, label
15 changes: 10 additions & 5 deletions talos/scan/Scan.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,10 @@ def model():
model : keras model
Any Keras model with relevant declrations like params['first_neuron']
experiment_name : str
Experiment name will be used to produce the file name for the
results saved in the local directory. Make sure to change it between
experiments to avoid log of previous experiment from being overwritten.
Experiment name will be used to produce a folder (unless already) it's
there from previous iterations of the experiment. Logs of the
experiment are saved in the folder with timestamp of start
time as filenames.
x_val : ndarray
User specified cross-validation data. (Default is None).
y_val : ndarray
Expand Down Expand Up @@ -124,8 +125,12 @@ def model():

global self

def __init__(self, x, y, params, model,
experiment_name=None,
def __init__(self,
x,
y,
params,
model,
experiment_name,
x_val=None,
y_val=None,
val_split=.3,
Expand Down
6 changes: 2 additions & 4 deletions talos/scan/scan_prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,9 @@ def scan_prepare(self):
'''Includes all preparation procedures up until starting the first scan
through scan_run()'''

import time as ti
from .scan_utils import initialize_log

# create the name for the experiment
if self.experiment_name is None:
self.experiment_name = ti.strftime('%D%H%M%S').replace('/', '')
self._experiment_log = initialize_log(self)

# for the case where x_val or y_val is missing when other is present
self.custom_val_split = False
Expand Down
6 changes: 3 additions & 3 deletions talos/scan/scan_round.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,16 @@ def scan_round(self):
'''The main operational function that manages the experiment
on the level of execution of each round.'''

import time as ti
import time
import gc

# print round params
if self.print_params is True:
print(self.round_params)

# set start time
round_start = ti.strftime('%D-%H%M%S')
start = ti.time()
round_start = time.strftime('%D-%H%M%S')
start = time.time()

# fit the model
from ..model.ingest_model import ingest_model
Expand Down
21 changes: 21 additions & 0 deletions talos/scan/scan_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
def initialize_log(self):

import time
import os

# create the experiment folder (unless one is already there)
try:
path = os.getcwd()
os.mkdir(path + '/' + self.experiment_name)
except FileExistsError:
pass

_experiment_id = time.strftime('%D%H%M%S').replace('/', '')
_file_name = _experiment_id + '.csv'
_experiment_log = './' + self.experiment_name + '/' + _file_name

f = open(_experiment_log, 'w')
f.write('')
f.close()

return _experiment_log
4 changes: 4 additions & 0 deletions talos/templates/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ def breast_cancer(round_limit=2, random_method='uniform_mersenne'):
ta.templates.datasets.breast_cancer()[1],
ta.templates.params.breast_cancer(),
ta.templates.models.breast_cancer,
'test',
round_limit=round_limit)

return scan_object
Expand All @@ -19,6 +20,7 @@ def cervical_cancer(round_limit=2, random_method='uniform_mersenne'):
ta.templates.datasets.cervical_cancer()[1],
ta.templates.params.cervical_cancer(),
ta.templates.models.cervical_cancer,
'test',
round_limit=round_limit)

return scan_object
Expand All @@ -32,6 +34,7 @@ def iris(round_limit=2, random_method='uniform_mersenne'):
ta.templates.datasets.iris()[1],
ta.templates.params.iris(),
ta.templates.models.iris,
'test',
round_limit=round_limit)

return scan_object
Expand All @@ -45,6 +48,7 @@ def titanic(round_limit=2, random_method='uniform_mersenne'):
ta.templates.datasets.titanic()[1][:50],
ta.templates.params.titanic(),
ta.templates.models.titanic,
'test',
round_limit=round_limit)

return scan_object
Loading

0 comments on commit 240ff85

Please sign in to comment.