From 35562ae3fe748fc7dc934b236eb5d38058f5fd38 Mon Sep 17 00:00:00 2001 From: Matthew Carbone Date: Mon, 23 Jul 2018 00:05:51 -0400 Subject: [PATCH 01/13] Add docstrings, implement seed generator Add docstrings to validation_split() and random_shuffle(). Also, add capability to seed the random shuffle generator so taht results may be reproducible, at least in terms of the random shuffling of the train/cross-validation data. --- talos/scan.py | 3 ++- talos/utils/validation_split.py | 21 ++++++++++++--------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/talos/scan.py b/talos/scan.py index 18a5d198..8b576a70 100755 --- a/talos/scan.py +++ b/talos/scan.py @@ -26,7 +26,7 @@ def __init__(self, x, y, params, dataset_name, experiment_no, model, reduction_method=None, reduction_interval=100, reduction_window=None, grid_downsample=None, reduction_metric='val_acc', round_limit=None, - talos_log_name='talos.log', debug=False): + talos_log_name='talos.log', debug=False, seed=None): self.dataset_name = dataset_name self.experiment_no = experiment_no @@ -49,6 +49,7 @@ def __init__(self, x, y, params, dataset_name, experiment_no, model, self.grid_downsample = grid_downsample self.val_split = val_split self.shuffle = shuffle + self.seed = seed self.p = param_format(self) self.combinations = param_space(self) diff --git a/talos/utils/validation_split.py b/talos/utils/validation_split.py index 731bc8c2..16aeda0f 100644 --- a/talos/utils/validation_split.py +++ b/talos/utils/validation_split.py @@ -2,18 +2,15 @@ def validation_split(self): + """Defines the attributes `x_train`, `y_train`, `x_val` and `y_val`. + The validation (cross-validation, aka development) sets are determined + by the attribute val_split, which is a number in (0, 1) which determines + the proportion of the input data to be allocated for cross-validation.""" - '''VALIDATION SPLIT OF X AND Y - Based on the Scan() parameter val_split - both 'x' and 'y' are split. - - ''' - - if self.shuffle == True: + if self.shuffle: random_shuffle(self) - len_x = len(self.x) - limit = int(len_x * (1 - self.val_split)) + limit = int(len(self.x) * (1 - self.val_split)) self.x_train = self.x[:limit] self.y_train = self.y[:limit] @@ -25,8 +22,14 @@ def validation_split(self): def random_shuffle(self): + """Randomly shuffles the datasets. If self.seed is set, seed the generator + to ensure that the results are reproducible.""" random_index = np.arange(len(self.x)) + + if self.seed is not None: + np.random.seed(self.seed) + np.random.shuffle(random_index) self.x = self.x[random_index] From 2a7e9a0412c94ad36867ee1b7f35fd974f2f6a66 Mon Sep 17 00:00:00 2001 From: Matthew Carbone Date: Wed, 25 Jul 2018 14:52:56 -0400 Subject: [PATCH 02/13] Implement user specified x/y_val datasets To allow for a user to potentially augment their training data but not their validation data (and it should be noted that nobody should ever augment first then randomize then split, else major bias problems), implement an option to explicitly specify the validation dataset. Also make necessary changes to validation_split to account for this, and make a few QOL edits to the testing suite. --- talos/scan.py | 43 ++++++++++++++++++++++++++++++--- talos/utils/validation_split.py | 20 +++++++++------ test_script.py | 5 ++++ 3 files changed, 58 insertions(+), 10 deletions(-) diff --git a/talos/scan.py b/talos/scan.py index 6506d955..c6c30c7e 100755 --- a/talos/scan.py +++ b/talos/scan.py @@ -18,6 +18,12 @@ from .metrics.entropy import epoch_entropy +TRAIN_VAL_RUNTIME_ERROR_MSG = """ +If setting a custom train/val split, both x_val and y_val must be input data +and not None. +""" + + class Scan: """Suite of operations for training and evaluating Keras neural networks. @@ -30,14 +36,31 @@ class Scan: and the dictionary d = { - 'fcc_layer_1_N': [50, 100, 200] - 'fcc_layer_1_act': ['relu', 'tanh'] + 'fcc_layer_1_N': [50, 100, 200], + 'fcc_layer_1_act': ['relu', 'tanh'], 'fcc_layer_1_dropout': (0, 0.1, 5) # 5 points between 0 and 0.1 } The dictionary is parsed for every run and only one entry per parameter is fed into the neural network at a time. + Important note: the user has two options when specifying input data. + + Option 1: + Specify x, y and val_split. The training and validation data mixture + (x, y) will be randomly split into the training and validation datasets + as per the split specified in val_split. + + Option 2: + Specify x, y and x_val, y_val. This would allow the user to specify + their own validation datasets. Keras by default shuffles data during + training, so the user need only be sure that the split specified is + correct. This allows for not only reproducibility, but randomizing the + data on the user's own terms. This is critical if the user wishes to + augment their training data without augmenting their validation data + (which is the only acceptable practice!). + + Parameters ---------- x : ndarray @@ -90,8 +113,14 @@ class Scan: The lame of the saved Talos log. (Default is 'talos.log'). debug : bool Implements debugging feedback. (Default is False). + x_val : ndarray + User specified cross-validation data. (Default is None). + y_val : ndarray + User specified cross-validation labels. (Default is None). + """ + # TODO: refactor this so that we don't initialize global variables global self def __init__(self, x, y, params, dataset_name, experiment_no, model, @@ -99,12 +128,20 @@ def __init__(self, x, y, params, dataset_name, experiment_no, model, reduction_method=None, reduction_interval=100, reduction_window=None, grid_downsample=None, reduction_metric='val_acc', round_limit=None, - talos_log_name='talos.log', debug=False, seed=None): + talos_log_name='talos.log', debug=False, seed=None, + x_val=None, y_val=None): self.dataset_name = dataset_name self.experiment_no = experiment_no self.experiment_name = dataset_name + '_' + experiment_no + self.custom_val_split = False + if (x_val is not None and y_val is None) or \ + (x_val is None and y_val is not None): + raise RuntimeError(TRAIN_VAL_RUNTIME_ERROR_MSG) + elif (x_val is not None and y_val is not None): + self.custom_val_split = True + if debug: self.logfile = open('talos.debug.log', 'a') else: diff --git a/talos/utils/validation_split.py b/talos/utils/validation_split.py index 16aeda0f..21976603 100644 --- a/talos/utils/validation_split.py +++ b/talos/utils/validation_split.py @@ -7,16 +7,22 @@ def validation_split(self): by the attribute val_split, which is a number in (0, 1) which determines the proportion of the input data to be allocated for cross-validation.""" - if self.shuffle: - random_shuffle(self) + if self.custom_val_split: + self.x_train = self.x + self.y_train = self.y + # self.x/y_val are already set - limit = int(len(self.x) * (1 - self.val_split)) + else: + if self.shuffle: + random_shuffle(self) - self.x_train = self.x[:limit] - self.y_train = self.y[:limit] + limit = int(len(self.x) * (1 - self.val_split)) - self.x_val = self.x[limit:] - self.y_val = self.y[limit:] + self.x_train = self.x[:limit] + self.y_train = self.y[:limit] + + self.x_val = self.x[limit:] + self.y_val = self.y[limit:] return self diff --git a/test_script.py b/test_script.py index a500fcbb..a440f84a 100644 --- a/test_script.py +++ b/test_script.py @@ -96,14 +96,19 @@ def cervix_model(x_train, y_train, x_val, y_val, params): 'weight_regulizer': [None], 'emb_output_dims': [None]} + +print("Running Iris test 1...") x, y = ta.datasets.iris() +print("Scanning...") +print(x.shape, y.shape) h = ta.Scan(x, y, params=p, dataset_name='testing', experiment_no='000', model=iris_model) +print("Running Iris test 2...") p = {'lr': [1], 'first_neuron': [4], 'hidden_layers': [2], From 86611e28b4b6e0235a699951d21744532cb1b06d Mon Sep 17 00:00:00 2001 From: Mikko Kotila Date: Wed, 25 Jul 2018 23:18:21 +0300 Subject: [PATCH 03/13] Create ISSUE_TEMPLATE.md --- ISSUE_TEMPLATE.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 ISSUE_TEMPLATE.md diff --git a/ISSUE_TEMPLATE.md b/ISSUE_TEMPLATE.md new file mode 100644 index 00000000..e965047a --- /dev/null +++ b/ISSUE_TEMPLATE.md @@ -0,0 +1 @@ +Hello From bb6803c05ed3307ef25ae392bfec766b0f512bc8 Mon Sep 17 00:00:00 2001 From: Mikko Kotila Date: Wed, 25 Jul 2018 23:26:08 +0300 Subject: [PATCH 04/13] Update ISSUE_TEMPLATE.md --- ISSUE_TEMPLATE.md | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/ISSUE_TEMPLATE.md b/ISSUE_TEMPLATE.md index e965047a..8f60a5ac 100644 --- a/ISSUE_TEMPLATE.md +++ b/ISSUE_TEMPLATE.md @@ -1 +1,18 @@ -Hello +Thanks so much for coming here to raise an issue. Please take a moment to 'check' the below boxes: + +- [ ] I'm up-to-date with the latest release: + + pip install -U talos + +- [ ] I've confirmed that my Keras model works outside of Talos. + +If you still have an error, please submit **complete trace** and a code with: + +- output of shape in numpy for x and y +- Talos params dictionary +- The Keras model wired for Talos +- Description of extra variables in the model + +You can provide the code in pastebin / gist or any other format you like. + +------------------------------------------------------------------------- From 1f097e06ed477687a53e90a594b1cfc07db861d6 Mon Sep 17 00:00:00 2001 From: Mikko Kotila Date: Wed, 25 Jul 2018 23:27:10 +0300 Subject: [PATCH 05/13] Update ISSUE_TEMPLATE.md --- ISSUE_TEMPLATE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ISSUE_TEMPLATE.md b/ISSUE_TEMPLATE.md index 8f60a5ac..6a68e32a 100644 --- a/ISSUE_TEMPLATE.md +++ b/ISSUE_TEMPLATE.md @@ -8,7 +8,7 @@ Thanks so much for coming here to raise an issue. Please take a moment to 'check If you still have an error, please submit **complete trace** and a code with: -- output of shape in numpy for x and y +- output of shape for x and y e.g. (212,12) - Talos params dictionary - The Keras model wired for Talos - Description of extra variables in the model From 64a59821108085f4a188fec06de47000d0dba7e4 Mon Sep 17 00:00:00 2001 From: Matthew Carbone Date: Wed, 25 Jul 2018 16:32:05 -0400 Subject: [PATCH 06/13] Minor linting --- test_script.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/test_script.py b/test_script.py index a440f84a..3ba576fb 100644 --- a/test_script.py +++ b/test_script.py @@ -6,10 +6,13 @@ from keras.models import Sequential from keras.layers import Dropout, Dense -from keras.optimizers import SGD, Adam, Adadelta, Adagrad, Adamax, RMSprop, Nadam +from keras.optimizers import SGD, Adam, Adadelta, Adagrad +from keras.optimizers import Adamax, RMSprop, Nadam from keras.activations import softmax, relu, elu, sigmoid -from keras.losses import categorical_crossentropy, logcosh, binary_crossentropy -from talos.metrics.keras_metrics import matthews_correlation, precision, recall, fmeasure +from keras.losses import categorical_crossentropy, logcosh +from keras.losses import binary_crossentropy +from talos.metrics.keras_metrics import matthews_correlation, precision +from talos.metrics.keras_metrics recall, fmeasure def iris_model(x_train, y_train, x_val, y_val, params): From 895bc7cb5eabd8eefcf3b240cfc8bc15bd202447 Mon Sep 17 00:00:00 2001 From: Matthew Carbone Date: Wed, 25 Jul 2018 16:34:45 -0400 Subject: [PATCH 07/13] Bugfix --- test_script.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_script.py b/test_script.py index 3ba576fb..e7fb383b 100644 --- a/test_script.py +++ b/test_script.py @@ -12,7 +12,7 @@ from keras.losses import categorical_crossentropy, logcosh from keras.losses import binary_crossentropy from talos.metrics.keras_metrics import matthews_correlation, precision -from talos.metrics.keras_metrics recall, fmeasure +from talos.metrics.keras_metrics import recall, fmeasure def iris_model(x_train, y_train, x_val, y_val, params): From 93d0e8fb67acdaac59656019526458c4ecf7a361 Mon Sep 17 00:00:00 2001 From: Mikko Kotila Date: Thu, 26 Jul 2018 13:41:14 +0300 Subject: [PATCH 08/13] Create CONTRIBUTE.md --- CONTRIBUTE.md | 188 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 188 insertions(+) create mode 100644 CONTRIBUTE.md diff --git a/CONTRIBUTE.md b/CONTRIBUTE.md new file mode 100644 index 00000000..fd986ee0 --- /dev/null +++ b/CONTRIBUTE.md @@ -0,0 +1,188 @@ +# Contributing to Talos + +Thank you very much for taking the effort to contribute Talos. Below you will find some simple and mostly obvious guidelines on how to do it in the most valuable way. + +1. [Ways to Contribute](#ways-to-contribute) + + 1.1. [Code](#code) + + 1.2. [Ideas](#ideas) + + 1.3. [Testing](#testing) + + 1.4. [Something Else](#something) + + 1.5. [Documentation](#documentation) + + 1.6. [Examples](#examples) + +2. [Important Precautions for Code Contributions](#precautions) + + 2.1. [Planning](#code) + + 2.2. [Testing](#ideas) + + 2.3. [Documentation](#docs_for_review) + +3. [Reviewing Pull Requests](#review) + +4. [Specific Guidelines for Github](#github) + +## 1. Ways to contribute + +There are several ways programmers, data scientists and others can contribute to Autonomio. + +#### 1.1. Contributing Code + +##### 1.1.0. Note on Philosophy and Style + +**AUTONOMIO DEV PHILOSOPHY** + +- Doing is more interesting than achieving +- Having fun is more important than being productive +- Code coverage can, and needs to be 100% +- User docs are more important than new features +- Testing is more important than building +- Creating great stuff takes long time + +**CODING STYLE GUIDELINES** + +We follow pep8. Because [reading docs](http://legacy.python.org/dev/peps/pep-0008/) and particulary [style guides](http://legacy.python.org/dev/peps/pep-0008/) more or less suck, and one way is to use Atom and the amazing Linter plugin. + +**MORE STYLE GUIDELINES** + +We also make the best effort in moving towards following pep20: + +- Beautiful is better than ugly +- Explicit is better than implicit +- Simple is better than complex +- Complex is better than complicated +- Flat is better than nested +- Sparse is better than dense +- Readability counts +- Special cases aren't special enough to break the rules +- Although practicality beats purity +- Errors should never pass silently +- Unless explicitly silenced +- In the face of ambiguity, refuse the temptation to guess +- There should be one-- and preferably only one --obvious way to do it +- Although that way may not be obvious at first unless you're Dutch +- Now is better than never +- Although never is often better than right now +- If the implementation is hard to explain, it's a bad idea +- If the implementation is easy to explain, it may be a good idea +- Namespaces are one honking great idea -- let's do more of those + +##### 1.1.1. Contribute to Open Issues + +It will be great if you can contribute towards open issues. To do this, the best way is to: + +1) check out the [open issues](https://github.com/autonomio/talos/issues) +2) join the conversation and share your willingness to contribute +3) somebody will help you get started / provide more details if needed +4) fork [the current dev](https://github.com/autonomio/talos/issues#fork-destination-box) branch +5) make your changes to your own fork/repo +6) test, test, test +7) if it's a new feature, make changes to test_script.py accordingly +8) make sure that Travis build passes +9) come back and make a pull request + +What we really try to avoid, is being this guy... + +Drawing + +#### 1.1.2. Contribute to a New Idea + +Same as above, but start by [creating a new issue](https://github.com/autonomio/core-module/issues/new) to open a discussion on the idea you have for contribution. + +### 1.2. Contributing Ideas + +In case you don't want to contribute code, but have a feature request or some other idea, that is a great contribution as well and will be much appreciated. You can do it by [creating a new issue](https://github.com/autonomio/core-module/issues/new). + + + +### 1.3. Contributing Testing + +Another great way to contribute is testing, which really just means using Talos and [reporting issues](https://github.com/autonomio/talos/issues/new) as they might arise. + +**Testing comes in two forms:** + +#### 1.3.1 actual testing + +Just use Autonomio for any open challenge you are working on. Or pick one from [Kaggle](https://www.kaggle.com/competitions). + +1) Work with Autonomion in data science challenges +2) Try a lot of different things +3) [Report issues](https://github.com/autonomio/talos/issues/new) as you may find them + +#### 1.3.2 improving code coverage + +We're using [Coveralls](https://coveralls.io) for code coverage testing, and even the smallest contributions to this end help a great deal. + +1) Follow the instructions in section 1.1 and 1.3.1 +2) Use your own fork to see how the results improve in comparison to [current Master](https://coveralls.io/github/autonomio/core-module) + +### 1.4. Contributing Something Else + +Best way to get started might be [starting a discussion](https://github.com/autonomio/talos/issues/new) as they might arise. + +### 1.5. Contributing to Manual / Documentation + +At the moment there is no manual / documentation, so contributions here would be wonderful. Generally it's better to do something very simple and clear. It seems that [RTD](http://readthedocs.io) is a good option as it can read INDEX.rst in /docs and a slightly more complex but much better looking option would be slate. + + + +### 1.6. Contributing Examples + +One of the most useful ways to contribute is when you use Talos for an actual project / challenge, and then write a blog post about your experience with code examples. + +## 2. Important Precautions for Code Contributions + +### 2.1. Planning the Change + +Before even thinking about making any changes to actual code: + +1) Define what is happening now (what needs to be changed) +2) Define what is happening differently (once the code is changed) +3) Use text search to find which files / functions are affected +4) Make sure that you understand what each function is doing in relation to the change + +### 2.2. Testing the Change + +Never ever, under any circumstances, commit code that is not thoroughly tested: + +1) Run through the code changes and ask yourself if it makes sense +2) Create a clean environment and install from your fork: + + pip install git+http://your-fork-repo-address.git + +3) Perform all the commands where your changes are involved and note them down +4) Change the test_script.py in the repo root with the commands from step 3 +5) Make sure that code coverage is not becoming lower* +6) Make sure that Travis build is passed + +*In terms of code coverage, 100% coverage for your changes would be ideal. If you can't do that, then at least explain the possible caveats in the commit details and also in the comments section of the pull request you are making. + +Once you've gone through all these steps, take a short break, come back and ask yourself the question: + +"WHAT COULD GO WRONG?" +### 3. Reviewing Pull Requests + +If you've been assigned as a reviewer of a given pull request, unless you've been explicitly asked to do so, **DON'T MERGE** just approve the review and share in the comments what you think. If you don't have any comments, just confirm with a comment that you don't have any. While this is kind of obvious, don't start reviewing before you can see all the tests have passed ;) + +### 4. General points on using Github + +1) First things first, make sure you understand [this](https://guides.github.com/introduction/flow/index.html) 100% +2) Also make sure that you clearly understand everything that is said [here](https://blog.hartleybrody.com/git-small-teams/) +3) Working on your local machine, only have one folder (the git remote) +4) Load it as module with: + + import sys + return sys.path.insert(0, '/home/dev/talos') + +5) Frequently fetch origin to make sure you get latest changes from other people +6) Don’t work in separate forks, but in branches +7) Keep commits as small as possible +8) Make clear commit messages (explain what you actually are changing) + +For Mac users Github desktop is pretty fantastic. For Linux users the GUIs are not so fantastic. Atom looks like a good cross-platform option. From 99761dbd8f2708ae654181825ba07aa6b1f656ed Mon Sep 17 00:00:00 2001 From: Matthew Carbone Date: Thu, 26 Jul 2018 19:53:44 -0400 Subject: [PATCH 09/13] Completely refactor the testing suite Clean up completely. Make the following changes: - Move all tests into test/ directory - test_script.py still calls all the same unit checks as before - Move test models (iris and cervical cancer) to talos/model/examples.py --- talos/model/examples.py | 80 ++++++++++++++++ test/__init__.py | 0 test/__main__.py | 3 + test/core_tests/__init__.py | 1 + test/core_tests/test_scan.py | 103 ++++++++++++++++++++ test_script.py | 180 ++--------------------------------- 6 files changed, 193 insertions(+), 174 deletions(-) create mode 100644 talos/model/examples.py create mode 100644 test/__init__.py create mode 100644 test/__main__.py create mode 100644 test/core_tests/__init__.py create mode 100644 test/core_tests/test_scan.py diff --git a/talos/model/examples.py b/talos/model/examples.py new file mode 100644 index 00000000..d4673d8a --- /dev/null +++ b/talos/model/examples.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python + +from talos.model import lr_normalizer, early_stopper, hidden_layers + +from keras.models import Sequential +from keras.layers import Dropout, Dense + +from talos.metrics.keras_metrics import matthews_correlation, precision +from talos.metrics.keras_metrics import recall, fmeasure + + +def iris_model(x_train, y_train, x_val, y_val, params): + + # note how instead of passing the value, we pass a dictionary entry + model = Sequential() + model.add(Dense(params['first_neuron'], + input_dim=x_train.shape[1], + activation='relu')) + + # same here, just passing a dictionary entry + model.add(Dropout(params['dropout'])) + + # with this call we can create any number of hidden layers + hidden_layers(model, params, y_train.shape[1]) + + # again, instead of the activation name, we have a dictionary entry + model.add(Dense(y_train.shape[1], + activation=params['last_activation'])) + + # here are using a learning rate boundary + model.compile(optimizer=params['optimizer'] + (lr=lr_normalizer(params['lr'], + params['optimizer'])), + loss=params['losses'], + metrics=['acc']) + + # here we are also using the early_stopper function for a callback + out = model.fit(x_train, y_train, + batch_size=params['batch_size'], + epochs=params['epochs'], + verbose=0, + validation_data=[x_val, y_val], + callbacks=early_stopper(params['epochs'], mode=[1, 1])) + + return out, model + + +def cervix_model(x_train, y_train, x_val, y_val, params): + + model = Sequential() + model.add(Dense(params['first_neuron'], + input_dim=x_train.shape[1], + activation='relu')) + + model.add(Dropout(params['dropout'])) + + hidden_layers(model, params, 1) + + model.add(Dense(1, activation=params['last_activation'])) + + model.compile(optimizer=params['optimizer'] + (lr=lr_normalizer(params['lr'], + params['optimizer'])), + loss=params['loss'], + metrics=['acc', + fmeasure, + recall, + precision, + matthews_correlation]) + + results = model.fit(x_train, y_train, + batch_size=params['batch_size'], + epochs=params['epochs'], + verbose=0, + validation_data=[x_val, y_val], + callbacks=early_stopper(params['epochs'], + mode='moderate', + monitor='val_fmeasure')) + + return results, model diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/test/__main__.py b/test/__main__.py new file mode 100644 index 00000000..c3167fc2 --- /dev/null +++ b/test/__main__.py @@ -0,0 +1,3 @@ +import sys + +sys.path.append('../talos') diff --git a/test/core_tests/__init__.py b/test/core_tests/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/test/core_tests/__init__.py @@ -0,0 +1 @@ + diff --git a/test/core_tests/test_scan.py b/test/core_tests/test_scan.py new file mode 100644 index 00000000..62e4987d --- /dev/null +++ b/test/core_tests/test_scan.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python + +from __future__ import print_function + +from keras.losses import categorical_crossentropy, logcosh +from keras.losses import binary_crossentropy +from keras.optimizers import SGD, Adam, Adadelta, Adagrad +from keras.optimizers import Adamax, RMSprop, Nadam +from keras.activations import softmax, relu, sigmoid + +import talos as ta + +from talos.model.examples import iris_model, cervix_model + + +p1 = {'lr': [1], + 'first_neuron': [4], + 'hidden_layers': [2], + 'batch_size': [50], + 'epochs': [1], + 'dropout': [0], + 'shapes': ['stairs', 'triangle', 'hexagon', 'diamond', + 'brick', 'long_funnel', 'rhombus', 'funnel'], + 'optimizer': [Adam], + 'losses': [categorical_crossentropy], + 'activation': [relu], + 'last_activation': [softmax], + 'weight_regulizer': [None], + 'emb_output_dims': [None]} + +p2 = {'lr': [1], + 'first_neuron': [4], + 'hidden_layers': [2], + 'batch_size': [50], + 'epochs': [1], + 'dropout': [0], + 'shapes': ['stairs'], + 'optimizer': [Adam, Adagrad, Adamax, RMSprop, Adadelta, Nadam, SGD], + 'losses': [categorical_crossentropy], + 'activation': [relu], + 'last_activation': [softmax], + 'weight_regulizer': [None], + 'emb_output_dims': [None]} + +p3 = {'lr': (0.5, 5, 10), + 'first_neuron': [4, 8, 16, 32, 64], + 'hidden_layers': [2, 3, 4, 5], + 'batch_size': (2, 30, 10), + 'epochs': [3], + 'dropout': (0, 0.5, 5), + 'weight_regulizer': [None], + 'shapes': ['stairs'], + 'emb_output_dims': [None], + 'optimizer': [Nadam], + 'loss': [logcosh, binary_crossentropy], + 'activation': [relu], + 'last_activation': [sigmoid]} + + +class TestIris: + + def __init__(self): + self.x, self.y = ta.datasets.iris() + + def test_scan_iris_1(self): + print("Running Iris dataset test 1...") + ta.Scan(self.x, self.y, params=p1, dataset_name='iris_test_1', + experiment_no='000', model=iris_model) + + def test_scan_iris_2(self): + print("Running Iris dataset test 2...") + ta.Scan(self.x, self.y, params=p2, dataset_name='iris_test_2', + experiment_no='000', model=iris_model) + ta.Reporting('iris_test_2_000.csv') + + +class TestCancer: + + def __init__(self): + self.x, self.y = ta.datasets.cervical_cancer() + + def test_scan_cancer(self): + print("Running Cervical Cancer dataset test...") + ta.Scan(self.x, self.y, grid_downsample=0.001, params=p3, + dataset_name='cervical_cancer_test', experiment_no='a', + model=cervix_model, + reduction_method='spear', reduction_interval=5) + ta.Reporting('cervical_cancer_test_a.csv') + + +class TestLoadDatasets: + + def __init__(self): + print("Testing Load Datasets...") + x = ta.datasets.icu_mortality() + x = ta.datasets.icu_mortality(100) + x = ta.datasets.titanic() + x = ta.datasets.iris() + x = ta.datasets.cervical_cancer() + x = ta.datasets.breast_cancer() + + x = ta.params.iris() + x = ta.params.breast_cancer() # noqa diff --git a/test_script.py b/test_script.py index e7fb383b..ade76f54 100644 --- a/test_script.py +++ b/test_script.py @@ -1,178 +1,10 @@ #!/usr/bin/env python -import talos as ta -from talos.model import lr_normalizer, early_stopper, hidden_layers +from test.core_tests.test_scan import TestIris, TestCancer, TestLoadDatasets -from keras.models import Sequential -from keras.layers import Dropout, Dense -from keras.optimizers import SGD, Adam, Adadelta, Adagrad -from keras.optimizers import Adamax, RMSprop, Nadam -from keras.activations import softmax, relu, elu, sigmoid -from keras.losses import categorical_crossentropy, logcosh -from keras.losses import binary_crossentropy -from talos.metrics.keras_metrics import matthews_correlation, precision -from talos.metrics.keras_metrics import recall, fmeasure - - -def iris_model(x_train, y_train, x_val, y_val, params): - - # note how instead of passing the value, we pass a dictionary entry - model = Sequential() - model.add(Dense(params['first_neuron'], - input_dim=x_train.shape[1], - activation='relu')) - - # same here, just passing a dictionary entry - model.add(Dropout(params['dropout'])) - - # with this call we can create any number of hidden layers - hidden_layers(model, params, y_train.shape[1]) - - # again, instead of the activation name, we have a dictionary entry - model.add(Dense(y_train.shape[1], - activation=params['last_activation'])) - - # here are using a learning rate boundary - model.compile(optimizer=params['optimizer'](lr=lr_normalizer(params['lr'], - params['optimizer'])), - loss=params['losses'], - metrics=['acc']) - - # here we are also using the early_stopper function for a callback - out = model.fit(x_train, y_train, - batch_size=params['batch_size'], - epochs=params['epochs'], - verbose=0, - validation_data=[x_val, y_val], - callbacks=early_stopper(params['epochs'], mode=[1,1])) - - return out, model - - -def cervix_model(x_train, y_train, x_val, y_val, params): - - model = Sequential() - model.add(Dense(params['first_neuron'], - input_dim=x_train.shape[1], - activation='relu')) - - model.add(Dropout(params['dropout'])) - - hidden_layers(model, params, 1) - - model.add(Dense(1, activation=params['last_activation'])) - - model.compile(optimizer=params['optimizer'](lr=lr_normalizer(params['lr'], params['optimizer'])), - loss=params['loss'], - metrics=['acc', - fmeasure, - recall, - precision, - matthews_correlation]) - - results = model.fit(x_train, y_train, - batch_size=params['batch_size'], - epochs=params['epochs'], - verbose=0, - validation_data=[x_val, y_val], - callbacks=early_stopper(params['epochs'], mode='moderate', monitor='val_fmeasure')) - - return results, model - -# PROGRAM STARTS HERE -# =================== - - -# here use a standard 2d dictionary for inputting the param boundaries -p = {'lr': [1], - 'first_neuron': [4], - 'hidden_layers': [2], - 'batch_size': [50], - 'epochs': [1], - 'dropout': [0], - 'shapes': ['stairs', 'triangle', 'hexagon', 'diamond', 'brick', 'long_funnel', 'rhombus', 'funnel'], - 'optimizer': [Adam], - 'losses': [categorical_crossentropy], - 'activation': [relu], - 'last_activation': [softmax], - 'weight_regulizer': [None], - 'emb_output_dims': [None]} - - -print("Running Iris test 1...") -x, y = ta.datasets.iris() - -print("Scanning...") -print(x.shape, y.shape) -h = ta.Scan(x, y, - params=p, - dataset_name='testing', - experiment_no='000', - model=iris_model) - -print("Running Iris test 2...") -p = {'lr': [1], - 'first_neuron': [4], - 'hidden_layers': [2], - 'batch_size': [50], - 'epochs': [1], - 'dropout': [0], - 'shapes': ['stairs'], - 'optimizer': [Adam, Adagrad, Adamax, RMSprop, Adadelta, Nadam, SGD], - 'losses': [categorical_crossentropy], - 'activation': [relu], - 'last_activation': [softmax], - 'weight_regulizer': [None], - 'emb_output_dims': [None]} - -x, y = ta.datasets.iris() - -h = ta.Scan(x, y, - params=p, - dataset_name='testing', - experiment_no='000', - model=iris_model) - - - - - -r = ta.Reporting('testing_000.csv') - -# here use a standard 2d dictionary for inputting the param boundaries - -x, y = ta.datasets.cervical_cancer() -p = {'lr': (0.5, 5, 10), - 'first_neuron': [4, 8, 16, 32, 64], - 'hidden_layers': [2, 3, 4, 5], - 'batch_size': (2, 30, 10), - 'epochs': [3], - 'dropout': (0, 0.5, 5), - 'weight_regulizer': [None], - 'shapes': ['stairs'], - 'emb_output_dims': [None], - 'optimizer': [Nadam], - 'loss': [logcosh, binary_crossentropy], - 'activation': [relu], - 'last_activation': [sigmoid]} - -ta.Scan(x, y, - grid_downsample=0.001, - params=p, - dataset_name='cervix', - experiment_no='a', - model=cervix_model, reduction_method='spear', reduction_interval=5) - -ta.Reporting('cervix_a.csv') - - -x = ta.datasets.icu_mortality() -x = ta.datasets.icu_mortality(100) -x = ta.datasets.titanic() -x = ta.datasets.iris() -x = ta.datasets.cervical_cancer() -x = ta.datasets.breast_cancer() - -x = ta.params.iris() -x = ta.params.breast_cancer() +if __name__ == '__main__': + TestIris().test_scan_iris_1() + TestIris().test_scan_iris_2() + TestCancer().test_scan_cancer() + TestLoadDatasets() From 9bc1bf6f22f502a78421226fb67478303b422ec0 Mon Sep 17 00:00:00 2001 From: Matthew Carbone Date: Thu, 26 Jul 2018 20:16:03 -0400 Subject: [PATCH 10/13] Finalize testing of Scan explicit val dataset Notably, bugfix an error in Scan where the attributes x_val and y_val were not being defined. --- talos/scan.py | 2 ++ test/core_tests/test_scan.py | 67 ++++++++++++++++++++++++------------ test_script.py | 5 +++ 3 files changed, 52 insertions(+), 22 deletions(-) diff --git a/talos/scan.py b/talos/scan.py index c6c30c7e..88dcce43 100755 --- a/talos/scan.py +++ b/talos/scan.py @@ -141,6 +141,8 @@ def __init__(self, x, y, params, dataset_name, experiment_no, model, raise RuntimeError(TRAIN_VAL_RUNTIME_ERROR_MSG) elif (x_val is not None and y_val is not None): self.custom_val_split = True + self.x_val = x_val + self.y_val = y_val if debug: self.logfile = open('talos.debug.log', 'a') diff --git a/test/core_tests/test_scan.py b/test/core_tests/test_scan.py index 62e4987d..9dc5fdef 100644 --- a/test/core_tests/test_scan.py +++ b/test/core_tests/test_scan.py @@ -8,10 +8,15 @@ from keras.optimizers import Adamax, RMSprop, Nadam from keras.activations import softmax, relu, sigmoid -import talos as ta +from sklearn.model_selection import train_test_split + +from talos.scan import Scan +from talos.reporting import Reporting from talos.model.examples import iris_model, cervix_model +# remotely hosted datasets +from talos import datasets, params p1 = {'lr': [1], 'first_neuron': [4], @@ -60,44 +65,62 @@ class TestIris: def __init__(self): - self.x, self.y = ta.datasets.iris() + self.x, self.y = datasets.iris() + self.x_train, self.x_dev, self.y_train, self.y_dev \ + = train_test_split(self.x, self.y, test_size=0.2) def test_scan_iris_1(self): print("Running Iris dataset test 1...") - ta.Scan(self.x, self.y, params=p1, dataset_name='iris_test_1', - experiment_no='000', model=iris_model) + Scan(self.x, self.y, params=p1, dataset_name='testing', + experiment_no='000', model=iris_model) def test_scan_iris_2(self): print("Running Iris dataset test 2...") - ta.Scan(self.x, self.y, params=p2, dataset_name='iris_test_2', - experiment_no='000', model=iris_model) - ta.Reporting('iris_test_2_000.csv') + Scan(self.x, self.y, params=p2, dataset_name='testing', + experiment_no='000', model=iris_model) + Reporting('testing_000.csv') + + def test_scan_iris_explicit_validation_set(self): + print("Running explicit validation dataset test 1...") + Scan(self.x_train, self.y_train, params=p2, + dataset_name='testing', + experiment_no='000', model=iris_model, + x_val=self.x_dev, y_val=self.y_dev) + + def test_scan_iris_explicit_validation_set_force_fail(self): + print("Running explicit validation dataset test 2...") + try: + Scan(self.x_train, self.y_train, params=p2, + dataset_name='testing', + experiment_no='000', model=iris_model, + y_val=self.y_dev) + except RuntimeError: + pass class TestCancer: def __init__(self): - self.x, self.y = ta.datasets.cervical_cancer() + self.x, self.y = datasets.cervical_cancer() def test_scan_cancer(self): print("Running Cervical Cancer dataset test...") - ta.Scan(self.x, self.y, grid_downsample=0.001, params=p3, - dataset_name='cervical_cancer_test', experiment_no='a', - model=cervix_model, - reduction_method='spear', reduction_interval=5) - ta.Reporting('cervical_cancer_test_a.csv') + Scan(self.x, self.y, grid_downsample=0.001, params=p3, + dataset_name='testing', experiment_no='a', + model=cervix_model, + reduction_method='spear', reduction_interval=5) + Reporting('testing_a.csv') class TestLoadDatasets: def __init__(self): print("Testing Load Datasets...") - x = ta.datasets.icu_mortality() - x = ta.datasets.icu_mortality(100) - x = ta.datasets.titanic() - x = ta.datasets.iris() - x = ta.datasets.cervical_cancer() - x = ta.datasets.breast_cancer() - - x = ta.params.iris() - x = ta.params.breast_cancer() # noqa + x = datasets.icu_mortality() + x = datasets.icu_mortality(100) + x = datasets.titanic() + x = datasets.iris() + x = datasets.cervical_cancer() + x = datasets.breast_cancer() + x = params.iris() + x = params.breast_cancer() # noqa diff --git a/test_script.py b/test_script.py index ade76f54..6030a193 100644 --- a/test_script.py +++ b/test_script.py @@ -4,6 +4,11 @@ if __name__ == '__main__': + + # TODO describe what all this does + + TestIris().test_scan_iris_explicit_validation_set() + TestIris().test_scan_iris_explicit_validation_set_force_fail() TestIris().test_scan_iris_1() TestIris().test_scan_iris_2() TestCancer().test_scan_cancer() From eb8cf297f937f749a64fd7392b4bafeae0e6142b Mon Sep 17 00:00:00 2001 From: Matthew Carbone Date: Thu, 26 Jul 2018 20:24:39 -0400 Subject: [PATCH 11/13] Add sklearn to required packages --- setup.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/setup.py b/setup.py index 45e5171b..c364fe36 100755 --- a/setup.py +++ b/setup.py @@ -24,6 +24,7 @@ except ImportError: from distutils.core import setup + def check_dependencies(): install_requires = [] @@ -48,10 +49,14 @@ def check_dependencies(): import astetik except ImportError: install_requires.append('astetik') - + try: + import sklearn + except ImportError: + install_requires.append('sklearn') return install_requires + if __name__ == "__main__": install_requires = check_dependencies() @@ -77,13 +82,13 @@ def check_dependencies(): 'talos.metrics'], classifiers=[ - 'Intended Audience :: Science/Research', - 'Programming Language :: Python :: 3.6', - 'License :: OSI Approved :: MIT License', - 'Topic :: Scientific/Engineering :: Human Machine Interfaces', - 'Topic :: Scientific/Engineering :: Artificial Intelligence', - 'Topic :: Scientific/Engineering :: Mathematics', - 'Operating System :: POSIX', - 'Operating System :: Unix', - 'Operating System :: MacOS'], + 'Intended Audience :: Science/Research', + 'Programming Language :: Python :: 3.6', + 'License :: OSI Approved :: MIT License', + 'Topic :: Scientific/Engineering :: Human Machine Interfaces', + 'Topic :: Scientific/Engineering :: Artificial Intelligence', + 'Topic :: Scientific/Engineering :: Mathematics', + 'Operating System :: POSIX', + 'Operating System :: Unix', + 'Operating System :: MacOS'], ) From 3c50fd8ca18a9596fe83c8888603e4170ce3a674 Mon Sep 17 00:00:00 2001 From: Matthew Carbone Date: Thu, 26 Jul 2018 20:29:13 -0400 Subject: [PATCH 12/13] Fix import error from metrics re fmeasure_acc Appears that fmeasure was changed to fmeasure_acc in a previous commit. --- talos/model/examples.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/talos/model/examples.py b/talos/model/examples.py index d4673d8a..8e9ef8b5 100644 --- a/talos/model/examples.py +++ b/talos/model/examples.py @@ -6,7 +6,7 @@ from keras.layers import Dropout, Dense from talos.metrics.keras_metrics import matthews_correlation, precision -from talos.metrics.keras_metrics import recall, fmeasure +from talos.metrics.keras_metrics import recall, fmeasure_acc def iris_model(x_train, y_train, x_val, y_val, params): @@ -63,7 +63,7 @@ def cervix_model(x_train, y_train, x_val, y_val, params): params['optimizer'])), loss=params['loss'], metrics=['acc', - fmeasure, + fmeasure_acc, recall, precision, matthews_correlation]) From 8a619a7cd77ce05ca40c4f0165aee9c9917bc148 Mon Sep 17 00:00:00 2001 From: Matthew Carbone Date: Thu, 26 Jul 2018 20:35:39 -0400 Subject: [PATCH 13/13] Final checks --- test_script.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test_script.py b/test_script.py index 6030a193..2e2321cd 100644 --- a/test_script.py +++ b/test_script.py @@ -6,7 +6,6 @@ if __name__ == '__main__': # TODO describe what all this does - TestIris().test_scan_iris_explicit_validation_set() TestIris().test_scan_iris_explicit_validation_set_force_fail() TestIris().test_scan_iris_1()