# Callables in Research

The main purpose of Research is to run pipleines with different configs in parallel but you also can add callables and realize very flexible plans of experiments even without pipelines.

In [1]:
import sys
import os
import shutil

import warnings
warnings.filterwarnings('ignore')

from tensorflow import logging
logging.set_verbosity(logging.ERROR)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import matplotlib
%matplotlib inline

import numpy as np

In [2]:
sys.path.append('../../..')

from batchflow import Pipeline, B, C, V, D, L
from batchflow.opensets import MNIST
from batchflow.models.tf import VGG7, VGG16
from batchflow.research import Research, Option, Results, RP, RR, RD, REP, RID, RI

In [3]:
def clear_previous_results(res_name):
    if os.path.exists(res_name):
        shutil.rmtree(res_name)

## Simple example

To add your callable into Research use `add_callable` method:

In [4]:
res_name = 'sample_callable_research'
clear_previous_results(res_name)

def randn_std():
    return np.random.randn()

research = Research().add_callable(randn_std, returns='random', name='randn_std')

research.run(5, name=res_name)

research.load_results().df

Research sample_callable_research is starting...


Unnamed: 0,name,random,iteration,sample_index,repetition,update
0,randn_std,0.592352,0,4056567556,0,0
1,randn_std,-0.994278,1,4056567556,0,0
2,randn_std,0.584242,2,4056567556,0,0
3,randn_std,-0.752847,3,4056567556,0,0
4,randn_std,-0.666386,4,4056567556,0,0


You also can use `args` and `kwargs` for your callables, just add them into `add_callable`.

In [5]:
clear_previous_results(res_name)

def randn(mean=0, std=1):
    return np.random.randn() * std + mean

research = Research().add_callable(randn, mean=2, std=5, returns='random', name='randn')

research.run(5, name=res_name)

research.load_results().df

Research sample_callable_research is starting...


Unnamed: 0,name,random,iteration,sample_index,repetition,update
0,randn,-2.867201,0,536019212,0,0
1,randn,4.830893,1,536019212,0,0
2,randn,6.040271,2,536019212,0,0
3,randn,5.427344,3,536019212,0,0
4,randn,6.148175,4,536019212,0,0


## Named Expressions

Obviously, such usage of `args` and `kwargs` is not very usefull because can be realized by `partial` but you also can use named expressions to substitute into functions objects which depends on research objects. For example, you can use ready results of the current research by `RR` named expression which corresponds to `Results(path=res_name)`.

In [6]:
res_name = 'max_research'

clear_previous_results(res_name)

def stat(results):
    return results.random.min(), results.random.max()

research = (Research()
    .add_callable(randn, mean=2, std=5, returns='random', name='randn', dump=1)
    .add_callable(stat, results=RR().df, returns=['min_value', 'max_value'], name='stat')
)

research.run(5, name=res_name)

research.load_results().df

Research max_research is starting...


Unnamed: 0,name,random,min_value,max_value,iteration,sample_index,repetition,update
0,randn,4.96176,,,0,3260983596,0,0
1,randn,-2.971391,,,1,3260983596,0,0
2,randn,4.921212,,,2,3260983596,0,0
3,randn,-1.764236,,,3,3260983596,0,0
4,randn,-1.33193,,,4,3260983596,0,0
5,stat,,4.96176,4.96176,0,3260983596,0,0
6,stat,,-2.971391,4.96176,1,3260983596,0,0
7,stat,,-2.971391,4.96176,2,3260983596,0,0
8,stat,,-2.971391,4.96176,3,3260983596,0,0
9,stat,,-2.971391,4.96176,4,3260983596,0,0


## Save only the best model

One can use callables to save only the best (in some sense) model, for example, the model with the highest accuracy on the test.

Firstly, define pipelines as usual

In [7]:
BATCH_SIZE = 64
mnist = MNIST()
domain = Option('layout', ['cna', 'can']) * Option('bias', [True, False])

model_config={
    'inputs/images/shape': B('image_shape'),
    'inputs/labels/classes': 10,
    'inputs/labels/name': 'targets',
    'initial_block/inputs': 'images',
    'body/block/layout': C('layout'),
    'common/conv/use_bias': C('bias'),
}

In [8]:
train_ppl = (Pipeline()
            .init_variable('loss')
            .init_model('dynamic', VGG7, 'conv', config=model_config)
            .to_array()
            .train_model('conv', 
                         images=B('images'), labels=B('labels'),
                         fetches='loss', save_to=V('loss', mode='w'))           
)

train_root = mnist.train.p.run_later(BATCH_SIZE, shuffle=True, n_epochs=None)

In [9]:
test_ppl = (Pipeline()
                 .init_variable('predictions')
                 .init_variable('metrics')
                 .import_model('conv', C('import_from'))
                 .to_array()
                 .predict_model('conv', 
                                images=B('images'), labels=B('labels'),
                                fetches='predictions', save_to=V('predictions'))
                 .gather_metrics('class', targets=B('labels'), predictions=V('predictions'), 
                                fmt='logits', axis=-1, save_to=V('metrics', mode='a'))
)

test_root = mnist.test.p.run_later(BATCH_SIZE, shuffle=True, n_epochs=1) #Note  n_epochs=1

Now define callable which will get train pipeline with model, results for the current experiment, path to the folder with experiment results and current iteration of the reserach.

In [10]:
import glob
import shutil

def save_model(ppl, results, path, iteration):
    best_row = results.iloc[results.accuracy.idxmax()]
    if best_row.iteration == iteration:
        for item in glob.glob(glob.escape(path) + '/model_*'):
            shutil.rmtree(item)
        model_path = os.path.join(path, 'model_{}'.format(iteration))
        ppl.get_model_by_name("conv").save(model_path)
    return path

In [11]:
res_name = 'save_model_research'

clear_previous_results(res_name)

To define values of parameters we will use named expressions. `RR` `args` and `kwargs` will be used in `Results` initialization.

In [12]:
EXECUTE_EACH = 10

research = (Research()
    .init_domain(domain)
    .add_pipeline(train_root, train_ppl, variables='loss', name='train_ppl')
    .add_pipeline(test_root, test_ppl, variables='metrics', run=True, name='test_ppl',
                  import_from=RP('train_ppl'),
                  execute=[EXECUTE_EACH, 'last'], dump=[EXECUTE_EACH, 'last'])
    .get_metrics(pipeline='test_ppl', metrics_var='metrics', metrics_name='accuracy',
                 returns='accuracy',
                 execute=[EXECUTE_EACH, 'last'], dump=[EXECUTE_EACH, 'last'])
    .add_callable(save_model, returns='model_path', execute=[EXECUTE_EACH, 'last'],
                  ppl=RP('train_ppl'), 
                  results=RR(sample_index=RID(), names='test_ppl_metrics').df,
                  path=L(os.path.join)(RD(), REP()),
                  iteration=RI())
)

research.run(300, branches=4, name=res_name, bar=True)

Research save_model_research is starting...


Domain updated: 0: 100%|██████████| 300/300.0 [04:34<00:00,  1.09it/s]


<batchflow.research.research.Research at 0x7fda99321080>

Let's check that we have only the best models for each config.

In [13]:
results = research.load_results(concat_config=True).df

List of the saved models:

In [14]:
glob.glob(os.path.join(res_name, 'results', '*', '*', 'model*'))

['save_model_research/results/bias_True-layout_can-repetition_0-update_0/2843298119/model_299',
 'save_model_research/results/bias_False-layout_cna-repetition_0-update_0/3036401677/model_299',
 'save_model_research/results/bias_False-layout_can-repetition_0-update_0/3895489470/model_299',
 'save_model_research/results/bias_True-layout_cna-repetition_0-update_0/2198009815/model_299']

Iterations for each config with the best test accuracy:

In [15]:
results.groupby('config').apply(lambda x: x.loc[x.accuracy.idxmax()])[['config', 'accuracy', 'iteration']]

Unnamed: 0_level_0,config,accuracy,iteration
config,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bias_False-layout_can,bias_False-layout_can,0.967957,299
bias_False-layout_cna,bias_False-layout_cna,0.967821,299
bias_True-layout_can,bias_True-layout_can,0.968163,299
bias_True-layout_cna,bias_True-layout_cna,0.969032,299
