## Research and cross validation

In [1]:
import os
import sys
from copy import deepcopy
from itertools import combinations
from collections import defaultdict
sys.path.insert(0, os.path.join('..'))

import numpy as np
import matplotlib.pyplot as plt

from radial.batchflow import Dataset, FilesIndex, Pipeline, R, B, V, C
from radial.batchflow.models.tf import ResNet18, ResNet34
from radial.batchflow.research import Research, Option
from radial.core import RadialBatch
from radial.core.pipelines import create_preprocess_pipeline, create_train_pipeline
plt.style.use('ggplot')
%matplotlib inline

In [2]:
def create_datasets(path, batch, cross_val=None):
    """Generated dataset with following conditions: if `cross_val` is None or equal to 1
    the one dataset will be created. Elsewhere you will receive list of train and test datasets.
    And the lenght of this list will be equal to `cross_val`."""
    ix = FilesIndex(path=path)
    if cross_val is None or cross_val == 1:
        dset = Dataset(ix, RadialBatch)
        dset.split()
        return dset

    split_ix = np.array_split(ix.indices, cross_val)
    iterations = zip(combinations(split_ix, cross_val-1), list(combinations(split_ix, 1))[::-1])
    dsets = []
    for train, test in iterations:
        dset_train = Dataset(index=ix.create_subset(np.concatenate(train)), batch_class=RadialBatch)
        dset_test = Dataset(index=ix.create_subset(np.concatenate(test)), batch_class=RadialBatch)
        dsets.append([dset_train, dset_test])
    return dsets

## Подготовка данных
если хотите обучать пайплайны по кросс валидации, напишите количество бинов в `cross_val` иначе будет происходить стандартное разделение на train/test.

In [3]:
cross_val = 2
path = './data/*'

dataset = create_datasets(path, RadialBatch, cross_val)

# Создание пайплайнов

In [4]:
N_SAMPLES = 50

model_config = {
    'inputs': dict(points=dict(shape=(2, N_SAMPLES)),
                   targets=dict(name='target', shape=1)),
    'initial_block/inputs': 'points',
    'head': dict(layout='f',
                 units=1),
    'body/num_blocks': [1, 1, 1],
    'body/filters': [8, 16, 32],
    'initial_block/filters': C('filters'),
    'loss': 'mse',
    'optimizer': 'Adam'
}

In [5]:
B_SIZE = 50
# тут создаем пайплайны для трейна и теста ресерча, без загрузки данных
prep_pipeline = create_preprocess_pipeline(N_SAMPLES, np.random.random)
train_pipeline = create_train_pipeline(C('model'), model_config, prep_pipeline).run(B_SIZE, n_epochs=None, drop_last=True, lazy=True)

test_pipeline = prep_pipeline + (Pipeline()
                        .init_variable('predictions', init_on_each_run=list)
                        .init_variable('targets', init_on_each_run=list)
                        .import_model('model', C('import_from'))
                        .update_variable('targets', B('target'), mode='e')
                        .predict_model('model', fetches='predictions',
                                                feed_dict={'points': B('points'),
                                                           'targets': B('target')},
                                        save_to=V('predictions'), mode='e')
                        .run(B_SIZE, n_epochs=1, drop_last=True, lazy=True)
)

# Метрики
По дефолту будем считать метрики mape и катежеков-like

In [6]:
def get_mape(iteration, experiment, pipeline):
    """ Calculate mean absolute percentage error."""
    _ = iteration
    pipeline = experiment[pipeline].pipeline
    y_pred = np.array(pipeline.get_variable('predictions')).reshape(-1)
    y_true = np.array(pipeline.get_variable('targets'))
    return np.abs(y_true-y_pred)/y_true

def get_mape30(iteration, experiment, pipeline):
    """ Calculate percentage of mean absolute percentage error which less than 30%."""
    mape = mape(iteration, experiment, pipeline)
    return np.mean(mape < 0.3)*100

# еще сохраним обученные модели на всякий случай
def save_model(iteration, experiment, pipeline, model_name, path='./'):
    """ Save model to a path."""
    path = os.path.join(path, experiment[pipeline].config.alias(as_string=True) + '_' + str(iteration))
    pipeline = experiment[pipeline].pipeline
    pipeline.save_model(model_name, path)
    return

## Создание Research

In [7]:
# измени на свои параметры
opts = Option('model', [ResNet18, ResNet34]) \
      * Option('filters', [4, 8, 16])

research = (Research()
            .pipeline(train_pipeline, variables='loss', name='train')
            .pipeline(test_pipeline, name='test', execute='%5',
                      run=True, import_from='train')
            .grid(opts)
            .function(get_mape, returns='loss', name='test_mape',
                      execute='%5', pipeline='test')
            .function(get_mape30, returns='loss', name='test_mape30',
                      execute='%5', pipeline='test')
            .function(save_model, execute=-1, pipeline='train',
                  model_name='model', path='saved_models/')
)

In [8]:
def execute_research_with_cv(cross_val, res, dataset, n_reps, n_iters, research_name='research'):
    """execute research with given parameters"""
    research = deepcopy(res)
    if not isinstance(dataset, list):
        train = Research().pipeline(train_pipeline<<dataset.train, variables='loss', name='train')
        test = Research().pipeline(train_pipeline<<dataset.test, name='test', execute='%5', run=True, import_from='train')
        research.executables['train'] = train.executables['train']
        research.executables['test'] = test.executables['test']
        research.run(n_reps=n_reps, n_iters=n_iters, name=research_name, progress_bar=True)
        return research

    research_list = []
    print('number of bins: ', cross_val)
    for i in range(cross_val):
        research = deepcopy(res)
        train = Research().pipeline(train_pipeline<<dataset[i][0], variables='loss', name='train')
        test = Research().pipeline(train_pipeline<<dataset[i][1], name='test', execute='%5', run=True, import_from='train')
        research.executables['train'] = train.executables['train']
        research.executables['test'] = test.executables['test']
        research_name_cv = research_name + '_cv_%d' % i
        research.run(n_reps=n_reps, n_iters=n_iters, name=research_name_cv, progress_bar=True)
        research_list.append(research)
    return research_list

In [9]:
execute_research_with_cv(cross_val, research, dataset, 1, 1)

number of bins:  2
Research research_cv_0 is starting...


  0%|          | 0/6 [00:00<?, ?it/s]

Distributor has 6 jobs with 1 iterations. Totally: 6


100%|██████████| 6/6 [00:33<00:00,  5.54s/it]


Research research_cv_1 is starting...


  0%|          | 0/6 [00:00<?, ?it/s]

Distributor has 6 jobs with 1 iterations. Totally: 6


100%|██████████| 6/6 [00:32<00:00,  5.35s/it]


[<radial.batchflow.research.research.Research at 0x10e2b3160>,
 <radial.batchflow.research.research.Research at 0x1c3fb7d400>]