In [1]:
# Online training

Imports

In [2]:
# !cat /project/python/lib/ml/cnnscore.py

In [3]:
# -*- coding: utf-8 -*-

from __future__ import print_function, absolute_import

import os
import logging

import pandas as pd
import numpy as np
import copy

from tseries.chartdata import get_range
import datetime
from database import db_session
from models import Algo

import chainer
from chainer import optimizers
# from chainer import serializers
from chainer import Variable
import chainer.functions as F
import chainer.links as L

from trading import tm


def _get_algo_data(algo_id):
    """Get paramaters from an algorithm_id."""
    try:
        al = db_session.query(Algo).get(algo_id)
        st = al.strategy
        mp = st.model_params
        dataset = al.model.dataset["data"]
        sc = al.score
        start = sc.algo.current_backtest.start
        end = sc.algo.current_backtest.end
    finally:
        db_session.remove()
    return mp, dataset, sc, start, end


def _center_01(x):
    """Center data between 0 and 1."""
    c = (x - x.min()) / (x.max() - x.min())
    return np.array([i for i in c]).astype(np.float32)


def _discard_grad_and_normalizers(mp):
    """Discard gradients and normalizers from model parameters."""
    features = []
    for idxi, i in enumerate(mp["features"]):
        if "_dg2" in i["id"]:
            continue
        else:
            tmp = i
            tmp[u"normalizers"] = []
            features.append(tmp)
            if tmp["indicator_type"] == "DelayedGrad":
                tmp["indicator_type"] = "Identity"
            tmp.pop("scale", None)
    mp["features"] = features


def _get_indicators_group(mp):
    """Group similar indicators.

    Group indicators by names, and keep related ones together,
    ex. "MACD" and "MACD-signal".
    """

    scaling_list = {"Function": [], "Identity": [], "MA": [], "MACD": [],
                    "StdDev": [], "BBANDS": [], "RSI": [], "UMustache": [],
                    "LMustache": [], "CandleSize": [], "PivotPoint": [],
                    "PlusDI": [], "MinusDI": [], "ADX": [], "Momentum": [],
                    "Stochastic": [], "SMI": [], "Operator": [],
                    "Ichimoku": [], "HLBand": [], "RCI": [], "DiffM": [],
                    "Mustache": [],  # added
                    }
    _linked_ohlc = ["BBANDS", "HLBand"]

    for idxi, i in enumerate(mp["features"]):
        try:
            t = i["indicator_type"]
            if t == "Identity" or t == "DelayedGrad" or i in _linked_ohlc:
                scaling_list["Identity"].append(idxi)
            elif "Mustache" in t:
                scaling_list["Mustache"].append(idxi)
            elif t == "MACD":
                if "-signal" not in i["id"]:
                    scaling_list[i["id"]] = [idxi]
                else:
                    scaling_list[i["id"][:-7]].append(idxi)
            else:
                scaling_list[t].append(idxi)
        except KeyError:
            scaling_list[t] = [idxi]

    indicators_groups = []
    for i in scaling_list.values():
        if i != []:
            indicators_groups.append(i)
    return indicators_groups


def _check_negative_inputs(dataset):
    if any([i["label"] == ['neg_icon'] for i in dataset]):
        raise NotImplementedError("Can't handle negative inputs yet")


def _load_df_range(mp, indicators, start, end):
    """Load dataframe to work on.

    Add +-datetime.timedelta(15).
    """

    d_start = start
    d_end = end
    d_start -= datetime.timedelta(15)
    d_end += datetime.timedelta(15)

    # get params
    symbol = mp['symbol']
    timeframe = mp['timeframe']
    indi = mp["features"]

    r = get_range(d_start, d_end, symbol, timeframe, indi)

    # correct index
    date = r.pop("Date", None)
    df = pd.DataFrame.from_dict(r).astype(np.float32)
    df["Date"] = date
    df["Date"] = df["Date"].apply(
        lambda x: tm.T(x / 1e9))  # rescale
    df = df.set_index("Date")

    # correct columns name
    df = df[["Open", "High", "Low", "Close"] + indicators[4:]]
    return df


def _easing(t, b, c, d):
    t /= d
    return c * t * t + b


def _process_range(r, df, align, justify_right=True):
    """Process a range and align it by right side."""

    startidx = df.index.get_loc(tm.T(r["start"]), method="ffill")
    endidx = df.index.get_loc(tm.T(r["end"]), method="bfill")

    datalen = endidx - startidx
    if datalen >= align:
        startidx = endidx - align + 1
        datalen = endidx - startidx
    assert (datalen < align)

    if justify_right:
        leftmost = endidx - align
        rightmost = endidx
        start_margin = datalen
    else:
        start_margin = (align - datalen) / 2
        leftmost = max(startidx - start_margin, 0)
        rightmost = min(leftmost + align, df.shape[0])

        if rightmost - leftmost < align:
            # shift back the leftmost point
            leftmost -= align - (rightmost - leftmost)

    assert (leftmost >= 0)
    assert (rightmost <= df.shape[0])
    assert (rightmost - leftmost == align)

    mini_df = df[leftmost:rightmost]
    x_values = mini_df

    d = datalen
    t = np.arange(1, d + 1, dtype=np.float32)
    y = _easing(t, 0.0, t / d, d)
    y_values = np.zeros(align, dtype=np.float32)
    if justify_right:
        y_values[-datalen:] = y
    else:
        y_values[start_margin:start_margin + datalen] = y

    return x_values.astype(np.float32), y_values


def _select_normalize_from_df(r, df, align, indicators_groups,
                              justify_right=True):
    """Process range and normalize it."""

    t, y = _process_range(r, df, align, justify_right=justify_right)

    c = t.values
    for i in indicators_groups:
        c[:, i] = _center_01(c[:, i])
    return c.astype(np.float32), y


def _get_output_size(*args):
    """Compute input size for fully connected layer after CNN.

    Args:
        *args: (in,
        [l_pad, l_k, l_ch, l_st],
        [l_pad, l_k, l_ch, l_st],
        ...)

    Returns:
        size (int)

    Example:
        (input_size,
         [layer_1_padding, layer_1_filter, layer_1_channel, layer_1_stride],
         [layer_2_padding, layer_2_filter, layer_2_channel, layer_2_stride],
         ...
         [layer_N_padding, layer_N_filter, layer_N_channel, layer_N_stride],
        )

    Computed as:
    $$n_{out}=((n_{in} + 2*n_{padding} - n_{filter})\
/ n_{stride} + 1) * n_{channel}$$
    """
    out = -1
    previous = 1
    for idx, i in enumerate(args):
        if idx == 0:
            out = i
            continue
        out = ((out / previous + 2 * i[0] - i[1]) / i[3] + 1) * i[2]
        previous = i[2]
    return out


class _CNN_1D(chainer.Chain):
    """1D Convolutional Neural Network.

    Use batch normalization and dropout.
    3 CNN layers, 2 FC layers
    Arbitrary hyperparameters, empirically decided.
    """

    def __init__(self, input_size, input_channels):
        # hyper parameters
        ch1, ch2, ch3 = 32, 64, 128
        k1, k2, k3 = 11,  5,   3
        st1, st2, st3 = 3,  2,   1
        p1, p2, p3 = 2,  2,   2

        # parameters
        self.input_size = input_size
        self.input_channels = input_channels

        self.fullco_val = _get_output_size(self.input_size,
                                           [p1, k1, ch1, st1],
                                           [p2, k2, ch2, st2],
                                           [p3, k3, ch3, st3],
                                           )

        super(_CNN_1D, self).__init__(
            c1_1=L.Convolution2D(self.input_channels, ch1,
                                 (k1, 1), stride=(st1, 1),
                                 pad=(p1, 0)),
            bn1_1=L.BatchNormalization(ch1),

            c2_1=L.Convolution2D(
                ch1, ch2, (k2, 1), stride=(st2, 1), pad=(p2, 0)),
            bn2_1=L.BatchNormalization(ch2),

            c3_1=L.Convolution2D(
                ch2, ch3, (k3, 1), stride=(st3, 1), pad=(p3, 0)),
            bn3_1=L.BatchNormalization(ch3),

            # The problem with this fully connected layer is that it's
            # preventing the network to work on arbitrary sized inputs...
            #
            # Use global average pooling to get rid of this!
            # http://image-net.org/challenges/LSVRC/
            # 2014/slides/ILSVRC2014_NUS_release.pdf    slide 6
            # https://arxiv.org/pdf/1312.4400.pdf
            # http://cnnlocalization.csail.mit.edu/
            #
            l4=L.Linear(self.fullco_val, 200),
            l5=L.Linear(200, 1),
        )

    def __call__(self, x, train=False):
        h = F.relu(self.bn1_1(self.c1_1(x), test=not train))
        h = F.dropout(h, ratio=0.5, train=train)

        h = F.relu(self.bn2_1(self.c2_1(h), test=not train))
        h = F.dropout(h, ratio=0.5, train=train)

        h = F.relu(self.bn3_1(self.c3_1(h), test=not train))
        h = F.dropout(h, ratio=0.5, train=train)

        h = F.dropout(F.relu(self.l4(h)), ratio=0.5, train=train)
        h = self.l5(h)  # can add a F.tanh if we want to restrain it to -1,+1
        return h


def _prepare_samples_CNN(postive_data, negative_data, indicators_groups,
                         augment_factor=1, asymetric_augment=1):
    """Reshape samples to be fed into the CNN."""

    # selected_data.shape[0]*2 #10000
    size_dataset = postive_data.shape[0] * augment_factor * 2
    len_samples = postive_data.shape[1]

    # Trivial negative data
    # fake_data = center(np.cumsum(np.random.randn(size_dataset/2,
    # len_samples),axis=1).astype(np.float32)) #*2 - 1
    # fake_data = np.array([
    #         center(i) for i in np.cumsum(np.random.randn(
    # size_dataset/2, len_samples),axis=1).astype(np.float32)
    #     ])

    # Randomly sampled and normalized negative data
    negative_samples = np.empty(
        (size_dataset / 2 * asymetric_augment, len_samples,
         postive_data.shape[2]), dtype=np.float32)
    for idxi, i in enumerate(negative_samples):
        rand = np.random.randint(0, len(negative_data) - (len_samples))
        c = copy.deepcopy(negative_data[rand:rand + len_samples])
        for i in indicators_groups:
            c[:, i] = _center_01(c[:, i])

        negative_samples[idxi] = c

    if augment_factor > 1:
        positive_samples = np.repeat(postive_data, augment_factor,
                                     axis=0)
    else:
        positive_samples = postive_data

    # Reshape data for convenient use with CNN
    data = np.concatenate(
        (positive_samples, negative_samples)).astype(np.float32)
    labels = np.array([[1]] * len(positive_samples) + [[-1]]
                      * len(negative_samples)).astype(np.int32)
    data = np.swapaxes(data, 1, 2)
    data = np.reshape(data, data.shape + (1,))

    permut = np.random.permutation(len(data))
    data = data[permut]
    labels = labels[permut]
    return data, labels


class _train_CNN(chainer.Chain):
    """Trainer encapsuling CNN with MSE."""

    def __init__(self, predictor):
        super(_train_CNN, self).__init__(predictor=predictor)

    def __call__(self, x, t, train=True):
        y = self.predictor(x, train=train)
        self.loss = F.mean_squared_error(y, t)
        return self.loss


def _eval_net(network, x, y, eval_on=10, batch_size=400, gpu_id=None):
    """Evaluate network performance on a subset."""

    if gpu_id is None:
        gpu_id = int(os.environ.get('GPU_ID', 0))

    # true/false positive, true/false negative
    tp, fp, tn, fn = 0, 0, 0, 0

    N = x.shape[0]

    # Random permutations, permissive for N < batch_size
    perm = np.random.permutation(max(N, (batch_size / N + 1) * N)) % N
    np.random.shuffle(perm)
    perm = perm[:max(N, batch_size)]

    b = 0 if batch_size >= N else np.random.randint(0, N - batch_size)

    _examples = x[perm[b:b + batch_size]]
    _labels = y[perm[b:b + batch_size]]

    examples = Variable(_examples)
    examples.to_gpu(gpu_id)

    pred = network(examples, train=False)
    pred.to_cpu()

    for idxi, i in enumerate(_labels):
        if np.sign(pred.data[idxi, 0]) > 0:
            if np.sign(_labels[idxi]) == np.sign(pred.data[idxi, 0]):
                # true positive
                tp += 1
            else:
                # false positive
                fp += 1
        else:
            if np.sign(_labels[idxi]) == np.sign(pred.data[idxi, 0]):
                # true negative
                tn += 1
            else:
                # false negative
                fn += 1

    eval_on = float(batch_size)
    mean_loss = (tp + tn) / eval_on
    precision, recall = 0 if tp + fp == 0 else tp / \
        float(tp + fp), 0 if tp + fn == 0 else tp / float(tp + fn)
    confusion_matrix = [tp / eval_on,
                        fp / eval_on,
                        fn / eval_on,
                        tn / eval_on]
    f1_score = 0 if precision + recall == 0 else 2 * \
        ((precision * recall) / (precision + recall))  # harmonic mean
    return mean_loss, precision, recall, f1_score, confusion_matrix


def _gpu_trainer(network, trainer, optimizer,
                 x_train, x_test, y_train, y_test,
                 n_steps=100, batch_size=400,
                 log=False, display=False, print_nth=10, gpu_id=None):
    """Convenience function to train neural network on GPU.

    Train a neural network on GPU, eventually log and print progress.


    Args:
        network     (_CNN_1D): neural network.
        trainer  (_train_CNN): network trainer.
        optimizer   (chainer): optimizer.
        x_train (numpy.array): training data.
        x_test  (numpy.array): testing data, may be null.
        y_train (numpy.array): training labels.
        y_test  (numpy.array): testing labels, may be null.
        n_steps         (int): number of training steps, default 100.
        batch_size      (int): batch size, default 400.
        log            (bool): log progress, default False.
        display        (bool): display progress, default False.
        print_nth       (int): display progress every nth, default 10.
        gpu_id          (int): GPU id.

    Returns:
        If `log`, return evaluation results on train and test,
        and confusion matrix on train and test:
        `log_train, log_test, cm, cm_`
        otherwirse:
        `None, None, None, None`.
    """

    if gpu_id is None:
        gpu_id = int(os.environ.get('GPU_ID', 0))

    network.to_gpu(gpu_id)
    trainer.to_gpu(gpu_id)

    N = x_train.shape[0]

    logger = logging.getLogger(__name__)

    if log:
        if display:
            logger.debug("> Training for %d steps" % n_steps)
            logger.debug("> Training samples: %d. Testing samples: %d\n" % (
                len(y_train), len(y_test)))
            logger.debug("          (Accuracy) train , test   || \
(Precison/Recall) train , test ||  (F1_score) train , test")
            logger.debug("-" * 100)
        print_nth = n_steps if (print_nth > n_steps) or (
            print_nth == -1) else print_nth
        p_every = n_steps / print_nth
        l = len(range(0, n_steps - 1, p_every))
        log_train, log_test = np.zeros((l + 1, 5)), np.zeros((l + 1, 5))
        log_idx = 0

    for step in range(n_steps):
        network.zerograds()

        # Random permutations, permissive for N < batch_size
        perm = np.random.permutation(max(N, (batch_size / N + 1) * N)) % N
        np.random.shuffle(perm)
        perm = perm[:max(N, batch_size)]

        b = 0 if batch_size >= N else np.random.randint(0, N - batch_size)

        _examples = x_train[perm[b:b + batch_size]]
        _labels = y_train[perm[b:b + batch_size]]

        examples = Variable(_examples)
        labels = Variable(_labels.astype(np.float32))
        examples.to_gpu(gpu_id)
        labels.to_gpu(gpu_id)

        optimizer.update(trainer, examples, labels)

        if log and (step % p_every == 0 or step == n_steps - 1):
            lo, pr, re, f1, cm = _eval_net(
                network, x_train, y_train, eval_on=1, batch_size=400,
                gpu_id=gpu_id)
            lo_, pr_, re_, f1_, cm_ = _eval_net(
                network, x_test, y_test, eval_on=1, batch_size=400,
                gpu_id=gpu_id)
            log_train[log_idx] = np.array([step, lo, pr, re, f1])
            log_test[log_idx] = np.array([step, lo_, pr_, re_, f1_])
            log_idx += 1
            if display:
                logger.debug("Step %s:        %0.2f , %0.2f   ||    \
%0.2f/%0.2f , %0.2f/%0.2f      ||       %0.2f , %0.2f"
                             % (str(step).ljust(8),
                                lo, lo_, pr, re,
                                pr_, re_, f1, f1_))

    if log:
        return log_train, log_test, cm, cm_
    else:
        return None, None, None, None


def _split_train_test(ratio, data, labels):
    """Split between train and test."""
    assert data.shape[0] == labels.shape[0]
    l = int(len(data) * ratio)
    x_train, x_test = data[:l], data[l:]
    y_train, y_test = labels[:l], labels[l:]

    return x_train, x_test, y_train, y_test


def _rolling_window_lastaxis(a, window):
    """Rollout an array on one axis."""
    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
    strides = a.strides + (a.strides[-1],)
    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)


def _rolling_window(a, window):
    """Rollout an array on multiple axis."""
    for i, win in enumerate(window):
        if win > 1:
            a = a.swapaxes(i, -1)
            a = _rolling_window_lastaxis(a, win)
            a = a.swapaxes(-2, i)
    return a


def _normalize_range(df, indicators_groups, window_size):
    """Normalize a Dataframe while preserving groups relations."""

    data = df.values

    indicators_size = [len(i) for i in sorted(indicators_groups)]
    indicators_loc = np.r_[0, np.cumsum(indicators_size)[:-1]]

    # expand data over a rolling window
    rolled = _rolling_window(data, (window_size, data.shape[1]))
    rolled = np.reshape(
        rolled, (rolled.shape[0], window_size, rolled.shape[-1]))

    # maximum value over each window for each group
    _max = np.maximum.reduceat(np.max(rolled, axis=1), indicators_loc, axis=1)
    r_max = np.repeat(_max, indicators_size, axis=1)

    # minimum value over each window for each group
    _min = np.minimum.reduceat(np.min(rolled, axis=1), indicators_loc, axis=1)
    r_min = np.repeat(_min, indicators_size, axis=1)

    # normalize between 0 and 1
    difference = r_max - r_min
    substracted = rolled - r_min[:, np.newaxis, :]
    divided = substracted / difference[:, np.newaxis, :]

    # consistane shape
    divided = np.swapaxes(divided, 1, 2)
    return np.reshape(divided, divided.shape + (1,))


def _output(cnn, arr, gpu_id=None):
    """Compute a batch of forward pass."""

    if gpu_id is None:
        gpu_id = int(os.environ.get('GPU_ID', 0))

    example = Variable(arr, volatile=True)
    example.to_gpu(gpu_id)

    pred = cnn(example, train=False)
    pred.to_cpu()
    return pred.data


def train(*args, **kwargs):
    """Empty"""
    return


def calc_score(algo_id, gpu_id=None):
    """Compute CNN scores.

    Train a CNN network and predict scores.


    Args:
        algo_id         (int): algorithm id.
        gpu_id          (int): GPU id, optional, default: `env['GPU_ID']`.

    Returns:
        A pandas DataFrame indexed by date with a confidence value.
        For example:

        Date                | confidence
        --------------------+-------------
        2015-08-25 07:45:00 | 0.000000
        2015-08-25 07:50:00 | 0.666309
        2015-08-25 07:55:00 | 0.711061

    Example:
        `calc_score(42)`
    """

    logger = logging.getLogger(__name__)

    if gpu_id is None:
        gpu_id = int(os.environ.get('GPU_ID', 0))

    # local parameters
    align = 25            # candles to take before entry point
    gpu_batch_size = 400  # max batch size to match GPU
    seed = 1337           # random seed

    mp, dataset, sc, bt_start, bt_end = _get_algo_data(algo_id)

    _discard_grad_and_normalizers(mp)

    indicators = [str(i["id"]) for i in mp["features"]]
    indicators_groups = _get_indicators_group(mp)

    # negative inputs are not handled yet
    # _check_negative_inputs(dataset)
    dataset = [i for i in dataset if i['label'] == []]

    # ensure to load everything we will need
    ds_start = tm.T(dataset[0]["start"]).to_pydatetime()
    ds_end = tm.T(dataset[-1]["end"]).to_pydatetime()
    _df_start = min(ds_start, bt_start)
    _df_end = max(ds_end, bt_end)

    # get the DataFrame corresponding to dataset range plus margins
    # **high time consuming step**
    df = _load_df_range(mp, indicators, start=_df_start, end=_df_end)

    # get positive samples from the dataset
    x_and_y = zip(*[_select_normalize_from_df(sample,
                                              df,
                                              align,
                                              indicators_groups,
                                              justify_right=True
                                              ) for sample in dataset])
    selected_data, labels = np.array(x_and_y[0]), np.array(x_and_y[1])

    # get random negative samples
    _align = min(100000, len(df))
    tmp, _ = _select_normalize_from_df({u'end': u'2005/01/01 17:45',
                                        u'start': u'2018/12/31 15:00'},
                                       df, _align, indicators_groups,
                                       justify_right=False)
    data_indicators = np.array(tmp)

    # prepare the data to feed into the neural network
    samples, labels = _prepare_samples_CNN(selected_data, data_indicators,
                                           indicators_groups,
                                           augment_factor=1,
                                           asymetric_augment=1)

    # splitting with a ratio=1, no validation data
    x_train, x_test, y_train, y_test = _split_train_test(1, samples, labels)

    logger.debug("done preparing training data")

    # fixed random seed
    with chainer.cuda.get_device(gpu_id):
        chainer.cuda.cupy.random.seed(seed=seed)
    np.random.seed(seed)

    # setup CNN
    cnn = _CNN_1D(input_size=selected_data.shape[1],
                  input_channels=selected_data.shape[2])
    t_cnn = _train_CNN(cnn)
    optimizer = optimizers.Adam()
    optimizer.setup(t_cnn)

    # train CNN on GPU
    log_train, log_test, cm, cm_ = _gpu_trainer(
        cnn, t_cnn, optimizer, x_train, x_test, y_train, y_test,
        n_steps=100, batch_size=gpu_batch_size,
        log=False, display=False, print_nth=25, gpu_id=gpu_id)
    logger.debug("done training CNN")

    # normalize the full range to process
    # **high time consuming step**
    normalized_range = _normalize_range(df, indicators_groups, align)

    # predict over the range
    cnn_prediction = np.zeros(len(normalized_range))
    for i in range(0, len(normalized_range), gpu_batch_size):
        cnn_prediction[i:i + gpu_batch_size] = _output(
            cnn,
            normalized_range[
                i:i + gpu_batch_size],
            gpu_id=gpu_id)[:, 0]

    logger.debug("done processing range")

    # low pass filter
    cnn_prediction = np.where(cnn_prediction >= 0.0,
                              cnn_prediction, np.zeros(len(cnn_prediction)))

    output = copy.deepcopy(df[-len(cnn_prediction):])
    output["confidence"] = cnn_prediction

    return output[["confidence"]]


def list_suggestions(algo_id, gpu_id=None):
    """Find the most similar points locally.

    Find the most similar points around the points selected in the dataset
    of the algorithm refered by `algo_id`, using CNN. Call `calc_score()`
    and select the best non contiguous results.


    Args:
        algo_id         (int): algorithm id.
        gpu_id          (int): GPU id, optional, default: `env['GPU_ID']`.

    Returns:
        A pandas DataFrame indexed by date ordered by decreasing
        confidence value. For example:

        Date                | confidence
        --------------------+-------------
        2015-09-09 07:55:00 | 0.845748
        2015-09-22 10:05:00 | 0.737587
        2015-09-16 09:25:00 | 0.723020

    Example:
        `suggestions = list_suggestions(42)`
    """

    if gpu_id is None:
        gpu_id = int(os.environ.get('GPU_ID', 0))

    # get scores
    output = calc_score(algo_id, gpu_id=gpu_id)
    scores = output["confidence"].values

    # select local maxima
    maxima = np.where(np.r_[True, scores[1:] > scores[:-1]]
                      & np.r_[scores[:-1] > scores[1:], True])[0]

    # sort by score
    best_scores = np.array(sorted(
        zip(maxima, [scores[i] for i in maxima]), key=lambda k: k[1],
        reverse=True))

    return output.iloc[best_scores[:, 0].astype(np.int32)]



In [4]:
%%time
global _samples
out = calc_score(1)

CPU times: user 5.94 s, sys: 720 ms, total: 6.66 s
Wall time: 52 s


In [None]:
def _prepare_samples_CNN(postive_data, negative_samples, indicators_groups,
                         augment_factor=1, asymetric_augment=1):
    """Reshape samples to be fed into the CNN."""

    # selected_data.shape[0]*2 #10000
    size_dataset = postive_data.shape[0] * augment_factor * 2
    len_samples = postive_data.shape[1]

    if augment_factor > 1:
        positive_samples = np.repeat(postive_data, augment_factor,
                                     axis=0)
    else:
        positive_samples = postive_data

    # Reshape data for convenient use with CNN
    data = np.concatenate(
        (positive_samples, negative_samples)).astype(np.float32)
    labels = np.array([[1]] * len(positive_samples) + [[-1]]
                      * len(negative_samples)).astype(np.int32)
    data = np.swapaxes(data, 1, 2)
    data = np.reshape(data, data.shape + (1,))

    permut = np.random.permutation(len(data))
    data = data[permut]
    labels = labels[permut]
    return data, labels

In [None]:
mp, dataset, sc, bt_start, bt_end = _get_algo_data(1)

_discard_grad_and_normalizers(mp)

indicators = [str(i["id"]) for i in mp["features"]]
indicators_groups = _get_indicators_group(mp)

# negative inputs are not handled yet
# _check_negative_inputs(dataset)
#################
global positive_ranges
positive_ranges = [i for i in dataset if i['label']==[]]
negative_ranges = [i for i in dataset if i['label']=='neg_icon']

# ensure to load everything we will need
ds_start = tm.T(dataset[0]["start"]).to_pydatetime()
ds_end = tm.T(dataset[-1]["end"]).to_pydatetime()
_df_start = min(ds_start, bt_start)
_df_end = max(ds_end, bt_end)

# get the DataFrame corresponding to dataset range plus margins
# **high time consuming step**
df = _load_df_range(mp, indicators, start=_df_start, end=_df_end)

# get positive samples from the dataset
x_and_y = zip(*[_select_normalize_from_df(sample,
                                          df,
                                          align,
                                          indicators_groups,
                                          justify_right=True
                                          ) for sample in positive_ranges])
selected_data, labels = np.array(x_and_y[0]), np.array(x_and_y[1])

# get random negative samples
# _align = min(100000, len(df))
# tmp, _ = _select_normalize_from_df({u'end': u'2005/01/01 17:45',
#                                     u'start': u'2018/12/31 15:00'},
#                                    df, _align, indicators_groups,
#                                    justify_right=False)
global _samples
if len(positive_ranges) > len(negative_ranges):
    tmp = []
    zer = df.index[0].strftime("%Y/%m/%d %H:%M") #%z
    add_n_samples = len(positive_ranges)- len(negative_ranges)
    _samples = df[align:].sample(n=add_n_samples).index.sort_values()
    for i in _samples: # .strftime("%Y/%m/%d %H:%M"): #%Y-%m-%d %H-%M-%S
        _start = i.strftime("%Y/%m/%d %H:%M")
        _end = (i - datetime.timedelta(align)).strftime("%Y/%m/%d %H:%M")
        tmp.append({u'end': _start, u'label':'neg_icon', u'start': _end})

    negative_ranges += tmp



neg = zip(*[_select_normalize_from_df(sample,
                                          df,
                                          align,
                                          indicators_groups,
                                          justify_right=True
                                          ) for sample in negative_ranges])
data_indicators = np.array(neg[0])

# prepare the data to feed into the neural network
samples, labels = _prepare_samples_CNN(selected_data, data_indicators,
                                       indicators_groups,
                                       augment_factor=1,
                                       asymetric_augment=1)