# Back Test and Trade

# Notebook Environment

For a unified research environment, enable the flags below:

In [None]:
UPGRADE_PY = False
INSTALL_DEPS = False
if INSTALL_DEPS:
  # !pip install -q tensorboard==2.15.2
  # !pip install -q tensorflow[and-cuda]==2.15.1
  # !pip install -q tensorflow==2.15.0
  # !pip install -q tensorflow-io-gcs-filesystem==0.36.0
  # !pip install -q tensorflow-text==2.15.0
  # !pip install -q tf_keras==2.15.1
  # !pip install -q tokenizers==0.15.2
  # !pip install -q torch==2.2.0+cpu
  # !pip install -q torch-xla==2.2.0+libtpu
  # !pip install -q torchdata==0.7.1
  !pip install -q transformers==4.38.2

if UPGRADE_PY:
    !mamba create -n py311 -y
    !source /opt/conda/bin/activate py312 && mamba install python=3.11 jupyter mamba -y

    !sudo rm /opt/conda/bin/python3
    !sudo ln -sf /opt/conda/envs/py312/bin/python3 /opt/conda/bin/python3
    !sudo rm /opt/conda/bin/python3.10
    !sudo ln -sf /opt/conda/envs/py312/bin/python3 /opt/conda/bin/python3.10
    !sudo rm /opt/conda/bin/python
    !sudo ln -sf /opt/conda/envs/py312/bin/python3 /opt/conda/bin/python

!python --version

In [None]:
import os
import sys
import warnings
warnings.filterwarnings("ignore")

# Transformers cannot use keras3
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ['TF_USE_LEGACY_KERAS'] = '1'
IN_KAGGLE = IN_COLAB = False
!export CUDA_LAUNCH_BLOCKING=1
!export XLA_FLAGS=--xla_cpu_verbose=0

try:
    # https://www.tensorflow.org/install/pip#windows-wsl2
    import google.colab
    from google.colab import drive
    drive.mount('/content/drive')
    DATA_PATH = "/content/drive/MyDrive/EDT dataset"
    MODEL_PATH = "./models/bert_news"
    IN_COLAB = True
    print('Colab!')
except:
    IN_COLAB = False
if 'KAGGLE_KERNEL_RUN_TYPE' in os.environ and not IN_COLAB:
    print('Running in Kaggle...')
    for dirname, _, filenames in os.walk('/kaggle/input'):
        for filename in filenames:
            print(os.path.join(dirname, filename))
    DATA_PATH = "/kaggle/input/uscorpactionnews" 
    MODEL_PATH = "/kaggle/input/bert_news/tensorflow2/bert_news/1/bert_news"
    IN_KAGGLE = True
    print('Kaggle!')
elif not IN_COLAB and not IN_KAGGLE:
    IN_KAGGLE = False
    DATA_PATH = "./data/"
    MODEL_PATH = "./models/bert_news"
    print('Normal!')

MODEL_BASE = "google-bert/bert-base-cased"

# Accelerators Configuration

If you have a GPU, TPU or in one of the collaborative notebooks. Configure your setup below:

In [None]:
import numpy as np
import math
import shutil
import pandas as pd

from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras import mixed_precision

print(f'Tensorflow version: [{tf.__version__}]')

tf.get_logger().setLevel('INFO')

#tf.config.set_soft_device_placement(True)
#tf.config.experimental.enable_op_determinism()
#tf.random.set_seed(1)
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()

    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
except Exception as e:
    # Not an exception, just no TPUs available, GPU is fallback
    # https://www.tensorflow.org/guide/mixed_precision
    print(e)
    policy = mixed_precision.Policy('mixed_float16')
    mixed_precision.set_global_policy(policy)
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if len(gpus) > 0:

        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, False)
            tf.config.experimental.set_virtual_device_configuration(gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=12288)])
            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
            strategy = tf.distribute.MirroredStrategy()

            print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
        except RuntimeError as e:
            print(e)
        finally:
            print("Running on", len(tf.config.list_physical_devices('GPU')), "GPU(s)")
    else:
        # CPU is final fallback
        strategy = tf.distribute.get_strategy()
        print("Running on CPU")

def is_tpu_strategy(strategy):
    return isinstance(strategy, tf.distribute.TPUStrategy)

print("Number of accelerators:", strategy.num_replicas_in_sync)
os.getcwd()

# Back Test

In [None]:
from transformers import BertTokenizerFast

from tensorflow.keras.models import load_model
from tensorflow.keras.models import save_model

from tqdm import tqdm
import json
from dateutil import parser

MAX_LEN = 256
BATCH_SIZE = 8 * strategy.num_replicas_in_sync # Default 8

def create_dataset(encodings, ner_labels, seq_labels):
    input_ids = np.array(encodings['input_ids'])
    attention_mask = np.array(encodings['attention_mask'])
    token_type_ids = np.array(encodings['token_type_ids']) if 'token_type_ids' in encodings else None
    ner_labels = np.array(ner_labels)
    seq_labels = np.array(seq_labels)
    return tf.data.Dataset.from_tensor_slices((
        {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'token_type_ids': token_type_ids,
        },
        {
            'seq_output': seq_labels,
            'ner_output': ner_labels,
        },
    ))

def load_seq_data_from_json(path, max_len=MAX_LEN):
    with open(path, "r") as f:
        data = json.load(f)
    texts = []
    labels = []
    for item in tqdm(data):
        text = item['title'] + " " + item['text']
        text = " ".join(text.split()[:max_len])
        texts.append(text)
        labels.append(0)
    return texts, labels

def load_and_cache_predict_dataset(DATA_PATH, model = MODEL_BASE):
    tokenizer = BertTokenizerFast.from_pretrained(model)
    predict_text, predict_seq_label = load_seq_data_from_json(f'{DATA_PATH}/Trading_benchmark/evaluate_news.json')
    predict_encodings = tokenizer(predict_text, padding=True, truncation=True, max_length=MAX_LEN)
    predict_ner_label = np.zeros([len(predict_text), MAX_LEN])
    return predict_encodings, predict_seq_label, predict_ner_label

with strategy.scope():
    predict_encodings, predict_seq_label, predict_ner_label = load_and_cache_predict_dataset(DATA_PATH)
    dataset = create_dataset(predict_encodings, predict_ner_label, predict_seq_label)
    dataset = (dataset.batch(BATCH_SIZE).cache().prefetch(tf.data.experimental.AUTOTUNE))

In [None]:
from tensorflow.keras import Model
from tensorflow.keras.optimizers import AdamW, Adam
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.losses import Loss, SparseCategoricalCrossentropy, CategoricalFocalCrossentropy, CategoricalCrossentropy, BinaryCrossentropy
from tensorflow.keras.metrics import Metric, SparseCategoricalAccuracy, Precision, Recall
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard, Callback, ReduceLROnPlateau, TerminateOnNaN
from tensorflow.keras.initializers import GlorotUniform
from tensorflow.keras.mixed_precision import LossScaleOptimizer
from tensorflow.keras.utils import register_keras_serializable

from transformers import TFBertModel, BertConfig

UNK_ID = -100
index2event = {
    '0': 'Acquisitions',
    '1': 'Clinical Trials',
    '2': 'Dividend Cut',
    '3': 'Dividend Increase',
    '4': 'Guidance Change',
    '5': 'New Contract',
    '6': 'Regular Dividend',
    '7': 'Reverse Stock Split',
    '8': 'Special Dividend',
    '9': 'Stock Repurchase',
    '10': 'Stock Split',
    '11': 'NoEvent',
}
event2index = {v: k for k, v in index2event.items()}
NUM_EVENTS = len(event2index) - 1
NOEVENT_ID = int(event2index['NoEvent'])
IS_POSITIVE = {
    'Acquisitions': True,
    'Clinical Trials': True,
    'Dividend Cut': False,
    'Dividend Increase': True,
    'Guidance Change': True,
    'New Contract': True,
    'Regular Dividend': True,
    'Reverse Stock Split': False,
    'Special Dividend': True,
    'Stock Repurchase': True,
    'Stock Split': True,
    'Sentiment': True,
}

# https://www.tensorflow.org/api_docs/python/tf/keras/Metric
@register_keras_serializable(package='Custom', name='MultilabelBinaryAccuracy')
class MultilabelBinaryAccuracy(Metric):
    def __init__(self, name='multilabel_binary_accuracy', labels_len=11, **kwargs):
        super(MultilabelBinaryAccuracy, self).__init__(name=name, **kwargs)
        self.correct_predictions = self.add_weight(name='correct', initializer='zeros')
        self.total_predictions = self.add_weight(name='total', initializer='zeros')
        self.labels_len = labels_len

    def update_state(self, y_true, y_pred, sample_weight=None):
        mask = tf.logical_and(tf.greater_equal(y_true, 0), tf.less(y_true, self.labels_len))
        y_true_masked = tf.where(mask, y_true, tf.zeros_like(y_true))
        y_true_masked = tf.cast(y_true_masked, tf.float32)

        y_pred = tf.cast(tf.greater(y_pred, 0.5), tf.float32)
        matches = tf.cast(tf.equal(y_true_masked, y_pred), tf.float32)

        if sample_weight is not None:
            matches = tf.multiply(matches, tf.cast(sample_weight, tf.float32))

        self.correct_predictions.assign_add(tf.reduce_sum(matches))
        self.total_predictions.assign_add(tf.cast(tf.size(y_true), tf.float32))

    def reset_states(self):
        self.correct_predictions.assign(0.)
        self.total_predictions.assign(0.)


    def result(self):
        return tf.cast(self.correct_predictions, tf.float32) / tf.cast(self.total_predictions, tf.float32)

@register_keras_serializable(package='Custom', name='MaskedWeightedMultiClassBCE')
class MaskedWeightedMultiClassBCE(Loss):
    def __init__(self,
                 from_logits=False,
                 name='masked_weighted_multi_bce',
                 class_weight=None,
                 labels_len=MAX_LEN,
                 null_class=UNK_ID,
                 focal_gamma=None, **kwargs):
        super().__init__(name=name, **kwargs)
        self.from_logits = from_logits
        self.null_class = tf.cast(null_class, tf.float32)
        self.class_weight = None
        self.labels_len = labels_len
        if class_weight is not None:
            class_weights_list = [class_weight[i] for i in sorted(class_weight)]
            self.class_weight = tf.convert_to_tensor(class_weights_list, dtype=tf.dtypes.float32)
        self.focal_gamma = focal_gamma

    def call(self, y_true, y_pred):
        y_true = tf.cast(y_true, tf.float32)
        # mask = tf.logical_and(tf.greater_equal(y_true, 0), tf.less(y_true, self.labels_len - 1))
        # y_true_masked = tf.where(mask, y_true, tf.zeros_like(y_true))
        # y_true_masked = tf.cast(y_true_masked, tf.float32)

        loss_fn = BinaryCrossentropy(from_logits=self.from_logits, reduction=tf.keras.losses.Reduction.NONE)
        loss = loss_fn(y_true, y_pred)

        return tf.cast(tf.reduce_mean(loss), tf.float32)

# https://www.tensorflow.org/api_docs/python/tf/keras/losses/CategoricalFocalCrossentropy
@register_keras_serializable(package='Custom', name='MaskedWeightedSCCE')
class MaskedWeightedSCCE(Loss):
    def __init__(self,
                 from_logits=False,
                 name='masked_weighted_scce',
                 class_weight=None,
                 labels_len=MAX_LEN,
                 null_class=UNK_ID,
                 focal_gamma=None,
                 **kwargs):
        super().__init__(name=name, **kwargs)
        self.from_logits = from_logits
        self.null_class = tf.cast(null_class, tf.float32)
        self.class_weight = None
        self.labels_len = labels_len
        if class_weight is not None:
            class_weights_list = [class_weight[i] for i in sorted(class_weight)]
            self.class_weight = tf.convert_to_tensor(class_weights_list, dtype=tf.dtypes.float32)
        self.focal_gamma = focal_gamma

    def call(self, y_true, y_pred):
        y_true = tf.cast(y_true, tf.float32)
        mask = tf.logical_and(tf.greater_equal(y_true, 0), tf.less(y_true, self.labels_len - 1))
        y_true_masked = tf.where(mask, y_true, tf.zeros_like(y_true))
        y_true_masked = tf.cast(y_true_masked, tf.float32)
        if tf.executing_eagerly():
            tf.debugging.assert_greater(tf.reduce_sum(tf.cast(mask, tf.int32)),
                                        0, message="All data are masked!")

        # https://www.tensorflow.org/api_docs/python/tf/keras/losses/SparseCategoricalCrossentropy
        loss_fn = SparseCategoricalCrossentropy(from_logits=self.from_logits, reduction=tf.keras.losses.Reduction.NONE)

        if self.focal_gamma is not None:
            # inspired by: https://github.com/artemmavrin/focal-loss/blob/master/src/focal_loss/_categorical_focal_loss.py
            loss = loss_fn(y_true_masked, y_pred)
            y_pred = tf.clip_by_value(y_pred, clip_value_min=-100., clip_value_max=100.)
            proba = tf.nn.softmax(y_pred)
            y_true_rank = y_true_masked.shape.rank

            p_t = tf.gather(proba, tf.cast(y_true_masked, tf.int32),
                            axis=-1, batch_dims=y_true_rank)
            focal_modulation = tf.cast((1. - tf.clip_by_value(p_t, 0.01, 0.99)) ** self.focal_gamma, tf.float32)

            loss *= focal_modulation
            if self.class_weight is not None:
                loss *= tf.gather(self.class_weight, tf.cast(y_true_masked, tf.int32))
            if tf.executing_eagerly():
                tf.debugging.assert_all_finite(focal_modulation, "Focal contains NaN or Inf")
        else:
          # We remove wieghts from focal loss as we zero the UNK class (ln(0)).
          loss = loss_fn(y_true_masked, y_pred,
                         sample_weight=tf.gather(self.class_weight,
                                                 tf.cast(y_true_masked, tf.int32)) if self.class_weight is not None
                                                 else None)
        loss = tf.cast(loss, tf.float32)
        loss *=  tf.cast(mask, tf.float32)
        # Avoid div by 0.
        sum_mask = tf.reduce_sum(tf.cast(mask, tf.float32))
        if tf.executing_eagerly():
            tf.debugging.assert_positive(sum_mask, message="sum_mask zeroed.")
        loss = (tf.reduce_sum(loss) / sum_mask
                      if sum_mask > 0.
                      else tf.constant(0., dtype=tf.float32))
        if tf.executing_eagerly():
            tf.debugging.assert_positive(loss, message="Loss masked to zero.")

        return loss


In [None]:
def get_positive_for_event(pred_dir, NER=False, SEQ=False, max_seq_len=256, seq_threshold=0,
                           ignore_event_list=('Regular Dividend',)):
    print('Finding trading signals for events with NER={}, SEQ={}, MAX_SEQUENCE_LEN={}, seq_threshold={}'.format(NER, SEQ, max_seq_len, seq_threshold))
    count = 0
    ignore_list = []
    if len(ignore_event_list) > 0:
        for event in ignore_event_list:
            ignore_list.append(int(event2index[event]))
    all_positive = {}
    for label in range(NUM_EVENTS):
        all_positive[index2event[str(label)]] = []
    ner_path = os.path.join(pred_dir, 'ner_pred.npy')
    seq_path = os.path.join(pred_dir, 'seq_pred.npy')
    if NER:
        ner_preds = np.load(os.path.join(ner_path))
        ner_preds = ner_preds.reshape([-1, max_seq_len])
        ner_preds = ner_preds[:, 1:]
    if SEQ:
        seq_preds = np.load(seq_path)
        seq_preds = seq_preds[1:, :]
    if NER:
        for index, pred in enumerate(ner_preds):
            pred[pred == -100] = NOEVENT_ID
            tags = set(pred)
            if SEQ:
                seq_tags = set(list(np.where(seq_preds[index] > seq_threshold)[0]))
                tags = tags.union(seq_tags)

            if len(tags) == 1:
                continue

            tags.remove(NOEVENT_ID)
            for tag in list(tags):
                tag = int(tag)
                if tag not in ignore_list:
                    # if len(np.where(pred == tag)[0]) < 2:
                    #     continue
                    all_positive[index2event[str(tag)]].append(index)
                    count += 1
    elif SEQ:
        for index, pred in enumerate(seq_preds):
            pos_label = set(list(np.where(pred > seq_threshold)[0]))
            if len(pos_label) == 0:
                pass
            elif NOEVENT_ID not in pos_label:
                for pos in pos_label:
                    if pos not in ignore_list:
                        all_positive[index2event[str(pos)]].append(index)
                        count += 1
    print('Find {} trading signals with events'.format(count))
    return all_positive

def load_evaluation_news(data_dir):
    print("Loading data from {}".format(data_dir))
    with open(data_dir, "r") as f:
        evaluation_news = json.load(f)
    return evaluation_news

def _initialize_dicts_for_data_storage(event_list):
    results = {}
    enriched_event_list = list(event_list) + ['All']
    for start_type in ['open', 'close']:
        results[start_type] = {}
    for start_type in ['open', 'close']:
        for policy in ['end', 'best']:
            results[start_type][policy] = {}
    for start_type in ['open', 'close']:
        for policy in ['end', 'best']:
            for period in ['1', '2', '3']:
                results[start_type][policy][period] = {}
    for start_type in ['open', 'close']:
        for policy in ['end', 'best']:
            for period in ['1', '2', '3']:
                for event in enriched_event_list:
                    results[start_type][policy][period][event] = {}
    for start_type in ['open', 'close']:
        for policy in ['end', 'best']:
            for period in ['1', '2', '3']:
                for event in enriched_event_list:
                    for metric in ['big_win_count', 'win_count', 'loss_count', 'total_count', 'win_rate', 'win_change_rate',
                                   'loss_change_rate', 'total_change_rate', 'big_win_rate']:
                        results[start_type][policy][period][event][metric] = 0
                    for index in ['win_index', 'loss_index']:
                        results[start_type][policy][period][event][index] = {}
    return results

def _update_backtest_results_with_change_rate(index, change_rate, result_dict):
    result_dict['big_win_count'] += (change_rate >= 0.01)
    result_dict['win_count'] += (change_rate >= 0)
    result_dict['loss_count'] += (change_rate < 0)
    result_dict['total_count'] += 1
    result_dict['win_change_rate'] += (change_rate >= 0) * change_rate
    result_dict['loss_change_rate'] += (change_rate < 0) * change_rate
    result_dict['total_change_rate'] += change_rate
    if change_rate >= 0:
        result_dict['win_index'][index] = change_rate
    else:
        result_dict['loss_index'][index] = change_rate


def backtest(all_positive, evaluation_news, save_dir, buy_pub_same_time=False, stoploss=0.0):
    print("Perform backtesting with buy_pub_same_time={}, stoploss={}".format(buy_pub_same_time, stoploss))
    event_list = all_positive.keys()
    results = _initialize_dicts_for_data_storage(event_list)
    for event in event_list:
        positive = IS_POSITIVE[event]
        all_signals = all_positive[event]

        for index in all_signals:
            item = evaluation_news[index]
            labels = item['labels']
            if len(labels) <= 1:
                continue

            if buy_pub_same_time:
                '''
                skip the signal if the stock buy time is different from the article publish time. On the one hand,
                all the news articles that are not published in the market hours are ignored. On the other hand,
                since there are missing values in our historical stock data, some market hour signals whose historical
                data are imcomplete are also ignored
                '''
                # if labels['start_time'] != item['pub_time']:
                #     continue
                # else:
                #     start_hour = int(labels['start_time'].split()[1].split(":")[0])
                #     if  9 < start_hour < 16:
                #         continue
                if parser.parse(labels['start_time']) != parser.parse(item['pub_time']):
                    continue


            open_price = labels['start_price_open']
            close_price = labels['start_price_close']

            if positive:
                change_rate_close_end_1 = (labels['end_price_1day'] - close_price) / close_price
                change_rate_close_end_2 = (labels['end_price_2day'] - close_price) / close_price
                change_rate_close_end_3 = (labels['end_price_3day'] - close_price) / close_price

                change_rate_open_end_1 = (labels['end_price_1day'] - open_price) / open_price
                change_rate_open_end_2 = (labels['end_price_2day'] - open_price) / open_price
                change_rate_open_end_3 = (labels['end_price_3day'] - open_price) / open_price

                change_rate_close_best_1 = (labels['highest_price_1day'] - close_price) / close_price
                change_rate_close_best_2 = (labels['highest_price_2day'] - close_price) / close_price
                change_rate_close_best_3 = (labels['highest_price_3day'] - close_price) / close_price

                change_rate_open_best_1 = (labels['highest_price_1day'] - open_price) / open_price
                change_rate_open_best_2 = (labels['highest_price_2day'] - open_price) / open_price
                change_rate_open_best_3 = (labels['highest_price_3day'] - open_price) / open_price

            else:
                change_rate_close_end_1 = (close_price - labels['end_price_1day']) / close_price
                change_rate_close_end_2 = (close_price - labels['end_price_2day']) / close_price
                change_rate_close_end_3 = (close_price - labels['end_price_3day']) / close_price

                change_rate_open_end_1 = (open_price - labels['end_price_1day']) / open_price
                change_rate_open_end_2 = (open_price - labels['end_price_2day']) / open_price
                change_rate_open_end_3 = (open_price - labels['end_price_3day']) / open_price

                change_rate_close_best_1 = (close_price - labels['lowest_price_1day']) / close_price
                change_rate_close_best_2 = (close_price - labels['lowest_price_2day']) / close_price
                change_rate_close_best_3 = (close_price - labels['lowest_price_3day']) / close_price

                change_rate_open_best_1 = (open_price - labels['lowest_price_1day']) / open_price
                change_rate_open_best_2 = (open_price - labels['lowest_price_2day']) / open_price
                change_rate_open_best_3 = (open_price - labels['lowest_price_3day']) / open_price


            if stoploss:
                if positive:
                    max_loss_close_end_1 = (labels['lowest_price_1day'] - close_price) / close_price
                    max_loss_close_end_2 = (labels['lowest_price_2day'] - close_price) / close_price
                    max_loss_close_end_3 = (labels['lowest_price_3day'] - close_price) / close_price

                    max_loss_open_end_1 = (labels['lowest_price_1day'] - open_price) / open_price
                    max_loss_open_end_2 = (labels['lowest_price_2day'] - open_price) / open_price
                    max_loss_open_end_3 = (labels['lowest_price_3day'] - open_price) / open_price

                else:
                    max_loss_close_end_1 = (close_price - labels['highest_price_1day']) / close_price
                    max_loss_close_end_2 = (close_price - labels['highest_price_2day']) / close_price
                    max_loss_close_end_3 = (close_price - labels['highest_price_3day']) / close_price

                    max_loss_open_end_1 = (open_price - labels['highest_price_1day']) / open_price
                    max_loss_open_end_2 = (open_price - labels['highest_price_2day']) / open_price
                    max_loss_open_end_3 = (open_price - labels['highest_price_3day']) / open_price


                change_rate_close_end_1 = -stoploss if max_loss_close_end_1 < -stoploss else change_rate_close_end_1
                change_rate_close_end_2 = -stoploss if max_loss_close_end_2 < -stoploss else change_rate_close_end_2
                change_rate_close_end_3 = -stoploss if max_loss_close_end_3 < -stoploss else change_rate_close_end_3

                change_rate_open_end_1 = -stoploss if max_loss_open_end_1 < -stoploss else change_rate_open_end_1
                change_rate_open_end_2 = -stoploss if max_loss_open_end_2 < -stoploss else change_rate_open_end_2
                change_rate_open_end_3 = -stoploss if max_loss_open_end_3 < -stoploss else change_rate_open_end_3


            _update_backtest_results_with_change_rate(index, change_rate_close_end_1, results['close']['end']['1'][event])
            _update_backtest_results_with_change_rate(index, change_rate_close_end_2, results['close']['end']['2'][event])
            _update_backtest_results_with_change_rate(index, change_rate_close_end_3, results['close']['end']['3'][event])

            _update_backtest_results_with_change_rate(index, change_rate_open_end_1, results['open']['end']['1'][event])
            _update_backtest_results_with_change_rate(index, change_rate_open_end_2, results['open']['end']['2'][event])
            _update_backtest_results_with_change_rate(index, change_rate_open_end_3, results['open']['end']['3'][event])

            _update_backtest_results_with_change_rate(index, change_rate_close_best_1, results['close']['best']['1'][event])
            _update_backtest_results_with_change_rate(index, change_rate_close_best_2, results['close']['best']['2'][event])
            _update_backtest_results_with_change_rate(index, change_rate_close_best_3, results['close']['best']['3'][event])

            _update_backtest_results_with_change_rate(index, change_rate_open_best_1, results['open']['best']['1'][event])
            _update_backtest_results_with_change_rate(index, change_rate_open_best_2, results['open']['best']['2'][event])
            _update_backtest_results_with_change_rate(index, change_rate_open_best_3, results['open']['best']['3'][event])


    for start_type in ['open', 'close']:
        for policy in ['end', 'best']:
            for period in ['1', '2', '3']:
                for event in event_list:
                    for metric in ['big_win_count', 'win_count', 'loss_count', 'total_count', 'win_rate', 'win_change_rate',
                                   'loss_change_rate', 'total_change_rate', 'big_win_rate']:
                        results[start_type][policy][period]['All'][metric] += results[start_type][policy][period][event][metric]
                    # for index in ['win_index', 'loss_index']:
                    #     results[start_type][policy][period]['All'][index].extend(results[start_type][policy][period][event][index])

    for start_type in ['open', 'close']:
        for policy in ['end', 'best']:
            for period in ['1', '2', '3']:
                for event in (list(event_list) + ['All']):
                    results[start_type][policy][period][event]['big_win_rate'] = results[start_type][policy][period][event]['big_win_count'] \
                                                                             / max(1, results[start_type][policy][period][event]['total_count'])
                    results[start_type][policy][period][event]['win_rate'] = results[start_type][policy][period][event]['win_count'] \
                                                                             / max(1, results[start_type][policy][period][event]['total_count'])
                    results[start_type][policy][period][event]['win_change_rate'] = results[start_type][policy][period][event]['win_change_rate'] \
                                                                                    / max(1, results[start_type][policy][period][event]['win_count'])
                    results[start_type][policy][period][event]['loss_change_rate'] = results[start_type][policy][period][event]['loss_change_rate'] \
                                                                                     / max(1, results[start_type][policy][period][event]['loss_count'])
                    results[start_type][policy][period][event]['total_change_rate'] = results[start_type][policy][period][event]['total_change_rate'] \
                                                                                      / max(1, results[start_type][policy][period][event]['total_count'])
    print(results['open']['end']['1']['All'])
    print(results['open']['end']['2']['All'])
    print(results['open']['end']['3']['All'])
    print(results['open']['best']['1']['All'])
    print(results['open']['best']['2']['All'])
    print(results['open']['best']['3']['All'])
    for event in event_list:
        print("{}: {} {}".format(event, results['open']['end']['1'][event]['total_change_rate'], results['open']['end']['1'][event]['total_count']))


    # calculate earnings and save them in "results"
    all_earnings = sequential_backtest(results, event_list, evaluation_news)

    for start_type in ['open', 'close']:
        for policy in ['end']:
            for period in ['1', '2', '3']:
                results[start_type][policy][period]['All']['earning'] = all_earnings[start_type][policy][period]


    # save the backtest results
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    save_dir = os.path.join(save_dir, "backtest_results.json")

    print("Saving backtesting results in {}".format(save_dir))

    with open(save_dir, "w") as f:
        json.dump(results, f)

    return results

In [None]:
with strategy.scope():
    loaded_model = load_model(MODEL_PATH)
    loaded_model.summary()
    y1 = loaded_model.predict(dataset)
    print(f"NER labels shape: {y1[0].shape}")
    print(f"Sequence labels shape: {y1[1].shape}")

    source_path = './data/predict'
    np.save(os.path.join(source_path, 'ner_pred.npy'), y1[0])
    np.save(os.path.join(source_path, 'seq_pred.npy'), y1[1])
    
    
    all_positive = get_positive_for_event(pred_dir=source_path, SEQ=True, NER=True, seq_threshold=5)

    evaluation_news = load_evaluation_news(DATA_PATH)
    _ = backtest(all_positive, evaluation_news, save_dir="./backtest", buy_pub_same_time=True, stoploss=0.2)