## Imports

In [6]:
%load_ext autoreload
%autoreload 2

import logging
import os

import pandas as pd

import helpers.dbg as dbg
import helpers.printing as prnt

In [9]:
prnt.config_notebook()

# dbg.init_logger(verbosity=logging.DEBUG)
dbg.init_logger(verbosity=logging.INFO)
# dbg.test_logger()
_LOG = logging.getLogger(__name__)

[0m[36mINFO[0m: > cmd='/venv/lib/python3.8/site-packages/ipykernel_launcher.py -f /root/.local/share/jupyter/runtime/kernel-26b9c9dd-34a1-43ca-8ac4-d0545f037ca0.json'


# Real-time node

## Build pipeline


In [None]:
import core.config as cconfig
import core.dataflow as cdataf
import core.dataflow.real_time as cdrt
import dataflow_amp.returns.pipeline as darp

dag_builder = darp.ReturnsPipeline()
config = dag_builder.get_config_template()

# # Add the source node.
# source_config = cconfig.get_config_from_nested_dict(
#     {
#         "func": cldns.load_single_instrument_data,
#         "func_kwargs": {
#             "start_date": datetime.date(2010, 6, 29),
#             "end_date": datetime.date(2010, 7, 13),
#         },
#     }
# )
# config["load_prices"] = source_config
# config["resample_prices_to_1min", "func_kwargs", "volume_cols"] = ["volume"]
# config["compute_vwap", "func_kwargs", "rule"] = "15T"
# config["compute_vwap", "func_kwargs", "volume_col"] = "volume"

if False:
    from im.kibot.data.config import S3_PREFIX

    ticker = "AAPL"
    file_path = os.path.join(S3_PREFIX, "pq/sp_500_1min", ticker + ".pq")
    source_node_kwargs = {
        "func": cdataf.load_data_from_disk,
        "func_kwargs": {
            "file_path": file_path,
            "start_date": pd.to_datetime("2010-01-04 9:30:00"),
            "end_date": pd.to_datetime("2010-01-04 16:05:00"),
        },
    }
    config["load_prices"] = cconfig.get_config_from_nested_dict(
        source_node_kwargs
    )

else:
    start_date = pd.Timestamp("2010-01-04 09:30:00")
    end_date = pd.Timestamp("2010-01-04 11:30:00")

    source_node_kwargs = {
        "columns": ["close", "vol"],
        "start_date": start_date,
        "end_date": end_date,
    }
    config["load_prices"] = cconfig.get_config_from_nested_dict(
        {
            "source_node_name": "real_time_synthetic",
            "source_node_kwargs": source_node_kwargs,
        }
    )

print(config)

In [None]:
dag = dag_builder.get_dag(config)

In [None]:
if False:
    # nid = "compute_ret_0"
    nid = "load_prices"
    node = dag.get_node("load_prices")
    node.reset_current_time()
    node.set_current_time(pd.to_datetime("2010-01-06 9:30:00"))

    dict_ = dag.run_leq_node(nid, "fit")

    print(dict_)

In [None]:
node = dag.get_node("load_prices")
node.reset_current_time()

for now in cdrt.get_now_time(start_date, end_date):
    print("now=", now)
    execute = cdrt.is_dag_to_execute(now)
    if execute:
        print("Time to execute the DAG")
        node = dag.get_node("load_prices")
        node.set_current_time(now)
        #
        sink = dag.get_unique_sink()
        dict_ = dag.run_leq_node(sink, "fit")
        print(dict_["df_out"].tail(3))

## Use real_time_return_pipeline

In [5]:
import dataflow_amp.real_time.real_time_return_pipeline as dtfart

dag_builder = dtfart.RealTimeReturnPipeline()

config = dag_builder.get_config_template()
print("\n# config=\n%s" % config)

dag_builder.validate_config(config)

dag = dag_builder.get_dag(config)


# config=
load_prices:
  source_node_name: RealTimeDataSource
  source_node_kwargs:
    delay_in_secs: 0.0
    external_clock: <bound method ReplayRealTime.get_replayed_current_time of <core.dataflow.real_time.ReplayRealTime object>>
    data_builder: <function generate_synthetic_data>
    data_builder_kwargs:
      columns: ['close', 'vol']
      start_datetime: 2010-01-04 09:30:00
      end_datetime: 2010-01-04 11:30:00
filter_weekends:
  col_mode: replace_all
filter_ath:
  col_mode: replace_all
  transformer_kwargs:
    start_time: 09:30:00
    end_time: 16:00:00
resample_prices_to_1min:
  func_kwargs:
    rule: 1T
    price_cols: ['close']
    volume_cols: ['vol']
compute_vwap:
  func_kwargs:
    rule: 5T
    price_col: close
    volume_col: vol
    add_bar_start_timestamps: True
    add_epoch: True
    add_last_price: True
compute_ret_0:
  cols: ['twap', 'vwap']
  col_mode: merge_all
  transformer_kwargs:
    mode: pct_change


In [None]:
# print(dag)
cdataf.draw(dag)

In [None]:
# # Align on a even second.
# cdrt.align_on_even_second()
# #
# sleep_interval_in_secs = 1.0
# num_iterations = 3
# get_current_time = rrt.get_replayed_current_time
# need_to_execute = cdrt.execute_every_2_seconds
# #
# execution_trace, results = cdrt.execute_dag_with_real_time_loop(
#     sleep_interval_in_secs,
#     num_iterations,
#     get_current_time,
#     need_to_execute,
#     dag,
# )

In [None]:
results[0][1]["df_out"]

In [None]:
##

import core.dataflow.real_time as cdrt

In [14]:
import helpers.datetime_ as hdatetime

start_datetime = pd.Timestamp("2010-01-04 09:30:00", tz=hdatetime.get_ET_tz())
end_datetime = pd.Timestamp("2010-01-05 09:30:00", tz=hdatetime.get_ET_tz())

# Use a replayed real-time starting at the same time as the data.
rrt = cdrt.ReplayRealTime(start_datetime)
get_current_time = rrt.get_replayed_current_time

initial_replayed_dt=2010-01-04 09:30:00-05:00
Using tz 'America/New_York'
now='2021-07-30 11:15:53.159357-04:00'


In [26]:
import core.dataflow as cdtf

execute_rt_loop_kwargs = {
    "sleep_interval_in_secs": 1.0,
    "num_iterations": 3,
    "get_current_time": rrt.get_replayed_current_time,
    "need_to_execute": cdrt.execute_every_2_seconds,
}

#
kwargs = {
    "config": config,
    "dag_builder": dag_builder,
    "fit_state": None,
    #
    "execute_rt_loop_kwargs": execute_rt_loop_kwargs,
    #
    "dst_dir": None,
}

dag_runner = cdtf.RealTimeDagRunner(**kwargs)

# Align on a even second.
cdrt.align_on_even_second()
result = dag_runner.predict()

Using tz 'America/New_York'
now='2021-07-30 13:09:56.001007-04:00'
event='RealTimeEvent(num_it=1, current_time=Timestamp('2010-01-04 11:24:02.841650-0500', tz='America/New_York'), wall_clock_time=Timestamp('2021-07-30 13:09:56.001580-0400', tz='America/New_York'), need_execute=True)'
  -> execute


run_leq_node:   0%|          | 0/6 [00:00<?, ?it/s]

Using tz 'None'
now='2021-07-30 17:09:56.022644'
Using tz 'America/New_York'
now='2021-07-30 13:09:57.123659-04:00'
event='RealTimeEvent(num_it=2, current_time=Timestamp('2010-01-04 11:24:03.964302-0500', tz='America/New_York'), wall_clock_time=Timestamp('2021-07-30 13:09:57.125097-0400', tz='America/New_York'), need_execute=False)'
Using tz 'America/New_York'
now='2021-07-30 13:09:58.129532-04:00'
event='RealTimeEvent(num_it=3, current_time=Timestamp('2010-01-04 11:24:04.970175-0500', tz='America/New_York'), wall_clock_time=Timestamp('2021-07-30 13:09:58.130575-0400', tz='America/New_York'), need_execute=True)'
  -> execute


run_leq_node:   0%|          | 0/6 [00:00<?, ?it/s]

Using tz 'None'
now='2021-07-30 17:09:58.165454'


In [32]:
result[0]

[RealTimeEvent(num_it=1, current_time=Timestamp('2010-01-04 11:24:02.841650-0500', tz='America/New_York'), wall_clock_time=Timestamp('2021-07-30 13:09:56.001580-0400', tz='America/New_York'), need_execute=True),
 RealTimeEvent(num_it=2, current_time=Timestamp('2010-01-04 11:24:03.964302-0500', tz='America/New_York'), wall_clock_time=Timestamp('2021-07-30 13:09:57.125097-0400', tz='America/New_York'), need_execute=False),
 RealTimeEvent(num_it=3, current_time=Timestamp('2010-01-04 11:24:04.970175-0500', tz='America/New_York'), wall_clock_time=Timestamp('2021-07-30 13:09:58.130575-0400', tz='America/New_York'), need_execute=True)]

In [31]:
len(result[1])

2

In [17]:
import pandas as pd
import helpers.unit_test as hut

num_cols = 2
seed = 42
date_range_kwargs = {
    "start": pd.Timestamp("2010-01-01"),
    "end": pd.Timestamp("2010-02-01"),
    "freq": "1B",
}
#pd.date_range(**date_range_kwargs)
data = hut.get_random_df(num_cols, seed=seed, date_range_kwargs=date_range_kwargs)
print(data)


config = {
        "rule": "1B",
        "agg_func": "last",
        "resample_kwargs": None,
        "agg_func_kwargs": None,
    }
node = cdnt.Reample("resample", **config)
df_out = node.fit(data)["df_out"]

                   0         1
2010-01-01  0.374540  0.950714
2010-01-04  0.731994  0.598658
2010-01-05  0.156019  0.155995
2010-01-06  0.058084  0.866176
2010-01-07  0.601115  0.708073
2010-01-08  0.020584  0.969910
2010-01-11  0.832443  0.212339
2010-01-12  0.181825  0.183405
2010-01-13  0.304242  0.524756
2010-01-14  0.431945  0.291229
2010-01-15  0.611853  0.139494
2010-01-18  0.292145  0.366362
2010-01-19  0.456070  0.785176
2010-01-20  0.199674  0.514234
2010-01-21  0.592415  0.046450
2010-01-22  0.607545  0.170524
2010-01-25  0.065052  0.948886
2010-01-26  0.965632  0.808397
2010-01-27  0.304614  0.097672
2010-01-28  0.684233  0.440152
2010-01-29  0.122038  0.495177
2010-02-01  0.034389  0.909320


NameError: name 'cconfig' is not defined

In [22]:
import core.dataflow.nodes.transformers as cdtfnt

nid = "nop"
def func(df_in):
    return df_in


func_kwargs = {}

node = cdtfnt.FunctionWrapper(nid,
                       func,
                       func_kwargs)

node.fit(data)

{'df_out':                    0         1
 2010-01-01  0.374540  0.950714
 2010-01-04  0.731994  0.598658
 2010-01-05  0.156019  0.155995
 2010-01-06  0.058084  0.866176
 2010-01-07  0.601115  0.708073
 2010-01-08  0.020584  0.969910
 2010-01-11  0.832443  0.212339
 2010-01-12  0.181825  0.183405
 2010-01-13  0.304242  0.524756
 2010-01-14  0.431945  0.291229
 2010-01-15  0.611853  0.139494
 2010-01-18  0.292145  0.366362
 2010-01-19  0.456070  0.785176
 2010-01-20  0.199674  0.514234
 2010-01-21  0.592415  0.046450
 2010-01-22  0.607545  0.170524
 2010-01-25  0.065052  0.948886
 2010-01-26  0.965632  0.808397
 2010-01-27  0.304614  0.097672
 2010-01-28  0.684233  0.440152
 2010-01-29  0.122038  0.495177
 2010-02-01  0.034389  0.909320}

In [32]:
import core.dataflow.nodes.test.test_dag as test_dag

dag_builder = test_dag._NaivePipeline()

config = dag_builder.get_config_template()

dag_builder.get_dag(config)

<core.dataflow.core.DAG at 0x7f9ad9dbd430>