In [1]:
import os
import sys
sys.path.append(os.getcwd())

import os
import pandas as pd
from mom_trans.model_inputs import ModelFeatures
from data.symbols import SYMBOLS, ASSET_CLASS_DICT

In [2]:
features_file_path = "data/quandl_cpd_nonelbw.csv"
raw_data = pd.read_csv(features_file_path, index_col=0, parse_dates=True)
raw_data["date"] = raw_data["date"].astype("datetime64[ns]")

In [4]:
raw_data = raw_data[raw_data['symbol']=='SPY']

In [7]:
train_interval = (2010, 2022, 2025)
changepoint_lbws = None 
asset_class_dictionary = ASSET_CLASS_DICT

total_time_steps = 63
split_tickers_individually = True

train_valid_ratio = 0.9
# Add feature "static_ticker". Not sure how this is used!
add_ticker_as_static = False # True for TFT
# Add features: [days_from_start, day_of_week, day_of_month, week_of_year]
time_features = False
force_output_sharpe_length = None

model_features = ModelFeatures(
    raw_data,
    total_time_steps=total_time_steps,
    start_boundary=train_interval[0],
    test_boundary=train_interval[1],
    test_end=train_interval[2],
    changepoint_lbws=changepoint_lbws,
    split_tickers_individually=split_tickers_individually,
    train_valid_ratio=train_valid_ratio,
    add_ticker_as_static=add_ticker_as_static,
    time_features=time_features,
    lags=force_output_sharpe_length,
    asset_class_dictionary=asset_class_dictionary,
)

print(f"train input shape: {model_features.train['inputs'].shape}")
print(f"train output shape: {model_features.train['outputs'].shape}")

def start_date(series, ticker='QQQ'):
    ticker_dates = series['date'][series['identifier'] == ticker]
    return ticker_dates[ticker_dates != ''].min()

def end_date(series, ticker='QQQ'):
    ticker_dates = series['date'][series['identifier'] == ticker]
    return ticker_dates[ticker_dates != ''].max()


print(f"train start: {start_date(model_features.train)}")
print(f"train end: {end_date(model_features.train)}")
print()

print(f"valid input shape: {model_features.valid['inputs'].shape}")
print(f"valid output shape: {model_features.valid['outputs'].shape}")
print(f"valid start: {start_date(model_features.valid)}")
print(f"valid end: {end_date(model_features.valid)}")
print()

print(f"test_fixed input shape: {model_features.test_fixed['inputs'].shape}")
print(f"test_fixed output shape: {model_features.test_fixed['outputs'].shape}")
print(f"test_fixed start: {start_date(model_features.test_fixed)}")
print(f"test_fixed end: {end_date(model_features.test_fixed)}")
print()

print(f"test_sliding input shape: {model_features.test_sliding['inputs'].shape}")
print(f"test_sliding output shape: {model_features.test_sliding['outputs'].shape}")
print(f"test_sliding start: {start_date(model_features.test_sliding)}")
print(f"test_sliding end: {end_date(model_features.test_sliding)}")


train input shape: (40, 63, 8)
train output shape: (40, 63, 1)


ValueError: zero-size array to reduction operation minimum which has no identity

The `ModelFeatures` constructor creates the following dictionaries from the input DataFrame:
  * train
  * valid
  * test_fixed
  * test_sliding

Each dictionary has the following entries:
  * 'identifier': (None, 63, 1), object
  * 'date': (None, 63, 1), object
  * 'inputs': (None, 63, 8), float64
  * 'outputs': (None, 63, 1), float64
  * 'active_entries': (None, 63), float64

All values are Numpy arrays.
Values can be inactive because the input timeseries is not long enough to fill the last input sample for a ticker.
The 'identifier' and 'date' values are actually Python strings.

The constructor also fits scalers for categorical/real-values input features and target values. The scalers are stored in:
  * model_features._real_scalers: sklearn.preprocessing.StandardScaler
  * model_features._cat_scalers: sklearn.preprocessing.LabelEncoder
  * model_featues._target_scaler: sklearn.preprocessing.StandardScaler

The mapping between the original DataFrame columns and the real/categorical/target categories is defined in the schema returned by `model_features.get_column_definition()`.

Note:
  * The `ModelFeatures._target_scaler()` is only called in `ModelFeatures.format_predictions()` but this method is never called.
  * The `ModelFeatures._real_scaler()` is only called if `ModelFeatures.transform_real_inputs` is true. The default value for this flag is false and the value is never set to anything else.

## Property `input_params`
`ModelFeatures.input_params` returns metadata that is requires for building the Keras model.

In [None]:
model_features.input_params

In [11]:
model_features.train['outputs'][0]

array([[-0.01179294],
       [ 0.00302856],
       [-0.00570757],
       [ 0.02178671],
       [ 0.01233551],
       [ 0.00257654],
       [-0.00102902],
       [ 0.00056333],
       [ 0.00367964],
       [ 0.00397272],
       [ 0.00602909],
       [ 0.00067476],
       [ 0.00094263],
       [-0.00496152],
       [ 0.0063477 ],
       [-0.00461521],
       [ 0.00269816],
       [ 0.00720565],
       [ 0.00357365],
       [-0.00167098],
       [ 0.00047152],
       [ 0.00172463],
       [ 0.00087552],
       [-0.00197658],
       [ 0.00030112],
       [ 0.01325998],
       [-0.00070138],
       [ 0.00671803],
       [-0.00256364],
       [-0.00260078],
       [-0.00168885],
       [ 0.00482397],
       [ 0.01247057],
       [-0.00224919],
       [ 0.01010508],
       [ 0.00238548],
       [-0.01397839],
       [-0.00183067],
       [ 0.00317129],
       [ 0.00809627],
       [ 0.00077925],
       [ 0.00565535],
       [ 0.00365754],
       [-0.0263066 ],
       [ 0.01007314],
       [ 0

In [16]:
#for index, row in raw_data.iterrows():
#    print(row['daily_returns'], row['target_returns'])
raw_data['daily_returns']

Date
2010-11-24    0.014774
2010-11-26   -0.011647
2010-11-29    0.003030
2010-11-30   -0.005623
2010-12-01    0.021268
                ...   
2024-11-08    0.004332
2024-11-11    0.000953
2024-11-12   -0.003106
2024-11-13    0.000486
2024-11-14   -0.006430
Name: daily_returns, Length: 3517, dtype: float64

In [20]:
for a,b in zip(raw_data['daily_returns'], raw_data['target_returns'].shift(1)):
    print(a,b)

0.0147741666955438 nan
-0.0116472040934177 -0.0117929384883811
0.0030303080900992 0.0030285606668284
-0.0056227406659009 -0.005707565454859
0.0212676539614613 0.0217867120476611
0.0128088207173018 0.0123355053463069
0.0026925737404401 0.0025765422230269
-0.0010578342750722 -0.0010290226967503
0.0005702158162765 0.0005633306670484
0.0036635751977038 0.0036796409267889
0.0038936028689693 0.0039727209422377
0.0058177214631152 0.0060290863773814
0.0006426269240271 0.0006747565337195
0.000883113459438 0.0009426253187274
-0.0045720678847832 -0.0049615175042496
0.0058017826716836 0.006347701206647
-0.0041659721546046 -0.0046152080201109
0.0024134788012706 0.0026981633382609
0.0063402963499348 0.0072056529380877
0.0031102910243783 0.0035736539893162
-0.001431072562591 -0.0016709820229313
0.0003981134742458 0.0004715242779317
0.0014325531475518 0.0017246261320704
0.0007152216210841 0.000875515238627
-0.0015882858255275 -0.0019765759423346
0.0002386158050078 0.0003011181505656
0.0103379964354497