In [1]:
import os

os.chdir(r'C:\Users\carlo\GitHub\ProjectBuffalo')

import buffalo.ingestion as ingestion
import buffalo.predictor as predictor
import buffalo.algorithm as algorithm
import buffalo.predictor.models as modeling
import torch
import pickle
import pandas as pd
from buffalo.utility import expand_grid, do_call_for_each_group
from tqdm.auto import tqdm

torch.manual_seed(0)

<torch._C.Generator at 0x2580db88f10>

In [2]:
ingestor = ingestion.DataIngestion(ingestion.enum.API.ADVANTAGE)

In [3]:
ingestor.load_data(r'cached_data/ingestion.sqlite')

In [4]:
target_stock = ingestor.data['ADJUSTED_DAILY_STOCK']

In [5]:
target_income_statement = ingestor.data['COMPANY_INCOME_STATEMENT'].query('freq == "quarterly"').drop(columns=['reported_currency', 'symbol', 'freq', 'function']).dropna(axis=1, how='all')
target_balance_sheet = ingestor.data['COMPANY_BALANCE_SHEET'].query('freq == "quarterly"').drop(columns=['reported_currency', 'symbol', 'freq', 'function']).dropna(axis=1, how='all')
target_cash_flow = ingestor.data['COMPANY_CASH_FLOW'].query('freq == "quarterly"').drop(columns=['reported_currency', 'symbol', 'freq', 'function', 'net_income']).dropna(axis=1, how='all')

In [6]:
fed_funds_rate = ingestor.data['FEDERAL_FUNDS_RATE'][['value']].rename(columns={'value': 'effective_federal_funds_rate'}).dropna(axis=1, how='all')
payroll = ingestor.data['NONFARM_PAYROLL'][['value']].rename(columns={'value': 'total_nonfarm_payroll'}).dropna(axis=1, how='all')
cpi = ingestor.data['CPI'][['value']].rename(columns={'value': 'consumer_price_index'}).dropna(axis=1, how='all')
unemployment = ingestor.data['UNEMPLOYMENT'][['value']].rename(columns={'value': 'unemployment_rate'}).dropna(axis=1, how='all')
real_gdp = ingestor.data['REAL_GDP'][['value']].rename(columns={'value': 'real_gross_domestic_product'}).dropna(axis=1, how='all')
real_gdp_per_capita = ingestor.data['REAL_GDP_PER_CAPITA'][['value']].rename(columns={'value': 'real_gross_domestic_product_per_capita'})
treasury_yield = ingestor.data['TREASURY_YIELD'][['value', 'maturity']].pivot(columns=['maturity'], values=['value']).dropna(axis=1, how='all')
treasury_yield.columns = 'treasury_yield_' + treasury_yield.columns.droplevel(level=0)

In [15]:
def combine_colwise_stocks(stock, all_symbols=['MSFT', 'IBM', 'JNJ', 'PFE', 'UNH', 'XLV', 'JPM', 'BAC', 'GS', 'XLF', 'AAPL', 'GE', 'KO', 'PEP', 'NKE', 'XLP', 'PG', 'HON', 'MMM', 'XLI']):
    curr_symb = stock['symbol'].iloc[0]
    for symb in all_symbols:
        if symb != curr_symb:
            temp = target_stock[target_stock['symbol'] == symb][['open', 'high', 'low', 'adjusted_close', 'volume']].rename(columns={'adjusted_close': 'close'})
            temp.columns = symb + '_' + temp.columns
            stock = predictor.util.align_dataframe_by_time(stock, temp)
    return stock.drop(columns=['symbol', 'dividend_amount', 'split_coefficient', 'interval', 'adjusted'])

In [16]:
target_stock = do_call_for_each_group(target_stock, combine_colwise_stocks, ['symbol'])

In [19]:
target_stock[target_stock['symbol'] == 'MSFT']

Unnamed: 0,symbol,time,open,high,low,close,adjusted_close,volume,MSFT_open,MSFT_high,...,XLI_open,XLI_high,XLI_low,XLI_close,XLI_volume,AAPL_open,AAPL_high,AAPL_low,AAPL_close,AAPL_volume
72535,MSFT,1999-11-01 00:00:00-05:00,93.250000,94.190002,92.120003,92.370003,28.873610,26630600,,,...,28.020000,28.020000,27.410000,17.690912,3100.0,80.000000,80.690002,77.370003,0.589041,2487300.0
72536,MSFT,1999-11-02 00:00:00-05:00,92.750000,94.500000,91.940002,92.559998,28.933001,23174500,,,...,27.690001,28.030001,27.660000,17.910355,25700.0,78.000000,81.690002,77.309998,0.608999,3564600.0
72537,MSFT,1999-11-03 00:00:00-05:00,92.940002,93.500000,91.500000,92.000000,28.757952,22258500,,,...,27.750000,27.750000,27.559999,17.787725,8400.0,81.620003,83.250000,81.000000,0.618485,2932700.0
72538,MSFT,1999-11-04 00:00:00-05:00,92.309998,92.750000,90.309998,91.750000,28.679806,27119700,,,...,27.750000,27.910000,27.360001,17.736092,17800.0,82.059998,85.370003,80.620003,0.634574,3384700.0
72539,MSFT,1999-11-05 00:00:00-05:00,91.809998,92.870003,90.500000,91.559998,28.620415,35083700,,,...,27.629999,27.860001,27.559999,17.832905,88600.0,84.620003,88.370003,84.000000,0.670165,3721500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78422,MSFT,2023-03-27 00:00:00-05:00,280.500000,281.458893,275.519989,276.380005,276.380005,26840212,,,...,97.690002,98.050003,97.080002,97.720001,7668573.0,159.940002,160.770004,157.869995,158.279999,52390266.0
78423,MSFT,2023-03-28 00:00:00-05:00,275.790009,276.140015,272.045105,275.230011,275.230011,21878647,,,...,97.529999,98.610001,97.529999,98.220001,9909054.0,157.970001,158.490005,155.979996,157.649994,45992152.0
78424,MSFT,2023-03-29 00:00:00-05:00,278.959991,281.139801,278.410004,280.510010,280.510010,25087032,,,...,99.110001,99.699997,98.919998,99.650002,9387900.0,159.369995,161.050003,159.350006,160.770004,51305691.0
78425,MSFT,2023-03-30 00:00:00-05:00,284.230011,284.459991,281.480011,284.049988,284.049988,25053410,,,...,100.139999,100.339996,99.540001,99.860001,8160225.0,161.529999,162.470001,161.270996,162.360001,49501689.0


In [17]:
target_stock = do_call_for_each_group(target_stock, predictor.util.align_dataframe_by_time, ['symbol'], other_df=fed_funds_rate)
payroll = do_call_for_each_group(target_stock, predictor.util.align_dataframe_by_time, ['symbol'], other_df=payroll)
cpi = do_call_for_each_group(target_stock, predictor.util.align_dataframe_by_time, ['symbol'], other_df=cpi)
unemployment = do_call_for_each_group(target_stock, predictor.util.align_dataframe_by_time, ['symbol'], other_df=unemployment)
real_gdp = do_call_for_each_group(target_stock, predictor.util.align_dataframe_by_time, ['symbol'], other_df=real_gdp)
real_gdp_per_capita = do_call_for_each_group(target_stock, predictor.util.align_dataframe_by_time, ['symbol'], other_df=real_gdp_per_capita)
treasury_yield = do_call_for_each_group(target_stock, predictor.util.align_dataframe_by_time, ['symbol'], other_df=treasury_yield)

AssertionError: 

In [None]:
def combine_colwise_indicators(stock, indicators):
    curr_symb = stock['symbol'].iloc[0]
    for ind in indicators:
        stock = predictor.util.align_dataframe_by_time(stock, ind.query(f'symbol == "{curr_symb}"').drop(columns=['symbol']))
    return stock

In [None]:
sma = ingestor.data['SMA'].query('interval == "daily"')
roc = ingestor.data['ROC'].query('interval == "daily"')
ht_sine = ingestor.data['HT_SINE'].query('interval == "daily"')
mom = ingestor.data['MOM'].query('interval == "daily"')
sma = sma.pivot(columns=['time_period', 'series_type'], values=['sma']).dropna(axis=1, how='all')
sma.columns = sma.columns.map(lambda x: '-'.join([str(t) for t in x]))
roc = roc.pivot(columns=['time_period', 'series_type'], values=['roc']).dropna(axis=1, how='all')
roc.columns = roc.columns.map(lambda x: '-'.join([str(t) for t in x]))
ht_sine = ht_sine.pivot(columns=['time_period', 'series_type'], values=['lead_sine', 'sine']).dropna(axis=1, how='all')
ht_sine.columns = ht_sine.columns.map(lambda x: '-'.join([str(t) for t in x]))
mom = mom.pivot(columns=['time_period', 'series_type'], values=['mom']).dropna(axis=1, how='all')
mom.columns = mom.columns.map(lambda x: '-'.join([str(t) for t in x]))

In [None]:
treasury_yield = do_call_for_each_group(target_stock, combine_colwise_indicators, ['symbol'], indicators=[sma, roc, ht_sine, mom])

In [None]:
pickle.dump(target_stock, open(r'cached_data/target_stock.pickle', 'wb'))

In [None]:
target_stock = pickle.load(open(r'cached_data/target_stock.pickle', 'rb'))

In [None]:
target_stock.shape

In [None]:
n_head = 1

In [None]:
time_series_data = predictor.util.TimeSeriesData(endog=target_stock[['adjusted_close']], exog=target_stock.drop(columns=['adjusted_close']), seq_len=180, label_len=n_head, name=f'DAILY_ADJUSTED_CLOSE_{target_symbol}')

#### Offline Learning

In [None]:
sweep_params = expand_grid(
    hidden_size=[32, 64, 128],
    num_layers=[1, 2, 4],
    dropout=[0.0, 0.2, 0.4],
    batch_size=[32, 64, 128],
    learning_rate=[0.001, 0.005, 0.0001],
    weight_decay=[0.001, 0.0001],
    epochs=[20, 30],
    bidirectional = [True, False],
    n_fold=1
)

In [None]:
for i in tqdm(range(6, sweep_params.shape[0])):
    param = sweep_params.loc[i,:].to_dict()
    rnn = modeling.RNN(
        input_size=target_stock.shape[1],
        n_ahead=n_head,
        hidden_size=param['hidden_size'],
        output_size=1,
        num_layers=param['num_layers'],
        dropout=param['dropout'],
        bidirectional=param['bidirectional'],
        use_gpu=True)
    optimizer = torch.optim.Adam(
        rnn.parameters(),
        lr=param['learning_rate'],
        weight_decay=param['weight_decay'])
    loss_func = torch.nn.MSELoss()
    training_record = predictor.train_and_evaluate_model(
        rnn,
        optimizer,
        loss_func,
        time_series_data,
        epochs_per_fold=param['epochs'],
        test_ratio=0.2,
        n_fold=param['n_fold'],
        clip_grad=1,
        batch_size=param['batch_size']) # Pointwise prediction
    training_record.serialize_to_file(r'cached_data/record.sqlite', additional_note_dataset='', additonal_note_model='')

In [None]:
for i in tqdm(range(sweep_params.shape[0])):
    param = sweep_params.loc[i,:].to_dict()
    rnn = modeling.LSTM(
        input_size=target_stock.shape[1],
        n_ahead=n_head,
        hidden_size=param['hidden_size'],
        output_size=1,
        num_layers=param['num_layers'],
        dropout=param['dropout'],
        bidirectional=param['bidirectional'],
        use_gpu=True)
    optimizer = torch.optim.Adam(
        rnn.parameters(),
        lr=param['learning_rate'],
        weight_decay=param['weight_decay'])
    loss_func = torch.nn.MSELoss()
    training_record = predictor.train_and_evaluate_model(
        rnn,
        optimizer,
        loss_func,
        time_series_data,
        epochs_per_fold=param['epochs'],
        test_ratio=0.2,
        n_fold=param['n_fold'],
        clip_grad=1,
        batch_size=param['batch_size']) # Pointwise prediction
    training_record.serialize_to_file(r'cached_data/record.sqlite', additional_note_dataset='', additonal_note_model='')

#### Online Learning

In [None]:
sweep_params = expand_grid(
    hidden_size=[32, 64, 128],
    num_layers=[1, 2, 3, 4],
    dropout=[0.0, 0.2, 0.4],
    batch_size=[32, 64, 128],
    learning_rate=[0.001, 0.005, 0.0001],
    weight_decay=[0.001, 0.0001, 0.00001],
    epochs=[40],
    epochs_per_update=[1, 5, 10, 15],
    update_freq=[1, 5, 10, 15],
    bidirectional = [True, False]
)

In [None]:
for i in tqdm(range(sweep_params.shape[0])):
    rnn = modeling.RNN(
        input_size=target_stock.shape[1],
        n_ahead=n_head,
        hidden_size=sweep_params.loc[i,'hidden_size'],
        output_size=1,
        num_layers=sweep_params.loc[i,'num_layers'],
        dropout=sweep_params.loc[i,'dropout'],
        bidirectional=sweep_params.loc[i,'bidirectional'],
        use_gpu=True)
    optimizer = torch.optim.Adam(
        rnn.parameters(),
        lr=sweep_params.loc[i,'learning_rate'],
        weight_decay=sweep_params.loc[i,'weight_decay'])
    loss_func = torch.nn.MSELoss()
    update_rule = algorithm.online_update.IncrementalBatchGradientDescent(epochs=sweep_params.loc[i,'epochs'], epochs_per_update=sweep_params.loc[i,'epochs_per_update'], update_freq=sweep_params.loc[i,'update_freq'], clip_grad_norm_update=None, clip_grad_norm_train=1)
    training_record = predictor.train_and_evaluate_model_online(
        rnn,
        time_series_data,
        update_rule,
        optimizer,
        loss_func,
        train_ratio=0.3,
        batch_size=sweep_params.loc[i,'batch_size']) # Pointwise prediction
    training_record.serialize_to_file(r'cached_data/record.sqlite', additional_note_dataset='', additonal_note_model='')

In [None]:
rnn = modeling.RNN(input_size=target_stock.shape[1], n_ahead=1, hidden_size=64, output_size=1, num_layers=2, dropout=0.5, bidirectional=False, use_gpu=True)
optimizer = torch.optim.Adam(rnn.parameters(), lr=0.001, weight_decay=0.01)
update_rule = algorithm.online_update.IncrementalBatchGradientDescent(epochs=80, epochs_per_update=10, update_freq=5, clip_grad_norm_update=None, clip_grad_norm_train=1)
loss_func = torch.nn.MSELoss()
training_record = predictor.train_and_evaluate_model_online(rnn,
                                                            time_series_data,
                                                            update_rule,
                                                            optimizer,
                                                            loss_func,
                                                            train_ratio=0.3,
                                                            batch_size=64) # Pointwise prediction
training_record.serialize_to_file(r'cached_data/record.sqlite', additional_note_dataset='', additonal_note_model='')

In [None]:
training_record = predictor.util.ModelPerformanceOnline.deserialize_from_file(r'cached_data/record.sqlite', 2)
training_record.plot_training_records()
training_record.plot_logs()
training_record.plot_residuals()

In [None]:
rnn = modeling.RNN(input_size=target_stock.shape[1], n_ahead=1, hidden_size=64, output_size=1, num_layers=2, dropout=0.2, bidirectional=True, use_gpu=True)
optimizer = torch.optim.Adam(rnn.parameters(), lr=0.001, weight_decay=0.001)
update_rule = algorithm.online_update.IncrementalBatchGradientDescent(epochs=80, epochs_per_update=5, update_freq=5, clip_grad_norm_update=None, clip_grad_norm_train=1)
loss_func = torch.nn.MSELoss()
training_record = predictor.train_and_evaluate_model_online(rnn,
                                                            time_series_data,
                                                            update_rule,
                                                            optimizer,
                                                            loss_func,
                                                            train_ratio=0.3,
                                                            batch_size=64) # Pointwise prediction
training_record.serialize_to_file(r'cached_data/record.sqlite', additional_note_dataset='', additonal_note_model='')
training_record = predictor.util.ModelPerformanceOnline.deserialize_from_file(r'cached_data/record.sqlite', 1)
training_record.plot_training_records()
training_record.plot_logs()
training_record.plot_residuals()

In [None]:
rnn = modeling.RNN(input_size=target_stock.shape[1], n_ahead=1, hidden_size=64, output_size=1, num_layers=3, dropout=0.2, bidirectional=False, use_gpu=True)
optimizer = torch.optim.Adam(rnn.parameters(), lr=0.001, weight_decay=0.001)
update_rule = algorithm.online_update.IncrementalBatchGradientDescent(epochs=80, epochs_per_update=5, update_freq=5, clip_grad_norm_update=None, clip_grad_norm_train=1)
loss_func = torch.nn.MSELoss()
training_record = predictor.train_and_evaluate_model_online(rnn,
                                                            time_series_data,
                                                            update_rule,
                                                            optimizer,
                                                            loss_func,
                                                            train_ratio=0.3,
                                                            batch_size=64) # Pointwise prediction
training_record.serialize_to_file(r'cached_data/record.sqlite', additional_note_dataset='', additonal_note_model='')
training_record = predictor.util.ModelPerformanceOnline.deserialize_from_file(r'cached_data/record.sqlite', 1)
training_record.plot_training_records()
training_record.plot_logs()
training_record.plot_residuals()

In [None]:
rnn = modeling.RNN(input_size=target_stock.shape[1], n_ahead=1, hidden_size=64, output_size=1, num_layers=3, dropout=0.2, bidirectional=True, use_gpu=True)
optimizer = torch.optim.Adam(rnn.parameters(), lr=0.001, weight_decay=0.001)
update_rule = algorithm.online_update.IncrementalBatchGradientDescent(epochs=80, epochs_per_update=5, update_freq=5, clip_grad_norm_update=None, clip_grad_norm_train=1)
loss_func = torch.nn.MSELoss()
training_record = predictor.train_and_evaluate_model_online(rnn,
                                                            time_series_data,
                                                            update_rule,
                                                            optimizer,
                                                            loss_func,
                                                            train_ratio=0.3,
                                                            batch_size=64) # Pointwise prediction
training_record.serialize_to_file(r'cached_data/record.sqlite', additional_note_dataset='', additonal_note_model='')
training_record = predictor.util.ModelPerformanceOnline.deserialize_from_file(r'cached_data/record.sqlite', 1)
training_record.plot_training_records()
training_record.plot_logs()
training_record.plot_residuals()