In [1]:
import os

os.chdir(r'C:\Users\carlo\GitHub\ProjectBuffalo')

In [2]:
import buffalo.ingestion as ingestion
import buffalo.predictor as predictor

In [3]:
import torch
import pandas as pd

In [None]:
ingestor = ingestion.DataIngestion(ingestion.enum.API.ADVANTAGE)

In [None]:
ingestor.load_data(r'cached_data/ingestion.sqlite')

In [None]:
target_symbol = 'AAPL'

In [None]:
ingestor.data['ADJUSTED_DAILY_STOCK'].symbol.unique()

In [None]:
target_stock = ingestor.data['ADJUSTED_DAILY_STOCK'].query('symbol == @target_symbol')[['open', 'high', 'low', 'adjusted_close', 'volume']]

In [None]:
other_stocks = ingestor.data['ADJUSTED_DAILY_STOCK'].query('symbol != "AAPL"')

In [None]:
for symbol in ['AAPL', 'MSFT', 'IBM', 'META']:
    temp = other_stocks[other_stocks['symbol'] == symbol][['open', 'high', 'low', 'adjusted_close', 'volume']].rename(columns={'adjusted_close': 'close'})
    temp.columns = symbol + '_' + temp.columns
    target_stock = predictor.util.align_dataframe_by_time(target_stock, temp)

In [None]:
target_income_statement = ingestor.data['COMPANY_INCOME_STATEMENT'].query('symbol == @target_symbol & freq == "quarterly"').drop(columns=['reported_currency', 'symbol', 'freq', 'function']).dropna(axis=1, how='all')
target_balance_sheet = ingestor.data['COMPANY_BALANCE_SHEET'].query('symbol == @target_symbol & freq == "quarterly"').drop(columns=['reported_currency', 'symbol', 'freq', 'function']).dropna(axis=1, how='all')
target_cash_flow = ingestor.data['COMPANY_CASH_FLOW'].query('symbol == @target_symbol & freq == "quarterly"').drop(columns=['reported_currency', 'symbol', 'freq', 'function', 'net_income']).dropna(axis=1, how='all')
fed_funds_rate = ingestor.data['FEDERAL_FUNDS_RATE'][['value']].rename(columns={'value': 'effective_federal_funds_rate'}).dropna(axis=1, how='all')
payroll = ingestor.data['NONFARM_PAYROLL'][['value']].rename(columns={'value': 'total_nonfarm_payroll'}).dropna(axis=1, how='all')
cpi = ingestor.data['CPI'][['value']].rename(columns={'value': 'consumer_price_index'}).dropna(axis=1, how='all')
unemployment = ingestor.data['UNEMPLOYMENT'][['value']].rename(columns={'value': 'unemployment_rate'}).dropna(axis=1, how='all')
real_gdp = ingestor.data['REAL_GDP'][['value']].rename(columns={'value': 'real_gross_domestic_product'}).dropna(axis=1, how='all')
real_gdp_per_capita = ingestor.data['REAL_GDP_PER_CAPITA'][['value']].rename(columns={'value': 'real_gross_domestic_product_per_capita'})
treasury_yield = ingestor.data['TREASURY_YIELD'][['value', 'maturity']].pivot(columns=['maturity'], values=['value']).dropna(axis=1, how='all')
treasury_yield.columns = 'treasury_yield_' + treasury_yield.columns.droplevel(level=0)

In [None]:
print(target_stock.shape)
target_stock = predictor.util.align_dataframe_by_time(target_stock, fed_funds_rate)
print(target_stock.shape)
target_stock = predictor.util.align_dataframe_by_time(target_stock, payroll)
print(target_stock.shape)
target_stock = predictor.util.align_dataframe_by_time(target_stock, cpi)
print(target_stock.shape)
target_stock = predictor.util.align_dataframe_by_time(target_stock, unemployment)
print(target_stock.shape)
target_stock = predictor.util.align_dataframe_by_time(target_stock, real_gdp)
print(target_stock.shape)
target_stock = predictor.util.align_dataframe_by_time(target_stock, real_gdp_per_capita)
print(target_stock.shape)
target_stock = predictor.util.align_dataframe_by_time(target_stock, treasury_yield)
print(target_stock.shape)
#target_stock = predictor.util.align_dataframe_by_time(target_stock, target_income_statement)
#print(target_stock.shape)
#target_stock = predictor.util.align_dataframe_by_time(target_stock, target_balance_sheet)
#print(target_stock.shape)
#target_stock = predictor.util.align_dataframe_by_time(target_stock, target_cash_flow)
#print(target_stock.shape)

In [4]:
import pickle

In [None]:
pickle.dump(target_stock, open('cached_data/target_stock.pickle', 'wb'))

In [5]:
target_stock = pickle.load(open('cached_data/target_stock.pickle', 'rb'))

In [6]:
target_stock.shape

(5246, 24)

In [10]:
time_series_data = predictor.util.TimeSeries(endog=target_stock[['close']], exog=target_stock.drop(columns='close'), seq_len=180)
trainset, testset = time_series_data.get_traintest_splitted_dataset(0.7)

In [8]:
import buffalo.predictor.models as modeling

In [None]:
rnn = modeling.RNN(input_size=target_stock.shape[1], hidden_size=64, seq_len=180, output_size=1, num_layers=2, dropout=0.2, bidirectional=True, use_gpu=True)
optimizer = torch.optim.Adam(rnn.parameters(), lr=0.001, weight_decay=0.001)
loss_func = torch.nn.MSELoss()
training_record = modeling.train_model(rnn, optimizer, loss_func, trainset, 0.1, multi_fold_valiation=True, epochs=60, batch_size=64, save_model=True, save_path=r'cached_data/rnn_bi_2lr_64hi_2do.pth') # Pointwise prediction
modeling.test_model(rnn, testset, loss_func, batch_size=128)

In [11]:
lstm = modeling.LSTM(input_size=target_stock.shape[1], hidden_size=64, seq_len=180, output_size=1, num_layers=2, dropout=0.2, bidirectional=False, use_gpu=True)
optimizer = torch.optim.Adam(lstm.parameters(), lr=0.001, weight_decay=0.001)
loss_func = torch.nn.MSELoss()
training_record = modeling.train_model(lstm, optimizer, loss_func, trainset, 0.1, multi_fold_valiation=True, epochs=60, batch_size=64, save_model=True, save_path=r'cached_data/lstm_2lr_64hi_2do.pth') # Pointwise prediction
modeling.test_model(lstm, testset, loss_func, batch_size=128)

Multi-fold validation:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch:   0%|          | 0/60 [00:00<?, ?it/s]

Epoch:   0%|          | 0/60 [00:00<?, ?it/s]

Epoch:   0%|          | 0/60 [00:00<?, ?it/s]

Epoch:   0%|          | 0/60 [00:00<?, ?it/s]

Epoch:   0%|          | 0/60 [00:00<?, ?it/s]

Epoch:   0%|          | 0/60 [00:00<?, ?it/s]

Epoch:   0%|          | 0/60 [00:00<?, ?it/s]

Epoch:   0%|          | 0/60 [00:00<?, ?it/s]

Epoch:   0%|          | 0/60 [00:00<?, ?it/s]

Epoch:   0%|          | 0/60 [00:00<?, ?it/s]

Averaged validation loss: 350.34965693155925. Best validation loss: 198.17682647705078


Batched testing:   0%|          | 0/12 [00:00<?, ?it/s]

Test loss: 267.18861198425293


182     -4.933001
186     -2.956239
188     -3.939749
191     -3.098645
192     -3.592588
          ...    
5233     1.932236
5236    -6.733215
5238    -8.815125
5239    -8.024551
5244   -11.384621
Length: 1520, dtype: float32

In [12]:
lstm = modeling.LSTM(input_size=target_stock.shape[1], hidden_size=64, seq_len=180, output_size=1, num_layers=2, dropout=0.2, bidirectional=False, use_gpu=True)
optimizer = torch.optim.Adam(lstm.parameters(), lr=0.001, weight_decay=0.001)
loss_func = torch.nn.MSELoss()
training_record = modeling.train_model(lstm, optimizer, loss_func, trainset, 0.1, multi_fold_valiation=True, epochs=60, batch_size=64, save_model=True, save_path=r'cached_data/lstm_bi_2lr_64hi_2do.pth') # Pointwise prediction
modeling.test_model(lstm, testset, loss_func, batch_size=128)

Multi-fold validation:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch:   0%|          | 0/60 [00:00<?, ?it/s]

Epoch:   0%|          | 0/60 [00:00<?, ?it/s]

Epoch:   0%|          | 0/60 [00:00<?, ?it/s]

Epoch:   0%|          | 0/60 [00:00<?, ?it/s]

Epoch:   0%|          | 0/60 [00:00<?, ?it/s]

Epoch:   0%|          | 0/60 [00:00<?, ?it/s]

Epoch:   0%|          | 0/60 [00:00<?, ?it/s]

Epoch:   0%|          | 0/60 [00:00<?, ?it/s]

Epoch:   0%|          | 0/60 [00:00<?, ?it/s]

Epoch:   0%|          | 0/60 [00:00<?, ?it/s]

Averaged validation loss: 348.2230374654134. Best validation loss: 181.2894515991211


Batched testing:   0%|          | 0/12 [00:00<?, ?it/s]

Test loss: 247.7506472269694


182     -5.669872
186     -1.579164
188     -3.023251
191     -4.569116
192     -6.246744
          ...    
5233     3.102585
5236    -2.991196
5238    -1.872986
5239    -6.973541
5244   -10.747673
Length: 1520, dtype: float32