In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from capstone.visuals.style import set_style

set_style()

- Modeling
    - 1 Feature: Past Prices
    - 9 Features: Price/Return Indicators

- Scoring Stocks:
    - $expect



In [2]:
from capstone.data import load_data

data = [
    'features',
    'target_prices'
]

features_df, prices_df = load_data('data', data)

In [25]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

seq_len = 30
m1 = LinearRegression()
scaler = StandardScaler()

features = features_df[:-seq_len*3]
input_to_predict_future = features_df[-seq_len*2:-seq_len]
m1_predictions = pd.DataFrame(index=input_to_predict_future.index)
m1_coefs = pd.DataFrame(index=features.columns)

for stock in prices_df.columns:
    target = prices_df[seq_len:-seq_len*2][stock]
    lr_pipe = make_pipeline(scaler, m1)
    lr_pipe.fit(features, target)
    pred = lr_pipe.predict(input_to_predict_future)
    m1_predictions[stock] = pred
    m1_coefs[stock] = lr_pipe[1].coef_

Unnamed: 0,tkrs_avg_roc_30d,tkrs_avg_std_30d,tkrs_avg_rs_roc_30d,tkrs_avg_vol_avg_roc_30d,tkrs_avg_turnover_avg_roc_30d,stock_idx_pct_change_30d,comm_idx_pct_change_30d,bond_idx_pct_change_30d,vol_idx_pct_change_30d
ACN,5.012045,29.987164,-12.846020,-17.290073,14.125514,-13.129519,23.064448,-0.809661,-5.804103
ADBE,-2.543328,77.275214,-44.809646,-32.779202,25.350649,-10.124527,48.887305,46.324047,6.993816
ADM,0.310886,3.603702,-0.922276,-2.118021,0.632167,-5.830374,3.178915,-2.883932,-3.098982
AFL,1.794352,1.249279,-0.346035,-0.684347,0.099685,-1.368401,2.576454,-3.123580,-1.102302
AMAT,7.954939,12.110838,-8.458663,-6.230192,5.795942,-7.705327,11.360656,-2.026676,-1.157109
...,...,...,...,...,...,...,...,...,...
WRB,2.258296,4.516341,-1.318094,-2.186858,1.208375,-3.088795,3.040117,-3.045037,-1.843096
WST,0.367758,51.259679,-24.456104,-29.371063,23.425608,-17.900577,35.175287,9.179931,-8.186510
WYNN,5.555940,-7.764635,-3.301675,6.901101,-5.286684,3.952100,3.931079,3.757294,6.681324
YUM,3.364224,7.108030,-3.059044,-4.666061,2.838066,-1.672831,6.420607,-3.178969,-1.497249


In [4]:
from capstone.modeling.model_eval import rank_stocks
from capstone.data import growth
from trade_metrics import Metrics

def backtest_model(predictions, prices, seq_len, n_stocks):
    ranked_stocks = rank_stocks(predictions)
    ranked_stocks = ranked_stocks[:n_stocks]['stock'].to_list()

    backtest_prices = prices[-seq_len:][ranked_stocks]
    backtest_returns = backtest_prices.pct_change()
    backtest_returns.loc[backtest_returns.index.min()] = 0
    metrics = Metrics(backtest_returns).calculate()
    return metrics, metrics.mean()

m1_stats, m1_mean_stats = backtest_model(
    m1_predictions, prices_df, seq_len=seq_len, n_stocks=5
)

display(m1_stats, m1_mean_stats)

Unnamed: 0,Cumulative Return,Annual Return,Annual Volatility,Sharpe Ratio,Max Drawdown
BA,0.153221,1.239667,0.286271,4.260524,-0.039541
GPN,0.189281,1.496949,0.276212,5.347158,-0.027682
WYNN,0.204976,1.657943,0.424157,3.861643,-0.068829
UAL,0.274005,2.13179,0.434292,4.862609,-0.066602
WBD,0.495459,3.579448,0.608471,5.849825,-0.104944


Cumulative Return    0.263388
Annual Return        2.021159
Annual Volatility    0.405881
Sharpe Ratio         4.836352
Max Drawdown        -0.061520
dtype: float64

In [8]:
from sklearn.ensemble import RandomForestRegressor
from tqdm import tqdm

m2_predictions = pd.DataFrame(index=input_to_predict_future.index)
m2 = RandomForestRegressor(
    n_estimators=100, 
    max_depth=15, 
    min_samples_split=5, 
    min_samples_leaf=2, 
    max_features='sqrt', 
    bootstrap=True, 
    random_state=42
)

for stock in tqdm(prices_df.columns):
    target = prices_df[seq_len:-seq_len*2][stock]
    m2.fit(features, target)
    pred = m2.predict(input_to_predict_future)
    m2_predictions[stock] = pred

100%|██████████| 97/97 [00:57<00:00,  1.70it/s]


In [12]:
m2_stats, m2_mean_stats = backtest_model(
    m2_predictions, prices_df, seq_len=seq_len, n_stocks=5
)

display(m2_stats, pd.DataFrame(m2_mean_stats, columns=['Metric']))

Unnamed: 0,Cumulative Return,Annual Return,Annual Volatility,Sharpe Ratio,Max Drawdown
BA,0.153221,1.239667,0.286271,4.260524,-0.039541
UAL,0.274005,2.13179,0.434292,4.862609,-0.066602
GPN,0.189281,1.496949,0.276212,5.347158,-0.027682
WYNN,0.204976,1.657943,0.424157,3.861643,-0.068829
BKNG,0.255629,1.940979,0.211687,9.074625,-0.019732


Unnamed: 0,Metric
Cumulative Return,0.215422
Annual Return,1.693466
Annual Volatility,0.326524
Sharpe Ratio,5.481312
Max Drawdown,-0.044477


In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from warnings import filterwarnings
filterwarnings('ignore')

# Initialize DataFrame to hold predictions
m3_predictions = pd.DataFrame(index=input_to_predict_future.index)

# LSTM expects input shape (n_samples, n_timesteps, n_features)
# Here, n_timesteps is set as seq_len, and n_features is the number of features in your data
n_features = features.shape[1]

# Reshape features for LSTM
features_reshaped = np.reshape(features.values, (features.shape[0], 1, n_features))
input_to_predict_future_reshaped = np.reshape(input_to_predict_future.values, (input_to_predict_future.shape[0], 1, n_features))

# Standardize the data if needed
scaler = StandardScaler()
features_reshaped = scaler.fit_transform(features_reshaped.reshape(-1, n_features)).reshape(features_reshaped.shape)
input_to_predict_future_reshaped = scaler.transform(input_to_predict_future_reshaped.reshape(-1, n_features)).reshape(input_to_predict_future_reshaped.shape)

# Loop through all stocks to train and predict
for stock in tqdm(prices_df.columns):
    target = prices_df[seq_len:-seq_len*2][stock]
    # Create LSTM model
    model = Sequential()
    model.add(LSTM(50, input_shape=(features_reshaped.shape[1], features_reshaped.shape[2]), return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(50, return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse')
    # Fit the model
    model.fit(features_reshaped, target, epochs=50, batch_size=32, verbose=0)
    # Make predictions
    pred = model.predict(input_to_predict_future_reshaped, verbose=0)
    # Store predictions
    m3_predictions[stock] = pred.flatten()

100%|██████████| 97/97 [30:10<00:00, 18.67s/it]


In [16]:
m3_stats, m3_mean_stats = backtest_model(
    m3_predictions, prices_df, seq_len=seq_len, n_stocks=5
)

display(m3_stats, pd.DataFrame(m3_mean_stats, columns=['Metric']))

Unnamed: 0,Cumulative Return,Annual Return,Annual Volatility,Sharpe Ratio,Max Drawdown
WYNN,0.204976,1.657943,0.424157,3.861643,-0.068829
FMC,0.076044,0.638331,0.213571,2.895204,-0.037637
ICE,0.067388,0.563781,0.178585,3.044942,-0.025498
CZR,0.101566,0.954084,0.538628,1.734189,-0.15383
BBY,0.116256,0.948843,0.219936,4.223235,-0.056482


Unnamed: 0,Metric
Cumulative Return,0.113246
Annual Return,0.952596
Annual Volatility,0.314976
Sharpe Ratio,3.151842
Max Drawdown,-0.068455


In [20]:
m3_predictions

Unnamed: 0_level_0,ACN,ADBE,ADM,AFL,AMAT,AMP,APD,AVB,AXP,BA,...,VRSK,WBA,WBD,WDC,WM,WRB,WST,WYNN,YUM,ZION
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-11-03,115.725616,123.01664,56.076927,41.280216,78.699707,110.746826,115.765282,114.317451,93.704811,119.793671,...,105.482727,50.058842,25.357821,60.270412,88.442451,51.004097,114.296455,71.06617,72.684532,46.118309
2022-11-04,115.830742,123.080887,53.28767,39.928684,73.45417,110.796722,115.772255,114.39241,93.81189,119.945602,...,105.576408,52.739037,25.54414,61.933628,88.695114,49.647419,114.377052,90.364334,77.280891,46.071003
2022-11-07,115.864662,123.118835,37.634495,30.790466,34.334225,110.824707,115.799538,114.431488,93.886635,119.969543,...,105.663651,54.963242,26.12221,58.761509,88.756866,30.258249,114.365959,94.098602,79.029335,42.674393
2022-11-08,115.864738,123.120293,28.009398,26.434925,21.274481,110.824692,115.792412,114.437347,93.895119,119.968773,...,105.683563,55.267006,26.892557,51.008087,88.757263,17.604622,114.285362,94.428978,78.606491,32.140171
2022-11-09,115.679779,122.938927,51.409916,34.093487,61.404964,110.668427,115.639839,114.267624,93.599808,119.803726,...,105.322128,53.021721,25.936972,60.63493,88.290451,46.243015,114.065392,84.559944,56.139584,45.707684
2022-11-10,115.872932,123.13131,25.689728,30.991379,23.704473,110.841362,115.833549,114.453644,93.911835,119.98011,...,105.70871,54.646297,27.648664,39.43285,88.768654,16.209381,114.355003,94.611763,80.400848,21.834242
2022-11-11,115.878174,123.134743,26.769665,33.123348,23.934402,110.847343,115.842316,114.457596,93.91526,119.98362,...,105.713333,54.632156,27.750645,38.907921,88.773994,17.248257,114.399704,94.627602,80.903603,19.888268
2022-11-14,115.866127,123.125481,35.928387,32.786259,36.337372,110.83149,115.818146,114.442909,93.903015,119.969543,...,105.693863,53.000401,27.252001,46.497814,88.758888,24.613642,114.381042,94.528679,79.384285,34.92556
2022-11-15,115.832611,123.096046,42.066242,40.95031,58.518383,110.783714,115.786324,114.398247,93.848045,119.926636,...,105.63575,49.555035,26.836433,47.90588,88.681778,38.609501,114.343437,94.438522,78.186676,39.192261
2022-11-16,115.779892,123.057205,31.739241,41.217152,36.95826,110.66301,115.759232,114.358192,93.781532,119.887779,...,105.563736,50.411083,25.963297,37.629757,88.513443,30.88273,114.176949,94.531151,77.246727,28.563356


In [None]:
# import numpy as np
# import pandas as pd
# import tensorflow as tf
# from sklearn.model_selection import train_test_split

# # Data Preprocessing
# X = returns_df.to_numpy()  # Convert DataFrame to NumPy array
# seq_len = 10  # Choose your sequence length

# # Create sequences
# X = np.array([X[i:i+seq_len] for i in range(X.shape[0] - seq_len)])

# # Split data into training and test sets
# X_train, X_test = train_test_split(X, test_size=0.2, shuffle=False)

# # Seq2Seq Model Architecture
# encoder_inputs = tf.keras.layers.Input(shape=(seq_len, X.shape[2]))
# encoder = tf.keras.layers.LSTM(50, return_state=True)
# encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# encoder_states = [state_h, state_c]

# decoder_inputs = tf.keras.layers.Input(shape=(seq_len, X.shape[2]))
# decoder_lstm = tf.keras.layers.LSTM(50, return_sequences=True, return_state=True)
# decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
# decoder_dense = tf.keras.layers.Dense(X.shape[2])
# decoder_outputs = decoder_dense(decoder_outputs)

# model = tf.keras.models.Model([encoder_inputs, decoder_inputs], decoder_outputs)

# # Compile and Train
# model.compile(optimizer='adam', loss='mean_squared_error')
# model.fit([X_train, X_train], X_train, batch_size=64, epochs=50, validation_split=0.2)

# # Prediction
# predicted_stock_returns = model.predict([X_test, X_test])

In [None]:
X.shape

(1945, 10, 93)