In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
import yfinance as yf
from functools import reduce
import operator
from tqdm import tqdm
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

data = yf.download("AAPL", period = "60d", interval="5m")  # For minute-level bars
data = data[['Open', 'High', 'Low', 'Close', 'Volume']]
df = data.dropna()
df = df[~pd.isnull(df['Close'])].copy()
df.reset_index(inplace=True)
df.columns = df.columns.get_level_values(0)

df['range'] = df['High'] - df['Low']
df['body_length'] = abs(df['Close'] - df['Open'])
df['upper_tail'] = df['High'] - df[['Close','Open']].apply(lambda x: max(x), axis=1)
df['lower_tail'] = df[['Close','Open']].apply(lambda x: min(x), axis=1) - df['Low']
df['body_length/range'] = df['body_length'] / df['range']
df['upper_tail/range'] = df['upper_tail'] / df['range']
df['lower_tail/range'] = df['lower_tail'] / df['range']
df['return'] = df['Close'].pct_change()
df['local_vol'] = df['return'].rolling(window=10, min_periods=3).std()
df['rolling_mean_volume'] = df['Volume'].rolling(window = 10, min_periods = 3).mean()
df['volume_spike'] = df[['Volume','rolling_mean_volume']].apply(lambda x: x[0]/x[1] if ( not any(np.isnan([x[0] ,x[1] ]) )) else None, axis=1 )
print(df.shape)

df = df.dropna()
print(df.shape)
df.reset_index(inplace=True)
df = df.iloc[:,1:]
print(df.shape)
df[:3]

[*********************100%***********************]  1 of 1 completed


(4678, 17)
(4675, 17)
(4675, 17)


Price,Datetime,Open,High,Low,Close,Volume,range,body_length,upper_tail,lower_tail,body_length/range,upper_tail/range,lower_tail/range,return,local_vol,rolling_mean_volume,volume_spike
0,2025-03-27 13:45:00+00:00,221.389999,221.574997,220.910004,221.070007,445106,0.664993,0.319992,0.184998,0.160004,0.481196,0.278195,0.240609,-0.0014,0.001002,724236.5,0.614587
1,2025-03-27 13:50:00+00:00,221.080002,221.789902,220.975006,221.460007,516126,0.814896,0.380005,0.329895,0.104996,0.466323,0.404831,0.128846,0.001764,0.001295,682614.4,0.756102
2,2025-03-27 13:55:00+00:00,221.509995,222.074997,221.509995,221.554993,375238,0.565002,0.044998,0.520004,0.0,0.079642,0.920358,0.0,0.000429,0.001124,631385.0,0.594309


Consider 30 rolling pts for training data, and summarise the features across 30 - 5m candles to get a single points for training.

In [2]:
window = 10  # Considering 10 such 5m candles and collapsing into a single data that will represent this window
look_ahead_period = 3

#avg_columns = ['average_range','average_body_length','average_upper_tail','average_lower_tail','average_body_length/range','average_upper_tail/range','average_lower_tail/range','average_return']
extra_cols = ['Green_candle_perc' , 'breakout_up', 'breakout_down' , '2ndHalf_1stHalf_return_diff', 'trend_slope', 'cumulative_return', 'compounded_return', 'rolling_'+str(window) + '_vol' , 'coeff_of_variation' , 'volume_trend_slope', 'secondHalf_volume_average', 'price_to_volume_correlation' ]
x_train = pd.DataFrame(columns = extra_cols)
y_train = []
y_train_threshold = []
d = df.copy()
for i in tqdm(range(0, len(df) - window )):

  d = df[i:i+window].copy()
  d.reset_index(inplace=True)

  avg_future_closing_price = (df['Close'].values[i+window :  i+window + look_ahead_period]).mean()

  data = pd.DataFrame(columns = x_train.columns)

  # Average statistics
  #data.loc[i,avg_columns] = d.loc[:,['range','body_length','upper_tail','lower_tail','body_length/range','upper_tail/range','lower_tail/range','return']].mean().values

  # Price_action features
  data.loc[i,'Green_candle_perc'] = sum(d.loc[:window,['Open', 'Close']].apply(lambda x: x[1] > x[0], axis=1))/window
  data.loc[i, 'breakout_up'] = 1 if d.loc[window-1,'Close'] > max(d['Close'].values[:-1]) else 0
  data.loc[i, 'breakout_down'] = 1 if d.loc[window-1,'Close'] < min(d['Close'].values[:-1]) else 0

  # Momentum features
  data.loc[i, '2ndHalf_1stHalf_return_diff'] = (d.loc[(window/2):window,'return'].values - d.loc[:(window/2)-1,'return'].values ).mean()
  data.loc[i, 'trend_slope'] =   np.polyfit(np.arange(0,window), d['Close'].values, 1)[0]
  data.loc[i, 'cumulative_return'] =  (d.loc[:,'Close'][1]/ d.loc[:,'Close'][0] ) - 1
  data.loc[i, 'compounded_return'] = pow(reduce(operator.mul, list(1 + d['return'] ) ),1/window ) - 1

  # Volatility features
  data.loc[i, 'rolling_'+str(window) + '_vol'] = d.loc[:, 'return'].std()
  data.loc[i, 'coeff_of_variation'] =  d.loc[:, 'return'].std() / d.loc[:, 'return'].mean()

  # Volume features
  data.loc[i, 'volume_trend_slope'] = np.polyfit(np.arange(window/2), d.loc[(window)/2 :i+window, 'Volume'].values, 1)[0] /  d.loc[ (window)/2 :window, 'Volume'].mean()
  data.loc[i, 'secondHalf_volume_average'] = (d.loc[(window)/2 :window, 'Volume'].values).mean() / d.loc[:, 'Volume'].mean()

  # Price_to_Volume Correlation
  data.loc[i, 'price_to_volume_correlation'] = np.corrcoef( d.loc[:,'Close'], d.loc[:,'Volume'] )[0,1]

  x_train = pd.concat([x_train, data],axis=0, ignore_index=True)

  y_train.append( 1 if avg_future_closing_price > d.loc[window-1,'Close'] else 0)

  y_train_threshold.append( 1 if avg_future_closing_price > d.loc[window-1,'Close'] * 1.005 else -1 if avg_future_closing_price < d.loc[window-1,'Close'] * 0.995 else 0)
data

100%|██████████| 4665/4665 [00:36<00:00, 129.54it/s]


Unnamed: 0,Green_candle_perc,breakout_up,breakout_down,2ndHalf_1stHalf_return_diff,trend_slope,cumulative_return,compounded_return,rolling_10_vol,coeff_of_variation,volume_trend_slope,secondHalf_volume_average,price_to_volume_correlation
4664,0.4,1,0,0.000565,0.022267,9.7e-05,8.4e-05,0.000951,11.228608,0.219006,1.051782,0.338837


In [3]:
print(pd.Series(y_train).value_counts())
print(pd.Series(y_train_threshold).value_counts())
x_train.shape, len(y_train), len(y_train_threshold)

0    2337
1    2328
Name: count, dtype: int64
 0    4300
-1     201
 1     164
Name: count, dtype: int64


((4665, 12), 4665, 4665)

Check for multi-collinearity

In [4]:
from sklearn.model_selection import train_test_split
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

# Example: df is your DataFrame of features
x = add_constant(x_train)  # adds intercept term
x = x.astype(float)

vif_data = pd.DataFrame()
vif_data['feature'] = x.columns
vif_data['VIF'] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]

print(vif_data)

                        feature        VIF
0                         const  37.426027
1             Green_candle_perc   1.508046
2                   breakout_up   1.289694
3                 breakout_down   1.341431
4   2ndHalf_1stHalf_return_diff   1.341921
5                   trend_slope   6.254567
6             cumulative_return   1.308649
7             compounded_return   6.984663
8                rolling_10_vol   1.019930
9            coeff_of_variation   1.000478
10           volume_trend_slope   1.145340
11    secondHalf_volume_average   1.095564
12  price_to_volume_correlation   1.082518


In [5]:
from sklearn.model_selection import train_test_split

X_tr, X_test ,Y_tr, Y_test = train_test_split(x_train, y_train_threshold, test_size = 0.3, shuffle=True, random_state = 42)

In [6]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
from lightgbm import LGBMClassifier

x_train = x_train.astype(float)

strat_kFold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

model = LGBMClassifier(objective='multiclass', num_class=3, random_state=42)

x_tr, x_cv ,y_tr, y_cv = [], [], [], []

for train_idx, cv_idx in strat_kFold.split(X_tr, Y_tr):

  x_tr, x_cv = x_train.loc[train_idx,:], x_train.loc[cv_idx,:]
  y_tr, y_cv = [y_train_threshold[i] for i in train_idx], [y_train_threshold[i] for i in cv_idx]


  model.fit(x_tr, y_tr)

  y_pred = model.predict(x_cv)

  print("Accuracy:", accuracy_score(y_cv, y_pred))
  print(classification_report(y_cv, y_pred))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000645 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2309
[LightGBM] [Info] Number of data points in the train set: 2612, number of used features: 12
[LightGBM] [Info] Start training from score -2.863925
[LightGBM] [Info] Start training from score -0.106552
[LightGBM] [Info] Start training from score -3.122939
Accuracy: 0.8759571209800919
              precision    recall  f1-score   support

          -1       0.30      0.08      0.12        39
           0       0.90      0.98      0.94       577
           1       0.23      0.08      0.12        37

    accuracy                           0.88       653
   macro avg       0.48      0.38      0.39       653
weighted avg       0.82      0.88      0.84       653

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000334 seconds.
You can set `force_col_wise=true` to