In [8]:
# Algorithmic Trader Python 

from statsmodels.regression.rolling import RollingOLS
import pandas_datareader.data as web
import matplotlib.pyplot as plt
import statsmodels.api as sm
import pandas as pd
import numpy as np
import datetime as dt
import yfinance as yf
import pandas_ta
import warnings
warnings.filterwarnings('ignore')



In [9]:
# load data and sanitize 

sp500 = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
sp500['Symbol'] = sp500['Symbol'].replace('.','-')

symbols_list = sp500['Symbol'].unique().tolist()

end_date = '2024-01-08'

start_date = pd.to_datetime(end_date)-pd.DateOffset(365*8)

df = yf.download(tickers=symbols_list,
                start=start_date,
                end=end_date).stack()

df.index.names = ['date','ticker']

df.columns = df.columns.str.lower()



[*********************100%%**********************]  503 of 503 completed

2 Failed downloads:
['BRK.B']: Exception('%ticker%: No timezone found, symbol may be delisted')
['BF.B']: Exception('%ticker%: No price data found, symbol may be delisted (1d 2016-01-10 00:00:00 -> 2024-01-08)')


In [10]:
# calculate features and technical indicators

# Garman-Klass Volatility
df['garman_klass_vol'] = ((np.log(df['high'])-np.log(df['low']))**2)/2-(2*np.log(2)-1)*((np.log(df['adj close'])-np.log(df['open']))**2)

# RSI
df['rsi'] = df.groupby(level=1)['adj close'].transform(lambda x: pandas_ta.rsi(close=x, length=20))

# Bollinger Bands
df['bb_low'] = df.groupby(level=1)['adj close'].transform(lambda x: pandas_ta.bbands(close=np.log1p(x), length=20).iloc[:,0])
                                                          
df['bb_mid'] = df.groupby(level=1)['adj close'].transform(lambda x: pandas_ta.bbands(close=np.log1p(x), length=20).iloc[:,1])
                                                          
df['bb_high'] = df.groupby(level=1)['adj close'].transform(lambda x: pandas_ta.bbands(close=np.log1p(x), length=20).iloc[:,2])

def compute_atr(stock_data):
    atr = pandas_ta.atr(high=stock_data['high'],
                        low=stock_data['low'],
                        close=stock_data['close'],
                        length=14)
    return atr.sub(atr.mean()).div(atr.std())

# ATR
df['atr'] = df.groupby(level=1, group_keys=False).apply(compute_atr)

def compute_macd(close):
    macd = pandas_ta.macd(close=close, length=20).iloc[:,0]
    return macd.sub(macd.mean()).div(macd.std())

# MACD
df['macd'] = df.groupby(level=1, group_keys=False)['adj close'].apply(compute_macd)

# Dollar volume
df['dollar_volume'] = (df['adj close']*df['volume'])/1e6

df



Unnamed: 0_level_0,Unnamed: 1_level_0,adj close,close,high,low,open,volume,garman_klass_vol,rsi,bb_low,bb_mid,bb_high,atr,macd,dollar_volume
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2016-01-11,A,35.557140,37.939999,38.900002,37.410000,38.709999,2818400.0,-0.002025,,,,,,,100.214244
2016-01-11,AAL,39.257923,41.080002,41.200001,39.900002,40.560001,15877500.0,0.000103,,,,,,,623.317674
2016-01-11,AAPL,22.425259,24.632500,24.764999,24.334999,24.742500,198957600.0,-0.003582,,,,,,,4461.675638
2016-01-11,ABBV,38.137871,53.880001,55.980000,52.830002,55.860001,10483300.0,-0.054587,,,,,,,399.810741
2016-01-11,ABT,35.062431,40.730000,40.900002,40.099998,40.770000,7839700.0,-0.008591,,,,,,,274.878943
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-01-05,YUM,128.339996,128.339996,129.100006,127.440002,128.850006,1408800.0,0.000078,51.979174,4.837772,4.867868,4.897964,-0.073669,0.354146,180.805387
2024-01-05,ZBH,119.980003,119.980003,121.300003,119.690002,119.720001,1391000.0,0.000087,61.239035,4.765302,4.791236,4.817170,-1.062166,0.746261,166.892185
2024-01-05,ZBRA,252.690002,252.690002,257.160004,252.149994,252.210007,293500.0,0.000192,52.779791,5.451475,5.561812,5.672149,-0.071003,0.815839,74.164516
2024-01-05,ZION,44.049999,44.049999,44.139999,41.770000,41.919998,2552400.0,0.000574,60.926847,3.668305,3.775123,3.881942,0.379873,1.535518,112.433218


In [12]:
# Aggregate to monthly level and filter top 150 most liquid stocks for each month

last_cols = [c for c in df.columns.unique(0) if c not in ['dollar_volume', 'volume', 'open',
                                                          'high', 'low', 'close']]

data = (pd.concat([df.unstack('ticker')['dollar_volume'].resample('M').mean().stack('ticker').to_frame('dollar_volume'),
                   df.unstack()[last_cols].resample('M').last().stack('ticker')],
                  axis=1)).dropna()

data

Unnamed: 0_level_0,Unnamed: 1_level_0,dollar_volume,adj close,atr,bb_high,bb_low,bb_mid,garman_klass_vol,macd,rsi
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2016-02-29,A,91.918054,35.004200,-0.941847,3.620122,3.522492,3.571307,-0.001914,-0.160794,50.730036
2016-02-29,AAL,338.977823,39.288383,0.978469,3.729458,3.547833,3.638646,-0.000319,0.621469,56.744053
2016-02-29,AAPL,3548.073957,22.125889,-1.024249,3.155194,3.106253,3.130723,-0.003013,-0.309962,50.633140
2016-02-29,ABBV,357.006440,39.060566,-0.483842,3.734669,3.638599,3.686634,-0.049102,-0.335900,49.898459
2016-02-29,ABT,245.865712,33.559803,-0.796850,3.570935,3.483741,3.527338,-0.009363,-0.345717,48.856125
...,...,...,...,...,...,...,...,...,...,...
2024-01-31,ABNB,531.243402,135.979996,-1.049542,5.013877,4.887410,4.950643,0.000144,0.138212,51.351696
2024-01-31,CEG,168.186979,116.239998,0.125507,4.804735,4.724048,4.764392,0.000204,-0.918562,48.774127
2024-01-31,GEHC,180.872613,76.620003,-1.037387,4.410788,4.251088,4.330938,0.000270,0.715577,61.023250
2024-01-31,KVUE,510.430410,21.350000,-1.326498,3.124828,3.069709,3.097269,0.000142,1.695743,56.680129


In [None]:
data['dollar_volume'] = (data.loc[:, 'dollar_volume'].unstack('ticker').rolling(5*12, min_periods=12).mean().stack())

data['dollar_vol_rank'] = (data.groupby('date')['dollar_volume'].rank(ascending=False))

data = data[data['dollar_vol_rank']<150].drop(['dollar_volume', 'dollar_vol_rank'], axis=1)

data