# 1. Download SP500 Price Data

## 1.1 Import all necessary libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as pyplt
import statsmodels.api as sm
import pandas_datareader.data as web
import datetime as dt
import yfinance as yf
import pandas_ta
import warnings
from statsmodels.regression.rolling import RollingOLS

In [2]:
warnings.filterwarnings('ignore')

## 1.2 Download the SP500 Constituent Data

In [3]:
sp500 = pd.read_html("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies")
print(type(sp500[0]))


<class 'pandas.core.frame.DataFrame'>


## 1.3 Clean the Data

- Isolate the ticker symbols
- Store the isoalted ticker symbols into a list

In [4]:
tickers_list = sp500[0]["Symbol"].tolist()

## 1.4 Take the start and end date to determine a range

- For this example, I will use 8 years

In [5]:
end_date = dt.datetime.today()
start_date = pd.to_datetime(end_date)-pd.DateOffset(365*8)

## 1.5.1 Per ticker, download the necessary information from the start date to the end date

In [6]:
df = yf.download(tickers=tickers_list, 
                 start = start_date, 
                 end = end_date).stack()
df

[*********************100%%**********************]  503 of 503 completed

2 Failed downloads:
['BF.B']: Exception('%ticker%: No price data found, symbol may be delisted (1d 2016-05-05 17:45:02.035843 -> 2024-05-03 17:45:02.035843)')
['BRK.B']: Exception('%ticker%: No timezone found, symbol may be delisted')


Unnamed: 0_level_0,Price,Adj Close,Close,High,Low,Open,Volume
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-05-05,A,38.436127,40.959999,41.029999,40.509998,40.639999,1084000.0
2016-05-05,AAL,31.531178,32.810001,33.520000,32.730000,33.279999,11606100.0
2016-05-05,AAPL,21.438967,23.309999,23.517500,23.170000,23.500000,143562000.0
2016-05-05,ABBV,44.179386,62.340000,62.400002,61.090000,61.410000,8646200.0
2016-05-05,ABT,33.229511,38.500000,38.730000,38.099998,38.279999,16531000.0
...,...,...,...,...,...,...,...
2024-05-02,XYL,135.990005,135.990005,137.179993,132.020004,132.429993,2784800.0
2024-05-02,YUM,135.070007,135.070007,136.610001,134.759995,135.500000,2425400.0
2024-05-02,ZBH,118.459999,118.459999,123.959999,117.599998,121.989998,2931700.0
2024-05-02,ZBRA,312.709991,312.709991,315.170013,303.209991,312.230011,507800.0



# 1.5.2 Make Data More Readable

In [7]:
df.index.names = ['date', 'ticker']

df.columns = df.columns.str.lower()

df

Unnamed: 0_level_0,Price,adj close,close,high,low,open,volume
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-05-05,A,38.436127,40.959999,41.029999,40.509998,40.639999,1084000.0
2016-05-05,AAL,31.531178,32.810001,33.520000,32.730000,33.279999,11606100.0
2016-05-05,AAPL,21.438967,23.309999,23.517500,23.170000,23.500000,143562000.0
2016-05-05,ABBV,44.179386,62.340000,62.400002,61.090000,61.410000,8646200.0
2016-05-05,ABT,33.229511,38.500000,38.730000,38.099998,38.279999,16531000.0
...,...,...,...,...,...,...,...
2024-05-02,XYL,135.990005,135.990005,137.179993,132.020004,132.429993,2784800.0
2024-05-02,YUM,135.070007,135.070007,136.610001,134.759995,135.500000,2425400.0
2024-05-02,ZBH,118.459999,118.459999,123.959999,117.599998,121.989998,2931700.0
2024-05-02,ZBRA,312.709991,312.709991,315.170013,303.209991,312.230011,507800.0


In [8]:
df.to_csv("x.csv")

# 2. Calculate Different Technical Indicators per Stock

- RSI
- Bollinger Bands
- MACD
- ATR
- Garman-Klass Volatility
- Dollar Volume

In [9]:
def compute_atr(tickers_list):
    atr = pandas_ta.atr(high = tickers_list['high'],
                        low = tickers_list['low'],
                        close = tickers_list['close'],
                        length = 14)
    return atr.sub(atr.mean()).div(atr.std())   

In [10]:


def compute_macd(close):
    macd = pandas_ta.macd(close=close, length=20).iloc[:,0]
    return macd.sub(macd.mean()).div(macd.std())


In [11]:
df['rsi'] = df.groupby(level = 1)['adj close'].transform(lambda x: pandas_ta.rsi(close = x, length= 20))

df['bb_low'] = df.groupby(level = 1)['adj close'].transform(lambda x: pandas_ta.bbands(close = np.log1p(x), length = 20).iloc[:,0])

df['bb_mid'] = df.groupby(level = 1)['adj close'].transform(lambda x: pandas_ta.bbands(close = np.log1p(x), length = 20).iloc[:,1])

df['bb_upper'] = df.groupby(level = 1)['adj close'].transform(lambda x: pandas_ta.bbands(close = np.log1p(x), length = 20).iloc[:,2])

df ['atr'] = df.groupby (level = 1, group_keys = False).apply(compute_atr)

df['ema12'] = df['close'].ewm(span = 12, adjust = False).mean()

df['ema26'] = df['close'].ewm(span = 26, adjust = False).mean()

df['macd'] = df['ema12'] - df['ema26']

df['dollar_volume'] = (df['adj close'] * df['volume']) / 1e6

df['garman_klass_vol'] = ((np.log(df['high'])-np.log(df['low']))**2)/2-(2*np.log(2)-1)*((np.log(df['adj close'])-np.log(df['open']))**2)

In [12]:
df

Unnamed: 0_level_0,Price,adj close,close,high,low,open,volume,rsi,bb_low,bb_mid,bb_upper,atr,ema12,ema26,macd,dollar_volume,garman_klass_vol
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2016-05-05,A,38.436127,40.959999,41.029999,40.509998,40.639999,1084000.0,,,,,,40.959999,40.959999,0.000000,41.664761,-0.001120
2016-05-05,AAL,31.531178,32.810001,33.520000,32.730000,33.279999,11606100.0,,,,,,39.706153,40.356296,-0.650142,365.953999,-0.000841
2016-05-05,AAPL,21.438967,23.309999,23.517500,23.170000,23.500000,143562000.0,,,,,,37.183668,39.093607,-1.909939,3077.820945,-0.003144
2016-05-05,ABBV,44.179386,62.340000,62.400002,61.090000,61.410000,8646200.0,,,,,,41.053873,40.815562,0.238311,381.983808,-0.041668
2016-05-05,ABT,33.229511,38.500000,38.730000,38.099998,38.279999,16531000.0,,,,,,40.660969,40.644039,0.016931,549.317051,-0.007599
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-02,XYL,135.990005,135.990005,137.179993,132.020004,132.429993,2784800.0,68.550712,4.836793,4.871289,4.905784,0.892935,118.974125,130.199460,-11.225335,378.704967,0.000463
2024-05-02,YUM,135.070007,135.070007,136.610001,134.759995,135.500000,2425400.0,43.375371,4.905459,4.938441,4.971423,0.682124,121.450414,130.560242,-9.109827,327.598796,0.000089
2024-05-02,ZBH,118.459999,118.459999,123.959999,117.599998,121.989998,2931700.0,38.892780,4.763220,4.816631,4.870043,-0.192443,120.990351,129.663927,-8.673577,347.289179,0.001054
2024-05-02,ZBRA,312.709991,312.709991,315.170013,303.209991,312.230011,507800.0,62.286364,5.578997,5.674417,5.769836,0.513987,150.485680,143.222895,7.262785,158.794134,0.000747


# 3. Aggregate on Monthly Level and Filter per Month the Most Liquid Stocks

In [13]:
last_cols = [c for c in df.columns.unique(0) if c not in ['dollar_volume', ' volume', 'open',
                                                         'high', ' low', 'close']]


data = (pd.concat([df.unstack('ticker')['dollar_volume'].resample('M').mean().stack('ticker').to_frame('dollar_volume'),
        df.unstack()[last_cols].resample('M').last().stack('ticker')], 
        axis = 1)).dropna()

data

Unnamed: 0_level_0,Unnamed: 1_level_0,dollar_volume,adj close,atr,bb_low,bb_mid,bb_upper,ema12,ema26,garman_klass_vol,low,macd,rsi,volume
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2016-06-30,A,99.655612,41.736324,-1.169227,3.721142,3.769610,3.818078,60.551535,63.274646,-0.000631,43.590000,-2.723112,50.861527,2181100.0
2016-06-30,AAL,351.266378,27.206570,0.667327,3.253283,3.392594,3.531905,55.591298,60.684673,0.000214,27.490000,-5.093374,41.491302,11148300.0
2016-06-30,AAPL,3133.495532,21.981606,-1.202583,3.102071,3.143204,3.184338,50.715714,57.959882,-0.001853,23.575001,-7.244168,49.803542,143345600.0
2016-06-30,ABBV,418.335110,43.874649,-0.988771,3.744297,3.788811,3.833325,52.437912,58.252483,-0.045187,61.139999,-5.814572,52.484026,7920600.0
2016-06-30,ABT,338.847581,33.928635,-1.154670,3.488460,3.527148,3.565837,50.418233,56.849336,-0.006377,38.570000,-6.431103,56.070849,10425000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-31,GEHC,324.076345,77.830002,1.484963,4.371661,4.453593,4.535524,144.054576,155.860922,0.000272,77.190002,-11.806346,37.078468,2995200.0
2024-05-31,KVUE,337.615761,19.120001,-1.293339,2.969300,3.015172,3.061045,98.572423,120.833905,0.000106,18.940001,-22.261483,42.941614,22724700.0
2024-05-31,VLTO,110.929116,93.849998,0.335532,4.462109,4.520273,4.578436,192.327913,201.056941,0.000189,93.110001,-8.729028,61.409364,1265900.0
2024-05-31,GEV,452.206660,153.830002,0.218104,4.811928,4.950923,5.089918,129.367714,146.370228,0.000302,151.658997,-17.002514,61.409384,2530200.0


## 3.1 Compute the 5-year rolling average of the dollar volume per stock

Unnamed: 0_level_0,Unnamed: 1_level_0,dollar_volume,adj close,atr,bb_low,bb_mid,bb_upper,ema12,ema26,garman_klass_vol,low,macd,rsi,volume,dollar_volume_new
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2016-06-30,A,99.655612,41.736324,-1.169227,3.721142,3.769610,3.818078,60.551535,63.274646,-0.000631,43.590000,-2.723112,50.861527,2181100.0,
2016-06-30,AAL,351.266378,27.206570,0.667327,3.253283,3.392594,3.531905,55.591298,60.684673,0.000214,27.490000,-5.093374,41.491302,11148300.0,
2016-06-30,AAPL,3133.495532,21.981606,-1.202583,3.102071,3.143204,3.184338,50.715714,57.959882,-0.001853,23.575001,-7.244168,49.803542,143345600.0,
2016-06-30,ABBV,418.335110,43.874649,-0.988771,3.744297,3.788811,3.833325,52.437912,58.252483,-0.045187,61.139999,-5.814572,52.484026,7920600.0,
2016-06-30,ABT,338.847581,33.928635,-1.154670,3.488460,3.527148,3.565837,50.418233,56.849336,-0.006377,38.570000,-6.431103,56.070849,10425000.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-31,GEHC,324.076345,77.830002,1.484963,4.371661,4.453593,4.535524,144.054576,155.860922,0.000272,77.190002,-11.806346,37.078468,2995200.0,
2024-05-31,KVUE,337.615761,19.120001,-1.293339,2.969300,3.015172,3.061045,98.572423,120.833905,0.000106,18.940001,-22.261483,42.941614,22724700.0,
2024-05-31,VLTO,110.929116,93.849998,0.335532,4.462109,4.520273,4.578436,192.327913,201.056941,0.000189,93.110001,-8.729028,61.409364,1265900.0,
2024-05-31,GEV,452.206660,153.830002,0.218104,4.811928,4.950923,5.089918,129.367714,146.370228,0.000302,151.658997,-17.002514,61.409384,2530200.0,


# 4. Calculate Monthly Returns for Different Time-Horizons

# 5. Download Fama-French Factors; Calculate Rolling Factor Betas Per Stock

# 6.  Per Month, Create a K-means Clustering Model to group similar assets based on their Features

# 7. Per Month, Select Assets Based on the Cluster and Build a Portfolio Based on That

# 8. Visualize the Portfolio Returns and Compare That with the SP500 