# 1. Download SP500 Price Data

## 1.1 Import all necessary libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as pyplt
import statsmodels.api as sm
import pandas_datareader.data as web
import datetime as dt
import yfinance as yf
import pandas_ta
import warnings
from statsmodels.regression.rolling import RollingOLS

In [3]:
warnings.filterwarnings('ignore')

## 1.2 Download the SP500 Constituent Data

In [4]:
sp500 = pd.read_html("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies")
print(type(sp500[0]))


<class 'pandas.core.frame.DataFrame'>


## 1.3 Clean the Data

- Isolate the ticker symbols
- Store the isoalted ticker symbols into a list

In [5]:
tickers_list = sp500[0]["Symbol"].tolist()

## 1.4 Take the start and end date to determine a range

- For this example, I will use 8 years

In [6]:
end_date = dt.datetime.today()
start_date = pd.to_datetime(end_date)-pd.DateOffset(365*8)

## 1.5.1 Per ticker, download the necessary information from the start date to the end date

In [36]:
df = yf.download(tickers=tickers_list, 
                 start = start_date, 
                 end = end_date).stack()
df

[*********************100%%**********************]  503 of 503 completed

2 Failed downloads:
['BRK.B']: Exception('%ticker%: No timezone found, symbol may be delisted')
['BF.B']: Exception('%ticker%: No price data found, symbol may be delisted (1d 2016-04-29 14:11:55.915228 -> 2024-04-27 14:11:55.915228)')


Unnamed: 0_level_0,Price,Adj Close,Close,High,Low,Open,Volume
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-04-29,A,38.398594,40.919998,41.040001,40.259998,40.919998,2027900.0
2016-04-29,AAL,33.241791,34.689999,35.830002,34.099998,35.810001,17807200.0
2016-04-29,AAPL,21.423494,23.434999,23.680000,23.127501,23.497499,274126000.0
2016-04-29,ABBV,43.229752,61.000000,61.450001,60.480000,61.029999,13052700.0
2016-04-29,ABT,33.574753,38.900002,40.380001,38.580002,40.369999,30646800.0
...,...,...,...,...,...,...,...
2024-04-26,XYL,132.399994,132.399994,132.649994,129.789993,130.410004,1112300.0
2024-04-26,YUM,141.770004,141.770004,142.789993,140.639999,140.830002,1979100.0
2024-04-26,ZBH,119.349998,119.349998,119.940002,118.660004,119.489998,1099200.0
2024-04-26,ZBRA,297.209991,297.209991,299.149994,292.779999,293.500000,461400.0


# 1.5.2 Make Data More Readable

In [38]:
df.index.names = ['date', 'ticker']

df.columns = df.columns.str.lower()

df

Unnamed: 0_level_0,Price,adj close,close,high,low,open,volume
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-04-29,A,38.398594,40.919998,41.040001,40.259998,40.919998,2027900.0
2016-04-29,AAL,33.241791,34.689999,35.830002,34.099998,35.810001,17807200.0
2016-04-29,AAPL,21.423494,23.434999,23.680000,23.127501,23.497499,274126000.0
2016-04-29,ABBV,43.229752,61.000000,61.450001,60.480000,61.029999,13052700.0
2016-04-29,ABT,33.574753,38.900002,40.380001,38.580002,40.369999,30646800.0
...,...,...,...,...,...,...,...
2024-04-26,XYL,132.399994,132.399994,132.649994,129.789993,130.410004,1112300.0
2024-04-26,YUM,141.770004,141.770004,142.789993,140.639999,140.830002,1979100.0
2024-04-26,ZBH,119.349998,119.349998,119.940002,118.660004,119.489998,1099200.0
2024-04-26,ZBRA,297.209991,297.209991,299.149994,292.779999,293.500000,461400.0


# 2. Calculate Different Technical Indicators per Stock

- RSI
- Bollinger Bands
- MACD
- ATR
- Garman-Klass Volatility
- Dollar Volume

In [42]:
df['garman_klass_vol'] = ((np.log(df['high'])-np.log(df['low']))**2)/2-(2*np.log(2)-1)*((np.log(df['adj close'])-np.log(df['open']))**2)

df['rsi'] = df.groupby(level = 1)['adj close'].transform(lambda x: pandas_ta.rsi(close = x, length= 20))

df

Unnamed: 0_level_0,Price,adj close,close,high,low,open,volume,garman_klass_vol,rsi
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2016-04-29,A,38.398594,40.919998,41.040001,40.259998,40.919998,2027900.0,-0.001378,
2016-04-29,AAL,33.241791,34.689999,35.830002,34.099998,35.810001,17807200.0,-0.000915,
2016-04-29,AAPL,21.423494,23.434999,23.680000,23.127501,23.497499,274126000.0,-0.003020,
2016-04-29,ABBV,43.229752,61.000000,61.450001,60.480000,61.029999,13052700.0,-0.045809,
2016-04-29,ABT,33.574753,38.900002,40.380001,38.580002,40.369999,30646800.0,-0.012083,
...,...,...,...,...,...,...,...,...,...
2024-04-26,XYL,132.399994,132.399994,132.649994,129.789993,130.410004,1112300.0,0.000149,64.817989
2024-04-26,YUM,141.770004,141.770004,142.789993,140.639999,140.830002,1979100.0,0.000098,62.949891
2024-04-26,ZBH,119.349998,119.349998,119.940002,118.660004,119.489998,1099200.0,0.000057,38.675346
2024-04-26,ZBRA,297.209991,297.209991,299.149994,292.779999,293.500000,461400.0,0.000171,57.265359


# 3. Aggregate on Monthly Level and Filter per Month the Most Liquid Stocks

# 4. Calculate Monthly Returns for Different Time-Horizons

# 5. Download Fama-French Factors; Calculate Rolling Factor Betas Per Stock

# 6.  Per Month, Create a K-means Clustering Model to group similar assets based on their Features

# 7. Per Month, Select Assets Based on the Cluster and Build a Portfolio Based on That

# 8. Visualize the Portfolio Returns and Compare That with the SP500 