# 1. Download SP500 Price Data

## 1.1 Import all necessary libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as pyplt
import statsmodels.api as sm
import pandas_datareader.data as web
import datetime as dt
import yfinance as yf
import pandas_ta
import warnings
from statsmodels.regression.rolling import RollingOLS

In [3]:
warnings.filterwarnings('ignore')

## 1.2 Download the SP500 Constituent Data

In [4]:
sp500 = pd.read_html("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies")
print(type(sp500[0]))


<class 'pandas.core.frame.DataFrame'>


## 1.3 Clean the Data

- Isolate the ticker symbols
- Store the isoalted ticker symbols into a list

In [5]:
tickers_list = sp500[0]["Symbol"].tolist()

## 1.4 Take the start and end date to determine a range

- For this example, I will use 8 years

In [6]:
end_date = dt.datetime.today()
start_date = pd.to_datetime(end_date)-pd.DateOffset(365*8)

## 1.5.1 Per ticker, download the necessary information from the start date to the end date

In [7]:
df = yf.download(tickers=tickers_list, 
                 start = start_date, 
                 end = end_date).stack()
df

[*********************100%%**********************]  503 of 503 completed

2 Failed downloads:
['BF.B']: Exception('%ticker%: No price data found, symbol may be delisted (1d 2016-04-30 11:59:28.612588 -> 2024-04-28 11:59:28.612588)')
['BRK.B']: Exception('%ticker%: No timezone found, symbol may be delisted')


Unnamed: 0_level_0,Price,Adj Close,Close,High,Low,Open,Volume
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-05-02,A,39.139919,41.709999,41.750000,41.130001,41.200001,1285300.0
2016-05-02,AAL,33.088032,34.430000,35.169998,34.400002,34.990002,8829800.0
2016-05-02,AAPL,21.400639,23.410000,23.520000,23.100000,23.492500,192640400.0
2016-05-02,ABBV,43.541569,61.439999,62.000000,60.700001,60.790001,9512000.0
2016-05-02,ABT,33.531590,38.849998,39.119999,38.419998,38.849998,21292300.0
...,...,...,...,...,...,...,...
2024-04-26,XYL,132.399994,132.399994,132.649994,129.789993,130.410004,1112300.0
2024-04-26,YUM,141.770004,141.770004,142.789993,140.639999,140.830002,1979100.0
2024-04-26,ZBH,119.349998,119.349998,119.940002,118.660004,119.489998,1099200.0
2024-04-26,ZBRA,297.209991,297.209991,299.149994,292.779999,293.500000,461400.0


# 1.5.2 Make Data More Readable

In [8]:
df.index.names = ['date', 'ticker']

df.columns = df.columns.str.lower()

df

Unnamed: 0_level_0,Price,adj close,close,high,low,open,volume
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-05-02,A,39.139919,41.709999,41.750000,41.130001,41.200001,1285300.0
2016-05-02,AAL,33.088032,34.430000,35.169998,34.400002,34.990002,8829800.0
2016-05-02,AAPL,21.400639,23.410000,23.520000,23.100000,23.492500,192640400.0
2016-05-02,ABBV,43.541569,61.439999,62.000000,60.700001,60.790001,9512000.0
2016-05-02,ABT,33.531590,38.849998,39.119999,38.419998,38.849998,21292300.0
...,...,...,...,...,...,...,...
2024-04-26,XYL,132.399994,132.399994,132.649994,129.789993,130.410004,1112300.0
2024-04-26,YUM,141.770004,141.770004,142.789993,140.639999,140.830002,1979100.0
2024-04-26,ZBH,119.349998,119.349998,119.940002,118.660004,119.489998,1099200.0
2024-04-26,ZBRA,297.209991,297.209991,299.149994,292.779999,293.500000,461400.0


# 2. Calculate Different Technical Indicators per Stock

- RSI
- Bollinger Bands
- MACD
- ATR
- Garman-Klass Volatility
- Dollar Volume

In [9]:
df['rsi'] = df.groupby(level = 1)['adj close'].transform(lambda x: pandas_ta.rsi(close = x, length= 20))

df['bb_low'] = df.groupby(level = 1)['adj close'].transform(lambda x: pandas_ta.bbands(close = np.log1p(x), length = 20).iloc[:,0])

df['bb_mid'] = df.groupby(level = 1)['adj close'].transform(lambda x: pandas_ta.bbands(close = np.log1p(x), length = 20).iloc[:,1])

df['bb_upper'] = df.groupby(level = 1)['adj close'].transform(lambda x: pandas_ta.bbands(close = np.log1p(x), length = 20).iloc[:,2])



df['garman_klass_vol'] = ((np.log(df['high'])-np.log(df['low']))**2)/2-(2*np.log(2)-1)*((np.log(df['adj close'])-np.log(df['open']))**2)

In [10]:
df

Unnamed: 0_level_0,Price,adj close,close,high,low,open,volume,garman_klass_vol,rsi,bb_low,bb_mid,bb_upper
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2016-05-02,A,39.139919,41.709999,41.750000,41.130001,41.200001,1285300.0,-0.000904,,,,
2016-05-02,AAL,33.088032,34.430000,35.169998,34.400002,34.990002,8829800.0,-0.000962,,,,
2016-05-02,AAPL,21.400639,23.410000,23.520000,23.100000,23.492500,192640400.0,-0.003197,,,,
2016-05-02,ABBV,43.541569,61.439999,62.000000,60.700001,60.790001,9512000.0,-0.042794,,,,
2016-05-02,ABT,33.531590,38.849998,39.119999,38.419998,38.849998,21292300.0,-0.008209,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
2024-04-26,XYL,132.399994,132.399994,132.649994,129.789993,130.410004,1112300.0,0.000149,64.817989,4.842647,4.864700,4.886754
2024-04-26,YUM,141.770004,141.770004,142.789993,140.639999,140.830002,1979100.0,0.000098,62.949891,4.912565,4.939073,4.965581
2024-04-26,ZBH,119.349998,119.349998,119.940002,118.660004,119.489998,1099200.0,0.000057,38.675346,4.770082,4.830899,4.891715
2024-04-26,ZBRA,297.209991,297.209991,299.149994,292.779999,293.500000,461400.0,0.000171,57.265359,5.589446,5.665682,5.741919


# 3. Aggregate on Monthly Level and Filter per Month the Most Liquid Stocks

# 4. Calculate Monthly Returns for Different Time-Horizons

# 5. Download Fama-French Factors; Calculate Rolling Factor Betas Per Stock

# 6.  Per Month, Create a K-means Clustering Model to group similar assets based on their Features

# 7. Per Month, Select Assets Based on the Cluster and Build a Portfolio Based on That

# 8. Visualize the Portfolio Returns and Compare That with the SP500 