# 1. Download SP500 Price Data

## 1.1 Import all necessary libraries

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as pyplt
import statsmodels.api as sm
import pandas_datareader.data as web
import datetime as dt
import yfinance as yf
import pandas_ta
import warnings
from statsmodels.regression.rolling import RollingOLS

In [9]:
warnings.filterwarnings('ignore')

## 1.2 Download the SP500 Constituent Data

In [10]:
sp500 = pd.read_html("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies")
print(type(sp500[0]))


<class 'pandas.core.frame.DataFrame'>


## 1.3 Clean the Data

- Isolate the ticker symbols
- Store the isoalted ticker symbols into a list

In [11]:
tickers_list = sp500[0]["Symbol"].tolist()

## 1.4 Take the start and end date to determine a range

- For this example, I will use 8 years

In [23]:
end_date = dt.datetime.today()
start_date = pd.to_datetime(end_date)-pd.DateOffset(365*8)

## 1.5.1 Per ticker, download the necessary information from the start date to the end date

In [24]:
df = yf.download(tickers=tickers_list, 
                 start = start_date, 
                 end = end_date).stack()
df

[*********************100%%**********************]  503 of 503 completed

2 Failed downloads:
['BF.B']: Exception('%ticker%: No price data found, symbol may be delisted (1d 2016-05-05 16:08:34.621139 -> 2024-05-03 16:08:34.621139)')
['BRK.B']: Exception('%ticker%: No timezone found, symbol may be delisted')


Unnamed: 0_level_0,Price,Adj Close,Close,High,Low,Open,Volume
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-05-05,A,38.436123,40.959999,41.029999,40.509998,40.639999,1084000.0
2016-05-05,AAL,31.531179,32.810001,33.520000,32.730000,33.279999,11606100.0
2016-05-05,AAPL,21.438957,23.309999,23.517500,23.170000,23.500000,143562000.0
2016-05-05,ABBV,44.179379,62.340000,62.400002,61.090000,61.410000,8646200.0
2016-05-05,ABT,33.229504,38.500000,38.730000,38.099998,38.279999,16531000.0
...,...,...,...,...,...,...,...
2024-05-02,XYL,135.990005,135.990005,137.179993,132.020004,132.429993,2784800.0
2024-05-02,YUM,135.070007,135.070007,136.610001,134.759995,135.500000,2425400.0
2024-05-02,ZBH,118.459999,118.459999,123.959999,117.599998,121.989998,2931700.0
2024-05-02,ZBRA,312.709991,312.709991,315.170013,303.209991,312.230011,507800.0



# 1.5.2 Make Data More Readable

In [25]:
df.index.names = ['date', 'ticker']

df.columns = df.columns.str.lower()

df

Unnamed: 0_level_0,Price,adj close,close,high,low,open,volume
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-05-05,A,38.436123,40.959999,41.029999,40.509998,40.639999,1084000.0
2016-05-05,AAL,31.531179,32.810001,33.520000,32.730000,33.279999,11606100.0
2016-05-05,AAPL,21.438957,23.309999,23.517500,23.170000,23.500000,143562000.0
2016-05-05,ABBV,44.179379,62.340000,62.400002,61.090000,61.410000,8646200.0
2016-05-05,ABT,33.229504,38.500000,38.730000,38.099998,38.279999,16531000.0
...,...,...,...,...,...,...,...
2024-05-02,XYL,135.990005,135.990005,137.179993,132.020004,132.429993,2784800.0
2024-05-02,YUM,135.070007,135.070007,136.610001,134.759995,135.500000,2425400.0
2024-05-02,ZBH,118.459999,118.459999,123.959999,117.599998,121.989998,2931700.0
2024-05-02,ZBRA,312.709991,312.709991,315.170013,303.209991,312.230011,507800.0


# 2. Calculate Different Technical Indicators per Stock

- RSI
- Bollinger Bands
- MACD
- ATR
- Garman-Klass Volatility
- Dollar Volume

In [26]:
def compute_atr(tickers_list):
    atr = pandas_ta.atr(high = tickers_list['high'],
                        low = tickers_list['low'],
                        close = tickers_list['close'],
                        length = 14)
    return atr.sub(atr.mean()).div(atr.std())   

In [27]:
def compute_macd(close):
    macd = pandas_ta.macd(close=close, length=20).iloc[:,0]
    return macd.sub(macd.mean()).div(macd.std())


In [28]:
df['rsi'] = df.groupby(level = 1)['adj close'].transform(lambda x: pandas_ta.rsi(close = x, length= 20))

df['bb_low'] = df.groupby(level = 1)['adj close'].transform(lambda x: pandas_ta.bbands(close = np.log1p(x), length = 20).iloc[:,0])

df['bb_mid'] = df.groupby(level = 1)['adj close'].transform(lambda x: pandas_ta.bbands(close = np.log1p(x), length = 20).iloc[:,1])

df['bb_upper'] = df.groupby(level = 1)['adj close'].transform(lambda x: pandas_ta.bbands(close = np.log1p(x), length = 20).iloc[:,2])

df ['atr'] = df.groupby (level = 1, group_keys = False).apply(compute_atr)

df['macd'] = df.groupby(level = 1, group_keys = False).apply(compute_macd)

df['garman_klass_vol'] = ((np.log(df['high'])-np.log(df['low']))**2)/2-(2*np.log(2)-1)*((np.log(df['adj close'])-np.log(df['open']))**2)

AttributeError: 'NoneType' object has no attribute 'iloc'

In [None]:
df

# 3. Aggregate on Monthly Level and Filter per Month the Most Liquid Stocks

# 4. Calculate Monthly Returns for Different Time-Horizons

# 5. Download Fama-French Factors; Calculate Rolling Factor Betas Per Stock

# 6.  Per Month, Create a K-means Clustering Model to group similar assets based on their Features

# 7. Per Month, Select Assets Based on the Cluster and Build a Portfolio Based on That

# 8. Visualize the Portfolio Returns and Compare That with the SP500 