In [1]:
# Unsupervised Learning Trading Strategy

# 1. Download/Load SP500 stocks prices data.
# 2. Calculate different features and indicators on each stock.
# 3. Aggregate on monthly level and filter top 150 most liquid stocks.
# 4. Calculate Monthly Returns for different time-horizons.
# 5. Download Fama-French Factors and Calculate Rolling Factor Betas.
# 6. For each month fit a K-Means Clustering Algorithm to group similar assets based on their features.
# 7. For each month select assets based on the cluster and form a portfolio based on Efficient Frontier max sharpe ratio optimization.
# 8. Visualize Portfolio returns and compare to SP500 returns.

# pandas, numpy, matplotlib, statsmodels, pandas_datareader, datetime, yfinance, sklearn, PyPortfolioOpt

In [5]:
!pip install pandas-ta

Collecting pandas-ta
  Downloading pandas_ta-0.3.14b.tar.gz (115 kB)
     ---------------------------------------- 0.0/115.1 kB ? eta -:--:--
     ------ ------------------------------ 20.5/115.1 kB 320.0 kB/s eta 0:00:01
     ----------------------- ------------- 71.7/115.1 kB 787.7 kB/s eta 0:00:01
     -------------------------------------- 115.1/115.1 kB 1.1 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: pandas-ta
  Building wheel for pandas-ta (setup.py): started
  Building wheel for pandas-ta (setup.py): finished with status 'done'
  Created wheel for pandas-ta: filename=pandas_ta-0.3.14b0-py3-none-any.whl size=218928 sha256=3f98782561b8d22c37df2c276f91fe43fdf145bcae17b2530a17f030f829f275
  Stored in directory: c:\users\lenovo t490s\appdata\local\pip\cache\wheels\fd\ed\18\2a12fd1b7906c63efca6accb351929f2c7f6bbc674e1c0ba5d
Successfully built pandas-ta
Installing collecte

In [127]:
from statsmodels.regression.rolling import RollingOLS
import pandas_datareader.data as web
import matplotlib.pyplot as plt
import statsmodels.api as sm
import pandas as pd
import numpy as np
import datetime as dt
import yfinance as yf
import pandas_ta
import warnings

In [129]:
#1. Download/Load SP500 stocks prices data.

In [155]:
warnings.filterwarnings('ignore')

sp500 = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]

sp500['Symbol'] = sp500['Symbol'].str.replace('.', '-')

symbols_list = sp500['Symbol'].unique().tolist()

end_date = '2024-12-05'

start_date = pd.to_datetime(end_date)-pd.DateOffset(365*8)

df = yf.download(tickers=symbols_list,
                 start=start_date,
                 end=end_date).stack()

df.index.names = ['date', 'ticker']

df.columns = df.columns.str.lower()

# Convert 'date' index to show only date (remove time and timezone)
# df.index = pd.MultiIndex.from_tuples(
#     [(d.date(), t) for d, t in df.index],
#     names=['date', 'ticker']
# )

df

[*********************100%***********************]  503 of 503 completed


Unnamed: 0_level_0,Price,adj close,close,high,low,open,volume
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-12-07 00:00:00+00:00,A,42.288265,44.990002,44.990002,44.110001,44.560001,1815200.0
2016-12-07 00:00:00+00:00,AAPL,25.706537,27.757500,27.797501,27.290001,27.315001,119994800.0
2016-12-07 00:00:00+00:00,ABBV,42.548874,59.990002,60.650002,58.799999,60.259998,13095600.0
2016-12-07 00:00:00+00:00,ABT,33.294872,38.480000,38.490002,37.419998,38.130001,10888600.0
2016-12-07 00:00:00+00:00,ACGL,26.904089,28.293333,28.306667,28.013332,28.083332,858000.0
...,...,...,...,...,...,...,...
2024-12-04 00:00:00+00:00,XYL,127.540001,127.540001,128.419998,127.269997,127.940002,1157300.0
2024-12-04 00:00:00+00:00,YUM,138.160004,138.160004,138.500000,137.070007,137.490005,1522700.0
2024-12-04 00:00:00+00:00,ZBH,110.620003,110.620003,111.980003,110.269997,110.519997,1438200.0
2024-12-04 00:00:00+00:00,ZBRA,411.940002,411.940002,412.940002,407.769989,408.709991,230100.0


In [157]:
#2. Calculate features and technical indicators for each stock.
# Garman-Klass Volatility
# RSI
# Bollinger Bands
# ATR
# MACD
# Dollar Volume

In [159]:
df['garman_klass_vol'] = ((np.log(df['high'])-np.log(df['low']))**2)/2-(2*np.log(2)-1)*((np.log(df['adj close'])-np.log(df['open']))**2)

df['rsi'] = df.groupby(level=1)['adj close'].transform(lambda x: pandas_ta.rsi(close=x, length=20))

df['bb_low'] = df.groupby(level=1)['adj close'].transform(lambda x: pandas_ta.bbands(close=np.log1p(x), length=20).iloc[:,0])
                                                          
df['bb_mid'] = df.groupby(level=1)['adj close'].transform(lambda x: pandas_ta.bbands(close=np.log1p(x), length=20).iloc[:,1])
                                                          
df['bb_high'] = df.groupby(level=1)['adj close'].transform(lambda x: pandas_ta.bbands(close=np.log1p(x), length=20).iloc[:,2])

def compute_atr(stock_data):
    atr = pandas_ta.atr(high=stock_data['high'],
                        low=stock_data['low'],
                        close=stock_data['close'],
                        length=14)
    return atr.sub(atr.mean()).div(atr.std())

df['atr'] = df.groupby(level=1, group_keys=False).apply(compute_atr)

def compute_macd(close):
    macd = pandas_ta.macd(close=close, length=20).iloc[:,0]
    return macd.sub(macd.mean()).div(macd.std())

df['macd'] = df.groupby(level=1, group_keys=False)['adj close'].apply(compute_macd)

df['dollar_volume'] = (df['adj close']*df['volume'])/1e6

df

Unnamed: 0_level_0,Price,adj close,close,high,low,open,volume,garman_klass_vol,rsi,bb_low,bb_mid,bb_high,atr,macd,dollar_volume
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2016-12-07 00:00:00+00:00,A,42.288265,44.990002,44.990002,44.110001,44.560001,1815200.0,-0.000863,,,,,,,76.761659
2016-12-07 00:00:00+00:00,AAPL,25.706537,27.757500,27.797501,27.290001,27.315001,119994800.0,-0.001253,,,,,,,3084.650796
2016-12-07 00:00:00+00:00,ABBV,42.548874,59.990002,60.650002,58.799999,60.259998,13095600.0,-0.046306,,,,,,,557.203033
2016-12-07 00:00:00+00:00,ABT,33.294872,38.480000,38.490002,37.419998,38.130001,10888600.0,-0.006705,,,,,,,362.534546
2016-12-07 00:00:00+00:00,ACGL,26.904089,28.293333,28.306667,28.013332,28.083332,858000.0,-0.000657,,,,,,,23.083708
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-04 00:00:00+00:00,XYL,127.540001,127.540001,128.419998,127.269997,127.940002,1157300.0,0.000037,51.175950,4.798319,4.832672,4.867025,0.579851,-0.247383,147.602043
2024-12-04 00:00:00+00:00,YUM,138.160004,138.160004,138.500000,137.070007,137.490005,1522700.0,0.000045,57.620501,4.884074,4.916817,4.949560,0.391878,0.662421,210.376238
2024-12-04 00:00:00+00:00,ZBH,110.620003,110.620003,111.980003,110.269997,110.519997,1438200.0,0.000118,54.012327,4.689100,4.712716,4.736333,-0.729156,0.410483,159.093688
2024-12-04 00:00:00+00:00,ZBRA,411.940002,411.940002,412.940002,407.769989,408.709991,230100.0,0.000055,66.110686,5.946108,5.989031,6.031955,-0.219933,0.884988,94.787395


In [161]:
#3. Aggregate to monthly level and filter top 150 most liquid stocks for each month.
# To reduce training time and experiment with features and strategies, we convert the business-daily data to month-end frequency.

In [163]:
last_cols = [c for c in df.columns.unique(0) if c not in ['dollar_volume', 'volume', 'open', 'high', 'low', 'close']]

data = (pd.concat([df.unstack('ticker')['dollar_volume'].resample('M').mean().stack('ticker').to_frame('dollar_volume'),
                   df.unstack()[last_cols].resample('M').last().stack('ticker')],
                  axis=1)).dropna()

data

Unnamed: 0_level_0,Unnamed: 1_level_0,dollar_volume,adj close,garman_klass_vol,rsi,bb_low,bb_mid,bb_high,atr,macd
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2017-01-31 00:00:00+00:00,A,102.349486,46.162560,-0.000339,61.302703,3.803595,3.833158,3.862721,-1.393466,0.152586
2017-01-31 00:00:00+00:00,AAPL,3120.691359,28.095907,-0.002159,73.638511,3.327728,3.356231,3.384733,-1.459895,-0.147605
2017-01-31 00:00:00+00:00,ABBV,344.889481,43.780560,-0.038072,49.752534,3.779777,3.810983,3.842188,-1.463515,-0.333496
2017-01-31 00:00:00+00:00,ABT,371.924577,36.375000,-0.004523,73.900535,3.557928,3.591215,3.624503,-1.421809,0.276686
2017-01-31 00:00:00+00:00,ACGL,29.688113,28.003962,-0.001044,59.526822,3.331300,3.355690,3.380080,-1.173415,-0.089095
...,...,...,...,...,...,...,...,...,...,...
2024-12-31 00:00:00+00:00,XYL,153.968072,127.540001,0.000037,51.175950,4.798319,4.832672,4.867025,0.579851,-0.247383
2024-12-31 00:00:00+00:00,YUM,233.261120,138.160004,0.000045,57.620501,4.884074,4.916817,4.949560,0.391878,0.662421
2024-12-31 00:00:00+00:00,ZBH,153.370169,110.620003,0.000118,54.012327,4.689100,4.712716,4.736333,-0.729156,0.410483
2024-12-31 00:00:00+00:00,ZBRA,113.083748,411.940002,0.000055,66.110686,5.946108,5.989031,6.031955,-0.219933,0.884988


In [165]:
# Calculate 5-year rolling average of dollar volume for each stocks before filtering.

data['dollar_volume'] = (data.loc[:, 'dollar_volume'].unstack('ticker').rolling(5*12, min_periods=12).mean().stack())

data['dollar_vol_rank'] = (data.groupby('date')['dollar_volume'].rank(ascending=False))

data = data[data['dollar_vol_rank']<150].drop(['dollar_volume', 'dollar_vol_rank'], axis=1)

data

Unnamed: 0_level_0,Unnamed: 1_level_0,adj close,garman_klass_vol,rsi,bb_low,bb_mid,bb_high,atr,macd
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-12-31 00:00:00+00:00,AAPL,39.811539,-0.001775,47.784605,3.698682,3.724109,3.749536,-1.267751,-0.271406
2017-12-31 00:00:00+00:00,ABBV,71.096039,-0.039988,55.000914,4.253741,4.278529,4.303317,-0.928835,0.084570
2017-12-31 00:00:00+00:00,ABT,50.525517,-0.006559,59.877822,3.884984,3.922018,3.959052,-1.383264,0.162317
2017-12-31 00:00:00+00:00,ACN,137.983582,-0.004850,65.003498,4.891671,4.921786,4.951901,-1.169710,0.223477
2017-12-31 00:00:00+00:00,ADBE,175.240005,0.000056,51.008417,5.137116,5.167383,5.197650,-1.365585,-0.180396
...,...,...,...,...,...,...,...,...,...
2024-12-31 00:00:00+00:00,VZ,42.520000,0.000201,49.844828,3.702627,3.767658,3.832689,-0.247612,0.959244
2024-12-31 00:00:00+00:00,WFC,73.059998,0.000150,60.118814,4.259402,4.316143,4.372884,1.787578,2.697188
2024-12-31 00:00:00+00:00,WMT,94.449997,0.000079,78.263378,4.404536,4.486676,4.568816,2.481566,4.621761
2024-12-31 00:00:00+00:00,XOM,114.279999,0.000210,39.841787,4.761482,4.789825,4.818168,0.396881,-0.531204


In [167]:
# 4. Calculate Monthly Returns for different time horizons as features.
# To capture time series dynamics that reflect, for example, momentum patterns, 
# we compute historical returns using the method .pct_change(lag), that is, returns over various monthly periods as identified by lags.

def calculate_returns(df):

    outlier_cutoff = 0.005

    lags = [1, 2, 3, 6, 9, 12]

    for lag in lags:

        df[f'return_{lag}m'] = (df['adj close']
                              .pct_change(lag)
                              .pipe(lambda x: x.clip(lower=x.quantile(outlier_cutoff),
                                                     upper=x.quantile(1-outlier_cutoff)))
                              .add(1)
                              .pow(1/lag)
                              .sub(1))
    return df
    
    
data = data.groupby(level=1, group_keys=False).apply(calculate_returns).dropna()

data

Unnamed: 0_level_0,Unnamed: 1_level_0,adj close,garman_klass_vol,rsi,bb_low,bb_mid,bb_high,atr,macd,return_1m,return_2m,return_3m,return_6m,return_9m,return_12m
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2018-12-31 00:00:00+00:00,AAPL,37.665623,-0.000835,37.072878,3.585046,3.697134,3.809221,-0.681157,-1.247614,-0.116698,-0.132708,-0.101117,-0.025181,-0.005637,-0.004607
2018-12-31 00:00:00+00:00,ABBV,70.416023,-0.026539,57.957325,4.152361,4.223280,4.294199,0.731986,-0.094307,-0.022064,0.088210,-0.004972,0.002641,0.000530,-0.000801
2018-12-31 00:00:00+00:00,ABT,65.209160,-0.003189,55.190172,4.111693,4.169437,4.227180,0.303489,-0.324267,-0.023228,0.024291,-0.003340,0.030303,0.022655,0.021488
2018-12-31 00:00:00+00:00,ACN,129.389038,-0.002485,36.633860,4.807611,4.940240,5.072869,-0.158300,-1.467880,-0.142901,-0.054156,-0.057936,-0.022971,-0.007415,-0.005345
2018-12-31 00:00:00+00:00,ADBE,226.240005,0.000161,46.330419,5.329066,5.452557,5.576047,-0.049695,-0.756427,-0.098250,-0.040535,-0.057180,-0.012388,0.005118,0.021515
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-31 00:00:00+00:00,VRTX,463.980011,0.000371,47.012568,6.078846,6.160841,6.242836,2.307121,-1.453169,-0.008865,-0.012686,-0.000789,-0.001693,0.011660,0.011002
2024-12-31 00:00:00+00:00,VZ,42.520000,0.000201,49.844828,3.702627,3.767658,3.832689,-0.247612,0.959244,-0.041046,0.004618,-0.012980,0.010436,0.006806,0.015521
2024-12-31 00:00:00+00:00,WFC,73.059998,0.000150,60.118814,4.259402,4.316143,4.372884,1.787578,2.697188,-0.040830,0.063894,0.091611,0.037436,0.028238,0.035712
2024-12-31 00:00:00+00:00,WMT,94.449997,0.000079,78.263378,4.404536,4.486676,4.568816,2.481566,4.621761,0.021081,0.073560,0.053626,0.057542,0.052112,0.050775


In [171]:
#Convert 'date' index to show only date (remove time and timezone)
data.index = pd.MultiIndex.from_tuples(
    [(d.date(), t) for d, t in data.index],
    names=['date', 'ticker']
)

data

Unnamed: 0_level_0,Unnamed: 1_level_0,adj close,garman_klass_vol,rsi,bb_low,bb_mid,bb_high,atr,macd,return_1m,return_2m,return_3m,return_6m,return_9m,return_12m
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2018-12-31,AAPL,37.665623,-0.000835,37.072878,3.585046,3.697134,3.809221,-0.681157,-1.247614,-0.116698,-0.132708,-0.101117,-0.025181,-0.005637,-0.004607
2018-12-31,ABBV,70.416023,-0.026539,57.957325,4.152361,4.223280,4.294199,0.731986,-0.094307,-0.022064,0.088210,-0.004972,0.002641,0.000530,-0.000801
2018-12-31,ABT,65.209160,-0.003189,55.190172,4.111693,4.169437,4.227180,0.303489,-0.324267,-0.023228,0.024291,-0.003340,0.030303,0.022655,0.021488
2018-12-31,ACN,129.389038,-0.002485,36.633860,4.807611,4.940240,5.072869,-0.158300,-1.467880,-0.142901,-0.054156,-0.057936,-0.022971,-0.007415,-0.005345
2018-12-31,ADBE,226.240005,0.000161,46.330419,5.329066,5.452557,5.576047,-0.049695,-0.756427,-0.098250,-0.040535,-0.057180,-0.012388,0.005118,0.021515
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-31,VRTX,463.980011,0.000371,47.012568,6.078846,6.160841,6.242836,2.307121,-1.453169,-0.008865,-0.012686,-0.000789,-0.001693,0.011660,0.011002
2024-12-31,VZ,42.520000,0.000201,49.844828,3.702627,3.767658,3.832689,-0.247612,0.959244,-0.041046,0.004618,-0.012980,0.010436,0.006806,0.015521
2024-12-31,WFC,73.059998,0.000150,60.118814,4.259402,4.316143,4.372884,1.787578,2.697188,-0.040830,0.063894,0.091611,0.037436,0.028238,0.035712
2024-12-31,WMT,94.449997,0.000079,78.263378,4.404536,4.486676,4.568816,2.481566,4.621761,0.021081,0.073560,0.053626,0.057542,0.052112,0.050775


In [173]:
# 5. Download Fama-French Factors and Calculate Rolling Factor Betas.

# We will introduce the Fama—French data to estimate the exposure of assets to common risk factors using linear regression.

# The five Fama—French factors, namely market risk, size, value, operating profitability, 
# and investment have been shown empirically to explain asset returns and are commonly used to assess the risk/return profile of portfolios. Hence, it is natural to include past factor exposures as financial features in models.

# We can access the historical factor returns using the pandas-datareader 
# and estimate historical exposures using the RollingOLS rolling linear regression.

factor_data = web.DataReader('F-F_Research_Data_5_Factors_2x3',
                               'famafrench',
                               start='2019')[0].drop('RF', axis=1)

factor_data.index = factor_data.index.to_timestamp()

factor_data = factor_data.resample('M').last().div(100)

factor_data.index.name = 'date'

factor_data = factor_data.join(data['return_1m']).sort_index()

factor_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Mkt-RF,SMB,HML,RMW,CMA,return_1m
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-01-31,AAPL,0.0840,0.0301,-0.0045,-0.0078,-0.0152,0.055154
2019-01-31,ABBV,0.0840,0.0301,-0.0045,-0.0078,-0.0152,-0.118400
2019-01-31,ABT,0.0840,0.0301,-0.0045,-0.0078,-0.0152,0.013665
2019-01-31,ACN,0.0840,0.0301,-0.0045,-0.0078,-0.0152,0.088930
2019-01-31,ADBE,0.0840,0.0301,-0.0045,-0.0078,-0.0152,0.095385
...,...,...,...,...,...,...,...
2024-10-31,VRTX,-0.0097,-0.0087,0.0089,-0.0140,0.0103,0.023437
2024-10-31,VZ,-0.0097,-0.0087,0.0089,-0.0140,0.0103,-0.047257
2024-10-31,WFC,-0.0097,-0.0087,0.0089,-0.0140,0.0103,0.149230
2024-10-31,WMT,-0.0097,-0.0087,0.0089,-0.0140,0.0103,0.014861


In [175]:
# Filter out stocks with less than 10 months of data.

observations = factor_data.groupby(level=1).size()

valid_stocks = observations[observations >= 10]

factor_data = factor_data[factor_data.index.get_level_values('ticker').isin(valid_stocks.index)]

factor_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Mkt-RF,SMB,HML,RMW,CMA,return_1m
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-01-31,AAPL,0.0840,0.0301,-0.0045,-0.0078,-0.0152,0.055154
2019-01-31,ABBV,0.0840,0.0301,-0.0045,-0.0078,-0.0152,-0.118400
2019-01-31,ABT,0.0840,0.0301,-0.0045,-0.0078,-0.0152,0.013665
2019-01-31,ACN,0.0840,0.0301,-0.0045,-0.0078,-0.0152,0.088930
2019-01-31,ADBE,0.0840,0.0301,-0.0045,-0.0078,-0.0152,0.095385
...,...,...,...,...,...,...,...
2024-10-31,VRTX,-0.0097,-0.0087,0.0089,-0.0140,0.0103,0.023437
2024-10-31,VZ,-0.0097,-0.0087,0.0089,-0.0140,0.0103,-0.047257
2024-10-31,WFC,-0.0097,-0.0087,0.0089,-0.0140,0.0103,0.149230
2024-10-31,WMT,-0.0097,-0.0087,0.0089,-0.0140,0.0103,0.014861


In [177]:
# Calculate Rolling Factor Betas.
betas = (factor_data.groupby(level=1,
                            group_keys=False)
         .apply(lambda x: RollingOLS(endog=x['return_1m'], 
                                     exog=sm.add_constant(x.drop('return_1m', axis=1)),
                                     window=min(24, x.shape[0]),
                                     min_nobs=len(x.columns)+1)
         .fit(params_only=True)
         .params
         .drop('const', axis=1)))

betas

Unnamed: 0_level_0,Unnamed: 1_level_0,Mkt-RF,SMB,HML,RMW,CMA
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-01-31,AAPL,,,,,
2019-01-31,ABBV,,,,,
2019-01-31,ABT,,,,,
2019-01-31,ACN,,,,,
2019-01-31,ADBE,,,,,
...,...,...,...,...,...,...
2024-10-31,VRTX,0.875565,0.561738,-0.138340,1.074647,-0.149229
2024-10-31,VZ,0.734815,-0.727149,0.404253,0.236961,0.079426
2024-10-31,WFC,0.938970,-0.274122,1.298546,-1.246690,-0.662803
2024-10-31,WMT,0.650253,-0.021231,-0.191272,1.048214,0.088269


In [179]:
# Join the rolling factors data to the main features dataframe.
factors = ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA']

data = (data.join(betas.groupby('ticker').shift()))

data.loc[:, factors] = data.groupby('ticker', group_keys=False)[factors].apply(lambda x: x.fillna(x.mean()))

data = data.drop('adj close', axis=1)

data = data.dropna()

data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 10272 entries, (datetime.date(2018, 12, 31), 'AAPL') to (datetime.date(2024, 12, 31), 'XOM')
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   garman_klass_vol  10272 non-null  float64
 1   rsi               10272 non-null  float64
 2   bb_low            10272 non-null  float64
 3   bb_mid            10272 non-null  float64
 4   bb_high           10272 non-null  float64
 5   atr               10272 non-null  float64
 6   macd              10272 non-null  float64
 7   return_1m         10272 non-null  float64
 8   return_2m         10272 non-null  float64
 9   return_3m         10272 non-null  float64
 10  return_6m         10272 non-null  float64
 11  return_9m         10272 non-null  float64
 12  return_12m        10272 non-null  float64
 13  Mkt-RF            10272 non-null  float64
 14  SMB               10272 non-null  float64
 15  HML               10

In [181]:
# At this point we have to decide on what ML model and approach to use for predictions etc.

In [183]:
#6. For each month fit a K-Means Clustering Algorithm to group similar assets based on their features.
# K-Means Clustering
# You may want to initialize predefined centroids for each cluster based on your research.

# For visualization purpose of this tutorial we will initially rely on the ‘k-means++’ initialization.

# Then we will pre-define our centroids for each cluster.

In [185]:
from sklearn.cluster import KMeans

data = data.drop('cluster', axis=1)

def get_clusters(df):
    df['cluster'] = KMeans(n_clusters=4,
                           random_state=0,
                           init=initial_centroids).fit(df).labels_
    return df

data = data.dropna().groupby('date', group_keys=False).apply(get_clusters)

data

KeyError: "['cluster'] not found in axis"