In [1]:
import pandas as pd
import numpy as np
import yfinance as yf
import os
import sys

from collections import defaultdict
from sklearn.mixture import GaussianMixture

sys.path.append(os.getcwd()[:-10])
from Filtering.filtering import *

### In this notebook, we will be clustering securities in our trading universe.

As an example, we will download 6 months of historical stock data for our trading universe, preprocess the data to obtain observation values and perform clustering using a Gaussian Mixture Model.

More precisely, the observations are smoothed daily returns obtained using the formulas provided by Zura Kakushadze and Willie Yu
    \begin{align*}
    \\
    &S^i_t := \text{Security } i \text{ close price, time } t\\
    &R^i_t := \text{Returns of } S^i_t\\
    &\sigma^i := \text {Serial standard deviation of }R^i_t\\
    &\tilde{R^i_t} := \frac{R^i_t}{\sigma^i}, \quad \text{"Normalised Returns"}\\
    &\hat{R^i_i} := \frac{\tilde{R^i_t}}{u_i}, \quad \text{"Smoothed Returns"}\\
    &u_i := \max(
                 \exp(
                      \log(\sigma_i) - (
                          \text{Median}(\log(\sigma_i)) - 3 * \text{Mean Absolute Deviation}(\log(\sigma_i))
                                       )
                      )
    , 1)\\\\
    &\text{Median(·) and MAD(·) above are cross-sectional.}
    \end{align*}

References:
- Z. Kakushadze, W. Yu. Statistical Industry Classification. arXiv:1607.04883


### Filtering Trading Universe

In [2]:
listed_companies = pd.read_html("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies")[0].set_index('Symbol')

# pd.to_datetime cannot be called directly due to inconsistent data structures
def try_mapping_to_datetime(date):
    try:
        return pd.Timestamp(date)
    except:
        return np.nan
    
# Removing companies listed after 2020/01/01
listed_companies['Date first added'] = listed_companies['Date first added'].map(try_mapping_to_datetime)
listed_companies = listed_companies.dropna()
listed_companies = listed_companies[listed_companies['Date first added'] < pd.Timestamp('2020-01-01')]

# Filtering
trading_universe = filter_universe(securities = listed_companies.index.to_list(),
                                   current_time = pd.Timestamp('2020-01-01'),
                                   lookback=30,
                                   percentile=.05)

print(trading_universe)

[*********************100%***********************]  422 of 422 completed
['GOOGL', 'GOOG', 'AMZN', 'AAPL', 'BA', 'FB', 'MU', 'MSFT', 'NFLX', 'NVDA', 'V', 'DIS']


In [3]:
daily_bar_data = yf.download(tickers=trading_universe,
                             start='2019-07-01',
                             end='2020-01-01')['Adj Close']

daily_bar_data = daily_bar_data.dropna(axis=1)
daily_bar_data.head()

[*********************100%***********************]  12 of 12 completed


Unnamed: 0_level_0,AAPL,AMZN,BA,DIS,FB,GOOG,GOOGL,MSFT,MU,NFLX,NVDA,V
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2019-07-01,49.459972,1922.189941,350.111084,139.939774,193.0,1097.949951,1100.0,132.745392,40.011353,374.600006,41.378731,171.285995
2019-07-02,49.749546,1934.310059,347.852051,140.809143,195.0,1111.25,1112.599976,133.6259,39.502602,375.429993,40.39761,172.60556
2019-07-03,50.161819,1939.0,348.156494,141.253708,197.199997,1121.579956,1122.98999,134.486893,39.49263,381.720001,40.5271,174.171265
2019-07-05,50.117641,1942.910034,349.521729,141.601624,196.399994,1131.589966,1132.670044,134.095535,39.323044,380.549988,39.899586,173.964493
2019-07-08,49.084515,1952.319946,344.86618,140.180145,195.759995,1116.349976,1116.790039,133.997711,40.310612,376.160004,39.14756,173.501648


### Creating observation features

In [4]:
# calculate returns
returns = daily_bar_data.pct_change()
returns = returns.dropna()

# normalising returns
standard_deviation = returns.std(axis=0)
normalised_returns = returns / standard_deviation

# smoothing_returns
log_standard_deviation = np.log(standard_deviation)
smoothing_factor = log_standard_deviation - (log_standard_deviation.median() - 3 * log_standard_deviation.mad())
smoothing_factor = np.exp(smoothing_factor)
smoothing_factor[smoothing_factor < 1] = 1
smoothed_returns = normalised_returns / smoothing_factor

In [5]:
smoothed_returns.head()

Unnamed: 0_level_0,AAPL,AMZN,BA,DIS,FB,GOOG,GOOGL,MSFT,MU,NFLX,NVDA,V
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2019-07-02,0.21318,0.346405,-0.154417,0.243913,0.375291,0.431239,0.438858,0.395792,-0.152619,0.035802,-0.342279,0.401878
2019-07-03,0.301743,0.133203,0.020946,0.123959,0.408585,0.330928,0.357787,0.38447,-0.00303,0.270724,0.046272,0.473196
2019-07-05,-0.032068,0.110784,0.093845,0.096704,-0.14692,0.317725,0.330255,-0.173639,-0.051542,-0.049528,-0.223518,-0.06193
2019-07-08,-0.750591,0.266076,-0.318769,-0.394134,-0.118014,-0.479449,-0.537149,-0.04353,0.301445,-0.186404,-0.272082,-0.138791
2019-07-09,0.222094,1.012476,0.134275,0.164265,0.638252,0.270422,0.257298,-0.217841,0.279207,0.161947,0.009183,0.455961


### Clustering

In [6]:
securities = smoothed_returns.columns
clustering_model = GaussianMixture(n_components=8).fit(smoothed_returns.values.T)
cluster_tags = clustering_model.predict(smoothed_returns.values.T)

clusters_of_securities = defaultdict(list)
for i in range(smoothed_returns.shape[1]):
    clusters_of_securities[cluster_tags[i]].append(securities[i])

In [7]:
clusters_of_securities

defaultdict(list,
            {7: ['AAPL'],
             4: ['AMZN'],
             5: ['BA'],
             3: ['DIS'],
             6: ['FB'],
             0: ['GOOG', 'GOOGL'],
             2: ['MSFT', 'V'],
             1: ['MU', 'NFLX', 'NVDA']})

### Improvements

Clustering is a complex task. The algorithm presented here is overly simplistic and hence there are many areas of improvements. Some questions to consider are:

- Optimal clustering length: Securities rarely exhibit highly correlated behaviours over long periods of time (6 months in this case). On the other hand, choosing a period that is too short leads to spurious results.
- Number of clusters: Getting this number algorithmically rather than discretionally setting it apriori
- Randomness of clustering algorithms: How to ensure consistent performance
- Clustering algorithm: Hierarchical clustering VS Gaussian Mixture Models
- ONC algorithm suggested by López de Prado