In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.mixture import GaussianMixture
from matplotlib import pyplot as plt

In [2]:
#get data
all_files = os.listdir(r'C:\Users\xyyh\Desktop\Cornell\ORIE 5370\Project\sp500_data_features')
dates = pd.read_csv(r'C:\Users\xyyh\Desktop\Cornell\ORIE 5370\Project\sp500_data_features\{}'.format(all_files[0])).Date
data_dict = {}
for file in all_files:
    df = pd.read_csv(r'C:\Users\xyyh\Desktop\Cornell\ORIE 5370\Project\sp500_data_features\{}'.format(file))
    data_dict[file] = df


In [4]:
#get features
vw_mu = np.empty([len(dates), len(all_files)])
vw_ema_mu = np.empty([len(dates), len(all_files)])
vw_macd = np.empty([len(dates), len(all_files)])
vw_rsi = np.empty([len(dates), len(all_files)])

for i in range(len(dates)):
    vw_mu_daily = np.array([])
    vw_ema_mu_daily = np.array([])
    vw_macd_daily = np.array([])
    vw_rsi_daily = np.array([])
    
    total_volume = 0
    for file in all_files:
        ret = data_dict[file].Return[i]
        volume = data_dict[file].Volume[i]
        ema = data_dict[file].EMA[i]
        macd = data_dict[file].MACD[i]
        rsi = data_dict[file].RSI[i]

        total_volume += volume
        vw_mu_daily = np.append(vw_mu_daily, ret*volume)
        vw_ema_mu_daily = np.append(vw_ema_mu_daily, ema*volume)
        vw_macd_daily = np.append(vw_macd_daily, macd*volume)
        vw_rsi_daily = np.append(vw_rsi_daily, rsi*volume)

    vw_mu[i] = vw_mu_daily / total_volume
    vw_ema_mu[i] = vw_ema_mu_daily / total_volume
    vw_macd[i] = vw_macd_daily / total_volume
    vw_rsi[i] = vw_rsi_daily / total_volume

In [10]:
#Define the functions of persistence scores
def persistence_score(arr):
    '''
    Score: How frequent does state switching happen (the lower the better)
    '''
    count = 0
    for i in range(len(arr)):
        if i !=0:
            if arr[i] != arr[i-1]:
                #switching state
                count += 1
        else:
            continue
    return count/len(arr)
    
def avg_persistence(n_state, mu, train_len, freq):
    '''
    n_state: the number of market states (int);
    mu: the feature used to cluster data points;
    train_len: the time span of training set (in years);
    freq: days between each training set (int).
    '''
    train_days = int(train_len * 253)
    n_iter = int((len(mu) - train_days + 1)/freq)
    total_persist_score = 0
    for i in range(n_iter):
        mu_temp = mu[i*freq:i*freq+train_days]
        gm = GaussianMixture(n_components = n_state, random_state=0).fit(mu_temp)
        all_states = gm.predict(mu_temp)
        persist_score = persistence_score(all_states)
        total_persist_score += persist_score
    return total_persist_score/n_iter
        


In [15]:
#define the candidate configurations
cand_states = [2,3,4,5]
cand_mu = [vw_mu, vw_ema_mu, vw_macd, vw_rsi]
mu_str = ['vw_mu', 'vw_ema_mu', 'vw_macd', 'vw_rsi']
cand_train_len = [0.5, 1, 2, 3, 4] #training set length or lookback period
cand_freq = [1, 5, 21, 63, 252] #rebalancing 
total_tasks = len(cand_states)*(len(cand_mu))*len(cand_train_len)*len(cand_freq)
print(total_tasks)

400


In [17]:
res_output = []
state_output = []
mu_output = []
train_len_output = []
freq_output = []

count = 0
for state in cand_states:
    print('Number of states:'+str(state))
    for i in range(len(cand_mu)):
        print(mu_str[i])
        for train_len in cand_train_len:
            print('Training set length:'+str(train_len))
            for freq in cand_freq:
                print('Rebalancing Frequency:'+str(freq))
                avg_per = avg_persistence(state, cand_mu[i], train_len, freq)
                res_output.append(avg_per)
                state_output.append(state)
                mu_output.append(mu_str[i])
                train_len_output.append(train_len)
                freq_output.append(freq)
                count += 1
                print('{} completed'.format(str(count/total_tasks)))

                
df_output = pd.DataFrame({'Average Persistence': res_output, 'Number of States': state_output, 'Feature': mu_output,
                         'Train Length': train_len_output, 'Frequency': freq_output})
df_output.to_csv('MarketStateEvalution.csv', index=False)

Number of states:2
vw_mu
Training set length:0.5
Rebalancing Frequency:1
0.0025 completed
Rebalancing Frequency:5
0.005 completed
Rebalancing Frequency:21
0.0075 completed
Rebalancing Frequency:63
0.01 completed
Rebalancing Frequency:252
0.0125 completed
Training set length:1
Rebalancing Frequency:1
0.015 completed
Rebalancing Frequency:5
0.0175 completed
Rebalancing Frequency:21
0.02 completed
Rebalancing Frequency:63
0.0225 completed
Rebalancing Frequency:252
0.025 completed
Training set length:2
Rebalancing Frequency:1
0.0275 completed
Rebalancing Frequency:5
0.03 completed
Rebalancing Frequency:21
0.0325 completed
Rebalancing Frequency:63
0.035 completed
Rebalancing Frequency:252
0.0375 completed
Training set length:3
Rebalancing Frequency:1
0.04 completed
Rebalancing Frequency:5
0.0425 completed
Rebalancing Frequency:21
0.045 completed
Rebalancing Frequency:63
0.0475 completed
Rebalancing Frequency:252
0.05 completed
Training set length:4
Rebalancing Frequency:1
0.0525 completed
R



0.1775 completed
Rebalancing Frequency:5
0.18 completed
Rebalancing Frequency:21
0.1825 completed
Rebalancing Frequency:63
0.185 completed
Rebalancing Frequency:252
0.1875 completed
vw_rsi
Training set length:0.5
Rebalancing Frequency:1
0.19 completed
Rebalancing Frequency:5
0.1925 completed
Rebalancing Frequency:21
0.195 completed
Rebalancing Frequency:63
0.1975 completed
Rebalancing Frequency:252
0.2 completed
Training set length:1
Rebalancing Frequency:1
0.2025 completed
Rebalancing Frequency:5
0.205 completed
Rebalancing Frequency:21
0.2075 completed
Rebalancing Frequency:63
0.21 completed
Rebalancing Frequency:252
0.2125 completed
Training set length:2
Rebalancing Frequency:1
0.215 completed
Rebalancing Frequency:5
0.2175 completed
Rebalancing Frequency:21
0.22 completed
Rebalancing Frequency:63
0.2225 completed
Rebalancing Frequency:252
0.225 completed
Training set length:3
Rebalancing Frequency:1
0.2275 completed
Rebalancing Frequency:5
0.23 completed
Rebalancing Frequency:21
0.

0.625 completed
vw_macd
Training set length:0.5
Rebalancing Frequency:1
0.6275 completed
Rebalancing Frequency:5
0.63 completed
Rebalancing Frequency:21
0.6325 completed
Rebalancing Frequency:63
0.635 completed
Rebalancing Frequency:252
0.6375 completed
Training set length:1
Rebalancing Frequency:1
0.64 completed
Rebalancing Frequency:5
0.6425 completed
Rebalancing Frequency:21
0.645 completed
Rebalancing Frequency:63
0.6475 completed
Rebalancing Frequency:252
0.65 completed
Training set length:2
Rebalancing Frequency:1
0.6525 completed
Rebalancing Frequency:5
0.655 completed
Rebalancing Frequency:21
0.6575 completed
Rebalancing Frequency:63
0.66 completed
Rebalancing Frequency:252
0.6625 completed
Training set length:3
Rebalancing Frequency:1
0.665 completed
Rebalancing Frequency:5
0.6675 completed
Rebalancing Frequency:21
0.67 completed
Rebalancing Frequency:63
0.6725 completed
Rebalancing Frequency:252
0.675 completed
Training set length:4
Rebalancing Frequency:1
0.6775 completed
Re



0.8025 completed
Rebalancing Frequency:5
0.805 completed
Rebalancing Frequency:21
0.8075 completed
Rebalancing Frequency:63
0.81 completed
Rebalancing Frequency:252
0.8125 completed
vw_ema_mu
Training set length:0.5
Rebalancing Frequency:1
0.815 completed
Rebalancing Frequency:5
0.8175 completed
Rebalancing Frequency:21
0.82 completed
Rebalancing Frequency:63
0.8225 completed
Rebalancing Frequency:252
0.825 completed
Training set length:1
Rebalancing Frequency:1
0.8275 completed
Rebalancing Frequency:5
0.83 completed
Rebalancing Frequency:21
0.8325 completed
Rebalancing Frequency:63
0.835 completed
Rebalancing Frequency:252
0.8375 completed
Training set length:2
Rebalancing Frequency:1
0.84 completed
Rebalancing Frequency:5
0.8425 completed
Rebalancing Frequency:21
0.845 completed
Rebalancing Frequency:63
0.8475 completed
Rebalancing Frequency:252
0.85 completed
Training set length:3
Rebalancing Frequency:1
0.8525 completed
Rebalancing Frequency:5
0.855 completed
Rebalancing Frequency: