# Hurst Exponent Research

The Hurst exponent is an excellent tool for determining whether or not a stock is trending $(H > 0.5)$, mean-reverting $(H < 0.5)$, or exhibiting random walk behavior $(H \approx 0.5)$. In this notebook, we will be calculating the Hurst exponent for each of our chosen ETFs over the minute ticker data we scraped. The goal is to use this data as a target variable for machine learning purposes. If we can discover ETFs that exhibit mean-reverting properties, we can trade them using an indicator known as the relative strength index, a signal that represents whether or not an asset is overbought or oversold.

In [1]:
import os
import numpy as np
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import alpaca_trade_api as tradeapi
import datetime
import seaborn as sns
from gensim.models import Word2Vec
from nltk import word_tokenize
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, LSTM, CuDNNLSTM, Embedding, Conv2D
from keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.preprocessing import text, sequence
from keras.optimizers import SGD
from keras import backend as K
from sklearn.utils import class_weight
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
np.random.seed(0)

api = tradeapi.REST(
    base_url=os.environ['APCA_API_BASE_URL'],
    key_id=os.environ['APCA_API_KEY_ID'],
    secret_key=os.environ['APCA_API_SECRET_KEY']
)

Using TensorFlow backend.


In [2]:
# Pick ETF Universe
symbols = [
    'XLF', # Financials
    'GDX', # Gold miners
    'VXX', # Volatility (Options)
    'EEM', # Emerging Markets
    'XRT', # S&P Retail
    'VTI', # Vanguard Total Stock Market
    'EWJ', # Japanese Market 
    'FXI', # China Large Cap
    'XHB', # S&P Homebuilders (Tracks real estate)
    'TLT', # 20 yr Treasury Bond
    'USO', # US Oil Fund
    'DBC', # Commodity Tracking
    'GLD', # Gold
    'SPY', # S&P 500
    'QQQ', # Nasdaq 100
    'XSW', # Computer Software
]

In [3]:
# Load our ticker data
ticker_data = pd.read_csv('data/ticker_data.csv', index_col=0)
ticker_data.index = pd.to_datetime(ticker_data.index, utc=True).tz_convert('US/Eastern') 
ticker_data.head()

Unnamed: 0_level_0,open_XLF,high_XLF,low_XLF,close_XLF,volume_XLF,open_GDX,high_GDX,low_GDX,close_GDX,volume_GDX,...,open_QQQ,high_QQQ,low_QQQ,close_QQQ,volume_QQQ,open_XSW,high_XSW,low_XSW,close_XSW,volume_XSW
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-12-02 09:30:00-05:00,21.51,21.53,21.5,21.52,104372.0,21.8,21.84,21.78,21.83,257282.0,...,85.83,85.86,85.81,85.85,271251.0,44.275,44.275,44.275,44.275,360.0
2013-12-02 09:31:00-05:00,21.52,21.53,21.51,21.52,15900.0,21.84,21.8799,21.8,21.81,171214.0,...,85.84,85.85,85.79,85.8,130647.0,44.275,44.275,44.275,44.275,360.0
2013-12-02 09:32:00-05:00,21.51,21.52,21.49,21.495,753186.0,21.81,21.81,21.77,21.78,286029.0,...,85.8,85.83,85.78,85.81,185792.0,44.275,44.275,44.275,44.275,360.0
2013-12-02 09:33:00-05:00,21.495,21.51,21.49,21.51,46024.0,21.7799,21.8,21.771,21.78,157994.0,...,85.8,85.85,85.7985,85.85,108899.0,44.275,44.275,44.275,44.275,360.0
2013-12-02 09:34:00-05:00,21.51,21.52,21.51,21.52,47806.0,21.78,21.8,21.77,21.79,125530.0,...,85.84,85.865,85.83,85.85,92022.0,44.275,44.275,44.275,44.275,360.0


In [None]:
def calc_hurst(data, lag_end=20):
    """ Calculates the hurst exponent over a single time window. """
    lags = range(2, lag_end)
    tau = [np.sqrt(np.std(np.subtract(data[lag:], data[:-lag]))) for lag in lags]

    # calculate Hurst as slope of log-log plot
    m = np.polyfit(np.log(lags), np.log(tau), 1)
    hurst = m[0]*2.0
    return hurst

def get_hurst(ticker_data=ticker_data, symbols=symbols):
    """ Gets daily hurst exponent for each asset during trading hours and 30 minutes before and after hours. """
    out = {}
    for symbol in symbols:
        out[symbol] = ticker_data['close_' + symbol].between_time('09:00', '16:30').groupby(pd.Grouper(freq='D')).apply(calc_hurst).dropna()
        
    return pd.concat(out, axis=1)

In [13]:
#df = get_hurst()
df = pd.read_csv('data/daily_hurst.csv', index_col=0)

In [14]:
df.tail()

Unnamed: 0_level_0,DBC,EEM,EWJ,FXI,GDX,GLD,QQQ,SPY,TLT,USO,VTI,VXX,XHB,XLF,XRT,XSW
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2019-06-03 00:00:00-04:00,0.523221,0.352668,0.484593,0.316632,0.33753,0.459752,0.501581,0.512639,0.517266,0.456661,0.48747,0.538203,0.481924,0.460337,0.485652,0.560335
2019-06-04 00:00:00-04:00,0.345413,0.432663,0.473177,0.425388,0.411641,0.092307,0.530037,0.517668,0.437447,0.412134,0.334451,0.500213,0.461554,0.455927,0.581165,0.356441
2019-06-05 00:00:00-04:00,0.491973,0.540032,0.400742,0.512187,0.448627,0.450032,0.466509,0.448117,0.477804,0.440339,0.409789,0.362629,0.484543,0.518187,0.504278,0.473029
2019-06-06 00:00:00-04:00,0.506061,0.473082,0.463997,0.438473,0.489791,0.494113,0.400016,0.134377,0.529334,0.468419,0.43627,0.38457,0.568998,0.446966,0.573632,0.524979
2019-06-07 00:00:00-04:00,0.389846,0.450185,0.323267,0.561966,0.404821,0.392652,0.418428,0.373632,0.376463,0.421355,0.558162,0.499161,0.273208,0.201832,0.331988,0.596721


In [15]:
df.mean()

DBC    0.443312
EEM    0.413877
EWJ    0.386687
FXI    0.390307
GDX    0.455717
GLD    0.441969
QQQ    0.446088
SPY    0.456076
TLT    0.459751
USO    0.440997
VTI    0.444687
VXX    0.461719
XHB    0.472657
XLF    0.387299
XRT    0.348658
XSW    0.477285
dtype: float64

We can see that most assets trade below the 0.5 Hurst exponent (at 20 min lag) on average. This indicates that it is possible a mean-reversion strategy would be profitable. To test this, let's create an RSI strategy and conduct a backtest on our minute ticker data.

In [None]:
# Save data for later
df.to_csv('data/daily_hurst.csv')

# RSI Research

The goal of calculating the Hurst exponent is to determine whether or not a mean reversion strategy is viable on a given day. To implement an actual strategy, we need to have a metric that is calculable "on the fly." Relative Strength Index, or RSI, is a measure of how overbought or oversold a certain asset is. When RSI is below 30, we assume that the asset is oversold and would be a viable buy. When RSI is above 70, we assume the asset is overbought and would be a viable short sell. When RSI crosses back over or the trading day has finished, we exit our position. Let's calculate the RSI for every day similar to how we calculated the Hurst exponent. Then, trading the above strategy, we will determine how much money we have made or lost such that we trade 100 shares per trade.

For more info on how to calculate RSI, visit here: https://stockcharts.com/school/doku.php?id=chart_school:technical_indicators:relative_strength_index_rsi.

In [15]:
def calc_rsi(data, symbol, lag=20):
    """ Calculates RSI on a given lag for a given symbol. """
    returns = (data['close_' + symbol] - data['open_' + symbol]) / data['open_' + symbol]
    up, down = returns.copy(), returns.copy()
    up[up < 0] = 0
    down[down > 0] = 0
    rolling_gains = up.rolling(lag).mean()
    rolling_losses = down.rolling(lag).mean().abs()
    rs = rolling_gains / rolling_losses
    rsi = 100 - 100 / (1 + rs)
    return rsi


def calc_rsi_strategy(data, symbol, lag=20):
    """ Executes RSI strategy. """
    # Get RSI and create DF
    rsi = calc_rsi(data, symbol, lag=lag)
    rsi = pd.DataFrame(rsi, columns=['rsi'], index=rsi.index)
    # Concatenate for strategy run-through
    trade_data = pd.concat([data['close_' + symbol], rsi], axis=1).dropna()
    # Init holdings and current trade
    assets, curr_trade = 0, 0
    # Init that we do not have a position
    position = None
    for index, row in trade_data.iterrows():
        if position == 'long':
            # Criteria for exiting long position
            if row['rsi'] >= 70:
                position = None
                curr_trade = row['close_' + symbol] - curr_trade
                assets += curr_trade
                curr_trade = 0
        elif position == 'short':
            # Criteria for exiting short position
            if row['rsi'] <= 30:
                position = None
                curr_trade -= row['close_' + symbol]
                assets += curr_trade
                curr_trade = 0
        else:
            # Criteria for entering long position
            if row['rsi'] >= 70:
                position = 'long'
                curr_trade = row['close_' + symbol]
            # Criteria for entering short position
            elif row['rsi'] <= 30:
                position = 'short'
                curr_trade = row['close_' + symbol]
    
    # Liquidate at end of day
    if position == 'long':
        curr_trade = row['close_' + symbol] - curr_trade
        assets += curr_trade 
    elif position == 'short':
        curr_trade -= row['close_' + symbol]
        assets += curr_trade
        
    return assets
            
    
def get_rsi_df(ticker_data=ticker_data, symbols=symbols):
    """ Gets daily hurst exponent for each asset during trading hours. """
    out = {}
    for symbol in symbols:
        # Only up until 1 min before market close - need to ensure we can liquidate
        out[symbol] = ticker_data.between_time('09:30', '15:59').groupby(pd.Grouper(freq='D')).apply(calc_rsi_strategy, symbol=symbol).dropna()
        
    return pd.concat(out, axis=1)  

In [16]:
rsi_df = get_rsi_df()
rsi_df.head()

Unnamed: 0_level_0,DBC,EEM,EWJ,FXI,GDX,GLD,QQQ,SPY,TLT,USO,VTI,VXX,XHB,XLF,XRT,XSW
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2013-12-02 00:00:00-05:00,0.1294,0.2384,-0.253,0.1311,0.3389,0.1487,0.0313,0.0365,0.0929,0.1004,0.3593,9.76,-0.37,0.1127,0.017,0.0
2013-12-03 00:00:00-05:00,0.1327,4.0689,-0.185,0.2751,0.0269,-0.6725,0.3759,0.1039,-0.51,0.2419,0.1056,33.744,0.1485,-0.136,0.268,0.0
2013-12-04 00:00:00-05:00,-0.0008,0.1267,-0.128,-0.129,-0.4183,-2.4022,-0.1939,-0.6831,0.2399,0.1081,0.4433,-10.148,-0.0464,0.2069,0.002,0.0
2013-12-05 00:00:00-05:00,-0.0924,-0.05,-0.085,-0.15,-0.0977,0.8751,-0.0349,0.2659,0.2075,-0.1,-0.1182,-2.676,-0.145,0.0718,0.244,0.0
2013-12-06 00:00:00-05:00,-0.0223,-3.6815,0.018,-0.3641,-0.2823,-0.6591,-0.0859,0.4112,-0.245,0.1313,0.0358,3.564,-0.1857,-0.0511,-0.18,0.0


The below value represents the overall profit / loss in terms of dollars per share we would have generated on average. Notice how certain ETFs, such as the EWJ, perform poorly on average, while others such as VXX, perform well on average.

In [17]:
rsi_df.mean()

DBC   -0.000728
EEM   -0.012859
EWJ   -0.001590
FXI   -0.009738
GDX   -0.001003
GLD   -0.007450
QQQ    0.035389
SPY    0.034351
TLT   -0.005774
USO    0.001398
VTI   -0.016500
VXX    0.276356
XHB    0.005329
XLF   -0.000058
XRT    0.326016
XSW   -0.008840
dtype: float64

Let's see how each ETF would have performed had we only traded on ETFs where the strategy was profitable.

In [18]:
rsi_df[rsi_df > 0].mean()

DBC    0.066406
EEM    0.552019
EWJ    0.140098
FXI    0.141495
GDX    0.165682
GLD    0.282171
QQQ    0.511624
SPY    0.695543
TLT    0.301368
USO    0.114268
VTI    0.348426
VXX    4.260037
XHB    0.159411
XLF    0.095041
XRT    3.691052
XSW    0.234871
dtype: float64

We see that even ETFs that do poorly on average would do much better if we could accurately predict whether or not this strategy would have worked on a given day. Our goal will be to use our news and morning ticker data to predict whether or not RSI mean-reversion trading will work on any given day.

In [19]:
# Save for later
rsi_df.to_csv('data/rsi_pl.csv')

# Volatility and Volume Research

One of the key aspects that makes an asset good for day trading is higher than usual volume and volatility. Why is that? Well, when an asset has high volume, it is very liquid, meaning it is easy to enter an exit a position at any given time. When an asset is volatile, it is moving in either direction consistently, making it more likely to profit from a position that relies on mean-reversion to be successful. Our goal is to predict whether or not a mean-reversion strategy will make money on any given day. While it may be very difficult to predict this at market open, it may be easier to predict whether or not an asset will have higher than usual daily volatility or volume. We measure volatility as the standard deviation of minute close prices and volume as the total sum of minute volume.

In [20]:
def get_volatility(data, symbols=symbols):
    out = {}
    for symbol in symbols:
        out[symbol] = data['close_' + symbol].between_time('09:30', '15:59').groupby(pd.Grouper(freq='D')).std().dropna()
    return pd.concat(out, axis=1) 


def get_volume(data, symbols=symbols):
    out = {}
    for symbol in symbols:
        out[symbol] = data['volume_' + symbol].between_time('09:30', '15:59').groupby(pd.Grouper(freq='D')).sum().dropna()
    return pd.concat(out, axis=1) 

We want to test that if we only trade assets that have either high volatility or high volume for a given day, that our RSI mean-reversion strategy would have better performance than if we traded all assets every day. To do this, we can multiply each DataFrame element wise and compare average performance.

In [21]:
volatility_df = get_volatility(ticker_data)
volume_df = get_volume(ticker_data)
rsi_df = pd.read_csv('data/rsi_pl.csv', index_col = 0)
rsi_df.index = pd.to_datetime(rsi_df.index, utc=True).tz_convert('US/Eastern') 

In [22]:
volatility_df.head()

Unnamed: 0_level_0,DBC,EEM,EWJ,FXI,GDX,GLD,QQQ,SPY,TLT,USO,VTI,VXX,XHB,XLF,XRT,XSW
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2013-12-02 00:00:00-05:00,0.051233,0.216552,0.060327,0.128639,0.203226,0.384325,0.085193,0.237505,0.133732,0.074812,0.133183,2.772955,0.134465,0.049406,0.076197,0.190521
2013-12-03 00:00:00-05:00,0.03298,1.80202,0.054884,0.143678,0.074387,0.147071,0.132806,0.307561,0.091001,0.1483,0.164234,9.313067,0.089652,0.074913,0.068045,0.089437
2013-12-04 00:00:00-05:00,0.03067,0.114592,0.091087,0.123803,0.256995,0.885402,0.218932,0.492466,0.133805,0.095286,0.266965,6.686612,0.167535,0.057636,0.144782,0.051499
2013-12-05 00:00:00-05:00,0.019377,0.079208,0.069555,0.091675,0.094308,0.423058,0.090315,0.187192,0.125224,0.052713,0.08813,2.849088,0.044636,0.032177,0.06418,0.116347
2013-12-06 00:00:00-05:00,0.015236,1.844113,0.040856,0.099273,0.097473,0.298526,0.12179,0.216121,0.156708,0.032668,0.105778,4.24124,0.088779,0.041699,0.062296,0.158202


In [23]:
volume_df.head()

Unnamed: 0_level_0,DBC,EEM,EWJ,FXI,GDX,GLD,QQQ,SPY,TLT,USO,VTI,VXX,XHB,XLF,XRT,XSW
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2013-12-02 00:00:00-05:00,4451701.0,57884449.0,8347820.0,13494093.0,44102259.0,8310859.0,20393276.0,66389410.0,5992629.0,4039467.0,3105557.0,907921.0,3262302.0,18589285.0,8327410.0,156048.0
2013-12-03 00:00:00-05:00,1367857.0,95197430.0,8009656.0,20447053.0,28762548.0,5476432.0,25703684.0,74679552.0,4326378.0,9269610.0,2007762.0,1640559.0,3261899.0,26029698.0,4331750.0,176472.0
2013-12-04 00:00:00-05:00,1859862.0,47288805.0,12365473.0,15145565.0,39070084.0,7643957.0,32583033.0,96965740.0,6236937.0,7000074.0,2196678.0,1701089.0,3602432.0,34223915.0,8704198.0,159086.0
2013-12-05 00:00:00-05:00,1657078.0,51099572.0,11436411.0,10675814.0,25984565.0,7807211.0,31676851.0,76837414.0,5283952.0,4522063.0,1820562.0,1109820.0,2144908.0,33275145.0,6538672.0,885906.0
2013-12-06 00:00:00-05:00,1032843.0,65154870.0,6488826.0,17368376.0,19451212.0,6231604.0,26268568.0,80387895.0,6066351.0,2656304.0,1913441.0,1510266.0,2287232.0,24831059.0,6910938.0,190400.0


In [24]:
rsi_df.head()

Unnamed: 0_level_0,DBC,EEM,EWJ,FXI,GDX,GLD,QQQ,SPY,TLT,USO,VTI,VXX,XHB,XLF,XRT,XSW
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2013-12-02 00:00:00-05:00,0.1294,0.2384,-0.253,0.1311,0.3389,0.1487,0.0313,0.0365,0.0929,0.1004,0.3593,9.76,-0.37,0.1127,0.017,0.0
2013-12-03 00:00:00-05:00,0.1327,4.0689,-0.185,0.2751,0.0269,-0.6725,0.3759,0.1039,-0.51,0.2419,0.1056,33.744,0.1485,-0.136,0.268,0.0
2013-12-04 00:00:00-05:00,-0.0008,0.1267,-0.128,-0.129,-0.4183,-2.4022,-0.1939,-0.6831,0.2399,0.1081,0.4433,-10.148,-0.0464,0.2069,0.002,0.0
2013-12-05 00:00:00-05:00,-0.0924,-0.05,-0.085,-0.15,-0.0977,0.8751,-0.0349,0.2659,0.2075,-0.1,-0.1182,-2.676,-0.145,0.0718,0.244,0.0
2013-12-06 00:00:00-05:00,-0.0223,-3.6815,0.018,-0.3641,-0.2823,-0.6591,-0.0859,0.4112,-0.245,0.1313,0.0358,3.564,-0.1857,-0.0511,-0.18,0.0


In [25]:
volatility_weighted = (volatility_df > volatility_df.quantile(0.5)).astype('int64') * rsi_df

In [26]:
volatility_weighted.mean() > rsi_df.mean()

DBC     True
EEM    False
EWJ     True
FXI     True
GDX     True
GLD     True
QQQ     True
SPY     True
TLT     True
USO     True
VTI     True
VXX     True
XHB     True
XLF     True
XRT     True
XSW    False
dtype: bool

In [27]:
volume_weighted = (volume_df > volume_df.quantile(0.5)).astype('int64') * rsi_df

In [28]:
volume_weighted.mean() > rsi_df.mean()

DBC     True
EEM     True
EWJ     True
FXI     True
GDX     True
GLD     True
QQQ    False
SPY    False
TLT    False
USO    False
VTI     True
VXX     True
XHB    False
XLF    False
XRT    False
XSW    False
dtype: bool

It seems that trading these strategies independently allows us to have higher expected return on most assets. Let's observe what happens if we combine these strategies.

In [29]:
comb_weighted = ((volume_df > volume_df.quantile(0.5)) & (volatility_df > volatility_df.quantile(0.5))).dropna().astype('int64') * rsi_df

In [30]:
comb_weighted.mean() > rsi_df.mean()

DBC     True
EEM     True
EWJ     True
FXI     True
GDX     True
GLD     True
QQQ     True
SPY     True
TLT     True
USO     True
VTI     True
VXX     True
XHB     True
XLF     True
XRT    False
XSW    False
dtype: bool

Well that's pretty fantastic. If we were able to predict days when both volatility and volume would be higher than usual, we can expect higher than average returns on 14 out of 16 etfs if we were only to execute our RSI strategy on those assets. Let's also observe how well the strategy does from a dollar per share standpoint on average.

In [31]:
comb_weighted.mean()

DBC    0.000823
EEM    0.010903
EWJ    0.003563
FXI   -0.004037
GDX    0.006365
GLD   -0.001977
QQQ    0.055716
SPY    0.043313
TLT   -0.001211
USO    0.002793
VTI   -0.011175
VXX    0.519429
XHB    0.008361
XLF    0.002828
XRT    0.191499
XSW   -0.011037
dtype: float64

The next step is building an algorithm which attempts to predict when an asset's volume and volatility will be higher than usual! Let's save this dataframe; we will use it later as set of target variables.

In [32]:
vv_target = ((volume_df > volume_df.quantile(0.5)) & (volatility_df > volatility_df.quantile(0.5))).dropna()
vv_target.head()

Unnamed: 0_level_0,DBC,EEM,EWJ,FXI,GDX,GLD,QQQ,SPY,TLT,USO,VTI,VXX,XHB,XLF,XRT,XSW
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2013-12-02 00:00:00-05:00,True,True,False,True,True,True,False,False,False,False,False,False,True,False,False,True
2013-12-03 00:00:00-05:00,False,True,False,True,False,True,False,False,False,False,False,False,False,False,False,False
2013-12-04 00:00:00-05:00,False,True,True,True,True,True,False,True,False,False,True,False,True,True,False,False
2013-12-05 00:00:00-05:00,False,False,True,False,True,True,False,False,False,False,False,False,False,False,False,False
2013-12-06 00:00:00-05:00,False,True,False,True,False,True,False,False,False,False,False,False,False,False,False,True


In [33]:
# Save as csv
vv_target.to_csv('data/vv_target.csv')

In [9]:
# Test load
vv_target = pd.read_csv('data/vv_target.csv', index_col=0).astype('int64')

In [11]:
# Observe total distribution
vv_target.sum().sum() / (vv_target.shape[0] * vv_target.shape[1])

0.3988138030194105