![QuantConnect Logo](https://cdn.quantconnect.com/web/i/icon.png)
<hr>

### KDE Model MI Index Research

Research the best method for calculating the Mispricing index given a KDE model. 
Factors to consider are: 
* Computational speed
* 

In [38]:
#----------------------------------------------------------------------------------------------------
# UTIL FUNCTIONS
#----------------------------------------------------------------------------------------------------
import numpy as np
class Parameter():
    def __init__(self,
                resolution,
                start_year, 
                start_month=1, 
                start_day=1,
                n_years=0,
                n_months=0, 
                n_days=0): 
        
        self.Resolution = resolution
        
        end_year = start_year + n_years
        end_month = start_month + n_months
        end_day = start_day + n_days 
        self.Start = datetime(start_year,start_month,start_day,9,30,0)
        self.End = datetime(end_year, end_month, end_day,16,30,0)
        
class Data():
    def __init__(self, df):
        self._df = df.dropna()
    
    @property 
    def prices(self):
        return self._df
    
    @property
    def returns(self):
        return self._df.diff() 
    
    @property
    def columns(self):
        return self._df.columns
    
    def plot(self, *args, **kwargs): 
        return self._df.plot(*args, **kwargs)

import random

def sample_from_bivariate(x_domain, y_domain, weights, n_samples):
    x_domain = x_domain.ravel()
    y_domain = y_domain.ravel()
    weights = np.nan_to_num(weights.ravel()) # account for nans
    
    population = np.array([x_domain, y_domain]).T
    print(population)
    return np.array(random.choices(population, weights, k=n_samples))

# Utils
def np_remove_nan(x): 
    return x[np.logical_not(np.isnan(x).any(axis=1))]

def np_remove_inf(x):
    return x[np.logical_not(np.isinf(x).any(axis=1))]

def np_remove_inf_1D(x):
    return x[~np.isinf(x)]

def np_remove_nan_1D(x):
    return x[~np.isinf(x)]

# test remove nan 
data = np.array([[np.inf,np.inf],
                 [1,2],
                 [1,3],
                 [np.nan, 4],
                 [5,6]])

data_no_nan = np_remove_nan(data)
data_no_inf = np_remove_inf(data)
data_no_non_num = np_remove_nan(np_remove_inf(data))

check_for_nan = lambda x: True not in np.isnan(x)
check_for_inf = lambda x: True not in np.isinf(x)

assert check_for_nan(data_no_nan)
assert check_for_inf(data_no_inf)
assert check_for_nan(data_no_non_num) and check_for_inf(data_no_non_num)
assert data_no_non_num.shape == (3,2)


In [39]:
#----------------------------------------------------------------------------------------------------
# LOAD DATA
#----------------------------------------------------------------------------------------------------

print("Setting parameters...")
# parameters
START_YEAR = 2011
START_MONTH = 1
START_DAY = 1
N_YEARS = 5
N_MONTHS = 0
N_DAYS = 0
RESOLUTION = Resolution.Daily
PARAM = Parameter(RESOLUTION, START_YEAR, START_MONTH, START_DAY,
                  N_YEARS, N_MONTHS, N_DAYS)
print("Loading QuantBook...")
qb = QuantBook()

# Specify list of correlated tickers for S&P 500 
tickers = ["SPY","XLK", "VGT", "IYW", "IGV"]

# register tickers to quantbook
for ticker in tickers: 
    qb.AddEquity(ticker)
    
print("Loading historical data...")
# get historical prices
history = qb.History(tickers, PARAM.Start, PARAM.End, PARAM.Resolution)

print("Preparing historical data...")
# Unpack dataframe
Open = history['open'].unstack(level=0)
Close = history['close'].unstack(level=0)
High = history['high'].unstack(level=0)
Low = history['low'].unstack(level=0)

# create data object
close = Data(Close)
print("Done")

In [40]:
close.prices.plot()

In [41]:
#----------------------------------------------------------------------------------------------------
# ESTIMATE EMPIRICAL MARGINAL DISTRIBUTIONS AND CALCULATE MARGINAL VALUES
#----------------------------------------------------------------------------------------------------

from statsmodels.distributions.empirical_distribution import ECDF

# Empirically estimate marginal distributions
marginal_dist = {}
for asset in close.columns:
    marginal_dist[asset] = ECDF(close.returns[asset])

marginal_values = pd.DataFrame()
for asset in close.columns: 
    marginal_values[asset] = close.returns[asset].apply(marginal_dist[asset])
    

In [42]:
np.array(close.returns).shape

In [43]:

def get_max_corr_pair(corr_matrix): 
    sol = (corr.where(np.triu(np.ones(corr.shape), k=1).astype(np.bool))
                 .stack()
                 .sort_values(ascending=False))
    
    col1 = close.columns[sol.index[sol == max(sol)].labels[0]].values[0]
    col2 = close.columns[sol.index[sol == max(sol)].labels[1]].values[0]
    return(col1, col2)

#----------------------------------------------------------------------------------------------------
# CALCULATE KENDALL TAU CORRELATION
#----------------------------------------------------------------------------------------------------
corr = close.returns.corr(method='kendall')
corr.style.background_gradient(cmap='viridis')

pair_override = ("SPY R735QTJ8XC9X", "XLK RGRPZX100F39") # tuple pair
pair = get_max_corr_pair(corr)
pair = pair_override if  pair_override is not None else pair 
print("Pair with maximum correlation: {}".format(pair))

marginals = np.array(marginal_values[list(pair)].values)

In [44]:
corr

In [45]:


from time import time
from scipy.stats import norm
import statsmodels.api as sm

#----------------------------------------------------------------------------------------------------
# FIT KDE MODEL TO DATA MAPPED USING GAUSSIAN TRANSFORMATION
#----------------------------------------------------------------------------------------------------
# transform samples to R^2 domain
marginals_R_raw = norm.ppf(marginals)
marginals_R = np_remove_inf(marginals_R_raw)

# fit a kernel density estimate 
_BANDWIDTH = 'cv_ml' #'normal_reference' # 'cv_ml', 'cv_ls'
kernel = sm.nonparametric.KDEMultivariate
t = time()
model = kernel(data=marginals_R, var_type='cc', bw=_BANDWIDTH)
elapsed = time() - t 

print("Model fit in {} seconds".format(elapsed))
print("Model bandwidth {}".format(model.bw))


In [46]:
plt.figure(figsize=(15,15))
plt.scatter(marginals[:,0], marginals[:,1], s=0.3)

plt.figure(figsize=(15,15))
plt.scatter(marginals_R[:,0], marginals_R[:,1])

In [47]:
# define domain in [0,1]^2 
_equidistant_grid = False
if _equidistant_grid: 
    _min = 0.01 # offset to avoid inf
    _max = 1
    w = 0.001 
    x_valuesU = np.arange(_min, _max, w)
    y_valuesU = np.arange(_min, _max, w)
else: 
    # by transforming the uniformly distributed grid using the normal cdf, 
    # the grid density is thicker near the extremes. 
    _min = -3; _max = 3; w = 0.01
    x_valuesU = norm.cdf(np.arange(_min, _max, w))
    y_valuesU = norm.cdf(np.arange(_min, _max, w))
    
# transform to R^2 domain and make meshgrid
x_hat_valuesR = norm.ppf(x_valuesU)
y_hat_valuesR = norm.ppf(y_valuesU)
x_hat_meshR, y_hat_meshR = np.meshgrid(x_hat_valuesR, y_hat_valuesR)

# make predictions in R^2 domain and make meshgrid
data_predictR = pd.DataFrame({"Xhat":x_hat_meshR.ravel(), "Yhat":y_hat_meshR.ravel()})


In [49]:
import time
# sample from KDE model to fit interpolation model
t = time.time()
USE_CDF = False
if USE_CDF: 
    z_hat_meshR = model.cdf(data_predict=data_predictR.to_numpy())
else: 
    z_hat_meshR = model.pdf(data_predict=data_predictR.to_numpy())
    
elapsed = time.time() - t 
print("Sampling took {} seconds".format(elapsed))


In [50]:
'''Sampling from KDE model takes a long time - around 60s. Try and address this here. '''
import multiprocessing as mp
import time

n_worker = 4
with mp.Pool(n_worker) as pool: 
    data = data_predictR.to_numpy()
    print(data.shape)
    _chunksize = int(data.size / n_worker)
    print(_chunksize)
    t  = time.time()
    z_hat_meshR = pool.map(model.pdf, data, chunksize=_chunksize)
    elapsed = time.time() - t 
    
    print(f"Time taken: {elapsed}s")

In [95]:
# Calculating pdf for a chunk of data with n_worker = 4 takes around 14 seconds
t = time.time() 
z_hat_meshR = model.pdf(data[0:int(len(data)/n_worker)])
elapsed = time.time() - t 
print(f"Sampling took {elapsed} seconds")
print(z_hat_meshR)

In [121]:
'''
The overhead with creating a pool is too great to be advantageous apparently - we get at least 
double the computation time. 
'''
i = 1000
n_worker = 4
testdata = data[0:i,:]
_chunksize = int(len(testdata)/n_worker)
_chunksize = _chunksize if _chunksize > 1 else 1
print(f"num workers: {n_worker}")
print(f"Chunksize: {_chunksize}")

t = time.time()
res = model.pdf(testdata)
elapsed = time.time() - t
print(f"basic func took {elapsed} seconds")
print(res)

with mp.Pool(n_worker) as pool:
    t = time.time()
    res = pool.map(model.pdf, testdata, chunksize=500)
    elapsed = time.time() - t 
    print(f"pooled basic_func {elapsed} seconds")
    print(res)

In [None]:
'''
Pretty much the same result with this method. Might be due to overhead with serialisation of the KDE object 
itself. 
'''
# parameters 
i = 1000000
i = min(len(data), i)
n_worker = 1
testdata = data[0:i,:]
_chunksize = int(len(testdata)/n_worker)
_remainder = len(testdata) % n_worker
print(f"num data: {i}")
print(f"Chunk size: {_chunksize}")
print(f"_remainder: {_remainder}")

# single process
t = time.time()
res = model.pdf(testdata)
elapsed = time.time() - t
print(f"basic func took {elapsed} seconds")
# print(res)

# --- multiprocess ---
# split the data set into chunks
chunklist = []
zeroindex_offset = 0
for i in range(n_worker):
    prev_end = 0
    start_index = _chunksize*i
    rem = 0 if i < n_worker - zeroindex_offset else _remainder
    end_index = start_index + _chunksize - zeroindex_offset + rem
    print(f"chunk range: {start_index} to {end_index}")
    chunk = testdata[start_index:end_index,:]
    chunklist.append(chunk)    

with mp.Pool(n_worker) as pool:
    t = time.time()
    res = pool.map(model.pdf, chunklist)
    elapsed = time.time() - t 
    print(f"pooled basic_func {elapsed} seconds")
    

In [144]:
import sys

def get_size(obj, seen=None):
    """Recursively finds size of objects"""
    size = sys.getsizeof(obj)
    if seen is None:
        seen = set()
    obj_id = id(obj)
    if obj_id in seen:
        return 0
    # Important mark as seen *before* entering recursion to gracefully handle
    # self-referential objects
    seen.add(obj_id)
    if isinstance(obj, dict):
        size += sum([get_size(v, seen) for v in obj.values()])
        size += sum([get_size(k, seen) for k in obj.keys()])
    elif hasattr(obj, '__dict__'):
        size += get_size(obj.__dict__, seen)
    elif hasattr(obj, '__iter__') and not isinstance(obj, (str, bytes, bytearray)):
        size += sum([get_size(i, seen) for i in obj])
    return size

get_size(model)

In [None]:
t = time()
# transform predictions in R^2 domain back to [0,1]^2 domain 
z_valuesU = np.array(z_hat_meshR / (norm.pdf(data_predictR.Xhat) * norm.pdf(data_predictR.Yhat)))

# make meshgrid in [0,1]^2 domain for plotting
x_meshU, y_meshU = np.meshgrid(x_valuesU, y_valuesU)
z_meshU = z_valuesU.reshape(x_hat_meshR.shape)
elapsed2 = time() - t

In [None]:
# contour plot of resulting kde esimate of copula pdf in [0,1]^2
from matplotlib import cm
plt.figure(figsize=(12,10))
plt.contourf(x_meshU, y_meshU, z_meshU, 1000, cmap=cm.coolwarm)
plt.colorbar()

# 3d plot of resulting kde estimate of copula pdf in [0,1]^2
from mpl_toolkits import mplot3d

fig = plt.figure(figsize=(10,10))
ax = fig.gca(projection='3d')
surf = ax.plot_surface(x_meshU, y_meshU, z_meshU,
                       linewidth=0, antialiased=False, cmap=cm.coolwarm)
ax.view_init(45, 135)

We have so far obtained a grid of pdf values using the Gaussian Transformation method. We now use interpolation between these points to obtain a continuous approximation of the copula pdf. The interpolation method has a integral function implemented so this can be used to calculate cdf values and conditional cdf values. 

In [None]:
import scipy

t = time()

# Use degree 1 for interpolation as larger degrees can overshoot to lead to negative pdf values
# do not enforce bbox as it causes the interpolation to overshoot into negative values _BBOX = [0,1,0,1]
interp_model = scipy.interpolate.RectBivariateSpline(x_valuesU, y_valuesU, z_meshU, kx=1, ky=1)
elapsed = time() - t
print("Interpolation model fit in {}s".format(elapsed))

In [None]:
# ---------------------------------------------------------------------------------
# plot the interpolated model
# ---------------------------------------------------------------------------------
from matplotlib import cm

z_mesh_interp = interp_model(x_valuesU, y_valuesU)
z_mesh_interp = z_mesh_interp

plt.figure(figsize=(12,10))
plt.contourf(x_meshU, y_meshU, z_mesh_interp, 300, cmap=cm.coolwarm)
plt.colorbar()

# 3d plot of resulting kde estimate of copula pdf in [0,1]^2
from mpl_toolkits import mplot3d

fig = plt.figure(figsize=(20,20))
ax = fig.gca(projection='3d')
surf = ax.plot_surface(x_meshU, y_meshU, z_mesh_interp,
                       linewidth=0, antialiased=False, cmap=cm.coolwarm)
ax.view_init(45, 135)

print("SSE: {}".format(interp_model.get_residual()))

Above we investigated an alternative approach: the original method used the KDE + Interpolation to estimate the copula pdf. From this we would exploit the interpolation model's ease of integral calculation to find the cdf and the conditional copula values. 

The alternative to this was to use the KDE + Interpolation to estimate the cdf directly by using the KDE cdf member function. This allows us to avoid having to integrate the pdf to find the cdf and the conditional which requires extra compuational step for normalisation. 

However, it has been shown above that the latter approach is unable to capture the lower tail dependencies well enough. 

We therefore resort to using the former method. 

In [None]:
# test that the cdf gives us the pdf
if USE_CDF: 
    z_values = interp_model.ev(x_meshU, y_meshU, dx=1, dy=1)
    z_values_norm = z_values / np.max(z_values)
    z_values_norm = z_values_norm.reshape(x_meshU.shape)
    plt.figure(figsize=(12,10))
    plt.contourf(x_meshU, y_meshU, z_values_norm, 500, cmap=cm.coolwarm)
    plt.colorbar()

    # 3d plot of resulting kde estimate of copula pdf in [0,1]^2
    from mpl_toolkits import mplot3d

    fig = plt.figure(figsize=(20,20))
    ax = fig.gca(projection='3d')
    surf = ax.plot_surface(x_meshU, y_meshU, z_values_norm,
                           linewidth=0, antialiased=False, cmap=cm.coolwarm)
    ax.view_init(45, 135)

    print("SSE: {}".format(interp_model.get_residual()))
    

We now consider how to compute the mispricing index efficiently. From the definition of the copula density, note that:  

$$
    c(u,v) = 
    \frac{ \partial^2 C(u,v) }{ \partial u \partial v } = 
    \frac{\partial}{\partial u}\bigg( \frac{\partial C(u,v)}{\partial v} \bigg) = 
    \frac{\partial}{\partial u}\bigg( C(u|v) \bigg)
$$

Therefore,

$$
   \therefore C(u|v) = \int_0^{u} c(s,v)ds
$$


In [None]:
# Calculate MI for u |v first
from scipy import integrate

# let's say u = 0.9, v = 0.9
u_val = 0.000795
v_val = 0.000795

d = 0.0001
rng = np.arange(0,u_val,d)
fullrng = np.arange(0,1,d)

# C(u=0.9|v=0.9)
c_val_for_norm = interp_model.ev(fullrng, v_val)
c_values = interp_model.ev(rng, v_val)

c = np.trapz(c_values, rng)
norm_const = np.trapz(c_val_for_norm, fullrng)
integral = c / norm_const

print("Non-normalised sum: {}".format(c))
print("Normalisation const: {}".format(norm_const))
print("Normalised sum: {}".format(integral))

# for plotting
pdf_norm_domain = c_values / norm_const
pdf_full_norm_domain = c_val_for_norm / norm_const
cdf_full_norm_domain = integrate.cumtrapz(pdf_full_norm_domain, fullrng) 
cdf_norm_domain = integrate.cumtrapz(pdf_norm_domain, rng)

fig,ax = plt.subplots(2,1, figsize=(10,10))
ax[0].plot(fullrng, pdf_full_norm_domain)
ax[0].plot(rng, pdf_norm_domain)
ax[1].plot(fullrng[1:], cdf_full_norm_domain)
ax[1].plot(rng[1:], cdf_norm_domain)
ax[0].grid()
ax[1].grid()
ax[0].set_title("pdf")
ax[1].set_title("cdf")

# Check for violation
_TOL= 1e-5
_max_cdf = np.max(cdf_full_norm_domain)
_min_pdf = np.min(pdf_norm_domain)
assert _max_cdf <= 1.0 + _TOL, "Violation: CDF exceeds 1 with value {}".format(_max_cdf)
assert integral <= 1.0 + _TOL, "Violation: Conditional exceeds 1 with value {}".format(integral)
assert _min_pdf >= 0.0, "Violation: negative pdf with value {}".format(_min_cdf)


In [77]:
# now do it to account for both 
# integrate for particular value of u or v
from scipy import integrate

def _get_range_linear(val=None, width=1e-3):
    '''
        Using this is not recommended as any u,v values with resolution
        smaller than 'width' will not yield sensible results. 
        Args: 
            val: The range upper limit. If None, defaults to 1. 
        Returns: 
            Numpy array within range [0,val] with spacing = width
    '''
    _MINRNG = 0
    _MAXRNG = 1 if val is None else val
    return np.arange(_MINRNG, _MAXRNG, width)

def _get_range_gaussian_transformation(val=None, width=1e-3): 
    '''
        Using this is recommended as opposed to the _get_range_linear as
        this has higher sample density in the extremes and therefore avoids
        the issues with results where val is smaller than the 'width' value. 
        Args: 
            val: The range upper limit. If None, defaults to 1. 
        Returns: 
            Numpy array within range [0,val] with spacing = width
    '''
    _MINRNG = -4
    _MAXRNG_default = 4
    # take account for None and inf values
    cond = ((val is None) or (norm.ppf(val) > _MAXRNG_default))
    _MAXRNG = _MAXRNG_default if cond else norm.ppf(val)
    linrng = np.arange(_MINRNG, _MAXRNG, width)
    return norm.cdf(linrng)

def _integrate(yval, xval): 
    '''Wrapper for numpy numerical integration via trapezoidal rule'''
    return np.trapz(yval, xval)

def MI(u, v, res=1e-3, model=None, debug=False): 
    '''
        Calculate the mispricing index for a given u,v value
        Args: 
            u: Scalar bounded in range [0,1]. 
            v: Scalar bounded in range [0,1]. 
            res: width for numerical integration. Larger value 
            leads to faster computation for less accuracy
    '''
    _FULLRNG = _get_range_gaussian_transformation(width=res)
    _urng = _get_range_gaussian_transformation(u, width=res)
    _vrng = _get_range_gaussian_transformation(v, width=res)
    
    # c(u,v=v') c(u=u',v)
    c_uv, c_vu = model.ev(_urng, v), model.ev(u, _vrng)
    
    # copula values for normalisation const
    z_uv, z_vu = model.ev(_FULLRNG, v), model.ev(u, _FULLRNG)
    
    # integrate 
    C_uv, C_vu = _integrate(c_uv, _urng), _integrate(c_vu, _vrng)
    Z_uv, Z_vu = _integrate(z_uv, _FULLRNG), _integrate(z_vu, _FULLRNG)
    
    if debug: 
        print("non-normalised values: \n C(u|v): {}, C(v|u): {}".format(C_uv, C_vu))
        print("normalisation const: \n Z(u|v): {}, Z(v|u): {}".format(Z_uv, Z_vu))
        print("normalised values: \n C(u|v): {}, C(v|u): {}".format(C_uv/Z_uv, C_vu/Z_vu))
    
    # MI values 
    return C_uv/Z_uv, C_vu/Z_vu

# test MI function
u = 0.000795
v = 0.9
mi_uv, mi_vu = MI(u,v,res=1e-3,model=interp_model, debug=True)

print("Mispricing Index C(u|v): {}".format(mi_uv))
print("Mispricing Index C(v|u): {}".format(mi_vu))


We now have a method for calculating the mispricing index. Let's verify that this generates valid signals. 

In [78]:
def MI_series(x, model, pair):
    return MI(x[pair[0]], x[pair[1]], model=model)

def signaltype_cond(x, siglvl): 
    '''
    Long signal is Buy U and Sell V (+(U-V))
    Short signal is Sell U and Buy V (-(U-V))
    '''
    if (x['C_uv'] > UPPER and x['C_vu'] < LOWER): 
        return 1
    elif  (x['C_uv'] < LOWER and x['C_vu'] > UPPER): 
        return -1 
    else: 
        return 0

test_marginals = marginal_values[list(pair)]
# calculate MI values
df_MI_values = \
    test_marginals.apply(MI_series, args=(interp_model, pair), axis=1).apply(pd.Series)
df_MI_values.columns = ['C_uv', 'C_vu']

# identify rows where the MI surpasses the threshold
siglvl = 0.1
UPPER, LOWER = 1 - siglvl, siglvl
df_MI_values['signal'] = \
    df_MI_values.apply(
        lambda x: ((x['C_uv'] > UPPER and x['C_vu'] < LOWER) or \
                   (x['C_uv'] < LOWER and x['C_vu'] > UPPER)), 
        axis=1)

df_MI_values['cumulative_signal'] = \
    df_MI_values.apply(signaltype_cond, args=(siglvl,), axis=1).cumsum()

# join to main df for comparison
prices = close.prices / close.prices.iloc[0]
df = prices[list(pair)].join(df_MI_values, how='left')
df_signal = df[df.signal==True]

# Calculate pnl
_TRADE_SIZE = 100 
df['pnl'] = (df.cumulative_signal * (df[pair[0]] - df[pair[1]]) * _TRADE_SIZE).cumsum()


In [79]:
_MARKER = 'o'
_MARKER_SIZE = 2.5
_XLIM = [datetime(2011,6,1), datetime(2012,1,1)]

# df = df[(df.index > _XLIM[0]) & (df.index < _XLIM[1])]

fig, ax = plt.subplots(3, figsize=(20,40))
col2drop = ["C_uv", "C_vu", "signal", "cumulative_signal", "pnl"]
df.drop(labels=col2drop, axis=1).plot(ax=ax[0], marker=_MARKER, ms=_MARKER_SIZE)
df.drop(labels=col2drop, axis=1).diff().plot(ax=ax[1], marker=_MARKER, ms=_MARKER_SIZE)

returns = df.drop(labels=col2drop, axis=1).diff()
returns_signal = returns[df.signal == True].join(df[["C_uv", "C_vu"]], how='left')

diff = pd.DataFrame(returns[pair[0]] - returns[pair[1]])
diff.columns = ["diff"]
ax[2].vlines(diff.index.values, [0], diff.values)
ax[1].vlines(diff.index.values, [0], diff.values)
_YLIM = [-0.015,0.015]
ax[2].set_ylim(_YLIM)

for row in returns_signal.iterrows():
    index, data = row
    ax[1].annotate('x<-'+str(round(data["C_uv"],2)), xy=(index, data[pair[0]]))
    ax[1].annotate('x<-'+str(round(data["C_vu"],2)), xy=(index, data[pair[1]]))

diff_signal = diff[df.signal==True].join(df[["C_uv", "C_vu"]], how='left')
for row in diff_signal.iterrows(): 
    index, data = row
    d = (round(data["C_uv"],2), round(data["C_vu"],2))
    ax[2].annotate('x<-' + str(d), xy=(index, data["diff"]))
    ax[1].annotate('x<-' + str(d), xy=(index, data["diff"]))

_offset = 0.05
for row in df_signal.iterrows(): 
    index, data = row
    ax[0].vlines(index, data[pair[1]]-_offset, data[pair[1]] + _offset)
    ax[0].vlines(index, data[pair[0]]-_offset, data[pair[0]] + _offset)

# major_ticks = 
# for axis in ax: 
#     axis.grid()

#     axis.set_xticks(major_ticks)
#     axis.set_xticks(minor_ticks, minor=True)
#     axis.set_yticks(major_ticks)
#     axis.set_yticks(minor_ticks, minor=True)

In [80]:
# reset df_ object
try: 
    del df_
except NameError as e: 
    pass 

fig, ax = plt.subplots(figsize=(20,20))
df_ = df.join(diff, how='left')
ax.vlines(df_.index.values, [0], df_['diff'].values)
for row in df_.iterrows(): 
    index, data = row
    d = (round(data["C_uv"],2), round(data["C_vu"],2))
    ax.annotate('x<-' + str(d), xy=(index, data["diff"]))

Take a look at cumulative MI signal

In [81]:
_MARKER = 'o'
_MARKER_SIZE = 2.5
_XLIM = [datetime(2011,6,1), datetime(2012,1,1)]

# limit the data to specified range in _XLIM
# df = df[(df.index > _XLIM[0]) & (df.index < _XLIM[1])]


fig, ax = plt.subplots(4, figsize=(20,40))
col2drop = ["C_uv", "C_vu", "signal", "cumulative_signal", "pnl"]

# First plot - price series
df.drop(labels=col2drop, axis=1).plot(ax=ax[0], marker=_MARKER, ms=_MARKER_SIZE)

# Second plot - Returns difference plot
returns = df.drop(labels=col2drop, axis=1).diff()
returns_signal = returns[df.signal == True].join(df[["C_uv", "C_vu"]], how='left')
diff = pd.DataFrame(returns[pair[0]] - returns[pair[1]])
diff.columns = ["diff"]
ax[1].vlines(diff.index.values, [0], diff.values)
_YLIM = [-0.015,0.015]
ax[1].set_ylim(_YLIM)

# Third plot - cumulative signal
df.cumulative_signal.plot(ax=ax[2], marker=_MARKER, ms=_MARKER_SIZE)

# Fourth plot - estimated Pnl value (excluding trade commission)
df.pnl.plot(ax=ax[3], marker=_MARKER, ms=_MARKER_SIZE)
ax[3].legend(['Pairs Trading', 'Benchmark'])

# annotations 
diff_signal = diff[df.signal==True].join(df[["C_uv", "C_vu"]], how='left')
for row in diff_signal.iterrows(): 
    index, data = row
    d = (round(data["C_uv"],2), round(data["C_vu"],2))
    ax[1].annotate('x<-' + str(d), xy=(index, data["diff"]))

_offset = 0.1
for row in df_signal.iterrows(): 
    index, data = row
    ax[0].vlines(index, data[pair[1]]-_offset, data[pair[1]] + _offset)
    ax[0].vlines(index, data[pair[0]]-_offset, data[pair[0]] + _offset)


In [82]:
Roll_Max = df.pnl.cummax()
Daily_Drawdown = df.pnl/Roll_Max - 1.0
Max_Daily_Drawdown = Daily_Drawdown.cummin()
print(f"Max Daily Drawdown: {Max_Daily_Drawdown.iloc[-1]}%")

daily_return = df.pnl.pct_change(1)
sharpe_ratio = 255**0.5*daily_return.iloc[2:].mean() / daily_return.iloc[2:].std()
print(f"Sharpe ratio: {sharpe_ratio}")

In [83]:
daily_return.iloc[2:]

In [84]:
plt.plot(close.prices['SPY'] / close.prices['SPY'].iloc[0])