![QuantConnect Logo](https://cdn.quantconnect.com/web/i/icon.png)
<hr>

In [1]:
# QuantBook Analysis Tool 
# For more information see [https://www.quantconnect.com/docs/research/overview]
qb = QuantBook()
spy = qb.AddEquity("SPY")

index_ticker = "SPY"
index_etf = [index_ticker]
symdict = {}
for ticker in index_etf: 
    symdict[ticker] = qb.AddEquity(ticker) 

ystart = 2006
n_years = 10
n_train_days = 252*2
start, end = datetime(ystart,1,11,9,30,0), datetime(ystart+n_years, 1, 1, 16,30,0)    
res = Resolution.Hour
index_history = qb.History(index_etf, start, end, res)
# tickers = history.index.get_level_values(level=0).unique()
index_close = index_history['close'].unstack(level=0)
index_close

In [2]:
def min_max_df_scale(df):
    res = (df - df.min())/(df.max()-df.min())
    return res 

def norm_df_scale(df): 
    res = (df - df.mean()) / df.std()
    return res 

def rel_norm_df_scale(df):
    res = (df - df.iloc[0]) / df.std()
    return res 

def rel_log_df_scale(df): 
    res = np.log(df / df.iloc[0])
    return res 


We investigate relationships between assets and their corresponding market index. 

In [3]:
# tickers = ["MSFT", "GOOGL", "AAPL", "AMZN", "FB", "JPM", "JNJ", "KO", "PEP", "WFC", "C", "IBM"]
tickers = ["XLK", "VGT", "IYW", "FTEC", "IGV"]
securities = []
for ticker in tickers: 
    securities.append(qb.AddEquity(ticker))

history = qb.History(tickers, start, end, res)
# tickers = history.index.get_level_values(level=0).unique()

close = history['close'].unstack(level=0) # instrument close
close["SPY"] = index_close.mean(axis=1) # average over rows, not columns 

# normalize price values
norm_close = norm_df_scale(close)
# get returns
# returns = np.log(close / close.shift(1))#.dropna()
returns = np.log(close/close.shift(1))#.dropna(axis=1,how='all').dropna(axis=0,how='any')

# -- plots --
norm_close.plot(figsize=(20,10)) # plot timeseries
plt.grid()

returns.plot(figsize=(20,10)) # plot returns 
plt.grid()


In [4]:
import matplotlib.cm as cm 
name = "XLK"
ticker = name + " 2T" 

norm_close[[ticker, "SPY 2T"]].plot(figsize=(20,10))
returns[[ticker, "SPY 2T"]].plot(figsize=(20,10))

sec = returns[ticker].values
markt = returns["SPY 2T"].values

assert len(sec) == len(markt)
plt.figure(figsize=(20,10))
colours = cm.rainbow(np.linspace(0,1,len(sec)))
plt.scatter(sec, markt, c=colours, marker='x', s=5)

plt.grid()
plt.xlabel(name)
plt.ylabel('SPY')


In [5]:
# get correlations for first year's worth of data 
returns_end = returns.index[0] + timedelta(days=365*2)
returns_corr_spearman = returns.loc[returns.index < returns_end].corr(method="spearman")
returns_corr_spearman.style.background_gradient(cmap='inferno') # plot correlation

In [6]:
# get correlations for first year's worth of data 
returns_end = returns.index[0] + timedelta(days=365*2)
returns_corr_kendall = returns.loc[returns.index < returns_end].corr(method="kendall")
returns_corr_kendall.style.background_gradient(cmap='inferno') # plot correlation

In [7]:
# base ticker
base_ticker = "SPY 2T"

# get high correlation assets 
corr = returns_corr_kendall
high_corr_tickers = corr[base_ticker][(corr[base_ticker] >= 0.5) & (corr[base_ticker] < 1)].index
high_corr_df = norm_close[high_corr_tickers]
 
n_plots = 3
n_assets = len(high_corr_tickers)
fig, ax = plt.subplots(n_assets, n_plots, figsize=(20,10*n_assets))
print(n_assets)
for i, col in enumerate(high_corr_df.columns): 
    indices = norm_close.index
    market = norm_close[base_ticker].values
    asset = high_corr_df[col].values
    
    if n_assets > 1: 
        ax[i,0].plot(norm_close.index[0:n_train_days], market[0:n_train_days], c='r', linestyle='-')
        ax[i,0].plot(norm_close.index[0:n_train_days], asset[0:n_train_days], c='b', linestyle='-')
        
        ax[i,0].plot(norm_close.index[n_train_days:-1], market[n_train_days:-1], c='r', linestyle=':')
        ax[i,0].plot(norm_close.index[n_train_days:-1], asset[n_train_days:-1], c='b', linestyle=':')
        
        ax[i,0].grid()
        ax[i,0].legend([base_ticker, col])

        spread = (norm_close[base_ticker] - high_corr_df[col]).dropna()
        ax[i,1].plot(spread.index[0:n_train_days],spread.values[0:n_train_days], c='g', linestyle='-')
        ax[i,1].plot(spread.index[n_train_days:-1],spread.values[n_train_days:-1], c='g', linestyle=':')
        ax[i,1].grid()
    
        ax[i,2].scatter(returns[base_ticker].values, returns[col].values, s=3, marker='x')
        ax[i,2].grid() 
        ax[i,2].set_xlabel(base_ticker)
        ax[i,2].set_ylabel(col)
        
    else: 
        ax[0].plot(norm_close.index[0:n_train_days], market[0:n_train_days], c='r', linestyle='-')
        ax[0].plot(norm_close.index[n_train_days:-1], market[n_train_days:-1], c='r', linestyle=':')
        ax[0].plot(norm_close.index[0:n_train_days], asset[0:n_train_days], c='b', linestyle='-')
        ax[0].plot(norm_close.index[n_train_days:-1], asset[n_train_days:-1], c='b', linestyle=':')
        ax[0].grid()
        ax[0].legend([base_ticker, col])

        spread = (norm_close[base_ticker] - high_corr_df[col]).dropna()
        ax[1].plot(spread.index,spread.values, c='g')
        ax[1].grid()
        
        ax[2].scatter(returns[base_ticker].values, returns[col].values, s=3, marker='x')
        ax[2].grid() 
        ax[2].set_xlabel(base_ticker)
        ax[2].set_ylabel(col)
        


We try and model the marginal distributions

In [8]:
from scipy.stats import t, laplace

distribution = laplace
# ----------------------------------------------------------------------------------------------------
# index 
# ----------------------------------------------------------------------------------------------------
def plot_kde_vs_dist(some_returns, ax=None, title=None):
    # fit kde 
    if ax is None: 
        ax = some_returns.hist(figsize=(20,10), bins=500, density=True)
    else: 
        some_returns.hist(figsize=(20,10), bins=500, density=True, ax=ax)
    
    some_returns.plot.kde(linestyle='-', linewidth=3, ax=ax)

    # fit t distribution
    par = distribution.fit(some_returns.dropna())
    print(f"Distribution ML parametrs:{par}")
    r = distribution.rvs(*par, 2000)
    r = pd.DataFrame(r)
    r.plot.kde(linestyle='--', linewidth=3, ax=ax)

    ax.legend(['KDE', str(distribution)])
    ax.set_xlim([-0.025,0.025])
    ax.grid()
    ax.set_title(title)
    
def compare_dist_plot(returns_list, legend_list=None): 
    fig, ax = plt.subplots(1,1, figsize=(20,10))
    for returns in returns_list: 
        r = returns.dropna(axis=1,how='all').dropna(axis=0,how='any')
        if len(r) > 0: 
            par = distribution.fit(r)    
            print(f"Distribution ML parametrs: {par}")
            r = distribution.rvs(*par, 2000)
            r = pd.DataFrame(r)
            r.plot.kde(linestyle='--', linewidth=3, ax=ax)
    
    ax.legend(legend_list) 
    ax.set_xlim([-0.025, 0.025])
    

fig, ax = plt.subplots(1,2, figsize=(40,40), sharex=True, sharey=True)
    
some_returns = {
    '24months' : returns.loc[returns.index[0]:returns.index[0]+pd.offsets.DateOffset(months=24)], 
    '12months' : returns.loc[returns.index[0]:returns.index[0]+pd.offsets.DateOffset(months=12)], 
    '6months' : returns.loc[returns.index[0]:returns.index[0]+pd.offsets.DateOffset(months=6)], 
    '3months' : returns.loc[returns.index[0]:returns.index[0]+pd.offsets.DateOffset(months=3)], 
}

plot_kde_vs_dist(some_returns['24months']['SPY 2T'], ax[0], title="24 Months")
plot_kde_vs_dist(some_returns['12months']['SPY 2T'], ax[1], title="12 Months")

returns_lst = [val[1] for val in some_returns.items()]
legend_lst  = [val[0] for val in some_returns.items()]
compare_dist_plot(returns_lst, legend_lst)


In [9]:
# In general, t-distribution does not seem to be a good fit. 
for ticker in high_corr_tickers: 
    fig, ax = plt.subplots(1,2, figsize=(40,40), sharex=True, sharey=True)

    some_returns = {
        '24months' : returns.loc[returns.index[0]:returns.index[0]+pd.offsets.DateOffset(months=120)], 
        '12months' : returns.loc[returns.index[0]:returns.index[0]+pd.offsets.DateOffset(months=12)], 
        '6months' : returns.loc[returns.index[0]:returns.index[0]+pd.offsets.DateOffset(months=6)], 
        '3months' : returns.loc[returns.index[0]:returns.index[0]+pd.offsets.DateOffset(months=3)], 
    }

    plot_kde_vs_dist(some_returns['24months']['SPY 2T'], ax[0], title="24 Months for "+str(ticker))
    plot_kde_vs_dist(some_returns['12months']['SPY 2T'], ax[1], title="12 Months for "+str(ticker))

    returns_lst = [val[1] for val in some_returns.items()]
    legend_lst  = [val[0] for val in some_returns.items()]
    compare_dist_plot(returns_lst, legend_lst)


In [10]:
# fit index marginal with student-t distribtion
index_params = distribution.fit(returns['SPY 2T'].dropna())
print("Index parameters: {}".format(*index_params))
index_cdf = distribution.cdf(returns['SPY 2T'].dropna(), *index_params)

# fit amazon marginal with student-t distribtion 
ticker = "XLK 2T"
asset_params = distribution.fit(returns[ticker].dropna())
print(ticker + " parameters: {}".format(*asset_params))
asset_cdf = distribution.cdf(returns[ticker].dropna(), *asset_params)

print(f"{len(index_cdf)},{len(asset_cdf)}")
plt.figure(figsize=(10,10))
plt.scatter(index_cdf, asset_cdf, s=0.9)
plt.xlabel("Index")
plt.ylabel(ticker)
plt.grid()

In [11]:
# rather than use a parametric distribution, use empirical cdf function instead
# assuming we have enouh data
from statsmodels.distributions.empirical_distribution import ECDF

# ["XLK", "VGT", "IYW", "FTEC", "IGV"]
index_ticker = "SPY 2T"
index_data = returns[index_ticker].dropna()
index_dist = ECDF(index_data) # initialise empirical cdf with data
index_cdf = index_dist(index_data)

asset_ticker = "XLK 2T"
asset_data = returns[asset_ticker].dropna()
asset_dist = ECDF(asset_data)
asset_cdf = asset_dist(asset_data)

print(f"{len(index_cdf)},{len(asset_cdf)}")
plt.figure(figsize=(10,10))
plt.scatter(index_cdf, asset_cdf, s=0.9)
plt.xlabel("Index")
plt.ylabel(ticker)
plt.grid()


In [12]:
import sys 
from scipy.integrate import quad as integrate
from scipy.optimize import minimize

def ClaytonCdf(u,v,theta):
    if (V == 0).all() or (U == 0).all():
        cdf = np.zeros(V.shape[0])
    else:
        cdfs = [
            np.power(
                np.power(U[i], -theta) + np.power(V[i], -theta) - 1,
                -1.0 / theta
            )
            if (U[i] > 0 and V[i] > 0) else 0
            for i in range(len(U))
        ] # take into account 0 value

        cdf = np.array(cdfs)
    
    return cdf

def GumbelCdf(U,V,theta):
    
    if theta == 1:
        cdf = np.multiply(U, V)
    else:
        h = np.power(-np.log(U), theta) + np.power(-np.log(V), theta)
        h = -np.power(h, 1.0 / theta)
        cdf = np.exp(h)
    
    return cdf 

def FrankCdf(u,v,theta): 
    num = np.multiply(
        np.exp(np.multiply(-theta, U)) - 1,
        np.exp(np.multiply(-theta, V)) - 1
    )
    den = np.exp(-theta) - 1

    return -1.0 / theta * np.log(1 + num / den)

def ClaytonPdf(U,V,theta):
    a = (theta + 1) * np.power(np.multiply(U, V), -(theta + 1))
    b = np.power(U, -theta) + np.power(V, -theta) - 1
    c = -(2 * theta + 1) / theta
    return a * np.power(b, c)

def GumbelPdf(U,V,theta): 
    
    if theta == 1:
            pdf =  np.multiply(U, V)
    else:
        a = np.power(np.multiply(U, V), -1)
        tmp = np.power(-np.log(U), theta) + np.power(-np.log(V), theta)
        b = np.power(tmp, -2 + 2.0 / theta)
        c = np.power(np.multiply(np.log(U), np.log(V)), theta - 1)
        d = 1 + (theta - 1) * np.power(tmp, -1.0 / theta)
        
        pdf =  GumbelCdf(U,V,theta) * a * b * c * d
        
    return pdf 

def FrankPdf(u,v,theta):
    
    _A = lambda z : np.exp(np.multiply(theta, z)) - 1
    
    if theta == 0:
        pdf = np.multiply(U, V)

    else:
        num = np.multiply(np.multiply(-theta, _A(1)), 1 + _A(np.add(U, V)))
        aux = np.multiply(_A(U), _A(V)) + _A(1)
        den = np.power(aux, 2)
        pdf =  num / den
    
    return pdf

def GumbelConditionalPdf(u,given_v,theta):
    
    v = given_v
    C = GumbelPdf(u,given_v,theta)
    K = ((-np.log(u))**theta + (-np.log(v))**theta)**((1-theta)/theta)
    Q = (-np.log(v))**(theta-1)/v
    res = C*K*Q
    
    return res 

def ClaytonConditionalPdf(u,given_v,theta):
    v = given_v
    K = (u**(-theta)+v**(-theta)-1)**(1/theta-1)
    res = v**(-theta-1)*K
    return res

def FrankConditionalPdf(u, given_v, theta):
    v = given_v
    f = lambda: np.exp(-x) - 1
    
    den = f(theta*u)*f(theta*v)+f(theta)
    num = f(theta*u)*f(theta*v) + f(theta*u)
    
    res = num/den 
    
    return res 
    

class ArchimedeanCopula():
    def __init__(self, family, tau): 
        
        self._theta_func = {
            'Gumbel' : self._gumbel_theta(tau),
            'Clayton': 2*tau/(1-tau),
            'Frank'  : self._frank_theta(tau)
        }
        
        self._pdf_func = {
            'Gumbel' : GumbelPdf, 
            'Clayton': ClaytonPdf, 
            'Frank'  : FrankPdf
        }
        
        self._cdf_func = {
            'Gumbel' : GumbelCdf, 
            'Clayton': ClaytonCdf, 
            'Frank'  : FrankCdf
        }
        
        self.theta = self._theta_func[family]
        self._pdf = self._pdf_func[family]
        self._cdf = self._cdf_func[family]
    
    def _gumbel_theta(self, tau): 
        if tau == 1: 
            raise ValueError('Kendall Tau value for Gumbel Copula cannot be 1')
        return 1/(1-tau)
    
    def _frank_theta(self, tau): 
        
        # integrand
        integrand = lambda t: t / (np.exp(t) - 1) 
        
        # theta function
        frank_fnc = \
        lambda theta: ((tau -1) / 4. - (integrate(integrand, sys.float_info.epsilon, theta)[0]/theta - 1)) / theta
        theta = minimize(frank_fnc, 4, method='BFGS',tol=1e-5).x
        
        return theta 
    
    def pdf(self, u, v):
        return self._pdf(u,v,self.theta)
    
    def cdf(self, u, v):
        return self._cdf(u,v,self.theta)
        

In [13]:
from scipy.stats import kendalltau
# ["XLK", "VGT", "IYW", "FTEC", "IGV"]
index_ticker = "SPY 2T"
index_data = returns[index_ticker].dropna()
index_dist = ECDF(index_data) # initialise empirical cdf with data
index_cdf = index_dist(index_data)

asset_ticker = "XLK 2T"
asset_data = returns[asset_ticker].dropna()
asset_dist = ECDF(asset_data)
asset_cdf = asset_dist(asset_data)

tau = kendalltau(asset_data, index_data)
print(f"{index_ticker} & {asset_ticker} Kendall Tau {tau.correlation}")

In [14]:
Gumbel = ArchimedeanCopula('Gumbel', tau.correlation)
Clayton = ArchimedeanCopula('Clayton', tau.correlation)

In [15]:
pdfs = Clayton.pdf(asset_cdf, index_cdf)

In [16]:
from mpl_toolkits.mplot3d import Axes3D  # noqa: F401 unused import

import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.ticker import LinearLocator, FormatStrFormatter
import numpy as np

copula = Clayton

fig = plt.figure(figsize=(20,10))
ax = fig.gca(projection='3d')

theta = 3

x = np.arange(0.01, 1, 0.01)
y = np.arange(0.01, 1, 0.01)
X,Y = np.meshgrid(x,y)

Z = np.zeros(X.shape)
for i, u in enumerate(x):
    for j, v in enumerate(y):
        Z[i,j] = copula.pdf(u,v)

# Plot the surface.
surf = ax.plot_surface(X, Y, Z, cmap=cm.coolwarm,
                       linewidth=0, antialiased=False)

# Customize the z axis.
ax.zaxis.set_major_locator(LinearLocator(10))
ax.zaxis.set_major_formatter(FormatStrFormatter('%.02f'))
angle = -90
ax.view_init(60, angle)
# Add a color bar which maps values to colors.
fig.colorbar(surf, shrink=0.5, aspect=5)

plt.show()