![QuantConnect Logo](https://cdn.quantconnect.com/web/i/icon.png)
<hr>

In [1]:
# utils 
from scipy.stats import iqr
def freedman_diaconis(emp_data):
    '''Return optimum bin width for empirical data for a histogram'''
    IQR = iqr(emp_data)
    n = len(emp_data)
    return 2*IQR*np.power(n,-1/3)

def square_root_choice(emp_data):
    return np.ceil(np.sqrt(len(emp_data)))


In [2]:
from statsmodels.distributions.empirical_distribution import ECDF
from copulas.bivariate.base import Bivariate
from copulas.multivariate import GaussianMultivariate

class Parameter():
    def __init__(self,
                resolution,
                start_year, 
                start_month=1, 
                start_day=1,
                n_years=0,
                n_months=0, 
                n_days=0): 
        
        self.Resolution = resolution
        
        end_year = start_year + n_years
        end_month = start_month + n_months
        end_day = start_day + n_days 
        self.Start = datetime(start_year,start_month,start_day,9,30,0)
        self.End = datetime(end_year, end_month, end_day,16,30,0)
    
class Data():
    def __init__(self, df):
        self._df = df.dropna()
    
    @property 
    def prices(self):
        return self._df
    
    @property
    def returns(self):
        return self._df.diff() 
    
    @property
    def columns(self):
        return self._df.columns
    
    def plot(self, *args, **kwargs): 
        return self._df.plot(*args, **kwargs)
    
class EmpiricalCopula():
    def __init__(self, X):
        '''
            args:
                X: numpy array of shape (n_pairs, 2)
        '''
        self.data = X
        self._len = X.shape[0]
    
    def __len__(self):
        return self._len
    
    @staticmethod 
    def _get_bin_width(self, data):
        return freedman_diaconis(data)
    
    def cdf(self, u, v):
        '''returns C(u,v) = P(u <= U, v <= V)'''
        cnt = np.sum((self.data[:,0] <= u) & (self.data[:,1] <= v))
        return float(cnt / self._len)
    
    def pdf(self, u, v):
        pass 
    
    def cond_pdf(u, given_p):
        '''P(U<=u | V=v) = dC(u,v)/dv = P(U<=u, V=v)/p(V=v)'''
        pass 
        
def emp_copula_test():
    u = np.expand_dims(np.arange(0,100,1)+1, axis=1)
    v = np.expand_dims(np.arange(0,100,1)+1, axis=1)
    x = np.concatenate((u,v),axis=1)
    print("u:",u.shape)
    print("v:",v.shape)
    print("x:",x.shape)
        
    test_copula = EmpiricalCopula(x)
    assert len(test_copula) == len(u), f"{len(test_copula)} != {len(u)}"
    
    pvalue = test_copula.cdf(50,50)
    assert pvalue == 0.5, f"p = {pvalue} not 0.5"
    pvalue = test_copula.cdf(25,25)
    assert pvalue == 0.25, f"p = {pvalue} not 0.25"
    pvalue = test_copula.cdf(100,0)
    assert pvalue == 0.0, f"p={pvalue} not 0"
    pvalue = test_copula.cdf(0,100)
    assert pvalue == 0.0, f"p={pvalue} not 0"
    pvalue = test_copula.cdf(20,100)
    assert pvalue == 0.2, f"p={pvalue} not 0.2"
    pvalue = test_copula.cdf(100,20)
    assert pvalue == 0.2, f"p={pvalue} not 0.2"
    pvalue = test_copula.cdf(100,100)
    assert pvalue == 1.0, f"p={pvalue} not 1.0"
    
emp_copula_test()
                
        
            


In [4]:
# QuantBook Analysis Tool 
# For more information see [https://www.quantconnect.com/docs/research/overview]
qb = QuantBook()

# parameters
START_YEAR = 2005
START_MONTH = 1
START_DAY = 1
N_YEARS = 10
N_MONTHS = 0
N_DAYS = 0
RESOLUTION = Resolution.Daily
PARAM = Parameter(RESOLUTION, START_YEAR, START_MONTH, START_DAY,
                  N_YEARS, N_MONTHS, N_DAYS)

# Specify list of correlated tickers for S&P 500 
tickers = ["SPY","XLK", "VGT", "IYW", "IGV"]

# register tickers to quantbook
for ticker in tickers: 
    qb.AddEquity(ticker)
    
# get historical prices
history = qb.History(tickers, PARAM.Start, PARAM.End, PARAM.Resolution)

# Unpack dataframe
Open = history['open'].unstack(level=0)
Close = history['close'].unstack(level=0)
High = history['high'].unstack(level=0)
Low = history['low'].unstack(level=0)

# create data object
close = Data(Close)


In [5]:
# Calculate Kendall Tau
corr = close.returns.corr(method='kendall')
corr.style.background_gradient(cmap='viridis')

In [6]:
# Empirically estimate marginal distributions
marginal_dist = {}
for asset in close.columns:
    marginal_dist[asset] = ECDF(close.returns[asset])

marginal_values = pd.DataFrame()
for asset in close.columns: 
    marginal_values[asset] = close.returns[asset].apply(marginal_dist[asset])

In [7]:
def get_max_corr_pair(corr_matrix): 
    sol = (corr.where(np.triu(np.ones(corr.shape), k=1).astype(np.bool))
                 .stack()
                 .sort_values(ascending=False))
    
    col1 = close.columns[sol.index[sol == max(sol)].labels[0]].values[0]
    col2 = close.columns[sol.index[sol == max(sol)].labels[1]].values[0]
    return(col1, col2)

pair = get_max_corr_pair(corr)
print("Pair with maximum correlation: {}".format(pair))

X = np.array(marginal_values[list(pair)].values)

In [37]:
close_zeroed = (close._df - close._df.iloc[0])
close_norm = (close_zeroed - close_zeroed.mean())/close_zeroed.std()
close_norm[[close_norm.columns[3], close_norm.columns[2]]].plot(figsize=(20,10))

diff = close_norm[close_norm.columns[3]] - close_norm[close_norm.columns[2]]
plt.figure()
diff.plot(figsize=(20,10))

plt.figure(figsize=(20,10))
plt.scatter(close_norm[close_norm.columns[3]], close_norm[close_norm.columns[2]])

In [26]:
from statsmodels.tsa.stattools import adfuller
result = adfuller(diff)
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
	print('\t%s: %.3f' % (key, value))

In [7]:
# data 
plt.figure(figsize=(10,10))
plt.scatter(X[:,0], X[:,1], s=1.0)
plt.grid()

# gumbel sample data 
gumbel = Bivariate(copula_type="clayton")
gumbel.fit(X)
samples = gumbel.sample(len(marginal_values))
plt.figure(figsize=(10,10))
plt.scatter(samples[:,0], samples[:,1], s=1.0)

# gauss sample data
gauss = GaussianMultivariate()
gauss.fit(X)
samples = gauss.sample(len(marginal_values))
plt.figure(figsize=(10,10))
plt.scatter(samples[0], samples[1], s=1.0)

plt.grid()

from matplotlib.image import NonUniformImage
fig = plt.figure(figsize=(10, 10))
H, xedges, yedges = np.histogram2d(X[:,0], X[:,1], bins=50, density=True)
# x, y = np.meshgrid(xedges, yedges)
ax = fig.add_subplot(title='NonUniformImage: interpolated',
                     aspect='equal', xlim=xedges[[0, -1]], ylim=yedges[[0, -1]])
im = NonUniformImage(ax, interpolation='bilinear')
xcenters = (xedges[:-1] + xedges[1:]) / 2
ycenters = (yedges[:-1] + yedges[1:]) / 2

# print(ycenters)
im.set_data(xcenters, ycenters, H)
im.set_extent(im.get_extent()) # workaround for valuetype error
# print(im._extent)
ax.images.append(im)

print(H)
# plt.show()

# plt.figure(figsize=(20,20))
# ax = fig.add_subplot(title='pcolormesh: actual edges',
#          aspect='equal')
# x, y = np.meshgrid(xedges, yedges)
# ax.pcolormesh(x, y, H)
# plt.show()
# print(histgrid)
# print(len(X))

Use Kernel Density Estimation to estimate the copula density

### 1. Naive Kernel Density Estimation


In [8]:

from scipy import stats
from sklearn.neighbors import KernelDensity 
from sklearn.model_selection import GridSearchCV
import multiprocessing as mp

_USE_CV = True

if _USE_CV: 
    # use grid search cross-validation to optimize the bandwidth for KDE
    params = {'bandwidth': np.logspace(-5, 0, 20)}
    _NJOBS = 10
    _CV_SIZE = len(X) # use leave-one-out cv as sample size is small 
    grid = GridSearchCV(KernelDensity(breadth_first=False), params, verbose=1, n_jobs=_NJOBS, cv=_CV_SIZE)
    grid.fit(X)


    print("Best Esimator Bandwidth: {}".format(grid.best_estimator_.bandwidth))
    model = grid.best_estimator_
    
else: 
    _KERNEL = 'gaussian'
    _BANDWIDTH = float(1/10)
    model = KernelDensity(kernel=_KERNEL, bandwidth=_BANDWIDTH)
    model.fit(X)

samples = model.sample(len(X))

In [9]:
get_n_bins = lambda x: square_root_choice(x) # lambda x: (max(x)-min(x))/ freedman_diaconis(x)

u_nbins = get_n_bins(X[:,0])
v_nbins = get_n_bins(X[:,1])

u_edges = np.linspace(min(X[:,0]), max(X[:,0]), u_nbins)
v_edges = np.linspace(min(X[:,1]), max(X[:,1]), v_nbins)

H, xedges, yedges = np.histogram2d(X[:,0], X[:,1], bins=[u_nbins, v_nbins], normed=True)

xcenters = (xedges[:-1] + xedges[1:]) / 2
ycenters = (yedges[:-1] + yedges[1:]) / 2


# original 
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(title='NonUniformImage: interpolated',
                     aspect='equal', xlim=xedges[[0, -1]], ylim=yedges[[0, -1]])
im = NonUniformImage(ax, interpolation='bilinear')
im.set_data(xcenters, ycenters, H)
im.set_extent(im.get_extent()) # workaround for valuetype error
ax.images.append(im)

# samples
H, xedges, yedges = np.histogram2d(samples[:,0], samples[:,1], bins=[u_nbins, v_nbins], normed=True)
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(title='NonUniformImage: interpolated',
                     aspect='equal', xlim=xedges[[0, -1]], ylim=yedges[[0, -1]])
im = NonUniformImage(ax, interpolation='bilinear')
im.set_data(xcenters, ycenters, H)
im.set_extent(im.get_extent()) # workaround for valuetype error
ax.images.append(im)

In [10]:
from mpl_toolkits.mplot3d import Axes3D
from scipy import interpolate

H, xedges, yedges = np.histogram2d(X[:,0], X[:,1], bins=[u_nbins, v_nbins], normed=True)

xpos, ypos = np.meshgrid(xedges[:-1] + 0.001, yedges[:-1] + 0.001, indexing="ij")
xpos = xpos.ravel()
ypos = ypos.ravel()
zpos = 0


dx = (1/u_nbins)*np.ones_like(zpos)
dy =  (1/v_nbins)* np.ones_like(zpos)
dz = H.ravel()
tck = interpolate.bisplrep(xpos, ypos, dz)
xnew, ynew = np.mgrid[0:1:100j, 0:1:100j]
z = interpolate.bisplev(xnew[:,0], ynew[0,:], tck)
plt.figure(figsize=(15,10))
plt.pcolor(xnew, ynew, z)
plt.colorbar()


fig = plt.figure(figsize=(20,20))
ax = fig.add_subplot(111, projection='3d')
ax.bar3d(xpos, ypos, zpos, dx, dy, dz, zsort='average')
ax.plot_surface(xnew, ynew, z,cmap='viridis', edgecolor='none')
ax.view_init(45, 40)


In [11]:
print(np.min(z))

In [12]:
fig, ax = plt.subplots(2, figsize=(10,20))
ax[0].scatter(X[:,0],X[:,1], s=0.3)
ax[1].scatter(samples[:,0],samples[:,1], s=0.3)


### Transformation Estimator

Let $\Phi$ be the standard normal cdf and $\phi$ its density. Then the random vector $(X,Y) = (\phi^{-1}(U), \phi^{-1}(V))$ has normally distributed margins and is supported on the full $\mathbb{R}^2$. By Sklar's Theroem, its density $f$ can be written as: 

$$f(x,y) = c(\Phi(x), \Phi(y))\phi(x)\phi(y)$$

To estimate this density, we transform samples $(U_i,V_i)$ to $(X_i,Y_i) = (\Phi^{-1}(U_i), \Phi^{-1}(V_i))$ for $i = 1,...,n$. To this transformed sample, we now apply the standard kernel density estimator for $(x,y) \in \mathbb{R}^2$: 

$$\hat{f}_n(x,y) = \frac{1}{n}\sum_{i=1}^{n}K_{b_n}(x-X_i)K_{b_n}(y-Y_i)$$

Hence, the copula density can be expressed as : 

$$
    \hat{c}_n(u,v) = \frac{\sum_{i=1}^{n}K_{b_n}\big(\Phi^{-1}(u) - \Phi^{-1}(U_i)\big)K_{b_n}\big(\Phi^{-1}(v) - \Phi^{-1}(V_i)\big)}{n\phi(\Phi^{-1}(u))\phi(\Phi^{-1}(v))}
$$
for all $(u,v) \in [0,1]^2$

In [13]:
from scipy.stats import norm

inv_Phi = norm.ppf
Phi = norm.cdf

# transform from [0,1]^2 to R^2 domain
W = inv_Phi(X)

plt.figure(figsize=(10,10))
plt.scatter(W[:,0], W[:,1], s=0.3)

# use grid search cross-validation to optimize the bandwidth for KDE
params = {'bandwidth': np.logspace(-5, 0, 20)}
_NJOBS = 10
_CV_SIZE = len(W)-1 # use leave-one-out cv as sample size is small 
grid = GridSearchCV(KernelDensity(breadth_first=False), params, verbose=1, n_jobs=_NJOBS, cv=_CV_SIZE)
grid.fit(W[1:,:])


print("Best Esimator Bandwidth: {}".format(grid.best_estimator_.bandwidth))
model = grid.best_estimator_

samples = model.sample(len(W)*500)
plt.figure(figsize=(10,10))
plt.scatter(samples[:,0], samples[:,1], s=0.3)



In [14]:
x_axis, y_axis = np.meshgrid(samples[:,0], samples[:,1])
f = model.score_samples()
print(x_axis)

In [18]:
mean = np.mean(W[1:,:],axis=0)
cov = np.cov(W[1:,:], rowvar=0)
print(mean)
print(cov)
from scipy.stats import multivariate_normal 
rv = multivariate_normal(mean, cov)
samples = rv.rvs(len(W))
plt.figure(figsize=(10,10))
plt.scatter(samples[:,0], samples[:,1], s=0.3)


# convert samples back to [0,1]^2 domain
invg_u = norm.pdf(samples[:,0])
invg_v = norm.pdf(samples[:,1])

den = np.multiply(invg_u, invg_v)


In [19]:
den

In [None]:
close.columns
asset1, asset2 = "VGT 2T", "XLK 2T"
plt.figure(figsize=(10,10))
plt.scatter(close.returns[asset1].values, close.returns[asset2].values, s=3)

from sklearn import linear_model
ransac = linear_model.RANSACRegressor()
ransac.fit(close.returns[asset1].dropna().values.reshape(-1,1),close.returns[asset2].dropna().values.reshape(-1,1))
ransac_x = np.arange(close.returns[asset1].dropna().values.min(), close.returns[asset1].dropna().values.max(), 0.001)
ransac_y = ransac.predict(ransac_x.reshape(-1,1))


ridge = linear_model.RANSACRegressor()
ridge.fit(close.returns[asset1].dropna().values.reshape(-1,1),close.returns[asset2].dropna().values.reshape(-1,1))
ridge_x = np.arange(close.returns[asset1].dropna().values.min(), close.returns[asset1].dropna().values.max(), 0.001)
ridge_y = ridge.predict(ridge_x.reshape(-1,1))

plt.scatter(ransac_x, ransac_y, color='r', s=0.5)
plt.scatter(ridge_x, ridge_y, color='g', s=0.5)
plt.grid()

In [11]:
close.returns['IGV 2T'].dropna().values.shape

In [12]:
ransac_x = np.arange(close.returns[asset1].dropna().values.min(), close.returns[asset1].dropna().values.max())

In [13]:

coeff_list = []
for i in range(1000):
    ransac = linear_model.RANSACRegressor()
    ransac.fit(close.returns[asset1].dropna().values.reshape(-1,1),close.returns[asset2].dropna().values.reshape(-1,1))
    coeff_list.append(ransac.estimator_.coef_)

In [14]:
array = np.array(coeff_list).flatten()
plt.hist(array, bins=250)
from scipy.stats import mode 
binned = np.histogram(array, bins=250)
7
coeff = binned[1][np.argmax(binned[0])]


plt.figure(figsize=(20,10))
plt.scatter(close.returns[asset1].values, close.returns[asset2].values, s=3)

x = np.arange(close.returns[asset1].dropna().values.min(), close.returns[asset1].dropna().values.max(), 0.001)
y = coeff*x
plt.scatter(x,y,s=3)
# plt.scatter(ridge_x, ridge_y, color='g', s=0.5)


In [15]:
plt.figure(figsize=(20,10))
(close.prices[asset2]-close.prices[asset2].iloc[0]).plot(linewidth=0.3)
((close.prices[asset1]-close.prices[asset1].iloc[0])*coeff).plot(linewidth=0.3)