![QuantConnect Logo](https://cdn.quantconnect.com/web/i/icon.png)
<hr>

# Copula KDE Model

Here, we will research and develop a model for estimating Copula density from returns distributions. In the previous Jupyter notebook, the transformation method was used to estimate copula density for stock returns of the S&P 500 ETFs. This was done as in initial investigation to the feasibility of using this approach. 

As it has been established that using a KDE model may be appropriate, here we will try and develop a data pipeline, involving: 
* Preprocessing asset price data
* Fitting a model to the data
* Optimising the model
    * What kernels are most effective
    * How to quantitatively evaluate the performance of a model
* Generating insights
    * Lookback window for training
    * Frequency of refitting the model to new data

This will provide insights into the components necessary for building an alpha model. 

In [80]:
#------------------------------------------------------------------------------------------
# util objects 
#------------------------------------------------------------------------------------------
from statsmodels.distributions.empirical_distribution import ECDF
from copulas.bivariate.base import Bivariate
from copulas.multivariate import GaussianMultivariate

class Parameter():
    def __init__(self,
                resolution,
                start_year, 
                start_month=1, 
                start_day=1,
                n_years=0,
                n_months=0, 
                n_days=0): 
        
        self.Resolution = resolution
        
        end_year = start_year + n_years
        end_month = start_month + n_months
        end_day = start_day + n_days 
        self.Start = datetime(start_year,start_month,start_day,9,30,0)
        self.End = datetime(end_year, end_month, end_day,16,30,0)
    
class Data():
    def __init__(self, df):
        self._df = df.dropna()
        self.marginals = pd.DataFrame()
        
        self.init_data()
        
    def init_data(self): 
        self.get_marginals()
    
    @property 
    def prices(self):
        return self._df
    
    @property
    def returns(self):
        return self._df.diff() 
    
    @property
    def columns(self):
        return self._df.columns
    
    @property
    def assets(self):
        return self._df.columns
    
    def get_marginals(self):
        for asset in self.assets: 
            self.marginals[asset] = self.fit_transform_ECDF(asset)
        return self.marginals
    
    def fit_ECDF(self, asset=None):
        '''
            Return Emprical Culumlative Distribution fit to returns data
            for the specified asset. If asset is None, then returns a dictionary 
            for all assets. 
        '''
        if asset:
            return ECDF(self.returns[asset])
        else: 
            ecdf_dct = {}
            for inst in self._df.columns:
                ecdf_dct[inst] = ECDF(self.returns[inst])
            return ecdf_dct
    
    def transform(self, model, asset=None):
        '''Transform the data according to the model passed. '''
        if asset:
            return model(self.returns[asset])
        else: 
            return self.returns.apply(model, axis=0)
    
    def fit_transform_ECDF(self, asset):
        '''
            Return the transformed returns data after fitting an ECDf to the data. 
        '''
        ecdf = self.fit_ECDF(asset=asset)
        return self.returns[asset].apply(ecdf)
    
    def plot(self, *args, **kwargs): 
        return self._df.plot(*args, **kwargs)
    
    def _get_corr_pair_desc(self, corr):
        return (corr.where(np.triu(np.ones(corr.shape), k=1).astype(np.bool))
                     .stack()
                     .sort_values(ascending=False))
    
    def _get_max_corr_pair(self, corr): 
        sol = self._get_corr_pair_desc(corr)
        col1 = self.columns[sol.index[sol == max(sol)].labels[0]].values[0]
        col2 = self.columns[sol.index[sol == max(sol)].labels[1]].values[0]
        return(col1, col2)
    
    def max_returns_corr_pair(self, method='kendall'):
        '''
            Returns a tuple pair of assets with highest correlation. 
        '''
        corr = close.returns.corr(method=method)
        max_pair = self._get_max_corr_pair(corr)
        return max_pair
    
    def returns_corr(self, method='kendall'):
        return close.returns.corr(method=method)
    
    def pairs_by_returns_correlation(self, method='kendall'):
        return self._get_corr_pair_desc(self.returns_corr(method=method))

In [1]:
#------------------------------------------------------------------------------------------
# Copula Model
#------------------------------------------------------------------------------------------
from scipy.stats import norm
import statsmodels.api as sm
from collections import namedtuple

_KDEMODEL = namedtuple("KDEMODEL", ["model", "parameters"])

_STATSLIB_KDE_MULTIVARIATE_PARAMS = {
    "var_type" : "cc", 
    "bw" : ['normal_reference', 'cv_ml', 'cv_ls']
}

class KDECopula():
    def __init__(self, data):
        self._data = np.array(data)
        self._u = data[:,0]
        self._v = data[:,1]
        self._check_input(data)
        
        self._models = {
            "statslib_KDEMultivariate" : _KDEMODEL(model=sm.nonparametric.KDEMultivariate,
                                                   parameters=_STATSLIB_KDE_MULTIVARIATE_PARAMS)
        }
    
    @staticmethod
    def _check_input(data):
        dim = data.shape
        if len(dim) != 2: 
            raise ValueError("Input shape {}, expected (n,2)".format(dim))
        
        if dim[-1] != 2: 
            raise ValueError("Input is {} dimensional. Expected 2D.".format(dim[-1]))
    
    @staticmethod
    def transform(data:"np.array"):
        '''Transform from [0,1]^2 domain to R^2 domain using Gaussian method'''
        data_raw_ = norm.ppf(data)
        data_ = np_remove_inf(data)
        return data_
    
    def fit(self, data:"np.array"): 
        '''Fit a KDE model to the input data'''
        self._check_input(data)
        
        
    
    def transform_fit(self):
        pass 
        
        
    
    
    

In [81]:
#------------------------------------------------------------------------------------------
# model comparison criteria
#------------------------------------------------------------------------------------------
def AIC(log_lik, n_params):
    '''
        AIC = 2k - 2*ln(L)
        Args: 
            log_lik: total log likelihood over all sampoles
            n_samples: number of samples
            n_params: number of parameters
    '''
    return 2 * n_params - 2 * log_lik

def BIC(log_lik, n_samples, n_params):
    '''
        BIC = k*ln(N) - 2*ln(L)
        where 
            k = number of parameters
            N = number of samples 
            L = likelihood
        Args: 
            log_lik: total log likelihood over all sampoles
            n_samples: number of samples
            n_params: number of parameters
    '''
    return k * np.log(n_samples) - 2 * log_lik

In [82]:
#------------------------------------------------------------------------------------------
# Util functions
#------------------------------------------------------------------------------------------
def np_remove_nan(x): 
    return x[np.logical_not(np.isnan(x).any(axis=1))]

def np_remove_inf(x):
    return x[np.logical_not(np.isinf(x).any(axis=1))]

In [83]:
# QuantBook Analysis Tool 
# For more information see [https://www.quantconnect.com/docs/research/overview]
qb = QuantBook()

# parameters
START_YEAR = 2005
START_MONTH = 1
START_DAY = 1
N_YEARS = 10
N_MONTHS = 0
N_DAYS = 0
RESOLUTION = Resolution.Daily
PARAM = Parameter(RESOLUTION, START_YEAR, START_MONTH, START_DAY,
                  N_YEARS, N_MONTHS, N_DAYS)

# Specify list of correlated tickers for S&P 500 
sp500_tickers = ["SPY","XLK", "VGT", "IYW", "IGV"]
djia_tickers = ["DIA", "IYY", "DDM"]
tickers = sp500_tickers

# register tickers to quantbook
for ticker in tickers: 
    qb.AddEquity(ticker)
    
# get historical prices
history = qb.History(tickers, PARAM.Start, PARAM.End, PARAM.Resolution)

# Unpack dataframe
Open = history['open'].unstack(level=0)
Close = history['close'].unstack(level=0)
High = history['high'].unstack(level=0)
Low = history['low'].unstack(level=0)

# create data object - We use close by default
close = Data(Close)

In [84]:
#---------------------------------------------------------------------------------------------------
# Data Preprocessing
#---------------------------------------------------------------------------------------------------
