## Startup

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

import os
import pickle

In [None]:
import pandas as pd
import numpy as np
import pandas_datareader as web

import scipy.stats as scs
from scipy.stats import norm
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.stattools import acf, pacf

import datetime as dt
import seaborn as sb

In [2]:
np.random.seed(42)

In [3]:
from scripts.params import get_params

params = get_params()

In [18]:
dataroute = os.path.join("..", "data")
processedroute = os.path.join("...", "processed")
resultsroute = os.path.join("..", "results")
descriptivegraphsroute=os.path.join(resultsroute, "graphs", "descriptive")

## Data Retrieval

In [5]:
name = f'finaldf_train_{params["tablename"]}.pickle'
filename = os.path.join(dataroute, name)
with open(filename, "rb") as handle:
    df = pickle.load(handle)

In [6]:
df.head()

Unnamed: 0,^MERV_rets,^MERV_log_rets,^MERV_gk_vol,GGAL.BA_rets,GGAL.BA_log_rets,GGAL.BA_gk_vol,GGAL_rets,GGAL_log_rets,GGAL_gk_vol,YPFD.BA_rets,...,BBAR.BA_gk_vol,BBAR_rets,BBAR_log_rets,BBAR_gk_vol,USD_rets,USD_log_rets,USD_gk_vol,USD_^MERV_rets,USD_^MERV_log_rets,USD_^MERV_gk_vol
2013-01-03,0.007552,0.007524,0.000129,0.010616,0.01056,0.000677,-0.012748,-0.01283,0.001228,-0.006862,...,0.000169,-0.005725,-0.005742,0.00096,0.00883,0.008792,1.4e-05,0.001247,0.001246,0.000129
2013-01-04,0.007092,0.007067,0.000158,-0.006303,-0.006323,0.000208,-0.010043,-0.010094,0.000554,0.004936,...,0.000406,-0.019194,-0.019381,0.000635,0.018043,0.017883,0.000133,-0.005727,-0.005744,0.000158
2013-01-07,-0.001035,-0.001035,2.2e-05,0.002114,0.002112,6.3e-05,-0.014493,-0.014599,0.000517,0.010805,...,0.000492,0.015656,0.015534,0.000511,-0.002489,-0.002492,4.8e-05,-0.009769,-0.009817,2.2e-05
2013-01-08,0.008285,0.008251,8.2e-05,-0.008439,-0.008475,0.000153,-0.016176,-0.016309,0.001085,0.049563,...,0.000438,-0.015414,-0.015534,0.000642,0.015356,0.015239,6.4e-05,-0.001117,-0.001118,8.2e-05
2013-01-09,0.017826,0.017669,0.000273,0.0,0.0,0.0,0.011958,0.011887,0.005238,0.0,...,0.0,-0.003914,-0.003921,0.000147,-0.008671,-0.008709,0.001065,0.017245,0.017098,0.000273


## Descriptive graphs

In [8]:
sb.set_style(style='darkgrid')
sb.set_palette(sb.color_palette(palette='deep'))

In [None]:
for column in df.columns: 
    if column.endswith("log_rets"):
        fig=df[column].plot(title=column).get_figure()
        fig.savefig(os.path.join(descriptivegraphsroute, "log_rets", f"{column}.png"))
        plt.close()

In [23]:
for column in df.columns: 
    if column.endswith("gk_vol"):
        fig=df[column].plot(title=column).get_figure()
        fig.savefig(os.path.join(descriptivegraphsroute, "gk_vol", f"{column}.png"))
        plt.close()

In [None]:
def autocorrelograms(df, names, lags):
    """
    :param df: diccionario con dataframes adentro
    :param list: lista con los #n nombres de las acciones que vamos a calcular
    :return:guarda #n graficos de autocorrelacion en la ruta seleccionada
    """
    for name in names:
        fig = plt.figure(figsize=[13.6, 10.2])
        ax1 = fig.add_subplot(3, 1, 1)
        ax2 = fig.add_subplot(3, 1, 2, sharex=ax1)
        ax3 = fig.add_subplot(3, 1, 3, sharex=ax1)

        plot_acf(df[name]['Log Returns'],
                 lags=lags,                 # Cuantos lags busco autocorrelacionar
                 zero=False,                # Si tomo el lag cero
                 alpha=0.05,                # El rango de certeza marcado en azul
                 use_vlines=False,          # Lineas verticales que conectan cada punto con el eje x
                 ax=ax1)                    # La posicion en la figura
        ax1.grid(True)
        ax1.set_xlabel('Lag')
        ax1.set_ylabel('Autocorrelation')
        ax1.set_title(f'Autocorrelation of Log returns for {name}')

        plot_acf(df[name]['Abs Returns'],
                 lags=lags,                 # Cuantos lags busco autocorrelacionar
                 zero=False,                # Si tomo el lag cero
                 alpha=0.05,                # El rango de certeza marcado en azul
                 use_vlines=False,          # Lineas verticales que conectan cada punto con el eje x
                 ax=ax2)                    # La posicion en la figura
        ax2.grid(True)
        ax2.set_xlabel('Lag')
        ax2.set_ylabel('Autocorrelation')
        ax2.set_title(f'Autocorrelation of Abs returns for {name}')

        plot_acf(df[name]['Sqr Returns'],
                 lags=lags,                 # Cuantos lags busco autocorrelacionar
                 zero=False,                # Si tomo el lag cero
                 alpha=0.05,                # El rango de certeza marcado en azul
                 use_vlines=False,          # Lineas verticales que conectan cada punto con el eje x
                 ax=ax3)                    # La posicion en la figura
        ax3.grid(True)
        ax3.set_xlabel('Lag')
        ax3.set_ylabel('Autocorrelation')
        ax3.set_title(f'Autocorrelation of Sqr returns for {name}')

        fig.savefig(route_graphs + f'{name}_autocorrs_returns.png')

# autocorrelograms(dfs, stocks, 252)

def ac_test(df, names, lags):
    "FINALMENTE NO USE ESTA FUNCION"
    ac_list = []
    ac_pvals = []
    pac_list = []

    for rets in return_lists:
        for name in names:
            ac, confint, qstat, pvals = acf(df[name][rets],nlags=lags, qstat=True, alpha=0.05)
            ac_list.append(np.round(ac[:5],3))
            ac_pvals.append(pvals[:5])

        ac_df = pd.DataFrame(ac_list)
        ac_df.index = names
        ac_df.to_csv(route_tables + f'{rets}_ac_table.csv')

        ac_p_df = pd.DataFrame(np.round(ac_pvals,3))
        ac_p_df.index = names
        ac_p_df.to_csv(route_tables + f'{rets}_ac_pval_table.csv')

        ac_list = []
        ac_pvals = []

    return ac_df, ac_p_df

# ac_test(dfs, stocks, 252)

def pac_test(df, names, lags):
    "FINALMENTE NO USE ESTA FUNCION"
    pac_list = []

    for rets in return_lists:
        for name in names:
            pac = pacf(df[name][rets], nlags=lags)
            pac_list.append(np.round(pac[:5], 3))

        pac_df = pd.DataFrame(pac_list)
        pac_df.index = names
        pac_df.to_csv(route_tables + f'{rets}_pac_table.csv')

        pac_list=[]

    return pac_df

# pac_test(dfs, stocks, 252)


def partial_autocorrelograms(df, names, lags):
    """

    :param df: diccionario con dataframes adentro
    :param list: lista con los #n nombres de las acciones que vamos a calcular
    :return:guarda #n graficos de autocorrelacion en la ruta seleccionada
    """
    for name in names:
        fig = plt.figure(figsize=[13.6, 10.2])
        ax1 = fig.add_subplot(3, 1, 1)
        ax2 = fig.add_subplot(3, 1, 2, sharex=ax1)
        ax3 = fig.add_subplot(3, 1, 3, sharex=ax1)

        plot_pacf(df[name]['Log Returns'],
                  method='ywunbiased',          # Metodo de Yule Walker con correccion de sesgo por autocovarianzas
                  lags=lags,                    # Cuantos lags busco autocorrelacionar
                  zero=False,                   # Si tomo el lag cero
                  alpha=0.05,                   # El rango de certeza marcado en azul
                  use_vlines=False,             # Lineas verticales que conectan cada punto con el eje x
                  ax=ax1)                       # La posicion en la figura
        ax1.grid(True)
        ax1.set_xlabel('Lag')
        ax1.set_ylabel('Partial Autocorrelation')
        ax1.set_title(f'Partial Autocorrelation of Log returns for {name}')

        plot_pacf(df[name]['Abs Returns'],
                  method='ywunbiased',          # Metodo de Yule Walker con correccion de sesgo por autocovarianzas
                  lags=lags,                    # Cuantos lags busco autocorrelacionar
                  zero=False,                   # Si tomo el lag cero
                  alpha=0.05,                   # El rango de certeza marcado en azul
                  use_vlines=False,             # Lineas verticales que conectan cada punto con el eje x
                  ax=ax2)                       # La posicion en la figura
        ax2.grid(True)
        ax2.set_xlabel('Lag')
        ax2.set_ylabel('Partial Autocorrelation')
        ax2.set_title(f'Partial Autocorrelation of Abs returns for {name}')
        
        plot_pacf(df[name]['Sqr Returns'],
                  method='ywunbiased',          # Metodo de Yule Walker con correccion de sesgo por autocovarianzas
                  lags=lags,                    # Cuantos lags busco autocorrelacionar
                  zero=False,                   # Si tomo el lag cero
                  alpha=0.05,                   # El rango de certeza marcado en azul
                  use_vlines=False,             # Lineas verticales que conectan cada punto con el eje x
                  ax=ax3)                       # La posicion en la figura
        ax3.grid(True)
        ax3.set_xlabel('Lag')
        ax3.set_ylabel('Partial Autocorrelation')
        ax3.set_title(f'Partial Autocorrelation of Sqr returns for {name}')
        
        fig.savefig(route_graphs + f'{name}_partial_autocorrs_returns.png')

# partial_autocorrelograms(dfs,stocks,252)

def histograms(df, names):
    for name in names:
        fig = plt.figure(figsize=[13.6, 10.2])
        ax1 = fig.add_subplot(3, 1, 1)
        ax2 = fig.add_subplot(3, 1, 2)
        ax3 = fig.add_subplot(3, 1, 3)

        sb.distplot(df[name]['Log Returns'].fillna(0),
                    ax=ax1,
                    hist=True,
                    bins=int(np.ceil(np.log2(len(df[name])) + 15)),
                    label='Datos observados',
                    kde=True,
                    kde_kws={"color":"k", "lw":2, "label":"KDE"},
                    fit=norm,
                    fit_kws = {"color":"r", "lw":3, "label":"Normal Teorica"})
        # TODO: No me esta dando que las frecuencias relativas esten ni cerca de lo esperable
        plt.grid(True)
        plt.xlabel('Log Returns')
        plt.ylabel('Frequency')
        #plt.legend(True)
        plt.title(f'Histogram for Log returns frequency for {name}')

        sb.distplot(df[name]['Abs Returns'].fillna(0),
                    ax=ax2,
                    hist=True,
                    bins=int(np.ceil(np.log2(len(df[name])) + 15)),
                    label='Datos observados')
                    # SAQUE LO DE ABAJO PQ FLASHIE
                    # kde=True,
                    # kde_kws={"color":"k", "lw":2, "label":"KDE"},
                    # fit=halfnorm,
                    # fit_kws = {"color":"r", "lw":3, "label":"Media Normal Teorica"})

        plt.grid(True)
        plt.xlabel('Abs Returns')
        plt.ylabel('Frequency')
        #plt.legend(True)
        plt.title(f'Histogram for Abs Returns frequency for {name}')

        sb.distplot(df[name]['Sqr Returns'].fillna(0),
                    ax=ax3,
                    hist=True,
                    bins=int(np.ceil(np.log2(len(df[name])) + 15)),
                    label='Datos observados')
                    # kde=True,
                    # kde_kws={"color":"k", "lw":2, "label":"KDE"},
                    # fit=chi2,
                    # fit_kws = {"color":"r", "lw":3, "label":"Chi Cuadrada Teorica"})

        plt.grid(True)
        plt.xlabel('Sqr Returns')
        plt.ylabel('Frequency')
        #plt.legend(True)
        plt.title(f'Histogram for Sqr Returns frequency for {name}')

        fig.savefig(route_graphs + f'{name}_histogram_returns.png')

# histograms(dfs, stocks)
def histogram_normal(df, names):
    for name in names:
        fig = plt.figure(figsize=[13.6, 5.1])
        ax1 = fig.add_subplot(1, 1, 1)

        sb.distplot(df[name]['Returns'].fillna(0),
                    ax=ax1,
                    hist=True,
                    bins=int(np.ceil(np.log2(len(df[name])) + 15)),
                    label='Datos observados',
                    fit=norm,
                    fit_kws = {"color":"r", "lw":3, "label":"Normal Teorica"})

        plt.grid(True)
        plt.xlabel('Log Returns')
        plt.ylabel('Frequency')
        #plt.legend(True)
        plt.title(f'Histogram for simple return frequency for {name}')
        fig.savefig(route_graphs + f'{name}_normality_histogram_returns.png')

# histogram_normal(dfs, stocks)

def normality_test(arr):
    arr = arr.fillna(0)
    print('Skewness coefficient: ' + str(np.round(scs.skew(arr), 2)))
    print('Skewness test p-value: ' + str(1 - np.round(scs.skewtest(arr)[1], 2)))
    print('Kurtosis coefficient: ' + str(np.round(scs.kurtosis(arr), 2)))
    print('Kurtosis test p-value: ' + str(1 - np.round(scs.kurtosistest(arr)[1], 2)))
    print('Normality test p-value: ' + str(1 - np.round(scs.normaltest(arr)[1], 2)))

def normality_table(df, names, values):
    """
    :param df: 
    :param names: 
    :return: 
    """
    skew = []
    skew_pval = []
    kurt = []
    kurt_pval = []
    norm_pval = []

    for name in names:
        skew.append(np.round(scs.skew(df[name][values]), 3))
        skew_pval.append(np.round(scs.skewtest(df[name][values])[1], 3))
        kurt.append(np.round(scs.kurtosis(df[name][values]), 3))
        kurt_pval.append(np.round(scs.kurtosistest(df[name][values])[1], 3))
        norm_pval.append(np.round(scs.normaltest(df[name][values])[1], 3))

    dictionary = {'Skewness':skew, 'Skew p-value':skew_pval, 'Kurtosis':kurt, 'Kurtosis p-value':kurt_pval,
                  'Normality test p-value':norm_pval}

    table = pd.DataFrame(dictionary)
    table.index = names
    table.to_csv(route_tables+f'norm_table_{values}.csv')

    return table

# for rets in return_lists:
#     normality_table(dfs, stocks, rets)


def describe_and_test_norm(df, names, values):
    "FINALMENTE NO USÈ ESTA FUNICON"
    for name in names:
        print(values + ' tests for ' + name)
        print(df[name][values].describe())
        print('')
        print(values+' tests for '+name)
        normality_test(df[name][values])
        print('')
        print('-' * 10)

    table = normality_table(df, names, values)

    return table
            

# describe_and_test_norm(dfs, stocks, 'Returns')


""

## para tests de residuales
def residuals_test_plot(residual, lags, alpha_graph=0.05,**kwargs):
    """
    TODO: FUNCION SIN USO AUN.
    :param residual: The model Residual we want to test
    :param lags: the amount of lags we want to run the ACF/PACF on and the lags on the Ljung-Box test
    :param alpha_graph: The significance of the graph region
    :param name: Name for the graph
    :return: the p-value tuple and a saved ACF/PACF graph
    """
    fig = plt.Figure(figsize=[10.2, 13.6])
    ax1 = fig.add_subplot(2, 1, 1)
    ax2 = fig.add_subplot(2, 1, 2)

    plot_acf(residual, lags=lags, zero=False, alpha=alpha_graph, ax=ax1)
    plot_pacf(residual, lags=lags, zero=False, alpha=alpha_graph, ax=ax2)
    plt.show()

    name = kwargs.get('name', 'model')
    fig.savefig(f'{name}_ACF_PACF')

    test = acorr_ljungbox(residual, lags=lags)

    print('The p-values for the residuals are:')
    print(test[1])
