# Data Analysis

## Setup

### Imports

In [1]:
%load_ext autoreload
%autoreload 2
%load_ext autotime

from datetime import datetime, timedelta
from pathlib import Path

import IPython.display as IPydisplay
import matplotlib.pyplot as plt
import mplfinance as mpf
import numpy as np
import pandas as pd

from qbm.metrics import compute_annualized_volatility, compute_correlation_coefficients
from qbm.plotting import (
    mpf_style,
    plot_correlation_coefficients,
    plot_histogram_grid,
    plot_qq_grid,
    plot_violin,
)
from qbm.utils import (
    binarize_df,
    get_binarization_params,
    prepare_training_data,
    unbinarize_df,
    load_log_returns,
    load_raw_data,
    compute_df_stats,
    compute_stats_over_dfs,
    get_project_dir,
    get_rng,
    load_artifact,
    lr_exp_decay,
    save_artifact,
)
from qbm.sampling import generate_rbm_sample, generate_rbm_samples_df

time: 717 ms (started: 2021-11-15 11:57:40 +01:00)


### Data Import

In [2]:
project_dir = get_project_dir()
data_dir = project_dir / "data"
if not (data_dir / "train").exists():
    (data_dir / "train").mkdir()
artifacts_dir = project_dir / "artifacts"
plots_dir = project_dir / "results/plots/data_analysis"

data_source = "dukascopy"
start_date = datetime(1999, 1, 1)
end_date = datetime(2020, 12, 31)
currency_pairs = ["EURUSD", "GBPUSD", "USDCAD", "USDJPY"]

dfs, log_returns_raw = load_raw_data(
    data_dir,
    data_source,
    start_date=datetime(1990, 1, 1),
    end_date=datetime(2021, 12, 31),
)
log_returns_raw.to_csv(data_dir / f"train/{data_source}.csv")

# filter dates
for pair, df in dfs.items():
    dfs[pair] = df.loc[(df.index >= start_date) & (df.index <= end_date)]
log_returns_raw = log_returns_raw.loc[
    (log_returns_raw.index >= start_date) & (log_returns_raw.index <= end_date)
]

print("EURUSD")
display(dfs["EURUSD"])
display(compute_df_stats(dfs["EURUSD"]))

print("Log Returns")
display(log_returns_raw)
display(compute_df_stats((log_returns_raw)))

EURUSD


Unnamed: 0_level_0,open,high,low,close,volume,return,log_return
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1999-01-04,1.17400,1.19060,1.16950,1.18005,19664.0000,0.005153,0.005140
1999-01-06,1.17620,1.18130,1.15500,1.16815,31654.0000,-0.006844,-0.006868
1999-01-07,1.16250,1.17670,1.16050,1.16860,62349.0000,0.005247,0.005234
1999-01-08,1.17110,1.17330,1.15320,1.16325,43164.0000,-0.006703,-0.006726
1999-01-11,1.15650,1.16070,1.14850,1.15460,45083.0000,-0.001643,-0.001644
...,...,...,...,...,...,...,...
2020-12-23,1.21871,1.22210,1.21538,1.21939,186474.2969,0.000558,0.000558
2020-12-24,1.21940,1.22156,1.21773,1.21820,150064.3594,-0.000984,-0.000985
2020-12-29,1.22196,1.22750,1.22193,1.22519,152109.0313,0.002643,0.002640
2020-12-30,1.22520,1.23101,1.22511,1.22984,157635.7188,0.003787,0.003780


Unnamed: 0,open,high,low,close,volume,return,log_return
min,0.8267,0.8324,0.8229,0.8264,54.0,-0.026897,-0.027265
max,1.5972,1.60389,1.58643,1.59721,2697619.0,0.034954,0.034357
mean,1.200355,1.205907,1.194972,1.20042,480081.5,7.8e-05,5.9e-05
median,1.20316,1.20829,1.19808,1.20328,220040.6,7.4e-05,7.4e-05
std,0.1626,0.163168,0.161807,0.162564,529659.5,0.00611,0.006108


Log Returns


Unnamed: 0_level_0,EURUSD,GBPUSD,USDCAD,USDJPY
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1999-01-04,0.005140,-0.000904,-0.003269,-0.013065
1999-01-06,-0.006868,-0.009712,-0.000992,0.013452
1999-01-07,0.005234,-0.003025,0.001191,-0.014041
1999-01-08,-0.006726,-0.000244,-0.001124,-0.001622
1999-01-11,-0.001644,0.000429,-0.003977,-0.018750
...,...,...,...,...
2020-12-23,0.000558,0.008271,-0.003847,0.000174
2020-12-24,-0.000985,0.001730,0.001027,0.001014
2020-12-29,0.002640,0.002433,-0.001613,-0.001351
2020-12-30,0.003780,0.009725,-0.005568,-0.004122


Unnamed: 0,EURUSD,GBPUSD,USDCAD,USDJPY
min,-0.027265,-0.063726,-0.037117,-0.034561
max,0.034357,0.035111,0.034804,0.052261
mean,5.9e-05,-4e-06,-5.8e-05,-6.5e-05
median,7.4e-05,0.0,-5.2e-05,-3.8e-05
std,0.006108,0.005795,0.005381,0.006275


time: 1.77 s (started: 2021-11-15 11:57:40 +01:00)


## Raw Data Analysis Plots

### Candlestick OHLC Plots

In [3]:
for pair in currency_pairs:
    df = dfs[pair]
    mpf.plot(
        df[df.index > df.index.max() - timedelta(days=365)],
        title=pair,
        type="candlestick",
        style=mpf_style,
        figsize=(15, 6),
        volume=True,
        tight_layout=True,
        savefig=dict(fname=plots_dir / f"candlestick_{pair}.png", dpi=300),
    )

time: 3.6 s (started: 2021-11-15 11:57:42 +01:00)


In [4]:
IPydisplay.Image(plots_dir / f"candlestick_EURUSD.png")

<IPython.core.display.Image object>

time: 3.46 ms (started: 2021-11-15 11:57:46 +01:00)


In [5]:
IPydisplay.Image(plots_dir / f"candlestick_GBPUSD.png")

<IPython.core.display.Image object>

time: 12 ms (started: 2021-11-15 11:57:46 +01:00)


In [6]:
IPydisplay.Image(plots_dir / f"candlestick_USDCAD.png")

<IPython.core.display.Image object>

time: 19.6 ms (started: 2021-11-15 11:57:46 +01:00)


In [7]:
IPydisplay.Image(plots_dir / f"candlestick_USDJPY.png")

<IPython.core.display.Image object>

time: 11.1 ms (started: 2021-11-15 11:57:46 +01:00)


### Histograms

In [8]:
params = {
    "xlims": (-0.03, 0.03),
    "ylims": (0, 100),
    "xticks": np.linspace(-0.03, 0.03, 7),
    "yticks": np.linspace(0, 100, 6),
}
fig, ax = plot_histogram_grid(log_returns_raw, params, bins=80, density=True)
plt.savefig(plots_dir / "histograms.png")

<Figure size 3000x1800 with 4 Axes>

time: 1.6 s (started: 2021-11-15 11:57:46 +01:00)


### Violin/Box Plot

In [9]:
params = {"ylims": (-0.0667, 0.0667), "yticks": np.linspace(-0.06, 0.06, 7)}
fig, ax = plot_violin(log_returns_raw, params)
plt.savefig(plots_dir / "violin.png")

<Figure size 2700x1800 with 1 Axes>

time: 740 ms (started: 2021-11-15 11:57:47 +01:00)


## Data Curation

In [10]:
# find and remove outliers above threshold
log_returns = load_log_returns(
    "dukascopy", start_date="1999-01-01", end_date="2019-12-31", outlier_threshold=8
)

print("\nRaw Data")
display(compute_df_stats(log_returns_raw))
print("Curated")
display(compute_df_stats(log_returns))


Raw Data


Unnamed: 0,EURUSD,GBPUSD,USDCAD,USDJPY
min,-0.027265,-0.063726,-0.037117,-0.034561
max,0.034357,0.035111,0.034804,0.052261
mean,5.9e-05,-4e-06,-5.8e-05,-6.5e-05
median,7.4e-05,0.0,-5.2e-05,-3.8e-05
std,0.006108,0.005795,0.005381,0.006275


Curated


Unnamed: 0,EURUSD,GBPUSD,USDCAD,USDJPY
min,-0.027265,-0.038992,-0.037117,-0.034561
max,0.034357,0.028512,0.034804,0.037652
mean,5e-05,-3e-06,-4.9e-05,-6.8e-05
median,6.7e-05,-2e-05,-4.7e-05,-1.8e-05
std,0.006149,0.005639,0.005385,0.006268


time: 28.3 ms (started: 2021-11-15 11:57:48 +01:00)


In [11]:
params = {"ylims": (-0.0667, 0.0667), "yticks": np.linspace(-0.06, 0.06, 7)}
fig, ax = plot_violin(log_returns, params)
plt.savefig(plots_dir / "violin_train.png")

<Figure size 2700x1800 with 1 Axes>

time: 813 ms (started: 2021-11-15 11:57:48 +01:00)


## Binarization of Data

### Binarization of Data

In [12]:
n_bits = 16
binarization_params = {}
for column in log_returns.columns:
    binarization_params[column] = {
        "n_bits": n_bits,
        "x_min": log_returns[column].min(),
        "x_max": log_returns[column].max(),
    }

log_returns_binarized = binarize_df(log_returns, binarization_params)
log_returns_unbinarized = unbinarize_df(log_returns_binarized, binarization_params)
print(f"Discretization Errors for n_bits = {n_bits}")
display((log_returns - log_returns_unbinarized).abs().agg(["max", "mean"]))

Discretization Errors for n_bits = 16


Unnamed: 0,EURUSD,GBPUSD,USDCAD,USDJPY
max,4.699172e-07,5.149917e-07,5.486262e-07,5.508586e-07
mean,2.316821e-07,2.548981e-07,2.74841e-07,2.774857e-07


time: 90.1 ms (started: 2021-11-15 11:57:49 +01:00)


## Correlation Coefficients

In [13]:
combinations = (
    ("EURUSD", "GBPUSD"),
    ("EURUSD", "USDJPY"),
    ("EURUSD", "USDCAD"),
    ("GBPUSD", "USDJPY"),
    ("GBPUSD", "USDCAD"),
    ("USDJPY", "USDCAD"),
)
correlation_coefficients = compute_correlation_coefficients(log_returns, combinations)
correlation_coefficients

Unnamed: 0,Pearson,Spearman,Kendall
EURUSD/GBPUSD,0.616834,0.617062,0.444424
EURUSD/USDJPY,-0.265169,-0.303933,-0.212593
EURUSD/USDCAD,-0.441786,-0.411412,-0.28661
GBPUSD/USDJPY,-0.150245,-0.208661,-0.145539
GBPUSD/USDCAD,-0.421101,-0.37331,-0.258017
USDJPY/USDCAD,0.009134,0.061499,0.04288


time: 23.8 ms (started: 2021-11-15 11:57:49 +01:00)


## Probability Density Estimation

### KDE

In [14]:
from sklearn.neighbors import KernelDensity

kde = KernelDensity(kernel="gaussian", bandwidth=0.0001).fit(log_returns.to_numpy())
samples = pd.DataFrame(kde.sample(log_returns.shape[0]), columns=log_returns.columns)
samples

Unnamed: 0,EURUSD,GBPUSD,USDCAD,USDJPY
0,0.007472,0.003541,0.000296,0.003696
1,0.015639,0.005325,-0.003944,-0.008312
2,0.003876,-0.001721,0.004473,0.001946
3,0.011120,0.003819,-0.000757,-0.001916
4,0.003980,0.000570,-0.002885,0.004076
...,...,...,...,...
5159,-0.011640,0.012370,0.009715,0.004072
5160,0.001809,0.004267,-0.001626,-0.003865
5161,0.010613,0.012972,-0.005126,0.001136
5162,0.004318,-0.000599,0.015656,-0.002522


time: 59.1 ms (started: 2021-11-15 11:57:49 +01:00)


In [47]:
params = {
    "title": "test",
    "xlims": (-0.045, 0.045),
    "ylims": (-0.045, 0.045),
    "xticks": np.linspace(-0.04, 0.04, 9),
    "yticks": np.linspace(-0.04, 0.04, 9),
}
fig, axs = plot_qq_grid(log_returns, samples, params)
plt.savefig(plots_dir / "kde_qq.png")

correlation_coefficients_data = compute_correlation_coefficients(
    log_returns, combinations
)
correlation_coefficients_sample = compute_stats_over_dfs(
    [
        compute_correlation_coefficients(
            pd.DataFrame(kde.sample(log_returns.shape[0]), columns=log_returns.columns),
            combinations,
        )
        for i in range(100)
    ]
)

fig, axs = plot_correlation_coefficients(
    correlation_coefficients_data, correlation_coefficients_sample
)
plt.savefig(plots_dir / "kde_correlation_coefficients.png")

<Figure size 2700x2700 with 4 Axes>

<Figure size 3000x1800 with 6 Axes>

time: 6.45 s (started: 2021-11-15 12:28:57 +01:00)


In [16]:
print("Annualized Volatility")
pd.DataFrame(
    {
        "data": compute_annualized_volatility(log_returns),
        "sample": compute_annualized_volatility(samples),
    }
)

Annualized Volatility


Unnamed: 0,data,sample
EURUSD,0.097614,0.096869
GBPUSD,0.089519,0.08995
USDCAD,0.085481,0.084287
USDJPY,0.099496,0.098679


time: 6.62 ms (started: 2021-11-15 11:57:55 +01:00)


## Volatiliy

In [17]:
from qbm.metrics import compute_rolling_volatility
from qbm.utils import binarize_volatility
from qbm.plotting import plot_volatility_grid
from datetime import date

volatility = compute_rolling_volatility(log_returns, timedelta(days=90))
volatility_binarized = binarize_volatility(volatility)
train_data = pd.merge(
    log_returns, volatility_binarized, left_index=True, right_index=True
)
display(train_data)

params = {
    "yscale": "log",
    "xlims": (
        volatility.index.min() - timedelta(days=90),
        volatility.index.max() + timedelta(days=90),
    ),
    "ylims": (3e-2, 3e-1),
    "xticks": [date(year, 1, 1) for year in range(1996, 2025, 4)],
    "label": "3M Rolling",
}
fig, ax = plot_volatility_grid(volatility, params)
plt.savefig(plots_dir / "rolling_volatility.png")

Unnamed: 0_level_0,EURUSD,GBPUSD,USDCAD,USDJPY,EURUSD_volatility_binary,GBPUSD_volatility_binary,USDCAD_volatility_binary,USDJPY_volatility_binary
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1999-04-06,0.006283,-0.005453,0.001598,-0.008912,0,0,0,1
1999-04-07,-0.003327,0.003012,-0.003334,0.006361,0,0,0,1
1999-04-08,0.003571,-0.000311,0.000600,-0.002059,0,0,0,1
1999-04-12,0.002124,0.000124,-0.005615,-0.003732,0,0,0,1
1999-04-13,-0.001853,-0.000247,-0.000939,-0.006247,0,0,0,1
...,...,...,...,...,...,...,...,...
2019-12-23,0.001046,-0.004786,0.000145,-0.000987,0,1,0,0
2019-12-24,-0.000406,-0.001013,0.000738,-0.000073,0,1,0,0
2019-12-27,0.006286,0.005752,-0.002467,-0.000968,0,1,0,0
2019-12-30,0.001796,0.001641,-0.001064,-0.005167,0,1,0,0


<Figure size 3000x1800 with 4 Axes>

time: 5.06 s (started: 2021-11-15 11:57:55 +01:00)


In [62]:
def filter_df_on_values(df, column_values, drop_filter_columns=True):
    """
    Return a copy of the dataframe filtered conditionally on provided
    column values.
    
    :param df: Dataframe to filter.
    :param column_values: Dictionary where the keys are column names, and the
        values are values on which to filter the dataframe.
    :param drop_filter_columns: If True returns a copy of the dataframe with
        the filtered columns dropped.
        
    :returns: A dataframe filtered conditionally on the provided column values.
    """
    df = df.copy()
    for column, value in column_values.items():
        df = df.loc[df[column] == value]

    if drop_filter_columns:
        df.drop(column_values.keys(), axis=1, inplace=True)

    return df


low_volatility_column_values = {
    f"{pair}_volatility_binary": 1 for pair in currency_pairs
}

compute_annualized_volatility(
    filter_df_on_values(train_data, low_volatility_column_values)
)

EURUSD    0.131185
GBPUSD    0.126346
USDCAD    0.128737
USDJPY    0.128116
dtype: float64

time: 4.39 ms (started: 2021-11-15 12:41:52 +01:00)


In [19]:
high_volatility = train_data[
    (train_data["EURUSD_volatility_binary"] == 1) & 
    (train_data["GBPUSD_volatility_binary"] == 1) & 
    (train_data["USDCAD_volatility_binary"] == 1) & 
    (train_data["USDJPY_volatility_binary"] == 1)
]
compute_annualized_volatility(high_volatility)

EURUSD                      0.131185
GBPUSD                      0.126346
USDCAD                      0.128737
USDJPY                      0.128116
EURUSD_volatility_binary    0.000000
GBPUSD_volatility_binary    0.000000
USDCAD_volatility_binary    0.000000
USDJPY_volatility_binary    0.000000
dtype: float64

time: 11.7 ms (started: 2021-11-15 11:58:00 +01:00)


In [89]:
from qbm.plotting import plot_tail_concentrations
dfs = {
    "Data": log_returns,
}
plot_tail_concentrations(dfs, combinations)

(<Figure size 3300x3300 with 6 Axes>,
 array([[<AxesSubplot:title={'center':'EURUSD/GBPUSD'}>,
         <AxesSubplot:title={'center':'EURUSD/USDJPY'}>],
        [<AxesSubplot:title={'center':'EURUSD/USDCAD'}>,
         <AxesSubplot:title={'center':'GBPUSD/USDJPY'}>],
        [<AxesSubplot:title={'center':'GBPUSD/USDCAD'}>,
         <AxesSubplot:title={'center':'USDJPY/USDCAD'}>]], dtype=object))

<Figure size 3300x3300 with 6 Axes>

time: 5.87 s (started: 2021-11-15 19:02:07 +01:00)
