# Data Analysis

## Setup

### Imports

In [1]:
%load_ext autoreload
%autoreload 2
%load_ext autotime

from datetime import date, datetime, timedelta
from pathlib import Path

import IPython.display as IPydisplay
import matplotlib.pyplot as plt
import mplfinance as mpf
import numpy as np
import pandas as pd

from qbm.metrics import compute_annualized_volatility, compute_correlation_coefficients
from qbm.plotting import (
    mpf_style,
    plot_correlation_coefficients,
    plot_histogram_grid,
    plot_qq_grid,
    plot_violin,
)
from qbm.utils import (
    binarize_df,
    get_binarization_params,
    PowerTransformer,
    prepare_training_data,
    unbinarize_df,
    load_log_returns,
    load_raw_data,
    compute_df_stats,
    compute_stats_over_dfs,
    get_project_dir,
    get_rng,
    load_artifact,
    lr_exp_decay,
    save_artifact,
)
from qbm.sampling import generate_rbm_sample, generate_rbm_samples_df

time: 1.44 s (started: 2021-11-18 02:29:09 +01:00)


### Data Import

In [2]:
project_dir = get_project_dir()
data_dir = project_dir / "data"
if not (data_dir / "train").exists():
    (data_dir / "train").mkdir()
artifacts_dir = project_dir / "artifacts"
plots_dir = project_dir / "results/plots/data_analysis"

data_source = "dukascopy"
start_date = datetime(1999, 1, 1)
end_date = datetime(2019, 12, 31)
currency_pairs = ["EURUSD", "GBPUSD", "USDCAD", "USDJPY"]

dfs, log_returns_raw = load_raw_data(
    data_dir,
    data_source,
    start_date=datetime(1990, 1, 1),
    end_date=datetime(2021, 12, 31),
)
log_returns_raw.to_csv(data_dir / f"train/{data_source}.csv")

# filter dates
for pair, df in dfs.items():
    dfs[pair] = df.loc[(df.index >= start_date) & (df.index <= end_date)]
log_returns_raw = log_returns_raw.loc[
    (log_returns_raw.index >= start_date) & (log_returns_raw.index <= end_date)
]

print("EURUSD")
display(dfs["EURUSD"])
display(compute_df_stats(dfs["EURUSD"]))

print("Log Returns")
display(log_returns_raw)
display(compute_df_stats((log_returns_raw)))

EURUSD


Unnamed: 0_level_0,open,high,low,close,volume,return,log_return
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1999-01-04,1.17400,1.19060,1.16950,1.18005,19664.0000,0.005153,0.005140
1999-01-06,1.17620,1.18130,1.15500,1.16815,31654.0000,-0.006844,-0.006868
1999-01-07,1.16250,1.17670,1.16050,1.16860,62349.0000,0.005247,0.005234
1999-01-08,1.17110,1.17330,1.15320,1.16325,43164.0000,-0.006703,-0.006726
1999-01-11,1.15650,1.16070,1.14850,1.15460,45083.0000,-0.001643,-0.001644
...,...,...,...,...,...,...,...
2019-12-23,1.10801,1.10958,1.10699,1.10917,108289.4297,0.001047,0.001046
2019-12-24,1.10918,1.10938,1.10690,1.10873,80970.6797,-0.000406,-0.000406
2019-12-27,1.11013,1.11883,1.10987,1.11713,124575.9063,0.006306,0.006286
2019-12-30,1.11812,1.12207,1.11806,1.12013,130773.8750,0.001798,0.001796


Unnamed: 0,open,high,low,close,volume,return,log_return
min,0.8267,0.8324,0.8229,0.8264,54.0,-0.026897,-0.027265
max,1.5972,1.60389,1.58643,1.59721,2697619.0,0.034954,0.034357
mean,1.203174,1.208784,1.197732,1.20323,483517.5,7.1e-05,5.2e-05
median,1.213745,1.2188,1.20693,1.213665,217876.1,6.7e-05,6.7e-05
std,0.165674,0.166238,0.164876,0.165638,537392.5,0.006168,0.006167


Log Returns


Unnamed: 0_level_0,EURUSD,GBPUSD,USDCAD,USDJPY
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1999-01-04,0.005140,-0.000904,-0.003269,-0.013065
1999-01-06,-0.006868,-0.009712,-0.000992,0.013452
1999-01-07,0.005234,-0.003025,0.001191,-0.014041
1999-01-08,-0.006726,-0.000244,-0.001124,-0.001622
1999-01-11,-0.001644,0.000429,-0.003977,-0.018750
...,...,...,...,...
2019-12-23,0.001046,-0.004786,0.000145,-0.000987
2019-12-24,-0.000406,-0.001013,0.000738,-0.000073
2019-12-27,0.006286,0.005752,-0.002467,-0.000968
2019-12-30,0.001796,0.001641,-0.001064,-0.005167


Unnamed: 0,EURUSD,GBPUSD,USDCAD,USDJPY
min,-0.027265,-0.063726,-0.037117,-0.034561
max,0.034357,0.035111,0.034804,0.052261
mean,5.2e-05,-8e-06,-5e-05,-6.3e-05
median,6.7e-05,-2e-05,-4.7e-05,-1.8e-05
std,0.006167,0.005728,0.005397,0.006321


time: 1.2 s (started: 2021-11-18 02:29:11 +01:00)


## Raw Data Analysis Plots

### Candlestick OHLC Plots

In [3]:
for pair in currency_pairs:
    df = dfs[pair]
    mpf.plot(
        df[df.index > df.index.max() - timedelta(days=365)],
        title=pair,
        type="candlestick",
        style=mpf_style,
        figsize=(15, 6),
        volume=True,
        tight_layout=True,
        savefig=dict(fname=plots_dir / f"candlestick_{pair}.png", dpi=300),
    )

time: 3.2 s (started: 2021-11-17 20:25:54 +01:00)


In [4]:
IPydisplay.Image(plots_dir / f"candlestick_EURUSD.png")

<IPython.core.display.Image object>

time: 2.83 ms (started: 2021-11-17 20:25:57 +01:00)


In [5]:
IPydisplay.Image(plots_dir / f"candlestick_GBPUSD.png")

<IPython.core.display.Image object>

time: 13.1 ms (started: 2021-11-17 20:25:57 +01:00)


In [6]:
IPydisplay.Image(plots_dir / f"candlestick_USDCAD.png")

<IPython.core.display.Image object>

time: 13.8 ms (started: 2021-11-17 20:25:57 +01:00)


In [7]:
IPydisplay.Image(plots_dir / f"candlestick_USDJPY.png")

<IPython.core.display.Image object>

time: 16.1 ms (started: 2021-11-17 20:25:57 +01:00)


### Histograms

In [8]:
params = {
    "xlims": (-0.03, 0.03),
    "ylims": (0, 100),
    "xticks": np.linspace(-0.03, 0.03, 7),
    "yticks": np.linspace(0, 100, 6),
}
fig, ax = plot_histogram_grid(log_returns_raw, params, bins=80, density=True)
plt.savefig(plots_dir / "histograms.png")

<Figure size 3000x1800 with 4 Axes>

time: 1.15 s (started: 2021-11-17 20:25:57 +01:00)


### Violin/Box Plot

In [9]:
params = {"ylims": (-0.0667, 0.0667), "yticks": np.linspace(-0.06, 0.06, 7)}
fig, ax = plot_violin(log_returns_raw, params)
plt.savefig(plots_dir / "violin.png")

<Figure size 2700x1800 with 1 Axes>

time: 538 ms (started: 2021-11-17 20:25:58 +01:00)


## Data Curation

In [10]:
# find and remove outliers above threshold
log_returns = load_log_returns(
    "dukascopy", start_date="1999-01-01", end_date="2019-12-31", outlier_threshold=10
)

print("\nRaw Data")
display(compute_df_stats(log_returns_raw))
print("Curated")
display(compute_df_stats(log_returns))


Raw Data


Unnamed: 0,EURUSD,GBPUSD,USDCAD,USDJPY
min,-0.027265,-0.063726,-0.037117,-0.034561
max,0.034357,0.035111,0.034804,0.052261
mean,5.2e-05,-8e-06,-5e-05,-6.3e-05
median,6.7e-05,-2e-05,-4.7e-05,-1.8e-05
std,0.006167,0.005728,0.005397,0.006321


Curated


Unnamed: 0,EURUSD,GBPUSD,USDCAD,USDJPY
min,-0.027265,-0.038992,-0.037117,-0.034561
max,0.034357,0.035111,0.034804,0.052261
mean,5.6e-05,4e-06,-5.3e-05,-5.8e-05
median,6.7e-05,-2e-05,-4.7e-05,-1.8e-05
std,0.00616,0.00566,0.005394,0.006309


time: 23.8 ms (started: 2021-11-17 20:25:59 +01:00)


In [11]:
params = {"ylims": (-0.0667, 0.0667), "yticks": np.linspace(-0.06, 0.06, 7)}
fig, ax = plot_violin(log_returns, params)
plt.savefig(plots_dir / "violin_train.png")

<Figure size 2700x1800 with 1 Axes>

time: 531 ms (started: 2021-11-17 20:25:59 +01:00)


## Data Transformation

In [12]:
transformer = PowerTransformer(log_returns_raw, power=0.5, threshold=1)
log_returns_transformed = transformer.transform(log_returns_raw)

time: 11.3 ms (started: 2021-11-17 20:25:59 +01:00)


### Histograms

In [13]:
params = {
    "xlims": (-0.03, 0.03),
    "ylims": (0, 100),
    "xticks": np.linspace(-0.03, 0.03, 7),
    "yticks": np.linspace(0, 100, 6),
}
fig, ax = plot_histogram_grid(log_returns_transformed, params, bins=30, density=True)
plt.savefig(plots_dir / "histograms_transformed.png")

<Figure size 3000x1800 with 4 Axes>

time: 928 ms (started: 2021-11-17 20:25:59 +01:00)


### Violin/Box Plot

In [14]:
params = {"ylims": (-0.0233, 0.0233), "yticks": np.linspace(-0.04, 0.04, 9)}
fig, ax = plot_violin(log_returns_transformed, params)
plt.savefig(plots_dir / "violin_transformed.png")

<Figure size 2700x1800 with 1 Axes>

time: 538 ms (started: 2021-11-17 20:26:00 +01:00)


## Binarization of Data

### Binarization of Data

In [15]:
n_bits = 16
binarization_params = {}
for column in log_returns.columns:
    binarization_params[column] = {
        "n_bits": n_bits,
        "x_min": log_returns[column].min(),
        "x_max": log_returns[column].max(),
    }

log_returns_binarized = binarize_df(log_returns, binarization_params)
log_returns_unbinarized = unbinarize_df(log_returns_binarized, binarization_params)
print(f"Discretization Errors for n_bits = {n_bits}")
display((log_returns - log_returns_unbinarized).abs().agg(["max", "mean"]))

Discretization Errors for n_bits = 16


Unnamed: 0,EURUSD,GBPUSD,USDCAD,USDJPY
max,4.699172e-07,5.653249e-07,5.486262e-07,6.623774e-07
mean,2.317001e-07,2.837606e-07,2.748302e-07,3.292663e-07


time: 63.9 ms (started: 2021-11-17 20:26:01 +01:00)


## Correlation Coefficients

In [16]:
combinations = (
    ("EURUSD", "GBPUSD"),
    ("EURUSD", "USDJPY"),
    ("EURUSD", "USDCAD"),
    ("GBPUSD", "USDJPY"),
    ("GBPUSD", "USDCAD"),
    ("USDJPY", "USDCAD"),
)
correlation_coefficients = compute_correlation_coefficients(log_returns, combinations)
correlation_coefficients

Unnamed: 0,Pearson,Spearman,Kendall
EURUSD/GBPUSD,0.618646,0.617284,0.444639
EURUSD/USDJPY,-0.255886,-0.303177,-0.212124
EURUSD/USDCAD,-0.443807,-0.411753,-0.286886
GBPUSD/USDJPY,-0.138721,-0.207959,-0.145096
GBPUSD/USDCAD,-0.423923,-0.373673,-0.258303
USDJPY/USDCAD,0.00219,0.060884,0.042477


time: 18.2 ms (started: 2021-11-17 20:26:01 +01:00)


## Probability Density Estimation

### KDE

In [17]:
from sklearn.neighbors import KernelDensity

kde = KernelDensity(kernel="gaussian", bandwidth=0.0001).fit(log_returns.to_numpy())
samples = pd.DataFrame(kde.sample(log_returns.shape[0]), columns=log_returns.columns)
samples

Unnamed: 0,EURUSD,GBPUSD,USDCAD,USDJPY
0,0.003734,0.004766,-0.003005,0.008220
1,0.002564,0.001632,0.003112,0.004052
2,0.000345,-0.003756,-0.002496,-0.004794
3,0.007721,0.003112,0.003966,-0.009438
4,-0.004092,-0.001645,0.001767,0.000392
...,...,...,...,...
5160,-0.002548,-0.000704,0.007154,0.013358
5161,0.000652,0.002378,0.000327,0.001645
5162,-0.004745,-0.009863,0.005019,0.005476
5163,0.012994,0.002949,0.001464,-0.006887


time: 46.7 ms (started: 2021-11-17 20:26:01 +01:00)


In [18]:
params = {
    "title": "test",
    "xlims": (-0.045, 0.045),
    "ylims": (-0.045, 0.045),
    "xticks": np.linspace(-0.04, 0.04, 9),
    "yticks": np.linspace(-0.04, 0.04, 9),
}
fig, axs = plot_qq_grid(log_returns, samples, params)
plt.savefig(plots_dir / "kde_qq.png")

correlation_coefficients_data = compute_correlation_coefficients(
    log_returns, combinations
)
correlation_coefficients_sample = compute_stats_over_dfs(
    [
        compute_correlation_coefficients(
            pd.DataFrame(kde.sample(log_returns.shape[0]), columns=log_returns.columns),
            combinations,
        )
        for i in range(100)
    ]
)

fig, axs = plot_correlation_coefficients(
    correlation_coefficients_data, correlation_coefficients_sample
)
plt.savefig(plots_dir / "kde_correlation_coefficients.png")

<Figure size 2700x2700 with 4 Axes>

<Figure size 3000x1800 with 6 Axes>

time: 3.98 s (started: 2021-11-17 20:26:01 +01:00)


In [19]:
print("Annualized Volatility")
pd.DataFrame(
    {
        "data": compute_annualized_volatility(log_returns),
        "sample": compute_annualized_volatility(samples),
    }
)

Annualized Volatility


Unnamed: 0,data,sample
EURUSD,0.097785,0.09756
GBPUSD,0.089846,0.091575
USDCAD,0.085625,0.086258
USDJPY,0.100156,0.095926


time: 5.45 ms (started: 2021-11-17 20:26:05 +01:00)


## Volatiliy

In [5]:
from qbm.metrics import compute_rolling_volatility
from qbm.utils import binarize_volatility
from qbm.plotting import plot_volatility_grid

log_returns_volatility = log_returns_raw.loc[
    (log_returns_raw.index >= datetime(1999, 1, 1) - timedelta(days=90))
    & (log_returns_raw.index <= datetime(2019, 12, 31))
]
volatility = compute_rolling_volatility(log_returns_volatility, timedelta(days=90))
volatility_binarized = binarize_volatility(volatility)
train_data = pd.merge(
    log_returns_volatility, volatility_binarized, left_index=True, right_index=True
)
display(train_data)

params = {
    "yscale": "log",
    "xlims": (
        volatility.index.min() - timedelta(days=90),
        volatility.index.max() + timedelta(days=90),
    ),
    "ylims": (3e-2, 3e-1),
    "xticks": [date(year, 1, 1) for year in range(1996, 2025, 4)],
    "label": "3M Rolling",
}
fig, ax = plot_volatility_grid(volatility, params)
plt.savefig(plots_dir / "rolling_volatility.png")

Unnamed: 0_level_0,EURUSD,GBPUSD,USDCAD,USDJPY,EURUSD_volatility_binary,GBPUSD_volatility_binary,USDCAD_volatility_binary,USDJPY_volatility_binary
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1999-04-06,0.006283,-0.005453,0.001598,-0.008912,0,0,0,1
1999-04-07,-0.003327,0.003012,-0.003334,0.006361,0,0,0,1
1999-04-08,0.003571,-0.000311,0.000600,-0.002059,0,0,0,1
1999-04-12,0.002124,0.000124,-0.005615,-0.003732,0,0,0,1
1999-04-13,-0.001853,-0.000247,-0.000939,-0.006247,0,0,0,1
...,...,...,...,...,...,...,...,...
2019-12-23,0.001046,-0.004786,0.000145,-0.000987,0,1,0,0
2019-12-24,-0.000406,-0.001013,0.000738,-0.000073,0,1,0,0
2019-12-27,0.006286,0.005752,-0.002467,-0.000968,0,1,0,0
2019-12-30,0.001796,0.001641,-0.001064,-0.005167,0,1,0,0


<Figure size 3000x1800 with 4 Axes>

time: 3.08 s (started: 2021-11-18 02:29:57 +01:00)
