# Bitcoin Historical Data
Bitcoin data at 1-min intervals from select exchanges, Jan 2012 to March 2021

Link: https://www.kaggle.com/datasets/mczielinski/bitcoin-historical-data

In [1]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool, sum_models, to_classifier
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.preprocessing import StandardScaler
from tsfresh import extract_features
from tsfresh.feature_extraction import ComprehensiveFCParameters

In [2]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [3]:
df = pd.read_csv(
    "../../data/mczielinski_bitcoin-historical-data/bitstampUSD_1-min_data_2012-01-01_to_2021-03-31.csv"
)
df["Timestamp"] = pd.to_datetime(df["Timestamp"], unit="s")
df.fillna(0, inplace=True)
df.set_index("Timestamp", inplace=True)
df["Period"] = df.index.round("30min")
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price,Period
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2011-12-31 07:52:00,4.39,4.39,4.39,4.39,0.455581,2.000000,4.390000,2011-12-31 08:00:00
2011-12-31 07:53:00,0.00,0.00,0.00,0.00,0.000000,0.000000,0.000000,2011-12-31 08:00:00
2011-12-31 07:54:00,0.00,0.00,0.00,0.00,0.000000,0.000000,0.000000,2011-12-31 08:00:00
2011-12-31 07:55:00,0.00,0.00,0.00,0.00,0.000000,0.000000,0.000000,2011-12-31 08:00:00
2011-12-31 07:56:00,0.00,0.00,0.00,0.00,0.000000,0.000000,0.000000,2011-12-31 08:00:00
...,...,...,...,...,...,...,...,...
2021-03-30 23:56:00,58714.31,58714.31,58686.00,58686.00,1.384487,81259.372187,58692.753339,2021-03-31 00:00:00
2021-03-30 23:57:00,58683.97,58693.43,58683.97,58685.81,7.294848,428158.146640,58693.226508,2021-03-31 00:00:00
2021-03-30 23:58:00,58693.43,58723.84,58693.43,58723.84,1.705682,100117.070370,58696.198496,2021-03-31 00:00:00
2021-03-30 23:59:00,58742.18,58770.38,58742.18,58760.59,0.720415,42332.958633,58761.866202,2021-03-31 00:00:00


<IPython.core.display.Javascript object>

In [4]:
df.index.year.value_counts()

2012    527040
2016    527040
2020    527040
2013    525600
2014    525600
2017    525600
2018    525600
2019    525600
2015    519128
2021    128161
2011       968
Name: Timestamp, dtype: int64

<IPython.core.display.Javascript object>

In [5]:
df = df[df.index.year == 2019].copy()
df.shape

(525600, 8)

<IPython.core.display.Javascript object>

# Prepare

In [6]:
scaler = StandardScaler()

<IPython.core.display.Javascript object>

In [7]:
X = df[["Open", "High", "Low", "Close"]].copy()
X[X.columns] = scaler.fit_transform(X)
X["Period"] = X.index.round("30min")
X

Unnamed: 0_level_0,Open,High,Low,Close,Period
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-01-01 00:00:00,-1.203847,-1.202548,-1.203487,-1.202431,2019-01-01
2019-01-01 00:01:00,-1.203546,-1.203951,-1.204592,-1.204967,2019-01-01
2019-01-01 00:02:00,-1.205268,-1.205671,-1.205983,-1.206358,2019-01-01
2019-01-01 00:03:00,-1.204192,-1.204596,-1.205396,-1.204380,2019-01-01
2019-01-01 00:04:00,-1.204364,-1.204768,-1.204699,-1.205074,2019-01-01
...,...,...,...,...,...
2019-12-31 23:55:00,-0.002605,-0.003733,-0.001091,-0.002168,2020-01-01
2019-12-31 23:56:00,-0.000259,-0.001730,-0.002987,-0.004404,2020-01-01
2019-12-31 23:57:00,-0.006562,-0.005760,-0.005939,-0.005546,2020-01-01
2019-12-31 23:58:00,-0.005641,-0.006966,-0.005822,-0.005404,2020-01-01


<IPython.core.display.Javascript object>

In [8]:
# https://tsfresh.readthedocs.io/en/latest/text/feature_extraction_settings.html
# https://otus.ru/nest/post/1024/
X = extract_features(
    X, column_id="Period", default_fc_parameters=ComprehensiveFCParameters()
)
X.shape

Feature Extraction: 100%|███████████████████████████████████████████████| 30/30 [25:36<00:00, 51.21s/it]


(17521, 3152)

<IPython.core.display.Javascript object>

In [9]:
y = df[["Weighted_Price", "Period"]].groupby("Period").mean()
y

Unnamed: 0_level_0,Weighted_Price
Period,Unnamed: 1_level_1
2019-01-01 00:00:00,2767.072716
2019-01-01 00:30:00,2545.995022
2019-01-01 01:00:00,2860.814664
2019-01-01 01:30:00,2163.408399
2019-01-01 02:00:00,3205.776311
...,...
2019-12-31 22:00:00,7158.608490
2019-12-31 22:30:00,6918.165685
2019-12-31 23:00:00,7172.228421
2019-12-31 23:30:00,6918.312048


<IPython.core.display.Javascript object>

In [10]:
y["Weighted_Price_1"] = y["Weighted_Price"].shift(-1).fillna(0)
y["Target"] = (y["Weighted_Price_1"] > y["Weighted_Price"]).astype(int)
y

Unnamed: 0_level_0,Weighted_Price,Weighted_Price_1,Target
Period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-01-01 00:00:00,2767.072716,2545.995022,0
2019-01-01 00:30:00,2545.995022,2860.814664,1
2019-01-01 01:00:00,2860.814664,2163.408399,0
2019-01-01 01:30:00,2163.408399,3205.776311,1
2019-01-01 02:00:00,3205.776311,3551.331120,1
...,...,...,...
2019-12-31 22:00:00,7158.608490,6918.165685,0
2019-12-31 22:30:00,6918.165685,7172.228421,1
2019-12-31 23:00:00,7172.228421,6918.312048,0
2019-12-31 23:30:00,6918.312048,6693.794769,0


<IPython.core.display.Javascript object>

In [11]:
y = y[["Target"]]
y.value_counts()

Target
1         8830
0         8691
dtype: int64

<IPython.core.display.Javascript object>

In [12]:
X.join(y).to_csv(
    "../../data/mczielinski_bitcoin-historical-data/bitstampUSD_30min_tsfresh_2019.csv.gz",
    compression="gzip",
)

<IPython.core.display.Javascript object>

# Train

In [13]:
X.shape, y.shape

((17521, 3152), (17521, 1))

<IPython.core.display.Javascript object>

## Hyperparameter tuning

In [14]:
tscv = TimeSeriesSplit(n_splits=10)

<IPython.core.display.Javascript object>

In [15]:
model = CatBoostClassifier(logging_level="Silent")

# https://catboost.ai/en/docs/concepts/parameter-tuning
# https://docs.aws.amazon.com/sagemaker/latest/dg/catboost-tuning.html
tuned_params = {
    "learning_rate": np.concatenate(
        (
            np.arange(1, 10, 1) / 10,
            np.arange(1, 10, 1) / 100,
            np.arange(1, 10, 1) / 1000,
        )
    ),
    "depth": np.arange(1, 11),
    "l2_leaf_reg": np.concatenate((np.arange(1, 10), np.arange(10, 110, 10))),
    "random_strength": np.arange(1, 11),
    "iterations": np.arange(100, 1100, 100),
}

grid_search_result = model.randomized_search(
    tuned_params, Pool(X, y), cv=3, verbose=False, plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<IPython.core.display.Javascript object>

In [16]:
best_model_params = grid_search_result["params"]
best_model_params

{'random_strength': 3,
 'depth': 6,
 'l2_leaf_reg': 8,
 'iterations': 900,
 'learning_rate': 0.05}

<IPython.core.display.Javascript object>

# KFold

In [18]:
X_train, X_true, y_train, y_true = train_test_split(
    X, y, test_size=0.1, shuffle=False, random_state=42
)

X_train.shape, X_true.shape, y_train.shape, y_true.shape

((15768, 3152), (1753, 3152), (15768, 1), (1753, 1))

<IPython.core.display.Javascript object>

In [19]:
ensemble = []

for train_index, val_index in tscv.split(X_train):
    X_sub_train, X_sub_valid = X_train.iloc[train_index], X_train.iloc[val_index]
    y_sub_train, y_sub_valid = (
        y_train.iloc[train_index],
        y_train.iloc[val_index],
    )

    train_pool = Pool(X_sub_train, y_sub_train)
    valid_pool = Pool(X_sub_valid, y_sub_valid)

    model = CatBoostClassifier(**best_model_params, logging_level="Silent")
    model.fit(train_pool, eval_set=valid_pool, verbose=False)

    ensemble.append(model)
    print(model.get_best_score())

{'learn': {'Logloss': 0.035770811132765444}, 'validation': {'Logloss': 0.6235385013763012}}
{'learn': {'Logloss': 0.09709476980597082}, 'validation': {'Logloss': 0.6101612403113889}}
{'learn': {'Logloss': 0.1544228025528578}, 'validation': {'Logloss': 0.6297671039355128}}
{'learn': {'Logloss': 0.19336155906296243}, 'validation': {'Logloss': 0.5564755910506902}}
{'learn': {'Logloss': 0.22483174388129343}, 'validation': {'Logloss': 0.5312223392785445}}
{'learn': {'Logloss': 0.2443064261727121}, 'validation': {'Logloss': 0.5851915584725029}}
{'learn': {'Logloss': 0.26920817126347374}, 'validation': {'Logloss': 0.538208671536707}}
{'learn': {'Logloss': 0.28708932548696403}, 'validation': {'Logloss': 0.5968971235273015}}
{'learn': {'Logloss': 0.308191851005501}, 'validation': {'Logloss': 0.5677059121177677}}
{'learn': {'Logloss': 0.31716078879697596}, 'validation': {'Logloss': 0.5742902548688824}}


<IPython.core.display.Javascript object>

In [20]:
models_avrg = sum_models(ensemble, weights=[1.0 / len(ensemble)] * len(ensemble))
models_avrg = to_classifier(models_avrg)
models_avrg

<catboost.core.CatBoostClassifier at 0x7fbc54d8db70>

<IPython.core.display.Javascript object>

In [22]:
y_pred = models_avrg.predict(X_true)
y_pred

array([1, 1, 1, ..., 0, 1, 1])

<IPython.core.display.Javascript object>

In [29]:
y_pred_1 = models_avrg.predict_proba(X_true)[:, 1]
y_pred_1

array([0.65494822, 0.63493497, 0.79766308, ..., 0.38299251, 0.78587039,
       0.7582567 ])

<IPython.core.display.Javascript object>

In [23]:
(y_true["Target"] == y_pred).sum() / len(y_true)

0.7181973759269823

<IPython.core.display.Javascript object>