# Bitcoin Historical Data
Bitcoin data at 1-min intervals from select exchanges, Jan 2012 to March 2021

Link: https://www.kaggle.com/datasets/mczielinski/bitcoin-historical-data

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from catboost import CatBoostClassifier, Pool, sum_models, to_classifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.preprocessing import MinMaxScaler
from tsfresh import extract_features
from tsfresh.feature_extraction import ComprehensiveFCParameters

In [2]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [3]:
df = pd.read_csv(
    "../data/mczielinski_bitcoin-historical-data/bitstampUSD_1-min_data_2012-01-01_to_2021-03-31.csv"
)
df["Timestamp"] = pd.to_datetime(df["Timestamp"], unit="s")
df.fillna(0, inplace=True)
df.set_index("Timestamp", inplace=True)
df["Date"] = df.index.round("30min")
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price,Date
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2011-12-31 07:52:00,4.39,4.39,4.39,4.39,0.455581,2.000000,4.390000,2011-12-31 08:00:00
2011-12-31 07:53:00,0.00,0.00,0.00,0.00,0.000000,0.000000,0.000000,2011-12-31 08:00:00
2011-12-31 07:54:00,0.00,0.00,0.00,0.00,0.000000,0.000000,0.000000,2011-12-31 08:00:00
2011-12-31 07:55:00,0.00,0.00,0.00,0.00,0.000000,0.000000,0.000000,2011-12-31 08:00:00
2011-12-31 07:56:00,0.00,0.00,0.00,0.00,0.000000,0.000000,0.000000,2011-12-31 08:00:00
...,...,...,...,...,...,...,...,...
2021-03-30 23:56:00,58714.31,58714.31,58686.00,58686.00,1.384487,81259.372187,58692.753339,2021-03-31 00:00:00
2021-03-30 23:57:00,58683.97,58693.43,58683.97,58685.81,7.294848,428158.146640,58693.226508,2021-03-31 00:00:00
2021-03-30 23:58:00,58693.43,58723.84,58693.43,58723.84,1.705682,100117.070370,58696.198496,2021-03-31 00:00:00
2021-03-30 23:59:00,58742.18,58770.38,58742.18,58760.59,0.720415,42332.958633,58761.866202,2021-03-31 00:00:00


<IPython.core.display.Javascript object>

In [4]:
df.index.year.value_counts()

2012    527040
2016    527040
2020    527040
2013    525600
2014    525600
2017    525600
2018    525600
2019    525600
2015    519128
2021    128161
2011       968
Name: Timestamp, dtype: int64

<IPython.core.display.Javascript object>

In [5]:
# построение прогноза только для 1 года
df = df[df.index.year == 2019].copy()
df.shape

(525600, 8)

<IPython.core.display.Javascript object>

# Подготовка признаков

In [6]:
scaler = MinMaxScaler()

<IPython.core.display.Javascript object>

In [7]:
X = df[["Open", "High", "Low", "Close"]].copy()
X[X.columns] = scaler.fit_transform(X)
X["Date"] = X.index.round("30min")
X

Unnamed: 0_level_0,Open,High,Low,Close,Date
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-01-01 00:00:00,0.266641,0.266483,0.267089,0.266993,2019-01-01
2019-01-01 00:01:00,0.266704,0.266190,0.266857,0.266462,2019-01-01
2019-01-01 00:02:00,0.266344,0.265831,0.266566,0.266171,2019-01-01
2019-01-01 00:03:00,0.266569,0.266055,0.266689,0.266585,2019-01-01
2019-01-01 00:04:00,0.266533,0.266019,0.266835,0.266440,2019-01-01
...,...,...,...,...,...
2019-12-31 23:55:00,0.517979,0.517052,0.518847,0.518151,2020-01-01
2019-12-31 23:56:00,0.518470,0.517470,0.518450,0.517683,2020-01-01
2019-12-31 23:57:00,0.517151,0.516628,0.517832,0.517444,2020-01-01
2019-12-31 23:58:00,0.517344,0.516376,0.517857,0.517473,2020-01-01


<IPython.core.display.Javascript object>

In [8]:
# https://tsfresh.readthedocs.io/en/latest/text/feature_extraction_settings.html
# https://otus.ru/nest/post/1024/
X = extract_features(
    X, column_id="Date", default_fc_parameters=ComprehensiveFCParameters()
)
X.shape

Feature Extraction: 100%|████████████████████████████████████████████████████████████████| 30/30 [28:16<00:00, 56.56s/it]


(17521, 3176)

<IPython.core.display.Javascript object>

In [9]:
y = df[["Weighted_Price", "Date"]].groupby("Date").mean()
y

Unnamed: 0_level_0,Weighted_Price
Date,Unnamed: 1_level_1
2019-01-01 00:00:00,2767.072716
2019-01-01 00:30:00,2545.995022
2019-01-01 01:00:00,2860.814664
2019-01-01 01:30:00,2163.408399
2019-01-01 02:00:00,3205.776311
...,...
2019-12-31 22:00:00,7158.608490
2019-12-31 22:30:00,6918.165685
2019-12-31 23:00:00,7172.228421
2019-12-31 23:30:00,6918.312048


<IPython.core.display.Javascript object>

In [10]:
y["Weighted_Price_1"] = y["Weighted_Price"].shift(-1).fillna(0)
y["Target"] = (y["Weighted_Price_1"] > y["Weighted_Price"]).astype(int)
y

Unnamed: 0_level_0,Weighted_Price,Weighted_Price_1,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-01-01 00:00:00,2767.072716,2545.995022,0
2019-01-01 00:30:00,2545.995022,2860.814664,1
2019-01-01 01:00:00,2860.814664,2163.408399,0
2019-01-01 01:30:00,2163.408399,3205.776311,1
2019-01-01 02:00:00,3205.776311,3551.331120,1
...,...,...,...
2019-12-31 22:00:00,7158.608490,6918.165685,0
2019-12-31 22:30:00,6918.165685,7172.228421,1
2019-12-31 23:00:00,7172.228421,6918.312048,0
2019-12-31 23:30:00,6918.312048,6693.794769,0


<IPython.core.display.Javascript object>

In [11]:
y = y[["Target"]]
y.value_counts()

Target
1         8830
0         8691
dtype: int64

<IPython.core.display.Javascript object>

# Подбор гиперпараметров

In [12]:
tscv = TimeSeriesSplit(n_splits=10)

<IPython.core.display.Javascript object>

In [13]:
model = CatBoostClassifier(logging_level="Silent")

# https://effectiveml.com/using-grid-search-to-optimise-catboost-parameters.html
grid_params = {
    "depth": [3, 1, 2, 6, 4, 5, 7, 8, 9, 10],
    "iterations": [250, 100, 500, 1000],
    "learning_rate": [0.03, 0.001, 0.01, 0.1, 0.2, 0.3],
    "l2_leaf_reg": [3, 1, 5, 10, 100],
}

grid_search_result = model.randomized_search(
    grid_params, Pool(X, y), cv=tscv, verbose=False, plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<IPython.core.display.Javascript object>

In [14]:
best_model_params = grid_search_result["params"]
best_model_params

{'depth': 6, 'l2_leaf_reg': 5, 'iterations': 1000, 'learning_rate': 0.03}

<IPython.core.display.Javascript object>

# Обучение модели

In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, shuffle=False, random_state=42
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((15768, 3176), (1753, 3176), (15768, 1), (1753, 1))

<IPython.core.display.Javascript object>

In [16]:
ensemble = []

for train_index, val_index in tscv.split(X_train):
    X_sub_train, X_sub_valid = X_train.iloc[train_index], X_train.iloc[val_index]
    y_sub_train, y_sub_valid = (
        y_train.iloc[train_index],
        y_train.iloc[val_index],
    )

    train_pool = Pool(X_sub_train, y_sub_train)
    valid_pool = Pool(X_sub_valid, y_sub_valid)

    model = CatBoostClassifier(**best_model_params)
    model.fit(train_pool, eval_set=valid_pool, verbose=False)

    ensemble.append(model)
    print(model.get_best_score())

{'learn': {'Logloss': 0.05427629441962693}, 'validation': {'Logloss': 0.6236366783726128}}
{'learn': {'Logloss': 0.14815629504743305}, 'validation': {'Logloss': 0.605044209339462}}
{'learn': {'Logloss': 0.21801102491814928}, 'validation': {'Logloss': 0.621397205253102}}
{'learn': {'Logloss': 0.2641709495275425}, 'validation': {'Logloss': 0.5555572451604841}}
{'learn': {'Logloss': 0.2880504727915731}, 'validation': {'Logloss': 0.5276045438609853}}
{'learn': {'Logloss': 0.3051795771886179}, 'validation': {'Logloss': 0.5789902524416514}}
{'learn': {'Logloss': 0.33001438406239836}, 'validation': {'Logloss': 0.5336285971621911}}
{'learn': {'Logloss': 0.34335096373735463}, 'validation': {'Logloss': 0.5959970901029138}}
{'learn': {'Logloss': 0.3640614142117156}, 'validation': {'Logloss': 0.5658283239673291}}
{'learn': {'Logloss': 0.37264384423457264}, 'validation': {'Logloss': 0.5680855194920233}}


<IPython.core.display.Javascript object>

In [17]:
models_avrg = sum_models(ensemble, weights=[1.0 / len(ensemble)] * len(ensemble))
models_avrg = to_classifier(models_avrg)
models_avrg

<catboost.core.CatBoostClassifier at 0x7f568fefee80>

<IPython.core.display.Javascript object>

In [18]:
y_pred = models_avrg.predict(X_test)
y_pred

array([1, 1, 1, ..., 0, 1, 1])

<IPython.core.display.Javascript object>

In [19]:
accuracy_score(y_test, y_pred)

0.7221905305191101

<IPython.core.display.Javascript object>