# Bitcoin Historical Data
Bitcoin data at 1-min intervals from select exchanges, Jan 2012 to March 2021

Link: https://www.kaggle.com/datasets/mczielinski/bitcoin-historical-data

In [10]:
import numpy as np
import pandas as pd
import seaborn as sns
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.preprocessing import MinMaxScaler
from tsfresh import extract_features
from tsfresh.feature_extraction import ComprehensiveFCParameters

<IPython.core.display.Javascript object>

In [3]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [4]:
df = pd.read_csv(
    "../data/mczielinski_bitcoin-historical-data/bitstampUSD_1-min_data_2012-01-01_to_2021-03-31.csv.gz"
)
df["Timestamp"] = pd.to_datetime(df["Timestamp"], unit="s")
df.fillna(0, inplace=True)
df.set_index("Timestamp", inplace=True)
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2011-12-31 07:52:00,4.39,4.39,4.39,4.39,0.455581,2.000000,4.390000
2011-12-31 07:53:00,0.00,0.00,0.00,0.00,0.000000,0.000000,0.000000
2011-12-31 07:54:00,0.00,0.00,0.00,0.00,0.000000,0.000000,0.000000
2011-12-31 07:55:00,0.00,0.00,0.00,0.00,0.000000,0.000000,0.000000
2011-12-31 07:56:00,0.00,0.00,0.00,0.00,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...
2021-03-30 23:56:00,58714.31,58714.31,58686.00,58686.00,1.384487,81259.372187,58692.753339
2021-03-30 23:57:00,58683.97,58693.43,58683.97,58685.81,7.294848,428158.146640,58693.226508
2021-03-30 23:58:00,58693.43,58723.84,58693.43,58723.84,1.705682,100117.070370,58696.198496
2021-03-30 23:59:00,58742.18,58770.38,58742.18,58760.59,0.720415,42332.958633,58761.866202


<IPython.core.display.Javascript object>

In [5]:
df.index.year.value_counts()

2012    527040
2016    527040
2020    527040
2013    525600
2014    525600
2017    525600
2018    525600
2019    525600
2015    519128
2021    128161
2011       968
Name: Timestamp, dtype: int64

<IPython.core.display.Javascript object>

In [6]:
# построение прогноза только для 1 года
df = df[df.index.year == 2019]
df.shape

(525600, 7)

<IPython.core.display.Javascript object>

# Подготовка признаков

In [8]:
scaler = MinMaxScaler()

<IPython.core.display.Javascript object>

In [9]:
X = df[["Open", "High", "Low", "Close"]].copy()
X[X.columns] = scaler.fit_transform(X)
X["Date"] = X.index.round("30min")
X

Unnamed: 0_level_0,Open,High,Low,Close,Date
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-01-01 00:00:00,0.266641,0.266483,0.267089,0.266993,2019-01-01
2019-01-01 00:01:00,0.266704,0.266190,0.266857,0.266462,2019-01-01
2019-01-01 00:02:00,0.266344,0.265831,0.266566,0.266171,2019-01-01
2019-01-01 00:03:00,0.266569,0.266055,0.266689,0.266585,2019-01-01
2019-01-01 00:04:00,0.266533,0.266019,0.266835,0.266440,2019-01-01
...,...,...,...,...,...
2019-12-31 23:55:00,0.517979,0.517052,0.518847,0.518151,2020-01-01
2019-12-31 23:56:00,0.518470,0.517470,0.518450,0.517683,2020-01-01
2019-12-31 23:57:00,0.517151,0.516628,0.517832,0.517444,2020-01-01
2019-12-31 23:58:00,0.517344,0.516376,0.517857,0.517473,2020-01-01


<IPython.core.display.Javascript object>

In [12]:
# https://otus.ru/nest/post/1024/
settings = ComprehensiveFCParameters()
X = extract_features(X, column_id="Date", default_fc_parameters=settings)
X.shape

Feature Extraction: 100%|████████████████████████████████████████████████████████████████| 30/30 [25:02<00:00, 50.08s/it]


(17521, 3176)

<IPython.core.display.Javascript object>

In [14]:
y = df[["Weighted_Price"]].copy()
y["Date"] = y.index.round("30min")
y = y.groupby("Date").mean()
y

Unnamed: 0_level_0,Weighted_Price
Date,Unnamed: 1_level_1
2019-01-01 00:00:00,2767.072716
2019-01-01 00:30:00,2545.995022
2019-01-01 01:00:00,2860.814664
2019-01-01 01:30:00,2163.408399
2019-01-01 02:00:00,3205.776311
...,...
2019-12-31 22:00:00,7158.608490
2019-12-31 22:30:00,6918.165685
2019-12-31 23:00:00,7172.228421
2019-12-31 23:30:00,6918.312048


<IPython.core.display.Javascript object>

In [15]:
# для бинарного обучения классификатора (up/down)
def accuracy_target(labels):
    labels_acc = labels.copy()
    labels_acc["Weighted_Price_1"] = labels.shift(1)["Weighted_Price"].fillna(0)
    labels_acc["Target"] = (
        labels_acc["Weighted_Price"] > labels_acc["Weighted_Price_1"]
    ).astype(int)
    return labels_acc


y = accuracy_target(y)
y = y[["Target"]]
y

Unnamed: 0_level_0,Target
Date,Unnamed: 1_level_1
2019-01-01 00:00:00,1
2019-01-01 00:30:00,0
2019-01-01 01:00:00,1
2019-01-01 01:30:00,0
2019-01-01 02:00:00,1
...,...
2019-12-31 22:00:00,1
2019-12-31 22:30:00,0
2019-12-31 23:00:00,1
2019-12-31 23:30:00,0


<IPython.core.display.Javascript object>

# Обучение модели

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, shuffle=False, random_state=42
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((15768, 3176), (1753, 3176), (15768, 1), (1753, 1))

<IPython.core.display.Javascript object>

In [17]:
tscv = TimeSeriesSplit(n_splits=5)

<IPython.core.display.Javascript object>

In [18]:
ensemble = []

for train_index, val_index in tscv.split(X_train):
    X_sub_train, X_sub_valid = X_train.iloc[train_index], X_train.iloc[val_index]
    y_sub_train, y_sub_valid = (
        y_train.iloc[train_index],
        y_train.iloc[val_index],
    )

    train_pool = Pool(X_sub_train, y_sub_train)
    valid_pool = Pool(X_sub_valid, y_sub_valid)

    model = CatBoostClassifier()
    model.fit(train_pool, eval_set=valid_pool, verbose=False)

    ensemble.append(model)
    print(model.get_best_score())

{'learn': {'Logloss': 0.0671312398925038}, 'validation': {'Logloss': 0.6063262268635125}}
{'learn': {'Logloss': 0.12422802604315307}, 'validation': {'Logloss': 0.5723687548426111}}
{'learn': {'Logloss': 0.16342434043632673}, 'validation': {'Logloss': 0.5306976791775349}}
{'learn': {'Logloss': 0.18785014217847015}, 'validation': {'Logloss': 0.610581707336832}}
{'learn': {'Logloss': 0.21553184510076837}, 'validation': {'Logloss': 0.5823707942003101}}


<IPython.core.display.Javascript object>

In [26]:
preds = [m.predict(X_test) for m in ensemble]
preds = pd.DataFrame(
    np.array(preds).T, columns=["Model_%d" % i for i in range(len(ensemble))]
)
preds

Unnamed: 0,Model_0,Model_1,Model_2,Model_3,Model_4
0,1,1,1,1,1
1,1,1,1,1,1
2,1,1,1,1,1
3,1,0,0,0,0
4,1,0,0,0,0
...,...,...,...,...,...
1748,1,1,1,1,1
1749,0,0,0,0,0
1750,1,1,1,1,1
1751,0,0,0,0,0


<IPython.core.display.Javascript object>

In [31]:
y_pred = preds.mean(axis=1).astype(int)
y_pred

0       1
1       1
2       1
3       0
4       0
       ..
1748    1
1749    0
1750    1
1751    0
1752    0
Length: 1753, dtype: int64

<IPython.core.display.Javascript object>

In [32]:
accuracy_score(y_test, y_pred)

0.7090701654306902

<IPython.core.display.Javascript object>