In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pathlib
import datetime  
from time import process_time
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split 
from sklearn.multioutput import MultiOutputRegressor 
from sklearn.metrics import mean_absolute_error
from sklearn.inspection import permutation_importance

In [2]:
blocks_df = pd.read_csv('pool_blocks_2.csv', index_col='height')
print(blocks_df.shape)
blocks_df.tail()

(3049, 15)


Unnamed: 0_level_0,next_block_mean_fee,hour_mean_fee,six_hour_mean_fee,day_mean_fee,time_btwn_blocks,next_block_sat/vB,hour_block_sat/vB,six_hour_block_sat/vB,day_block_sat/vB,mempool_mean_fee,mempool_mean_vBytes,month,day,hour,minute
height,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
664549,33980.46515,11384.315789,9187.204423,8829.081983,401.667629,19.756141,9.881303,8.25,8.212914,0.000107,95.017185,1,4,18,43
664550,29661.188679,11332.593466,8521.071743,8693.17392,121.387896,19.803509,8.021448,7.628581,7.691099,8.8e-05,100.031976,1,4,18,45
664551,25751.310393,8703.527446,8196.988623,7633.777096,617.352392,19.788886,11.670282,7.50536,7.504585,9.8e-05,95.282053,1,4,18,55
664552,11928.25,10031.604167,8043.71152,7179.456915,111.200127,12.67168,6.616592,6.490284,6.503805,7.6e-05,105.228578,1,4,18,57
664553,28603.645748,11213.539877,7928.369882,1.0,872.896248,19.857541,19.775785,4.443284,1.0,0.000106,102.803255,1,4,19,12


# Split Data

In [3]:
# satoshis per virtual byte
target = ['next_block_sat/vB', 'hour_block_sat/vB', 'six_hour_block_sat/vB', 'day_block_sat/vB'] 
y = blocks_df[target]
X = blocks_df.drop(columns=target)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.8,random_state=42)

assert len(X_train) + len(X_test) == len(X)

# Baseline

In [5]:
y_pred = [y_train.mean()] * len(y_train)

print('Baseline MAE:', mean_absolute_error(y_train, y_pred, multioutput='raw_values'))

Baseline MAE: [7.23462807 9.17287839 5.11920087 2.67068187]


# Model

In [6]:
xgb_more = MultiOutputRegressor(xgb.XGBRegressor(objective ='reg:squarederror', seed = 42))

In [7]:
param_dist = {"estimator__n_estimators" : range(75, 200, 25),
             "estimator__subsample" : np.arange(0.5, 1.0, 0.1),
             "estimator__max_depth" : range(2, 6, 1)}

cv_model = RandomizedSearchCV(
    xgb_more,
    param_distributions=param_dist,
    cv=5,
    n_jobs=-1,
    n_iter=50
)

# cv_model = GridSearchCV(
#     xgb_more,
#     param_grid=param_dist,
#     cv=5,
#     n_jobs=-1,
#     verbose=1
# )

cv_model.fit(X_train, y_train);

In [8]:
cv_model.best_params_

{'estimator__subsample': 0.8999999999999999,
 'estimator__n_estimators': 75,
 'estimator__max_depth': 5}

# Metrics

In [9]:
print('Training MAE:', mean_absolute_error(y_train, cv_model.predict(X_train), multioutput='raw_values'))
print('Test MAE:', mean_absolute_error(y_test, cv_model.predict(X_test), multioutput='raw_values'))

# Grid
# Training MAE: [1.1658414  1.84030962 0.35672489 0.12263735]
# Test MAE: [3.01844642 4.32106921 1.24775473 0.48603203]

# 10
# Training MAE: [0.60933472 0.98675779 0.17975069 0.06058145]
# Test MAE: [3.03491914 4.62674619 1.2299598  0.47650211]

Training MAE: [1.1658414  1.84030962 0.35672489 0.12263735]
Test MAE: [3.01844642 4.32106921 1.24775473 0.48603203]


In [10]:
perm = permutation_importance(cv_model, X_test, y_test, n_jobs=-1, random_state=42)

data = {
    'importances_mean' : perm['importances_mean'],
    'importances_std' : perm['importances_std']
}

permutation_importances = pd.DataFrame(data, index=X_test.columns).sort_values(by='importances_mean', ascending=False)
permutation_importances

Unnamed: 0,importances_mean,importances_std
six_hour_mean_fee,0.384138,0.018795
day_mean_fee,0.333376,0.013001
time_btwn_blocks,0.222768,0.017883
mempool_mean_fee,0.191023,0.018065
day,0.162706,0.009587
mempool_mean_vBytes,0.088564,0.008068
next_block_mean_fee,0.069326,0.0045
hour_mean_fee,0.062814,0.0032
hour,0.019266,0.002247
month,0.003345,0.00051
