In [27]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from skforecast.ForecasterAutoreg import ForecasterAutoreg

In [16]:
DATA_PATH = "data/bitcoin_data.csv"
df = pd.read_csv(DATA_PATH)
df.head()

Unnamed: 0,Date,btc_market_price,btc_total_bitcoins,btc_market_cap,btc_trade_volume,btc_blocks_size,btc_avg_block_size,btc_n_orphaned_blocks,btc_n_transactions_per_block,btc_median_confirmation_time,...,btc_cost_per_transaction_percent,btc_cost_per_transaction,btc_n_unique_addresses,btc_n_transactions,btc_n_transactions_total,btc_n_transactions_excluding_popular,btc_n_transactions_excluding_chains_longer_than_100,btc_output_volume,btc_estimated_transaction_volume,btc_estimated_transaction_volume_usd
0,2010-02-23 00:00:00,0.0,2110700.0,0.0,0.0,0.0,0.000216,0.0,1.0,0.0,...,25100.0,0.0,252.0,252.0,42613.0,252.0,252.0,12600.0,50.0,0.0
1,2010-02-24 00:00:00,0.0,2120200.0,0.0,0.0,0.0,0.000282,0.0,1.0,0.0,...,179.245283,0.0,195.0,196.0,42809.0,196.0,196.0,14800.0,5300.0,0.0
2,2010-02-25 00:00:00,0.0,2127600.0,0.0,0.0,0.0,0.000227,0.0,1.0,0.0,...,1057.142857,0.0,150.0,150.0,42959.0,150.0,150.0,8100.0,700.0,0.0
3,2010-02-26 00:00:00,0.0,2136100.0,0.0,0.0,0.0,0.000319,0.0,1.0,0.0,...,64.582059,0.0,176.0,176.0,43135.0,176.0,176.0,29349.0,13162.0,0.0
4,2010-02-27 00:00:00,0.0,2144750.0,0.0,0.0,0.0,0.000223,0.0,1.0,0.0,...,1922.222222,0.0,176.0,176.0,43311.0,176.0,176.0,9101.0,450.0,0.0


Let's do some very basic initial processing so we can immediately do so naive linear regression modelling. We need to drop the date column from the beginning so we only have numeric predictors.


In [17]:
# Drop Date column because it is a string so won't work with LinearRegression
df = df.drop("Date", axis=1)
df.head()

Unnamed: 0,btc_market_price,btc_total_bitcoins,btc_market_cap,btc_trade_volume,btc_blocks_size,btc_avg_block_size,btc_n_orphaned_blocks,btc_n_transactions_per_block,btc_median_confirmation_time,btc_hash_rate,...,btc_cost_per_transaction_percent,btc_cost_per_transaction,btc_n_unique_addresses,btc_n_transactions,btc_n_transactions_total,btc_n_transactions_excluding_popular,btc_n_transactions_excluding_chains_longer_than_100,btc_output_volume,btc_estimated_transaction_volume,btc_estimated_transaction_volume_usd
0,0.0,2110700.0,0.0,0.0,0.0,0.000216,0.0,1.0,0.0,3.2e-05,...,25100.0,0.0,252.0,252.0,42613.0,252.0,252.0,12600.0,50.0,0.0
1,0.0,2120200.0,0.0,0.0,0.0,0.000282,0.0,1.0,0.0,3.6e-05,...,179.245283,0.0,195.0,196.0,42809.0,196.0,196.0,14800.0,5300.0,0.0
2,0.0,2127600.0,0.0,0.0,0.0,0.000227,0.0,1.0,0.0,2.8e-05,...,1057.142857,0.0,150.0,150.0,42959.0,150.0,150.0,8100.0,700.0,0.0
3,0.0,2136100.0,0.0,0.0,0.0,0.000319,0.0,1.0,0.0,3.2e-05,...,64.582059,0.0,176.0,176.0,43135.0,176.0,176.0,29349.0,13162.0,0.0
4,0.0,2144750.0,0.0,0.0,0.0,0.000223,0.0,1.0,0.0,3.3e-05,...,1922.222222,0.0,176.0,176.0,43311.0,176.0,176.0,9101.0,450.0,0.0


In [18]:
# Count NaNs by rows
missing = df.isna().sum(axis=1)
print(missing.value_counts())

0    2899
1      21
Name: count, dtype: int64


There are only 21 rows that contains missing values. This is less than 1% of the total number of observations, so dropping this rows isn't really harmful. 

In [19]:
df = df.dropna()


Let's see the correlation between different predictors in our dataset

We want to predict btc_market_price in 4 days ahead and we use the data from the previous 3 days to do so. Therefore, we can consider the data from the current row and create "lag" variables that contain the data from the features from the previous 2 days. We also create a new response column btc_market_price_future that holds that market_price in 4 days time.

In [20]:
# Assume df['btc_market_price'] exists
N, p = df.shape
df_reshaped = df.copy()
y = df_reshaped['btc_market_price']
# Let's say df has btc_market_price and some predictors
# # Create lagged features
original_cols = df.columns.tolist()
for col in original_cols:
    df_reshaped[f'{col}_lag1'] = df[col].shift(1)
    df_reshaped[f'{col}_lag2'] = df[col].shift(2)

df_reshaped['btc_market_price_future'] = y.shift(-4)
assert(len(df_reshaped.columns) == 3 * p + 1)
df_reshaped = df_reshaped.dropna()

In [21]:
print(len(df_reshaped.columns))
print(len(df.columns))
print(df_reshaped.columns)

70
23
Index(['btc_market_price', 'btc_total_bitcoins', 'btc_market_cap',
       'btc_trade_volume', 'btc_blocks_size', 'btc_avg_block_size',
       'btc_n_orphaned_blocks', 'btc_n_transactions_per_block',
       'btc_median_confirmation_time', 'btc_hash_rate', 'btc_difficulty',
       'btc_miners_revenue', 'btc_transaction_fees',
       'btc_cost_per_transaction_percent', 'btc_cost_per_transaction',
       'btc_n_unique_addresses', 'btc_n_transactions',
       'btc_n_transactions_total', 'btc_n_transactions_excluding_popular',
       'btc_n_transactions_excluding_chains_longer_than_100',
       'btc_output_volume', 'btc_estimated_transaction_volume',
       'btc_estimated_transaction_volume_usd', 'btc_market_price_lag1',
       'btc_market_price_lag2', 'btc_total_bitcoins_lag1',
       'btc_total_bitcoins_lag2', 'btc_market_cap_lag1', 'btc_market_cap_lag2',
       'btc_trade_volume_lag1', 'btc_trade_volume_lag2',
       'btc_blocks_size_lag1', 'btc_blocks_size_lag2',
       'btc_avg_bl

Now we split the data using first 80% for training and the last 20% as a test set. Note that we do not randomly split because the data is longitudinal.

In [22]:
# Split into train/test dataframes
train_size = int(len(df_reshaped) * 0.8)
train_df = df_reshaped.iloc[:train_size]
test_df  = df_reshaped.iloc[train_size:]

# Split by predictors/response so we can pass to sklearn
response_col_name = 'btc_market_price_future'
X_train = train_df.drop(response_col_name, axis=1)
y_train = train_df[response_col_name]
X_test = test_df.drop(response_col_name, axis=1)
y_test = test_df[response_col_name]
X_train.head()

Unnamed: 0,btc_market_price,btc_total_bitcoins,btc_market_cap,btc_trade_volume,btc_blocks_size,btc_avg_block_size,btc_n_orphaned_blocks,btc_n_transactions_per_block,btc_median_confirmation_time,btc_hash_rate,...,btc_n_transactions_excluding_popular_lag1,btc_n_transactions_excluding_popular_lag2,btc_n_transactions_excluding_chains_longer_than_100_lag1,btc_n_transactions_excluding_chains_longer_than_100_lag2,btc_output_volume_lag1,btc_output_volume_lag2,btc_estimated_transaction_volume_lag1,btc_estimated_transaction_volume_lag2,btc_estimated_transaction_volume_usd_lag1,btc_estimated_transaction_volume_usd_lag2
2,0.0,2127600.0,0.0,0.0,0.0,0.000227,0.0,1.0,0.0,2.8e-05,...,196.0,252.0,196.0,252.0,14800.0,12600.0,5300.0,50.0,0.0,0.0
3,0.0,2136100.0,0.0,0.0,0.0,0.000319,0.0,1.0,0.0,3.2e-05,...,150.0,196.0,150.0,196.0,8100.0,14800.0,700.0,5300.0,0.0,0.0
4,0.0,2144750.0,0.0,0.0,0.0,0.000223,0.0,1.0,0.0,3.3e-05,...,176.0,150.0,176.0,150.0,29349.0,8100.0,13162.0,700.0,0.0,0.0
5,0.0,2152850.0,0.0,0.0,0.0,0.000291,0.0,1.0,0.0,3e-05,...,176.0,176.0,176.0,176.0,9101.0,29349.0,450.0,13162.0,0.0,0.0
6,0.0,2162150.0,0.0,0.0,0.0,0.000228,0.0,1.0,0.0,3.5e-05,...,165.0,176.0,165.0,176.0,13399.0,9101.0,5250.0,450.0,0.0,0.0


Now we'll naively apply linear regression just to see what accuracy we get

In [23]:
lr = LinearRegression().fit(X_train, y_train) # fit to training data
pred = lr.predict(X_test)

lr_Rsquared_score = lr.score(X_train, y_train)
print("Naive R^2 score: ", lr_Rsquared_score)

yhat = lr.predict(X_test)
loss = mean_squared_error(y_test, yhat)
print("RMSE: ", np.sqrt(loss))

Naive R^2 score:  0.9854199669244095
RMSE:  834.2082934642567


R^2 is practically 1 on the training and our Root Mean Squared Error is only ~$800 which is already quite good. Likely this is because we have included the predictors such as previous market_price and market_caps which are very highly correlated with the future market price a few days later.

In [26]:
df_reshaped.corr()['btc_market_price_future'].sort_values(ascending=False)

btc_market_price_future                  1.000000
btc_market_price                         0.989783
btc_market_cap                           0.989521
btc_market_price_lag1                    0.987504
btc_market_cap_lag1                      0.987222
                                           ...   
btc_cost_per_transaction_percent_lag1   -0.011026
btc_cost_per_transaction_percent_lag2   -0.012509
btc_n_orphaned_blocks_lag1              -0.083645
btc_n_orphaned_blocks_lag2              -0.083720
btc_n_orphaned_blocks                   -0.083866
Name: btc_market_price_future, Length: 70, dtype: float64

Let's use lagged versions of the variables

In [28]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)
alphas = [0.1, 1.0, 10.0, 100.0]
ridge_cv = RidgeCV(alphas=alphas, store_cv_values=True)
ridge_cv.fit(X_train_scaled, y_train)
print("Best alpha:", ridge_cv.alpha_)
y_pred = ridge_cv.predict(X_test_scaled)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = ridge_cv.score(X_test_scaled, y_test)
print("R^2:", r2)
print("RMSE:", rmse)

Best alpha: 0.1
R^2: 0.9641527659754008
RMSE: 821.9259421733764


Even with ridge regression there are some unnecessary variables such as "market_cap" which is just a linear transformation of 