In [3]:
!pip install -U scikit-learn


Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Downloading scikit_learn-1.8.0-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.10.0 (from scikit-learn)
  Downloading scipy-1.16.3-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.3.0 (from scikit-learn)
  Downloading joblib-1.5.3-py3-none-any.whl.metadata (5.5 kB)
Collecting threadpoolctl>=3.2.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.8.0-cp313-cp313-win_amd64.whl (8.0 MB)
   ---------------------------------------- 0.0/8.0 MB ? eta -:--:--
   --- ------------------------------------ 0.8/8.0 MB 6.1 MB/s eta 0:00:02
   ---------------------------------------  7.9/8.0 MB 24.6 MB/s eta 0:00:01
   ---------------------------------------- 8.0/8.0 MB 23.3 MB/s  0:00:00
Downloading joblib-1.5.3-py3-none-any.whl (309 kB)
Downloading scipy-1.16.3-cp313-cp313-win_amd64.whl (38.5 MB)
   ---


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf
from datetime import datetime

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

plt.rcParams["figure.figsize"] = (12, 5)


In [7]:
start_date = "2016-01-01"
end_date = datetime.today().strftime("%Y-%m-%d")

btc = yf.download("BTC-USD", start=start_date, end=end_date, auto_adjust=False)
gold = yf.download("GC=F", start=start_date, end=end_date, auto_adjust=False)


btc_close = btc["Close"]
if isinstance(btc_close, pd.DataFrame):
    btc_close = btc_close.iloc[:, 0]
btc_close.name = "BTC_Close"

gold_close = gold["Close"]
if isinstance(gold_close, pd.DataFrame):
    gold_close = gold_close.iloc[:, 0]
gold_close.name = "Gold_Close"

data = pd.concat([btc_close, gold_close], axis=1).dropna()

data.head(), data.shape


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


(             BTC_Close   Gold_Close
 Date                               
 2016-01-04  433.091003  1075.099976
 2016-01-05  431.959991  1078.400024
 2016-01-06  429.105011  1091.900024
 2016-01-07  458.048004  1107.699951
 2016-01-08  453.230011  1097.800049,
 (2518, 2))

In [8]:
# Log returns
log_returns = np.log(data / data.shift(1)).dropna()
log_returns.columns = ["BTC_lr", "Gold_lr"]

# Rolling 30-day volatility 
vol30 = log_returns.rolling(30).std()
vol30.columns = ["BTC_vol30", "Gold_vol30"]

df = pd.concat([log_returns, vol30], axis=1).dropna()
df["BTC_lr_lag1"] = df["BTC_lr"].shift(1)
df["Gold_lr_lag1"] = df["Gold_lr"].shift(1)

df = df.dropna()
df.head(), df.shape


(              BTC_lr   Gold_lr  BTC_vol30  Gold_vol30  BTC_lr_lag1  \
 Date                                                                 
 2016-02-18  0.014430  0.012309   0.046036    0.013640     0.021447   
 2016-02-19 -0.003767  0.003501   0.046026    0.013553     0.014430   
 2016-02-22  0.039521 -0.017132   0.044983    0.013935    -0.003767   
 2016-02-23 -0.039638  0.010527   0.045495    0.013815     0.039521   
 2016-02-24  0.009978  0.013328   0.045523    0.013896    -0.039638   
 
             Gold_lr_lag1  
 Date                      
 2016-02-18      0.002646  
 2016-02-19      0.012309  
 2016-02-22      0.003501  
 2016-02-23     -0.017132  
 2016-02-24      0.010527  ,
 (2487, 6))

In [9]:
train = df.loc[: "2023-12-31"].copy()
test  = df.loc["2024-01-01":].copy()

features = ["Gold_lr", "BTC_lr_lag1", "Gold_lr_lag1", "BTC_vol30", "Gold_vol30"]
target = "BTC_lr"

X_train, y_train = train[features], train[target]
X_test, y_test = test[features], test[target]

X_train.shape, X_test.shape


((1978, 5), (509, 5))

In [21]:
pred_base = np.zeros(len(y_test))
mae_base = mean_absolute_error(y_test, pred_base)
rmse_base = np.sqrt(mean_squared_error(y_test, pred_base))
r2_base = r2_score(y_test, pred_base)


lin = LinearRegression()
lin.fit(X_train, y_train)
pred_lin = lin.predict(X_test)

mae_lin = mean_absolute_error(y_test, pred_lin)
rmse_lin = np.sqrt(mean_squared_error(y_test, pred_lin))
r2_lin = r2_score(y_test, pred_lin)

rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=6,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)

mae_rf = mean_absolute_error(y_test, pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, pred_rf))
r2_rf = r2_score(y_test, pred_rf)


results = pd.DataFrame({
    "Model": ["Baseline(0)", "LinearRegression", "RandomForest"],
    "MAE": [mae_base, mae_lin, mae_rf],
    "RMSE": [rmse_base, rmse_lin, rmse_rf],
    "R2": [r2_base, r2_lin, r2_rf],
})

results


Unnamed: 0,Model,MAE,RMSE,R2
0,Baseline(0),0.022636,0.030604,-0.002456
1,LinearRegression,0.022733,0.030809,-0.015922
2,RandomForest,0.022857,0.030667,-0.006553


In [13]:
df_vol = df.copy()

df_vol["BTC_vol30_next"] = df_vol["BTC_vol30"].shift(-1)

df_vol = df_vol.dropna()

df_vol.head(), df_vol.shape


(              BTC_lr   Gold_lr  BTC_vol30  Gold_vol30  BTC_lr_lag1  \
 Date                                                                 
 2016-02-18  0.014430  0.012309   0.046036    0.013640     0.021447   
 2016-02-19 -0.003767  0.003501   0.046026    0.013553     0.014430   
 2016-02-22  0.039521 -0.017132   0.044983    0.013935    -0.003767   
 2016-02-23 -0.039638  0.010527   0.045495    0.013815     0.039521   
 2016-02-24  0.009978  0.013328   0.045523    0.013896    -0.039638   
 
             Gold_lr_lag1  BTC_vol30_next  
 Date                                      
 2016-02-18      0.002646        0.046026  
 2016-02-19      0.012309        0.044983  
 2016-02-22      0.003501        0.045495  
 2016-02-23     -0.017132        0.045523  
 2016-02-24      0.010527        0.045236  ,
 (2486, 7))

In [14]:
train_v = df_vol.loc[: "2023-12-31"].copy()
test_v  = df_vol.loc["2024-01-01":].copy()

features_v = ["BTC_vol30", "Gold_vol30", "BTC_lr", "Gold_lr", "BTC_lr_lag1", "Gold_lr_lag1"]
target_v = "BTC_vol30_next"

X_train_v, y_train_v = train_v[features_v], train_v[target_v]
X_test_v, y_test_v = test_v[features_v], test_v[target_v]

X_train_v.shape, X_test_v.shape


((1978, 6), (508, 6))

In [15]:

pred_base_v = X_test_v["BTC_vol30"].values

mae_base_v = mean_absolute_error(y_test_v, pred_base_v)
rmse_base_v = np.sqrt(mean_squared_error(y_test_v, pred_base_v))
r2_base_v = r2_score(y_test_v, pred_base_v)

mae_base_v, rmse_base_v, r2_base_v


(0.0007622488603812455, np.float64(0.0014019137772830323), 0.9677556505587929)

In [16]:
lin_v = LinearRegression()
lin_v.fit(X_train_v, y_train_v)
pred_lin_v = lin_v.predict(X_test_v)

mae_lin_v = mean_absolute_error(y_test_v, pred_lin_v)
rmse_lin_v = np.sqrt(mean_squared_error(y_test_v, pred_lin_v))
r2_lin_v = r2_score(y_test_v, pred_lin_v)

mae_lin_v, rmse_lin_v, r2_lin_v


(0.0007890843587782631, np.float64(0.0013981918668655698), 0.9679266329334456)

In [17]:
rf_v = RandomForestRegressor(
    n_estimators=400,
    max_depth=8,
    random_state=42,
    n_jobs=-1
)
rf_v.fit(X_train_v, y_train_v)
pred_rf_v = rf_v.predict(X_test_v)

mae_rf_v = mean_absolute_error(y_test_v, pred_rf_v)
rmse_rf_v = np.sqrt(mean_squared_error(y_test_v, pred_rf_v))
r2_rf_v = r2_score(y_test_v, pred_rf_v)

mae_rf_v, rmse_rf_v, r2_rf_v


(0.0010693471165764087, np.float64(0.0018916285809469607), 0.941293965433259)

In [18]:
results_v = pd.DataFrame({
    "Model": ["Baseline(persist)", "LinearRegression", "RandomForest"],
    "MAE": [mae_base_v, mae_lin_v, mae_rf_v],
    "RMSE": [rmse_base_v, rmse_lin_v, rmse_rf_v],
    "R2": [r2_base_v, r2_lin_v, r2_rf_v],
})

results_v


Unnamed: 0,Model,MAE,RMSE,R2
0,Baseline(persist),0.000762,0.001402,0.967756
1,LinearRegression,0.000789,0.001398,0.967927
2,RandomForest,0.001069,0.001892,0.941294


In [20]:
imp_v = pd.Series(rf_v.feature_importances_, index=features_v).sort_values(ascending=False)
imp_v


BTC_vol30       0.986738
Gold_vol30      0.005310
Gold_lr_lag1    0.002675
BTC_lr_lag1     0.002181
BTC_lr          0.001807
Gold_lr         0.001288
dtype: float64

The feature importance results show that "BTC_vol30" is by far the most important feature for predicting next-day Bitcoin volatility. Gold related features contribute very little, which suggests Bitcoin’s risk behavior is mainly driven by its own past volatility rather than gold market movements.

In Phase 3, I used ML models to predict next-day Bitcoin volatility(30-day rolling volatility).  I used a time-based split (train: 2016–2023, test: 2024+), and compared a persistence baseline with Linear Regression and Random Forest.

The baseline already performs very well, and Linear Regression performs similarly. Random Forest performs worse in this setup.  Overall, the results suggest that Bitcoin volatility is mostly explained by its own past volatility, and gold variables add little predictive value.
