# Analyze Effectiveness of Various Machine Learning Models
## Predicting the price direction for bitcoin

### Import required libraries and metrics

In [1]:
# Import libraries
import pandas as pd
import hvplot.pandas
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

In [2]:
# Process Data

# Import data from .csv
btc_metrics_file = 'data/btc_metrics.csv'

btc_df = pd.read_csv(
    btc_metrics_file,
    index_col='date',
    parse_dates=True,
    infer_datetime_format=True
)

display(btc_df.head(2))
display(btc_df.tail(2))

Unnamed: 0_level_0,price,price_pct_change,a_sopr,puell_multiple,exchange_netflow,difficulty_compression_band,mvrv_z_score,nonzero_balance_addresses,%_utxo_in_profit,nvt,nupl,stablecoin_supply,rhodl,cvdd,rpv,balanced_price,investor_capitalization
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2016-01-01,434.883982,,1.0179,1.430684,3084.261365,0.149233,0.707518,6810666.0,0.829427,7.160297,0.310327,,336.693607,169.528788,0.000537,232.840772,3017275000.0
2016-01-02,434.92242,8.8e-05,1.006339,1.539487,-1595.518899,0.151165,0.707028,6795517.0,0.8252,7.16968,0.308427,,322.022994,169.51377,0.000265,232.941445,3017969000.0


Unnamed: 0_level_0,price,price_pct_change,a_sopr,puell_multiple,exchange_netflow,difficulty_compression_band,mvrv_z_score,nonzero_balance_addresses,%_utxo_in_profit,nvt,nupl,stablecoin_supply,rhodl,cvdd,rpv,balanced_price,investor_capitalization
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2021-12-07,50592.088498,0.001974,1.040021,1.183472,-14231.298949,0.064475,2.130263,39113906.0,0.893479,16.819511,0.515705,-0.458717,10066.092817,12270.999536,0.00389,20595.60386,428317200000.0
2021-12-08,50513.855144,-0.001546,,,,,,,,,,,,,,,


### Refine the metrics into relevant formats and create signals

In [3]:
# Create a clean dataframe in which to modify metrics
dataset = btc_df

# Convert metrics that only go up into daily percent change
dataset['nonzero_addy_pct_change'] = btc_df['nonzero_balance_addresses'].pct_change()
dataset['investor_capitalization_pct_change'] = btc_df['investor_capitalization'].pct_change()
dataset['balanced_price_change'] = btc_df['balanced_price'].pct_change()
dataset['cvdd_change'] = btc_df['cvdd'].pct_change()

# Remove the unmodified columns & eliminate metrics that don't exist for the entire data range
dataset = dataset.drop(columns=['investor_capitalization', 'stablecoin_supply', 'nonzero_balance_addresses', 'balanced_price'])

In [4]:
# Review correlations between metrics to help select which metrics to use
display(dataset.drop(columns=['price']).corr().hvplot.heatmap(height=700, width=800, rot=90))
#plt.savefig('images/correlation_matrix.png')

<Figure size 432x288 with 0 Axes>

In [5]:
# Plot dataset to seek correlations or extreme values
display(dataset.hvplot(height=500, width=1000, rot=90))

# Use .savefig to export the plot images
#plt.savefig('images/dataset_plot.png')

<Figure size 432x288 with 0 Axes>

In [6]:
# Initialize the signal dataframe & pull in metrics
signal_df = pd.DataFrame()
signal_df['price_pct_change'] = dataset['price_pct_change']

# pull in and calculate smas for price change
signal_df['price_pct_change'] = dataset['price_pct_change']
signal_df['price_change_ssma'] = signal_df['price_pct_change'].rolling(window=2).mean()
signal_df['price_change_lsma'] = signal_df['price_pct_change'].rolling(window=10).mean()

# Add in utxo profit
signal_df['%_utxo_in_profit'] = dataset['%_utxo_in_profit']
signal_df['utxo_change_ssma'] = signal_df['%_utxo_in_profit'].rolling(window=10).mean()
signal_df['utxo_change_lsma'] = signal_df['%_utxo_in_profit'].rolling(window=45).mean()

# Add in difficulty compression band
signal_df['difficulty_compression_band'] = dataset['difficulty_compression_band']

# Add in exchange netflow
signal_df['exchange_netflow'] = dataset['exchange_netflow'].pct_change()

# Add in rhodl
signal_df['rhodl_change'] = dataset['rhodl'].pct_change()

# Add in puell multiple
signal_df['puell_multiple'] = dataset['puell_multiple'].pct_change()

# Add in a_sopr
signal_df['asopr_change'] = dataset['a_sopr'].pct_change()
signal_df['asopr_change_ssma'] = signal_df['asopr_change'].rolling(window=2).mean()  
signal_df['asopr_change_lsma'] = signal_df['asopr_change'].rolling(window=14).mean()

# Add in address change
signal_df['nonzero_addy_pct_change'] = dataset['nonzero_addy_pct_change']
signal_df['addy_change_ssma'] = signal_df['nonzero_addy_pct_change'].rolling(window=7).mean()
signal_df['addy_change_lsma'] = signal_df['nonzero_addy_pct_change'].rolling(window=21).mean()

signal_df.dropna(inplace=True)

# Define the signal. In this case, simply buy if the price will go up, sell if the price will go down
signal_df['signal'] = 0
signal_df.loc[(signal_df['price_pct_change'] >= 0), 'signal'] = 1 #'buy'
signal_df.loc[(signal_df['price_pct_change'] < 0), 'signal'] = 0 # 'sell'

# Review the signal df
signal_df.head()

Unnamed: 0_level_0,price_pct_change,price_change_ssma,price_change_lsma,%_utxo_in_profit,utxo_change_ssma,utxo_change_lsma,difficulty_compression_band,exchange_netflow,rhodl_change,puell_multiple,asopr_change,asopr_change_ssma,asopr_change_lsma,nonzero_addy_pct_change,addy_change_ssma,addy_change_lsma,signal
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2016-02-14,0.040762,0.031179,0.004819,0.807892,0.743173,0.766557,0.189377,1.970932,0.010025,0.092198,0.01085,0.007335,0.001514,0.004287,0.003562,0.000423,1
2016-02-15,-0.019101,0.010831,0.003981,0.792673,0.746975,0.76574,0.192431,-0.378469,0.074463,-0.078748,-0.009019,0.000915,0.002671,0.00337,0.003636,0.00056,0
2016-02-16,0.021138,0.001019,0.008362,0.810395,0.756294,0.765411,0.191979,-3.583639,0.077703,-0.092366,0.007247,-0.000886,0.00168,0.003977,0.004022,0.000661,1
2016-02-17,0.018809,0.019974,0.009923,0.812491,0.765256,0.765373,0.19162,-1.009661,0.046359,0.154033,0.013427,0.010337,0.003091,0.004183,0.004329,0.000819,1
2016-02-18,0.015107,0.016958,0.012818,0.820258,0.778057,0.76535,0.19135,-34.787963,0.004699,0.102681,-0.01773,-0.002152,0.000522,0.005311,0.004451,0.001067,1


### Separate the metrics to analyze and predict, train and test

In [7]:
# Define X
X = signal_df.drop(columns=['signal', 'asopr_change', 'rhodl_change', 'nonzero_addy_pct_change', '%_utxo_in_profit']).shift().dropna() #removed: 'asopr_change', 'exchange_netflow', 'rhodl', 'nupl', 'price_pct_change', 
# removed cvdd and 

# Define y
y = signal_df['signal']

dataset.dropna(inplace=True)

dataset['signal'] = 0.0

dataset.loc[(dataset['price_pct_change'] >= 0), 'signal'] = 1

dataset.loc[(dataset['price_pct_change'] < 0), 'signal'] = 0
dataset.head()

In [8]:
from sklearn.preprocessing import StandardScaler, RobustScaler
from pandas.tseries.offsets import DateOffset

# Separate training & testing data by date
training_begin = X.index.min()
training_end = X.index.min() + DateOffset(months=3)

# Generate the X_train and y_train DataFrames
X_train = X.loc[training_begin:training_end]
y_train = y.loc[training_begin:training_end]

# Generate the X_test and y_test DataFrames
X_test = X.loc[training_end+DateOffset(days=1):]
y_test = y.loc[training_end+DateOffset(days=1):]

# Create a StandardScaler instance
scaler = RobustScaler()

# Apply the scaler model to fit the X-train data
X_scaler = scaler.fit(X_train)

# Transform the X_train and X_test DataFrames using the X_scaler
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Instantiate Logistic Regression Model

In [9]:
# machine learning libraries
from sklearn.metrics import classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from sklearn.linear_model import LogisticRegression

# Instantiate Logistic Regression model instance
lr_model = LogisticRegression()
 
# Fit the model to the data using the training data
lr_model = lr_model.fit(X_train_scaled, y_train)
 
# Use the testing data to make the model predictions
lr_model_pred = lr_model.predict(X_test_scaled)

# Review the model's predicted values
lr_model_pred

array([1, 1, 1, ..., 1, 1, 1])

In [10]:
# Use a classification report to evaluate the model using the predictions and testing data
testing_report = classification_report(y_test, lr_model_pred)

# Print the classification report
print('LR Report')
print(testing_report)

LR Report
              precision    recall  f1-score   support

           0       0.54      0.08      0.14       926
           1       0.55      0.94      0.69      1106

    accuracy                           0.55      2032
   macro avg       0.54      0.51      0.42      2032
weighted avg       0.54      0.55      0.44      2032



In [11]:
# Create a new empty predictions DataFrame for storing predictions and results
predictions_df = pd.DataFrame(index=X_test.index)
predictions_df['price_pct_change'] = dataset['price_pct_change']

# Add the LR model predictions to the DataFrame
predictions_df['LR Predicted'] = lr_model_pred

# Add the strategy returns to the DataFrame
predictions_df['LR Strategy Returns'] = predictions_df['price_pct_change'] * predictions_df['LR Predicted']

# Review the DataFrame
display(predictions_df.head(3))
display(predictions_df.tail(3))

Unnamed: 0_level_0,price_pct_change,LR Predicted,LR Strategy Returns
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-05-16,-0.013778,1,-0.013778
2016-05-17,-0.00171,1,-0.00171
2016-05-18,0.002451,1,0.002451


Unnamed: 0_level_0,price_pct_change,LR Predicted,LR Strategy Returns
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-12-05,0.004329,1,0.004329
2021-12-06,0.026793,1,0.026793
2021-12-07,0.001974,1,0.001974


In [12]:
# Plot the actual returns versus the strategy returns
lr_returns_plot = (1 + predictions_df[['price_pct_change', 'LR Strategy Returns']]).cumprod().hvplot(
    title='LR Strategy Returns',
    height=500,
    width=1200,
    ylabel='Return (NOT %)'
)

# Use .savefig to export the plot image 
#plt.savefig('images/LR_returns_vs_HODL.png')

# Display LR strategy Returns
lr_returns_plot

<Figure size 432x288 with 0 Axes>

### Generate the Neural Network

In [19]:
# Instantiate the model
nn = Sequential()

# Define neuron data values
inputs = X.shape[1]
L1_nodes = 20
L2_nodes = 10

# Create the first layer
nn.add(Dense(units=L1_nodes,input_dim=inputs,activation='relu'))

# Create the second layer
nn.add(Dense(units=L2_nodes,activation='relu'))

# Create the output layer
nn.add(Dense(units=1, activation='sigmoid'))

# Compile the model
nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=["accuracy"])

# Fit the model
model_robust = nn.fit(X_train_scaled, y_train, epochs=150, batch_size=16)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

In [20]:
# Predict using the testing data
predicted_y = nn.predict(X_test_scaled)

# View the predicted values
predicted_y

array([[0.8725821 ],
       [0.83662   ],
       [0.4356387 ],
       ...,
       [0.99977934],
       [0.99991775],
       [0.8112062 ]], dtype=float32)

In [21]:
# Evaluate the model loss and accuracy metrics using the evaluate method and the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)

# Display the evaluation results
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

64/64 - 0s - loss: 2.5029 - accuracy: 0.5546 - 208ms/epoch - 3ms/step
Loss: 2.5028817653656006, Accuracy: 0.5546259880065918


In [22]:
# Review summary of NN model parameters
nn.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 20)                260       
                                                                 
 dense_4 (Dense)             (None, 10)                210       
                                                                 
 dense_5 (Dense)             (None, 1)                 11        
                                                                 
Total params: 481
Trainable params: 481
Non-trainable params: 0
_________________________________________________________________


In [23]:
# Reshape nn predictions & add to predictions_df
predicted_y.flatten()
predictions_df['predicted_signal_nn'] = predicted_y

# Convert NN predictions to buy or sell signals (1 or 0)
predictions_df['predicted_signal_nn'] = np.where(predictions_df['predicted_signal_nn'] >= 0.5, 1, 0)

# Calculate & add the NN strategy returns to the DataFrame
predictions_df['NN Strategy Returns'] = predictions_df['price_pct_change'] * predictions_df['predicted_signal_nn']

# Calculate the difference between the returns of the two strategies
predictions_df['returns_difference'] = predictions_df['NN Strategy Returns'] - predictions_df['LR Strategy Returns']

# Review df
predictions_df.head()

Unnamed: 0_level_0,price_pct_change,LR Predicted,LR Strategy Returns,predicted_signal_nn,NN Strategy Returns,returns_difference
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-05-16,-0.013778,1,-0.013778,1,-0.013778,0.0
2016-05-17,-0.00171,1,-0.00171,1,-0.00171,0.0
2016-05-18,0.002451,1,0.002451,0,0.0,-0.002451
2016-05-19,-0.036067,1,-0.036067,1,-0.036067,0.0
2016-05-20,0.015203,1,0.015203,1,0.015203,0.0


In [24]:
# plot the NN returns vs HODL
nn_returns_plot = (1 + predictions_df[['price_pct_change', 'NN Strategy Returns']]).cumprod().hvplot(
    title='NN Strategy Returns',
    height=500,
    width=1200,
    ylabel='Return (NOT %)'
)

# Use .savefig to export the plot image 
#plt.savefig('images/NN_returns_vs_HODL.png')

# Display NN Returns vs HODL
nn_returns_plot

<Figure size 432x288 with 0 Axes>

In [25]:
predictions_df['returns_difference'] = predictions_df['NN Strategy Returns'] - predictions_df['LR Strategy Returns']
predictions_df.head()

Unnamed: 0_level_0,price_pct_change,LR Predicted,LR Strategy Returns,predicted_signal_nn,NN Strategy Returns,returns_difference
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-05-16,-0.013778,1,-0.013778,1,-0.013778,0.0
2016-05-17,-0.00171,1,-0.00171,1,-0.00171,0.0
2016-05-18,0.002451,1,0.002451,0,0.0,-0.002451
2016-05-19,-0.036067,1,-0.036067,1,-0.036067,0.0
2016-05-20,0.015203,1,0.015203,1,0.015203,0.0


In [26]:
# Plot the actual returns versus the strategy returns
model_difference = predictions_df['returns_difference'].hvplot(
    title='Difference Between Strategy Returns',
    height=500,
    width=1200,
    ylabel='Model Return Discrepancies'
)

# Use .savefig to export the plot image 
#plt.savefig('images/model_returns_difference.png')

# Display Plot
model_difference

<Figure size 432x288 with 0 Axes>

In [27]:
lr_nn_hodl = lr_returns_plot * nn_returns_plot

# Use .savefig to export the plot image 
#plt.savefig('images/lr_nn_hodl.png')

# Display Plot
lr_nn_hodl

<Figure size 432x288 with 0 Axes>

What I discovered:  
- Accuracy of the model was extremely sensitive to any change in the data. Depending upon the exact duration of the SMAs, the models either massively underperformed or just slightly underperformed. Also, at the end of the project, I refreshed all the data to incorporate the recent crypto market drawdown. The new data from the previous week (Bitcoin moved off it's price in a violent 20-25% crash). This completely changed the performance of the models! That is not a good thing.  
    
- Also, the markets have fundamentally changed as more and more data has been accumulated on the nascent asset class. For example, people did not previously pay as close attention to hashrate of the bitcoin network, and now in the current market cycle, there has been lot's of attention paid to the difficulty rates. The NN model did not outperform without that metric, and it only does outperform in recent years, where without that metric, it outperformed in the previous cycle bear markets. 

- To summarize, I would not entrust these models without significant refining to execute trades using real money.  
    
- Potential improvements would be to obtain additional data that is relevant, like derivatives market data. My API did not include access to derivatives data beyond the last 7 days, and that metric has substantial impact on the market volatility. Additionally, having a multifaceted signal, like when these various metrics hit different parameters, rather than just `NGU` or `NGD` as seen through the daily price percent change. Also, creating a trading strategy that had different methods of buying and selling. For example, when a signal to buy, just buy a partial position each day the buy signal is displayed, selling a percentage of the position as the signal turns bearish, and also having a range where you just simply HODL.  
    
- One final point of concern about this model is that it doesn't take into account the potential transaction fees, nor does it take into account capital gains or losses.  