# **Bitcoin Movement Prediction**

---
In this section of our project we aim to predict the movement of cryptocurrencies using 'LSTM' networks, a type of recurrent neural network. The data includes historical Bitcoin price data, as well as sentiment scores obtained through two distinct approaches: VADER and BERT.


# **Install Required Libraries**

---



In [None]:
# Provides tools for diverse machine learning tasks, including data preprocessing, model development,
# evaluation, and hyperparameter tuning.
!pip install scikit-learn

# To develop neural networks
!pip install keras

# Install Tensorflow  machine learning Framework
!pip install tensorflow

# For displaying results in an organized manner.
!pip install PrettyTable

# **Import Required Libraries**

---



In [None]:
# Importing the pandas library to manipulate data
import pandas as pd

# Importing the numpy library for  numerical operations
import numpy as np

# Importing resample for class balancing
from sklearn.utils import resample

# to split the data into training and test groups
from sklearn.model_selection import train_test_split

# Machine learning task framework
import tensorflow as tf

# Metrics for assessing model performance
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Importing GridSearchCV to discovery the good hyperparameters for models
from sklearn.model_selection import GridSearchCV

# Regarding the usage of Keras with scikit-learn
from keras.wrappers.scikit_learn import KerasClassifier

# For the step-by-step development of neural networks
from keras.models import Sequential
# Importing Neural network subcomponents
from keras.layers import LSTM, Dense, Bidirectional, Dropout

# For Normalizing the data to ensure consistent scaling
from sklearn.preprocessing import MinMaxScaler

#For Creating organized and clear result tables
from prettytable import PrettyTable

# Configure random seeding for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

base_path = '/content/drive/My Drive/MyData/'

Sentiment_Scores = pd.read_csv(base_path + 'Sentiment_Scores_Dataset.csv')
data_2017 = pd.read_csv(base_path + 'BTC-2017min.csv')
data_2018 = pd.read_csv(base_path + 'BTC-2018min.csv')
data_2019 = pd.read_csv(base_path + 'BTC-2019min.csv')

Mounted at /content/drive


# **Overview the Data**

---



In [None]:
data_2017.head()

Unnamed: 0,unix,date,symbol,open,high,low,close,Volume BTC,Volume USD
0,1514764740,31/12/2017 23:59,BTC/USD,13913.28,13913.28,13867.18,13880.0,0.591748,8213.456549
1,1514764680,31/12/2017 23:58,BTC/USD,13913.26,13953.83,13884.69,13953.77,1.398784,19518.30966
2,1514764620,31/12/2017 23:57,BTC/USD,13908.73,13913.26,13874.99,13913.26,0.775012,10782.94429
3,1514764560,31/12/2017 23:56,BTC/USD,13827.0,13908.69,13827.0,13859.58,0.666459,9236.841134
4,1514764500,31/12/2017 23:55,BTC/USD,13825.05,13825.05,13825.05,13825.05,0.065501,905.56013


In [None]:
data_2018.head()

Unnamed: 0,unix,date,symbol,open,high,low,close,Volume BTC,Volume USD
0,1546300740,2018-12-31 23:59:00,BTC/USD,3689.26,3693.3,3689.26,3693.3,9.838855,36337.841473
1,1546300680,2018-12-31 23:58:00,BTC/USD,3689.26,3691.35,3689.26,3691.35,0.56,2067.156
2,1546300620,2018-12-31 23:57:00,BTC/USD,3688.83,3689.26,3688.83,3689.26,0.560833,2069.059602
3,1546300560,2018-12-31 23:56:00,BTC/USD,3687.87,3689.65,3686.92,3686.92,7.61024,28058.346208
4,1546300500,2018-12-31 23:55:00,BTC/USD,3688.28,3688.85,3685.0,3688.85,7.665703,28277.629729


In [None]:
data_2019.head()

Unnamed: 0,unix,date,symbol,open,high,low,close,Volume BTC,Volume USD
0,1577836740,2019-12-31 23:59:00,BTC/USD,7167.3,7171.22,7167.3,7168.36,0.191448,1372.366393
1,1577836680,2019-12-31 23:58:00,BTC/USD,7166.89,7167.3,7161.99,7167.3,0.568868,4077.245538
2,1577836620,2019-12-31 23:57:00,BTC/USD,7164.22,7170.8,7161.65,7166.89,3.95944,28376.874525
3,1577836560,2019-12-31 23:56:00,BTC/USD,7182.49,7182.49,7170.2,7170.2,6.248892,44805.80255
4,1577836500,2019-12-31 23:55:00,BTC/USD,7175.69,7176.68,7175.69,7176.68,0.016877,121.122623


#**Combine the Price Datasets**

---



In [None]:
combined_data = pd.concat([data_2017, data_2018, data_2019], ignore_index=True)

# **Data Quality Checking**

---



In [None]:
combined_data.head()

Unnamed: 0,unix,date,symbol,open,high,low,close,Volume BTC,Volume USD
0,1514764740,31/12/2017 23:59,BTC/USD,13913.28,13913.28,13867.18,13880.0,0.591748,8213.456549
1,1514764680,31/12/2017 23:58,BTC/USD,13913.26,13953.83,13884.69,13953.77,1.398784,19518.30966
2,1514764620,31/12/2017 23:57,BTC/USD,13908.73,13913.26,13874.99,13913.26,0.775012,10782.94429
3,1514764560,31/12/2017 23:56,BTC/USD,13827.0,13908.69,13827.0,13859.58,0.666459,9236.841134
4,1514764500,31/12/2017 23:55,BTC/USD,13825.05,13825.05,13825.05,13825.05,0.065501,905.56013


As it is obvious we have some irrelevent columns so we remove them using the 'drop' function.

# **Drop Unnecessary Columns**

---



In [None]:
combined_data = combined_data.drop(['unix', 'symbol'], axis=1)

In [None]:
print(combined_data.columns)

Index(['date', 'open', 'high', 'low', 'close', 'Volume BTC', 'Volume USD'], dtype='object')


In [None]:
combined_data.head()

Unnamed: 0,date,open,high,low,close,Volume BTC,Volume USD
0,31/12/2017 23:59,13913.28,13913.28,13867.18,13880.0,0.591748,8213.456549
1,31/12/2017 23:58,13913.26,13953.83,13884.69,13953.77,1.398784,19518.30966
2,31/12/2017 23:57,13908.73,13913.26,13874.99,13913.26,0.775012,10782.94429
3,31/12/2017 23:56,13827.0,13908.69,13827.0,13859.58,0.666459,9236.841134
4,31/12/2017 23:55,13825.05,13825.05,13825.05,13825.05,0.065501,905.56013


#**Check Missing Values**

---

Following that, we checked the 'combined_data' dataset for missing value.

In [None]:
missing_count = combined_data.isna().sum()
print(missing_count)

date          0
open          0
high          0
low           0
close         0
Volume BTC    0
Volume USD    0
dtype: int64


According to the result, we can confirm that there are no missing values in the dataset.

# **Check Duplicates**

---
Then, we examined the ‘combined_data’ dataset for duplicates.



In [None]:
num_duplicates = combined_data.duplicated().sum()
print(f"Number of duplicate rows: {num_duplicates}")

Number of duplicate rows: 0


According to the result, we found no duplicates in the dataset.

In [None]:
path_to_save = '/content/drive/My Drive/MyData/'
combined_data.to_csv(path_to_save + 'combined_data.csv', index=False)


As we can see there is no duplicates in our dataset.

# **Overview the 'Sentiment_Scores' Dataset**

---



In [None]:
Sentiment_Scores.head()

Unnamed: 0,date,compound,VADER_Sentiment_Scores,BERT_Compound,BERT_Sentiment_Scores
0,2017-12-05 12:51:08,0.8861,1,-0.003518,0
1,2017-10-02 15:11:26,0.0,0,0.074483,1
2,2018-08-08 16:39:30,-0.3415,-1,0.074879,1
3,2018-01-30 16:05:54,0.9791,1,0.088999,1
4,2015-10-15 10:05:27,0.2023,1,0.044378,0


## **Merge Datasets Based on Date**

---

In this step, we combine the 'sentiment_scores' and 'combined_data' datasets into a new dataframe called 'merged_df', using the 'date' as the key, to streamline further analysis.

In [None]:
Sentiment_Scores.shape

(83242, 5)

In [None]:
combined_data.shape

(1576797, 7)

In [None]:
# Convert the 'date' column of both dataframes to datetime format
Sentiment_Scores['date'] = pd.to_datetime(Sentiment_Scores['date'])
combined_data['date'] = pd.to_datetime(combined_data['date'])

# Merge the dataframes based on 'date'
merged_df = pd.merge(Sentiment_Scores, combined_data, on='date', how='inner')

In [None]:
merged_df.head(5)

Unnamed: 0,date,compound,VADER_Sentiment_Scores,BERT_Compound,BERT_Sentiment_Scores,open,high,low,close,Volume BTC,Volume USD
0,2018-11-28 06:15:00,-0.3058,-1,-0.018622,0,3996.48,3998.76,3991.1,3998.76,7.812108,31238.746586
1,2017-10-16 16:37:00,0.9337,1,0.649525,1,5641.0,5649.5,5640.0,5641.1,3.029607,17090.31509
2,2018-01-22 12:26:00,-0.7506,-1,-0.392861,-1,11306.54,11334.63,11301.01,11334.33,4.280901,48521.139418
3,2017-08-21 11:58:00,0.9117,1,0.024353,0,4033.0,4044.38,4033.0,4044.38,3.665888,14826.24257
4,2018-08-07 11:58:00,0.3818,1,-0.737118,-1,7100.25,7100.25,7089.1,7089.12,2.38748,16925.130658


In [None]:
merged_df.shape

(1170, 11)

# **Extact Daily Data**

---



In [None]:
def predominant_sentiment(x):
    if len(x) == 0:  # Check if the sequence is empty
        return None
    counts = x.value_counts()
    return counts.idxmax()

In [None]:
merged_df = merged_df.sort_values(by='date')


In [None]:
# Convert 'date' column to datetime format
merged_df['date'] = pd.to_datetime(merged_df['date'])

# Resample data to daily frequency and calculate summary statistics
daily_data = merged_df.resample('D', on='date').agg({
    'open': 'first',
    'high': 'max',
    'low': 'min',
    'close': 'last',
    'Volume BTC': 'sum',
    'Volume USD': 'sum',
    'VADER_Sentiment_Scores': predominant_sentiment,
    'BERT_Sentiment_Scores': predominant_sentiment
})

# Drop days with missing data
daily_data = daily_data.dropna()


In [None]:
daily_data.head(5)

Unnamed: 0_level_0,open,high,low,close,Volume BTC,Volume USD,VADER_Sentiment_Scores,BERT_Sentiment_Scores
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-01-07,2435.94,2435.94,2435.92,2435.94,1.864778,4542.486225,-1.0,-1.0
2017-01-10,4276.11,4279.23,4248.99,4249.0,14.503526,62062.394631,1.0,0.0
2017-01-11,6597.0,6597.0,6588.4,6597.0,2.634528,17379.97937,1.0,0.0
2017-01-17,883.05,883.05,883.05,883.05,0.856119,755.996024,-1.0,-1.0
2017-01-23,916.61,916.95,916.61,916.95,0.870818,798.496473,1.0,0.0


# **Derive the Target Variable**

---




In [None]:
daily_data['movement'] = np.where(daily_data['close'] > daily_data['open'], 1, np.where(daily_data['close'] < daily_data['open'], 2, 0))
daily_data.head()

Unnamed: 0_level_0,open,high,low,close,Volume BTC,Volume USD,VADER_Sentiment_Scores,BERT_Sentiment_Scores,movement
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-01-07,2435.94,2435.94,2435.92,2435.94,1.864778,4542.486225,-1.0,-1.0,0
2017-01-10,4276.11,4279.23,4248.99,4249.0,14.503526,62062.394631,1.0,0.0,2
2017-01-11,6597.0,6597.0,6588.4,6597.0,2.634528,17379.97937,1.0,0.0,0
2017-01-17,883.05,883.05,883.05,883.05,0.856119,755.996024,-1.0,-1.0,0
2017-01-23,916.61,916.95,916.61,916.95,0.870818,798.496473,1.0,0.0,1


In [None]:
daily_data.tail(5)

Unnamed: 0_level_0,open,high,low,close,Volume BTC,Volume USD,VADER_Sentiment_Scores,BERT_Sentiment_Scores,movement
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2019-03-21,4045.37,4046.67,4044.03,4046.67,0.690801,2795.44344,1.0,1.0,1
2019-03-22,3983.1,3983.1,3983.1,3983.1,0.356305,1419.200357,1.0,0.0,0
2019-03-24,3977.76,3977.76,3975.18,3975.18,3.147545,12512.056621,1.0,1.0,2
2019-03-25,3966.88,3966.88,3966.88,3966.88,0.02428,96.317751,1.0,0.0,0
2019-03-27,4008.0,4009.5,4007.99,4009.5,2.63111,10549.434382,1.0,1.0,1


# **Check Class Balance**

---



In [None]:
# Count the occurrences of each class in the 'movement' column
class_counts = daily_data['movement'].value_counts()

# Print the counts
print(class_counts)

1    263
2    222
0     45
Name: movement, dtype: int64


As we can see the classes are not balance so we need to do class balancing using oversampling technique.

# **Oversampling**

---



In [None]:
# Separate each class
class_minus_one = daily_data[daily_data['movement'] == 2]
class_zero = daily_data[daily_data['movement'] == 0]
class_one = daily_data[daily_data['movement'] == 1]

# Calculate the target number of samples per class (using the size of the majority class)
num_samples = max(len(class_minus_one), len(class_zero), len(class_one))

# Oversample each class
oversampled_minus_one = resample(class_minus_one, replace=True, n_samples=num_samples, random_state=42)
oversampled_zero = resample(class_zero, replace=True, n_samples=num_samples, random_state=42)
oversampled_one = resample(class_one, replace=True, n_samples=num_samples, random_state=42)

# Combine the oversampled classes
oversampled_data = pd.concat([oversampled_minus_one, oversampled_zero, oversampled_one])

# Shuffle the oversampled data
oversampled_data = oversampled_data.sample(frac=1, random_state=42)

# Check the class distribution after oversampling
oversampled_class_counts = oversampled_data['movement'].value_counts()
print(oversampled_class_counts)

0    263
2    263
1    263
Name: movement, dtype: int64


In [None]:
oversampled_data.head(5)

Unnamed: 0_level_0,open,high,low,close,Volume BTC,Volume USD,VADER_Sentiment_Scores,BERT_Sentiment_Scores,movement
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-04-03,1276.43,1276.43,1276.43,1276.43,0.0,0.0,-1.0,-1.0,0
2019-03-01,3831.26,3833.32,3812.64,3812.64,1.895688,7256.624552,-1.0,-1.0,2
2018-09-08,6445.0,6445.0,6144.76,6177.99,89.916275,557348.913289,1.0,1.0,2
2017-08-30,4634.01,4638.01,4603.61,4613.39,19.956641,92304.45079,1.0,0.0,2
2018-09-02,7286.98,7293.41,7269.18,7269.18,1.125385,8197.766598,1.0,0.0,2


# **Define features**

---



In [None]:
# Define features for each model
features_combined_data = oversampled_data[['open', 'high', 'low', 'Volume USD', 'Volume BTC']]
features_vader = oversampled_data[['open', 'high', 'low', 'Volume BTC','Volume USD','VADER_Sentiment_Scores']]
features_bert = oversampled_data[['open', 'high', 'low', 'Volume BTC','Volume USD', 'BERT_Sentiment_Scores']]

In [None]:
features_vader.head(5)

Unnamed: 0_level_0,open,high,low,Volume BTC,Volume USD,VADER_Sentiment_Scores
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-04-03,1276.43,1276.43,1276.43,0.0,0.0,-1.0
2019-03-01,3831.26,3833.32,3812.64,1.895688,7256.624552,-1.0
2018-09-08,6445.0,6445.0,6144.76,89.916275,557348.913289,1.0
2017-08-30,4634.01,4638.01,4603.61,19.956641,92304.45079,1.0
2018-09-02,7286.98,7293.41,7269.18,1.125385,8197.766598,1.0


# **Split Data into Training and Testing**


---




In [None]:
split_idx = int(0.8 * len(oversampled_data))

# Split combined features
X_train_combined = features_combined_data.iloc[:split_idx]
X_test_combined = features_combined_data.iloc[split_idx:]

# Split VADER features
X_train_vader = features_vader.iloc[:split_idx]
X_test_vader = features_vader.iloc[split_idx:]

# Split BERT features
X_train_bert = features_bert.iloc[:split_idx]
X_test_bert = features_bert.iloc[split_idx:]

# Split target variable
y_train = oversampled_data['movement'].iloc[:split_idx]
y_test = oversampled_data['movement'].iloc[split_idx:]


# **Normalize the Data**


---



In [None]:
# Fit the scaler on the training data only and transform both training and test data
scaler_combined = MinMaxScaler().fit(X_train_combined)
X_train_combined_scaled = scaler_combined.transform(X_train_combined)
X_test_combined_scaled = scaler_combined.transform(X_test_combined)

scaler_vader = MinMaxScaler().fit(X_train_vader)
X_train_vader_scaled = scaler_vader.transform(X_train_vader)
X_test_vader_scaled = scaler_vader.transform(X_test_vader)

scaler_bert = MinMaxScaler().fit(X_train_bert)
X_train_bert_scaled = scaler_bert.transform(X_train_bert)
X_test_bert_scaled = scaler_bert.transform(X_test_bert)


# **Preparing the Data for the LSTM Model**

---


LSTM requires the input data to have 3 dimensions: (number of samples, number of time steps, number of features per step). Here, we'll use the number of features as time steps for simplicity, meaning each row is a time sequence.

In [None]:
X_train_combined_scaled = X_train_combined_scaled.reshape((X_train_combined_scaled.shape[0], X_train_combined_scaled.shape[1], 1))
X_test_combined_scaled = X_test_combined_scaled.reshape((X_test_combined_scaled.shape[0], X_test_combined_scaled.shape[1], 1))

X_train_vader_scaled = X_train_vader_scaled.reshape((X_train_vader_scaled.shape[0], X_train_vader_scaled.shape[1], 1))
X_test_vader_scaled = X_test_vader_scaled.reshape((X_test_vader_scaled.shape[0], X_test_vader_scaled.shape[1], 1))

X_train_bert_scaled = X_train_bert_scaled.reshape((X_train_bert_scaled.shape[0], X_train_bert_scaled.shape[1], 1))
X_test_bert_scaled = X_test_bert_scaled.reshape((X_test_bert_scaled.shape[0], X_test_bert_scaled.shape[1], 1))


# **Hyperparameter Tuning**

---



In [None]:
num_classes= 3
# Define input shape for the LSTM models
input_shape_combined = (X_train_combined_scaled.shape[1], X_train_combined_scaled.shape[2])

# Define the parameter grid for grid search
param_grid = {
    'num_neurons': [50, 100, 150],
    'dropout_rate': [0.2, 0.3, 0.4],
    'activation': ['tanh', 'relu']
}

# Define a wrapper function for the create_lstm_model function that takes hyperparameters as arguments
def create_lstm_model_hyper(num_neurons, dropout_rate, activation, input_shape):
    model = Sequential()
    model.add(Bidirectional(LSTM(num_neurons, activation=activation, return_sequences=True, dropout=dropout_rate, recurrent_dropout=dropout_rate), input_shape=input_shape))
    model.add(LSTM(num_neurons, activation=activation, dropout=dropout_rate, recurrent_dropout=dropout_rate))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='RMSprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Create the LSTM model using the function with hyperparameters
model_combined = KerasClassifier(build_fn=create_lstm_model_hyper, input_shape=input_shape_combined, epochs=20, batch_size=64, verbose=0)

# Create the GridSearchCV instance
grid_search = GridSearchCV(estimator=model_combined, param_grid=param_grid, scoring='accuracy', cv=3, verbose=1)

# Fit the GridSearchCV instance
grid_result = grid_search.fit(X_train_combined_scaled, y_train)

# Print the best parameters and best accuracy
print("Best Parameters: ", grid_result.best_params_)
print("Best Accuracy: ", grid_result.best_score_)



# **Train the Models**

---



In [None]:
# Get the best hyperparameters from the grid search
best_params = grid_result.best_params_
best_num_neurons = best_params['num_neurons']
best_dropout_rate = best_params['dropout_rate']
best_activation = best_params['activation']

# Define the input shape for the LSTM models
input_shape_combined = (X_train_combined_scaled.shape[1], X_train_combined_scaled.shape[2])
input_shape_vader = (X_train_vader_scaled.shape[1], X_train_vader_scaled.shape[2])
input_shape_bert = (X_train_bert_scaled.shape[1], X_train_bert_scaled.shape[2])

# Create the LSTM models using the best hyperparameters
best_model_combined = create_lstm_model_hyper(best_num_neurons, best_dropout_rate, best_activation, input_shape_combined)
best_model_vader = create_lstm_model_hyper(best_num_neurons, best_dropout_rate, best_activation, input_shape_vader)
best_model_bert = create_lstm_model_hyper(best_num_neurons, best_dropout_rate, best_activation, input_shape_bert)

# Train the best models for combined, VADER, and BERT
best_history_combined = best_model_combined.fit(X_train_combined_scaled, y_train, epochs=10, batch_size=64, validation_data=(X_test_combined_scaled, y_test))
best_history_vader = best_model_vader.fit(X_train_vader_scaled, y_train, epochs=10, batch_size=64, validation_data=(X_test_vader_scaled, y_test))
best_history_bert = best_model_bert.fit(X_train_bert_scaled, y_train, epochs=10, batch_size=64, validation_data=(X_test_bert_scaled, y_test))

# **Evaluate the Models**

---



In [None]:
# Evaluate the best models
# Evaluate the best combined model
y_pred_best_combined = best_model_combined.predict(X_test_combined_scaled)
y_pred_best_combined = np.argmax(y_pred_best_combined, axis=1)
accuracy_best_combined = accuracy_score(y_test, y_pred_best_combined)
precision_best_combined = precision_score(y_test, y_pred_best_combined, average='weighted')
recall_best_combined = recall_score(y_test, y_pred_best_combined, average='weighted')
f1_best_combined = f1_score(y_test, y_pred_best_combined, average='weighted')

# Evaluate the best VADER model
y_pred_best_vader = best_model_vader.predict(X_test_vader_scaled)
y_pred_best_vader = np.argmax(y_pred_best_vader, axis=1)
accuracy_best_vader = accuracy_score(y_test, y_pred_best_vader)
precision_best_vader = precision_score(y_test, y_pred_best_vader, average='weighted')
recall_best_vader = recall_score(y_test, y_pred_best_vader, average='weighted')
f1_best_vader = f1_score(y_test, y_pred_best_vader, average='weighted')

# Evaluate the best BERT model
y_pred_best_bert = best_model_bert.predict(X_test_bert_scaled)
y_pred_best_bert = np.argmax(y_pred_best_bert, axis=1)
accuracy_best_bert = accuracy_score(y_test, y_pred_best_bert)
precision_best_bert = precision_score(y_test, y_pred_best_bert, average='weighted')
recall_best_bert = recall_score(y_test, y_pred_best_bert, average='weighted')
f1_best_bert = f1_score(y_test, y_pred_best_bert, average='weighted')

# Create the table with results of the best models
best_table = PrettyTable()

# Add columns
best_table.field_names = ["Model", "Precision", "Recall", "F1-score", "Accuracy"]

# For the best combined model
best_table.add_row(["Historical Data",
                    f"{precision_best_combined:.4f}",
                    f"{recall_best_combined:.4f}",
                    f"{f1_best_combined:.4f}",
                    f"{accuracy_best_combined:.4f}"])

# For the best VADER model
best_table.add_row(["Historical Data & VADER Sentiment Scores",
                    f"{precision_best_vader:.4f}",
                    f"{recall_best_vader:.4f}",
                    f"{f1_best_vader:.4f}",
                    f"{accuracy_best_vader:.4f}"])

# For the best BERT model
best_table.add_row(["Historical Data & BERT Sentiment Scores",
                    f"{precision_best_bert:.4f}",
                    f"{recall_best_bert:.4f}",
                    f"{f1_best_bert:.4f}",
                    f"{accuracy_best_bert:.4f}"])

print(best_table)