## Networks for sentiment analysis in cryptocurrency

### Setup

In [None]:
import yaml
import pandas as pd
from pathlib import Path
from matplotlib import pyplot as plt
import seaborn as sns
import quantstats as qs
from sklearn.metrics import accuracy_score,  precision_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
sns.set()

In [None]:
%matplotlib inline
plt.style.use('ggplot')

In [None]:
from preprocess import *
from train import *
from inference import *
from interpret import *

In [None]:
with open("Model/params.yaml", "r") as params_file:
    params = yaml.safe_load(params_file)

data_dir = params['data_dir']

### Load and Prepare the Data

In [None]:
file_name = "price_sentiment_btc.csv"
data = preprocess.load_data(file_name)

In [None]:
train_df, val_df, test_df = preprocess.prep_data(df=data, train_frac=0.7, plot_df=True)
train_df.shape, val_df.shape, test_df.shape

### Visualise the data

In [None]:
plot_df = pd.read_csv(Path(data_dir, 'plot_df.csv'))
plot_df['Close_Change'] = plot_df.Close.pct_change(1)
plot_df = plot_df.dropna()

In [None]:
plot_df['Close'].plot(figsize=(16,7), label='Training data')
plt.xlabel("Date", fontsize=16)
plt.ylabel("USD price", fontsize=16)
plt.title('Bitcoin Close Price')
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=1)
plot_df['Close_Change'].plot(ax=axes[0], figsize=(16,6), rot=90, title='a. Price Change', sharex=True)
plot_df['Close_Change'].rolling(90).mean().plot(ax=axes[1], figsize=(16,6), rot=90, title='b. Rolling Mean', color='orange')
plot_df['Close_Change'].rolling(90).std().plot(ax=axes[2], figsize=(16,6), rot=0, title='c. Rolling Standard Deviation', color='purple')
plt.show()

Traditional statistical forecasting techniques require the data to be stationary, i.e., having constant mean, standard deviation, and autocorrelation. If, however, the conditions for stationarity are not achieved, forecasting techniques, like ARMA, cannot model the dependence structure of the data over time and therefore other techniques have to be used.
We see from the Figures above that the mean and the standard deviation of the price change vary over time indicating that the time series at hand (Close prices) is not stationary. The time series shows strong, irregular dynamics which traditional forecasting techniques are less suited for. On the other hand, RNNs, especially LSTM models, have proven to work well with more complex time series at finding patterns in non-stationary time series.

In [None]:
matrix_df = train_df.drop(['Open', 'High', 'Low', 'RSI', 'MACD', 'OBV', 'Williams', 'Stochastic_oscillator'], axis=1)
corrMatrix = matrix_df.corr()
fig, ax = plt.subplots(figsize=(15,10)) 
sns.heatmap(corrMatrix, cmap="Blues", annot=True, ax=ax)
plt.show()

At first glance, we cannot see strong correlation between the sentiment analysis features and the price related ones. The only point to note is the correlation between the Bitcoin volume and the number of tweets which can be an indication that the number of tweets correlates with the volatility of cryptocurrencies.

### Train the LSTM model

In [None]:
sequence_length = 24
batch_size = 32
n_epochs = 10000
n_epochs_stop = 50
label_name = 'Close'

In [None]:
hist = train_model(train_df, val_df, label_name, sequence_length, batch_size, n_epochs, n_epochs_stop)

In [None]:
hist.plot(figsize=(15,6))
plt.show()

### Evaluate the model

In [None]:
predictions_descaled, labels_descaled = predict(df=test_df, label_name=label_name, sequence_length=sequence_length)
len(predictions_descaled), len(labels_descaled)

In [None]:
fig = plt.figure(figsize = (16, 7))
plt.plot(labels_descaled, label='Actual Price')
plt.plot(predictions_descaled, label='Predicted Price')

plt.legend()
plt.xlabel('Date', fontsize=18, fontweight='bold')
plt.ylabel('Price', fontsize=18, fontweight='bold')
plt.title('Actual price and predicted price for test data', fontsize=20, fontweight='bold')
fig.tight_layout()
plt.show()

In [None]:
def buy_sell_trades(actual, predicted, threshold):
    pred_df = pd.DataFrame()
    pred_df['Predictions'] = predicted
    threshold /= 100


    y_pct_change = pred_df.pct_change()

    money = 10000
    number_of_stocks = (10000 / actual[0])
    left = (((actual[len(actual)-1]-actual[0])/actual[0])+1)*10000
    trades = 0
    number_of_stocks = 0
    portfolio = []

    buying_percentage_threshold = threshold
    selling_percentage_threshold = threshold

    for i in range(len(actual) - 1):    
        if y_pct_change['Predictions'][i + 1] > buying_percentage_threshold:
            for j in np.arange(1, 0, -0.001):
                #Buying of stock
                if (money >= j * actual[i]):
                    trades += 1
                    money -= j * actual[i]
                    number_of_stocks += (j*(1-0.001))
                    break
        elif  y_pct_change['Predictions'][i + 1] < -selling_percentage_threshold:
            for j in np.arange(1, 0, -0.001):
                #Selling of stock
                if (number_of_stocks >= j):
                    money += (j*(1-0.001)) * actual[i]
                    number_of_stocks -= j
                    break
        portfolio.append((number_of_stocks * actual[i])+money)

    money += number_of_stocks * actual[len(actual) - 1]

    print('Money if we traded: ',money) #Money if we traded
    print('Buy and hold strategy: ',left)  #Buy and hold
    print('Number of trades done: ', trades)

    return money, portfolio

In [None]:
money, portfolio= buy_sell_trades(labels_descaled, predictions_descaled, 0)

In [None]:
real = plt.plot(np.diff(labels_descaled)/labels_descaled[:-1], label='Actual Change')
pred = plt.plot(np.diff(predictions_descaled)/predictions_descaled[:-1], label='Predicted Change')

plt.legend(['Actual Price', 'Predicted Price'])
plt.xlabel('Price', fontsize=16)
plt.ylabel('Time', fontsize=16)
plt.title('Actual price change and predicted price change for test data', fontsize=20)
plt.gcf().set_size_inches(15, 10, forward=True)
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error

def generate_predicted_result_based_on_previous_actual(actual, y_pred): 
    temp_actual = actual[:-1]

    #Adding each actual price at time t with the predicted difference to get a predicted price at time t + 1
    new = np.add(temp_actual, y_pred)

    plt.gcf().set_size_inches(12, 8, forward=True)
    plt.title('Plot of real price and predicted price against number of days for test set')
    plt.xlabel('Number of days')
    plt.ylabel('Adjusted Close Price($)')

    plt.plot(actual[1:], label='Actual Price')
    plt.plot(new, label='Predicted Price')

    print('RMSE: ', mean_squared_error(actual[1:], new, squared = False))
    print('MAPE: ', mean_absolute_percentage_error(actual[1:], new))

    #plotting of model  
    plt.legend(['Actual Price', 'Predicted Price'])  
    plt.show()

In [None]:
diff = np.diff(predictions_descaled)
generate_predicted_result_based_on_previous_actual(labels_descaled, diff)

In [None]:
hourly_labels = []
hourly_predictions = []
for i in range(0, len(labels_descaled),4):
    hourly_labels.append(labels_descaled[i])
    hourly_predictions.append(predictions_descaled[i])

In [None]:
money, portfolio = buy_sell_trades(hourly_labels, hourly_predictions, 0)

In [None]:
df = pd.DataFrame(hourly_labels, columns=['price'])
diff = df.price.pct_change().dropna()
bh = [10000]
for i in range(len(diff)-1):
    bh.append(bh[i]*(1+diff[i+1]))

In [None]:
fig = plt.figure(figsize = (16, 7))
plt.plot(portfolio, label='Hourly portfolio')
plt.plot(bh, label='Buy & hold benchmark')
plt.legend()
plt.xlabel('Time', fontsize=16)
plt.ylabel('Price', fontsize=16)
plt.title('Portfolio performances comparaison (1h interval)', fontsize=20)
plt.show()

In [None]:
print('Error on all test data:')
print_loss_metrics(labels_descaled, predictions_descaled)

In [None]:
predicted = predictions_descaled
actual = labels_descaled

In [None]:
predicted = hourly_predictions
actual = hourly_labels

In [None]:
predicted_movement = []
actual_movement = []

for i in range(0, len(predicted)-1,4):
    if predicted[i+1] > predicted[i]:
        predicted_movement.append(1)
    else:
        predicted_movement.append(0)
        
    if actual[i+1] > actual[i]:
        actual_movement.append(1)
    else:
        actual_movement.append(0)
    

In [None]:
print(confusion_matrix(actual_movement, predicted_movement))
print(accuracy_score(actual_movement, predicted_movement))
print(precision_score(actual_movement, predicted_movement))

In [None]:
print(classification_report(actual_movement, predicted_movement))

### Model generalisation

In [None]:
unseen = pd.read_csv('Data/unseen_price_sentiment_btc.csv')

In [None]:
save_time = unseen['Time']
unseen = unseen.drop(['Time'], axis=1)
unseen = preprocess.create_features(unseen)
scaler = joblib.load("Model/scaler.gz")
unseen_df = pd.DataFrame(
    scaler.transform(unseen), index=unseen.index, columns=unseen.columns
)

In [None]:
predictions_descaled, labels_descaled = predict(df=unseen_df, label_name=label_name, sequence_length=sequence_length)
len(predictions_descaled), len(labels_descaled)

In [None]:
fig = plt.figure(figsize = (16, 7))
plt.plot(labels_descaled, label='Actual Price')
plt.plot(predictions_descaled, label='Predicted Price')

plt.legend()
plt.xlabel('Date', fontsize=18, fontweight='bold')
plt.ylabel('Price', fontsize=18, fontweight='bold')
plt.title('Actual price and predicted price for unseen data', fontsize=20, fontweight='bold')
fig.tight_layout()
fig.savefig('C:/Users/bapti/OneDrive/Desktop/Imperial Y4/FYP report/figures/general.eps', dpi=600, bbox_inches='tight')
plt.show()

In [None]:
money, portfolio0 = buy_sell_trades(labels_descaled, predictions_descaled, 0)
money, portfolio2 = buy_sell_trades(labels_descaled, predictions_descaled, 0.2)
money, portfolio5 = buy_sell_trades(labels_descaled, predictions_descaled, 0.5)

In [None]:
df = pd.DataFrame(labels_descaled, columns=['price'])
diff = df.price.pct_change().dropna()
bh = [10000]
for i in range(len(diff)-1):
    bh.append(bh[i]*(1+diff[i+1]))

In [None]:
fig = plt.figure(figsize = (16, 7))
plt.plot(portfolio0, label='Model portfolio (no threshold)')
plt.plot(portfolio2, label='Model portfolio (0.2% threshold)')
plt.plot(portfolio5, label='Model portfolio (0.5% threshold)')
plt.plot(bh, label='Buy & hold benchmark')

plt.legend()
plt.xlabel('Date', fontsize=18, fontweight='bold')
plt.ylabel('Price', fontsize=18, fontweight='bold')
plt.title('Portfolio performance comparison (15 min interval)', fontsize=20, fontweight='bold')
fig.tight_layout()
plt.show()

In [None]:
hourly_labels = []
hourly_predictions = []
for i in range(0, len(labels_descaled),4):
    hourly_labels.append(labels_descaled[i])
    hourly_predictions.append(predictions_descaled[i])
money, portfolio = buy_sell_trades(hourly_labels, hourly_predictions, 0)
money, portfolio2 = buy_sell_trades(hourly_labels, hourly_predictions, 0.2)

In [None]:
df = pd.DataFrame(hourly_labels, columns=['price'])
diff = df.price.pct_change().dropna()
bh = [16000]
for i in range(len(diff)-1):
    bh.append(bh[i]*(1+diff[i+1]))

In [None]:
fig = plt.figure(figsize = (16, 7))
plt.plot(portfolio, label='Model portfolio (no threshold)')
plt.plot(portfolio2, label='Model portfolio (0.2% threshold)')
plt.plot(bh, label='Buy & hold benchmark')

plt.legend()
plt.xlabel('Date', fontsize=18, fontweight='bold')
plt.ylabel('Price', fontsize=18, fontweight='bold')
plt.title('Portfolio performance comparison (1h interval)', fontsize=20, fontweight='bold')
fig.tight_layout()
plt.show()

### Find important features

In [None]:
background_data_size = 2000
test_sample_size = 800
sequence_length = 24

In [None]:
shap_values = get_important_features(
    background_data_size,
    test_sample_size,
    sequence_length,
)

In [None]:
shap_plot = pd.DataFrame(shap_values, columns=train_df.columns.tolist())
shap_plot['days'] = [i-25 for i in list(range(1,25))]

In [None]:
shap_plot.plot.area(x='days',figsize=(10, 6), cmap='viridis')
plt.title("Deep SHAP - Feature Importance")
plt.show()

In [None]:
total_importance = pd.DataFrame(shap_plot.sum(), columns=['Features'])
total_importance.nlargest(40, 'Features')