In [5]:
# Import all necessary libraries
import os
import json
import time
import datetime
import requests
from dotenv import load_dotenv

import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path



# **Data Collection & Cleaning**

1. Bitcoin articles collection and analysis
2. Bitcoin historical prices collection and manipulation
3. Analyze tones in Bitcoin articles



In [2]:
load_dotenv()

True

## 1.1 Bitcoin Articles

Collect Bitcoin articles using GNews API


In [6]:
# Use while-loop to iterate through a range of dates in a url to pull articles from each day
start_date = datetime.date(2020, 12, 18)
end_date = datetime.date(2021, 1, 18)
delta = datetime.timedelta(days=1)
articles = []
while start_date <= end_date:
    
    # Print(start_date)
    gnews_api = os.environ["gnews_api"]
    gnews_url = f"https://gnews.io/api/v4/search?q=bitcoin&in=cryptocurrency&from={start_date}T00:01:36Z&to={start_date}T23:59:36Z&lang=en&token={gnews_api}"
    response = requests.get(gnews_url)
    data = response.json()
    articles.append(data)
    start_date += delta
    
    # Use time.sleep to pause the loop every 4 seconds due api restrictions, api only allows a request for every 3 seconds
    time.sleep(4)

KeyError: 'gnews_api'

In [3]:
# Convert json to dataframe pulling only "publishAt", "title", "description", and "content"
articles_df = pd.json_normalize(articles, record_path=['articles'], meta='totalArticles')
articles_df['title&description'] = articles_df['title'] + " " + articles_df['description']
articles_df = articles_df[['publishedAt', 'title&description', 'totalArticles']]
articles_df.rename(columns={'publishedAt':'date'}, inplace=True)
articles_df.head()

AttributeError: module 'pandas' has no attribute 'json_normalize'

In [3]:
# Clean dataframe and add a column that is composed of "title" and "description"
articles_df['date'] = pd.to_datetime(articles_df['date'], infer_datetime_format=True).dt.date

# Group dataframe by "publish date"
bitcoin_articles = articles_df.groupby(by=["date",'totalArticles']).sum()
bitcoin_articles.head()

NameError: name 'articles_df' is not defined

## 1.2. Bitcoin Articles

Analyze Bitcoin articles using SentimentIntensityAnalyzer

In [7]:
# Import the libraries for sentiment scoring using Vader
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Download/Update the VADER Lexicon
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/alhamduliallah/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [9]:
# Define two lists to store vader sentiment scoring
y_vader_pred = []
y_vader_prob = []

In [10]:
# Score sentiment of test set using Vader
for text in bitcoin_articles["title&description"]:
    y_vader_prob.append(analyzer.polarity_scores(text)["pos"])
    sentiment_score = analyzer.polarity_scores(text)["compound"]
    if sentiment_score >= 0.1:
        y_vader_pred.append(1)
    else:
        y_vader_pred.append(0)

In [19]:
# Add sentiment score to bitcoin_articles dataframe
bitcoin_articles["Sentiment Score"] = y_vader_pred
bitcoin_articles.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,title&description,Sentiment Score
date,totalArticles,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-12-18,18,How to invest in bitcoin: The major ways to bu...,1
2020-12-19,6,How will Bitcoin shift the power in the coffee...,1
2020-12-20,14,"Tesla's Elon Musk asks about converting ""large...",1
2020-12-21,13,Ripple to face SEC suit over XRP cryptocurrenc...,0
2020-12-22,10,SEC Sues Ripple Over XRP Cryptocurrency The ag...,1


## 2.1. Bitcoin Historical Prices

Collect Bitcoin historical prices data from Kraken

In [12]:
# Import
import ccxt

# Get bitcoin historical prices from kraken
kraken_public_key = os.getenv("KRAKEN_PUBLIC_KEY")
kraken_secret_key = os.getenv("KRAKEN_SECRET_KEY")
kraken = ccxt.kraken({"apiKey": kraken_public_key, "secret": kraken_secret_key})

historical_prices = kraken.fetch_ohlcv("BTC/USD","1d")

In [13]:
# Create a dataframe for historical prices
historical_prices_df = pd.DataFrame(historical_prices, columns=["date", "open", "high", "low", "close", "volume"])
historical_prices_df["date"] = pd.to_datetime(historical_prices_df["date"], unit="ms")
historical_prices_df.set_index("date",inplace=True)
historical_prices_df.head()

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-02-04,3416.6,3437.8,3396.9,3413.9,1359.629904
2019-02-05,3411.0,3433.7,3397.8,3429.5,1429.386854
2019-02-06,3430.0,3445.0,3337.0,3367.4,2289.491212
2019-02-07,3367.2,3382.4,3348.1,3357.1,1512.883808
2019-02-08,3359.5,3704.9,3341.4,3622.1,4231.506468


## 2.2. Bitcoin Historical Prices

Analyzing Bitcoin historical prices using daily return, changes in volumn, and spread

In [14]:
# Drop NAs and calculate daily percent return
historical_prices_df['daily_return'] = historical_prices_df['close'].dropna().pct_change()
historical_prices_df['volume change'] = historical_prices_df['volume'].pct_change().shift(-1)
historical_prices_df['spread'] = (historical_prices_df['high'] - historical_prices_df['low']) / historical_prices_df['open']
historical_prices_df.head()

Unnamed: 0_level_0,open,high,low,close,volume,daily_return,volume change,spread
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-01-19,36622.5,37860.0,35900.0,35925.6,6298.676377,-0.019029,-0.156197,0.053519
2021-01-20,35925.5,36396.7,33374.0,35511.8,9170.190746,-0.011518,0.455892,0.084138
2021-01-21,35511.8,35614.1,30056.1,30832.6,19513.432956,-0.131765,1.12792,0.156511
2021-01-22,30832.5,33850.0,28800.0,33000.0,15251.549792,0.070296,-0.218408,0.163788
2021-01-23,33000.1,33496.4,31350.0,32069.4,4710.182129,-0.0282,-0.691167,0.065042


In [20]:
# Combine bitcoin sentiment dataframe with historical prices dataframe
combined_df = bitcoin_articles.join(historical_prices_df)
combined_df['articles'] = combined_df['articles'].shift(-1)
combined_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,title&description,Sentiment Score,open,high,low,close,volume,daily_return,volume change,spread
date,totalArticles,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-12-18,18,How to invest in bitcoin: The major ways to bu...,1,22811.8,23317.1,22308.2,23130.5,6050.937552,0.013331,-0.68142,0.044227
2020-12-19,6,How will Bitcoin shift the power in the coffee...,1,23132.8,24288.2,22800.0,23871.5,6255.127803,0.032036,0.033745,0.064333
2020-12-20,14,"Tesla's Elon Musk asks about converting ""large...",1,23871.5,24297.7,23084.9,23480.7,5876.100372,-0.016371,-0.060595,0.050805
2020-12-21,13,Ripple to face SEC suit over XRP cryptocurrenc...,0,23480.8,24090.0,21885.6,22716.1,10783.18265,-0.032563,0.835092,0.093881
2020-12-22,10,SEC Sues Ripple Over XRP Cryptocurrency The ag...,1,22724.6,23837.0,22354.2,23828.2,5728.007668,0.048956,-0.468802,0.065251


In [21]:
# Import
from ibm_watson import ToneAnalyzerV3
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator

# Analyzing tones of bitcoin articles with ibm_watson tone analyzer SDK
ibm_key = os.getenv("ibm_key")
authenticator = IAMAuthenticator(ibm_key)
tone_analyzer = ToneAnalyzerV3(
    version = '2017-09-21',
    authenticator = authenticator
)

tone_analyzer.set_service_url('https://api.us-east.tone-analyzer.watson.cloud.ibm.com')
tone_analyzer.set_disable_ssl_verification(True)

In [22]:
# Initialize a list of all tones analyzed
tone_analyzed = []
for text in combined_df["title&description"]:
    tone_analysis = tone_analyzer.tone(
    {'text': text},
    content_type = 'application/json').get_result()
    tone_analyzed.append(tone_analysis)

In [23]:
# Convert tone_analyzed into sparse matrix that will be merged with combined_df so we evaluate extra features
tone = []
for text in tone_analyzed:
    dic = {}
    tone.append(dic)
    for emotions in text["document_tone"]["tones"]:
        dic.update({emotions['tone_id']:emotions['score']})
data = pd.DataFrame(tone)
data.fillna(0,inplace=True)
data.head()

Unnamed: 0,joy,fear,sadness,tentative,analytical
0,0.590572,0.592034,0.534758,0.515201,0.0
1,0.529512,0.0,0.0,0.68286,0.0
2,0.565506,0.0,0.508825,0.594668,0.0
3,0.589725,0.0,0.0,0.759095,0.0
4,0.0,0.0,0.578295,0.716654,0.559117


In [24]:
# Combine Bitcoin articles sentiment analysis, Bitcoin historical prices with tone data
combined_df.reset_index(inplace=True)
all_df = combined_df.join(data)
all_df.set_index("date",inplace=True)

In [27]:
all_df.head()

Unnamed: 0_level_0,totalArticles,title&description,Sentiment Score,open,high,low,close,volume,daily_return,volume change,spread,joy,fear,sadness,tentative,analytical
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2020-12-18,18,How to invest in bitcoin: The major ways to bu...,1,22811.8,23317.1,22308.2,23130.5,6050.937552,0.013331,-0.68142,0.044227,0.590572,0.592034,0.534758,0.515201,0.0
2020-12-19,6,How will Bitcoin shift the power in the coffee...,1,23132.8,24288.2,22800.0,23871.5,6255.127803,0.032036,0.033745,0.064333,0.529512,0.0,0.0,0.68286,0.0
2020-12-20,14,"Tesla's Elon Musk asks about converting ""large...",1,23871.5,24297.7,23084.9,23480.7,5876.100372,-0.016371,-0.060595,0.050805,0.565506,0.0,0.508825,0.594668,0.0
2020-12-21,13,Ripple to face SEC suit over XRP cryptocurrenc...,0,23480.8,24090.0,21885.6,22716.1,10783.18265,-0.032563,0.835092,0.093881,0.589725,0.0,0.0,0.759095,0.0
2020-12-22,10,SEC Sues Ripple Over XRP Cryptocurrency The ag...,1,22724.6,23837.0,22354.2,23828.2,5728.007668,0.048956,-0.468802,0.065251,0.0,0.0,0.578295,0.716654,0.559117


In [28]:
# Save as csv for future use
all_df.to_csv('sentiment_prices_tone_text.csv')

# **Trade Signal Indicators**

Generate different trade signal indicators using:
1. Exponential moving average for mean
2. Exponential moving average for standard deviation
3. Bollinger band mean and standard deviation
4. Constructing the indicator

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path

In [None]:
# Set path to CSV and read in CSV
csv_path = Path('sentiment_prices_tone_text.csv')
btc_df=pd.read_csv(csv_path, index_col=[0])
btc_df.head()

In [None]:
# Set index as datetime object
btc_df.set_index(pd.to_datetime(btc_df.index, infer_datetime_format=True), inplace=True)
btc_df.head()

## 1. Exponential moving average for mean

In [None]:
# Set short and long windows
short_window = 1
long_window = 10

# Construct a `Fast` and `Slow` Exponential Moving Average from short and long windows, respectively
btc_df['fast_close'] = btc_df['close'].ewm(halflife=short_window).mean()
btc_df['slow_close'] = btc_df['close'].ewm(halflife=long_window).mean()

# Construct a crossover trading signal
btc_df['crossover_long'] = np.where(btc_df['fast_close'] > btc_df['slow_close'], 1.0, 0.0)
btc_df['crossover_short'] = np.where(btc_df['fast_close'] < btc_df['slow_close'], -1.0, 0.0)
btc_df['crossover_signal'] = btc_df['crossover_long'] + btc_df['crossover_short']

btc_df.head()

## 2. Exponential moving average for standard deviation

In [None]:
# Set short and long volatility windows
short_vol_window = 1
long_vol_window = 10

# Construct a `Fast` and `Slow` Exponential Moving Average from short and long windows, respectively
btc_df['fast_vol'] = btc_df['daily_return'].ewm(halflife=short_vol_window).std()
btc_df['slow_vol'] = btc_df['daily_return'].ewm(halflife=long_vol_window).std()

# Construct a crossover trading signal
btc_df['vol_trend_long'] = np.where(btc_df['fast_vol'] < btc_df['slow_vol'], 1.0, 0.0)
btc_df['vol_trend_short'] = np.where(btc_df['fast_vol'] > btc_df['slow_vol'], -1.0, 0.0) 
btc_df['vol_trend_signal'] = btc_df['vol_trend_long'] + btc_df['vol_trend_short']

btc_df.head()

## 3. Bollinger band mean and standard deviation

In [None]:
# Set bollinger band window
bollinger_window = 20

# Calculate rolling mean and standard deviation
btc_df['bollinger_mid_band'] = btc_df['close'].rolling(window=bollinger_window).mean()
btc_df['bollinger_std'] = btc_df['close'].rolling(window=20).std()

# Calculate upper and lowers bands of bollinger band
btc_df['bollinger_upper_band']  = btc_df['bollinger_mid_band'] + (btc_df['bollinger_std'] * 1)
btc_df['bollinger_lower_band']  = btc_df['bollinger_mid_band'] - (btc_df['bollinger_std'] * 1)

# Calculate bollinger band trading signal
btc_df['bollinger_long'] = np.where(btc_df['close'] < btc_df['bollinger_lower_band'], 1.0, 0.0)
btc_df['bollinger_short'] = np.where(btc_df['close'] > btc_df['bollinger_upper_band'], -1.0, 0.0)
btc_df['bollinger_signal'] = btc_df['bollinger_long'] + btc_df['bollinger_short']

btc_df.head()

## 4. Constructing the indicator

In [None]:
# Construct the dependent variable where if daily return is greater than 0, then 1, else, 0
btc_df['Positive Return'] = np.where(btc_df['daily_return'] > 0, 1.0, 0.0)
btc_df.head()

In [None]:
# Save as csv for future use
btc_df.to_csv('dataframe_with_tradesignals.csv')

# **Deep Network**

Creating a deep network by:
1. Slicing data for testing and training
2. Setting up a model with 4 layers
3. Compiling the model
4. Evaluating the model's performance
5. Conducting a confusion matrix
4. Conducting a classification report

In [None]:
# Initial imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from pathlib import Path

%matplotlib inline

## 1.1 Data Preparation

In [None]:
# Read available data
df = pd.read_csv('bitcoin_model_data_nyt.csv', index_col="date", infer_datetime_format=True, parse_dates=True)

# Slice for new dataframe
start_date = '2020-10-30'
end_date = df.index.max().strftime(format= '%Y-%m-%d')
df_2 = df[start_date:end_date]
df_2.head()

In [None]:
# Shift data in all column by 1 except for 'Positive Return' column
for col in df_2.columns:
    if col != 'Positive Return':
        df_2[col] = df_2[col].shift(1)
    else:
        None
        
# Drop NAs
df_2.dropna(inplace=True)

In [None]:
# Creating the X and y sets
X = df_2.iloc[:, 0:15].values
y = df_2["positive_return"].values

In [None]:
# Import the StandardScaler from sklearn
from sklearn.preprocessing import StandardScaler

# Scale the data
scaler = StandardScaler().fit(X)
X = scaler.transform(X)

## 2.1. Testing & Training Data for Model

In [None]:
# Import
from sklearn.model_selection import train_test_split

# Creating training, validation, and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=78)

## 2.2. Model Set-up

In [None]:
# Import Keras modules for model creation
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [None]:
# Model set-up
number_input_features = 15
hidden_nodes_layer1 = 20
hidden_nodes_layer2 = 15
hidden_nodes_layer3 = 10
hidden_nodes_layer4 = 5

In [None]:
# Define the LSTM RNN model
model = Sequential()

# Layer 1
model.add(Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Layer 2
model.add(Dense(units=hidden_nodes_layer2, activation="relu"))

# Layer 3
model.add(Dense(units=hidden_nodes_layer3, activation="relu"))

# Layer 4 
model.add(Dense(units=hidden_nodes_layer4, activation="relu"))

# Output layer
model.add(Dense(1, activation="sigmoid"))

## 3. Model compiled and trained

In [None]:
# Compile the model
model.compile(
    loss="binary_crossentropy",
    optimizer="adam",
    metrics=[
        "accuracy",
        tf.keras.metrics.TruePositives(name="tp"),
        tf.keras.metrics.TrueNegatives(name="tn"),
        tf.keras.metrics.FalsePositives(name="fp"),
        tf.keras.metrics.FalseNegatives(name="fn"),
        tf.keras.metrics.Precision(name="precision"),
        tf.keras.metrics.Recall(name="recall"),
        tf.keras.metrics.AUC(name="auc"),
    ],
)

In [None]:
# Summarize the model
model.summary()

In [None]:
# Training the model
batch_size = 1000
epochs = 500
model_training = model.fit(
    X_train,
    y_train,
    validation_data=(X_val, y_val),
    epochs=epochs,
    batch_size=batch_size,
    verbose=1,
)

## 4. Model Evaluation

In [None]:
# Plotting loss
loss_df = pd.DataFrame(
    {
        "Epoch": range(1, epochs + 1),
        "Train": model_training.history["loss"],
        "Val": model_training.history["val_loss"],
    }
)
loss_df.set_index("Epoch", inplace=True)
loss_df.plot(title="Loss")

In [None]:
# Plotting accuracy
accuracy_df = pd.DataFrame(
    {
        "Epoch": range(1, epochs + 1),
        "Train": model_training.history["accuracy"],
        "Val": model_training.history["val_accuracy"],
    }
)
accuracy_df.set_index("Epoch", inplace=True)
accuracy_df.plot(title="Accuracy")

In [None]:
 # Plotting AUC
auc_df = pd.DataFrame(
    {
        "Epoch": range(1, epochs + 1),
        "Train": model_training.history["auc"],
        "Val": model_training.history["val_auc"],
    }
)
auc_df.set_index("Epoch", inplace=True)
auc_df.plot(title="AUC")

In [None]:
# Evaluate the model
scores = model.evaluate(X_test, y_test, verbose=0)

# Define metrics dictionary
metrics = {k: v for k, v in zip(model.metrics_names, scores)}

# Display evaluation metrics results
display(metrics)

## 5. Confusion Matrix

In [None]:
 # Define the confusion matrix data
cm_df = pd.DataFrame(
    {
        "Positive (1)": [f"TP={metrics['tp']}", f"FP={metrics['fn']}"],
        "Negative (0)": [f"FN={metrics['fp']}", f"TN={metrics['tn']}"],
    },
    index=["Positive(1)", "Negative(0)"],
)
cm_df.index.name = "Actual"
cm_df.columns.name = "Predicted"

# Show the confusion matrix
cm_df

## 6. Classification report

In [None]:
# Import the classification_report method from sklearn
from sklearn.metrics import classification_report

# Predict classes using testing data
y_predict_classes = model.predict_classes(X_test, batch_size=1000)

# Display classification report
print(classification_report(y_predict_classes, y_test))