In [1]:
import os
import json
import time
import datetime
import requests
import pandas as pd
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

In [3]:
# use while-loop to iterate through a range of dates in a url to pull articles from each day. Use time.sleep to pause the loop every 4 seconds due api restrictions, api only allows a request for every 3 seconds.
start_date = datetime.date(2020, 12, 18)
end_date = datetime.date(2021, 1, 18)
delta = datetime.timedelta(days=1)
articles=[]
while start_date <= end_date:
    # print(start_date)
    gnews_api = os.environ["gnews_api"]
    gnews_url =f"https://gnews.io/api/v4/search?q=bitcoin&in=cryptocurrency&from={start_date}T00:01:36Z&to={start_date}T23:59:36Z&lang=en&token={gnews_api}"
    response = requests.get(gnews_url)
    data = response.json()
    articles.append(data)
    start_date += delta
    time.sleep(4)

In [20]:
# convert json to dataframe pulling only "publishAt", "title", "description", and "content"
articles_df = pd.json_normalize(articles, record_path=['articles'], meta='totalArticles')
articles_df['title&description']= articles_df['title']+ " " +articles_df['description']
articles_df = articles_df[['publishedAt', 'title&description', 'totalArticles']]
articles_df.rename(columns={'publishedAt':'date'}, inplace=True)
articles_df.head()

Unnamed: 0,date,title&description,totalArticles
0,2020-12-18T23:55:00Z,How to invest in bitcoin: The major ways to bu...,18
1,2020-12-18T22:30:35Z,Which Bitcoin Fund Should You Buy? Bitcoin is ...,18
2,2020-12-18T17:52:00Z,Bitcoin's market cap could hit $1 trillion in ...,18
3,2020-12-18T16:58:10Z,New bitcoin investors buying $20 million or mo...,18
4,2020-12-18T16:35:13Z,Forget Gold and Bitcoin. I’d Use the Stock Mar...,18


In [32]:
# clean dataframe and add a column that is composed of "title" and "description". Also group dataframe by "publish date"
articles_df['date']=pd.to_datetime(articles_df['date'], infer_datetime_format=True).dt.date
bitcoin_articles= articles_df.groupby(by=["date",'totalArticles']).sum()
bitcoin_articles.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,title&description
date,totalArticles,Unnamed: 2_level_1
2020-12-18,18,How to invest in bitcoin: The major ways to bu...
2020-12-19,6,How will Bitcoin shift the power in the coffee...
2020-12-20,14,"Tesla's Elon Musk asks about converting ""large..."
2020-12-21,13,Ripple to face SEC suit over XRP cryptocurrenc...
2020-12-22,10,SEC Sues Ripple Over XRP Cryptocurrency The ag...


In [6]:
# Import the libraries for sentiment scoring using Vader
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()
# Download/Update the VADER Lexicon
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/alhamduliallah/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [7]:
# Define two lists to store vader sentiment scoring
y_vader_pred = []
y_vader_prob = []

In [9]:
# Score sentiment of test set using Vader
for text in bitcoin_articles["title&description"]:
    y_vader_prob.append(analyzer.polarity_scores(text)["pos"])
    sentiment_score = analyzer.polarity_scores(text)["compound"]
    if sentiment_score >= 0.1:
        y_vader_pred.append(1)
    else:
        y_vader_pred.append(0)

In [22]:
# add sentiment score to bitcoin_articles dataframe
bitcoin_articles["Sentiment Score"]=y_vader_pred
bitcoin_articles['number of articles']= 
bitcoin_articles.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,title&description,Sentiment Score
date,totalArticles,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-12-18,18,How to invest in bitcoin: The major ways to bu...,1
2020-12-19,6,How will Bitcoin shift the power in the coffee...,1
2020-12-20,14,"Tesla's Elon Musk asks about converting ""large...",1
2020-12-21,13,Ripple to face SEC suit over XRP cryptocurrenc...,0
2020-12-22,10,SEC Sues Ripple Over XRP Cryptocurrency The ag...,1


In [11]:
# import ccxt SDK and get bitcoin historical prices from kraken
import ccxt
kraken_public_key = os.getenv("KRAKEN_PUBLIC_KEY")
kraken_secret_key = os.getenv("KRAKEN_SECRET_KEY")
kraken = ccxt.kraken({"apiKey": kraken_public_key, "secret": kraken_secret_key})

historical_prices = kraken.fetch_ohlcv("BTC/USD","1d")

In [23]:
historical_prices_df = pd.DataFrame(historical_prices, columns=["date", "open", "high", "low", "close", "volume"])
historical_prices_df["date"] = pd.to_datetime(
    historical_prices_df["date"], unit="ms"
)
historical_prices_df.set_index("date",inplace=True)
historical_prices_df.head()

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-02-03,3467.7,3472.9,3388.6,3415.2,1300.82888
2019-02-04,3416.6,3437.8,3396.9,3413.9,1359.629904
2019-02-05,3411.0,3433.7,3397.8,3429.5,1429.386854
2019-02-06,3430.0,3445.0,3337.0,3367.4,2289.491212
2019-02-07,3367.2,3382.4,3348.1,3357.1,1512.883808


In [24]:
# Drop NAs and calculate daily percent return
historical_prices_df['daily_return'] = historical_prices_df['close'].dropna().pct_change()
historical_prices_df['volume change'] = historical_prices_df['volume'].pct_change()
historical_prices_df['spread'] = (historical_prices_df['high'] - historical_prices_df['low'])/historical_prices_df['open']
historical_prices_df.tail()

Unnamed: 0_level_0,open,high,low,close,volume,daily_return,volume change,spread
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-01-18,35815.6,37400.0,34754.5,36622.5,7464.626338,0.022506,-0.172243,0.073864
2021-01-19,36622.5,37860.0,35900.0,35925.6,6298.676377,-0.019029,-0.156197,0.053519
2021-01-20,35925.5,36396.7,33374.0,35511.8,9170.190746,-0.011518,0.455892,0.084138
2021-01-21,35511.8,35614.1,30056.1,30832.6,19513.432956,-0.131765,1.12792,0.156511
2021-01-22,30832.5,32844.7,28800.0,32377.7,11897.779434,0.050113,-0.390277,0.131183


In [37]:
combined_df = bitcoin_articles.join(historical_prices_df)
# combined_df['articles'].shift(periods=1) code for shifting articles by 1 day. Hasn't been done yet.
combined_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,title&description,open,high,low,close,volume,daily_return,volume change,spread
date,totalArticles,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-12-18,18,How to invest in bitcoin: The major ways to bu...,22811.8,23317.1,22308.2,23130.5,6050.937552,0.013331,-0.68142,0.044227
2020-12-19,6,How will Bitcoin shift the power in the coffee...,23132.8,24288.2,22800.0,23871.5,6255.127803,0.032036,0.033745,0.064333
2020-12-20,14,"Tesla's Elon Musk asks about converting ""large...",23871.5,24297.7,23084.9,23480.7,5876.100372,-0.016371,-0.060595,0.050805
2020-12-21,13,Ripple to face SEC suit over XRP cryptocurrenc...,23480.8,24090.0,21885.6,22716.1,10783.18265,-0.032563,0.835092,0.093881
2020-12-22,10,SEC Sues Ripple Over XRP Cryptocurrency The ag...,22724.6,23837.0,22354.2,23828.2,5728.007668,0.048956,-0.468802,0.065251


In [None]:
# analyzing tone of bitcoin articles with ibm_watson tone analyzer SDK
from ibm_watson import ToneAnalyzerV3
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator

ibm_key = os.getenv("ibm_key")
authenticator = IAMAuthenticator(ibm_key)
tone_analyzer = ToneAnalyzerV3(
    version='2017-09-21',
    authenticator=authenticator
)

tone_analyzer.set_service_url('https://api.us-east.tone-analyzer.watson.cloud.ibm.com')
tone_analyzer.set_disable_ssl_verification(True)

In [None]:
tone_analyzed = []
for text in combined_df["Title & Description"]:
    tone_analysis = tone_analyzer.tone(
    {'text': text},
    content_type='application/json',
    sentences=False).get_result()
    tone_analyzed.append(tone_analysis)

In [None]:
# working on figuring out how to convert tone_analyzed into sparse matrix that will be merged with combined_df so we evaluate extra features.