In [1]:
import os
import json
import time
import datetime
import requests
import pandas as pd
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

In [3]:
# use while-loop to iterate through a range of dates in a url to pull articles from each day. Use time.sleep to pause the loop every 4 seconds due api restrictions, api only allows a request for every 3 seconds.
start_date = datetime.date(2020, 12, 18)
end_date = datetime.date(2021, 1, 18)
delta = datetime.timedelta(days=1)
articles=[]
while start_date <= end_date:
    # print(start_date)
    gnews_api = os.environ["gnews_api"]
    gnews_url =f"https://gnews.io/api/v4/search?q=bitcoin&in=cryptocurrency&from={start_date}T00:01:36Z&to={start_date}T23:59:36Z&lang=en&token={gnews_api}"
    response = requests.get(gnews_url)
    data = response.json()
    articles.append(data)
    start_date += delta
    time.sleep(4)

In [2]:
# convert json to dataframe pulling only "publishAt", "title", "description", and "content"
articles_df = pd.json_normalize(articles, record_path=['articles'], meta='totalArticles')
articles_df['title&description']= articles_df['title']+ " " +articles_df['description']
articles_df = articles_df[['publishedAt', 'title&description', 'totalArticles']]
articles_df.rename(columns={'publishedAt':'date'}, inplace=True)
articles_df.head()

NameError: name 'articles' is not defined

In [3]:
# clean dataframe and add a column that is composed of "title" and "description". Also group dataframe by "publish date"
articles_df['date']=pd.to_datetime(articles_df['date'], infer_datetime_format=True).dt.date
bitcoin_articles= articles_df.groupby(by=["date",'totalArticles']).sum()
bitcoin_articles.head()

NameError: name 'articles_df' is not defined

In [7]:
# Import the libraries for sentiment scoring using Vader
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()
# Download/Update the VADER Lexicon
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/alhamduliallah/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [9]:
# Define two lists to store vader sentiment scoring
y_vader_pred = []
y_vader_prob = []

In [10]:
# Score sentiment of test set using Vader
for text in bitcoin_articles["title&description"]:
    y_vader_prob.append(analyzer.polarity_scores(text)["pos"])
    sentiment_score = analyzer.polarity_scores(text)["compound"]
    if sentiment_score >= 0.1:
        y_vader_pred.append(1)
    else:
        y_vader_pred.append(0)

In [19]:
# add sentiment score to bitcoin_articles dataframe
bitcoin_articles["Sentiment Score"]=y_vader_pred
bitcoin_articles.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,title&description,Sentiment Score
date,totalArticles,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-12-18,18,How to invest in bitcoin: The major ways to bu...,1
2020-12-19,6,How will Bitcoin shift the power in the coffee...,1
2020-12-20,14,"Tesla's Elon Musk asks about converting ""large...",1
2020-12-21,13,Ripple to face SEC suit over XRP cryptocurrenc...,0
2020-12-22,10,SEC Sues Ripple Over XRP Cryptocurrency The ag...,1


In [12]:
# import ccxt SDK and get bitcoin historical prices from kraken
import ccxt
kraken_public_key = os.getenv("KRAKEN_PUBLIC_KEY")
kraken_secret_key = os.getenv("KRAKEN_SECRET_KEY")
kraken = ccxt.kraken({"apiKey": kraken_public_key, "secret": kraken_secret_key})

historical_prices = kraken.fetch_ohlcv("BTC/USD","1d")

In [13]:
historical_prices_df = pd.DataFrame(historical_prices, columns=["date", "open", "high", "low", "close", "volume"])
historical_prices_df["date"] = pd.to_datetime(
    historical_prices_df["date"], unit="ms"
)
historical_prices_df.set_index("date",inplace=True)
historical_prices_df.head()

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-02-04,3416.6,3437.8,3396.9,3413.9,1359.629904
2019-02-05,3411.0,3433.7,3397.8,3429.5,1429.386854
2019-02-06,3430.0,3445.0,3337.0,3367.4,2289.491212
2019-02-07,3367.2,3382.4,3348.1,3357.1,1512.883808
2019-02-08,3359.5,3704.9,3341.4,3622.1,4231.506468


In [14]:
# Drop NAs and calculate daily percent return
historical_prices_df['daily_return'] = historical_prices_df['close'].dropna().pct_change()
historical_prices_df['volume change'] = historical_prices_df['volume'].pct_change().shift(-1)
historical_prices_df['spread'] = (historical_prices_df['high'] - historical_prices_df['low'])/historical_prices_df['open']
historical_prices_df.head
()

Unnamed: 0_level_0,open,high,low,close,volume,daily_return,volume change,spread
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-01-19,36622.5,37860.0,35900.0,35925.6,6298.676377,-0.019029,-0.156197,0.053519
2021-01-20,35925.5,36396.7,33374.0,35511.8,9170.190746,-0.011518,0.455892,0.084138
2021-01-21,35511.8,35614.1,30056.1,30832.6,19513.432956,-0.131765,1.12792,0.156511
2021-01-22,30832.5,33850.0,28800.0,33000.0,15251.549792,0.070296,-0.218408,0.163788
2021-01-23,33000.1,33496.4,31350.0,32069.4,4710.182129,-0.0282,-0.691167,0.065042


In [20]:
combined_df = bitcoin_articles.join(historical_prices_df)
combined_df['articles'] = combined_df['articles'].shift(-1)
combined_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,title&description,Sentiment Score,open,high,low,close,volume,daily_return,volume change,spread
date,totalArticles,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-12-18,18,How to invest in bitcoin: The major ways to bu...,1,22811.8,23317.1,22308.2,23130.5,6050.937552,0.013331,-0.68142,0.044227
2020-12-19,6,How will Bitcoin shift the power in the coffee...,1,23132.8,24288.2,22800.0,23871.5,6255.127803,0.032036,0.033745,0.064333
2020-12-20,14,"Tesla's Elon Musk asks about converting ""large...",1,23871.5,24297.7,23084.9,23480.7,5876.100372,-0.016371,-0.060595,0.050805
2020-12-21,13,Ripple to face SEC suit over XRP cryptocurrenc...,0,23480.8,24090.0,21885.6,22716.1,10783.18265,-0.032563,0.835092,0.093881
2020-12-22,10,SEC Sues Ripple Over XRP Cryptocurrency The ag...,1,22724.6,23837.0,22354.2,23828.2,5728.007668,0.048956,-0.468802,0.065251


In [21]:
# analyzing tone of bitcoin articles with ibm_watson tone analyzer SDK
from ibm_watson import ToneAnalyzerV3
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator

ibm_key = os.getenv("ibm_key")
authenticator = IAMAuthenticator(ibm_key)
tone_analyzer = ToneAnalyzerV3(
    version='2017-09-21',
    authenticator=authenticator
)

tone_analyzer.set_service_url('https://api.us-east.tone-analyzer.watson.cloud.ibm.com')
tone_analyzer.set_disable_ssl_verification(True)

In [22]:
tone_analyzed = []
for text in combined_df["title&description"]:
    tone_analysis = tone_analyzer.tone(
    {'text': text},
    content_type='application/json').get_result()
    tone_analyzed.append(tone_analysis)

In [23]:
# working on figuring out how to convert tone_analyzed into sparse matrix that will be merged with combined_df so we evaluate extra features.
tone=[]
for text in tone_analyzed:
    dic={}
    tone.append(dic)
    for emotions in text["document_tone"]["tones"]:
        dic.update({emotions['tone_id']:emotions['score']})
data = pd.DataFrame(tone)
data.fillna(0,inplace=True)
data.head()


Unnamed: 0,joy,fear,sadness,tentative,analytical
0,0.590572,0.592034,0.534758,0.515201,0.0
1,0.529512,0.0,0.0,0.68286,0.0
2,0.565506,0.0,0.508825,0.594668,0.0
3,0.589725,0.0,0.0,0.759095,0.0
4,0.0,0.0,0.578295,0.716654,0.559117


In [24]:
combined_df.reset_index(inplace=True)
all_df = combined_df.join(data)
all_df.set_index("date",inplace=True)


In [27]:
all_df.head()

Unnamed: 0_level_0,totalArticles,title&description,Sentiment Score,open,high,low,close,volume,daily_return,volume change,spread,joy,fear,sadness,tentative,analytical
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2020-12-18,18,How to invest in bitcoin: The major ways to bu...,1,22811.8,23317.1,22308.2,23130.5,6050.937552,0.013331,-0.68142,0.044227,0.590572,0.592034,0.534758,0.515201,0.0
2020-12-19,6,How will Bitcoin shift the power in the coffee...,1,23132.8,24288.2,22800.0,23871.5,6255.127803,0.032036,0.033745,0.064333,0.529512,0.0,0.0,0.68286,0.0
2020-12-20,14,"Tesla's Elon Musk asks about converting ""large...",1,23871.5,24297.7,23084.9,23480.7,5876.100372,-0.016371,-0.060595,0.050805,0.565506,0.0,0.508825,0.594668,0.0
2020-12-21,13,Ripple to face SEC suit over XRP cryptocurrenc...,0,23480.8,24090.0,21885.6,22716.1,10783.18265,-0.032563,0.835092,0.093881,0.589725,0.0,0.0,0.759095,0.0
2020-12-22,10,SEC Sues Ripple Over XRP Cryptocurrency The ag...,1,22724.6,23837.0,22354.2,23828.2,5728.007668,0.048956,-0.468802,0.065251,0.0,0.0,0.578295,0.716654,0.559117


In [28]:
all_df.to_csv('sentiment_prices_tone_text.csv')