# Import Libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re


%matplotlib inline

In [2]:
# Download/Update the VADER Lexicon
nltk.download('vader_lexicon')

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Lendl\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# Import CSV files

In [3]:
# Get Tweets data
tweets = pd.read_csv('../Data/texts.csv', infer_datetime_format=True, parse_dates=True)

# Convert timestamp to datetime
tweets['timestamp'] = pd.to_datetime(tweets['timestamp'], unit='ms', infer_datetime_format=True)
tweets = tweets.rename(columns={'text':'Tweets'})
tweets.head()

Unnamed: 0,id,timestamp,Tweets
0,1432832769269379000,2021-08-31 22:28:47,With the end of August already here\t it’s nic...
1,1432809586763964400,2021-08-31 20:56:40,#BNB https://t.co/eCnAd8JEi8
2,1432790221838499800,2021-08-31 19:39:43,So #BTC is cool\t but maybe you want something...
3,1432753822858989600,2021-08-31 17:15:05,Missed one of the #Binance Responsible Trading...
4,1432730654471557000,2021-08-31 15:43:01,$500 can be yours over on Featured by #Binance...


In [4]:
btcusd = pd.read_csv('../Data/btcusd.csv')
btcusd.head()

# Convert 'Date' to datetime
btcusd['Date'] = btcusd['Date'].astype('datetime64[ns]') 

# Drop columns and keep only 'Close'
btcusd = btcusd.drop(['Open', 'High','Low','Adj Close','Volume'], axis=1)
btcusd.set_index("Date", inplace=True)

# Data Preprocessing

In [5]:
# Create function to clean tweets (remove @mentions, #, RT, Hyperlinks, and Symbols)
def cleanText(tweet):
  tweet = re.sub(r'@[A-Za-z0-9]+', '', tweet) 
  tweet = re.sub(r'#', '', tweet)
  tweet = re.sub(r'RT[\s]+', '', tweet)
  tweet = re.sub(r'https?:\/\/\S+', '', tweet)
  tweet = re.sub('\\t', '', tweet)

  return tweet

# Clean tweets
tweets["Tweets"] = tweets['Tweets'].apply(cleanText)
tweets


Unnamed: 0,id,timestamp,Tweets
0,1432832769269379000,2021-08-31 22:28:47,With the end of August already here it’s nice ...
1,1432809586763964400,2021-08-31 20:56:40,BNB
2,1432790221838499800,2021-08-31 19:39:43,So BTC is cool but maybe you want something el...
3,1432753822858989600,2021-08-31 17:15:05,Missed one of the Binance Responsible Trading ...
4,1432730654471557000,2021-08-31 15:43:01,$500 can be yours over on Featured by Binance
...,...,...,...
532,1430518089746112500,2021-08-25 13:11:04,OnlyFans CEO Blames Banks for Scrapping Adult ...
533,1430398366823985200,2021-08-25 05:15:20,Tezos Strikes Three Key Partnerships Targeting...
534,1429414195578163200,2021-08-22 12:04:36,How Hackers and Cybercriminals launder Crypto ...
535,1428928505069326300,2021-08-21 03:54:38,US Mayor Wants to Give Every Resident $1000 in...


In [6]:
# Filter and grab tweets related to Bitcoin, Ethereum
btc_tweets = tweets[tweets['Tweets'].str.contains("BTC", "Bitcoin")]
eth_tweets = tweets[tweets['Tweets'].str.contains('ETH', 'Ethereum')]
crypto_tweets = tweets[tweets['Tweets'].str.contains('crypto', 'cryptocurrency')]

In [7]:
print(f"Bitcoin has {btc_tweets.shape[0]} rows of tweets")
print(f"Ethereum has {eth_tweets.shape[0]} rows of tweets")
print(f"Crypto has {crypto_tweets.shape[0]} rows of tweets")


Bitcoin has 38 rows of tweets
Ethereum has 17 rows of tweets
Crypto has 64 rows of tweets


# Sentiment Analysis

In [8]:
# Create a Bitcoin sentiment scores Dataframe
bitcoin_sentiments = []
analyzer = SentimentIntensityAnalyzer()

for tweets in btc_tweets["Tweets"]:
    text = tweets
    sentiment = analyzer.polarity_scores(text)
    compound = sentiment["compound"]
    pos = sentiment["pos"]
    neu = sentiment["neu"]
    neg = sentiment["neg"]
        
    bitcoin_sentiments.append({
        "Tweet": text,
        "Compound": compound,
        "Positive": pos,
        "Negative": neg,
        "Neutral": neu,
    })

# Grab dates
btc_dates = []
for d in btc_tweets["timestamp"]:
    date = d

    btc_dates.append({
        "Date":d
    })

# Create the sentiment and date dataframe
btc_dates_df = pd.DataFrame(btc_dates)        

btc_sentiment_df = pd.DataFrame(bitcoin_sentiments)
cols = ["Compound","Negative","Neutral","Positive","Tweet"]
btc_sentiment_df = btc_sentiment_df[cols]

# Join the date and sentiment dataframe
btc_sentiment_df = pd.concat([btc_sentiment_df, btc_dates_df], join='outer', axis=1)

# Normalize the 'Date' and remove the time component
btc_sentiment_df['Date'] = pd.to_datetime(btc_sentiment_df['Date']).dt.normalize()
btc_sentiment_df.head()

Unnamed: 0,Compound,Negative,Neutral,Positive,Tweet,Date
0,0.6986,0.0,0.715,0.285,So BTC is cool but maybe you want something el...,2021-08-31
1,0.4588,0.0,0.87,0.13,: The August $BTC option expiry is shaping up ...,2021-08-27
2,0.9855,0.016,0.83,0.153,announced today that its broker-dealer MML In...,2021-08-20
3,0.9672,0.03,0.845,0.125,As of August the firm’s data center had a mini...,2021-08-31
4,0.0,0.0,1.0,0.0,Canaan has sold Genesis 20K bitcoin miners and...,2021-08-31


In [9]:
# Create ETHEREUM sentiment scores Dataframe
eth_sentiments = []
analyzer = SentimentIntensityAnalyzer()

for tweets in eth_tweets["Tweets"]:
    text = tweets
    sentiment = analyzer.polarity_scores(text)
    compound = sentiment["compound"]
    pos = sentiment["pos"]
    neu = sentiment["neu"]
    neg = sentiment["neg"]
        
    eth_sentiments.append({
        "Tweet": text,
        "Compound": compound,
        "Positive": pos,
        "Negative": neg,
        "Neutral": neu,
    })

# Grab dates
dates = []
for d in eth_tweets["timestamp"]:
    date = d

    dates.append({
        "Date":d
    })

# Create the sentiment and date dataframe
eth_dates = pd.DataFrame(dates)

eth_sentiment_df = pd.DataFrame(eth_sentiments)
cols = ["Compound","Negative","Neutral","Positive","Tweet"]
eth_sentiment_df = eth_sentiment_df[cols]

# Join the date and sentiment dataframe
eth_sentiment_df = pd.concat([eth_sentiment_df, eth_dates], join='outer', axis=1)

# Normalize the 'Date' and remove the time component
eth_sentiment_df['Date'] = pd.to_datetime(eth_sentiment_df['Date']).dt.normalize()
eth_sentiment_df.head()

Unnamed: 0,Compound,Negative,Neutral,Positive,Tweet,Date
0,0.6808,0.0,0.732,0.268,Is it time to ETH? There is growing confidence...,2021-08-31
1,-0.2263,0.193,0.711,0.096,Costly Mistake: Mutant Ape NFT Sold for 17 USD...,2021-08-31
2,-0.1027,0.085,0.915,0.0,Leading Ethereum Developer Proposes Date for M...,2021-07-07
3,0.0,0.0,1.0,0.0,SHIB Becomes ETH’s 3rd Largest Consumer of Gas...,2021-07-07
4,-0.4215,0.141,0.859,0.0,Ethereum’s Max. Price Lies in the Range of $75...,2021-07-07


# Correlate Score and Returns

In [10]:
# Isolate the score and date of the bitcoin sentiment df
btc_score = btc_sentiment_df[['Compound','Date']].copy()
btc_score.head()

Unnamed: 0,Compound,Date
0,0.6986,2021-08-31
1,0.4588,2021-08-27
2,0.9855,2021-08-20
3,0.9672,2021-08-31
4,0.0,2021-08-31


In [11]:
# Aggregate daily sentiment score
btc_daily_score = btc_score.groupby(["Date"]).sum()
btc_daily_score.head()

Unnamed: 0_level_0,Compound
Date,Unnamed: 1_level_1
2020-01-24,0.3818
2021-02-05,0.0
2021-02-16,0.0
2021-02-25,0.0
2021-03-01,0.0


In [12]:
# Calculate Bitcoin daily returns
btcusd['Returns'] = btcusd['Close']/btcusd['Close'].shift(1) - 1
btcusd.head()

Unnamed: 0_level_0,Close,Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-08-31,11680.820313,
2020-09-01,11970.478516,0.024798
2020-09-02,11414.03418,-0.046485
2020-09-03,10245.296875,-0.102395
2020-09-04,10511.813477,0.026014


In [13]:
# Check relationship between sentiment score and returns
# Lag the sentiment score and shift the score down 1
btc_daily_score['Previous Score'] = btc_daily_score.shift(1)
btc_daily_score.head()

Unnamed: 0_level_0,Compound,Previous Score
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-24,0.3818,
2021-02-05,0.0,0.3818
2021-02-16,0.0,0.0
2021-02-25,0.0,0.0
2021-03-01,0.0,0.0


In [14]:
# Merge the daily returns to the lagged sentiment scores
btcreturnsScore = pd.merge( btc_daily_score[['Previous Score']], btcusd[['Returns']], left_index=True, right_index=True, how='left')

In [15]:
# Clean the data
btcreturnsScore["Returns shift"] = btcreturnsScore["Returns"].shift(3)
btcreturnsScore.fillna(0, inplace=True)

In [16]:
# Calculate correlation
btcreturnsScore['Returns shift'].corr(btcreturnsScore['Previous Score'])

-0.21870085377471477

# Scale Sentiment Score

In [17]:
eth_sentiment_scaled = eth_sentiment_df["Compound"].apply(lambda x: int(((x*100)+100)/2))
eth_sentiment_scaled.head()

0    84
1    38
2    44
3    50
4    28
Name: Compound, dtype: int64

In [18]:
# Scale the Compound Score to be between 0 and 100
btc_sentiment_scaled = btc_sentiment_df["Compound"].apply(lambda x: int(((x*100)+100)/2))
btc_sentiment_scaled.head()

0    84
1    72
2    99
3    98
4    50
Name: Compound, dtype: int64

In [19]:
# Save btc_sentiment_scaled and eth_sentiment_scaled to csv
btc_sentiment_scaled.to_csv("../Data/btc_sentiment.csv")
eth_sentiment_scaled.to_csv("../Data/eth_sentiment.csv")