# Predicting Change in Daily Bitcoin Prices using Social Media Interest Measures

Here, we train a classifier to use Daily twitter sentiment scores, daily reddit sentiment scores, and daily google trend scores to categorize the change in bitcoin prices into one of 5 categories: highly negative (-2), moderately negative (-1), no commendable change (0), moderately positive (1), highly negative (2)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics 
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

## Prepare Data

In [None]:
#Import Data

bitcoin = pd.read_csv('categorical_bitcoin_price.csv', sep='\t', index_col=0)
reddit = pd.read_csv('redditDailySentiment.csv', sep='\t', index_col=None)
trends= pd.read_csv('trends_daily_score.csv', sep='\t', index_col=0)
twitter = pd.read_csv('twitterSentiment.csv', sep=',', index_col=None)

In [None]:
bitcoin.head()

Unnamed: 0,Date,open,high,low,close,volume,Diff_Open,Diff_Category
0,2021-01-01,28923.63,29600.0,28624.57,29331.69,54182.925011,,0
1,2021-01-02,29331.7,33300.0,28946.53,32178.33,129993.873362,408.07,0
2,2021-01-03,32176.45,34778.11,31962.99,33000.05,120957.56675,2844.75,1
3,2021-01-04,33000.05,33600.0,28130.0,31988.71,140899.88569,823.6,1
4,2021-01-05,31989.75,34360.0,29900.0,33949.53,116049.997038,-1010.3,-1


In [None]:
reddit.head()

Unnamed: 0,Date,neg,neu,pos,compound
0,2021-01-01,0.05239,0.771753,0.172513,0.238452
1,2021-01-02,0.057414,0.811156,0.121446,0.135624
2,2021-01-03,0.063241,0.78381,0.148659,0.152421
3,2021-01-04,0.061042,0.815152,0.121297,0.167221
4,2021-01-05,0.06179,0.801428,0.135118,0.147205


In [None]:
trends.head()

Unnamed: 0,Date,avg_score,weighted_score
0,2021-01-01,12.660625,14.447561
1,2021-01-02,34.144375,37.360744
2,2021-01-03,48.853125,53.128124
3,2021-01-04,32.544375,35.352064
4,2021-01-05,24.34125,26.472194


In [None]:
twitter.head()

Unnamed: 0,Day,neg,neu,pos,compound
0,2021-02-05,0.030948,0.892471,0.076579,0.125674
1,2021-02-06,0.028127,0.896185,0.07568,0.129964
2,2021-02-07,0.028718,0.890963,0.080317,0.13965
3,2021-02-08,0.025719,0.887855,0.086424,0.157384
4,2021-02-09,0.027866,0.890614,0.081516,0.149618


In [None]:
#Rename merging column ("Date")
reddit = reddit.rename(columns={"Day": "Date"})
trends = trends.rename(columns={"date": "Date"})
twitter = twitter.rename(columns={"Day": "Date"})

In [None]:
#Combine Data into dataframe

data = pd.DataFrame()
data["Date"] = bitcoin["Date"]
data = data.merge(twitter, on='Date')
data = data.merge(reddit, on='Date')
data = data.merge(trends, on='Date')
data = data.merge(bitcoin, on='Date')
data.head()

Unnamed: 0,Date,neg_x,neu_x,pos_x,compound_x,neg_y,neu_y,pos_y,compound_y,avg_score,weighted_score,open,high,low,close,volume,Diff_Open,Diff_Category
0,2021-02-08,0.025719,0.887855,0.086424,0.157384,0.065759,0.796466,0.137774,0.131102,50.626875,51.557635,38795.69,46794.45,37988.89,46374.87,138597.536914,-385.32,0
1,2021-02-09,0.027866,0.890614,0.081516,0.149618,0.063627,0.804396,0.129117,0.159778,65.42875,67.025286,46374.86,48142.19,44961.09,46420.42,115499.861712,7579.17,2
2,2021-02-10,0.028736,0.885619,0.085641,0.158076,0.059693,0.800772,0.137013,0.153127,35.38625,37.993114,46420.42,47310.0,43727.0,44807.58,97154.1822,45.56,0
3,2021-02-13,0.029296,0.885487,0.085214,0.148188,0.067005,0.800504,0.129159,0.142842,18.039375,19.782415,47298.15,48150.0,46202.53,47153.69,63768.097399,-670.51,-1
4,2021-02-14,0.026092,0.859811,0.114094,0.255904,0.067111,0.798853,0.130681,0.14122,25.83625,28.54808,47156.78,49707.43,47014.17,48577.79,73735.475533,-141.37,0


In [None]:
#Rename columns, drop unnecessary columns
data = data.drop(['weighted_score', 'high', 'low','close','volume'],axis=1)
data = data.rename(columns={'neg_x':'neg_twitter', 'neu_x':'neu_twitter','pos_x':'pos_twitter','compound_x':'compound_twitter','neg_y':'neg_reddit', 'neu_y':'neu_reddit','pos_y':'pos_reddit','compound_y':'compound_reddit','avg_score':'avg_trend','Diff_Open':'diff_open','Diff_Category':'label'})
data.head()

Unnamed: 0,Date,neg_twitter,neu_twitter,pos_twitter,compound_twitter,neg_reddit,neu_reddit,pos_reddit,compound_reddit,avg_trend,open,diff_open,label
0,2021-02-08,0.025719,0.887855,0.086424,0.157384,0.065759,0.796466,0.137774,0.131102,50.626875,38795.69,-385.32,0
1,2021-02-09,0.027866,0.890614,0.081516,0.149618,0.063627,0.804396,0.129117,0.159778,65.42875,46374.86,7579.17,2
2,2021-02-10,0.028736,0.885619,0.085641,0.158076,0.059693,0.800772,0.137013,0.153127,35.38625,46420.42,45.56,0
3,2021-02-13,0.029296,0.885487,0.085214,0.148188,0.067005,0.800504,0.129159,0.142842,18.039375,47298.15,-670.51,-1
4,2021-02-14,0.026092,0.859811,0.114094,0.255904,0.067111,0.798853,0.130681,0.14122,25.83625,47156.78,-141.37,0


In [None]:
data['label'].value_counts()

 1    20
-1    18
 0    12
 2     2
Name: label, dtype: int64

In [None]:
data.columns

Index(['Date', 'neg_twitter', 'neu_twitter', 'pos_twitter', 'compound_twitter',
       'neg_reddit', 'neu_reddit', 'pos_reddit', 'compound_reddit',
       'avg_trend', 'open', 'diff_open', 'label'],
      dtype='object')

In [None]:
#get training and test data
#Using only compound sentiment score, and average google trend score
X = data[['compound_twitter','compound_reddit','avg_trend']]
y = data[['label']]

#we need to use the y value for the NEXT day as the classification variable
X = X.iloc[:-1,:]
y= y.iloc[1:,:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

## Training the Random Forest Classifier

Reference: https://www.geeksforgeeks.org/random-forest-classifier-using-scikit-learn/

In [None]:
#Train Model
clf = RandomForestClassifier(n_estimators = 10) 
clf.fit(X_train, y_train)

#Predict
y_pred = clf.predict(X_test)

  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
#Reference: https://scikit-learn.org/stable/modules/model_evaluation.html

#Evaluate Model
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print("F1-Score: ", metrics.f1_score(y_test, y_pred, average='weighted'))
print("Precision: ", metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall: ", metrics.recall_score(y_test, y_pred, average='weighted'))

Accuracy:  0.09090909090909091
F1-Score:  0.04958677685950413
Precision:  0.03409090909090909
Recall:  0.09090909090909091


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
y_pred

array([ 1,  1, -1, -1, -1,  0, -1, -1, -1, -1, -1])

## Training a Support Vector Classifier

Reference: https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

In [None]:
#Train Model
clf = SVC(kernel="linear", C=0.025)
clf.fit(X_train, y_train)

#Predict
y_pred = clf.predict(X_test)

  y = column_or_1d(y, warn=True)


In [None]:
#Evaluate Model
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print("F1-Score: ", metrics.f1_score(y_test, y_pred, average='weighted'))
print("Precision: ", metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall: ", metrics.recall_score(y_test, y_pred, average='weighted'))

Accuracy:  0.36363636363636365
F1-Score:  0.19393939393939394
Precision:  0.1322314049586777
Recall:  0.36363636363636365


  _warn_prf(average, modifier, msg_start, len(result))


## Decision Tree Classifier

In [None]:
#Train Model
clf = DecisionTreeClassifier(max_depth=5)
clf.fit(X_train, y_train)

#Predict
y_pred = clf.predict(X_test)

In [None]:
#Evaluate Model
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print("F1-Score: ", metrics.f1_score(y_test, y_pred, average='weighted'))
print("Precision: ", metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall: ", metrics.recall_score(y_test, y_pred, average='weighted'))

Accuracy:  0.36363636363636365
F1-Score:  0.3590909090909091
Precision:  0.3727272727272727
Recall:  0.36363636363636365


  _warn_prf(average, modifier, msg_start, len(result))


# 3-way Classification

Here, we attempt to categorize change in bitcoin prices to one of 3 categories: negative change (-1), no change (0), positive change (1)

In [None]:
data.head()

Unnamed: 0,Date,neg_twitter,neu_twitter,pos_twitter,compound_twitter,neg_reddit,neu_reddit,pos_reddit,compound_reddit,avg_trend,open,diff_open,label
0,2021-02-08,0.025719,0.887855,0.086424,0.157384,0.065759,0.796466,0.137774,0.131102,50.626875,38795.69,-385.32,0
1,2021-02-09,0.027866,0.890614,0.081516,0.149618,0.063627,0.804396,0.129117,0.159778,65.42875,46374.86,7579.17,2
2,2021-02-10,0.028736,0.885619,0.085641,0.158076,0.059693,0.800772,0.137013,0.153127,35.38625,46420.42,45.56,0
3,2021-02-13,0.029296,0.885487,0.085214,0.148188,0.067005,0.800504,0.129159,0.142842,18.039375,47298.15,-670.51,-1
4,2021-02-14,0.026092,0.859811,0.114094,0.255904,0.067111,0.798853,0.130681,0.14122,25.83625,47156.78,-141.37,0


In [None]:
#Transform Label
data.loc[data['label'] >0, 'label_3'] = 1  
data.loc[data['label'] <0, 'label_3'] = -1  
data.loc[data['label'] ==0, 'label_3'] = 0

data.head()

Unnamed: 0,Date,neg_twitter,neu_twitter,pos_twitter,compound_twitter,neg_reddit,neu_reddit,pos_reddit,compound_reddit,avg_trend,open,diff_open,label,label_3
0,2021-02-08,0.025719,0.887855,0.086424,0.157384,0.065759,0.796466,0.137774,0.131102,50.626875,38795.69,-385.32,0,0.0
1,2021-02-09,0.027866,0.890614,0.081516,0.149618,0.063627,0.804396,0.129117,0.159778,65.42875,46374.86,7579.17,2,1.0
2,2021-02-10,0.028736,0.885619,0.085641,0.158076,0.059693,0.800772,0.137013,0.153127,35.38625,46420.42,45.56,0,0.0
3,2021-02-13,0.029296,0.885487,0.085214,0.148188,0.067005,0.800504,0.129159,0.142842,18.039375,47298.15,-670.51,-1,-1.0
4,2021-02-14,0.026092,0.859811,0.114094,0.255904,0.067111,0.798853,0.130681,0.14122,25.83625,47156.78,-141.37,0,0.0


In [None]:
#get training and test data
#Using only compound sentiment score, and average google trend score
X = data[['compound_twitter','compound_reddit','avg_trend']]
y = data[['label_3']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

In [None]:
#Train Decision Tree
#Train Model
clf = DecisionTreeClassifier(max_depth=5)
clf.fit(X_train, y_train)

#Predict
y_pred = clf.predict(X_test)

In [None]:
#Evaluate Model
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print("F1-Score: ", metrics.f1_score(y_test, y_pred, average='weighted'))
print("Precision: ", metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall: ", metrics.recall_score(y_test, y_pred, average='weighted'))

Accuracy:  0.2727272727272727
F1-Score:  0.2621015348288076
Precision:  0.33116883116883117
Recall:  0.2727272727272727



# Predicting Change in Hourly Bitcoin Prices using Social Media Interest Measures
Here, we train a classifier to use Daily hourly sentiment scores, hourly reddit sentiment scores, and hourly google trend scores to categorize the change in bitcoin prices into one of 3 categories: negative (-1), no commendable change (0), positive (1)

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics 
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

## Prepare Data

In [2]:
#Import Data

bitcoin = pd.read_csv('categorical_bitcoin_hourly.csv', sep='\t', index_col=None)
reddit = pd.read_csv('redditHourlySentiment.csv', sep='\t', index_col=None)
trends= pd.read_csv('trends_hourly_score.csv', sep='\t', index_col=None)
twitter = pd.read_csv('twitterSentiment_hourly.csv', sep=',', index_col=None)

In [3]:
bitcoin.tail()

Unnamed: 0,date,open,high,low,close,Volume BTC,Volume USD,Diff_Open,Diff_Category
5827,2021-01-01 04:00:00,29351.85,29458.27,29121.8,29290.38,325.5657,9535943.0,57.27,0
5828,2021-01-01 03:00:00,29249.33,29414.48,29216.61,29345.63,284.146568,8338460.0,-102.52,0
5829,2021-01-01 02:00:00,29484.39,29535.95,29189.34,29249.33,290.242637,8489403.0,235.06,1
5830,2021-01-01 01:00:00,29070.66,29543.93,29010.36,29479.12,648.597652,19120090.0,-413.73,-1
5831,2021-01-01 00:00:00,28999.63,29099.0,28774.64,29056.94,397.964775,11563640.0,-71.03,0


In [4]:
reddit = reddit.rename(columns={'Hour': 'date'})
reddit.tail()

Unnamed: 0,date,neg,neu,pos,compound
1547,2021-08-30 23:00:00,0.019,0.862,0.119,0.7579
1548,2021-08-31 00:00:00,0.019,0.862,0.119,0.7579
1549,2021-08-31 01:00:00,0.019,0.862,0.119,0.7579
1550,2021-08-31 02:00:00,0.019,0.862,0.119,0.7579
1551,2021-08-31 03:00:00,0.019,0.862,0.119,0.7579


In [5]:
trends.tail()

Unnamed: 0,date,bitcoin,btc,bitcoin price,bitcoin kurs,bitcoin usd,bitcoin stock,bitcoin dollar,bitcoin euro,buy bitcoin,buy btc,btc usd,btc inr,price btc,btc stock,btc coin,btc euro,avg_score
5827,2021-08-31 19:00:00,12.07,14.07,9.9,8.36,10.88,8.1,15.96,21.76,12.6,5.89,11.76,3.8,6.24,7.81,9.01,28.86,11.691875
5828,2021-08-31 20:00:00,12.07,14.07,9.9,8.36,10.88,8.1,15.96,21.76,12.6,5.89,11.76,3.8,6.24,7.81,9.01,28.86,11.691875
5829,2021-08-31 21:00:00,12.07,14.07,9.9,8.36,10.88,8.1,15.96,21.76,12.6,5.89,11.76,3.8,6.24,7.81,9.01,28.86,11.691875
5830,2021-08-31 22:00:00,12.07,14.07,9.9,8.36,10.88,8.1,15.96,21.76,12.6,5.89,11.76,3.8,6.24,7.81,9.01,28.86,11.691875
5831,2021-08-31 23:00:00,12.07,14.07,9.9,8.36,10.88,8.1,15.96,21.76,12.6,5.89,11.76,3.8,6.24,7.81,9.01,28.86,11.691875


In [9]:
twitter.tail(20)

Unnamed: 0,Hour,neg,neu,pos,compound
1971,2021-11-19 15:00:00,0.032943,0.864558,0.102496,0.213237
1972,2021-11-19 16:00:00,0.026469,0.872561,0.100972,0.236397
1973,2021-11-19 17:00:00,0.032144,0.852734,0.115126,0.258893
1974,2021-11-19 18:00:00,0.040772,0.854975,0.10425,0.179953
1975,2021-11-19 19:00:00,0.031683,0.865458,0.102859,0.233449
1976,2021-11-19 20:00:00,0.028347,0.871458,0.100195,0.241766
1977,2021-11-19 21:00:00,0.028735,0.879008,0.092264,0.215446
1978,2021-11-19 22:00:00,0.033254,0.865578,0.101166,0.244083
1979,2021-11-19 23:00:00,0.026742,0.875645,0.097604,0.228104
1980,"['Airdrop', 'Airdrops', 'Airdropinspector', 'B...",0.0,1.0,0.0,0.0


In [15]:
twitter = twitter.iloc[:1980,:]
twitter = twitter.rename(columns={'Hour':'date'})

In [16]:
twitter.tail()

Unnamed: 0,date,neg,neu,pos,compound
1975,2021-11-19 19:00:00,0.031683,0.865458,0.102859,0.233449
1976,2021-11-19 20:00:00,0.028347,0.871458,0.100195,0.241766
1977,2021-11-19 21:00:00,0.028735,0.879008,0.092264,0.215446
1978,2021-11-19 22:00:00,0.033254,0.865578,0.101166,0.244083
1979,2021-11-19 23:00:00,0.026742,0.875645,0.097604,0.228104


In [14]:
print(bitcoin.shape[0])
print(trends.shape[0])
print(reddit.shape[0])
print(twitter.shape[0])

5832
5832
1552
1980


In [55]:
#Combine Data into dataframe

data = pd.DataFrame()
data["date"] = bitcoin["date"]
data = data.merge(bitcoin[['date','Diff_Category']], on='date')
data = data.merge(trends[['date','avg_score']], on='date', how='outer')
data = data.merge(reddit[['date','compound']], on='date', how='inner')
data = data.merge(twitter[['date','compound']], on='date', how='inner')

data.tail()

Unnamed: 0,date,Diff_Category,avg_score,compound_x,compound_y
550,2021-02-09 04:00:00,-1,65.42875,0.169034,0.177088
551,2021-02-08 20:00:00,-1,50.626875,0.137459,0.17137
552,2021-02-08 03:00:00,1,50.626875,0.176463,0.182238
553,2021-02-08 02:00:00,1,50.626875,0.133948,0.117205
554,2021-02-08 01:00:00,1,50.626875,0.053669,0.106501


In [56]:
#trying with all attributes
#GIVES WORSE PERFORMANCE- DO NOT USE
'''
data = pd.DataFrame()
data["date"] = bitcoin["date"]
data = data.merge(bitcoin[['date','Diff_Category']], on='date')
data = data.merge(trends, on='date', how='outer')
data = data.merge(reddit, on='date', how='inner')
data = data.merge(twitter, on='date', how='inner')
data.tail()
'''

'\ndata = pd.DataFrame()\ndata["date"] = bitcoin["date"]\ndata = data.merge(bitcoin[[\'date\',\'Diff_Category\']], on=\'date\')\ndata = data.merge(trends, on=\'date\', how=\'outer\')\ndata = data.merge(reddit, on=\'date\', how=\'inner\')\ndata = data.merge(twitter, on=\'date\', how=\'inner\')\ndata.tail()\n'

In [43]:
#data = data.fillna(0)
#data.tail()

In [57]:
#get training and test data
#Using only compound sentiment score, and average google trend score
X = data[['compound_x','compound_y','avg_score']]
y = data[['Diff_Category']]

#we need to use the y value for the NEXT hour as the classification variable
X = X.iloc[:-1,:]
y= y.iloc[1:,:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=7)

In [58]:
#trying with all attributes
#get training and test data
'''
cols = data.columns
X = data[cols[2:]]
y = data[['Diff_Category']]

#we need to use the y value for the NEXT hour as the classification variable
X = X.iloc[:-1,:]
y= y.iloc[1:,:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=7)
'''

"\ncols = data.columns\nX = data[cols[2:]]\ny = data[['Diff_Category']]\n\n#we need to use the y value for the NEXT hour as the classification variable\nX = X.iloc[:-1,:]\ny= y.iloc[1:,:]\n\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=7)\n"

## Training a Decision Tree Classifier

In [59]:
#Train Model
clf = DecisionTreeClassifier(max_depth=5)
clf.fit(X_train, y_train)

#Predict
y_pred = clf.predict(X_test)

In [60]:
#Evaluate Model
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print("F1-Score: ", metrics.f1_score(y_test, y_pred, average='weighted'))
print("Precision: ", metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall: ", metrics.recall_score(y_test, y_pred, average='weighted'))

Accuracy:  0.4594594594594595
F1-Score:  0.3639768520922193
Precision:  0.45849928876244667
Recall:  0.4594594594594595


## Training a Random Forrest Classifier

In [61]:
#Train Model
clf = RandomForestClassifier(n_estimators = 10) 
clf.fit(X_train, y_train)

#Predict
y_pred = clf.predict(X_test)

#Evaluate Model
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print("F1-Score: ", metrics.f1_score(y_test, y_pred, average='weighted'))
print("Precision: ", metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall: ", metrics.recall_score(y_test, y_pred, average='weighted'))

Accuracy:  0.32432432432432434
F1-Score:  0.31132645803698433
Precision:  0.3266975181868799
Recall:  0.32432432432432434


  This is separate from the ipykernel package so we can avoid doing imports until


## Training an SVM Classifier

In [62]:
#Train Model
clf = SVC(kernel="linear", C=0.025)
clf.fit(X_train, y_train)

#Predict
y_pred = clf.predict(X_test)


#Evaluate Model
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print("F1-Score: ", metrics.f1_score(y_test, y_pred, average='weighted'))
print("Precision: ", metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall: ", metrics.recall_score(y_test, y_pred, average='weighted'))

Accuracy:  0.3963963963963964
F1-Score:  0.26215824912784486
Precision:  0.2056829556829557
Recall:  0.3963963963963964


  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
