# Predicting Change in Daily Bitcoin Prices using Social Media Interest Measures

Here, we train a classifier to use Daily twitter sentiment scores, daily reddit sentiment scores, and daily google trend scores to categorize the change in bitcoin prices into one of 5 categories: highly negative (-2), moderately negative (-1), no commendable change (0), moderately positive (1), highly negative (2)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics 
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

## Prepare Data

In [None]:
#Import Data

bitcoin = pd.read_csv('categorical_bitcoin_price.csv', sep='\t', index_col=0)
reddit = pd.read_csv('redditDailySentiment.csv', sep='\t', index_col=None)
trends= pd.read_csv('trends_daily_score.csv', sep='\t', index_col=0)
twitter = pd.read_csv('twitterSentiment.csv', sep=',', index_col=None)

In [None]:
bitcoin.head()

Unnamed: 0,Date,open,high,low,close,volume,Diff_Open,Diff_Category
0,2021-01-01,28923.63,29600.0,28624.57,29331.69,54182.925011,,0
1,2021-01-02,29331.7,33300.0,28946.53,32178.33,129993.873362,408.07,0
2,2021-01-03,32176.45,34778.11,31962.99,33000.05,120957.56675,2844.75,1
3,2021-01-04,33000.05,33600.0,28130.0,31988.71,140899.88569,823.6,1
4,2021-01-05,31989.75,34360.0,29900.0,33949.53,116049.997038,-1010.3,-1


In [None]:
reddit.head()

Unnamed: 0,Date,neg,neu,pos,compound
0,2021-01-01,0.05239,0.771753,0.172513,0.238452
1,2021-01-02,0.057414,0.811156,0.121446,0.135624
2,2021-01-03,0.063241,0.78381,0.148659,0.152421
3,2021-01-04,0.061042,0.815152,0.121297,0.167221
4,2021-01-05,0.06179,0.801428,0.135118,0.147205


In [None]:
trends.head()

Unnamed: 0,Date,avg_score,weighted_score
0,2021-01-01,12.660625,14.447561
1,2021-01-02,34.144375,37.360744
2,2021-01-03,48.853125,53.128124
3,2021-01-04,32.544375,35.352064
4,2021-01-05,24.34125,26.472194


In [None]:
twitter.head()

Unnamed: 0,Day,neg,neu,pos,compound
0,2021-02-05,0.030948,0.892471,0.076579,0.125674
1,2021-02-06,0.028127,0.896185,0.07568,0.129964
2,2021-02-07,0.028718,0.890963,0.080317,0.13965
3,2021-02-08,0.025719,0.887855,0.086424,0.157384
4,2021-02-09,0.027866,0.890614,0.081516,0.149618


In [None]:
#Rename merging column ("Date")
reddit = reddit.rename(columns={"Day": "Date"})
trends = trends.rename(columns={"date": "Date"})
twitter = twitter.rename(columns={"Day": "Date"})

In [None]:
#Combine Data into dataframe

data = pd.DataFrame()
data["Date"] = bitcoin["Date"]
data = data.merge(twitter, on='Date')
data = data.merge(reddit, on='Date')
data = data.merge(trends, on='Date')
data = data.merge(bitcoin, on='Date')
data.head()

Unnamed: 0,Date,neg_x,neu_x,pos_x,compound_x,neg_y,neu_y,pos_y,compound_y,avg_score,weighted_score,open,high,low,close,volume,Diff_Open,Diff_Category
0,2021-02-08,0.025719,0.887855,0.086424,0.157384,0.065759,0.796466,0.137774,0.131102,50.626875,51.557635,38795.69,46794.45,37988.89,46374.87,138597.536914,-385.32,0
1,2021-02-09,0.027866,0.890614,0.081516,0.149618,0.063627,0.804396,0.129117,0.159778,65.42875,67.025286,46374.86,48142.19,44961.09,46420.42,115499.861712,7579.17,2
2,2021-02-10,0.028736,0.885619,0.085641,0.158076,0.059693,0.800772,0.137013,0.153127,35.38625,37.993114,46420.42,47310.0,43727.0,44807.58,97154.1822,45.56,0
3,2021-02-13,0.029296,0.885487,0.085214,0.148188,0.067005,0.800504,0.129159,0.142842,18.039375,19.782415,47298.15,48150.0,46202.53,47153.69,63768.097399,-670.51,-1
4,2021-02-14,0.026092,0.859811,0.114094,0.255904,0.067111,0.798853,0.130681,0.14122,25.83625,28.54808,47156.78,49707.43,47014.17,48577.79,73735.475533,-141.37,0


In [None]:
#Rename columns, drop unnecessary columns
data = data.drop(['weighted_score', 'high', 'low','close','volume'],axis=1)
data = data.rename(columns={'neg_x':'neg_twitter', 'neu_x':'neu_twitter','pos_x':'pos_twitter','compound_x':'compound_twitter','neg_y':'neg_reddit', 'neu_y':'neu_reddit','pos_y':'pos_reddit','compound_y':'compound_reddit','avg_score':'avg_trend','Diff_Open':'diff_open','Diff_Category':'label'})
data.head()

Unnamed: 0,Date,neg_twitter,neu_twitter,pos_twitter,compound_twitter,neg_reddit,neu_reddit,pos_reddit,compound_reddit,avg_trend,open,diff_open,label
0,2021-02-08,0.025719,0.887855,0.086424,0.157384,0.065759,0.796466,0.137774,0.131102,50.626875,38795.69,-385.32,0
1,2021-02-09,0.027866,0.890614,0.081516,0.149618,0.063627,0.804396,0.129117,0.159778,65.42875,46374.86,7579.17,2
2,2021-02-10,0.028736,0.885619,0.085641,0.158076,0.059693,0.800772,0.137013,0.153127,35.38625,46420.42,45.56,0
3,2021-02-13,0.029296,0.885487,0.085214,0.148188,0.067005,0.800504,0.129159,0.142842,18.039375,47298.15,-670.51,-1
4,2021-02-14,0.026092,0.859811,0.114094,0.255904,0.067111,0.798853,0.130681,0.14122,25.83625,47156.78,-141.37,0


In [None]:
data['label'].value_counts()

 1    20
-1    18
 0    12
 2     2
Name: label, dtype: int64

In [None]:
data.columns

Index(['Date', 'neg_twitter', 'neu_twitter', 'pos_twitter', 'compound_twitter',
       'neg_reddit', 'neu_reddit', 'pos_reddit', 'compound_reddit',
       'avg_trend', 'open', 'diff_open', 'label'],
      dtype='object')

In [None]:
#get training and test data
#Using only compound sentiment score, and average google trend score
X = data[['compound_twitter','compound_reddit','avg_trend']]
y = data[['label']]

#we need to use the y value for the NEXT day as the classification variable
X = X.iloc[:-1,:]
y= y.iloc[1:,:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

## Training the Random Forest Classifier

Reference: https://www.geeksforgeeks.org/random-forest-classifier-using-scikit-learn/

In [None]:
#Train Model
clf = RandomForestClassifier(n_estimators = 10) 
clf.fit(X_train, y_train)

#Predict
y_pred = clf.predict(X_test)

  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
#Reference: https://scikit-learn.org/stable/modules/model_evaluation.html

#Evaluate Model
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print("F1-Score: ", metrics.f1_score(y_test, y_pred, average='weighted'))
print("Precision: ", metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall: ", metrics.recall_score(y_test, y_pred, average='weighted'))

Accuracy:  0.09090909090909091
F1-Score:  0.04958677685950413
Precision:  0.03409090909090909
Recall:  0.09090909090909091


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
y_pred

array([ 1,  1, -1, -1, -1,  0, -1, -1, -1, -1, -1])

## Training a Support Vector Classifier

Reference: https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

In [None]:
#Train Model
clf = SVC(kernel="linear", C=0.025)
clf.fit(X_train, y_train)

#Predict
y_pred = clf.predict(X_test)

  y = column_or_1d(y, warn=True)


In [None]:
#Evaluate Model
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print("F1-Score: ", metrics.f1_score(y_test, y_pred, average='weighted'))
print("Precision: ", metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall: ", metrics.recall_score(y_test, y_pred, average='weighted'))

Accuracy:  0.36363636363636365
F1-Score:  0.19393939393939394
Precision:  0.1322314049586777
Recall:  0.36363636363636365


  _warn_prf(average, modifier, msg_start, len(result))


## Decision Tree Classifier

In [None]:
#Train Model
clf = DecisionTreeClassifier(max_depth=5)
clf.fit(X_train, y_train)

#Predict
y_pred = clf.predict(X_test)

In [None]:
#Evaluate Model
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print("F1-Score: ", metrics.f1_score(y_test, y_pred, average='weighted'))
print("Precision: ", metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall: ", metrics.recall_score(y_test, y_pred, average='weighted'))

Accuracy:  0.36363636363636365
F1-Score:  0.3590909090909091
Precision:  0.3727272727272727
Recall:  0.36363636363636365


  _warn_prf(average, modifier, msg_start, len(result))


# 3-way Classification

Here, we attempt to categorize change in bitcoin prices to one of 3 categories: negative change (-1), no change (0), positive change (1)

In [None]:
data.head()

Unnamed: 0,Date,neg_twitter,neu_twitter,pos_twitter,compound_twitter,neg_reddit,neu_reddit,pos_reddit,compound_reddit,avg_trend,open,diff_open,label
0,2021-02-08,0.025719,0.887855,0.086424,0.157384,0.065759,0.796466,0.137774,0.131102,50.626875,38795.69,-385.32,0
1,2021-02-09,0.027866,0.890614,0.081516,0.149618,0.063627,0.804396,0.129117,0.159778,65.42875,46374.86,7579.17,2
2,2021-02-10,0.028736,0.885619,0.085641,0.158076,0.059693,0.800772,0.137013,0.153127,35.38625,46420.42,45.56,0
3,2021-02-13,0.029296,0.885487,0.085214,0.148188,0.067005,0.800504,0.129159,0.142842,18.039375,47298.15,-670.51,-1
4,2021-02-14,0.026092,0.859811,0.114094,0.255904,0.067111,0.798853,0.130681,0.14122,25.83625,47156.78,-141.37,0


In [None]:
#Transform Label
data.loc[data['label'] >0, 'label_3'] = 1  
data.loc[data['label'] <0, 'label_3'] = -1  
data.loc[data['label'] ==0, 'label_3'] = 0

data.head()

Unnamed: 0,Date,neg_twitter,neu_twitter,pos_twitter,compound_twitter,neg_reddit,neu_reddit,pos_reddit,compound_reddit,avg_trend,open,diff_open,label,label_3
0,2021-02-08,0.025719,0.887855,0.086424,0.157384,0.065759,0.796466,0.137774,0.131102,50.626875,38795.69,-385.32,0,0.0
1,2021-02-09,0.027866,0.890614,0.081516,0.149618,0.063627,0.804396,0.129117,0.159778,65.42875,46374.86,7579.17,2,1.0
2,2021-02-10,0.028736,0.885619,0.085641,0.158076,0.059693,0.800772,0.137013,0.153127,35.38625,46420.42,45.56,0,0.0
3,2021-02-13,0.029296,0.885487,0.085214,0.148188,0.067005,0.800504,0.129159,0.142842,18.039375,47298.15,-670.51,-1,-1.0
4,2021-02-14,0.026092,0.859811,0.114094,0.255904,0.067111,0.798853,0.130681,0.14122,25.83625,47156.78,-141.37,0,0.0


In [None]:
#get training and test data
#Using only compound sentiment score, and average google trend score
X = data[['compound_twitter','compound_reddit','avg_trend']]
y = data[['label_3']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

In [None]:
#Train Decision Tree
#Train Model
clf = DecisionTreeClassifier(max_depth=5)
clf.fit(X_train, y_train)

#Predict
y_pred = clf.predict(X_test)

In [None]:
#Evaluate Model
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print("F1-Score: ", metrics.f1_score(y_test, y_pred, average='weighted'))
print("Precision: ", metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall: ", metrics.recall_score(y_test, y_pred, average='weighted'))

Accuracy:  0.2727272727272727
F1-Score:  0.2621015348288076
Precision:  0.33116883116883117
Recall:  0.2727272727272727



# Predicting Change in Hourly Bitcoin Prices using Social Media Interest Measures
Here, we train a classifier to use Daily hourly sentiment scores, hourly reddit sentiment scores, and hourly google trend scores to categorize the change in bitcoin prices into one of 3 categories: negative (-1), no commendable change (0), positive (1)

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics 
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

## Prepare Data

In [2]:
#Import Data

bitcoin = pd.read_csv('categorical_bitcoin_hourly.csv', sep='\t', index_col=None)
reddit = pd.read_csv('redditHourlySentiment.csv', sep='\t', index_col=None)
trends= pd.read_csv('trends_hourly_score.csv', sep='\t', index_col=None)
#GET TWITTER DATA

In [3]:
bitcoin.head()

Unnamed: 0,date,open,high,low,close,Volume BTC,Volume USD,Diff_Open,Diff_Category
0,2021-08-31 23:00:00,47102.74,47365.57,47000.05,47156.09,52.281015,2465368.0,,0
1,2021-08-31 22:00:00,46942.58,47127.13,46771.13,47100.38,199.495397,9396309.0,-160.16,-1
2,2021-08-31 21:00:00,47020.1,47213.49,46845.05,46950.0,138.092514,6483444.0,77.52,0
3,2021-08-31 20:00:00,47306.81,47345.65,46926.4,47003.1,81.363207,3824323.0,286.71,1
4,2021-08-31 19:00:00,47326.7,47422.72,47205.86,47321.52,95.043733,4497614.0,19.89,0


In [5]:
reddit = reddit.rename(columns={'Hour': 'date'})
reddit.head()

Unnamed: 0,date,neg,neu,pos,compound
0,2021-01-01 12:00:00,0.0421,0.78873,0.15917,0.249644
1,2021-01-01 20:00:00,0.057535,0.763265,0.179185,0.232856
2,2021-01-02 04:00:00,0.07086,0.80588,0.11331,0.087468
3,2021-01-02 11:00:00,0.059394,0.755424,0.154909,0.265852
4,2021-01-02 12:00:00,0.047412,0.823184,0.12191,0.135536


In [7]:
trends.head()

Unnamed: 0,date,bitcoin,btc,bitcoin price,bitcoin kurs,bitcoin usd,bitcoin stock,bitcoin dollar,bitcoin euro,buy bitcoin,buy btc,btc usd,btc inr,price btc,btc stock,btc coin,btc euro,avg_score
0,2021-01-01 00:00:00,9.99,13.2,8.88,14.96,11.76,3.8,26.88,30.78,10.5,8.36,20.4,20.0,6.3,3.0,2.88,10.88,12.660625
1,2021-01-01 01:00:00,9.99,13.2,8.88,14.96,11.76,3.8,26.88,30.78,10.5,8.36,20.4,20.0,6.3,3.0,2.88,10.88,12.660625
2,2021-01-01 02:00:00,9.99,13.2,8.88,14.96,11.76,3.8,26.88,30.78,10.5,8.36,20.4,20.0,6.3,3.0,2.88,10.88,12.660625
3,2021-01-01 03:00:00,9.99,13.2,8.88,14.96,11.76,3.8,26.88,30.78,10.5,8.36,20.4,20.0,6.3,3.0,2.88,10.88,12.660625
4,2021-01-01 04:00:00,9.99,13.2,8.88,14.96,11.76,3.8,26.88,30.78,10.5,8.36,20.4,20.0,6.3,3.0,2.88,10.88,12.660625


In [8]:
#Combine Data into dataframe

data = pd.DataFrame()
data["date"] = bitcoin["date"]
data = data.merge(reddit[['date','compound']], on='date')
data = data.merge(trends[['date','avg_score']], on='date')
data = data.merge(bitcoin[['date','Diff_Category']], on='date')
data.head()

Unnamed: 0,date,compound,avg_score,Diff_Category
0,2021-08-01 03:00:00,0.170187,14.223125,-1
1,2021-08-01 02:00:00,0.118415,14.223125,0
2,2021-08-01 01:00:00,0.102585,14.223125,0
3,2021-07-31 10:00:00,0.109353,13.428125,0
4,2021-07-31 09:00:00,0.051863,13.428125,0


In [10]:
#get training and test data
#Using only compound sentiment score, and average google trend score
X = data[['compound','avg_score']]
y = data[['Diff_Category']]

#we need to use the y value for the NEXT hour as the classification variable
X = X.iloc[:-1,:]
y= y.iloc[1:,:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

## Training a Decision Tree Classifier

In [11]:
#Train Model
clf = DecisionTreeClassifier(max_depth=5)
clf.fit(X_train, y_train)

#Predict
y_pred = clf.predict(X_test)

In [12]:
#Evaluate Model
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print("F1-Score: ", metrics.f1_score(y_test, y_pred, average='weighted'))
print("Precision: ", metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall: ", metrics.recall_score(y_test, y_pred, average='weighted'))

Accuracy:  0.4269005847953216
F1-Score:  0.40722710038499516
Precision:  0.4212347166975116
Recall:  0.4269005847953216


## Training a Random Forrest Classifier

In [13]:
#Train Model
clf = RandomForestClassifier(n_estimators = 10) 
clf.fit(X_train, y_train)

#Predict
y_pred = clf.predict(X_test)

#Evaluate Model
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print("F1-Score: ", metrics.f1_score(y_test, y_pred, average='weighted'))
print("Precision: ", metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall: ", metrics.recall_score(y_test, y_pred, average='weighted'))

Accuracy:  0.3508771929824561
F1-Score:  0.3535459861775651
Precision:  0.359349218015693
Recall:  0.3508771929824561


  This is separate from the ipykernel package so we can avoid doing imports until


## Training an SVM Classifier

In [14]:
#Train Model
clf = SVC(kernel="linear", C=0.025)
clf.fit(X_train, y_train)

#Predict
y_pred = clf.predict(X_test)


#Evaluate Model
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print("F1-Score: ", metrics.f1_score(y_test, y_pred, average='weighted'))
print("Precision: ", metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall: ", metrics.recall_score(y_test, y_pred, average='weighted'))

Accuracy:  0.38596491228070173
F1-Score:  0.3150943504084421
Precision:  0.27074933711089316
Recall:  0.38596491228070173


  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
