# Predicting Change in Bitcoin Prices using Social Media Interest Measures

Here, we train a classifier to use Daily twitter sentiment scores, daily reddit sentiment scores, and daily google trend scores to categorize the change in bitcoin prices into one of 5 categories: highly negative (-2), moderately negative (-1), no commendable change (0), moderately positive (1), highly negative (2)

In [58]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics 
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

## Prepare Data

In [8]:
#Import Data

bitcoin = pd.read_csv('categorical_bitcoin_price.csv', sep='\t', index_col=0)
reddit = pd.read_csv('redditDailySentiment.csv', sep='\t', index_col=None)
trends= pd.read_csv('trends_daily_score.csv', sep='\t', index_col=0)
#twitter = READ TWITTER DATA

In [6]:
bitcoin.head()

Unnamed: 0,Date,open,high,low,close,volume,Diff_Open,Diff_Category
0,2021-01-01,28923.63,29600.0,28624.57,29331.69,54182.925011,,0
1,2021-01-02,29331.7,33300.0,28946.53,32178.33,129993.873362,408.07,0
2,2021-01-03,32176.45,34778.11,31962.99,33000.05,120957.56675,2844.75,1
3,2021-01-04,33000.05,33600.0,28130.0,31988.71,140899.88569,823.6,1
4,2021-01-05,31989.75,34360.0,29900.0,33949.53,116049.997038,-1010.3,-1


In [17]:
reddit.head()

Unnamed: 0,Date,neg,neu,pos,compound
0,2021-01-01,0.05239,0.771753,0.172513,0.238452
1,2021-01-02,0.057414,0.811156,0.121446,0.135624
2,2021-01-03,0.063241,0.78381,0.148659,0.152421
3,2021-01-04,0.061042,0.815152,0.121297,0.167221
4,2021-01-05,0.06179,0.801428,0.135118,0.147205


In [16]:
trends.head()

Unnamed: 0,Date,avg_score,weighted_score
0,2021-01-01,12.660625,14.447561
1,2021-01-02,34.144375,37.360744
2,2021-01-03,48.853125,53.128124
3,2021-01-04,32.544375,35.352064
4,2021-01-05,24.34125,26.472194


In [15]:
#Rename merging column ("Date")
reddit = reddit.rename(columns={"Day": "Date"})
trends = trends.rename(columns={"date": "Date"})

In [20]:
#Combine Data into dataframe

data = pd.DataFrame()
data["Date"] = bitcoin["Date"]
data = data.merge(reddit, on='Date')
data = data.merge(trends, on='Date')
data = data.merge(bitcoin, on='Date')
data.head()

Unnamed: 0,Date,neg,neu,pos,compound,avg_score,weighted_score,open,high,low,close,volume,Diff_Open,Diff_Category
0,2021-01-01,0.05239,0.771753,0.172513,0.238452,12.660625,14.447561,28923.63,29600.0,28624.57,29331.69,54182.925011,,0
1,2021-01-02,0.057414,0.811156,0.121446,0.135624,34.144375,37.360744,29331.7,33300.0,28946.53,32178.33,129993.873362,408.07,0
2,2021-01-03,0.063241,0.78381,0.148659,0.152421,48.853125,53.128124,32176.45,34778.11,31962.99,33000.05,120957.56675,2844.75,1
3,2021-01-04,0.061042,0.815152,0.121297,0.167221,32.544375,35.352064,33000.05,33600.0,28130.0,31988.71,140899.88569,823.6,1
4,2021-01-05,0.06179,0.801428,0.135118,0.147205,24.34125,26.472194,31989.75,34360.0,29900.0,33949.53,116049.997038,-1010.3,-1


In [22]:
#Rename columns, drop unnecessary columns
data = data.drop(['weighted_score', 'high', 'low','close','volume'],axis=1)
data = data.rename(columns={'neg':'neg_reddit', 'neu':'neu_reddit','pos':'pos_reddit','compound':'compound_reddit','avg_score':'avg_trend','Diff_Open':'diff_open','Diff_Category':'label'})
data.head()

Unnamed: 0,Date,neg_reddit,neu_reddit,pos_reddit,compound_reddit,avg_trend,open,diff_open,label
0,2021-01-01,0.05239,0.771753,0.172513,0.238452,12.660625,28923.63,,0
1,2021-01-02,0.057414,0.811156,0.121446,0.135624,34.144375,29331.7,408.07,0
2,2021-01-03,0.063241,0.78381,0.148659,0.152421,48.853125,32176.45,2844.75,1
3,2021-01-04,0.061042,0.815152,0.121297,0.167221,32.544375,33000.05,823.6,1
4,2021-01-05,0.06179,0.801428,0.135118,0.147205,24.34125,31989.75,-1010.3,-1


In [26]:
data['label'].value_counts()

 1    69
-1    68
 0    42
 2     8
-2     4
Name: label, dtype: int64

In [27]:
data.columns

Index(['Date', 'neg_reddit', 'neu_reddit', 'pos_reddit', 'compound_reddit',
       'avg_trend', 'open', 'diff_open', 'label'],
      dtype='object')

In [40]:
#get training and test data
#Using only compound sentiment score, and average google trend score
X = data[['compound_reddit','avg_trend']]
y = data[['label']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

## Training the Random Forest Classifier

Reference: https://www.geeksforgeeks.org/random-forest-classifier-using-scikit-learn/

In [48]:
#Train Model
clf = RandomForestClassifier(n_estimators = 10) 
clf.fit(X_train, y_train)

#Predict
y_pred = clf.predict(X_test)

  This is separate from the ipykernel package so we can avoid doing imports until


In [49]:
#Reference: https://scikit-learn.org/stable/modules/model_evaluation.html

#Evaluate Model
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print("F1-Score: ", metrics.f1_score(y_test, y_pred, average='weighted'))
print("Precision: ", metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall: ", metrics.recall_score(y_test, y_pred, average='weighted'))

Accuracy:  0.3333333333333333
F1-Score:  0.3230769230769231
Precision:  0.31776556776556775
Recall:  0.3333333333333333


  _warn_prf(average, modifier, msg_start, len(result))


In [50]:
y_pred

array([ 1,  0,  1,  1,  1, -1, -1,  1,  0,  1, -1,  1,  0,  0, -1,  2, -2,
       -1,  0, -1,  1,  1,  1, -1, -1,  0, -1, -1,  1,  1,  1,  1,  1,  1,
       -1, -1,  0, -1, -1])

## Training a Support Vector Classifier

Reference: https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

In [56]:
#Train Model
clf = SVC(kernel="linear", C=0.025)
clf.fit(X_train, y_train)

#Predict
y_pred = clf.predict(X_test)

  y = column_or_1d(y, warn=True)


In [57]:
#Evaluate Model
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print("F1-Score: ", metrics.f1_score(y_test, y_pred, average='weighted'))
print("Precision: ", metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall: ", metrics.recall_score(y_test, y_pred, average='weighted'))

Accuracy:  0.358974358974359
F1-Score:  0.22833333333333336
Precision:  0.24074074074074076
Recall:  0.358974358974359


  _warn_prf(average, modifier, msg_start, len(result))


## Decision Tree Classifier

In [59]:
#Train Model
clf = DecisionTreeClassifier(max_depth=5)
clf.fit(X_train, y_train)

#Predict
y_pred = clf.predict(X_test)

In [60]:
#Evaluate Model
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print("F1-Score: ", metrics.f1_score(y_test, y_pred, average='weighted'))
print("Precision: ", metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall: ", metrics.recall_score(y_test, y_pred, average='weighted'))

Accuracy:  0.4358974358974359
F1-Score:  0.44045584045584046
Precision:  0.466925231631114
Recall:  0.4358974358974359


# 3-way Classification

Here, we attempt to categorize change in bitcoin prices to one of 3 categories: negative change (-1), no change (0), positive change (1)

In [61]:
data.head()

Unnamed: 0,Date,neg_reddit,neu_reddit,pos_reddit,compound_reddit,avg_trend,open,diff_open,label
0,2021-01-01,0.05239,0.771753,0.172513,0.238452,12.660625,28923.63,,0
1,2021-01-02,0.057414,0.811156,0.121446,0.135624,34.144375,29331.7,408.07,0
2,2021-01-03,0.063241,0.78381,0.148659,0.152421,48.853125,32176.45,2844.75,1
3,2021-01-04,0.061042,0.815152,0.121297,0.167221,32.544375,33000.05,823.6,1
4,2021-01-05,0.06179,0.801428,0.135118,0.147205,24.34125,31989.75,-1010.3,-1


In [63]:
#Transform Label
data.loc[data['label'] >0, 'label_3'] = 1  
data.loc[data['label'] <0, 'label_3'] = -1  
data.loc[data['label'] ==0, 'label_3'] = 0

data.head()

Unnamed: 0,Date,neg_reddit,neu_reddit,pos_reddit,compound_reddit,avg_trend,open,diff_open,label,label_3
0,2021-01-01,0.05239,0.771753,0.172513,0.238452,12.660625,28923.63,,0,0.0
1,2021-01-02,0.057414,0.811156,0.121446,0.135624,34.144375,29331.7,408.07,0,0.0
2,2021-01-03,0.063241,0.78381,0.148659,0.152421,48.853125,32176.45,2844.75,1,1.0
3,2021-01-04,0.061042,0.815152,0.121297,0.167221,32.544375,33000.05,823.6,1,1.0
4,2021-01-05,0.06179,0.801428,0.135118,0.147205,24.34125,31989.75,-1010.3,-1,-1.0


In [64]:
#get training and test data
#Using only compound sentiment score, and average google trend score
X = data[['compound_reddit','avg_trend']]
y = data[['label_3']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

In [65]:
#Train Decision Tree
#Train Model
clf = DecisionTreeClassifier(max_depth=5)
clf.fit(X_train, y_train)

#Predict
y_pred = clf.predict(X_test)

In [66]:
#Evaluate Model
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print("F1-Score: ", metrics.f1_score(y_test, y_pred, average='weighted'))
print("Precision: ", metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall: ", metrics.recall_score(y_test, y_pred, average='weighted'))

Accuracy:  0.41025641025641024
F1-Score:  0.3998565537027075
Precision:  0.3957264957264957
Recall:  0.41025641025641024
