In [60]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
import datetime
import sys
from sklearn.preprocessing import MinMaxScaler
sys.path.insert(1, '../../')
from custom_utils import get_num_lags, log_return_transformation

In [134]:
df = pd.read_csv("../../data/cooked_data/cooked_complete_dataset.csv")

In [135]:
df = log_return_transformation(df, ['Adj_Close_BTC-USD'])
df['Adj_Close_BTC-USD'] = df['Adj_Close_BTC-USD'].apply(lambda x: 1 if x > 0 else 0)

In [136]:
df

Unnamed: 0,date,Adj_Close_BTC-USD,Open_BTC-USD,High_BTC-USD,Low_BTC-USD,Volume_BTC-USD,Adj_Close_SPY,Adj_Close_GLD,Adj_Close_CHFUSD=X,Adj_Close_CNYUSD=X,Adj_Close_EURUSD=X,Adj_Close_GBPUSD=X,Adj_Close_JPYUSD=X,coindesk_sentiment,num_of_coindesk_posts,reddit_comments_sentiments,top_50_reddit_posts_sentiments,blockchain_transactions_per_block,blockchain_hash_rates
1,15/12/20,1,19246.91992,19525.00781,19079.84180,2.674198e+10,366.819824,173.940002,1.127930,0.152679,1.214890,1.333084,0.009614,0.173773,18,0.101930,0.447277,2288.857143,133351912.2
2,16/12/20,1,19418.81836,21458.90820,19298.31641,4.440901e+10,367.395508,174.899994,1.129382,0.152945,1.215430,1.344447,0.009649,0.341491,11,0.127344,0.480809,2204.314685,132323572.3
3,17/12/20,1,21308.35156,23642.66016,21234.67578,7.137861e+10,369.449982,176.740005,1.129446,0.153109,1.219959,1.350293,0.009664,0.197572,10,0.135945,0.539729,2399.077519,132373208.7
4,18/12/20,1,22806.79688,23238.60156,22399.81250,4.038790e+10,367.974792,176.440002,1.130301,0.153090,1.226272,1.357018,0.009696,0.315601,2,0.135441,0.449503,2392.031847,131791042.0
5,19/12/20,1,23132.86523,24085.85547,22826.47266,3.848755e+10,367.974792,176.440002,1.130301,0.153090,1.226272,1.357018,0.009696,0.255413,7,0.129564,0.384758,2174.278481,133832920.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,9/4/21,0,58326.56250,58937.04688,57807.86328,4.665521e+10,411.489990,163.270004,1.082064,0.152672,1.191568,1.373438,0.009147,0.239549,17,0.112655,0.341179,2136.315789,165551986.7
117,10/4/21,1,58253.77734,61276.66406,58038.70703,5.823847e+10,411.489990,163.270004,1.082064,0.152672,1.191568,1.373438,0.009147,0.316376,2,0.128121,0.277659,1905.006369,167595961.7
118,11/4/21,1,59846.23047,60790.55469,59289.79688,4.628025e+10,411.489990,163.270004,1.082064,0.152672,1.191568,1.373438,0.009147,-0.035314,1,0.206358,0.375233,1747.924138,171868015.6
119,12/4/21,0,60175.94531,61253.03516,59589.87500,5.182869e+10,411.640015,162.279999,1.080742,0.152625,1.190051,1.371215,0.009116,0.240797,17,0.121495,0.280850,2023.395833,171703705.8


In [137]:
# drop certain variables due to multicollinearity or week correlation
df = df.drop(["Open_BTC-USD", "High_BTC-USD", "Low_BTC-USD", "Volume_BTC-USD", "Adj_Close_CNYUSD=X", "num_of_coindesk_posts"], axis=1)

In [138]:
df['date'] = pd.to_datetime(df['date'], format='%d/%m/%y')

In [139]:
df = get_num_lags(df, {})

In [140]:
train = df[(df['date']<='2021-03-14') & (df['date']>='2021-01-01')].sort_values('date')
validation_and_test = df[(df['date']>='2021-03-15') & (df['date'] <'2021-04-13') ].sort_values('date')

In [141]:
sc = MinMaxScaler(feature_range = (0,1))
y_train = train['Adj_Close_BTC-USD']
X_train = train.iloc[:,2:]
column_names = X_train.columns
X_train = sc.fit_transform(X_train)
X_train = pd.DataFrame(X_train, columns = list(column_names))
X_train_no_sentiment = X_train.drop(['coindesk_sentiment_lag_1', 'reddit_comments_sentiments_lag_1', 'top_50_reddit_posts_sentiments_lag_1'], axis=1)


y_test = validation_and_test['Adj_Close_BTC-USD']
X_test = validation_and_test.iloc[:,2:]
X_test = sc.transform(X_test)
X_test = pd.DataFrame(X_test, columns = list(column_names))
X_test_no_sentiment = X_test.drop(['coindesk_sentiment_lag_1', 'reddit_comments_sentiments_lag_1', 'top_50_reddit_posts_sentiments_lag_1'], axis=1)


In [142]:
y_train.value_counts()

1    40
0    33
Name: Adj_Close_BTC-USD, dtype: int64

In [143]:
y_test.value_counts()

1    16
0    13
Name: Adj_Close_BTC-USD, dtype: int64

### Logistic regression with complete set of independent variables (With sentiments)

In [144]:
lr = LogisticRegression(max_iter=500, solver='liblinear')
lr.fit(X_train, y_train)

LogisticRegression(max_iter=500, solver='liblinear')

In [145]:
output_with_sentiment = []
y_pred_class = lr.predict(X_train)
output_with_sentiment = output_with_sentiment + list(y_pred_class)
print('Accuracy score in training set: {}'.format(metrics.accuracy_score(y_train, y_pred_class)))
print('Precision score in training set: {}'.format(metrics.precision_score(y_train, y_pred_class)))
print('Recall score in training set: {}'.format(metrics.recall_score(y_train, y_pred_class)))
print('Confusion matrix:')
print(metrics.confusion_matrix(y_train, y_pred_class))

y_pred_prob = lr.predict_proba(X_train)[:, 1]
print('ROC_AUC score in training set: {}'.format(metrics.roc_auc_score(y_train, y_pred_prob)))

print('*'*80)

y_pred_class = lr.predict(X_test)
output_with_sentiment = output_with_sentiment + list(y_pred_class)
print('Accuracy score in test set: {}'.format(metrics.accuracy_score(y_test, y_pred_class)))
print('Precision score in test set: {}'.format(metrics.precision_score(y_test, y_pred_class)))
print('Recall score in test set: {}'.format(metrics.recall_score(y_test, y_pred_class)))
print('Confusion matrix:')
print(metrics.confusion_matrix(y_test, y_pred_class))

y_pred_prob = lr.predict_proba(X_test)[:, 1]
print('ROC_AUC score in test set: {}'.format(metrics.roc_auc_score(y_test, y_pred_prob)))

Accuracy score in training set: 0.6164383561643836
Precision score in training set: 0.6
Recall score in training set: 0.9
Confusion matrix:
[[ 9 24]
 [ 4 36]]
ROC_AUC score in training set: 0.6575757575757576
********************************************************************************
Accuracy score in test set: 0.5517241379310345
Precision score in test set: 0.56
Recall score in test set: 0.875
Confusion matrix:
[[ 2 11]
 [ 2 14]]
ROC_AUC score in test set: 0.42307692307692313


### Logistic regression with complete set of independent variables (Without sentiments)

In [146]:
lr = LogisticRegression(max_iter=500, solver='liblinear')
lr.fit(X_train_no_sentiment, y_train)

LogisticRegression(max_iter=500, solver='liblinear')

In [147]:
output_without_sentiment = []
y_pred_class = lr.predict(X_train_no_sentiment)
output_without_sentiment = output_without_sentiment + list(y_pred_class)

print('Accuracy score in training set: {}'.format(metrics.accuracy_score(y_train, y_pred_class)))
print('Precision score in training set: {}'.format(metrics.precision_score(y_train, y_pred_class)))
print('Recall score in training set: {}'.format(metrics.recall_score(y_train, y_pred_class)))
print('Confusion matrix:')
print(metrics.confusion_matrix(y_train, y_pred_class))

y_pred_prob = lr.predict_proba(X_train_no_sentiment)[:, 1]
print('ROC_AUC score in training set: {}'.format(metrics.roc_auc_score(y_train, y_pred_prob)))

print('*'*80)

y_pred_class = lr.predict(X_test_no_sentiment)
output_without_sentiment = output_without_sentiment + list(y_pred_class)
print('Accuracy score in test set: {}'.format(metrics.accuracy_score(y_test, y_pred_class)))
print('Precision score in test set: {}'.format(metrics.precision_score(y_test, y_pred_class)))
print('Recall score in test set: {}'.format(metrics.recall_score(y_test, y_pred_class)))
print('Confusion matrix:')
print(metrics.confusion_matrix(y_test, y_pred_class))

y_pred_prob = lr.predict_proba(X_test_no_sentiment)[:, 1]
print('ROC_AUC score in test set: {}'.format(metrics.roc_auc_score(y_test, y_pred_prob)))

Accuracy score in training set: 0.5753424657534246
Precision score in training set: 0.5737704918032787
Recall score in training set: 0.875
Confusion matrix:
[[ 7 26]
 [ 5 35]]
ROC_AUC score in training set: 0.625
********************************************************************************
Accuracy score in test set: 0.5172413793103449
Precision score in test set: 0.5357142857142857
Recall score in test set: 0.9375
Confusion matrix:
[[ 0 13]
 [ 1 15]]
ROC_AUC score in test set: 0.30288461538461536


In [148]:
predictions = {
    'prediction_with_sentiments': output_with_sentiment,
    'predictions_without_sentiments': output_without_sentiment,
    'date': list(train['date']) + list(validation_and_test['date'])
}

In [150]:
# pd.DataFrame(predictions).to_csv('logistic_predictions.csv', index=False)