In [11]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import math
from sklearn.preprocessing import MinMaxScaler
import sys
sys.path.insert(1, '../../')
from custom_utils import *

In [20]:
df = pd.read_csv("../../data/cooked_data/cooked_complete_dataset.csv")
df['date'] = pd.to_datetime(df['date'], format='%d/%m/%y')

# drop certain variables due to multicollinearity or week correlation
df = df.drop(["Open_BTC-USD", "High_BTC-USD", "Low_BTC-USD", "Volume_BTC-USD", "Adj_Close_CNYUSD=X", "num_of_coindesk_posts"], axis=1)

In [21]:
df

Unnamed: 0,date,Adj_Close_BTC-USD,Adj_Close_SPY,Adj_Close_GLD,Adj_Close_CHFUSD=X,Adj_Close_EURUSD=X,Adj_Close_GBPUSD=X,Adj_Close_JPYUSD=X,coindesk_sentiment,reddit_comments_sentiments,top_50_reddit_posts_sentiments,blockchain_transactions_per_block,blockchain_hash_rates
0,2020-12-14,19246.64453,361.926788,171.539993,1.125442,1.213340,1.331824,0.009621,0.249489,0.158060,0.677618,2167.931034,134533587.6
1,2020-12-15,19417.07617,366.819824,173.940002,1.127930,1.214890,1.333084,0.009614,0.173773,0.101930,0.447277,2288.857143,133351912.2
2,2020-12-16,21310.59766,367.395508,174.899994,1.129382,1.215430,1.344447,0.009649,0.341491,0.127344,0.480809,2204.314685,132323572.3
3,2020-12-17,22805.16211,369.449982,176.740005,1.129446,1.219959,1.350293,0.009664,0.197572,0.135945,0.539729,2399.077519,132373208.7
4,2020-12-18,23137.96094,367.974792,176.440002,1.130301,1.226272,1.357018,0.009696,0.315601,0.135441,0.449503,2392.031847,131791042.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,2021-04-09,58245.00391,411.489990,163.270004,1.082064,1.191568,1.373438,0.009147,0.239549,0.112655,0.341179,2136.315789,165551986.7
117,2021-04-10,59793.23438,411.489990,163.270004,1.082064,1.191568,1.373438,0.009147,0.316376,0.128121,0.277659,1905.006369,167595961.7
118,2021-04-11,60204.96484,411.489990,163.270004,1.082064,1.191568,1.373438,0.009147,-0.035314,0.206358,0.375233,1747.924138,171868015.6
119,2021-04-12,59893.45313,411.640015,162.279999,1.080742,1.190051,1.371215,0.009116,0.240797,0.121495,0.280850,2023.395833,171703705.8


In [22]:
df.columns

Index(['date', 'Adj_Close_BTC-USD', 'Adj_Close_SPY', 'Adj_Close_GLD',
       'Adj_Close_CHFUSD=X', 'Adj_Close_EURUSD=X', 'Adj_Close_GBPUSD=X',
       'Adj_Close_JPYUSD=X', 'coindesk_sentiment',
       'reddit_comments_sentiments', 'top_50_reddit_posts_sentiments',
       'blockchain_transactions_per_block', 'blockchain_hash_rates'],
      dtype='object')

In [23]:
df = get_num_lags(df, {})

In [24]:
train = df[(df['date']<='2021-03-14') & (df['date']>='2021-01-01')].sort_values('date')
validation_and_test = df[(df['date']>='2021-03-15') & (df['date'] <'2021-04-13') ].sort_values('date')

In [25]:
sc = MinMaxScaler(feature_range = (0,1))
y_train = train['Adj_Close_BTC-USD']
X_train = train.iloc[:,2:]
column_names = X_train.columns
X_train = sc.fit_transform(X_train)
X_train = pd.DataFrame(X_train, columns = list(column_names))
X_train_no_sentiment = X_train.drop(['coindesk_sentiment_lag_1', 'reddit_comments_sentiments_lag_1', 'top_50_reddit_posts_sentiments_lag_1'], axis=1)


y_test = validation_and_test['Adj_Close_BTC-USD']
X_test = validation_and_test.iloc[:,2:]
X_test = sc.transform(X_test)
X_test = pd.DataFrame(X_test, columns = list(column_names))
X_test_no_sentiment = X_test.drop(['coindesk_sentiment_lag_1', 'reddit_comments_sentiments_lag_1', 'top_50_reddit_posts_sentiments_lag_1'], axis=1)


### Linear regression with sentiments

In [29]:
model = LinearRegression(normalize=True)
model.fit(X_train, y_train)

LinearRegression(normalize=True)

In [30]:
output_with_sentiment = []
y_pred_return = model.predict(X_train)
output_with_sentiment = output_with_sentiment + list(y_pred_return)
print("RMSE in training set: {}".format(math.sqrt(mean_squared_error(y_pred_return, y_train))))

y_pred_return = model.predict(X_test)
output_with_sentiment = output_with_sentiment + list(y_pred_return)
print("RMSE in test set: {}".format(math.sqrt(mean_squared_error(y_pred_return, y_test))))

RMSE in training set: 2869.4498913466623
RMSE in test set: 6976.001755833121


### Linear regression without sentiments

In [31]:
model = LinearRegression(normalize=True)
model.fit(X_train_no_sentiment, y_train)

LinearRegression(normalize=True)

In [32]:
output_without_sentiment = []
y_pred_return = model.predict(X_train_no_sentiment)
output_without_sentiment = output_without_sentiment + list(y_pred_return)
print("RMSE in training set: {}".format(math.sqrt(mean_squared_error(y_pred_return, y_train))))

y_pred_return = model.predict(X_test_no_sentiment)
output_without_sentiment = output_without_sentiment + list(y_pred_return)
print("RMSE in training set: {}".format(math.sqrt(mean_squared_error(y_pred_return, y_test))))

RMSE in training set: 3078.521251044796
RMSE in training set: 8162.758535503783


In [34]:
predictions = {
    'prediction_with_sentiments': output_with_sentiment,
    'predictions_without_sentiments': output_without_sentiment,
    'date': list(train['date']) + list(validation_and_test['date'])
}

In [35]:
pd.DataFrame(predictions).to_csv('linear_predictions.csv', index=False)