In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler

In [3]:
names = ['BTC', 'ETH', 'LTC', 'EOS', 'DOGE']

In [4]:
data = pd.read_csv('data/' + names[0] + '-USD.csv')
data['Date'] = pd.to_datetime(data['Date'])
for name in names[1:]:
    new_data = pd.read_csv('data/' + names[1] + '-USD.csv')
    new_data['Date'] = pd.to_datetime(new_data['Date'])
    data = data.merge(new_data, on='Date', how='left', suffixes=('', ' ' + name))

data.columns = list([data.columns[0]]) + list([_ + ' ' + names[0] for _ in data.columns[1:7]]) + list(data.columns[7:])
data = data.dropna()

In [5]:
data.head()

Unnamed: 0,Date,Open BTC,High BTC,Low BTC,Close BTC,Adj Close BTC,Volume BTC,Open ETH,High ETH,Low ETH,...,Low EOS,Close EOS,Adj Close EOS,Volume EOS,Open DOGE,High DOGE,Low DOGE,Close DOGE,Adj Close DOGE,Volume DOGE
1847,2015-08-06,278.0,279.600006,274.279999,277.890015,277.890015,11919665,0.6747,3.0,0.6747,...,0.6747,3.0,3.0,371.0,0.6747,3.0,0.6747,3.0,3.0,371.0
1848,2015-08-07,277.890015,278.920013,257.420013,258.600006,258.600006,22308123,3.0,3.0,0.15,...,0.15,1.2,1.2,1438.0,3.0,3.0,0.15,1.2,1.2,1438.0
1849,2015-08-08,258.600006,266.75,258.559998,263.869995,263.869995,15154749,1.2,1.2,1.2,...,1.2,1.2,1.2,0.0,1.2,1.2,1.2,1.2,1.2,0.0
1850,2015-08-09,263.869995,266.630005,260.519989,263.299988,263.299988,12873441,1.2,1.2,1.2,...,1.2,1.2,1.2,0.0,1.2,1.2,1.2,1.2,1.2,0.0
1851,2015-08-10,263.299988,269.899994,261.440002,269.029999,269.029999,13681939,1.2,1.2,0.6504,...,0.6504,0.99,0.99,7419.0,1.2,1.2,0.6504,0.99,0.99,7419.0


In [6]:
X = data.iloc[:, data.columns != 'Close BTC']
y = data['Close BTC']

In [7]:
data.iloc[:, 1::6].head()

Unnamed: 0,Open BTC,Open ETH,Open LTC,Open EOS,Open DOGE
1847,278.0,0.6747,0.6747,0.6747,0.6747
1848,277.890015,3.0,3.0,3.0,3.0
1849,258.600006,1.2,1.2,1.2,1.2
1850,263.869995,1.2,1.2,1.2,1.2
1851,263.299988,1.2,1.2,1.2,1.2


In [8]:
data = pd.read_csv('data/BTC-USD.csv')
data['Change'] = (data['Close'] - data['Open']) / data['Open']

In [9]:
scaled = MinMaxScaler(feature_range=(-1, 1)).fit_transform(data['Change'].values.reshape(-1, 1))
data['col'] = scaled

In [10]:
WINDOW = 50
scaler = MinMaxScaler((-1, 1))
data['Change Scaled'] = scaler.fit_transform(data['Change'].values.reshape(-1, 1))
X = data['Change Scaled']
for i in range(1, WINDOW):
    X = pd.concat([X, data['Change Scaled'].shift(i)], axis=1)
X = X[WINDOW:]
X = X.values.reshape((X.shape[0], WINDOW, 1))

In [11]:
label_averages = []
for d in pd.date_range(start=data.head(1)['Date'].values[0], end=data.tail(1)['Date'].values[0]):
    with open('labels_btc/text_' + '{:04d}'.format(d.year) + '{:02d}'.format(d.month) + '{:02d}'.format(d.day) + '.txt', 'r') as f:
        labels = []
        for line in f:
            labels.append(int(line[-2]))
        label_averages.append(np.mean(labels))
data['Label Average'] = label_averages

In [12]:
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Change,col,Change Scaled,Label Average
0,2010-07-16,0.04951,0.04951,0.04951,0.04951,0.04951,0,0.0,-0.709649,-0.709649,3.6
1,2010-07-17,0.04951,0.08585,0.05941,0.08584,0.08584,5,0.733791,-0.337208,-0.337208,2.9
2,2010-07-18,0.08584,0.09307,0.07723,0.0808,0.0808,49,-0.058714,-0.739449,-0.739449,3.5
3,2010-07-19,0.0808,0.08181,0.07426,0.07474,0.07474,20,-0.075,-0.747716,-0.747716,3.3
4,2010-07-20,0.07474,0.07921,0.06634,0.07921,0.07921,42,0.059807,-0.679293,-0.679293,3.3


In [13]:
X = data['Label Average'][:-1]
y = data['Change'].shift(-1)[:-1]

In [14]:
model = sm.OLS(y, X)
results = model.fit()

In [15]:
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                 Change   R-squared:                       0.005
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     14.59
Date:                Thu, 11 Apr 2019   Prob (F-statistic):           0.000136
Time:                        22:05:30   Log-Likelihood:                 3222.7
No. Observations:                3181   AIC:                            -6443.
Df Residuals:                    3180   BIC:                            -6437.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Label Average     0.0017      0.000      3.820