In [141]:
# Python ≥3.5
import sys
assert sys.version_info >= (3, 5)

import numpy as np
import os
import seaborn as sns
import pandas as pd
import datetime as dt
from sklearn_pandas import DataFrameMapper

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from mpl_toolkits.mplot3d import Axes3D
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Scikit-Learn ≥0.20
import sklearn
assert sklearn.__version__ >= "0.20"


from sklearn.model_selection import StratifiedShuffleSplit
import matplotlib.pyplot as plt
from zlib import crc32
import hashlib
import matplotlib.image as mpimg
from sklearn.model_selection import train_test_split
from six.moves import urllib
from scipy import stats
from sklearn.metrics import mean_squared_error
from sklearn import metrics
from sklearn import linear_model
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.model_selection import (TimeSeriesSplit, KFold, ShuffleSplit,
StratifiedKFold, GroupShuffleSplit,
GroupKFold, StratifiedShuffleSplit,cross_val_score)
from sklearn.linear_model import Ridge
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import (MinMaxScaler, StandardScaler, PolynomialFeatures,OrdinalEncoder,OneHotEncoder)
from sklearn.compose import ColumnTransformer
import statsmodels.api as sm
import warnings
from itertools import product
from datetime import datetime
warnings.filterwarnings('ignore')
plt.style.use('seaborn-poster')
from ta.utils import dropna
from ta.volatility import BollingerBands
from ta import add_all_ta_features
import numpy as np
import pandas as pd
from sklearn.ensemble import *
import xgboost as xgb
import operator

import settings
from ta import *
import utils
import torch
import torch.nn as nn
from torch.autograd import Variable
from sklearn.metrics import mean_absolute_error

import datetime, pytz
#define a conversion function for the native timestamps in the csv file
def dateparse (time_in_secs):    
    return pytz.utc.localize(datetime.datetime.fromtimestamp(float(time_in_secs)))

# Donde guardar las figuras
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Model definition
- ARIMA
- LTSM

In [142]:

def training_loop(n_epochs, lstm, optimizer, loss_fn, X_train, y_train,
                  X_test, y_test):
    for epoch in range(n_epochs):
        lstm.train()
        outputs = lstm.forward(X_train) # forward pass
        optimizer.zero_grad() # calculate the gradient, manually setting to 0
        # obtain the loss function
        loss = loss_fn(outputs, y_train)
        loss.backward() # calculates the loss of the loss function
        optimizer.step() # improve from loss, i.e backprop
        # test loss
        lstm.eval()
        test_preds = lstm(X_test)
        test_loss = loss_fn(test_preds, y_test)
        if epoch % 100 == 0:
            print("Epoch: %d, train loss: %1.5f, test loss: %1.5f" % (epoch, 
                                                                      loss.item(), 
                                                                      test_loss.item()))

In [143]:
class LSTM(nn.Module):
    # tendriamos que ir probando iterativamente la cantidad de capas
    def __init__(self, num_classes, input_size, hidden_size, num_layers):
        super().__init__()
        self.num_classes = num_classes # output size
        self.num_layers = num_layers # number of recurrent layers in the lstm
        self.input_size = input_size # input size
        self.hidden_size = hidden_size # neurons in each lstm layer
        # LSTM model
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
                            num_layers=num_layers, batch_first=True, dropout=0.2) # lstm
        self.fc_1 =  nn.Linear(hidden_size, 128) # fully connected 
        self.fc_2 = nn.Linear(128, num_classes) # fully connected last layer
        self.relu = nn.ReLU()
        
    def forward(self,x):
        # hidden state
        h_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size))
        # cell state
        c_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size))
        # propagate input through LSTM
        output, (hn, cn) = self.lstm(x, (h_0, c_0)) # (input, hidden, and internal state)
        hn = hn.view(-1, self.hidden_size) # reshaping the data for Dense layer next
        out = self.relu(hn)
        out = self.fc_1(out) # first dense
        out = self.relu(out) # relu
        out = self.fc_2(out) # final output
        return out

# Find best hyperparameters
GridsearchCV

## Split train - test

In [153]:
bitcoin_hist = pd.read_csv("data/processed_bitcoin_history.csv")
df = bitcoin_hist.drop('Timestamp',axis=1)

In [154]:
last_items = int(df.Close.count()*0.2)
X = df.head(df.Close.count()-last_items)
y = df.tail(last_items)

In [164]:
n_epochs = 1000 # 1000 epochs
learning_rate = 0.1 # 0.001 lr

input_size = 25 # number of features
hidden_size = 13 # number of features in hidden state
num_layers = 1 # number of stacked lstm layers

num_classes = 1 # number of output classes 

lstm = LSTM(num_classes, input_size, hidden_size, num_layers)

criterion = torch.nn.MSELoss()    # mean-squared error for regression
optimizer = torch.optim.Adam(lstm.parameters(), lr=learning_rate)

In [165]:
tscv = TimeSeriesSplit(n_splits=2) #TODO: revisar si tenemos que fijar el tamaño para test
for train_index, test_index in tscv.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = X_train[['Close']], X_test[['Close']]
    X_train = X_train.drop('Close', axis=1) 
    X_test = X_test.drop('Close', axis=1)

    #https://machinelearningmastery.com/reshape-input-data-long-short-term-memory-networks-keras/
    X_train_tensors = Variable(torch.Tensor(X_train
                                            .values))
    X_test_tensors = Variable(torch.Tensor(X_test
                                           .values))

    y_train_tensors = Variable(torch.Tensor(y_train.values))
    y_test_tensors = Variable(torch.Tensor(y_test.values))

    # reshaping to rows, timestamps, features
    X_train_tensors_final = torch.reshape(X_train_tensors,   
                                          (X_train_tensors.shape[0], 1, 
                                           X_train_tensors.shape[1]))
    X_test_tensors_final = torch.reshape(X_test_tensors,  
                                         (X_test_tensors.shape[0], 1, 
                                          X_test_tensors.shape[1])) 
    training_loop(n_epochs=n_epochs,
              lstm=lstm,
              optimizer=optimizer,
              loss_fn=criterion,
              X_train=X_train_tensors_final,
              y_train=y_train_tensors,
              X_test=X_test_tensors_final,
              y_test=y_test_tensors)

TRAIN: [     0      1      2 ... 239426 239427 239428] TEST: [239429 239430 239431 ... 478855 478856 478857]
Epoch: 0, train loss: nan, test loss: nan
Epoch: 100, train loss: nan, test loss: nan
Epoch: 200, train loss: nan, test loss: nan
Epoch: 300, train loss: nan, test loss: nan
Epoch: 400, train loss: nan, test loss: nan
Epoch: 500, train loss: nan, test loss: nan
Epoch: 600, train loss: nan, test loss: nan
Epoch: 700, train loss: nan, test loss: nan
Epoch: 800, train loss: nan, test loss: nan
Epoch: 900, train loss: nan, test loss: nan
TRAIN: [     0      1      2 ... 478855 478856 478857] TEST: [478858 478859 478860 ... 718284 718285 718286]
Epoch: 0, train loss: nan, test loss: nan
Epoch: 100, train loss: nan, test loss: nan
Epoch: 200, train loss: nan, test loss: nan


KeyboardInterrupt: 

## Evaluation

In [None]:
ground_truth = y.Close
test = y.drop('Close', axis=1)
test_tensor = Variable(torch.Tensor(test
                                        .values))
# reshaping to rows, timestamps, features
test_tensor_final = torch.reshape(test_tensor,   
                                      (test_tensor.shape[0], 1, 
                                       test_tensor.shape[1]))
train_predict = lstm(test_tensor_final) # forward pass
data_predict = train_predict.data.numpy() # numpy conversion
dataY_plot = ground_truth.data.numpy()

data_predict = mm.inverse_transform(data_predict) # reverse transformation
dataY_plot = mm.inverse_transform(dataY_plot)
true, preds = [], []
for i in range(len(dataY_plot)):
    true.append(dataY_plot[i][0])
for i in range(len(data_predict)):
    preds.append(data_predict[i][0])
plt.figure(figsize=(10,6)) #plotting
plt.axvline(x=train_test_cutoff, c='r', linestyle='--') # size of the training set

plt.plot(true, label='Actual Data') # actual plot
plt.plot(preds, label='Predicted Data') # predicted plot
plt.title('Time-Series Prediction')
plt.legend()
plt.savefig("whole_plot.png", dpi=300)
plt.show() 