In [1]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import LSTM
from keras.regularizers import l2
from sklearn.preprocessing import MinMaxScaler

model_data = pd.read_csv('data\model_data.csv')

Using TensorFlow backend.


In [2]:
model_data.dropna(inplace=True)
model_data.drop(['close_future'], inplace=True, axis=1)

In [3]:
# 80% train and 20% test splitting
# Isso pega mais ou menos dois anos de teste (a partir de 2016)
# Não fiz o train_test_split aleatório para não avacalhar com a relação temporal da série...

train = model_data[0:1965]
test = model_data[1965:2457]

In [4]:
# Normalizando as features
scaler = MinMaxScaler()

x_train = train.drop(['daily_percentage_change', 'date'], axis=1)
x_train = scaler.fit_transform(x_train)
y_train = train['daily_percentage_change']
x_test = test.drop(['daily_percentage_change', 'date'], axis=1)
x_test = scaler.fit_transform(x_test)
y_test = test['daily_percentage_change']

In [5]:
# Transformando em arrays. Acho que só colocar o dataframe.values também funciona kkk

y_train = np.array(y_train, dtype=float)
y_test = np.array(y_test, dtype=float)

In [6]:
# Fazendo um reshape dos dados para alimentar o modelo. O LSTM requer que os dados sejam alimentados na forma
# [samples, time_steps, features]

x_train = x_train.reshape(x_train.shape[0], 1, x_train.shape[1])
x_test = x_test.reshape(x_test.shape[0], 1, x_test.shape[1])

In [7]:
# Função que transforma as percentages de variação diárias em decisões de compra, venda ou hold
# Os limites são baseados na distribuição que está no notebook correlations_and_features

def make_position(data):
    new_array = []
    for percent in data:
        if percent <= -0.025:
            position = 'sell'
        elif percent >= 0.022:
            position = 'buy'
        else:
            position = 'hold'
        new_array.append(position)
    return np.array(new_array)

In [8]:
y_train_categorical = make_position(y_train)
y_test_categorical = make_position(y_test)

# A função get_dummies cria uma variável para cada categoria. Logo, a matriz Y tem 3 dimensões (uma coluna para cada categoria,
# sempre com 0 e 1)

targets_train = pd.DataFrame({'y_train': y_train_categorical})
y_train_dummies = pd.get_dummies(targets_train)
targets_test = pd.DataFrame({'y_train': y_test_categorical})
y_test_dummies = pd.get_dummies(targets_test)

In [9]:
y_train_dummies.head()

Unnamed: 0,y_train_buy,y_train_hold,y_train_sell
0,1,0,0
1,1,0,0
2,0,1,0
3,0,1,0
4,0,0,1


In [10]:
# Avaliando todos os shapes

print('Shape X treino :', x_train.shape)
print('Shape Y treino :', y_train_dummies.shape)
print('Shape X teste :', x_test.shape)
print('Shape Y teste :', y_test_dummies.shape)

Shape X treino : (1965, 1, 20)
Shape Y treino : (1965, 3)
Shape X teste : (492, 1, 20)
Shape Y teste : (492, 3)


In [11]:
# Verificando o balanceamento das classes
# Estão desbalanceadas! Isso pode estar prejudicando o modelo...

unique, counts = np.unique(y_train_categorical, return_counts=True)
dict(zip(unique, counts))

{'buy': 369, 'hold': 1228, 'sell': 368}

In [12]:
n_timesteps, n_features, n_outputs = x_train.shape[1], x_train.shape[2], y_train_dummies.shape[1]

In [13]:
model = Sequential()  
model.add(LSTM(100, input_shape=(n_timesteps,n_features)))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.3, noise_shape=None, seed=None))
model.add(Dense(n_outputs, activation='sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 100)               48400     
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 303       
Total params: 58,803
Trainable params: 58,803
Non-trainable params: 0
_________________________________________________________________


In [14]:
history = model.fit(x_train, y_train_dummies.values, nb_epoch=100, verbose=1, validation_data=(x_test, y_test_dummies.values))

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 1965 samples, validate on 492 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100


Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [15]:
# O predict dele retorna também um vetor com 3 dimensões. Cada linha é uma observação e cada coluna é a probabilidade
# da categoria associada a ela. Respectivamente: buy, hold e sell. É a mesma ordem que passamos nas features.

results = model.predict(x_test)
results_train = model.predict(x_train)

results_train

array([[0.4623177 , 0.9999573 , 0.0438669 ],
       [0.26145646, 0.99968433, 0.06965423],
       [0.26766717, 0.9995252 , 0.09406734],
       ...,
       [0.28920412, 0.19152501, 0.5909601 ],
       [0.16219757, 0.24679211, 0.6144338 ],
       [0.14539704, 0.29869723, 0.6022797 ]], dtype=float32)

In [16]:
fig = go.Figure()

fig.add_trace(go.Scatter(x = np.array(range(400)), y=history.history['loss'], name='train'))
fig.add_trace(go.Scatter(x = np.array(range(400)), y=history.history['val_loss'], name='test'))
fig.update_layout(title='<b> Função custo ao longo do treino </b>')
fig.update_xaxes(title_text='Épocas')
fig.show()

In [17]:
fig = go.Figure()

fig.add_trace(go.Scatter(x = np.array(range(400)), y=history.history['acc'], name='train'))
fig.add_trace(go.Scatter(x = np.array(range(400)), y=history.history['val_acc'], name='test'))
fig.update_layout(title='<b> accuracy ao longo do treino </b>')
fig.update_xaxes(title_text='Épocas')
fig.show()