# Enzo Yamamura

## Bitcoin x Sentimentos do Reddit

Machine Learning

***
# Imports iniciais

In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
import plotly.figure_factory as ff
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, Dropout, LeakyReLU
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from tensorflow.random import set_seed

  import pandas.util.testing as tm


---
# Análises preliminares

In [5]:
df = pd.read_csv('/content/drive/MyDrive/Enzo/db.csv', index_col = 0)

In [6]:
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume_(BTC),sent_sum,positivo,neutro,negativo
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2015-01-01,321.0,321.0,312.6,313.81,3087.436554,15.780143,750,262,323
2015-01-02,313.82,317.01,311.96,315.42,3468.281375,14.801278,827,280,457
2015-01-03,315.42,316.58,280.0,282.0,21752.719146,19.01267,1089,336,546
2015-01-04,280.0,289.39,255.0,264.0,41441.278553,24.593739,1361,368,735
2015-01-05,264.55,280.0,264.07,276.8,9528.271002,24.207751,1294,355,668


In [7]:
# Mudança percentual do Close em relação ao dia anterior:
df['Close_chg'] = df['Close'].pct_change(1)

In [8]:
fig = px.line(df['Close'], title ='Preço do Close do Bitcoin (USD) por Dia')
fig.update_layout(yaxis_title = 'Preço')
fig.show()

In [9]:
fig = px.line(df['Close_chg'], title ='Variação Diária Percentual do Close do Bitcoin (USD)')
fig.update_layout(yaxis_title = 'Variação')
fig.show()

fig = px.line(df['Close_chg'].rolling(90).mean(), title ='Média Móvel 90 Dias')
fig.update_layout(yaxis_title = 'Variação')
fig.show()

fig = px.line(df['Close_chg'].rolling(90).std(), title ='Desvio Padrão Móvel 90 Dias')
fig.update_layout(yaxis_title = 'Variação')
fig.show()

Nem a média nem o desvio padrão móvel são constantes ao longo do tempo, sinalizando não-estacionariedade.

---
# Feature Engineering e Normalização

In [10]:
# Intraday High - low
df['High_Low'] = (df['High'] / df['Low'] - 1)

# Diferença overnight (close de hoje e open de amanhã):
df['Overnight'] = df['Open'].shift(-1) / df['Close'] - 1

df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume_(BTC),sent_sum,positivo,neutro,negativo,Close_chg,High_Low,Overnight
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2015-01-01,321.0,321.0,312.6,313.81,3087.436554,15.780143,750,262,323,,0.026871,3.2e-05
2015-01-02,313.82,317.01,311.96,315.42,3468.281375,14.801278,827,280,457,0.00513,0.016188,0.0
2015-01-03,315.42,316.58,280.0,282.0,21752.719146,19.01267,1089,336,546,-0.105954,0.130643,-0.007092
2015-01-04,280.0,289.39,255.0,264.0,41441.278553,24.593739,1361,368,735,-0.06383,0.134863,0.002083
2015-01-05,264.55,280.0,264.07,276.8,9528.271002,24.207751,1294,355,668,0.048485,0.060325,0.0


	Usaremos: Volume_(BTC)	sent_sum	positivo	neutro	negativo	Close_chg	High_Low	Overnight

  Separaremos sent_sum das contagens de positivo, neutro e negativo em treinos diferentes pois são redundantes.

In [11]:
# Trocando Explicativas:
# df_ = df[['Close', 'sent_sum','High_Low']].copy()
df_ = df[['Close','sent_sum']].copy()
df_.dropna(inplace=True)

In [12]:
# Separando em treino e teste:
train = df_.loc[:'2018-12-31']
test = df_.loc['2019-01-01':]

In [13]:
# Padronizando
scaler = MinMaxScaler() #converge mais rápido
scaler.fit(df_)
df_scaled = scaler.fit_transform(df_)

# Definindo explicada x explicativas:


features = df_scaled
target = df_scaled [:,0]

In [14]:
# Separando em treino e teste:
# shuffle = false pois a ordem importa
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = 0.20, random_state=42, shuffle = False)

---
# LSTM com Sentimentos Agregados:

In [13]:
registro = pd.DataFrame(columns=['MSE','MAE','MAPE'])
win_length = 0
batch_size = 0
for win_length in [7,14,28,70]:# janelas de 7 em 7
  for batch_size in [4,8,32,64,256]:# batches consagrados
    num_features = x_train.shape[1]

    # Generator para teste e treino
    train_generator = TimeseriesGenerator(x_train, y_train, length= win_length, sampling_rate = 1, batch_size= batch_size)
    test_generator = TimeseriesGenerator(x_test, y_test, length= win_length, sampling_rate = 1, batch_size= batch_size)

    # Arquitetura LSTM
    model = Sequential()
    model.add(LSTM(128, input_shape = (win_length, num_features), return_sequences = True))
    model.add(LeakyReLU(alpha=0.5))
    model.add(LSTM(128, return_sequences=True))
    model.add(LeakyReLU(alpha=0.5))
    model.add(Dropout(0.3))
    model.add(LSTM(64, return_sequences=False))
    model.add(Dropout(0.3))
    model.add(Dense(1)) #output layer

    # Parada precoce, sempre que não melhora depois de 3 iterações
    early_stopping = EarlyStopping(monitor='val_loss',patience = 3, mode='min')

    # Adicionando decaimento exponencial de -0.1
    def scheduler(epoch, lr):
        return np.clip(lr * tf.math.exp(-0.1), 0.000001, 0.001)


    callbacklr = tf.keras.callbacks.LearningRateScheduler(scheduler)

    model.compile(loss = 'mae',
                  optimizer = tf.optimizers.Adam(learning_rate=0.001),
                  metrics=['mse','mae','mape'])

    # aqui testei com e sem decaimento exponencial
    # Rodando:
    set_seed(42)
    history = model.fit(train_generator, epochs = 200, validation_data = test_generator,
                                  shuffle=False, callbacks = [early_stopping, callbacklr])
    # Criando tabela comparativa
    registro.loc[f'Batches:{batch_size}, Janela:{win_length}']= pd.Series(dict(zip(['MSE','MAE','MAPE'],model.evaluate(test_generator, verbose=0))))

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epo

In [14]:
# Checando Melhor Modelo
registro.index.names = ['Configuração']
registro

Unnamed: 0_level_0,MSE,MAE,MAPE
Configuração,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Batches:4, Janela:7",0.036964,0.002086,0.036964
"Batches:8, Janela:7",0.031455,0.001775,0.031455
"Batches:32, Janela:7",0.049986,0.003773,0.049986
"Batches:64, Janela:7",0.050934,0.003374,0.050934
"Batches:256, Janela:7",0.030376,0.001684,0.030376
"Batches:4, Janela:14",0.178074,0.045094,0.178074
"Batches:8, Janela:14",0.029148,0.001342,0.029148
"Batches:32, Janela:14",0.044508,0.003423,0.044508
"Batches:64, Janela:14",0.104148,0.014979,0.104148
"Batches:256, Janela:14",0.034186,0.002424,0.034186


In [15]:
# Vamos priorizar MAE pois queremos penalizar erros grandes de previsão
registro.sort_values(['MAE','MSE','MAPE']).head(3)

Unnamed: 0_level_0,MSE,MAE,MAPE
Configuração,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Batches:8, Janela:28",0.028412,0.001287,0.028412
"Batches:8, Janela:14",0.029148,0.001342,0.029148
"Batches:4, Janela:28",0.030221,0.00149,0.030221


In [15]:
# Rodando para os acima:
win_length = 28
batch_size = 8

num_features = x_train.shape[1]

# Generator para teste e treino
train_generator = TimeseriesGenerator(x_train, y_train, length= win_length, sampling_rate = 1, batch_size= batch_size)
test_generator = TimeseriesGenerator(x_test, y_test, length= win_length, sampling_rate = 1, batch_size= batch_size)

# Arquitetura LSTM
model = Sequential()
model.add(LSTM(128, input_shape = (win_length, num_features), return_sequences = True))
model.add(LeakyReLU(alpha=0.5))
model.add(LSTM(128, return_sequences=True))
model.add(LeakyReLU(alpha=0.5))
model.add(Dropout(0.3))
model.add(LSTM(64, return_sequences=False))
model.add(Dropout(0.3))
model.add(Dense(1)) #output layer

# Parada precoce, sempre que não melhora depois de 3 iterações
early_stopping = EarlyStopping(monitor='val_loss',patience = 3, mode='min')

# Adicionando decaimento exponencial de -0.1
def scheduler(epoch, lr):
  return np.clip(lr * tf.math.exp(-0.1), 0.000001, 0.001)


callbacklr = tf.keras.callbacks.LearningRateScheduler(scheduler)

model.compile(loss = 'mae',
            optimizer = tf.optimizers.Adam(learning_rate=0.001),
            metrics=['mse','mape'])

# aqui testei com e sem decaimento exponencial
# Rodando:
set_seed(42)
history = model.fit(train_generator, epochs = 200, validation_data = test_generator,
                            shuffle=False, callbacks = [early_stopping, callbacklr])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200


In [24]:
metricas = history.history
del metricas ['mape']
del metricas['val_mape']
fig = px.line(metricas, title = 'Evolução nas Métricas por Epochs')
fig.update_layout(xaxis_title = 'Epochs')

In [17]:
s# Prevendo
predictions = model.predict(test_generator)

In [18]:
predictions.shape #diferença da janela de 14 dias usada

(338, 1)

In [19]:
# Consideraremos só:
# x_test [:, 1:][win_length:]

In [20]:
# Concatenando previsto com teste:
df_pred = pd.concat([pd.DataFrame(predictions), pd.DataFrame(x_test [:, 1:][win_length:])],axis=1)

# Revertendo scaling:
rev_trans = scaler.inverse_transform(df_pred)

In [21]:
# Pegando só datas que foram preditas:
df_final = df_[predictions.shape[0]*-1:]

df_final.count()

Close       338
sent_sum    338
dtype: int64

In [22]:
df_final ['Close_Pred'] = rev_trans[:,0]

In [23]:
fig = px.line(df_final[['Close','Close_Pred']], title='Predito x Observado: Preço do Close de BTC em 2019')
fig.show()