# Cryptocurrency Prophet

### Predicción mediante librería prophet

"Projecto" : "Análisis_criptomonedas"  
"Título" : "Predicción mediante librería prophet"  
"Autor" : "Cristian García Díaz"  
"Fecha de creación" : "20180821"  
"Fecha de modificación" : "20180826"  
"Fuentes":  
>https://facebook.github.io/prophet/docs/quick_start.html

In [20]:
#Se importan las librerias necesarias
#Gestión de archivos y datos
import os
from datetime import datetime
import pickle

#Gráficas
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff

import matplotlib.pyplot as plt
%matplotlib inline

# Se configura el modo offline
py.init_notebook_mode(connected=True)

#Procesamiento de datos
import pandas as pd
import numpy as np
import seaborn as sns

#predicción
import pandas as pd

In [21]:
# Comprobar si no esta creada la carpeta de archivos para almacenar los datos
directorio = "cryptocurrency_analysis_files"
if not os.path.exists(directorio):
    os.mkdir(directorio)

In [22]:
#Se añaden los datos de Bitcoin
# Se define una función get_json_data para cargar los datos de la API Poloniex
"""pickle --> para no descargar de nuevo los mismo datos"""
"""La función devuelve un Dataframe Pandas"""

def get_json_data(json_url,cache_path):
    cache_path = directorio +'\{}.pkl'.format(cache_path)
    """Descargamos en cache los datos en formato json"""
    try:
        f = open(cache_path,'rb')
        df = pickle.load(f)
        print('Dataset {} cargado del cache'.format(json_url))
    except (OSError,IOError) as e:
        print('Descargando datos {} mediante la API Poloniex'.format(json_url))
        df = pd.read_json(json_url)
        df.to_pickle(cache_path)
        print('Cargado {} de {} en el cache'.format(json_url,cache_path))
    return df

In [23]:
# Se define la función genera las peticiones vía HTTP a Poloniex API y se llamará a la función get_json_data para guardar los datos obtenidos
base_url = 'https://poloniex.com/public?command=returnChartData&currencyPair={}&start={}&end={}&period={}'
start_date = datetime.strptime('2015-01-01', '%Y-%m-%d')
end_date = datetime.now()
# Periodos válidos: '15m': 900, '5m': 300, '30m': 1800, '4h': 14400, '2h': 7200, '1d': 86400
period = 86400

def get_crypto_data(poloniex_pair):
    '''Captura de los datos de criptomonedas de la API Poliniex'''
    json_url = base_url.format(poloniex_pair,start_date.timestamp(),end_date.timestamp(),period)
    data_df = get_json_data(json_url,poloniex_pair)
    data_df = data_df.set_index('date')
    return data_df

# URL de ejemplo: https://poloniex.com/public?command=returnChartData&currencyPair=BTC_ETH&start=1420066800.0&end=1483225200.0&period=86400

In [24]:
# Diccionario altcoins formado por un dataframe por cada criptomoneda.
# Cada dataframe contiene el ratio medio de cammbio entre altcoins y BTC.

#altcoins = ['ETH','LTC','XRP','ETC','STR','DASH','SC','XMR','XEM']

altcoins = ['BTC']

altcoin_data = {}
for altcoin in altcoins:
    coinpair = 'USDT_{}'.format(altcoin)
    
    BTC_price_real_df = get_crypto_data(coinpair)
    altcoin_data[altcoin] = BTC_price_real_df

Dataset https://poloniex.com/public?command=returnChartData&currencyPair=USDT_BTC&start=1420066800.0&end=1536007142.03716&period=86400 cargado del cache


In [25]:
BTC_price_real_df.head()

Unnamed: 0_level_0,close,high,low,open,quoteVolume,volume,weightedAverage
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-02-19,244.0,244.0,225.0,225.0,0.193117,46.276313,239.627778
2015-02-20,240.25,245.0,240.25,240.250118,0.230429,55.894897,242.568479
2015-02-21,245.0,245.0,245.0,245.0,0.060091,14.722239,245.0
2015-02-22,235.0,249.0,235.0,245.0,0.539055,129.121248,239.532608
2015-02-23,235.0,235.001,235.0,235.000002,0.410926,96.567562,235.000062


In [26]:
BTC_price_real_df.tail()

Unnamed: 0_level_0,close,high,low,open,quoteVolume,volume,weightedAverage
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-08-22,6354.767602,6894.597094,6255.608559,6479.13229,2172.958589,14297910.0,6579.926417
2018-08-23,6530.616799,6567.216243,6344.41097,6354.767602,592.459626,3817506.0,6443.486423
2018-08-24,6685.522183,6718.0,6458.002476,6525.836537,759.907347,4998491.0,6577.763106
2018-08-25,6726.091103,6789.0,6665.0,6693.306879,519.285013,3486018.0,6713.112009
2018-08-26,6672.4847,6778.395678,6571.29781,6726.258293,449.810054,2998773.0,6666.755151


In [27]:
# Gráfico del precio del BTC
btc_trace = go.Scatter(x=BTC_price_real_df.index, y=BTC_price_real_df['close'])
py.iplot([btc_trace])

In [28]:
# Se define la función para visualizar los datos
def df_scatter(df, title,seperate_y_axis=False, y_axis_label='',scale='linear',initial_hide=False):
    # Se definen la lista de los nombres de cada dataframe como una lista label_arr = ['BITSTAMP', 'COINBASE', 'ITBIT', 'KRAKEN']
    label_arr = list(df)
    # Aplicamos una función lambda para mapear cada columnas y asignar la etiqueta correspondiente
    # Se guarda como otra lista series_arr
    series_arr = list(map(lambda col:df[col],label_arr))
    
    # Se definen los parametros de la salida gráfica
    layout = go.Layout(
        title = title,
        legend = dict(orientation='h'),
        xaxis = dict(type='date'),
        yaxis = dict(
            title = y_axis_label, 
            showticklabels = not seperate_y_axis,
            type = scale
        )
    )
    
    # Se define la configuración del eje y
    y_axis_config = dict(
        overlaying = 'y',
        showticklabels = False,
        type = scale
    )
    
    # Se define la visibilidad
    visibility = 'visible'
    if initial_hide:
        visibility = 'legendonly'
        
    # Se define la forma para cada serie de datos
    trace_arr = []
    for index, series in enumerate(series_arr):
        trace = go.Scatter(
        x = series.index,
        y = series,
        name = label_arr[index],
        visible = visibility
        )
        
        #Añadir un eje separado para cada serie
        if seperate_y_axis:
            trace['yaxis'] = 'y{format}'.format(index + 1)
            layout['yaxis{}'.format(index + 1)] = y_axis_config
        trace_arr.append(trace)
    
    fig = go.Figure(data = trace_arr, layout = layout)
    py.iplot(fig)

In [29]:
#Copiamos los datos a un DF nuevo para poder trabajar con él sin afectar a los datos originales
BTC_price_transform = BTC_price_real_df['weightedAverage']

In [30]:
# Comprobamos si hay datos que sean nulos en cualquier lugar del DF
# Estamos de suerte no hay datos nulos
BTC_price_transform.isnull().any()

False

In [31]:
BTC_price_real_df.index

DatetimeIndex(['2015-02-19', '2015-02-20', '2015-02-21', '2015-02-22',
               '2015-02-23', '2015-02-24', '2015-02-25', '2015-02-26',
               '2015-02-27', '2015-02-28',
               ...
               '2018-08-17', '2018-08-18', '2018-08-19', '2018-08-20',
               '2018-08-21', '2018-08-22', '2018-08-23', '2018-08-24',
               '2018-08-25', '2018-08-26'],
              dtype='datetime64[ns]', name='date', length=1285, freq=None)

In [32]:
BTC_price_real_df['weightedAverage'].values

array([ 239.62777823,  242.56847926,  245.        , ..., 6577.76310567,
       6713.11200926, 6666.7551505 ])

In [33]:
# Se define la función para visualizar los datos
def df_scatter(df, title,seperate_y_axis=False, y_axis_label='',scale='linear',initial_hide=False):
    # Se definen la lista de los nombres de cada dataframe como una lista label_arr = ['BITSTAMP', 'COINBASE', 'ITBIT', 'KRAKEN']
    label_arr = list(df)
    # Aplicamos una función lambda para mapear cada columnas y asignar la etiqueta correspondiente
    # Se guarda como otra lista series_arr
    series_arr = list(map(lambda col:df[col],label_arr))
    
    # Se definen los parametros de la salida gráfica
    layout = go.Layout(
        title = title,
        legend = dict(orientation='h'),
        xaxis = dict(type='date'),
        yaxis = dict(
            title = y_axis_label, 
            showticklabels = not seperate_y_axis,
            type = scale
        )
    )
    
    # Se define la configuración del eje y
    y_axis_config = dict(
        overlaying = 'y',
        showticklabels = False,
        type = scale
    )
    
    # Se define la visibilidad
    visibility = 'visible'
    if initial_hide:
        visibility = 'legendonly'
        
    # Se define la forma para cada serie de datos
    trace_arr = []
    for index, series in enumerate(series_arr):
        trace = go.Scatter(
        x = series.index,
        y = series,
        name = label_arr[index],
        visible = visibility
        )
        
        #Añadir un eje separado para cada serie
        if seperate_y_axis:
            trace['yaxis'] = 'y{format}'.format(index + 1)
            layout['yaxis{}'.format(index + 1)] = y_axis_config
        trace_arr.append(trace)
    
    fig = go.Figure(data = trace_arr, layout = layout)
    py.iplot(fig)

In [34]:
mesures_bitcoin = BTC_price_real_df

In [35]:
# Crear los datos de entreno, test y evaluación
# Entreno
train_from_date = '2016-01-01'
train_end_date =  '2018-06-22'
# Test
test_from_date = '2018-06-23'
test_end_date = '2018-08-16'
# Evaluación
# '2018-08-17'
evaluation_from_date = '2018-08-10'
evaluation_end_date = '2018-08-22'

df_train = mesures_bitcoin.loc[train_from_date:train_end_date]
df_test = mesures_bitcoin.loc[test_from_date:test_end_date]
df_evaluation = mesures_bitcoin.loc[evaluation_from_date:evaluation_end_date]


print(df_train.size," días de entreno\n",df_test.size," días de test\n",df_evaluation.size," días de evaluación\n")

train_days = mesures_bitcoin.loc[train_from_date:train_end_date].count()
test_days = mesures_bitcoin.loc[test_from_date:test_end_date].count()
evalutacion_days = mesures_bitcoin.loc[evaluation_from_date:evaluation_end_date].count()
print(train_days, " desde ",train_from_date," hasta ",train_end_date )
print(test_days, " desde ",test_from_date," hasta ",test_end_date )
print(evalutacion_days, " desde ",evaluation_from_date," hasta ",evaluation_end_date )

6328  días de entreno
 385  días de test
 91  días de evaluación

close              904
high               904
low                904
open               904
quoteVolume        904
volume             904
weightedAverage    904
dtype: int64  desde  2016-01-01  hasta  2018-06-22
close              55
high               55
low                55
open               55
quoteVolume        55
volume             55
weightedAverage    55
dtype: int64  desde  2018-06-23  hasta  2018-08-16
close              13
high               13
low                13
open               13
quoteVolume        13
volume             13
weightedAverage    13
dtype: int64  desde  2018-08-10  hasta  2018-08-22


In [36]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from time import time
from math import sqrt

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler

import statsmodels.api as sm
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import GRU
from keras.callbacks import EarlyStopping
from keras import initializers

from keras.layers import Activation
from keras.layers import Dropout

Unnamed: 0_level_0,close,high,low,open,quoteVolume,volume,weightedAverage
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-01-01,434.990000,435.000000,428.260002,428.260002,6.887257,2.975027e+03,431.961057
2016-01-02,436.949900,438.140000,430.500000,434.990000,2.179981,9.448533e+02,433.422678
2016-01-03,428.140000,435.614998,426.453089,432.310000,1.596349,6.839185e+02,428.426673
2016-01-04,432.000011,435.999999,427.291140,427.291141,1.673428,7.249760e+02,433.228179
2016-01-05,430.376774,435.999999,429.569500,430.140211,0.992530,4.276473e+02,430.865842
2016-01-06,427.500020,435.000000,427.290001,430.170473,1.903358,8.193448e+02,430.473321
2016-01-07,458.897002,458.897002,427.500573,427.500573,24.013560,1.070624e+04,445.841533
2016-01-08,453.050000,468.000000,447.000000,451.100011,12.930452,5.912948e+03,457.288610
2016-01-09,447.420001,457.398054,447.000379,453.050000,4.181581,1.879485e+03,449.467648
2016-01-10,450.728520,453.030000,440.020007,447.420001,20.461995,9.116861e+03,445.550928


In [40]:
# Variable independientes
df_train_x = df_train.loc[:,:]
# Variables predictoras
df_train_y = df_train.loc[:,"weightedAverage"]

# Variable independientes
df_test_x = df_test.loc[:,:]
# Variables predictoras
df_test_y = df_test.loc[:,"weightedAverage"]

# Variable independientes
df_evaluation_x = df_evaluation.loc[:,:]
# Variables predictoras
df_evaluation_y = df_evaluation.loc[:,"weightedAverage"]

In [47]:
window_len = 10
mesures_name = ["close",
               "high",
               "low",
               "open",
               "quoteVolume",
               "volume",
               "weightedAverage"]

In [48]:
# ENTRENO
#Variables explicativas
LSTM_training_inputs = []

for i in range(len(df_train_x)-window_len):
    temp_set = df_train_x[i:(i+window_len)].copy()
    for j in mesures_name:
        temp_set.loc[:, j] = temp_set[j]/temp_set[j].iloc[0] - 1
    LSTM_training_inputs.append(temp_set)
    
#Variable explicada
LSTM_training_outputs = (df_train_y[window_len:].values/df_train_y[:-window_len].values)-1

# TEST
#Variables explicativas
LSTM_test_inputs = []

for i in range(len(df_test_x)-window_len):
    temp_set = df_test_x[i:(i+window_len)].copy()
    for j in mesures_name:
        temp_set.loc[:, j] = temp_set[j]/temp_set[j].iloc[0] - 1
    LSTM_test_inputs.append(temp_set)
    
#Variable explicada 
LSTM_test_outputs = (df_test_y[window_len:].values/df_test_y[:-window_len].values)-1

# EVALUATION
#Variables explicativas
LSTM_evaluation_inputs = []

for i in range(len(df_evaluation_x)-window_len):
    temp_set = df_evaluation_x[i:(i+window_len)].copy()
    for j in mesures_name:
        temp_set.loc[:, j] = temp_set[j]/temp_set[j].iloc[0] - 1
    LSTM_evaluation_inputs.append(temp_set)
    
#Variable explicada 
LSTM_evaluation_outputs = (df_evaluation_y[window_len:].values/df_evaluation_y[:-window_len].values)-1

# Convertimos los datos de dataframe a matrices numpy ya que tenemos datos númericos
LSTM_training_inputs = [np.array(LSTM_training_input) for LSTM_training_input in LSTM_training_inputs]
LSTM_training_inputs = np.array(LSTM_training_inputs)

LSTM_test_inputs = [np.array(LSTM_test_inputs) for LSTM_test_inputs in LSTM_test_inputs]
LSTM_test_inputs = np.array(LSTM_test_inputs)

LSTM_evaluation_inputs = [np.array(LSTM_evaluation_inputs) for LSTM_evaluation_inputs in LSTM_evaluation_inputs]
LSTM_evaluation_inputs = np.array(LSTM_evaluation_inputs)

In [58]:
# La salida del modelo se normaliza
LSTM_training_outputs = (df_train['weightedAverage'][window_len:].values/df_train['weightedAverage'][:-window_len].values)-1

In [59]:
LSTM_training_outputs

array([ 3.68419980e-02,  2.49174854e-02,  4.28732421e-03, -7.26912379e-03,
       -8.59346345e-02, -1.55691038e-01, -1.43780455e-01, -1.56926570e-01,
       -1.38364880e-01, -9.25597999e-02, -8.28039134e-02, -1.25636960e-01,
       -9.56375449e-02, -8.69920140e-02,  4.03092317e-03,  7.64339767e-02,
        3.00223526e-02, -1.34032902e-02, -3.72261397e-02, -6.71793826e-02,
       -8.77876821e-02, -4.41063729e-02, -4.08026036e-02, -7.25702027e-02,
       -5.85005575e-02,  1.37052137e-03, -3.58956924e-02, -1.23856612e-02,
       -3.34405345e-04, -5.72084041e-03,  9.02417005e-03,  2.12514311e-02,
        5.91225568e-03,  6.81067805e-02,  7.04824467e-02,  2.94814752e-02,
        7.62011607e-02,  1.01018184e-01,  1.24376704e-01,  1.18365628e-01,
        1.41339396e-01,  1.52390072e-01,  1.59595943e-01,  9.69805371e-02,
        4.13184334e-02,  1.04849047e-02,  4.01468462e-02,  3.32002289e-02,
       -3.48308349e-03,  4.46811956e-02,  2.48487103e-03, -1.18875143e-02,
       -3.38177952e-02, -

In [94]:
# Se define una función para construir el modelo de red neuronal
# Se construye un modelo vacío sequencial y se agrega una capa LSTM.
# El modelo se ha configurado para que se adapte a una entrada n x m.
# Se incluye la función de activación.

def build_model(inputs, output_size, neurons, activ_func="linear",
                dropout=0.25, loss="mae", optimizer="adam"):
    model = Sequential()

    model.add(LSTM(neurons, input_shape=(inputs.shape[1], inputs.shape[2])))
    model.add(Dropout(dropout))
    model.add(Dense(units=output_size))
    model.add(Activation(activ_func))

    model.compile(loss=loss, optimizer=optimizer)
    return model

In [95]:
# Se define una semilla para generar los números pseudoaleatorios
np.random.seed(14)

# Se inicializa el modelo
model_btc = build_model(LSTM_training_inputs, output_size=1, neurons = 20)

# La salida del modelo se normaliza
LSTM_training_outputs = (df_train['weightedAverage'][window_len:].values/df_train['weightedAverage'][:-window_len].values)-1

# Comprobar el tiempo
start_time = time()

#Se entrena al modelo. model_btc_history contiene información del error por entreno
model_btc_history = model_btc.fit(LSTM_training_inputs, LSTM_training_outputs, 
                            epochs=50, batch_size=1, verbose=2, shuffle=True)

#Se entrena al modelo. model_btc_history contiene información del error por entreno
#model_btc_history = model_btc.fit(LSTM_training_inputs, LSTM_training_outputs, 
#                           epochs=50, batch_size=1, verbose=2, shuffle=False,
#                           validation_data=(LSTM_test_inputs, LSTM_test_outputs),
#                           callbacks = [EarlyStopping(monitor='val_loss', min_delta=5e-5, patience=20, verbose=1)])
# Comprobar el tiempo
final_time = time() - start_time

Epoch 1/50
 - 6s - loss: 0.0643
Epoch 2/50
 - 5s - loss: 0.0449
Epoch 3/50
 - 4s - loss: 0.0362
Epoch 4/50
 - 4s - loss: 0.0343
Epoch 5/50
 - 4s - loss: 0.0324
Epoch 6/50
 - 5s - loss: 0.0315
Epoch 7/50
 - 6s - loss: 0.0309
Epoch 8/50
 - 6s - loss: 0.0302
Epoch 9/50
 - 5s - loss: 0.0307
Epoch 10/50
 - 6s - loss: 0.0300
Epoch 11/50
 - 6s - loss: 0.0298
Epoch 12/50
 - 5s - loss: 0.0302
Epoch 13/50
 - 6s - loss: 0.0284
Epoch 14/50
 - 6s - loss: 0.0280
Epoch 15/50
 - 5s - loss: 0.0286
Epoch 16/50
 - 5s - loss: 0.0293
Epoch 17/50
 - 4s - loss: 0.0276
Epoch 18/50
 - 4s - loss: 0.0285
Epoch 19/50
 - 4s - loss: 0.0275
Epoch 20/50
 - 5s - loss: 0.0283
Epoch 21/50
 - 4s - loss: 0.0280
Epoch 22/50
 - 4s - loss: 0.0274
Epoch 23/50
 - 4s - loss: 0.0290
Epoch 24/50
 - 5s - loss: 0.0276
Epoch 25/50
 - 5s - loss: 0.0269
Epoch 26/50
 - 5s - loss: 0.0275
Epoch 27/50
 - 5s - loss: 0.0279
Epoch 28/50
 - 5s - loss: 0.0272
Epoch 29/50
 - 5s - loss: 0.0278
Epoch 30/50
 - 5s - loss: 0.0273
Epoch 31/50
 - 4s -

In [96]:
# Tiempo de ejecución
print('Tiempo de ejecución de la red neural es de: {0:.3f}'.format(final_time))

Tiempo de ejecución de la red neural es de: 234.919


In [97]:
# Gráfico del error MAE
history_error_btc = go.Scatter(x=model_btc_history.epoch, y=model_btc_history.history['loss'])
py.iplot([history_error_btc])

In [98]:
predicted = ((np.transpose(model_btc.predict(LSTM_training_inputs))+1) * df_train_y.values[:-window_len])[0]
observated = df_train_y.values[window_len:]

In [99]:
# Visualización de 
trace1 = go.Scatter(
    x = np.arange(0, len(predicted), 1),
    y = predicted,
    mode = 'lines',
    name = 'Predicted',
    line = dict(color=('rgb(244, 146, 65)'), width=2)
)
trace2 = go.Scatter(
    x = np.arange(0, len(predicted), 1),
    y = observated,
    mode = 'lines',
    name = 'Observaciones',
    line = dict(color=('rgb(66, 244, 155)'), width=2)
)

data = [trace1, trace2]
layout = dict(title = 'Comparison of true prices (on the test dataset) with prices our model predicted',
             xaxis = dict(title = 'Day number'), yaxis = dict(title = 'Price, USD'))
fig = dict(data=data, layout=layout)
py.iplot(fig, filename='results_demonstrating0')

In [100]:
# MSE
print("MSE: %.3f" % mean_squared_error(predicted,observated ))
# RMSE Root Mean Square Error
RMSE = sqrt(mean_squared_error(predicted,observated ))
print('RMSE: %.3f' % RMSE)
from sklearn.metrics import mean_absolute_error
# MAE
print("MAE: %.3f" % mean_absolute_error(predicted,observated ))

MSE: 60746.957
RMSE: 246.469
MAE: 108.731


In [101]:
MAE = mean_absolute_error(LSTM_test_outputs, model_btc.predict(LSTM_test_inputs))

In [102]:
print(MAE)

0.019096092989025867


In [None]:
"""
linear sin validación 20 neuronas
MSE: 66863.095
RMSE: 258.579
MAE: 101

linear con validación 20 neuronas
MSE: 66863.095
RMSE: 258.579
MAE: 116.466
"""