In [98]:

import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.tsa.arima_model import ARMA
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.seasonal import seasonal_decompose
from scipy import stats
from itertools import product
import warnings
warnings.filterwarnings('ignore')
import Data_Process as dp
%reload_ext autoreload
%autoreload 2


In [102]:
info = pd.read_csv("g-research-crypto-forecasting/asset_details.csv")
ctrain = pd.read_csv("g-research-crypto-forecasting/train.csv")

ValueError: Only callable can be used as callback

In [None]:
btc = dp.c_time_sub(asset_id =1, data = ctrain)

In [None]:
btc.Close.plot()

In [None]:
#Box-Cox Transformation for monthly rolled
btc_month=btc.resample("M").mean()
btc_month["close_box"], lmbda=stats.boxcox(btc_month.Close)
print("Dickey–Fuller test: p=%f" % adfuller(btc_month.close_box)[1])

In [None]:
#Box-Cox Transformation for minute data
btc["close_box"], lmbda_2=stats.boxcox(btc.Close)
print("Dickey–Fuller test: p=%f" % adfuller(btc_month.close_box)[1])


In [None]:
# Seasonal differentiation (12 months)
btc_month['box_diff_seasonal_12'] = btc_month.close_box - btc_month.close_box.shift(12)
print("Dickey–Fuller test: p=%f" % adfuller(btc_month.box_diff_seasonal_12[12:])[1])

In [None]:
# Seasonal differentiation (3 months)
btc_month['box_diff_seasonal_3'] = btc_month.close_box - btc_month.close_box.shift(3)
print("Dickey–Fuller test: p=%f" % adfuller(btc_month.box_diff_seasonal_3[3:])[1])

Initial approximation of parameters using Autocorrelation and Partial Autocorrelation Plots
fot month value

In [None]:
ax = plt.subplot(211)
# Plot the autocorrelation function
plot_acf(btc_month.Close.values.squeeze(), lags=12, ax=ax)
ax = plt.subplot(212)
plot_pacf(btc_month.Close.values.squeeze(), lags=12, ax=ax)
plt.tight_layout()
plt.show()

We see that there are 4 spikes in the ACF, followed by an almost significant spike at lag 5.
In the PACF, there are 4 significant spikes, and then no significant spikes thereafter (apart from one just outside the bounds). We can ignore significant spikes in each plot if it is just outside the limits, and not in the first few lags

In [None]:
plot_acf(btc_month.close_box.values.squeeze(), lags=12)
plot_pacf(btc_month.close_box.values.squeeze(), lags=12)

plt.tight_layout()
plt.show()

ARIMA for Close

In [None]:
PCT_VALIDATION = 0.1 # last 10% of the data are used as validation set

btc_month_train = btc_month[:int(len(btc_month)*(1-PCT_VALIDATION))]
btc_month_train.shape[0]/btc_month.shape[0]

In [None]:
# Initial approximation of parameters

qs = range(0, 5)
ps = range(0, 5)
d=1
parameters = product(ps, qs)
parameters_list = list(parameters)
len(parameters_list)

# Model Selection
results = []
best_aic = float("inf")
warnings.filterwarnings('ignore')
for param in parameters_list:
    try:
        model = SARIMAX(btc_month_train.Close, order=(param[0], d, param[1])).fit(disp=-1)
    except ValueError:
        print('bad parameter combination:', param)
        continue
    aic = model.aic
    if aic < best_aic:
        best_model = model
        best_aic = aic
        best_param = param
    results.append([param, model.aic])

In [None]:
# Best Models
result_table = pd.DataFrame(results)
result_table.columns = ['parameters', 'aic']
print(result_table.sort_values(by = 'aic', ascending=True).head())

In [None]:
best_model.plot_diagnostics(figsize=(15, 12))
plt.show()

In [None]:
print("Dickey–Fuller test:: p=%f" % adfuller(best_model.resid[13:])[1])

Auto tuning of params

In [None]:
#we will use pmdarima library to fit automatically
#!pip install pmdarima
import numpy as np
import pmdarima as pm
import pmdarima as pm
from pmdarima.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt

If you are unsure (as is common) of the best parameters for your model, let auto_arima figure it out for you. auto_arima is similar to an ARIMA-specific grid search, but (by default) uses a more intelligent stepwise algorithm laid out in a paper by Hyndman and Khandakar (2008). If stepwise is False, the models will be fit similar to a gridsearch. Note that it is possible for auto_arima not to find a model that will converge; if this is the case, it will raise a ValueError.



In [None]:
# fitting a stepwise model:

stepwise_fit = pm.auto_arima(btc_month.Close, start_p=1, start_q=1, max_p=3, max_q=3, m=12,
                             start_P=0, seasonal=True, d=1, D=1, trace=True,
                             error_action='ignore',  # don't want to know if an order does not work
                             suppress_warnings=True,  # don't want convergence warnings
                             stepwise=True)  # set to stepwise

stepwise_fit.summary()

In [None]:
stepwise_fit.plot_diagnostics(figsize=(15, 12))
plt.show()

In [None]:
#stepwise is False
rs_fit = pm.auto_arima(btc_month.Close, start_p=1, start_q=1, max_p=3, max_q=3, m=12,
                       start_P=0, seasonal=True, d=1, D=1, trace=True,
                       n_jobs=-1,  # We can run this in parallel by controlling this option
                       error_action='ignore',  # don't want to know if an order does not work
                       suppress_warnings=True,  # don't want convergence warnings
                       stepwise=False, random=True, random_state=42,  # we can fit a random search (not exhaustive)
                       n_fits=25)

rs_fit.summary()

In [None]:
rs_fit.plot_diagnostics(figsize=(15, 12))
plt.show()


In [None]:
train, test = train_test_split(btc_month.Close, train_size=35)

In [None]:
# Fit the model
model = pm.auto_arima(train, seasonal=True, m=12)

# make the forecasts
forecasts,conf_int = model.predict(test.shape[0],return_conf_int=True)  # predict N steps into the future

# Visualize the forecasts (blue=train, green=forecasts)
x = np.arange(btc_month.Close.shape[0])
plt.plot(x[:35], train, c='blue')
plt.plot(x[35:], forecasts, c='green')
plt.show()#%%

In [None]:
model.plot_diagnostics(figsize=(15, 12))
plt.show()

In [None]:
def plt_arima_result(y_train = train, forecasts = forecasts,y_test = test,conf_int = conf_int ):
    fig = plt.figure(figsize=(16, 8))
    ax = fig.add_subplot(1, 1, 1)

    n_train = y_train.shape[0]
    x = np.arange(n_train + forecasts.shape[0])

    ax.plot(x[:n_train], y_train, color='blue', label='Training Data')
    ax.plot(x[n_train:], forecasts, color='green', marker='o',
            label='Predicted')
    ax.plot(x[n_train:], y_test, color='red', label='Actual')
    ax.legend(loc='lower left', borderaxespad=0.5)
    ax.set_title('Predicted Foo')
    ax.set_ylabel('# Foo')
    plt.fill_between(x[n_train:],
                 conf_int[:, 0], conf_int[:, 1],
                 alpha=0.1, color='b')

    plt.show()

In [None]:
plt_arima_result(y_train = train, forecasts = forecasts,y_test = test )

In [None]:
import pmdarima as pm
from pmdarima import arima
from pmdarima import model_selection
from pmdarima import pipeline
from pmdarima import preprocessing
from pmdarima.datasets._base import load_date_example

import numpy as np
from matplotlib import pyplot as plt

Predict with BoxCoxEndogTransformer

In [None]:
import pmdarima as pm
from pmdarima.model_selection import train_test_split
from pmdarima.pipeline import Pipeline
from pmdarima.preprocessing import BoxCoxEndogTransformer
import pickle

# Load/split your data
y = btc_month.Close
train_size=int(len(y)*0.8)
train, test = train_test_split(btc_month.Close, train_size=train_size)

# Define and fit your pipeline
pipeline = Pipeline([
    ('boxcox', BoxCoxEndogTransformer(lmbda2=1e-6)),  # lmbda2 avoids negative values
    ('arima', pm.AutoARIMA(seasonal=True, m=12, #monthly data
                           suppress_warnings=True,
                           trace=True))
])

pipeline.fit(train)

# Serialize your model just like you would in scikit:
with open('model.pkl', 'wb') as pkl:
    pickle.dump(pipeline, pkl)

# Load it and make predictions seamlessly:
with open('model.pkl', 'rb') as pkl:
    mod = pickle.load(pkl)
    print(mod.predict(len(y)-train_size)
# [25.20580375 25.05573898 24.4263037  23.56766793 22.67463049 21.82231043
# 21.04061069 20.33693017 19.70906027 19.1509862  18.6555793  18.21577243
# 17.8250318  17.47750614 17.16803394]


plt_arima_result(y_train = train, forecasts = mod.predict(int(len(y)*0.2)),y_test = test )

Hourly prediction

In [None]:
btc_hour=btc.resample("H").mean()

import pmdarima as pm
from pmdarima.model_selection import train_test_split
from pmdarima.pipeline import Pipeline
from pmdarima.preprocessing import BoxCoxEndogTransformer
import pickle

# Load/split your data
y = btc_hour.Close
train_size=int(len(y)*0.8)
train, test = train_test_split(btc_hour.Close, train_size=train_size)

# Define and fit your pipeline
pipeline = Pipeline([
    ('boxcox', BoxCoxEndogTransformer(lmbda2=1e-6)),  # lmbda2 avoids negative values
    ('arima', pm.AutoARIMA(seasonal=True, m=1, #hourly data
                           suppress_warnings=True,
                           trace=True))
])

pipeline.fit(train)

# Serialize your model just like you would in scikit:
with open('model_H.pkl', 'wb') as pkl:
    pickle.dump(pipeline, pkl)

# Load it and make predictions seamlessly:
with open('model_H.pkl', 'rb') as pkl:
    mod = pickle.load(pkl)
forecasts = mod.predict(len(y)-train_size)
forecasts

In [None]:
plt_arima_result(y_train = train, forecasts = forecasts,y_test = test )


In [None]:
btc_hour=btc.resample("H").mean()

import pmdarima as pm
from pmdarima.model_selection import train_test_split
from pmdarima.pipeline import Pipeline
from pmdarima.preprocessing import BoxCoxEndogTransformer
import pickle

# Load/split your data
y = btc_hour.Close
train_size=int(len(y)*0.8)
train, test = train_test_split(btc_hour.Close, train_size=train_size)

# Define and fit your pipeline
pipeline = Pipeline([
    ('boxcox', BoxCoxEndogTransformer(lmbda2=1e-6)),  # lmbda2 avoids negative values
    ('arima', pm.AutoARIMA(seasonal=True, m=12, #hourly data
                           suppress_warnings=True,
                           trace=True))
])

pipeline.fit(train)

# Serialize your model just like you would in scikit:
with open('model_H.pkl', 'wb') as pkl:
    pickle.dump(pipeline, pkl)

# Load it and make predictions seamlessly:
with open('model_H.pkl', 'rb') as pkl:
    mod = pickle.load(pkl)
forecasts = mod.predict(len(y)-train_size)
forecasts



In [None]:
plt_arima_result(y_train = train, forecasts = forecasts,y_test = test )

In [None]:
btc_hour=btc.resample("H").mean()

import pmdarima as pm
from pmdarima.model_selection import train_test_split
from pmdarima.pipeline import Pipeline
from pmdarima.preprocessing import BoxCoxEndogTransformer
import pickle

# Load/split your data
y = btc_hour.Close
train_size=int(len(y)*0.8)
train, test = train_test_split(btc_hour.Close, train_size=train_size)

# Define and fit your pipeline
pipeline = Pipeline([
    ('boxcox', BoxCoxEndogTransformer(lmbda2=1e-6)),  # lmbda2 avoids negative values
    ('arima', pm.AutoARIMA(seasonal=True, m=24, #hourly data
                           suppress_warnings=True,
                           trace=True))
])

pipeline.fit(train)

# Serialize your model just like you would in scikit:
with open('model_H.pkl', 'wb') as pkl:
    pickle.dump(pipeline, pkl)

# Load it and make predictions seamlessly:
with open('model_H.pkl', 'rb') as pkl:
    mod = pickle.load(pkl)
forecasts = mod.predict(len(y)-train_size)
forecasts

plt_arima_result(y_train = train, forecasts = forecasts,y_test = test )

Minute prediction

In [None]:
n_steps_to_train = 180*24*60 #0.5 year
btc_mini =  btc[-n_steps_to_train:]

In [None]:
btc[-1:]

In [None]:
plot_acf(btc_mini.Close.values.squeeze(), lags=60)
plot_pacf(btc_mini.Close.values.squeeze(), lags=60)

plt.tight_layout()
plt.show()







# Load/split your data
y = btc_mini.Close

train_size=int(len(y)*0.8)
train, test = train_test_split(y, train_size=train_size)

# Define and fit your pipeline
pipeline = Pipeline([
    ('boxcox', BoxCoxEndogTransformer(lmbda2=1e-6)),  # lmbda2 avoids negative values
    ('arima', pm.AutoARIMA(seasonal=True, m =2, #min data
                           suppress_warnings=True,
                           trace=True))
])

pipeline.fit(train)

# Serialize your model just like you would in scikit:
with open('model_H.pkl', 'wb') as pkl:
    pickle.dump(pipeline, pkl)

# Load it and make predictions seamlessly:
with open('model_H.pkl', 'rb') as pkl:
    mod = pickle.load(pkl)
forecasts ,conf_int = mod.predict(len(y)-train_size,return_conf_int=True)
forecasts

plt_arima_result(y_train = train, forecasts = forecasts,y_test = test )

In [None]:

n_steps_to_train = 180*24*60 #0.5 year
btc_mini =  btc[-n_steps_to_train:]

# Load/split your data
y = btc_mini.Close

train_size=int(len(y)*0.8)
train, test = train_test_split(y, train_size=train_size)

# Define and fit your pipeline
pipeline = Pipeline([
    ('boxcox', BoxCoxEndogTransformer(lmbda2=1e-6)),  # lmbda2 avoids negative values
    ('arima', pm.AutoARIMA(seasonal=True, m =60, #min data
                           suppress_warnings=True,
                           trace=True))
])

pipeline.fit(train)

# Serialize your model just like you would in scikit:
with open('model_H.pkl', 'wb') as pkl:
    pickle.dump(pipeline, pkl)

# Load it and make predictions seamlessly:
with open('model_H.pkl', 'rb') as pkl:
    mod = pickle.load(pkl)
forecasts ,conf_int = mod.predict(len(y)-train_size,return_conf_int=True)
forecasts

plt_arima_result(y_train = train, forecasts = forecasts,y_test = test )