# Multiple Variable Wavelet Preprocessing

In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split

from crypr.data import get_data
from crypr.util import common

from crypr.features.build_features import make_single_feature, series_to_predict_matrix, make_features, data_to_supervised
from crypr.features.wavelets import *

from scipy.fftpack import fft, fftfreq, fftshift
from scipy import signal
import pywt

import pickle
import gc
p=print

In [15]:
SYM='BTC'
LAST_N_HOURS=16000
TARGET='close'
Tx = 72
Ty = 1
TEST_SIZE=0.05

In [16]:
data = pd.read_csv('../data/raw/{}.csv'.format(SYM), index_col=0)

In [17]:
"""
Get percent change feature and target data.
"""
df=make_features(input_df=data, target_col='close', moving_average_lags=[], train_on_x_last_hours=LAST_N_HOURS)
X, y = data_to_supervised(input_df=df, Tx=Tx, Ty=Ty)
p(X.shape, y.shape)
X.head()

(15927, 504) (15927, 1)


Unnamed: 0,var1(t-72),var2(t-72),var3(t-72),var4(t-72),var5(t-72),var6(t-72),var7(t-72),var1(t-71),var2(t-71),var3(t-71),...,var5(t-2),var6(t-2),var7(t-2),var1(t-1),var2(t-1),var3(t-1),var4(t-1),var5(t-1),var6(t-1),var7(t-1)
28075,775.26,773.39,767.71,767.54,1429043.84,1850.95,-0.973867,767.71,770.35,768.02,...,747410.78,945.51,-0.118026,778.57,780.2,778.96,778.03,511678.97,656.76,0.050092
28076,767.71,770.35,768.02,765.6,1723087.38,2224.14,0.04038,768.02,770.55,766.86,...,511678.97,656.76,0.050092,778.96,780.21,774.04,773.32,1440954.63,1847.61,-0.631611
28077,768.02,770.55,766.86,765.97,967075.41,1258.18,-0.151038,766.86,768.69,767.87,...,1440954.63,1847.61,-0.631611,774.04,777.01,775.94,772.24,860687.72,1110.05,0.245465
28078,766.86,768.69,767.87,765.72,823248.02,1067.15,0.131706,767.87,769.84,768.77,...,860687.72,1110.05,0.245465,775.94,777.89,776.11,774.52,772375.45,987.86,0.021909
28079,767.87,769.84,768.77,767.88,330765.44,425.38,0.117207,768.77,767.97,767.38,...,772375.45,987.86,0.021909,776.11,780.79,779.7,776.48,648214.25,825.66,0.462563


In [18]:
"""
Confirm data reshape and target/feature creation was done correctly.
"""
y_values_except_last = np.squeeze(y.iloc[:-1].values)
t_minus_1_x_values_except_first = X.iloc[1:,-1].values

y_values_except_last.all() == t_minus_1_x_values_except_first.all()

True

In [19]:
"""
For comparing different transformations
"""
sample_ix = 1000

In [20]:
"""
Reshape the data into 3d array if multiple variables.
"""
X=X.values.reshape((X.shape[0], -1, Tx))
p(X.shape)

(15927, 7, 72)


In [33]:
"""
Apply the wave transformation to the feature data.
"""
wt_type='DWT_HAAR'

if wt_type == 'RICKER':
    wt_transform_fun = lambda x: signal.cwt(x, wavelet=signal.ricker, widths=widths)
elif wt_type == 'HAAR':
    wt_transform_fun = lambda x: Haar(x).getpower()
elif wt_type == 'DWT_HAAR':
    wt_transform_fun = lambda x: np.stack(pywt.dwt(x, 'haar'))
else:
    p('NOT IMPLEMENTED')
    
X_wt = np.apply_along_axis(func1d=wt_transform_fun, axis=-1, arr=X)

X_wt.shape

(15927, 7, 2, 36)

In [34]:
"""
Condense wavelet features if multiple features analyzed.
"""
X_wt=X_wt.reshape((X_wt.shape[0], X_wt.shape[1]*X_wt.shape[2], X_wt.shape[-1]))
N = X_wt.shape[-2:]
X_wt.shape, N

((15927, 14, 36), (14, 36))

In [35]:
"""
Reshape the data so Tx is the 2nd dimension.
"""
X_wt_rs = X_wt.swapaxes(-1,-2)
p(X_wt_rs.shape)

(15927, 36, 14)


In [36]:
"""
Train Test Split.
"""
X_train, X_test, y_train, y_test = train_test_split(X_wt_rs, y, test_size=TEST_SIZE, shuffle=False)

In [37]:
"""
Save data.
"""
np.save(arr=X_train, allow_pickle=True, file='../data/processed/X_train_{}_{}_{}x{}'.format(SYM, wt_type, Tx, N))
np.save(arr=X_test, allow_pickle=True, file='../data/processed/X_test_{}_{}_{}x{}'.format(SYM, wt_type, Tx, N))
np.save(arr=y_train, allow_pickle=True, file='../data/processed/y_train_{}_{}_{}x{}'.format(SYM, wt_type, Tx, N))
np.save(arr=y_test, allow_pickle=True, file='../data/processed/y_test_{}_{}_{}x{}'.format(SYM, wt_type, Tx, N))