In [None]:
import numpy as np

from numba import jit

In [None]:
def get_multiple_ts_dataset(id, data, target, tr_win=5, tt_win=0, point_target=True):
    num_samples = data.shape[0] - tr_win - tt_win + 1
    
    ID = np.zeros((num_samples, ))
    X = np.zeros((num_samples, tr_win*(data.shape[1])))
    
    for i in range(num_samples):
        X[i] = data[i:i + tr_win].flatten()
    
    if point_target:
        Y = np.zeros((num_samples, target.shape[1]))
        for i in range(num_samples):
            Y[i] = target[i + tr_win + tt_win - 1, :]
    else:
        Y = np.zeros((num_samples, tt_win*(target.shape[1])))
        for i in range(num_samples):
            Y[i] = target[i:i + tr_win].flatten()
    
    ID = id[tr_win-1:tr_win+num_samples-1, :]
    
    return ID, X, Y

In [None]:
def get_ts_cv_folds(data_size, n_folds=5, frac_min_tr=0.4, max_tr='frac_min_tr'):
    cv_size = int(((1 - frac_min_tr)/n_folds)*data_size)
    tr_size = int(data_size*frac_min_tr)
    
    if max_tr == 'frac_min_tr':
        return[(
            i*cv_size,
            i*cv_size + tr_size,
            i*cv_size + tr_size + 1,
            i*cv_size + tr_size + 1 + cv_size
        ) for i in range(n_folds)]
    elif max_tr == 'full':
        return[(
            0,
            i*cv_size + tr_size,
            i*cv_size + tr_size + 1,
            i*cv_size + tr_size + 1 + cv_size
        ) for i in range(n_folds)]
    else:
        return []

In [None]:
def data_validation(data):
    open_times = data.OPEN_TIME.astype(int).values
    open_times_diff = np.diff(open_times)
    
    missing_points = np.where(open_times_diff != 60000)[0]
    if len(missing_points) != 4:
        raise ValueError('Number of missing gaps not matching to 4!!!')
    
    if (data.OPEN_TIME.iloc[missing_points].values.astype(int) != np.array([1530104340000, 1530663720000, 1539928740000, 1542160740000])).all():
        raise ValueError('Missing gap timestamps not matching!!!')
    
    if (data.OPEN_TIME.iloc[missing_points+1].values.astype(int) != np.array([1530110700000, 1530691200000, 1539941400000, 1542186000000])).all():
        raise ValueError('Missing gap timestamps not matching!!!')

In [None]:
def fill_time_gaps(data, period=60000, interpolate=False):
    open_times = data.OPEN_TIME.astype(int).values
    open_times_diff = np.diff(open_times)
    missing_points = np.where(open_times_diff != period)[0]
    
    start_missing_points = data.OPEN_TIME.iloc[missing_points].values.astype(int)
    end_missing_points = data.OPEN_TIME.iloc[missing_points+1].values.astype(int)
    
    ot_missing = []
    ct_missing = []
    for i in range(len(start_missing_points)):
        ot_missing = ot_missing + range(start_missing_points[i] + period, end_missing_points[i], period)
    
    ot_missing = np.array(ot_missing)
    ct_missing = ot_missing + period - 1
    
    df = pd.DataFrame(columns=data.columns)
    df['OPEN_TIME'] = ot_missing
    df['CLOSE_TIME'] = ct_missing
    
    df = pd.concat([data, df], ignore_index=True)
    df.sort_values('OPEN_TIME', axis=0, ascending=True, inplace=True)
    df = df.reset_index(drop=True)
    
    if interpolate == True:
        df = df.interpolate()
    
    return df

In [None]:
def find_error_data(data, period=60000):
    if len(data) != (dat_.OPEN_TIME + 59999 != dat_.CLOSE_TIME).sum():
        error_points = np.where((dat_.OPEN_TIME + 59999 != dat_.CLOSE_TIME))[0]
        for error_point in error_points:
            print(data.iloc[error_point-2:error_point+2])

In [None]:
def clean_data(data):
    dat_.loc[dat_.OPEN_TIME == 1530663720000, 'CLOSE_TIME'] = 1530663780000 - 1
    return data

In [None]:
def TR_CV_TT_split(data, tr_frac, cv_frac, tt_frac):
    TR_data = data[:int(len(data)*tr_frac)]
    CV_data = data[int(len(data)*tr_frac):int(len(data)*(tr_frac+cv_frac))]
    TT_data = data[int(len(data)*(tr_frac+cv_frac)):]
    return TR_data, CV_data, TT_data

In [None]:
def interval_transform(data, period=5):
    _data = data.iloc[::-1].reset_index(drop=True)
    
    data_ = pd.DataFrame({
        'OPEN_TIME'  : data['OPEN_TIME'].iloc[len(data)-period::-period].reset_index(drop=True)
    })
    data_['CLOSE_TIME'] = data['CLOSE_TIME'].iloc[len(data)-1::-period].reset_index(drop=True)
    data_['OPEN']  = data['OPEN'].iloc[len(data)-period::-period].reset_index(drop=True)
    data_['HIGH']  = _data['HIGH'].groupby(_data.index // period).max().reset_index(drop=True)
    data_['LOW']   = _data['LOW'].groupby(_data.index // period).min().reset_index(drop=True)
    data_['CLOSE'] = data['CLOSE'].iloc[len(data)-1::-period].reset_index(drop=True)
    
    return data_.iloc[::-1].reset_index(drop=True)

In [2]:
from numpy.linalg import norm

def distribution_diff(data_p, data_q, n_bins=1000, plot=False):
    max_p = max(data_p)
    min_p = min(data_p)
    max_q = max(data_q)
    min_q = min(data_q)
    
    min_pq = min(min_p, min_q)
    min_pq = min(max_p, max_q)
    
    bins = np.linspace(min_pq, max_pq, n_bins)
    
    freq_p, bins = np.histogram(data_p, bins=bins)
    freq_q, bins = np.histogram(data_q, bins=bins)
    
    if plot==True:
        plt.figure(figsize=(20, 5))
        _ = plt.hist(data_p, bins=bins)
        plt.xlim((min_pq, max_pq))
        plt.show()
        
        plt.figure(figsize=(20, 5))
        _ = plt.hist(data_q, bins=bins)
        plt.xlim((min_pq, max_pq))
        plt.show()
        
    p = freq_p/len(data_p)
    q = freq_q/len(data_q)
    
    return norm(p-q)