In [2]:
import json 
import os 
import numpy as np
import pandas as pd
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
main_dir = os.path.dirname(os.getcwd())

In [4]:
# CSV file contains candlestick data for interval = 1min
df = pd.read_csv(main_dir+'/data/BTC_USD_1month_features.csv', usecols = [0,1,2,3,4,5,6])
df['Close Time']  = pd.to_datetime(df['Close Time']/1000, unit='s')
df['Close Time'] = [ str(item).split('.')[0].replace('T', ' ') for item in df['Close Time'].values ]
df = df.rename( { 'Close Time':'ts'}, axis=1).drop('Open Time', axis=1)
df.columns = [ c.lower() for c in df.columns ] 
df = df[['ts','low', 'high', 'open', 'close', 'volume']]
df.head()

Unnamed: 0,ts,low,high,open,close,volume
0,2020-10-01 00:00:59,10776.59,10786.88,10776.59,10781.13,71.839045
1,2020-10-01 00:01:59,10779.87,10799.9,10781.02,10796.0,149.010542
2,2020-10-01 00:02:59,10791.5,10796.0,10796.0,10791.5,41.615967
3,2020-10-01 00:03:59,10791.5,10799.0,10791.51,10798.9,41.85655
4,2020-10-01 00:04:59,10798.8,10826.19,10798.9,10811.29,232.593454


In [136]:
def ConvertMinutesToHour(df_feats, training_window = 180, feature_window=30):
    '''
    
    '''
    # Price features
    ts = df_feats.ts.values[-1]
    open_hour = df_feats.open.values[0]
    high_hour = df_feats.close.max()
    low_hour = df_feats.close.min() 
    close_hour = df_feats.close.values[-1]
    vol_hour = df_feats.volume.sum() #is this correct?
    
    # Latest price
    open_hour_latest = df_feats.open.values[-1]
    high_hour_latest = df_feats.high.values[-1]
    low_hour_latest = df_feats.low.values[-1]
    close_hour_latest = df_feats.close.values[-1]
    
    # Overall difference
    overall_price_diff = df_feats.close.values[-1] - df_feats.close.values[0]
    overall_price_acceleration = (df_feats.close.values[-1] / df_feats.close.values[0]) - 1
        
    # Price acceleration
    acc_index=0

    average_close_price = 0
    average_close_price_acceleration = 0
    average_close_price_diff = 0
    av_window_volume = 0
    window_volume_acceleration = 0
    prev_window_volume_acceleration = 0
    average_volume_acceleration = 0
    
    for k, (idx, row) in enumerate(df_feats.iloc[:-1,:].iterrows()):
        acc_vol = row['volume']
        av_window_volume += acc_vol 
        window_volume_acceleration += acc_vol
        
        acc_close = row['close']
        average_close_price += acc_close
        
        acc_index +=1
        if acc_index%feature_window == 0:
            close_price_acceleration = (acc_vol / df_feats.close.values[k-feature_window]) - 1
            average_close_price_acceleration += close_price_acceleration
            #features.append(close_price_acceleration)
            #feature_names.append("close_price_acc_"+str(acc_index)) #not sure what is this
            close_price_diff = acc_close - df_feats.close.values[k-feature_window]
            average_close_price_diff +=  close_price_diff
            if prev_window_volume_acceleration != 0:
                volume_acceleration = (window_volume_acceleration / prev_window_volume_acceleration) - 1
                #features.append(volume_acceleration)
                #feature_names.append("volume_acc_"+str(acc_index))   #not sure what is this
            average_volume_acceleration += window_volume_acceleration
            prev_window_volume_acceleration = window_volume_acceleration
            window_volume_acceleration = 0
            
    average_close_price /= training_window
    average_close_price_acceleration /= training_window
    av_window_volume /= training_window
    average_volume_acceleration /= training_window

    hour_candle = {
                   'ts': ts, 'open':open_hour, 'high':high_hour, 'low':low_hour, 'close':close_hour, 'volume':vol_hour,
                   'open_latest':open_hour_latest , 'high_latest':high_hour_latest , 'low_latest':low_hour_latest, 'close_latest':close_hour_latest ,
                   'overall_close_diff':overall_price_diff, 'overall_close_acc':overall_price_acceleration, 
                   'av_close':average_close_price, 'av_close_acc':average_close_price_acceleration , 
                   'av_vol':av_window_volume, 'av_vol_acc':average_volume_acceleration
                  }
    
    return hour_candle
    

In [137]:
def convert_MinToHour(df, training_window = 180, labeling_window = 60):
    ''' 
    
    '''
    candlesticks = []
    for i, (idx, row) in enumerate(df[labeling_window:].iterrows()):
        if idx%labeling_window==0:
            if idx-training_window>=0:
                df_tmp = df.iloc[idx-training_window:idx, :]
                candle = ConvertMinutesToHour(df_tmp)
                candlesticks.append(candle)
    df_hour = pd.DataFrame(candlesticks)
    return df_hour

In [138]:
def LabelData(df):
    '''
    Creates labels for each timestamp on the new DataFrame.
    '''
    df_labeled = df.copy()
    df_labeled['close_shift'] = df.close.shift(-1)
    df_labeled['label'] = df_labeled.apply(lambda row: 1 if row['close_shift']>= row['close'] else 0, axis=1)
    df_labeled = df_labeled.drop(['close_shift'], axis=1)
    return df_labeled


In [139]:
df_hour = convert_MinToHour(df)
df_hour = LabelData(df_hour)
display(df_hour.head())

Unnamed: 0,ts,open,high,low,close,volume,open_latest,high_latest,low_latest,close_latest,overall_close_diff,overall_close_acc,av_close,av_close_acc,av_vol,av_vol_acc,label
0,2020-10-01 02:59:59,10776.59,10846.59,10781.13,10817.14,5001.180825,10812.66,10817.14,10812.65,10817.14,36.01,0.00334,10750.509611,-0.02771,27.69342,25.56781,0
1,2020-10-01 03:59:59,10788.3,10846.59,10787.82,10798.18,3812.020351,10794.65,10798.98,10794.64,10798.18,4.87,0.000451,10751.593444,-0.027733,21.061759,18.847278,1
2,2020-10-01 04:59:59,10838.89,10846.59,10790.91,10800.01,3290.332749,10801.6,10801.85,10800.0,10800.01,-46.58,-0.004294,10751.067611,-0.027746,18.243071,15.669111,1
3,2020-10-01 05:59:59,10817.14,10826.41,10790.91,10821.07,3286.59199,10821.3,10821.3,10821.06,10821.07,-1.17,-0.000108,10746.8235,-0.027745,18.130671,14.463628,1
4,2020-10-01 06:59:59,10798.38,10843.21,10792.86,10821.29,4093.025101,10821.35,10821.36,10821.29,10821.29,24.52,0.002271,10754.9685,-0.027738,22.670377,18.541785,1
