In [1]:
import pandas as pd
import numpy as np
import sklearn
import torch
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_column', 100)
pd.set_option('display.max_rows', None)
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [2]:
def order_book():
    df = pd.read_csv('688301.SH.csv')
    df.drop(df.index[0:6], inplace=True)
    df.reset_index(drop=True, inplace=True)
    df.drop(df.index[3232:3251], inplace=True)
    df['DateTime'] = pd.to_datetime(df['DateTime'])
    df['seconds'] = (df['DateTime'] - df['DateTime'].min()).dt.total_seconds()
    column_order = ['DateTime', 'seconds', 'bid1', 'bsize1', 'bid2', 'bsize2', 'bid3', 'bsize3',
    'bid4', 'bsize4', 'bid5', 'bsize5', 'ask1', 'asize1', 'ask2', 'asize2',
    'ask3', 'asize3', 'ask4', 'asize4', 'ask5', 'asize5']
    df = df[column_order]
    df['seconds'][1608:] = df['seconds'][1608:] - 5400
    
    datetime = np.array(df['DateTime']).astype('datetime64[s]')
    seconds = np.array(df['seconds']).astype('int64')
    bid1 = np.array(df['bid1']).astype('float64') 
    bsize1 = np.array(df['bsize1']).astype('float64')
    bid2 = np.array(df['bid2']).astype('float64') 
    bsize2 = np.array(df['bsize2']).astype('float64') 
    bid3 = np.array(df['bid3']).astype('float64') 
    bsize3 = np.array(df['bsize3']).astype('float64') 
    bid4 = np.array(df['bid4']).astype('float64') 
    bsize4 = np.array(df['bsize4']).astype('float64') 
    bid5 = np.array(df['bid5']).astype('float64') 
    bsize5 = np.array(df['bsize5']).astype('float64') 
    ask1 = np.array(df['ask1']).astype('float64')
    asize1 = np.array(df['asize1']).astype('float64') 
    ask2 = np.array(df['ask2']).astype('float64')
    asize2 = np.array(df['asize2']).astype('float64') 
    ask3 = np.array(df['ask3']).astype('float64')
    asize3 = np.array(df['asize3']).astype('float64') 
    ask4 = np.array(df['ask4']).astype('float64')
    asize4 = np.array(df['asize4']).astype('float64') 
    ask5 = np.array(df['ask5']).astype('float64')
    asize5 = np.array(df['asize5']).astype('float64') 
    
    bid1[np.isnan(bid1)] = 0
    bsize1[np.isnan(bsize1)] = 0
    bid2[np.isnan(bid2)] = 0 
    bsize2[np.isnan(bsize2)] = 0 
    bid3[np.isnan(bid3)] = 0 
    bsize3[np.isnan(bsize3)] = 0 
    bid4[np.isnan(bid4)] = 0 
    bsize4[np.isnan(bsize4)] = 0 
    bid5[np.isnan(bid5)] = 0 
    bsize5[np.isnan(bsize5)] = 0 
    ask1[np.isnan(ask1)] = 0
    asize1[np.isnan(asize1)] = 0
    ask2[np.isnan(ask2)] = 0 
    asize2[np.isnan(asize2)] = 0 
    ask3[np.isnan(ask3)] = 0 
    asize3[np.isnan(asize3)] = 0 
    ask4[np.isnan(ask4)] = 0 
    asize4[np.isnan(asize4)] = 0 
    ask5[np.isnan(ask5)] = 0 
    asize5[np.isnan(asize5)] = 0 
    
    
    return df, datetime, seconds, bid1, bsize1, bid2, bsize2, bid3, bsize3, bid4,\
            bsize4, bid5, bsize5, ask1, asize1, ask2, asize2, ask3, asize3, ask4,\
            asize4, ask5, asize5 

In [3]:
def weighted_features(w1, w2, w3, w4, w5, bsize1, bsize2, bsize3, bsize4, bsize5, asize1, asize2, asize3,\
                     asize4, asize5):
    weighted_bid = w1 * bsize1 + w2 * bsize2 + w3 * bsize3 + w4 * bsize4 + w5 * bsize5
    weighted_ask = w1 * asize1 + w2 * asize2 + w3 * asize3 + w4 * asize4 + w5 * asize5
    
    weighted_depth = weighted_bid / weighted_ask 
    weighted_obi = (weighted_bid - weighted_ask) / (weighted_bid + weighted_ask)
   
    weighted_depth = np.nan_to_num(weighted_depth)
    weighted_obi = np.nan_to_num(weighted_obi)
    
    return weighted_depth, weighted_obi

In [4]:
def rise_ratio(ask1, seconds, before_time): # timestamp_time_second is literally of the time stamps while before time is the point in which to add 
    ask1[ask1 == 0] = np.mean(ask1)
    index = np.where(seconds >= before_time)[0][0]
    rise_ratio = []

    for i in range(0, index):
        rise = round((ask1[i] - ask1[0])*(1.0)/ask1[0]*100, 5)
        rise_ratio.append(rise)
        
    for i in range(index, len(ask1)):
        #print(f"Length of df[:{i}]: {len(df[:i])}")
        index_start = np.where(seconds[:i] >= seconds[i] - before_time)[0][0]
        rise = round((ask1[i] - ask1[index_start])*(1.0)/ask1[index_start]*100,5)
        rise_ratio.append(rise)
    
    return rise_ratio

In [5]:
df, datetime, seconds, bid1, bsize1, bid2, bsize2, bid3, bsize3, bid4,\
            bsize4, bid5, bsize5, ask1, asize1, ask2, asize2, ask3, asize3, ask4,\
            asize4, ask5, asize5 = order_book()

In [6]:
def feature_engineering(seconds, bid1, bsize1, bid2, bsize2, bid3, bsize3, bid4,\
            bsize4, bid5, bsize5, ask1, asize1, ask2, asize2, ask3, asize3, ask4,\
            asize4, ask5, asize5):
    
    w_d_10000, w_i_10000 = weighted_features(100.0, 0.0, 0.0, 0.0, 0.0, bsize1, bsize2, bsize3, bsize4, bsize5, asize1, asize2, asize3,asize4, asize5)
    w_d_01000, w_i_01000 = weighted_features(0.0, 100.0, 0.0, 0.0, 0.0, bsize1, bsize2, bsize3, bsize4, bsize5, asize1, asize2, asize3,asize4, asize5)
    w_d_00100, w_i_00100 = weighted_features(0.0, 0.0, 100.0, 0.0, 0.0, bsize1, bsize2, bsize3, bsize4, bsize5, asize1, asize2, asize3,asize4, asize5)
    w_d_00010, w_i_00010 = weighted_features(0.0, 0.0, 0.0, 100.0, 0.0, bsize1, bsize2, bsize3, bsize4, bsize5, asize1, asize2, asize3,asize4, asize5)
    w_d_00001, w_i_00001 = weighted_features(0.0, 0.0, 0.0, 0.0, 100.0, bsize1, bsize2, bsize3, bsize4, bsize5, asize1, asize2, asize3,asize4, asize5)
    w_d_91000, w_i_91000 = weighted_features(90.0, 10.0, 0.0, 0.0, 0.0, bsize1, bsize2, bsize3, bsize4, bsize5, asize1, asize2, asize3,asize4, asize5)
    w_d_81100, w_i_81100 = weighted_features(80.0, 10.0, 10.0, 0.0, 0.0, bsize1, bsize2, bsize3, bsize4, bsize5, asize1, asize2, asize3,asize4, asize5)
    w_d_73000, w_i_73000 = weighted_features(70.0, 30.0, 0.0, 0.0, 0.0, bsize1, bsize2, bsize3, bsize4, bsize5, asize1, asize2, asize3,asize4, asize5)
    w_d_72100, w_i_72100 = weighted_features(70.0, 20.0, 10.0, 0.0, 0.0, bsize1, bsize2, bsize3, bsize4, bsize5, asize1, asize2, asize3,asize4, asize5)
    w_d_71110, w_i_71110 = weighted_features(70.0, 10.0, 10.0, 0.0, 0.0, bsize1, bsize2, bsize3, bsize4, bsize5, asize1, asize2, asize3,asize4, asize5)
    w_d_64000, w_i_64000 = weighted_features(60.0, 40.0, 0.0, 0.0, 0.0, bsize1, bsize2, bsize3, bsize4, bsize5, asize1, asize2, asize3,asize4, asize5)
    w_d_63100, w_i_63100 = weighted_features(60.0, 30.0, 10.0, 0.0, 0.0, bsize1, bsize2, bsize3, bsize4, bsize5, asize1, asize2, asize3,asize4, asize5)
    w_d_62200, w_i_62200 = weighted_features(60.0, 20.0, 20.0, 0.0, 0.0, bsize1, bsize2, bsize3, bsize4, bsize5, asize1, asize2, asize3,asize4, asize5)
    w_d_62110, w_i_62110 = weighted_features(60.0, 20.0, 10.0, 10.0, 0.0, bsize1, bsize2, bsize3, bsize4, bsize5, asize1, asize2, asize3,asize4, asize5)
    w_d_61111, w_i_61111 = weighted_features(60.0, 10.0, 10.0, 10.0, 10.0, bsize1, bsize2, bsize3, bsize4, bsize5, asize1, asize2, asize3,asize4, asize5)
    w_d_55000, w_i_55000 = weighted_features(50.0, 50.0, 0.0, 0.0, 0.0, bsize1, bsize2, bsize3, bsize4, bsize5, asize1, asize2, asize3,asize4, asize5)
    w_d_54100, w_i_54100 = weighted_features(50.0, 40.0, 10.0, 0.0, 0.0, bsize1, bsize2, bsize3, bsize4, bsize5, asize1, asize2, asize3,asize4, asize5)
    w_d_53200, w_i_53200 = weighted_features(50.0, 30.0, 20.0, 0.0, 0.0, bsize1, bsize2, bsize3, bsize4, bsize5, asize1, asize2, asize3,asize4, asize5)
    w_d_52210, w_i_52210 = weighted_features(50.0, 20.0, 20.0, 10.0, 0.0, bsize1, bsize2, bsize3, bsize4, bsize5, asize1, asize2, asize3,asize4, asize5)
    w_d_52111, w_i_52111 = weighted_features(50.0, 20.0, 20.0, 10.0, 0.0, bsize1, bsize2, bsize3, bsize4, bsize5, asize1, asize2, asize3,asize4, asize5)
    w_d_45100, w_i_45100 = weighted_features(60.0, 10.0, 10.0, 10.0, 10.0, bsize1, bsize2, bsize3, bsize4, bsize5, asize1, asize2, asize3,asize4, asize5)
    w_d_37000, w_i_37000 = weighted_features(50.0, 20.0, 20.0, 10.0, 0.0, bsize1, bsize2, bsize3, bsize4, bsize5, asize1, asize2, asize3,asize4, asize5)
    w_d_23500, w_i_23500 = weighted_features(50.0, 20.0, 20.0, 10.0, 0.0, bsize1, bsize2, bsize3, bsize4, bsize5, asize1, asize2, asize3,asize4, asize5)
    w_d_12700, w_i_12700 = weighted_features(50.0, 20.0, 20.0, 10.0, 0.0, bsize1, bsize2, bsize3, bsize4, bsize5, asize1, asize2, asize3,asize4, asize5)
    
    before_time = 60.0 * 3
    rise_ask_1 = rise_ratio(ask1, seconds, before_time)
    before_time = 60.0 * 3 + 30
    rise_ask_2 = rise_ratio(ask1, seconds, before_time)
    before_time = 60.0 * 4
    rise_ask_3 = rise_ratio(ask1, seconds, before_time)
    before_time = 60.0 * 4 + 30
    rise_ask_4 = rise_ratio(ask1, seconds, before_time)
    before_time = 60.0 * 5
    rise_ask_5 = rise_ratio(ask1, seconds, before_time)
    before_time = 60.0 * 5 + 30
    rise_ask_6 = rise_ratio(ask1, seconds, before_time)
    before_time = 60.0 * 6 
    rise_ask_7 = rise_ratio(ask1, seconds, before_time)
    before_time = 60.0 * 6 + 30
    rise_ask_8 = rise_ratio(ask1, seconds, before_time)
    before_time = 60.0 * 7 
    rise_ask_9 = rise_ratio(ask1, seconds, before_time)
    before_time = 60.0 * 7 + 30
    rise_ask_10 = rise_ratio(ask1, seconds, before_time)
    before_time = 60.0 * 8 
    rise_ask_11 = rise_ratio(ask1, seconds, before_time)
    before_time = 60.0 * 8 + 30
    rise_ask_12 = rise_ratio(ask1, seconds, before_time)
    before_time = 60.0 * 9 
    rise_ask_13 = rise_ratio(ask1, seconds, before_time)
    before_time = 60.0 * 9 + 30
    rise_ask_14 = rise_ratio(ask1, seconds, before_time)
    before_time = 60.0 * 10 
    rise_ask_15 = rise_ratio(ask1, seconds, before_time)
    before_time = 60.0 * 10 + 30
    rise_ask_16 = rise_ratio(ask1, seconds, before_time)
    before_time = 60.0 * 11
    rise_ask_17 = rise_ratio(ask1, seconds, before_time)
    before_time = 60.0 * 11 + 30
    rise_ask_18 = rise_ratio(ask1, seconds, before_time)
    before_time = 60.0 * 12
    rise_ask_19 = rise_ratio(ask1, seconds, before_time)
    before_time = 60.0 * 12 + 30
    rise_ask_20 = rise_ratio(ask1, seconds, before_time)
    before_time = 60.0 * 13 
    rise_ask_21 = rise_ratio(ask1, seconds, before_time)
    before_time = 60.0 * 13 + 30
    rise_ask_22 = rise_ratio(ask1, seconds, before_time)
    before_time = 60.0 * 14 
    rise_ask_23 = rise_ratio(ask1, seconds, before_time)
    before_time = 60.0 * 14 + 30
    rise_ask_24 = rise_ratio(ask1, seconds, before_time)
    before_time = 60.0 * 15 
    rise_ask_25 = rise_ratio(ask1, seconds, before_time)
    before_time = 60.0 * 15 + 30
    rise_ask_26 = rise_ratio(ask1, seconds, before_time)
    before_time = 60.0 * 16 
    rise_ask_27 = rise_ratio(ask1, seconds, before_time)
    before_time = 60.0 * 16 + 30
    rise_ask_28 = rise_ratio(ask1, seconds, before_time)
    before_time = 60.0 * 17 
    rise_ask_29 = rise_ratio(ask1, seconds, before_time)
    before_time = 60.0 * 17 + 30
    rise_ask_30 = rise_ratio(ask1, seconds, before_time)
    before_time = 60.0 * 18 
    rise_ask_31 = rise_ratio(ask1, seconds, before_time)
    before_time = 60.0 * 18 + 30
    rise_ask_32 = rise_ratio(ask1, seconds, before_time)
    before_time = 60.0 * 19 
    rise_ask_33 = rise_ratio(ask1, seconds, before_time)
    before_time = 60.0 * 19 + 30
    rise_ask_34 = rise_ratio(ask1, seconds, before_time)
    before_time = 60.0 * 20 
    rise_ask_35 = rise_ratio(ask1, seconds, before_time)
    before_time = 60.0 * 20 + 30
    rise_ask_36 = rise_ratio(ask1, seconds, before_time)
    
    return w_d_10000, w_i_10000, w_d_01000, w_i_01000, w_d_00100, w_i_00100 , w_d_00010, w_i_00010,\
    w_d_00001, w_i_00001, w_d_91000, w_i_91000 , w_d_81100, w_i_81100 , w_d_73000, w_i_73000, \
    w_d_72100, w_i_72100, w_d_71110, w_i_71110, w_d_64000, w_i_64000, w_d_63100, w_i_63100, \
    w_d_62200, w_i_62200, w_d_62110, w_i_62110, w_d_61111, w_i_61111, w_d_55000, w_i_55000, \
    w_d_54100, w_i_54100, w_d_53200, w_i_53200, w_d_52210, w_i_52210, w_d_52111, w_i_52111, \
    w_d_45100, w_i_45100, w_d_37000, w_i_37000, w_d_23500, w_i_23500, w_d_12700, w_i_12700, \
    rise_ask_1, rise_ask_2, rise_ask_3, rise_ask_4, rise_ask_5, rise_ask_6, rise_ask_7,\
    rise_ask_8, rise_ask_9, rise_ask_10, rise_ask_11, rise_ask_12, rise_ask_13, rise_ask_14, \
     rise_ask_15, rise_ask_16, rise_ask_17, rise_ask_18, rise_ask_19, rise_ask_20, rise_ask_21, \
     rise_ask_22, rise_ask_23, rise_ask_24, rise_ask_25, rise_ask_26, rise_ask_27, rise_ask_28, \
    rise_ask_29, rise_ask_30, rise_ask_31, rise_ask_32, rise_ask_33, rise_ask_34, rise_ask_35, rise_ask_36

In [7]:
def traded_label(seconds, bid1, ask1):
    
    traded = []
    
    for i in range(0,len(seconds)):
        if bid1[i] > np.min(ask1[i:i + 100]):
            traded.append(1)
        else:
            traded.append(0)
    
    return traded

In [8]:
def setup():
    
    df, datetime, seconds, bid1, bsize1, bid2, bsize2, bid3, bsize3, bid4,\
            bsize4, bid5, bsize5, ask1, asize1, ask2, asize2, ask3, asize3, ask4,\
            asize4, ask5, asize5 = order_book()
    
    w_d_10000, w_i_10000, w_d_01000, w_i_01000, w_d_00100, w_i_00100 , w_d_00010, w_i_00010,\
    w_d_00001, w_i_00001, w_d_91000, w_i_91000 , w_d_81100, w_i_81100 , w_d_73000, w_i_73000, \
    w_d_72100, w_i_72100, w_d_71110, w_i_71110, w_d_64000, w_i_64000, w_d_63100, w_i_63100, \
    w_d_62200, w_i_62200, w_d_62110, w_i_62110, w_d_61111, w_i_61111, w_d_55000, w_i_55000, \
    w_d_54100, w_i_54100, w_d_53200, w_i_53200, w_d_52210, w_i_52210, w_d_52111, w_i_52111, \
    w_d_45100, w_i_45100, w_d_37000, w_i_37000, w_d_23500, w_i_23500, w_d_12700, w_i_12700, \
    rise_ask_1, rise_ask_2, rise_ask_3, rise_ask_4, rise_ask_5, rise_ask_6, rise_ask_7,\
    rise_ask_8, rise_ask_9, rise_ask_10, rise_ask_11, rise_ask_12, rise_ask_13, rise_ask_14, \
     rise_ask_15, rise_ask_16, rise_ask_17, rise_ask_18, rise_ask_19, rise_ask_20, rise_ask_21, \
     rise_ask_22, rise_ask_23, rise_ask_24, rise_ask_25, rise_ask_26, rise_ask_27, rise_ask_28, \
    rise_ask_29, rise_ask_30, rise_ask_31, rise_ask_32, rise_ask_33, rise_ask_34, rise_ask_35, rise_ask_36 = feature_engineering(seconds,\
            bid1, bsize1, bid2, bsize2, bid3, bsize3, bid4,\
            bsize4, bid5, bsize5, ask1, asize1, ask2, asize2, ask3, asize3, ask4,\
            asize4, ask5, asize5)
    
    traded = traded_label(seconds, bid1, ask1)
    
    data = np.array([traded, w_d_10000, w_i_10000, w_d_01000, w_i_01000, w_d_00100, w_i_00100 , w_d_00010, w_i_00010,\
    w_d_00001, w_i_00001, w_d_91000, w_i_91000 , w_d_81100, w_i_81100 , w_d_73000, w_i_73000, \
    w_d_72100, w_i_72100, w_d_71110, w_i_71110, w_d_64000, w_i_64000, w_d_63100, w_i_63100, \
    w_d_62200, w_i_62200, w_d_62110, w_i_62110, w_d_61111, w_i_61111, w_d_55000, w_i_55000, \
    w_d_54100, w_i_54100, w_d_53200, w_i_53200, w_d_52210, w_i_52210, w_d_52111, w_i_52111, \
    w_d_45100, w_i_45100, w_d_37000, w_i_37000, w_d_23500, w_i_23500, w_d_12700, w_i_12700, \
    rise_ask_1, rise_ask_2, rise_ask_3, rise_ask_4, rise_ask_5, rise_ask_6, rise_ask_7,\
    rise_ask_8, rise_ask_9, rise_ask_10, rise_ask_11, rise_ask_12, rise_ask_13, rise_ask_14, \
     rise_ask_15, rise_ask_16, rise_ask_17, rise_ask_18, rise_ask_19, rise_ask_20, rise_ask_21, \
     rise_ask_22, rise_ask_23, rise_ask_24, rise_ask_25, rise_ask_26, rise_ask_27, rise_ask_28, \
    rise_ask_29, rise_ask_30, rise_ask_31, rise_ask_32, rise_ask_33, rise_ask_34, rise_ask_35, rise_ask_36])
    
    
    return pd.DataFrame(data).transpose()
    