In [152]:
import os  
import os
import glob
import h5py
import matplotlib.pyplot as plt
import lightgbm as lgb  
import numpy as np  
import pandas as pd  
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error, mean_absolute_error

## Summary
1. Loading and cleaning data, 
2. Feature engineering, 
3. HFT strategy, 
4. Predication of last 5 mins of each trading day

### 1. Loading and cleaning the data of 002521 and 300132 for 4 months

- Droping the entire columns that consist of zeros, NaNs, and IDs, as these are not involved in subsequent calculations.
- Changing the storage format of all data to reduce memory consumption.
- Converting the DataTime column to a time series and arranging all data sequentially based on it.

In [6]:
def combine_h5_files(folder_path, file_pattern='*.h5'):

    # Generate the file-matching pattern
    pattern = os.path.join(folder_path, file_pattern)

    df_list = []

    # Iterate over each matching file
    for file_path in glob.glob(pattern):
        with h5py.File(file_path, 'r') as f:
            columns = list(f.keys())
            data_dict = {}

            # Build a dictionary { column_name: numpy_array_of_values }
            for col in columns:
                data_dict[col] = f[col][:]

            # Convert this file's data into a DataFrame
            df_temp = pd.DataFrame(data_dict)

        df_list.append(df_temp)

    # Concatenate all DataFrames into one
    if df_list:
        df_combined = pd.concat(df_list, ignore_index=True)
    else:
        # In case no files match, return an empty DataFrame
        df_combined = pd.DataFrame()

    return df_combined

In [7]:
def clean_dataframe(df):

    columns_all_zero = [col for col in df.columns if (df[col] == 0).all()]

    df.drop(columns=columns_all_zero, inplace=True)

    if 'Nano' in df.columns:
        df.drop(columns=['Nano'], inplace=True)
    if 'TradingDay' in df.columns:
        df.drop(columns=['TradingDay'], inplace=True)
    if 'InstrumentID' in df.columns:
        df.drop(columns=['InstrumentID'], inplace=True)

    return df

def sort_by_datatime(df):

    if not pd.api.types.is_datetime64_any_dtype(df['DataTime']):
        try:

            df['DataTime'] = pd.to_datetime(df['DataTime'], format='%Y%m%d%H%M%S%f')
        except ValueError:

            pass

    df.sort_values(by='DataTime', inplace=True)

    df.reset_index(drop=True, inplace=True)

    return df

def reduce_column_size(df):

    df_reduced = df.copy()

    int_cols = df_reduced.select_dtypes(include=['int64']).columns
    for col in int_cols:
        df_reduced[col] = pd.to_numeric(df_reduced[col], downcast='integer')

    float_cols = df_reduced.select_dtypes(include=['float64']).columns
    for col in float_cols:
        df_reduced[col] = pd.to_numeric(df_reduced[col], downcast='float')

    return df_reduced




In [8]:
df_002521 = combine_h5_files('interview', '002521*.h5') # load
df_300132 = combine_h5_files('interview', '300132*.h5')
df_002521 = clean_dataframe(df_002521) # clean
df_300132 = clean_dataframe(df_300132)
df_002521 = sort_by_datatime(df_002521) # timestamp
df_300132 = sort_by_datatime(df_300132)
df_002521 = reduce_column_size(df_002521) # reduce RAM
df_300132 = reduce_column_size(df_300132)

### 2. Feature engineering

These features were developed to reflect the imbalance, liquidity, price dynamics, and statistics of volume and prices
- imbalance = bid_volume1-10/(bid_volume1-10 + ask_volume1-10)
- delta_imbalance = imbalance - (last imbalance)
- rolling_delta_imb = (sum of delta_imbalance in past 5 rows)/5
- returns = price.pct_change()
- volatilities in different period 10s, 1min, 1Hour, 1D, 7D, 30D
- midprice = (BidPrice1 + AskPrice1)/2
- spread = (AskPrice1 - BidPrice1)
- momentum = turnover * return
- size_imbalance" = total_bid_volume / total_bid_volume
- spread_intensity = spread.diff()
- 'market_urgency' = 'spread * liquidity_imbalance
- order_book_imbalance = (total_ask_volume - total_bid_volume)/(total_ask_volume + total_bid_volume + 1e-6)
- weighted_bid_price based on the bid_volume1-10
- weighted_ask_price based on the ask_volume1-10
- bid_price_gap = weighted_bid_price - BidPrice1
- ask_price_gap = weighted_ask_price = AskPrice1
- mean, std, medium of bidprice
- mean, std, medium of askprice
- top3 ask volume sum
- top3 bid volume sum

In [31]:
def generate_level2_imbalance(df, bid_levels=10, ask_levels=10, time_col='DataTime'):

    df = df.sort_values(by=time_col).copy()

    bid_volume_cols = [f"BidVolume{i}" for i in range(1, bid_levels + 1) if f"BidVolume{i}" in df.columns]
    ask_volume_cols = [f"AskVolume{i}" for i in range(1, ask_levels + 1) if f"AskVolume{i}" in df.columns]

    df["total_bid_volume"] = df[bid_volume_cols].sum(axis=1)
    df["total_ask_volume"] = df[ask_volume_cols].sum(axis=1)
    df["total_volume"] = df["total_bid_volume"] + df["total_ask_volume"]
    df["imbalance"] = df["total_bid_volume"] / (df["total_bid_volume"] + df["total_ask_volume"])

    df["imbalance"].fillna(0.5, inplace=True)

    df["delta_imbalance"] = df["imbalance"].diff()

    df['rolling_delta_imb'] = df['delta_imbalance'].rolling(window=5).mean()
    return df
def add_rolling_volatility(
    df,
    time_col='DataTime',
    price_col='LastPrice',
    window='1D',
    new_col='volatility',
    use_log_returns=False
):

    df_local = df.copy()

    if not pd.api.types.is_datetime64_any_dtype(df_local[time_col]):
        df_local[time_col] = pd.to_datetime(df_local[time_col], errors='coerce')

    df_local.sort_values(by=time_col, inplace=True)

    if use_log_returns:
        df_local['returns'] = np.log(df_local[price_col].pct_change() + 1)
    else:
        df_local['returns'] = df_local[price_col].pct_change()

    df_local.dropna(subset=['returns'], inplace=True)

    df_local[new_col] = (
        df_local
        .rolling(window=window, on=time_col)['returns']
        .std()
    )

    return df_local

In [32]:
df_002521_signals = generate_level2_imbalance(df_002521, bid_levels=10, ask_levels=10, time_col='DataTime') #imbalance features
df_300132_signals = generate_level2_imbalance(df_300132, bid_levels=10, ask_levels=10, time_col='DataTime')

In [33]:
df_002521_signals = add_rolling_volatility(df_002521_signals, window='1D', new_col='daily_vol') # volatility features
df_002521_signals = add_rolling_volatility(df_002521_signals, window='7D', new_col='weekly_vol')
df_002521_signals = add_rolling_volatility(df_002521_signals, window='30D', new_col='monthly_vol')
df_002521_signals = add_rolling_volatility(df_002521_signals, window='1H', new_col='hourly_vol')
df_002521_signals = add_rolling_volatility(df_002521_signals, window='1T', new_col='min_vol')
df_002521_signals = add_rolling_volatility(df_002521_signals, window='1S', new_col='sec_vol')
df_002521_signals = add_rolling_volatility(df_002521_signals, window='10S', new_col='tenth_sec_vol')
df_300132_signals = add_rolling_volatility(df_300132_signals, window='1D', new_col='daily_vol')
df_300132_signals = add_rolling_volatility(df_300132_signals, window='7D', new_col='weekly_vol')
df_300132_signals = add_rolling_volatility(df_300132_signals, window='30D', new_col='monthly_vol')
df_300132_signals = add_rolling_volatility(df_300132_signals, window='1H', new_col='hourly_vol')
df_300132_signals = add_rolling_volatility(df_300132_signals, window='1T', new_col='min_vol')
df_002521_signals = add_rolling_volatility(df_002521_signals, window='1S', new_col='sec_vol')
df_002521_signals = add_rolling_volatility(df_002521_signals, window='10S', new_col='tenth_sec_vol')

In [34]:
df_002521_signals = df_002521_signals[df_002521_signals['LastPrice'] != 0] # drop price that equals to 0
df_300132_signals = df_300132_signals[df_300132_signals['LastPrice'] != 0]
df_002521_signals.set_index('DataTime', inplace=True)
df_300132_signals.set_index('DataTime', inplace=True)
df_002521_signals ['momentum'] = df_002521_signals['Turnover']*df_002521_signals['returns'] # features of momentum, midprice, spread
df_300132_signals ['momentum'] = df_300132_signals['Turnover']*df_300132_signals['returns']
df_002521_signals ['midprice'] = (df_002521_signals['AskPrice1'] + df_002521_signals['BidPrice1'])/2
df_002521_signals ['spread'] = df_002521_signals['AskPrice1'] - df_002521_signals['BidPrice1']
df_300132_signals ['midprice'] = (df_300132_signals['AskPrice1'] + df_300132_signals['BidPrice1'])/2
df_300132_signals ['spread'] = df_300132_signals['AskPrice1'] - df_300132_signals['BidPrice1']

In [105]:
for df in [df_002521_signals, df_300132_signals]: # spread + imbalance features
    df["liquidity_imbalance"] = df.eval("(total_bid_volume - total_ask_volume)/(total_bid_volume + total_ask_volume)")
    df["size_imbalance"] = df.eval("total_bid_volume / total_bid_volume")
    df['spread_intensity'] = df['spread'].diff()
    df['market_urgency'] = df['spread'] * df['liquidity_imbalance']
    df['order_book_imbalance'] = (df['total_ask_volume']-df['total_bid_volume'])/(df['total_ask_volume']+df['total_bid_volume']+1e-6)

bid_volume_cols = [f'BidVolume{i}' for i in range(1, 11)]
ask_volume_cols = [f'AskVolume{i}' for i in range(1, 11)]
bid_price_cols = [f'BidPrice{i}' for i in range(1, 11)]
ask_price_cols = [f'AskPrice{i}' for i in range(1, 11)]
for df_features in [df_002521_signals, df_300132_signals]:
    df_features['weighted_bid_price'] = (
    df_features[bid_price_cols].mul(df_features[bid_volume_cols]).sum(axis=1) /
    (df_features['total_bid_volume'] + 1e-6)
)
    df_features['weighted_ask_price'] = (
    df_features[ask_price_cols].mul(df_features[ask_volume_cols]).sum(axis=1) /
    (df_features['total_ask_volume'] + 1e-6)
)
    df_features['bid_price_gap'] = df_features['weighted_bid_price'] - df_features['BidPrice1']
    df_features['ask_price_gap'] = df_features['weighted_ask_price'] - df_features['AskPrice1']
    for i in range(1, 10):
        df_features[f'bid_price_diff_{i}_{i+1}'] = df_features[f'BidPrice{i}'] - df_features[f'BidPrice{i+1}']
        df_features[f'ask_price_diff_{i}_{i+1}'] = df_features[f'AskPrice{i+1}'] - df_features[f'AskPrice{i}']
    df_features['avg_bid_price'] = df_features[bid_price_cols].mean(axis=1)
    df_features['avg_ask_price'] = df_features[ask_price_cols].mean(axis=1)
    df_features['median_bid_price'] = df_features[bid_price_cols].median(axis=1)
    df_features['median_ask_price'] = df_features[ask_price_cols].median(axis=1)
    df_features['std_bid_price'] = df_features[bid_price_cols].std(axis=1)
    df_features['std_ask_price'] = df_features[ask_price_cols].std(axis=1)
    df_features['best_volume_ratio'] = df_features['BidVolume1'] / (df_features['AskVolume1'] + 1e-6)
    df_features['volume_ratio'] = df_features['total_bid_volume'] / (df_features['total_ask_volume'] + 1e-6)

    top3_bid_cols = [f'BidVolume{i}' for i in range(1, 4)]
    top3_ask_cols = [f'AskVolume{i}' for i in range(1, 4)]
    df_features['top3_bid_volume'] = df_features[top3_bid_cols].sum(axis=1)
    df_features['top3_ask_volume'] = df_features[top3_ask_cols].sum(axis=1)
    df_features['top3_volume_ratio'] = df_features['top3_bid_volume'] / (df_features['top3_ask_volume'] + 1e-6)

    for i in range(1, 11):
        df_features[f'spread_level_{i}'] = df_features[f'AskPrice{i}'] - df_features[f'BidPrice{i}']


