In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings
import os
import zstandard as zstd
import io
import random
import pickle

warnings.filterwarnings("ignore")

In [2]:
conda install zstandard

Channels:
 - defaults
 - conda-forge
Platform: osx-64
Collecting package metadata (repodata.json): done
Solving environment: done

# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [2]:
def process_csv_files(directory, ticker):
    """
    Convert input csv to dataframes and prepare to process through trading signal.

    Params:
    directory (String): Name of directory that contains list of csv files.

    Returns:
    return (list[DataFrame]): List of pandas DataFrames, one for each trading day.
    """
    
    # Directory containing the CSV files
    
    ret = []
    
    # Loop through all files in the directory and convert .zst csv to df
    for filename in os.listdir(directory):
        
        if filename.endswith('.zst'):
            filepath = os.path.join(directory, filename)

            try:
                # Open the compressed file in binary mode
                with open(filepath, 'rb') as compressed_file:
                    # Initialize the decompressor
                    dctx = zstd.ZstdDecompressor()
                    
                    # Decompress the file into an in-memory buffer
                    with dctx.stream_reader(compressed_file) as decompressed_stream:
                        text_stream = io.TextIOWrapper(decompressed_stream, encoding='utf-8')
                        print(f"Processing {filename}\n")
                        # Read the decompressed data into a pandas DataFrame
                        df = pd.read_csv(text_stream, parse_dates=['ts_recv', 'ts_event'])
                        print(f"Processed {filename}\n")
                    
            except Exception as e:
                print(f"Error processing {filename}\n")
                print(e)
                continue
                
            ret.append(df)
            
    return ret

In [6]:
dfs = process_csv_files('equity-data', 'ANF')

Processing xnas-itch-20240823.mbp-10.csv.zst

Processed xnas-itch-20240823.mbp-10.csv.zst

Processing xnas-itch-20240830.mbp-10.csv.zst

Processed xnas-itch-20240830.mbp-10.csv.zst

Processing xnas-itch-20240822.mbp-10.csv.zst

Processed xnas-itch-20240822.mbp-10.csv.zst

Processing xnas-itch-20240828.mbp-10.csv.zst

Processed xnas-itch-20240828.mbp-10.csv.zst

Processing xnas-itch-20240820.mbp-10.csv.zst

Processed xnas-itch-20240820.mbp-10.csv.zst

Processing xnas-itch-20240827.mbp-10.csv.zst

Processed xnas-itch-20240827.mbp-10.csv.zst

Processing xnas-itch-20240826.mbp-10.csv.zst

Processed xnas-itch-20240826.mbp-10.csv.zst

Processing xnas-itch-20240821.mbp-10.csv.zst

Processed xnas-itch-20240821.mbp-10.csv.zst

Processing xnas-itch-20240829.mbp-10.csv.zst

Processed xnas-itch-20240829.mbp-10.csv.zst

Processing xnas-itch-20240815.mbp-10.csv.zst

Processed xnas-itch-20240815.mbp-10.csv.zst

Processing xnas-itch-20240808.mbp-10.csv.zst

Processed xnas-itch-20240808.mbp-10.csv.zst


In [10]:
# Save down as pkl file - for use in case user needs to reset kernel, can fetch back dataframes faster
day_df = {}

i = 1
for df in dfs:
    df.to_pickle("day_" + str(i) + ".pkl")
    print(str(i) + ' saved file as pickle')
    i +=1
    day_df[i] = df


1 saved file as pickle
2 saved file as pickle
3 saved file as pickle
4 saved file as pickle
5 saved file as pickle
6 saved file as pickle
7 saved file as pickle
8 saved file as pickle
9 saved file as pickle
10 saved file as pickle
11 saved file as pickle
12 saved file as pickle
13 saved file as pickle
14 saved file as pickle
15 saved file as pickle
16 saved file as pickle
17 saved file as pickle
18 saved file as pickle
19 saved file as pickle
20 saved file as pickle
21 saved file as pickle
22 saved file as pickle


In [3]:
directory = 'equity-pkl'

dfs = []

for filename in os.listdir(directory):
    if filename.endswith('.pkl'):
        filepath = os.path.join(directory, filename)
        with open(filepath, 'rb') as file:
            data = pickle.load(file)
            dfs.append(data)
            print(f"Loaded {filename}")


Loaded day_19.pkl
Loaded day_18.pkl
Loaded day_20.pkl
Loaded day_21.pkl
Loaded day_22.pkl
Loaded day_9.pkl
Loaded day_8.pkl
Loaded day_6.pkl
Loaded day_7.pkl
Loaded day_5.pkl
Loaded day_4.pkl
Loaded day_1.pkl
Loaded day_3.pkl
Loaded day_2.pkl
Loaded day_13.pkl
Loaded day_12.pkl
Loaded day_10.pkl
Loaded day_11.pkl
Loaded day_15.pkl
Loaded day_14.pkl
Loaded day_16.pkl
Loaded day_17.pkl


In [6]:
# Process data
bbo_dfs_nvda = []
bbo_dfs_anf = []

tickers = ['NVDA', 'ANF']

i = 0
for df in dfs:
    for tkr in tickers:
        print(i)
        if 'ts_event' in list(df.columns):
            df['ts_event'] = pd.to_datetime(df['ts_event'])
            # Set 'ts_event' as index
            df = df.set_index('ts_event')
                    
        bbo_df = df.between_time('13:40', '19:55', inclusive='left')
        
        tkr_df_nvda = bbo_df[bbo_df['symbol'] == 'NVDA'].resample('100ms').last().ffill()
        tkr_df_anf = bbo_df[bbo_df['symbol'] == 'ANF'].resample('100ms').last().ffill()

        bbo_dfs_nvda.append(tkr_df_nvda)
        bbo_dfs_anf.append(tkr_df_anf)

    i+=1

0
0
1
1
2
2
3
3
4
4
5
5
6
6
7
7
8
8
9
9
10
10
11
11
12
12
13
13
14
14
15
15
16
16
17
17
18
18
19
19
20
20
21
21


In [14]:
def moving_average_signal(df, short_window, long_window, b):
    """
    Generate trading signals based on moving average cross-over strategy.

    Params:
    prices (Series): asset prices
    short_window (int): Window size for the short-term moving average
    long_window (int): Window size for the long-term moving average
    b (int): Bandwidth parameter that determines the buy/sell thresholds

    Return:
    return (Series): Series with trading signals (+1 for buy, -1 for sell, 0 for hold)
    """

    # Create midprice column (average of bid and ask prices)
    mid_price = (df['bid_px_00'] + df['ask_px_00']) / 2
    
    # Calculate short-term and long-term moving averages
    short_ma = mid_price.rolling(window=short_window).mean()
    long_ma = mid_price.rolling(window=long_window).mean()
    
    # Define thresholds
    upper_threshold = (1 + b) * long_ma
    lower_threshold = (1 - b) * long_ma
    
    signal = pd.Series(0, index=df.index)

    # Generate buy signals (+1 where short_ma > upper_threshold)
    signal[short_ma > upper_threshold] = 1
    
    # Generate sell signals (-1 where short_ma < lower_threshold)
    signal[short_ma < lower_threshold] = -1

    return signal

In [8]:
def update_order_book(bids, asks, bid_price, bid_size, ask_price, ask_size):
    """
    Update the 5-level deep order book with new bid and ask data.

    Params:
     bids (DataFrame): DataFrame maintaining top 5 bid prices and sizes.
     asks (DataFrame): DataFrame maintaining top 5 ask prices and sizes.
     bid_price (float): New bid price.
     bid_size (int): New bid size.
     ask_price (float): New ask price.
     ask_size (int): New ask size.

    Returns:
    - Updated bids and asks DataFrames.
    """

    # Update bids
    if bid_price > 0 and bid_size > 0:
        if (bids['price'] == bid_price).any():
            # If bid price already exists, update the size
            bids.loc[bids['price'] == bid_price, 'size'] = bid_size
        else:
            # If it's a new bid price, add it and sort
            new_bid = pd.DataFrame({'price': [bid_price], 'size': [bid_size]})
            bids = pd.concat([bids, new_bid]).nlargest(5, 'price').reset_index(drop=True)

    # Update asks
    if ask_price > 0 and ask_size > 0:
        if (asks['price'] == ask_price).any():
            # If ask price already exists, update the size
            asks.loc[asks['price'] == ask_price, 'size'] = ask_size
        else:
            # If it's a new ask price, add it and sort
            new_ask = pd.DataFrame({'price': [ask_price], 'size': [ask_size]})
            asks = pd.concat([asks, new_ask]).nsmallest(5, 'price').reset_index(drop=True)

    return bids, asks

def process_mpb10_data(mpb10_df):
    """
    Process MPB-10 data to maintain a 5-level deep order book.

    Params:
    - mpb10_df (DataFrame): DataFrame containing the MPB-10 data.

    Returns:
    - bids (DataFrame): Final top 5 bid prices and sizes.
    - asks (DataFrame): Final top 5 ask prices and sizes.
    """

    # Initialize empty DataFrames for the top 5 levels of bids and asks
    bids = pd.DataFrame(columns=['price', 'size'], index=range(5))
    asks = pd.DataFrame(columns=['price', 'size'], index=range(5))

    for index, row in mpb10_df.iterrows():
        # Extract bid and ask prices and sizes from each row
        bid_price = row['bid_px_00']
        bid_size = row['bid_sz_00']
        ask_price = row['ask_px_00']
        ask_size = row['ask_sz_00']

        # Update the order book
        bids, asks = update_order_book(bids, asks, bid_price, bid_size, ask_price, ask_size)

    return bids, asks

In [12]:
def execute_trading_signal(bbo_df, signal, order_size=1):
    """
    Execute the trading signal based on the available order book depth and calculate actual P&L.

    bbo_df (DataFrame): DataFrame containing the order book and signals for each day.
    signal (Series): Generated trading signals (+1 for buy, -1 for sell, 0 for hold).
    order_size (int): Size of the order (e.g., $1M or 1 share for initial).

    :return: DataFrame with actual P&L calculated for each signal.
    """

    # Initialize return and position tracking
    bbo_df['return'] = 0
    position = 0  # Tracks the position (positive for buy, negative for sell)
    
    # Loop through the DataFrame and execute trades based on the signal
    for i in range(1, len(bbo_df)):
        current_signal = signal.iloc[i]
        previous_signal = signal.iloc[i-1]
        
        if current_signal != previous_signal:  # Only act when signal changes
            # Extract the order book state
            bids, asks = process_mpb10_data(bbo_df.iloc[:i+1])

            # Trade on buy signal (+1) and sell signal (-1)
            if current_signal == 1 and position == 0:
                # Buy at the best ask price, fill up to the order size
                best_ask_price = asks.iloc[0]['price']
                position = order_size
                bbo_df.loc[bbo_df.index[i], 'return'] = -best_ask_price  # negative because we are buying
                
            elif current_signal == -1 and position > 0:
                # Sell at the best bid price, close the position
                best_bid_price = bids.iloc[0]['price']
                bbo_df.loc[bbo_df.index[i], 'return'] = (best_bid_price - abs(bbo_df['return'][i-1]))  # sell return
                position = 0  # Reset position after selling

    # Scale return to $1M for actual P&L calculation as per assignment description
    bbo_df['scaled_return'] = bbo_df['return'] * 1_000_000
    
    # Cumulative returns for analysis
    bbo_df['cumulative_return'] = bbo_df['scaled_return'].cumsum()
    
    return bbo_df

In [9]:
test = bbo_dfs_anf[0]

In [None]:
signal = moving_average_signal(test, short_window=50, long_window=500, b=0.0001)

returns = execute_trading_signal(test, signal)

In [None]:
returns['cumulative_return']

In [None]:
# theoretical, hopefully this works

returns = []
for df in bbo_dfs_anf:
    ret = execute_trading_signal(df, signal)
    returns.append(ret)