In [7]:
import vaex
import numpy as np
import pandas as pd
import statsmodels.api as sm
import datetime as dt
import os
import seaborn as sns
import matplotlib.pyplot as plt
import regex as re

print(os.getcwd())

# Read in the data

/Users/macos/Financial-Big-Data/Labs


In [8]:
def load_TRTH_trade(filename,
             tz_exchange="America/New_York",
             only_non_special_trades=True,
             only_regular_trading_hours=True,
             open_time="09:30:00",
             close_time="16:00:00",
             merge_sub_trades=True):
    try:
        if re.search('(csv|csv\\.gz)$',filename):
            DF = pd.read_csv(filename)
        if re.search(r'arrow$',filename):
            DF = pd.read_arrow(filename)
        if re.search('parquet$',filename):
            DF = pd.read_parquet(filename)

    except Exception as e:
     #   print("load_TRTH_trade could not load "+filename)
     #   print(e)
        return None
    
    try:
        DF.shape
    except Exception as e: # DF does not exist
        print("DF does not exist")
        print(e)
        return None

    
    if DF.shape[0]==0:
        return None
    
    if only_non_special_trades:
        DF = DF[DF["trade-stringflag"]=="uncategorized"]

    DF.drop(columns=["trade-rawflag","trade-stringflag"],axis=1,inplace=True)
    
    DF.index = pd.to_datetime(DF["xltime"],unit="d",origin="1899-12-30",utc=True)
    DF.index = DF.index.tz_convert(tz_exchange)  # .P stands for Arca, which is based at New York
    DF.drop(columns="xltime",inplace=True)
    
    if only_regular_trading_hours:
        DF=DF.between_time(open_time,close_time)    # warning: ever heard e.g. about Thanksgivings?
    
    if merge_sub_trades:
           DF=DF.groupby(DF.index).agg(trade_price=pd.NamedAgg(column='trade-price', aggfunc='mean'),
                                       trade_volume=pd.NamedAgg(column='trade-volume', aggfunc='sum'))
    
    return DF


def load_TRTH_bbo(filename,
             tz_exchange="America/New_York",
             only_regular_trading_hours=True,
             merge_sub_trades=True):
    try:
        if re.search(r'(csv|csv\.gz)$',filename):
            DF = pd.read_csv(filename)
        if re.search(r'arrow$',filename):
            DF = pd.read_arrow(filename)
        if re.search(r'parquet$',filename):
            DF = pd.read_parquet(filename) 
    except Exception as e:
       # print("load_TRTH_bbo could not load "+filename)
        return None
    
    try:
        DF.shape
    except Exception as e: # DF does not exist
        print("DF does not exist")
        print(e)
        return None

    if DF.shape[0]==0:
        return None
    
        
    DF.index = pd.to_datetime(DF["xltime"],unit="d",origin="1899-12-30",utc=True)
    DF.index = DF.index.tz_convert(tz_exchange)  # .P stands for Arca, which is based at New York
    DF.drop(columns="xltime",inplace=True)
    
    if only_regular_trading_hours:
        DF=DF.between_time("09:30:00","16:00:00")    # ever heard about Thanksgivings?
        
    if merge_sub_trades:
        DF=DF.groupby(DF.index).last()
    

        
    return DF

In [9]:
def load_merge_trade_bbo(ticker,date,
                         country="US",
                         dirBase="../data/raw/TRTH/equities/",
                         suffix="parquet",
                         suffix_save=None,
                         dirSaveBase="../data/clean/TRTH/equities/events",
                         saveOnly=False,
                         doSave=False
                        ):
    
    file_trade=dirBase+"/"+country+"/trade/"+ticker+"/"+str(date.date())+"-"+ticker+"-trade."+suffix
    file_bbo=file_trade.replace("trade","bbo")
    trades=load_TRTH_trade(file_trade)
    bbos  =load_TRTH_bbo(file_bbo)
    try:
        trades.shape + bbos.shape
    except:
        return None
    
    events=trades.join(bbos,how="outer")
    
    if doSave:
        dirSave=dirSaveBase+"/"+country+"/events/"+ticker
        if not os.path.isdir(dirSave):
            os.makedirs(dirSave)

        if suffix_save:
            suffix=suffix_save
        
        file_events=dirSave+"/"+str(date.date())+"-"+ticker+"-events"+"."+suffix
       # pdb.set_trace()

        saved=False
        if suffix=="arrow":
            events=vaex.from_pandas(events,copy_index=True)
            events.export_arrow(file_events)
            saved=True
        if suffix=="parquet":
         #   pdb.set_trace()
            events.to_parquet(file_events,use_deprecated_int96_timestamps=True)
            saved=True
            
        if not saved:
            print("suffix "+suffix+" : format not recognized")
            
        if saveOnly:
            return saved
    return events

In [24]:
def response(df, shift):
    return df["sign"]*(df["mid-price"].shift(shift)-df["mid-price"])

In [26]:
df = load_merge_trade_bbo("SPY.P", dt.datetime(2010,5,5), suffix="csv")
df.dropna(inplace=True)
df["mid-price"] = (df["bid-price"]+df["ask-price"])/2
df["sign"] = np.concatenate(([np.nan],np.sign(df["trade_price"].values[1:]-(df["bid-price"].values[-1]+df["ask-price"].values[-1])/2)))
df.dropna(inplace=True)
df["response"] = response(df, 5)
df

Unnamed: 0_level_0,trade_price,trade_volume,bid-price,bid-volume,ask-price,ask-volume,mid-price,sign,response
xltime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2010-05-05 09:30:01.194000128-04:00,116.540000,1800.0,116.51,1056.0,116.53,32.0,116.520,-1.0,
2010-05-05 09:30:01.212000-04:00,116.520000,600.0,116.51,1074.0,116.53,5.0,116.520,-1.0,
2010-05-05 09:30:01.252000512-04:00,116.523333,500.0,116.52,40.0,116.54,88.0,116.530,-1.0,
2010-05-05 09:30:01.502000384-04:00,116.520000,700.0,116.52,100.0,116.55,8.0,116.535,-1.0,
2010-05-05 09:30:02.069000192-04:00,116.530000,12731.0,116.52,32.0,116.53,108.0,116.525,-1.0,
...,...,...,...,...,...,...,...,...,...
2010-05-05 15:59:57.794000384-04:00,116.800000,100.0,116.79,1440.0,116.82,197.0,116.805,-1.0,0.035
2010-05-05 15:59:58.926000640-04:00,116.820000,3000.0,116.81,286.0,116.82,311.0,116.815,-1.0,0.035
2010-05-05 15:59:58.966001152-04:00,116.820000,1000.0,116.81,477.0,116.82,189.0,116.815,-1.0,0.035
2010-05-05 15:59:59.159000064-04:00,116.830000,100.0,116.82,327.0,116.83,784.0,116.825,0.0,-0.000
