# Data processing
Input: TAQ millisecond data
Output: TAQ data grouped by 15 minutes (you can change this easily)

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict

TAQ_MS_FILE = "data/TAQ_Millisecond_AAPL_2023.csv"
chunksize = 2**16
converters = {
    'SIZE':np.int64,
    'PRICE':np.float64,
    'TR_CORR':np.int64,
    'TR_SEQNUM':np.int64,
    'TR_ID':np.int64,
}

In [None]:
# This is because the TAQ data is too big to load in memory.
chunks = []
for chunk in pd.read_csv(TAQ_MS_FILE, chunksize=chunksize, converters=converters):
    # process each chunk here
    chunk['datetime'] = pd.to_datetime(chunk['DATE'] + ' ' + chunk['TIME_M'])
    chunk.index = chunk['datetime']
    grouped = chunk.groupby(pd.Grouper(freq='15min')).agg(
        OPEN      = pd.NamedAgg(column="PRICE", aggfunc="first"),
        HIGH      = pd.NamedAgg(column="PRICE", aggfunc="max"),
        LOW       = pd.NamedAgg(column="PRICE", aggfunc="min"),
        CLOSE     = pd.NamedAgg(column="PRICE", aggfunc="last"),
        AVG_PRICE = pd.NamedAgg(column="PRICE", aggfunc="mean"),
        VOLUME    = pd.NamedAgg(column="SIZE" , aggfunc="sum"),
    )

    chunks.append(grouped)
    print(chunk['datetime'].iloc[0].strftime('%Y-%m-%d %X'))
print("Loaded {} data chunks".format(len(chunks)))

In [None]:
result = pd.concat(chunks)
print(result.info())

In [None]:
# Now, since the chunks we read in do not align with the desired time intervals,
# we will have repeated intervals to merge.
result = result.groupby(pd.Grouper(freq='15min')).agg({
    "OPEN"      : "first",
    "HIGH"      : "max"  ,
    "LOW"       : "min"  ,
    "CLOSE"     : "last" ,
    "AVG_PRICE" : "mean" ,
    "VOLUME"    : "sum"  ,
})

In [None]:
result = result[(result.index.dayofweek <= 4) & (result['VOLUME'] > 0)]
print(result.info())

In [None]:
result.to_csv("data/TAQ_15Min_AAPL_2023.csv")

## Validation
We validate our work against the TAQ Daily set over the same time period.

In [None]:
# this is for inspecting the data
import matplotlib.pyplot as plt

In [None]:
year = pd.read_csv("data/TAQ_Daily_AAPL_2023.csv")
year.index = pd.to_datetime(year['DATE'])
print(year.info())

In [None]:
# CHECK: These should be the same.
plt.plot(year.index, year['avg_buy_price_LR'], label="daily_buyprice")
plt.plot(year.index, year['avg_sell_price_LR'], label="daily_sellprice")
plt.plot(result.index, result['AVG_PRICE'], label="computed_price")
plt.legend()

In [None]:
# CHECK: These should be the same.
daily_computed_vol = result.groupby(pd.Grouper(freq='1d')).agg({'VOLUME':'sum'})
daily_computed_vol = daily_computed_vol[daily_computed_vol['VOLUME'] > 0]

plt.semilogy(year.index, year['total_vol'], label="daily_vol")
plt.semilogy(daily_computed_vol.index, daily_computed_vol['VOLUME'], label="computed_vol")
plt.legend()