In [54]:
import configparser
import os

# Initialize the config parser
config = configparser.ConfigParser()
config.read('config.ini')

# Current active profile
user_profile = 'Xuting' 

# Correcting the variable names to match your config.ini keys
# These refer to the 'input =' and 'output =' lines in your file
input_base = config[user_profile]['input']
output_base = config[user_profile]['output']

print(f"Reading data from: {input_base}")
print(f"Saving results to: {output_base}")

Reading data from: /Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData
Saving results to: /Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData


In [55]:
import pyarrow.parquet as pq
import polars as pl

# --- Paths to your files ---
# We use os.path.join to combine the base directory with the filename
quotes_file = "data_quotes_2024_03_15.parquet"
trades_file = "data_trades_2024_03_15.parquet"

quotes_path = os.path.join(input_base, quotes_file)
trades_path = os.path.join(input_base, trades_file)

# --- Efficiently count rows without loading data ---
# Now quotes_path and trades_path are correctly defined
quotes_rows = pq.ParquetFile(quotes_path).metadata.num_rows
trades_rows = pq.ParquetFile(trades_path).metadata.num_rows

print(f"Quotes rows: {quotes_rows:,}")
print(f"Trades rows: {trades_rows:,}")

Quotes rows: 1,665,669,705
Trades rows: 82,461,761


In [56]:
import pyarrow.parquet as pq
from datetime import datetime

# ---- quotes file ----
# Construct the path using the base directory from your config file
quotes_file = "data_quotes_2024_03_15.parquet"
quotes_path = os.path.join(input_base, quotes_file)

pf_quotes = pq.ParquetFile(quotes_path)
print("🧾 Quotes schema:")
print(pf_quotes.schema_arrow)     # column names and types only

# ---- trades file ----
# Construct the path using the base directory from your config file
trades_file = "data_trades_2024_03_15.parquet"
trades_path = os.path.join(input_base, trades_file)

pf_trades = pq.ParquetFile(trades_path)
print("\n💹 Trades schema:")
print(pf_trades.schema_arrow)

🧾 Quotes schema:
DATE: date32[day]
TIME_M: time64[us]
EX: string
BID: double
BIDSIZ: int64
ASK: double
ASKSIZ: int64
QU_COND: string
QU_SEQNUM: int64
NATBBO_IND: string
QU_CANCEL: string
QU_SOURCE: string
SYM_ROOT: string
SYM_SUFFIX: string

💹 Trades schema:
DATE: date32[day]
TIME_M: time64[us]
EX: string
SYM_ROOT: string
SYM_SUFFIX: string
TR_SCOND: string
SIZE: int64
PRICE: double
TR_STOP_IND: string
TR_CORR: string
TR_SEQNUM: int64
TR_ID: int64
TR_SOURCE: string
TR_RF: string


In [57]:
#Trades

target_file = os.path.join(input_base, "data_trades_2024_03_15.parquet")
target_output = os.path.join(output_base, "2024_03_15/processed_output_trades/")

%run polar_try_partitioned.py --FILE_PATH=$target_file --OUTPUT_DIR=$target_output

Starting chunked read from: /Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/data_trades_2024_03_15.parquet
Saving output to directory: /Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/processed_output_trades/
File contains 666 row groups (chunks).
✅ Chunk 0001/0666 saved: 123919 rows.
✅ Chunk 0002/0666 saved: 123214 rows.
✅ Chunk 0003/0666 saved: 123873 rows.
✅ Chunk 0004/0666 saved: 124035 rows.
✅ Chunk 0005/0666 saved: 124193 rows.
✅ Chunk 0006/0666 saved: 123727 rows.
✅ Chunk 0007/0666 saved: 123513 rows.
✅ Chunk 0008/0666 saved: 124206 rows.
✅ Chunk 0009/0666 saved: 123832 rows.
✅ Chunk 0010/0666 saved: 123111 rows.
✅ Chunk 0011/0666 saved: 124895 rows.
✅ Chunk 0012/0666 saved: 122903 rows.
✅ Chunk 0013/0666 saved: 124647 rows.
✅ Chunk 0014/0666 saved: 124418 rows.
✅ Chunk 0015/0666 saved: 122911 rows.
✅ Chunk 0016/0666 saved: 124216 rows.
✅ Chunk 0017/0666 saved: 123228 rows.
✅ Chunk 0018/0666 saved: 124430 rows.
✅ Chunk 0019/0666 saved: 124340 rows.
✅ Chunk 

In [None]:
#Quotes
quotes_input_file = os.path.join(input_base, "data_quotes_2024_03_15.parquet")
quotes_output_dir = os.path.join(output_base, "2024_03_15/processed_output_quotes/")

# Ensure the output directory exists to avoid errors
os.makedirs(quotes_output_dir, exist_ok=True)

%run polar_try_partitioned.py --FILE_PATH=$quotes_input_file --OUTPUT_DIR=$quotes_output_dir

Starting chunked read from: /Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/data_quotes_2024_03_15.parquet
Saving output to directory: /Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/processed_output_quotes/
File contains 13445 row groups (chunks).
✅ Chunk 0001/13445 saved: 124057 rows.
✅ Chunk 0002/13445 saved: 124631 rows.
✅ Chunk 0003/13445 saved: 123058 rows.
✅ Chunk 0004/13445 saved: 123057 rows.
✅ Chunk 0005/13445 saved: 123347 rows.
✅ Chunk 0006/13445 saved: 123260 rows.
✅ Chunk 0007/13445 saved: 123161 rows.
✅ Chunk 0008/13445 saved: 123552 rows.
✅ Chunk 0009/13445 saved: 124192 rows.
✅ Chunk 0010/13445 saved: 123754 rows.
✅ Chunk 0011/13445 saved: 123955 rows.
✅ Chunk 0012/13445 saved: 123672 rows.
✅ Chunk 0013/13445 saved: 123636 rows.
✅ Chunk 0014/13445 saved: 123667 rows.
✅ Chunk 0015/13445 saved: 124269 rows.
✅ Chunk 0016/13445 saved: 124241 rows.
✅ Chunk 0017/13445 saved: 124167 rows.
✅ Chunk 0018/13445 saved: 124046 rows.
✅ Chunk 0019/13445 saved: 

In [59]:
pl.Config.set_tbl_cols(50) 
pl.Config.set_tbl_width_chars(200)
pl.Config.set_tbl_rows(200) #polars.Config.tbl_rows = 50

polars.config.Config

In [60]:
# Input file remains the same
trades_input = os.path.join(input_base, "data_trades_2024_03_15.parquet")

# Output file with the "_upper" suffix
trades_output_upper = os.path.join(input_base, "data_trades_2024_03_15_upper.parquet")

%run makesTradesColsUpper.py --INPUT=$trades_input --OUTPUT=$trades_output_upper

pf_trades = pq.ParquetFile(trades_input)
print("\n💹 Trades schema:")
print(pf_trades.schema_arrow)

Uppercasing column names (preserve types): 100%|██████████| 666/666 [00:32<00:00, 20.73it/s]


Done. Wrote: /Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/data_trades_2024_03_15_upper.parquet
Input TIME_M type: time64[us]
Output TIME_M type: time64[us]

💹 Trades schema:
DATE: date32[day]
TIME_M: time64[us]
EX: string
SYM_ROOT: string
SYM_SUFFIX: string
TR_SCOND: string
SIZE: int64
PRICE: double
TR_STOP_IND: string
TR_CORR: string
TR_SEQNUM: int64
TR_ID: int64
TR_SOURCE: string
TR_RF: string


In [61]:
from pathlib import Path
import pyarrow.parquet as pq
import os

input_file_upper = os.path.join(input_base, "data_trades_2024_03_15_upper.parquet")
output_dir_clean = os.path.join(output_base, "2024_03_15/processed_output_trades_upper_clean_from_single/")

os.makedirs(output_dir_clean, exist_ok=True)


%run rewrite2.py --IN_FILE=$input_file_upper --OUT_DIR=$output_dir_clean --BATCH_ROWS=25000


DIR = Path(output_dir_clean)

files = sorted(DIR.glob("*.parquet"))
if not files:
    raise RuntimeError(f"No parquet files found in {DIR}")

# Check the schema of the first generated file to verify results
for f in files[:1]:
    arrow_schema = pq.read_schema(f)
    print(f"File Name: {f.name}")
    print(f"Schema:\n{arrow_schema}")


Rewrite2 (single->chunks): 100%|██████████| 3299/3299 [00:44<00:00, 74.49it/s]



Done. Written chunks: 3299

Row count check:
  IN_FILE : 82461761
  OUT_DIR : 82461761
  DIFF    : 0
Sanity check passed: scan_parquet works on rewritten chunks.
File Name: chunk_000001.parquet
Schema:
DATE: date32[day]
TIME_M: time64[us]
EX: string
SYM_ROOT: string
SYM_SUFFIX: string
TR_SCOND: string
SIZE: int64
PRICE: double
TR_STOP_IND: string
TR_CORR: string
TR_SEQNUM: int64
TR_ID: int64
TR_SOURCE: string
TR_RF: string


In [62]:
trades_clean_dir = os.path.join(output_base, "2024_03_15/processed_output_trades_upper_clean_from_single")
quotes_processed_dir = os.path.join(output_base, "2024_03_15/processed_output_quotes")

analysis_output_file = os.path.join(output_base, "2024_03_15/taq_analysis_output.txt")

%run query.py --TRADES_UPPER=$trades_clean_dir --QUOTES_OLD=$quotes_processed_dir --OUTPUT_FILE=$analysis_output_file


[INFO] Deleted existing output file before processing: /Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/taq_analysis_output.txt

Analysis successfully executed and results written to /Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/taq_analysis_output.txt


  print(trades_lf.fetch(n_rows=10), file=f)
  print(quotes_lf.fetch(n_rows=10), file=f)


In [63]:
import os
import polars as pl


quotes_path = os.path.join(input_base, "data_quotes_2024_03_15.parquet")

q = pl.scan_parquet(quotes_path).filter(
    (pl.col("SYM_ROOT") == "SPY") & 
    (pl.col("SYM_SUFFIX").fill_null("") == "")
).collect()

len(q)

27516767

In [64]:
QUOTES_DIR = os.path.join(output_base, "2024_03_15/processed_output_quotes/")
q = pl.scan_parquet(f"{QUOTES_DIR}/*.parquet").filter(
    (pl.col("SYM_ROOT") == "SPY") & 
    (pl.col("SYM_SUFFIX").fill_null("") == "")
).collect()
len(q)

27516767

In [65]:
QUOTES_DIR = os.path.join(output_base, "2024_03_15/processed_output_quotes_top50/")
q = pl.scan_parquet(f"{QUOTES_DIR}/*.parquet").filter(
    (pl.col("SYM_ROOT") == "SPY") & 
    (pl.col("SYM_SUFFIX").fill_null("") == "")
).collect()
len(q)

27516767

In [66]:
trades_path = os.path.join(input_base, "data_trades_2024_03_15.parquet")

# Process data
t = pl.scan_parquet(trades_path).filter(
    (pl.col("SYM_ROOT") == "SPY") & 
    (pl.col("SYM_SUFFIX").fill_null("") == "")
).collect()

len(t)

694399

In [67]:
trades_upper_path = os.path.join(input_base, "data_trades_2024_03_15_upper.parquet")

t = pl.scan_parquet(trades_upper_path).filter(
    (pl.col("SYM_ROOT") == "SPY") & 
    (pl.col("SYM_SUFFIX").fill_null("") == "")
).collect()

len(t)

694399

In [68]:
TRADES_DIR = os.path.join(output_base, "2024_03_15/processed_output_trades/")

q = pl.scan_parquet(f"{TRADES_DIR}/*.parquet").filter(
    (pl.col("SYM_ROOT") == "SPY") & 
    (pl.col("SYM_SUFFIX").fill_null("") == "")
).collect()

len(q)

694399

In [69]:
import polars as pl

schema = {
    "DATE": pl.Date,
    "TIME_M": pl.Time,
    "EX": pl.Utf8,
    "SYM_ROOT": pl.Utf8,
    "SYM_SUFFIX": pl.Utf8,
    "TR_SCOND": pl.Utf8,
    "SIZE": pl.Int64,
    "PRICE": pl.Float64,
    "TR_STOP_IND": pl.Utf8,
    "TR_CORR": pl.Utf8,
    "TR_SEQNUM": pl.Int64,
    "TR_ID": pl.Int64,
    "TR_SOURCE": pl.Utf8,
    "TR_RF": pl.Utf8,
}

TRADES = os.path.join(input_base, "data_trades_2024_03_15_upper.parquet")

t = pl.scan_parquet(TRADES, schema=schema).filter(
    (pl.col("SYM_ROOT") == "SPY") & 
    (pl.col("SYM_SUFFIX").fill_null("") == "")
).collect()
print(len(t))

t = pl.scan_parquet(TRADES, schema=schema).collect()
print(len(t))

694399
82461761


In [70]:
import polars as pl

schema = {
    "DATE": pl.Date,
    "TIME_M": pl.Time,
    "EX": pl.Utf8,
    "SYM_ROOT": pl.Utf8,
    "SYM_SUFFIX": pl.Utf8,
    "TR_SCOND": pl.Utf8,
    "SIZE": pl.Int64,
    "PRICE": pl.Float64,
    "TR_STOP_IND": pl.Utf8,
    "TR_CORR": pl.Utf8,
    "TR_SEQNUM": pl.Int64,
    "TR_ID": pl.Int64,
    "TR_SOURCE": pl.Utf8,
    "TR_RF": pl.Utf8,
}



#TRADES_CLEAN = os.path.join(output_base, "2024_03_15/processed_output_trades_upper_clean/")
#t_clean_spy = pl.scan_parquet(TRADES_CLEAN, schema=schema).filter(
#   (pl.col("SYM_ROOT") == "SPY") & (pl.col("SYM_SUFFIX").fill_null("") == "")
#).collect()
#print(f"Clean SPY: {len(t_clean_spy)}")

TRADES_SINGLE = os.path.join(output_base, "2024_03_15/processed_output_trades_upper_clean_from_single/")


t_single_spy = pl.scan_parquet(TRADES_SINGLE).filter(
    (pl.col("SYM_ROOT") == "SPY") & (pl.col("SYM_SUFFIX").fill_null("") == "")
).collect()
print(f"Single SPY: {len(t_single_spy)}")


t_single_all = pl.scan_parquet(TRADES_SINGLE).collect()
print(f"Single Total: {len(t_single_all)}")

Single SPY: 694399
Single Total: 82461761


In [71]:
%run offending_datatypes.py --TRADES_DIR=/Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/processed_output_trades_upper_clean_from_single

Found 3299 parquet files in /Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/processed_output_trades_upper_clean_from_single
  scanned schemas: 200/3299
  scanned schemas: 400/3299
  scanned schemas: 600/3299
  scanned schemas: 800/3299
  scanned schemas: 1000/3299
  scanned schemas: 1200/3299
  scanned schemas: 1400/3299
  scanned schemas: 1600/3299
  scanned schemas: 1800/3299
  scanned schemas: 2000/3299
  scanned schemas: 2200/3299
  scanned schemas: 2400/3299
  scanned schemas: 2600/3299
  scanned schemas: 2800/3299
  scanned schemas: 3000/3299
  scanned schemas: 3200/3299
  scanned schemas: 3299/3299

=== Columns with >1 dtype across files ===

=== SIZE dtype breakdown ===
SIZE dtype Int64: 3299 files
  examples: ['chunk_000001.parquet', 'chunk_000002.parquet', 'chunk_000003.parquet', 'chunk_000004.parquet', 'chunk_000005.parquet', 'chunk_000006.parquet', 'chunk_000007.parquet', 'chunk_000008.parquet', 'chunk_000009.parquet', 'chunk_000010.parquet']

=== Offending

In [72]:
#%run query.py  --TRADES_UPPER="/Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/processed_output_trades_upper_clean_from_single/*.parquet" --QUOTES_OLD="/Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/processed_output_quotes/*.parquet" --OUTPUT_FILE="/Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/taq_analysis_output.txt"
trades_clean = os.path.join(output_base, "2024_03_15/processed_output_trades_upper_clean_from_single/*.parquet")
quotes_old = os.path.join(output_base, "2024_03_15/processed_output_quotes/*.parquet")
output_txt = os.path.join(output_base, "2024_03_15/taq_analysis_output.txt")

%run query.py --TRADES_UPPER=$trades_clean --QUOTES_OLD=$quotes_old --OUTPUT_FILE=$output_txt

[INFO] Deleted existing output file before processing: /Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/taq_analysis_output.txt

Analysis successfully executed and results written to /Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/taq_analysis_output.txt


  print(trades_lf.fetch(n_rows=10), file=f)
  print(quotes_lf.fetch(n_rows=10), file=f)


In [73]:
#%run stats.py --TRADES_DIR="/Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/processed_output_trades_upper_clean_from_single/" --QUOTES_DIR="/Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/processed_output_quotes/" --OUT_DIR="/Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/stats_out_simple/"
t_dir = os.path.join(output_base, "2024_03_15/processed_output_trades_upper_clean_from_single/")
q_dir = os.path.join(output_base, "2024_03_15/processed_output_quotes/")
o_dir = os.path.join(output_base, "2024_03_15/stats_out_simple/")

%run stats.py --TRADES_DIR=$t_dir --QUOTES_DIR=$q_dir --OUT_DIR=$o_dir

[INFO] Deleted existing OUT_DIR before processing: /Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/stats_out_simple
Output dir: /Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/stats_out_simple
[TRADES] scanning 3299 files in /Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/processed_output_trades_upper_clean_from_single …
  → 1/3299 chunk_000001.parquet: 1 row-groups
    RG 1/1 … unique symbols so far: 1
  → 2/3299 chunk_000002.parquet: 1 row-groups
    RG 1/1 … unique symbols so far: 2
  → 3/3299 chunk_000003.parquet: 1 row-groups
    RG 1/1 … unique symbols so far: 2
  → 4/3299 chunk_000004.parquet: 1 row-groups
    RG 1/1 … unique symbols so far: 14
  → 5/3299 chunk_000005.parquet: 1 row-groups
    RG 1/1 … unique symbols so far: 14
  → 6/3299 chunk_000006.parquet: 1 row-groups
    RG 1/1 … unique symbols so far: 14
  → 7/3299 chunk_000007.parquet: 1 row-groups
    RG 1/1 … unique symbols so far: 18
  → 8/3299 chunk_000008.parquet: 1

  df = pl.DataFrame(rows, schema=["SYM_ROOT", "SYM_SUFFIX", "TRADES_ROWS", "TRADES_VOL"])



[QUOTES] scanning 13445 files in /Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/processed_output_quotes …
  → 1/13445 chunk_0001.parquet: 2 row-groups
    RG 2/2 … cumulative QUOTES rows: 124,057
  → 2/13445 chunk_0002.parquet: 2 row-groups
    RG 2/2 … cumulative QUOTES rows: 248,688
  → 3/13445 chunk_0003.parquet: 2 row-groups
    RG 2/2 … cumulative QUOTES rows: 371,746
  → 4/13445 chunk_0004.parquet: 2 row-groups
    RG 2/2 … cumulative QUOTES rows: 494,803
  → 5/13445 chunk_0005.parquet: 2 row-groups
    RG 2/2 … cumulative QUOTES rows: 618,150
  → 6/13445 chunk_0006.parquet: 2 row-groups
    RG 2/2 … cumulative QUOTES rows: 741,410
  → 7/13445 chunk_0007.parquet: 2 row-groups
    RG 2/2 … cumulative QUOTES rows: 864,571
  → 8/13445 chunk_0008.parquet: 2 row-groups
    RG 2/2 … cumulative QUOTES rows: 988,123
  → 9/13445 chunk_0009.parquet: 2 row-groups
    RG 2/2 … cumulative QUOTES rows: 1,112,315
  → 10/13445 chunk_0010.parquet: 2 row-groups
    RG 2/2 … cumu

  df = pl.DataFrame(rows, schema=["SYM_ROOT", "SYM_SUFFIX", "QUOTES_ROWS"])


In [74]:
OUT_DIR = os.path.join(output_base, "2024_03_15/stats_out_simple")
TRADES_OUT_CSV = os.path.join(OUT_DIR, "top50_trades_by_volume.csv")
QUOTES_OUT_CSV = os.path.join(OUT_DIR, "top50_quotes_by_rows.csv")

print(f"Reading {TRADES_OUT_CSV} and {QUOTES_OUT_CSV} …")
trades_df = pl.read_csv(TRADES_OUT_CSV)
quotes_df = pl.read_csv(QUOTES_OUT_CSV)

print("\n🔥 Top Trades (Sorted by TRADES_ROWS):")
print(trades_df.sort("TRADES_ROWS", descending=True))

print("\n📊 Top Quotes (Sorted by QUOTES_ROWS):")
print(quotes_df.sort("QUOTES_ROWS", descending=True))

Reading /Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/stats_out_simple/top50_trades_by_volume.csv and /Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/stats_out_simple/top50_quotes_by_rows.csv …

🔥 Top Trades (Sorted by TRADES_ROWS):
shape: (50, 4)
┌──────────┬────────────┬─────────────┬────────────┐
│ SYM_ROOT ┆ SYM_SUFFIX ┆ TRADES_ROWS ┆ TRADES_VOL │
│ ---      ┆ ---        ┆ ---         ┆ ---        │
│ str      ┆ str        ┆ i64         ┆ i64        │
╞══════════╪════════════╪═════════════╪════════════╡
│ TSLA     ┆            ┆ 1107061     ┆ 107574931  │
│ AAPL     ┆            ┆ 771409      ┆ 162289433  │
│ AMD      ┆            ┆ 726550      ┆ 107720633  │
│ VERB     ┆            ┆ 723088      ┆ 546980770  │
│ SPY      ┆            ┆ 694399      ┆ 117996619  │
│ AMZN     ┆            ┆ 465377      ┆ 96036298   │
│ SOUN     ┆            ┆ 327723      ┆ 130961335  │
│ JTAI     ┆            ┆ 296601      ┆ 101697821  │
│ GERN     ┆            ┆ 2

In [75]:
#%run top50_trades_corresponding_quotes.py --OUT_DIR="/Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/stats_out_simple/" --TRADES_TOP50_CSV=top50_trades_by_volume.csv --TRADES_SRC_DIR="/Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/processed_output_trades_upper_clean_from_single/" --TRADES_OUT_DIR="/Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/processed_output_trades_upper_top50/" --QUOTES_SRC_DIR="/Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/processed_output_quotes/" --QUOTES_OUT_DIR="/Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/processed_output_quotes_top50/" --MERGED_OUT_CSV="merged_top50_sorted.csv"
import os


stats_dir = os.path.join(output_base, "2024_03_15/stats_out_simple/")
top50_csv = "top50_trades_by_volume.csv"

trades_src = os.path.join(output_base, "2024_03_15/processed_output_trades_upper_clean_from_single/")
trades_out = os.path.join(output_base, "2024_03_15/processed_output_trades_upper_top50/")

quotes_src = os.path.join(output_base, "2024_03_15/processed_output_quotes/")
quotes_out = os.path.join(output_base, "2024_03_15/processed_output_quotes_top50/")

os.makedirs(trades_out, exist_ok=True)
os.makedirs(quotes_out, exist_ok=True)

# --- 2. Run the Top 50 Processing Script ---
%run top50_trades_corresponding_quotes.py \
    --OUT_DIR=$stats_dir \
    --TRADES_TOP50_CSV=$top50_csv \
    --TRADES_SRC_DIR=$trades_src \
    --TRADES_OUT_DIR=$trades_out \
    --QUOTES_SRC_DIR=$quotes_src \
    --QUOTES_OUT_DIR=$quotes_out \
    --MERGED_OUT_CSV="merged_top50_sorted.csv"


🔧 Paths configured:
  • Top-50 CSV        : /Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/stats_out_simple/top50_trades_by_volume.csv
  • Trades src        : /Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/processed_output_trades_upper_clean_from_single
  • Trades out (top50): /Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/processed_output_trades_upper_top50
  • Quotes src        : /Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/processed_output_quotes
  • Quotes out (top50): /Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/processed_output_quotes_top50
  • Summary CSV       : /Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/stats_out_simple/merged_top50_sorted.csv
[INFO] Deleted existing TRADES_OUT_DIR before processing: /Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/processed_output_trades_upper_top50
[INFO] Deleted existing QUOTES_OUT_DIR before processing: /User

In [76]:
#%run top50_trades_corresponding_quotes_persist.py --TRADES_DIR="/Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/processed_output_trades_upper_top50/" --QUOTES_DIR="/Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/processed_output_quotes_top50/" --STATS_DIR="/Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/stats_out_simple/" --TOP50_CSV="top50_trades_by_volume.csv" --OUT_DIR="/Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/merged_output_top50/" --OUT_FILE="merged_all.parquet" --CHUNK_SIZE=25000

t_dir = os.path.join(output_base, "2024_03_15/processed_output_trades_upper_top50/")
q_dir = os.path.join(output_base, "2024_03_15/processed_output_quotes_top50/")
s_dir = os.path.join(output_base, "2024_03_15/stats_out_simple/")
out_dir = os.path.join(output_base, "2024_03_15/merged_output_top50/")

%run top50_trades_corresponding_quotes_persist.py \
    --TRADES_DIR=$t_dir \
    --QUOTES_DIR=$q_dir \
    --STATS_DIR=$s_dir \
    --TOP50_CSV="top50_trades_by_volume.csv" \
    --OUT_DIR=$out_dir \
    --OUT_FILE="merged_all.parquet" \
    --CHUNK_SIZE=25000

[INFO] Deleted existing OUT_DIR before processing: /Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/merged_output_top50
📄 Loading top50: /Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/stats_out_simple/top50_trades_by_volume.csv

—— [1/50] BB: 2024-03-15 → 2024-03-15 | tol=1s | chunk=25,000


  out_df = shard.collect(engine="streaming")


📝 Opened writer → merged_all.parquet
    ✅ RG 00001: BB 2024-03-15 04:00 — 4 rows
    ✅ RG 00002: BB 2024-03-15 05:00 — 3 rows
    ✅ RG 00003: BB 2024-03-15 06:00 — 3 rows
    ✅ RG 00004: BB 2024-03-15 08:00 — 20 rows
    ✅ RG 00005: BB 2024-03-15 09:00 — 2,800 rows
    ✅ RG 00006: BB 2024-03-15 10:00 — 6,534 rows
    ✅ RG 00007: BB 2024-03-15 11:00 — 3,781 rows
    ✅ RG 00008: BB 2024-03-15 12:00 — 3,948 rows
    ✅ RG 00009: BB 2024-03-15 13:00 — 5,259 rows
    ✅ RG 00010: BB 2024-03-15 14:00 — 5,843 rows
    ✅ RG 00011: BB 2024-03-15 15:00 — 11,721 rows
    ✅ RG 00012: BB 2024-03-15 16:00 — 215 rows
    ✅ RG 00013: BB 2024-03-15 17:00 — 43 rows
    ✅ RG 00014: BB 2024-03-15 18:00 — 35 rows
    ✅ RG 00015: BB 2024-03-15 19:00 — 30 rows

—— [2/50] SPY: 2024-03-15 → 2024-03-15 | tol=1s | chunk=25,000
    ✅ RG 00016: SPY 2024-03-15 04:00 — 549 rows
    ✅ RG 00017: SPY 2024-03-15 05:00 — 274 rows
    ✅ RG 00018: SPY 2024-03-15 06:00 — 328 rows
    ✅ RG 00019: SPY 2024-03-15 07:00 — 750 ro

In [77]:
#%run ms.py --MERGED_FILE="/Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/merged_output_top50/merged_all.parquet" --OUT_DIR="/Users/alessiaaaa/Desktop/WithAmitG/WithAmitG/TAQData/2024_03_15/ms_out/" --TICKERS=VERB,FSR,BMY


merged_file = os.path.join(output_base, "2024_03_15/merged_output_top50/merged_all.parquet")
ms_out_dir = os.path.join(output_base, "2024_03_15/ms_out/")


%run ms.py --MERGED_FILE=$merged_file --OUT_DIR=$ms_out_dir --TICKERS=VERB,FSR,BMY

Null TR_SEQNUM : 0
Null QU_SEQNUM : 206422
Zero BID and ASK : 15134

🔎 Running microstructure analysis for: VERB
shape: (1, 19)
┌────────┬────────┬────────────┬──────────┬─────────────┬─────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┐
│ TICKER ┆ N_ROWS ┆ TOTAL_SIZE ┆ VWAP     ┆ AVG_SPREAD_ ┆ AVG_DEPTH_T ┆ IS_MEAN_BP ┆ IS_MEDIAN_ ┆ IS_MIN_BPS ┆ IS_MAX_BPS ┆ MI_MEAN_BP ┆ MI_MEDIAN_ ┆ MI_MIN_BPS ┆ MI_MAX_BPS ┆ MR_MEAN_BP ┆ MR_MEDIAN_ ┆ MR_MIN_BPS ┆ MR_MAX_BPS ┆ FWD_TRADES │
│ ---    ┆ ---    ┆ ---        ┆ ---      ┆ BPS         ┆ OB          ┆ S          ┆ BPS        ┆ ---        ┆ ---        ┆ S          ┆ BPS        ┆ ---        ┆ ---        ┆ S          ┆ BPS        ┆ ---        ┆ ---        ┆ _FOR_MR    │
│ str    ┆ u32    ┆ i64        ┆ f64      ┆ ---         ┆ ---         ┆ ---        ┆ ---        ┆ f64        ┆ f64        ┆ ---        ┆ ---        ┆

In [78]:
summary_df

TICKER,N_ROWS,TOTAL_SIZE,VWAP,AVG_SPREAD_BPS,AVG_DEPTH_TOB,IS_MEAN_BPS,IS_MEDIAN_BPS,IS_MIN_BPS,IS_MAX_BPS,MI_MEAN_BPS,MI_MEDIAN_BPS,MI_MIN_BPS,MI_MAX_BPS,MR_MEAN_BPS,MR_MEDIAN_BPS,MR_MIN_BPS,MR_MAX_BPS,FWD_TRADES_FOR_MR
str,u32,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i32
"""VERB""",708281,536025444,0.541075,136.421152,52.06326,64.211393,33.997086,0.0,9983.981312,0.0,-0.0,-0.0,0.0,-56.28771,-4.004271,-6194600.0,51917.366786,10
"""FSR""",155584,308787285,0.173557,54.212612,141.000778,84.190576,6.22665,0.0,8148.492743,0.0,0.0,-0.0,0.0,-57.566389,-2.476167,-43793.793794,43446.046743,10
"""BMY""",150102,18828776,52.502794,1679.59321,8.06389,10.47267,0.949758,0.0,1330.453564,0.0,0.0,-0.0,0.0,-8.337166,-0.946163,-1330.453564,1174.227983,10
