In [1]:
import polars as pl
import os

QUOTES_DIR = "/home/amazon/Documents/TAQData/2024_03_15/processed_output_quotes/"

# 1. Grab the first file in the directory to use as a template
try:
    first_file = next(f for f in os.listdir(QUOTES_DIR) if f.endswith(('.parquet', '.csv', '.ipc')))
    file_path = os.path.join(QUOTES_DIR, first_file)

    # 2. Use scan_ functions for 'lazy' access (doesn't load the data)
    if file_path.endswith('.parquet'):
        lf = pl.scan_parquet(file_path)
    elif file_path.endswith('.csv'):
        lf = pl.scan_csv(file_path)
    else:
        lf = pl.scan_ipc(file_path)

    # 3. Print the schema
    print(f"Schema for: {first_file}")
    print(lf.schema)

except StopIteration:
    print("No compatible files found in the directory.")

Schema for: chunk_5002.parquet
Schema({'DATE': Date, 'TIME_M': Time, 'EX': String, 'BID': Float64, 'BIDSIZ': Int64, 'ASK': Float64, 'ASKSIZ': Int64, 'QU_COND': String, 'QU_SEQNUM': Int64, 'NATBBO_IND': String, 'QU_CANCEL': String, 'QU_SOURCE': String, 'SYM_ROOT': String, 'SYM_SUFFIX': String})


  print(lf.schema)


In [2]:
import polars as pl
import os

paths = {
    "Quotes Top 50": "/home/amazon/Documents/TAQData/2024_03_15/processed_output_quotes_top50/",
    "Trades Top 50": "/home/amazon/Documents/TAQData/2024_03_15/processed_output_trades_upper_top50/"
}

for name, directory in paths.items():
    try:
        # Get the first file in the directory
        files = [f for f in os.listdir(directory) if f.endswith(('.parquet', '.csv', '.ipc'))]
        if not files:
            print(f"--- {name}: No files found ---")
            continue
            
        sample_path = os.path.join(directory, files[0])
        
        # Use scan_parquet for immediate schema access
        lf = pl.scan_parquet(sample_path)
        
        print(f"--- Schema for {name} ---")
        print(lf.schema)
        print("\n")
        
    except Exception as e:
        print(f"Error reading {name}: {e}")

--- Schema for Quotes Top 50 ---
Schema({'DATE': Date, 'TIME_M': Time, 'EX': String, 'BID': Float64, 'BIDSIZ': Int64, 'ASK': Float64, 'ASKSIZ': Int64, 'QU_COND': String, 'QU_SEQNUM': Int64, 'NATBBO_IND': String, 'QU_CANCEL': String, 'QU_SOURCE': String, 'SYM_ROOT': String, 'SYM_SUFFIX': String})


--- Schema for Trades Top 50 ---
Schema({'DATE': Date, 'TIME_M': Time, 'EX': String, 'SYM_ROOT': String, 'SYM_SUFFIX': String, 'TR_SCOND': String, 'SIZE': Int64, 'PRICE': Float64, 'TR_STOP_IND': String, 'TR_CORR': String, 'TR_SEQNUM': Int64, 'TR_ID': Int64, 'TR_SOURCE': String, 'TR_RF': String})




  print(lf.schema)


In [3]:
import polars as pl

# Update this to the specific directory you want to check
PATH = "/home/amazon/Documents/TAQData/2024_03_15/processed_output_quotes_top50/*.parquet"

# 1. Scan the directory (glob pattern handles all files in the folder)
# 2. Select only the 'EX' column
# 3. Call unique() and collect() to get the result
distinct_exchanges = (
    pl.scan_parquet(PATH)
    .select("EX")
    .unique()
    .collect()
)

print("Distinct Exchange Codes Found:")
print(distinct_exchanges)

Distinct Exchange Codes Found:
shape: (17, 1)
┌─────┐
│ EX  │
│ --- │
│ str │
╞═════╡
│ N   │
│ K   │
│ C   │
│ Y   │
│ H   │
│ …   │
│ B   │
│ M   │
│ V   │
│ Q   │
│ A   │
└─────┘


In [4]:
distinct_exchanges

EX
str
"""N"""
"""K"""
"""C"""
"""Y"""
"""H"""
…
"""B"""
"""M"""
"""V"""
"""Q"""
