In [1]:
import boto3 #Amazon AWS Python SDK
from botocore.config import Config #Config for SDK
from dotenv import load_dotenv # Load .ENV file containing protected information
import os # Ability to manage and access neigboring files 
import pandas as pd
import gzip
import io
from collections import defaultdict
from datetime import time
import pytz
from datetime import time
from tqdm import tqdm

In [2]:
# Make the environment variables available to python from the .env file
load_dotenv()
# Load the environment variables into python variables
ACCESS_KEY = os.getenv("AWS_ACCESS_KEY_ID")
SECRET_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")

In [3]:
# Initialize a session using the AWS keys
session = boto3.Session( # Session object used to configure users and environment control
    aws_access_key_id=ACCESS_KEY,
    aws_secret_access_key=SECRET_KEY,
)

In [4]:
# Create a client with session and speficy the endpoint (where the data is located)
s3 = session.client(
    's3', # Connecting to the S3 (Simple Storage Service) specifically (can connect to any aws service here)
    endpoint_url='https://files.polygon.io', # Base url for the service you want to access
    config=Config(signature_version='s3v4'), # Ensures client is using AWS signature Version 4 protocol by prohibiting api requests unless supplied with
                                             # a secret key. Used for hashsing
)
# The previous code is everything needed to accesss the S3 flatfiles, from here you can use commands like list objects or get objects

In [5]:
# Initialize a paginator for listing objects
paginator = s3.get_paginator('list_objects_v2')

In [6]:
# Choose the appropriate prefix depending on the data you need:
# - 'global_crypto' for global cryptocurrency data
# - 'global_forex' for global forex data
# - 'us_indices' for US indices data
# - 'us_options_opra' for US options (OPRA) data
# - 'us_stocks_sip' for US stocks (SIP) data
prefix = 'us_stocks_sip/daily_aggs_v1/2022'  # Example: Change this prefix to match your data need
object_keys = []

# List objects using the selected prefix
for page in paginator.paginate(Bucket='flatfiles', Prefix=prefix):
    for obj in page['Contents']:
        print(obj['Key'])
        object_keys.append(obj['Key'])

prefix = 'us_stocks_sip/daily_aggs_v1/2023'

# List objects using the selected prefix
for page in paginator.paginate(Bucket='flatfiles', Prefix=prefix):
    for obj in page['Contents']:
        object_keys.append(obj['Key'])

prefix = 'us_stocks_sip/daily_aggs_v1/2024'

# List objects using the selected prefix
for page in paginator.paginate(Bucket='flatfiles', Prefix=prefix):
    for obj in page['Contents']:
        object_keys.append(obj['Key'])

us_stocks_sip/minute_aggs_v1/2022/01/2022-01-03.csv.gz
us_stocks_sip/minute_aggs_v1/2022/01/2022-01-04.csv.gz
us_stocks_sip/minute_aggs_v1/2022/01/2022-01-05.csv.gz
us_stocks_sip/minute_aggs_v1/2022/01/2022-01-06.csv.gz
us_stocks_sip/minute_aggs_v1/2022/01/2022-01-07.csv.gz
us_stocks_sip/minute_aggs_v1/2022/01/2022-01-10.csv.gz
us_stocks_sip/minute_aggs_v1/2022/01/2022-01-11.csv.gz
us_stocks_sip/minute_aggs_v1/2022/01/2022-01-12.csv.gz
us_stocks_sip/minute_aggs_v1/2022/01/2022-01-13.csv.gz
us_stocks_sip/minute_aggs_v1/2022/01/2022-01-14.csv.gz
us_stocks_sip/minute_aggs_v1/2022/01/2022-01-18.csv.gz
us_stocks_sip/minute_aggs_v1/2022/01/2022-01-19.csv.gz
us_stocks_sip/minute_aggs_v1/2022/01/2022-01-20.csv.gz
us_stocks_sip/minute_aggs_v1/2022/01/2022-01-21.csv.gz
us_stocks_sip/minute_aggs_v1/2022/01/2022-01-24.csv.gz
us_stocks_sip/minute_aggs_v1/2022/01/2022-01-25.csv.gz
us_stocks_sip/minute_aggs_v1/2022/01/2022-01-26.csv.gz
us_stocks_sip/minute_aggs_v1/2022/01/2022-01-27.csv.gz
us_stocks_

In [7]:
# Market hours in Eastern Time
market_open = time(9, 30)
market_close = time(16, 0)

# Convert timezone?
USE_TIMEZONE_CONVERSION = True  # Set to False if timestamps are already in ET

# Tracks ticker -> list of daily in-market sample counts
ticker_sample_counts = defaultdict(list)

for object_key in tqdm(object_keys):
    # Read the file from S3 into memory
    response = s3.get_object(Bucket='flatfiles', Key=object_key)
    with gzip.GzipFile(fileobj=io.BytesIO(response['Body'].read())) as f:
        df = pd.read_csv(f)

    # Convert timestamp column (from nanoseconds)
    df['timestamp'] = pd.to_datetime(df['window_start'], unit='ns')

    if USE_TIMEZONE_CONVERSION:
        # Assume timestamps are in UTC and convert to US/Eastern
        df['timestamp'] = df['timestamp'].dt.tz_localize('UTC').dt.tz_convert('US/Eastern')

    # Filter only rows during market hours
    df = df[df['timestamp'].dt.time.between(market_open, market_close)]

    # Count in-market samples per ticker
    counts = df.groupby('ticker').size()

    # Store daily count for each ticker
    for ticker, count in counts.items():
        ticker_sample_counts[ticker].append(count)

# Filter: only keep tickers that have the same number of in-market samples each day
valid_tickers = [ticker for ticker, counts in ticker_sample_counts.items() if len(set(counts)) == 1]

print("Valid tickers:", valid_tickers)

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 753/753 [56:50<00:00,  4.53s/it]

Valid tickers: ['CIT', 'CITpB', 'FB', 'MGLN', 'VIAC', 'YSACW', 'ZXIET', 'FXCOU', 'EVSTC', 'SHELw', 'NTEST.L', 'EVGBC', 'EVLMC', 'ALACU', 'BBUCw', 'ESABw', 'CBX', 'GLUrw', 'NTEST.Y', 'NTEST.Z', 'ZVZZC', 'RODI', 'MIMO.WS.C', 'MMM.WD', 'EMPW', 'ADEAV', 'RSFrw', 'ZCZZT', 'ARMKw', 'Kw', 'WPCw', 'WORw', 'CDLRw', 'ROIS', 'XLYO', 'INXBV', 'INXB', 'ILMNV', 'AFCGV', 'SUNSV', 'HHHw', 'LIAE', 'AVKrw', 'LIAG', 'Jw', 'EFFI', 'JHCR', 'LIAF', 'LIAV', 'WXET', 'BRIF', 'EFFE', 'ONEG']





In [8]:
len(valid_tickers)

53