In [1]:
import boto3
from botocore.config import Config
import os
from dotenv import load_dotenv
from datetime import datetime

load_dotenv("env.env")
session = boto3.Session(
  aws_access_key_id=os.getenv('accesskey'),
  aws_secret_access_key=os.getenv('secretkey'),
)
s3 = session.client(
  's3',
  endpoint_url='https://files.polygon.io',
  config=Config(signature_version='s3v4'),
)
bucket_name = 'flatfiles'
paginator = s3.get_paginator('list_objects_v2')
prefix = 'us_stocks_sip'
min_date = datetime(2019, 2, 4)

for page in paginator.paginate(Bucket='flatfiles', Prefix=prefix):
    for obj in page['Contents']:
        key = obj['Key']
        if '/trades_v1/' in key:
            parts = key.split('/')
            if len(parts) >= 4 and parts[-1].endswith('.csv.gz'):
                try:
                    date_str = parts[-1].split('.')[0]  # '2015-08-10'
                    file_date = datetime.strptime(date_str, "%Y-%m-%d")
                    if file_date >= min_date:
                        local_file_name = key.split('/')[-1]
                        local_file_path = './datafiles/' + local_file_name
                        s3.download_file(bucket_name, key, local_file_path)
                        print(key)
                except Exception:
                    continue

us_stocks_sip/trades_v1/2019/02/2019-02-04.csv.gz
us_stocks_sip/trades_v1/2019/02/2019-02-05.csv.gz
us_stocks_sip/trades_v1/2019/02/2019-02-06.csv.gz
us_stocks_sip/trades_v1/2019/02/2019-02-07.csv.gz
us_stocks_sip/trades_v1/2019/02/2019-02-08.csv.gz
us_stocks_sip/trades_v1/2019/02/2019-02-11.csv.gz
us_stocks_sip/trades_v1/2019/02/2019-02-12.csv.gz
us_stocks_sip/trades_v1/2019/02/2019-02-13.csv.gz
us_stocks_sip/trades_v1/2019/02/2019-02-14.csv.gz
us_stocks_sip/trades_v1/2019/02/2019-02-15.csv.gz
us_stocks_sip/trades_v1/2019/02/2019-02-19.csv.gz
us_stocks_sip/trades_v1/2019/02/2019-02-20.csv.gz
us_stocks_sip/trades_v1/2019/02/2019-02-21.csv.gz
us_stocks_sip/trades_v1/2019/02/2019-02-22.csv.gz
us_stocks_sip/trades_v1/2019/02/2019-02-25.csv.gz
us_stocks_sip/trades_v1/2019/02/2019-02-26.csv.gz
us_stocks_sip/trades_v1/2019/02/2019-02-27.csv.gz
us_stocks_sip/trades_v1/2019/02/2019-02-28.csv.gz
us_stocks_sip/trades_v1/2019/03/2019-03-01.csv.gz
us_stocks_sip/trades_v1/2019/03/2019-03-04.csv.gz


In [None]:
import os
import gzip
import pandas as pd

# Settings
columns_to_keep = ['ticker', 'exchange', 'participant_timestamp', 'price', 'size']
valid_exchanges = {2, 12, 17, 202, 203}
output_file = './datafiles/output.csv'

# Remove output file if it exists
if os.path.exists(output_file):
    os.remove(output_file)

# Sort files chronologically (alphabetically works for YYYY-MM-DD.csv.gz)
data_dir = './datafiles'
file_list = sorted([os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith('.csv.gz')])

# State for del_t and del_p
last_seen = {}  # {ticker: (last_timestamp, last_price)}
header_written = False

for file in file_list:
    print(file)
    with gzip.open(file, 'rt') as f:
        df = pd.read_csv(f, usecols=columns_to_keep)
    # Filter valid exchanges and deduplicate
    df = df[df['exchange'].isin(valid_exchanges)].drop_duplicates()
    # Sort by participant_timestamp
    df = df.sort_values('participant_timestamp').reset_index(drop=True)
    # Compute del_t and del_p
    del_t_list = []
    del_p_list = []
    for idx, row in df.iterrows():
        ticker = row['ticker']
        timestamp = row['participant_timestamp']
        price = row['price']
        if ticker in last_seen:
            last_time, last_price = last_seen[ticker]
            del_t = timestamp - last_time
            del_p = price - last_price
        else:
            del_t = 0
            del_p = 0
        last_seen[ticker] = (timestamp, price)
        del_t_list.append(del_t)
        del_p_list.append(del_p)
    df['del_t'] = del_t_list
    df['del_p'] = del_p_list
    # Write to output file
    print("Writing started")
    df.to_csv(output_file, mode='a', header=not header_written, index=False)
    header_written = True

./datafiles/2015-08-10.csv.gz
Writing started
./datafiles/2015-08-11.csv.gz
Writing started
./datafiles/2015-08-12.csv.gz
Writing started
./datafiles/2015-08-13.csv.gz
Writing started
./datafiles/2015-08-14.csv.gz
Writing started
./datafiles/2015-08-17.csv.gz
Writing started
./datafiles/2015-08-18.csv.gz
Writing started
./datafiles/2015-08-19.csv.gz
Writing started
./datafiles/2015-08-20.csv.gz
Writing started
./datafiles/2015-08-21.csv.gz
Writing started
./datafiles/2015-08-24.csv.gz
