In [1]:
import boto3 #Amazon AWS Python SDK
from botocore.config import Config #Config for SDK
from dotenv import load_dotenv # Load .ENV file containing protected information
import os # Ability to manage and access neigboring files 
import pandas as pd

In [2]:
# Make the environment variables available to python from the .env file
load_dotenv()
# Load the environment variables into python variables
ACCESS_KEY = os.getenv("AWS_ACCESS_KEY_ID")
SECRET_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
POLYGON_KEY = os.getenv("POLYGON_API_KEYS")

In [3]:
# Initialize a session using the AWS keys
session = boto3.Session( # Session object used to configure users and environment control
    aws_access_key_id=ACCESS_KEY,
    aws_secret_access_key=SECRET_KEY,
)

In [4]:
# Create a client with session and speficy the endpoint (where the data is located)
s3 = session.client(
    's3', # Connecting to the S3 (Simple Storage Service) specifically (can connect to any aws service here)
    endpoint_url='https://files.polygon.io', # Base url for the service you want to access
    config=Config(signature_version='s3v4'), # Ensures client is using AWS signature Version 4 protocol by prohibiting api requests unless supplied with
                                             # a secret key. Used for hashsing
)
# The previous code is everything needed to accesss the S3 flatfiles, from here you can use commands like list objects or get objects

In [5]:
# Initialize a paginator for listing objects
paginator = s3.get_paginator('list_objects_v2')

## 🌐 Understanding Requests and Paginators in S3 (Conceptual Overview)

### 📤 What is a Request?

A **request** is a single operation sent from your client (e.g., Python code) to a server (e.g., AWS S3 or Polygon’s S3-compatible endpoint). For example, when you ask to list files in a folder-like structure in a bucket, that is a request.

S3’s `list_objects_v2` request returns a maximum of 1000 objects (files) at a time. If more files exist, it only returns the first "page" and indicates that more data is available.

---

### 🔁 What is a Paginator?

A **paginator** is a built-in tool provided by `boto3` that automatically handles repeated requests when the response is paginated. 

Instead of manually tracking continuation tokens and sending new requests, the paginator transparently performs this for you. It lets you iterate over all the data as if it were returned in one big response.

---

### 🪣 S3 Paginators Specifically

S3 paginators are used to retrieve more than 1000 files (objects) from a bucket. You create a paginator specifically for the `list_objects_v2` operation, which is the improved version of the original S3 listing API.

The paginator handles:
- Sending the first request
- Detecting if the result is truncated (chopped off)
- Sending follow-up requests with the continuation token
- Returning each full page of results one after the other

---

### 📌 Key Parameters Used with S3 Paginators

- **Bucket**: The name of the S3 bucket you are querying.
- **Prefix**: A folder-like path that limits the results to objects that begin with that string.
- **Delimiter** (optional): Used to group files as if they were in folders (commonly set to `/`).
- **PaginationConfig** (optional): Allows advanced control, like page size or starting from a specific point.

---

### ✅ Summary

- A **request** retrieves a single chunk of data from S3.
- A **paginator** automates multiple requests so you can work with large datasets easily.
- S3 paginators are essential when listing more than 1000 files in a bucket or folder-like structure.


In [6]:
from polygon import RESTClient
from polygon.rest.models import (
    TickerSnapshot,
    Agg,
) #Python libraries for polygon

client = RESTClient(POLYGON_KEY) # Activating Polygon REST API

snapshot = client.get_snapshot_all(
	"stocks",
	) # Returns a snapshot of all stocks daily activities, includes over 10000 tickers

In [7]:
df = pd.DataFrame(snapshot) # Make json object a dataframe

In [8]:
day_df = pd.json_normalize(df['day']) # Turn json dict into dataframe
day_df = day_df[['volume', 'vwap']]  # Only keep the ones you care about (Volume and VWAP)
final_df = pd.concat([df['ticker'], day_df], axis=1) # Concatenate the ticker onto the rows on the left side

In [9]:
final_df['dollar_volume'] = df['day'].apply(lambda x: x['volume'] * x['vwap']) # Lambda function to create dollar volume
final_df = final_df.sort_values(by='dollar_volume', ascending=False).reset_index(drop=True)

In [10]:
ticker_list = list(final_df['ticker'][:500])
print(len(ticker_list))

500


In [11]:
final_df.isnull().sum() # Print null values per column to check 

ticker           0
volume           0
vwap             0
dollar_volume    0
dtype: int64

In [13]:
# Copy example
# Specify the bucket name
bucket_name = 'flatfiles'

# Specify the S3 object key name
object_key = 'us_stocks_sip/minute_aggs_v1/2025/03/2025-03-20.csv.gz'

# Specify the local file name and path to save the downloaded file
# This splits the object_key string by '/' and takes the last segment as the file name
local_file_name = object_key.split('/')[-1]

# This constructs the full local file path
local_file_path = './' + local_file_name

# Download the file
s3.download_file(bucket_name, object_key, local_file_path)

In [14]:
# Assuming you've already downloaded the file
df = pd.read_csv(local_file_path, compression='gzip')

# Preview the data
df.head()

Unnamed: 0,ticker,volume,open,close,high,low,window_start,transactions
0,A,11905,121.72,122.055,122.055,121.72,1742477400000000000,36
1,A,1268,122.075,122.075,122.075,122.075,1742477460000000000,17
2,A,565,122.075,122.075,122.075,122.075,1742477520000000000,16
3,A,8163,122.075,122.07,122.08,121.44,1742477580000000000,81
4,A,4209,122.065,120.81,122.065,120.81,1742477640000000000,117


In [16]:
#Now I will create a dataframe where the tickers only match the ones in the ticker list, we will filter them out and check if the 
# minute agg bars amount is the same across tickers. 

filtered_df = df[df['ticker'].isin(ticker_list)]
print(len(filtered_df['ticker'].unique()))


498


In [17]:
# Now I will print the missing tickers that did not get imported but was in the ticker list

filtered_tickers = set(filtered_df['ticker'].unique())
expected_tickers = set(ticker_list)

missing = expected_tickers - filtered_tickers

print("Missing tickers:")
for ticker in sorted(missing):
    print(ticker)


Missing tickers:
CRWV
NMAX
