In [None]:
import pandas as pd
import os
import sys

# Load in OHLCV Data 
csv_path = r'C:\Users\carso\Development\emerytrading\Data\Stocks\Polygon\OHLCV_Historical_2016-01-01_to_2025-10-26.csv'
df = pd.read_csv(csv_path)

# Observe the types of all the columns 
print("=" * 50)
print(df.dtypes)

# Convert to pandas datetime and normalize to date (removes time component)
# Keeping as pandas datetime (not Python date) for pandas operations like .dt.year
df['date'] = pd.to_datetime(df['window_start'], unit='ns').dt.normalize()

if 'ticker' in df.columns:
        df['ticker'] = df['ticker'].astype('category')

# Rename window_start to unix_nsec_timestamp
df = df.rename(columns={'window_start': 'unix_nsec_timestamp'})

# Reorder columns to match desired order: date, unix_nsec_timestamp, ticker, open, close, high, low, volume, transactions
desired_order = ['date', 'unix_nsec_timestamp', 'ticker', 'open', 'close', 'high', 'low', 'volume', 'transactions']
df = df[desired_order]


print("=" * 50)
print(df.head())
print(df.dtypes)


# Check for NaN values in ticker column
print(f"NaN count: {df['ticker'].isna().sum()}")
print(f"NaN percentage: {df['ticker'].isna().sum() / len(df) * 100:.2f}%")
print(f"Unique tickers (with NaN): {len(df['ticker'].unique())}")
print(f"Unique tickers (without NaN): {len(df['ticker'].dropna().unique())}")



In [4]:
# More comprehensive analysis: Distribution of data points per ticker
# This helps determine an informed threshold rather than an arbitrary 30 days

import pandas as pd
import numpy as np

# Load in OHLCV Data 
csv_path = r'C:\Users\carso\Development\emerytrading\Data\Stocks\Polygon\OHLCV_Historical_2016-01-01_to_2025-10-26.csv'
df = pd.read_csv(csv_path)

df['date'] = pd.to_datetime(df['window_start'], unit='ns').dt.normalize()

# Calculate stats for each ticker
ticker_stats = []

for ticker, group in df.groupby('ticker', observed=True):
    dates = group['date'].unique()
    date_range = (dates.max() - dates.min()).days if len(dates) > 1 else 0
    
    ticker_stats.append({
        'ticker': ticker,
        'num_days': len(dates),
        'num_rows': len(group),
        'date_span_days': date_range,
        'start_date': dates.min(),
        'end_date': dates.max()
    })

stats_df = pd.DataFrame(ticker_stats)

print("=" * 70)
print("DISTRIBUTION OF DATA POINTS PER TICKER")
print("=" * 70)

print("\n1. NUMBER OF UNIQUE DAYS PER TICKER:")
print(stats_df['num_days'].describe())
print(f"\nPercentiles:")
for p in [1, 5, 10, 25, 50, 75, 90, 95, 99]:
    val = stats_df['num_days'].quantile(p/100)
    count = (stats_df['num_days'] < val).sum()
    print(f"  {p}th percentile: {val:.0f} days ({count:,} tickers have fewer)")

print("\n2. TIME SPAN (CALENDAR DAYS FROM FIRST TO LAST):")
print(stats_df['date_span_days'].describe())
print(f"\nNote: A ticker with 30 days spread over 2 years is different from 30 consecutive days")

print("\n3. TICKERS BY DATA POINT COUNT:")
print("Count of tickers by number of days:")
print(stats_df['num_days'].value_counts().sort_index().head(50))

print("\n4. RECOMMENDATION ANALYSIS:")
print("=" * 70)

# Test different thresholds
thresholds = [10, 20, 30, 50, 60, 90, 120]
print("\nThreshold | Tickers Removed | % of Total | Rows Removed | % of Rows")
print("-" * 70)

total_tickers = len(stats_df)
total_rows = len(df)

for threshold in thresholds:
    tickers_to_remove = stats_df[stats_df['num_days'] < threshold]['ticker'].tolist()
    rows_to_remove = df[df['ticker'].isin(tickers_to_remove)]
    
    ticker_pct = (len(tickers_to_remove) / total_tickers) * 100
    row_pct = (len(rows_to_remove) / total_rows) * 100
    
    print(f"   {threshold:3d}   |     {len(tickers_to_remove):5,}      |  {ticker_pct:5.2f}%  |   {len(rows_to_remove):8,}   |  {row_pct:5.2f}%")

print("\n5. CONSIDERATIONS:")
print("   - 30 trading days ≈ 1.5 months (reasonable for basic analysis)")
print("   - 60 trading days ≈ 3 months (better for trend analysis)")
print("   - 90 trading days ≈ 4.5 months (good for seasonal patterns)")
print("   - Also consider: time span (not just count) - consecutive vs spread out")
print("   - Many of these short-series tickers are likely:")
print("     * Test tickers (ATEST.A, ATEST.B, etc.)")
print("     * Warrants/rights (.w, .WS suffixes)")
print("     * Preferred shares (.pA, .pB, etc.)")
print("     * Which you're already filtering with sanitize_non_equities()")

print("\n6. SUGGESTED APPROACH:")
print("   Option A: Filter by count only (simple): 30-60 days")
print("   Option B: Filter by count AND time span (more robust):")
print("             - At least 30-60 trading days")
print("             - AND span at least 3-6 months")
print("   Option C: Let other filters handle it (non-equities, low volume)")
print("             and only filter extreme cases (< 10-20 days)")

print("=" * 70)


DISTRIBUTION OF DATA POINTS PER TICKER

1. NUMBER OF UNIQUE DAYS PER TICKER:
count    23621.000000
mean       990.949663
std        858.446496
min          1.000000
25%        254.000000
50%        695.000000
75%       1663.000000
max       2468.000000
Name: num_days, dtype: float64

Percentiles:
  1th percentile: 4 days (235 tickers have fewer)
  5th percentile: 25 days (1,172 tickers have fewer)
  10th percentile: 88 days (2,344 tickers have fewer)
  25th percentile: 254 days (5,899 tickers have fewer)
  50th percentile: 695 days (11,807 tickers have fewer)
  75th percentile: 1663 days (17,711 tickers have fewer)
  90th percentile: 2468 days (20,399 tickers have fewer)
  95th percentile: 2468 days (20,399 tickers have fewer)
  99th percentile: 2468 days (20,399 tickers have fewer)

2. TIME SPAN (CALENDAR DAYS FROM FIRST TO LAST):
count    23621.000000
mean      1547.307989
std       1245.705514
min          0.000000
25%        500.000000
50%       1135.000000
75%       2636.000000
ma

In [3]:
# Check the distribution of tickers that have very little continuous trading data and determine a threshold to filter out, perhaps less than 30 days

import pandas as pd
import os
import sys

# Load in OHLCV Data 
csv_path = r'C:\Users\carso\Development\emerytrading\Data\Stocks\Polygon\OHLCV_Historical_2016-01-01_to_2025-10-26.csv'
df = pd.read_csv(csv_path)

df['date'] = pd.to_datetime(df['window_start'], unit='ns').dt.normalize()


tickers_too_little_data = []

for ticker, group in df.groupby('ticker', observed=True):
    dates = group['date'].unique()

    if len(dates) < 30:
        tickers_too_little_data.append(ticker)

print(len(tickers_too_little_data))
print(tickers_too_little_data)

1292
['AANw', 'AAPGV', 'AAUM', 'AAw', 'ACAw', 'ACCL', 'ACE', 'ACEI', 'ACPrw', 'ACSV', 'ADAT', 'ADEAV', 'ADNTw', 'ADOCU', 'ADSw', 'ADYX', 'ADZ', 'AEBIV', 'AEC', 'AED.CL', 'AEK.CL', 'AESEW', 'AESpC.CL', 'AEXA', 'AFA.CL', 'AFCGV', 'AFGE.CL', 'AFIw', 'AFRU', 'AFW.CL', 'AGCC', 'AGMpB.CL', 'AGRZ', 'AHCOW', 'AHMA', 'AHTpA.CL', 'AHTpE.CL', 'AIFER', 'AIIA.U', 'AIMCV', 'AIVpZ.CL', 'ALH', 'ALISU', 'ALKSV', 'ALLYpB.CL', 'ALLpA.CL', 'ALLpD.CL', 'ALLpE.CL', 'ALLpF.CL', 'ALPpO.CL', 'ALTMW', 'ALVw', 'AMTMw', 'AMUN', 'AMYY', 'ANGIV', 'AOUTV', 'APADR', 'APDw', 'APRB', 'APTVw', 'APVOV', 'APYw', 'AQBTV', 'AREpE.CL', 'ARHpC.CL', 'ARIpA.CL', 'ARIpC.CL', 'ARKIU', 'ARKT', 'ARLOw', 'ARMKw', 'ARMW', 'ARNCw', 'ARRpA.CL', 'ARRpB.CL', 'ARU.CL', 'ASAIw', 'ASCI', 'ASHw', 'ASIXw', 'AST.WSw', 'ASXw', 'ATEST.A', 'ATEST.B', 'ATEST.C', 'ATMUw', 'ATUSw', 'AURE', 'AURU', 'AVKr', 'AVKrw', 'AVOL', 'AVV.CL', 'AVXX', 'AWIw', 'AXAC.U', 'AXARU', 'AXG', 'AXPW', 'AXPWW', 'AXSpD.CL', 'AXUP', 'AZYY', 'BABW', 'BACpY.CL', 'BACpZ.CL', 

In [None]:
# Analyze types in ticker column
import pandas as pd
from collections import Counter

# Load in OHLCV Data 
csv_path = r'C:\Users\carso\Development\emerytrading\Data\Stocks\Polygon\OHLCV_Historical_2016-01-01_to_2025-10-26.csv'
df = pd.read_csv(csv_path)

print("=" * 70)
print("TICKER COLUMN TYPE ANALYSIS")
print("=" * 70)

# Column dtype
print(f"\nColumn dtype: {df['ticker'].dtype}")

# Count Python types
type_counts = Counter(type(val).__name__ for val in df['ticker'])
print(f"\nPython type distribution:")
for py_type, count in type_counts.most_common():
    percentage = (count / len(df)) * 100
    print(f"  {py_type}: {count:,} rows ({percentage:.2f}%)")

# NaN analysis
nan_count = df['ticker'].isna().sum()
print(f"\nNaN values: {nan_count:,} rows ({(nan_count/len(df)*100):.2f}%)")

# Sample values by type
print(f"\nSample values by type:")
for py_type in type_counts.keys():
    sample_values = [val for val in df['ticker'] if type(val).__name__ == py_type][:5]
    print(f"  {py_type}: {sample_values}")

# Check for numeric tickers (might be read as float)
numeric_tickers = df[pd.to_numeric(df['ticker'], errors='coerce').notna()]
if len(numeric_tickers) > 0:
    print(f"\n⚠️  Found {len(numeric_tickers)} rows with numeric ticker values:")
    print(f"   Sample: {numeric_tickers['ticker'].head(10).tolist()}")

# Summary
print(f"\n" + "=" * 70)
print("SUMMARY")
print("=" * 70)
print(f"Total rows: {len(df):,}")
print(f"String tickers: {type_counts.get('str', 0):,}")
print(f"Non-string tickers: {sum(count for type_name, count in type_counts.items() if type_name != 'str'):,}")
print(f"NaN tickers: {nan_count:,}")


In [None]:
# Inspect rows with NaN ticker values
import pandas as pd

# Load in OHLCV Data 
csv_path = r'C:\Users\carso\Development\emerytrading\Data\Stocks\Polygon\OHLCV_Historical_2016-01-01_to_2025-10-26.csv'
df = pd.read_csv(csv_path)

# Convert to pandas datetime and normalize to date
df['date'] = pd.to_datetime(df['window_start'], unit='ns').dt.normalize()

# Find rows with NaN tickers
nan_ticker_rows = df[df['ticker'].isna()]

print("=" * 70)
print("ROWS WITH NaN TICKER VALUES")
print("=" * 70)
print(f"Total rows with NaN ticker: {len(nan_ticker_rows)}")
print(f"\nSample of rows with NaN ticker (first 20):")
print(nan_ticker_rows.head(20))

print(f"\n\nDate range for NaN ticker rows:")
print(f"  From: {nan_ticker_rows['date'].min()}")
print(f"  To: {nan_ticker_rows['date'].max()}")

print(f"\n\nOther columns with NaN values in these rows:")
for col in nan_ticker_rows.columns:
    nan_count = nan_ticker_rows[col].isna().sum()
    if nan_count > 0:
        print(f"  {col}: {nan_count} NaN values ({nan_count/len(nan_ticker_rows)*100:.1f}%)")

print(f"\n\nVolume statistics for NaN ticker rows:")
print(nan_ticker_rows['volume'].describe())

print(f"\n\nPrice statistics for NaN ticker rows:")
print(nan_ticker_rows[['open', 'close', 'high', 'low']].describe())

print(f"\n\nUnique dates with NaN tickers: {nan_ticker_rows['date'].nunique()}")
print(f"Date distribution (top 10 dates with most NaN tickers):")
print(nan_ticker_rows['date'].value_counts().head(10))


In [None]:
# Performance Test: String (object) vs Category dtype for ticker column
import pandas as pd
import numpy as np
import time

# Load in OHLCV Data 
csv_path = r'C:\Users\carso\Development\emerytrading\Data\Stocks\Polygon\OHLCV_Historical_2016-01-01_to_2025-10-26.csv'
df_test = pd.read_csv(csv_path)

# Convert to pandas datetime and normalize to date
df_test['date'] = pd.to_datetime(df_test['window_start'], unit='ns').dt.normalize()

# Reorder columns to put 'date' first
cols = ['date'] + [col for col in df_test.columns if col != 'date']
df_test = df_test[cols]

print("=" * 70)
print("DATASET INFO")
print("=" * 70)
print(f"Total rows: {len(df_test):,}")
print(f"Unique tickers: {df_test['ticker'].nunique():,}")
print(f"Average rows per ticker: {len(df_test) / df_test['ticker'].nunique():.1f}")
print(f"\nMemory usage (object dtype):")
mem_obj = df_test['ticker'].memory_usage(deep=True) / 1024**2
print(f"{mem_obj:.2f} MB")

# Create a copy with category dtype
df_cat = df_test.copy()
df_cat['ticker'] = df_cat['ticker'].astype('category')

print(f"\nMemory usage (category dtype):")
mem_cat = df_cat['ticker'].memory_usage(deep=True) / 1024**2
print(f"{mem_cat:.2f} MB")
print(f"Memory savings: {(1 - mem_cat / mem_obj) * 100:.1f}%")

print("\n" + "=" * 70)
print("PERFORMANCE TESTS")
print("=" * 70)

# Test 1: groupby operations
print("\n1. GROUPBY OPERATIONS")
print("-" * 70)

# Object dtype
start = time.time()
result_obj = df_test.groupby('ticker')['volume'].sum()
time_obj = time.time() - start
print(f"Object dtype: {time_obj:.4f} seconds")

# Category dtype
start = time.time()
result_cat = df_cat.groupby('ticker')['volume'].sum()
time_cat = time.time() - start
print(f"Category dtype: {time_cat:.4f} seconds")
print(f"Speedup: {time_obj / time_cat:.2f}x")

# Test 2: Filtering with isin()
print("\n2. FILTERING WITH isin()")
print("-" * 70)

# Get a sample of tickers to filter
sample_tickers = df_test['ticker'].unique()[:1000].tolist()

# Object dtype
start = time.time()
filtered_obj = df_test[df_test['ticker'].isin(sample_tickers)]
time_obj = time.time() - start
print(f"Object dtype: {time_obj:.4f} seconds")

# Category dtype
start = time.time()
filtered_cat = df_cat[df_cat['ticker'].isin(sample_tickers)]
time_cat = time.time() - start
print(f"Category dtype: {time_cat:.4f} seconds")
print(f"Speedup: {time_obj / time_cat:.2f}x")

# Test 3: Sorting
print("\n3. SORTING")
print("-" * 70)

# Object dtype
start = time.time()
sorted_obj = df_test.sort_values('ticker')
time_obj = time.time() - start
print(f"Object dtype: {time_obj:.4f} seconds")

# Category dtype
start = time.time()
sorted_cat = df_cat.sort_values('ticker')
time_cat = time.time() - start
print(f"Category dtype: {time_cat:.4f} seconds")
print(f"Speedup: {time_obj / time_cat:.2f}x")

# Test 4: value_counts()
print("\n4. value_counts()")
print("-" * 70)

# Object dtype
start = time.time()
vc_obj = df_test['ticker'].value_counts()
time_obj = time.time() - start
print(f"Object dtype: {time_obj:.4f} seconds")

# Category dtype
start = time.time()
vc_cat = df_cat['ticker'].value_counts()
time_cat = time.time() - start
print(f"Category dtype: {time_cat:.4f} seconds")
print(f"Speedup: {time_obj / time_cat:.2f}x")

# Test 5: set_index with ticker
print("\n5. set_index() with ticker")
print("-" * 70)

# Object dtype
start = time.time()
idx_obj = df_test.set_index(['ticker', 'date'])
time_obj = time.time() - start
print(f"Object dtype: {time_obj:.4f} seconds")

# Category dtype
start = time.time()
idx_cat = df_cat.set_index(['ticker', 'date'])
time_cat = time.time() - start
print(f"Category dtype: {time_cat:.4f} seconds")
print(f"Speedup: {time_obj / time_cat:.2f}x")

# Test 6: Aggregation operations
print("\n6. MULTIPLE AGGREGATIONS")
print("-" * 70)

# Object dtype
start = time.time()
agg_obj = df_test.groupby('ticker').agg({
    'volume': ['sum', 'mean', 'max'],
    'close': ['mean', 'std']
})
time_obj = time.time() - start
print(f"Object dtype: {time_obj:.4f} seconds")

# Category dtype
start = time.time()
agg_cat = df_cat.groupby('ticker').agg({
    'volume': ['sum', 'mean', 'max'],
    'close': ['mean', 'std']
})
time_cat = time.time() - start
print(f"Category dtype: {time_cat:.4f} seconds")
print(f"Speedup: {time_obj / time_cat:.2f}x")

In [None]:
# Analyzing how much of the data (or haw many ticker series) have at MAX 100 shares vol traded on a day through their ENTIRE series 
# and never reached above $0.01 on their ENTIRE series. 

# Goal is to potentially remove some untradeable noise, but first to see how much of that noise we would be removing

import re
from collections import defaultdict
import pandas as pd
import numpy as np

# Load in OHLCV Data 
csv_path = r'C:\Users\carso\Development\emerytrading\Data\Stocks\Polygon\OHLCV_Historical_2016-01-01_to_2025-10-26.csv'
OHLCV_data = pd.read_csv(csv_path)

# Convert to pandas datetime and normalize to date (removes time component)
# Keeping as pandas datetime (not Python date) for pandas operations like .dt.year
OHLCV_data['date'] = pd.to_datetime(OHLCV_data['window_start'], unit='ns').dt.normalize()

# Reorder columns to put 'date' first
cols = ['date'] + [col for col in OHLCV_data.columns if col != 'date']
OHLCV_data[cols]

unique_tickers = OHLCV_data['ticker'].unique()

# Find and analyze the number of tickers that have MAX 100 shares volume across their entire series 
# Compute max volume and max price per ticker in one pass
ticker_stats = OHLCV_data.groupby('ticker').agg({
    'volume': 'max',
    'close': 'max'
}).reset_index()

# See what the distribution of price and volume is across our series
print(ticker_stats['volume'].min())
print(ticker_stats['volume'].quantile([0.01, 0.05, 0.10, 0.25, 0.50]))
print(ticker_stats['close'].min())

# Filter tickers into a list that are below the criteria
invalid_tickers = ticker_stats[
    (ticker_stats['volume'] < 1000) |
    (ticker_stats['close'] < 0.01)
]['ticker'].tolist()

print(invalid_tickers)
print(len(invalid_tickers))

In [None]:
# Look at all of the suffixes we have for the unique tickers
import re
from collections import defaultdict
import pandas as pd
import numpy as np

# Load in OHLCV Data 
csv_path = r'C:\Users\carso\Development\emerytrading\Data\Stocks\Polygon\OHLCV_Historical_2016-01-01_to_2025-10-26.csv'
OHLCV_data = pd.read_csv(csv_path)

# Convert to pandas datetime and normalize to date (removes time component)
# Keeping as pandas datetime (not Python date) for pandas operations like .dt.year
OHLCV_data['date'] = pd.to_datetime(OHLCV_data['window_start'], unit='ns').dt.normalize()

# Reorder columns to put 'date' first
cols = ['date'] + [col for col in OHLCV_data.columns if col != 'date']
OHLCV_data[cols]

# Get unique tickers and filter out NaN values, convert to strings
unique_tickers = OHLCV_data['ticker'].dropna().unique()
unique_tickers = [str(ticker) for ticker in unique_tickers]

# Function to extract suffix from a ticker
def extract_suffix(ticker):
    """
    Extract suffix from ticker. Returns (base, suffix) or (ticker, None) if no suffix detected.
    Suffixes are typically 1-3 characters at the end that are not digits.
    """

    # Match test tickers (case-insensitive)
    if re.search(r'(?i)test', ticker):
        # Handle test tickers
        return ticker, 'TEST'

    # Match ZVZZT/ZWZZT test tickers
    if re.match(r'^(ZVZZT|ZWZZT)$', ticker):
        return ticker, None

    # Match non-equities (lowercase/period suffixes)
    non_equities_match = re.match(r'^([^a-z.]*)([a-z.].*)$', ticker)
    if non_equities_match:
        base, suffix = non_equities_match.groups()
        if len(base) >= 1 and len(suffix) >= 1:
            return base, suffix

    # If no pattern matches, return the whole ticker as base with no suffix
    return ticker, None

# Group tickers by suffix - simple dictionary mapping suffix -> list of tickers
suffix_groups = defaultdict(list)

for ticker in unique_tickers:
    base, suffix = extract_suffix(ticker)
    if suffix:
        suffix_groups[suffix].append(ticker)
    else:
        # Tickers with no detected suffix go under a no_suffix column
        suffix_groups['no suffix'].append(ticker)


# Find the maximum length needed for DataFrame
max_len = max(len(tickers) for tickers in suffix_groups.values()) if suffix_groups else 0

# Pad all lists to the same length with NaN
for suffix in suffix_groups:
    suffix_groups[suffix] = suffix_groups[suffix] + [np.nan] * (max_len - len(suffix_groups[suffix]))

# Create DataFrame
suffix_df = pd.DataFrame(dict(sorted(suffix_groups.items())))

# take a look at the dataframe
print(suffix_df)

# Dictionary to store the column names (suffixes) and the COUNT of tickers that fall under that suffix 
# we dont currently use the dictionary but could be nice to have
suffix_dict = {}

for column_name, column_data in suffix_df.items():
    suffix_dict[column_name] = column_data.count()
    print(f"{column_name} : {column_data.count()}")

# number of non suffixed unique tickers
count_no_suffix = suffix_dict['no suffix']
# number of uniquely suffix tickers identified by the regex
count_suffix = len(unique_tickers) - count_no_suffix

# Percent of uniquely suffixed tickers for all unique tickers 
# Will be removinig these 
print(count_suffix / len(unique_tickers))

In [None]:
# Analyzing duplicates
import re
from collections import defaultdict
import pandas as pd
import numpy as np

# Load in OHLCV Data 
csv_path = r'C:\Users\carso\Development\emerytrading\Data\Stocks\Polygon\OHLCV_Historical_2016-01-01_to_2025-10-26.csv'
OHLCV_data = pd.read_csv(csv_path)

# Convert to pandas datetime and normalize to date (removes time component)
# Keeping as pandas datetime (not Python date) for pandas operations like .dt.year
OHLCV_data['date'] = pd.to_datetime(OHLCV_data['window_start'], unit='ns').dt.normalize()

# Reorder columns to put 'date' first
cols = ['date'] + [col for col in OHLCV_data.columns if col != 'date']
OHLCV_data[cols]

# Find the duplcate rows
duplicate_rows = OHLCV_data[OHLCV_data.duplicated(subset=['ticker', 'date'], keep=False)]
print(type(duplicate_rows))

# Select only 'ticker' and 'date' columns, sort by ticker
dupes = duplicate_rows[['ticker', 'date', 'close','open', 'volume']].sort_values(by='ticker')

print(dupes)
print(len(dupes))
# print(len(duplicate_rows))

In [1]:
# Calculate API calls needed for comprehensive data collection
import pandas as pd
import numpy as np
from math import ceil

from cleaning import run_cleaning

def calculate_api_calls_needed(
    df: pd.DataFrame,
    frequency_days: int = 1,
    use_trading_days: bool = True,
    gap_threshold_days: int = None,
    ticker_col: str = 'ticker',  # Parameter: defaults to 'ticker' - the name of your ticker column
    date_col: str = 'date'        # Parameter: defaults to 'date' - the name of your date column
) -> pd.DataFrame:
    """
    Calculate the number of API calls needed to collect comprehensive data for each ticker's continuous series.
    
    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame with ticker and date columns (or whatever you name them)
    frequency_days : int, default=1
        How often to call the API (1=daily, 7=weekly, 30=monthly, etc.)
    use_trading_days : bool, default=True
        If True, uses business days (excludes weekends/holidays). If False, uses calendar days.
    gap_threshold_days : int, optional
        If provided, splits ticker series at gaps larger than this threshold (handles ticker reuse).
        If None, treats each ticker as a single continuous series from min to max date.
    ticker_col : str, default='ticker'
        The name of the column containing ticker symbols in your DataFrame.
        Since your column is named 'ticker', you can ignore this parameter.
    date_col : str, default='date'
        The name of the column containing dates in your DataFrame.
        Since your column is named 'date', you can ignore this parameter.
        
    Returns:
    --------
    pd.DataFrame with columns:
        - ticker: Ticker symbol
        - series_start: Start date of the continuous series
        - series_end: End date of the continuous series
        - days_in_range: Number of days (trading or calendar) in the range
        - api_calls_needed: Number of API calls needed for this series
    """
    
    # ticker_col and date_col are parameters (variables) that hold the column names
    # When you call df[ticker_col], it's the same as df['ticker'] (since ticker_col = 'ticker' by default)
    # This makes the function flexible - if someone's columns were named differently, they could pass different names
    
    # Ensure date column is datetime
    # df[date_col] means: access the column whose name is stored in the date_col variable
    # So df[date_col] = df['date'] when date_col='date' (the default)
    if not pd.api.types.is_datetime64_any_dtype(df[date_col]):
        df = df.copy()
        df[date_col] = pd.to_datetime(df[date_col])
    
    # Sort by ticker and date for gap detection
    # [ticker_col, date_col] creates a list like ['ticker', 'date'] to sort by both columns
    df_sorted = df.sort_values([ticker_col, date_col]).reset_index(drop=True)
    
    results = []
    
    # Group by ticker
    # df_sorted.groupby(ticker_col) is the same as df_sorted.groupby('ticker') when ticker_col='ticker'
    for ticker, group in df_sorted.groupby(ticker_col, observed=True):
        # group[date_col] accesses the date column from this ticker's group
        group = group.sort_values(date_col).reset_index(drop=True)
        dates = group[date_col].unique()
        
        if len(dates) < 2:
            # Single date or no dates - still need at least 1 API call
            series_start = dates[0] if len(dates) == 1 else None
            series_end = series_start
            
            if series_start is not None:
                if use_trading_days:
                    days_in_range = 1
                else:
                    days_in_range = 1
                
                api_calls = max(1, ceil(days_in_range / frequency_days))
                
                results.append({
                    'ticker': ticker,
                    'series_start': series_start,
                    'series_end': series_end,
                    'days_in_range': days_in_range,
                    'api_calls_needed': api_calls
                })
            continue
        
        # Detect continuous series
        if gap_threshold_days is not None:
            # Split into continuous series based on gaps
            series_list = []
            current_series_start = dates[0]
            
            for i in range(1, len(dates)):
                gap = (dates[i] - dates[i-1]).days
                
                if gap > gap_threshold_days:
                    # End current series, start new one
                    series_list.append((current_series_start, dates[i-1]))
                    current_series_start = dates[i]
            
            # Add the last series
            series_list.append((current_series_start, dates[-1]))
        else:
            # Treat as single continuous series
            series_list = [(dates[0], dates[-1])]
        
        # Calculate API calls for each continuous series
        for series_start, series_end in series_list:
            if use_trading_days:
                # Use business days (excludes weekends, but not holidays)
                date_range = pd.bdate_range(start=series_start, end=series_end, inclusive='both')
                days_in_range = len(date_range)
            else:
                # Use calendar days
                days_in_range = (series_end - series_start).days + 1
            
            # Calculate API calls needed (always at least 1, round up)
            api_calls = max(1, ceil(days_in_range / frequency_days))
            
            results.append({
                'ticker': ticker,
                'series_start': series_start,
                'series_end': series_end,
                'days_in_range': days_in_range,
                'api_calls_needed': api_calls
            })
    
    result_df = pd.DataFrame(results)
    
    # Convert date columns to datetime if they exist
    if len(result_df) > 0:
        result_df['series_start'] = pd.to_datetime(result_df['series_start'])
        result_df['series_end'] = pd.to_datetime(result_df['series_end'])
    
    return result_df


def summarize_api_calls(api_calls_df: pd.DataFrame) -> None:
    """
    Print a summary of API calls needed.
    
    Parameters:
    -----------
    api_calls_df : pd.DataFrame
        DataFrame returned from calculate_api_calls_needed()
    """
    if len(api_calls_df) == 0:
        print("No data to summarize.")
        return
    
    total_calls = api_calls_df['api_calls_needed'].sum()
    unique_tickers = api_calls_df['ticker'].nunique()
    total_series = len(api_calls_df)
    
    print("=" * 70)
    print("API CALLS SUMMARY")
    print("=" * 70)
    print(f"Total API calls needed: {total_calls:,}")
    print(f"Unique tickers: {unique_tickers:,}")
    print(f"Total continuous series: {total_series:,}")
    print(f"Average calls per ticker: {total_calls / unique_tickers:.2f}")
    print(f"Average calls per series: {total_calls / total_series:.2f}")
    print()
    print("Distribution of API calls per series:")
    print(api_calls_df['api_calls_needed'].describe())
    print()
    print("Top 10 tickers by API calls needed:")
    top_tickers = api_calls_df.groupby('ticker')['api_calls_needed'].sum().sort_values(ascending=False).head(10)
    for ticker, calls in top_tickers.items():
        print(f"  {ticker}: {calls:,} calls")
    print()
    print("Series with most API calls needed:")
    top_series = api_calls_df.nlargest(10, 'api_calls_needed')[['ticker', 'series_start', 'series_end', 'days_in_range', 'api_calls_needed']]
    print(top_series.to_string(index=False))
    print("=" * 70)



csv_path = r'C:\Users\carso\Development\emerytrading\Data\Stocks\Polygon\OHLCV_Historical_2016-01-01_to_2025-10-26.csv'
df = pd.read_csv(csv_path)
df = run_cleaning(df)


# Calculate with different scenarios:
# 1. Daily calls, no gap detection (treat each ticker as single series)
api_calls_daily = calculate_api_calls_needed(df, frequency_days=7, use_trading_days=True, gap_threshold_days=None)
summarize_api_calls(api_calls_daily)

# 2. Weekly calls, with gap detection (split at gaps > 365 days)
api_calls_weekly = calculate_api_calls_needed(df, frequency_days=14, use_trading_days=True, gap_threshold_days=None)
summarize_api_calls(api_calls_weekly)

# 3. Monthly calls, with gap detection
api_calls_monthly = calculate_api_calls_needed(df, frequency_days=30, use_trading_days=True, gap_threshold_days=None)
summarize_api_calls(api_calls_monthly)


Dataframe before cleaning:
Count of Unique Tickers: 23622
Number of Rows: 23408074


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'date'] = pd.to_datetime(df.loc[:, 'window_start'], unit='ns').dt.normalize()


Dataframe after cleaning:
Count of Unique Tickers: 20827
Number of Rows: 21967114
API CALLS SUMMARY
Total API calls needed: 3,488,466
Unique tickers: 20,827
Total continuous series: 20,827
Average calls per ticker: 167.50
Average calls per series: 167.50

Distribution of API calls per series:
count    20827.000000
mean       167.497287
std        128.056701
min          1.000000
25%         58.000000
50%        128.000000
75%        294.000000
max        366.000000
Name: api_calls_needed, dtype: float64

Top 10 tickers by API calls needed:
  AADR: 366 calls
  AAL: 366 calls
  A: 366 calls
  AA: 366 calls
  AAP: 366 calls
  AAON: 366 calls
  AAOI: 366 calls
  AAME: 366 calls
  GDV: 366 calls
  GDO: 366 calls

Series with most API calls needed:
ticker series_start series_end  days_in_range  api_calls_needed
     A   2016-01-04 2025-10-24           2560               366
    AA   2016-01-04 2025-10-24           2560               366
  AADR   2016-01-04 2025-10-24           2560          