In [None]:
import pandas as pd
import os
import sys

# Load in OHLCV Data 
csv_path = r'C:\Users\carso\Development\emerytrading\Data\Stocks\Polygon\OHLCV_Historical_2016-01-01_to_2025-10-26.csv'
OHLCV_data = pd.read_csv(csv_path)

# Convert to pandas datetime and normalize to date (removes time component)
# Keeping as pandas datetime (not Python date) for pandas operations like .dt.year
OHLCV_data['date'] = pd.to_datetime(OHLCV_data['window_start'], unit='ns').dt.normalize()

# Reorder columns to put 'date' first
cols = ['date'] + [col for col in OHLCV_data.columns if col != 'date']
OHLCV_data[cols]

# print(OHLCV_data.head())


In [None]:
# Analyzing how much of the data (or haw many ticker series) have at MAX 100 shares vol traded on a day through their ENTIRE series 
# and never reached above $0.01 on their ENTIRE series. 

# Goal is to potentially remove some untradeable noise, but first to see how much of that noise we would be removing

import re
from collections import defaultdict
import pandas as pd
import numpy as np

# Load in OHLCV Data 
csv_path = r'C:\Users\carso\Development\emerytrading\Data\Stocks\Polygon\OHLCV_Historical_2016-01-01_to_2025-10-26.csv'
OHLCV_data = pd.read_csv(csv_path)

# Convert to pandas datetime and normalize to date (removes time component)
# Keeping as pandas datetime (not Python date) for pandas operations like .dt.year
OHLCV_data['date'] = pd.to_datetime(OHLCV_data['window_start'], unit='ns').dt.normalize()

# Reorder columns to put 'date' first
cols = ['date'] + [col for col in OHLCV_data.columns if col != 'date']
OHLCV_data[cols]

unique_tickers = OHLCV_data['ticker'].unique()

# Find and analyze the number of tickers that have MAX 100 shares volume across their entire series 
# Compute max volume and max price per ticker in one pass
ticker_stats = OHLCV_data.groupby('ticker').agg({
    'volume': 'max',
    'close': 'max'
}).reset_index()

# See what the distribution of price and volume is across our series
print(ticker_stats['volume'].min())
print(ticker_stats['volume'].quantile([0.01, 0.05, 0.10, 0.25, 0.50]))
print(ticker_stats['close'].min())

# Filter tickers into a list that are below the criteria
invalid_tickers = ticker_stats[
    (ticker_stats['volume'] < 1000) |
    (ticker_stats['close'] < 0.01)
]['ticker'].tolist()

print(invalid_tickers)
print(len(invalid_tickers))

In [None]:
# Look at all of the suffixes we have for the unique tickers
import re
from collections import defaultdict
import pandas as pd
import numpy as np

# Load in OHLCV Data 
csv_path = r'C:\Users\carso\Development\emerytrading\Data\Stocks\Polygon\OHLCV_Historical_2016-01-01_to_2025-10-26.csv'
OHLCV_data = pd.read_csv(csv_path)

# Convert to pandas datetime and normalize to date (removes time component)
# Keeping as pandas datetime (not Python date) for pandas operations like .dt.year
OHLCV_data['date'] = pd.to_datetime(OHLCV_data['window_start'], unit='ns').dt.normalize()

# Reorder columns to put 'date' first
cols = ['date'] + [col for col in OHLCV_data.columns if col != 'date']
OHLCV_data[cols]

# Get unique tickers and filter out NaN values, convert to strings
unique_tickers = OHLCV_data['ticker'].dropna().unique()
unique_tickers = [str(ticker) for ticker in unique_tickers]

# Function to extract suffix from a ticker
def extract_suffix(ticker):
    """
    Extract suffix from ticker. Returns (base, suffix) or (ticker, None) if no suffix detected.
    Suffixes are typically 1-3 characters at the end that are not digits.
    """

    # Match test tickers (case-insensitive)
    if re.search(r'(?i)test', ticker):
        # Handle test tickers
        return ticker, 'TEST'

    # Match ZVZZT/ZWZZT test tickers
    if re.match(r'^(ZVZZT|ZWZZT)$', ticker):
        return ticker, None

    # Match non-equities (lowercase/period suffixes)
    non_equities_match = re.match(r'^([^a-z.]*)([a-z.].*)$', ticker)
    if non_equities_match:
        base, suffix = non_equities_match.groups()
        if len(base) >= 1 and len(suffix) >= 1:
            return base, suffix

    # If no pattern matches, return the whole ticker as base with no suffix
    return ticker, None

# Group tickers by suffix - simple dictionary mapping suffix -> list of tickers
suffix_groups = defaultdict(list)

for ticker in unique_tickers:
    base, suffix = extract_suffix(ticker)
    if suffix:
        suffix_groups[suffix].append(ticker)
    else:
        # Tickers with no detected suffix go under a no_suffix column
        suffix_groups['no suffix'].append(ticker)


# Find the maximum length needed for DataFrame
max_len = max(len(tickers) for tickers in suffix_groups.values()) if suffix_groups else 0

# Pad all lists to the same length with NaN
for suffix in suffix_groups:
    suffix_groups[suffix] = suffix_groups[suffix] + [np.nan] * (max_len - len(suffix_groups[suffix]))

# Create DataFrame
suffix_df = pd.DataFrame(dict(sorted(suffix_groups.items())))

# take a look at the dataframe
print(suffix_df)

# Dictionary to store the column names (suffixes) and the COUNT of tickers that fall under that suffix 
# we dont currently use the dictionary but could be nice to have
suffix_dict = {}

for column_name, column_data in suffix_df.items():
    suffix_dict[column_name] = column_data.count()
    print(f"{column_name} : {column_data.count()}")

# number of non suffixed unique tickers
count_no_suffix = suffix_dict['no suffix']
# number of uniquely suffix tickers identified by the regex
count_suffix = len(unique_tickers) - count_no_suffix

# Percent of uniquely suffixed tickers for all unique tickers 
# Will be removinig these 
print(count_suffix / len(unique_tickers))

In [9]:
# Analyzing duplicates
import re
from collections import defaultdict
import pandas as pd
import numpy as np

# Load in OHLCV Data 
csv_path = r'C:\Users\carso\Development\emerytrading\Data\Stocks\Polygon\OHLCV_Historical_2016-01-01_to_2025-10-26.csv'
OHLCV_data = pd.read_csv(csv_path)

# Convert to pandas datetime and normalize to date (removes time component)
# Keeping as pandas datetime (not Python date) for pandas operations like .dt.year
OHLCV_data['date'] = pd.to_datetime(OHLCV_data['window_start'], unit='ns').dt.normalize()

# Reorder columns to put 'date' first
cols = ['date'] + [col for col in OHLCV_data.columns if col != 'date']
OHLCV_data[cols]

# Find the duplcate rows
duplicate_rows = OHLCV_data[OHLCV_data.duplicated(subset=['ticker', 'date'], keep=False)]

# Select only 'ticker' and 'date' columns, sort by ticker
dupes = duplicate_rows[['ticker', 'date', 'close','open', 'volume']].sort_values(by='ticker')

print(dupes)
print(len(dupes))
# print(len(duplicate_rows))

        ticker       date     close      open  volume
7326351   AMUB 2019-08-13   14.6595   14.6595       0
7334826   AMUB 2019-08-13   14.7448   14.7448       1
7326911   BNKU 2019-08-13   44.2546   44.2546    1000
7335391   BNKU 2019-08-13   45.9859   45.9859       5
7326988   BSCE 2019-08-13   26.1300   26.1300      10
7335466   BSCE 2019-08-13   26.0950   26.0950      57
7335521    BUY 2019-08-13   18.2100   18.2600     500
7327043    BUY 2019-08-13   17.9200   17.9200       0
7327802   DAUD 2019-08-13   36.4729   36.4729       0
7336290   DAUD 2019-08-13   35.6398   35.5929     620
7336818   EMSG 2019-08-13   24.5109   24.5109       0
7328324   EMSG 2019-08-13   24.1899   24.1899       0
7328606   FAUS 2019-08-13   30.4600   30.4600       0
7337099   FAUS 2019-08-13   31.0524   31.0524       0
7328778   FLEU 2019-08-13  127.5950  127.5950       0
7337276   FLEU 2019-08-13  129.6009  129.6009     100
7329193   GLBY 2019-08-13   27.5450   27.5450       0
7337699   GLBY 2019-08-13   