check that i have active and inactive tickers for all days when exchanges were open.

In [3]:
# setup client for AWS S3
import sys
import os
# Add the parent directory to Python path to import api_key module
sys.path.append(os.path.dirname(os.path.abspath('')))
import settings

In [4]:
tickers_dir = os.path.join(settings.ABSOLUTE_DATA_DIR, 'tickers')
active_tickers_file = os.path.join(tickers_dir, 'tickers_history_active.csv')
inactive_tickers_file = os.path.join(tickers_dir, 'tickers_history_inactive.csv')

In [37]:
import csv

def collect_unique_dates(file_path):
    unique_dates = set()
    with open(file_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        next(reader, None)  # skip the first line (header)
        prev_date = None
        line_counter = 0
        for row in reader:
            if row:  # skip empty lines
                date = row[0]
                # optimization: check against the hashset only if the date is different from the previous one
                if date != prev_date:
                    unique_dates.add(date)
                    prev_date = date
            else:
                print(f"Empty line found in {file_path} at position {line_counter}, skipping.")
                return None
            line_counter += 1
    return unique_dates

In [6]:
import pandas as pd

dates = collect_unique_dates(active_tickers_file)
df = pd.DataFrame(list(dates), columns=['Date'])
df

Unnamed: 0,Date
0,2005-08-29
1,2017-03-22
2,2023-11-06
3,2004-03-09
4,2017-03-02
...,...
5672,2009-03-09
5673,2024-07-16
5674,2006-01-17
5675,2015-03-12


In [10]:
# sort the DataFrame by date
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values(by='Date').reset_index(drop=True)
# save the DataFrame to a CSV file
df.to_csv(os.path.join(tickers_dir, 'unique_active_dates.csv'), index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df['Date'])


In [35]:
import pandas_market_calendars as mcal

nyse = mcal.get_calendar('NYSE')

schedule = nyse.schedule(start_date='2003-09-10', end_date='2025-06-11')
# get dates from the schedule
schedule_dates = pd.Series(schedule.index.date)
schedule_dates

0       2003-09-10
1       2003-09-11
2       2003-09-12
3       2003-09-15
4       2003-09-16
           ...    
5469    2025-06-05
5470    2025-06-06
5471    2025-06-09
5472    2025-06-10
5473    2025-06-11
Length: 5474, dtype: object

In [36]:
# convert df['Date'] to datetime
active_dates = pd.Series(pd.to_datetime(df['Date']).dt.date)
active_dates

0       2003-09-10
1       2003-09-11
2       2003-09-12
3       2003-09-15
4       2003-09-16
           ...    
5671    2025-06-05
5672    2025-06-06
5673    2025-06-09
5674    2025-06-10
5675    2025-06-11
Name: Date, Length: 5676, dtype: object

In [32]:
# check that unique_days are in the NYSE schedule
print(schedule_dates.isin(active_dates).all())
# and that the NYSE schedule is in unique_days
print(active_dates.isin(schedule_dates).all())


True
False


this True, False result is expected because active_dates were downloaded for all days except weekends. means that it has dates for federal holidays and also days of disasters when exchanges were closed. one more check below.

In [31]:
# Find dates that are in schedule_dates but not in active_dates
missing_in_active = schedule_dates[~schedule_dates.isin(active_dates)]
print("Dates in NYSE schedule but not in active dates:")
print(missing_in_active)

# Find dates that are in active_dates but not in schedule_dates
extra_in_active = active_dates[~active_dates.isin(schedule_dates)]
print("Dates in active dates but not in NYSE schedule:")
print(extra_in_active)

Dates in NYSE schedule but not in active dates:
Series([], dtype: object)
Dates in active dates but not in NYSE schedule:
56      2003-11-27
76      2003-12-25
81      2004-01-01
93      2004-01-19
113     2004-02-16
           ...    
5566    2025-01-09
5573    2025-01-20
5593    2025-02-17
5637    2025-04-18
5663    2025-05-26
Name: Date, Length: 202, dtype: object


In [38]:
# now do the same for inactive tickers
inactive_dates = collect_unique_dates(inactive_tickers_file)

df = pd.DataFrame(list(inactive_dates), columns=['Date'])
df

Unnamed: 0,Date
0,2005-08-29
1,2017-03-22
2,2023-11-06
3,2004-03-09
4,2017-03-02
...,...
5671,2009-03-09
5672,2024-07-16
5673,2006-01-17
5674,2015-03-12


In [39]:
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values(by='Date').reset_index(drop=True)
# save the DataFrame to a CSV file
df.to_csv(os.path.join(tickers_dir, 'unique_inactive_dates.csv'), index=False)

In [40]:
inactive_dates = pd.Series(pd.to_datetime(df['Date']).dt.date)
inactive_dates

0       2003-09-11
1       2003-09-12
2       2003-09-15
3       2003-09-16
4       2003-09-17
           ...    
5671    2025-06-06
5672    2025-06-09
5673    2025-06-10
5674    2025-06-11
5675    2025-06-12
Name: Date, Length: 5676, dtype: object

In [41]:
# check that unique_days are in the NYSE schedule
print(schedule_dates.isin(active_dates).all())
# and that the NYSE schedule is in unique_days
print(active_dates.isin(schedule_dates).all())

True
False


ok. so i have active and inactive tickers for all days when exchanges were open. next step is to try to calculate renames.