In [1]:
import os


def read_processed_tickers(file_path):
    """
    Safely read the processed tickers file without disrupting ongoing writes.
    Returns a set of processed ticker symbols.
    """
    processed_tickers = set()

    # Check if file exists first
    if not os.path.exists(file_path):
        return processed_tickers

    try:
        # Try a few times in case the file is being written to
        max_attempts = 3
        for attempt in range(max_attempts):
            try:
                with open(file_path, "r") as f:
                    return set(line.strip() for line in f if line.strip())
            except (IOError, PermissionError) as e:
                # If there's a file access issue, wait briefly and retry
                if attempt < max_attempts - 1:
                    print(
                        f"Attempt {attempt+1} to read {file_path} failed: {e}. Retrying..."
                    )
                    time.sleep(0.5)
                else:
                    print(
                        f"Failed to read processed tickers after {max_attempts} attempts: {e}"
                    )
                    raise
    except Exception as e:
        print(f"Error reading processed tickers file: {e}")

    return processed_tickers

In [2]:
def remove_ticker_from_processed(lbw, ticker_to_remove):
    """
    Remove a specific ticker from the processed_tickers.txt file

    Parameters:
    -----------
    lbw : int
        Lookback window length
    ticker_to_remove : str
        Ticker symbol to remove from the processed list
    """
    import os
    from settings.default import CPD_OPENBB_OUTPUT_FOLDER

    # Get the path to the processed_tickers.txt file
    progress_file = os.path.join(
        CPD_OPENBB_OUTPUT_FOLDER(lbw), "processed_tickers.txt"
    )

    # Read the current list of processed tickers
    try:
        with open(progress_file, "r") as f:
            tickers = [line.strip() for line in f if line.strip()]
    except FileNotFoundError:
        print(f"File {progress_file} not found.")
        return

    # Check if ticker exists in the list
    if ticker_to_remove not in tickers:
        print(
            f"Ticker '{ticker_to_remove}' not found in processed tickers list."
        )
        return

    # Remove the ticker from the list
    tickers.remove(ticker_to_remove)

    # Write the updated list back to the file
    with open(progress_file, "w") as f:
        for ticker in tickers:
            f.write(f"{ticker}\n")

    print(
        f"Successfully removed '{ticker_to_remove}' from processed tickers list."
    )
    print(f"Remaining tickers: {tickers}")

In [3]:
processed_tickers = {}

In [4]:
import os
from settings.default import CPD_OPENBB_OUTPUT_FOLDER

# Example usage:
lbw = 21  # Your lookback window length
progress_file = os.path.join(
    CPD_OPENBB_OUTPUT_FOLDER(lbw), "processed_tickers.txt"
)
processed_tickers[lbw] = read_processed_tickers(progress_file)
print(f"Currently processed {len(processed_tickers[lbw])} tickers")
print(f"Processed tickers: {processed_tickers[lbw]}")

Currently processed 35 tickers
Processed tickers: {'DX', 'CL', 'RP', 'ZC', 'NG', 'ZS', '6J', 'RY', 'ZW', 'SB', 'CT', 'HG', 'SY', 'RB', 'OJ', 'SI', 'GC', 'ZN', 'HO', 'SS', 'CC', 'YM', 'ZB', 'QG', '6Z', 'ZJ', '6C', 'ES', 'ZF', 'RF', 'ZR', 'AR', 'QM', 'NQ', 'KC'}


In [7]:
# Example usage:
lbw = 126  # Your lookback window length
progress_file = os.path.join(
    CPD_OPENBB_OUTPUT_FOLDER(lbw), "processed_tickers.txt"
)
processed_tickers[lbw] = read_processed_tickers(progress_file)
print(f"Currently processed {len(processed_tickers[lbw])} tickers")
print(f"Processed tickers: {processed_tickers[lbw]}")

Currently processed 35 tickers
Processed tickers: {'DX', 'CL', 'RP', 'ZC', 'NG', 'ZS', '6J', 'RY', 'ZW', 'SB', 'CT', 'HG', 'RB', 'SY', 'OJ', 'SI', 'GC', 'ZN', 'HO', 'SS', 'CC', 'YM', 'ZB', 'QG', '6Z', 'ZJ', '6C', 'ES', 'ZF', 'RF', 'ZR', 'AR', 'QM', 'NQ', 'KC'}


In [6]:
# Which tickers have been processed for both lbw?
common_tickers = set(processed_tickers[21]) - set(processed_tickers[126])
print(f"Common tickers for lbw 21 and 126: {list(common_tickers)}")

Common tickers for lbw 21 and 126: ['HO']


In [8]:
import os
import pandas as pd
from settings.default import CPD_OPENBB_OUTPUT_FOLDER


def get_ticker_date_ranges(lbw, tickers):
    """
    Get the date range for each processed ticker's data file

    Parameters:
    -----------
    lbw : int
        Lookback window length
    tickers : list or set
        List of ticker symbols to check

    Returns:
    --------
    dict: Dictionary mapping tickers to their (start_date, end_date) tuples
    """
    date_ranges = {}
    folder_path = CPD_OPENBB_OUTPUT_FOLDER(lbw)

    for ticker in tickers:
        file_path = os.path.join(folder_path, f"{ticker}.csv")
        if os.path.exists(file_path):
            try:
                # Read the CSV file
                df = pd.read_csv(file_path, index_col=0, parse_dates=True)
                if not df.empty:
                    start_date = df.index.min().strftime("%Y-%m-%d")
                    end_date = df.index.max().strftime("%Y-%m-%d")
                    date_ranges[ticker] = (start_date, end_date)
                else:
                    date_ranges[ticker] = ("Empty file", "No data")
            except Exception as e:
                date_ranges[ticker] = (f"Error: {str(e)}", "")
        else:
            date_ranges[ticker] = ("File not found", "")

    return date_ranges


# Print date ranges for lookback window 21
print("Date ranges for LBW = 21:")
date_ranges_21 = get_ticker_date_ranges(21, processed_tickers[21])
for ticker, (start_date, end_date) in sorted(date_ranges_21.items()):
    print(f"{ticker}: {start_date} to {end_date}")

print("\nDate ranges for LBW = 126:")
date_ranges_126 = get_ticker_date_ranges(126, processed_tickers[126])
for ticker, (start_date, end_date) in sorted(date_ranges_126.items()):
    print(f"{ticker}: {start_date} to {end_date}")

# Find tickers with different date ranges between the two lookback windows
print("\nTickers with different date ranges between LBW 21 and 126:")
common_tickers = set(processed_tickers[21]) & set(processed_tickers[126])
for ticker in common_tickers:
    if ticker in date_ranges_21 and ticker in date_ranges_126:
        if date_ranges_21[ticker] != date_ranges_126[ticker]:
            print(f"{ticker}:")
            print(
                f"  LBW 21: {date_ranges_21[ticker][0]} to {date_ranges_21[ticker][1]}"
            )
            print(
                f"  LBW 126: {date_ranges_126[ticker][0]} to {date_ranges_126[ticker][1]}"
            )

Date ranges for LBW = 21:
6C: 2000-08-24 to 2021-12-30
6J: 2001-01-10 to 2021-12-28
6Z: 2011-06-24 to 2018-06-28
AR: Empty file to No data
CC: 2006-10-17 to 2019-05-15
CL: 2004-01-23 to 2019-12-23
CT: 2000-02-02 to 2021-12-29
DX: 2008-06-26 to 2016-06-27
ES: 2000-10-17 to 2021-12-30
GC: 2000-09-29 to 2021-12-30
HG: 2000-09-29 to 2021-12-30
HO: 2000-10-03 to 2021-12-30
KC: 2000-02-02 to 2021-12-30
NG: 2014-10-13 to 2021-12-30
NQ: 2001-04-10 to 2021-12-30
OJ: 2010-12-14 to 2021-12-30
QG: 2009-05-15 to 2021-12-30
QM: 2010-04-01 to 2021-12-30
RB: 2006-04-19 to 2021-12-30
RF: 2006-11-09 to 2021-12-30
RP: 2007-03-16 to 2021-12-30
RY: 2000-08-18 to 2021-12-30
SB: 2007-01-30 to 2021-12-30
SI: 2004-02-02 to 2021-12-30
SS: 2012-04-04 to 2021-12-30
SY: 2016-04-29 to 2021-12-30
YM: 2008-07-29 to 2021-12-30
ZB: 2006-08-15 to 2021-12-30
ZC: 2007-06-22 to 2021-12-30
ZF: 2013-01-04 to 2021-12-30
ZJ: 2003-08-07 to 2021-12-30
ZN: 2005-04-21 to 2021-12-30
ZR: 2003-09-12 to 2021-12-30
ZS: Empty file to No

In [10]:
import os
import pandas as pd
from datetime import datetime
from settings.default import CPD_OPENBB_OUTPUT_FOLDER


def merge_ticker_data_from_alt_dir(ticker, lbw=21):
    """
    Merge data for a specific ticker from alternate directory and sort by date

    Parameters:
    -----------
    ticker : str
        Ticker symbol
    lbw : int
        Lookback window length (default 21)

    Returns:
    --------
    bool: True if merge was successful, False otherwise
    """
    # Define paths
    main_dir = CPD_OPENBB_OUTPUT_FOLDER(lbw)
    alt_dir = os.path.join(os.path.dirname(main_dir), "openbb_cpd_21lbw(1)")

    main_file = os.path.join(main_dir, f"{ticker}.csv")
    alt_file = os.path.join(alt_dir, f"{ticker}.csv")

    # Check if both files exist
    if not os.path.exists(main_file):
        print(f"Main file for {ticker} doesn't exist in {main_dir}")
        return False

    if not os.path.exists(alt_file):
        print(f"Alternative file for {ticker} doesn't exist in {alt_dir}")
        return False

    try:
        # Read both files
        print(f"Reading data from {main_file}...")
        main_data = pd.read_csv(main_file, parse_dates=["date"])

        print(f"Reading data from {alt_file}...")
        alt_data = pd.read_csv(alt_file, parse_dates=["date"])

        # Display stats before merging
        print(f"\nBefore merging:")
        print(
            f"Main data: {len(main_data)} rows, from {main_data['date'].min()} to {main_data['date'].max()}"
        )
        print(
            f"Alt data: {len(alt_data)} rows, from {alt_data['date'].min()} to {alt_data['date'].max()}"
        )

        # Combine datasets
        combined_data = pd.concat([main_data, alt_data])

        # Remove duplicates (if any)
        combined_data = combined_data.drop_duplicates(
            subset=["date"]
        ).reset_index(drop=True)

        # Sort by date
        combined_data = combined_data.sort_values("date")

        # Display stats after merging
        print(f"\nAfter merging:")
        print(
            f"Combined data: {len(combined_data)} rows, from {combined_data['date'].min()} to {combined_data['date'].max()}"
        )

        # Create backup of original file
        backup_file = main_file + ".bak"
        print(f"\nCreating backup of original file at {backup_file}")
        main_data.to_csv(backup_file, index=False)

        # Save the merged data
        print(f"Saving merged data to {main_file}")
        combined_data.to_csv(main_file, index=False)

        print(f"Merge and sort complete for {ticker}!")

        # Also update progress file if needed
        progress_file = main_file + ".progress"
        alt_progress_file = alt_file + ".progress"

        if os.path.exists(progress_file) and os.path.exists(alt_progress_file):
            print(f"Merging progress files for {ticker}...")
            # Read both sets of processed windows
            with open(progress_file, "r") as f:
                main_processed = set(
                    int(line.strip()) for line in f.readlines()
                )

            with open(alt_progress_file, "r") as f:
                alt_processed = set(
                    int(line.strip()) for line in f.readlines()
                )

            # Combine processed windows
            all_processed = main_processed.union(alt_processed)

            # Create backup of original progress file
            with open(progress_file + ".bak", "w") as f:
                for window in sorted(main_processed):
                    f.write(f"{window}\n")

            # Write combined progress file
            with open(progress_file, "w") as f:
                for window in sorted(all_processed):
                    f.write(f"{window}\n")

            print(
                f"Updated progress file with {len(all_processed)} processed windows"
            )

        return True

    except Exception as e:
        print(f"Error merging data for {ticker}: {str(e)}")
        return False


# First, get all tickers with a starting date after 2007
lbw = 21
print(f"Analyzing tickers from LBW = {lbw}...")

# Get date ranges for all tickers
date_ranges = get_ticker_date_ranges(lbw, processed_tickers[lbw])

# Filter for tickers with start date after 2007
post_2007_tickers = []
for ticker, (start_date, end_date) in date_ranges.items():
    try:
        if (
            isinstance(start_date, str)
            and not start_date.startswith("Error")
            and not start_date.startswith("Empty")
        ):
            start_year = datetime.strptime(start_date, "%Y-%m-%d").year
            if start_year > 2007:
                post_2007_tickers.append(ticker)
    except Exception as e:
        print(f"Error parsing date for {ticker}: {str(e)}")

print(f"Found {len(post_2007_tickers)} tickers with start date after 2007:")
print(post_2007_tickers)

Analyzing tickers from LBW = 21...
Found 10 tickers with start date after 2007:
['DX', 'NG', 'SY', 'OJ', 'SS', 'YM', 'QG', '6Z', 'ZF', 'QM']


In [46]:
# Process each ticker
success_count = 0
for i, ticker in enumerate(post_2007_tickers):
    print(f"\nProcessing ticker {i+1}/{len(post_2007_tickers)}: {ticker}")
    if merge_ticker_data_from_alt_dir(ticker, lbw):
        success_count += 1
    print("-" * 50)

print(
    f"\nMerge operation completed: Successfully processed {success_count} out of {len(post_2007_tickers)} tickers"
)


Processing ticker 1/5: 6Z
Reading data from data\openbb_cpd_21lbw\6Z.csv...
Reading data from data\openbb_cpd_21lbw(1)\6Z.csv...

Before merging:
Main data: 883 rows, from 2018-06-29 00:00:00 to 2021-12-30 00:00:00
Alt data: 1530 rows, from 2001-05-03 00:00:00 to 2007-06-15 00:00:00

After merging:
Combined data: 2413 rows, from 2001-05-03 00:00:00 to 2021-12-30 00:00:00

Creating backup of original file at data\openbb_cpd_21lbw\6Z.csv.bak
Saving merged data to data\openbb_cpd_21lbw\6Z.csv
Merge and sort complete for 6Z!
--------------------------------------------------

Processing ticker 2/5: CL
Reading data from data\openbb_cpd_21lbw\CL.csv...
Reading data from data\openbb_cpd_21lbw(1)\CL.csv...

Before merging:
Main data: 1081 rows, from 2017-06-14 00:00:00 to 2021-12-30 00:00:00
Alt data: 828 rows, from 2000-09-22 00:00:00 to 2004-01-22 00:00:00

After merging:
Combined data: 1909 rows, from 2000-09-22 00:00:00 to 2021-12-30 00:00:00

Creating backup of original file at data\open

In [13]:
import os
import pandas as pd
from settings.default import CPD_OPENBB_OUTPUT_FOLDER
from data.pull_data import pull_openbb_sample_data


def create_missing_progress_files(lbw=21):
    """
    Find tickers that have CPD result files but no progress files,
    then create appropriate progress files for them by identifying
    which dates/indexes have already been processed.
    Excludes tickers that are already in processed_tickers.txt.

    Parameters:
    -----------
    lbw : int
        Lookback window length (default 21)
    """
    print(f"\nSearching for tickers without progress files in LBW={lbw}...")

    # Define paths
    main_dir = CPD_OPENBB_OUTPUT_FOLDER(lbw)
    processed_tickers_file = os.path.join(main_dir, "processed_tickers.txt")

    # Read the list of processed tickers to exclude them
    processed_tickers = set()
    # if os.path.exists(processed_tickers_file):
    #     with open(processed_tickers_file, "r") as f:
    #         processed_tickers = set(line.strip() for line in f if line.strip())
    #     print(
    #         f"Found {len(processed_tickers)} tickers in processed_tickers.txt (will exclude these)"
    #     )

    # Get all ticker files in the directory
    all_files = [
        f
        for f in os.listdir(main_dir)
        if f.endswith(".csv")
        and not f.endswith(".bak")
        and not f == "processed_tickers.txt"
    ]
    all_tickers = [os.path.splitext(f)[0] for f in all_files]

    # Filter out tickers that are already in processed_tickers.txt
    candidates = [
        ticker for ticker in all_tickers if ticker not in processed_tickers
    ]

    # Check which candidates don't have progress files
    tickers_without_progress = []
    for ticker in candidates:
        progress_file = os.path.join(main_dir, f"{ticker}.csv.progress")
        if not os.path.exists(progress_file):
            tickers_without_progress.append(ticker)

    print(
        f"Found {len(tickers_without_progress)} tickers with CSV files but no progress files"
    )
    if len(tickers_without_progress) > 0:
        print(f"First 10 tickers: {tickers_without_progress[:10]}")

    # Process each ticker without a progress file
    success_count = 0
    for i, ticker in enumerate(tickers_without_progress):
        print(f"\n{'='*50}")
        print(
            f"Processing ticker {i+1}/{len(tickers_without_progress)}: {ticker}"
        )
        print(f"{'='*50}")

        cpd_file = os.path.join(main_dir, f"{ticker}.csv")
        progress_file = cpd_file + ".progress"

        try:
            # Get original price data
            print(f"Pulling original price data for {ticker}...")
            orig_data = pull_openbb_sample_data(ticker)
            orig_data = orig_data[orig_data.index < "2021-12-31"]

            # Read CPD results file
            print(f"Reading CPD data from {cpd_file}...")
            cpd_data = pd.read_csv(cpd_file, parse_dates=["date"])

            # Find which dates have been processed
            orig_dates = set(orig_data.index.date)
            cpd_dates = set(pd.to_datetime(cpd_data["date"]).dt.date)

            # Create a mapping from date to index in original data
            orig_data_reset = orig_data.reset_index()
            date_to_idx = {
                d.date(): i for i, d in enumerate(orig_data_reset["date"])
            }

            # Create list of processed indices
            processed_indices = [
                date_to_idx[date] for date in cpd_dates if date in date_to_idx
            ]

            # Write progress file
            with open(progress_file, "w") as f:
                for idx in sorted(processed_indices):
                    f.write(f"{idx}\n")

            print(
                f"Created progress file with {len(processed_indices)} processed indices"
            )
            success_count += 1

        except Exception as e:
            print(f"Error processing {ticker}: {str(e)}")

    print(
        f"\nProcess complete: Created progress files for {success_count} out of {len(tickers_without_progress)} tickers"
    )
    return success_count


# Create missing progress files for lookback window 21
create_missing_progress_files(lbw=21)
create_missing_progress_files(lbw=126)


Searching for tickers without progress files in LBW=21...
Found 2 tickers with CSV files but no progress files
First 10 tickers: ['6J', 'CT']

Processing ticker 1/2: 6J
Pulling original price data for 6J...
Reading CPD data from data\openbb_cpd_21lbw\6J.csv...
Created progress file with 1141 processed indices

Processing ticker 2/2: CT
Pulling original price data for CT...
Reading CPD data from data\openbb_cpd_21lbw\CT.csv...
Created progress file with 1162 processed indices

Process complete: Created progress files for 2 out of 2 tickers

Searching for tickers without progress files in LBW=126...
Found 0 tickers with CSV files but no progress files

Process complete: Created progress files for 0 out of 0 tickers


0

In [14]:
import os
import pandas as pd
from settings.default import CPD_OPENBB_OUTPUT_FOLDER, OPENBB_2003_TICKERS
from data.pull_data import pull_openbb_sample_data
from mom_trans.data_prep import calc_returns


def find_and_remove_missing_date_indexes(ticker, lbw=21):
    """
    Find missing dates between CPD results and original price data,
    then remove those indexes from the progress file.

    Parameters:
    -----------
    ticker : str
        Ticker symbol
    lbw : int
        Lookback window length (default 21)
    """
    print(f"\n{'='*50}")
    print(f"Processing ticker: {ticker}")
    print(f"{'='*50}")

    # Define paths
    main_dir = CPD_OPENBB_OUTPUT_FOLDER(lbw)
    cpd_file = os.path.join(main_dir, f"{ticker}.csv")
    progress_file = cpd_file + ".progress"

    # Check if files exist
    if not os.path.exists(cpd_file):
        print(f"CPD file for {ticker} doesn't exist in {main_dir}")
        return False

    if not os.path.exists(progress_file):
        print(f"Progress file for {ticker} doesn't exist")
        return False

    try:
        # Step 1: Get original price data
        print(f"Pulling original price data for {ticker}...")
        orig_data = pull_openbb_sample_data(ticker)
        orig_data = orig_data[
            orig_data.index < "2021-12-31"
        ]  # Same filter as used previously

        # Step 2: Read CPD results file
        print(f"Reading CPD data from {cpd_file}...")
        cpd_data = pd.read_csv(cpd_file, parse_dates=["date"])

        # Step 3: Find missing dates
        orig_dates = set(orig_data.index.date)
        cpd_dates = set(pd.to_datetime(cpd_data["date"]).dt.date)

        missing_dates = orig_dates - cpd_dates

        print(f"Found {len(missing_dates)} missing dates in CPD file")

        if not missing_dates:
            print("No missing dates found. Progress file remains unchanged.")
            return True

        # Step 4: Read progress file
        with open(progress_file, "r") as f:
            indices = [int(line.strip()) for line in f if line.strip()]

        # Create backup of original progress file
        with open(progress_file + ".bak", "w") as f:
            for idx in indices:
                f.write(f"{idx}\n")

        print(f"Created backup of progress file at {progress_file}.bak")

        # Step 5: For each missing date, remove the corresponding indices
        # The indexes in the progress file correspond to positions in the original data
        # First, create a mapping from date to index in original data
        orig_data_reset = orig_data.reset_index()
        date_to_idx = {
            d.date(): i for i, d in enumerate(orig_data_reset["date"])
        }

        # Find indices to remove
        indices_to_remove = []
        for missing_date in missing_dates:
            if missing_date in date_to_idx:
                indices_to_remove.append(date_to_idx[missing_date])

        print(
            f"Identified {len(indices_to_remove)} indexes to remove from progress file"
        )
        if len(indices_to_remove) > 0:
            print(
                f"Sample indexes to remove: {sorted(indices_to_remove)[:5]} ..."
            )

            # Step 6: Filter out these indices from the progress file
            original_count = len(indices)
            filtered_indices = [
                idx for idx in indices if idx not in indices_to_remove
            ]
            removed_count = original_count - len(filtered_indices)

            # Step 7: Write the filtered indices back to the progress file
            with open(progress_file, "w") as f:
                for idx in sorted(filtered_indices):
                    f.write(f"{idx}\n")

            print(f"Removed {removed_count} entries from progress file")
            print(
                f"Original count: {original_count}, New count: {len(filtered_indices)}"
            )

        return True

    except Exception as e:
        print(f"Error processing {ticker}: {str(e)}")
        return False


# Process each ticker
tickers = OPENBB_2003_TICKERS
lbw = 21

success_count = 0
for ticker in tickers:
    if find_and_remove_missing_date_indexes(ticker, lbw):
        success_count += 1

print(
    f"\nProcessing complete: Successfully processed {success_count} out of {len(tickers)} tickers"
)


Processing ticker: 6C
Pulling original price data for 6C...
Reading CPD data from data\openbb_cpd_21lbw\6C.csv...
Found 701 missing dates in CPD file
Created backup of progress file at data\openbb_cpd_21lbw\6C.csv.progress.bak
Identified 701 indexes to remove from progress file
Sample indexes to remove: [0, 1, 2, 3, 4] ...
Removed 0 entries from progress file
Original count: 4701, New count: 4701

Processing ticker: 6J
Pulling original price data for 6J...
Reading CPD data from data\openbb_cpd_21lbw\6J.csv...
Found 4163 missing dates in CPD file
Created backup of progress file at data\openbb_cpd_21lbw\6J.csv.progress.bak
Identified 4163 indexes to remove from progress file
Sample indexes to remove: [0, 1, 2, 3, 4] ...
Removed 0 entries from progress file
Original count: 1141, New count: 1141

Processing ticker: 6Z
Pulling original price data for 6Z...
Reading CPD data from data\openbb_cpd_21lbw\6Z.csv...
Found 3424 missing dates in CPD file
Created backup of progress file at data\open