## Documents Extraction and Processing

In [4]:
%load_ext autoreload
%autoreload 2

Lets first obtain the file that maps the tickers with the CIKs

In [2]:
import requests
import json
from pathlib import Path
import os

# --- Configuration (from previous step) ---
HEADERS = {
    "User-Agent": "EdgarTutorial/1.0 (YourName your.email@domain.com)" 
}
TICKER_CIK_URL = "https://www.sec.gov/files/company_tickers.json"
OUTPUT_FILE = Path("sec_data/company_tickers.json")

# Ensure directory exists
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
# ----------------------------------------

# 1. Download the JSON data
print("Downloading CIK-Ticker map...")
response = requests.get(TICKER_CIK_URL, headers=HEADERS, timeout=15)
response.raise_for_status()
raw_data = response.json() # Load into Python dictionary

# 2. Open the file and use json.dump() with indent=4
print(f"Saving JSON in readable format to {OUTPUT_FILE.absolute()}...")

# Use 'w' mode to write the file
with open(OUTPUT_FILE, 'w') as f:
    # Key Fix: The 'indent=4' parameter tells the JSON module to format the output 
    # with 4 spaces for each level of nesting, adding line breaks automatically.
    json.dump(raw_data, f, indent=4) 

print("✅ JSON saved successfully with proper line breaks and indentation.")

# --- Optional: Print a Snippet to Console (Also Pretty-Printed) ---
# If you want to print to the console instead of a file, use json.dumps()
print("\n--- Console Snippet (Pretty-Printed) ---")
# Print the first 3 key-value pairs from the dictionary
keys = list(raw_data.keys())
snippet = {k: raw_data[k] for k in keys[:3]}

# Use json.dumps() with indent=2 to format the string output
pretty_string = json.dumps(snippet, indent=2)
print(pretty_string)

Downloading CIK-Ticker map...
Saving JSON in readable format to /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks/sec_data/company_tickers.json...
✅ JSON saved successfully with proper line breaks and indentation.

--- Console Snippet (Pretty-Printed) ---
{
  "0": {
    "cik_str": 1045810,
    "ticker": "NVDA",
    "title": "NVIDIA CORP"
  },
  "1": {
    "cik_str": 1652044,
    "ticker": "GOOGL",
    "title": "Alphabet Inc."
  },
  "2": {
    "cik_str": 320193,
    "ticker": "AAPL",
    "title": "Apple Inc."
  }
}


### Vanguard Index Funds

In [2]:
import pandas as pd
from io import StringIO
from edgar import Company, set_identity
import sys
from pathlib import Path

from src.simple_rag.models.fund import FilingMetadata
from src.simple_rag.extraction.parser import BlackRockFiling


set_identity("luis.alvarez.conde@alumnos.upm.es")

ticker = "VOO"
fund = Company(ticker)
all_filings = fund.get_filings(form="N-CSR")


if all_filings:
    
    latest_date_str = max(f.report_date for f in all_filings)
    
    target_year = latest_date_str[:4]
    
    # 3. Filter: Keep ALL filings where the report_date starts with that year
    # This captures the March, June, and December reports for that fiscal year
    latest_filings = [
        f for f in all_filings 
        if f.report_date and f.report_date.startswith(target_year)
    ]
    print("Found filings: ", len(latest_filings), "for year: ", target_year)

funds_total = []
performance_funds = []
df_performance = []
for filing in latest_filings:

    print("Processing filing: ", filing.report_date)
    html_content = filing.html()
    filing_metadata = FilingMetadata(
        accession_number=filing.accession_number,
        reporting_date=filing.report_date,
        filing_date=filing.filing_date,
        form=filing.form,
        url=filing.url

    )
    parser = BlackRockFiling(html_content)
    funds = parser.get_funds()
    count = 0
    for fund in funds:
        fund.ncsr_metadata = filing_metadata
        if fund.performance_table is not None:
            performance_funds.append(fund.ticker)
            count += 1

    df_performance.append(parser.get_financial_highlights())

    print(count)
    print("Adding funds: ", len(funds))
    
    funds_total.extend(funds)

print(len(performance_funds))
print(performance_funds)
print(len(df_performance))


  from .autonotebook import tqdm as notebook_tqdm


Found filings:  2 for year:  2024
Processing filing:  2024-12-31
Processing: Vanguard Extended Market Index Fund
Extracting context:  FY2024_C000007779Member
Tag not found:  dei:SecurityExchangeName FY2024_C000007779Member
Failed to extract tables from block:  oef:LineGraphTableTextBlock
Tables not found.
Processing: Vanguard Extended Market Index Fund
Extracting context:  FY2024_C000007782Member
Tag not found:  dei:SecurityExchangeName FY2024_C000007782Member
Failed to extract tables from block:  oef:LineGraphTableTextBlock
Tables not found.
Processing: Vanguard Extended Market Index Fund
Extracting context:  FY2024_C000007780Member
Tag not found:  dei:SecurityExchangeName FY2024_C000007780Member
Failed to extract tables from block:  oef:LineGraphTableTextBlock
Tables not found.
Processing: Vanguard Extended Market Index Fund
Extracting context:  FY2024_C000007781Member
Tag not found:  dei:SecurityExchangeName FY2024_C000007781Member
Failed to extract tables from block:  oef:LineGraph

In [3]:
from src.simple_rag.models.fund import FinancialHighlights
import pandas as pd
from IPython.display import display

print(len(df_performance))

total_df = pd.concat([df_performance[0], df_performance[1]], ignore_index=True)

returns_lookup = total_df.copy()

display(returns_lookup['fund_name'].unique())

numeric_columns = ['portfolio_turnover', 'expense_ratio', 'net_assets', 
                   'nav_beginning', 'nav_end', 'net_income_ratio', 'distribution_shares']
for col in numeric_columns:
    if col in returns_lookup.columns:
        if returns_lookup[col] is not None:        
            returns_lookup[f'{col}_clean'] = (
                returns_lookup[col]
                .astype(str)
                .str.replace('%', '')
                .str.replace('$', '')
                .str.replace(',', '')
                .replace('N/A', '0')
                .replace('', '0')
                .replace('None', '0')
                .astype(float)
            )
count = 0
# Now you can efficiently match and update your funds
for fund_obj in funds_total:
    print(f"\nProcessing fund object: {fund_obj.name} - {fund_obj.share_class}")
    
    # Initialize annual returns
    if not hasattr(fund_obj, 'annual_returns') or fund_obj.annual_returns is None:
        fund_obj.annual_returns = {}
    
    # Clean the name: remove "Vanguard" and strip whitespace
    name = fund_obj.name.replace("Vanguard", "").strip()
    print(f"Cleaned name: '{name}'")
    
    # Find matching rows based on fund name
    name_matches = returns_lookup[returns_lookup['fund_name'].str.strip().str.lower() == name.lower()]
    if len(name_matches) == 0:
        print("  No name matches found for ticker: ", fund_obj.ticker)

        continue
    
    print(f"  Found {len(name_matches)} name matches")
    
    # Clean share class (remove trademark symbol)
    share_class = fund_obj.share_class
    if "™" in share_class:
        share_class = share_class.replace("™", "")
    
    # Now match share class
    share_class_matches = name_matches[
        name_matches['share_class'].str.contains(share_class, case=False, na=False, regex=False)]
    
    if len(share_class_matches) == 0:
        print(f"  No share class matches found for '{share_class}' ticker: ", fund_obj.ticker)
        print(f"  Available share classes: {name_matches['share_class'].unique()}")
        continue
    
    
    print(f"  Found {len(share_class_matches)} matching records")
    count += 1
    # Add all matching returns
    for _, row in share_class_matches.iterrows():
        year = str(row['year'])
        
       
        highlights = FinancialHighlights(
            turnover=row.get('portfolio_turnover_clean', 0),
            expense_ratio=row.get('expense_ratio_clean', 0),
            total_return=row['total_return'],
            net_assets=row.get('net_assets_clean', 0),
            net_assets_value_begining=row.get('nav_beginning_clean', 0),
            net_assets_value_end=row.get('nav_end_clean', 0),
            net_income_ratio=row.get('net_income_ratio_clean', 0.0)
        )
        
        fund_obj.financial_highlights[year] = highlights
        print(f"  {year}: Total Return = {highlights.total_return}%, Expense Ratio = {highlights.expense_ratio}%, Net Assets = {highlights.net_assets}, Net Income Ratio = {highlights.net_income_ratio}, Turnover = {highlights.turnover}, Net Assets Value Begining = {highlights.net_assets_value_begining}, Net Assets Value End = {highlights.net_assets_value_end}")
print("count: ",count)
print("Total funds: ",len(funds_total))
    

2


array(['Small-Cap Index Fund', 'Small-Cap Growth Index Fund',
       'Small-Cap Value Index Fund', 'Extended Market Index Fund',
       'Mid-Cap Index Fund', 'Mid-Cap Growth Index Fund',
       'Mid-Cap Value Index Fund', 'Total Stock Market Index Fund',
       '500 Index Fund', 'Growth Index Fund', 'Value Index Fund',
       'Large-Cap Index Fund'], dtype=object)


Processing fund object: Vanguard Extended Market Index Fund - ShareClassType.INVESTOR
Cleaned name: 'Extended Market Index Fund'
  Found 30 name matches
  Found 5 matching records
  2024: Total Return = 16.76%, Expense Ratio = 0.19%, Net Assets = 195.0, Net Income Ratio = 1.09, Turnover = 11.0, Net Assets Value Begining = 124.78, Net Assets Value End = 144.2
  2023: Total Return = 25.22%, Expense Ratio = 0.19%, Net Assets = 232.0, Net Income Ratio = 1.28, Turnover = 11.0, Net Assets Value Begining = 100.93, Net Assets Value End = 124.78
  2022: Total Return = -26.56%, Expense Ratio = 0.19%, Net Assets = 229.0, Net Income Ratio = 1.14, Turnover = 11.0, Net Assets Value Begining = 138.8, Net Assets Value End = 100.93
  2021: Total Return = 12.31%, Expense Ratio = 0.19%, Net Assets = 399.0, Net Income Ratio = 0.87, Turnover = 19.0, Net Assets Value Begining = 124.83, Net Assets Value End = 138.8
  2020: Total Return = 32.04%, Expense Ratio = 0.19%, Net Assets = 454.0, Net Income Ratio = 

In [4]:
import sys
%reload_ext autoreload
from src.simple_rag.extraction.parser import compute_annual_returns

for fund in funds_total:
    if fund.ticker in performance_funds:
        returns = compute_annual_returns(fund.performance_table)
        print("\nFinal Annual Returns:")
        fund.annual_returns = returns
        print(f"  {fund.ticker}: {returns}")
        for year, return_ in returns.items():
            print(fund.financial_highlights.keys())
            if year not in fund.financial_highlights.keys():
                new_highlight = FinancialHighlights(
                year=int(year),
                total_return=return_,
                turnover=0.0,
                expense_ratio=0.0,
                net_assets=0.0,
                net_assets_value_begining=0.0,
                net_assets_value_end=0.0,
                net_income_ratio=0.0
                )
                fund.financial_highlights[year] = new_highlight
                print(f"    {year}: {new_highlight}")

Detected format: Year (YYYY)
Found years: [np.int64(2014), np.int64(2015), np.int64(2016), np.int64(2017), np.int64(2018), np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2022), np.int64(2023), np.int64(2024)]
  2015 Return: $10,000.00 -> $10,125.00 = 1.25%
  2016 Return: $10,125.00 -> $11,321.00 = 11.81%
  2017 Return: $11,321.00 -> $13,774.00 = 21.67%
  2018 Return: $13,774.00 -> $13,151.00 = -4.52%
  2019 Return: $13,151.00 -> $17,271.00 = 31.33%
  2020 Return: $17,271.00 -> $20,423.00 = 18.25%
  2021 Return: $20,423.00 -> $26,250.00 = 28.53%
  2022 Return: $26,250.00 -> $21,465.00 = -18.23%
  2023 Return: $21,465.00 -> $27,069.00 = 26.11%
  2024 Return: $27,069.00 -> $33,794.00 = 24.84%

Final Annual Returns:
  VFINX: {'2015': 1.25, '2016': 11.81, '2017': 21.67, '2018': -4.52, '2019': 31.33, '2020': 18.25, '2021': 28.53, '2022': -18.23, '2023': 26.11, '2024': 24.84}
dict_keys(['2024', '2023', '2022', '2021', '2020'])
    2015: turnover=0.0 expense_ratio=0.0 total_return=1.

  df['parsed_date'] = pd.to_datetime(df[date_col], errors='coerce')


## Vanguard World Fund

In [5]:
import pandas as pd
from io import StringIO
import sys
from pathlib import Path
RAG_DIR = Path("/home/alvar/CascadeProjects/windsurf-project/RAG")
if str(RAG_DIR) not in sys.path:
    sys.path.insert(0, str(RAG_DIR))
%reload_ext autoreload
from src.simple_rag.extraction.parser import BlackRockFiling
from edgar import set_identity, Company

from src.simple_rag.models.fund import FilingMetadata

set_identity("luis.alvarez.conde@alumnos.upm.es")

ticker = "MGK"
fund = Company(ticker)
all_filings = fund.get_filings(form="N-CSR")


if all_filings:
    # 1. Find the most recent date in the entire history (e.g., "2024-12-31")
    latest_date_str = max(f.report_date for f in all_filings)
    
    # 2. Extract just the YEAR (e.g., "2024")
    target_year = latest_date_str[:4]
    
    # 3. Filter: Keep ALL filings where the report_date starts with that year
    # This captures the March, June, and December reports for that fiscal year
    latest_filings = [
        f for f in all_filings 
        if f.report_date and f.report_date.startswith(target_year)
    ]
    target_year = "2024"
    filings2 = sorted(
        [f for f in all_filings if f.report_date and f.report_date.startswith(target_year)],
        key=lambda f: f.report_date,
        reverse=True
    )

    latest_filings.append(filings2[0])
    print("Found filings: ", len(latest_filings), "for year: ", target_year)



performance_funds = []
df_performance = []
world_funds = set()

abort = False
for filing in latest_filings:

    html_content = filing.html()
    
    parser = BlackRockFiling(html_content)
    funds = parser.get_funds()
    count = 0
    filing_metadata = FilingMetadata(
        accession_number=filing.accession_number,
        reporting_date=filing.report_date,
        filing_date=filing.filing_date,
        form=filing.form,
        url=filing.url

    )
    for fund in funds:
        if fund.performance_table is not None:
            if fund.ticker not in performance_funds:
                performance_funds.append(fund.ticker)
                count += 1
        if fund.ticker not in world_funds:
            world_funds.add(fund.ticker)
        else:
            print("Exiting filing, repeated ticker found: ", fund.ticker)
            abort = True
            break
        fund.ncsr_metadata = filing_metadata 
    if not abort:

        df_performance.append(parser.get_financial_highlights())
        print(count)
        print("Adding funds: ", len(funds))
        funds_total.extend(funds)

print("Total world funds added: ", len(world_funds))
print(len(performance_funds))
print(performance_funds)

print(len(df_performance))


Found filings:  4 for year:  2024
Processing: Mega Cap Growth Index Fund
Extracting context:  From2024-10-01to2025-09-30_C000055216Member
Processing: Mega Cap Growth Index Fund
Extracting context:  From2024-10-01to2025-09-30_C000055215Member
Tag not found:  dei:SecurityExchangeName From2024-10-01to2025-09-30_C000055215Member
2
Adding funds:  2
Processing: Vanguard Extended Duration Treasury Index Fund
Extracting context:  FY2025_C000051981Member
Tag not found:  dei:SecurityExchangeName FY2025_C000051981Member
Failed to extract tables from block:  oef:LineGraphTableTextBlock
Tables not found.
Processing: Vanguard Extended Duration Treasury Index Fund
Extracting context:  FY2025_C000051979Member
Tag not found:  dei:SecurityExchangeName FY2025_C000051979Member
Failed to extract tables from block:  oef:LineGraphTableTextBlock
Tables not found.
Processing: Vanguard ESG U.S. Stock ETF
Extracting context:  FY2025_C000204567Member
Tag not found:  dei:SecurityExchangeName FY2025_C000204567Membe

In [6]:
from src.simple_rag.models.fund import FinancialHighlights
import pandas as pd
from IPython.display import display

print(len(df_performance))

total_df = pd.concat([df_performance[0], df_performance[1], df_performance[2]], ignore_index=True)

returns_lookup = total_df.copy()

# Optional: Clean the total_return column (remove % sign if needed)
print(returns_lookup.head())
display(returns_lookup['fund_name'].unique())

numeric_columns = ['portfolio_turnover', 'expense_ratio', 'net_assets', 
                   'nav_beginning', 'nav_end', 'net_income_ratio', 'distribution_shares']


for col in numeric_columns:
    if col in returns_lookup.columns:
        if returns_lookup[col] is not None:  
            try:      
                returns_lookup[f'{col}_clean'] = (
                    returns_lookup[col]
                    .astype(str)
                    .str.replace('%', '')
                    .str.replace('$', '')
                    .str.replace(',', '')
                    .replace('N/A', '0')
                    .replace('', '0')
                    .replace('None', '0')
                    .astype(float)
                )
            except Exception as e:
                print(f"Error cleaning column '{col}': {str(e)}")
                print(returns_lookup[col].to_string())
count = 0
# Now you can efficiently match and update your funds
for fund_obj in funds_total:
    print(f"\nProcessing fund object: {fund_obj.name} - {fund_obj.share_class}")
    
    # Initialize annual returns
    if not hasattr(fund_obj, 'annual_returns') or fund_obj.annual_returns is None:
        fund_obj.annual_returns = {}
    
    # Clean the name: remove "Vanguard" and strip whitespace
    name = fund_obj.name.replace("Vanguard", "").strip()
    print(f"Cleaned name: '{name}'")
    
    if "™" in name:
        name = name.replace("™", "")
    elif "®" in name:
        name = name.replace("®", "")
    # Find matching rows based on fund name
    name_matches = returns_lookup[returns_lookup['fund_name'].str.strip().str.lower() == name.lower()]
    if len(name_matches) == 0:
        print("  No name matches found for ticker: ", fund_obj.ticker)
        continue
    
    print(f"  Found {len(name_matches)} name matches")
    
    # Clean share class (remove trademark symbol)
    share_class = fund_obj.share_class
    if "™" in share_class:
        share_class = share_class.replace("™", "")
    
    # Now match share class
    share_class_matches = name_matches[
        name_matches['share_class'].str.contains(share_class, case=False, na=False, regex=False)]
    
    if len(share_class_matches) == 0:
        print(f"  No share class matches found for '{share_class}' ticker: ", fund_obj.ticker)
        print(f"  Available share classes: {name_matches['share_class'].unique()}")
        continue
    elif len(share_class_matches) > 5:
        print("  More than 5 share class matches found:")
        print(share_class_matches)
    
    
    print(f"  Found {len(share_class_matches)} matching records")
    count += 1
    # Add all matching returns
    for _, row in share_class_matches.iterrows():
        year = str(row['year'])
        
       
        highlights = FinancialHighlights(
            turnover=row.get('portfolio_turnover_clean', 0),
            expense_ratio=row.get('expense_ratio_clean', 0),
            total_return=row['total_return'],
            net_assets=row.get('net_assets_clean', 0),
            net_assets_value_begining=row.get('nav_beginning_clean', 0),
            net_assets_value_end=row.get('nav_end_clean', 0),
            net_income_ratio=row.get('net_income_ratio_clean', 0.0)
        )
        
        fund_obj.financial_highlights[year] = highlights
        print(f"  {year}: Total Return = {highlights.total_return}%, Expense Ratio = {highlights.expense_ratio}%, Net Assets = {highlights.net_assets}, Net Income Ratio = {highlights.net_income_ratio}, Turnover = {highlights.turnover}, Net Assets Value Begining = {highlights.net_assets_value_begining}, Net Assets Value End = {highlights.net_assets_value_end}")
print("count: ",count)
print("Total funds: ",world_funds)
    

3
                    fund_name share_class  year  net_assets  nav_beginning  \
0  Mega Cap Growth Index Fund  ETF Shares  2025     31195.0         321.87   
1  Mega Cap Growth Index Fund  ETF Shares  2024     22954.0         314.83   
2  Mega Cap Growth Index Fund  ETF Shares  2024     21996.0         241.25   
3  Mega Cap Growth Index Fund  ETF Shares  2023     14376.0         195.20   
4  Mega Cap Growth Index Fund  ETF Shares  2022     11168.0         248.50   

   nav_end  total_return  expense_ratio  net_income_ratio  portfolio_turnover  \
0   402.45         25.58           0.07              0.42                14.0   
1   321.87          2.35           0.07              0.40                 6.0   
2   314.83         31.16           0.07              0.51                14.0   
3   241.25         24.39           0.07              0.62                 7.0   
4   195.20        -21.08           0.07              0.51                 5.0   

  distribution_shares  
0                N

array(['Mega Cap Growth Index Fund',
       'Extended Duration Treasury Index Fund', 'ESG U.S. Stock ETF',
       'ESG International Stock ETF', 'Global Wellington Fund',
       'Global Wellesley Income Fund', 'ESG U.S. Corporate Bond ETF',
       'U.S. Growth Fund', 'International Growth Fund',
       'FTSE Social Index Fund', 'Communication Services Index Fund',
       'Consumer Discretionary Index Fund', 'Consumer Staples Index Fund',
       'Energy Index Fund', 'Financials Index Fund',
       'Health Care Index Fund', 'Industrials Index Fund',
       'Information Technology Index Fund', 'Materials Index Fund',
       'Utilities Index Fund', 'Mega Cap Index Fund',
       'Mega Cap Value Index Fund'], dtype=object)


Processing fund object: Vanguard Extended Market Index Fund - ShareClassType.INVESTOR
Cleaned name: 'Extended Market Index Fund'
  No name matches found for ticker:  VEXMX

Processing fund object: Vanguard Extended Market Index Fund - ShareClassType.ETF
Cleaned name: 'Extended Market Index Fund'
  No name matches found for ticker:  VXF

Processing fund object: Vanguard Extended Market Index Fund - ShareClassType.ADMIRAL
Cleaned name: 'Extended Market Index Fund'
  No name matches found for ticker:  VEXAX

Processing fund object: Vanguard Extended Market Index Fund - ShareClassType.INSTITUTIONAL
Cleaned name: 'Extended Market Index Fund'
  No name matches found for ticker:  VIEIX

Processing fund object: Vanguard Extended Market Index Fund - ShareClassType.INSTITUTIONAL_PLUS
Cleaned name: 'Extended Market Index Fund'
  No name matches found for ticker:  VEMPX

Processing fund object: Vanguard Extended Market Index Fund - ShareClassType.INSTITUTIONAL_SELECT
Cleaned name: 'Extended Marke

In [7]:
import sys
%reload_ext autoreload
sys.path.append('../src')


from simple_rag.extraction.parser import compute_annual_returns

for fund in funds_total:
    if fund.ticker in performance_funds:
        returns = compute_annual_returns(fund.performance_table)
        print("\nFinal Annual Returns:")
        fund.annual_returns = returns
        print(f"  {fund.ticker}: {returns}")
        for year, return_ in returns.items():
            print(fund.financial_highlights.keys())
            if year not in fund.financial_highlights.keys():
                new_highlight = FinancialHighlights(
                year=int(year),
                total_return=return_,
                turnover=0.0,
                expense_ratio=0.0,
                net_assets=0.0,
                net_assets_value_begining=0.0,
                net_assets_value_end=0.0,
                net_income_ratio=0.0
                )
                fund.financial_highlights[year] = new_highlight
                print(f"    {year}: {new_highlight}")

Detected format: Year (YYYY)
Found years: [np.int64(2015), np.int64(2016), np.int64(2017), np.int64(2018), np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2022), np.int64(2023), np.int64(2024), np.int64(2025)]
  2016 Return: $10,724.00 -> $11,409.00 = 6.39%
  2017 Return: $11,409.00 -> $14,772.00 = 29.48%
  2018 Return: $14,772.00 -> $14,349.00 = -2.86%
  2019 Return: $14,349.00 -> $19,736.00 = 37.54%
  2020 Return: $19,736.00 -> $27,826.00 = 40.99%
  2021 Return: $27,826.00 -> $35,753.00 = 28.49%
  2022 Return: $35,753.00 -> $23,755.00 = -33.56%
  2023 Return: $23,755.00 -> $36,004.00 = 51.56%
  2024 Return: $36,004.00 -> $47,873.00 = 32.97%
  2025 Return: $47,873.00 -> $56,289.00 = 17.58%

Final Annual Returns:
  MGK: {'2016': 6.39, '2017': 29.48, '2018': -2.86, '2019': 37.54, '2020': 40.99, '2021': 28.49, '2022': -33.56, '2023': 51.56, '2024': 32.97, '2025': 17.58}
dict_keys(['2025', '2024', '2023', '2022', '2021'])
    2016: turnover=0.0 expense_ratio=0.0 total_return=6.39

In [8]:
import pickle
from pathlib import Path
import sys

# Add RAG directory to path
RAG_DIR = Path("/home/alvar/CascadeProjects/windsurf-project/RAG")
if str(RAG_DIR) not in sys.path:
    sys.path.insert(0, str(RAG_DIR))

# Define pickle file path
PKL_PATH = Path("./funds_backup_metadata.pkl")

print("Current working directory:", Path.cwd())
print("PKL_PATH resolves to:", PKL_PATH.resolve())

# Save to pickle file
try:
    with PKL_PATH.open("wb") as f:
        pickle.dump(funds_total, f)
    
    print(f"Successfully saved {len(funds_total)} funds to pickle file")
    print(f"File size: {PKL_PATH.stat().st_size / 1024:.2f} KB")
    
except Exception as e:
    print(f"Error saving to pickle file: {e}")

Current working directory: /home/luis/Desktop/code/RAG/notebooks
PKL_PATH resolves to: /home/luis/Desktop/code/RAG/notebooks/funds_backup_metadata.pkl
Successfully saved 93 funds to pickle file
File size: 403.82 KB


## Vanguard Specialized Funds

In [9]:
import pandas as pd
from io import StringIO
import sys
from pathlib import Path
from src.simple_rag.extraction.parser import BlackRockFiling
from edgar import set_identity, Company
from src.simple_rag.models.fund import FilingMetadata

set_identity("luis.alvarez.conde@alumnos.upm.es")

ticker = "VDIGX"
fund = Company(ticker)
all_filings = fund.get_filings(form="N-CSR")


if all_filings:
    # 1. Find the most recent date in the entire history (e.g., "2024-12-31")
    latest_date_str = max(f.report_date for f in all_filings)
    
    # 2. Extract just the YEAR (e.g., "2024")
    target_year = latest_date_str[:4]
    
    # 3. Filter: Keep ALL filings where the report_date starts with that year
    # This captures the March, June, and December reports for that fiscal year
    latest_filings = [
        f for f in all_filings 
        if f.report_date and f.report_date.startswith(target_year)
    ]
    
    print("Found filings: ", len(latest_filings), "for year: ", target_year)



performance_funds = []
specialized_funds = set()
df_performance = []
abort = False
for filing in latest_filings:

    html_content = filing.html()
    
    parser = BlackRockFiling(html_content)
    funds = parser.get_funds()
    count = 0
    filing_metadata = FilingMetadata(
        accession_number=filing.accession_number,
        reporting_date=filing.report_date,
        filing_date=filing.filing_date,
        form=filing.form,
        url=filing.url

    )
    for fund in funds:
        if fund.performance_table is not None:
            if fund.ticker not in performance_funds: 
                performance_funds.append(fund.ticker)
                count += 1
        if fund.ticker not in specialized_funds:
            specialized_funds.add(fund.ticker)
        else:
            print("Exiting filing, repeated ticker found: ", fund.ticker)
            abort = True
            break
        fund.ncsr_metadata = filing_metadata
    if abort:
        break
    df_performance.append(parser.get_financial_highlights())

    print(count)
    print("Adding funds: ", len(funds))
    funds_total.extend(funds)

print("Specialized funds: ", len(specialized_funds))
print(len(performance_funds))
print(performance_funds)
print(len(df_performance))



Found filings:  2 for year:  2025
Processing: Dividend Growth Fund
Extracting context:  From2024-02-01to2025-01-31_C000008004Member
Tag not found:  dei:SecurityExchangeName From2024-02-01to2025-01-31_C000008004Member
Processing: Energy Fund
Extracting context:  From2024-02-01to2025-01-31_C000008005Member
Tag not found:  dei:SecurityExchangeName From2024-02-01to2025-01-31_C000008005Member
Processing: Energy Fund
Extracting context:  From2024-02-01to2025-01-31_C000008006Member
Tag not found:  dei:SecurityExchangeName From2024-02-01to2025-01-31_C000008006Member
Processing: Health Care Fund
Extracting context:  From2024-02-01to2025-01-31_C000008007Member
Tag not found:  dei:SecurityExchangeName From2024-02-01to2025-01-31_C000008007Member
Processing: Health Care Fund
Extracting context:  From2024-02-01to2025-01-31_C000008008Member
Tag not found:  dei:SecurityExchangeName From2024-02-01to2025-01-31_C000008008Member
Processing: Dividend Appreciation Index Fund
Extracting context:  From2024-02

In [10]:
from src.simple_rag.models.fund import FinancialHighlights
import pandas as pd
from IPython.display import display

print(len(df_performance))

total_df = pd.concat([df_performance[0], df_performance[1]], ignore_index=True)

returns_lookup = total_df.copy()

# Optional: Clean the total_return column (remove % sign if needed)
print(returns_lookup.head())
display(returns_lookup['fund_name'].unique())

numeric_columns = ['portfolio_turnover', 'expense_ratio', 'net_assets', 
                   'nav_beginning', 'nav_end', 'net_income_ratio', 'distribution_shares']


for col in numeric_columns:
    if col in returns_lookup.columns:
        if returns_lookup[col] is not None:  
            try:      
                returns_lookup[f'{col}_clean'] = (
                    returns_lookup[col]
                    .astype(str)
                    .str.replace('%', '')
                    .str.replace('$', '')
                    .str.replace(',', '')
                    .replace('N/A', '0')
                    .replace('', '0')
                    .replace('None', '0')
                    .astype(float)
                )
            except Exception as e:
                print(f"Error cleaning column '{col}': {str(e)}")
                print(returns_lookup[col].to_string())
count = 0
# Now you can efficiently match and update your funds
for fund_obj in funds_total:
    if fund_obj.ticker not in specialized_funds:
        continue
    print(f"\nProcessing fund object: {fund_obj.name} - {fund_obj.share_class}")
    
    # Initialize annual returns
    if not hasattr(fund_obj, 'annual_returns') or fund_obj.annual_returns is None:
        fund_obj.annual_returns = {}
    
    # Clean the name: remove "Vanguard" and strip whitespace
    name = fund_obj.name.replace("Vanguard", "").strip()
    print(f"Cleaned name: '{name}'")
    
    if "™" in name:
        name = name.replace("™", "")
    elif "®" in name:
        name = name.replace("®", "")
    # Find matching rows based on fund name
    name_matches = returns_lookup[returns_lookup['fund_name'].str.strip().str.lower() == name.lower()]
    if len(name_matches) == 0:
        print("  No name matches found for ticker: ", fund_obj.ticker)
        continue
    
    print(f"  Found {len(name_matches)} name matches")
    
    # Clean share class (remove trademark symbol)
    share_class = fund_obj.share_class
    if "™" in share_class:
        share_class = share_class.replace("™", "")
    
    # Now match share class
    share_class_matches = name_matches[
        name_matches['share_class'].str.contains(share_class, case=False, na=False, regex=False)]
    
    if len(share_class_matches) == 0:
        print(f"  No share class matches found for '{share_class}' ticker: ", fund_obj.ticker)
        print(f"  Available share classes: {name_matches}")
        print(f"  Found {len(share_class_matches)} matching records")
        count += 1
        # Add all matching returns
        for _, row in share_class_matches.iterrows():
            year = str(row['year'])
            highlights = FinancialHighlights(
                turnover=row.get('portfolio_turnover_clean', 0),
                expense_ratio=row.get('expense_ratio_clean', 0),
                total_return=row['total_return'],
                net_assets=row.get('net_assets_clean', 0),
                net_assets_value_begining=row.get('nav_beginning_clean', 0),
                net_assets_value_end=row.get('nav_end_clean', 0),
                net_income_ratio=row.get('net_income_ratio_clean', 0.0)
            )
            
            fund_obj.financial_highlights[year] = highlights
            print(f"  {year}: Total Return = {highlights.total_return}%, Expense Ratio = {highlights.expense_ratio}%, Net Assets = {highlights.net_assets}, Net Income Ratio = {highlights.net_income_ratio}, Turnover = {highlights.turnover}, Net Assets Value Begining = {highlights.net_assets_value_begining}, Net Assets Value End = {highlights.net_assets_value_end}")
        continue
    elif len(share_class_matches) > 5:
        print("  More than 5 share class matches found:")
        print(share_class_matches)
    
    
    print(f"  Found {len(share_class_matches)} matching records")
    count += 1
    # Add all matching returns
    for _, row in share_class_matches.iterrows():
        year = str(row['year'])
        highlights = FinancialHighlights(
            turnover=row.get('portfolio_turnover_clean', 0),
            expense_ratio=row.get('expense_ratio_clean', 0),
            total_return=row['total_return'],
            net_assets=row.get('net_assets_clean', 0),
            net_assets_value_begining=row.get('nav_beginning_clean', 0),
            net_assets_value_end=row.get('nav_end_clean', 0),
            net_income_ratio=row.get('net_income_ratio_clean', 0.0)
        )
        
        fund_obj.financial_highlights[year] = highlights
        print(f"  {year}: Total Return = {highlights.total_return}%, Expense Ratio = {highlights.expense_ratio}%, Net Assets = {highlights.net_assets}, Net Income Ratio = {highlights.net_income_ratio}, Turnover = {highlights.turnover}, Net Assets Value Begining = {highlights.net_assets_value_begining}, Net Assets Value End = {highlights.net_assets_value_end}")
print("count: ",count)
print("Total funds: ",specialized_funds)
    

2
              fund_name share_class  year  net_assets  nav_beginning  nav_end  \
0  Dividend Growth Fund        None  2025     50424.0          37.76    37.14   
1  Dividend Growth Fund        None  2024     52553.0          35.42    37.76   
2  Dividend Growth Fund        None  2023     53452.0          37.85    35.42   
3  Dividend Growth Fund        None  2022     54186.0          31.82    37.85   
4  Dividend Growth Fund        None  2021     45099.0          30.63    31.82   

   total_return  expense_ratio  net_income_ratio  portfolio_turnover  \
0         10.20           0.22              1.68                16.0   
1          9.11           0.29              1.74                 9.0   
2         -0.76           0.30              1.68                11.0   
3         25.66           0.27              1.56                15.0   
4          7.03           0.26              1.85                15.0   

  distribution_shares  
0                None  
1                None  
2     

array(['Dividend Growth Fund', 'Energy Fund', 'Health Care Fund',
       'Dividend Appreciation Index Fund', 'Real Estate Index Fund',
       'Real Estate II Index Fund', 'Global Capital Cycles Fund',
       'Global ESG Select Stock Fund'], dtype=object)


Processing fund object: Dividend Growth Fund - ShareClassType.INVESTOR
Cleaned name: 'Dividend Growth Fund'
  Found 5 name matches
  No share class matches found for 'ShareClassType.INVESTOR' ticker:  VDIGX
  Available share classes:               fund_name share_class  year  net_assets  nav_beginning  nav_end  \
0  Dividend Growth Fund        None  2025     50424.0          37.76    37.14   
1  Dividend Growth Fund        None  2024     52553.0          35.42    37.76   
2  Dividend Growth Fund        None  2023     53452.0          37.85    35.42   
3  Dividend Growth Fund        None  2022     54186.0          31.82    37.85   
4  Dividend Growth Fund        None  2021     45099.0          30.63    31.82   

   total_return  expense_ratio  net_income_ratio  portfolio_turnover  \
0         10.20           0.22              1.68                16.0   
1          9.11           0.29              1.74                 9.0   
2         -0.76           0.30              1.68              

In [11]:
import sys
%reload_ext autoreload
sys.path.append('../src')


from simple_rag.extraction.parser import compute_annual_returns

for fund in funds_total:
    if fund.ticker in performance_funds:
        returns = compute_annual_returns(fund.performance_table)
        print("\nFinal Annual Returns:")
        fund.annual_returns = returns
        print(f"  {fund.ticker}: {returns}")
        for year, return_ in returns.items():
            print(fund.financial_highlights.keys())
            if year not in fund.financial_highlights.keys():
                new_highlight = FinancialHighlights(
                year=int(year),
                total_return=return_,
                turnover=0.0,
                expense_ratio=0.0,
                net_assets=0.0,
                net_assets_value_begining=0.0,
                net_assets_value_end=0.0,
                net_income_ratio=0.0
                )
                fund.financial_highlights[year] = new_highlight
                print(f"    {year}: {new_highlight}")

Detected format: Year (YYYY)
Found years: [np.int64(2015), np.int64(2016), np.int64(2017), np.int64(2018), np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2022), np.int64(2023), np.int64(2024), np.int64(2025)]
  2016 Return: $10,619.00 -> $10,874.00 = 2.40%
  2017 Return: $10,874.00 -> $12,860.00 = 18.26%
  2018 Return: $12,860.00 -> $14,143.00 = 9.98%
  2019 Return: $14,143.00 -> $16,879.00 = 19.35%
  2020 Return: $16,879.00 -> $17,545.00 = 3.95%
  2021 Return: $17,545.00 -> $23,646.00 = 34.77%
  2022 Return: $23,646.00 -> $22,953.00 = -2.93%
  2023 Return: $22,953.00 -> $23,134.00 = 0.79%
  2024 Return: $23,134.00 -> $28,121.00 = 21.56%
  2025 Return: $28,121.00 -> $28,555.00 = 1.54%

Final Annual Returns:
  VDIGX: {'2016': 2.4, '2017': 18.26, '2018': 9.98, '2019': 19.35, '2020': 3.95, '2021': 34.77, '2022': -2.93, '2023': 0.79, '2024': 21.56, '2025': 1.54}
dict_keys([])
    2016: turnover=0.0 expense_ratio=0.0 total_return=2.4 net_assets=0.0 net_assets_value_begining=0.0 ne

In [12]:
import pickle
from pathlib import Path
import sys

# Add RAG directory to path
RAG_DIR = Path("/home/alvar/CascadeProjects/windsurf-project/RAG")
if str(RAG_DIR) not in sys.path:
    sys.path.insert(0, str(RAG_DIR))

# Define pickle file path
PKL_PATH = Path("./funds_backup_metadata.pkl")

print("Current working directory:", Path.cwd())
print("PKL_PATH resolves to:", PKL_PATH.resolve())

# Save to pickle file
try:
    with PKL_PATH.open("wb") as f:
        pickle.dump(funds_total, f)
    
    print(f"Successfully saved {len(funds_total)} funds to pickle file")
    print(f"File size: {PKL_PATH.stat().st_size / 1024:.2f} KB")
    
except Exception as e:
    print(f"Error saving to pickle file: {e}")

Current working directory: /home/luis/Desktop/code/RAG/notebooks
PKL_PATH resolves to: /home/luis/Desktop/code/RAG/notebooks/funds_backup_metadata.pkl
Successfully saved 107 funds to pickle file
File size: 461.90 KB


In [14]:
import pickle
from pathlib import Path
from dataclasses import is_dataclass, asdict
import pandas as pd
import sys
from pathlib import Path

RAG_DIR = Path("/home/alvar/CascadeProjects/windsurf-project/RAG")
if str(RAG_DIR) not in sys.path:
    sys.path.insert(0, str(RAG_DIR))


PKL_PATH = Path("./funds_backup_metadata.pkl")
print("Current working directory:", Path.cwd())
print("PKL_PATH resolves to:", PKL_PATH.resolve())
with PKL_PATH.open("rb") as f:
    funds_total = pickle.load(f)

print(f"Loaded {len(funds_total)} funds from pickle file")

Current working directory: /home/luis/Desktop/code/RAG/notebooks
PKL_PATH resolves to: /home/luis/Desktop/code/RAG/notebooks/funds_backup_metadata.pkl
Loaded 107 funds from pickle file


## Vanguard Whitehall Funds

In [15]:
import pandas as pd
from io import StringIO
import sys
from pathlib import Path
from src.simple_rag.extraction.parser import BlackRockFiling
from edgar import set_identity, Company
from src.simple_rag.models.fund import FundData

set_identity("luis.alvarez.conde@alumnos.upm.es")
ticker = "VMGRX"
fund = Company(ticker)
all_filings = fund.get_filings(form="N-CSR")


if all_filings:
    # 1. Find the most recent date in the entire history (e.g., "2024-12-31")
    latest_date_str = max(f.report_date for f in all_filings)
    
    # 2. Extract just the YEAR (e.g., "2024")
    target_year = latest_date_str[:4]
    
    # 3. Filter: Keep ALL filings where the report_date starts with that year
    # This captures the March, June, and December reports for that fiscal year
    latest_filings = [
        f for f in all_filings 
        if f.report_date and f.report_date.startswith(target_year)
    ]
    
    print("Found filings: ", len(latest_filings), "for year: ", target_year)



performance_funds = []
whitehall_funds = set()
df_performance = []
abort = False

for filing in latest_filings:

    html_content = filing.html()
    
    parser = BlackRockFiling(html_content)
    funds = parser.get_funds()
    count = 0
    filing_metadata = FilingMetadata(
        accession_number=filing.accession_number,
        reporting_date=filing.report_date,
        filing_date=filing.filing_date,
        form=filing.form,
        url=filing.url

    )
    for fund in funds:
        if fund.performance_table is not None:
            if fund.ticker not in performance_funds: 
                performance_funds.append(fund.ticker)
                count += 1

        if fund.ticker not in whitehall_funds:
            whitehall_funds.add(fund.ticker)
        else:
            print("Exiting filing, repeated ticker found: ", fund.ticker)
            abort = True
            break
        fund.ncsr_metadata = filing_metadata
    if abort:
        break

    df_performance.append(parser.get_financial_highlights())

    print(count)
    print("Adding funds: ", len(funds))
    funds_total.extend(funds)

print("Whitehall funds: ", len(whitehall_funds))
print(len(performance_funds))
print(whitehall_funds)
print(len(df_performance))


Found filings:  2 for year:  2025
Processing: Mid-Cap Growth Fund
Extracting context:  From2024-11-01to2025-10-31_C000012166Member
Tag not found:  dei:SecurityExchangeName From2024-11-01to2025-10-31_C000012166Member
Processing: Selected Value Fund
Extracting context:  From2024-11-01to2025-10-31_C000012167Member
Tag not found:  dei:SecurityExchangeName From2024-11-01to2025-10-31_C000012167Member
Processing: Emerging Markets Government Bond Index Fund
Extracting context:  From2024-11-01to2025-10-31_C000126408Member
Processing: Emerging Markets Government Bond Index Fund
Extracting context:  From2024-11-01to2025-10-31_C000126407Member
Tag not found:  dei:SecurityExchangeName From2024-11-01to2025-10-31_C000126407Member
Processing: Emerging Markets Government Bond Index Fund
Extracting context:  From2024-11-01to2025-10-31_C000126409Member
Tag not found:  dei:SecurityExchangeName From2024-11-01to2025-10-31_C000126409Member
Processing: Global Minimum Volatility Fund
Extracting context:  From2

In [16]:
from src.simple_rag.models.fund import FinancialHighlights
import pandas as pd
from IPython.display import display

print(len(df_performance))

total_df = pd.concat([df_performance[0], df_performance[1]], ignore_index=True)

returns_lookup = total_df.copy()

# Optional: Clean the total_return column (remove % sign if needed)
print(returns_lookup.head())
returns_lookup['fund_name'] = (
    returns_lookup['fund_name']
    .str.replace('\n', ' ', regex=False)
)
display(returns_lookup['fund_name'].unique())

numeric_columns = ['portfolio_turnover', 'expense_ratio', 'net_assets', 
                   'nav_beginning', 'nav_end', 'net_income_ratio', 'distribution_shares']


for col in numeric_columns:
    if col in returns_lookup.columns:
        if returns_lookup[col] is not None:  
            try:      
                returns_lookup[f'{col}_clean'] = (
                    returns_lookup[col]
                    .astype(str)
                    .str.replace('%', '')
                    .str.replace('$', '')
                    .str.replace(',', '')
                    .replace('N/A', '0')
                    .replace('', '0')
                    .replace('None', '0')
                    .astype(float)
                )
            except Exception as e:
                print(f"Error cleaning column '{col}': {str(e)}")
                print(returns_lookup[col].to_string())
count = 0
# Now you can efficiently match and update your funds
for fund_obj in funds_total:
    if fund_obj.ticker not in whitehall_funds:
        continue
    print(f"\nProcessing fund object: {fund_obj.name} - {fund_obj.share_class}")
    
    # Initialize annual returns
    if not hasattr(fund_obj, 'annual_returns') or fund_obj.annual_returns is None:
        fund_obj.annual_returns = {}
    
    # Clean the name: remove "Vanguard" and strip whitespace
    name = fund_obj.name.replace("Vanguard", "").strip()
    print(f"Cleaned name: '{name}'")
    
    if "™" in name:
        name = name.replace("™", "")
    elif "®" in name:
        name = name.replace("®", "")
    elif "\n" in name:
        name = name.replace("\n", " ")
        print(name)
    # Find matching rows based on fund name
    name_matches = returns_lookup[returns_lookup['fund_name'].str.strip().str.lower() == name.lower()]
    if len(name_matches) == 0:
        print("  No name matches found for ticker: ", fund_obj.ticker)
        continue
    
    print(f"  Found {len(name_matches)} name matches")
    
    # Clean share class (remove trademark symbol)
    share_class = fund_obj.share_class
    if "™" in share_class:
        share_class = share_class.replace("™", "")
    
    # Now match share class
    share_class_matches = name_matches[
        name_matches['share_class'].str.contains(share_class, case=False, na=False, regex=False)]
    
    if len(share_class_matches) == 0:
        print(f"  No share class matches found for '{share_class}' ticker: ", fund_obj.ticker)
        print(f"  Found {len(name_matches)} name records")
        count += 1
        # Add all matching returns
        for _, row in name_matches.iterrows():
            year = str(row['year'])
            highlights = FinancialHighlights(
                turnover=row.get('portfolio_turnover_clean', 0),
                expense_ratio=row.get('expense_ratio_clean', 0),
                total_return=row['total_return'],
                net_assets=row.get('net_assets_clean', 0),
                net_assets_value_begining=row.get('nav_beginning_clean', 0),
                net_assets_value_end=row.get('nav_end_clean', 0),
                net_income_ratio=row.get('net_income_ratio_clean', 0.0)
            )
            
            fund_obj.financial_highlights[year] = highlights
            print(f"  {year}: Total Return = {highlights.total_return}%, Expense Ratio = {highlights.expense_ratio}%, Net Assets = {highlights.net_assets}, Net Income Ratio = {highlights.net_income_ratio}, Turnover = {highlights.turnover}, Net Assets Value Begining = {highlights.net_assets_value_begining}, Net Assets Value End = {highlights.net_assets_value_end}")
        continue
    elif len(share_class_matches) > 5:
        print("  More than 5 share class matches found:")
        print(share_class_matches)
    
    
    print(f"  Found {len(share_class_matches)} matching records")
    count += 1
    # Add all matching returns
    for _, row in share_class_matches.iterrows():
        year = str(row['year'])
        highlights = FinancialHighlights(
            turnover=row.get('portfolio_turnover_clean', 0),
            expense_ratio=row.get('expense_ratio_clean', 0),
            total_return=row['total_return'],
            net_assets=row.get('net_assets_clean', 0),
            net_assets_value_begining=row.get('nav_beginning_clean', 0),
            net_assets_value_end=row.get('nav_end_clean', 0),
            net_income_ratio=row.get('net_income_ratio_clean', 0.0)
        )
        
        fund_obj.financial_highlights[year] = highlights
        print(f"  {year}: Total Return = {highlights.total_return}%, Expense Ratio = {highlights.expense_ratio}%, Net Assets = {highlights.net_assets}, Net Income Ratio = {highlights.net_income_ratio}, Turnover = {highlights.turnover}, Net Assets Value Begining = {highlights.net_assets_value_begining}, Net Assets Value End = {highlights.net_assets_value_end}")
print("count: ",count)
print("Total funds: ",whitehall_funds)
    

2
             fund_name share_class  year  net_assets  nav_beginning  nav_end  \
0  Mid-Cap Growth Fund        None  2025      3116.0          26.21    29.55   
1  Mid-Cap Growth Fund        None  2024      3042.0          19.38    26.21   
2  Mid-Cap Growth Fund        None  2023      2530.0          19.24    19.38   
3  Mid-Cap Growth Fund        None  2022      2956.0          38.72    19.24   
4  Mid-Cap Growth Fund        None  2021      5290.0          29.89    38.72   

   total_return  expense_ratio  net_income_ratio  portfolio_turnover  \
0         14.77           0.32              0.26              1285.0   
1         35.77           0.33              0.37                69.0   
2          0.99           0.37              0.37                87.0   
3        -32.22           0.35              0.14                71.0   
4         37.68           0.33             -0.04                98.0   

  distribution_shares  
0                None  
1                None  
2           

array(['Mid-Cap Growth Fund', 'Selected Value Fund',
       'Emerging Markets Government Bond Index Fund',
       'Global Minimum Volatility Fund',
       'International Dividend Appreciation Index Fund',
       'International High Dividend Yield Index Fund',
       'International Dividend Growth Fund',
       'Advice Select International Growth Fund',
       'Advice Select Dividend Growth Fund',
       'Advice Select Global Value Fund', 'International Explorer Fund',
       'High Dividend Yield Index Fund'], dtype=object)


Processing fund object: Mid-Cap Growth Fund - ShareClassType.INVESTOR
Cleaned name: 'Mid-Cap Growth Fund'
  Found 5 name matches
  No share class matches found for 'ShareClassType.INVESTOR' ticker:  VMGRX
  Found 5 name records
  2025: Total Return = 14.77%, Expense Ratio = 0.32%, Net Assets = 3116.0, Net Income Ratio = 0.26, Turnover = 1285.0, Net Assets Value Begining = 26.21, Net Assets Value End = 29.55
  2024: Total Return = 35.77%, Expense Ratio = 0.33%, Net Assets = 3042.0, Net Income Ratio = 0.37, Turnover = 69.0, Net Assets Value Begining = 19.38, Net Assets Value End = 26.21
  2023: Total Return = 0.99%, Expense Ratio = 0.37%, Net Assets = 2530.0, Net Income Ratio = 0.37, Turnover = 87.0, Net Assets Value Begining = 19.24, Net Assets Value End = 19.38
  2022: Total Return = -32.22%, Expense Ratio = 0.35%, Net Assets = 2956.0, Net Income Ratio = 0.14, Turnover = 71.0, Net Assets Value Begining = 38.72, Net Assets Value End = 19.24
  2021: Total Return = 37.68%, Expense Ratio 

In [17]:
import sys
%reload_ext autoreload
sys.path.append('../src')


from simple_rag.extraction.parser import compute_annual_returns

for fund in funds_total:
    if fund.ticker in performance_funds:
        returns = compute_annual_returns(fund.performance_table)
        print("\nFinal Annual Returns:")
        fund.annual_returns = returns
        print(f"  {fund.ticker}: {returns}")
        for year, return_ in returns.items():
            print(fund.financial_highlights.keys())
            if year not in fund.financial_highlights.keys():
                new_highlight = FinancialHighlights(
                year=int(year),
                total_return=return_,
                turnover=0.0,
                expense_ratio=0.0,
                net_assets=0.0,
                net_assets_value_begining=0.0,
                net_assets_value_end=0.0,
                net_income_ratio=0.0
                )
                fund.financial_highlights[year] = new_highlight
                print(f"    {year}: {new_highlight}")

Detected format: Year (YYYY)
Found years: [np.int64(2015), np.int64(2016), np.int64(2017), np.int64(2018), np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2022), np.int64(2023), np.int64(2024), np.int64(2025)]
  2016 Return: $10,000.00 -> $9,451.00 = -5.49%
  2017 Return: $9,451.00 -> $11,595.00 = 22.69%
  2018 Return: $11,595.00 -> $12,710.00 = 9.62%
  2019 Return: $12,710.00 -> $14,434.00 = 13.56%
  2020 Return: $14,434.00 -> $17,079.00 = 18.32%
  2021 Return: $17,079.00 -> $23,515.00 = 37.68%
  2022 Return: $23,515.00 -> $15,939.00 = -32.22%
  2023 Return: $15,939.00 -> $16,097.00 = 0.99%
  2024 Return: $16,097.00 -> $21,856.00 = 35.78%
  2025 Return: $21,856.00 -> $25,084.00 = 14.77%

Final Annual Returns:
  VMGRX: {'2016': -5.49, '2017': 22.69, '2018': 9.62, '2019': 13.56, '2020': 18.32, '2021': 37.68, '2022': -32.22, '2023': 0.99, '2024': 35.78, '2025': 14.77}
dict_keys(['2025', '2024', '2023', '2022', '2021'])
    2016: turnover=0.0 expense_ratio=0.0 total_return=-5.49 

  df['parsed_date'] = pd.to_datetime(df[date_col], errors='coerce')
  df['parsed_date'] = pd.to_datetime(df[date_col], errors='coerce')
  df['parsed_date'] = pd.to_datetime(df[date_col], errors='coerce')
  df['parsed_date'] = pd.to_datetime(df[date_col], errors='coerce')
  df['parsed_date'] = pd.to_datetime(df[date_col], errors='coerce')


In [18]:
import pickle
from pathlib import Path
import sys

# Add RAG directory to path
RAG_DIR = Path("/home/alvar/CascadeProjects/windsurf-project/RAG")
if str(RAG_DIR) not in sys.path:
    sys.path.insert(0, str(RAG_DIR))

# Define pickle file path
PKL_PATH = Path("./funds_backup_metadata.pkl")

print("Current working directory:", Path.cwd())
print("PKL_PATH resolves to:", PKL_PATH.resolve())

# Save to pickle file
try:
    with PKL_PATH.open("wb") as f:
        pickle.dump(funds_total, f)
    
    print(f"Successfully saved {len(funds_total)} funds to pickle file")
    print(f"File size: {PKL_PATH.stat().st_size / 1024:.2f} KB")
    
except Exception as e:
    print(f"Error saving to pickle file: {e}")

Current working directory: /home/luis/Desktop/code/RAG/notebooks
PKL_PATH resolves to: /home/luis/Desktop/code/RAG/notebooks/funds_backup_metadata.pkl
Successfully saved 125 funds to pickle file
File size: 552.28 KB


In [None]:
import pickle
from pathlib import Path
from dataclasses import is_dataclass, asdict
import pandas as pd
import sys
from pathlib import Path
RAG_DIR = Path("/home/alvar/CascadeProjects/windsurf-project/RAG/src")
if str(RAG_DIR) not in sys.path:
    sys.path.insert(0, str(RAG_DIR))


PKL_PATH = Path("./funds_backup.pkl")
print("Current working directory:", Path.cwd())
print("PKL_PATH resolves to:", PKL_PATH.resolve())
with PKL_PATH.open("rb") as f:
    funds_total = pickle.load(f)

print(f"Loaded {len(funds_total)} funds from pickle file")

Current working directory: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks
PKL_PATH resolves to: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks/funds_backup.pkl
Loaded 125 funds from pickle file


## Ishares

### Low RAM mode

In [19]:
import gc
from concurrent.futures import ProcessPoolExecutor, as_completed
import signal
from concurrent.futures import ProcessPoolExecutor, as_completed
import resource
import os
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed
import pandas as pd
from typing import List
import sys
from pathlib import Path
from tqdm import tqdm
%reload_ext autoreload
from src.simple_rag.extraction.parser import BlackRockFiling
from edgar import set_identity, Company
from src.simple_rag.models.fund import FilingMetadata
set_identity("luis.alvarez.conde@alumnos.upm.es")

class TimeoutException(Exception):
    pass

def timeout_handler(signum, frame):
    raise TimeoutException("HTML parsing timed out")

# --- 1. UPDATED WORKER (Remove the memory limit code) ---
def process_single_filing_multiprocess(filing_data):
    """
    Process a single filing. 
    REMOVED: resource.setrlimit (Let the OS manage memory)
    """
    try:
        # Unpack data
        html_content, report_date, accession_number, filing_date, form, url = filing_data
        
        # Import inside function
        from src.simple_rag.extraction.parser import BlackRockFiling
        from src.simple_rag.models.fund import FilingMetadata
        
        parser = BlackRockFiling(html_content)
        funds = parser.get_funds()
        
        performance_tickers = []
        df_performance = None        
        
        filing_metadata = FilingMetadata(
            accession_number=accession_number,
            reporting_date=report_date,
            filing_date=filing_date,
            form=form,
            url=url
        )
        
        all_tickers = []
        processed_funds = []
        
        for fund in funds:
            fund.ncsr_metadata = filing_metadata
            processed_funds.append(fund)
            all_tickers.append(fund.ticker)
            
            if fund.performance_table is not None:
                if fund.ticker not in performance_tickers:
                    performance_tickers.append(fund.ticker)
        
        try:
            df_performance = parser.get_financial_highlights2()
        except Exception as e:
            pass # Suppress minor parsing errors
        
        return {
            'funds': processed_funds,
            'all_tickers': all_tickers,
            'performance_tickers': performance_tickers,
            'df_performance': df_performance,
            'report_date': report_date
        }

    except Exception as e:
        print(f"❌ Error in worker: {e}")
        return None

# --- 2. NEW BATCHED MAIN LOOP ---

# Configuration
BATCH_SIZE = 5  # Process only 5 filings at a time to save RAM
MAX_WORKERS = 8 # Keep this low (2-4) to prevent CPU/RAM saturation


ticker = "HEZU"
fund = Company(ticker)
all_filings = fund.get_filings(form="N-CSR")

if all_filings:
    unique_dates = sorted({f.report_date for f in all_filings if f.report_date})
    print("Unique report dates:", unique_dates)
    
    # Filter for filings from 2024-08-31 onward
    cutoff_date = "2024-09-31"
    latest_filings = [
        f for f in all_filings 
        if f.report_date and f.report_date >= cutoff_date
    ]
    
    print("Found filings: ", len(latest_filings), "from", cutoff_date, "onward")
    
    # Optional: Show the dates of filtered filings
    print("Filtered filing dates:", sorted({f.report_date for f in latest_filings}))

# Global results containers
all_funds_total = []

performance_funds = []
df_performances = []
ishares_funds = []

# Helper to chunk the list
def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

print(f"🚀 Starting processing with Batch Size: {BATCH_SIZE} | Workers: {MAX_WORKERS}")

# Iterate through filings in small groups
for batch_index, filing_batch in enumerate(chunker(latest_filings, BATCH_SIZE)):
    
    print(f"\n📦 Processing Batch {batch_index + 1} ({len(filing_batch)} filings)...")
    
    # 1. Prepare Data for this Batch ONLY
    batch_data = []
    
    for filing in filing_batch:
        try:
            # Your existing timeout logic
            signal.signal(signal.SIGALRM, timeout_handler)
            signal.alarm(10) # 10s timeout
            html_content = filing.html()
            signal.alarm(0)
            
            if html_content:
                batch_data.append((
                    html_content, 
                    filing.report_date,
                    filing.accession_number,
                    filing.filing_date,
                    filing.form,
                    filing.url
                ))
        except Exception as e:
            signal.alarm(0)
            print(f"   ⚠️ Skipped {filing.report_date}: {e}")

    # 2. Process this Batch Immediately
    if batch_data:
        with ProcessPoolExecutor(max_workers=MAX_WORKERS) as executor:
            future_to_data = {executor.submit(process_single_filing_multiprocess, d): d for d in batch_data}
            
            for future in as_completed(future_to_data):
                res = future.result()
                if res:
                    # Aggregate results
                    funds_total.extend(res['funds'])
                    ishares_funds.extend(res['all_tickers'])
                    performance_funds.extend(res['performance_tickers'])
                    if res['df_performance'] is not None:
                        df_performances.append(res['df_performance'])
    
    # 3. CRITICAL: Clear memory
    print(f"   🧹 Cleaning up batch memory...")
    del batch_data
    gc.collect() # Force Python to release RAM

print(f"\n✅ DONE!")
print(f"Total funds: {len(funds_total)}")
print(f"Performance Tables: {len(df_performances)}")

Unique report dates: ['2003-04-30', '2003-07-31', '2004-02-29', '2004-03-31', '2004-04-30', '2004-07-31', '2005-02-28', '2005-03-31', '2005-04-30', '2005-07-31', '2006-02-28', '2006-03-31', '2006-04-30', '2006-07-31', '2007-02-28', '2007-03-31', '2007-04-30', '2007-07-31', '2008-02-29', '2008-03-31', '2008-04-30', '2008-07-31', '2009-02-28', '2009-03-31', '2009-04-30', '2009-07-31', '2009-08-31', '2010-02-28', '2010-03-31', '2010-04-30', '2010-07-31', '2010-08-31', '2011-02-28', '2011-03-31', '2011-04-30', '2011-07-31', '2011-08-31', '2011-10-31', '2012-02-29', '2012-03-31', '2012-04-30', '2012-07-31', '2012-08-31', '2012-10-31', '2013-02-28', '2013-03-31', '2013-04-30', '2013-07-31', '2013-08-31', '2013-10-31', '2014-02-28', '2014-03-31', '2014-04-30', '2014-07-31', '2014-08-31', '2014-10-31', '2015-02-28', '2015-03-31', '2015-04-30', '2015-07-31', '2015-08-31', '2015-10-31', '2016-02-29', '2016-03-31', '2016-04-30', '2016-07-31', '2016-08-31', '2016-10-31', '2017-02-28', '2017-03-31'

In [30]:
for fund in funds_total:
    print(fund.name)
    print(fund.ncsr_metadata)
    break

Vanguard Extended Market Index Fund
accession_number='0001104659-25-020311' filing_date=datetime.date(2025, 3, 4) reporting_date=datetime.date(2024, 12, 31) url='https://www.sec.gov/Archives/edgar/data/36405/0001104659-25-020311-index.html' form='N-CSR'


In [1]:
from concurrent.futures import ProcessPoolExecutor, as_completed
import resource
import os
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed
import pandas as pd
from typing import List
import sys
from pathlib import Path
from tqdm import tqdm
%reload_ext autoreload
from src.simple_rag.extraction.parser import BlackRockFiling
from edgar import set_identity, Company
from src.simple_rag.models.fund import FilingMetadata
set_identity("luis.alvarez.conde@alumnos.upm.es")

def process_single_filing_multiprocess(filing_data):
    """Process a single filing with memory limits"""
    try:
        # Limit memory to 1.5GB per process
        soft, hard = resource.getrlimit(resource.RLIMIT_AS)
        resource.setrlimit(resource.RLIMIT_AS, (1500 * 1024 * 1024, hard))
        
        # Your existing code here...
        html_content, report_date, accession_number, filing_date, form, url = filing_data
        
        import sys
        from pathlib import Path
        sys.path.append('../src')
        from src.simple_rag.extraction.parser import BlackRockFiling
        from src.simple_rag.models.fund import FilingMetadata
        
        parser = BlackRockFiling(html_content)
        funds = parser.get_funds()
        
        performance_funds = []
        df_performance = None        
        count = 0
        
        filing_metadata = FilingMetadata(
            accession_number=accession_number,
            reporting_date=report_date,
            filing_date=filing_date,
            form=form,
            url=url
        )
        
        all_tickers = []  # Fix: collect all tickers
        
        for fund in funds:
            if fund.performance_table is not None:
                if fund.ticker not in performance_funds:
                    performance_funds.append(fund.ticker)
                    count += 1
            
            all_tickers.append(fund.ticker)  # Fix: collect here instead
            fund.ncsr_metadata = filing_metadata
        
        print(f"Calling get_financial_highlights2 for {report_date}")
        df_performance = parser.get_financial_highlights2()
        
        print(f"Filing {report_date}: Found {count} funds with performance tables, Total funds: {len(funds)}")
        
        return {
            'funds': funds,
            'all_tickers': all_tickers,  # Fix: return all tickers
            'performance_tickers': performance_funds,
            'df_performance': df_performance,
            'report_date': report_date
        }
    except MemoryError:
        print(f"❌ Memory limit exceeded for filing {filing_data[1]}")
        return None
    except Exception as e:
        print(f"Error processing filing: {e}")
        return None

# Calculate safe worker count based on available RAM
import psutil
available_ram_gb = psutil.virtual_memory().available / (1024**3)
# Assume each worker needs ~2GB, leave 2GB for system
max_workers = max(1, min(4, int((available_ram_gb - 2) / 2)))
max_workers = 1
print(f"Using {max_workers} workers based on {available_ram_gb:.1f}GB available RAM")

ticker = "HEZU"
fund = Company(ticker)
all_filings = fund.get_filings(form="N-CSR")

if all_filings:
    unique_dates = sorted({f.report_date for f in all_filings if f.report_date})
    print("Unique report dates:", unique_dates)
    
    # Filter for filings from 2024-08-31 onward
    cutoff_date = "2024-09-31"
    latest_filings = [
        f for f in all_filings 
        if f.report_date and f.report_date >= cutoff_date
    ]
    
    print("Found filings: ", len(latest_filings), "from", cutoff_date, "onward")
    
    # Optional: Show the dates of filtered filings
    print("Filtered filing dates:", sorted({f.report_date for f in latest_filings}))


# Prepare data for multiprocessing (fetch HTML first)
filing_data_list = []
failed_filings = []
import signal

class TimeoutException(Exception):
    pass

def timeout_handler(signum, frame):
    raise TimeoutException("HTML parsing timed out")

sgml_filings = []

for filing in latest_filings:
    try:
        # Set a 10-second timeout for HTML fetching
        signal.signal(signal.SIGALRM, timeout_handler)
        signal.alarm(10)
        
        html_content = filing.html()
        
        # Cancel the alarm
        signal.alarm(0)
        
        if html_content:  # Only add if HTML content exists
            filing_data_list.append((
                html_content, 
                filing.report_date,
                filing.accession_number,
                filing.filing_date,
                filing.form,
                filing.url
            ))
        else:
            print(f"⚠️  No HTML content for filing: {filing.report_date}")
            failed_filings.append(filing)
            
    except TimeoutException:
        signal.alarm(0)
        print(f"⏭️  Timeout (likely SGML): {filing.report_date}")
        sgml_filings.append(filing)
        
    except ValueError as e:
        signal.alarm(0)
        print(f"❌ Error processing filing {filing.report_date}: {e}")
        failed_filings.append(filing)
        
    except Exception as e:
        signal.alarm(0)
        print(f"❌ Unexpected error for filing {filing.report_date}: {e}")
        failed_filings.append(filing)

print(f"✅ Successfully prepared {len(filing_data_list)} filings")
print(f"⏭️  Skipped {len(sgml_filings)} SGML/timeout filings")
print(f"❌ Failed to prepare {len(failed_filings)} filings")
# Continue with successful filings only

performance_funds = []
df_performances = []
funds_total = []  # Make sure this is defined

# Process with limited workers
with ProcessPoolExecutor(max_workers=max_workers) as executor:
    future_to_data = {executor.submit(process_single_filing_multiprocess, data): data 
                      for data in filing_data_list}
    
    for future in tqdm(as_completed(future_to_data), total=len(filing_data_list), desc="Processing filings"):
        result = future.result()
        if result:
            funds_total.extend(result['funds'])
            ishares_funds.extend(result['all_tickers'])  # Fix: use returned tickers
            performance_funds.extend(result['performance_tickers'])
            
            if result['df_performance'] is not None:
                df_performances.append(result['df_performance'])

print(f"\nTotal funds processed: {len(funds_total)}")
print(f"Unique iShares funds: {len(set(ishares_funds))}")
print(f"Performance dataframes: {len(df_performances)}")

  from .autonotebook import tqdm as notebook_tqdm


Using 1 workers based on 5.9GB available RAM


KeyboardInterrupt: 

In [None]:
from concurrent.futures import ProcessPoolExecutor, as_completed
import pandas as pd
from typing import List
import sys
from pathlib import Path
from tqdm import tqdm
%reload_ext autoreload
from src.simple_rag.extraction.parser import BlackRockFiling
from edgar import set_identity, Company
from src.simple_rag.models.fund import FilingMetadata
set_identity("luis.alvarez.conde@alumnos.upm.es")

ticker = "HEZU"
fund = Company(ticker)
all_filings = fund.get_filings(form="N-CSR")

def process_single_filing_multiprocess(filing_data):
    """
    Process a single filing (for multiprocessing).
    Note: Must pass serializable data, not the filing object directly
    """
    try:
        # Import inside function for multiprocessing
        import sys
        from pathlib import Path
        sys.path.append('../src')
        from src.simple_rag.extraction.parser import BlackRockFiling
        
        html_content, report_date, accession_number, filing_date, form, url = filing_data
        parser = BlackRockFiling(html_content)
        funds = parser.get_funds()
        
        performance_funds = []
        df_performance = None        
        count = 0
        filing_metadata = FilingMetadata(
        accession_number=accession_number,
        reporting_date=report_date,
        filing_date=filing_date,
        form=form,
        url=url

    )
        for fund in funds:

            if fund.performance_table is not None:
                if fund.ticker not in performance_funds:
                    performance_funds.append(fund.ticker)
                    count += 1
            if fund.ticker not in ishares_funds:
                ishares_funds.append(fund.ticker)
            fund.ncsr_metadata = filing_metadata
            print("Calling get_financial_highlights2")
        df_performance = parser.get_financial_highlights2()
        
        print(f"Filing {report_date}: Found {count} funds with performance tables, Total funds: {len(funds)}")
        
        return {
            'funds': funds,
            'performance_tickers': performance_funds,
            'df_performance': df_performance,
            'report_date': report_date
        }
    except Exception as e:
        print(f"Error processing filing: {e}")
        return None

if all_filings:
    unique_dates = sorted({f.report_date for f in all_filings if f.report_date})
    print("Unique report dates:", unique_dates)
    
    # Filter for filings from 2024-08-31 onward
    cutoff_date = "2024-09-31"
    latest_filings = [
        f for f in all_filings 
        if f.report_date and f.report_date >= cutoff_date
    ]
    
    print("Found filings: ", len(latest_filings), "from", cutoff_date, "onward")
    
    # Optional: Show the dates of filtered filings
    print("Filtered filing dates:", sorted({f.report_date for f in latest_filings}))

# Prepare data for multiprocessing (fetch HTML first)
filing_data_list = []
failed_filings = []
import signal

class TimeoutException(Exception):
    pass

def timeout_handler(signum, frame):
    raise TimeoutException("HTML parsing timed out")

sgml_filings = []

for filing in latest_filings:
    try:
        # Set a 10-second timeout for HTML fetching
        signal.signal(signal.SIGALRM, timeout_handler)
        signal.alarm(10)
        
        html_content = filing.html()
        
        # Cancel the alarm
        signal.alarm(0)
        
        if html_content:  # Only add if HTML content exists
            filing_data_list.append((
                html_content, 
                filing.report_date,
                filing.accession_number,
                filing.filing_date,
                filing.form,
                filing.url
            ))
        else:
            print(f"⚠️  No HTML content for filing: {filing.report_date}")
            failed_filings.append(filing)
            
    except TimeoutException:
        signal.alarm(0)
        print(f"⏭️  Timeout (likely SGML): {filing.report_date}")
        sgml_filings.append(filing)
        
    except ValueError as e:
        signal.alarm(0)
        print(f"❌ Error processing filing {filing.report_date}: {e}")
        failed_filings.append(filing)
        
    except Exception as e:
        signal.alarm(0)
        print(f"❌ Unexpected error for filing {filing.report_date}: {e}")
        failed_filings.append(filing)

print(f"✅ Successfully prepared {len(filing_data_list)} filings")
print(f"⏭️  Skipped {len(sgml_filings)} SGML/timeout filings")
print(f"❌ Failed to prepare {len(failed_filings)} filings")
# Continue with successful filings only

performance_funds = []
df_performances = []
ishares_funds = []

# Use ProcessPoolExecutor
with ProcessPoolExecutor(max_workers=2) as executor:
    future_to_data = {executor.submit(process_single_filing_multiprocess, data): data 
                      for data in filing_data_list}
    
    for future in tqdm(as_completed(future_to_data), total=len(filing_data_list), desc="Processing filings"):
        result = future.result()
        if result:
            ishares_funds.extend(result['funds'])
            funds_total.extend(result['funds'])
            performance_funds.extend(result['performance_tickers'])
            
            if result['df_performance'] is not None:
                df_performances.append(result['df_performance'])

print(len(df_performances))
print(f"Total funds processed: {len(ishares_funds)}")


Unique report dates: ['2003-04-30', '2003-07-31', '2004-02-29', '2004-03-31', '2004-04-30', '2004-07-31', '2005-02-28', '2005-03-31', '2005-04-30', '2005-07-31', '2006-02-28', '2006-03-31', '2006-04-30', '2006-07-31', '2007-02-28', '2007-03-31', '2007-04-30', '2007-07-31', '2008-02-29', '2008-03-31', '2008-04-30', '2008-07-31', '2009-02-28', '2009-03-31', '2009-04-30', '2009-07-31', '2009-08-31', '2010-02-28', '2010-03-31', '2010-04-30', '2010-07-31', '2010-08-31', '2011-02-28', '2011-03-31', '2011-04-30', '2011-07-31', '2011-08-31', '2011-10-31', '2012-02-29', '2012-03-31', '2012-04-30', '2012-07-31', '2012-08-31', '2012-10-31', '2013-02-28', '2013-03-31', '2013-04-30', '2013-07-31', '2013-08-31', '2013-10-31', '2014-02-28', '2014-03-31', '2014-04-30', '2014-07-31', '2014-08-31', '2014-10-31', '2015-02-28', '2015-03-31', '2015-04-30', '2015-07-31', '2015-08-31', '2015-10-31', '2016-02-29', '2016-03-31', '2016-04-30', '2016-07-31', '2016-08-31', '2016-10-31', '2017-02-28', '2017-03-31'

Processing filings:   0%|          | 0/22 [00:00<?, ?it/s]

Processing: iShares iBonds 1-5 Year Corporate Ladder ETF
Extracting context:  From2024-11-01to2025-10-31_C000254885Member
Tag not found:  oef:ClassName From2024-11-01to2025-10-31_C000254885Member
Unknown Table:        Footnote                   Description
0  Footnote(a)  Excludes money market funds.
Unknown table type:       Footnote                   Description
0  Footnote(a)  Excludes money market funds.
Processing: iShares iBonds 1-5 Year High Yield and Income Ladder ETF
Extracting context:  From2024-11-01to2025-10-31_C000254886Member
Tag not found:  oef:ClassName From2024-11-01to2025-10-31_C000254886Member
Unknown Table:        Footnote                   Description
0  Footnote(a)  Excludes money market funds.
Unknown table type:       Footnote                   Description
0  Footnote(a)  Excludes money market funds.
Processing: iShares iBonds 1-5 Year TIPS Ladder ETF
Extracting context:  From2024-11-01to2025-10-31_C000254887Member
Tag not found:  oef:ClassName From2024-11-01to2

Processing filings:   5%|▍         | 1/22 [00:01<00:39,  1.90s/it]

Processing: iShares Climate Conscious & Transition MSCI USA ETF
Extracting context:  From2024-09-01to2025-08-31_C000242847Member
Tag not found:  oef:ClassName From2024-09-01to2025-08-31_C000242847Member
Processing: iShares ESG Advanced MSCI USA ETF
Extracting context:  From2024-09-01to2025-08-31_C000219749Member
Tag not found:  oef:ClassName From2024-09-01to2025-08-31_C000219749Member
Processing: iShares ESG Aware MSCI USA ETF
Extracting context:  From2024-09-01to2025-08-31_C000174221Member
Tag not found:  oef:ClassName From2024-09-01to2025-08-31_C000174221Member
Processing: iShares ESG Aware MSCI USA Growth ETF
Extracting context:  From2024-09-01to2025-08-31_C000240124Member
Tag not found:  oef:ClassName From2024-09-01to2025-08-31_C000240124Member
Processing: iShares ESG Aware MSCI USA Small-Cap ETF
Extracting context:  From2024-09-01to2025-08-31_C000199548Member
Tag not found:  oef:ClassName From2024-09-01to2025-08-31_C000199548Member
Processing: iShares ESG Aware MSCI USA Value ETF


Processing filings:   9%|▉         | 2/22 [00:17<03:22, 10.13s/it]

Tag not found:  dei:SecurityExchangeName FY2025_C000244562Member
Unknown Table:        0                                                  1
0  ​(a)                       Excludes money market funds.
1    ​#  Ten largest countries/geographic regions are p...
Unknown table type:       0                                                  1
0  ​(a)                       Excludes money market funds.
1    ​#  Ten largest countries/geographic regions are p...
Processing: iShares iBonds 2028 Term High Yield and Income ETF
Extracting context:  From2024-11-01to2025-10-31_C000234589Member
Failed to extract tables from block:  oef:LineGraphTableTextBlock
Tables not found.
Failed to extract tables from block:  oef:AvgAnnlRtrTableTextBlock
Tables not found.
Processing: iShares MSCI Emerging Markets Value Factor ETF
Extracting context:  FY2025_C000244564Member
Tag not found:  dei:SecurityExchangeName FY2025_C000244564Member
Unknown Table:        0                                                  1
0  ​

Processing filings:  14%|█▎        | 3/22 [00:37<04:33, 14.38s/it]

Tag not found:  oef:ClassName From2024-11-01to2025-10-31_C000210857Member
Failed to extract tables from block:  oef:LineGraphTableTextBlock
Tables not found.
Processing: iShares 0-5 Year Investment Grade Corporate Bond ETF
Extracting context:  FY2025_C000131292Member
Unknown Table:        0                                                  1
0  ​(a)                       Excludes money market funds.
1    ​*  Credit quality ratings shown reflect the ratin...
Unknown table type:       0                                                  1
0  ​(a)                       Excludes money market funds.
1    ​*  Credit quality ratings shown reflect the ratin...
Processing: iShares Large Cap Accelerated Outcome ETF
Extracting context:  FY2025_C000256144Member
Tag not found:  dei:SecurityExchangeName FY2025_C000256144Member
Unknown Table:        0                                                 1
0  ​(a)  The underlying fund is iShares Core S&P 500 ETF.
1  ​(b)                      Excludes money ma

Processing filings:  18%|█▊        | 4/22 [00:39<02:55,  9.74s/it]

Unknown Table:        Footnote                                        Description
0    Footnote*  Credit quality ratings shown reflect the ratin...
1  Footnote(a)                       Excludes money market funds.
Unknown table type:       Footnote                                        Description
0    Footnote*  Credit quality ratings shown reflect the ratin...
1  Footnote(a)                       Excludes money market funds.
Processing: iShares iBonds Dec 2029 Term Muni Bond ETF
Extracting context:  From2024-11-01to2025-10-31_C000242692Member
Processing: iShares Breakthrough Environmental Solutions ETF
Extracting context:  FY2025_C000240517Member
Unknown Table:        0                             1
0  ​(a)  Excludes money market funds.
Unknown table type:       0                             1
0  ​(a)  Excludes money market funds.
Tag not found:  oef:ClassName From2024-11-01to2025-10-31_C000242692Member
Tag not found:  dei:SecurityExchangeName FY2025_C000236812Member
Unknown Table: 

Processing filings:  23%|██▎       | 5/22 [00:47<02:32,  8.97s/it]

Unknown Table:        Footnote                   Description
0  Footnote(a)  Excludes money market funds.
Unknown table type:       Footnote                   Description
0  Footnote(a)  Excludes money market funds.
Processing: iShares U.S. Small-Cap Equity Factor ETF
Extracting context:  From2024-08-01to2025-07-31_C000153272Member
Failed to extract tables from block:  oef:LineGraphTableTextBlock
Tables not found.
Processing: iShares Core MSCI International Developed Markets ETF
Extracting context:  FY2025_C000179059Member
Tag not found:  oef:ClassName From2024-08-01to2025-07-31_C000153272Member
Unknown Table:        Footnote                   Description
0  Footnote(a)  Excludes money market funds.
Unknown table type:       Footnote                   Description
0  Footnote(a)  Excludes money market funds.
Processing: iShares U.S. Tech Breakthrough Multisector ETF
Extracting context:  From2024-08-01to2025-07-31_C000216288Member
Tag not found:  oef:ClassName From2024-08-01to2025-07-31_

Processing filings:  27%|██▋       | 6/22 [00:49<01:44,  6.54s/it]

Tag not found:  dei:SecurityExchangeName FY2025_C000194633Member
Unknown Table:        0                                                  1
0  ​(a)                       Excludes money market funds.
1  ​(b)                          Rounds to less than 0.1%.
2    ​*  Credit quality ratings shown reflect the ratin...
Unknown table type:       0                                                  1
0  ​(a)                       Excludes money market funds.
1  ​(b)                          Rounds to less than 0.1%.
2    ​*  Credit quality ratings shown reflect the ratin...
Failed to extract tables from block:  oef:LineGraphTableTextBlock
Tables not found.
Processing: iShares Core MSCI Pacific ETF
Extracting context:  FY2025_C000140338Member
Failed to extract tables from block:  oef:LineGraphTableTextBlock
Tables not found.
Processing: iShares CMBS ETF
Extracting context:  FY2025_C000110079Member
Tag not found:  dei:SecurityExchangeName FY2025_C000140338Member
Unknown Table:        0          

Processing filings:  32%|███▏      | 7/22 [01:15<03:13, 12.91s/it]

Tag not found:  dei:SecurityExchangeName FY2025_C000149539Member
Unknown Table:        0                                                  1
0  ​(a)                       Excludes money market funds.
1    ​#  Ten largest countries/geographic regions are p...
Unknown table type:       0                                                  1
0  ​(a)                       Excludes money market funds.
1    ​#  Ten largest countries/geographic regions are p...
Tag not found:  dei:SecurityExchangeName FY2025_C000249959Member
Unknown Table:        0                                                  1
0  ​(a)                       Excludes money market funds.
1    ​*  Credit quality ratings shown reflect the ratin...
Unknown table type:       0                                                  1
0  ​(a)                       Excludes money market funds.
1    ​*  Credit quality ratings shown reflect the ratin...
Failed to extract tables from block: Failed to extract tables from block:   oef:LineGraphT

Processing filings:  36%|███▋      | 8/22 [01:18<02:17,  9.82s/it]

Tag not found:  dei:SecurityExchangeName FY2025_C000152180Member
Tag not found:  dei:SecurityExchangeName FY2025_C000148919Member
Unknown Table:        0                                                  1
0  ​(a)                       Excludes money market funds.
1    ​*  Credit quality ratings shown reflect the ratin...
Unknown table type:       0                                                  1
0  ​(a)                       Excludes money market funds.
1    ​*  Credit quality ratings shown reflect the ratin...
Failed to extract tables from block:  oef:LineGraphTableTextBlock
Tables not found.
Processing: iShares iBonds Dec 2026 Term Corporate ETF
Extracting context:  FY2025_C000173141Member
Unknown Table:        0                             1
0  ​(a)  Excludes money market funds.
Unknown table type:       0                             1
0  ​(a)  Excludes money market funds.
Failed to extract tables from block:  oef:LineGraphTableTextBlock
Tables not found.
Processing: iShares MSCI

Processing filings:  41%|████      | 9/22 [01:26<02:01,  9.37s/it]

Tag not found:  dei:SecurityExchangeName FY2024_C000236812Member
Unknown Table:        0                                                  1
0  ​(a)  The underlying fund is iShares 20+ Year Treasu...
1  ​(b)                       Excludes money market funds.
Unknown table type:       0                                                  1
0  ​(a)  The underlying fund is iShares 20+ Year Treasu...
1  ​(b)                       Excludes money market funds.
Tag not found:  dei:SecurityExchangeName FY2025_C000204503Member
Failed to extract tables from block:  oef:LineGraphTableTextBlock
Tables not found.
Processing: iShares MSCI EAFE Small-Cap ETF
Extracting context:  FY2025_C000052896Member
Unknown Table:        0                                                  1
0  ​(a)                       Excludes money market funds.
1    ​*  Credit quality ratings shown reflect the ratin...
Unknown table type:       0                                                  1
0  ​(a)                       Exclu

Processing filings:  45%|████▌     | 10/22 [01:49<02:40, 13.36s/it]

Tag not found:  oef:ClassName From2024-05-01to2025-04-30_C000012051Member
Unknown Table:        0                             1
0  ​(a)  Excludes money market funds.
Unknown table type:       0                             1
0  ​(a)  Excludes money market funds.
Failed to extract tables from block:  oef:LineGraphTableTextBlock
Tables not found.
Processing: iShares High Yield Corporate Bond BuyWrite Strategy ETF
Extracting context:  FY2024_C000236814Member
Tag not found:  oef:FactorsAffectingPerfTextBlock From2024-05-01to2025-04-30_C000012051Member
Tag not found:  dei:SecurityExchangeName FY2025_C000237776Member
Unknown Table:        0                                                  1
0  ​(a)                       Excludes money market funds.
1    ​*  Credit quality ratings shown reflect the ratin...
2    ​#  Ten largest countries/geographic regions are p...
Unknown table type:       0                                                  1
0  ​(a)                       Excludes money market

Processing filings:  50%|█████     | 11/22 [01:54<02:01, 11.03s/it]

Processing: iShares U.S. Financials ETF
Extracting context:  From2024-05-01to2025-04-30_C000012053Member
Tag not found:  dei:SecurityExchangeName FY2025_C000247831Member
Unknown Table:        0                                                 1
0  ​(a)  The underlying fund is iShares Russell 2000 ETF.
1  ​(b)                      Excludes money market funds.
Unknown table type:       0                                                 1
0  ​(a)  The underlying fund is iShares Russell 2000 ETF.
1  ​(b)                      Excludes money market funds.
Tag not found:  oef:ClassName From2024-05-01to2025-04-30_C000012053Member
Tag not found:  oef:FactorsAffectingPerfTextBlock From2024-05-01to2025-04-30_C000012053Member
Failed to extract tables from block:  oef:LineGraphTableTextBlock
Tables not found.
Processing: iShares S&P 500 BuyWrite ETF
Extracting context:  FY2025_C000247832Member
Tag not found:  dei:SecurityExchangeName FY2024_C000211404Member
Unknown Table:        0                    

Processing filings:  55%|█████▍    | 12/22 [02:09<02:00, 12.08s/it]

Tag not found:  dei:SecurityExchangeName FY2025_C000012089Member
Failed to extract tables from block:  oef:LineGraphTableTextBlock
Tables not found.
Processing: iShares Global Financials ETF
Extracting context:  FY2025_C000012100Member
Tag not found:  dei:SecurityExchangeName FY2025_C000012100Member
Failed to extract tables from block:  oef:LineGraphTableTextBlock
Tables not found.
Processing: iShares Global Healthcare ETF
Extracting context:  FY2025_C000012102Member
Processing: iShares U.S. Utilities ETF
Extracting context:  From2024-05-01to2025-04-30_C000012062Member
Tag not found:  dei:SecurityExchangeName FY2025_C000012102Member
Tag not found:  oef:ClassName From2024-05-01to2025-04-30_C000012062Member
Failed to extract tables from block:  oef:LineGraphTableTextBlock
Tables not found.
Processing: iShares Global Industrials ETF
Extracting context:  FY2025_C000024196Member
No data obtained
Processing: iShares iBonds Dec 2024 Term Corporate ETF
Extracting context:  FY2024_C000152179Mem

Processing filings:  59%|█████▉    | 13/22 [02:13<01:28,  9.80s/it]

Failed to extract tables from block:  oef:LineGraphTableTextBlock
Tables not found.
Processing: iShares Global Timber & Forestry ETF
Extracting context:  FY2025_C000065076Member
Unknown Table:        0                             1
0  ​(a)  Excludes money market funds.
Unknown table type:       0                             1
0  ​(a)  Excludes money market funds.
Failed to extract tables from block:  oef:LineGraphTableTextBlock
Tables not found.
Processing: iShares Global Utilities ETF
Extracting context:  FY2025_C000024197Member
Tag not found:  dei:SecurityExchangeName FY2025_C000024197Member
Failed to extract tables from block:  oef:LineGraphTableTextBlock
Tables not found.
Processing: iShares India 50 ETF
Extracting context:  FY2025_C000064225Member
Unknown Table:        0                             1
0  ​(a)  Excludes money market funds.
Unknown table type:       0                             1
0  ​(a)  Excludes money market funds.
Failed to extract tables from block:  oef:LineGra

Processing filings:  64%|██████▎   | 14/22 [02:20<01:10,  8.82s/it]

Tag not found:  dei:SecurityExchangeName FY2024_C000191091Member
Unknown Table:        0                                                  1
0  ​(a)                       Excludes money market funds.
1    ​*  Credit quality ratings shown reflect the ratin...
Unknown table type:       0                                                  1
0  ​(a)                       Excludes money market funds.
1    ​*  Credit quality ratings shown reflect the ratin...
Failed to extract tables from block:  oef:LineGraphTableTextBlock
Tables not found.
Processing: iShares iBonds Dec 2028 Term Corporate ETF
Extracting context:  FY2024_C000204503Member
Processing: iShares Mortgage Real Estate ETF
Extracting context:  FY2025_C000042587Member
Tag not found:  dei:SecurityExchangeName FY2025_C000042587Member
Unknown Table:        0                             1
0  ​(a)  Excludes money market funds.
Unknown table type:       0                             1
0  ​(a)  Excludes money market funds.
Failed to extract 

Processing filings:  68%|██████▊   | 15/22 [02:38<01:21, 11.70s/it]

Tag not found:  oef:ClassName From2024-04-01to2025-03-31_C000012086Member
Tag not found:  dei:SecurityExchangeName FY2024_C000228040Member
Unknown Table:        Footnote                   Description
0  Footnote(a)  Excludes money market funds.
Unknown table type:       Footnote                   Description
0  Footnote(a)  Excludes money market funds.
Unknown Table:        0                                                  1
0  ​(a)                       Excludes money market funds.
1    ​*  Credit quality ratings shown reflect the ratin...
Unknown table type:       0                                                  1
0  ​(a)                       Excludes money market funds.
1    ​*  Credit quality ratings shown reflect the ratin...
Failed to extract tables from block:  oef:LineGraphTableTextBlock
Tables not found.
Processing: iShares iBonds Dec 2034 Term Corporate ETF
Extracting context:  FY2024_C000249961Member
Processing: iShares Russell 2000 ETF
Extracting context:  From2024-04-0

Processing filings:  73%|███████▎  | 16/22 [03:03<01:34, 15.71s/it]

Calling get_financial_highlights2
Calling get_financial_highlights2
Calling get_financial_highlights2
Calling get_financial_highlights2
Calling get_financial_highlights2
Calling get_financial_highlights2
Calling get_financial_highlights2
Calling get_financial_highlights2
Calling get_financial_highlights2
Calling get_financial_highlights2
Calling get_financial_highlights2
Calling get_financial_highlights2
Calling get_financial_highlights2
Calling get_financial_highlights2
Calling get_financial_highlights2
Calling get_financial_highlights2
Calling get_financial_highlights2
Calling get_financial_highlights2
Calling get_financial_highlights2
Calling get_financial_highlights2
Calling get_financial_highlights2
Found 3 potential Financial Highlights sections
Total funds extracted: 0
Filing 2025-03-31: Found 18 funds with performance tables, Total funds: 21


Processing filings:  77%|███████▋  | 17/22 [03:04<00:55, 11.14s/it]

No data obtained
Processing: iShares iBonds Dec 2054 Term Treasury ETF
Extracting context:  FY2024_C000250199Member
Tag not found:  oef:FactorsAffectingPerfTextBlock FY2024_C000250199Member
Unknown Table:        0                                                  1
0  ​(a)                       Excludes money market funds.
1    ​*  Credit quality ratings shown reflect the ratin...
Unknown table type:       0                                                  1
0  ​(a)                       Excludes money market funds.
1    ​*  Credit quality ratings shown reflect the ratin...
No data obtained
Processing: iShares 0-3 Month Treasury Bond ETF
Extracting context:  From2024-03-01to2025-02-28_C000219740Member
Tag not found:  oef:ClassName From2024-03-01to2025-02-28_C000219740Member
No data obtained
Processing: iShares iBonds Oct 2034 Term TIPS ETF
Extracting context:  FY2024_C000249962Member
Unknown Table:        Footnote                   Description
0  Footnote(a)  Excludes money market funds

Processing filings:  82%|████████▏ | 18/22 [03:13<00:41, 10.37s/it]

Tag not found:  dei:SecurityExchangeName FY2024_C000249962Member
Tag not found:  oef:FactorsAffectingPerfTextBlock FY2024_C000249962Member
Unknown Table:        0                                                  1
0  ​(a)                       Excludes money market funds.
1    ​*  Credit quality ratings shown reflect the ratin...
Unknown table type:       0                                                  1
0  ​(a)                       Excludes money market funds.
1    ​*  Credit quality ratings shown reflect the ratin...


In [20]:
import pandas as pd
import re
%reload_ext autoreload
from simple_rag.models.fund import FinancialHighlights

if df_performances:
    df_performance = pd.concat(df_performances, ignore_index=True)
else:
    df_performance = pd.DataFrame() # Empty fallback
    print("No performance data found.")

print(df_performance.head())

def clean_financial_number(val):
    """
    Parses financial strings like '23.19 %(b)' or '(24.82 )%'.
    - Extracts the numerical value.
    - Handles (12.34) as negative -12.34.
    - Ignores footnote markers like (a), (b).
    - Removes %, $, and commas.
    """
    if pd.isna(val) or val is None:
        return None
    
    # Convert to string and strip whitespace
    s = str(val).strip()
    
    # 1. Regex to find the number (handles decimals and commas)
    # Looks for digits, optional commas, and optional decimal part
    match = re.search(r'(\d{1,3}(?:,\d{3})*\.?\d*|\d*\.?\d+)', s)
    
    if not match:
        return None
        
    # Get the raw number string (e.g., "24.82" or "1,234.56")
    num_str = match.group(0)
    
    # 2. Check for negative indication: "(" at the start of the string
    # Accounting format always puts the negative parenthesis at the start: (24.82)%
    is_negative = s.startswith('(')
    
    try:
        # Remove commas and convert to float
        clean_num = float(num_str.replace(',', ''))
        
        # Apply negative sign if detected
        return -clean_num if is_negative else clean_num
        
    except ValueError:
        return None
returns_lookup = df_performance.copy()


# Apply to all financial columns
financial_cols = ['total_return', 'expense_ratio', 'net_income_ratio', 'portfolio_turnover', 'nav_end', 'nav_beginning', 'net_assets']
for col in financial_cols:
    if col in returns_lookup.columns:
        returns_lookup[f'{col}_clean'] = returns_lookup[col].apply(clean_financial_number)

ishares_tickers = [fund_obj for fund_obj in ishares_funds]
print("Tickers in ishares_funds:", ishares_tickers)


# Now you can efficiently match and update your funds
for fund_obj in funds_total:
    
    if fund_obj.ticker not in ishares_tickers:
        continue

    print(f"\nProcessing fund object: {fund_obj.name} - {fund_obj.share_class}")
    # Initialize annual returns
    if not hasattr(fund_obj, 'annual_returns') or fund_obj.annual_returns is None:
        fund_obj.annual_returns = {}

    if not hasattr(fund_obj, 'financial_highlights') or fund_obj.financial_highlights is None:
        fund_obj.financial_highlights = {}
    
    # Clean the name: remove "Vanguard" and strip whitespace
    name = fund_obj.name.replace("Vanguard", "").strip()
    print(f"Cleaned name: '{name}'")
    
    if "®" in name:
        name = name.replace("®", "")
    if "™" in name:
        name = name.replace("™", "")
        
    # Find matching rows based on fund name
    name_matches = returns_lookup[returns_lookup['fund_name'].str.contains(name, case=False, na=False, regex=False)]
    
    if len(name_matches) == 0:
        print("  No name matches found")
        continue
    
    print(f"  Found {len(name_matches)} name matches")
    
    # Clean share class (remove trademark symbol)
    fund_obj.share_class = "ETF Shares"
    share_class = fund_obj.share_class
    
    if "™" in share_class:
        share_class = share_class.replace("™", "")
    
    # Now match share class
    share_class_matches = name_matches[
        name_matches['share_class'].str.contains(share_class, case=False, na=False, regex=False)]
    
    if name_matches['share_class'].isna().all():
        fund_obj.annual_returns = dict(zip(name_matches['year'], name_matches['total_return_clean']))
        print("Annual return: ", fund_obj.annual_returns)
        continue
        
    if len(share_class_matches) == 0:
        print(f"  No share class matches found for '{share_class}'")
        print(f"  Available share classes: {name_matches['share_class'].unique()}")
        continue
    
    print(f"  Found {len(share_class_matches)} matching records")
    
    # Add all matching returns
    for _, row in share_class_matches.iterrows():
        year = str(row['year'])
        
        # Store annual return
        fund_obj.annual_returns[year] = row['total_return_clean']
        
        # Store full financial highlights snapshot
        fund_obj.financial_highlights[year] = FinancialHighlights(
            turnover=row.get('portfolio_turnover_clean'),
            expense_ratio=row.get('expense_ratio_clean'),
            total_return=row.get('total_return_clean'),
            net_assets=row.get('net_assets'),  # You may need to add this cleaning
            net_assets_value_begining=row.get('nav_beginning_clean'),
            net_assets_value_end=row.get('nav_end_clean') ,
            net_income_ratio=row.get('net_income_ratio_clean')
        )
    
    print(f"  Annual returns: {fund_obj.annual_returns}")
    print(f"  Financial highlights years: {list(fund_obj.financial_highlights.keys())}")
    for key, value in fund_obj.financial_highlights.items():
        print(f"    {key}: {value}")

                            fund_name share_class  year   net_assets  \
0  iShares ESG Advanced MSCI EAFE ETF  ETF Shares  2025  821452000.0   
1  iShares ESG Advanced MSCI EAFE ETF  ETF Shares  2024  708068000.0   
2  iShares ESG Advanced MSCI EAFE ETF  ETF Shares  2023  475876000.0   
3  iShares ESG Advanced MSCI EAFE ETF  ETF Shares  2022  354432000.0   
4  iShares ESG Advanced MSCI EAFE ETF  ETF Shares  2021  245846000.0   

   nav_beginning  nav_end  total_return  expense_ratio  net_income_ratio  \
0          71.52    74.00          6.47           0.12              2.43   
1          59.48    71.52         23.19           0.12              2.39   
2          51.37    59.48         18.17           0.12              2.41   
3          70.24    51.37          0.00           0.12              2.73   
4          55.79    70.24         27.47           0.12              2.06   

   portfolio_turnover distribution_shares  
0                14.0                None  
1                24.0 

In [21]:
import re
from collections import defaultdict
import pandas as pd

def infer_first_col_format(value: object) -> str:
    if value is None or (isinstance(value, float) and pd.isna(value)):
        return "EMPTY"

    s = str(value).strip()
    if s == "" or s.lower() == "nan":
        return "EMPTY"

    # Jan 23, Aug 15
    if re.match(r"^[A-Za-z]{3}\s+\d{2}$", s):
        return "MON_YY"

    # 2015
    if re.match(r"^\d{4}$", s):
        return "YYYY"

    # 2024-08-31
    if re.match(r"^\d{4}-\d{2}-\d{2}$", s):
        return "YYYY_MM_DD"

    # 08/31/24 or 8/31/2024
    if re.match(r"^\d{1,2}/\d{1,2}/\d{2,4}$", s):
        return "MM_DD_YY(YY)"

    # 31/08/24 (if you ever have EU style)
    if re.match(r"^\d{1,2}-\d{1,2}-\d{2,4}$", s):
        return "DD_MM_YY(YY)_or_MM_DD_YY(YY)_DASH"

    # Fallbacks
    if re.search(r"\d", s):
        return "OTHER_HAS_DIGITS"

    return "OTHER_TEXT"


def describe_first_column_formats(
    dfs,
    names=None,
    samples_per_df=3,
    max_groups_to_show=50,
    max_dfs_per_group_to_print=5,
):
    if names is None:
        names = [f"df[{i}]" for i in range(len(dfs))]

    groups = defaultdict(list)

    for name, df in zip(names, dfs):
        if df is None or not isinstance(df, pd.DataFrame) or df.empty:
            groups["EMPTY_DF"].append((name, df))
            continue

        first_col = df.columns[0]
        # take first non-empty sample from first column
        series = df[first_col].astype(str)
        sample_vals = [v for v in series.head(20).tolist() if str(v).strip() and str(v).lower() != "nan"]

        fmt = infer_first_col_format(sample_vals[0]) if sample_vals else "EMPTY_FIRST_COL"
        groups[fmt].append((name, df))

    sorted_groups = sorted(groups.items(), key=lambda kv: len(kv[1]), reverse=True)

    print(f"Total dataframes: {len(dfs)}")
    print(f"Unique first-column formats: {len(sorted_groups)}\n")

    for gi, (fmt, members) in enumerate(sorted_groups[:max_groups_to_show], start=1):
        print("=" * 100)
        print(f"Group #{gi}: {fmt}")
        print(f"Count: {len(members)}")

        example_shapes = [m[1].shape for m in members if isinstance(m[1], pd.DataFrame)]
        print(f"Example shapes (first 10): {example_shapes[:10]}")

        # Print a few examples per group
        for ex_i, (name, df) in enumerate(members[:max_dfs_per_group_to_print], start=1):
            if df is None or not isinstance(df, pd.DataFrame) or df.empty:
                print(f"  [Example {ex_i}] {name}: EMPTY/None")
                continue

            first_col = df.columns[0]
            vals = [v for v in df[first_col].head(20).tolist() if str(v).strip() and str(v).lower() != "nan"]
            vals = vals[:samples_per_df]

            print(f"  [Example {ex_i}] {name}")
            print(f"    first_col: {first_col!r}")
            print(f"    columns: {list(df.columns)[:12]}{' ...' if len(df.columns) > 12 else ''}")
            print(f"    first_col_samples: {vals}")

        print()


# Example usage with your list of performance tables
performances = []
perf_names = []
for i, fund in enumerate(funds_total):
    if fund.ticker in performance_funds and fund.performance_table is not None:
        performances.append(fund.performance_table)
        perf_names.append(f"{fund.ticker} | {fund.name} | {fund.share_class}")

describe_first_column_formats(performances, names=perf_names)

Total dataframes: 105
Unique first-column formats: 2

Group #1: MON_YY
Count: 104
Example shapes (first 10): [(28, 4), (64, 4), (106, 4), (32, 4), (90, 4), (32, 4), (77, 4), (44, 4), (121, 4), (121, 4)]
  [Example 1] USCL | iShares Climate Conscious & Transition MSCI USA ETF | ShareClassType.OTHER
    first_col: 'Unnamed: 0'
    columns: ['Unnamed: 0', 'Fund', 'MSCI USA Index', 'MSCI USA Extended Climate Action Index']
    first_col_samples: ['May 23', 'Jun 23', 'Jul 23']
  [Example 2] USXF | iShares ESG Advanced MSCI USA ETF | ShareClassType.OTHER
    first_col: 'Unnamed: 0'
    columns: ['Unnamed: 0', 'Fund', 'MSCI USA Index', 'MSCI USA Choice ESG Screened Index']
    first_col_samples: ['May 20', 'Jun 20', 'Jul 20']
  [Example 3] ESGU | iShares ESG Aware MSCI USA ETF | ShareClassType.OTHER
    first_col: 'Unnamed: 0'
    columns: ['Unnamed: 0', 'Fund', 'MSCI USA Index', 'MSCI USA Extended ESG Focus Index (Spliced)']
    first_col_samples: ['Nov 16', 'Dec 16', 'Jan 17']
  [Example 4]

In [22]:
import sys
from pathlib import Path
%reload_ext autoreload
RAG_DIR = Path("/home/alvar/CascadeProjects/windsurf-project/RAG")
if str(RAG_DIR) not in sys.path:
    sys.path.insert(0, str(RAG_DIR))


from src.simple_rag.extraction.parser import compute_annual_returns

for fund in funds_total:
    if fund.ticker in performance_funds:
        print(fund.performance_table)
        returns = compute_annual_returns(fund.performance_table)
        print("\nFinal Annual Returns:")
        fund.annual_returns = returns
        print(f"  {fund.ticker}: {returns}")
        print("---")

   Unnamed: 0     Fund MSCI USA Index MSCI USA Extended Climate Action Index
0      May 23  $10,000        $10,000                                $10,000
1      Jun 23  $10,384        $10,994                                $10,960
2      Jul 23  $10,786        $11,374                                $11,386
3      Aug 23  $10,641        $11,180                                $11,248
4      Sep 23  $10,150        $10,656                                $10,730
5      Oct 23   $9,880        $10,410                                $10,432
6      Nov 23  $10,811        $11,391                                $11,425
7      Dec 23  $11,295        $11,927                                $11,923
8      Jan 24  $11,534        $12,122                                $12,176
9      Feb 24  $12,212        $12,085                                $12,216
10     Mar 24  $12,579        $12,467                                $13,281
11     Apr 24  $12,093        $12,630                                $12,768

In [23]:
tickers = []
unique_funds = []
duplicates = 0

for fund in funds_total:
    if fund.ticker in tickers:
        print(f"DUPLICATE: {fund.name} ({fund.ticker})")
        duplicates += 1
    else:
        tickers.append(fund.ticker)
        unique_funds.append(fund)

# Replace the original list
funds_total = unique_funds
print(f"Removed {duplicates} duplicates")
print(f"Remaining funds: {len(funds_total)}")

DUPLICATE: iShares 0-5 Year High Yield Corporate Bond ETF (SHYG)
DUPLICATE: iShares 0-5 Year Investment Grade Corporate Bond ETF (SLQD)
DUPLICATE: iShares 1-3 Year International Treasury Bond ETF (ISHG)
DUPLICATE: iShares 20+ Year Treasury Bond BuyWrite Strategy ETF (TLTW)
DUPLICATE: iShares Aaa - A Rated Corporate Bond ETF (QLTA)
DUPLICATE: iShares BB Rated Corporate Bond ETF (HYBB)
DUPLICATE: iShares Broad USD High Yield Corporate Bond ETF (USHY)
DUPLICATE: iShares CMBS ETF (CMBS)
DUPLICATE: iShares Convertible Bond ETF (ICVT)
DUPLICATE: iShares Core 1-5 Year USD Bond ETF (ISTB)
DUPLICATE: iShares Core International Aggregate Bond ETF (IAGG)
DUPLICATE: iShares ESG Advanced High Yield Corporate Bond ETF (HYXF)
DUPLICATE: iShares Fallen Angels USD Bond ETF (FALN)
DUPLICATE: iShares Floating Rate Bond ETF (FLOT)
DUPLICATE: iShares GNMA Bond ETF (GNMA)
DUPLICATE: iShares High Yield Corporate Bond BuyWrite Strategy ETF (HYGW)
DUPLICATE: iShares iBonds 2025 Term High Yield and Income ETF (

In [24]:
import pickle
from pathlib import Path
import sys

# Add RAG directory to path
RAG_DIR = Path("/home/alvar/CascadeProjects/windsurf-project/RAG")
if str(RAG_DIR) not in sys.path:
    sys.path.insert(0, str(RAG_DIR))

# Define pickle file path
PKL_PATH = Path("./funds_backup_metadata.pkl")

print("Current working directory:", Path.cwd())
print("PKL_PATH resolves to:", PKL_PATH.resolve())

# Save to pickle file
try:
    with PKL_PATH.open("wb") as f:
        pickle.dump(funds_total, f)
    
    print(f"Successfully saved {len(funds_total)} funds to pickle file")
    print(f"File size: {PKL_PATH.stat().st_size / 1024:.2f} KB")
    
except Exception as e:
    print(f"Error saving to pickle file: {e}")

Current working directory: /home/luis/Desktop/code/RAG/notebooks
PKL_PATH resolves to: /home/luis/Desktop/code/RAG/notebooks/funds_backup_metadata.pkl
Successfully saved 460 funds to pickle file
File size: 4146.67 KB


In [1]:
import pickle
from pathlib import Path
from dataclasses import is_dataclass, asdict
import pandas as pd
import sys
from pathlib import Path
RAG_DIR = Path("../")
if str(RAG_DIR) not in sys.path:
    sys.path.insert(0, str(RAG_DIR))


SRC_DIR = Path("../src")
if str(SRC_DIR) not in sys.path:
    sys.path.insert(0, str(SRC_DIR))

PKL_PATH = Path("./funds_backup_metadata.pkl")
print("Current working directory:", Path.cwd())
print("PKL_PATH resolves to:", PKL_PATH.resolve())
with PKL_PATH.open("rb") as f:
    funds_total = pickle.load(f)

print(f"Loaded {len(funds_total)} funds from pickle file")

Current working directory: /home/luis/Desktop/code/RAG/notebooks
PKL_PATH resolves to: /home/luis/Desktop/code/RAG/notebooks/funds_backup_metadata.pkl
Loaded 460 funds from pickle file


## Summary Prospectus

In [None]:
import multiprocessing as mp
from edgar import Company, set_identity
import pandas as pd
from typing import List, Dict
import gc
from src.simple_rag.extraction.general_info import FundInfoExtractor
from src.simple_rag.models.fund import FilingMetadata

tickers = ["VOO", "MGK", "HEZU", "VMGRX", "VDIGX"]

def safe_date_convert(date_value):
    """Safely convert a date value to date object, handling None/NaN."""
    if date_value is None or pd.isna(date_value):
        return None
    try:
        dt = pd.to_datetime(date_value)
        if pd.isna(dt):
            return None
        return dt.date()
    except Exception as e:
        print(f"Warning: Could not convert date '{date_value}': {e}")
        return None

def process_single_filing(filing, ticker):
    """Extracts data from a single filing object."""
    try:
        text = filing.text()
        extractor = FundInfoExtractor(text, ticker=ticker)
        fund_data = extractor.get_structured_data()
        
        filing_metadata = FilingMetadata(
            accession_number=filing.accession_number,
            reporting_date=safe_date_convert(filing.report_date),
            filing_date=safe_date_convert(filing.filing_date),
            form=filing.form,
            url=filing.url
        )
        
        md = extractor.get_clean_markdown()
        
        result = {
            'ticker': fund_data['ticker'],
            'managers': fund_data['managers'],
            'strategies': fund_data['strategies'],
            'risks': fund_data['risks'],
            'objective': fund_data['objective'],
            'summary_prospectus': md,
            'filing_metadata': filing_metadata
        }
        
        del text, extractor, fund_data, md
        return result
    except Exception as e:
        print(f"❌ Error processing filing {filing.accession_number} for {ticker}: {e}")
        return None

def process_ticker(ticker):
    """
    Worker function: Fetches and processes all 497K filings for a single ticker.
    Runs entirely within a separate CPU process.
    """
    # Setting identity inside the worker ensures each new OS process
    # has the correct EDGAR credentials initialized.
    set_identity('luis.alvarez.conde@alumnos.upm.es')
    
    print(f"⚙️ Starting extraction for {ticker}...")
    company = Company(ticker)
    filings = company.get_filings(form="497K")
    
    ticker_results = []
    processed_accessions = set() 
    
    for filing in filings:
        if filing.accession_number in processed_accessions:
            continue
            
        res = process_single_filing(filing, ticker)
        if res:
            processed_accessions.add(filing.accession_number)
            ticker_results.append(res)
            
        gc.collect() 
        
    return ticker, ticker_results

def main():
    all_extracted_data = {}
    
    print(f"{'='*60}\nStarting Multiprocessing Pool for {len(tickers)} tickers\n{'='*60}")
    
    # Create a pool of workers equal to your CPU cores
    # mp.cpu_count() is usually a good default, or you can hardcode a number like processes=4
    with mp.Pool(processes=mp.cpu_count()) as pool:
        # pool.imap yields results as they finish, similar to executor.map
        for ticker, results in pool.imap_unordered(process_ticker, tickers):
            all_extracted_data[ticker] = results
            print(f"✅ Completed {ticker}: {len(results)} distinct filings extracted.")
            
    print(f"\n{'='*60}\nPipeline Complete! Data ready in memory.\n{'='*60}")
    return all_extracted_data

if __name__ == '__main__':
    # 1. Run the extraction
    funds_data_dict = main()
    
    # 2. Update logic goes here...

In [2]:
from edgar import Company, set_identity
import pandas as pd
from typing import List, Dict
import sys
from tqdm import tqdm
from IPython.display import display, Markdown
from src.simple_rag.extraction.general_info import FundInfoExtractor
from pathlib import Path
from src.simple_rag.models.fund import FilingMetadata
import gc
import pickle

set_identity('luis.alvarez.conde@alumnos.upm.es')
tickers = ["VOO", "MGK", "HEZU", "VMGRX", "VDIGX"]

# Create cache directory
cache_dir = Path("results/497k_cache")
cache_dir.mkdir(parents=True, exist_ok=True)

def safe_date_convert(date_value):
    """Safely convert a date value to date object, handling None/NaN."""
    if date_value is None or pd.isna(date_value):
        return None
    
    try:
        dt = pd.to_datetime(date_value)
        if pd.isna(dt):
            return None
        return dt.date()
    except Exception as e:
        print(f"Warning: Could not convert date '{date_value}': {e}")
        return None

def process_filing(filing, ticker):
    """Process a single filing and return extracted data"""
    try:
        text = filing.text()
        extractor = FundInfoExtractor(text, ticker=ticker)
        fund_data = extractor.get_structured_data()
        
        # Safely convert dates
        reporting_date = safe_date_convert(filing.report_date)
        filing_date = safe_date_convert(filing.filing_date)
       
        filing_metadata = FilingMetadata(
            accession_number=filing.accession_number,
            reporting_date=reporting_date,
            filing_date=filing_date,
            form=filing.form,
            url=filing.url
        )
        
        md = extractor.get_clean_markdown()
        
        result = {
            'ticker': fund_data['ticker'],
            'managers': fund_data['managers'],
            'strategies': fund_data['strategies'],
            'risks': fund_data['risks'],
            'objective': fund_data['objective'],
            'summary_prospectus': md,
            'filing_metadata': filing_metadata
        }
        
        # Clean up immediately
        del text, extractor, fund_data, md
        gc.collect()
        
        return result
    except Exception as e:
        print(f"❌ Error processing filing for {ticker}: {e}")
        return None

# Process each ticker
BATCH_SIZE = 5  # Process 5 filings at a time

for ticker in tickers:
    print(f"\n{'='*60}")
    print(f"Processing ticker: {ticker}")
    print(f"{'='*60}")
    
    # Check if already processed
    ticker_cache = cache_dir / f"{ticker}_processed.pkl"
    if ticker_cache.exists():
        print(f"⏭️  Skipping {ticker} - already processed (delete {ticker_cache} to reprocess)")
        continue
    
    company = Company(ticker)
    processed_funds = []
    filings = company.get_filings(form="497K")
    
    # Convert to list to enable batching
    filings_list = list(filings)
    print(f"Found {len(filings_list)} 497K filings for {ticker}")
    
    # Process filings in batches
    filing_results = []
    stop_processing = False
    
    for i in range(0, len(filings_list), BATCH_SIZE):
        if stop_processing:
            break
            
        batch = filings_list[i:i+BATCH_SIZE]
        print(f"\nProcessing batch {i//BATCH_SIZE + 1}/{(len(filings_list)-1)//BATCH_SIZE + 1}")
        
        for filing in tqdm(batch, desc=f"Batch {i//BATCH_SIZE + 1}"):
            result = process_filing(filing, ticker)
            
            if result:
                # Check for duplicates
                if result['ticker'] in processed_funds:
                    print(f"⚠️  First duplicate: {result['ticker']}")
                    stop_processing = True
                    break
                
                processed_funds.append(result['ticker'])
                filing_results.append(result)
            
            # Clear memory after each filing
            gc.collect()
        
        # Save batch progress
        batch_cache = cache_dir / f"{ticker}_batch_{i//BATCH_SIZE}.pkl"
        with open(batch_cache, 'wb') as f:
            pickle.dump(filing_results, f)
        
        print(f"✅ Saved batch progress ({len(filing_results)} filings so far)")
    
    # Mark ticker as complete
    with open(ticker_cache, 'wb') as f:
        pickle.dump({'ticker': ticker, 'count': len(processed_funds)}, f)
    
    print(f"\n✅ Processed {len(processed_funds)} funds for ticker: {ticker}")
    
    # Clear memory before next ticker
    del company, filings, filings_list, filing_results
    gc.collect()

print("\n" + "="*60)
print("All tickers processed! Now updating funds_total...")
print("="*60)

# Update funds_total with cached results
updated_count = 0

for ticker in tickers:
    ticker_cache = cache_dir / f"{ticker}_processed.pkl"
    
    if not ticker_cache.exists():
        print(f"⚠️  No cache found for {ticker}")
        continue
    
    # Load all batches for this ticker
    batch_files = sorted(cache_dir.glob(f"{ticker}_batch_*.pkl"))
    
    for batch_file in batch_files:
        with open(batch_file, 'rb') as f:
            filing_results = pickle.load(f)
        
        # Update funds_total
        for result in tqdm(filing_results, desc=f"Updating {ticker}"):
            for fund in funds_total:
                if fund.ticker == result['ticker']:
                    print(result['risks'])
                    fund.summary_prospectus = result['summary_prospectus']
                    fund.managers = result['managers']
                    fund.strategies = result['strategies']
                    fund.risks = result['risks']
                    fund.objective = result['objective']
                    fund.summary_prospectus_metadata = result['filing_metadata']
                    updated_count += 1
                    break
        
        # Clean up
        del filing_results
        gc.collect()

print(f"\n✅ Updated {updated_count} funds in funds_total")


Processing ticker: VOO
Found 909 497K filings for VOO

Processing batch 1/182


Batch 1:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 1:  20%|██        | 1/5 [00:00<00:00,  4.08it/s]

Batch 1:  40%|████      | 2/5 [00:00<00:00,  4.16it/s]

Batch 1:  60%|██████    | 3/5 [00:00<00:00,  4.42it/s]

Batch 1:  80%|████████  | 4/5 [00:00<00:00,  4.48it/s]

Batch 1: 100%|██████████| 5/5 [00:01<00:00,  4.43it/s]


✅ Saved batch progress (5 filings so far)

Processing batch 2/182


Batch 2:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 2:  20%|██        | 1/5 [00:00<00:00,  4.68it/s]

Batch 2:  40%|████      | 2/5 [00:00<00:00,  4.24it/s]

Batch 2:  60%|██████    | 3/5 [00:00<00:00,  4.40it/s]

Batch 2:  80%|████████  | 4/5 [00:00<00:00,  4.30it/s]

Batch 2: 100%|██████████| 5/5 [00:01<00:00,  4.40it/s]


✅ Saved batch progress (10 filings so far)

Processing batch 3/182


Batch 3:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 3:  20%|██        | 1/5 [00:00<00:00,  4.84it/s]

Batch 3:  40%|████      | 2/5 [00:00<00:00,  4.79it/s]

Batch 3:  60%|██████    | 3/5 [00:00<00:00,  4.68it/s]

Batch 3:  80%|████████  | 4/5 [00:00<00:00,  4.55it/s]

Batch 3: 100%|██████████| 5/5 [00:01<00:00,  4.58it/s]


✅ Saved batch progress (15 filings so far)

Processing batch 4/182


Batch 4:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 4:  20%|██        | 1/5 [00:00<00:00,  4.41it/s]

Batch 4:  40%|████      | 2/5 [00:00<00:00,  4.54it/s]

Batch 4:  60%|██████    | 3/5 [00:00<00:00,  4.61it/s]

Batch 4:  80%|████████  | 4/5 [00:00<00:00,  4.66it/s]

Batch 4: 100%|██████████| 5/5 [00:01<00:00,  4.57it/s]


✅ Saved batch progress (20 filings so far)

Processing batch 5/182


Batch 5:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 5:  20%|██        | 1/5 [00:00<00:00,  4.72it/s]

Batch 5:  40%|████      | 2/5 [00:00<00:00,  4.42it/s]

Batch 5:  60%|██████    | 3/5 [00:00<00:00,  4.29it/s]

Batch 5:  80%|████████  | 4/5 [00:00<00:00,  4.36it/s]

Batch 5: 100%|██████████| 5/5 [00:01<00:00,  4.36it/s]


✅ Saved batch progress (25 filings so far)

Processing batch 6/182


Batch 6:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 6:  20%|██        | 1/5 [00:00<00:00,  4.17it/s]

Batch 6:  40%|████      | 2/5 [00:00<00:00,  4.10it/s]

Batch 6:  60%|██████    | 3/5 [00:00<00:00,  3.98it/s]

Batch 6:  80%|████████  | 4/5 [00:00<00:00,  3.99it/s]

Batch 6: 100%|██████████| 5/5 [00:01<00:00,  4.01it/s]


✅ Saved batch progress (30 filings so far)

Processing batch 7/182


Batch 7:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 7:  20%|██        | 1/5 [00:00<00:00,  4.14it/s]

Batch 7:  40%|████      | 2/5 [00:00<00:00,  4.33it/s]

Batch 7:  60%|██████    | 3/5 [00:00<00:00,  4.38it/s]

Batch 7:  80%|████████  | 4/5 [00:00<00:00,  4.28it/s]

Batch 7: 100%|██████████| 5/5 [00:01<00:00,  4.34it/s]


✅ Saved batch progress (35 filings so far)

Processing batch 8/182


Batch 8:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 8:  20%|██        | 1/5 [00:00<00:00,  4.46it/s]

Batch 8:  40%|████      | 2/5 [00:00<00:00,  4.08it/s]

Batch 8:  60%|██████    | 3/5 [00:00<00:00,  3.71it/s]

Batch 8:  80%|████████  | 4/5 [00:01<00:00,  3.79it/s]

Batch 8: 100%|██████████| 5/5 [00:01<00:00,  3.86it/s]


✅ Saved batch progress (40 filings so far)

Processing batch 9/182


Batch 9:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 9:  20%|██        | 1/5 [00:00<00:00,  4.41it/s]

Batch 9:  40%|████      | 2/5 [00:00<00:00,  4.50it/s]

Batch 9:  60%|██████    | 3/5 [00:00<00:00,  4.44it/s]

Batch 9:  80%|████████  | 4/5 [00:01<00:00,  2.92it/s]

Batch 9: 100%|██████████| 5/5 [00:01<00:00,  3.54it/s]


✅ Saved batch progress (45 filings so far)

Processing batch 10/182


Batch 10:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 10:  20%|██        | 1/5 [00:00<00:00,  4.62it/s]

Batch 10:  40%|████      | 2/5 [00:00<00:00,  4.69it/s]

Batch 10:  60%|██████    | 3/5 [00:00<00:00,  4.76it/s]

Batch 10:  60%|██████    | 3/5 [00:00<00:00,  3.86it/s]


⚠️  First duplicate: VSCIX
✅ Saved batch progress (48 filings so far)

✅ Processed 48 funds for ticker: VOO

Processing ticker: MGK
Found 648 497K filings for MGK

Processing batch 1/130


Batch 1:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 1:  20%|██        | 1/5 [00:00<00:00,  4.25it/s]

Batch 1:  40%|████      | 2/5 [00:00<00:00,  4.17it/s]

Batch 1:  60%|██████    | 3/5 [00:00<00:00,  4.25it/s]

Batch 1:  80%|████████  | 4/5 [00:00<00:00,  4.29it/s]

Batch 1: 100%|██████████| 5/5 [00:01<00:00,  4.15it/s]


✅ Saved batch progress (5 filings so far)

Processing batch 2/130


Batch 2:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 2:  20%|██        | 1/5 [00:00<00:01,  3.47it/s]

Batch 2:  40%|████      | 2/5 [00:00<00:00,  3.86it/s]

Batch 2:  60%|██████    | 3/5 [00:00<00:00,  3.94it/s]

Batch 2:  80%|████████  | 4/5 [00:00<00:00,  4.18it/s]

Batch 2: 100%|██████████| 5/5 [00:01<00:00,  4.05it/s]


✅ Saved batch progress (10 filings so far)

Processing batch 3/130


Batch 3:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 3:  20%|██        | 1/5 [00:00<00:00,  4.21it/s]

Batch 3:  40%|████      | 2/5 [00:00<00:00,  4.16it/s]

Batch 3:  60%|██████    | 3/5 [00:00<00:00,  4.29it/s]

Batch 3:  80%|████████  | 4/5 [00:00<00:00,  4.31it/s]

Batch 3: 100%|██████████| 5/5 [00:01<00:00,  4.30it/s]


✅ Saved batch progress (15 filings so far)

Processing batch 4/130


Batch 4:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 4:  20%|██        | 1/5 [00:00<00:00,  4.50it/s]

Batch 4:  40%|████      | 2/5 [00:00<00:00,  4.27it/s]

Batch 4:  60%|██████    | 3/5 [00:00<00:00,  4.29it/s]

Batch 4:  80%|████████  | 4/5 [00:00<00:00,  4.25it/s]

Batch 4: 100%|██████████| 5/5 [00:01<00:00,  4.31it/s]


✅ Saved batch progress (20 filings so far)

Processing batch 5/130


Batch 5:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 5:  20%|██        | 1/5 [00:00<00:00,  4.35it/s]

Batch 5:  40%|████      | 2/5 [00:00<00:00,  4.45it/s]

Batch 5:  60%|██████    | 3/5 [00:00<00:00,  4.32it/s]

Batch 5:  80%|████████  | 4/5 [00:00<00:00,  4.15it/s]

Batch 5: 100%|██████████| 5/5 [00:01<00:00,  4.16it/s]


✅ Saved batch progress (25 filings so far)

Processing batch 6/130


Batch 6:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 6:  20%|██        | 1/5 [00:00<00:01,  3.95it/s]

Batch 6:  40%|████      | 2/5 [00:00<00:00,  3.95it/s]

Batch 6:  60%|██████    | 3/5 [00:00<00:00,  4.05it/s]

Batch 6:  80%|████████  | 4/5 [00:00<00:00,  4.20it/s]

Batch 6: 100%|██████████| 5/5 [00:01<00:00,  4.15it/s]


✅ Saved batch progress (30 filings so far)

Processing batch 7/130


Batch 7:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 7:  20%|██        | 1/5 [00:00<00:00,  4.36it/s]

Batch 7:  40%|████      | 2/5 [00:00<00:00,  4.23it/s]

Batch 7:  60%|██████    | 3/5 [00:00<00:00,  4.31it/s]

Batch 7:  80%|████████  | 4/5 [00:00<00:00,  4.38it/s]

Batch 7: 100%|██████████| 5/5 [00:01<00:00,  4.34it/s]


✅ Saved batch progress (35 filings so far)

Processing batch 8/130


Batch 8:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 8:  20%|██        | 1/5 [00:00<00:00,  4.42it/s]

Batch 8:  40%|████      | 2/5 [00:00<00:00,  4.32it/s]

Batch 8:  60%|██████    | 3/5 [00:00<00:00,  3.97it/s]

Batch 8:  60%|██████    | 3/5 [00:00<00:00,  3.41it/s]


⚠️  First duplicate: VEXC
✅ Saved batch progress (38 filings so far)

✅ Processed 38 funds for ticker: MGK

Processing ticker: HEZU
Found 6251 497K filings for HEZU

Processing batch 1/1251


Batch 1:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 1:  20%|██        | 1/5 [00:00<00:01,  2.97it/s]

Batch 1:  40%|████      | 2/5 [00:00<00:00,  3.18it/s]

Batch 1:  60%|██████    | 3/5 [00:00<00:00,  3.38it/s]

Batch 1:  80%|████████  | 4/5 [00:01<00:00,  3.29it/s]

Batch 1: 100%|██████████| 5/5 [00:01<00:00,  3.31it/s]


✅ Saved batch progress (5 filings so far)

Processing batch 2/1251


Batch 2:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 2:  20%|██        | 1/5 [00:00<00:01,  3.31it/s]

Batch 2:  40%|████      | 2/5 [00:00<00:00,  3.18it/s]

Batch 2:  60%|██████    | 3/5 [00:00<00:00,  3.19it/s]

Batch 2:  80%|████████  | 4/5 [00:01<00:00,  3.21it/s]

Batch 2: 100%|██████████| 5/5 [00:01<00:00,  3.28it/s]


✅ Saved batch progress (10 filings so far)

Processing batch 3/1251


Batch 3:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 3:  20%|██        | 1/5 [00:00<00:01,  3.40it/s]

Batch 3:  40%|████      | 2/5 [00:00<00:00,  3.33it/s]

Batch 3:  60%|██████    | 3/5 [00:00<00:00,  3.15it/s]

Batch 3:  80%|████████  | 4/5 [00:01<00:00,  3.28it/s]

Batch 3: 100%|██████████| 5/5 [00:01<00:00,  3.25it/s]


✅ Saved batch progress (15 filings so far)

Processing batch 4/1251


Batch 4:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 4:  20%|██        | 1/5 [00:00<00:01,  2.93it/s]

Batch 4:  40%|████      | 2/5 [00:00<00:01,  2.93it/s]

Batch 4:  60%|██████    | 3/5 [00:01<00:00,  2.94it/s]

Batch 4:  80%|████████  | 4/5 [00:01<00:00,  3.05it/s]

Batch 4: 100%|██████████| 5/5 [00:01<00:00,  2.88it/s]


✅ Saved batch progress (20 filings so far)

Processing batch 5/1251


Batch 5:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 5:  20%|██        | 1/5 [00:00<00:01,  2.93it/s]

Batch 5:  40%|████      | 2/5 [00:00<00:01,  2.98it/s]

Batch 5:  60%|██████    | 3/5 [00:00<00:00,  3.02it/s]

Batch 5:  80%|████████  | 4/5 [00:01<00:00,  3.00it/s]

Batch 5: 100%|██████████| 5/5 [00:01<00:00,  3.07it/s]


✅ Saved batch progress (25 filings so far)

Processing batch 6/1251


Batch 6:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 6:  20%|██        | 1/5 [00:00<00:01,  2.51it/s]

Batch 6:  40%|████      | 2/5 [00:00<00:01,  2.78it/s]

Batch 6:  60%|██████    | 3/5 [00:01<00:00,  2.90it/s]

Batch 6:  80%|████████  | 4/5 [00:01<00:00,  3.08it/s]

Batch 6: 100%|██████████| 5/5 [00:01<00:00,  3.03it/s]


✅ Saved batch progress (30 filings so far)

Processing batch 7/1251


Batch 7:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 7:  20%|██        | 1/5 [00:00<00:01,  3.56it/s]

Batch 7:  40%|████      | 2/5 [00:00<00:00,  3.52it/s]

Batch 7:  60%|██████    | 3/5 [00:00<00:00,  3.35it/s]

Batch 7:  80%|████████  | 4/5 [00:01<00:00,  3.14it/s]

Batch 7: 100%|██████████| 5/5 [00:01<00:00,  3.23it/s]


✅ Saved batch progress (35 filings so far)

Processing batch 8/1251


Batch 8:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 8:  20%|██        | 1/5 [00:00<00:01,  3.11it/s]

Batch 8:  40%|████      | 2/5 [00:00<00:00,  3.07it/s]

Batch 8:  60%|██████    | 3/5 [00:00<00:00,  3.13it/s]

Batch 8:  80%|████████  | 4/5 [00:01<00:00,  3.26it/s]

Batch 8: 100%|██████████| 5/5 [00:01<00:00,  3.18it/s]


✅ Saved batch progress (40 filings so far)

Processing batch 9/1251


Batch 9:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 9:  20%|██        | 1/5 [00:00<00:01,  3.32it/s]

Batch 9:  40%|████      | 2/5 [00:00<00:01,  2.76it/s]

Batch 9:  60%|██████    | 3/5 [00:01<00:00,  2.88it/s]

Batch 9:  80%|████████  | 4/5 [00:01<00:00,  2.79it/s]

Batch 9: 100%|██████████| 5/5 [00:01<00:00,  2.89it/s]


✅ Saved batch progress (45 filings so far)

Processing batch 10/1251


Batch 10:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 10:  20%|██        | 1/5 [00:00<00:01,  3.39it/s]

Batch 10:  40%|████      | 2/5 [00:00<00:00,  3.28it/s]

Batch 10:  60%|██████    | 3/5 [00:00<00:00,  3.09it/s]

Batch 10:  80%|████████  | 4/5 [00:01<00:00,  3.23it/s]

Batch 10: 100%|██████████| 5/5 [00:01<00:00,  3.29it/s]


✅ Saved batch progress (50 filings so far)

Processing batch 11/1251


Batch 11:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 11:  20%|██        | 1/5 [00:00<00:01,  2.80it/s]

Batch 11:  40%|████      | 2/5 [00:00<00:01,  2.79it/s]

Batch 11:  60%|██████    | 3/5 [00:01<00:00,  3.02it/s]

Batch 11:  80%|████████  | 4/5 [00:01<00:00,  3.27it/s]

Batch 11: 100%|██████████| 5/5 [00:01<00:00,  3.13it/s]


✅ Saved batch progress (55 filings so far)

Processing batch 12/1251


Batch 12:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 12:  20%|██        | 1/5 [00:00<00:01,  3.28it/s]

Batch 12:  40%|████      | 2/5 [00:00<00:00,  3.58it/s]

Batch 12:  60%|██████    | 3/5 [00:00<00:00,  3.54it/s]

Batch 12:  80%|████████  | 4/5 [00:01<00:00,  3.38it/s]

Batch 12: 100%|██████████| 5/5 [00:01<00:00,  3.46it/s]


✅ Saved batch progress (60 filings so far)

Processing batch 13/1251


Batch 13:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 13:  20%|██        | 1/5 [00:00<00:01,  3.40it/s]

Batch 13:  40%|████      | 2/5 [00:00<00:00,  3.15it/s]

Batch 13:  60%|██████    | 3/5 [00:00<00:00,  3.30it/s]

Batch 13:  80%|████████  | 4/5 [00:01<00:00,  3.34it/s]

Batch 13: 100%|██████████| 5/5 [00:01<00:00,  3.31it/s]


✅ Saved batch progress (65 filings so far)

Processing batch 14/1251


Batch 14:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 14:  20%|██        | 1/5 [00:00<00:01,  3.61it/s]

Batch 14:  40%|████      | 2/5 [00:00<00:00,  3.62it/s]

Batch 14:  60%|██████    | 3/5 [00:00<00:00,  3.45it/s]

Batch 14:  80%|████████  | 4/5 [00:01<00:00,  3.37it/s]

Batch 14: 100%|██████████| 5/5 [00:01<00:00,  3.44it/s]


✅ Saved batch progress (70 filings so far)

Processing batch 15/1251


Batch 15:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 15:  20%|██        | 1/5 [00:00<00:01,  3.49it/s]

Batch 15:  40%|████      | 2/5 [00:00<00:00,  3.85it/s]

Batch 15:  60%|██████    | 3/5 [00:00<00:00,  3.69it/s]

Batch 15:  80%|████████  | 4/5 [00:01<00:00,  3.63it/s]

Batch 15: 100%|██████████| 5/5 [00:01<00:00,  3.66it/s]


✅ Saved batch progress (75 filings so far)

Processing batch 16/1251


Batch 16:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 16:  20%|██        | 1/5 [00:00<00:01,  3.77it/s]

Batch 16:  40%|████      | 2/5 [00:00<00:00,  3.45it/s]

Batch 16:  60%|██████    | 3/5 [00:00<00:00,  3.43it/s]

Batch 16:  80%|████████  | 4/5 [00:01<00:00,  3.22it/s]

Batch 16: 100%|██████████| 5/5 [00:01<00:00,  3.32it/s]


✅ Saved batch progress (80 filings so far)

Processing batch 17/1251


Batch 17:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 17:  20%|██        | 1/5 [00:00<00:01,  3.11it/s]

Batch 17:  40%|████      | 2/5 [00:00<00:00,  3.25it/s]

Batch 17:  60%|██████    | 3/5 [00:00<00:00,  3.46it/s]

Batch 17:  80%|████████  | 4/5 [00:01<00:00,  3.61it/s]

Batch 17: 100%|██████████| 5/5 [00:01<00:00,  3.55it/s]


✅ Saved batch progress (85 filings so far)

Processing batch 18/1251


Batch 18:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 18:  20%|██        | 1/5 [00:00<00:01,  3.89it/s]

Batch 18:  40%|████      | 2/5 [00:00<00:00,  3.79it/s]

Batch 18:  60%|██████    | 3/5 [00:00<00:00,  3.52it/s]

Batch 18:  80%|████████  | 4/5 [00:01<00:00,  3.61it/s]

Batch 18: 100%|██████████| 5/5 [00:01<00:00,  3.65it/s]


✅ Saved batch progress (90 filings so far)

Processing batch 19/1251


Batch 19:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 19:  20%|██        | 1/5 [00:00<00:01,  3.52it/s]

Batch 19:  40%|████      | 2/5 [00:00<00:00,  3.71it/s]

Batch 19:  60%|██████    | 3/5 [00:00<00:00,  3.71it/s]

Batch 19:  80%|████████  | 4/5 [00:01<00:00,  3.69it/s]

Batch 19: 100%|██████████| 5/5 [00:01<00:00,  3.68it/s]


✅ Saved batch progress (95 filings so far)

Processing batch 20/1251


Batch 20:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 20:  20%|██        | 1/5 [00:00<00:01,  3.97it/s]

Batch 20:  40%|████      | 2/5 [00:00<00:00,  3.76it/s]

Batch 20:  60%|██████    | 3/5 [00:00<00:00,  3.65it/s]

Batch 20:  80%|████████  | 4/5 [00:01<00:00,  3.72it/s]

Batch 20: 100%|██████████| 5/5 [00:01<00:00,  3.75it/s]


✅ Saved batch progress (100 filings so far)

Processing batch 21/1251


Batch 21:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 21:  20%|██        | 1/5 [00:00<00:01,  3.76it/s]

Batch 21:  40%|████      | 2/5 [00:00<00:00,  3.92it/s]

Batch 21:  60%|██████    | 3/5 [00:00<00:00,  3.77it/s]

Batch 21:  80%|████████  | 4/5 [00:01<00:00,  3.82it/s]

Batch 21: 100%|██████████| 5/5 [00:01<00:00,  3.81it/s]


✅ Saved batch progress (105 filings so far)

Processing batch 22/1251


Batch 22:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 22:  20%|██        | 1/5 [00:00<00:01,  3.91it/s]

Batch 22:  40%|████      | 2/5 [00:00<00:00,  3.88it/s]

Batch 22:  60%|██████    | 3/5 [00:00<00:00,  3.75it/s]

Batch 22:  80%|████████  | 4/5 [00:01<00:00,  3.78it/s]

Batch 22: 100%|██████████| 5/5 [00:01<00:00,  3.73it/s]


✅ Saved batch progress (110 filings so far)

Processing batch 23/1251


Batch 23:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 23:  20%|██        | 1/5 [00:00<00:01,  3.81it/s]

Batch 23:  40%|████      | 2/5 [00:00<00:00,  3.96it/s]

Batch 23:  60%|██████    | 3/5 [00:00<00:00,  3.73it/s]

Batch 23:  80%|████████  | 4/5 [00:01<00:00,  3.76it/s]

Batch 23: 100%|██████████| 5/5 [00:01<00:00,  3.76it/s]


✅ Saved batch progress (115 filings so far)

Processing batch 24/1251


Batch 24:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 24:  20%|██        | 1/5 [00:00<00:01,  3.53it/s]

Batch 24:  40%|████      | 2/5 [00:00<00:00,  3.69it/s]

Batch 24:  60%|██████    | 3/5 [00:00<00:00,  3.55it/s]

Batch 24:  80%|████████  | 4/5 [00:01<00:00,  3.64it/s]

Batch 24: 100%|██████████| 5/5 [00:01<00:00,  3.62it/s]


✅ Saved batch progress (120 filings so far)

Processing batch 25/1251


Batch 25:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 25:  20%|██        | 1/5 [00:00<00:00,  4.05it/s]

Batch 25:  40%|████      | 2/5 [00:00<00:00,  3.87it/s]

Batch 25:  60%|██████    | 3/5 [00:00<00:00,  3.64it/s]

Batch 25:  80%|████████  | 4/5 [00:01<00:00,  3.74it/s]

Batch 25: 100%|██████████| 5/5 [00:01<00:00,  3.67it/s]


✅ Saved batch progress (125 filings so far)

Processing batch 26/1251


Batch 26:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 26:  20%|██        | 1/5 [00:00<00:01,  3.94it/s]

Batch 26:  40%|████      | 2/5 [00:00<00:00,  3.50it/s]

Batch 26:  60%|██████    | 3/5 [00:00<00:00,  3.66it/s]

Batch 26:  80%|████████  | 4/5 [00:01<00:00,  3.74it/s]

Batch 26: 100%|██████████| 5/5 [00:01<00:00,  3.62it/s]


✅ Saved batch progress (130 filings so far)

Processing batch 27/1251


Batch 27:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 27:  20%|██        | 1/5 [00:00<00:01,  3.25it/s]

Batch 27:  40%|████      | 2/5 [00:00<00:00,  3.23it/s]

Batch 27:  60%|██████    | 3/5 [00:00<00:00,  3.41it/s]

Batch 27:  80%|████████  | 4/5 [00:01<00:00,  3.54it/s]

Batch 27: 100%|██████████| 5/5 [00:01<00:00,  3.53it/s]


✅ Saved batch progress (135 filings so far)

Processing batch 28/1251


Batch 28:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 28:  20%|██        | 1/5 [00:00<00:01,  3.81it/s]

Batch 28:  40%|████      | 2/5 [00:00<00:00,  3.65it/s]

Batch 28:  60%|██████    | 3/5 [00:00<00:00,  3.37it/s]

Batch 28:  80%|████████  | 4/5 [00:01<00:00,  3.39it/s]

Batch 28: 100%|██████████| 5/5 [00:01<00:00,  3.49it/s]


✅ Saved batch progress (140 filings so far)

Processing batch 29/1251


Batch 29:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 29:  20%|██        | 1/5 [00:00<00:01,  3.98it/s]

Batch 29:  40%|████      | 2/5 [00:00<00:00,  3.59it/s]

Batch 29:  60%|██████    | 3/5 [00:00<00:00,  3.32it/s]

Batch 29:  80%|████████  | 4/5 [00:01<00:00,  3.20it/s]

Batch 29: 100%|██████████| 5/5 [00:01<00:00,  3.27it/s]


✅ Saved batch progress (145 filings so far)

Processing batch 30/1251


Batch 30:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 30:  20%|██        | 1/5 [00:00<00:01,  3.45it/s]

Batch 30:  40%|████      | 2/5 [00:00<00:00,  3.47it/s]

Batch 30:  60%|██████    | 3/5 [00:00<00:00,  3.29it/s]

Batch 30:  80%|████████  | 4/5 [00:01<00:00,  3.28it/s]

Batch 30: 100%|██████████| 5/5 [00:01<00:00,  3.33it/s]


✅ Saved batch progress (150 filings so far)

Processing batch 31/1251


Batch 31:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 31:  20%|██        | 1/5 [00:00<00:01,  3.34it/s]

Batch 31:  40%|████      | 2/5 [00:00<00:00,  3.19it/s]

Batch 31:  60%|██████    | 3/5 [00:00<00:00,  3.19it/s]

Batch 31:  80%|████████  | 4/5 [00:01<00:00,  3.42it/s]

Batch 31: 100%|██████████| 5/5 [00:01<00:00,  3.29it/s]


✅ Saved batch progress (155 filings so far)

Processing batch 32/1251


Batch 32:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 32:  20%|██        | 1/5 [00:00<00:01,  3.61it/s]

Batch 32:  40%|████      | 2/5 [00:00<00:00,  3.63it/s]

Batch 32:  60%|██████    | 3/5 [00:00<00:00,  3.44it/s]

Batch 32:  80%|████████  | 4/5 [00:01<00:00,  3.55it/s]

Batch 32: 100%|██████████| 5/5 [00:01<00:00,  3.34it/s]


✅ Saved batch progress (160 filings so far)

Processing batch 33/1251


Batch 33:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 33:  20%|██        | 1/5 [00:00<00:01,  2.95it/s]

Batch 33:  40%|████      | 2/5 [00:00<00:00,  3.12it/s]

Batch 33:  60%|██████    | 3/5 [00:01<00:00,  2.97it/s]

Batch 33:  80%|████████  | 4/5 [00:01<00:00,  3.01it/s]

Batch 33: 100%|██████████| 5/5 [00:01<00:00,  3.05it/s]


✅ Saved batch progress (165 filings so far)

Processing batch 34/1251


Batch 34:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 34:  20%|██        | 1/5 [00:00<00:01,  3.10it/s]

Batch 34:  40%|████      | 2/5 [00:00<00:00,  3.15it/s]

Batch 34:  60%|██████    | 3/5 [00:01<00:01,  1.96it/s]

Batch 34:  80%|████████  | 4/5 [00:01<00:00,  2.31it/s]

Batch 34: 100%|██████████| 5/5 [00:01<00:00,  2.52it/s]


✅ Saved batch progress (170 filings so far)

Processing batch 35/1251


Batch 35:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 35:  20%|██        | 1/5 [00:00<00:01,  3.16it/s]

Batch 35:  40%|████      | 2/5 [00:00<00:00,  3.36it/s]

Batch 35:  60%|██████    | 3/5 [00:00<00:00,  3.64it/s]

Batch 35:  80%|████████  | 4/5 [00:01<00:00,  3.43it/s]

Batch 35: 100%|██████████| 5/5 [00:01<00:00,  3.35it/s]


✅ Saved batch progress (175 filings so far)

Processing batch 36/1251


Batch 36:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 36:  20%|██        | 1/5 [00:00<00:01,  3.94it/s]

Batch 36:  40%|████      | 2/5 [00:00<00:00,  3.61it/s]

Batch 36:  60%|██████    | 3/5 [00:01<00:00,  2.47it/s]

Batch 36:  80%|████████  | 4/5 [00:01<00:00,  2.92it/s]

Batch 36: 100%|██████████| 5/5 [00:01<00:00,  3.08it/s]


✅ Saved batch progress (180 filings so far)

Processing batch 37/1251


Batch 37:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 37:  20%|██        | 1/5 [00:00<00:00,  4.15it/s]

Batch 37:  40%|████      | 2/5 [00:00<00:00,  4.03it/s]

Batch 37:  60%|██████    | 3/5 [00:00<00:00,  4.04it/s]

Batch 37:  80%|████████  | 4/5 [00:01<00:00,  3.82it/s]

Batch 37: 100%|██████████| 5/5 [00:01<00:00,  3.90it/s]


✅ Saved batch progress (185 filings so far)

Processing batch 38/1251


Batch 38:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 38:  20%|██        | 1/5 [00:00<00:01,  3.99it/s]

Batch 38:  40%|████      | 2/5 [00:00<00:00,  3.78it/s]

Batch 38:  60%|██████    | 3/5 [00:00<00:00,  3.55it/s]

Batch 38:  80%|████████  | 4/5 [00:01<00:00,  3.81it/s]

Batch 38: 100%|██████████| 5/5 [00:01<00:00,  3.76it/s]


✅ Saved batch progress (190 filings so far)

Processing batch 39/1251


Batch 39:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 39:  20%|██        | 1/5 [00:00<00:00,  4.30it/s]

Batch 39:  40%|████      | 2/5 [00:00<00:00,  3.81it/s]

Batch 39:  60%|██████    | 3/5 [00:00<00:00,  3.80it/s]

Batch 39:  80%|████████  | 4/5 [00:01<00:00,  4.01it/s]

Batch 39: 100%|██████████| 5/5 [00:01<00:00,  3.99it/s]


✅ Saved batch progress (195 filings so far)

Processing batch 40/1251


Batch 40:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 40:   0%|          | 0/5 [00:00<?, ?it/s]

⚠️  First duplicate: EUIG
✅ Saved batch progress (195 filings so far)

✅ Processed 195 funds for ticker: HEZU

Processing ticker: VMGRX





Found 262 497K filings for VMGRX

Processing batch 1/53


Batch 1:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 1:  20%|██        | 1/5 [00:00<00:00,  4.17it/s]

Batch 1:  40%|████      | 2/5 [00:00<00:00,  4.39it/s]

Batch 1:  60%|██████    | 3/5 [00:00<00:00,  4.67it/s]

Batch 1:  80%|████████  | 4/5 [00:00<00:00,  4.83it/s]

Batch 1: 100%|██████████| 5/5 [00:01<00:00,  4.75it/s]


✅ Saved batch progress (5 filings so far)

Processing batch 2/53


Batch 2:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 2:  20%|██        | 1/5 [00:00<00:00,  4.29it/s]

Batch 2:  40%|████      | 2/5 [00:00<00:00,  4.18it/s]

Batch 2:  60%|██████    | 3/5 [00:00<00:00,  4.17it/s]

Batch 2:  80%|████████  | 4/5 [00:00<00:00,  4.47it/s]

Batch 2: 100%|██████████| 5/5 [00:01<00:00,  4.47it/s]


✅ Saved batch progress (10 filings so far)

Processing batch 3/53


Batch 3:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 3:  20%|██        | 1/5 [00:00<00:00,  4.87it/s]

Batch 3:  40%|████      | 2/5 [00:00<00:00,  4.60it/s]

Batch 3:  60%|██████    | 3/5 [00:00<00:00,  4.79it/s]

Batch 3:  80%|████████  | 4/5 [00:00<00:00,  4.49it/s]

Batch 3: 100%|██████████| 5/5 [00:01<00:00,  4.50it/s]


✅ Saved batch progress (15 filings so far)

Processing batch 4/53


Batch 4:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 4:  20%|██        | 1/5 [00:00<00:00,  4.62it/s]

Batch 4:  40%|████      | 2/5 [00:00<00:00,  4.35it/s]

Batch 4:  40%|████      | 2/5 [00:00<00:00,  3.50it/s]


⚠️  First duplicate: VMGRX
✅ Saved batch progress (17 filings so far)

✅ Processed 17 funds for ticker: VMGRX

Processing ticker: VDIGX
Found 239 497K filings for VDIGX

Processing batch 1/48


Batch 1:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 1:  20%|██        | 1/5 [00:00<00:00,  5.57it/s]

Batch 1:  40%|████      | 2/5 [00:00<00:00,  5.63it/s]

Batch 1:  60%|██████    | 3/5 [00:00<00:00,  5.48it/s]

Batch 1:  80%|████████  | 4/5 [00:00<00:00,  5.54it/s]

Batch 1: 100%|██████████| 5/5 [00:00<00:00,  5.48it/s]


✅ Saved batch progress (5 filings so far)

Processing batch 2/48


Batch 2:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 2:  20%|██        | 1/5 [00:00<00:00,  5.13it/s]

Batch 2:  40%|████      | 2/5 [00:00<00:00,  5.54it/s]

Batch 2:  60%|██████    | 3/5 [00:00<00:00,  5.53it/s]

Batch 2:  80%|████████  | 4/5 [00:00<00:00,  5.44it/s]

Batch 2: 100%|██████████| 5/5 [00:00<00:00,  5.42it/s]


✅ Saved batch progress (10 filings so far)

Processing batch 3/48


Batch 3:   0%|          | 0/5 [00:00<?, ?it/s]

Batch 3:  20%|██        | 1/5 [00:00<00:00,  5.37it/s]

Batch 3:  20%|██        | 1/5 [00:00<00:01,  3.24it/s]


⚠️  First duplicate: VGSNX
✅ Saved batch progress (11 filings so far)

✅ Processed 11 funds for ticker: VDIGX

All tickers processed! Now updating funds_total...


Updating VOO:   0%|          | 0/5 [00:00<?, ?it/s]


NameError: name 'funds_total' is not defined

In [None]:
 # Update funds_total
count = 0
for result in tqdm(filing_results, desc=f"Updating {ticker}"):
    
    print(result['risks'])
    print(result['ticker'])
    count += 1
print(count)
    
    

Updating VOO: 100%|██████████| 5/5 [00:00<00:00, 47339.77it/s]

An investment in the Fund could lose money over short or long periods of time. You should expect the Fund’s share price and total return to fluctuate within awide range. The Fund is subject to the following risks, which could affect theFund’s performance:

• Stock market risk, which is the chance that stock prices overall will decline. Stock markets tend to move in cycles, with periods of rising prices and periods offalling prices. The Fund’s target index tracks a subset of the U. S. stock market, which could cause the Fund to perform differently from the overall stock market. In addition, the Fund’s target index may, at times, become focused in stocks of a

2

particular market sector, which would subject the Fund to proportionately higherexposure to the risks of that sector.

• Investment style risk, which is the chance that returns from mid-capitalizationstocks will trail returns from the overall stock market. Historically, mid-cap stockshave been more volatile in price than the lar




In [11]:
from edgar import Company, set_identity
import pandas as pd
from typing import List, Dict
import sys
from tqdm import tqdm
from IPython.display import display, Markdown
from src.simple_rag.extraction.general_info import FundInfoExtractor
from pathlib import Path
from src.simple_rag.models.fund import FilingMetadata
set_identity('luis.alvarez.conde@alumnos.upm.es')
tickers = ["VOO", "MGK", "HEZU", "VMGRX", "VDIGX"]

def safe_date_convert(date_value):
    """Safely convert a date value to date object, handling None/NaN."""
    if date_value is None or pd.isna(date_value):
        return None
    
    try:
        dt = pd.to_datetime(date_value)
        if pd.isna(dt):
            return None
        return dt.date()
    except Exception as e:
        print(f"Warning: Could not convert date '{date_value}': {e}")
        return None

for ticker in tickers:
    company = Company(ticker)
    processed_funds = []
    filings = company.get_filings(form="497K")

    for filing in filings:
        text = filing.text()
        extractor = FundInfoExtractor(text, ticker=ticker)
        fund_data = extractor.get_structured_data()
        
        if fund_data['ticker'] in processed_funds:
            print("First duplicate: ", fund_data['ticker'])
            break

        # Safely convert dates
        reporting_date = safe_date_convert(filing.report_date)
        filing_date = safe_date_convert(filing.filing_date)
       
        filing_metadata = FilingMetadata(
            accession_number=filing.accession_number,
            reporting_date=reporting_date,
            filing_date=filing_date,
            form=filing.form,
            url=filing.url
        )

        processed_funds.append(fund_data['ticker'])
        md = extractor.get_clean_markdown()
        
        for fund in funds_total:
            if fund.ticker == fund_data['ticker']:
                fund.summary_prospectus = md
                fund.managers = fund_data['managers']
                fund.strategies = fund_data['strategies']
                fund.risks = fund_data['risks']
                fund.objective = fund_data['objective']
                fund.summary_prospectus_metadata = filing_metadata
                break
        
    print("Processed funds: ", len(processed_funds), "for ticker: ", ticker)

First duplicate:  VSCIX
Processed funds:  48 for ticker:  VOO


First duplicate:  VEXC
Processed funds:  38 for ticker:  MGK


First duplicate:  EUIG
Processed funds:  195 for ticker:  HEZU


First duplicate:  VMGRX
Processed funds:  17 for ticker:  VMGRX


First duplicate:  VGSNX
Processed funds:  11 for ticker:  VDIGX


In [29]:
managers = set()
for fund in funds_total:
    if fund.managers is not None:
        # If fund.managers is a list, add each manager individually
        if isinstance(fund.managers, list):
            managers.update(fund.managers)  # ✅ Use update() for lists
        else:
            managers.add(fund.managers)    # ✅ Use add() for single values

print(f"Unique managers found: {len(managers)}")
for manager in sorted(managers):
    print(f"  - {manager}")

Unique managers found: 32
  - Aaron Choi
  - Asian Economic Risk
  - Aurélie Denis
  - Brett Barakett
  - Chris Nieves
  - Christopher Chung
  - Consumer Goods
  - Erin Armstrong
  - Gary Robinson
  - Investment Manager
  - Jake Riley
  - James Mauro
  - Jena Stenger
  - Jennifer Hsui
  - Jonathan Graves
  - Kenny Narzikul
  - Lawrence Burns
  - Managing Director
  - Matt Waldron
  - Michael Cling
  - Michelle Louie
  - Nataliya Kofman
  - Natasha Kuhlkin
  - Nick Birkett
  - Peter Sietsema
  - Senior Managing Director
  - Simon Webber
  - Steven White
  - Suzanne Ly
  - Thomas Coutts
  - Tom Slater
  - Walter Nejman


In [30]:
import pickle
from pathlib import Path
import sys

# Add RAG directory to path
RAG_DIR = Path("/home/alvar/CascadeProjects/windsurf-project/RAG")
if str(RAG_DIR) not in sys.path:
    sys.path.insert(0, str(RAG_DIR))

# Define pickle file path
PKL_PATH = Path("./funds_backup_metadata.pkl")

print("Current working directory:", Path.cwd())
print("PKL_PATH resolves to:", PKL_PATH.resolve())

# Save to pickle file
try:
    with PKL_PATH.open("wb") as f:
        pickle.dump(funds_total, f)
    
    print(f"Successfully saved {len(funds_total)} funds to pickle file")
    print(f"File size: {PKL_PATH.stat().st_size / 1024:.2f} KB")
    
except Exception as e:
    print(f"Error saving to pickle file: {e}")

Current working directory: /home/luis/Desktop/code/RAG/notebooks
PKL_PATH resolves to: /home/luis/Desktop/code/RAG/notebooks/funds_backup_metadata.pkl
Successfully saved 460 funds to pickle file
File size: 11257.74 KB


In [10]:
import pickle
from pathlib import Path
from dataclasses import is_dataclass, asdict
import pandas as pd
import sys
import sys
from pathlib import Path
import pickle

# Get the absolute path to the project root (parent of notebooks directory)
# This assumes the notebook is running in the 'notebooks' folder
PROJECT_ROOT = Path.cwd().parent

# Add project root to sys.path if not present
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

# Add src directory to sys.path
SRC_DIR = PROJECT_ROOT / "src"
if str(SRC_DIR) not in sys.path:
    sys.path.insert(0, str(SRC_DIR))

from src.simple_rag.models.fund import FundData

PKL_PATH = Path("./funds_backup.pkl")
print("Current working directory:", Path.cwd())
print("PKL_PATH resolves to:", PKL_PATH.resolve())
funds_total = []
with PKL_PATH.open("rb") as f:

    funds_total = pickle.load(f)

print(f"Loaded {len(funds_total)} funds from pickle file")

Current working directory: /home/luis/Desktop/code/RAG/notebooks
PKL_PATH resolves to: /home/luis/Desktop/code/RAG/notebooks/funds_backup.pkl
Loaded 420 funds from pickle file


## NPORT (Portfolio Composition)

In [31]:
from edgar import Company, set_identity
import pandas as pd
from typing import List, Dict
import sys
from tqdm import tqdm
from pathlib import Path
import gc
import pickle
from concurrent.futures import ProcessPoolExecutor, as_completed
import psutil

from src.simple_rag.extraction.nport import NPortProcessor
from src.simple_rag.models.fund import PortfolioHolding, Derivatives, NonDerivatives, FilingMetadata

company_json_path = Path("../notebooks/sec_data/company_tickers.json")

set_identity('luis.alvarez.conde@alumnos.upm.es')

# Create cache directory
cache_dir = Path("results/nport_cache")
cache_dir.mkdir(parents=True, exist_ok=True)

def safe_date_convert(date_value):
    """Safely convert a date value to date object, handling None/NaN."""
    if date_value is None or pd.isna(date_value):
        return None
    
    try:
        dt = pd.to_datetime(date_value)
        if pd.isna(dt):
            return None
        return dt.date()
    except Exception as e:
        print(f"Warning: Could not convert date '{date_value}': {e}")
        return None

def process_single_filing_worker(args):
    """Worker function for multiprocessing - processes a single filing"""
    filing_data, ticker, company_json_path = args
    
    try:
        # Re-import in worker process
        import sys
        from pathlib import Path
        sys.path.append(str(Path.cwd()))
        from src.simple_rag.extraction.nport import NPortProcessor
        from src.simple_rag.models.fund import FilingMetadata
        from edgar import Company, set_identity
        import gc
        
        set_identity('luis.alvarez.conde@alumnos.upm.es')
        
        # Reconstruct filing object from saved data
        company = Company(ticker)
        filings = company.get_filings(form="NPORT-P")
        filing = None
        
        for f in filings:
            if f.accession_number == filing_data['accession_number']:
                filing = f
                break
        
        if filing is None:
            print(f"Could not find filing {filing_data['accession_number']}")
            return None
        
        xml_data = filing.obj()
        fund_name = xml_data.get_fund_series().name
        reporting_period = xml_data.reporting_period
        portfolio_list = xml_data.investments
        derivatives = xml_data.derivatives
        series_id = xml_data.get_fund_series().series_id

        # Convert dates safely
        reporting_date = safe_date_convert(filing.report_date)
        filing_date = safe_date_convert(filing.filing_date)

        # Process holdings
        proc = NPortProcessor(company_tickers_json_path=company_json_path, min_similarity=0.74)
        holdings = proc.process_holdings(portfolio_list)
        result = proc.enrich_tickers(holdings, verbose=False)
        
        filing_metadata = FilingMetadata(
            accession_number=filing.accession_number,
            reporting_date=reporting_date,
            filing_date=filing_date,
            form=filing.form,
            url=filing.url
        )

        not_matches = result[result['matched_ticker'].isna() | (result['matched_ticker'] == '')]
        
        output = {
            'fund_name': fund_name,
            'reporting_period': reporting_period,
            'holdings': holdings,
            'result': result,
            'derivatives': derivatives,
            'not_matches': not_matches,
            'ticker': ticker,
            'series_id': series_id,
            'report_date': filing.report_date,
            'nport_metadata': filing_metadata
        }
        
        # Clean up immediately
        del xml_data, proc, holdings, result, filing
        gc.collect()
        
        return output
        
    except Exception as e:
        print(f"❌ Error processing filing {filing_data.get('accession_number', 'unknown')}: {e}")
        return None

def process_ticker_parallel(ticker, company_json_path, batch_size=3, max_workers=2):
    """Process all filings for a single ticker with parallel batch processing"""
    try:
        # Check if already processed
        ticker_cache = cache_dir / f"{ticker}_processed.pkl"
        if ticker_cache.exists():
            print(f"⏭️  Skipping {ticker} - already processed (delete to reprocess)")
            with open(ticker_cache, 'rb') as f:
                return pickle.load(f)
        
        nport_file = Company(ticker)
        filings = sorted(nport_file.get_filings(form="NPORT-P"), 
                        key=lambda x: x.report_date, reverse=True)
        
        # Convert to list for batching
        filings_list = list(filings)
        
        if not filings_list:
            print(f"No filings found for {ticker}")
            return None
            
        print(f"\n{'='*60}")
        print(f"Processing ticker: {ticker}")
        print(f"Found {len(filings_list)} filings, most recent: {filings_list[0].report_date}")
        print(f"{'='*60}")
        
        funds_processed_set = set()
        ticker_results = []
        stop_processing = False
        
        # Process filings in batches with multiprocessing
        for i in range(0, len(filings_list), batch_size):
            if stop_processing:
                break
                
            batch = filings_list[i:i+batch_size]
            print(f"\n📦 Batch {i//batch_size + 1}/{(len(filings_list)-1)//batch_size + 1}")
            
            # Prepare batch data (lightweight metadata only)
            batch_args = []
            for filing in batch:
                filing_data = {
                    'accession_number': filing.accession_number,
                    'report_date': filing.report_date
                }
                batch_args.append((filing_data, ticker, company_json_path))
            
            # Process batch in parallel
            with ProcessPoolExecutor(max_workers=max_workers) as executor:
                futures = {executor.submit(process_single_filing_worker, args): args 
                          for args in batch_args}
                
                for future in tqdm(as_completed(futures), total=len(batch_args), 
                                 desc=f"{ticker} batch {i//batch_size + 1}"):
                    result = future.result()
                    
                    if result is not None:
                        # Check if we've already processed this fund
                        if result['fund_name'].lower() in funds_processed_set:
                            print(f"  ⚠️  Stopping - already processed fund: {result['fund_name']}")
                            stop_processing = True
                            break
                        
                        funds_processed_set.add(result['fund_name'].lower())
                        ticker_results.append(result)
                        
                        print(f"  ✅ {result['fund_name']}: {len(result['holdings'])} holdings, "
                              f"{len(result['not_matches'])} unmatched")
            
            # Save batch progress
            batch_cache = cache_dir / f"{ticker}_batch_{i//batch_size}.pkl"
            with open(batch_cache, 'wb') as f:
                pickle.dump(ticker_results, f)
            
            print(f"  💾 Saved batch progress ({len(ticker_results)} filings)")
            
            # Clean memory between batches
            gc.collect()
        
        final_result = {
            'ticker': ticker,
            'results': ticker_results,
            'funds_processed': list(funds_processed_set)
        }
        
        # Save final result
        with open(ticker_cache, 'wb') as f:
            pickle.dump(final_result, f)
        
        print(f"\n✅ Completed {ticker}: {len(funds_processed_set)} funds processed")
        
        # Clean up
        del nport_file, filings, filings_list, ticker_results
        gc.collect()
        
        return final_result
    
    except Exception as e:
        print(f"❌ Error processing ticker {ticker}: {e}")
        return None

# Calculate safe worker counts
available_ram_gb = psutil.virtual_memory().available / (1024**3)
# Each NPORT worker needs ~2-3GB, be conservative
workers_per_batch = max(1, min(2, int((available_ram_gb - 4) / 3)))
batch_size = 4  # Process 4 filings per batch
workers_per_batch = 8
print(f"💻 System: {available_ram_gb:.1f}GB available RAM")
print(f"⚙️  Settings: {workers_per_batch} workers per batch, {batch_size} filings per batch")

# Main execution
tickers = ["VOO", "MGK", "HEZU", "VMGRX", "VDIGX"]

print("\n" + "="*60)
print("Processing tickers sequentially (parallel within each ticker)")
print("="*60)

all_results = []

for ticker in tickers:
    result = process_ticker_parallel(
        ticker, 
        company_json_path, 
        batch_size=batch_size,
        max_workers=workers_per_batch
    )
    if result:
        all_results.append(result)
    
    # Force cleanup between tickers
    gc.collect()

💻 System: 17.1GB available RAM
⚙️  Settings: 8 workers per batch, 4 filings per batch

Processing tickers sequentially (parallel within each ticker)

Processing ticker: VOO
Found 301 filings, most recent: 2025-09-30

📦 Batch 1/76


VOO batch 1:  25%|██▌       | 1/4 [00:01<00:04,  1.43s/it]

  ✅ VANGUARD GROWTH INDEX FUND: 166 holdings, 4 unmatched
  ✅ VANGUARD MID-CAP VALUE INDEX FUND: 186 holdings, 4 unmatched


VOO batch 1:  75%|███████▌  | 3/4 [00:02<00:00,  1.36it/s]

  ✅ VANGUARD EXTENDED MARKET INDEX FUND: 3458 holdings, 89 unmatched


VOO batch 1: 100%|██████████| 4/4 [00:05<00:00,  1.39s/it]

  ✅ VANGUARD SMALL-CAP INDEX FUND: 1335 holdings, 15 unmatched
  💾 Saved batch progress (4 filings)






📦 Batch 2/76


VOO batch 2: 100%|██████████| 4/4 [00:01<00:00,  2.77it/s]

  ✅ VANGUARD LARGE-CAP INDEX FUND: 458 holdings, 7 unmatched
  ✅ VANGUARD 500 INDEX FUND: 532 holdings, 8 unmatched
  ✅ VANGUARD MID-CAP GROWTH INDEX FUND: 125 holdings, 2 unmatched
  ✅ VANGUARD SMALL-CAP VALUE INDEX FUND: 849 holdings, 10 unmatched





  💾 Saved batch progress (8 filings)

📦 Batch 3/76


VOO batch 3:  75%|███████▌  | 3/4 [00:01<00:00,  2.51it/s]

  ✅ VANGUARD SMALL-CAP GROWTH INDEX FUND: 578 holdings, 9 unmatched
  ✅ VANGUARD MID-CAP INDEX FUND: 302 holdings, 4 unmatched
  ✅ VANGUARD VALUE INDEX FUND: 333 holdings, 5 unmatched


VOO batch 3: 100%|██████████| 4/4 [00:02<00:00,  1.61it/s]

  ✅ VANGUARD TOTAL STOCK MARKET INDEX FUND: 3560 holdings, 70 unmatched
  💾 Saved batch progress (12 filings)






📦 Batch 4/76


VOO batch 4:   0%|          | 0/4 [00:01<?, ?it/s]

  ⚠️  Stopping - already processed fund: VANGUARD GROWTH INDEX FUND





  💾 Saved batch progress (12 filings)

✅ Completed VOO: 12 funds processed

Processing ticker: MGK
Found 550 filings, most recent: 2025-11-30

📦 Batch 1/138


MGK batch 1:  25%|██▌       | 1/4 [00:01<00:03,  1.05s/it]

  ✅ VANGUARD FTSE SOCIAL INDEX FUND: 391 holdings, 5 unmatched
  ✅ VANGUARD MEGA CAP INDEX FUND: 184 holdings, 4 unmatched
  ✅ VANGUARD HEALTH CARE INDEX FUND: 426 holdings, 8 unmatched


MGK batch 1: 100%|██████████| 4/4 [00:01<00:00,  3.19it/s]

  ✅ VANGUARD ESG U.S. STOCK ETF: 1277 holdings, 9 unmatched
  💾 Saved batch progress (4 filings)






📦 Batch 2/138


MGK batch 2: 100%|██████████| 4/4 [00:01<00:00,  3.34it/s]

  ✅ VANGUARD CONSUMER STAPLES INDEX FUND: 110 holdings, 3 unmatched
  ✅ VANGUARD ENERGY INDEX FUND: 114 holdings, 3 unmatched
  ✅ VANGUARD INFORMATION TECHNOLOGY INDEX FUND: 328 holdings, 3 unmatched
  ✅ VANGUARD MEGA CAP VALUE INDEX FUND: 126 holdings, 3 unmatched





  💾 Saved batch progress (8 filings)

📦 Batch 3/138


MGK batch 3:  50%|█████     | 2/4 [00:01<00:01,  1.87it/s]

  ✅ VANGUARD INDUSTRIALS INDEX FUND: 394 holdings, 4 unmatched
  ✅ VANGUARD FINANCIALS INDEX FUND: 421 holdings, 4 unmatched


MGK batch 3:  75%|███████▌  | 3/4 [00:04<00:01,  1.61s/it]

  ✅ VANGUARD EMERGING MARKETS EX-CHINA ETF: 1025 holdings, 530 unmatched


MGK batch 3: 100%|██████████| 4/4 [00:19<00:00,  4.92s/it]

  ✅ VANGUARD ESG INTERNATIONAL STOCK ETF: 6620 holdings, 3153 unmatched
  💾 Saved batch progress (12 filings)






📦 Batch 4/138


MGK batch 4:  25%|██▌       | 1/4 [00:01<00:03,  1.19s/it]

  ✅ VANGUARD COMMUNICATION SERVICES INDEX FUND: 126 holdings, 5 unmatched


MGK batch 4: 100%|██████████| 4/4 [00:02<00:00,  1.62it/s]

  ✅ VANGUARD GLOBAL WELLESLEY INCOME FUND: 595 holdings, 127 unmatched
  ✅ VANGUARD ESG U.S. CORPORATE BOND ETF: 2736 holdings, 104 unmatched
  ✅ VANGUARD GLOBAL WELLINGTON FUND: 647 holdings, 138 unmatched





  💾 Saved batch progress (16 filings)

📦 Batch 5/138


MGK batch 5:  25%|██▌       | 1/4 [00:01<00:03,  1.13s/it]

  ✅ VANGUARD UTILITIES INDEX FUND: 72 holdings, 2 unmatched
  ✅ VANGUARD MATERIALS INDEX FUND: 117 holdings, 2 unmatched


MGK batch 5: 100%|██████████| 4/4 [00:01<00:00,  2.58it/s]

  ✅ VANGUARD CONSUMER DISCRETIONARY INDEX FUND: 294 holdings, 9 unmatched
  ✅ VANGUARD INTERNATIONAL GROWTH FUND: 129 holdings, 39 unmatched





  💾 Saved batch progress (20 filings)

📦 Batch 6/138


MGK batch 6:  25%|██▌       | 1/4 [00:01<00:03,  1.30s/it]

  ✅ VANGUARD U.S. GROWTH FUND: 115 holdings, 7 unmatched


MGK batch 6:  75%|███████▌  | 3/4 [00:01<00:00,  2.37it/s]

  ✅ VANGUARD MEGA CAP GROWTH INDEX FUND: 70 holdings, 3 unmatched
  ✅ VANGUARD EXTENDED DURATION TREASURY INDEX FUND: 83 holdings, 1 unmatched


MGK batch 6:  75%|███████▌  | 3/4 [00:01<00:00,  1.75it/s]


  ⚠️  Stopping - already processed fund: VANGUARD FTSE SOCIAL INDEX FUND
  💾 Saved batch progress (23 filings)

✅ Completed MGK: 23 funds processed

Processing ticker: HEZU
Found 7963 filings, most recent: 2025-11-30

📦 Batch 1/1991


HEZU batch 1:  25%|██▌       | 1/4 [00:04<00:12,  4.02s/it]

  ✅ iShares Paris-Aligned Climate Optimized MSCI USA ETF: 138 holdings, 0 unmatched


HEZU batch 1:  50%|█████     | 2/4 [00:04<00:03,  1.77s/it]

  ✅ iShares Currency Hedged MSCI Eurozone ETF: 9 holdings, 4 unmatched


HEZU batch 1: 100%|██████████| 4/4 [00:04<00:00,  1.16s/it]

  ✅ iShares ESG Aware MSCI USA Small-Cap ETF: 916 holdings, 3 unmatched
  ✅ iShares ESG Aware MSCI USA Value ETF: 236 holdings, 2 unmatched
  💾 Saved batch progress (4 filings)






📦 Batch 2/1991


HEZU batch 2:  50%|█████     | 2/4 [00:04<00:03,  1.71s/it]

  ✅ iShares Climate Conscious & Transition MSCI USA ETF: 278 holdings, 2 unmatched
  ✅ iShares ESG Aware MSCI USA ETF: 283 holdings, 2 unmatched


HEZU batch 2: 100%|██████████| 4/4 [00:04<00:00,  1.09s/it]

  ✅ iShares MSCI UAE ETF: 59 holdings, 22 unmatched
  ✅ iShares MSCI Saudi Arabia ETF: 129 holdings, 50 unmatched
  💾 Saved batch progress (8 filings)






📦 Batch 3/1991


HEZU batch 3:  25%|██▌       | 1/4 [00:03<00:10,  3.63s/it]

  ✅ iShares ESG Advanced MSCI USA ETF: 305 holdings, 2 unmatched
  ✅ iShares Currency Hedged MSCI Japan ETF: 10 holdings, 7 unmatched


HEZU batch 3: 100%|██████████| 4/4 [00:04<00:00,  1.02s/it]

  ✅ iShares MSCI Finland ETF: 39 holdings, 32 unmatched
  ✅ iShares MSCI Ireland ETF: 30 holdings, 5 unmatched
  💾 Saved batch progress (12 filings)






📦 Batch 4/1991


HEZU batch 4:  25%|██▌       | 1/4 [00:03<00:11,  3.79s/it]

  ✅ iShares ESG Aware MSCI USA Growth ETF: 93 holdings, 0 unmatched


HEZU batch 4:  50%|█████     | 2/4 [00:04<00:03,  1.76s/it]

  ✅ iShares ESG MSCI USA Leaders ETF: 267 holdings, 2 unmatched


HEZU batch 4:  75%|███████▌  | 3/4 [00:04<00:01,  1.05s/it]

  ✅ iShares MSCI United Kingdom Small-Cap ETF: 202 holdings, 104 unmatched


HEZU batch 4: 100%|██████████| 4/4 [00:21<00:00,  5.33s/it]

  ✅ iShares National Muni Bond ETF: 6091 holdings, 2695 unmatched
  💾 Saved batch progress (16 filings)






📦 Batch 5/1991


HEZU batch 5:  25%|██▌       | 1/4 [00:04<00:12,  4.22s/it]

  ✅ iShares Agency Bond ETF: 110 holdings, 5 unmatched


HEZU batch 5:  50%|█████     | 2/4 [00:04<00:03,  1.91s/it]

  ✅ iShares 7-10 Year Treasury Bond ETF: 18 holdings, 0 unmatched


HEZU batch 5:  75%|███████▌  | 3/4 [00:08<00:02,  2.75s/it]

  ✅ iShares California Muni Bond ETF: 1389 holdings, 539 unmatched


HEZU batch 5: 100%|██████████| 4/4 [00:12<00:00,  3.02s/it]

  ✅ iShares Short-Term National Muni Bond ETF: 2554 holdings, 1212 unmatched
  💾 Saved batch progress (20 filings)






📦 Batch 6/1991


HEZU batch 6:  25%|██▌       | 1/4 [00:03<00:10,  3.55s/it]

  ✅ iShares 25+ Year Treasury STRIPS Bond ETF: 23 holdings, 0 unmatched


HEZU batch 6:  50%|█████     | 2/4 [00:04<00:03,  1.97s/it]

  ✅ iShares Short Treasury Bond ETF: 68 holdings, 0 unmatched


HEZU batch 6:  75%|███████▌  | 3/4 [00:04<00:01,  1.21s/it]

  ✅ iShares Long-Term National Muni Bond ETF: 241 holdings, 111 unmatched


HEZU batch 6: 100%|██████████| 4/4 [00:05<00:00,  1.43s/it]

  ✅ iShares New York Muni Bond ETF: 734 holdings, 313 unmatched
  💾 Saved batch progress (24 filings)






📦 Batch 7/1991


HEZU batch 7:  75%|███████▌  | 3/4 [00:04<00:01,  1.20s/it]

  ✅ iShares 3-7 Year Treasury Bond ETF: 84 holdings, 0 unmatched
  ✅ iShares 10-20 Year Treasury Bond ETF: 66 holdings, 0 unmatched
  ✅ iShares 0-3 Month Treasury Bond ETF: 24 holdings, 0 unmatched


HEZU batch 7: 100%|██████████| 4/4 [00:04<00:00,  1.25s/it]

  ✅ iShares BBB Rated Corporate Bond ETF: 1125 holdings, 56 unmatched
  💾 Saved batch progress (28 filings)






📦 Batch 8/1991


HEZU batch 8:  25%|██▌       | 1/4 [00:04<00:12,  4.20s/it]

  ✅ iShares 1-3 Year Treasury Bond ETF: 89 holdings, 0 unmatched


HEZU batch 8:  50%|█████     | 2/4 [00:04<00:04,  2.16s/it]

  ✅ iShares 20+ Year Treasury Bond ETF: 45 holdings, 0 unmatched


HEZU batch 8:  75%|███████▌  | 3/4 [00:05<00:01,  1.50s/it]

  ✅ iShares iBoxx $ Investment Grade Corporate Bond ETF: 3008 holdings, 192 unmatched


HEZU batch 8: 100%|██████████| 4/4 [00:17<00:00,  4.28s/it]

  ✅ iShares Broad USD Investment Grade Corporate Bond ETF: 11176 holdings, 1771 unmatched





  💾 Saved batch progress (32 filings)

📦 Batch 9/1991


HEZU batch 9:  25%|██▌       | 1/4 [00:04<00:14,  4.80s/it]

  ✅ iShares ESG Aware 1-5 Year USD Corporate Bond ETF: 1657 holdings, 99 unmatched


HEZU batch 9:  50%|█████     | 2/4 [00:05<00:04,  2.23s/it]

  ✅ iShares ESG MSCI EM Leaders ETF: 432 holdings, 165 unmatched


HEZU batch 9:  75%|███████▌  | 3/4 [00:06<00:01,  1.82s/it]

  ✅ iShares ESG Aware USD Corporate Bond ETF: 4171 holdings, 326 unmatched


HEZU batch 9: 100%|██████████| 4/4 [00:08<00:00,  2.18s/it]

  ✅ iShares Core 5-10 Year USD Bond ETF: 3441 holdings, 628 unmatched





  💾 Saved batch progress (36 filings)

📦 Batch 10/1991


HEZU batch 10:  25%|██▌       | 1/4 [00:05<00:17,  5.70s/it]

  ✅ iShares ESG Aware MSCI EAFE ETF: 368 holdings, 169 unmatched


HEZU batch 10:  50%|█████     | 2/4 [00:06<00:06,  3.01s/it]

  ✅ iShares MSCI China ETF: 567 holdings, 169 unmatched


HEZU batch 10:  75%|███████▌  | 3/4 [00:09<00:03,  3.08s/it]

  ✅ iShares ESG Advanced Universal USD Bond ETF: 4157 holdings, 721 unmatched


HEZU batch 10: 100%|██████████| 4/4 [00:10<00:00,  2.57s/it]

  ✅ iShares Intermediate Government/Credit Bond ETF: 6045 holdings, 540 unmatched





  💾 Saved batch progress (40 filings)

📦 Batch 11/1991


HEZU batch 11:  25%|██▌       | 1/4 [00:04<00:13,  4.34s/it]

  ✅ iShares MSCI Japan Value ETF: 110 holdings, 45 unmatched


HEZU batch 11:  50%|█████     | 2/4 [00:04<00:04,  2.03s/it]

  ✅ iShares MSCI India ETF: 166 holdings, 76 unmatched


HEZU batch 11:  75%|███████▌  | 3/4 [00:07<00:02,  2.34s/it]

  ✅ iShares 5-10 Year Investment Grade Corporate Bond ETF: 2963 holdings, 432 unmatched


HEZU batch 11: 100%|██████████| 4/4 [00:09<00:00,  2.37s/it]

  ✅ iShares Core 10+ Year USD Bond ETF: 3783 holdings, 614 unmatched





  💾 Saved batch progress (44 filings)

📦 Batch 12/1991


HEZU batch 12:  25%|██▌       | 1/4 [00:04<00:13,  4.40s/it]

  ✅ iShares MSCI China Small-Cap ETF: 269 holdings, 97 unmatched
  ✅ iShares MSCI Emerging Markets Value Factor ETF: 289 holdings, 116 unmatched


HEZU batch 12:  75%|███████▌  | 3/4 [00:09<00:02,  2.98s/it]

  ✅ iShares 1-5 Year Investment Grade Corporate Bond ETF: 4441 holdings, 681 unmatched


HEZU batch 12: 100%|██████████| 4/4 [00:21<00:00,  5.32s/it]

  ✅ iShares Core U.S. Aggregate Bond ETF: 12954 holdings, 2032 unmatched





  💾 Saved batch progress (48 filings)

📦 Batch 13/1991


HEZU batch 13:  25%|██▌       | 1/4 [00:03<00:11,  3.72s/it]

  ✅ iShares MSCI Kuwait ETF: 38 holdings, 14 unmatched


HEZU batch 13:  50%|█████     | 2/4 [00:04<00:03,  1.70s/it]

  ✅ iShares MSCI Peru and Global Exposure ETF: 27 holdings, 12 unmatched


HEZU batch 13:  75%|███████▌  | 3/4 [00:04<00:01,  1.04s/it]

  ✅ iShares MSCI Global Sustainable Development Goals ETF: 126 holdings, 45 unmatched


HEZU batch 13: 100%|██████████| 4/4 [00:09<00:00,  2.48s/it]

  ✅ iShares ESG Aware U.S. Aggregate Bond ETF: 5122 holdings, 679 unmatched





  💾 Saved batch progress (52 filings)

📦 Batch 14/1991


HEZU batch 14:  25%|██▌       | 1/4 [00:03<00:11,  3.88s/it]

  ✅ iShares High Yield Systematic Bond ETF: 268 holdings, 58 unmatched


HEZU batch 14:  50%|█████     | 2/4 [00:04<00:04,  2.13s/it]

  ✅ iShares MSCI Emerging Markets Quality Factor ETF: 315 holdings, 126 unmatched


HEZU batch 14:  75%|███████▌  | 3/4 [00:08<00:03,  3.01s/it]

  ✅ iShares 10+ Year Investment Grade Corporate Bond ETF: 3813 holdings, 664 unmatched


HEZU batch 14: 100%|██████████| 4/4 [00:26<00:00,  6.75s/it]

  ✅ iShares MBS ETF: 11185 holdings, 3391 unmatched





  💾 Saved batch progress (56 filings)

📦 Batch 15/1991


HEZU batch 15:  25%|██▌       | 1/4 [00:03<00:11,  3.87s/it]

  ✅ iShares MSCI Philippines ETF: 36 holdings, 12 unmatched


HEZU batch 15:  50%|█████     | 2/4 [00:04<00:03,  1.75s/it]

  ✅ iShares MSCI Denmark ETF: 44 holdings, 29 unmatched


HEZU batch 15:  75%|███████▌  | 3/4 [00:05<00:01,  1.52s/it]

  ✅ iShares MSCI India Small-Cap ETF: 501 holdings, 243 unmatched


HEZU batch 15: 100%|██████████| 4/4 [00:06<00:00,  1.73s/it]

  ✅ iShares Government/Credit Bond ETF: 3092 holdings, 287 unmatched





  💾 Saved batch progress (60 filings)

📦 Batch 16/1991


HEZU batch 16:  25%|██▌       | 1/4 [00:04<00:13,  4.60s/it]

  ✅ iShares ESG Advanced MSCI EAFE ETF: 418 holdings, 210 unmatched


HEZU batch 16:  50%|█████     | 2/4 [00:04<00:04,  2.08s/it]

  ✅ iShares Paris-Aligned Climate Optimized MSCI World ex USA ETF: 432 holdings, 194 unmatched


HEZU batch 16:  75%|███████▌  | 3/4 [00:06<00:01,  1.85s/it]

  ✅ iShares iBoxx $ High Yield Corporate Bond ETF: 1329 holdings, 354 unmatched


HEZU batch 16: 100%|██████████| 4/4 [00:08<00:00,  2.16s/it]

  ✅ iShares MSCI World Small-Cap ETF: 3383 holdings, 934 unmatched





  💾 Saved batch progress (64 filings)

📦 Batch 17/1991


HEZU batch 17:  25%|██▌       | 1/4 [00:03<00:10,  3.45s/it]

  ✅ iShares MSCI New Zealand ETF: 28 holdings, 6 unmatched


HEZU batch 17:  75%|███████▌  | 3/4 [00:04<00:01,  1.01s/it]

  ✅ iShares MSCI Indonesia ETF: 85 holdings, 46 unmatched
  ✅ iShares Investment Grade Systematic Bond ETF: 508 holdings, 29 unmatched


HEZU batch 17: 100%|██████████| 4/4 [00:05<00:00,  1.31s/it]

  ✅ iShares ESG Advanced MSCI EM ETF: 554 holdings, 203 unmatched





  💾 Saved batch progress (68 filings)

📦 Batch 18/1991


HEZU batch 18:  50%|█████     | 2/4 [00:04<00:03,  1.79s/it]

  ✅ iShares MSCI Qatar ETF: 33 holdings, 18 unmatched
  ✅ iShares MSCI Poland ETF: 36 holdings, 21 unmatched


HEZU batch 18:  75%|███████▌  | 3/4 [00:04<00:01,  1.23s/it]

  ✅ iShares MSCI Norway ETF: 60 holdings, 38 unmatched


HEZU batch 18: 100%|██████████| 4/4 [00:05<00:00,  1.27s/it]

  ✅ iShares MSCI Global Quality Factor ETF: 495 holdings, 148 unmatched





  💾 Saved batch progress (72 filings)

📦 Batch 19/1991


HEZU batch 19:  25%|██▌       | 1/4 [00:03<00:11,  3.93s/it]

  ✅ iShares MSCI United Kingdom ETF: 76 holdings, 23 unmatched
  ✅ iShares MSCI Brazil Small-Cap ETF: 74 holdings, 47 unmatched


HEZU batch 19:  75%|███████▌  | 3/4 [00:04<00:01,  1.13s/it]

  ✅ iShares MSCI China Multisector Tech ETF: 170 holdings, 45 unmatched


HEZU batch 19: 100%|██████████| 4/4 [00:04<00:00,  1.15s/it]

  ✅ iShares MSCI EAFE Min Vol Factor ETF: 240 holdings, 115 unmatched





  💾 Saved batch progress (76 filings)

📦 Batch 20/1991


HEZU batch 20:  25%|██▌       | 1/4 [00:04<00:12,  4.01s/it]

  ✅ iShares iBonds Dec 2027 Term Corporate ETF: 685 holdings, 42 unmatched


HEZU batch 20:  50%|█████     | 2/4 [00:04<00:03,  1.86s/it]

  ✅ iShares MSCI Intl Quality Factor ETF: 305 holdings, 135 unmatched


HEZU batch 20:  75%|███████▌  | 3/4 [00:04<00:01,  1.28s/it]

  ✅ iShares CMBS ETF: 521 holdings, 206 unmatched


HEZU batch 20: 100%|██████████| 4/4 [00:09<00:00,  2.37s/it]

  ✅ iShares MSCI EAFE Small-Cap ETF: 2029 holdings, 1161 unmatched





  💾 Saved batch progress (80 filings)

📦 Batch 21/1991


HEZU batch 21:  25%|██▌       | 1/4 [00:04<00:12,  4.24s/it]

  ✅ iShares iBonds Dec 2034 Term Corporate ETF: 375 holdings, 23 unmatched
  ✅ iShares iBonds Oct 2034 Term TIPS ETF: 2 holdings, 0 unmatched


HEZU batch 21:  75%|███████▌  | 3/4 [00:04<00:01,  1.18s/it]

  ✅ iShares Treasury Floating Rate Bond ETF: 10 holdings, 0 unmatched


HEZU batch 21: 100%|██████████| 4/4 [00:04<00:00,  1.20s/it]

  ✅ iShares iBonds Dec 2028 Term Corporate ETF: 698 holdings, 57 unmatched





  💾 Saved batch progress (84 filings)

📦 Batch 22/1991


HEZU batch 22:  25%|██▌       | 1/4 [00:03<00:10,  3.66s/it]

  ✅ iShares iBonds Dec 2034 Term Treasury ETF: 5 holdings, 0 unmatched


HEZU batch 22:  50%|█████     | 2/4 [00:04<00:03,  1.90s/it]

  ✅ iShares 1-3 Year International Treasury Bond ETF: 167 holdings, 155 unmatched


HEZU batch 22:  75%|███████▌  | 3/4 [00:04<00:01,  1.18s/it]

  ✅ iShares iBonds Dec 2029 Term Corporate ETF: 605 holdings, 44 unmatched


HEZU batch 22: 100%|██████████| 4/4 [00:04<00:00,  1.25s/it]

  ✅ iShares International Equity Factor ETF: 472 holdings, 223 unmatched





  💾 Saved batch progress (88 filings)

📦 Batch 23/1991


HEZU batch 23:  25%|██▌       | 1/4 [00:03<00:11,  3.84s/it]

  ✅ iShares 20+ Year Treasury Bond BuyWrite Strategy ETF: 3 holdings, 0 unmatched
  ✅ iShares Convertible Bond ETF: 367 holdings, 7 unmatched
  ✅ iShares iBonds Dec 2025 Term Corporate ETF: 69 holdings, 3 unmatched


HEZU batch 23: 100%|██████████| 4/4 [00:10<00:00,  2.66s/it]

  ✅ iShares Core MSCI EAFE ETF: 2618 holdings, 1428 unmatched





  💾 Saved batch progress (92 filings)

📦 Batch 24/1991


HEZU batch 24:  25%|██▌       | 1/4 [00:04<00:12,  4.07s/it]

  ✅ iShares iBonds Dec 2054 Term Treasury ETF: 4 holdings, 0 unmatched


HEZU batch 24:  50%|█████     | 2/4 [00:04<00:03,  1.80s/it]

  ✅ iShares Self-Driving EV and Tech ETF: 51 holdings, 8 unmatched


HEZU batch 24: 100%|██████████| 4/4 [00:04<00:00,  1.17s/it]

  ✅ iShares Currency Hedged MSCI EAFE ETF: 99 holdings, 12 unmatched
  ✅ iShares Global REIT ETF: 331 holdings, 94 unmatched





  💾 Saved batch progress (96 filings)

📦 Batch 25/1991


HEZU batch 25:  50%|█████     | 2/4 [00:04<00:03,  1.72s/it]

  ✅ iShares Currency Hedged MSCI EAFE Small-Cap ETF: 51 holdings, 2 unmatched
  ✅ iShares S&P 500 BuyWrite ETF: 3 holdings, 2 unmatched


HEZU batch 25: 100%|██████████| 4/4 [00:04<00:00,  1.14s/it]

  ✅ iShares International Developed Real Estate ETF: 264 holdings, 116 unmatched
  ✅ iShares MSCI EAFE Value ETF: 426 holdings, 208 unmatched





  💾 Saved batch progress (100 filings)

📦 Batch 26/1991


HEZU batch 26:  50%|█████     | 2/4 [00:04<00:03,  1.91s/it]

  ✅ iShares iBonds 2031 Term High Yield and Income ETF: 219 holdings, 51 unmatched
  ✅ iShares ESG Advanced High Yield Corporate Bond ETF: 626 holdings, 100 unmatched


HEZU batch 26:  75%|███████▌  | 3/4 [00:04<00:01,  1.18s/it]

  ✅ iShares MSCI China A ETF: 387 holdings, 132 unmatched


HEZU batch 26: 100%|██████████| 4/4 [00:07<00:00,  1.88s/it]

  ✅ iShares iBonds Dec 2030 Term Muni Bond ETF: 1206 holdings, 526 unmatched





  💾 Saved batch progress (104 filings)

📦 Batch 27/1991


HEZU batch 27:  25%|██▌       | 1/4 [00:04<00:14,  4.82s/it]

  ✅ iShares Floating Rate Bond ETF: 476 holdings, 108 unmatched
  ✅ iShares Environmentally Aware Real Estate ETF: 356 holdings, 109 unmatched


HEZU batch 27:  75%|███████▌  | 3/4 [00:07<00:02,  2.29s/it]

  ✅ iShares Core MSCI Pacific ETF: 1367 holdings, 676 unmatched


HEZU batch 27: 100%|██████████| 4/4 [00:15<00:00,  3.89s/it]

  ✅ iShares Core 1-5 Year USD Bond ETF: 7008 holdings, 1685 unmatched





  💾 Saved batch progress (108 filings)

📦 Batch 28/1991


HEZU batch 28:  25%|██▌       | 1/4 [00:04<00:14,  4.80s/it]

  ✅ iShares Russell 2000 BuyWrite ETF: 3 holdings, 2 unmatched


HEZU batch 28:  50%|█████     | 2/4 [00:07<00:06,  3.32s/it]

  ✅ iShares MSCI ACWI Low Carbon Target ETF: 981 holdings, 232 unmatched
  ✅ iShares Aaa - A Rated Corporate Bond ETF: 3361 holdings, 210 unmatched


HEZU batch 28: 100%|██████████| 4/4 [00:24<00:00,  6.14s/it]

  ✅ iShares Core International Aggregate Bond ETF: 7102 holdings, 4374 unmatched





  💾 Saved batch progress (112 filings)

📦 Batch 29/1991


HEZU batch 29:  25%|██▌       | 1/4 [00:04<00:12,  4.31s/it]

  ✅ iShares High Yield Corporate Bond BuyWrite Strategy ETF: 3 holdings, 0 unmatched


HEZU batch 29:  75%|███████▌  | 3/4 [00:05<00:01,  1.29s/it]

  ✅ iShares iBonds Dec 2030 Term Corporate ETF: 714 holdings, 48 unmatched
  ✅ iShares Global Equity Factor ETF: 630 holdings, 182 unmatched


HEZU batch 29: 100%|██████████| 4/4 [00:09<00:00,  2.44s/it]

  ✅ iShares Core MSCI International Developed Markets ETF: 2259 holdings, 1122 unmatched





  💾 Saved batch progress (116 filings)

📦 Batch 30/1991


HEZU batch 30:  25%|██▌       | 1/4 [00:05<00:16,  5.55s/it]

  ✅ iShares Fallen Angels USD Bond ETF: 156 holdings, 28 unmatched


HEZU batch 30:  50%|█████     | 2/4 [00:05<00:04,  2.50s/it]

  ✅ iShares GNMA Bond ETF: 313 holdings, 0 unmatched


HEZU batch 30:  75%|███████▌  | 3/4 [00:06<00:01,  1.52s/it]

  ✅ iShares MSCI EAFE Growth ETF: 368 holdings, 174 unmatched


HEZU batch 30: 100%|██████████| 4/4 [00:06<00:00,  1.65s/it]

  ✅ iShares MSCI Intl Value Factor ETF: 348 holdings, 153 unmatched





  💾 Saved batch progress (120 filings)

📦 Batch 31/1991


HEZU batch 31:  25%|██▌       | 1/4 [00:04<00:14,  4.71s/it]

  ✅ iShares Breakthrough Environmental Solutions ETF: 48 holdings, 13 unmatched


HEZU batch 31:  50%|█████     | 2/4 [00:05<00:05,  2.55s/it]

  ✅ iShares J.P. Morgan Broad USD Emerging Markets Bond ETF: 267 holdings, 66 unmatched


HEZU batch 31:  75%|███████▌  | 3/4 [00:06<00:01,  1.54s/it]

  ✅ iShares BB Rated Corporate Bond ETF: 1045 holdings, 205 unmatched


HEZU batch 31: 100%|██████████| 4/4 [00:14<00:00,  3.71s/it]

  ✅ iShares Core MSCI Total International Stock ETF: 4194 holdings, 1978 unmatched





  💾 Saved batch progress (124 filings)

📦 Batch 32/1991


HEZU batch 32:  25%|██▌       | 1/4 [00:05<00:15,  5.18s/it]

  ✅ iShares International Select Dividend ETF: 109 holdings, 43 unmatched


HEZU batch 32:  50%|█████     | 2/4 [00:06<00:05,  2.68s/it]

  ✅ iShares Genomics Immunology and Healthcare ETF: 47 holdings, 1 unmatched


HEZU batch 32:  75%|███████▌  | 3/4 [00:07<00:02,  2.17s/it]

  ✅ iShares iBonds 2026 Term High Yield and Income ETF: 359 holdings, 40 unmatched


HEZU batch 32: 100%|██████████| 4/4 [00:08<00:00,  2.07s/it]

  ✅ iShares Systematic Bond ETF: 635 holdings, 98 unmatched





  💾 Saved batch progress (128 filings)

📦 Batch 33/1991


HEZU batch 33:  50%|█████     | 2/4 [00:04<00:04,  2.00s/it]

  ✅ iShares iBonds 2025 Term High Yield and Income ETF: 35 holdings, 3 unmatched
  ✅ iShares Energy Storage & Materials ETF: 66 holdings, 22 unmatched


HEZU batch 33:  75%|███████▌  | 3/4 [00:07<00:02,  2.37s/it]

  ✅ iShares International Treasury Bond ETF: 891 holdings, 840 unmatched


HEZU batch 33: 100%|██████████| 4/4 [00:08<00:00,  2.05s/it]

  ✅ iShares 0-5 Year High Yield Corporate Bond ETF: 1176 holdings, 338 unmatched





  💾 Saved batch progress (132 filings)

📦 Batch 34/1991


HEZU batch 34:  25%|██▌       | 1/4 [00:06<00:19,  6.56s/it]

  ✅ iShares MSCI EAFE ETF: 700 holdings, 340 unmatched


HEZU batch 34:  50%|█████     | 2/4 [00:08<00:08,  4.04s/it]

  ✅ iShares International Small-Cap Equity Factor ETF: 1102 holdings, 617 unmatched


HEZU batch 34: 100%|██████████| 4/4 [00:10<00:00,  2.67s/it]

  ✅ iShares MSCI ACWI ex U.S. ETF: 1734 holdings, 724 unmatched
  ✅ iShares Broad USD High Yield Corporate Bond ETF: 1951 holdings, 527 unmatched





  💾 Saved batch progress (136 filings)

📦 Batch 35/1991


HEZU batch 35:  25%|██▌       | 1/4 [00:05<00:16,  5.37s/it]

  ✅ iShares Investment Grade Corporate Bond BuyWrite Strategy ETF: 3 holdings, 0 unmatched


HEZU batch 35:  50%|█████     | 2/4 [00:06<00:06,  3.07s/it]

  ✅ iShares 0-5 Year Investment Grade Corporate Bond ETF: 2887 holdings, 160 unmatched


HEZU batch 35:  75%|███████▌  | 3/4 [00:07<00:01,  1.78s/it]

  ✅ iShares iBonds Dec 2044 Term Treasury ETF: 9 holdings, 0 unmatched


HEZU batch 35: 100%|██████████| 4/4 [00:07<00:00,  1.87s/it]

  ✅ iShares MSCI Intl Momentum Factor ETF: 304 holdings, 128 unmatched





  💾 Saved batch progress (140 filings)

📦 Batch 36/1991


HEZU batch 36:  25%|██▌       | 1/4 [00:07<00:21,  7.21s/it]

  ✅ iShares Cybersecurity and Tech ETF: 36 holdings, 6 unmatched


HEZU batch 36:  75%|███████▌  | 3/4 [00:08<00:01,  1.91s/it]

  ✅ iShares USD Green Bond ETF: 331 holdings, 117 unmatched
  ✅ iShares China Large-Cap ETF: 53 holdings, 15 unmatched
❌ Error processing filing 0001410368-25-043373: The read operation timed out


HEZU batch 36: 100%|██████████| 4/4 [00:10<00:00,  2.68s/it]


  💾 Saved batch progress (143 filings)

📦 Batch 37/1991


HEZU batch 37:  50%|█████     | 2/4 [00:05<00:04,  2.43s/it]

  ✅ iShares Future Exponential Technologies ETF: 271 holdings, 39 unmatched
  ✅ iShares Neuroscience and Healthcare ETF: 61 holdings, 5 unmatched


HEZU batch 37:  75%|███████▌  | 3/4 [00:07<00:01,  1.99s/it]

  ✅ iShares iBonds 2027 Term High Yield and Income ETF: 163 holdings, 46 unmatched


HEZU batch 37: 100%|██████████| 4/4 [00:11<00:00,  2.76s/it]

  ✅ iShares MSCI ACWI ETF: 2255 holdings, 712 unmatched





  💾 Saved batch progress (147 filings)

📦 Batch 38/1991


HEZU batch 38:  50%|█████     | 2/4 [00:06<00:05,  2.77s/it]

  ✅ iShares iBonds Dec 2031 Term Corporate ETF: 462 holdings, 33 unmatched
  ✅ iShares Currency Hedged MSCI ACWI ex U.S. ETF: 185 holdings, 13 unmatched


HEZU batch 38:  75%|███████▌  | 3/4 [00:07<00:02,  2.06s/it]

  ✅ iShares MSCI Kokusai ETF: 1145 holdings, 276 unmatched


HEZU batch 38: 100%|██████████| 4/4 [00:08<00:00,  2.14s/it]

  ✅ iShares J.P. Morgan USD Emerging Markets Bond ETF: 662 holdings, 181 unmatched





  💾 Saved batch progress (151 filings)

📦 Batch 39/1991


HEZU batch 39:  25%|██▌       | 1/4 [00:06<00:18,  6.24s/it]

  ✅ iShares Select Dividend ETF: 104 holdings, 2 unmatched


HEZU batch 39:  50%|█████     | 2/4 [00:06<00:05,  2.95s/it]

  ✅ iShares Morningstar Small-Cap ETF: 1556 holdings, 9 unmatched


HEZU batch 39:  75%|███████▌  | 3/4 [00:07<00:01,  1.70s/it]

  ✅ iShares Core U.S. REIT ETF: 136 holdings, 0 unmatched


HEZU batch 39: 100%|██████████| 4/4 [00:08<00:00,  2.15s/it]

  ✅ iShares Core Dividend ETF: 439 holdings, 6 unmatched





  💾 Saved batch progress (155 filings)

📦 Batch 40/1991


HEZU batch 40:  50%|█████     | 2/4 [00:06<00:05,  2.61s/it]

  ✅ iShares Morningstar Mid-Cap Value ETF: 285 holdings, 2 unmatched
  ✅ iShares Core High Dividend ETF: 79 holdings, 1 unmatched
  ✅ iShares Morningstar Value ETF: 407 holdings, 5 unmatched


HEZU batch 40: 100%|██████████| 4/4 [00:08<00:00,  2.04s/it]

  ✅ iShares Core Dividend Growth ETF: 403 holdings, 3 unmatched





  💾 Saved batch progress (159 filings)

📦 Batch 41/1991


HEZU batch 41:  25%|██▌       | 1/4 [00:06<00:20,  6.87s/it]

  ✅ iShares U.S. Financial Services ETF: 104 holdings, 0 unmatched


HEZU batch 41:  50%|█████     | 2/4 [00:07<00:06,  3.12s/it]

  ✅ iShares LifePath Target Date 2050 ETF: 12 holdings, 3 unmatched


HEZU batch 41:  75%|███████▌  | 3/4 [00:08<00:02,  2.15s/it]

  ✅ iShares Morningstar U.S. Equity ETF: 567 holdings, 5 unmatched


HEZU batch 41: 100%|██████████| 4/4 [00:08<00:00,  2.20s/it]

  ✅ iShares MSCI Europe Small-Cap ETF: 855 holdings, 565 unmatched





  💾 Saved batch progress (163 filings)

📦 Batch 42/1991


HEZU batch 42:  25%|██▌       | 1/4 [00:08<00:24,  8.17s/it]

  ✅ iShares U.S. Transportation ETF: 47 holdings, 1 unmatched


HEZU batch 42:  75%|███████▌  | 3/4 [00:09<00:02,  2.17s/it]

  ✅ iShares LifePath Target Date 2070 ETF: 8 holdings, 3 unmatched
  ✅ iShares LifePath Target Date 2035 ETF: 15 holdings, 4 unmatched


HEZU batch 42: 100%|██████████| 4/4 [00:09<00:00,  2.33s/it]


  ✅ iShares U.S. Consumer Staples ETF: 58 holdings, 2 unmatched
  💾 Saved batch progress (167 filings)

📦 Batch 43/1991


HEZU batch 43:  25%|██▌       | 1/4 [00:05<00:15,  5.30s/it]

  ✅ iShares LifePath Target Date 2055 ETF: 8 holdings, 3 unmatched


HEZU batch 43:  50%|█████     | 2/4 [00:06<00:05,  2.62s/it]

  ✅ iShares U.S. Basic Materials ETF: 40 holdings, 0 unmatched


HEZU batch 43:  75%|███████▌  | 3/4 [00:06<00:01,  1.72s/it]

  ✅ iShares Core 40/60 Moderate Allocation ETF: 8 holdings, 3 unmatched


HEZU batch 43: 100%|██████████| 4/4 [00:07<00:00,  1.79s/it]

  ✅ iShares Dow Jones U.S. ETF: 972 holdings, 4 unmatched





  💾 Saved batch progress (171 filings)

📦 Batch 44/1991


HEZU batch 44:  25%|██▌       | 1/4 [00:05<00:15,  5.27s/it]

  ✅ iShares Global Clean Energy ETF: 126 holdings, 39 unmatched


HEZU batch 44:  50%|█████     | 2/4 [00:05<00:04,  2.36s/it]

  ✅ iShares LifePath Target Date 2040 ETF: 15 holdings, 4 unmatched


HEZU batch 44:  75%|███████▌  | 3/4 [00:07<00:02,  2.04s/it]

  ✅ iShares MSCI All Country Asia ex Japan ETF: 937 holdings, 335 unmatched
❌ Error processing filing 0001004726-25-003170: The read operation timed out


HEZU batch 44: 100%|██████████| 4/4 [00:10<00:00,  2.54s/it]


  💾 Saved batch progress (174 filings)

📦 Batch 45/1991


HEZU batch 45:  25%|██▌       | 1/4 [00:04<00:14,  4.98s/it]

  ✅ iShares MSCI USA Quality Factor ETF: 128 holdings, 1 unmatched


HEZU batch 45:  50%|█████     | 2/4 [00:05<00:05,  2.59s/it]

  ✅ iShares U.S. Consumer Discretionary ETF: 172 holdings, 3 unmatched


HEZU batch 45:  75%|███████▌  | 3/4 [00:06<00:01,  1.57s/it]

  ✅ iShares U.S. Healthcare ETF: 106 holdings, 0 unmatched


HEZU batch 45: 100%|██████████| 4/4 [00:06<00:00,  1.65s/it]

  ✅ iShares iBonds 1-5 Year Treasury Ladder ETF: 6 holdings, 0 unmatched





  💾 Saved batch progress (178 filings)

📦 Batch 46/1991


HEZU batch 46:  25%|██▌       | 1/4 [00:04<00:12,  4.11s/it]

  ✅ iShares ESG Optimized MSCI USA ETF: 189 holdings, 2 unmatched


HEZU batch 46:  50%|█████     | 2/4 [00:04<00:03,  1.94s/it]

  ✅ iShares MSCI USA Size Factor ETF: 548 holdings, 2 unmatched


HEZU batch 46:  75%|███████▌  | 3/4 [00:07<00:02,  2.34s/it]

  ✅ iShares Core MSCI Europe ETF: 1022 holdings, 604 unmatched
❌ Error processing filing 0001004726-25-003161: The read operation timed out


HEZU batch 46: 100%|██████████| 4/4 [00:08<00:00,  2.15s/it]


  💾 Saved batch progress (181 filings)

📦 Batch 47/1991


HEZU batch 47:  50%|█████     | 2/4 [00:04<00:04,  2.01s/it]

  ✅ iShares ESG MSCI KLD 400 ETF: 404 holdings, 3 unmatched
  ✅ iShares iBonds 1-5 Year TIPS Ladder ETF: 6 holdings, 5 unmatched


HEZU batch 47:  75%|███████▌  | 3/4 [00:05<00:01,  1.24s/it]

  ✅ iShares LifePath Target Date 2045 ETF: 14 holdings, 3 unmatched


HEZU batch 47: 100%|██████████| 4/4 [00:05<00:00,  1.34s/it]

  ✅ iShares LifePath Target Date 2065 ETF: 8 holdings, 3 unmatched





  💾 Saved batch progress (185 filings)

📦 Batch 48/1991


HEZU batch 48:  25%|██▌       | 1/4 [00:04<00:12,  4.27s/it]

  ✅ iShares MSCI USA Min Vol Factor ETF: 174 holdings, 2 unmatched
  ✅ iShares Core 30/70 Conservative Allocation ETF: 8 holdings, 3 unmatched


HEZU batch 48: 100%|██████████| 4/4 [00:04<00:00,  1.16s/it]

  ✅ iShares U.S. Industrials ETF: 202 holdings, 1 unmatched
  ✅ iShares LifePath Target Date 2060 ETF: 8 holdings, 3 unmatched





  💾 Saved batch progress (189 filings)

📦 Batch 49/1991


HEZU batch 49:  25%|██▌       | 1/4 [00:03<00:11,  3.80s/it]

  ✅ iShares U.S. Utilities ETF: 46 holdings, 1 unmatched
  ✅ iShares U.S. Technology ETF: 144 holdings, 2 unmatched


HEZU batch 49: 100%|██████████| 4/4 [00:04<00:00,  1.07s/it]

  ✅ iShares Core 80/20 Aggressive Allocation ETF: 9 holdings, 4 unmatched
  ✅ iShares MSCI Europe Financials ETF: 93 holdings, 48 unmatched





  💾 Saved batch progress (193 filings)

📦 Batch 50/1991


HEZU batch 50: 100%|██████████| 4/4 [00:04<00:00,  1.02s/it]

  ✅ iShares Morningstar Multi-Asset Income ETF: 12 holdings, 2 unmatched
  ✅ iShares Core 60/40 Balanced Allocation ETF: 8 holdings, 3 unmatched
  ✅ iShares U.S. Financials ETF: 145 holdings, 1 unmatched
  ✅ iShares iBonds 1-5 Year High Yield and Income Ladder ETF: 6 holdings, 5 unmatched





  💾 Saved batch progress (197 filings)

📦 Batch 51/1991


HEZU batch 51:  25%|██▌       | 1/4 [00:04<00:12,  4.13s/it]

  ✅ iShares U.S. Energy ETF: 41 holdings, 0 unmatched


HEZU batch 51: 100%|██████████| 4/4 [00:04<00:00,  1.12s/it]

  ✅ iShares MSCI USA Momentum Factor ETF: 128 holdings, 2 unmatched
  ✅ iShares LifePath Retirement ETF: 14 holdings, 3 unmatched
  ✅ iShares iBonds 1-5 Year Corporate Ladder ETF: 6 holdings, 5 unmatched





  💾 Saved batch progress (201 filings)

📦 Batch 52/1991


HEZU batch 52:  50%|█████     | 2/4 [00:03<00:03,  1.62s/it]

  ✅ iShares MSCI USA Value Factor ETF: 149 holdings, 2 unmatched
  ✅ iShares Large Cap Max Buffer Dec ETF: 5 holdings, 3 unmatched
  ✅ iShares Large Cap 10% Target Buffer Sep ETF: 6 holdings, 4 unmatched


HEZU batch 52: 100%|██████████| 4/4 [00:04<00:00,  1.00s/it]


  ✅ iShares Large Cap Max Buffer Sep ETF: 5 holdings, 3 unmatched
  💾 Saved batch progress (205 filings)

📦 Batch 53/1991


HEZU batch 53:  25%|██▌       | 1/4 [00:03<00:11,  3.94s/it]

  ✅ iShares Large Cap Max Buffer Mar ETF: 7 holdings, 1 unmatched
  ✅ iShares Large Cap 10% Target Buffer Dec ETF: 6 holdings, 4 unmatched


HEZU batch 53: 100%|██████████| 4/4 [00:04<00:00,  1.08s/it]

  ✅ iShares Large Cap Accelerated Outcome ETF: 5 holdings, 3 unmatched
  ✅ iShares Large Cap Max Buffer Jun ETF: 5 holdings, 1 unmatched





  💾 Saved batch progress (209 filings)

📦 Batch 54/1991


HEZU batch 54:  25%|██▌       | 1/4 [00:03<00:11,  3.91s/it]

  ✅ iShares Large Cap 10% Target Buffer Jun ETF: 6 holdings, 1 unmatched


HEZU batch 54:  75%|███████▌  | 3/4 [00:04<00:01,  1.09s/it]

  ✅ iShares Large Cap 10% Target Buffer Mar ETF: 5 holdings, 1 unmatched
  ✅ iShares iBonds Dec 2032 Term Corporate ETF: 415 holdings, 23 unmatched


HEZU batch 54: 100%|██████████| 4/4 [00:04<00:00,  1.20s/it]

  ✅ iShares iBonds 2030 Term High Yield and Income ETF: 302 holdings, 62 unmatched





  💾 Saved batch progress (213 filings)

📦 Batch 55/1991


HEZU batch 55:  25%|██▌       | 1/4 [00:04<00:13,  4.66s/it]

  ✅ iShares iBonds Oct 2027 Term TIPS ETF: 6 holdings, 0 unmatched


HEZU batch 55:  75%|███████▌  | 3/4 [00:05<00:01,  1.19s/it]

  ✅ iShares iBonds Dec 2035 Term Treasury ETF: 4 holdings, 0 unmatched
  ✅ iShares iBonds 2029 Term High Yield and Income ETF: 433 holdings, 102 unmatched


HEZU batch 55: 100%|██████████| 4/4 [00:08<00:00,  2.16s/it]

  ✅ iShares iBonds Dec 2029 Term Muni Bond ETF: 1263 holdings, 606 unmatched





  💾 Saved batch progress (217 filings)

📦 Batch 56/1991


HEZU batch 56:  50%|█████     | 2/4 [00:03<00:03,  1.61s/it]

  ✅ iShares ESG Aware 80/20 Aggressive Allocation ETF: 6 holdings, 0 unmatched
  ✅ iShares ESG Aware 60/40 Balanced Allocation ETF: 6 holdings, 0 unmatched


HEZU batch 56:  75%|███████▌  | 3/4 [00:04<00:01,  1.13s/it]

  ✅ iShares iBonds Dec 2033 Term Corporate ETF: 332 holdings, 20 unmatched


HEZU batch 56: 100%|██████████| 4/4 [00:04<00:00,  1.16s/it]

  ✅ iShares iBonds Dec 2025 Term Muni Bond ETF: 301 holdings, 136 unmatched





  💾 Saved batch progress (221 filings)

📦 Batch 57/1991


HEZU batch 57:  25%|██▌       | 1/4 [00:04<00:12,  4.31s/it]

  ✅ iShares Morningstar Mid-Cap ETF: 411 holdings, 1 unmatched


HEZU batch 57:  50%|█████     | 2/4 [00:04<00:03,  1.90s/it]

  ✅ iShares iBonds Oct 2029 Term TIPS ETF: 7 holdings, 0 unmatched
  ✅ iShares U.S. Tech Breakthrough Multisector ETF: 171 holdings, 2 unmatched


HEZU batch 57: 100%|██████████| 4/4 [00:05<00:00,  1.50s/it]

  ✅ iShares iBonds Dec 2030 Term Treasury ETF: 24 holdings, 0 unmatched





  💾 Saved batch progress (225 filings)

📦 Batch 58/1991


HEZU batch 58:  25%|██▌       | 1/4 [00:04<00:12,  4.13s/it]

  ✅ iShares ESG Aware 30/70 Conservative Allocation ETF: 6 holdings, 0 unmatched
  ✅ iShares iBonds Dec 2045 Term Treasury ETF: 8 holdings, 0 unmatched
  ✅ iShares Morningstar Small-Cap Value ETF: 1118 holdings, 7 unmatched
❌ Error processing filing 0002071691-25-009485: The read operation timed out


HEZU batch 58: 100%|██████████| 4/4 [00:08<00:00,  2.01s/it]


  💾 Saved batch progress (228 filings)

📦 Batch 59/1991


HEZU batch 59: 100%|██████████| 4/4 [00:04<00:00,  1.04s/it]

  ✅ iShares iBonds Oct 2032 Term TIPS ETF: 4 holdings, 0 unmatched
  ✅ iShares iBonds Dec 2055 Term Treasury ETF: 4 holdings, 0 unmatched
  ✅ iShares iBonds Oct 2031 Term TIPS ETF: 3 holdings, 0 unmatched
  ✅ iShares iBonds Dec 2027 Term Treasury ETF: 48 holdings, 0 unmatched





  💾 Saved batch progress (232 filings)

📦 Batch 60/1991


HEZU batch 60:  25%|██▌       | 1/4 [00:04<00:13,  4.43s/it]

  ✅ iShares iBonds Dec 2032 Term Treasury ETF: 14 holdings, 0 unmatched


HEZU batch 60:  50%|█████     | 2/4 [00:04<00:04,  2.05s/it]

  ✅ iShares ESG Aware 40/60 Moderate Allocation ETF: 6 holdings, 0 unmatched
  ✅ iShares MSCI USA Small-Cap Min Vol Factor ETF: 299 holdings, 2 unmatched


HEZU batch 60: 100%|██████████| 4/4 [00:05<00:00,  1.42s/it]

  ✅ iShares U.S. Equity Factor ETF: 290 holdings, 1 unmatched





  💾 Saved batch progress (236 filings)

📦 Batch 61/1991


HEZU batch 61:  25%|██▌       | 1/4 [00:03<00:11,  3.92s/it]

  ✅ iShares iBonds Dec 2033 Term Treasury ETF: 5 holdings, 0 unmatched
  ✅ iShares iBonds Oct 2030 Term TIPS ETF: 4 holdings, 0 unmatched


HEZU batch 61:  75%|███████▌  | 3/4 [00:08<00:02,  2.54s/it]

  ✅ iShares iBonds Dec 2026 Term Muni Bond ETF: 1399 holdings, 652 unmatched
❌ Error processing filing 0002071691-25-009471: The read operation timed out


HEZU batch 61: 100%|██████████| 4/4 [00:08<00:00,  2.08s/it]


  💾 Saved batch progress (239 filings)

📦 Batch 62/1991


HEZU batch 62:  25%|██▌       | 1/4 [00:03<00:10,  3.67s/it]

  ✅ iShares 0-5 Year TIPS Bond ETF: 26 holdings, 0 unmatched


HEZU batch 62:  50%|█████     | 2/4 [00:04<00:03,  1.72s/it]

  ✅ iShares iBonds Dec 2026 Term Treasury ETF: 45 holdings, 0 unmatched


HEZU batch 62:  75%|███████▌  | 3/4 [00:04<00:01,  1.05s/it]

  ✅ iShares U.S. Treasury Bond ETF: 214 holdings, 0 unmatched


HEZU batch 62: 100%|██████████| 4/4 [00:05<00:00,  1.31s/it]

  ✅ iShares iBonds Dec 2031 Term Muni Bond ETF: 508 holdings, 237 unmatched





  💾 Saved batch progress (243 filings)

📦 Batch 63/1991


HEZU batch 63:  75%|███████▌  | 3/4 [00:03<00:01,  1.02s/it]

  ✅ iShares Morningstar Mid-Cap Growth ETF: 276 holdings, 0 unmatched
  ✅ iShares iBonds Oct 2026 Term TIPS ETF: 6 holdings, 0 unmatched
  ✅ iShares iBonds Dec 2029 Term Treasury ETF: 27 holdings, 0 unmatched


HEZU batch 63: 100%|██████████| 4/4 [00:28<00:00,  7.00s/it]

  ✅ iShares Core Universal USD Bond ETF: 17464 holdings, 2997 unmatched





  💾 Saved batch progress (247 filings)

📦 Batch 64/1991


HEZU batch 64:  25%|██▌       | 1/4 [00:04<00:13,  4.34s/it]

  ✅ iShares iBonds Dec 2028 Term Treasury ETF: 36 holdings, 0 unmatched


HEZU batch 64:  75%|███████▌  | 3/4 [00:04<00:01,  1.15s/it]

  ✅ iShares iBonds Dec 2025 Term Treasury ETF: 7 holdings, 0 unmatched
  ✅ iShares U.S. Small-Cap Equity Factor ETF: 870 holdings, 7 unmatched


HEZU batch 64: 100%|██████████| 4/4 [00:09<00:00,  2.28s/it]

  ✅ iShares iBonds Dec 2028 Term Muni Bond ETF: 1715 holdings, 800 unmatched





  💾 Saved batch progress (251 filings)

📦 Batch 65/1991


HEZU batch 65:  25%|██▌       | 1/4 [00:04<00:12,  4.04s/it]

  ✅ iShares Morningstar Growth ETF: 357 holdings, 1 unmatched


HEZU batch 65: 100%|██████████| 4/4 [00:04<00:00,  1.10s/it]

  ✅ iShares iBonds Oct 2033 Term TIPS ETF: 3 holdings, 0 unmatched
  ✅ iShares iBonds 2028 Term High Yield and Income ETF: 285 holdings, 61 unmatched
  ✅ iShares iBonds Dec 2031 Term Treasury ETF: 17 holdings, 0 unmatched





  💾 Saved batch progress (255 filings)

📦 Batch 66/1991


HEZU batch 66:  25%|██▌       | 1/4 [00:03<00:11,  3.82s/it]

  ✅ iShares iBonds Oct 2035 Term TIPS ETF: 3 holdings, 0 unmatched


HEZU batch 66:  75%|███████▌  | 3/4 [00:04<00:00,  1.00it/s]

  ✅ iShares Yield Optimized Bond ETF: 13 holdings, 1 unmatched
  ✅ iShares TIPS Bond ETF: 49 holdings, 0 unmatched


HEZU batch 66: 100%|██████████| 4/4 [00:08<00:00,  2.18s/it]

  ✅ iShares iBonds Dec 2027 Term Muni Bond ETF: 1551 holdings, 730 unmatched





  💾 Saved batch progress (259 filings)

📦 Batch 67/1991


HEZU batch 67:  50%|█████     | 2/4 [00:04<00:03,  1.87s/it]

  ✅ iShares iBonds Oct 2028 Term TIPS ETF: 7 holdings, 0 unmatched
  ✅ iShares Nasdaq Top 30 Stocks ETF: 34 holdings, 0 unmatched


HEZU batch 67: 100%|██████████| 4/4 [00:04<00:00,  1.17s/it]


  ✅ iShares iBonds Dec 2035 Term Corporate ETF: 385 holdings, 15 unmatched
  ✅ iShares Morningstar Small-Cap Growth ETF: 940 holdings, 2 unmatched
  💾 Saved batch progress (263 filings)

📦 Batch 68/1991


HEZU batch 68:  25%|██▌       | 1/4 [00:04<00:13,  4.64s/it]

  ✅ iShares Russell 2000 Growth ETF: 1105 holdings, 13 unmatched
  ✅ iShares Russell Mid-Cap Growth ETF: 285 holdings, 0 unmatched
  ✅ iShares Russell 2000 Value ETF: 1431 holdings, 18 unmatched


HEZU batch 68: 100%|██████████| 4/4 [00:04<00:00,  1.21s/it]

  ✅ iShares Nasdaq-100 ex Top 30 ETF: 72 holdings, 0 unmatched





  💾 Saved batch progress (267 filings)

📦 Batch 69/1991


HEZU batch 69:  50%|█████     | 2/4 [00:04<00:03,  1.78s/it]

  ✅ iShares Semiconductor ETF: 33 holdings, 0 unmatched
  ✅ iShares North American Natural Resources ETF: 131 holdings, 1 unmatched


HEZU batch 69:  75%|███████▌  | 3/4 [00:04<00:01,  1.04s/it]

  ✅ iShares Expanded Tech-Software Sector ETF: 117 holdings, 1 unmatched


HEZU batch 69: 100%|██████████| 4/4 [00:05<00:00,  1.29s/it]

  ✅ iShares Russell 2000 ETF: 1981 holdings, 21 unmatched





  💾 Saved batch progress (271 filings)

📦 Batch 70/1991


HEZU batch 70: 100%|██████████| 4/4 [00:04<00:00,  1.16s/it]

  ✅ iShares S&P Mid-Cap 400 Value ETF: 303 holdings, 2 unmatched
  ✅ iShares S&P 500 ex S&P 100 ETF: 403 holdings, 4 unmatched
  ✅ iShares Expanded Tech Sector ETF: 283 holdings, 1 unmatched
  ✅ iShares Russell Mid-Cap ETF: 821 holdings, 4 unmatched





  💾 Saved batch progress (275 filings)

📦 Batch 71/1991


HEZU batch 71:  25%|██▌       | 1/4 [00:04<00:13,  4.51s/it]

  ✅ iShares Russell 2500 ETF: 506 holdings, 3 unmatched


HEZU batch 71:  50%|█████     | 2/4 [00:04<00:04,  2.01s/it]

  ✅ iShares S&P Small-Cap 600 Growth ETF: 363 holdings, 0 unmatched
  ✅ iShares U.S. Digital Infrastructure and Real Estate ETF: 30 holdings, 1 unmatched


HEZU batch 71: 100%|██████████| 4/4 [00:05<00:00,  1.29s/it]

  ✅ iShares Core S&P 500 ETF: 507 holdings, 5 unmatched





  💾 Saved batch progress (279 filings)

📦 Batch 72/1991


HEZU batch 72:  50%|█████     | 2/4 [00:04<00:03,  1.77s/it]

  ✅ iShares Biotechnology ETF: 254 holdings, 3 unmatched
  ✅ iShares S&P 500 3% Capped ETF: 504 holdings, 5 unmatched


HEZU batch 72:  75%|███████▌  | 3/4 [00:04<00:01,  1.11s/it]

  ✅ iShares Core S&P Total U.S. Stock Market ETF: 2492 holdings, 26 unmatched
❌ Error processing filing 0002071691-25-007619: The read operation timed out


HEZU batch 72: 100%|██████████| 4/4 [00:08<00:00,  2.10s/it]


  💾 Saved batch progress (282 filings)

📦 Batch 73/1991


HEZU batch 73:  25%|██▌       | 1/4 [00:04<00:12,  4.12s/it]

  ✅ iShares S&P Mid-Cap 400 Growth ETF: 248 holdings, 0 unmatched
  ✅ iShares Global Healthcare ETF: 117 holdings, 17 unmatched


HEZU batch 73:  75%|███████▌  | 3/4 [00:05<00:01,  1.41s/it]

  ✅ iShares Micro-Cap ETF: 1345 holdings, 16 unmatched


HEZU batch 73: 100%|██████████| 4/4 [00:05<00:00,  1.34s/it]

  ✅ iShares JPX-Nikkei 400 ETF: 392 holdings, 185 unmatched





  💾 Saved batch progress (286 filings)

📦 Batch 74/1991


HEZU batch 74:  25%|██▌       | 1/4 [00:04<00:12,  4.27s/it]

  ✅ iShares Global Materials ETF: 92 holdings, 28 unmatched
  ✅ iShares Lithium Miners and Producers ETF: 36 holdings, 4 unmatched


HEZU batch 74: 100%|██████████| 4/4 [00:04<00:00,  1.19s/it]

  ✅ iShares Copper and Metals Mining ETF: 51 holdings, 8 unmatched
  ✅ iShares Global Financials ETF: 219 holdings, 55 unmatched





  💾 Saved batch progress (290 filings)

📦 Batch 75/1991


HEZU batch 75:  25%|██▌       | 1/4 [00:04<00:12,  4.04s/it]

  ✅ iShares Future Metaverse Tech and Communications ETF: 38 holdings, 7 unmatched


HEZU batch 75:  50%|█████     | 2/4 [00:04<00:03,  1.78s/it]

  ✅ iShares Asia 50 ETF: 57 holdings, 13 unmatched


HEZU batch 75: 100%|██████████| 4/4 [00:04<00:00,  1.19s/it]

  ✅ iShares Global Consumer Discretionary ETF: 141 holdings, 40 unmatched
  ✅ iShares Global 100 ETF: 106 holdings, 15 unmatched





  💾 Saved batch progress (294 filings)

📦 Batch 76/1991


HEZU batch 76:  50%|█████     | 2/4 [00:04<00:03,  1.90s/it]

  ✅ iShares Blockchain and Tech ETF: 41 holdings, 1 unmatched
  ✅ iShares Global Timber & Forestry ETF: 29 holdings, 13 unmatched


HEZU batch 76:  75%|███████▌  | 3/4 [00:04<00:01,  1.09s/it]

  ✅ iShares Global Consumer Staples ETF: 99 holdings, 33 unmatched


HEZU batch 76: 100%|██████████| 4/4 [00:05<00:00,  1.34s/it]

  ✅ iShares Global Utilities ETF: 72 holdings, 20 unmatched





  💾 Saved batch progress (298 filings)

📦 Batch 77/1991


HEZU batch 77:  50%|█████     | 2/4 [00:04<00:03,  1.83s/it]

  ✅ iShares Environmental Infrastructure and Industrials ETF: 71 holdings, 30 unmatched
  ✅ iShares Global Tech ETF: 124 holdings, 15 unmatched


HEZU batch 77:  75%|███████▌  | 3/4 [00:04<00:01,  1.06s/it]

  ✅ iShares Global Industrials ETF: 217 holdings, 71 unmatched


HEZU batch 77: 100%|██████████| 4/4 [00:04<00:00,  1.22s/it]

  ✅ iShares International Dividend Growth ETF: 420 holdings, 202 unmatched





  💾 Saved batch progress (302 filings)

📦 Batch 78/1991


HEZU batch 78:  25%|██▌       | 1/4 [00:04<00:12,  4.11s/it]

  ✅ iShares Emerging Markets Infrastructure ETF: 31 holdings, 10 unmatched
  ✅ iShares Global Energy ETF: 54 holdings, 7 unmatched
  ✅ iShares Global Infrastructure ETF: 79 holdings, 20 unmatched


HEZU batch 78: 100%|██████████| 4/4 [00:05<00:00,  1.31s/it]

  ✅ iShares International Developed Small Cap Value Factor ETF: 494 holdings, 306 unmatched





  💾 Saved batch progress (306 filings)

📦 Batch 79/1991


HEZU batch 79:  25%|██▌       | 1/4 [00:04<00:12,  4.06s/it]

  ✅ iShares Core S&P Small-Cap ETF: 612 holdings, 1 unmatched


HEZU batch 79: 100%|██████████| 4/4 [00:04<00:00,  1.08s/it]

  ✅ iShares Global Comm Services ETF: 73 holdings, 17 unmatched
  ✅ iShares S&P 500 Growth ETF: 219 holdings, 2 unmatched
  ✅ iShares Russell Top 200 Value ETF: 155 holdings, 4 unmatched





  💾 Saved batch progress (310 filings)

📦 Batch 80/1991


HEZU batch 80:  50%|█████     | 2/4 [00:04<00:03,  1.82s/it]

  ✅ iShares Russell Top 200 ETF: 201 holdings, 4 unmatched
  ✅ iShares ESG Select Screened S&P Mid-Cap ETF: 362 holdings, 2 unmatched


HEZU batch 80: 100%|██████████| 4/4 [00:04<00:00,  1.21s/it]

  ✅ iShares S&P 100 ETF: 103 holdings, 2 unmatched
  ✅ iShares Russell 3000 ETF: 2589 holdings, 29 unmatched





  💾 Saved batch progress (314 filings)

📦 Batch 81/1991


HEZU batch 81:  50%|█████     | 2/4 [00:04<00:03,  1.70s/it]

  ✅ iShares Core S&P U.S. Value ETF: 694 holdings, 9 unmatched
  ✅ iShares Russell Top 200 Growth ETF: 113 holdings, 2 unmatched


HEZU batch 81: 100%|██████████| 4/4 [00:04<00:00,  1.07s/it]

  ✅ iShares Mortgage Real Estate ETF: 34 holdings, 0 unmatched
  ✅ iShares S&P Small-Cap 600 Value ETF: 463 holdings, 1 unmatched





  💾 Saved batch progress (318 filings)

📦 Batch 82/1991


HEZU batch 82:  25%|██▌       | 1/4 [00:04<00:13,  4.41s/it]

  ✅ iShares ESG Select Screened S&P 500 ETF: 443 holdings, 6 unmatched
  ✅ iShares Core S&P U.S. Growth ETF: 465 holdings, 2 unmatched


HEZU batch 82: 100%|██████████| 4/4 [00:04<00:00,  1.20s/it]

  ✅ iShares Preferred and Income Securities ETF: 449 holdings, 2 unmatched
  ✅ iShares Russell Mid-Cap Value ETF: 722 holdings, 6 unmatched





  💾 Saved batch progress (322 filings)

📦 Batch 83/1991


HEZU batch 83:  50%|█████     | 2/4 [00:04<00:03,  1.75s/it]

  ✅ iShares Core S&P Mid-Cap ETF: 407 holdings, 2 unmatched
  ✅ iShares ESG Select Screened S&P Small-Cap ETF: 554 holdings, 1 unmatched
  ✅ iShares Residential and Multisector Real Estate ETF: 42 holdings, 0 unmatched


HEZU batch 83: 100%|██████████| 4/4 [00:04<00:00,  1.17s/it]

  ✅ iShares S&P 500 Value ETF: 399 holdings, 6 unmatched





  💾 Saved batch progress (326 filings)

📦 Batch 84/1991


HEZU batch 84:  50%|█████     | 2/4 [00:04<00:03,  1.85s/it]

  ✅ iShares U.S. Infrastructure ETF: 158 holdings, 2 unmatched
  ✅ iShares MSCI USA Quality GARP ETF: 133 holdings, 0 unmatched
  ✅ iShares Russell 1000 Growth ETF: 394 holdings, 2 unmatched


HEZU batch 84: 100%|██████████| 4/4 [00:04<00:00,  1.24s/it]

  ✅ iShares U.S. Oil Equipment & Services ETF: 32 holdings, 1 unmatched





  💾 Saved batch progress (330 filings)

📦 Batch 85/1991


HEZU batch 85:  25%|██▌       | 1/4 [00:04<00:12,  4.26s/it]

  ✅ iShares U.S. Regional Banks ETF: 36 holdings, 0 unmatched
  ✅ iShares U.S. Medical Devices ETF: 53 holdings, 1 unmatched


HEZU batch 85: 100%|██████████| 4/4 [00:04<00:00,  1.16s/it]

  ✅ iShares U.S. Broker-Dealers & Securities Exchanges ETF: 36 holdings, 1 unmatched
  ✅ iShares U.S. Manufacturing ETF: 111 holdings, 1 unmatched





  💾 Saved batch progress (334 filings)

📦 Batch 86/1991


HEZU batch 86:  25%|██▌       | 1/4 [00:03<00:11,  3.77s/it]

  ✅ iShares US Small Cap Value Factor ETF: 256 holdings, 4 unmatched
  ✅ iShares U.S. Oil & Gas Exploration & Production ETF: 49 holdings, 0 unmatched


HEZU batch 86: 100%|██████████| 4/4 [00:04<00:00,  1.05s/it]

  ✅ iShares U.S. Pharmaceuticals ETF: 46 holdings, 0 unmatched
  ✅ iShares U.S. Telecommunications ETF: 25 holdings, 1 unmatched





  💾 Saved batch progress (338 filings)

📦 Batch 87/1991


HEZU batch 87:  75%|███████▌  | 3/4 [00:04<00:01,  1.13s/it]

  ✅ iShares U.S. Home Construction ETF: 49 holdings, 1 unmatched
  ✅ iShares Texas Equity ETF: 199 holdings, 1 unmatched
  ✅ iShares U.S. Aerospace & Defense ETF: 42 holdings, 1 unmatched


HEZU batch 87: 100%|██████████| 4/4 [00:04<00:00,  1.18s/it]

  ✅ iShares Europe ETF: 367 holdings, 173 unmatched





  💾 Saved batch progress (342 filings)

📦 Batch 88/1991


HEZU batch 88:  25%|██▌       | 1/4 [00:04<00:12,  4.12s/it]

  ✅ iShares U.S. Healthcare Providers ETF: 65 holdings, 0 unmatched


HEZU batch 88:  75%|███████▌  | 3/4 [00:04<00:01,  1.05s/it]

  ✅ iShares U.S. Real Estate ETF: 64 holdings, 0 unmatched
  ✅ iShares Russell 1000 ETF: 1015 holdings, 5 unmatched


HEZU batch 88: 100%|██████████| 4/4 [00:04<00:00,  1.17s/it]

  ✅ iShares India 50 ETF: 52 holdings, 22 unmatched





  💾 Saved batch progress (346 filings)

📦 Batch 89/1991


HEZU batch 89:  50%|█████     | 2/4 [00:04<00:03,  1.78s/it]

  ✅ iShares Future AI & Tech ETF: 51 holdings, 4 unmatched
  ✅ iShares Russell 1000 Value ETF: 873 holdings, 4 unmatched


HEZU batch 89:  75%|███████▌  | 3/4 [00:04<00:01,  1.10s/it]

  ✅ iShares U.S. Insurance ETF: 60 holdings, 1 unmatched


HEZU batch 89: 100%|██████████| 4/4 [00:04<00:00,  1.21s/it]

  ✅ iShares Latin America 40 ETF: 45 holdings, 14 unmatched





  💾 Saved batch progress (350 filings)

📦 Batch 90/1991


HEZU batch 90:   0%|          | 0/4 [00:04<?, ?it/s]

  ⚠️  Stopping - already processed fund: iShares 10-20 Year Treasury Bond ETF





  💾 Saved batch progress (350 filings)

✅ Completed HEZU: 350 funds processed

Processing ticker: VMGRX
Found 256 filings, most recent: 2025-10-31

📦 Batch 1/64


VMGRX batch 1: 100%|██████████| 4/4 [00:01<00:00,  2.52it/s]

  ✅ VANGUARD MID-CAP GROWTH FUND: 139 holdings, 3 unmatched
  ✅ VANGUARD INTERNATIONAL DIVIDEND GROWTH FUND: 47 holdings, 16 unmatched
  ✅ VANGUARD SELECTED VALUE FUND: 130 holdings, 2 unmatched
  ✅ VANGUARD HIGH DIVIDEND YIELD INDEX FUND: 571 holdings, 6 unmatched
  💾 Saved batch progress (4 filings)






📦 Batch 2/64


VMGRX batch 2:  50%|█████     | 2/4 [00:01<00:01,  1.42it/s]

  ✅ VANGUARD ADVICE SELECT DIVIDEND GROWTH FUND: 31 holdings, 2 unmatched
  ✅ VANGUARD ADVICE SELECT GLOBAL VALUE FUND: 107 holdings, 17 unmatched


VMGRX batch 2:  75%|███████▌  | 3/4 [00:02<00:00,  1.55it/s]

  ✅ VANGUARD INTERNATIONAL DIVIDEND APPRECIATION INDEX FUND: 349 holdings, 159 unmatched


VMGRX batch 2: 100%|██████████| 4/4 [00:04<00:00,  1.07s/it]

  ✅ VANGUARD EMERGING MARKETS GOVERNMENT BOND INDEX FUND: 841 holdings, 129 unmatched
  💾 Saved batch progress (8 filings)






📦 Batch 3/64


VMGRX batch 3:  25%|██▌       | 1/4 [00:01<00:04,  1.58s/it]

  ✅ VANGUARD ADVICE SELECT INTERNATIONAL GROWTH FUND: 29 holdings, 7 unmatched


VMGRX batch 3:  50%|█████     | 2/4 [00:01<00:01,  1.29it/s]

  ✅ VANGUARD GLOBAL MINIMUM VOLATILITY FUND: 238 holdings, 49 unmatched


VMGRX batch 3:  75%|███████▌  | 3/4 [00:02<00:00,  1.57it/s]

  ✅ VANGUARD INTERNATIONAL EXPLORER FUND: 340 holdings, 156 unmatched


VMGRX batch 3: 100%|██████████| 4/4 [00:05<00:00,  1.42s/it]

  ✅ VANGUARD INTERNATIONAL HIGH DIVIDEND YIELD INDEX FUND: 1559 holdings, 725 unmatched
  💾 Saved batch progress (12 filings)






📦 Batch 4/64


VMGRX batch 4:   0%|          | 0/4 [00:01<?, ?it/s]

  ⚠️  Stopping - already processed fund: VANGUARD ADVICE SELECT INTERNATIONAL GROWTH FUND





  💾 Saved batch progress (12 filings)

✅ Completed VMGRX: 12 funds processed

Processing ticker: VDIGX
Found 175 filings, most recent: 2025-10-31

📦 Batch 1/44


VDIGX batch 1:  25%|██▌       | 1/4 [00:01<00:04,  1.46s/it]

  ✅ VANGUARD ENERGY FUND: 42 holdings, 6 unmatched
  ✅ VANGUARD HEALTH CARE FUND: 99 holdings, 5 unmatched


VDIGX batch 1:  75%|███████▌  | 3/4 [00:01<00:00,  2.08it/s]

  ✅ VANGUARD GLOBAL CAPITAL CYCLES FUND: 75 holdings, 17 unmatched
❌ Error processing filing 0000734383-25-000149: The read operation timed out


VDIGX batch 1: 100%|██████████| 4/4 [00:05<00:00,  1.41s/it]


  💾 Saved batch progress (3 filings)

📦 Batch 2/44


VDIGX batch 2:  50%|█████     | 2/4 [00:01<00:01,  1.25it/s]

  ✅ VANGUARD GLOBAL ESG SELECT STOCK FUND: 38 holdings, 7 unmatched
  ✅ VANGUARD REAL ESTATE INDEX FUND: 165 holdings, 4 unmatched
  ⚠️  Stopping - already processed fund: VANGUARD ENERGY FUND
  💾 Saved batch progress (5 filings)






✅ Completed VDIGX: 5 funds processed


In [10]:
import pickle
import os
from pathlib import Path

# Specify the folder path
folder_path = './results/nport_cache'  # Update this path

# Initialize an empty list to store all objects
all_objects = []

# Get all files ending with '_processed.pickle' (or '_processed.pkl')
folder = Path(folder_path)
processed_files = list(folder.glob('*_processed.pickle')) + list(folder.glob('*_processed.pkl'))

# Read each pickle file and add objects to the array
for file_path in processed_files:
    try:
        with open(file_path, 'rb') as file:
            data = pickle.load(file)
            # If data is already a list, extend; otherwise append
            if isinstance(data, list):
                all_objects.extend(data)
            else:
                all_objects.append(data)
        print(f"Loaded: {file_path.name}")
    except Exception as e:
        print(f"Error loading {file_path.name}: {e}")

print(f"\nTotal objects loaded: {len(all_objects)}")

Loaded: VOO_processed.pkl
Loaded: MGK_processed.pkl
Loaded: VMGRX_processed.pkl
Loaded: VDIGX_processed.pkl
Loaded: HEZU_processed.pkl

Total objects loaded: 5


In [24]:
for result in all_objects:
    print(result['results'][0]['result'])
    break

                                          holding_name ticker_before  \
0                                          ABIOMED Inc          None   
1                                            Adobe Inc          None   
2                           Advanced Micro Devices Inc          None   
3                                           Airbnb Inc          None   
4                          Alnylam Pharmaceuticals Inc          None   
..                                                 ...           ...   
161                                                N/A          NQZ5   
162  Vanguard Cmt Funds-Vanguard Market Liquidity Fund          None   
163                                                N/A          None   
164                                                N/A          None   
165                        Royal Caribbean Cruises Ltd          None   

    ticker_after matched_ticker                  matched_title  similarity  \
0           None           None                          

In [16]:
from edgar import Company, set_identity
import pandas as pd
from typing import List, Dict
import sys
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import multiprocessing as mp
from threading import Lock
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
from src.simple_rag.extraction.nport import NPortProcessor
from src.simple_rag.models.fund import PortfolioHolding, Derivatives, NonDerivatives, FilingMetadata
from pathlib import Path

company_json_path = Path("./sec_data/company_tickers.json")

set_identity('luis.alvarez.conde@alumnos.upm.es')

def process_single_filing(filing, ticker, company_json_path):
    """Process a single filing - can be parallelized"""
    try:
        xml_data = filing.obj()
        fund_name = xml_data.get_fund_series().name
        reporting_period = xml_data.reporting_period
        portfolio_list = xml_data.investments
        derivatives = xml_data.derivatives
        series_id = xml_data.get_fund_series().series_id

        # Process holdings
        proc = NPortProcessor(company_tickers_json_path=company_json_path, min_similarity=0.74)
        holdings = proc.process_holdings(portfolio_list)
        result = proc.enrich_tickers(holdings, verbose=False)  # Set verbose=False to reduce I/O
        holdings_df = proc.to_df(holdings)
        filing_metadata = FilingMetadata(
            accession_number=filing.accession_number,
            reporting_date=reporting_period,
            filing_date=filing.filing_date,
            form=filing.form,
            url=filing.url
        )

        not_matches = result[result['matched_ticker'].isna() | (result['matched_ticker'] == '')]
        
        return {
            'fund_name': fund_name,
            'reporting_period': reporting_period,
            'holdings': holdings,
            'holdings_df': holdings_df,
            'result': result,
            'derivatives': derivatives,
            'not_matches': not_matches,
            'ticker': ticker,
            'series_id': series_id,
            'report_date': reporting_period,
            'nport_metadata': filing_metadata
        }
    except Exception as e:
        print(f"Error processing filing for {ticker}: {e}")
        return None

def process_ticker(ticker, company_json_path):
    """Process all filings for a single ticker - SEQUENTIAL within ticker"""
    try:
        nport_file = Company(ticker)
        filings = sorted(nport_file.get_filings(form="NPORT-P"), 
                        key=lambda x: x.report_date, reverse=True)
        
        if not filings:
            print(f"No filings found for {ticker}")
            return None
            
        print(f"Processing ticker: {ticker}, most recent filing date: {filings[0].report_date}")
        
        funds_processed_set = set()
        ticker_results = []
        
        # Process filings SEQUENTIALLY for this ticker (to respect the stop condition)
        for filing in filings:
            result = process_single_filing(filing, ticker, company_json_path)
            
            if result is not None:
                # Check if we've already processed this fund
                if result['fund_name'].lower() in funds_processed_set:
                    print(f"Stopping - already processed fund: {result['fund_name']}")
                    break
                
                funds_processed_set.add(result['fund_name'].lower())
                ticker_results.append(result)
                
                print(f"{ticker} - Fund: {result['fund_name']}, Holdings: {len(result['holdings'])}, Unmatched: {len(result['not_matches'])}")
        
        return {
            'ticker': ticker,
            'results': ticker_results,
            'funds_processed': list(funds_processed_set)
        }
    
    except Exception as e:
        print(f"Error processing ticker {ticker}: {e}")
        return None

# Main execution - PARALLEL across tickers only
tickers = ["VOO", "MGK", "HEZU", "VMGRX", "VDIGX"]

# Use fewer workers to avoid overwhelming the system
max_workers = min(5, len(tickers))  # Start with 3 workers
print(f"Using {max_workers} workers for tickers")

all_results = []
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    future_to_ticker = {
        executor.submit(process_ticker, ticker, company_json_path): ticker 
        for ticker in tickers
    }
    
    for future in tqdm(as_completed(future_to_ticker), total=len(tickers), desc="Processing tickers"):
        ticker = future_to_ticker[future]
        try:
            result = future.result()
            if result:
                all_results.append(result)
                print(f"\nCompleted {ticker}: {len(result['funds_processed'])} funds processed")
        except Exception as e:
            print(f"Error with ticker {ticker}: {e}")

# Update funds_total object with the results
print("\n=== Updating funds_total ===")


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Using 5 workers for tickers


Processing tickers:   0%|          | 0/5 [00:00<?, ?it/s]

Processing ticker: VOO, most recent filing date: 2025-09-30
VOO - Fund: VANGUARD MID-CAP VALUE INDEX FUND, Holdings: 186, Unmatched: 4
Processing ticker: MGK, most recent filing date: 2025-11-30
MGK - Fund: VANGUARD ESG U.S. STOCK ETF, Holdings: 1277, Unmatched: 9
Processing ticker: VMGRX, most recent filing date: 2025-10-31
MGK - Fund: VANGUARD HEALTH CARE INDEX FUND, Holdings: 426, Unmatched: 8
VMGRX - Fund: VANGUARD SELECTED VALUE FUND, Holdings: 130, Unmatched: 2VOO - Fund: VANGUARD EXTENDED MARKET INDEX FUND, Holdings: 3458, Unmatched: 89

MGK - Fund: VANGUARD FTSE SOCIAL INDEX FUND, Holdings: 391, Unmatched: 5
MGK - Fund: VANGUARD MEGA CAP INDEX FUND, Holdings: 184, Unmatched: 4Processing ticker: VDIGX, most recent filing date: 2025-10-31

VOO - Fund: VANGUARD GROWTH INDEX FUND, Holdings: 166, Unmatched: 4
VMGRX - Fund: VANGUARD HIGH DIVIDEND YIELD INDEX FUND, Holdings: 571, Unmatched: 6
VDIGX - Fund: VANGUARD DIVIDEND GROWTH FUND, Holdings: 56, Unmatched: 3
MGK - Fund: VANGUARD 

Processing tickers:  20%|██        | 1/5 [00:09<00:37,  9.33s/it]

Stopping - already processed fund: VANGUARD ENERGY FUND

Completed VDIGX: 7 funds processed
VOO - Fund: VANGUARD 500 INDEX FUND, Holdings: 532, Unmatched: 8
VOO - Fund: VANGUARD SMALL-CAP GROWTH INDEX FUND, Holdings: 578, Unmatched: 9
VOO - Fund: VANGUARD MID-CAP INDEX FUND, Holdings: 302, Unmatched: 4
Processing ticker: HEZU, most recent filing date: 2025-11-30
HEZU - Fund: iShares Currency Hedged MSCI Eurozone ETF, Holdings: 9, Unmatched: 4
VOO - Fund: VANGUARD TOTAL STOCK MARKET INDEX FUND, Holdings: 3560, Unmatched: 70
HEZU - Fund: iShares ESG Aware MSCI USA Small-Cap ETF, Holdings: 916, Unmatched: 3
HEZU - Fund: iShares Paris-Aligned Climate Optimized MSCI USA ETF, Holdings: 138, Unmatched: 0
VOO - Fund: VANGUARD VALUE INDEX FUND, Holdings: 333, Unmatched: 5
HEZU - Fund: iShares ESG Aware MSCI USA Value ETF, Holdings: 236, Unmatched: 2
VMGRX - Fund: VANGUARD EMERGING MARKETS GOVERNMENT BOND INDEX FUND, Holdings: 841, Unmatched: 129
HEZU - Fund: iShares MSCI Saudi Arabia ETF, Holdi

Processing tickers:  40%|████      | 2/5 [00:23<00:35, 11.90s/it]

Stopping - already processed fund: VANGUARD EXTENDED MARKET INDEX FUND

Completed VOO: 12 funds processed
HEZU - Fund: iShares Climate Conscious & Transition MSCI USA ETF, Holdings: 278, Unmatched: 2
HEZU - Fund: iShares ESG Aware MSCI USA ETF, Holdings: 283, Unmatched: 2
HEZU - Fund: iShares MSCI Ireland ETF, Holdings: 30, Unmatched: 5
HEZU - Fund: iShares ESG Advanced MSCI USA ETF, Holdings: 305, Unmatched: 2
VMGRX - Fund: VANGUARD INTERNATIONAL DIVIDEND APPRECIATION INDEX FUND, Holdings: 349, Unmatched: 159
HEZU - Fund: iShares Currency Hedged MSCI Japan ETF, Holdings: 10, Unmatched: 7
HEZU - Fund: iShares MSCI Finland ETF, Holdings: 39, Unmatched: 32
HEZU - Fund: iShares ESG MSCI USA Leaders ETF, Holdings: 267, Unmatched: 2
VMGRX - Fund: VANGUARD INTERNATIONAL EXPLORER FUND, Holdings: 340, Unmatched: 156
HEZU - Fund: iShares MSCI United Kingdom Small-Cap ETF, Holdings: 202, Unmatched: 104
HEZU - Fund: iShares ESG Aware MSCI USA Growth ETF, Holdings: 93, Unmatched: 0
VMGRX - Fund: V

Processing tickers:  60%|██████    | 3/5 [00:52<00:40, 20.14s/it]

Stopping - already processed fund: VANGUARD ADVICE SELECT INTERNATIONAL GROWTH FUND

Completed VMGRX: 12 funds processed
MGK - Fund: VANGUARD ESG INTERNATIONAL STOCK ETF, Holdings: 6620, Unmatched: 3153
MGK - Fund: VANGUARD EMERGING MARKETS EX-CHINA ETF, Holdings: 1025, Unmatched: 530
MGK - Fund: VANGUARD FINANCIALS INDEX FUND, Holdings: 421, Unmatched: 4
HEZU - Fund: iShares National Muni Bond ETF, Holdings: 6091, Unmatched: 2695
MGK - Fund: VANGUARD GLOBAL WELLESLEY INCOME FUND, Holdings: 595, Unmatched: 127
MGK - Fund: VANGUARD ESG U.S. CORPORATE BOND ETF, Holdings: 2736, Unmatched: 104
MGK - Fund: VANGUARD GLOBAL WELLINGTON FUND, Holdings: 647, Unmatched: 138
MGK - Fund: VANGUARD COMMUNICATION SERVICES INDEX FUND, Holdings: 126, Unmatched: 5
MGK - Fund: VANGUARD UTILITIES INDEX FUND, Holdings: 72, Unmatched: 2
MGK - Fund: VANGUARD INTERNATIONAL GROWTH FUND, Holdings: 129, Unmatched: 39
MGK - Fund: VANGUARD MATERIALS INDEX FUND, Holdings: 117, Unmatched: 2
MGK - Fund: VANGUARD CONSU

Processing tickers:  80%|████████  | 4/5 [01:36<00:29, 29.40s/it]

Stopping - already processed fund: VANGUARD FTSE SOCIAL INDEX FUND

Completed MGK: 23 funds processed
HEZU - Fund: iShares Short-Term National Muni Bond ETF, Holdings: 2554, Unmatched: 1212
HEZU - Fund: iShares California Muni Bond ETF, Holdings: 1389, Unmatched: 539
HEZU - Fund: iShares 7-10 Year Treasury Bond ETF, Holdings: 18, Unmatched: 0
HEZU - Fund: iShares Agency Bond ETF, Holdings: 110, Unmatched: 5
HEZU - Fund: iShares 25+ Year Treasury STRIPS Bond ETF, Holdings: 23, Unmatched: 0
HEZU - Fund: iShares New York Muni Bond ETF, Holdings: 734, Unmatched: 313
HEZU - Fund: iShares Long-Term National Muni Bond ETF, Holdings: 241, Unmatched: 111
HEZU - Fund: iShares Short Treasury Bond ETF, Holdings: 68, Unmatched: 0
HEZU - Fund: iShares BBB Rated Corporate Bond ETF, Holdings: 1125, Unmatched: 56
HEZU - Fund: iShares 0-3 Month Treasury Bond ETF, Holdings: 24, Unmatched: 0
HEZU - Fund: iShares 10-20 Year Treasury Bond ETF, Holdings: 66, Unmatched: 0
HEZU - Fund: iShares 3-7 Year Treasur

Processing tickers: 100%|██████████| 5/5 [08:22<00:00, 100.43s/it]

Stopping - already processed fund: iShares National Muni Bond ETF

Completed HEZU: 356 funds processed

=== Updating funds_total ===





In [18]:
for fund in funds_total:
    print(fund.name)
    if "vanguard" not in fund.name.lower() and "ishares" not in fund.name.lower():
        fund.name = "Vanguard " + fund.name
        

Vanguard Extended Market Index Fund
Vanguard Extended Market Index Fund
Vanguard Extended Market Index Fund
Vanguard Extended Market Index Fund
Vanguard Extended Market Index Fund
Vanguard Extended Market Index Fund
Vanguard Mid-Cap Index Fund
Vanguard Mid-Cap Index Fund
Vanguard Mid-Cap Index Fund
Vanguard Mid-Cap Index Fund
Vanguard Mid-Cap Index Fund
Vanguard Mid-Cap Growth Index Fund
Vanguard Mid-Cap Growth Index Fund
Vanguard Mid-Cap Growth Index Fund
Vanguard Mid-Cap Value Index Fund
Vanguard Mid-Cap Value Index Fund
Vanguard Mid-Cap Value Index Fund
Vanguard Small-Cap Index Fund
Vanguard Small-Cap Index Fund
Vanguard Small-Cap Index Fund
Vanguard Small-Cap Index Fund
Vanguard Small-Cap Index Fund
Vanguard Small-Cap Growth Index Fund
Vanguard Small-Cap Growth Index Fund
Vanguard Small-Cap Growth Index Fund
Vanguard Small-Cap Growth Index Fund
Vanguard Small-Cap Value Index Fund
Vanguard Small-Cap Value Index Fund
Vanguard Small-Cap Value Index Fund
Vanguard Small-Cap Value Index 

In [19]:
print("="*60)
print("SIMPLE NAME MATCHING ANALYSIS")
print("="*60)

unmatched_count = 0
matched_count = 0
unmateched_funds = []

for ticker_result in all_results:
    for filing in ticker_result['results']:
        filing_name = filing['fund_name']
        print(filing_name)
        
        # We will collect all "candidates" from your funds_total list
        found_candidates = []
        
        for fund in funds_total:
            # 1. Clean both names (lowercase, strip whitespace)
            f_name = filing_name.lower().strip()
            db_name = fund.name.lower().strip()
            
            # 2. Check: Is one inside the other?
            # We check both directions: "Vanguard 500" in "Vanguard 500 Index" OR vice versa
            if f_name in db_name:
                found_candidates.append(fund)
            
        # --- PRINT RESULTS ---
        if not found_candidates:
            unmatched_count += 1
            unmateched_funds.append(filing)
            print(f"❌ NO MATCH FOUND")
            print(f"   Filing Name: '{filing_name}'")
            print("   (Check: Is the spelling significantly different?)")
            print("-" * 40)
        else:
            matched_count += 1
            # Only print if you want to verify the logic (optional: comment out to reduce noise)
            print(f"✅  MATCHED: '{filing_name}'")
            print(f"   -> Linked to {len(found_candidates)} funds in DB:")
            found_name = found_candidates[0].name
            for c in found_candidates:
                if c.name == found_name:
                    print(f"      • {c.ticker} | {c.name}")
                    c.series_id = filing['series_id']
            print("-" * 40)

print("\n" + "="*30)
print("SUMMARY")
print(f"Total Filings Processed: {matched_count + unmatched_count}")
print(f"Matches Found:           {matched_count}")
print(f"Unmatched:               {unmatched_count}")
print("="*30)

SIMPLE NAME MATCHING ANALYSIS
VANGUARD DIVIDEND GROWTH FUND
✅  MATCHED: 'VANGUARD DIVIDEND GROWTH FUND'
   -> Linked to 1 funds in DB:
      • VDIGX | Vanguard Dividend Growth Fund
----------------------------------------
VANGUARD ENERGY FUND
✅  MATCHED: 'VANGUARD ENERGY FUND'
   -> Linked to 2 funds in DB:
      • VGENX | Vanguard Energy Fund
      • VGELX | Vanguard Energy Fund
----------------------------------------
VANGUARD GLOBAL CAPITAL CYCLES FUND
✅  MATCHED: 'VANGUARD GLOBAL CAPITAL CYCLES FUND'
   -> Linked to 1 funds in DB:
      • VGPMX | Vanguard Global Capital Cycles Fund
----------------------------------------
VANGUARD HEALTH CARE FUND
✅  MATCHED: 'VANGUARD HEALTH CARE FUND'
   -> Linked to 2 funds in DB:
      • VGHCX | Vanguard Health Care Fund
      • VGHAX | Vanguard Health Care Fund
----------------------------------------
VANGUARD REAL ESTATE INDEX FUND
✅  MATCHED: 'VANGUARD REAL ESTATE INDEX FUND'
   -> Linked to 4 funds in DB:
      • VGSIX | Vanguard Real Estat

In [9]:
for fund in funds_total:
    if fund.non_derivatives is not None:
        print(fund.series_id)
        break

In [24]:

processor = NPortProcessor()
for ticker_result in all_results:
    
    for filing_result in ticker_result['results']:
        print(f"Processing ticker: {filing_result['ticker']} {filing_result['fund_name']}")
        ticker = filing_result['ticker']
        reporting_period = filing_result['reporting_period']
        
        # Use the pre-computed holdings DataFrame
        holdings_df = filing_result['holdings_df']  # ← Already processed and enriched
        raw_derivatives = filing_result['derivatives']
        series_id = filing_result['series_id']
        filing_metadata = filing_result['nport_metadata']
        
        # Update your funds_total structure
        for fund in funds_total:
            if hasattr(fund, 'series_id'):
                if series_id == fund.series_id:
                    print(f"Updating fund: {fund.name}")
                    
                    print(holdings_df)
                    fund.non_derivatives = NonDerivatives(
                    date=reporting_period,
                    holdings_df=holdings_df
                    )
                    df2 = processor.to_df(derivatives)
                    fund.derivatives = Derivatives(
                        date=reporting_period,
                        derivatives_df=df2
                    )
                    fund.series_id = series_id
                    fund.nport_metadata = filing_metadata
                    break
        
print("\n=== Processing Complete ===")
print(f"Total tickers processed: {len(all_results)}")
for result in all_results:
    print(f"{result['ticker']}: {len(result['funds_processed'])} funds")



def verify_fund_data_integrity(funds_list):
    """
    Iterates through funds to verify that the holdings DataFrame is populated 
    and counts how many are None or missing.
    """
    print("\n" + "="*40)
    print("DATA INTEGRITY VERIFICATION")
    print("="*40)
    
    none_count = 0
    valid_count = 0
    
    for fund in funds_list:
        # We need to check layers: 
        # 1. Does fund.non_derivatives exist? 
        # 2. Is it not None?
        # 3. Is the holdings_df inside it not None?
        
        has_data = False
        row_count = 0
        
        try:
            if (hasattr(fund, 'non_derivatives') and 
                fund.non_derivatives is not None and 
                fund.non_derivatives.holdings_df is not None):
                
                has_data = True
                row_count = len(fund.non_derivatives.holdings_df)
        except Exception:
            # If any attribute access fails, treat as no data
            has_data = False

        if has_data:
            valid_count += 1
            # Optional: Print success if you want to see the good ones
            # print(f"✓ {fund.name:<20} | Rows: {row_count}")
        else:
            none_count += 1
            print(f"❌ {fund.name:<20} | Status: DATAFRAME IS NONE/MISSING")

    print("-" * 40)
    print(f"Total Funds Checked: {len(funds_list)}")
    print(f"Valid DataFrames:    {valid_count}")
    print(f"None/Missing Values: {none_count}")
    print("=" * 40)

# Run the verification
# Note: Ensure 'funds_total' is defined in your scope (it was referenced in your loop)
if 'funds_total' in locals():
    verify_fund_data_integrity(funds_total)
else:
    print("Error: 'funds_total' variable not found in current scope.")

Processing ticker: VDIGX VANGUARD DIVIDEND GROWTH FUND
Updating fund: Vanguard Dividend Growth Fund
                                                 name  ticker      cusip  \
0                            JP Morgan Securities LLC    MGHL       None   
1                                     Credit Agricole   CRARF       None   
2                             Procter & Gamble Co/The      PG  742718109   
3                                      S&P Global Inc    SPGI  78409V104   
4                                           Apple Inc    AAPL  037833100   
5                     Bank of America Securities, LLC   BACRP       None   
6                              Trane Technologies PLC      TT  G8994E103   
7                     Bank of America Securities, LLC   BACRP       None   
8                                        Stryker Corp     SYK  863667101   
9                                      Microsoft Corp    MSFT  594918104   
10                                           Visa Inc       V  9

In [25]:
for fund in funds_total:
    if fund.non_derivatives.holdings_df is not None:
        print(fund.non_derivatives.holdings_df)
    break


                                    name ticker      cusip          isin  \
0                    Aadi Bioscience Inc   RKDA  00032Q104  US00032Q1040   
1                               AAON Inc   AAON  000360206  US0003602069   
2                               AAR Corp    AIR  000361105  US0003611052   
3                       ACCO Brands Corp   ACCO  00081T108  US00081T1088   
4     New Issuer: BB Company ID:69859197   None  000847103  US0008471031   
...                                  ...    ...        ...           ...   
3453                      Dorian LPG Ltd    LPG  Y2106R110  MHY2106R1100   
3454                            Flex Ltd   FLEX  Y2573F102  SG9999000020   
3455        Genco Shipping & Trading Ltd    GNK  Y2685T131  MHY2685T1313   
3456           International Seaways Inc   INSW  Y41053102  MHY410531021   
3457              WaVe Life Sciences Ltd    WVE  Y95308105  SG9999014716   

                       lei     shares  market_value      weight_pct currency  \
0     5

In [26]:
import pickle
from pathlib import Path

PKL_PATH = Path("./funds_backup_metadata.pkl")
TMP_PATH = PKL_PATH.with_suffix(PKL_PATH.suffix + ".tmp")

with TMP_PATH.open("wb") as f:
    pickle.dump(funds_total, f, protocol=pickle.HIGHEST_PROTOCOL)

TMP_PATH.replace(PKL_PATH)

print(f"Saved {len(funds_total)} funds to pickle file: {PKL_PATH.resolve()}")
print(f"File size: {PKL_PATH.stat().st_size / (1024 * 1024):.2f} MB")

Saved 460 funds to pickle file: /home/luis/Desktop/code/RAG/notebooks/funds_backup_metadata.pkl
File size: 82.53 MB


In [1]:
import pickle
from pathlib import Path
from dataclasses import is_dataclass, asdict
import pandas as pd
import sys
from pathlib import Path
RAG_DIR = Path("/home/luis/Desktop/code/RAG/src/")
if str(RAG_DIR) not in sys.path:
    sys.path.insert(0, str(RAG_DIR))


PKL_PATH = Path("./funds_backup_metadata.pkl")
print("Current working directory:", Path.cwd())
print("PKL_PATH resolves to:", PKL_PATH.resolve())
with PKL_PATH.open("rb") as f:
    funds_total = pickle.load(f)

print(f"Loaded {len(funds_total)} funds from pickle file")

Current working directory: /home/luis/Desktop/code/RAG/notebooks
PKL_PATH resolves to: /home/luis/Desktop/code/RAG/notebooks/funds_backup_metadata.pkl
Loaded 460 funds from pickle file


## Processing Phase

### General information about the fund

In [37]:

for fund in funds_total:
    if 'Vanguard' in fund.name:
        fund.provider = 'The Vanguard Group, Inc'
    else:
        fund.provider = 'BlackRock, Inc'

In [38]:
registrants = set()
for fund in funds_total:
    if '\xa0' in fund.registrant:
        fund.registrant = fund.registrant.replace('\xa0', ' ')
    registrants.add(fund.registrant)

print(registrants)

{'Vanguard Index Funds', 'Vanguard Specialized Funds', 'Vanguard World Fund', 'iShares Trust', 'Vanguard Whitehall Funds'}


In [39]:
from src.simple_rag.models.fund import ShareClassType

unique_share_classes = set(fund.share_class for fund in funds_total)
print("Unique Share Classes:")
for share_class in unique_share_classes:
    print(share_class)

from collections import Counter

# Count funds by share class
share_counts = Counter()
for fund in funds_total:
    if fund.share_class:
        if fund.share_class == ShareClassType.OTHER:
            fund.share_class = ShareClassType.ETF
        share_counts[fund.share_class] += 1

print("Funds by share class:")
for share_type, count in share_counts.items():
    print(f"  {share_type.value}: {count} funds")



Unique Share Classes:
ShareClassType.OTHER
ShareClassType.INSTITUTIONAL
ShareClassType.INVESTOR
ShareClassType.ETF
ShareClassType.ADMIRAL
ShareClassType.INSTITUTIONAL_SELECT
ShareClassType.INSTITUTIONAL_PLUS
Funds by share class:
  Investor Shares: 27 funds
  ETF Shares: 370 funds
  Admiral Shares: 40 funds
  Institutional Shares: 16 funds
  Institutional Plus Shares: 4 funds
  Institutional Select Shares: 3 funds


In [40]:
from collections import Counter
import unicodedata

def normalize_name(text):
    if not text:
        return "Unknown"
    # NFKD normalization turns non-breaking spaces into normal spaces
    return unicodedata.normalize("NFKD", text).strip()

# Your original loop, but with the fix applied:
share_counts = Counter()

for fund in funds_total:
    if fund.registrant:
        # Apply the fix here
        clean_registrant = normalize_name(fund.registrant)
        share_counts[clean_registrant] += 1

print("Funds by share class (Merged):")
for share_type, count in share_counts.items():
    print(f"  {share_type}: {count} funds")

Funds by share class (Merged):
  Vanguard Index Funds: 52 funds
  Vanguard World Fund: 41 funds
  Vanguard Specialized Funds: 14 funds
  Vanguard Whitehall Funds: 18 funds
  iShares Trust: 335 funds


In [41]:
from collections import Counter
import unicodedata

def normalize_name(text):
    if not text:
        return "Unknown"
    # NFKD normalization turns non-breaking spaces into normal spaces
    return unicodedata.normalize("NFKD", text).strip()

fund_counts = Counter()

for fund in funds_total:
    if fund.name:
        # 1. Normalize the name first
        clean_name = normalize_name(fund.name)
        
        # 2. Check if it starts with iShares or Vanguard
        # We check case-sensitively, but you could use .lower() if needed
        if not (clean_name.startswith("iShares") or clean_name.startswith("Vanguard")):
            # 3. Add 'Vanguard' if missing
            clean_name = f"Vanguard {clean_name}"
            
        # 4. Count the final modified name
        fund_counts[clean_name] += 1

print("Funds by name (Merged & Fixed):")
for fund_name, count in fund_counts.items():
    print(f"  {fund_name}: {count} funds")



Funds by name (Merged & Fixed):
  Vanguard Extended Market Index Fund: 6 funds
  Vanguard Mid-Cap Index Fund: 5 funds
  Vanguard Mid-Cap Growth Index Fund: 3 funds
  Vanguard Mid-Cap Value Index Fund: 3 funds
  Vanguard Small-Cap Index Fund: 5 funds
  Vanguard Small-Cap Growth Index Fund: 4 funds
  Vanguard Small-Cap Value Index Fund: 4 funds
  Vanguard Total Stock Market Index Fund: 6 funds
  Vanguard 500 Index Fund: 4 funds
  Vanguard Value Index Fund: 4 funds
  Vanguard Growth Index Fund: 4 funds
  Vanguard Large-Cap Index Fund: 4 funds
  Vanguard Mega Cap Growth Index Fund: 2 funds
  Vanguard Extended Duration Treasury Index Fund: 2 funds
  Vanguard ESG U.S. Stock ETF: 1 funds
  Vanguard ESG International Stock ETF: 1 funds
  Vanguard Global WellingtonTM Fund: 2 funds
  Vanguard Global Wellesley® Income Fund: 2 funds
  Vanguard ESG U.S. Corporate Bond ETF: 1 funds
  Vanguard FTSE Social Index Fund: 2 funds
  Vanguard Materials Index Fund: 2 funds
  Vanguard Communication Services I

In [42]:
sum = 0
for fund in funds_total:
    # Format correctly the numeric fields
    fund.net_assets *= 1e6
    fund.advisory_fees *= 1e3
   

In [43]:
from src.simple_rag.utils.chart_utils import extract_flexible_performance
from src.simple_rag.models.fund import AverageReturnSnapshot
for fund in funds_total:
    
    year = fund.report_date.year
    if fund.avg_annual_returns is None:
        continue 
    result = extract_flexible_performance(fund.avg_annual_returns)
    
    # Create AverageReturnSnapshot instance
    snapshot = AverageReturnSnapshot(
        return_1y=result.get('1_year', None),
        return_5y=result.get('5_year', None),
        return_10y=result.get('10_year', None),
        return_inception=result.get('since_inception', None)
    )
    
    fund.performance[str(year)] = snapshot 
    

In [44]:
for fund in funds_total:
    if fund.financial_highlights is not None:
        print(fund.financial_highlights)
        for _, values in fund.financial_highlights.items():
            values.net_assets = values.net_assets * 1e6

{'2024': FinancialHighlights(turnover=11.0, expense_ratio=0.19, total_return=16.76, net_assets=195.0, net_assets_value_begining=124.78, net_assets_value_end=144.2, net_income_ratio=1.09), '2023': FinancialHighlights(turnover=11.0, expense_ratio=0.19, total_return=25.22, net_assets=232.0, net_assets_value_begining=100.93, net_assets_value_end=124.78, net_income_ratio=1.28), '2022': FinancialHighlights(turnover=11.0, expense_ratio=0.19, total_return=-26.56, net_assets=229.0, net_assets_value_begining=138.8, net_assets_value_end=100.93, net_income_ratio=1.14), '2021': FinancialHighlights(turnover=19.0, expense_ratio=0.19, total_return=12.31, net_assets=399.0, net_assets_value_begining=124.83, net_assets_value_end=138.8, net_income_ratio=0.87), '2020': FinancialHighlights(turnover=19.0, expense_ratio=0.19, total_return=32.04, net_assets=454.0, net_assets_value_begining=95.66, net_assets_value_end=124.83, net_income_ratio=1.04)}
{'2024': FinancialHighlights(turnover=11.0, expense_ratio=0.06

In [45]:
managers_set = set()
for fund in funds_total:
    if hasattr(fund, 'managers') and fund.managers is not None:
        managers_set.update(fund.managers)
for manager in sorted(managers_set):
    print(manager)

Aaron Choi
Asian Economic Risk
Aurélie Denis
Brett Barakett
Chris Nieves
Christopher Chung
Consumer Goods
Erin Armstrong
Gary Robinson
Investment Manager
Jake Riley
James Mauro
Jena Stenger
Jennifer Hsui
Jonathan Graves
Kenny Narzikul
Lawrence Burns
Managing Director
Matt Waldron
Michael Cling
Michelle Louie
Nataliya Kofman
Natasha Kuhlkin
Nick Birkett
Peter Sietsema
Senior Managing Director
Simon Webber
Steven White
Suzanne Ly
Thomas Coutts
Tom Slater
Walter Nejman


Enhance the summary prospectus 

In [46]:
def update_summary_prospectus_with_header(fund):
    """Remove all existing headers and add new header."""
    
    header = f"# FUND PROFILE({fund.ticker} - {fund.name} - {fund.share_class if fund.share_class else 'N/A'}):"
    
    if not fund.summary_prospectus or not fund.summary_prospectus.strip():
        return header
    
    # Remove all header lines and keep content
    lines = fund.summary_prospectus.split('\n')
    content_lines = [line for line in lines if not line.strip().startswith("# FUND PROFILE(")]
    
    # Add new header at beginning with proper spacing
    if content_lines:
        return f"{header}\n" + '\n'.join(content_lines)
    else:
        return header

# Update all funds
for i, fund in enumerate(funds_total):
    fund.summary_prospectus = update_summary_prospectus_with_header(fund)
    print(f"✅ {i+1}/{len(funds_total)}: Updated {fund.ticker}")

✅ 1/460: Updated VEXMX
✅ 2/460: Updated VXF
✅ 3/460: Updated VEXAX
✅ 4/460: Updated VIEIX
✅ 5/460: Updated VEMPX
✅ 6/460: Updated VSEMX
✅ 7/460: Updated VIMSX
✅ 8/460: Updated VO
✅ 9/460: Updated VIMAX
✅ 10/460: Updated VMCIX
✅ 11/460: Updated VMCPX
✅ 12/460: Updated VMGIX
✅ 13/460: Updated VOT
✅ 14/460: Updated VMGMX
✅ 15/460: Updated VMVIX
✅ 16/460: Updated VOE
✅ 17/460: Updated VMVAX
✅ 18/460: Updated NAESX
✅ 19/460: Updated VB
✅ 20/460: Updated VSMAX
✅ 21/460: Updated VSCIX
✅ 22/460: Updated VSCPX
✅ 23/460: Updated VISGX
✅ 24/460: Updated VBK
✅ 25/460: Updated VSGAX
✅ 26/460: Updated VSGIX
✅ 27/460: Updated VISVX
✅ 28/460: Updated VBR
✅ 29/460: Updated VSIAX
✅ 30/460: Updated VSIIX
✅ 31/460: Updated VTSMX
✅ 32/460: Updated VTI
✅ 33/460: Updated VTSAX
✅ 34/460: Updated VITSX
✅ 35/460: Updated VSMPX
✅ 36/460: Updated VSTSX
✅ 37/460: Updated VFINX
✅ 38/460: Updated VOO
✅ 39/460: Updated VFIAX
✅ 40/460: Updated VFFSX
✅ 41/460: Updated VIVAX
✅ 42/460: Updated VTV
✅ 43/460: Updated VVIAX

In [41]:
# Check a few funds to see the result
for fund in funds_total[:3]:
    print(f"\n{fund.ticker}:")
    print(f"First 100 chars: ...")
    print(fund.summary_prospectus)
    print(f"Has header: {'# FUND PROFILE(' in fund.summary_prospectus}")


VEXMX:
First 100 chars: ...
# FUND PROFILE(VEXMX - Vanguard Extended Market Index Fund - ShareClassType.INVESTOR):

## Investment Objective
The Fund seeks to track the performance of a benchmark index that measuresthe investment return of small- and mid-capitalization stocks.

## Principal Investment Strategies
The Fund employs an indexing investment approach designed to track theperformance of the S& P Completion Index (the Index), a broadly diversified indexof stocks of small and mid-size U. S. companies. The Index contains all of theU. S. common stocks regularly traded on the New York Stock Exchange, Cboe, and the Nasdaq over-the-counter market, except those stocks included in theS& P 500 Index. The Fund invests by sampling the Index, meaning that it holds abroadly diversified collection of securities that, in the aggregate, approximatesthe full Index in terms of key characteristics. These characteristics includeindustry weightings and market capitalization, as well as certain fina

### Geographic Allocation

In [47]:

for fund in funds_total:
    df = fund.geographic_allocation
    if df is not None:
        df = df.iloc[:,[0,1]]
        df.columns = ['Country', 'Percentage']
        df.iloc[:, 0] = df.iloc[:, 0].astype(str).str.replace('\u200b', '', regex=True).str.strip()
        df.iloc[:, 0] = df.iloc[:, 0].replace(['', 'nan', 'None', 'Other#', 'Country/Geographic Region'], 'Other')
        df.iloc[:, 1] = pd.to_numeric(df.iloc[:, 1], errors='coerce')
        df = df[df.iloc[:, 1].notna()]
        
        fund.geographic_allocation = df
        
countries = set()
for fund in funds_total:
    df = fund.geographic_allocation
    if df is not None:
        # Get unique values from the first column
        unique_values = df.iloc[:, 0].unique()
        countries.update(unique_values)
print(f"Unique countries: {len(countries)}")
print("Countries:", sorted(countries))
# Verify data types
for fund in funds_total:  # Check first fund
    if fund.geographic_allocation is not None:
        print(f"\nSample cleaned data for {fund.ticker}:")
        print(f"Shape: {fund.geographic_allocation.shape}")
        print(f"Data types: {fund.geographic_allocation.dtypes}")
        print(f"Sample rows:")
        print(fund.geographic_allocation)
        break


Unique countries: 42
Countries: ['Australia', 'Belgium', 'Brazil', 'Canada', 'Chile', 'China', 'Colombia', 'Denmark', 'Finland', 'France', 'Germany', 'Hong Kong', 'India', 'Indonesia', 'Israel', 'Italy', 'Japan', 'Malaysia', 'Mexico', 'Netherlands', 'New Zealand', 'Norway', 'Other', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Qatar', 'Saudi Arabia', 'Singapore', 'South Africa', 'South Korea', 'Spain', 'Supranational', 'Sweden', 'Switzerland', 'Taiwan', 'Thailand', 'Turkey', 'United Arab Emirates', 'United Kingdom', 'United States']

Sample cleaned data for DMXF:
Shape: (11, 2)
Data types: Country       object
Percentage    object
dtype: object
Sample rows:
           Country Percentage
1            Japan       25.1
2      Switzerland       11.8
3           France       10.3
4      Netherlands        8.6
5          Germany        8.2
6   United Kingdom        7.5
7           Sweden        6.1
8        Australia        4.5
9        Hong Kong        3.3
10       Singapore        2.6
11 

### Sector Allocation


In [48]:
from src.simple_rag.utils.chart_utils import validate_and_clean_allocation

for fund in funds_total:
    df = fund.sector_allocation
    if df is not None:
        df = validate_and_clean_allocation(df, 'Sector', sort_by_value=True)
        fund.sector_allocation = df


VALIDATION REPORT
Category Column: 'Sector' | Value Column: 'Percent of Total Investments(a)'
❌ Row  0: 'Portfolio Composition % of Net' | 'nan     ' | Numeric: True | Range OK: False
✅ Row  1: 'Communication Services        ' | '4.3%    ' | Numeric: True | Range OK: True
✅ Row  2: 'Consumer Discretionary        ' | '12.0%   ' | Numeric: True | Range OK: True
✅ Row  3: 'Consumer Staples              ' | '3.0%    ' | Numeric: True | Range OK: True
✅ Row  4: 'Energy                        ' | '4.1%    ' | Numeric: True | Range OK: True
✅ Row  5: 'Financials                    ' | '18.0%   ' | Numeric: True | Range OK: True
✅ Row  6: 'Health Care                   ' | '11.4%   ' | Numeric: True | Range OK: True
✅ Row  7: 'Industrials                   ' | '17.4%   ' | Numeric: True | Range OK: True
✅ Row  8: 'Information Technology        ' | '17.9%   ' | Numeric: True | Range OK: True
✅ Row  9: 'Materials                     ' | '4.7%    ' | Numeric: True | Range OK: True
✅ Row 10: 'Real

In [49]:
import pandas as pd
import re

# 1. Get all sectors from your funds
sectors = set()
for fund in funds_total:
    df = fund.sector_allocation
    if df is not None:
        sectors.update(df.iloc[:, 0].unique())

# 2. The Cleaning Function
def get_standard_key(text):
    if not isinstance(text, str):
        return text
        
    # A. Convert to lowercase
    clean = text.lower()
    
    # B. Remove the specific word "sector"
    clean = re.sub(r'\bsector\b', '', clean)
    
    # C. Remove punctuation
    clean = re.sub(r'[^a-z0-9\s]', '', clean)
    
    # D. Remove extra whitespace
    clean = re.sub(r'\s+', ' ', clean).strip()
    
    return clean

# 3. Create mapping DataFrame
sectors_list = list(sectors)
df_mapping = pd.DataFrame({'Original': sectors_list})
df_mapping['Clean_Key'] = df_mapping['Original'].apply(get_standard_key)

# 4. Create canonical mapping (pick shortest name as standard)
canonical_map = df_mapping.groupby('Clean_Key')['Original'].transform(lambda x: sorted(x, key=len)[0])
df_mapping['Standardized'] = canonical_map

# 5. Create the mapping dictionary
standardization_dict = dict(zip(df_mapping['Original'], df_mapping['Standardized']))

# 6. Review the mapping
print("Standardization Mapping:")
print(df_mapping[['Original', 'Standardized']].drop_duplicates().sort_values('Standardized'))

# 7. Apply standardization to all fund DataFrames
updated_funds = 0
for fund in funds_total:
    df = fund.sector_allocation
    if df is not None:
        # Make a copy to avoid SettingWithCopyWarning
        df_copy = df.copy()
        
        # Apply standardization to first column
        original_values = df_copy.iloc[:, 0].copy()
        standardized_values = original_values.map(standardization_dict)
        df_copy.iloc[:, 0] = standardized_values
        df_copy.iloc[:, 0] = (
            df_copy.iloc[:, 0]
            .str.replace(r'\bSector\b', '', regex=True)  # Remove multiple words
            .str.replace(r'\s+', ' ', regex=True)         # Fix extra spaces
            .str.strip()                                 # Remove leading/trailing spaces
        )
        # Update the fund's DataFrame
        fund.sector_allocation = df_copy
        updated_funds += 1

print(f"\n✅ Updated {updated_funds} funds with standardized sector names")

# 8. Verify the results
final_sectors = set()
for fund in funds_total:
    df = fund.sector_allocation
    if df is not None:
        final_sectors.update(df.iloc[:, 0].unique())

print(f"\nBefore standardization: {len(sectors)} unique sectors")
print(f"After standardization: {len(final_sectors)} unique sectors")
print(f"Reduction: {len(sectors) - len(final_sectors)} duplicates removed")

# 9. Show some examples of the changes
print(f"\nExamples of standardization:")
for original, standardized in standardization_dict.items():
    if original != standardized:
        print(f"  '{original}' → '{standardized}'")

Standardization Mapping:
                                Original                         Standardized
70                   Aerospace & Defense                  Aerospace & Defense
128             Aerospace Defense Sector                  Aerospace & Defense
140                               Africa                               Africa
67    Agricultural Farm Machinery Sector   Agricultural Farm Machinery Sector
65          Air Freight Logistics Sector              Air Freight & Logistics
..                                   ...                                  ...
160                     Utilities Sector                            Utilities
9    Vanguard Real Estate II Index Fund1  Vanguard Real Estate II Index Fund1
4                 Water Utilities Sector                      Water Utilities
98                       Water Utilities                      Water Utilities
48   Wireless Telecommunication Services  Wireless Telecommunication Services

[161 rows x 2 columns]

✅ Updated 287 

In [50]:
# Get unique values from first column (index 0) of all sector allocation DataFrames
sectors = set()
for fund in funds_total:
    df = fund.sector_allocation
    if df is not None:
        # Get unique values from first column
        unique_values = df.iloc[:, 0].unique()
        sectors.update(unique_values)

print(f"Unique sectors: {len(sectors)}")
print("Sectors:", sorted(sectors))

Unique sectors: 120
Sectors: ['Aerospace & Defense', 'Africa', 'Agricultural Farm Machinery', 'Air Freight & Logistics', 'Asia', 'Automobile Components', 'Automobile Manufacturers', 'Automobiles', 'Banks', 'Basic Materials', 'Beverages', 'Biotechnology', 'Broadline Retail', 'Building Products', 'Capital Markets', 'Chemicals', 'Coal & Consumable Fuels', 'Commercial Services & Supplies', 'Communication Services', 'Communications', 'Communications Equipment', 'Construction & Engineering', 'Construction Machinery Heavy Transportation Equipment', 'Construction Materials', 'Consumer Discretionary', 'Consumer Finance', 'Consumer Staples', 'Consumer Staples Distribution & Retail', 'Containers & Packaging', 'Data Center REITs', 'Distributors', 'Diversified Consumer Services', 'Diversified Telecommunication Services', 'Domestic Equity', 'Domestic Fixed Income', 'Electric Utilities', 'Electrical Components Equipment', 'Electrical Equipment', 'Electronic Components', 'Electronic Equipment, Instrum

### Industry Allocation

In [51]:
from src.simple_rag.utils.chart_utils import validate_and_clean_allocation

for fund in funds_total:
    df = fund.industry_allocation
    if df is not None:
        df = validate_and_clean_allocation(df, 'Industry', sort_by_value=True)
        fund.industry_allocation = df


🔍 Detected header row at index 0
New columns: ['Industry', 'Percent of Total Investments(a)']
VALIDATION REPORT
Category Column: 'Industry' | Value Column: 'Percent of Total Investments(a)'
✅ Row  0: 'Semiconductors & Semiconductor' | '37.9    ' | Numeric: True | Range OK: True
✅ Row  1: 'Machinery                     ' | '18.7    ' | Numeric: True | Range OK: True
✅ Row  2: 'Electrical Equipment          ' | '17.9    ' | Numeric: True | Range OK: True
✅ Row  3: 'Automobiles                   ' | '14.4    ' | Numeric: True | Range OK: True
✅ Row  4: 'Automobile Components         ' | '6.4     ' | Numeric: True | Range OK: True
✅ Row  5: 'Chemicals                     ' | '4.1     ' | Numeric: True | Range OK: True
✅ Row  6: 'Electronic Equipment, Instrume' | '0.6     ' | Numeric: True | Range OK: True

✅ Final: 7 valid rows (sorted by value)
🔍 Detected header row at index 0
New columns: ['Industry', 'Percent of Total  Investments(a)']
VALIDATION REPORT
Category Column: 'Industry' | Val

In [52]:
# Get unique values from first column (index 0) of all sector allocation DataFrames
sectors = set()
for fund in funds_total:
    df = fund.industry_allocation
    if df is not None:
        # Get unique values from first column
        unique_values = df.iloc[:, 0].unique()
        sectors.update(unique_values)

print(f"Unique sectors: {len(sectors)}")
print("Sectors:", sorted(sectors))

Unique sectors: 129
Sectors: ['Aerospace & Defense', 'Alternative Carriers', 'Automobile Components', 'Automobiles', 'Automobiles & Components', 'Banks', 'Biotechnology', 'Brewers', 'Building Products', 'Cable & Satellite', 'Capital Goods', 'Capital Markets', 'Chemicals', 'Coal & Consumable Fuels', 'Commercial & Professional Services', 'Commodity Chemicals', 'Communications Equipment', 'Construction & Engineering', 'Construction Machinery & Heavy Transportation Equipment', 'Construction Materials', 'Consumer Discretionary Distribution & Retail', 'Consumer Durables & Apparel', 'Consumer Finance', 'Consumer Services', 'Consumer Staples Merchandise Retail', 'Containers & Packaging', 'Copper', 'Data Center REITs', 'Distillers & Vintners', 'Diversified Banks', 'Diversified Metals & Mining', 'Diversified REITs', 'Diversified Real Estate Activities', 'Diversified Telecommunication Services', 'Electric Utilities', 'Electrical Components & Equipment', 'Electrical Equipment', 'Electronic Equipme

In [53]:
import pickle
from pathlib import Path

PKL_PATH = Path("./funds_backup_metadata.pkl")
TMP_PATH = PKL_PATH.with_suffix(PKL_PATH.suffix + ".tmp")

with TMP_PATH.open("wb") as f:
    pickle.dump(funds_total, f, protocol=pickle.HIGHEST_PROTOCOL)

TMP_PATH.replace(PKL_PATH)

print(f"Saved {len(funds_total)} funds to pickle file: {PKL_PATH.resolve()}")
print(f"File size: {PKL_PATH.stat().st_size / (1024 * 1024):.2f} MB")

Saved 460 funds to pickle file: /home/luis/Desktop/code/RAG/notebooks/funds_backup_metadata.pkl
File size: 11.06 MB
