## Documents Extraction and Processing

In [1]:
%load_ext autoreload
%autoreload 2

Lets first obtain the file that maps the tickers with the CIKs

In [1]:
import requests
import json
from pathlib import Path
import os

# --- Configuration (from previous step) ---
HEADERS = {
    "User-Agent": "EdgarTutorial/1.0 (YourName your.email@domain.com)" 
}
TICKER_CIK_URL = "https://www.sec.gov/files/company_tickers.json"
OUTPUT_FILE = Path("sec_data/company_tickers.json")

# Ensure directory exists
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
# ----------------------------------------

# 1. Download the JSON data
print("Downloading CIK-Ticker map...")
response = requests.get(TICKER_CIK_URL, headers=HEADERS, timeout=15)
response.raise_for_status()
raw_data = response.json() # Load into Python dictionary

# 2. Open the file and use json.dump() with indent=4
print(f"Saving JSON in readable format to {OUTPUT_FILE.absolute()}...")

# Use 'w' mode to write the file
with open(OUTPUT_FILE, 'w') as f:
    # Key Fix: The 'indent=4' parameter tells the JSON module to format the output 
    # with 4 spaces for each level of nesting, adding line breaks automatically.
    json.dump(raw_data, f, indent=4) 

print("‚úÖ JSON saved successfully with proper line breaks and indentation.")

# --- Optional: Print a Snippet to Console (Also Pretty-Printed) ---
# If you want to print to the console instead of a file, use json.dumps()
print("\n--- Console Snippet (Pretty-Printed) ---")
# Print the first 3 key-value pairs from the dictionary
keys = list(raw_data.keys())
snippet = {k: raw_data[k] for k in keys[:3]}

# Use json.dumps() with indent=2 to format the string output
pretty_string = json.dumps(snippet, indent=2)
print(pretty_string)

Downloading CIK-Ticker map...
Saving JSON in readable format to /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks/sec_data/company_tickers.json...
‚úÖ JSON saved successfully with proper line breaks and indentation.

--- Console Snippet (Pretty-Printed) ---
{
  "0": {
    "cik_str": 1045810,
    "ticker": "NVDA",
    "title": "NVIDIA CORP"
  },
  "1": {
    "cik_str": 320193,
    "ticker": "AAPL",
    "title": "Apple Inc."
  },
  "2": {
    "cik_str": 789019,
    "ticker": "MSFT",
    "title": "MICROSOFT CORP"
  }
}


### Vanguard Index Funds

In [None]:
import pandas as pd
from io import StringIO

import sys
from pathlib import Path

RAG_DIR = Path("/home/alvar/CascadeProjects/windsurf-project/RAG")
if str(RAG_DIR) not in sys.path:
    sys.path.insert(0, str(RAG_DIR))

from src.simple_rag.extraction.parser import BlackRockFiling


set_identity("luis.alvarez.conde@alumnos.upm.es")

ticker = "VOO"
fund = Company(ticker)
all_filings = fund.get_filings(form="N-CSR")


if all_filings:
    # 1. Find the most recent date in the entire history (e.g., "2024-12-31")
    latest_date_str = max(f.report_date for f in all_filings)
    
    # 2. Extract just the YEAR (e.g., "2024")
    target_year = latest_date_str[:4]
    
    # 3. Filter: Keep ALL filings where the report_date starts with that year
    # This captures the March, June, and December reports for that fiscal year
    latest_filings = [
        f for f in all_filings 
        if f.report_date and f.report_date.startswith(target_year)
    ]

    print("Found filings: ", len(latest_filings), "for year: ", target_year)


funds_total = []
performance_funds = []
for filing in latest_filings:

    html_content = filing.html()
    
    parser = BlackRockFiling(html_content)
    funds = parser.get_funds()
    count = 0
    for fund in funds:
        if fund.performance_table is not None:
            performance_funds.append(fund.ticker)
            count += 1

    if count == 0:
        df_performance = parser.get_financial_highlights()

    print(count)
    print("Adding funds: ", len(funds))
    
    funds_total.extend(funds)

print(len(performance_funds))
print(performance_funds)

print(df_performance)


Found filings:  2 for year:  2024
Processing: Vanguard Extended Market Index Fund
Extracting context:  FY2024_C000007779Member
Tag not found:  dei:SecurityExchangeName FY2024_C000007779Member
Failed to extract tables from block:  oef:LineGraphTableTextBlock
No tables found for block:  oef:LineGraphTableTextBlock
Processing: Vanguard Extended Market Index Fund
Extracting context:  FY2024_C000007782Member
Tag not found:  dei:SecurityExchangeName FY2024_C000007782Member
Failed to extract tables from block:  oef:LineGraphTableTextBlock
No tables found for block:  oef:LineGraphTableTextBlock
Processing: Vanguard Extended Market Index Fund
Extracting context:  FY2024_C000007780Member
Tag not found:  dei:SecurityExchangeName FY2024_C000007780Member
Failed to extract tables from block:  oef:LineGraphTableTextBlock
No tables found for block:  oef:LineGraphTableTextBlock
Processing: Vanguard Extended Market Index Fund
Extracting context:  FY2024_C000007781Member
Tag not found:  dei:SecurityExcha

In [None]:

returns_lookup = df_performance[['fund_name', 'share_class', 'year', 'total_return']].copy()

# Optional: Clean the total_return column (remove % sign if needed)
returns_lookup['total_return_clean'] = returns_lookup['total_return'].str.replace('%', '').astype(float)
display(returns_lookup['fund_name'].unique())

# Now you can efficiently match and update your funds
for fund_obj in funds_total:
    print(f"\nProcessing fund object: {fund_obj.name} - {fund_obj.share_class}")
    if fund_obj.ticker in performance_funds:
        continue
    # Initialize annual returns
    if not hasattr(fund_obj, 'annual_returns') or fund_obj.annual_returns is None:
        fund_obj.annual_returns = {}
    
    # Clean the name: remove "Vanguard" and strip whitespace
    name = fund_obj.name.replace("Vanguard", "").strip()
    print(f"Cleaned name: '{name}'")
    
    # Find matching rows based on fund name
    name_matches = returns_lookup[returns_lookup['fund_name'].str.contains(name, case=False, na=False, regex=False)]
    
    if len(name_matches) == 0:
        print("  No name matches found")
        continue
    
    print(f"  Found {len(name_matches)} name matches")
    
    # Clean share class (remove trademark symbol)
    share_class = fund_obj.share_class
    if "‚Ñ¢" in share_class:
        share_class = share_class.replace("‚Ñ¢", "")
    
    # Now match share class
    share_class_matches = name_matches[
        name_matches['share_class'].str.contains(share_class, case=False, na=False, regex=False)]
    
    if len(share_class_matches) == 0:
        print(f"  No share class matches found for '{share_class}'")
        print(f"  Available share classes: {name_matches['share_class'].unique()}")
        continue
    
    print(f"  Found {len(share_class_matches)} matching records")
    
    # Add all matching returns
    for _, row in share_class_matches.iterrows():
        fund_obj.annual_returns[row['year']] = row['total_return_clean']
    
    print(f"  Annual returns: {fund_obj.annual_returns}")

array(['Small-Cap Index Fund', 'Small-Cap Growth Index Fund',
       'Small-Cap Value Index Fund', 'Extended Market Index Fund',
       'Mid-Cap Index Fund', 'Mid-Cap Growth Index Fund',
       'Mid-Cap Value Index Fund', 'Total Stock Market Index Fund'],
      dtype=object)


Processing fund object: Vanguard Extended Market Index Fund - Investor Shares
Cleaned name: 'Extended Market Index Fund'
  Found 30 name matches
  Found 5 matching records
  Annual returns: {'2024': '16.76%', '2023': '25.22%', '2022': '-26.56%', '2021': '12.31%', '2020': '32.04%'}

Processing fund object: Vanguard Extended Market Index Fund - ETF Shares
Cleaned name: 'Extended Market Index Fund'
  Found 30 name matches
  Found 5 matching records
  Annual returns: {'2024': '16.90%', '2023': '25.39%', '2022': '-26.46%', '2021': '12.44%', '2020': '32.20%'}

Processing fund object: Vanguard Extended Market Index Fund - Admiral‚Ñ¢ Shares
Cleaned name: 'Extended Market Index Fund'
  Found 30 name matches
  Found 5 matching records
  Annual returns: {'2024': '16.91%', '2023': '25.38%', '2022': '-26.47%', '2021': '12.45%', '2020': '32.21%'}

Processing fund object: Vanguard Extended Market Index Fund - Institutional Shares
Cleaned name: 'Extended Market Index Fund'
  Found 30 name matches
  F

In [None]:
import sys
%reload_ext autoreload
sys.path.append('../src')


from simple_rag.extraction.parser import compute_annual_returns

for fund in funds_total:
    if fund.ticker in performance_funds:
        returns = compute_annual_returns(fund.performance_table)
        print("\nFinal Annual Returns:")
        fund.annual_returns = returns
        print(f"  {fund.ticker}: {returns}")

Detected format: Year (YYYY)
Found years: [np.int64(2014), np.int64(2015), np.int64(2016), np.int64(2017), np.int64(2018), np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2022), np.int64(2023), np.int64(2024)]
  2014 -> 2015: $10,000.00 -> $10,125.00 = 1.25%
  2015 -> 2016: $10,125.00 -> $11,321.00 = 11.81%
  2016 -> 2017: $11,321.00 -> $13,774.00 = 21.67%
  2017 -> 2018: $13,774.00 -> $13,151.00 = -4.52%
  2018 -> 2019: $13,151.00 -> $17,271.00 = 31.33%
  2019 -> 2020: $17,271.00 -> $20,423.00 = 18.25%
  2020 -> 2021: $20,423.00 -> $26,250.00 = 28.53%
  2021 -> 2022: $26,250.00 -> $21,465.00 = -18.23%
  2022 -> 2023: $21,465.00 -> $27,069.00 = 26.11%
  2023 -> 2024: $27,069.00 -> $33,794.00 = 24.84%

Final Annual Returns:
2015: 1.25%
2016: 11.81%
2017: 21.67%
2018: -4.52%
2019: 31.33%
2020: 18.25%
2021: 28.53%
2022: -18.23%
2023: 26.11%
2024: 24.84%
Detected format: Year (YYYY)
Found years: [np.int64(2014), np.int64(2015), np.int64(2016), np.int64(2017), np.int64(2018), np.in

  df['parsed_date'] = pd.to_datetime(df[date_col], errors='coerce')


In [7]:
parser.print_fund_info(funds_list)

Showing information of 16 funds


### üè¶ 500 Index Fund

üÜî Context ID:      From2024-01-01to2024-12-31_C000007773Member
üé´ Ticker:          VFINX
üè∑Ô∏è Share Class:     Investor Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 1,350,332
Expense Ratio       : 0.14
Turnover Rate       : 2
Costs per $10k      : 16
Advisory Fees       : 20,816
Number of Holdings  : 516

üìù Commentary: "How did the Fund perform during the reporting period?¬†For the 12 months ended December 31, 2024, the Fund performed¬†in line with its benchmark, the Standard & Poor's 500 Index.U.S. economic growth hovered around 3% on a year-over-year basis for much ..."


**üìà Performance History**

Unnamed: 0.1,Unnamed: 0,Investor Shares,S&P 500 Index,Dow Jones U.S. Total Stock Market Float Adjusted Index
0,2014,"$10,000","$10,000","$10,000"
1,2015,"$10,091","$10,095","$10,180"
2,2015,"$10,117","$10,123","$10,192"
3,2015,"$9,462","$9,471","$9,451"
4,2015,"$10,125","$10,138","$10,044"
5,2016,"$10,258","$10,275","$10,136"
6,2016,"$10,506","$10,527","$10,401"
7,2016,"$10,907","$10,933","$10,862"
8,2016,"$11,321","$11,351","$11,312"
9,2017,"$12,004","$12,039","$11,967"


**üìä Average Annual Returns**

Unnamed: 0.1,Unnamed: 0,1 Year,5 Years,10 Years
0,Investor Shares,24.84%,14.37%,12.95%
1,S&P 500 Index,25.02%,14.53%,13.10%
2,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1,2
0,Communication Services,9.4%,
1,Consumer Discretionary,11.2%,
2,Consumer Staples,5.5%,
3,Energy,3.2%,
4,Financials,13.6%,
5,Health Care,10.1%,
6,Industrials,8.1%,
7,Information Technology,32.4%,
8,Materials,1.9%,
9,Real Estate,2.1%,






### üè¶ 500 Index Fund

üÜî Context ID:      From2024-01-01to2024-12-31_C000092055Member
üé´ Ticker:          VOO
üè∑Ô∏è Share Class:     ETF Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 1,350,332
Expense Ratio       : 0.03
Turnover Rate       : 2
Costs per $10k      : 3
Advisory Fees       : 20,816
Number of Holdings  : 516

üìù Commentary: "How did the Fund perform during the reporting period?¬†For the 12 months ended December 31, 2024, the Fund performed¬†in line with its benchmark, the Standard & Poor's 500 Index.U.S. economic growth hovered around 3% on a year-over-year basis for much ..."


**üìà Performance History**

Unnamed: 0.1,Unnamed: 0,ETF Shares Net Asset Value,S&P 500 Index,Dow Jones U.S. Total Stock Market Float Adjusted Index
0,2014,"$10,000","$10,000","$10,000"
1,2015,"$10,094","$10,095","$10,180"
2,2015,"$10,123","$10,123","$10,192"
3,2015,"$9,469","$9,471","$9,451"
4,2015,"$10,135","$10,138","$10,044"
5,2016,"$10,271","$10,275","$10,136"
6,2016,"$10,522","$10,527","$10,401"
7,2016,"$10,927","$10,933","$10,862"
8,2016,"$11,345","$11,351","$11,312"
9,2017,"$12,031","$12,039","$11,967"


**üìä Average Annual Returns**

Unnamed: 0.1,Unnamed: 0,1 Year,5 Years,10 Years
0,ETF Shares Net Asset Value,24.98%,14.48%,13.06%
1,ETF Shares Market Price,24.94%,14.49%,13.06%
2,S&P 500 Index,25.02%,14.53%,13.10%
3,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1,2
0,Communication Services,9.4%,
1,Consumer Discretionary,11.2%,
2,Consumer Staples,5.5%,
3,Energy,3.2%,
4,Financials,13.6%,
5,Health Care,10.1%,
6,Industrials,8.1%,
7,Information Technology,32.4%,
8,Materials,1.9%,
9,Real Estate,2.1%,






### üè¶ 500 Index Fund

üÜî Context ID:      From2024-01-01to2024-12-31_C000007774Member
üé´ Ticker:          VFIAX
üè∑Ô∏è Share Class:     Admiral‚Ñ¢ Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 1,350,332
Expense Ratio       : 0.04
Turnover Rate       : 2
Costs per $10k      : 4
Advisory Fees       : 20,816
Number of Holdings  : 516

üìù Commentary: "How did the Fund perform during the reporting period?¬†For the 12 months ended December 31, 2024, the Fund performed¬†in line with its benchmark, the Standard & Poor's 500 Index.U.S. economic growth hovered around 3% on a year-over-year basis for much ..."


**üìà Performance History**

Unnamed: 0.1,Unnamed: 0,Admiral Shares,S&P 500 Index,Dow Jones U.S. Total Stock Market Float Adjusted Index
0,2014,"$10,000","$10,000","$10,000"
1,2015,"$10,094","$10,095","$10,180"
2,2015,"$10,123","$10,123","$10,192"
3,2015,"$9,470","$9,471","$9,451"
4,2015,"$10,136","$10,138","$10,044"
5,2016,"$10,272","$10,275","$10,136"
6,2016,"$10,523","$10,527","$10,401"
7,2016,"$10,928","$10,933","$10,862"
8,2016,"$11,345","$11,351","$11,312"
9,2017,"$12,032","$12,039","$11,967"


**üìä Average Annual Returns**

Unnamed: 0.1,Unnamed: 0,1 Year,5 Years,10 Years
0,Admiral Shares,24.97%,14.48%,13.06%
1,S&P 500 Index,25.02%,14.53%,13.10%
2,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1,2
0,Communication Services,9.4%,
1,Consumer Discretionary,11.2%,
2,Consumer Staples,5.5%,
3,Energy,3.2%,
4,Financials,13.6%,
5,Health Care,10.1%,
6,Industrials,8.1%,
7,Information Technology,32.4%,
8,Materials,1.9%,
9,Real Estate,2.1%,






### üè¶ 500 Index Fund

üÜî Context ID:      From2024-01-01to2024-12-31_C000170274Member
üé´ Ticker:          VFFSX
üè∑Ô∏è Share Class:     Institutional Select Share Class
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 1,350,332
Expense Ratio       : 0.01
Turnover Rate       : 2
Costs per $10k      : 1
Advisory Fees       : 20,816
Number of Holdings  : 516

üìù Commentary: "How did the Fund perform during the reporting period?¬†For the 12 months ended December 31, 2024, the Fund performed¬†in line with its benchmark, the Standard & Poor's 500 Index.U.S. economic growth hovered around 3% on a year-over-year basis for much ..."


**üìà Performance History**

Unnamed: 0.1,Unnamed: 0,Institutional Select Share Class,S&P 500 Index,Dow Jones U.S. Total Stock Market Float Adjusted Index
0,6/24/16,"$5,000,000,000","$5,000,000,000","$5,000,000,000"
1,6/30/16,"$5,152,656,423","$5,152,577,879","$5,147,526,005"
2,9/30/16,"$5,351,187,741","$5,351,059,644","$5,375,816,703"
3,12/31/16,"$5,556,006,018","$5,555,696,904","$5,598,236,535"
4,3/31/17,"$5,892,894,070","$5,892,712,361","$5,922,524,381"
5,6/30/17,"$6,074,753,290","$6,074,688,823","$6,100,664,780"
6,9/30/17,"$6,347,363,071","$6,346,856,796","$6,379,289,163"
7,12/31/17,"$6,769,029,785","$6,768,592,456","$6,782,952,897"
8,3/31/18,"$6,717,278,809","$6,717,209,557","$6,741,940,992"
9,6/30/18,"$6,947,757,117","$6,947,870,304","$7,003,081,023"


**üìä Average Annual Returns**

Unnamed: 0.1,Unnamed: 0,1 Year,5 Years,Since Inception 6/24/16
0,Institutional Select Share Class,25.00%,14.52%,15.25%
1,S&P 500 Index,25.02%,14.53%,15.26%
2,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,14.66%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1,2
0,Communication Services,9.4%,
1,Consumer Discretionary,11.2%,
2,Consumer Staples,5.5%,
3,Energy,3.2%,
4,Financials,13.6%,
5,Health Care,10.1%,
6,Industrials,8.1%,
7,Information Technology,32.4%,
8,Materials,1.9%,
9,Real Estate,2.1%,






### üè¶ Value Index Fund

üÜî Context ID:      From2024-01-01to2024-12-31_C000007775Member
üé´ Ticker:          VIVAX
üè∑Ô∏è Share Class:     Investor Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 183,569
Expense Ratio       : 0.17
Turnover Rate       : 9
Costs per $10k      : 18
Advisory Fees       : 3,184
Number of Holdings  : 348

üìù Commentary: "How did the Fund perform during the reporting period?¬†For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Large Cap Value Index.U.S. economic growth hovered around 3% on a year-over-year basis for muc..."


**üìà Performance History**

Unnamed: 0.1,Unnamed: 0,Investor Shares,CRSP US Large Cap Value Index,Dow Jones U.S. Total Stock Market Float Adjusted Index
0,2014,"$10,000","$10,000","$10,000"
1,2015,"$9,940","$9,945","$10,180"
2,2015,"$9,984","$9,993","$10,192"
3,2015,"$9,257","$9,268","$9,451"
4,2015,"$9,897","$9,914","$10,044"
5,2016,"$10,055","$10,078","$10,136"
6,2016,"$10,434","$10,462","$10,401"
7,2016,"$10,748","$10,781","$10,862"
8,2016,"$11,554","$11,592","$11,312"
9,2017,"$11,926","$11,972","$11,967"


**üìä Average Annual Returns**

Unnamed: 0.1,Unnamed: 0,1 Year,5 Years,10 Years
0,Investor Shares,15.84%,9.80%,9.86%
1,CRSP US Large Cap Value Index,16.00%,9.93%,10.01%
2,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1,2
0,Basic Materials,1.8%,
1,Consumer Discretionary,9.2%,
2,Consumer Staples,8.6%,
3,Energy,6.6%,
4,Financials,21.6%,
5,Health Care,15.5%,
6,Industrials,15.6%,
7,Real Estate,3.1%,
8,Technology,8.9%,
9,Telecommunications,3.4%,






### üè¶ Value Index Fund

üÜî Context ID:      From2024-01-01to2024-12-31_C000007778Member
üé´ Ticker:          VTV
üè∑Ô∏è Share Class:     ETF Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    NYSE

--- üí∞ Costs & Financials ---
Net Assets          : 183,569
Expense Ratio       : 0.04
Turnover Rate       : 9
Costs per $10k      : 4
Advisory Fees       : 3,184
Number of Holdings  : 348

üìù Commentary: "How did the Fund perform during the reporting period?¬†For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Large Cap Value Index.U.S. economic growth hovered around 3% on a year-over-year basis for muc..."


**üìà Performance History**

Unnamed: 0.1,Unnamed: 0,ETF Shares Net Asset Value,CRSP US Large Cap Value Index,Dow Jones U.S. Total Stock Market Float Adjusted Index
0,2014,"$10,000","$10,000","$10,000"
1,2015,"$9,945","$9,945","$10,180"
2,2015,"$9,993","$9,993","$10,192"
3,2015,"$9,266","$9,268","$9,451"
4,2015,"$9,911","$9,914","$10,044"
5,2016,"$10,074","$10,078","$10,136"
6,2016,"$10,456","$10,462","$10,401"
7,2016,"$10,775","$10,781","$10,862"
8,2016,"$11,585","$11,592","$11,312"
9,2017,"$11,963","$11,972","$11,967"


**üìä Average Annual Returns**

Unnamed: 0.1,Unnamed: 0,1 Year,5 Years,10 Years
0,ETF Shares Net Asset Value,16.00%,9.93%,10.00%
1,ETF Shares Market Price,15.94%,9.93%,10.00%
2,CRSP US Large Cap Value Index,16.00%,9.93%,10.01%
3,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1,2
0,Basic Materials,1.8%,
1,Consumer Discretionary,9.2%,
2,Consumer Staples,8.6%,
3,Energy,6.6%,
4,Financials,21.6%,
5,Health Care,15.5%,
6,Industrials,15.6%,
7,Real Estate,3.1%,
8,Technology,8.9%,
9,Telecommunications,3.4%,






### üè¶ Value Index Fund

üÜî Context ID:      From2024-01-01to2024-12-31_C000007776Member
üé´ Ticker:          VVIAX
üè∑Ô∏è Share Class:     Admiral‚Ñ¢ Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 183,569
Expense Ratio       : 0.05
Turnover Rate       : 9
Costs per $10k      : 5
Advisory Fees       : 3,184
Number of Holdings  : 348

üìù Commentary: "How did the Fund perform during the reporting period?¬†For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Large Cap Value Index.U.S. economic growth hovered around 3% on a year-over-year basis for muc..."


**üìà Performance History**

Unnamed: 0.1,Unnamed: 0,Admiral Shares,CRSP US Large Cap Value Index,Dow Jones U.S. Total Stock Market Float Adjusted Index
0,2014,"$10,000","$10,000","$10,000"
1,2015,"$9,943","$9,945","$10,180"
2,2015,"$9,994","$9,993","$10,192"
3,2015,"$9,266","$9,268","$9,451"
4,2015,"$9,914","$9,914","$10,044"
5,2016,"$10,076","$10,078","$10,136"
6,2016,"$10,459","$10,462","$10,401"
7,2016,"$10,777","$10,781","$10,862"
8,2016,"$11,586","$11,592","$11,312"
9,2017,"$11,963","$11,972","$11,967"


**üìä Average Annual Returns**

Unnamed: 0.1,Unnamed: 0,1 Year,5 Years,10 Years
0,Admiral Shares,15.99%,9.93%,9.99%
1,CRSP US Large Cap Value Index,16.00%,9.93%,10.01%
2,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1,2
0,Basic Materials,1.8%,
1,Consumer Discretionary,9.2%,
2,Consumer Staples,8.6%,
3,Energy,6.6%,
4,Financials,21.6%,
5,Health Care,15.5%,
6,Industrials,15.6%,
7,Real Estate,3.1%,
8,Technology,8.9%,
9,Telecommunications,3.4%,






### üè¶ Value Index Fund

üÜî Context ID:      From2024-01-01to2024-12-31_C000007777Member
üé´ Ticker:          VIVIX
üè∑Ô∏è Share Class:     Institutional Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 183,569
Expense Ratio       : 0.04
Turnover Rate       : 9
Costs per $10k      : 4
Advisory Fees       : 3,184
Number of Holdings  : 348

üìù Commentary: "How did the Fund perform during the reporting period?¬†For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Large Cap Value Index.U.S. economic growth hovered around 3% on a year-over-year basis for muc..."


**üìà Performance History**

Unnamed: 0.1,Unnamed: 0,Institutional Shares,CRSP US Large Cap Value Index,Dow Jones U.S. Total Stock Market Float Adjusted Index
0,2014,"$5,000,000","$5,000,000","$5,000,000"
1,2015,"$4,971,871","$4,972,417","$5,090,054"
2,2015,"$4,997,132","$4,996,694","$5,096,078"
3,2015,"$4,633,467","$4,633,912","$4,725,720"
4,2015,"$4,957,449","$4,957,090","$5,022,045"
5,2016,"$5,038,463","$5,039,202","$5,067,957"
6,2016,"$5,228,516","$5,231,204","$5,200,452"
7,2016,"$5,387,648","$5,390,397","$5,431,090"
8,2016,"$5,793,893","$5,796,215","$5,655,796"
9,2017,"$5,982,618","$5,986,133","$5,983,419"


**üìä Average Annual Returns**

Unnamed: 0.1,Unnamed: 0,1 Year,5 Years,10 Years
0,Institutional Shares,15.98%,9.94%,10.00%
1,CRSP US Large Cap Value Index,16.00%,9.93%,10.01%
2,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1,2
0,Basic Materials,1.8%,
1,Consumer Discretionary,9.2%,
2,Consumer Staples,8.6%,
3,Energy,6.6%,
4,Financials,21.6%,
5,Health Care,15.5%,
6,Industrials,15.6%,
7,Real Estate,3.1%,
8,Technology,8.9%,
9,Telecommunications,3.4%,






### üè¶ Growth Index Fund

üÜî Context ID:      From2024-01-01to2024-12-31_C000007783Member
üé´ Ticker:          VIGRX
üè∑Ô∏è Share Class:     Investor Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 284,976
Expense Ratio       : 0.17
Turnover Rate       : 11
Costs per $10k      : 20
Advisory Fees       : 4,355
Number of Holdings  : 183

üìù Commentary: "How did the Fund perform during the reporting period?¬†For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Large Cap Growth Index.U.S. economic growth hovered around 3% on a year-over-year basis for mu..."


**üìà Performance History**

Unnamed: 0.1,Unnamed: 0,Investor Shares,CRSP US Large Cap Growth Index,Dow Jones U.S. Total Stock Market Float Adjusted Index
0,2014,"$10,000","$10,000","$10,000"
1,2015,"$10,340","$10,346","$10,180"
2,2015,"$10,314","$10,325","$10,192"
3,2015,"$9,690","$9,705","$9,451"
4,2015,"$10,317","$10,338","$10,044"
5,2016,"$10,349","$10,376","$10,136"
6,2016,"$10,451","$10,482","$10,401"
7,2016,"$10,985","$11,021","$10,862"
8,2016,"$10,936","$10,975","$11,312"
9,2017,"$11,981","$12,031","$11,967"


**üìä Average Annual Returns**

Unnamed: 0.1,Unnamed: 0,1 Year,5 Years,10 Years
0,Investor Shares,32.50%,18.21%,15.61%
1,CRSP US Large Cap Growth Index,32.73%,18.41%,15.80%
2,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1,2
0,Basic Materials,0.7%,
1,Consumer Discretionary,19.8%,
2,Consumer Staples,0.4%,
3,Energy,0.8%,
4,Financials,2.7%,
5,Health Care,5.7%,
6,Industrials,8.4%,
7,Real Estate,1.3%,
8,Technology,59.0%,
9,Telecommunications,0.9%,






### üè¶ Growth Index Fund

üÜî Context ID:      From2024-01-01to2024-12-31_C000007786Member
üé´ Ticker:          VUG
üè∑Ô∏è Share Class:     ETF Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    NYSE

--- üí∞ Costs & Financials ---
Net Assets          : 284,976
Expense Ratio       : 0.04
Turnover Rate       : 11
Costs per $10k      : 5
Advisory Fees       : 4,355
Number of Holdings  : 183

üìù Commentary: "How did the Fund perform during the reporting period?¬†For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Large Cap Growth Index.U.S. economic growth hovered around 3% on a year-over-year basis for mu..."


**üìà Performance History**

Unnamed: 0.1,Unnamed: 0,ETF Shares Net Asset Value,CRSP US Large Cap Growth Index,Dow Jones U.S. Total Stock Market Float Adjusted Index
0,2014,"$10,000","$10,000","$10,000"
1,2015,"$10,344","$10,346","$10,180"
2,2015,"$10,321","$10,325","$10,192"
3,2015,"$9,701","$9,705","$9,451"
4,2015,"$10,332","$10,338","$10,044"
5,2016,"$10,367","$10,376","$10,136"
6,2016,"$10,473","$10,482","$10,401"
7,2016,"$11,010","$11,021","$10,862"
8,2016,"$10,965","$10,975","$11,312"
9,2017,"$12,018","$12,031","$11,967"


**üìä Average Annual Returns**

Unnamed: 0.1,Unnamed: 0,1 Year,5 Years,10 Years
0,ETF Shares Net Asset Value,32.68%,18.36%,15.76%
1,ETF Shares Market Price,32.64%,18.37%,15.76%
2,CRSP US Large Cap Growth Index,32.73%,18.41%,15.80%
3,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1,2
0,Basic Materials,0.7%,
1,Consumer Discretionary,19.8%,
2,Consumer Staples,0.4%,
3,Energy,0.8%,
4,Financials,2.7%,
5,Health Care,5.7%,
6,Industrials,8.4%,
7,Real Estate,1.3%,
8,Technology,59.0%,
9,Telecommunications,0.9%,






### üè¶ Growth Index Fund

üÜî Context ID:      From2024-01-01to2024-12-31_C000007784Member
üé´ Ticker:          VIGAX
üè∑Ô∏è Share Class:     Admiral‚Ñ¢ Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 284,976
Expense Ratio       : 0.05
Turnover Rate       : 11
Costs per $10k      : 6
Advisory Fees       : 4,355
Number of Holdings  : 183

üìù Commentary: "How did the Fund perform during the reporting period?¬†For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Large Cap Growth Index.U.S. economic growth hovered around 3% on a year-over-year basis for mu..."


**üìà Performance History**

Unnamed: 0.1,Unnamed: 0,Admiral Shares,CRSP US Large Cap Growth Index,Dow Jones U.S. Total Stock Market Float Adjusted Index
0,2014,"$10,000","$10,000","$10,000"
1,2015,"$10,344","$10,346","$10,180"
2,2015,"$10,320","$10,325","$10,192"
3,2015,"$9,699","$9,705","$9,451"
4,2015,"$10,330","$10,338","$10,044"
5,2016,"$10,365","$10,376","$10,136"
6,2016,"$10,470","$10,482","$10,401"
7,2016,"$11,009","$11,021","$10,862"
8,2016,"$10,963","$10,975","$11,312"
9,2017,"$12,014","$12,031","$11,967"


**üìä Average Annual Returns**

Unnamed: 0.1,Unnamed: 0,1 Year,5 Years,10 Years
0,Admiral Shares,32.66%,18.36%,15.75%
1,CRSP US Large Cap Growth Index,32.73%,18.41%,15.80%
2,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1,2
0,Basic Materials,0.7%,
1,Consumer Discretionary,19.8%,
2,Consumer Staples,0.4%,
3,Energy,0.8%,
4,Financials,2.7%,
5,Health Care,5.7%,
6,Industrials,8.4%,
7,Real Estate,1.3%,
8,Technology,59.0%,
9,Telecommunications,0.9%,






### üè¶ Growth Index Fund

üÜî Context ID:      From2024-01-01to2024-12-31_C000007785Member
üé´ Ticker:          VIGIX
üè∑Ô∏è Share Class:     Institutional Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 284,976
Expense Ratio       : 0.04
Turnover Rate       : 11
Costs per $10k      : 5
Advisory Fees       : 4,355
Number of Holdings  : 183

üìù Commentary: "How did the Fund perform during the reporting period?¬†For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Large Cap Growth Index.U.S. economic growth hovered around 3% on a year-over-year basis for mu..."


**üìà Performance History**

Unnamed: 0.1,Unnamed: 0,Institutional Shares,CRSP US Large Cap Growth Index,Dow Jones U.S. Total Stock Market Float Adjusted Index
0,2014,"$5,000,000","$5,000,000","$5,000,000"
1,2015,"$5,172,206","$5,173,192","$5,090,054"
2,2015,"$5,161,071","$5,162,318","$5,096,078"
3,2015,"$4,850,557","$4,852,558","$4,725,720"
4,2015,"$5,166,364","$5,169,146","$5,022,045"
5,2016,"$5,184,201","$5,187,759","$5,067,957"
6,2016,"$5,237,051","$5,240,785","$5,200,452"
7,2016,"$5,506,240","$5,510,594","$5,431,090"
8,2016,"$5,483,263","$5,487,362","$5,655,796"
9,2017,"$6,010,352","$6,015,412","$5,983,419"


**üìä Average Annual Returns**

Unnamed: 0.1,Unnamed: 0,1 Year,5 Years,10 Years
0,Institutional Shares,32.68%,18.37%,15.76%
1,CRSP US Large Cap Growth Index,32.73%,18.41%,15.80%
2,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1,2
0,Basic Materials,0.7%,
1,Consumer Discretionary,19.8%,
2,Consumer Staples,0.4%,
3,Energy,0.8%,
4,Financials,2.7%,
5,Health Care,5.7%,
6,Industrials,8.4%,
7,Real Estate,1.3%,
8,Technology,59.0%,
9,Telecommunications,0.9%,






### üè¶ Large-Cap Index Fund

üÜî Context ID:      From2024-01-01to2024-12-31_C000007787Member
üé´ Ticker:          VLACX
üè∑Ô∏è Share Class:     Investor Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 56,785
Expense Ratio       : 0.17
Turnover Rate       : 2
Costs per $10k      : 19
Advisory Fees       : 955
Number of Holdings  : 494

üìù Commentary: "How did the Fund perform during the reporting period?¬†For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Large Cap Index.U.S. economic growth hovered around 3% on a year-over-year basis for much of t..."


**üìà Performance History**

Unnamed: 0.1,Unnamed: 0,Investor Shares,CRSP US Large Cap Index,Dow Jones U.S. Total Stock Market Float Adjusted Index
0,2014,"$10,000","$10,000","$10,000"
1,2015,"$10,125","$10,131","$10,180"
2,2015,"$10,137","$10,147","$10,192"
3,2015,"$9,457","$9,470","$9,451"
4,2015,"$10,093","$10,111","$10,044"
5,2016,"$10,189","$10,217","$10,136"
6,2016,"$10,436","$10,471","$10,401"
7,2016,"$10,855","$10,894","$10,862"
8,2016,"$11,254","$11,298","$11,312"
9,2017,"$11,946","$11,998","$11,967"


**üìä Average Annual Returns**

Unnamed: 0.1,Unnamed: 0,1 Year,5 Years,10 Years
0,Investor Shares,24.95%,14.34%,12.87%
1,CRSP US Large Cap Index,25.15%,14.51%,13.05%
2,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1,2
0,Basic Materials,1.2%,
1,Consumer Discretionary,15.1%,
2,Consumer Staples,4.0%,
3,Energy,3.2%,
4,Financials,10.8%,
5,Health Care,9.8%,
6,Industrials,11.3%,
7,Real Estate,2.0%,
8,Technology,38.0%,
9,Telecommunications,2.0%,






### üè¶ Large-Cap Index Fund

üÜî Context ID:      From2024-01-01to2024-12-31_C000007790Member
üé´ Ticker:          VV
üè∑Ô∏è Share Class:     ETF Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    NYSE

--- üí∞ Costs & Financials ---
Net Assets          : 56,785
Expense Ratio       : 0.04
Turnover Rate       : 2
Costs per $10k      : 5
Advisory Fees       : 955
Number of Holdings  : 494

üìù Commentary: "How did the Fund perform during the reporting period?¬†For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Large Cap Index.U.S. economic growth hovered around 3% on a year-over-year basis for much of t..."


**üìà Performance History**

Unnamed: 0.1,Unnamed: 0,ETF Shares Net Asset Value,CRSP US Large Cap Index,Dow Jones U.S. Total Stock Market Float Adjusted Index
0,2014,"$10,000","$10,000","$10,000"
1,2015,"$10,129","$10,131","$10,180"
2,2015,"$10,145","$10,147","$10,192"
3,2015,"$9,468","$9,470","$9,451"
4,2015,"$10,107","$10,111","$10,044"
5,2016,"$10,207","$10,217","$10,136"
6,2016,"$10,459","$10,471","$10,401"
7,2016,"$10,881","$10,894","$10,862"
8,2016,"$11,284","$11,298","$11,312"
9,2017,"$11,982","$11,998","$11,967"


**üìä Average Annual Returns**

Unnamed: 0.1,Unnamed: 0,1 Year,5 Years,10 Years
0,ETF Shares Net Asset Value,25.12%,14.48%,13.02%
1,ETF Shares Market Price,25.05%,14.48%,13.01%
2,CRSP US Large Cap Index,25.15%,14.51%,13.05%
3,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1,2
0,Basic Materials,1.2%,
1,Consumer Discretionary,15.1%,
2,Consumer Staples,4.0%,
3,Energy,3.2%,
4,Financials,10.8%,
5,Health Care,9.8%,
6,Industrials,11.3%,
7,Real Estate,2.0%,
8,Technology,38.0%,
9,Telecommunications,2.0%,






### üè¶ Large-Cap Index Fund

üÜî Context ID:      From2024-01-01to2024-12-31_C000007788Member
üé´ Ticker:          VLCAX
üè∑Ô∏è Share Class:     Admiral‚Ñ¢ Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 56,785
Expense Ratio       : 0.05
Turnover Rate       : 2
Costs per $10k      : 6
Advisory Fees       : 955
Number of Holdings  : 494

üìù Commentary: "How did the Fund perform during the reporting period?¬†For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Large Cap Index.U.S. economic growth hovered around 3% on a year-over-year basis for much of t..."


**üìà Performance History**

Unnamed: 0.1,Unnamed: 0,Admiral Shares,CRSP US Large Cap Index,Dow Jones U.S. Total Stock Market Float Adjusted Index
0,2014,"$10,000","$10,000","$10,000"
1,2015,"$10,131","$10,131","$10,180"
2,2015,"$10,146","$10,147","$10,192"
3,2015,"$9,468","$9,470","$9,451"
4,2015,"$10,107","$10,111","$10,044"
5,2016,"$10,206","$10,217","$10,136"
6,2016,"$10,457","$10,471","$10,401"
7,2016,"$10,881","$10,894","$10,862"
8,2016,"$11,284","$11,298","$11,312"
9,2017,"$11,981","$11,998","$11,967"


**üìä Average Annual Returns**

Unnamed: 0.1,Unnamed: 0,1 Year,5 Years,10 Years
0,Admiral Shares,25.10%,14.47%,13.01%
1,CRSP US Large Cap Index,25.15%,14.51%,13.05%
2,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1,2
0,Basic Materials,1.2%,
1,Consumer Discretionary,15.1%,
2,Consumer Staples,4.0%,
3,Energy,3.2%,
4,Financials,10.8%,
5,Health Care,9.8%,
6,Industrials,11.3%,
7,Real Estate,2.0%,
8,Technology,38.0%,
9,Telecommunications,2.0%,






### üè¶ Large-Cap Index Fund

üÜî Context ID:      From2024-01-01to2024-12-31_C000007789Member
üé´ Ticker:          VLISX
üè∑Ô∏è Share Class:     Institutional Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 56,785
Expense Ratio       : 0.04
Turnover Rate       : 2
Costs per $10k      : 5
Advisory Fees       : 955
Number of Holdings  : 494

üìù Commentary: "How did the Fund perform during the reporting period?¬†For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Large Cap Index.U.S. economic growth hovered around 3% on a year-over-year basis for much of t..."


**üìà Performance History**

Unnamed: 0.1,Unnamed: 0,Institutional Shares,CRSP US Large Cap Index,Dow Jones U.S. Total Stock Market Float Adjusted Index
0,2014,"$5,000,000","$5,000,000","$5,000,000"
1,2015,"$5,064,717","$5,065,516","$5,090,054"
2,2015,"$5,072,527","$5,073,505","$5,096,078"
3,2015,"$4,733,990","$4,735,147","$4,725,720"
4,2015,"$5,053,400","$5,055,301","$5,022,045"
5,2016,"$5,103,277","$5,108,683","$5,067,957"
6,2016,"$5,229,161","$5,235,336","$5,200,452"
7,2016,"$5,440,308","$5,446,752","$5,431,090"
8,2016,"$5,642,490","$5,649,147","$5,655,796"
9,2017,"$5,991,191","$5,998,908","$5,983,419"


**üìä Average Annual Returns**

Unnamed: 0.1,Unnamed: 0,1 Year,5 Years,10 Years
0,Institutional Shares,25.12%,14.49%,13.02%
1,CRSP US Large Cap Index,25.15%,14.51%,13.05%
2,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1,2
0,Basic Materials,1.2%,
1,Consumer Discretionary,15.1%,
2,Consumer Staples,4.0%,
3,Energy,3.2%,
4,Financials,10.8%,
5,Health Care,9.8%,
6,Industrials,11.3%,
7,Real Estate,2.0%,
8,Technology,38.0%,
9,Telecommunications,2.0%,






## Vanguard World Fund

In [5]:
import pandas as pd
from io import StringIO

import sys
from pathlib import Path


sys.path.append('../src')
from simple_rag.extraction.parser import BlackRockFiling
from edgar import set_identity, Company


set_identity("luis.alvarez.conde@alumnos.upm.es")

ticker = "MGK"
fund = Company(ticker)
all_filings = fund.get_filings(form="N-CSR")


if all_filings:
    # 1. Find the most recent date in the entire history (e.g., "2024-12-31")
    latest_date_str = max(f.report_date for f in all_filings)
    
    # 2. Extract just the YEAR (e.g., "2024")
    target_year = latest_date_str[:4]
    
    # 3. Filter: Keep ALL filings where the report_date starts with that year
    # This captures the March, June, and December reports for that fiscal year
    latest_filings = [
        f for f in all_filings 
        if f.report_date and f.report_date.startswith(target_year)
    ]
    target_year = "2024"
    filings2 = sorted(
        [f for f in all_filings if f.report_date and f.report_date.startswith(target_year)],
        key=lambda f: f.report_date,
        reverse=True
    )

    latest_filings.append(filings2[0])
    print("Found filings: ", len(latest_filings), "for year: ", target_year)



performance_funds = []
for filing in latest_filings:

    html_content = filing.html()
    
    parser = BlackRockFiling(html_content)
    funds = parser.get_funds()
    count = 0
    for fund in funds:
        if fund.performance_table is not None:
            performance_funds.append(fund.ticker)
            count += 1

    if count == 0:
        df_performance = parser.get_financial_highlights()

    print(count)
    print("Adding funds: ", len(funds))
    funds_total.extend(funds)

print(len(performance_funds))
print(performance_funds)

print(df_performance)


Found filings:  4 for year:  2024
Processing: Mega Cap Growth Index Fund
Extracting context:  From2024-10-01to2025-09-30_C000055216Member
Processing: Mega Cap Growth Index Fund
Extracting context:  From2024-10-01to2025-09-30_C000055215Member
Tag not found:  dei:SecurityExchangeName From2024-10-01to2025-09-30_C000055215Member
2
Adding funds:  2
Processing: Vanguard Extended Duration Treasury Index Fund
Extracting context:  FY2025_C000051981Member
Tag not found:  dei:SecurityExchangeName FY2025_C000051981Member
Failed to extract tables from block:  oef:LineGraphTableTextBlock
No tables found for block:  oef:LineGraphTableTextBlock
Processing: Vanguard Extended Duration Treasury Index Fund
Extracting context:  FY2025_C000051979Member
Tag not found:  dei:SecurityExchangeName FY2025_C000051979Member
Failed to extract tables from block:  oef:LineGraphTableTextBlock
No tables found for block:  oef:LineGraphTableTextBlock
Processing: Vanguard ESG U.S. Stock ETF
Extracting context:  FY2025_C000

In [None]:

returns_lookup = df_performance[['fund_name', 'share_class', 'year', 'total_return']].copy()

# Optional: Clean the total_return column (remove % sign if needed)
returns_lookup['total_return_clean'] = returns_lookup['total_return'].str.replace('%', '').astype(float)
display(returns_lookup['fund_name'].unique())

# Now you can efficiently match and update your funds
for fund_obj in funds_total:
    print(f"\nProcessing fund object: {fund_obj.name} - {fund_obj.share_class}")
    if fund_obj.ticker in performance_funds:
        continue
    # Initialize annual returns
    if not hasattr(fund_obj, 'annual_returns') or fund_obj.annual_returns is None:
        fund_obj.annual_returns = {}
    
    # Clean the name: remove "Vanguard" and strip whitespace
    name = fund_obj.name.replace("Vanguard", "").strip()
    print(f"Cleaned name: '{name}'")
    
    if "¬Æ" in name:
        name = name.replace("¬Æ", "")
    if "‚Ñ¢" in name:
        name = name.replace("‚Ñ¢", "")
        
    # Find matching rows based on fund name
    name_matches = returns_lookup[returns_lookup['fund_name'].str.contains(name, case=False, na=False, regex=False)]
    
    if len(name_matches) == 0:
        print("  No name matches found")
        continue
    
    print(f"  Found {len(name_matches)} name matches")
    
    # Clean share class (remove trademark symbol)
    share_class = fund_obj.share_class
    if "‚Ñ¢" in share_class:
        share_class = share_class.replace("‚Ñ¢", "")
    
    # Now match share class
    share_class_matches = name_matches[
        name_matches['share_class'].str.contains(share_class, case=False, na=False, regex=False)]
    
    if len(share_class_matches) == 0:
        print(f"  No share class matches found for '{share_class}'")
        print(f"  Available share classes: {name_matches['share_class'].unique()}")
        continue
    
    print(f"  Found {len(share_class_matches)} matching records")
    
    # Add all matching returns
    for _, row in share_class_matches.iterrows():
        fund_obj.annual_returns[row['year']] = row['total_return_clean']
    
    print(f"  Annual returns: {fund_obj.annual_returns}")

array(['Extended Duration Treasury Index Fund', 'ESG U.S. Stock ETF',
       'ESG International Stock ETF', 'Global Wellington Fund',
       'Global Wellesley Income Fund', 'ESG U.S. Corporate Bond ETF'],
      dtype=object)


Processing fund object: Vanguard Extended Market Index Fund - Investor Shares
Cleaned name: 'Extended Market Index Fund'
  No name matches found

Processing fund object: Vanguard Extended Market Index Fund - ETF Shares
Cleaned name: 'Extended Market Index Fund'
  No name matches found

Processing fund object: Vanguard Extended Market Index Fund - Admiral‚Ñ¢ Shares
Cleaned name: 'Extended Market Index Fund'
  No name matches found

Processing fund object: Vanguard Extended Market Index Fund - Institutional Shares
Cleaned name: 'Extended Market Index Fund'
  No name matches found

Processing fund object: Vanguard Extended Market Index Fund - Institutional Plus Shares
Cleaned name: 'Extended Market Index Fund'
  No name matches found

Processing fund object: Vanguard Extended Market Index Fund - Institutional Select Shares
Cleaned name: 'Extended Market Index Fund'
  No name matches found

Processing fund object: Vanguard Mid-Cap Index Fund - Investor Shares
Cleaned name: 'Mid-Cap Index 

In [None]:
import sys
%reload_ext autoreload
sys.path.append('../src')


from simple_rag.extraction.parser import compute_annual_returns

for fund in funds_total:
    if fund.ticker in performance_funds:
        returns = compute_annual_returns(fund.performance_table)
        print("\nFinal Annual Returns:")
        fund.annual_returns = returns
        print(f"  {fund.ticker}: {returns}")

Detected format: Year (YYYY)
Found years: [np.int64(2015), np.int64(2016), np.int64(2017), np.int64(2018), np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2022), np.int64(2023), np.int64(2024), np.int64(2025)]
  2015 -> 2016: $10,724.00 -> $11,409.00 = 6.39%
  2016 -> 2017: $11,409.00 -> $14,772.00 = 29.48%
  2017 -> 2018: $14,772.00 -> $14,349.00 = -2.86%
  2018 -> 2019: $14,349.00 -> $19,736.00 = 37.54%
  2019 -> 2020: $19,736.00 -> $27,826.00 = 40.99%
  2020 -> 2021: $27,826.00 -> $35,753.00 = 28.49%
  2021 -> 2022: $35,753.00 -> $23,755.00 = -33.56%
  2022 -> 2023: $23,755.00 -> $36,004.00 = 51.56%
  2023 -> 2024: $36,004.00 -> $47,873.00 = 32.97%
  2024 -> 2025: $47,873.00 -> $56,289.00 = 17.58%

Final Annual Returns:
  MGK: {'2016': 6.39, '2017': 29.48, '2018': -2.86, '2019': 37.54, '2020': 40.99, '2021': 28.49, '2022': -33.56, '2023': 51.56, '2024': 32.97, '2025': 17.58}
Detected format: Year (YYYY)
Found years: [np.int64(2015), np.int64(2016), np.int64(2017), np.int64(

## Vanguard Specialized Funds

In [8]:
import pandas as pd
from io import StringIO

import sys
from pathlib import Path


sys.path.append('../src')
from simple_rag.extraction.parser import BlackRockFiling
from edgar import set_identity, Company


set_identity("luis.alvarez.conde@alumnos.upm.es")

ticker = "VDIGX"
fund = Company(ticker)
all_filings = fund.get_filings(form="N-CSR")


if all_filings:
    # 1. Find the most recent date in the entire history (e.g., "2024-12-31")
    latest_date_str = max(f.report_date for f in all_filings)
    
    # 2. Extract just the YEAR (e.g., "2024")
    target_year = latest_date_str[:4]
    
    # 3. Filter: Keep ALL filings where the report_date starts with that year
    # This captures the March, June, and December reports for that fiscal year
    latest_filings = [
        f for f in all_filings 
        if f.report_date and f.report_date.startswith(target_year)
    ]
    
    print("Found filings: ", len(latest_filings), "for year: ", target_year)



performance_funds = []
for filing in latest_filings:

    html_content = filing.html()
    
    parser = BlackRockFiling(html_content)
    funds = parser.get_funds()
    count = 0
    for fund in funds:
        if fund.performance_table is not None:
            performance_funds.append(fund.ticker)
            count += 1

    if count == 0:
        df_performance = parser.get_financial_highlights()

    print(count)
    print("Adding funds: ", len(funds))
    funds_total.extend(funds)

print(len(performance_funds))
print(performance_funds)

print(df_performance)


Found filings:  2 for year:  2025
Processing: Dividend Growth Fund
Extracting context:  From2024-02-01to2025-01-31_C000008004Member
Tag not found:  dei:SecurityExchangeName From2024-02-01to2025-01-31_C000008004Member
Processing: Energy Fund
Extracting context:  From2024-02-01to2025-01-31_C000008005Member
Tag not found:  dei:SecurityExchangeName From2024-02-01to2025-01-31_C000008005Member
Processing: Energy Fund
Extracting context:  From2024-02-01to2025-01-31_C000008006Member
Tag not found:  dei:SecurityExchangeName From2024-02-01to2025-01-31_C000008006Member
Processing: Health Care Fund
Extracting context:  From2024-02-01to2025-01-31_C000008007Member
Tag not found:  dei:SecurityExchangeName From2024-02-01to2025-01-31_C000008007Member
Processing: Health Care Fund
Extracting context:  From2024-02-01to2025-01-31_C000008008Member
Tag not found:  dei:SecurityExchangeName From2024-02-01to2025-01-31_C000008008Member
Processing: Dividend Appreciation Index Fund
Extracting context:  From2024-02

In [16]:

returns_lookup = df_performance[['fund_name', 'share_class', 'year', 'total_return']].copy()

# Optional: Clean the total_return column (remove % sign if needed)
returns_lookup['total_return_clean'] = returns_lookup['total_return'].str.replace('%', '').astype(float)
display(returns_lookup['fund_name'].unique())

# Now you can efficiently match and update your funds
for fund_obj in funds_total:
    print(f"\nProcessing fund object: {fund_obj.name} - {fund_obj.share_class}")
    if fund_obj.ticker in performance_funds:
        continue
    # Initialize annual returns
    if not hasattr(fund_obj, 'annual_returns') or fund_obj.annual_returns is None:
        fund_obj.annual_returns = {}
    
    # Clean the name: remove "Vanguard" and strip whitespace
    name = fund_obj.name.replace("Vanguard", "").strip()
    print(f"Cleaned name: '{name}'")
    
    if "¬Æ" in name:
        name = name.replace("¬Æ", "")
    if "‚Ñ¢" in name:
        name = name.replace("‚Ñ¢", "")
        
    # Find matching rows based on fund name
    name_matches = returns_lookup[returns_lookup['fund_name'].str.contains(name, case=False, na=False, regex=False)]
    
    if len(name_matches) == 0:
        print("  No name matches found")
        continue
    
    print(f"  Found {len(name_matches)} name matches")
    
    # Clean share class (remove trademark symbol)
    share_class = fund_obj.share_class
    if "‚Ñ¢" in share_class:
        share_class = share_class.replace("‚Ñ¢", "")
    
    # Now match share class
    share_class_matches = name_matches[
        name_matches['share_class'].str.contains(share_class, case=False, na=False, regex=False)]
    print(name)
    if "Cycles Fund" in name:
        fund_obj.annual_returns = dict(zip(name_matches['year'], name_matches['total_return_clean']))
        print("Annual return: ", fund_obj.annual_returns)
        continue
        
    if len(share_class_matches) == 0:
        print(f"  No share class matches found for '{share_class}'")
        print(f"  Available share classes: {name_matches['share_class'].unique()}")
        continue
    
    print(f"  Found {len(share_class_matches)} matching records")
    
    # Add all matching returns
    for _, row in share_class_matches.iterrows():
        fund_obj.annual_returns[row['year']] = row['total_return_clean']
    
    print(f"  Annual returns: {fund_obj.annual_returns}")

array(['Real Estate Index Fund', 'Real Estate II Index Fund',
       'Global Capital Cycles Fund', 'Global ESG Select Stock Fund'],
      dtype=object)


Processing fund object: Vanguard Extended Market Index Fund - Investor Shares
Cleaned name: 'Extended Market Index Fund'
  No name matches found

Processing fund object: Vanguard Extended Market Index Fund - ETF Shares
Cleaned name: 'Extended Market Index Fund'
  No name matches found

Processing fund object: Vanguard Extended Market Index Fund - Admiral‚Ñ¢ Shares
Cleaned name: 'Extended Market Index Fund'
  No name matches found

Processing fund object: Vanguard Extended Market Index Fund - Institutional Shares
Cleaned name: 'Extended Market Index Fund'
  No name matches found

Processing fund object: Vanguard Extended Market Index Fund - Institutional Plus Shares
Cleaned name: 'Extended Market Index Fund'
  No name matches found

Processing fund object: Vanguard Extended Market Index Fund - Institutional Select Shares
Cleaned name: 'Extended Market Index Fund'
  No name matches found

Processing fund object: Vanguard Mid-Cap Index Fund - Investor Shares
Cleaned name: 'Mid-Cap Index 

In [None]:
import sys
%reload_ext autoreload
sys.path.append('../src')


from simple_rag.extraction.parser import compute_annual_returns

for fund in funds_total:
    if fund.ticker in performance_funds:
        returns = compute_annual_returns(fund.performance_table)
        print("\nFinal Annual Returns:")
        fund.annual_returns = returns
        print(f"  {fund.ticker}: {returns}")

Detected format: Year (YYYY)
Found years: [np.int64(2015), np.int64(2016), np.int64(2017), np.int64(2018), np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2022), np.int64(2023), np.int64(2024), np.int64(2025)]
  2015 -> 2016: $10,619.00 -> $10,874.00 = 2.40%
  2016 -> 2017: $10,874.00 -> $12,860.00 = 18.26%
  2017 -> 2018: $12,860.00 -> $14,143.00 = 9.98%
  2018 -> 2019: $14,143.00 -> $16,879.00 = 19.35%
  2019 -> 2020: $16,879.00 -> $17,545.00 = 3.95%
  2020 -> 2021: $17,545.00 -> $23,646.00 = 34.77%
  2021 -> 2022: $23,646.00 -> $22,953.00 = -2.93%
  2022 -> 2023: $22,953.00 -> $23,134.00 = 0.79%
  2023 -> 2024: $23,134.00 -> $28,121.00 = 21.56%
  2024 -> 2025: $28,121.00 -> $28,555.00 = 1.54%

Final Annual Returns:
  VDIGX: {'2016': 2.4, '2017': 18.26, '2018': 9.98, '2019': 19.35, '2020': 3.95, '2021': 34.77, '2022': -2.93, '2023': 0.79, '2024': 21.56, '2025': 1.54}
Detected format: Year (YYYY)
Found years: [np.int64(2015), np.int64(2016), np.int64(2017), np.int64(2018), np

In [None]:
print("Vanguard funds processed: ", len(funds_total))

## Vanguard Whitehall Funds

In [17]:
import pandas as pd
from io import StringIO

import sys
from pathlib import Path


sys.path.append('../src')
from simple_rag.extraction.parser import BlackRockFiling
from edgar import set_identity, Company


set_identity("luis.alvarez.conde@alumnos.upm.es")

ticker = "VMGRX"
fund = Company(ticker)
all_filings = fund.get_filings(form="N-CSR")


if all_filings:
    # 1. Find the most recent date in the entire history (e.g., "2024-12-31")
    latest_date_str = max(f.report_date for f in all_filings)
    
    # 2. Extract just the YEAR (e.g., "2024")
    target_year = latest_date_str[:4]
    
    # 3. Filter: Keep ALL filings where the report_date starts with that year
    # This captures the March, June, and December reports for that fiscal year
    latest_filings = [
        f for f in all_filings 
        if f.report_date and f.report_date.startswith(target_year)
    ]
    
    print("Found filings: ", len(latest_filings), "for year: ", target_year)



performance_funds = []
for filing in latest_filings:

    html_content = filing.html()
    
    parser = BlackRockFiling(html_content)
    funds = parser.get_funds()
    count = 0
    for fund in funds:
        if fund.performance_table is not None:
            performance_funds.append(fund.ticker)
            count += 1

    if count == 0:
        df_performance = parser.get_financial_highlights()

    print(count)
    print("Adding funds: ", len(funds))
    funds_total.extend(funds)

print(len(performance_funds))
print(performance_funds)

print(df_performance)


Found filings:  2 for year:  2024
Processing: Mid-Cap Growth Fund
Extracting context:  From2023-11-01to2024-10-31_C000012166Member
Tag not found:  dei:SecurityExchangeName From2023-11-01to2024-10-31_C000012166Member
Processing: Selected Value Fund
Extracting context:  From2023-11-01to2024-10-31_C000012167Member
Tag not found:  dei:SecurityExchangeName From2023-11-01to2024-10-31_C000012167Member
Processing: Emerging Markets Government Bond Index Fund
Extracting context:  From2023-11-01to2024-10-31_C000126408Member
Processing: Emerging Markets Government Bond Index Fund
Extracting context:  From2023-11-01to2024-10-31_C000126407Member
Tag not found:  dei:SecurityExchangeName From2023-11-01to2024-10-31_C000126407Member
Processing: Emerging Markets Government Bond Index Fund
Extracting context:  From2023-11-01to2024-10-31_C000126409Member
Tag not found:  dei:SecurityExchangeName From2023-11-01to2024-10-31_C000126409Member
Processing: Global Minimum Volatility Fund
Extracting context:  From2

In [21]:

returns_lookup = df_performance[['fund_name', 'share_class', 'year', 'total_return']].copy()

# Optional: Clean the total_return column (remove % sign if needed)
returns_lookup['total_return_clean'] = returns_lookup['total_return'].str.replace('%', '').astype(float)
display(returns_lookup['fund_name'].unique())

# Now you can efficiently match and update your funds
for fund_obj in funds_total:
    print(f"\nProcessing fund object: {fund_obj.name} - {fund_obj.share_class}")
    if fund_obj.ticker in performance_funds:
        continue
    # Initialize annual returns
    if not hasattr(fund_obj, 'annual_returns') or fund_obj.annual_returns is None:
        fund_obj.annual_returns = {}
    
    # Clean the name: remove "Vanguard" and strip whitespace
    name = fund_obj.name.replace("Vanguard", "").strip()
    print(f"Cleaned name: '{name}'")
    
    if "¬Æ" in name:
        name = name.replace("¬Æ", "")
    if "‚Ñ¢" in name:
        name = name.replace("‚Ñ¢", "")
        
    # Find matching rows based on fund name
    name_matches = returns_lookup[returns_lookup['fund_name'].str.contains(name, case=False, na=False, regex=False)]
    
    if len(name_matches) == 0:
        print("  No name matches found")
        continue
    
    print(f"  Found {len(name_matches)} name matches")
    
    # Clean share class (remove trademark symbol)
    share_class = fund_obj.share_class
    if "‚Ñ¢" in share_class:
        share_class = share_class.replace("‚Ñ¢", "")
    
    # Now match share class
    share_class_matches = name_matches[
        name_matches['share_class'].str.contains(share_class, case=False, na=False, regex=False)]
    
    if name_matches['share_class'].isna().all():
        fund_obj.annual_returns = dict(zip(name_matches['year'], name_matches['total_return_clean']))
        print("Annual return: ", fund_obj.annual_returns)
        continue
        
    if len(share_class_matches) == 0:
        print(f"  No share class matches found for '{share_class}'")
        print(f"  Available share classes: {name_matches['share_class'].unique()}")
        continue
    
    print(f"  Found {len(share_class_matches)} matching records")
    
    # Add all matching returns
    for _, row in share_class_matches.iterrows():
        fund_obj.annual_returns[row['year']] = row['total_return_clean']
    
    print(f"  Annual returns: {fund_obj.annual_returns}")

array(['Advice Select International Growth Fund',
       'Advice Select Dividend Growth Fund',
       'Advice Select Global Value Fund', 'International Explorer Fund',
       'High Dividend Yield Index Fund'], dtype=object)


Processing fund object: Vanguard Extended Market Index Fund - Investor Shares
Cleaned name: 'Extended Market Index Fund'
  No name matches found

Processing fund object: Vanguard Extended Market Index Fund - ETF Shares
Cleaned name: 'Extended Market Index Fund'
  No name matches found

Processing fund object: Vanguard Extended Market Index Fund - Admiral‚Ñ¢ Shares
Cleaned name: 'Extended Market Index Fund'
  No name matches found

Processing fund object: Vanguard Extended Market Index Fund - Institutional Shares
Cleaned name: 'Extended Market Index Fund'
  No name matches found

Processing fund object: Vanguard Extended Market Index Fund - Institutional Plus Shares
Cleaned name: 'Extended Market Index Fund'
  No name matches found

Processing fund object: Vanguard Extended Market Index Fund - Institutional Select Shares
Cleaned name: 'Extended Market Index Fund'
  No name matches found

Processing fund object: Vanguard Mid-Cap Index Fund - Investor Shares
Cleaned name: 'Mid-Cap Index 

In [None]:
import sys
%reload_ext autoreload
sys.path.append('../src')


from simple_rag.extraction.parser import compute_annual_returns

for fund in funds_total:
    if fund.ticker in performance_funds:
        returns = compute_annual_returns(fund.performance_table)
        print("\nFinal Annual Returns:")
        fund.annual_returns = returns
        print(f"  {fund.ticker}: {returns}")

Detected format: Year (YYYY)
Found years: [np.int64(2014), np.int64(2015), np.int64(2016), np.int64(2017), np.int64(2018), np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2022), np.int64(2023), np.int64(2024)]
  2014 -> 2015: $10,000.00 -> $10,668.00 = 6.68%
  2015 -> 2016: $10,668.00 -> $10,082.00 = -5.49%
  2016 -> 2017: $10,082.00 -> $12,370.00 = 22.69%
  2017 -> 2018: $12,370.00 -> $13,559.00 = 9.61%
  2018 -> 2019: $13,559.00 -> $15,398.00 = 13.56%
  2019 -> 2020: $15,398.00 -> $18,220.00 = 18.33%
  2020 -> 2021: $18,220.00 -> $25,086.00 = 37.68%
  2021 -> 2022: $25,086.00 -> $17,003.00 = -32.22%
  2022 -> 2023: $17,003.00 -> $17,172.00 = 0.99%
  2023 -> 2024: $17,172.00 -> $23,315.00 = 35.77%

Final Annual Returns:
  VMGRX: {'2015': 6.68, '2016': -5.49, '2017': 22.69, '2018': 9.61, '2019': 13.56, '2020': 18.33, '2021': 37.68, '2022': -32.22, '2023': 0.99, '2024': 35.77}
Detected format: Year (YYYY)
Found years: [np.int64(2014), np.int64(2015), np.int64(2016), np.int64(20

  df['parsed_date'] = pd.to_datetime(df[date_col], errors='coerce')
  df['parsed_date'] = pd.to_datetime(df[date_col], errors='coerce')
  df['parsed_date'] = pd.to_datetime(df[date_col], errors='coerce')
  df['parsed_date'] = pd.to_datetime(df[date_col], errors='coerce')
  df['parsed_date'] = pd.to_datetime(df[date_col], errors='coerce')
  df['parsed_date'] = pd.to_datetime(df[date_col], errors='coerce')


## Ishares

In [None]:
import os
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
%%python --bg --out my_log
def process_filing(data):
    # Unpack the tuple (index, filing)
    index, filing = data
    
    try:
        # Network I/O
        text = filing.text()
        
        output_dir = Path.cwd() / "ishares"
        output_dir.mkdir(parents=True, exist_ok=True)
        
        # Disk I/O
        # Using index for the filename ensures uniqueness without a global counter
        filename = output_dir / f"ishares_{index}.txt"
        
        with open(filename, "w", encoding="utf-8") as f:
            f.write(text)
            
        print(f"Saved: {filename}")
        
    except Exception as e:
        print(f"Error processing filing {index}: {e}")

MAX_WORKERS = 10 
    

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = {executor.submit(process_filing, item): item[0] 
               for item in enumerate(latest_filings, start=1)}
    
    for future in as_completed(futures):
        index = futures[future]
        try:
            future.result()
        except Exception as e:
            print(f"Filing {index} failed: {e}")

Saved: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks/ishares/ishares_1.txt
Saved: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks/ishares/ishares_2.txt


Saved: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks/ishares/ishares_4.txt


Saved: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks/ishares/ishares_5.txt


Saved: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks/ishares/ishares_8.txt


Saved: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks/ishares/ishares_6.txt


Saved: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks/ishares/ishares_10.txt


Saved: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks/ishares/ishares_9.txt


Saved: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks/ishares/ishares_11.txt


Saved: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks/ishares/ishares_12.txt


KeyboardInterrupt: 

Saved: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks/ishares/ishares_14.txt
Error processing filing 15: Socket operation on non-socket
Saved: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks/ishares/ishares_13.txt


In [41]:
from concurrent.futures import ProcessPoolExecutor, as_completed
import pandas as pd
from typing import List
import sys
from pathlib import Path
from tqdm import tqdm
# Auto-reload setup (run once at the start)
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
sys.path.append('../src')


from simple_rag.extraction.parser import BlackRockFiling
from edgar import set_identity, Company

set_identity("luis.alvarez.conde@alumnos.upm.es")

ticker = "HEZU"
fund = Company(ticker)
all_filings = fund.get_filings(form="N-CSR")

def process_single_filing_multiprocess(filing_data):
    """
    Process a single filing (for multiprocessing).
    Note: Must pass serializable data, not the filing object directly
    """
    try:
        # Import inside function for multiprocessing
        import sys
        from pathlib import Path
        sys.path.append('../src')
        from simple_rag.extraction.parser import BlackRockFiling
        
        html_content, report_date = filing_data
        parser = BlackRockFiling(html_content)
        funds = parser.get_funds()
        
        performance_fund_tickers = []
        df_performance = None
        
        count = 0
        for fund in funds:

            if fund.performance_table is not None:
                performance_fund_tickers.append(fund.ticker)
                count += 1
        
        if count == 0:
            print("Funds calling get_financial_highlights2:")
            for fund in funds:
                if fund.performance_table is None:
                    print(f"  - {getattr(fund, 'name', getattr(fund, 'fund_name', 'Unknown'))}")

            print("Calling get_financial_highlights2")
            df_performance = parser.get_financial_highlights2()
        
        print(f"Filing {report_date}: Found {count} funds with performance tables, Total funds: {len(funds)}")
        
        return {
            'funds': funds,
            'performance_tickers': performance_fund_tickers,
            'df_performance': df_performance,
            'report_date': report_date
        }
    except Exception as e:
        print(f"Error processing filing: {e}")
        return None

if all_filings:
    unique_dates = sorted({f.report_date for f in all_filings if f.report_date})
    print("Unique report dates:", unique_dates)
    
    # Filter for filings from 2024-08-31 onward
    cutoff_date = "2024-09-31"
    latest_filings = [
        f for f in all_filings 
        if f.report_date and f.report_date >= cutoff_date
    ]
    
    print("Found filings: ", len(latest_filings), "from", cutoff_date, "onward")
    
    # Optional: Show the dates of filtered filings
    print("Filtered filing dates:", sorted({f.report_date for f in latest_filings}))

# Prepare data for multiprocessing (fetch HTML first)
filing_data_list = [(filing.html(), filing.report_date) for filing in latest_filings]


print("Saved ishares.html")

funds_total = []
performance_funds = []
df_performances = []
ishares_funds = []
# Use ProcessPoolExecutor
with ProcessPoolExecutor() as executor:
    future_to_data = {executor.submit(process_single_filing_multiprocess, data): data 
                      for data in filing_data_list}
    
    for future in tqdm(as_completed(future_to_data), total=len(filing_data_list), desc="Processing filings"):
        result = future.result()
        
        if result:
            ishares_funds.extend(result['funds'])
            funds_total.extend(result['funds'])
            performance_funds.extend(result['performance_tickers'])
            
            if result['df_performance'] is not None:
                df_performances.append(result['df_performance'])

print(len(df_performances))
print(f"Total funds processed: {len(ishares_funds)}")
print(f"Funds with performance tables: {len(performance_funds)}")
print(f"Unique performance fund tickers: {set(performance_funds)}")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Unique report dates: ['2003-04-30', '2003-07-31', '2004-02-29', '2004-03-31', '2004-04-30', '2004-07-31', '2005-02-28', '2005-03-31', '2005-04-30', '2005-07-31', '2006-02-28', '2006-03-31', '2006-04-30', '2006-07-31', '2007-02-28', '2007-03-31', '2007-04-30', '2007-07-31', '2008-02-29', '2008-03-31', '2008-04-30', '2008-07-31', '2009-02-28', '2009-03-31', '2009-04-30', '2009-07-31', '2009-08-31', '2010-02-28', '2010-03-31', '2010-04-30', '2010-07-31', '2010-08-31', '2011-02-28', '2011-03-31', '2011-04-30', '2011-07-31', '2011-08-31', '2011-10-31', '2012-02-29', '2012-03-31', '2012-04-30', '2012-07-31', '2012-08-31', '2012-10-31', '2013-02-28', '2013-03-31', '2013-04-30', '2013-07-31', '2013-08-31', '2013-10-31', '2014-02-28', '2014-03-31', '2014-04-30', '2014-07-31', '2014-08-31', '2014-10-31', '2015-02-28', '2015-03-31', '2015-04-30', '2015-07-31', '2015-08-31', '2015-10-31', '2016-02-29', '2016-03

Processing filings:   0%|          | 0/19 [00:00<?, ?it/s]

Processing: iShares Large Cap Accelerated Outcome ETF
Extracting context:  FY2025_C000256144Member
Tag not found:  dei:SecurityExchangeName FY2025_C000256144Member
Unknown Table:        0                                                 1
0  ‚Äã(a)  The underlying fund is iShares Core S&P 500 ETF.
1  ‚Äã(b)                      Excludes money market funds.
2  ‚Äã(c)                         Rounds to less than 0.1%.
Unknown table type:       0                                                 1
0  ‚Äã(a)  The underlying fund is iShares Core S&P 500 ETF.
1  ‚Äã(b)                      Excludes money market funds.
2  ‚Äã(c)                         Rounds to less than 0.1%.
Failed to extract tables from block:  oef:LineGraphTableTextBlock
No tables found for block:  oef:LineGraphTableTextBlock
Unknown Table:                                                     0
0  As of the date of this report, the Fund does n...
Unknown table type:                                                    0
0  As o

Processing filings:   5%|‚ñå         | 1/19 [00:07<02:16,  7.57s/it]

From2024-09-01to2025-08-31_C000242847Member
Tag not found:  dei:SecurityExchangeName FY2025_C000141932Member
Unknown Table:        0                                                  1
0  ‚Äã(a)                       Excludes money market funds.
1    ‚Äã*  Ten largest industries are presented. Addition...
Unknown table type:       0                                                  1
0  ‚Äã(a)                       Excludes money market funds.
1    ‚Äã*  Ten largest industries are presented. Addition...
Failed to extract tables from block:  oef:LineGraphTableTextBlock
No tables found for block:  oef:LineGraphTableTextBlock
Processing: iShares International Developed Real Estate ETF
Extracting context:  FY2025_C000050169Member
Tag not found:  oef:ClassName From2024-08-01to2025-07-31_C000219701Member
Unknown Table:        0                                                  1
0  ‚Äã(a)                       Excludes money market funds.
1    ‚Äã*  Ten largest industries are presented. Additio

Processing filings:  11%|‚ñà         | 2/19 [00:10<01:24,  5.00s/it]

Tag not found:  oef:ClassName From2024-05-01to2025-04-30_C000012060Member
Unknown Table:        Footnote                   Description
0  Footnote(a)  Excludes money market funds.
Unknown table type:       Footnote                   Description
0  Footnote(a)  Excludes money market funds.Tag not found: 
 oef:ClassName From2024-08-01to2025-07-31_C000171896Member
Tag not found:  oef:ClassName Processing: iShares ESG Aware MSCI USA ETFFrom2024-08-01to2025-07-31_C000069397Member

Extracting context:  From2024-09-01to2025-08-31_C000174221Member
Processing: iShares Morningstar Mid-Cap Growth ETF
Extracting context:  From2024-05-01to2025-04-30_C000012193Member
Processing: iShares Core U.S. REIT ETF
Extracting context:  FY2025_C000042588Member
Unknown Table:        Footnote                   Description
0  Footnote(a)  Excludes money market funds.Unknown Table: 
       Footnote                   Description
0  Footnote(a)  Excludes money market funds.Unknown table type:       Footnote         

Processing filings:  16%|‚ñà‚ñå        | 3/19 [00:16<01:27,  5.46s/it]

 oef:LineGraphTableTextBlock
Processing: iShares Blockchain and Tech ETF
Extracting context:  FY2025_C000235105Member
Filing 2025-04-30: Found 6 funds with performance tables, Total funds: 6


Processing filings:  21%|‚ñà‚ñà        | 4/19 [00:16<00:50,  3.37s/it]

Tag not found:  dei:SecurityExchangeName FY2025_C000012097Member
Unknown Table:        0                             1
0  ‚Äã(a)  Excludes money market funds.
Unknown table type:       0                             1
0  ‚Äã(a)  Excludes money market funds.
Failed to extract tables from block:  oef:LineGraphTableTextBlock
No tables found for block:  oef:LineGraphTableTextBlock
Processing: iShares Morningstar Value ETF
Extracting context:  FY2025_C000012099Member
Tag not found:  oef:ClassName From2024-05-01to2025-04-30_C000038163Member
Unknown Table:        Footnote                   Description
0  Footnote(a)  Excludes money market funds.
Unknown table type:       Footnote                   Description
0  Footnote(a)  Excludes money market funds.
Processing: iShares Currency Hedged MSCI Eurozone ETF
Extracting context:  FY2025_C000141929Member
Processing: iShares Morningstar Multi-Asset Income ETF
Extracting context:  From2024-08-01to2025-07-31_C000112640Member
Tag not found:  dei:Secur

Processing filings:  26%|‚ñà‚ñà‚ñã       | 5/19 [00:22<00:59,  4.23s/it]

Processing: iShares ESG Aware MSCI EAFE ETF
Extracting context:  FY2025_C000170246Member
Tag not found:  oef:ClassName From2024-09-01to2025-08-31_C000232922Member
Unknown Table:        0                                                  1
0  ‚Äã(a)                       Excludes money market funds.
1    ‚Äã#  Ten largest countries/geographic regions are p...
Unknown table type:       0                                                  1
0  ‚Äã(a)                       Excludes money market funds.
1    ‚Äã#  Ten largest countries/geographic regions are p...
Unknown Table:        Footnote                   Description
0  Footnote(a)  Excludes money market funds.
Unknown table type:       Footnote                   Description
0  Footnote(a)  Excludes money market funds.
Processing: iShares MSCI USA Momentum Factor ETF
Extracting context:  From2024-08-01to2025-07-31_C000125223Member
Tag not found:  dei:SecurityExchangeName FY2025_C000012056MemberTag not found: 
 oef:ClassName From2024-05-01

Processing filings:  32%|‚ñà‚ñà‚ñà‚ñè      | 6/19 [00:30<01:11,  5.47s/it]

Unknown Table:        Footnote                   Description
0  Footnote(a)  Excludes money market funds.
Unknown Table: Unknown table type:       Footnote                   Description
0  Footnote(a)  Excludes money market funds. 
      Footnote                   Description
0  Footnote(a)  Excludes money market funds.
Unknown table type:       Footnote                   Description
0  Footnote(a)  Excludes money market funds.
Processing: iShares U.S. Healthcare Providers ETF
Extracting context:  From2024-04-01to2025-03-31_C000025772Member
Processing: iShares LifePath Retirement ETFProcessing: iShares Core S&P Total U.S. Stock Market ETF
Extracting context: 
 Extracting context: From2024-08-01to2025-07-31_C000245481Member
 From2024-04-01to2025-03-31_C000012047Member
Tag not found:  dei:SecurityExchangeName FY2025_C000012102Member
Tag not found:  dei:SecurityExchangeName FY2025_C000244564Member
Unknown Table:        0                                                  1
0  ‚Äã(a)        

Processing filings:  37%|‚ñà‚ñà‚ñà‚ñã      | 7/19 [00:53<02:15, 11.32s/it]

Tag not found:  dei:SecurityExchangeName FY2025_C000140338Member
Tag not found:  dei:SecurityExchangeName FY2025_C000012071Member
Unknown Table:        0                             1
0  ‚Äã(a)  Excludes money market funds.
1  ‚Äã(b)     Rounds to less than 0.1%.
Unknown table type:       0                             1
0  ‚Äã(a)  Excludes money market funds.
1  ‚Äã(b)     Rounds to less than 0.1%.
Processing: iShares 0-5 Year High Yield Corporate Bond ETF
Extracting context:  FY2024_C000131291Member
Unknown Table:        0                             1
0  ‚Äã(a)  Excludes money market funds.
Unknown Table: Unknown table type:       0                             1
0  ‚Äã(a)  Excludes money market funds.
       0                             1
0  ‚Äã(a)  Excludes money market funds.
Unknown table type:       0                             1
0  ‚Äã(a)  Excludes money market funds.
Failed to extract tables from block:  oef:LineGraphTableTextBlock
No tables found for block: Tag not found:  o

Processing filings:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 8/19 [00:54<01:25,  7.78s/it]

Failed to extract tables from block:  oef:LineGraphTableTextBlock
No tables found for block:  oef:LineGraphTableTextBlock
Processing: iShares Core MSCI Total International Stock ETF
Extracting context:  FY2025_C000119716Member
Processing: iShares U.S. Oil & Gas Exploration & Production ETF
Extracting context:  From2024-04-01to2025-03-31_C000025768Member
Tag not found:  oef:FactorsAffectingPerfTextBlock From2024-05-01to2025-04-30_C000012052Member
Tag not found:  dei:SecurityExchangeName FY2025_C000145378Member
Tag not found:  dei:SecurityExchangeName FY2025_C000012065Member
Processing: iShares North American Natural Resources ETF
Extracting context:  From2024-04-01to2025-03-31_C000012086Member
Unknown Table:        0                             1
0  ‚Äã(a)  Excludes money market funds.
Unknown Table: Unknown table type:       0                             1
0  ‚Äã(a)  Excludes money market funds. 
      0                             1
0  ‚Äã(a)  Excludes money market funds.
Unknown tabl

Processing filings:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 9/19 [01:03<01:22,  8.28s/it]

  - iShares MSCI Poland ETF
  - iShares MSCI Qatar ETF
  - iShares MSCI Saudi Arabia ETF
  - iShares MSCI UAE ETF
  - iShares MSCI United Kingdom ETF
  - iShares MSCI World Small-Cap ETF
  - iShares Paris-Aligned Climate Optimized MSCI World ex USA ETF
Calling get_financial_highlights2
Tag not found:  oef:ClassName From2024-05-01to2025-04-30_C000012054Member
Tag not found:  oef:ClassName From2024-04-01to2025-03-31_C000012073Member
Failed to extract tables from block:  oef:LineGraphTableTextBlock
No tables found for block:  Tag not found: oef:LineGraphTableTextBlock
 oef:FactorsAffectingPerfTextBlock From2024-04-01to2025-03-31_C000025771Member
Processing: iShares Currency Hedged MSCI EAFE Small-Cap ETF
Extracting context:  FY2025_C000157306Member
Found 79 potential Financial Highlights sections
Processing table with shape: (61, 3)
Processing table with shape: (23, 6)
Cleaned Value: 821452 from label: net assets, end of year (000)
Cleaned Value: 708068 from label: net assets, end of year

Processing filings:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 10/19 [01:08<01:04,  7.18s/it]

Processing: iShares New York Muni Bond ETFUnknown Table:  
      Footnote                   Description
0  Footnote(a)  Excludes money market funds.Extracting context:  
From2024-03-01to2025-02-28_C000053740Member
Unknown table type:       Footnote                   Description
0  Footnote(a)  Excludes money market funds.
Failed to extract tables from block:  oef:LineGraphTableTextBlock
No tables found for block:  oef:LineGraphTableTextBlock
Processing: iShares Aaa - A Rated Corporate Bond ETF
Extracting context:  FY2024_C000110081Member
Processing: iShares Russell 2500 ETF
Extracting context:  Processing: iShares U.S. Industrials ETF
From2024-04-01to2025-03-31_C000183467MemberExtracting context: 
 From2024-05-01to2025-04-30_C000012055Member
Processing: iShares U.S. Regional Banks ETF
Extracting context:  From2024-04-01to2025-03-31_C000025776Member
Tag not found:  oef:ClassName From2024-05-01to2025-04-30_C000012055Member
Tag not found:  oef:ClassName From2024-03-01to2025-02-28_C0000537

Processing filings:  58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 11/19 [01:21<01:11,  8.99s/it]

Tag not found:  oef:ClassName From2024-04-01to2025-03-31_C000012039Member
Tag not found:  Tag not found: oef:FactorsAffectingPerfTextBlock  oef:ClassName From2024-05-01to2025-04-30_C000012061MemberFrom2024-04-01to2025-03-31_C000012075Member

Tag not found:  dei:SecurityExchangeName FY2024_C000110079Member
Failed to extract tables from block:  oef:LineGraphTableTextBlock
No tables found for block:  oef:LineGraphTableTextBlock
Processing: iShares 0-5 Year Investment Grade Corporate Bond ETF
Extracting context:  FY2024_C000131292MemberUnknown Table: 
       0                                                  1
0  ‚Äã(a)                       Excludes money market funds.
1    ‚Äã*  Credit quality ratings shown reflect the ratin...Tag not found:  
oef:ClassNameUnknown table type:       0                                                  1
0  ‚Äã(a)                       Excludes money market funds.
1    ‚Äã*  Credit quality ratings shown reflect the ratin... 
From2023-11-01to2024-10-31_C00021

Processing filings:  63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 12/19 [01:27<00:57,  8.24s/it]

Processing: iShares 10+ Year Investment Grade Corporate Bond ETF
Extracting context:  FY2025_C000080009Member
Unknown Table:        Footnote                   Description
0  Footnote(a)  Excludes money market funds.
Unknown table type:       Footnote                   Description
0  Footnote(a)  Excludes money market funds.
Processing: iShares Top 20 U.S. Stocks ETF
Extracting context: Tag not found:   From2024-04-01to2025-03-31_C000254701Memberoef:FactorsAffectingPerfTextBlock
 From2024-04-01to2025-03-31_C000222498Member
Tag not found:  dei:SecurityExchangeName FY2025_C000154548Member
Failed to extract tables from block:  oef:LineGraphTableTextBlock
No tables found for block:  oef:LineGraphTableTextBlock
Processing: iShares Aaa - A Rated Corporate Bond ETF
Extracting context:  FY2024_C000110081Member
Tag not found:  oef:ClassName From2023-11-01to2024-10-31_C000093655Member
Filing 2025-03-31: Found 17 funds with performance tables, Total funds: 19


Processing filings:  68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 13/19 [01:29<00:36,  6.15s/it]

Unknown Table:  Failed to extract tables from block:        Footnote                   Description
0  Footnote(a)  Excludes money market funds.oef:LineGraphTableTextBlock

Unknown table type:       Footnote                   Description
0  Footnote(a)  Excludes money market funds.No tables found for block: 
 oef:LineGraphTableTextBlock
Processing: iShares MSCI ACWI ETF
Extracting context:  FY2025_C000061364Member
Processing: iShares TIPS Bond ETF
Extracting context:  From2023-11-01to2024-10-31_C000012093Member
Tag not found:  oef:ClassName From2024-04-01to2025-03-31_C000254701Member
Unknown Table:        0                                                  1
0  ‚Äã(a)                       Excludes money market funds.
1    ‚Äã#  Ten largest countries/geographic regions are p...
Unknown table type:       0                                                  1
0  ‚Äã(a)                       Excludes money market funds.
1    ‚Äã#  Ten largest countries/geographic regions are p...
Tag not foun

Processing filings:  74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 14/19 [01:36<00:31,  6.37s/it]

Unknown Table:        Footnote                                        Description
0    Footnote*  Credit quality ratings shown reflect the ratin...
1  Footnote(a)                       Excludes money market funds.
Unknown table type:       Footnote                                        Description
0    Footnote*  Credit quality ratings shown reflect the ratin...
1  Footnote(a)                       Excludes money market funds.
Tag not found:  dei:SecurityExchangeName FY2024_C000194633Member
Processing: iShares iBonds Dec 2025 Term Treasury ETF
Extracting context:  From2023-11-01to2024-10-31_C000217186Member
Unknown Table:        0                                                  1
0  ‚Äã(a)                       Excludes money market funds.
1    ‚Äã*  Credit quality ratings shown reflect the ratin...
Unknown table type:       0                                                  1
0  ‚Äã(a)                       Excludes money market funds.
1    ‚Äã*  Credit quality ratings shown reflect

Processing filings:  79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 15/19 [02:13<01:03, 15.80s/it]

Tag not found:  oef:FactorsAffectingPerfTextBlock FY2024_C000249959Member
Unknown Table:        0                                                  1
0  ‚Äã(a)                       Excludes money market funds.
1    ‚Äã*  Credit quality ratings shown reflect the ratin...
Unknown table type:       0                                                  1
0  ‚Äã(a)                       Excludes money market funds.
1    ‚Äã*  Credit quality ratings shown reflect the ratin...
Failed to extract tables from block:  oef:LineGraphTableTextBlock
No tables found for block:  oef:LineGraphTableTextBlock
Processing: iShares High Yield Systematic Bond ETF
Extracting context:  FY2025_C000182992Member
Unknown Table:        Footnote                                        Description
0    Footnote*  Credit quality ratings shown reflect the ratin...
1  Footnote(a)                       Excludes money market funds.
Unknown table type:       Footnote                                        Description
0    Footn

Processing filings:  84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 16/19 [02:37<00:54, 18.07s/it]

Tag not found:  dei:SecurityExchangeName FY2025_C000182993Member
Unknown Table:        0                                                  1
0  ‚Äã(a)                       Excludes money market funds.
1    ‚Äã*  Credit quality ratings shown reflect the ratin...
Unknown table type:       0                                                  1
0  ‚Äã(a)                       Excludes money market funds.
1    ‚Äã*  Credit quality ratings shown reflect the ratin...
Tag not found:  dei:SecurityExchangeName FY2024_C000219751Member
Unknown Table:        0                                                  1
0  ‚Äã(a)                       Excludes money market funds.
1    ‚Äã*  Credit quality ratings shown reflect the ratin...
Unknown table type:       0                                                  1
0  ‚Äã(a)                       Excludes money market funds.
1    ‚Äã*  Credit quality ratings shown reflect the ratin...
Failed to extract tables from block:  oef:LineGraphTableTextBlock
No table

Processing filings:  89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 17/19 [02:46<00:30, 15.42s/it]

No data obtained
No data obtained
Processing: iShares iBonds Dec 2034 Term Treasury ETF
Extracting context:  FY2024_C000250196Member
Tag not found:  oef:FactorsAffectingPerfTextBlock FY2024_C000250196Member
No data obtained
Processing: iShares iBonds Dec 2031 Term Corporate ETF
Extracting context:  FY2024_C000228040Member
Unknown Table:        0                                                  1
0  ‚Äã(a)                       Excludes money market funds.
1    ‚Äã*  Credit quality ratings shown reflect the ratin...
Unknown table type:       0                                                  1
0  ‚Äã(a)                       Excludes money market funds.
1    ‚Äã*  Credit quality ratings shown reflect the ratin...
Tag not found:  dei:SecurityExchangeName FY2024_C000228040Member
Unknown Table:        0                                                  1
0  ‚Äã(a)                       Excludes money market funds.
1    ‚Äã*  Credit quality ratings shown reflect the ratin...
Unknown table ty

Processing filings:  95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 18/19 [03:55<00:31, 31.71s/it]

Failed to extract tables from block:  oef:LineGraphTableTextBlock
No tables found for block:  oef:LineGraphTableTextBlock
Processing: iShares Russell 2000 BuyWrite ETF
Extracting context:  FY2024_C000247831Member
Tag not found:  dei:SecurityExchangeName FY2024_C000247831Member
Unknown Table:        0                                                 1
0  ‚Äã(a)  The underlying fund is iShares Russell 2000 ETF.
1  ‚Äã(b)                      Excludes money market funds.
Unknown table type:       0                                                 1
0  ‚Äã(a)  The underlying fund is iShares Russell 2000 ETF.
1  ‚Äã(b)                      Excludes money market funds.
Failed to extract tables from block:  oef:LineGraphTableTextBlock
No tables found for block:  oef:LineGraphTableTextBlock
Failed to extract tables from block:  oef:AvgAnnlRtrTableTextBlock
No tables found for block:  oef:AvgAnnlRtrTableTextBlock
Processing: iShares S&P 500 BuyWrite ETF
Extracting context:  FY2024_C000247832Membe

Processing filings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 19/19 [04:15<00:00, 13.46s/it]


11
Total funds processed: 389
Funds with performance tables: 119
Unique performance fund tickers: {'EVUS', 'IBMM', 'ELQD', 'IEO', 'EGUS', 'IBB', 'ESMV', 'IAI', 'IBHJ', 'ISCV', 'ITDI', 'IBTG', 'IBMQ', 'AOK', 'IWB', 'IBTL', 'IGE', 'ITDD', 'IBMO', 'ITDH', 'IBTF', 'TIP', 'IVV', 'IAT', 'TLH', 'IBHH', 'IMCB', 'GOVT', 'EFNL', 'IEI', 'IHF', 'IBTM', 'EIRL', 'IBHI', 'ESGU', 'IHE', 'LRGF', 'TLT', 'VLUE', 'EAOM', 'ITDJ', 'EWUS', 'SIZE', 'AOA', 'BYLD', 'IBMP', 'ILCG', 'USXF', 'FOVL', 'IRTR', 'SMLF', 'SHY', 'IJJ', 'ITA', 'IYLD', 'IAK', 'IHI', 'QUAL', 'ITDB', 'IBTJ', 'IWR', 'TECB', 'IJK', 'SMMV', 'IBTH', 'MTUM', 'STIP', 'ICF', 'EAOA', 'IWF', 'IBMN', 'ISCG', 'SMMD', 'AGZ', 'IEZ', 'IBTI', 'ESML', 'NYF', 'SUB', 'SHV', 'AOR', 'ITDF', 'IBMR', 'IMCG', 'SVAL', 'ITDC', 'IBTK', 'EAOR', 'IUSB', 'USCL', 'IFRA', 'IEF', 'IJT', 'IGM', 'ITDE', 'IGV', 'SUSL', 'IWO', 'ITDG', 'IWN', 'IWC', 'IYZ', 'MUB', 'USMV', 'EAOK', 'IWM', 'PABU', 'ITB', 'SGOV', 'LQDB', 'IWP', 'IDGT', 'IBDX', 'SOXX', 'ITOT', 'GOVZ', 'IBTE', 'AOM', 

In [42]:
import pandas as pd
import re
%reload_ext autoreload
from simple_rag.models.fund import FinancialHighlights

if df_performances:
    df_performance = pd.concat(df_performances, ignore_index=True)
else:
    df_performance = pd.DataFrame() # Empty fallback
    print("No performance data found.")

print(df_performance.head())
def clean_financial_number(val):
    """
    Parses financial strings like '23.19 %(b)' or '(24.82 )%'.
    - Extracts the numerical value.
    - Handles (12.34) as negative -12.34.
    - Ignores footnote markers like (a), (b).
    - Removes %, $, and commas.
    """
    if pd.isna(val) or val is None:
        return None
    
    # Convert to string and strip whitespace
    s = str(val).strip()
    
    # 1. Regex to find the number (handles decimals and commas)
    # Looks for digits, optional commas, and optional decimal part
    match = re.search(r'(\d{1,3}(?:,\d{3})*\.?\d*|\d*\.?\d+)', s)
    
    if not match:
        return None
        
    # Get the raw number string (e.g., "24.82" or "1,234.56")
    num_str = match.group(0)
    
    # 2. Check for negative indication: "(" at the start of the string
    # Accounting format always puts the negative parenthesis at the start: (24.82)%
    is_negative = s.startswith('(')
    
    try:
        # Remove commas and convert to float
        clean_num = float(num_str.replace(',', ''))
        
        # Apply negative sign if detected
        return -clean_num if is_negative else clean_num
        
    except ValueError:
        return None
returns_lookup = df_performance.copy()


# Apply to all financial columns
financial_cols = ['total_return', 'expense_ratio', 'net_income_ratio', 'portfolio_turnover', 'nav_end', 'nav_beginning', 'net_assets']
for col in financial_cols:
    if col in returns_lookup.columns:
        returns_lookup[f'{col}_clean'] = returns_lookup[col].apply(clean_financial_number)

# Now you can efficiently match and update your funds
for fund_obj in funds_total:
    print(f"\nProcessing fund object: {fund_obj.name} - {fund_obj.share_class}")
    if fund_obj.ticker in performance_funds:
        continue
    # Initialize annual returns
    if not hasattr(fund_obj, 'annual_returns') or fund_obj.annual_returns is None:
        fund_obj.annual_returns = {}

    if not hasattr(fund_obj, 'financial_highlights') or fund_obj.financial_highlights is None:
        fund_obj.financial_highlights = {}
    
    # Clean the name: remove "Vanguard" and strip whitespace
    name = fund_obj.name.replace("Vanguard", "").strip()
    print(f"Cleaned name: '{name}'")
    
    if "¬Æ" in name:
        name = name.replace("¬Æ", "")
    if "‚Ñ¢" in name:
        name = name.replace("‚Ñ¢", "")
        
    # Find matching rows based on fund name
    name_matches = returns_lookup[returns_lookup['fund_name'].str.contains(name, case=False, na=False, regex=False)]
    
    if len(name_matches) == 0:
        print("  No name matches found")
        continue
    
    print(f"  Found {len(name_matches)} name matches")
    
    # Clean share class (remove trademark symbol)
    fund_obj.share_class = "ETF Shares"
    share_class = fund_obj.share_class
    
    if "‚Ñ¢" in share_class:
        share_class = share_class.replace("‚Ñ¢", "")
    
    # Now match share class
    share_class_matches = name_matches[
        name_matches['share_class'].str.contains(share_class, case=False, na=False, regex=False)]
    
    if name_matches['share_class'].isna().all():
        fund_obj.annual_returns = dict(zip(name_matches['year'], name_matches['total_return_clean']))
        print("Annual return: ", fund_obj.annual_returns)
        continue
        
    if len(share_class_matches) == 0:
        print(f"  No share class matches found for '{share_class}'")
        print(f"  Available share classes: {name_matches['share_class'].unique()}")
        continue
    
    print(f"  Found {len(share_class_matches)} matching records")
    
    # Add all matching returns
    for _, row in share_class_matches.iterrows():
        year = str(row['year'])
        
        # Store annual return
        fund_obj.annual_returns[year] = row['total_return_clean']
        
        # Store full financial highlights snapshot
        fund_obj.financial_highlights[year] = FinancialHighlights(
            turnover=row.get('portfolio_turnover_clean'),
            expense_ratio=row.get('expense_ratio_clean'),
            total_return=row.get('total_return_clean'),
            net_assets=row.get('net_assets'),  # You may need to add this cleaning
            net_assets_value_begining=row.get('nav_beginning_clean'),
            net_assets_value_end=row.get('nav_end_clean') ,
            net_income_ratio=row.get('net_income_ratio_clean')
        )
    
    print(f"  Annual returns: {fund_obj.annual_returns}")
    print(f"  Financial highlights years: {list(fund_obj.financial_highlights.keys())}")
    for key, value in fund_obj.financial_highlights.items():
        print(f"    {key}: {value}")

                                   fund_name share_class  year net_assets  \
0  iShares¬†Large¬†Cap¬†Accelerated¬†Outcome¬†ETF  ETF Shares  2025          $   
1  iShares¬†Large¬†Cap¬†Accelerated¬†Outcome¬†ETF  ETF Shares  2025   13473000   
2       iShares¬†Large¬†Cap¬†Max¬†Buffer¬†Mar¬†ETF  ETF Shares  2025   38203000   
3       iShares¬†Large¬†Cap¬†Max¬†Buffer¬†Jun¬†ETF  ETF Shares  2025  161134000   
4       iShares¬†Large¬†Cap¬†Max¬†Buffer¬†Jun¬†ETF  ETF Shares  2024   72687000   

  nav_beginning nav_end total_return expense_ratio net_income_ratio  \
0             $       $         None          None             None   
1         25.00   25.91         3.64          0.47             0.69   
2         25.00   25.81         3.25          0.47             0.65   
3         25.24   27.59        10.21          0.47             0.96   
4         25.00   25.24         0.95          0.47            (0.42   

  portfolio_turnover distribution_shares  
0               None                Non

In [45]:
import re
from collections import defaultdict
import pandas as pd

def infer_first_col_format(value: object) -> str:
    if value is None or (isinstance(value, float) and pd.isna(value)):
        return "EMPTY"

    s = str(value).strip()
    if s == "" or s.lower() == "nan":
        return "EMPTY"

    # Jan 23, Aug 15
    if re.match(r"^[A-Za-z]{3}\s+\d{2}$", s):
        return "MON_YY"

    # 2015
    if re.match(r"^\d{4}$", s):
        return "YYYY"

    # 2024-08-31
    if re.match(r"^\d{4}-\d{2}-\d{2}$", s):
        return "YYYY_MM_DD"

    # 08/31/24 or 8/31/2024
    if re.match(r"^\d{1,2}/\d{1,2}/\d{2,4}$", s):
        return "MM_DD_YY(YY)"

    # 31/08/24 (if you ever have EU style)
    if re.match(r"^\d{1,2}-\d{1,2}-\d{2,4}$", s):
        return "DD_MM_YY(YY)_or_MM_DD_YY(YY)_DASH"

    # Fallbacks
    if re.search(r"\d", s):
        return "OTHER_HAS_DIGITS"

    return "OTHER_TEXT"


def describe_first_column_formats(
    dfs,
    names=None,
    samples_per_df=3,
    max_groups_to_show=50,
    max_dfs_per_group_to_print=5,
):
    if names is None:
        names = [f"df[{i}]" for i in range(len(dfs))]

    groups = defaultdict(list)

    for name, df in zip(names, dfs):
        if df is None or not isinstance(df, pd.DataFrame) or df.empty:
            groups["EMPTY_DF"].append((name, df))
            continue

        first_col = df.columns[0]
        # take first non-empty sample from first column
        series = df[first_col].astype(str)
        sample_vals = [v for v in series.head(20).tolist() if str(v).strip() and str(v).lower() != "nan"]

        fmt = infer_first_col_format(sample_vals[0]) if sample_vals else "EMPTY_FIRST_COL"
        groups[fmt].append((name, df))

    sorted_groups = sorted(groups.items(), key=lambda kv: len(kv[1]), reverse=True)

    print(f"Total dataframes: {len(dfs)}")
    print(f"Unique first-column formats: {len(sorted_groups)}\n")

    for gi, (fmt, members) in enumerate(sorted_groups[:max_groups_to_show], start=1):
        print("=" * 100)
        print(f"Group #{gi}: {fmt}")
        print(f"Count: {len(members)}")

        example_shapes = [m[1].shape for m in members if isinstance(m[1], pd.DataFrame)]
        print(f"Example shapes (first 10): {example_shapes[:10]}")

        # Print a few examples per group
        for ex_i, (name, df) in enumerate(members[:max_dfs_per_group_to_print], start=1):
            if df is None or not isinstance(df, pd.DataFrame) or df.empty:
                print(f"  [Example {ex_i}] {name}: EMPTY/None")
                continue

            first_col = df.columns[0]
            vals = [v for v in df[first_col].head(20).tolist() if str(v).strip() and str(v).lower() != "nan"]
            vals = vals[:samples_per_df]

            print(f"  [Example {ex_i}] {name}")
            print(f"    first_col: {first_col!r}")
            print(f"    columns: {list(df.columns)[:12]}{' ...' if len(df.columns) > 12 else ''}")
            print(f"    first_col_samples: {vals}")

        print()


# Example usage with your list of performance tables
performances = []
perf_names = []
for i, fund in enumerate(funds_total):
    if fund.ticker in performance_funds and fund.performance_table is not None:
        performances.append(fund.performance_table)
        perf_names.append(f"{fund.ticker} | {fund.name} | {fund.share_class}")

describe_first_column_formats(performances, names=perf_names)

Total dataframes: 119
Unique first-column formats: 2

Group #1: MON_YY
Count: 118
Example shapes (first 10): [(62, 6), (62, 6), (62, 6), (62, 6), (107, 4), (120, 4), (120, 4), (67, 4), (120, 4), (120, 4)]
  [Example 1] EAOK | iShares ESG Aware 30/70 Conservative Allocation ETF | ETF Shares
    first_col: 'Unnamed: 0'
    columns: ['Unnamed: 0', 'Fund', 'Bloomberg U.S. Universal Index', 'MSCI All Country World Index (Net)', 'BlackRock ESG Aware Conservative Allocation Index', 'S&P Target Risk Conservative Index']
    first_col_samples: ['Jun 20', 'Jul 20', 'Aug 20']
  [Example 2] EAOM | iShares ESG Aware 40/60 Moderate Allocation ETF | ETF Shares
    first_col: 'Unnamed: 0'
    columns: ['Unnamed: 0', 'Fund', 'Bloomberg U.S. Universal Index', 'MSCI All Country World Index (Net)', 'BlackRock ESG Aware Moderate Allocation Index', 'S&P Target Risk Moderate Index']
    first_col_samples: ['Jun 20', 'Jul 20', 'Aug 20']
  [Example 3] EAOR | iShares ESG Aware 60/40 Balanced Allocation ETF | ET

In [None]:
import sys
from pathlib import Path
%reload_ext autoreload
RAG_DIR = Path("/home/alvar/CascadeProjects/windsurf-project/RAG")
if str(RAG_DIR) not in sys.path:
    sys.path.insert(0, str(RAG_DIR))


from src.simple_rag.extraction.parser import compute_annual_returns

for fund in funds_total:
    if fund.ticker in performance_funds:
        print(fund.performance_table)
        returns = compute_annual_returns(fund.performance_table)
        print("\nFinal Annual Returns:")
        fund.annual_returns = returns
        print(f"  {fund.ticker}: {returns}")
        print("---")

   Unnamed: 0     Fund Bloomberg U.S. Universal Index  \
0      Jun 20  $10,091                        $10,039   
1      Jul 20  $10,350                        $10,216   
2      Aug 20  $10,498                        $10,157   
3      Sep 20  $10,374                        $10,139   
4      Oct 20  $10,269                        $10,102   
..        ...      ...                            ...   
57     Mar 25  $11,423                         $9,828   
58     Apr 25  $11,453                         $9,863   
59     May 25  $11,606                         $9,813   
60     Jun 25  $11,902                         $9,966   
61     Jul 25  $11,920                         $9,951   

   MSCI All Country World Index (Net)  \
0                             $10,171   
1                             $10,708   
2                             $11,364   
3                             $10,998   
4                             $10,730   
..                                ...   
57                          

In [1]:
import pickle
from pathlib import Path
from dataclasses import is_dataclass, asdict
import pandas as pd
import sys
from pathlib import Path
RAG_DIR = Path("/home/alvar/CascadeProjects/windsurf-project/RAG")
if str(RAG_DIR) not in sys.path:
    sys.path.insert(0, str(RAG_DIR))


PKL_PATH = Path("./funds_backup.pkl")
print("Current working directory:", Path.cwd())
print("PKL_PATH resolves to:", PKL_PATH.resolve())
with PKL_PATH.open("rb") as f:
    funds_total = pickle.load(f)

print(f"Loaded {len(funds_total)} funds from pickle file")

Current working directory: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks
PKL_PATH resolves to: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks/funds_backup.pkl
Loaded 383 funds from pickle file


## Summary Prospectus

In [11]:
from edgar import Company, set_identity
import pandas as pd
from typing import List, Dict
import sys
from tqdm import tqdm
from IPython.display import display, Markdown
from src.simple_rag.extraction.general_info import FundInfoExtractor

from pathlib import Path


set_identity('luis.alvarez.conde@alumnos.upm.es')

tickers = ["VOO", "MGK", "HEZU", "VMGRX", "VDIGX"]


for ticker in tickers:
    company = Company(ticker)
    processed_funds = []
    filings = company.get_filings(form="497K")

    for filing in filings:
        text = filing.text()
        extractor = FundInfoExtractor(text, ticker=ticker)
        fund_data = extractor.get_structured_data()
        if fund_data['ticker'] in processed_funds:
            print("First duplicate: ", fund_data['ticker'])
            break
        
        processed_funds.append(fund_data['ticker'])
        md = extractor.get_clean_markdown()
        for fund in funds_total:
            if fund.ticker == fund_data['ticker']:
                fund.summary_prospectus = md
                fund.managers = fund_data['managers']
                fund.strategies = fund_data['strategies']
                fund.risks = fund_data['risks']
                fund.objective = fund_data['objective']
                break
        
    print("Processed funds: ", len(processed_funds), "for ticker: ", ticker)



First duplicate:  VSCIX
Processed funds:  48 for ticker:  VOO


First duplicate:  VEXC
Processed funds:  36 for ticker:  MGK


First duplicate:  XT
Processed funds:  120 for ticker:  HEZU


First duplicate:  VMGRX
Processed funds:  1 for ticker:  VMGRX


First duplicate:  VGSNX
Processed funds:  11 for ticker:  VDIGX


## NPORT (Portfolio Composition)

In [2]:
from edgar import Company, set_identity
import pandas as pd
from typing import List, Dict
import sys
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import multiprocessing as mp
from threading import Lock
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
from src.simple_rag.extraction.nport import NPortProcessor
from src.simple_rag.models.fund import PortfolioHolding, Derivatives, NonDerivatives
from pathlib import Path

company_json_path = Path("/home/alvar/CascadeProjects/windsurf-project/RAG/notebooks/sec_data/company_tickers.json")

set_identity('luis.alvarez.conde@alumnos.upm.es')

def process_single_filing(filing, ticker, company_json_path):
    """Process a single filing - can be parallelized"""
    try:
        xml_data = filing.obj()
        fund_name = xml_data.get_fund_series().name
        reporting_period = xml_data.reporting_period
        portfolio_list = xml_data.investments
        
        # Process holdings
        proc = NPortProcessor(company_tickers_json_path=company_json_path, min_similarity=0.74)
        holdings = proc.process_holdings(portfolio_list)
        result = proc.enrich_tickers(holdings, verbose=False)  # Set verbose=False to reduce I/O
        
        not_matches = result[result['matched_ticker'].isna() | (result['matched_ticker'] == '')]
        
        return {
            'fund_name': fund_name,
            'reporting_period': reporting_period,
            'holdings': holdings,
            'result': result,
            'not_matches': not_matches,
            'ticker': ticker,
            'report_date': filing.report_date
        }
    except Exception as e:
        print(f"Error processing filing for {ticker}: {e}")
        return None

def process_ticker(ticker, company_json_path):
    """Process all filings for a single ticker - SEQUENTIAL within ticker"""
    try:
        nport_file = Company(ticker)
        filings = sorted(nport_file.get_filings(form="NPORT-P"), 
                        key=lambda x: x.report_date, reverse=True)
        
        if not filings:
            print(f"No filings found for {ticker}")
            return None
            
        print(f"Processing ticker: {ticker}, most recent filing date: {filings[0].report_date}")
        
        funds_processed_set = set()
        ticker_results = []
        
        # Process filings SEQUENTIALLY for this ticker (to respect the stop condition)
        for filing in filings:
            result = process_single_filing(filing, ticker, company_json_path)
            
            if result is not None:
                # Check if we've already processed this fund
                if result['fund_name'].lower() in funds_processed_set:
                    print(f"Stopping - already processed fund: {result['fund_name']}")
                    break
                
                funds_processed_set.add(result['fund_name'].lower())
                ticker_results.append(result)
                
                print(f"{ticker} - Fund: {result['fund_name']}, Holdings: {len(result['holdings'])}, Unmatched: {len(result['not_matches'])}")
        
        return {
            'ticker': ticker,
            'results': ticker_results,
            'funds_processed': list(funds_processed_set)
        }
    
    except Exception as e:
        print(f"Error processing ticker {ticker}: {e}")
        return None

# Main execution - PARALLEL across tickers only
tickers = ["VOO", "MGK", "HEZU", "VMGRX", "VDIGX"]

# Use fewer workers to avoid overwhelming the system
max_workers = min(3, len(tickers))  # Start with 3 workers
print(f"Using {max_workers} workers for tickers")

all_results = []
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    future_to_ticker = {
        executor.submit(process_ticker, ticker, company_json_path): ticker 
        for ticker in tickers
    }
    
    for future in tqdm(as_completed(future_to_ticker), total=len(tickers), desc="Processing tickers"):
        ticker = future_to_ticker[future]
        try:
            result = future.result()
            if result:
                all_results.append(result)
                print(f"\nCompleted {ticker}: {len(result['funds_processed'])} funds processed")
        except Exception as e:
            print(f"Error with ticker {ticker}: {e}")

# Update funds_total object with the results
print("\n=== Updating funds_total ===")
for ticker_result in all_results:
    for filing_result in ticker_result['results']:
        fund_name = filing_result['fund_name']
        reporting_period = filing_result['reporting_period']
        holdings = filing_result['holdings']
        
        # Update your funds_total structure
        for fund in funds_total:
            if fund_name.lower() == fund.name.lower():
                print(f"Updating fund: {fund.name}")
                # Note: 'derivatives' variable is not defined in your original code
                # You may need to extract it from the filing_result
                fund.non_derivatives = NonDerivatives(
                    date=reporting_period,
                    holdings_df=holdings
                )
                fund.derivatives = Derivatives(
                    date=reporting_period,
                    derivatives_df=derivatives
                )
                break

print("\n=== Processing Complete ===")
print(f"Total tickers processed: {len(all_results)}")
for result in all_results:
    print(f"{result['ticker']}: {len(result['funds_processed'])} funds")

Using 3 workers for tickers


Processing tickers:   0%|          | 0/5 [00:00<?, ?it/s]

Processing ticker: MGK, most recent filing date: 2025-09-30
Processing ticker: VOO, most recent filing date: 2025-09-30
MGK - Fund: VANGUARD MEGA CAP GROWTH INDEX FUND, Holdings: 70, Unmatched: 3
VOO - Fund: VANGUARD MID-CAP VALUE INDEX FUND, Holdings: 186, Unmatched: 3
MGK - Fund: VANGUARD FTSE SOCIAL INDEX FUND, Holdings: 417, Unmatched: 5
Processing ticker: HEZU, most recent filing date: 2025-10-31
MGK - Fund: VANGUARD COMMUNICATION SERVICES INDEX FUND, Holdings: 126, Unmatched: 7
HEZU - Fund: iShares MSCI EAFE Min Vol Factor ETF, Holdings: 240, Unmatched: 115
MGK - Fund: VANGUARD HEALTH CARE INDEX FUND, Holdings: 413, Unmatched: 8
MGK - Fund: VANGUARD MEGA CAP VALUE INDEX FUND, Holdings: 128, Unmatched: 4
MGK - Fund: VANGUARD EXTENDED DURATION TREASURY INDEX FUND, Holdings: 83, Unmatched: 1
MGK - Fund: VANGUARD INDUSTRIALS INDEX FUND, Holdings: 391, Unmatched: 6
MGK - Fund: VANGUARD CONSUMER STAPLES INDEX FUND, Holdings: 113, Unmatched: 5
MGK - Fund: VANGUARD UTILITIES INDEX FUND, 

Processing tickers:  20%|‚ñà‚ñà        | 1/5 [03:41<14:44, 221.10s/it]

Stopping - already processed fund: VANGUARD EXTENDED MARKET INDEX FUND

Completed VOO: 12 funds processed
Processing ticker: VMGRX, most recent filing date: 2025-10-31
HEZU - Fund: iShares iBonds Dec 2030 Term Muni Bond ETF, Holdings: 1206, Unmatched: 526
VMGRX - Fund: VANGUARD SELECTED VALUE FUND, Holdings: 130, Unmatched: 2
MGK - Fund: VANGUARD ESG U.S. STOCK ETF, Holdings: 1330, Unmatched: 14
VMGRX - Fund: VANGUARD HIGH DIVIDEND YIELD INDEX FUND, Holdings: 571, Unmatched: 5
MGK - Fund: VANGUARD ENERGY INDEX FUND, Holdings: 117, Unmatched: 3
VMGRX - Fund: VANGUARD MID-CAP GROWTH FUND, Holdings: 139, Unmatched: 3
VMGRX - Fund: VANGUARD INTERNATIONAL DIVIDEND GROWTH FUND, Holdings: 47, Unmatched: 16
MGK - Fund: VANGUARD INTERNATIONAL GROWTH FUND, Holdings: 128, Unmatched: 38
VMGRX - Fund: VANGUARD ADVICE SELECT DIVIDEND GROWTH FUND, Holdings: 31, Unmatched: 2


Processing tickers:  40%|‚ñà‚ñà‚ñà‚ñà      | 2/5 [03:59<05:06, 102.13s/it]

Stopping - already processed fund: VANGUARD MEGA CAP GROWTH INDEX FUND

Completed MGK: 22 funds processed
Processing ticker: VDIGX, most recent filing date: 2025-10-31
VDIGX - Fund: VANGUARD DIVIDEND GROWTH FUND, Holdings: 56, Unmatched: 3
VDIGX - Fund: VANGUARD ENERGY FUND, Holdings: 42, Unmatched: 6
VDIGX - Fund: VANGUARD GLOBAL CAPITAL CYCLES FUND, Holdings: 75, Unmatched: 16
VDIGX - Fund: VANGUARD HEALTH CARE FUND, Holdings: 99, Unmatched: 5
VDIGX - Fund: VANGUARD REAL ESTATE INDEX FUND, Holdings: 165, Unmatched: 4
VDIGX - Fund: VANGUARD GLOBAL ESG SELECT STOCK FUND, Holdings: 38, Unmatched: 7
VMGRX - Fund: VANGUARD EMERGING MARKETS GOVERNMENT BOND INDEX FUND, Holdings: 841, Unmatched: 129
VMGRX - Fund: VANGUARD ADVICE SELECT GLOBAL VALUE FUND, Holdings: 107, Unmatched: 17
VDIGX - Fund: VANGUARD DIVIDEND APPRECIATION INDEX FUND, Holdings: 344, Unmatched: 4


Processing tickers:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 3/5 [04:14<02:04, 62.28s/it] 

Stopping - already processed fund: VANGUARD ENERGY FUND

Completed VDIGX: 7 funds processed
VMGRX - Fund: VANGUARD INTERNATIONAL DIVIDEND APPRECIATION INDEX FUND, Holdings: 349, Unmatched: 157
VMGRX - Fund: VANGUARD INTERNATIONAL EXPLORER FUND, Holdings: 340, Unmatched: 157
VMGRX - Fund: VANGUARD INTERNATIONAL HIGH DIVIDEND YIELD INDEX FUND, Holdings: 1559, Unmatched: 724
VMGRX - Fund: VANGUARD ADVICE SELECT INTERNATIONAL GROWTH FUND, Holdings: 29, Unmatched: 7
VMGRX - Fund: VANGUARD GLOBAL MINIMUM VOLATILITY FUND, Holdings: 238, Unmatched: 49


Processing tickers:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 4/5 [04:34<00:45, 45.55s/it]

Stopping - already processed fund: VANGUARD ADVICE SELECT INTERNATIONAL GROWTH FUND

Completed VMGRX: 12 funds processed
HEZU - Fund: iShares Core 1-5 Year USD Bond ETF, Holdings: 7008, Unmatched: 1674
HEZU - Fund: iShares Core MSCI Pacific ETF, Holdings: 1367, Unmatched: 677
HEZU - Fund: iShares Environmentally Aware Real Estate ETF, Holdings: 356, Unmatched: 109
HEZU - Fund: iShares Floating Rate Bond ETF, Holdings: 476, Unmatched: 108
HEZU - Fund: iShares Core International Aggregate Bond ETF, Holdings: 7102, Unmatched: 4371
HEZU - Fund: iShares Aaa - A Rated Corporate Bond ETF, Holdings: 3361, Unmatched: 210
HEZU - Fund: iShares Russell 2000 BuyWrite ETF, Holdings: 3, Unmatched: 2
HEZU - Fund: iShares MSCI ACWI Low Carbon Target ETF, Holdings: 981, Unmatched: 233
HEZU - Fund: iShares Global Equity Factor ETF, Holdings: 630, Unmatched: 184
HEZU - Fund: iShares iBonds Dec 2030 Term Corporate ETF, Holdings: 714, Unmatched: 47
HEZU - Fund: iShares Core MSCI International Developed Mark

Processing tickers: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [15:23<00:00, 184.67s/it]

Stopping - already processed fund: iShares iBonds 2032 Term High Yield and Income ETF

Completed HEZU: 356 funds processed

=== Updating funds_total ===
Updating fund: Vanguard Mid-Cap Value Index Fund





NameError: name 'derivatives' is not defined

In [None]:
from edgar import Company, set_identity
import pandas as pd
from typing import List, Dict
import sys
from tqdm import tqdm

%load_ext autoreload
%autoreload 2
%reload_ext autoreload
from src.simple_rag.extraction.nport import NPortProcessor
from src.simple_rag.models.fund import PortfolioHolding, Derivatives, NonDerivatives
from pathlib import Path

company_json_path = Path("/home/alvar/CascadeProjects/windsurf-project/RAG/notebooks/sec_data/company_tickers.json")

# 1. Initialize the Fund (can use Ticker or CIK)

set_identity('luis.alvarez.conde@alumnos.upm.es')
tickers = ["VOO", "MGK", "HEZU", "VMGRX", "VDIGX"]
for ticker in tickers:

    nport_file = Company(ticker)
    filings = sorted(nport_file.get_filings(form="NPORT-P"), key=lambda x: x.report_date, reverse=True)
    print(f"Processing ticker: {ticker}, most recent filing date: {filings[0].report_date}")
    funds_processed = []
    for filing in filings:

        print("Processing filing with date:", filing.report_date)
        xml_data = filing.obj() 
        # Show all attributes (filtering out internal python methods starting with __)
        #print([attr for attr in dir(xml_data) if not attr.startswith('__')])
        
        fund_name = xml_data.get_fund_series().name
        if fund_name.lower() in funds_processed:
            print("Last fund processed: ", fund_name)
            break
        print("Fund name:", fund_name)
        
        reporting_period = xml_data.reporting_period
        print("Reporting period:", reporting_period)
        
        portfolio_list = xml_data.investments
        
        proc = NPortProcessor(company_tickers_json_path=company_json_path, min_similarity=0.74)
        holdings = proc.process_holdings(portfolio_list)
        result = proc.enrich_tickers(holdings, verbose=True)
        
        print("Number of holdings:", len(holdings))
        # This method maps the title of the company to the ticker
        
        for fund in funds_total:
            if fund_name.lower() == fund.name.lower():
                print(f"Found fund: {fund.name}")
                funds_processed.append(fund.name.lower())
                fund.derivatives = Derivatives(
                    date=reporting_period,
                    derivatives_df=derivatives
                )
                fund.non_derivatives = NonDerivatives(
                    date=reporting_period,
                    holdings_df=holdings
                )
                break
        
        not_matches = result[result['matched_ticker'].isna() | (result['matched_ticker'] == '')]
        print(f"Number of unmatched holdings: {len(not_matches)}")
        print(not_matches.head())

    print(len(funds_processed))


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Processing ticker: VOO, most recent filing date: 2025-09-30
Processing filing with date: 2025-09-30
Fund name: VANGUARD MID-CAP VALUE INDEX FUND
Reporting period: 2025-09-30
Number of matched holdings: 181
Number of holdings: 186
Found fund: Vanguard Mid-Cap Value Index Fund
Number of unmatched holdings: 3
                                          holding_name ticker_before  \
147                                    Schlumberger NV          None   
173  Vanguard Cmt Funds-Vanguard Market Liquidity Fund          None   
183  Vanguard Cmt Funds-Vanguard Market Liquidity Fund          None   

    ticker_after matched_ticker matched_title  similarity  updated  
147         None           None          None    0.608696    False  
173         None           None          None    0.297297    False  
183         None           None          None    0.297297    False  
Processing filing with date: 2025-09-30

In [3]:
count = 0
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
from src.simple_rag.extraction.nport import NPortProcessor

processor = NPortProcessor()
df = processor.to_df(holdings)

for fund in funds_total:
    if fund.non_derivatives is not None:
        
        df = processor.to_df(fund.non_derivatives.holdings_df)
        fund.non_derivatives.holdings_df = df
        count += 1
        print(df.head())
        
print(f"Found {count} funds with non-derivatives data")
print(f"Total funds processed: {len(funds_total)}")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
     name ticker cusip  isin   lei shares market_value weight_pct currency  \
0    name   None  None  None  None   None         None       None     None   
1  ticker   None  None  None  None   None         None       None     None   
2   cusip   None  None  None  None   None         None       None     None   
3    isin   None  None  None  None   None         None       None     None   
4     lei   None  None  None  None   None         None       None     None   

  asset_category asset_category_desc issuer_category issuer_category_desc  \
0           None                None            None                 None   
1           None                None            None                 None   
2           None                None            None                 None   
3           None                None            None                 None   
4           None                None            None      

In [4]:
import pickle
from pathlib import Path

PKL_PATH = Path("./funds_backup.pkl")
TMP_PATH = PKL_PATH.with_suffix(PKL_PATH.suffix + ".tmp")

with TMP_PATH.open("wb") as f:
    pickle.dump(funds_total, f, protocol=pickle.HIGHEST_PROTOCOL)

TMP_PATH.replace(PKL_PATH)

print(f"Saved {len(funds_total)} funds to pickle file: {PKL_PATH.resolve()}")

Saved 383 funds to pickle file: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks/funds_backup.pkl


## Processing Phase

### Annual Returns

### Geographic Allocation

### Top Holdings


### Sector Allocation