## Documents Extraction and Processing

In [2]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Lets first obtain the file that maps the tickers with the CIKs

In [2]:
import requests
import json
from pathlib import Path
import os

# --- Configuration (from previous step) ---
HEADERS = {
    "User-Agent": "EdgarTutorial/1.0 (YourName your.email@domain.com)" 
}
TICKER_CIK_URL = "https://www.sec.gov/files/company_tickers.json"
OUTPUT_FILE = Path("sec_data/company_tickers.json")

# Ensure directory exists
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
# ----------------------------------------

# 1. Download the JSON data
print("Downloading CIK-Ticker map...")
response = requests.get(TICKER_CIK_URL, headers=HEADERS, timeout=15)
response.raise_for_status()
raw_data = response.json() # Load into Python dictionary

# 2. Open the file and use json.dump() with indent=4
print(f"Saving JSON in readable format to {OUTPUT_FILE.absolute()}...")

# Use 'w' mode to write the file
with open(OUTPUT_FILE, 'w') as f:
    # Key Fix: The 'indent=4' parameter tells the JSON module to format the output 
    # with 4 spaces for each level of nesting, adding line breaks automatically.
    json.dump(raw_data, f, indent=4) 

print("‚úÖ JSON saved successfully with proper line breaks and indentation.")

# --- Optional: Print a Snippet to Console (Also Pretty-Printed) ---
# If you want to print to the console instead of a file, use json.dumps()
print("\n--- Console Snippet (Pretty-Printed) ---")
# Print the first 3 key-value pairs from the dictionary
keys = list(raw_data.keys())
snippet = {k: raw_data[k] for k in keys[:3]}

# Use json.dumps() with indent=2 to format the string output
pretty_string = json.dumps(snippet, indent=2)
print(pretty_string)

Downloading CIK-Ticker map...
Saving JSON in readable format to /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks/sec_data/company_tickers.json...
‚úÖ JSON saved successfully with proper line breaks and indentation.

--- Console Snippet (Pretty-Printed) ---
{
  "0": {
    "cik_str": 1045810,
    "ticker": "NVDA",
    "title": "NVIDIA CORP"
  },
  "1": {
    "cik_str": 1652044,
    "ticker": "GOOGL",
    "title": "Alphabet Inc."
  },
  "2": {
    "cik_str": 320193,
    "ticker": "AAPL",
    "title": "Apple Inc."
  }
}


### 10K Company filings

In [8]:
from edgar import set_identity, Company

# 1. Set your identity (Required by SEC: "Name email@domain.com")
set_identity("luis.alvarez.conde@alumnos.upm.es")

# 2. Get the company (Apple)
ticker = "AAPL"
company = Company(ticker)

# 1. Get recent Form 4 filings (Insider Trading)
insider_filings = company.get_filings(form="4").head(20)

people_nodes = {}

for f in insider_filings:
    # Parse the Form 4 into an object
    form4 = f.obj()
    
    # Extract Reporting Owners (The Executives)
    for owner in form4.reporting_owners:
        name = owner.name
        title = owner.officer_title # e.g., "Chief Executive Officer"
        
        # Deduplicate
        if name not in people_nodes:
            people_nodes[name] = title

# Output for Graph Construction
for name, title in people_nodes.items():
    print(f"üë§ Found Person: {name} | Role: {title}")
    # Create Relation: (Person {name: name}) -[:HAS_ROLE {title: title}]-> (Company {ticker: 'AAPL'})

    
# 3. Get the latest 10-K filing
# We filter by form "10-K" and get the single latest result
latest_10k = company.get_filings(form="10-K").latest()

# 4. Get the URL of the primary document
# 'homepage_url' is the index page; 'document.url' is the actual HTML/Text file
filing_url = latest_10k.document.url

# 5. Extract the text content
# The library automatically cleans HTML tags when you use .text()
content = latest_10k.text()


tenk = latest_10k.obj()
business_text = tenk['Item 1']
print(business_text)
print(f"Type: {type(tenk)}")
print("\nProperties/Attributes:")
for attr in dir(tenk):
    if not attr.startswith('_'):
        print(f"  {attr}")
# 6. Save content to a text file
filename = f"{ticker}_10K_latest.txt"
with open(filename, "w", encoding="utf-8") as f:
    f.write(content)

# 7. Output results
print(f"‚úÖ Successfully saved 10-K content to {filename}")
print(f"üîó Link to official filing: {filing_url}")

üë§ Found Person: Katherine L. Adams | Role: SVP, GC and Secretary
üë§ Found Person: Chris Kondo | Role: Principal Accounting Officer
üë§ Found Person: Kevan Parekh | Role: Senior Vice President, CFO
üë§ Found Person: Timothy D Cook | Role: Chief Executive Officer
üë§ Found Person: Deirdre O'Brien | Role: Senior Vice President
üë§ Found Person: Sabih Khan | Role: COO
üë§ Found Person: Arthur D Levinson | Role: None
Item 1.¬†¬†¬†¬†Business

Company Background

The Company designs, manufactures and markets smartphones, personal computers, tablets, wearables and accessories, and sells a variety of related services. The Company‚Äôs fiscal year is the 52- or 53-week period that ends on the last Saturday of September.

Products

iPhone

iPhone¬Æ is the Company‚Äôs line of smartphones based on its iOS operating system. The iPhone line includes iPhone 17 Pro, iPhone Air‚Ñ¢, iPhone 17, iPhone 16 and iPhone 16e.

Mac

Mac¬Æ is the Company‚Äôs line of personal computers based on its macOS¬

### Vanguard Index Funds

In [3]:
import pandas as pd
from io import StringIO
from edgar import Company, set_identity 
import sys
from pathlib import Path

RAG_DIR = Path("/home/alvar/CascadeProjects/windsurf-project/RAG")
if str(RAG_DIR) not in sys.path:
    sys.path.insert(0, str(RAG_DIR))

from src.simple_rag.extraction.parser import BlackRockFiling


set_identity("luis.alvarez.conde@alumnos.upm.es")

ticker = "VOO"
fund = Company(ticker)
all_filings = fund.get_filings(form="N-CSR")


if all_filings:
    
    latest_date_str = max(f.report_date for f in all_filings)
    
    target_year = latest_date_str[:4]
    
    # 3. Filter: Keep ALL filings where the report_date starts with that year
    # This captures the March, June, and December reports for that fiscal year
    latest_filings = [
        f for f in all_filings 
        if f.report_date and f.report_date.startswith(target_year)
    ]
    print("Found filings: ", len(latest_filings), "for year: ", target_year)


funds_total = []
performance_funds = []
df_performance = []
for filing in latest_filings:

    print("Processing filing: ", filing.report_date)
    html_content = filing.html()
    
    parser = BlackRockFiling(html_content)
    funds = parser.get_funds()
    count = 0
    for fund in funds:
        if fund.performance_table is not None:
            performance_funds.append(fund.ticker)
            count += 1

    df_performance.append(parser.get_financial_highlights())

    print(count)
    print("Adding funds: ", len(funds))
    
    funds_total.extend(funds)

print(len(performance_funds))
print(performance_funds)
print(len(df_performance))


Found filings:  2 for year:  2024
Processing filing:  2024-12-31
Processing: Vanguard Extended Market Index Fund
Extracting context:  FY2024_C000007779Member
Tag not found:  dei:SecurityExchangeName FY2024_C000007779Member
Failed to extract tables from block:  oef:LineGraphTableTextBlock
No tables found for block:  oef:LineGraphTableTextBlock


ValidationError: 1 validation error for FundData
provider
  Field required [type=missing, input_value={'name': 'Vanguard Extend...  9.33%  
4    12.48%  }, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.12/v/missing

In [None]:
parser.print_fund_info(funds_total)\
    

Showing information of 52 funds


### üè¶ Vanguard Extended Market Index Fund

üÜî Context ID:      FY2024_C000007779Member
üé´ Ticker:          VEXMX
üè∑Ô∏è Share Class:     Investor Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 111,156
Expense Ratio       : 0.19
Turnover Rate       : 11
Costs per $10k      : 21
Advisory Fees       : 1,799
Number of Holdings  : 3,485

üìù Commentary: "How did the Fund perform during the reporting period?   For the 12 months ended December 31, 2024, the Fund performed roughly in line with its benchmark index.   U.S. economic growth hovered around 3% on a year-over-year basis for much of the period,..."


**üìä Average Annual Returns**

Unnamed: 0,0,1,2,3
0,Average Annual Total Returns,,,
1,,1 Year,5 Years,10 Years
2,Investor Shares,16.76%,9.75%,9.31%
3,S&P Completion Index,16.88%,9.77%,9.33%
4,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1
0,Portfolio Composition % of Net Assets (as of ...,
1,Communication Services,4.3%
2,Consumer Discretionary,12.0%
3,Consumer Staples,3.0%
4,Energy,4.1%
5,Financials,18.0%
6,Health Care,11.4%
7,Industrials,17.4%
8,Information Technology,17.9%
9,Materials,4.7%






### üè¶ Vanguard Extended Market Index Fund

üÜî Context ID:      FY2024_C000007782Member
üé´ Ticker:          VXF
üè∑Ô∏è Share Class:     ETF Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 111,156
Expense Ratio       : 0.06
Turnover Rate       : 11
Costs per $10k      : 7
Advisory Fees       : 1,799
Number of Holdings  : 3,485

üìù Commentary: "How did the Fund perform during the reporting period?   For the 12 months ended December 31, 2024, the Fund performed roughly in line with its benchmark index.   U.S. economic growth hovered around 3% on a year-over-year basis for much of the period,..."


**üìä Average Annual Returns**

Unnamed: 0,0,1,2,3
0,Average Annual Total Returns,,,
1,,1 Year,5 Years,10 Years
2,ETF Shares Net Asset Value,16.90%,9.89%,9.45%
3,ETF Shares Market Price,16.89%,9.90%,9.46%
4,S&P Completion Index,16.88%,9.77%,9.33%
5,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1
0,Portfolio Composition % of Net Assets (as of ...,
1,Communication Services,4.3%
2,Consumer Discretionary,12.0%
3,Consumer Staples,3.0%
4,Energy,4.1%
5,Financials,18.0%
6,Health Care,11.4%
7,Industrials,17.4%
8,Information Technology,17.9%
9,Materials,4.7%






### üè¶ Vanguard Extended Market Index Fund

üÜî Context ID:      FY2024_C000007780Member
üé´ Ticker:          VEXAX
üè∑Ô∏è Share Class:     Admiral Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 111,156
Expense Ratio       : 0.06
Turnover Rate       : 11
Costs per $10k      : 7
Advisory Fees       : 1,799
Number of Holdings  : 3,485

üìù Commentary: "How did the Fund perform during the reporting period?   For the 12 months ended December 31, 2024, the Fund performed roughly in line with its benchmark index.   U.S. economic growth hovered around 3% on a year-over-year basis for much of the period,..."


**üìä Average Annual Returns**

Unnamed: 0,0,1,2,3
0,Average Annual Total Returns,,,
1,,1 Year,5 Years,10 Years
2,Admiral Shares,16.91%,9.89%,9.45%
3,S&P Completion Index,16.88%,9.77%,9.33%
4,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1
0,Portfolio Composition % of Net Assets (as of ...,
1,Communication Services,4.3%
2,Consumer Discretionary,12.0%
3,Consumer Staples,3.0%
4,Energy,4.1%
5,Financials,18.0%
6,Health Care,11.4%
7,Industrials,17.4%
8,Information Technology,17.9%
9,Materials,4.7%






### üè¶ Vanguard Extended Market Index Fund

üÜî Context ID:      FY2024_C000007781Member
üé´ Ticker:          VIEIX
üè∑Ô∏è Share Class:     Institutional Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 111,156
Expense Ratio       : 0.05
Turnover Rate       : 11
Costs per $10k      : 5
Advisory Fees       : 1,799
Number of Holdings  : 3,485

üìù Commentary: "How did the Fund perform during the reporting period?   For the 12 months ended December 31, 2024, the Fund performed roughly in line with its benchmark index.   U.S. economic growth hovered around 3% on a year-over-year basis for much of the period,..."


**üìä Average Annual Returns**

Unnamed: 0,0,1,2,3
0,Average Annual Total Returns,,,
1,,1 Year,5 Years,10 Years
2,Institutional Shares,16.91%,9.90%,9.47%
3,S&P Completion Index,16.88%,9.77%,9.33%
4,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1
0,Portfolio Composition % of Net Assets (as of ...,
1,Communication Services,4.3%
2,Consumer Discretionary,12.0%
3,Consumer Staples,3.0%
4,Energy,4.1%
5,Financials,18.0%
6,Health Care,11.4%
7,Industrials,17.4%
8,Information Technology,17.9%
9,Materials,4.7%






### üè¶ Vanguard Extended Market Index Fund

üÜî Context ID:      FY2024_C000096110Member
üé´ Ticker:          VEMPX
üè∑Ô∏è Share Class:     Institutional Plus Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 111,156
Expense Ratio       : 0.04
Turnover Rate       : 11
Costs per $10k      : 4
Advisory Fees       : 1,799
Number of Holdings  : 3,485

üìù Commentary: "How did the Fund perform during the reporting period?   For the 12 months ended December 31, 2024, the Fund performed roughly in line with its benchmark index.   U.S. economic growth hovered around 3% on a year-over-year basis for much of the period,..."


**üìä Average Annual Returns**

Unnamed: 0,0,1,2,3
0,Average Annual Total Returns,,,
1,,1 Year,5 Years,10 Years
2,Institutional Plus Shares,16.94%,9.91%,9.48%
3,S&P Completion Index,16.88%,9.77%,9.33%
4,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1
0,Portfolio Composition % of Net Assets (as of ...,
1,Communication Services,4.3%
2,Consumer Discretionary,12.0%
3,Consumer Staples,3.0%
4,Energy,4.1%
5,Financials,18.0%
6,Health Care,11.4%
7,Industrials,17.4%
8,Information Technology,17.9%
9,Materials,4.7%






### üè¶ Vanguard Extended Market Index Fund

üÜî Context ID:      FY2024_C000170275Member
üé´ Ticker:          VSEMX
üè∑Ô∏è Share Class:     Institutional Select Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 111,156
Expense Ratio       : 0.02
Turnover Rate       : 11
Costs per $10k      : 2
Advisory Fees       : 1,799
Number of Holdings  : 3,485

üìù Commentary: "How did the Fund perform during the reporting period?   For the 12 months ended December 31, 2024, the Fund performed roughly in line with its benchmark index.   U.S. economic growth hovered around 3% on a year-over-year basis for much of the period,..."


**üìä Average Annual Returns**

Unnamed: 0,0,1,2,3
0,Average Annual Total Returns,,,
1,,1 Year,5 Years,Since Inception (6/27/2016)
2,Institutional Select Shares,16.96%,9.94%,12.09%
3,S&P Completion Index,16.88%,9.77%,11.91%
4,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,14.95%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1
0,Portfolio Composition % of Net Assets (as of ...,
1,Communication Services,4.3%
2,Consumer Discretionary,12.0%
3,Consumer Staples,3.0%
4,Energy,4.1%
5,Financials,18.0%
6,Health Care,11.4%
7,Industrials,17.4%
8,Information Technology,17.9%
9,Materials,4.7%






### üè¶ Vanguard Mid-Cap Index Fund

üÜî Context ID:      FY2024_C000007791Member
üé´ Ticker:          VIMSX
üè∑Ô∏è Share Class:     Investor Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 176,987
Expense Ratio       : 0.17
Turnover Rate       : 16
Costs per $10k      : 18
Advisory Fees       : 2,958
Number of Holdings  : 327

üìù Commentary: "How did the Fund perform during the reporting period?   For the 12 months ended December 31, 2024, the Fund performed roughly in line with its benchmark, the CRSP US Mid Cap Index.   U.S. economic growth hovered around 3% on a year-over-year basis fo..."


**üìä Average Annual Returns**

Unnamed: 0,0,1,2,3
0,Average Annual Total Returns,,,
1,,1 Year,5 Years,10 Years
2,Investor Shares,15.09%,9.72%,9.42%
3,CRSP US Mid Cap Index,15.25%,9.86%,9.57%
4,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1
0,Portfolio Composition % of Net Assets (as of ...,Portfolio Composition % of Net Assets (as of ...
1,Basic Materials,2.6%
2,Consumer Discretionary,13.1%
3,Consumer Staples,5.9%
4,Energy,5.5%
5,Financials,13.5%
6,Health Care,8.6%
7,Industrials,19.9%
8,Real Estate,7.6%
9,Technology,13.9%






### üè¶ Vanguard Mid-Cap Index Fund

üÜî Context ID:      FY2024_C000007794Member
üé´ Ticker:          VO
üè∑Ô∏è Share Class:     ETF Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 176,987
Expense Ratio       : 0.04
Turnover Rate       : 16
Costs per $10k      : 4
Advisory Fees       : 2,958
Number of Holdings  : 327

üìù Commentary: "How did the Fund perform during the reporting period?   For the 12 months ended December 31, 2024, the Fund performed roughly in line with its benchmark, the CRSP US Mid Cap Index.   U.S. economic growth hovered around 3% on a year-over-year basis fo..."


**üìä Average Annual Returns**

Unnamed: 0,0,1,2,3
0,Average Annual Total Returns,,,
1,,1 Year,5 Years,10 Years
2,ETF Shares Net Asset Value,15.23%,9.85%,9.56%
3,ETF Shares Market Price,15.28%,9.87%,9.56%
4,CRSP US Mid Cap Index,15.25%,9.86%,9.57%
5,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1
0,Portfolio Composition % of Net Assets (as of ...,Portfolio Composition % of Net Assets (as of ...
1,Basic Materials,2.6%
2,Consumer Discretionary,13.1%
3,Consumer Staples,5.9%
4,Energy,5.5%
5,Financials,13.5%
6,Health Care,8.6%
7,Industrials,19.9%
8,Real Estate,7.6%
9,Technology,13.9%






### üè¶ Vanguard Mid-Cap Index Fund

üÜî Context ID:      FY2024_C000007792Member
üé´ Ticker:          VIMAX
üè∑Ô∏è Share Class:     Admiral Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 176,987
Expense Ratio       : 0.05
Turnover Rate       : 16
Costs per $10k      : 5
Advisory Fees       : 2,958
Number of Holdings  : 327

üìù Commentary: "How did the Fund perform during the reporting period?   For the 12 months ended December 31, 2024, the Fund performed roughly in line with its benchmark, the CRSP US Mid Cap Index.   U.S. economic growth hovered around 3% on a year-over-year basis fo..."


**üìä Average Annual Returns**

Unnamed: 0,0,1,2,3
0,Average Annual Total Returns,,,
1,,1 Year,5 Years,10 Years
2,Admiral Shares,15.22%,9.85%,9.55%
3,CRSP US Mid Cap Index,15.25%,9.86%,9.57%
4,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1
0,Portfolio Composition % of Net Assets (as of ...,Portfolio Composition % of Net Assets (as of ...
1,Basic Materials,2.6%
2,Consumer Discretionary,13.1%
3,Consumer Staples,5.9%
4,Energy,5.5%
5,Financials,13.5%
6,Health Care,8.6%
7,Industrials,19.9%
8,Real Estate,7.6%
9,Technology,13.9%






### üè¶ Vanguard Mid-Cap Index Fund

üÜî Context ID:      FY2024_C000007793Member
üé´ Ticker:          VMCIX
üè∑Ô∏è Share Class:     Institutional Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 176,987
Expense Ratio       : 0.04
Turnover Rate       : 16
Costs per $10k      : 4
Advisory Fees       : 2,958
Number of Holdings  : 327

üìù Commentary: "How did the Fund perform during the reporting period?   For the 12 months ended December 31, 2024, the Fund performed roughly in line with its benchmark, the CRSP US Mid Cap Index.   U.S. economic growth hovered around 3% on a year-over-year basis fo..."


**üìä Average Annual Returns**

Unnamed: 0,0,1,2,3
0,Average Annual Total Returns,,,
1,,1 Year,5 Years,10 Years
2,Institutional Shares,15.23%,9.86%,9.56%
3,CRSP US Mid Cap Index,15.25%,9.86%,9.57%
4,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1
0,Portfolio Composition % of Net Assets (as of ...,Portfolio Composition % of Net Assets (as of ...
1,Basic Materials,2.6%
2,Consumer Discretionary,13.1%
3,Consumer Staples,5.9%
4,Energy,5.5%
5,Financials,13.5%
6,Health Care,8.6%
7,Industrials,19.9%
8,Real Estate,7.6%
9,Technology,13.9%






### üè¶ Vanguard Mid-Cap Index Fund

üÜî Context ID:      FY2024_C000096111Member
üé´ Ticker:          VMCPX
üè∑Ô∏è Share Class:     Institutional Plus Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 176,987
Expense Ratio       : 0.03
Turnover Rate       : 16
Costs per $10k      : 3
Advisory Fees       : 2,958
Number of Holdings  : 327

üìù Commentary: "How did the Fund perform during the reporting period?   For the 12 months ended December 31, 2024, the Fund performed roughly in line with its benchmark, the CRSP US Mid Cap Index.   U.S. economic growth hovered around 3% on a year-over-year basis fo..."


**üìä Average Annual Returns**

Unnamed: 0,0,1,2,3
0,Average Annual Total Returns,,,
1,,1 Year,5 Years,10 Years
2,Institutional Plus Shares,15.25%,9.87%,9.57%
3,CRSP US Mid Cap Index,15.25%,9.86%,9.57%
4,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1
0,Portfolio Composition % of Net Assets (as of ...,Portfolio Composition % of Net Assets (as of ...
1,Basic Materials,2.6%
2,Consumer Discretionary,13.1%
3,Consumer Staples,5.9%
4,Energy,5.5%
5,Financials,13.5%
6,Health Care,8.6%
7,Industrials,19.9%
8,Real Estate,7.6%
9,Technology,13.9%






### üè¶ Vanguard Mid-Cap Growth Index Fund

üÜî Context ID:      FY2024_C000034427Member
üé´ Ticker:          VMGIX
üè∑Ô∏è Share Class:     Investor Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 27,704
Expense Ratio       : 0.19
Turnover Rate       : 21
Costs per $10k      : 21
Advisory Fees       : 456
Number of Holdings  : 143

üìù Commentary: "How did the Fund perform during the reporting period?   For the 12 months ended December 31, 2024, the Fund performed roughly in line with its benchmark, the CRSP US Mid Cap Growth Index.   U.S. economic growth hovered around 3% on a year-over-year b..."


**üìä Average Annual Returns**

Unnamed: 0,0,1,2,3
0,Average Annual Total Returns,,,
1,,1 Year,5 Years,10 Years
2,Investor Shares,16.27%,10.44%,10.26%
3,CRSP US Mid Cap Growth Index,16.48%,10.62%,10.45%
4,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1
0,Portfolio Composition % of Net Assets (as of ...,Portfolio Composition % of Net Assets (as of ...
1,Basic Materials,1.1%
2,Consumer Discretionary,15.8%
3,Consumer Staples,1.2%
4,Energy,5.4%
5,Financials,8.5%
6,Health Care,11.7%
7,Industrials,21.3%
8,Real Estate,6.8%
9,Technology,21.7%






### üè¶ Vanguard Mid-Cap Growth Index Fund

üÜî Context ID:      FY2024_C000034428Member
üé´ Ticker:          VOT
üè∑Ô∏è Share Class:     ETF Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 27,704
Expense Ratio       : 0.07
Turnover Rate       : 21
Costs per $10k      : 8
Advisory Fees       : 456
Number of Holdings  : 143

üìù Commentary: "How did the Fund perform during the reporting period?   For the 12 months ended December 31, 2024, the Fund performed roughly in line with its benchmark, the CRSP US Mid Cap Growth Index.   U.S. economic growth hovered around 3% on a year-over-year b..."


**üìä Average Annual Returns**

Unnamed: 0,0,1,2,3
0,Average Annual Total Returns,,,
1,,1 Year,5 Years,10 Years
2,ETF Shares Net Asset Value,16.41%,10.57%,10.40%
3,ETF Shares Market Price,16.30%,10.56%,10.39%
4,CRSP US Mid Cap Growth Index,16.48%,10.62%,10.45%
5,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1
0,Portfolio Composition % of Net Assets (as of ...,Portfolio Composition % of Net Assets (as of ...
1,Basic Materials,1.1%
2,Consumer Discretionary,15.8%
3,Consumer Staples,1.2%
4,Energy,5.4%
5,Financials,8.5%
6,Health Care,11.7%
7,Industrials,21.3%
8,Real Estate,6.8%
9,Technology,21.7%






### üè¶ Vanguard Mid-Cap Growth Index Fund

üÜî Context ID:      FY2024_C000105306Member
üé´ Ticker:          VMGMX
üè∑Ô∏è Share Class:     Admiral Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 27,704
Expense Ratio       : 0.07
Turnover Rate       : 21
Costs per $10k      : 8
Advisory Fees       : 456
Number of Holdings  : 143

üìù Commentary: "How did the Fund perform during the reporting period?   For the 12 months ended December 31, 2024, the Fund performed roughly in line with its benchmark, the CRSP US Mid Cap Growth Index.   U.S. economic growth hovered around 3% on a year-over-year b..."


**üìä Average Annual Returns**

Unnamed: 0,0,1,2,3
0,Average Annual Total Returns,,,
1,,1 Year,5 Years,10 Years
2,Admiral Shares,16.41%,10.57%,10.40%
3,CRSP US Mid Cap Growth Index,16.48%,10.62%,10.45%
4,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1
0,Portfolio Composition % of Net Assets (as of ...,Portfolio Composition % of Net Assets (as of ...
1,Basic Materials,1.1%
2,Consumer Discretionary,15.8%
3,Consumer Staples,1.2%
4,Energy,5.4%
5,Financials,8.5%
6,Health Care,11.7%
7,Industrials,21.3%
8,Real Estate,6.8%
9,Technology,21.7%






### üè¶ Vanguard Mid-Cap Value Index Fund

üÜî Context ID:      FY2024_C000034429Member
üé´ Ticker:          VMVIX
üè∑Ô∏è Share Class:     Investor Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 30,104
Expense Ratio       : 0.19
Turnover Rate       : 19
Costs per $10k      : 20
Advisory Fees       : 532
Number of Holdings  : 195

üìù Commentary: "How did the Fund perform during the reporting period?   For the 12 months ended December 31, 2024, the Fund performed roughly in line with its benchmark, the CRSP US Mid Cap Value Index.   U.S. economic growth hovered around 3% on a year-over-year ba..."


**üìä Average Annual Returns**

Unnamed: 0,0,1,2,3
0,Average Annual Total Returns,,,
1,,1 Year,5 Years,10 Years
2,Investor Shares,13.89%,8.63%,8.37%
3,CRSP US Mid Cap Value Index,14.05%,8.79%,8.53%
4,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1
0,Portfolio Composition % of Net Assets (as of ...,Portfolio Composition % of Net Assets (as of ...
1,Basic Materials,3.9%
2,Consumer Discretionary,10.8%
3,Consumer Staples,9.9%
4,Energy,5.6%
5,Financials,17.7%
6,Health Care,6.0%
7,Industrials,18.8%
8,Real Estate,8.4%
9,Technology,7.4%






### üè¶ Vanguard Mid-Cap Value Index Fund

üÜî Context ID:      FY2024_C000034430Member
üé´ Ticker:          VOE
üè∑Ô∏è Share Class:     ETF Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 30,104
Expense Ratio       : 0.07
Turnover Rate       : 19
Costs per $10k      : 7
Advisory Fees       : 532
Number of Holdings  : 195

üìù Commentary: "How did the Fund perform during the reporting period?   For the 12 months ended December 31, 2024, the Fund performed roughly in line with its benchmark, the CRSP US Mid Cap Value Index.   U.S. economic growth hovered around 3% on a year-over-year ba..."


**üìä Average Annual Returns**

Unnamed: 0,0,1,2,3
0,Average Annual Total Returns,,,
1,,1 Year,5 Years,10 Years
2,ETF Shares Net Asset Value,14.03%,8.76%,8.49%
3,ETF Shares Market Price,14.00%,8.76%,8.49%
4,CRSP US Mid Cap Value Index,14.05%,8.79%,8.53%
5,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1
0,Portfolio Composition % of Net Assets (as of ...,Portfolio Composition % of Net Assets (as of ...
1,Basic Materials,3.9%
2,Consumer Discretionary,10.8%
3,Consumer Staples,9.9%
4,Energy,5.6%
5,Financials,17.7%
6,Health Care,6.0%
7,Industrials,18.8%
8,Real Estate,8.4%
9,Technology,7.4%






### üè¶ Vanguard Mid-Cap Value Index Fund

üÜî Context ID:      FY2024_C000105307Member
üé´ Ticker:          VMVAX
üè∑Ô∏è Share Class:     Admiral Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 30,104
Expense Ratio       : 0.07
Turnover Rate       : 19
Costs per $10k      : 7
Advisory Fees       : 532
Number of Holdings  : 195

üìù Commentary: "How did the Fund perform during the reporting period?   For the 12 months ended December 31, 2024, the Fund performed roughly in line with its benchmark, the CRSP US Mid Cap Value Index.   U.S. economic growth hovered around 3% on a year-over-year ba..."


**üìä Average Annual Returns**

Unnamed: 0,0,1,2,3
0,Average Annual Total Returns,,,
1,,1 Year,5 Years,10 Years
2,Admiral Shares,14.03%,8.76%,8.50%
3,CRSP US Mid Cap Value Index,14.05%,8.79%,8.53%
4,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1
0,Portfolio Composition % of Net Assets (as of ...,Portfolio Composition % of Net Assets (as of ...
1,Basic Materials,3.9%
2,Consumer Discretionary,10.8%
3,Consumer Staples,9.9%
4,Energy,5.6%
5,Financials,17.7%
6,Health Care,6.0%
7,Industrials,18.8%
8,Real Estate,8.4%
9,Technology,7.4%






### üè¶ Vanguard Small-Cap Index Fund

üÜî Context ID:      FY2024_C000007795Member
üé´ Ticker:          NAESX
üè∑Ô∏è Share Class:     Investor Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 155,233
Expense Ratio       : 0.17
Turnover Rate       : 13
Costs per $10k      : 18
Advisory Fees       : 2,566
Number of Holdings  : 1,377

üìù Commentary: "How did the Fund perform during the reporting period?   For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Small Cap Index.   U.S. economic growth hovered around 3% on a year-over-year basis for much..."


**üìä Average Annual Returns**

Unnamed: 0,0,1,2,3
0,Average Annual Total Returns,,,
1,,1 Year,5 Years,10 Years
2,Investor Shares,14.10%,9.17%,8.96%
3,CRSP US Small Cap Index,14.22%,9.26%,9.06%
4,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1
0,Portfolio Composition % of Net Assets (as of ...,Portfolio Composition % of Net Assets (as of ...
1,Basic Materials,3.5%
2,Consumer Discretionary,16.0%
3,Consumer Staples,3.6%
4,Energy,4.5%
5,Financials,14.6%
6,Health Care,10.5%
7,Industrials,21.7%
8,Real Estate,7.1%
9,Technology,13.4%






### üè¶ Vanguard Small-Cap Index Fund

üÜî Context ID:      FY2024_C000007798Member
üé´ Ticker:          VB
üè∑Ô∏è Share Class:     ETF Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 155,233
Expense Ratio       : 0.05
Turnover Rate       : 13
Costs per $10k      : 5
Advisory Fees       : 2,566
Number of Holdings  : 1,377

üìù Commentary: "How did the Fund perform during the reporting period?   For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Small Cap Index.   U.S. economic growth hovered around 3% on a year-over-year basis for much..."


**üìä Average Annual Returns**

Unnamed: 0,0,1,2,3
0,Average Annual Total Returns,,,
1,,1 Year,5 Years,10 Years
2,ETF Shares Net Asset Value,14.23%,9.30%,9.09%
3,ETF Shares Market Price,14.13%,9.29%,9.09%
4,CRSP US Small Cap Index,14.22%,9.26%,9.06%
5,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1
0,Portfolio Composition % of Net Assets (as of ...,Portfolio Composition % of Net Assets (as of ...
1,Basic Materials,3.5%
2,Consumer Discretionary,16.0%
3,Consumer Staples,3.6%
4,Energy,4.5%
5,Financials,14.6%
6,Health Care,10.5%
7,Industrials,21.7%
8,Real Estate,7.1%
9,Technology,13.4%






### üè¶ Vanguard Small-Cap Index Fund

üÜî Context ID:      FY2024_C000007796Member
üé´ Ticker:          VSMAX
üè∑Ô∏è Share Class:     Admiral Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 155,233
Expense Ratio       : 0.05
Turnover Rate       : 13
Costs per $10k      : 5
Advisory Fees       : 2,566
Number of Holdings  : 1,377

üìù Commentary: "How did the Fund perform during the reporting period?   For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Small Cap Index.   U.S. economic growth hovered around 3% on a year-over-year basis for much..."


**üìä Average Annual Returns**

Unnamed: 0,0,1,2,3
0,Average Annual Total Returns,,,
1,,1 Year,5 Years,10 Years
2,Admiral Shares,14.23%,9.30%,9.09%
3,CRSP US Small Cap Index,14.22%,9.26%,9.06%
4,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1
0,Portfolio Composition % of Net Assets (as of ...,Portfolio Composition % of Net Assets (as of ...
1,Basic Materials,3.5%
2,Consumer Discretionary,16.0%
3,Consumer Staples,3.6%
4,Energy,4.5%
5,Financials,14.6%
6,Health Care,10.5%
7,Industrials,21.7%
8,Real Estate,7.1%
9,Technology,13.4%






### üè¶ Vanguard Small-Cap Index Fund

üÜî Context ID:      FY2024_C000007797Member
üé´ Ticker:          VSCIX
üè∑Ô∏è Share Class:     Institutional Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 155,233
Expense Ratio       : 0.04
Turnover Rate       : 13
Costs per $10k      : 4
Advisory Fees       : 2,566
Number of Holdings  : 1,377

üìù Commentary: "How did the Fund perform during the reporting period?   For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Small Cap Index.   U.S. economic growth hovered around 3% on a year-over-year basis for much..."


**üìä Average Annual Returns**

Unnamed: 0,0,1,2,3
0,Average Annual Total Returns,,,
1,,1 Year,5 Years,10 Years
2,Institutional Shares,14.23%,9.31%,9.10%
3,CRSP US Small Cap Index,14.22%,9.26%,9.06%
4,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1
0,Portfolio Composition % of Net Assets (as of ...,Portfolio Composition % of Net Assets (as of ...
1,Basic Materials,3.5%
2,Consumer Discretionary,16.0%
3,Consumer Staples,3.6%
4,Energy,4.5%
5,Financials,14.6%
6,Health Care,10.5%
7,Industrials,21.7%
8,Real Estate,7.1%
9,Technology,13.4%






### üè¶ Vanguard Small-Cap Index Fund

üÜî Context ID:      FY2024_C000096112Member
üé´ Ticker:          VSCPX
üè∑Ô∏è Share Class:     Institutional Plus Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 155,233
Expense Ratio       : 0.03
Turnover Rate       : 13
Costs per $10k      : 3
Advisory Fees       : 2,566
Number of Holdings  : 1,377

üìù Commentary: "How did the Fund perform during the reporting period?   For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Small Cap Index.   U.S. economic growth hovered around 3% on a year-over-year basis for much..."


**üìä Average Annual Returns**

Unnamed: 0,0,1,2,3
0,Average Annual Total Returns,,,
1,,1 Year,5 Years,10 Years
2,Institutional Plus Shares,14.25%,9.32%,9.11%
3,CRSP US Small Cap Index,14.22%,9.26%,9.06%
4,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1
0,Portfolio Composition % of Net Assets (as of ...,Portfolio Composition % of Net Assets (as of ...
1,Basic Materials,3.5%
2,Consumer Discretionary,16.0%
3,Consumer Staples,3.6%
4,Energy,4.5%
5,Financials,14.6%
6,Health Care,10.5%
7,Industrials,21.7%
8,Real Estate,7.1%
9,Technology,13.4%






### üè¶ Vanguard Small-Cap Growth Index Fund

üÜî Context ID:      FY2024_C000007799Member
üé´ Ticker:          VISGX
üè∑Ô∏è Share Class:     Investor Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 38,107
Expense Ratio       : 0.19
Turnover Rate       : 21
Costs per $10k      : 21
Advisory Fees       : 628
Number of Holdings  : 596

üìù Commentary: "How did the Fund perform during the reporting period?   For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Small Cap Growth Index.   U.S. economic growth hovered around 3% on a year-over-year basis f..."


**üìä Average Annual Returns**

Unnamed: 0,0,1,2,3
0,Average Annual Total Returns,,,
1,,1 Year,5 Years,10 Years
2,Investor Shares,16.35%,7.56%,8.96%
3,CRSP US Small Cap Growth Index,16.48%,7.66%,9.05%
4,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1
0,Portfolio Composition % of Net Assets (as of ...,Portfolio Composition % of Net Assets (as of ...
1,Basic Materials,1.7%
2,Consumer Discretionary,16.1%
3,Consumer Staples,3.3%
4,Energy,5.3%
5,Financials,5.8%
6,Health Care,16.2%
7,Industrials,20.3%
8,Real Estate,5.2%
9,Technology,22.7%






### üè¶ Vanguard Small-Cap Growth Index Fund

üÜî Context ID:      FY2024_C000007801Member
üé´ Ticker:          VBK
üè∑Ô∏è Share Class:     ETF Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 38,107
Expense Ratio       : 0.07
Turnover Rate       : 21
Costs per $10k      : 8
Advisory Fees       : 628
Number of Holdings  : 596

üìù Commentary: "How did the Fund perform during the reporting period?   For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Small Cap Growth Index.   U.S. economic growth hovered around 3% on a year-over-year basis f..."


**üìä Average Annual Returns**

Unnamed: 0,0,1,2,3
0,Average Annual Total Returns,,,
1,,1 Year,5 Years,10 Years
2,ETF Shares Net Asset Value,16.49%,7.69%,9.09%
3,ETF Shares Market Price,16.49%,7.70%,9.09%
4,CRSP US Small Cap Growth Index,16.48%,7.66%,9.05%
5,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1
0,Portfolio Composition % of Net Assets (as of ...,Portfolio Composition % of Net Assets (as of ...
1,Basic Materials,1.7%
2,Consumer Discretionary,16.1%
3,Consumer Staples,3.3%
4,Energy,5.3%
5,Financials,5.8%
6,Health Care,16.2%
7,Industrials,20.3%
8,Real Estate,5.2%
9,Technology,22.7%






### üè¶ Vanguard Small-Cap Growth Index Fund

üÜî Context ID:      FY2024_C000105304Member
üé´ Ticker:          VSGAX
üè∑Ô∏è Share Class:     Admiral Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 38,107
Expense Ratio       : 0.07
Turnover Rate       : 21
Costs per $10k      : 8
Advisory Fees       : 628
Number of Holdings  : 596

üìù Commentary: "How did the Fund perform during the reporting period?   For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Small Cap Growth Index.   U.S. economic growth hovered around 3% on a year-over-year basis f..."


**üìä Average Annual Returns**

Unnamed: 0,0,1,2,3
0,Average Annual Total Returns,,,
1,,1 Year,5 Years,10 Years
2,Admiral Shares,16.49%,7.69%,9.09%
3,CRSP US Small Cap Growth Index,16.48%,7.66%,9.05%
4,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1
0,Portfolio Composition % of Net Assets (as of ...,Portfolio Composition % of Net Assets (as of ...
1,Basic Materials,1.7%
2,Consumer Discretionary,16.1%
3,Consumer Staples,3.3%
4,Energy,5.3%
5,Financials,5.8%
6,Health Care,16.2%
7,Industrials,20.3%
8,Real Estate,5.2%
9,Technology,22.7%






### üè¶ Vanguard Small-Cap Growth Index Fund

üÜî Context ID:      FY2024_C000007800Member
üé´ Ticker:          VSGIX
üè∑Ô∏è Share Class:     Institutional Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 38,107
Expense Ratio       : 0.06
Turnover Rate       : 21
Costs per $10k      : 6
Advisory Fees       : 628
Number of Holdings  : 596

üìù Commentary: "How did the Fund perform during the reporting period?   For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Small Cap Growth Index.   U.S. economic growth hovered around 3% on a year-over-year basis f..."


**üìä Average Annual Returns**

Unnamed: 0,0,1,2,3
0,Average Annual Total Returns,,,
1,,1 Year,5 Years,10 Years
2,Institutional Shares,16.50%,7.70%,9.10%
3,CRSP US Small Cap Growth Index,16.48%,7.66%,9.05%
4,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1
0,Portfolio Composition % of Net Assets (as of ...,Portfolio Composition % of Net Assets (as of ...
1,Basic Materials,1.7%
2,Consumer Discretionary,16.1%
3,Consumer Staples,3.3%
4,Energy,5.3%
5,Financials,5.8%
6,Health Care,16.2%
7,Industrials,20.3%
8,Real Estate,5.2%
9,Technology,22.7%






### üè¶ Vanguard Small-Cap Value Index Fund

üÜî Context ID:      FY2024_C000007802Member
üé´ Ticker:          VISVX
üè∑Ô∏è Share Class:     Investor Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 58,332
Expense Ratio       : 0.19
Turnover Rate       : 16
Costs per $10k      : 20
Advisory Fees       : 997
Number of Holdings  : 845

üìù Commentary: "How did the Fund perform during the reporting period?   For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Small Cap Value Index.   U.S. economic growth hovered around 3% on a year-over-year basis fo..."


**üìä Average Annual Returns**

Unnamed: 0,0,1,2,3
0,Average Annual Total Returns,,,
1,,1 Year,5 Years,10 Years
2,Investor Shares,12.25%,9.77%,8.54%
3,CRSP US Small Cap Value Index,12.42%,9.89%,8.67%
4,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1
0,Portfolio Composition % of Net Assets (as of ...,Portfolio Composition % of Net Assets (as of ...
1,Basic Materials,4.9%
2,Consumer Discretionary,15.8%
3,Consumer Staples,3.9%
4,Energy,3.8%
5,Financials,21.3%
6,Health Care,6.1%
7,Industrials,22.8%
8,Real Estate,8.6%
9,Technology,6.2%






### üè¶ Vanguard Small-Cap Value Index Fund

üÜî Context ID:      FY2024_C000007804Member
üé´ Ticker:          VBR
üè∑Ô∏è Share Class:     ETF Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 58,332
Expense Ratio       : 0.07
Turnover Rate       : 16
Costs per $10k      : 7
Advisory Fees       : 997
Number of Holdings  : 845

üìù Commentary: "How did the Fund perform during the reporting period?   For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Small Cap Value Index.   U.S. economic growth hovered around 3% on a year-over-year basis fo..."


**üìä Average Annual Returns**

Unnamed: 0,0,1,2,3
0,Average Annual Total Returns,,,
1,,1 Year,5 Years,10 Years
2,ETF Shares Net Asset Value,12.39%,9.89%,8.67%
3,ETF Shares Market Price,12.30%,9.89%,8.67%
4,CRSP US Small Cap Value Index,12.42%,9.89%,8.67%
5,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1
0,Portfolio Composition % of Net Assets (as of ...,Portfolio Composition % of Net Assets (as of ...
1,Basic Materials,4.9%
2,Consumer Discretionary,15.8%
3,Consumer Staples,3.9%
4,Energy,3.8%
5,Financials,21.3%
6,Health Care,6.1%
7,Industrials,22.8%
8,Real Estate,8.6%
9,Technology,6.2%






### üè¶ Vanguard Small-Cap Value Index Fund

üÜî Context ID:      FY2024_C000105305Member
üé´ Ticker:          VSIAX
üè∑Ô∏è Share Class:     Admiral Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 58,332
Expense Ratio       : 0.07
Turnover Rate       : 16
Costs per $10k      : 7
Advisory Fees       : 997
Number of Holdings  : 845

üìù Commentary: "How did the Fund perform during the reporting period?   For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Small Cap Value Index.   U.S. economic growth hovered around 3% on a year-over-year basis fo..."


**üìä Average Annual Returns**

Unnamed: 0,0,1,2,3
0,Average Annual Total Returns,,,
1,,1 Year,5 Years,10 Years
2,Admiral Shares,12.39%,9.90%,8.67%
3,CRSP US Small Cap Value Index,12.42%,9.89%,8.67%
4,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1
0,Portfolio Composition % of Net Assets (as of ...,Portfolio Composition % of Net Assets (as of ...
1,Basic Materials,4.9%
2,Consumer Discretionary,15.8%
3,Consumer Staples,3.9%
4,Energy,3.8%
5,Financials,21.3%
6,Health Care,6.1%
7,Industrials,22.8%
8,Real Estate,8.6%
9,Technology,6.2%






### üè¶ Vanguard Small-Cap Value Index Fund

üÜî Context ID:      FY2024_C000007803Member
üé´ Ticker:          VSIIX
üè∑Ô∏è Share Class:     Institutional Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 58,332
Expense Ratio       : 0.06
Turnover Rate       : 16
Costs per $10k      : 6
Advisory Fees       : 997
Number of Holdings  : 845

üìù Commentary: "How did the Fund perform during the reporting period?   For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Small Cap Value Index.   U.S. economic growth hovered around 3% on a year-over-year basis fo..."


**üìä Average Annual Returns**

Unnamed: 0,0,1,2,3
0,Average Annual Total Returns,,,
1,,1 Year,5 Years,10 Years
2,Institutional Shares,12.41%,9.91%,8.68%
3,CRSP US Small Cap Value Index,12.42%,9.89%,8.67%
4,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1
0,Portfolio Composition % of Net Assets (as of ...,Portfolio Composition % of Net Assets (as of ...
1,Basic Materials,4.9%
2,Consumer Discretionary,15.8%
3,Consumer Staples,3.9%
4,Energy,3.8%
5,Financials,21.3%
6,Health Care,6.1%
7,Industrials,22.8%
8,Real Estate,8.6%
9,Technology,6.2%






### üè¶ Vanguard Total Stock Market Index Fund

üÜî Context ID:      FY2024_C000007805Member
üé´ Ticker:          VTSMX
üè∑Ô∏è Share Class:     Investor Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 1,777,963
Expense Ratio       : 0.14
Turnover Rate       : 2
Costs per $10k      : 16
Advisory Fees       : 33,526
Number of Holdings  : 3,624

üìù Commentary: "How did the Fund perform during the reporting period?   For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Total Market Index.   U.S. economic growth hovered around 3% on a year-over-year basis for m..."


**üìä Average Annual Returns**

Unnamed: 0,0,1,2,3
0,Average Annual Total Returns,,,
1,,1 Year,5 Years,10 Years
2,Investor Shares,23.61%,13.69%,12.38%
3,CRSP US Total Market Index,23.77%,13.81%,12.50%
4,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1
0,Portfolio Composition % of Net Assets (as of D...,Portfolio Composition % of Net Assets (as of D...
1,Basic Materials,1.4%
2,Consumer Discretionary,15.1%
3,Consumer Staples,3.9%
4,Energy,3.4%
5,Financials,11.3%
6,Health Care,10.0%
7,Industrials,12.5%
8,Real Estate,2.6%
9,Technology,35.0%






### üè¶ Vanguard Total Stock Market Index Fund

üÜî Context ID:      FY2024_C000007808Member
üé´ Ticker:          VTI
üè∑Ô∏è Share Class:     ETF Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 1,777,963
Expense Ratio       : 0.03
Turnover Rate       : 2
Costs per $10k      : 3
Advisory Fees       : 33,526
Number of Holdings  : 3,624

üìù Commentary: "How did the Fund perform during the reporting period?   For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Total Market Index.   U.S. economic growth hovered around 3% on a year-over-year basis for m..."


**üìä Average Annual Returns**

Unnamed: 0,0,1,2,3
0,Average Annual Total Returns,,,
1,,1 Year,5 Years,10 Years
2,ETF Shares Net Asset Value,23.75%,13.80%,12.50%
3,ETF Shares Market Price,23.71%,13.81%,12.50%
4,CRSP US Total Market Index,23.77%,13.81%,12.50%
5,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1
0,Portfolio Composition % of Net Assets (as of D...,Portfolio Composition % of Net Assets (as of D...
1,Basic Materials,1.4%
2,Consumer Discretionary,15.1%
3,Consumer Staples,3.9%
4,Energy,3.4%
5,Financials,11.3%
6,Health Care,10.0%
7,Industrials,12.5%
8,Real Estate,2.6%
9,Technology,35.0%






### üè¶ Vanguard Total Stock Market Index Fund

üÜî Context ID:      FY2024_C000007806Member
üé´ Ticker:          VTSAX
üè∑Ô∏è Share Class:     Admiral Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 1,777,963
Expense Ratio       : 0.04
Turnover Rate       : 2
Costs per $10k      : 4
Advisory Fees       : 33,526
Number of Holdings  : 3,624

üìù Commentary: "How did the Fund perform during the reporting period?   For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Total Market Index.   U.S. economic growth hovered around 3% on a year-over-year basis for m..."


**üìä Average Annual Returns**

Unnamed: 0,0,1,2,3
0,Average Annual Total Returns,,,
1,,1 Year,5 Years,10 Years
2,Admiral Shares,23.74%,13.80%,12.49%
3,CRSP US Total Market Index,23.77%,13.81%,12.50%
4,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1
0,Portfolio Composition % of Net Assets (as of D...,Portfolio Composition % of Net Assets (as of D...
1,Basic Materials,1.4%
2,Consumer Discretionary,15.1%
3,Consumer Staples,3.9%
4,Energy,3.4%
5,Financials,11.3%
6,Health Care,10.0%
7,Industrials,12.5%
8,Real Estate,2.6%
9,Technology,35.0%






### üè¶ Vanguard Total Stock Market Index Fund

üÜî Context ID:      FY2024_C000007807Member
üé´ Ticker:          VITSX
üè∑Ô∏è Share Class:     Institutional Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 1,777,963
Expense Ratio       : 0.03
Turnover Rate       : 2
Costs per $10k      : 3
Advisory Fees       : 33,526
Number of Holdings  : 3,624

üìù Commentary: "How did the Fund perform during the reporting period?   For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Total Market Index.   U.S. economic growth hovered around 3% on a year-over-year basis for m..."


**üìä Average Annual Returns**

Unnamed: 0,0,1,2,3
0,Average Annual Total Returns,,,
1,,1 Year,5 Years,10 Years
2,Institutional Shares,23.75%,13.81%,12.50%
3,CRSP US Total Market Index,23.77%,13.81%,12.50%
4,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1
0,Portfolio Composition % of Net Assets (as of D...,Portfolio Composition % of Net Assets (as of D...
1,Basic Materials,1.4%
2,Consumer Discretionary,15.1%
3,Consumer Staples,3.9%
4,Energy,3.4%
5,Financials,11.3%
6,Health Care,10.0%
7,Industrials,12.5%
8,Real Estate,2.6%
9,Technology,35.0%






### üè¶ Vanguard Total Stock Market Index Fund

üÜî Context ID:      FY2024_C000155407Member
üé´ Ticker:          VSMPX
üè∑Ô∏è Share Class:     Institutional Plus Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 1,777,963
Expense Ratio       : 0.02
Turnover Rate       : 2
Costs per $10k      : 2
Advisory Fees       : 33,526
Number of Holdings  : 3,624

üìù Commentary: "How did the Fund perform during the reporting period?   For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Total Market Index.   U.S. economic growth hovered around 3% on a year-over-year basis for m..."


**üìä Average Annual Returns**

Unnamed: 0,0,1,2,3
0,Average Annual Total Returns,,,
1,,1 Year,5 Years,Since Inception (4/28/2015)
2,Institutional Plus Shares,23.76%,13.82%,12.52%
3,CRSP US Total Market Index,23.77%,13.81%,12.51%
4,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1
0,Portfolio Composition % of Net Assets (as of D...,Portfolio Composition % of Net Assets (as of D...
1,Basic Materials,1.4%
2,Consumer Discretionary,15.1%
3,Consumer Staples,3.9%
4,Energy,3.4%
5,Financials,11.3%
6,Health Care,10.0%
7,Industrials,12.5%
8,Real Estate,2.6%
9,Technology,35.0%






### üè¶ Vanguard Total Stock Market Index Fund

üÜî Context ID:      FY2024_C000170276Member
üé´ Ticker:          VSTSX
üè∑Ô∏è Share Class:     Institutional Select Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 1,777,963
Expense Ratio       : 0.01
Turnover Rate       : 2
Costs per $10k      : 1
Advisory Fees       : 33,526
Number of Holdings  : 3,624

üìù Commentary: "How did the Fund perform during the reporting period?   For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Total Market Index.   U.S. economic growth hovered around 3% on a year-over-year basis for m..."


**üìä Average Annual Returns**

Unnamed: 0,0,1,2,3
0,Average Annual Total Returns,,,
1,,1 Year,5 Years,Since Inception (6/27/2016)
2,Institutional Select Shares,23.78%,13.83%,15.00%
3,CRSP US Total Market Index,23.77%,13.81%,14.98%
4,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,14.95%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1
0,Portfolio Composition % of Net Assets (as of D...,Portfolio Composition % of Net Assets (as of D...
1,Basic Materials,1.4%
2,Consumer Discretionary,15.1%
3,Consumer Staples,3.9%
4,Energy,3.4%
5,Financials,11.3%
6,Health Care,10.0%
7,Industrials,12.5%
8,Real Estate,2.6%
9,Technology,35.0%






### üè¶ 500 Index Fund

üÜî Context ID:      From2024-01-01to2024-12-31_C000007773Member
üé´ Ticker:          VFINX
üè∑Ô∏è Share Class:     Investor Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 1,350,332
Expense Ratio       : 0.14
Turnover Rate       : 2
Costs per $10k      : 16
Advisory Fees       : 20,816
Number of Holdings  : 516

üìù Commentary: "How did the Fund perform during the reporting period?¬†For the 12 months ended December 31, 2024, the Fund performed¬†in line with its benchmark, the Standard & Poor's 500 Index.U.S. economic growth hovered around 3% on a year-over-year basis for much ..."


**üìà Performance History**

Unnamed: 0.1,Unnamed: 0,Investor Shares,S&P 500 Index,Dow Jones U.S. Total Stock Market Float Adjusted Index
0,2014,"$10,000","$10,000","$10,000"
1,2015,"$10,091","$10,095","$10,180"
2,2015,"$10,117","$10,123","$10,192"
3,2015,"$9,462","$9,471","$9,451"
4,2015,"$10,125","$10,138","$10,044"
5,2016,"$10,258","$10,275","$10,136"
6,2016,"$10,506","$10,527","$10,401"
7,2016,"$10,907","$10,933","$10,862"
8,2016,"$11,321","$11,351","$11,312"
9,2017,"$12,004","$12,039","$11,967"


**üìä Average Annual Returns**

Unnamed: 0.1,Unnamed: 0,1 Year,5 Years,10 Years
0,Investor Shares,24.84%,14.37%,12.95%
1,S&P 500 Index,25.02%,14.53%,13.10%
2,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1,2
0,Communication Services,9.4%,
1,Consumer Discretionary,11.2%,
2,Consumer Staples,5.5%,
3,Energy,3.2%,
4,Financials,13.6%,
5,Health Care,10.1%,
6,Industrials,8.1%,
7,Information Technology,32.4%,
8,Materials,1.9%,
9,Real Estate,2.1%,






### üè¶ 500 Index Fund

üÜî Context ID:      From2024-01-01to2024-12-31_C000092055Member
üé´ Ticker:          VOO
üè∑Ô∏è Share Class:     ETF Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 1,350,332
Expense Ratio       : 0.03
Turnover Rate       : 2
Costs per $10k      : 3
Advisory Fees       : 20,816
Number of Holdings  : 516

üìù Commentary: "How did the Fund perform during the reporting period?¬†For the 12 months ended December 31, 2024, the Fund performed¬†in line with its benchmark, the Standard & Poor's 500 Index.U.S. economic growth hovered around 3% on a year-over-year basis for much ..."


**üìà Performance History**

Unnamed: 0.1,Unnamed: 0,ETF Shares Net Asset Value,S&P 500 Index,Dow Jones U.S. Total Stock Market Float Adjusted Index
0,2014,"$10,000","$10,000","$10,000"
1,2015,"$10,094","$10,095","$10,180"
2,2015,"$10,123","$10,123","$10,192"
3,2015,"$9,469","$9,471","$9,451"
4,2015,"$10,135","$10,138","$10,044"
5,2016,"$10,271","$10,275","$10,136"
6,2016,"$10,522","$10,527","$10,401"
7,2016,"$10,927","$10,933","$10,862"
8,2016,"$11,345","$11,351","$11,312"
9,2017,"$12,031","$12,039","$11,967"


**üìä Average Annual Returns**

Unnamed: 0.1,Unnamed: 0,1 Year,5 Years,10 Years
0,ETF Shares Net Asset Value,24.98%,14.48%,13.06%
1,ETF Shares Market Price,24.94%,14.49%,13.06%
2,S&P 500 Index,25.02%,14.53%,13.10%
3,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1,2
0,Communication Services,9.4%,
1,Consumer Discretionary,11.2%,
2,Consumer Staples,5.5%,
3,Energy,3.2%,
4,Financials,13.6%,
5,Health Care,10.1%,
6,Industrials,8.1%,
7,Information Technology,32.4%,
8,Materials,1.9%,
9,Real Estate,2.1%,






### üè¶ 500 Index Fund

üÜî Context ID:      From2024-01-01to2024-12-31_C000007774Member
üé´ Ticker:          VFIAX
üè∑Ô∏è Share Class:     Admiral Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 1,350,332
Expense Ratio       : 0.04
Turnover Rate       : 2
Costs per $10k      : 4
Advisory Fees       : 20,816
Number of Holdings  : 516

üìù Commentary: "How did the Fund perform during the reporting period?¬†For the 12 months ended December 31, 2024, the Fund performed¬†in line with its benchmark, the Standard & Poor's 500 Index.U.S. economic growth hovered around 3% on a year-over-year basis for much ..."


**üìà Performance History**

Unnamed: 0.1,Unnamed: 0,Admiral Shares,S&P 500 Index,Dow Jones U.S. Total Stock Market Float Adjusted Index
0,2014,"$10,000","$10,000","$10,000"
1,2015,"$10,094","$10,095","$10,180"
2,2015,"$10,123","$10,123","$10,192"
3,2015,"$9,470","$9,471","$9,451"
4,2015,"$10,136","$10,138","$10,044"
5,2016,"$10,272","$10,275","$10,136"
6,2016,"$10,523","$10,527","$10,401"
7,2016,"$10,928","$10,933","$10,862"
8,2016,"$11,345","$11,351","$11,312"
9,2017,"$12,032","$12,039","$11,967"


**üìä Average Annual Returns**

Unnamed: 0.1,Unnamed: 0,1 Year,5 Years,10 Years
0,Admiral Shares,24.97%,14.48%,13.06%
1,S&P 500 Index,25.02%,14.53%,13.10%
2,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1,2
0,Communication Services,9.4%,
1,Consumer Discretionary,11.2%,
2,Consumer Staples,5.5%,
3,Energy,3.2%,
4,Financials,13.6%,
5,Health Care,10.1%,
6,Industrials,8.1%,
7,Information Technology,32.4%,
8,Materials,1.9%,
9,Real Estate,2.1%,






### üè¶ 500 Index Fund

üÜî Context ID:      From2024-01-01to2024-12-31_C000170274Member
üé´ Ticker:          VFFSX
üè∑Ô∏è Share Class:     Institutional Select Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 1,350,332
Expense Ratio       : 0.01
Turnover Rate       : 2
Costs per $10k      : 1
Advisory Fees       : 20,816
Number of Holdings  : 516

üìù Commentary: "How did the Fund perform during the reporting period?¬†For the 12 months ended December 31, 2024, the Fund performed¬†in line with its benchmark, the Standard & Poor's 500 Index.U.S. economic growth hovered around 3% on a year-over-year basis for much ..."


**üìà Performance History**

Unnamed: 0.1,Unnamed: 0,Institutional Select Share Class,S&P 500 Index,Dow Jones U.S. Total Stock Market Float Adjusted Index
0,6/24/16,"$5,000,000,000","$5,000,000,000","$5,000,000,000"
1,6/30/16,"$5,152,656,423","$5,152,577,879","$5,147,526,005"
2,9/30/16,"$5,351,187,741","$5,351,059,644","$5,375,816,703"
3,12/31/16,"$5,556,006,018","$5,555,696,904","$5,598,236,535"
4,3/31/17,"$5,892,894,070","$5,892,712,361","$5,922,524,381"
5,6/30/17,"$6,074,753,290","$6,074,688,823","$6,100,664,780"
6,9/30/17,"$6,347,363,071","$6,346,856,796","$6,379,289,163"
7,12/31/17,"$6,769,029,785","$6,768,592,456","$6,782,952,897"
8,3/31/18,"$6,717,278,809","$6,717,209,557","$6,741,940,992"
9,6/30/18,"$6,947,757,117","$6,947,870,304","$7,003,081,023"


**üìä Average Annual Returns**

Unnamed: 0.1,Unnamed: 0,1 Year,5 Years,Since Inception 6/24/16
0,Institutional Select Share Class,25.00%,14.52%,15.25%
1,S&P 500 Index,25.02%,14.53%,15.26%
2,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,14.66%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1,2
0,Communication Services,9.4%,
1,Consumer Discretionary,11.2%,
2,Consumer Staples,5.5%,
3,Energy,3.2%,
4,Financials,13.6%,
5,Health Care,10.1%,
6,Industrials,8.1%,
7,Information Technology,32.4%,
8,Materials,1.9%,
9,Real Estate,2.1%,






### üè¶ Value Index Fund

üÜî Context ID:      From2024-01-01to2024-12-31_C000007775Member
üé´ Ticker:          VIVAX
üè∑Ô∏è Share Class:     Investor Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 183,569
Expense Ratio       : 0.17
Turnover Rate       : 9
Costs per $10k      : 18
Advisory Fees       : 3,184
Number of Holdings  : 348

üìù Commentary: "How did the Fund perform during the reporting period?¬†For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Large Cap Value Index.U.S. economic growth hovered around 3% on a year-over-year basis for muc..."


**üìà Performance History**

Unnamed: 0.1,Unnamed: 0,Investor Shares,CRSP US Large Cap Value Index,Dow Jones U.S. Total Stock Market Float Adjusted Index
0,2014,"$10,000","$10,000","$10,000"
1,2015,"$9,940","$9,945","$10,180"
2,2015,"$9,984","$9,993","$10,192"
3,2015,"$9,257","$9,268","$9,451"
4,2015,"$9,897","$9,914","$10,044"
5,2016,"$10,055","$10,078","$10,136"
6,2016,"$10,434","$10,462","$10,401"
7,2016,"$10,748","$10,781","$10,862"
8,2016,"$11,554","$11,592","$11,312"
9,2017,"$11,926","$11,972","$11,967"


**üìä Average Annual Returns**

Unnamed: 0.1,Unnamed: 0,1 Year,5 Years,10 Years
0,Investor Shares,15.84%,9.80%,9.86%
1,CRSP US Large Cap Value Index,16.00%,9.93%,10.01%
2,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1,2
0,Basic Materials,1.8%,
1,Consumer Discretionary,9.2%,
2,Consumer Staples,8.6%,
3,Energy,6.6%,
4,Financials,21.6%,
5,Health Care,15.5%,
6,Industrials,15.6%,
7,Real Estate,3.1%,
8,Technology,8.9%,
9,Telecommunications,3.4%,






### üè¶ Value Index Fund

üÜî Context ID:      From2024-01-01to2024-12-31_C000007778Member
üé´ Ticker:          VTV
üè∑Ô∏è Share Class:     ETF Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    NYSE

--- üí∞ Costs & Financials ---
Net Assets          : 183,569
Expense Ratio       : 0.04
Turnover Rate       : 9
Costs per $10k      : 4
Advisory Fees       : 3,184
Number of Holdings  : 348

üìù Commentary: "How did the Fund perform during the reporting period?¬†For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Large Cap Value Index.U.S. economic growth hovered around 3% on a year-over-year basis for muc..."


**üìà Performance History**

Unnamed: 0.1,Unnamed: 0,ETF Shares Net Asset Value,CRSP US Large Cap Value Index,Dow Jones U.S. Total Stock Market Float Adjusted Index
0,2014,"$10,000","$10,000","$10,000"
1,2015,"$9,945","$9,945","$10,180"
2,2015,"$9,993","$9,993","$10,192"
3,2015,"$9,266","$9,268","$9,451"
4,2015,"$9,911","$9,914","$10,044"
5,2016,"$10,074","$10,078","$10,136"
6,2016,"$10,456","$10,462","$10,401"
7,2016,"$10,775","$10,781","$10,862"
8,2016,"$11,585","$11,592","$11,312"
9,2017,"$11,963","$11,972","$11,967"


**üìä Average Annual Returns**

Unnamed: 0.1,Unnamed: 0,1 Year,5 Years,10 Years
0,ETF Shares Net Asset Value,16.00%,9.93%,10.00%
1,ETF Shares Market Price,15.94%,9.93%,10.00%
2,CRSP US Large Cap Value Index,16.00%,9.93%,10.01%
3,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1,2
0,Basic Materials,1.8%,
1,Consumer Discretionary,9.2%,
2,Consumer Staples,8.6%,
3,Energy,6.6%,
4,Financials,21.6%,
5,Health Care,15.5%,
6,Industrials,15.6%,
7,Real Estate,3.1%,
8,Technology,8.9%,
9,Telecommunications,3.4%,






### üè¶ Value Index Fund

üÜî Context ID:      From2024-01-01to2024-12-31_C000007776Member
üé´ Ticker:          VVIAX
üè∑Ô∏è Share Class:     Admiral Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 183,569
Expense Ratio       : 0.05
Turnover Rate       : 9
Costs per $10k      : 5
Advisory Fees       : 3,184
Number of Holdings  : 348

üìù Commentary: "How did the Fund perform during the reporting period?¬†For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Large Cap Value Index.U.S. economic growth hovered around 3% on a year-over-year basis for muc..."


**üìà Performance History**

Unnamed: 0.1,Unnamed: 0,Admiral Shares,CRSP US Large Cap Value Index,Dow Jones U.S. Total Stock Market Float Adjusted Index
0,2014,"$10,000","$10,000","$10,000"
1,2015,"$9,943","$9,945","$10,180"
2,2015,"$9,994","$9,993","$10,192"
3,2015,"$9,266","$9,268","$9,451"
4,2015,"$9,914","$9,914","$10,044"
5,2016,"$10,076","$10,078","$10,136"
6,2016,"$10,459","$10,462","$10,401"
7,2016,"$10,777","$10,781","$10,862"
8,2016,"$11,586","$11,592","$11,312"
9,2017,"$11,963","$11,972","$11,967"


**üìä Average Annual Returns**

Unnamed: 0.1,Unnamed: 0,1 Year,5 Years,10 Years
0,Admiral Shares,15.99%,9.93%,9.99%
1,CRSP US Large Cap Value Index,16.00%,9.93%,10.01%
2,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1,2
0,Basic Materials,1.8%,
1,Consumer Discretionary,9.2%,
2,Consumer Staples,8.6%,
3,Energy,6.6%,
4,Financials,21.6%,
5,Health Care,15.5%,
6,Industrials,15.6%,
7,Real Estate,3.1%,
8,Technology,8.9%,
9,Telecommunications,3.4%,






### üè¶ Value Index Fund

üÜî Context ID:      From2024-01-01to2024-12-31_C000007777Member
üé´ Ticker:          VIVIX
üè∑Ô∏è Share Class:     Institutional Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 183,569
Expense Ratio       : 0.04
Turnover Rate       : 9
Costs per $10k      : 4
Advisory Fees       : 3,184
Number of Holdings  : 348

üìù Commentary: "How did the Fund perform during the reporting period?¬†For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Large Cap Value Index.U.S. economic growth hovered around 3% on a year-over-year basis for muc..."


**üìà Performance History**

Unnamed: 0.1,Unnamed: 0,Institutional Shares,CRSP US Large Cap Value Index,Dow Jones U.S. Total Stock Market Float Adjusted Index
0,2014,"$5,000,000","$5,000,000","$5,000,000"
1,2015,"$4,971,871","$4,972,417","$5,090,054"
2,2015,"$4,997,132","$4,996,694","$5,096,078"
3,2015,"$4,633,467","$4,633,912","$4,725,720"
4,2015,"$4,957,449","$4,957,090","$5,022,045"
5,2016,"$5,038,463","$5,039,202","$5,067,957"
6,2016,"$5,228,516","$5,231,204","$5,200,452"
7,2016,"$5,387,648","$5,390,397","$5,431,090"
8,2016,"$5,793,893","$5,796,215","$5,655,796"
9,2017,"$5,982,618","$5,986,133","$5,983,419"


**üìä Average Annual Returns**

Unnamed: 0.1,Unnamed: 0,1 Year,5 Years,10 Years
0,Institutional Shares,15.98%,9.94%,10.00%
1,CRSP US Large Cap Value Index,16.00%,9.93%,10.01%
2,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1,2
0,Basic Materials,1.8%,
1,Consumer Discretionary,9.2%,
2,Consumer Staples,8.6%,
3,Energy,6.6%,
4,Financials,21.6%,
5,Health Care,15.5%,
6,Industrials,15.6%,
7,Real Estate,3.1%,
8,Technology,8.9%,
9,Telecommunications,3.4%,






### üè¶ Growth Index Fund

üÜî Context ID:      From2024-01-01to2024-12-31_C000007783Member
üé´ Ticker:          VIGRX
üè∑Ô∏è Share Class:     Investor Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 284,976
Expense Ratio       : 0.17
Turnover Rate       : 11
Costs per $10k      : 20
Advisory Fees       : 4,355
Number of Holdings  : 183

üìù Commentary: "How did the Fund perform during the reporting period?¬†For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Large Cap Growth Index.U.S. economic growth hovered around 3% on a year-over-year basis for mu..."


**üìà Performance History**

Unnamed: 0.1,Unnamed: 0,Investor Shares,CRSP US Large Cap Growth Index,Dow Jones U.S. Total Stock Market Float Adjusted Index
0,2014,"$10,000","$10,000","$10,000"
1,2015,"$10,340","$10,346","$10,180"
2,2015,"$10,314","$10,325","$10,192"
3,2015,"$9,690","$9,705","$9,451"
4,2015,"$10,317","$10,338","$10,044"
5,2016,"$10,349","$10,376","$10,136"
6,2016,"$10,451","$10,482","$10,401"
7,2016,"$10,985","$11,021","$10,862"
8,2016,"$10,936","$10,975","$11,312"
9,2017,"$11,981","$12,031","$11,967"


**üìä Average Annual Returns**

Unnamed: 0.1,Unnamed: 0,1 Year,5 Years,10 Years
0,Investor Shares,32.50%,18.21%,15.61%
1,CRSP US Large Cap Growth Index,32.73%,18.41%,15.80%
2,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1,2
0,Basic Materials,0.7%,
1,Consumer Discretionary,19.8%,
2,Consumer Staples,0.4%,
3,Energy,0.8%,
4,Financials,2.7%,
5,Health Care,5.7%,
6,Industrials,8.4%,
7,Real Estate,1.3%,
8,Technology,59.0%,
9,Telecommunications,0.9%,






### üè¶ Growth Index Fund

üÜî Context ID:      From2024-01-01to2024-12-31_C000007786Member
üé´ Ticker:          VUG
üè∑Ô∏è Share Class:     ETF Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    NYSE

--- üí∞ Costs & Financials ---
Net Assets          : 284,976
Expense Ratio       : 0.04
Turnover Rate       : 11
Costs per $10k      : 5
Advisory Fees       : 4,355
Number of Holdings  : 183

üìù Commentary: "How did the Fund perform during the reporting period?¬†For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Large Cap Growth Index.U.S. economic growth hovered around 3% on a year-over-year basis for mu..."


**üìà Performance History**

Unnamed: 0.1,Unnamed: 0,ETF Shares Net Asset Value,CRSP US Large Cap Growth Index,Dow Jones U.S. Total Stock Market Float Adjusted Index
0,2014,"$10,000","$10,000","$10,000"
1,2015,"$10,344","$10,346","$10,180"
2,2015,"$10,321","$10,325","$10,192"
3,2015,"$9,701","$9,705","$9,451"
4,2015,"$10,332","$10,338","$10,044"
5,2016,"$10,367","$10,376","$10,136"
6,2016,"$10,473","$10,482","$10,401"
7,2016,"$11,010","$11,021","$10,862"
8,2016,"$10,965","$10,975","$11,312"
9,2017,"$12,018","$12,031","$11,967"


**üìä Average Annual Returns**

Unnamed: 0.1,Unnamed: 0,1 Year,5 Years,10 Years
0,ETF Shares Net Asset Value,32.68%,18.36%,15.76%
1,ETF Shares Market Price,32.64%,18.37%,15.76%
2,CRSP US Large Cap Growth Index,32.73%,18.41%,15.80%
3,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1,2
0,Basic Materials,0.7%,
1,Consumer Discretionary,19.8%,
2,Consumer Staples,0.4%,
3,Energy,0.8%,
4,Financials,2.7%,
5,Health Care,5.7%,
6,Industrials,8.4%,
7,Real Estate,1.3%,
8,Technology,59.0%,
9,Telecommunications,0.9%,






### üè¶ Growth Index Fund

üÜî Context ID:      From2024-01-01to2024-12-31_C000007784Member
üé´ Ticker:          VIGAX
üè∑Ô∏è Share Class:     Admiral Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 284,976
Expense Ratio       : 0.05
Turnover Rate       : 11
Costs per $10k      : 6
Advisory Fees       : 4,355
Number of Holdings  : 183

üìù Commentary: "How did the Fund perform during the reporting period?¬†For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Large Cap Growth Index.U.S. economic growth hovered around 3% on a year-over-year basis for mu..."


**üìà Performance History**

Unnamed: 0.1,Unnamed: 0,Admiral Shares,CRSP US Large Cap Growth Index,Dow Jones U.S. Total Stock Market Float Adjusted Index
0,2014,"$10,000","$10,000","$10,000"
1,2015,"$10,344","$10,346","$10,180"
2,2015,"$10,320","$10,325","$10,192"
3,2015,"$9,699","$9,705","$9,451"
4,2015,"$10,330","$10,338","$10,044"
5,2016,"$10,365","$10,376","$10,136"
6,2016,"$10,470","$10,482","$10,401"
7,2016,"$11,009","$11,021","$10,862"
8,2016,"$10,963","$10,975","$11,312"
9,2017,"$12,014","$12,031","$11,967"


**üìä Average Annual Returns**

Unnamed: 0.1,Unnamed: 0,1 Year,5 Years,10 Years
0,Admiral Shares,32.66%,18.36%,15.75%
1,CRSP US Large Cap Growth Index,32.73%,18.41%,15.80%
2,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1,2
0,Basic Materials,0.7%,
1,Consumer Discretionary,19.8%,
2,Consumer Staples,0.4%,
3,Energy,0.8%,
4,Financials,2.7%,
5,Health Care,5.7%,
6,Industrials,8.4%,
7,Real Estate,1.3%,
8,Technology,59.0%,
9,Telecommunications,0.9%,






### üè¶ Growth Index Fund

üÜî Context ID:      From2024-01-01to2024-12-31_C000007785Member
üé´ Ticker:          VIGIX
üè∑Ô∏è Share Class:     Institutional Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 284,976
Expense Ratio       : 0.04
Turnover Rate       : 11
Costs per $10k      : 5
Advisory Fees       : 4,355
Number of Holdings  : 183

üìù Commentary: "How did the Fund perform during the reporting period?¬†For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Large Cap Growth Index.U.S. economic growth hovered around 3% on a year-over-year basis for mu..."


**üìà Performance History**

Unnamed: 0.1,Unnamed: 0,Institutional Shares,CRSP US Large Cap Growth Index,Dow Jones U.S. Total Stock Market Float Adjusted Index
0,2014,"$5,000,000","$5,000,000","$5,000,000"
1,2015,"$5,172,206","$5,173,192","$5,090,054"
2,2015,"$5,161,071","$5,162,318","$5,096,078"
3,2015,"$4,850,557","$4,852,558","$4,725,720"
4,2015,"$5,166,364","$5,169,146","$5,022,045"
5,2016,"$5,184,201","$5,187,759","$5,067,957"
6,2016,"$5,237,051","$5,240,785","$5,200,452"
7,2016,"$5,506,240","$5,510,594","$5,431,090"
8,2016,"$5,483,263","$5,487,362","$5,655,796"
9,2017,"$6,010,352","$6,015,412","$5,983,419"


**üìä Average Annual Returns**

Unnamed: 0.1,Unnamed: 0,1 Year,5 Years,10 Years
0,Institutional Shares,32.68%,18.37%,15.76%
1,CRSP US Large Cap Growth Index,32.73%,18.41%,15.80%
2,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1,2
0,Basic Materials,0.7%,
1,Consumer Discretionary,19.8%,
2,Consumer Staples,0.4%,
3,Energy,0.8%,
4,Financials,2.7%,
5,Health Care,5.7%,
6,Industrials,8.4%,
7,Real Estate,1.3%,
8,Technology,59.0%,
9,Telecommunications,0.9%,






### üè¶ Large-Cap Index Fund

üÜî Context ID:      From2024-01-01to2024-12-31_C000007787Member
üé´ Ticker:          VLACX
üè∑Ô∏è Share Class:     Investor Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 56,785
Expense Ratio       : 0.17
Turnover Rate       : 2
Costs per $10k      : 19
Advisory Fees       : 955
Number of Holdings  : 494

üìù Commentary: "How did the Fund perform during the reporting period?¬†For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Large Cap Index.U.S. economic growth hovered around 3% on a year-over-year basis for much of t..."


**üìà Performance History**

Unnamed: 0.1,Unnamed: 0,Investor Shares,CRSP US Large Cap Index,Dow Jones U.S. Total Stock Market Float Adjusted Index
0,2014,"$10,000","$10,000","$10,000"
1,2015,"$10,125","$10,131","$10,180"
2,2015,"$10,137","$10,147","$10,192"
3,2015,"$9,457","$9,470","$9,451"
4,2015,"$10,093","$10,111","$10,044"
5,2016,"$10,189","$10,217","$10,136"
6,2016,"$10,436","$10,471","$10,401"
7,2016,"$10,855","$10,894","$10,862"
8,2016,"$11,254","$11,298","$11,312"
9,2017,"$11,946","$11,998","$11,967"


**üìä Average Annual Returns**

Unnamed: 0.1,Unnamed: 0,1 Year,5 Years,10 Years
0,Investor Shares,24.95%,14.34%,12.87%
1,CRSP US Large Cap Index,25.15%,14.51%,13.05%
2,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1,2
0,Basic Materials,1.2%,
1,Consumer Discretionary,15.1%,
2,Consumer Staples,4.0%,
3,Energy,3.2%,
4,Financials,10.8%,
5,Health Care,9.8%,
6,Industrials,11.3%,
7,Real Estate,2.0%,
8,Technology,38.0%,
9,Telecommunications,2.0%,






### üè¶ Large-Cap Index Fund

üÜî Context ID:      From2024-01-01to2024-12-31_C000007790Member
üé´ Ticker:          VV
üè∑Ô∏è Share Class:     ETF Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    NYSE

--- üí∞ Costs & Financials ---
Net Assets          : 56,785
Expense Ratio       : 0.04
Turnover Rate       : 2
Costs per $10k      : 5
Advisory Fees       : 955
Number of Holdings  : 494

üìù Commentary: "How did the Fund perform during the reporting period?¬†For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Large Cap Index.U.S. economic growth hovered around 3% on a year-over-year basis for much of t..."


**üìà Performance History**

Unnamed: 0.1,Unnamed: 0,ETF Shares Net Asset Value,CRSP US Large Cap Index,Dow Jones U.S. Total Stock Market Float Adjusted Index
0,2014,"$10,000","$10,000","$10,000"
1,2015,"$10,129","$10,131","$10,180"
2,2015,"$10,145","$10,147","$10,192"
3,2015,"$9,468","$9,470","$9,451"
4,2015,"$10,107","$10,111","$10,044"
5,2016,"$10,207","$10,217","$10,136"
6,2016,"$10,459","$10,471","$10,401"
7,2016,"$10,881","$10,894","$10,862"
8,2016,"$11,284","$11,298","$11,312"
9,2017,"$11,982","$11,998","$11,967"


**üìä Average Annual Returns**

Unnamed: 0.1,Unnamed: 0,1 Year,5 Years,10 Years
0,ETF Shares Net Asset Value,25.12%,14.48%,13.02%
1,ETF Shares Market Price,25.05%,14.48%,13.01%
2,CRSP US Large Cap Index,25.15%,14.51%,13.05%
3,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1,2
0,Basic Materials,1.2%,
1,Consumer Discretionary,15.1%,
2,Consumer Staples,4.0%,
3,Energy,3.2%,
4,Financials,10.8%,
5,Health Care,9.8%,
6,Industrials,11.3%,
7,Real Estate,2.0%,
8,Technology,38.0%,
9,Telecommunications,2.0%,






### üè¶ Large-Cap Index Fund

üÜî Context ID:      From2024-01-01to2024-12-31_C000007788Member
üé´ Ticker:          VLCAX
üè∑Ô∏è Share Class:     Admiral Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 56,785
Expense Ratio       : 0.05
Turnover Rate       : 2
Costs per $10k      : 6
Advisory Fees       : 955
Number of Holdings  : 494

üìù Commentary: "How did the Fund perform during the reporting period?¬†For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Large Cap Index.U.S. economic growth hovered around 3% on a year-over-year basis for much of t..."


**üìà Performance History**

Unnamed: 0.1,Unnamed: 0,Admiral Shares,CRSP US Large Cap Index,Dow Jones U.S. Total Stock Market Float Adjusted Index
0,2014,"$10,000","$10,000","$10,000"
1,2015,"$10,131","$10,131","$10,180"
2,2015,"$10,146","$10,147","$10,192"
3,2015,"$9,468","$9,470","$9,451"
4,2015,"$10,107","$10,111","$10,044"
5,2016,"$10,206","$10,217","$10,136"
6,2016,"$10,457","$10,471","$10,401"
7,2016,"$10,881","$10,894","$10,862"
8,2016,"$11,284","$11,298","$11,312"
9,2017,"$11,981","$11,998","$11,967"


**üìä Average Annual Returns**

Unnamed: 0.1,Unnamed: 0,1 Year,5 Years,10 Years
0,Admiral Shares,25.10%,14.47%,13.01%
1,CRSP US Large Cap Index,25.15%,14.51%,13.05%
2,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1,2
0,Basic Materials,1.2%,
1,Consumer Discretionary,15.1%,
2,Consumer Staples,4.0%,
3,Energy,3.2%,
4,Financials,10.8%,
5,Health Care,9.8%,
6,Industrials,11.3%,
7,Real Estate,2.0%,
8,Technology,38.0%,
9,Telecommunications,2.0%,






### üè¶ Large-Cap Index Fund

üÜî Context ID:      From2024-01-01to2024-12-31_C000007789Member
üé´ Ticker:          VLISX
üè∑Ô∏è Share Class:     Institutional Shares
üìÖ Report Date:     December 31, 2024
üèõÔ∏è Sec Exchange:    N/A

--- üí∞ Costs & Financials ---
Net Assets          : 56,785
Expense Ratio       : 0.04
Turnover Rate       : 2
Costs per $10k      : 5
Advisory Fees       : 955
Number of Holdings  : 494

üìù Commentary: "How did the Fund perform during the reporting period?¬†For the 12 months ended December 31, 2024, the Fund performed in line with its benchmark, the CRSP US Large Cap Index.U.S. economic growth hovered around 3% on a year-over-year basis for much of t..."


**üìà Performance History**

Unnamed: 0.1,Unnamed: 0,Institutional Shares,CRSP US Large Cap Index,Dow Jones U.S. Total Stock Market Float Adjusted Index
0,2014,"$5,000,000","$5,000,000","$5,000,000"
1,2015,"$5,064,717","$5,065,516","$5,090,054"
2,2015,"$5,072,527","$5,073,505","$5,096,078"
3,2015,"$4,733,990","$4,735,147","$4,725,720"
4,2015,"$5,053,400","$5,055,301","$5,022,045"
5,2016,"$5,103,277","$5,108,683","$5,067,957"
6,2016,"$5,229,161","$5,235,336","$5,200,452"
7,2016,"$5,440,308","$5,446,752","$5,431,090"
8,2016,"$5,642,490","$5,649,147","$5,655,796"
9,2017,"$5,991,191","$5,998,908","$5,983,419"


**üìä Average Annual Returns**

Unnamed: 0.1,Unnamed: 0,1 Year,5 Years,10 Years
0,Institutional Shares,25.12%,14.49%,13.02%
1,CRSP US Large Cap Index,25.15%,14.51%,13.05%
2,Dow Jones U.S. Total Stock Market Float Adjust...,23.88%,13.78%,12.48%


**üèóÔ∏è Sector Allocation**

Unnamed: 0,0,1,2
0,Basic Materials,1.2%,
1,Consumer Discretionary,15.1%,
2,Consumer Staples,4.0%,
3,Energy,3.2%,
4,Financials,10.8%,
5,Health Care,9.8%,
6,Industrials,11.3%,
7,Real Estate,2.0%,
8,Technology,38.0%,
9,Telecommunications,2.0%,






In [None]:
from src.simple_rag.models.fund import FinancialHighlights
import pandas as pd
from IPython.display import display

print(len(df_performance))

total_df = pd.concat([df_performance[0], df_performance[1]], ignore_index=True)

returns_lookup = total_df.copy()

display(returns_lookup['fund_name'].unique())

numeric_columns = ['portfolio_turnover', 'expense_ratio', 'net_assets', 
                   'nav_beginning', 'nav_end', 'net_income_ratio', 'distribution_shares']
for col in numeric_columns:
    if col in returns_lookup.columns:
        if returns_lookup[col] is not None:        
            returns_lookup[f'{col}_clean'] = (
                returns_lookup[col]
                .astype(str)
                .str.replace('%', '')
                .str.replace('$', '')
                .str.replace(',', '')
                .replace('N/A', '0')
                .replace('', '0')
                .replace('None', '0')
                .astype(float)
            )
count = 0
# Now you can efficiently match and update your funds
for fund_obj in funds_total:
    print(f"\nProcessing fund object: {fund_obj.name} - {fund_obj.share_class}")
    
    # Initialize annual returns
    if not hasattr(fund_obj, 'annual_returns') or fund_obj.annual_returns is None:
        fund_obj.annual_returns = {}
    
    # Clean the name: remove "Vanguard" and strip whitespace
    name = fund_obj.name.replace("Vanguard", "").strip()
    print(f"Cleaned name: '{name}'")
    
    # Find matching rows based on fund name
    name_matches = returns_lookup[returns_lookup['fund_name'].str.strip().str.lower() == name.lower()]
    if len(name_matches) == 0:
        print("  No name matches found for ticker: ", fund_obj.ticker)

        continue
    
    print(f"  Found {len(name_matches)} name matches")
    
    # Clean share class (remove trademark symbol)
    share_class = fund_obj.share_class
    if "‚Ñ¢" in share_class:
        share_class = share_class.replace("‚Ñ¢", "")
    
    # Now match share class
    share_class_matches = name_matches[
        name_matches['share_class'].str.contains(share_class, case=False, na=False, regex=False)]
    
    if len(share_class_matches) == 0:
        print(f"  No share class matches found for '{share_class}' ticker: ", fund_obj.ticker)
        print(f"  Available share classes: {name_matches['share_class'].unique()}")
        continue
    
    
    print(f"  Found {len(share_class_matches)} matching records")
    count += 1
    # Add all matching returns
    for _, row in share_class_matches.iterrows():
        year = str(row['year'])
        
       
        highlights = FinancialHighlights(
            turnover=row.get('portfolio_turnover_clean', 0),
            expense_ratio=row.get('expense_ratio_clean', 0),
            total_return=row['total_return'],
            net_assets=row.get('net_assets_clean', 0),
            net_assets_value_begining=row.get('nav_beginning_clean', 0),
            net_assets_value_end=row.get('nav_end_clean', 0),
            net_income_ratio=row.get('net_income_ratio_clean', 0.0)
        )
        
        fund_obj.financial_highlights[year] = highlights
        print(f"  {year}: Total Return = {highlights.total_return}%, Expense Ratio = {highlights.expense_ratio}%, Net Assets = {highlights.net_assets}, Net Income Ratio = {highlights.net_income_ratio}, Turnover = {highlights.turnover}, Net Assets Value Begining = {highlights.net_assets_value_begining}, Net Assets Value End = {highlights.net_assets_value_end}")
print("count: ",count)
print("Total funds: ",len(funds_total))
    

2


array(['Small-Cap Index Fund', 'Small-Cap Growth Index Fund',
       'Small-Cap Value Index Fund', 'Extended Market Index Fund',
       'Mid-Cap Index Fund', 'Mid-Cap Growth Index Fund',
       'Mid-Cap Value Index Fund', 'Total Stock Market Index Fund',
       '500 Index Fund', 'Growth Index Fund', 'Value Index Fund',
       'Large-Cap Index Fund'], dtype=object)


Processing fund object: Vanguard Extended Market Index Fund - Investor Shares
Cleaned name: 'Extended Market Index Fund'
  Found 30 name matches
  Found 5 matching records
  2024: Total Return = 16.76%, Expense Ratio = 0.19%, Net Assets = 195.0, Net Income Ratio = 1.09, Turnover = 11.0, Net Assets Value Begining = 124.78, Net Assets Value End = 144.2
  2023: Total Return = 25.22%, Expense Ratio = 0.19%, Net Assets = 232.0, Net Income Ratio = 1.28, Turnover = 11.0, Net Assets Value Begining = 100.93, Net Assets Value End = 124.78
  2022: Total Return = -26.56%, Expense Ratio = 0.19%, Net Assets = 229.0, Net Income Ratio = 1.14, Turnover = 11.0, Net Assets Value Begining = 138.8, Net Assets Value End = 100.93
  2021: Total Return = 12.31%, Expense Ratio = 0.19%, Net Assets = 399.0, Net Income Ratio = 0.87, Turnover = 19.0, Net Assets Value Begining = 124.83, Net Assets Value End = 138.8
  2020: Total Return = 32.04%, Expense Ratio = 0.19%, Net Assets = 454.0, Net Income Ratio = 1.04, Tu

In [None]:
import sys
%reload_ext autoreload
from src.simple_rag.extraction.parser import compute_annual_returns

for fund in funds_total:
    if fund.ticker in performance_funds:
        returns = compute_annual_returns(fund.performance_table)
        print("\nFinal Annual Returns:")
        fund.annual_returns = returns
        print(f"  {fund.ticker}: {returns}")
        for year, return_ in returns.items():
            print(fund.financial_highlights.keys())
            if year not in fund.financial_highlights.keys():
                new_highlight = FinancialHighlights(
                year=int(year),
                total_return=return_,
                turnover=0.0,
                expense_ratio=0.0,
                net_assets=0.0,
                net_assets_value_begining=0.0,
                net_assets_value_end=0.0,
                net_income_ratio=0.0
                )
                fund.financial_highlights[year] = new_highlight
                print(f"    {year}: {new_highlight}")

Detected format: Year (YYYY)
Found years: [np.int64(2014), np.int64(2015), np.int64(2016), np.int64(2017), np.int64(2018), np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2022), np.int64(2023), np.int64(2024)]
  2015 Return: $10,000.00 -> $10,125.00 = 1.25%
  2016 Return: $10,125.00 -> $11,321.00 = 11.81%
  2017 Return: $11,321.00 -> $13,774.00 = 21.67%
  2018 Return: $13,774.00 -> $13,151.00 = -4.52%
  2019 Return: $13,151.00 -> $17,271.00 = 31.33%
  2020 Return: $17,271.00 -> $20,423.00 = 18.25%
  2021 Return: $20,423.00 -> $26,250.00 = 28.53%
  2022 Return: $26,250.00 -> $21,465.00 = -18.23%
  2023 Return: $21,465.00 -> $27,069.00 = 26.11%
  2024 Return: $27,069.00 -> $33,794.00 = 24.84%

Final Annual Returns:
  VFINX: {'2015': 1.25, '2016': 11.81, '2017': 21.67, '2018': -4.52, '2019': 31.33, '2020': 18.25, '2021': 28.53, '2022': -18.23, '2023': 26.11, '2024': 24.84}
dict_keys(['2024', '2023', '2022', '2021', '2020'])
    2015: turnover=0.0 expense_ratio=0.0 total_return=1.

  df['parsed_date'] = pd.to_datetime(df[date_col], errors='coerce')


## Vanguard World Fund

In [None]:
import pandas as pd
from io import StringIO
import sys
from pathlib import Path
RAG_DIR = Path("/home/alvar/CascadeProjects/windsurf-project/RAG")
if str(RAG_DIR) not in sys.path:
    sys.path.insert(0, str(RAG_DIR))
%reload_ext autoreload
from src.simple_rag.extraction.parser import BlackRockFiling
from edgar import set_identity, Company


set_identity("luis.alvarez.conde@alumnos.upm.es")

ticker = "MGK"
fund = Company(ticker)
all_filings = fund.get_filings(form="N-CSR")


if all_filings:
    # 1. Find the most recent date in the entire history (e.g., "2024-12-31")
    latest_date_str = max(f.report_date for f in all_filings)
    
    # 2. Extract just the YEAR (e.g., "2024")
    target_year = latest_date_str[:4]
    
    # 3. Filter: Keep ALL filings where the report_date starts with that year
    # This captures the March, June, and December reports for that fiscal year
    latest_filings = [
        f for f in all_filings 
        if f.report_date and f.report_date.startswith(target_year)
    ]
    target_year = "2024"
    filings2 = sorted(
        [f for f in all_filings if f.report_date and f.report_date.startswith(target_year)],
        key=lambda f: f.report_date,
        reverse=True
    )

    latest_filings.append(filings2[0])
    print("Found filings: ", len(latest_filings), "for year: ", target_year)



performance_funds = []
df_performance = []
world_funds = set()

abort = False
for filing in latest_filings:

    html_content = filing.html()
    
    parser = BlackRockFiling(html_content)
    funds = parser.get_funds()
    count = 0
    for fund in funds:
        if fund.performance_table is not None:
            if fund.ticker not in performance_funds:
                performance_funds.append(fund.ticker)
                count += 1
        if fund.ticker not in world_funds:
            world_funds.add(fund.ticker)
        else:
            print("Exiting filing, repeated ticker found: ", fund.ticker)
            abort = True
            break
    if not abort:

        df_performance.append(parser.get_financial_highlights())
        print(count)
        print("Adding funds: ", len(funds))
        funds_total.extend(funds)

print("Total world funds added: ", len(world_funds))
print(len(performance_funds))
print(performance_funds)

print(len(df_performance))


Found filings:  4 for year:  2024
Processing: Mega Cap Growth Index Fund
Extracting context:  From2024-10-01to2025-09-30_C000055216Member
Processing: Mega Cap Growth Index Fund
Extracting context:  From2024-10-01to2025-09-30_C000055215Member
Tag not found:  dei:SecurityExchangeName From2024-10-01to2025-09-30_C000055215Member
2
Adding funds:  2
Processing: Vanguard Extended Duration Treasury Index Fund
Extracting context:  FY2025_C000051981Member
Tag not found:  dei:SecurityExchangeName FY2025_C000051981Member
Failed to extract tables from block:  oef:LineGraphTableTextBlock
No tables found for block:  oef:LineGraphTableTextBlock
Processing: Vanguard Extended Duration Treasury Index Fund
Extracting context:  FY2025_C000051979Member
Tag not found:  dei:SecurityExchangeName FY2025_C000051979Member
Failed to extract tables from block:  oef:LineGraphTableTextBlock
No tables found for block:  oef:LineGraphTableTextBlock
Processing: Vanguard ESG U.S. Stock ETF
Extracting context:  FY2025_C000

In [None]:
from src.simple_rag.models.fund import FinancialHighlights
import pandas as pd
from IPython.display import display

print(len(df_performance))

total_df = pd.concat([df_performance[0], df_performance[1], df_performance[2]], ignore_index=True)

returns_lookup = total_df.copy()

# Optional: Clean the total_return column (remove % sign if needed)
print(returns_lookup.head())
display(returns_lookup['fund_name'].unique())

numeric_columns = ['portfolio_turnover', 'expense_ratio', 'net_assets', 
                   'nav_beginning', 'nav_end', 'net_income_ratio', 'distribution_shares']


for col in numeric_columns:
    if col in returns_lookup.columns:
        if returns_lookup[col] is not None:  
            try:      
                returns_lookup[f'{col}_clean'] = (
                    returns_lookup[col]
                    .astype(str)
                    .str.replace('%', '')
                    .str.replace('$', '')
                    .str.replace(',', '')
                    .replace('N/A', '0')
                    .replace('', '0')
                    .replace('None', '0')
                    .astype(float)
                )
            except Exception as e:
                print(f"Error cleaning column '{col}': {str(e)}")
                print(returns_lookup[col].to_string())
count = 0
# Now you can efficiently match and update your funds
for fund_obj in funds_total:
    print(f"\nProcessing fund object: {fund_obj.name} - {fund_obj.share_class}")
    
    # Initialize annual returns
    if not hasattr(fund_obj, 'annual_returns') or fund_obj.annual_returns is None:
        fund_obj.annual_returns = {}
    
    # Clean the name: remove "Vanguard" and strip whitespace
    name = fund_obj.name.replace("Vanguard", "").strip()
    print(f"Cleaned name: '{name}'")
    
    if "‚Ñ¢" in name:
        name = name.replace("‚Ñ¢", "")
    elif "¬Æ" in name:
        name = name.replace("¬Æ", "")
    # Find matching rows based on fund name
    name_matches = returns_lookup[returns_lookup['fund_name'].str.strip().str.lower() == name.lower()]
    if len(name_matches) == 0:
        print("  No name matches found for ticker: ", fund_obj.ticker)
        continue
    
    print(f"  Found {len(name_matches)} name matches")
    
    # Clean share class (remove trademark symbol)
    share_class = fund_obj.share_class
    if "‚Ñ¢" in share_class:
        share_class = share_class.replace("‚Ñ¢", "")
    
    # Now match share class
    share_class_matches = name_matches[
        name_matches['share_class'].str.contains(share_class, case=False, na=False, regex=False)]
    
    if len(share_class_matches) == 0:
        print(f"  No share class matches found for '{share_class}' ticker: ", fund_obj.ticker)
        print(f"  Available share classes: {name_matches['share_class'].unique()}")
        continue
    elif len(share_class_matches) > 5:
        print("  More than 5 share class matches found:")
        print(share_class_matches)
    
    
    print(f"  Found {len(share_class_matches)} matching records")
    count += 1
    # Add all matching returns
    for _, row in share_class_matches.iterrows():
        year = str(row['year'])
        
       
        highlights = FinancialHighlights(
            turnover=row.get('portfolio_turnover_clean', 0),
            expense_ratio=row.get('expense_ratio_clean', 0),
            total_return=row['total_return'],
            net_assets=row.get('net_assets_clean', 0),
            net_assets_value_begining=row.get('nav_beginning_clean', 0),
            net_assets_value_end=row.get('nav_end_clean', 0),
            net_income_ratio=row.get('net_income_ratio_clean', 0.0)
        )
        
        fund_obj.financial_highlights[year] = highlights
        print(f"  {year}: Total Return = {highlights.total_return}%, Expense Ratio = {highlights.expense_ratio}%, Net Assets = {highlights.net_assets}, Net Income Ratio = {highlights.net_income_ratio}, Turnover = {highlights.turnover}, Net Assets Value Begining = {highlights.net_assets_value_begining}, Net Assets Value End = {highlights.net_assets_value_end}")
print("count: ",count)
print("Total funds: ",world_funds)
    

3
                    fund_name share_class  year  net_assets  nav_beginning  \
0  Mega Cap Growth Index Fund  ETF Shares  2025     31195.0         321.87   
1  Mega Cap Growth Index Fund  ETF Shares  2024     22954.0         314.83   
2  Mega Cap Growth Index Fund  ETF Shares  2024     21996.0         241.25   
3  Mega Cap Growth Index Fund  ETF Shares  2023     14376.0         195.20   
4  Mega Cap Growth Index Fund  ETF Shares  2022     11168.0         248.50   

   nav_end  total_return  expense_ratio  net_income_ratio  portfolio_turnover  \
0   402.45         25.58           0.07              0.42                14.0   
1   321.87          2.35           0.07              0.40                 6.0   
2   314.83         31.16           0.07              0.51                14.0   
3   241.25         24.39           0.07              0.62                 7.0   
4   195.20        -21.08           0.07              0.51                 5.0   

  distribution_shares  
0                N

array(['Mega Cap Growth Index Fund',
       'Extended Duration Treasury Index Fund', 'ESG U.S. Stock ETF',
       'ESG International Stock ETF', 'Global Wellington Fund',
       'Global Wellesley Income Fund', 'ESG U.S. Corporate Bond ETF',
       'U.S. Growth Fund', 'International Growth Fund',
       'FTSE Social Index Fund', 'Communication Services Index Fund',
       'Consumer Discretionary Index Fund', 'Consumer Staples Index Fund',
       'Energy Index Fund', 'Financials Index Fund',
       'Health Care Index Fund', 'Industrials Index Fund',
       'Information Technology Index Fund', 'Materials Index Fund',
       'Utilities Index Fund', 'Mega Cap Index Fund',
       'Mega Cap Value Index Fund'], dtype=object)


Processing fund object: Vanguard Extended Market Index Fund - Investor Shares
Cleaned name: 'Extended Market Index Fund'
  No name matches found for ticker:  VEXMX

Processing fund object: Vanguard Extended Market Index Fund - ETF Shares
Cleaned name: 'Extended Market Index Fund'
  No name matches found for ticker:  VXF

Processing fund object: Vanguard Extended Market Index Fund - Admiral Shares
Cleaned name: 'Extended Market Index Fund'
  No name matches found for ticker:  VEXAX

Processing fund object: Vanguard Extended Market Index Fund - Institutional Shares
Cleaned name: 'Extended Market Index Fund'
  No name matches found for ticker:  VIEIX

Processing fund object: Vanguard Extended Market Index Fund - Institutional Plus Shares
Cleaned name: 'Extended Market Index Fund'
  No name matches found for ticker:  VEMPX

Processing fund object: Vanguard Extended Market Index Fund - Institutional Select Shares
Cleaned name: 'Extended Market Index Fund'
  No name matches found for ticker

In [None]:
import sys
%reload_ext autoreload
sys.path.append('../src')


from simple_rag.extraction.parser import compute_annual_returns

for fund in funds_total:
    if fund.ticker in performance_funds:
        returns = compute_annual_returns(fund.performance_table)
        print("\nFinal Annual Returns:")
        fund.annual_returns = returns
        print(f"  {fund.ticker}: {returns}")
        for year, return_ in returns.items():
            print(fund.financial_highlights.keys())
            if year not in fund.financial_highlights.keys():
                new_highlight = FinancialHighlights(
                year=int(year),
                total_return=return_,
                turnover=0.0,
                expense_ratio=0.0,
                net_assets=0.0,
                net_assets_value_begining=0.0,
                net_assets_value_end=0.0,
                net_income_ratio=0.0
                )
                fund.financial_highlights[year] = new_highlight
                print(f"    {year}: {new_highlight}")

Detected format: Year (YYYY)
Found years: [np.int64(2015), np.int64(2016), np.int64(2017), np.int64(2018), np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2022), np.int64(2023), np.int64(2024), np.int64(2025)]
  2016 Return: $10,724.00 -> $11,409.00 = 6.39%
  2017 Return: $11,409.00 -> $14,772.00 = 29.48%
  2018 Return: $14,772.00 -> $14,349.00 = -2.86%
  2019 Return: $14,349.00 -> $19,736.00 = 37.54%
  2020 Return: $19,736.00 -> $27,826.00 = 40.99%
  2021 Return: $27,826.00 -> $35,753.00 = 28.49%
  2022 Return: $35,753.00 -> $23,755.00 = -33.56%
  2023 Return: $23,755.00 -> $36,004.00 = 51.56%
  2024 Return: $36,004.00 -> $47,873.00 = 32.97%
  2025 Return: $47,873.00 -> $56,289.00 = 17.58%

Final Annual Returns:
  MGK: {'2016': 6.39, '2017': 29.48, '2018': -2.86, '2019': 37.54, '2020': 40.99, '2021': 28.49, '2022': -33.56, '2023': 51.56, '2024': 32.97, '2025': 17.58}
dict_keys(['2025', '2024', '2023', '2022', '2021'])
    2016: turnover=0.0 expense_ratio=0.0 total_return=6.39

In [None]:
import pickle
from pathlib import Path
import sys

# Add RAG directory to path
RAG_DIR = Path("/home/alvar/CascadeProjects/windsurf-project/RAG")
if str(RAG_DIR) not in sys.path:
    sys.path.insert(0, str(RAG_DIR))

# Define pickle file path
PKL_PATH = Path("./funds_backup.pkl")

print("Current working directory:", Path.cwd())
print("PKL_PATH resolves to:", PKL_PATH.resolve())

# Save to pickle file
try:
    with PKL_PATH.open("wb") as f:
        pickle.dump(funds_total, f)
    
    print(f"Successfully saved {len(funds_total)} funds to pickle file")
    print(f"File size: {PKL_PATH.stat().st_size / 1024:.2f} KB")
    
except Exception as e:
    print(f"Error saving to pickle file: {e}")

Current working directory: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks
PKL_PATH resolves to: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks/funds_backup.pkl
Successfully saved 93 funds to pickle file
File size: 395.12 KB


## Vanguard Specialized Funds

In [None]:
import pandas as pd
from io import StringIO

import sys
from pathlib import Path
from src.simple_rag.extraction.parser import BlackRockFiling
from edgar import set_identity, Company


set_identity("luis.alvarez.conde@alumnos.upm.es")

ticker = "VDIGX"
fund = Company(ticker)
all_filings = fund.get_filings(form="N-CSR")


if all_filings:
    # 1. Find the most recent date in the entire history (e.g., "2024-12-31")
    latest_date_str = max(f.report_date for f in all_filings)
    
    # 2. Extract just the YEAR (e.g., "2024")
    target_year = latest_date_str[:4]
    
    # 3. Filter: Keep ALL filings where the report_date starts with that year
    # This captures the March, June, and December reports for that fiscal year
    latest_filings = [
        f for f in all_filings 
        if f.report_date and f.report_date.startswith(target_year)
    ]
    
    print("Found filings: ", len(latest_filings), "for year: ", target_year)



performance_funds = []
specialized_funds = set()
df_performance = []
abort = False
for filing in latest_filings:

    html_content = filing.html()
    
    parser = BlackRockFiling(html_content)
    funds = parser.get_funds()
    count = 0
    for fund in funds:
        if fund.performance_table is not None:
            if fund.ticker not in performance_funds: 
                performance_funds.append(fund.ticker)
                count += 1
        if fund.ticker not in specialized_funds:
            specialized_funds.add(fund.ticker)
        else:
            print("Exiting filing, repeated ticker found: ", fund.ticker)
            abort = True
            break
    if abort:
        break
    df_performance.append(parser.get_financial_highlights())

    print(count)
    print("Adding funds: ", len(funds))
    funds_total.extend(funds)

print("Specialized funds: ", len(specialized_funds))
print(len(performance_funds))
print(performance_funds)
print(len(df_performance))



Found filings:  2 for year:  2025
Processing: Dividend Growth Fund
Extracting context:  From2024-02-01to2025-01-31_C000008004Member
Tag not found:  dei:SecurityExchangeName From2024-02-01to2025-01-31_C000008004Member
Processing: Energy Fund
Extracting context:  From2024-02-01to2025-01-31_C000008005Member
Tag not found:  dei:SecurityExchangeName From2024-02-01to2025-01-31_C000008005Member
Processing: Energy Fund
Extracting context:  From2024-02-01to2025-01-31_C000008006Member
Tag not found:  dei:SecurityExchangeName From2024-02-01to2025-01-31_C000008006Member
Processing: Health Care Fund
Extracting context:  From2024-02-01to2025-01-31_C000008007Member
Tag not found:  dei:SecurityExchangeName From2024-02-01to2025-01-31_C000008007Member
Processing: Health Care Fund
Extracting context:  From2024-02-01to2025-01-31_C000008008Member
Tag not found:  dei:SecurityExchangeName From2024-02-01to2025-01-31_C000008008Member
Processing: Dividend Appreciation Index Fund
Extracting context:  From2024-02

In [None]:
from src.simple_rag.models.fund import FinancialHighlights
import pandas as pd
from IPython.display import display

print(len(df_performance))

total_df = pd.concat([df_performance[0], df_performance[1]], ignore_index=True)

returns_lookup = total_df.copy()

# Optional: Clean the total_return column (remove % sign if needed)
print(returns_lookup.head())
display(returns_lookup['fund_name'].unique())

numeric_columns = ['portfolio_turnover', 'expense_ratio', 'net_assets', 
                   'nav_beginning', 'nav_end', 'net_income_ratio', 'distribution_shares']


for col in numeric_columns:
    if col in returns_lookup.columns:
        if returns_lookup[col] is not None:  
            try:      
                returns_lookup[f'{col}_clean'] = (
                    returns_lookup[col]
                    .astype(str)
                    .str.replace('%', '')
                    .str.replace('$', '')
                    .str.replace(',', '')
                    .replace('N/A', '0')
                    .replace('', '0')
                    .replace('None', '0')
                    .astype(float)
                )
            except Exception as e:
                print(f"Error cleaning column '{col}': {str(e)}")
                print(returns_lookup[col].to_string())
count = 0
# Now you can efficiently match and update your funds
for fund_obj in funds_total:
    if fund_obj.ticker not in specialized_funds:
        continue
    print(f"\nProcessing fund object: {fund_obj.name} - {fund_obj.share_class}")
    
    # Initialize annual returns
    if not hasattr(fund_obj, 'annual_returns') or fund_obj.annual_returns is None:
        fund_obj.annual_returns = {}
    
    # Clean the name: remove "Vanguard" and strip whitespace
    name = fund_obj.name.replace("Vanguard", "").strip()
    print(f"Cleaned name: '{name}'")
    
    if "‚Ñ¢" in name:
        name = name.replace("‚Ñ¢", "")
    elif "¬Æ" in name:
        name = name.replace("¬Æ", "")
    # Find matching rows based on fund name
    name_matches = returns_lookup[returns_lookup['fund_name'].str.strip().str.lower() == name.lower()]
    if len(name_matches) == 0:
        print("  No name matches found for ticker: ", fund_obj.ticker)
        continue
    
    print(f"  Found {len(name_matches)} name matches")
    
    # Clean share class (remove trademark symbol)
    share_class = fund_obj.share_class
    if "‚Ñ¢" in share_class:
        share_class = share_class.replace("‚Ñ¢", "")
    
    # Now match share class
    share_class_matches = name_matches[
        name_matches['share_class'].str.contains(share_class, case=False, na=False, regex=False)]
    
    if len(share_class_matches) == 0:
        print(f"  No share class matches found for '{share_class}' ticker: ", fund_obj.ticker)
        print(f"  Available share classes: {name_matches}")
        print(f"  Found {len(share_class_matches)} matching records")
        count += 1
        # Add all matching returns
        for _, row in share_class_matches.iterrows():
            year = str(row['year'])
            highlights = FinancialHighlights(
                turnover=row.get('portfolio_turnover_clean', 0),
                expense_ratio=row.get('expense_ratio_clean', 0),
                total_return=row['total_return'],
                net_assets=row.get('net_assets_clean', 0),
                net_assets_value_begining=row.get('nav_beginning_clean', 0),
                net_assets_value_end=row.get('nav_end_clean', 0),
                net_income_ratio=row.get('net_income_ratio_clean', 0.0)
            )
            
            fund_obj.financial_highlights[year] = highlights
            print(f"  {year}: Total Return = {highlights.total_return}%, Expense Ratio = {highlights.expense_ratio}%, Net Assets = {highlights.net_assets}, Net Income Ratio = {highlights.net_income_ratio}, Turnover = {highlights.turnover}, Net Assets Value Begining = {highlights.net_assets_value_begining}, Net Assets Value End = {highlights.net_assets_value_end}")
        continue
    elif len(share_class_matches) > 5:
        print("  More than 5 share class matches found:")
        print(share_class_matches)
    
    
    print(f"  Found {len(share_class_matches)} matching records")
    count += 1
    # Add all matching returns
    for _, row in share_class_matches.iterrows():
        year = str(row['year'])
        highlights = FinancialHighlights(
            turnover=row.get('portfolio_turnover_clean', 0),
            expense_ratio=row.get('expense_ratio_clean', 0),
            total_return=row['total_return'],
            net_assets=row.get('net_assets_clean', 0),
            net_assets_value_begining=row.get('nav_beginning_clean', 0),
            net_assets_value_end=row.get('nav_end_clean', 0),
            net_income_ratio=row.get('net_income_ratio_clean', 0.0)
        )
        
        fund_obj.financial_highlights[year] = highlights
        print(f"  {year}: Total Return = {highlights.total_return}%, Expense Ratio = {highlights.expense_ratio}%, Net Assets = {highlights.net_assets}, Net Income Ratio = {highlights.net_income_ratio}, Turnover = {highlights.turnover}, Net Assets Value Begining = {highlights.net_assets_value_begining}, Net Assets Value End = {highlights.net_assets_value_end}")
print("count: ",count)
print("Total funds: ",specialized_funds)
    

2
              fund_name share_class  year  net_assets  nav_beginning  nav_end  \
0  Dividend Growth Fund        None  2025     50424.0          37.76    37.14   
1  Dividend Growth Fund        None  2024     52553.0          35.42    37.76   
2  Dividend Growth Fund        None  2023     53452.0          37.85    35.42   
3  Dividend Growth Fund        None  2022     54186.0          31.82    37.85   
4  Dividend Growth Fund        None  2021     45099.0          30.63    31.82   

   total_return  expense_ratio  net_income_ratio  portfolio_turnover  \
0         10.20           0.22              1.68                16.0   
1          9.11           0.29              1.74                 9.0   
2         -0.76           0.30              1.68                11.0   
3         25.66           0.27              1.56                15.0   
4          7.03           0.26              1.85                15.0   

  distribution_shares  
0                None  
1                None  
2     

array(['Dividend Growth Fund', 'Energy Fund', 'Health Care Fund',
       'Dividend Appreciation Index Fund', 'Real Estate Index Fund',
       'Real Estate II Index Fund', 'Global Capital Cycles Fund',
       'Global ESG Select Stock Fund'], dtype=object)


Processing fund object: Dividend Growth Fund - Investor Shares
Cleaned name: 'Dividend Growth Fund'
  Found 5 name matches
  No share class matches found for 'Investor Shares' ticker:  VDIGX
  Available share classes:               fund_name share_class  year  net_assets  nav_beginning  nav_end  \
0  Dividend Growth Fund        None  2025     50424.0          37.76    37.14   
1  Dividend Growth Fund        None  2024     52553.0          35.42    37.76   
2  Dividend Growth Fund        None  2023     53452.0          37.85    35.42   
3  Dividend Growth Fund        None  2022     54186.0          31.82    37.85   
4  Dividend Growth Fund        None  2021     45099.0          30.63    31.82   

   total_return  expense_ratio  net_income_ratio  portfolio_turnover  \
0         10.20           0.22              1.68                16.0   
1          9.11           0.29              1.74                 9.0   
2         -0.76           0.30              1.68                11.0   
3     

In [None]:
import sys
%reload_ext autoreload
sys.path.append('../src')


from simple_rag.extraction.parser import compute_annual_returns

for fund in funds_total:
    if fund.ticker in performance_funds:
        returns = compute_annual_returns(fund.performance_table)
        print("\nFinal Annual Returns:")
        fund.annual_returns = returns
        print(f"  {fund.ticker}: {returns}")
        for year, return_ in returns.items():
            print(fund.financial_highlights.keys())
            if year not in fund.financial_highlights.keys():
                new_highlight = FinancialHighlights(
                year=int(year),
                total_return=return_,
                turnover=0.0,
                expense_ratio=0.0,
                net_assets=0.0,
                net_assets_value_begining=0.0,
                net_assets_value_end=0.0,
                net_income_ratio=0.0
                )
                fund.financial_highlights[year] = new_highlight
                print(f"    {year}: {new_highlight}")

Detected format: Year (YYYY)
Found years: [np.int64(2015), np.int64(2016), np.int64(2017), np.int64(2018), np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2022), np.int64(2023), np.int64(2024), np.int64(2025)]
  2016 Return: $10,619.00 -> $10,874.00 = 2.40%
  2017 Return: $10,874.00 -> $12,860.00 = 18.26%
  2018 Return: $12,860.00 -> $14,143.00 = 9.98%
  2019 Return: $14,143.00 -> $16,879.00 = 19.35%
  2020 Return: $16,879.00 -> $17,545.00 = 3.95%
  2021 Return: $17,545.00 -> $23,646.00 = 34.77%
  2022 Return: $23,646.00 -> $22,953.00 = -2.93%
  2023 Return: $22,953.00 -> $23,134.00 = 0.79%
  2024 Return: $23,134.00 -> $28,121.00 = 21.56%
  2025 Return: $28,121.00 -> $28,555.00 = 1.54%

Final Annual Returns:
  VDIGX: {'2016': 2.4, '2017': 18.26, '2018': 9.98, '2019': 19.35, '2020': 3.95, '2021': 34.77, '2022': -2.93, '2023': 0.79, '2024': 21.56, '2025': 1.54}
dict_keys([])
    2016: turnover=0.0 expense_ratio=0.0 total_return=2.4 net_assets=0.0 net_assets_value_begining=0.0 ne

In [None]:
import pickle
from pathlib import Path
import sys

# Add RAG directory to path
RAG_DIR = Path("/home/alvar/CascadeProjects/windsurf-project/RAG")
if str(RAG_DIR) not in sys.path:
    sys.path.insert(0, str(RAG_DIR))

# Define pickle file path
PKL_PATH = Path("./funds_backup.pkl")

print("Current working directory:", Path.cwd())
print("PKL_PATH resolves to:", PKL_PATH.resolve())

# Save to pickle file
try:
    with PKL_PATH.open("wb") as f:
        pickle.dump(funds_total, f)
    
    print(f"Successfully saved {len(funds_total)} funds to pickle file")
    print(f"File size: {PKL_PATH.stat().st_size / 1024:.2f} KB")
    
except Exception as e:
    print(f"Error saving to pickle file: {e}")

Current working directory: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks
PKL_PATH resolves to: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks/funds_backup.pkl
Successfully saved 107 funds to pickle file
File size: 451.74 KB


In [None]:
import pickle
from pathlib import Path
from dataclasses import is_dataclass, asdict
import pandas as pd
import sys
from pathlib import Path
RAG_DIR = Path("/home/alvar/CascadeProjects/windsurf-project/RAG")
if str(RAG_DIR) not in sys.path:
    sys.path.insert(0, str(RAG_DIR))


PKL_PATH = Path("./funds_backup.pkl")
print("Current working directory:", Path.cwd())
print("PKL_PATH resolves to:", PKL_PATH.resolve())
with PKL_PATH.open("rb") as f:
    funds_total = pickle.load(f)

print(f"Loaded {len(funds_total)} funds from pickle file")

Current working directory: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks
PKL_PATH resolves to: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks/funds_backup.pkl
Loaded 107 funds from pickle file


## Vanguard Whitehall Funds

In [None]:
import pandas as pd
from io import StringIO
import sys
from pathlib import Path
from src.simple_rag.extraction.parser import BlackRockFiling
from edgar import set_identity, Company


set_identity("luis.alvarez.conde@alumnos.upm.es")
ticker = "VMGRX"
fund = Company(ticker)
all_filings = fund.get_filings(form="N-CSR")


if all_filings:
    # 1. Find the most recent date in the entire history (e.g., "2024-12-31")
    latest_date_str = max(f.report_date for f in all_filings)
    
    # 2. Extract just the YEAR (e.g., "2024")
    target_year = latest_date_str[:4]
    
    # 3. Filter: Keep ALL filings where the report_date starts with that year
    # This captures the March, June, and December reports for that fiscal year
    latest_filings = [
        f for f in all_filings 
        if f.report_date and f.report_date.startswith(target_year)
    ]
    
    print("Found filings: ", len(latest_filings), "for year: ", target_year)



performance_funds = []
whitehall_funds = set()
df_performance = []
abort = False

for filing in latest_filings:

    html_content = filing.html()
    
    parser = BlackRockFiling(html_content)
    funds = parser.get_funds()
    count = 0
    for fund in funds:
        if fund.performance_table is not None:
            if fund.ticker not in performance_funds: 
                performance_funds.append(fund.ticker)
                count += 1

        if fund.ticker not in whitehall_funds:
            whitehall_funds.add(fund.ticker)
        else:
            print("Exiting filing, repeated ticker found: ", fund.ticker)
            abort = True
            break
    if abort:
        break

    df_performance.append(parser.get_financial_highlights())

    print(count)
    print("Adding funds: ", len(funds))
    funds_total.extend(funds)

print("Whitehall funds: ", len(whitehall_funds))
print(len(performance_funds))
print(whitehall_funds)
print(len(df_performance))


Found filings:  2 for year:  2025
Processing: Mid-Cap Growth Fund
Extracting context:  From2024-11-01to2025-10-31_C000012166Member
Tag not found:  dei:SecurityExchangeName From2024-11-01to2025-10-31_C000012166Member
Processing: Selected Value Fund
Extracting context:  From2024-11-01to2025-10-31_C000012167Member
Tag not found:  dei:SecurityExchangeName From2024-11-01to2025-10-31_C000012167Member
Processing: Emerging Markets Government Bond Index Fund
Extracting context:  From2024-11-01to2025-10-31_C000126408Member
Processing: Emerging Markets Government Bond Index Fund
Extracting context:  From2024-11-01to2025-10-31_C000126407Member
Tag not found:  dei:SecurityExchangeName From2024-11-01to2025-10-31_C000126407Member
Processing: Emerging Markets Government Bond Index Fund
Extracting context:  From2024-11-01to2025-10-31_C000126409Member
Tag not found:  dei:SecurityExchangeName From2024-11-01to2025-10-31_C000126409Member
Processing: Global Minimum Volatility Fund
Extracting context:  From2

In [None]:
from src.simple_rag.models.fund import FinancialHighlights
import pandas as pd
from IPython.display import display

print(len(df_performance))

total_df = pd.concat([df_performance[0], df_performance[1]], ignore_index=True)

returns_lookup = total_df.copy()

# Optional: Clean the total_return column (remove % sign if needed)
print(returns_lookup.head())
returns_lookup['fund_name'] = (
    returns_lookup['fund_name']
    .str.replace('\n', ' ', regex=False)
)
display(returns_lookup['fund_name'].unique())

numeric_columns = ['portfolio_turnover', 'expense_ratio', 'net_assets', 
                   'nav_beginning', 'nav_end', 'net_income_ratio', 'distribution_shares']


for col in numeric_columns:
    if col in returns_lookup.columns:
        if returns_lookup[col] is not None:  
            try:      
                returns_lookup[f'{col}_clean'] = (
                    returns_lookup[col]
                    .astype(str)
                    .str.replace('%', '')
                    .str.replace('$', '')
                    .str.replace(',', '')
                    .replace('N/A', '0')
                    .replace('', '0')
                    .replace('None', '0')
                    .astype(float)
                )
            except Exception as e:
                print(f"Error cleaning column '{col}': {str(e)}")
                print(returns_lookup[col].to_string())
count = 0
# Now you can efficiently match and update your funds
for fund_obj in funds_total:
    if fund_obj.ticker not in whitehall_funds:
        continue
    print(f"\nProcessing fund object: {fund_obj.name} - {fund_obj.share_class}")
    
    # Initialize annual returns
    if not hasattr(fund_obj, 'annual_returns') or fund_obj.annual_returns is None:
        fund_obj.annual_returns = {}
    
    # Clean the name: remove "Vanguard" and strip whitespace
    name = fund_obj.name.replace("Vanguard", "").strip()
    print(f"Cleaned name: '{name}'")
    
    if "‚Ñ¢" in name:
        name = name.replace("‚Ñ¢", "")
    elif "¬Æ" in name:
        name = name.replace("¬Æ", "")
    elif "\n" in name:
        name = name.replace("\n", " ")
        print(name)
    # Find matching rows based on fund name
    name_matches = returns_lookup[returns_lookup['fund_name'].str.strip().str.lower() == name.lower()]
    if len(name_matches) == 0:
        print("  No name matches found for ticker: ", fund_obj.ticker)
        continue
    
    print(f"  Found {len(name_matches)} name matches")
    
    # Clean share class (remove trademark symbol)
    share_class = fund_obj.share_class
    if "‚Ñ¢" in share_class:
        share_class = share_class.replace("‚Ñ¢", "")
    
    # Now match share class
    share_class_matches = name_matches[
        name_matches['share_class'].str.contains(share_class, case=False, na=False, regex=False)]
    
    if len(share_class_matches) == 0:
        print(f"  No share class matches found for '{share_class}' ticker: ", fund_obj.ticker)
        print(f"  Found {len(name_matches)} name records")
        count += 1
        # Add all matching returns
        for _, row in name_matches.iterrows():
            year = str(row['year'])
            highlights = FinancialHighlights(
                turnover=row.get('portfolio_turnover_clean', 0),
                expense_ratio=row.get('expense_ratio_clean', 0),
                total_return=row['total_return'],
                net_assets=row.get('net_assets_clean', 0),
                net_assets_value_begining=row.get('nav_beginning_clean', 0),
                net_assets_value_end=row.get('nav_end_clean', 0),
                net_income_ratio=row.get('net_income_ratio_clean', 0.0)
            )
            
            fund_obj.financial_highlights[year] = highlights
            print(f"  {year}: Total Return = {highlights.total_return}%, Expense Ratio = {highlights.expense_ratio}%, Net Assets = {highlights.net_assets}, Net Income Ratio = {highlights.net_income_ratio}, Turnover = {highlights.turnover}, Net Assets Value Begining = {highlights.net_assets_value_begining}, Net Assets Value End = {highlights.net_assets_value_end}")
        continue
    elif len(share_class_matches) > 5:
        print("  More than 5 share class matches found:")
        print(share_class_matches)
    
    
    print(f"  Found {len(share_class_matches)} matching records")
    count += 1
    # Add all matching returns
    for _, row in share_class_matches.iterrows():
        year = str(row['year'])
        highlights = FinancialHighlights(
            turnover=row.get('portfolio_turnover_clean', 0),
            expense_ratio=row.get('expense_ratio_clean', 0),
            total_return=row['total_return'],
            net_assets=row.get('net_assets_clean', 0),
            net_assets_value_begining=row.get('nav_beginning_clean', 0),
            net_assets_value_end=row.get('nav_end_clean', 0),
            net_income_ratio=row.get('net_income_ratio_clean', 0.0)
        )
        
        fund_obj.financial_highlights[year] = highlights
        print(f"  {year}: Total Return = {highlights.total_return}%, Expense Ratio = {highlights.expense_ratio}%, Net Assets = {highlights.net_assets}, Net Income Ratio = {highlights.net_income_ratio}, Turnover = {highlights.turnover}, Net Assets Value Begining = {highlights.net_assets_value_begining}, Net Assets Value End = {highlights.net_assets_value_end}")
print("count: ",count)
print("Total funds: ",whitehall_funds)
    

2
             fund_name share_class  year  net_assets  nav_beginning  nav_end  \
0  Mid-Cap Growth Fund        None  2025      3116.0          26.21    29.55   
1  Mid-Cap Growth Fund        None  2024      3042.0          19.38    26.21   
2  Mid-Cap Growth Fund        None  2023      2530.0          19.24    19.38   
3  Mid-Cap Growth Fund        None  2022      2956.0          38.72    19.24   
4  Mid-Cap Growth Fund        None  2021      5290.0          29.89    38.72   

   total_return  expense_ratio  net_income_ratio  portfolio_turnover  \
0         14.77           0.32              0.26              1285.0   
1         35.77           0.33              0.37                69.0   
2          0.99           0.37              0.37                87.0   
3        -32.22           0.35              0.14                71.0   
4         37.68           0.33             -0.04                98.0   

  distribution_shares  
0                None  
1                None  
2           

array(['Mid-Cap Growth Fund', 'Selected Value Fund',
       'Emerging Markets Government Bond Index Fund',
       'Global Minimum Volatility Fund',
       'International Dividend Appreciation Index Fund',
       'International High Dividend Yield Index Fund',
       'International Dividend Growth Fund',
       'Advice Select International Growth Fund',
       'Advice Select Dividend Growth Fund',
       'Advice Select Global Value Fund', 'International Explorer Fund',
       'High Dividend Yield Index Fund'], dtype=object)


Processing fund object: Mid-Cap Growth Fund - Investor Shares
Cleaned name: 'Mid-Cap Growth Fund'
  Found 5 name matches
  No share class matches found for 'Investor Shares' ticker:  VMGRX
  Found 5 name records
  2025: Total Return = 14.77%, Expense Ratio = 0.32%, Net Assets = 3116.0, Net Income Ratio = 0.26, Turnover = 1285.0, Net Assets Value Begining = 26.21, Net Assets Value End = 29.55
  2024: Total Return = 35.77%, Expense Ratio = 0.33%, Net Assets = 3042.0, Net Income Ratio = 0.37, Turnover = 69.0, Net Assets Value Begining = 19.38, Net Assets Value End = 26.21
  2023: Total Return = 0.99%, Expense Ratio = 0.37%, Net Assets = 2530.0, Net Income Ratio = 0.37, Turnover = 87.0, Net Assets Value Begining = 19.24, Net Assets Value End = 19.38
  2022: Total Return = -32.22%, Expense Ratio = 0.35%, Net Assets = 2956.0, Net Income Ratio = 0.14, Turnover = 71.0, Net Assets Value Begining = 38.72, Net Assets Value End = 19.24
  2021: Total Return = 37.68%, Expense Ratio = 0.33%, Net Ass

In [None]:
import sys
%reload_ext autoreload
sys.path.append('../src')


from simple_rag.extraction.parser import compute_annual_returns

for fund in funds_total:
    if fund.ticker in performance_funds:
        returns = compute_annual_returns(fund.performance_table)
        print("\nFinal Annual Returns:")
        fund.annual_returns = returns
        print(f"  {fund.ticker}: {returns}")
        for year, return_ in returns.items():
            print(fund.financial_highlights.keys())
            if year not in fund.financial_highlights.keys():
                new_highlight = FinancialHighlights(
                year=int(year),
                total_return=return_,
                turnover=0.0,
                expense_ratio=0.0,
                net_assets=0.0,
                net_assets_value_begining=0.0,
                net_assets_value_end=0.0,
                net_income_ratio=0.0
                )
                fund.financial_highlights[year] = new_highlight
                print(f"    {year}: {new_highlight}")

Detected format: Year (YYYY)
Found years: [np.int64(2015), np.int64(2016), np.int64(2017), np.int64(2018), np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2022), np.int64(2023), np.int64(2024), np.int64(2025)]
  2016 Return: $10,000.00 -> $9,451.00 = -5.49%
  2017 Return: $9,451.00 -> $11,595.00 = 22.69%
  2018 Return: $11,595.00 -> $12,710.00 = 9.62%
  2019 Return: $12,710.00 -> $14,434.00 = 13.56%
  2020 Return: $14,434.00 -> $17,079.00 = 18.32%
  2021 Return: $17,079.00 -> $23,515.00 = 37.68%
  2022 Return: $23,515.00 -> $15,939.00 = -32.22%
  2023 Return: $15,939.00 -> $16,097.00 = 0.99%
  2024 Return: $16,097.00 -> $21,856.00 = 35.78%
  2025 Return: $21,856.00 -> $25,084.00 = 14.77%

Final Annual Returns:
  VMGRX: {'2016': -5.49, '2017': 22.69, '2018': 9.62, '2019': 13.56, '2020': 18.32, '2021': 37.68, '2022': -32.22, '2023': 0.99, '2024': 35.78, '2025': 14.77}
dict_keys(['2025', '2024', '2023', '2022', '2021'])
    2016: turnover=0.0 expense_ratio=0.0 total_return=-5.49 

Found year-end data for years: [np.int64(2016), np.int64(2017), np.int64(2018), np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2022), np.int64(2023), np.int64(2024), np.int64(2025)]
  2017 Return: $10,964.00 -> $13,177.00 = 20.18%
  2018 Return: $13,177.00 -> $12,249.00 = -7.04%
  2019 Return: $12,249.00 -> $14,572.00 = 18.96%
  2020 Return: $14,572.00 -> $15,116.00 = 3.73%
  2021 Return: $15,116.00 -> $19,640.00 = 29.93%
  2022 Return: $19,640.00 -> $15,143.00 = -22.90%
  2023 Return: $15,143.00 -> $16,799.00 = 10.94%
  2024 Return: $16,799.00 -> $20,576.00 = 22.48%
  2025 Return: $20,576.00 -> $22,440.00 = 9.06%

Final Annual Returns:
  VIGI: {'2017': 20.18, '2018': -7.04, '2019': 18.96, '2020': 3.73, '2021': 29.93, '2022': -22.9, '2023': 10.94, '2024': 22.48, '2025': 9.06}
dict_keys(['2025', '2024', '2023', '2022', '2021'])
    2017: turnover=0.0 expense_ratio=0.0 total_return=20.18 net_assets=0.0 net_assets_value_begining=0.0 net_assets_value_end=0.0 net_income_ratio=0.0


  df['parsed_date'] = pd.to_datetime(df[date_col], errors='coerce')
  df['parsed_date'] = pd.to_datetime(df[date_col], errors='coerce')
  df['parsed_date'] = pd.to_datetime(df[date_col], errors='coerce')
  df['parsed_date'] = pd.to_datetime(df[date_col], errors='coerce')
  df['parsed_date'] = pd.to_datetime(df[date_col], errors='coerce')


In [None]:
import pickle
from pathlib import Path
import sys

# Add RAG directory to path
RAG_DIR = Path("/home/alvar/CascadeProjects/windsurf-project/RAG")
if str(RAG_DIR) not in sys.path:
    sys.path.insert(0, str(RAG_DIR))

# Define pickle file path
PKL_PATH = Path("./funds_backup.pkl")

print("Current working directory:", Path.cwd())
print("PKL_PATH resolves to:", PKL_PATH.resolve())

# Save to pickle file
try:
    with PKL_PATH.open("wb") as f:
        pickle.dump(funds_total, f)
    
    print(f"Successfully saved {len(funds_total)} funds to pickle file")
    print(f"File size: {PKL_PATH.stat().st_size / 1024:.2f} KB")
    
except Exception as e:
    print(f"Error saving to pickle file: {e}")

Current working directory: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks
PKL_PATH resolves to: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks/funds_backup.pkl
Successfully saved 125 funds to pickle file
File size: 530.06 KB


In [None]:
import pickle
from pathlib import Path
from dataclasses import is_dataclass, asdict
import pandas as pd
import sys
from pathlib import Path
RAG_DIR = Path("/home/alvar/CascadeProjects/windsurf-project/RAG/src")
if str(RAG_DIR) not in sys.path:
    sys.path.insert(0, str(RAG_DIR))


PKL_PATH = Path("./funds_backup.pkl")
print("Current working directory:", Path.cwd())
print("PKL_PATH resolves to:", PKL_PATH.resolve())
with PKL_PATH.open("rb") as f:
    funds_total = pickle.load(f)

print(f"Loaded {len(funds_total)} funds from pickle file")

Current working directory: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks
PKL_PATH resolves to: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks/funds_backup.pkl
Loaded 125 funds from pickle file


## Ishares

In [None]:
from concurrent.futures import ProcessPoolExecutor, as_completed
import pandas as pd
from typing import List
import sys
from pathlib import Path
from tqdm import tqdm
%reload_ext autoreload
from simple_rag.extraction.parser import BlackRockFiling
from edgar import set_identity, Company

set_identity("luis.alvarez.conde@alumnos.upm.es")

ticker = "HEZU"
fund = Company(ticker)
all_filings = fund.get_filings(form="N-CSR")

def process_single_filing_multiprocess(filing_data):
    """
    Process a single filing (for multiprocessing).
    Note: Must pass serializable data, not the filing object directly
    """
    try:
        # Import inside function for multiprocessing
        import sys
        from pathlib import Path
        sys.path.append('../src')
        from simple_rag.extraction.parser import BlackRockFiling
        
        html_content, report_date = filing_data
        parser = BlackRockFiling(html_content)
        funds = parser.get_funds()
        
        performance_funds = []
        df_performance = None        
        count = 0
        for fund in funds:

            if fund.performance_table is not None:
                if fund.ticker not in performance_funds:
                    performance_funds.append(fund.ticker)
                    count += 1
            if fund.ticker not in ishares_funds:
                ishares_funds.append(fund.ticker)
        
            print("Calling get_financial_highlights2")
        df_performance = parser.get_financial_highlights2()
        
        print(f"Filing {report_date}: Found {count} funds with performance tables, Total funds: {len(funds)}")
        
        return {
            'funds': funds,
            'performance_tickers': performance_funds,
            'df_performance': df_performance,
            'report_date': report_date
        }
    except Exception as e:
        print(f"Error processing filing: {e}")
        return None

if all_filings:
    unique_dates = sorted({f.report_date for f in all_filings if f.report_date})
    print("Unique report dates:", unique_dates)
    
    # Filter for filings from 2024-08-31 onward
    cutoff_date = "2024-09-31"
    latest_filings = [
        f for f in all_filings 
        if f.report_date and f.report_date >= cutoff_date
    ]
    
    print("Found filings: ", len(latest_filings), "from", cutoff_date, "onward")
    
    # Optional: Show the dates of filtered filings
    print("Filtered filing dates:", sorted({f.report_date for f in latest_filings}))

# Prepare data for multiprocessing (fetch HTML first)
filing_data_list = []
failed_filings = []
for filing in latest_filings:
    try:
        html_content = filing.html()
        if html_content:  # Only add if HTML content exists
            filing_data_list.append((html_content, filing.report_date))
        else:
            print(f"‚ö†Ô∏è  No HTML content for filing: {filing.report_date}")
            failed_filings.append(filing)
    except ValueError as e:
        print(f"‚ùå Error processing filing {filing.report_date}: {e}")
        failed_filings.append(filing)
    except Exception as e:
        print(f"‚ùå Unexpected error for filing {filing.report_date}: {e}")
        failed_filings.append(filing)
print(f"‚úÖ Successfully prepared {len(filing_data_list)} filings")
print(f"‚ùå Failed to prepare {len(failed_filings)} filings")
# Continue with successful filings only

performance_funds = []
df_performances = []
ishares_funds = []

# Use ProcessPoolExecutor
with ProcessPoolExecutor() as executor:
    future_to_data = {executor.submit(process_single_filing_multiprocess, data): data 
                      for data in filing_data_list}
    
    for future in tqdm(as_completed(future_to_data), total=len(filing_data_list), desc="Processing filings"):
        result = future.result()
        if result:
            ishares_funds.extend(result['funds'])
            funds_total.extend(result['funds'])
            performance_funds.extend(result['performance_tickers'])
            
            if result['df_performance'] is not None:
                df_performances.append(result['df_performance'])

print(len(df_performances))
print(f"Total funds processed: {len(ishares_funds)}")


Unique report dates: ['2003-04-30', '2003-07-31', '2004-02-29', '2004-03-31', '2004-04-30', '2004-07-31', '2005-02-28', '2005-03-31', '2005-04-30', '2005-07-31', '2006-02-28', '2006-03-31', '2006-04-30', '2006-07-31', '2007-02-28', '2007-03-31', '2007-04-30', '2007-07-31', '2008-02-29', '2008-03-31', '2008-04-30', '2008-07-31', '2009-02-28', '2009-03-31', '2009-04-30', '2009-07-31', '2009-08-31', '2010-02-28', '2010-03-31', '2010-04-30', '2010-07-31', '2010-08-31', '2011-02-28', '2011-03-31', '2011-04-30', '2011-07-31', '2011-08-31', '2011-10-31', '2012-02-29', '2012-03-31', '2012-04-30', '2012-07-31', '2012-08-31', '2012-10-31', '2013-02-28', '2013-03-31', '2013-04-30', '2013-07-31', '2013-08-31', '2013-10-31', '2014-02-28', '2014-03-31', '2014-04-30', '2014-07-31', '2014-08-31', '2014-10-31', '2015-02-28', '2015-03-31', '2015-04-30', '2015-07-31', '2015-08-31', '2015-10-31', '2016-02-29', '2016-03-31', '2016-04-30', '2016-07-31', '2016-08-31', '2016-10-31', '2017-02-28', '2017-03-31'

Processing filings:   0%|          | 0/20 [00:00<?, ?it/s]

Processing: iShares iBonds 1-5 Year Corporate Ladder ETF
Extracting context:  From2024-11-01to2025-10-31_C000254885Member
Tag not found:  oef:ClassName From2024-11-01to2025-10-31_C000254885Member
Unknown Table:        Footnote                   Description
0  Footnote(a)  Excludes money market funds.
Unknown table type:       Footnote                   Description
0  Footnote(a)  Excludes money market funds.
Processing: iShares iBonds 1-5 Year High Yield and Income Ladder ETF
Extracting context:  From2024-11-01to2025-10-31_C000254886Member
Tag not found:  oef:ClassName From2024-11-01to2025-10-31_C000254886Member
Unknown Table:        Footnote                   Description
0  Footnote(a)  Excludes money market funds.
Unknown table type:       Footnote                   Description
0  Footnote(a)  Excludes money market funds.
Processing: iShares iBonds 1-5 Year TIPS Ladder ETF
Extracting context:  From2024-11-01to2025-10-31_C000254887Member
Tag not found:  oef:ClassName From2024-11-01to2

Processing filings:   5%|‚ñå         | 1/20 [00:01<00:37,  1.95s/it]

      Footnote                   Description
0  Footnote(a)  Excludes money market funds.
Unknown table type:       Footnote                   Description
0  Footnote(a)  Excludes money market funds.
Processing: iShares Currency Hedged MSCI Eurozone ETF
Extracting context:  FY2025_C000141929Member
Calling get_financial_highlights2
Calling get_financial_highlights2
Calling get_financial_highlights2
Calling get_financial_highlights2
Tag not found:  dei:SecurityExchangeName FY2025_C000141929Member
Found 3 potential Financial Highlights sections
Total funds extracted: 0Unknown Table: 
 Filing 2025-10-31: Found 0 funds with performance tables, Total funds: 4
      0                                                  1
0  ‚Äã(a)  The underlying fund is iShares MSCI Eurozone ETF.
1  ‚Äã(b)                       Excludes money market funds.
Unknown table type:       0                                                  1
0  ‚Äã(a)  The underlying fund is iShares MSCI Eurozone ETF.
1  ‚Äã(b)        

Processing filings:  10%|‚ñà         | 2/20 [00:06<00:58,  3.25s/it]

FY2025_C000050169Member
Processing: iShares Morningstar Growth ETF
Extracting context:  From2024-05-01to2025-04-30_C000012098Member
Unknown Table:        0                             1
0  ‚Äã(a)  Excludes money market funds.
Unknown table type:       0                             1
0  ‚Äã(a)  Excludes money market funds.Unknown Table: 
       0                                                  1
0  ‚Äã(a)                       Excludes money market funds.
1    ‚Äã*  Ten largest industries are presented. Addition...
Failed to extract tables from block:  Unknown table type:       0                                                  1
0  ‚Äã(a)                       Excludes money market funds.
1    ‚Äã*  Ten largest industries are presented. Addition...
oef:LineGraphTableTextBlock
No tables found for block:  Unknown Table: oef:LineGraphTableTextBlock
       Footnote                   Description
0  Footnote(a)  Excludes money market funds.
Unknown table type:       Footnote                

Processing filings:  15%|‚ñà‚ñå        | 3/20 [00:07<00:39,  2.32s/it]

FY2025_C000210083Member
Processing: iShares Dow Jones U.S. ETF
Extracting context:  From2024-05-01to2025-04-30_C000012060Member
Tag not found: Processing: iShares ESG Aware 80/20 Aggressive Allocation ETF 
oef:ClassNameExtracting context:   From2024-08-01to2025-07-31_C000219702MemberFrom2024-05-01to2025-04-30_C000012101Member

Unknown Table:        0                             1
0  ‚Äã(a)  Excludes money market funds.
Unknown table type:       0                             1
0  ‚Äã(a)  Excludes money market funds.
Failed to extract tables from block:  Unknown Table: oef:LineGraphTableTextBlock 
      Footnote                   Description
0  Footnote(a)  Excludes money market funds.
No tables found for block: Unknown table type:       Footnote                   Description
0  Footnote(a)  Excludes money market funds. 
oef:LineGraphTableTextBlock
Processing: iShares MSCI Kuwait ETF
Extracting context:  FY2025_C000218229Member
Tag not found:  oef:ClassName From2024-08-01to2025-07-31_C00

Processing filings:  20%|‚ñà‚ñà        | 4/20 [00:09<00:36,  2.29s/it]

       0                             1
0  ‚Äã(a)  Excludes money market funds.
Unknown table type:       0                             1
0  ‚Äã(a)  Excludes money market funds.
Failed to extract tables from block:  oef:LineGraphTableTextBlock
No tables found for block:  oef:LineGraphTableTextBlockUnknown Table: 
       Footnote                   Description
0  Footnote(a)  Excludes money market funds.
Processing: iShares Core Dividend Growth ETF
Extracting context: Unknown table type:       Footnote                   Description
0  Footnote(a)  Excludes money market funds. 
FY2025_C000141931Member
Tag not found:  Processing: iShares Morningstar Small-Cap Value ETFoef:ClassName 
From2024-08-01to2025-07-31_C000069398MemberExtracting context: 
 From2024-05-01to2025-04-30_C000012198Member
Tag not found:  oef:ClassName From2024-08-01to2025-07-31_C000153271Member
Tag not found:  dei:SecurityExchangeName FY2025_C000141931Member
Unknown Table:        0                             1
0  ‚Äã(a)  

Processing filings:  25%|‚ñà‚ñà‚ñå       | 5/20 [00:12<00:37,  2.50s/it]

Tag not found:  oef:ClassName From2024-05-01to2025-04-30_C000038163Member
Tag not found:  oef:ClassName From2024-08-01to2025-07-31_C000069400Member
Tag not found:  dei:SecurityExchangeName FY2025_C000012097Member
Unknown Table:        0                             1
0  ‚Äã(a)  Excludes money market funds.
Unknown table type:       0                             1
0  ‚Äã(a)  Excludes money market funds.
Tag not found: Failed to extract tables from block:   oef:FactorsAffectingPerfTextBlockFrom2024-05-01to2025-04-30_C000038163Member  oef:LineGraphTableTextBlock

No tables found for block: oef:LineGraphTableTextBlock
Unknown Table:  Processing: iShares Morningstar Value ETF      Footnote                   Description
0  Footnote(a)  Excludes money market funds.

Unknown table type:       Footnote                   Description
0  Footnote(a)  Excludes money market funds.Extracting context: 
 FY2025_C000012099Member
Unknown Table:        Footnote                   Description
0  Footnote(a) 

Processing filings:  30%|‚ñà‚ñà‚ñà       | 6/20 [00:13<00:27,  1.96s/it]

Tag not found:  dei:SecurityExchangeName FY2025_C000012099Member
Unknown Table:        0                             1
0  ‚Äã(a)  Excludes money market funds.
Unknown table type:       0                             1
0  ‚Äã(a)  Excludes money market funds.
Failed to extract tables from block:  oef:LineGraphTableTextBlock
No tables found for block:  oef:LineGraphTableTextBlock
Calling get_financial_highlights2
Calling get_financial_highlights2
Calling get_financial_highlights2
Calling get_financial_highlights2
Calling get_financial_highlights2
Calling get_financial_highlights2
Calling get_financial_highlights2
Calling get_financial_highlights2
Calling get_financial_highlights2
Tag not found:  oef:ClassName From2024-08-01to2025-07-31_C000112640Member
Found 18 potential Financial Highlights sections
ValueError: invalid literal for int() with base 10: ''
ValueError: invalid literal for int() with base 10: ''
ValueError: invalid literal for int() with base 10: ''
ValueError: invalid literal

Processing filings:  35%|‚ñà‚ñà‚ñà‚ñå      | 7/20 [00:14<00:23,  1.80s/it]

Failed to extract tables from block:  oef:LineGraphTableTextBlock
No tables found for block:  oef:LineGraphTableTextBlock
Processing: iShares Blockchain and Tech ETF
Extracting context:  FY2025_C000235105Member
Tag not found:  oef:ClassName From2024-08-01to2025-07-31_C000231047Member
Tag not found:  dei:SecurityExchangeName FY2025_C000235105Member
Unknown Table:        0                             1
0  ‚Äã(a)  Excludes money market funds.
Unknown table type:       0                             1
0  ‚Äã(a)  Excludes money market funds.
Failed to extract tables from block:  oef:LineGraphTableTextBlock
No tables found for block:  oef:LineGraphTableTextBlock
Processing: iShares Copper and Metals Mining ETF
Extracting context:  FY2025_C000241778Member
Unknown Table:        Footnote                   Description
0  Footnote(a)  Excludes money market funds.
Unknown table type:       Footnote                   Description
0  Footnote(a)  Excludes money market funds.
Unknown Table:        0   

Processing filings:  40%|‚ñà‚ñà‚ñà‚ñà      | 8/20 [00:23<00:48,  4.05s/it]

Processing: iShares Expanded Tech Sector ETF
Extracting context:  From2024-04-01to2025-03-31_C000012082Member
Tag not found:  oef:FactorsAffectingPerfTextBlock From2024-04-01to2025-03-31_C000025769Member
Failed to extract tables from block:  oef:LineGraphTableTextBlock
No tables found for block:  oef:LineGraphTableTextBlock
Processing: iShares Global Financials ETF
Extracting context:  FY2025_C000012100Member
Tag not found:  oef:ClassName From2024-08-01to2025-07-31_C000245481Member
Tag not found:  oef:FactorsAffectingPerfTextBlock From2024-11-01to2025-10-31_C000141927Member
Tag not found:  dei:SecurityExchangeName FY2025_C000012100Member
Processing: iShares U.S. Consumer Discretionary ETF
Extracting context:  From2024-05-01to2025-04-30_C000012050Member
Unknown Table:        Footnote                   Description
0  Footnote(a)  Excludes money market funds.
Unknown table type:       Footnote                   Description
0  Footnote(a)  Excludes money market funds.
Unknown Table:       

Processing filings:  45%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 9/20 [00:41<01:31,  8.30s/it]

Tag not found:  oef:FactorsAffectingPerfTextBlock From2024-04-01to2025-03-31_C000025768Member
No data obtained
Processing: iShares iBonds Dec 2032 Term Corporate ETF
Extracting context:  From2024-11-01to2025-10-31_C000236700Member
Processing: iShares North American Natural Resources ETF
Extracting context:  From2024-04-01to2025-03-31_C000012086Member
Unknown Table:        Footnote                   Description
0  Footnote(a)  Excludes money market funds.
Unknown table type:       Footnote                   Description
0  Footnote(a)  Excludes money market funds.
Unknown Table:        Footnote                                        Description
0    Footnote*  Credit quality ratings shown reflect the ratin...
1  Footnote(a)                       Excludes money market funds.
Unknown table type:       Footnote                                        Description
0    Footnote*  Credit quality ratings shown reflect the ratin...
1  Footnote(a)                       Excludes money market funds.

Processing filings:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 10/20 [00:42<01:01,  6.20s/it]

Processing: iShares U.S. Financials ETF
Extracting context:  From2024-05-01to2025-04-30_C000012053Member
Failed to extract tables from block:  oef:LineGraphTableTextBlock
No tables found for block:  oef:LineGraphTableTextBlock
Processing: iShares Core MSCI Total International Stock ETF
Extracting context:  FY2025_C000119716Member
Tag not found:  oef:ClassName From2024-11-01to2025-10-31_C000236700Member
Failed to extract tables from block:  oef:LineGraphTableTextBlock
No tables found for block: Tag not found:   oef:ClassNameoef:LineGraphTableTextBlock 
From2023-11-01to2024-10-31_C000204675Member
Processing: iShares 0-5 Year Investment Grade Corporate Bond ETF
Extracting context:  FY2025_C000131292Member
Tag not found:  oef:ClassName From2024-04-01to2025-03-31_C000025770Member
Unknown Table:        0                                                  1
0  ‚Äã(a)                       Excludes money market funds.
1    ‚Äã*  Credit quality ratings shown reflect the ratin...
Unknown table typ

Processing filings:  55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 11/20 [01:08<01:49, 12.14s/it]


Extracting context:  FY2024_C000161648Member
Unknown Table:        Footnote                   Description
0  Footnote(a)  Excludes money market funds.
Unknown table type:       Footnote                   Description
0  Footnote(a)  Excludes money market funds.
Failed to extract tables from block:  oef:LineGraphTableTextBlock
No tables found for block:  oef:LineGraphTableTextBlock
Processing: iShares Core International Aggregate Bond ETF
Extracting context:  FY2025_C000161648Member
Failed to extract tables from block:  oef:LineGraphTableTextBlock
No tables found for block:  oef:LineGraphTableTextBlock
Processing: iShares Top 20 U.S. Stocks ETFProcessing: iShares International Small-Cap Equity Factor ETF

Extracting context: Extracting context:   FY2025_C000154548MemberFrom2024-04-01to2025-03-31_C000254701Member

Calling get_financial_highlights2
Calling get_financial_highlights2
Calling get_financial_highlights2
Calling get_financial_highlights2
Calling get_financial_highlights2
Callin

Processing filings:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 12/20 [01:09<01:09,  8.66s/it]

Tag not found:  dei:SecurityExchangeName FY2025_C000080009Member
Unknown Table:        0                                                  1
0  ‚Äã(a)                       Excludes money market funds.
1    ‚Äã*  Credit quality ratings shown reflect the ratin...
Unknown table type:       0                                                  1
0  ‚Äã(a)                       Excludes money market funds.
1    ‚Äã*  Credit quality ratings shown reflect the ratin...
Processing: iShares iBonds Dec 2025 Term Treasury ETF
Extracting context:  From2024-11-01to2025-10-31_C000217186Member
Tag not found:  oef:ClassName From2024-04-01to2025-03-31_C000254701Member
Tag not found:  dei:SecurityExchangeName Tag not found: FY2025_C000154548Member 
oef:ClassName From2023-11-01to2024-10-31_C000217189Member
Tag not found:  dei:SecurityExchangeName FY2024_C000194633Member
Unknown Table:        0                                                  1
0  ‚Äã(a)                       Excludes money market funds.
1   

Processing filings:  65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 13/20 [01:12<00:48,  6.91s/it]

Failed to extract tables from block:  oef:LineGraphTableTextBlock
No tables found for block:  oef:LineGraphTableTextBlock
Processing: iShares MSCI ACWI ex U.S. ETF
Extracting context:  FY2025_C000061363Member
Failed to extract tables from block:  oef:LineGraphTableTextBlock
No tables found for block:  oef:LineGraphTableTextBlock
Processing: iShares ESG Advanced High Yield Corporate Bond ETF
Extracting context:  FY2025_C000170245Member
Tag not found:  oef:ClassName From2023-11-01to2024-10-31_C000217190Member
Unknown Table:        0                                                  1
0  ‚Äã(a)                       Excludes money market funds.
1    ‚Äã*  Credit quality ratings shown reflect the ratin...
Unknown table type:       0                                                  1
0  ‚Äã(a)                       Excludes money market funds.
1    ‚Äã*  Credit quality ratings shown reflect the ratin...
Unknown Table:        0                                                  1
0  ‚Äã(a)     

Processing filings:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 14/20 [01:15<00:35,  5.98s/it]

Unknown Table:        0                             1
0  ‚Äã(a)  Excludes money market funds.
Unknown table type:       0                             1
0  ‚Äã(a)  Excludes money market funds.
Tag not found:  dei:SecurityExchangeName FY2024_C000102031Member
Unknown Table:        0                                                  1
0  ‚Äã(a)                       Excludes money market funds.
1    ‚Äã*  Credit quality ratings shown reflect the ratin...
Unknown table type:       0                                                  1
0  ‚Äã(a)                       Excludes money market funds.
1    ‚Äã*  Credit quality ratings shown reflect the ratin...
Tag not found:  oef:ClassName From2023-11-01to2024-10-31_C000228602Member
Failed to extract tables from block:  oef:LineGraphTableTextBlock
No tables found for block:  oef:LineGraphTableTextBlock
Processing: iShares MSCI China A ETF
Extracting context:  FY2025_C000148919Member
Tag not found:  dei:SecurityExchangeName FY2024_C000153287Member
Un

Processing filings:  75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 15/20 [01:47<01:08, 13.73s/it]

Tag not found:  dei:SecurityExchangeName FY2025_C000249960Member
Unknown Table:      0                                                  1
0  ‚Äã*  Credit quality ratings shown reflect the ratin...
Unknown table type:     0                                                  1
0  ‚Äã*  Credit quality ratings shown reflect the ratin...
Failed to extract tables from block:  oef:LineGraphTableTextBlock
No tables found for block:  oef:LineGraphTableTextBlock
Processing: iShares iBonds Dec 2031 Term Corporate ETF
Extracting context:  FY2025_C000228040Member
Unknown Table:        Footnote                                        Description
0    Footnote*  Credit quality ratings shown reflect the ratin...
1  Footnote(a)                       Excludes money market funds.
Unknown table type:       Footnote                                        Description
0    Footnote*  Credit quality ratings shown reflect the ratin...
1  Footnote(a)                       Excludes money market funds.
Processing: i

Processing filings:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 16/20 [01:55<00:48, 12.10s/it]

Unknown Table:        0                                                  1
0  ‚Äã(a)                       Excludes money market funds.
1    ‚Äã*  Credit quality ratings shown reflect the ratin...
Unknown table type:       0                                                  1
0  ‚Äã(a)                       Excludes money market funds.
1    ‚Äã*  Credit quality ratings shown reflect the ratin...
Failed to extract tables from block:  oef:LineGraphTableTextBlock
No tables found for block:  oef:LineGraphTableTextBlock
Processing: iShares iBonds Dec 2027 Term Corporate ETF
Extracting context:  FY2024_C000191091Member
Tag not found:  dei:SecurityExchangeName FY2025_C000249962Member
Unknown Table:        0                                                  1
0  ‚Äã(a)                       Excludes money market funds.
1    ‚Äã*  Credit quality ratings shown reflect the ratin...
Unknown table type:       0                                                  1
0  ‚Äã(a)                       Exclude

Processing filings:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 17/20 [02:12<00:40, 13.50s/it]

Failed to extract tables from block:  oef:LineGraphTableTextBlock
No tables found for block:  oef:LineGraphTableTextBlock
Processing: iShares Systematic Bond ETF
Extracting context:  FY2025_C000196720Member
No data obtained
No data obtained
Processing: iShares iBonds Dec 2031 Term Corporate ETF
Extracting context:  FY2024_C000228040Member
Tag not found:  dei:SecurityExchangeName FY2025_C000196720Member
Unknown Table:        0                                                  1
0  ‚Äã(a)                       Excludes money market funds.
1    ‚Äã*  Credit quality ratings shown reflect the ratin...
Unknown table type:       0                                                  1
0  ‚Äã(a)                       Excludes money market funds.
1    ‚Äã*  Credit quality ratings shown reflect the ratin...
Failed to extract tables from block:  oef:LineGraphTableTextBlock
No tables found for block:  oef:LineGraphTableTextBlock
Processing: iShares Treasury Floating Rate Bond ETF
Extracting context:  F

Processing filings:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 18/20 [02:23<00:25, 12.57s/it]

No data obtained
No data obtained
Processing: iShares iBonds Dec 2044 Term Treasury ETF
Extracting context:  FY2024_C000250197Member
No data obtained
Processing: iShares iBonds Dec 2034 Term Treasury ETF
Extracting context:  FY2024_C000250196Member
Tag not found:  oef:FactorsAffectingPerfTextBlock FY2024_C000250197Member
Unknown Table:        0                                                  1
0  ‚Äã(a)                       Excludes money market funds.
1    ‚Äã*  Credit quality ratings shown reflect the ratin...
Unknown table type:       0                                                  1
0  ‚Äã(a)                       Excludes money market funds.
1    ‚Äã*  Credit quality ratings shown reflect the ratin...
Tag not found:  dei:SecurityExchangeName FY2024_C000250196Member
Tag not found:  oef:FactorsAffectingPerfTextBlock FY2024_C000250196Member
No data obtained
Unknown Table:        0                                                  1
0  ‚Äã(a)                       Excludes money m

Processing filings:  95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 19/20 [03:17<00:25, 25.16s/it]

Failed to extract tables from block:  oef:LineGraphTableTextBlock
No tables found for block:  oef:LineGraphTableTextBlock
Failed to extract tables from block:  oef:AvgAnnlRtrTableTextBlock
No tables found for block:  oef:AvgAnnlRtrTableTextBlock
Processing: iShares S&P 500 BuyWrite ETF
Extracting context:  FY2024_C000247832Member
Tag not found:  dei:SecurityExchangeName FY2024_C000247832Member
Unknown Table:        0                                                 1
0  ‚Äã(a)  The underlying fund is iShares Core S&P 500 ETF.
1  ‚Äã(b)                      Excludes money market funds.
Unknown table type:       0                                                 1
0  ‚Äã(a)  The underlying fund is iShares Core S&P 500 ETF.
1  ‚Äã(b)                      Excludes money market funds.
Failed to extract tables from block:  oef:LineGraphTableTextBlock
No tables found for block:  oef:LineGraphTableTextBlock
Failed to extract tables from block:  oef:AvgAnnlRtrTableTextBlock
No tables found for bl

Processing filings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [03:29<00:00, 10.48s/it]


18
Total funds processed: 380


In [None]:
failed_filings[0].url

'https://www.sec.gov/Archives/edgar/data/1100663/0001100663-25-000026-index.html'

In [None]:
import pandas as pd
import re
%reload_ext autoreload
from simple_rag.models.fund import FinancialHighlights

if df_performances:
    df_performance = pd.concat(df_performances, ignore_index=True)
else:
    df_performance = pd.DataFrame() # Empty fallback
    print("No performance data found.")

print(df_performance.head())

def clean_financial_number(val):
    """
    Parses financial strings like '23.19 %(b)' or '(24.82 )%'.
    - Extracts the numerical value.
    - Handles (12.34) as negative -12.34.
    - Ignores footnote markers like (a), (b).
    - Removes %, $, and commas.
    """
    if pd.isna(val) or val is None:
        return None
    
    # Convert to string and strip whitespace
    s = str(val).strip()
    
    # 1. Regex to find the number (handles decimals and commas)
    # Looks for digits, optional commas, and optional decimal part
    match = re.search(r'(\d{1,3}(?:,\d{3})*\.?\d*|\d*\.?\d+)', s)
    
    if not match:
        return None
        
    # Get the raw number string (e.g., "24.82" or "1,234.56")
    num_str = match.group(0)
    
    # 2. Check for negative indication: "(" at the start of the string
    # Accounting format always puts the negative parenthesis at the start: (24.82)%
    is_negative = s.startswith('(')
    
    try:
        # Remove commas and convert to float
        clean_num = float(num_str.replace(',', ''))
        
        # Apply negative sign if detected
        return -clean_num if is_negative else clean_num
        
    except ValueError:
        return None
returns_lookup = df_performance.copy()


# Apply to all financial columns
financial_cols = ['total_return', 'expense_ratio', 'net_income_ratio', 'portfolio_turnover', 'nav_end', 'nav_beginning', 'net_assets']
for col in financial_cols:
    if col in returns_lookup.columns:
        returns_lookup[f'{col}_clean'] = returns_lookup[col].apply(clean_financial_number)

ishares_tickers = [fund_obj.ticker for fund_obj in ishares_funds]
print("Tickers in ishares_funds:", ishares_tickers)


# Now you can efficiently match and update your funds
for fund_obj in funds_total:
    
    if fund_obj.ticker not in ishares_tickers:
        continue

    print(f"\nProcessing fund object: {fund_obj.name} - {fund_obj.share_class}")
    # Initialize annual returns
    if not hasattr(fund_obj, 'annual_returns') or fund_obj.annual_returns is None:
        fund_obj.annual_returns = {}

    if not hasattr(fund_obj, 'financial_highlights') or fund_obj.financial_highlights is None:
        fund_obj.financial_highlights = {}
    
    # Clean the name: remove "Vanguard" and strip whitespace
    name = fund_obj.name.replace("Vanguard", "").strip()
    print(f"Cleaned name: '{name}'")
    
    if "¬Æ" in name:
        name = name.replace("¬Æ", "")
    if "‚Ñ¢" in name:
        name = name.replace("‚Ñ¢", "")
        
    # Find matching rows based on fund name
    name_matches = returns_lookup[returns_lookup['fund_name'].str.contains(name, case=False, na=False, regex=False)]
    
    if len(name_matches) == 0:
        print("  No name matches found")
        continue
    
    print(f"  Found {len(name_matches)} name matches")
    
    # Clean share class (remove trademark symbol)
    fund_obj.share_class = "ETF Shares"
    share_class = fund_obj.share_class
    
    if "‚Ñ¢" in share_class:
        share_class = share_class.replace("‚Ñ¢", "")
    
    # Now match share class
    share_class_matches = name_matches[
        name_matches['share_class'].str.contains(share_class, case=False, na=False, regex=False)]
    
    if name_matches['share_class'].isna().all():
        fund_obj.annual_returns = dict(zip(name_matches['year'], name_matches['total_return_clean']))
        print("Annual return: ", fund_obj.annual_returns)
        continue
        
    if len(share_class_matches) == 0:
        print(f"  No share class matches found for '{share_class}'")
        print(f"  Available share classes: {name_matches['share_class'].unique()}")
        continue
    
    print(f"  Found {len(share_class_matches)} matching records")
    
    # Add all matching returns
    for _, row in share_class_matches.iterrows():
        year = str(row['year'])
        
        # Store annual return
        fund_obj.annual_returns[year] = row['total_return_clean']
        
        # Store full financial highlights snapshot
        fund_obj.financial_highlights[year] = FinancialHighlights(
            turnover=row.get('portfolio_turnover_clean'),
            expense_ratio=row.get('expense_ratio_clean'),
            total_return=row.get('total_return_clean'),
            net_assets=row.get('net_assets'),  # You may need to add this cleaning
            net_assets_value_begining=row.get('nav_beginning_clean'),
            net_assets_value_end=row.get('nav_end_clean') ,
            net_income_ratio=row.get('net_income_ratio_clean')
        )
    
    print(f"  Annual returns: {fund_obj.annual_returns}")
    print(f"  Financial highlights years: {list(fund_obj.financial_highlights.keys())}")
    for key, value in fund_obj.financial_highlights.items():
        print(f"    {key}: {value}")

                                   fund_name share_class  year   net_assets  \
0  iShares¬†Large¬†Cap¬†Accelerated¬†Outcome¬†ETF  ETF Shares  2025          0.0   
1  iShares¬†Large¬†Cap¬†Accelerated¬†Outcome¬†ETF  ETF Shares  2025   13473000.0   
2       iShares¬†Large¬†Cap¬†Max¬†Buffer¬†Mar¬†ETF  ETF Shares  2025   38203000.0   
3       iShares¬†Large¬†Cap¬†Max¬†Buffer¬†Jun¬†ETF  ETF Shares  2025  161134000.0   
4       iShares¬†Large¬†Cap¬†Max¬†Buffer¬†Jun¬†ETF  ETF Shares  2024   72687000.0   

   nav_beginning  nav_end  total_return  expense_ratio  net_income_ratio  \
0           0.00     0.00          0.00           0.00              0.00   
1          25.00    25.91          3.64           0.47              0.69   
2          25.00    25.81          3.25           0.47              0.65   
3          25.24    27.59         10.21           0.47              0.96   
4          25.00    25.24          0.95           0.47              0.00   

   portfolio_turnover distribution_share

In [None]:
import re
from collections import defaultdict
import pandas as pd

def infer_first_col_format(value: object) -> str:
    if value is None or (isinstance(value, float) and pd.isna(value)):
        return "EMPTY"

    s = str(value).strip()
    if s == "" or s.lower() == "nan":
        return "EMPTY"

    # Jan 23, Aug 15
    if re.match(r"^[A-Za-z]{3}\s+\d{2}$", s):
        return "MON_YY"

    # 2015
    if re.match(r"^\d{4}$", s):
        return "YYYY"

    # 2024-08-31
    if re.match(r"^\d{4}-\d{2}-\d{2}$", s):
        return "YYYY_MM_DD"

    # 08/31/24 or 8/31/2024
    if re.match(r"^\d{1,2}/\d{1,2}/\d{2,4}$", s):
        return "MM_DD_YY(YY)"

    # 31/08/24 (if you ever have EU style)
    if re.match(r"^\d{1,2}-\d{1,2}-\d{2,4}$", s):
        return "DD_MM_YY(YY)_or_MM_DD_YY(YY)_DASH"

    # Fallbacks
    if re.search(r"\d", s):
        return "OTHER_HAS_DIGITS"

    return "OTHER_TEXT"


def describe_first_column_formats(
    dfs,
    names=None,
    samples_per_df=3,
    max_groups_to_show=50,
    max_dfs_per_group_to_print=5,
):
    if names is None:
        names = [f"df[{i}]" for i in range(len(dfs))]

    groups = defaultdict(list)

    for name, df in zip(names, dfs):
        if df is None or not isinstance(df, pd.DataFrame) or df.empty:
            groups["EMPTY_DF"].append((name, df))
            continue

        first_col = df.columns[0]
        # take first non-empty sample from first column
        series = df[first_col].astype(str)
        sample_vals = [v for v in series.head(20).tolist() if str(v).strip() and str(v).lower() != "nan"]

        fmt = infer_first_col_format(sample_vals[0]) if sample_vals else "EMPTY_FIRST_COL"
        groups[fmt].append((name, df))

    sorted_groups = sorted(groups.items(), key=lambda kv: len(kv[1]), reverse=True)

    print(f"Total dataframes: {len(dfs)}")
    print(f"Unique first-column formats: {len(sorted_groups)}\n")

    for gi, (fmt, members) in enumerate(sorted_groups[:max_groups_to_show], start=1):
        print("=" * 100)
        print(f"Group #{gi}: {fmt}")
        print(f"Count: {len(members)}")

        example_shapes = [m[1].shape for m in members if isinstance(m[1], pd.DataFrame)]
        print(f"Example shapes (first 10): {example_shapes[:10]}")

        # Print a few examples per group
        for ex_i, (name, df) in enumerate(members[:max_dfs_per_group_to_print], start=1):
            if df is None or not isinstance(df, pd.DataFrame) or df.empty:
                print(f"  [Example {ex_i}] {name}: EMPTY/None")
                continue

            first_col = df.columns[0]
            vals = [v for v in df[first_col].head(20).tolist() if str(v).strip() and str(v).lower() != "nan"]
            vals = vals[:samples_per_df]

            print(f"  [Example {ex_i}] {name}")
            print(f"    first_col: {first_col!r}")
            print(f"    columns: {list(df.columns)[:12]}{' ...' if len(df.columns) > 12 else ''}")
            print(f"    first_col_samples: {vals}")

        print()


# Example usage with your list of performance tables
performances = []
perf_names = []
for i, fund in enumerate(funds_total):
    if fund.ticker in performance_funds and fund.performance_table is not None:
        performances.append(fund.performance_table)
        perf_names.append(f"{fund.ticker} | {fund.name} | {fund.share_class}")

describe_first_column_formats(performances, names=perf_names)

Total dataframes: 119
Unique first-column formats: 2

Group #1: MON_YY
Count: 118
Example shapes (first 10): [(62, 6), (62, 6), (62, 6), (62, 6), (107, 4), (120, 4), (120, 4), (67, 4), (120, 4), (120, 4)]
  [Example 1] EAOK | iShares ESG Aware 30/70 Conservative Allocation ETF | ETF Shares
    first_col: 'Unnamed: 0'
    columns: ['Unnamed: 0', 'Fund', 'Bloomberg U.S. Universal Index', 'MSCI All Country World Index (Net)', 'BlackRock ESG Aware Conservative Allocation Index', 'S&P Target Risk Conservative Index']
    first_col_samples: ['Jun 20', 'Jul 20', 'Aug 20']
  [Example 2] EAOM | iShares ESG Aware 40/60 Moderate Allocation ETF | ETF Shares
    first_col: 'Unnamed: 0'
    columns: ['Unnamed: 0', 'Fund', 'Bloomberg U.S. Universal Index', 'MSCI All Country World Index (Net)', 'BlackRock ESG Aware Moderate Allocation Index', 'S&P Target Risk Moderate Index']
    first_col_samples: ['Jun 20', 'Jul 20', 'Aug 20']
  [Example 3] EAOR | iShares ESG Aware 60/40 Balanced Allocation ETF | ET

In [None]:
import sys
from pathlib import Path
%reload_ext autoreload
RAG_DIR = Path("/home/alvar/CascadeProjects/windsurf-project/RAG")
if str(RAG_DIR) not in sys.path:
    sys.path.insert(0, str(RAG_DIR))


from src.simple_rag.extraction.parser import compute_annual_returns

for fund in funds_total:
    if fund.ticker in performance_funds:
        print(fund.performance_table)
        returns = compute_annual_returns(fund.performance_table)
        print("\nFinal Annual Returns:")
        fund.annual_returns = returns
        print(f"  {fund.ticker}: {returns}")
        print("---")

    Unnamed: 0     Fund Morningstar US Market Index  \
0       May 15  $10,181                     $10,137   
1       Jun 15  $10,069                      $9,955   
2       Jul 15  $10,492                     $10,133   
3       Aug 15   $9,796                      $9,524   
4       Sep 15   $9,536                      $9,256   
..         ...      ...                         ...   
115     Dec 24  $40,608                     $32,198   
116     Jan 25  $41,636                     $33,206   
117     Feb 25  $40,089                     $32,622   
118     Mar 25  $36,755                     $30,706   
119     Apr 25  $37,541                     $30,526   

    Morningstar¬Æ US Large-Mid Cap Broad Growth Index‚Ñ†  
0                                             $10,183  
1                                             $10,074  
2                                             $10,500  
3                                              $9,803  
4                                              $9,545  


In [None]:
tickers = []
unique_funds = []
duplicates = 0

for fund in funds_total:
    if fund.ticker in tickers:
        print(f"DUPLICATE: {fund.name} ({fund.ticker})")
        duplicates += 1
    else:
        tickers.append(fund.ticker)
        unique_funds.append(fund)

# Replace the original list
funds_total = unique_funds
print(f"Removed {duplicates} duplicates")
print(f"Remaining funds: {len(funds_total)}")

DUPLICATE: iShares 0-5 Year High Yield Corporate Bond ETF (SHYG)
DUPLICATE: iShares 0-5 Year Investment Grade Corporate Bond ETF (SLQD)
DUPLICATE: iShares 1-3 Year International Treasury Bond ETF (ISHG)
DUPLICATE: iShares 20+ Year Treasury Bond BuyWrite Strategy ETF (TLTW)
DUPLICATE: iShares Aaa - A Rated Corporate Bond ETF (QLTA)
DUPLICATE: iShares BB Rated Corporate Bond ETF (HYBB)
DUPLICATE: iShares Broad USD High Yield Corporate Bond ETF (USHY)
DUPLICATE: iShares CMBS ETF (CMBS)
DUPLICATE: iShares Convertible Bond ETF (ICVT)
DUPLICATE: iShares Core 1-5 Year USD Bond ETF (ISTB)
DUPLICATE: iShares Core International Aggregate Bond ETF (IAGG)
DUPLICATE: iShares ESG Advanced High Yield Corporate Bond ETF (HYXF)
DUPLICATE: iShares Fallen Angels USD Bond ETF (FALN)
DUPLICATE: iShares Floating Rate Bond ETF (FLOT)
DUPLICATE: iShares GNMA Bond ETF (GNMA)
DUPLICATE: iShares High Yield Corporate Bond BuyWrite Strategy ETF (HYGW)
DUPLICATE: iShares iBonds 2025 Term High Yield and Income ETF (

In [None]:
import pickle
from pathlib import Path
import sys

# Add RAG directory to path
RAG_DIR = Path("/home/alvar/CascadeProjects/windsurf-project/RAG")
if str(RAG_DIR) not in sys.path:
    sys.path.insert(0, str(RAG_DIR))

# Define pickle file path
PKL_PATH = Path("./funds_backup.pkl")

print("Current working directory:", Path.cwd())
print("PKL_PATH resolves to:", PKL_PATH.resolve())

# Save to pickle file
try:
    with PKL_PATH.open("wb") as f:
        pickle.dump(funds_total, f)
    
    print(f"Successfully saved {len(funds_total)} funds to pickle file")
    print(f"File size: {PKL_PATH.stat().st_size / 1024:.2f} KB")
    
except Exception as e:
    print(f"Error saving to pickle file: {e}")

Current working directory: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks
PKL_PATH resolves to: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks/funds_backup.pkl
Successfully saved 420 funds to pickle file
File size: 3791.39 KB


In [4]:
import pickle
from pathlib import Path
from dataclasses import is_dataclass, asdict
import pandas as pd
import sys
from pathlib import Path
RAG_DIR = Path("/home/alvar/CascadeProjects/windsurf-project/RAG/")
if str(RAG_DIR) not in sys.path:
    sys.path.insert(0, str(RAG_DIR))


PKL_PATH = Path("./funds_backup.pkl")
print("Current working directory:", Path.cwd())
print("PKL_PATH resolves to:", PKL_PATH.resolve())
with PKL_PATH.open("rb") as f:
    funds_total = pickle.load(f)

print(f"Loaded {len(funds_total)} funds from pickle file")

Current working directory: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks
PKL_PATH resolves to: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks/funds_backup.pkl
Loaded 420 funds from pickle file


## Summary Prospectus

In [60]:
from edgar import Company, set_identity
import pandas as pd
from typing import List, Dict
import sys
from tqdm import tqdm
from IPython.display import display, Markdown
from src.simple_rag.extraction.general_info import FundInfoExtractor
from pathlib import Path

set_identity('luis.alvarez.conde@alumnos.upm.es')
tickers = ["VOO", "MGK", "HEZU", "VMGRX", "VDIGX"]

for ticker in tickers:
    company = Company(ticker)
    processed_funds = []
    filings = company.get_filings(form="497K")

    for filing in filings:
        text = filing.text()
        extractor = FundInfoExtractor(text, ticker=ticker)
        fund_data = extractor.get_structured_data()
        if fund_data['ticker'] in processed_funds:
            print("First duplicate: ", fund_data['ticker'])
            break
        
        processed_funds.append(fund_data['ticker'])
        md = extractor.get_clean_markdown()
        for fund in funds_total:
            if fund.ticker == fund_data['ticker']:
                fund.summary_prospectus = md
                fund.managers = fund_data['managers']
                fund.strategies = fund_data['strategies']
                fund.risks = fund_data['risks']
                fund.objective = fund_data['objective']
                break
        
    print("Processed funds: ", len(processed_funds), "for ticker: ", ticker)


First duplicate:  VSCIX
Processed funds:  48 for ticker:  VOO


First duplicate:  VEXC
Processed funds:  36 for ticker:  MGK


First duplicate:  XT
Processed funds:  120 for ticker:  HEZU


First duplicate:  VMGRX
Processed funds:  1 for ticker:  VMGRX


First duplicate:  VGSNX
Processed funds:  11 for ticker:  VDIGX


In [61]:
managers = set()
for fund in funds_total:
    if fund.managers is not None:
        # If fund.managers is a list, add each manager individually
        if isinstance(fund.managers, list):
            managers.update(fund.managers)  # ‚úÖ Use update() for lists
        else:
            managers.add(fund.managers)    # ‚úÖ Use add() for single values

print(f"Unique managers found: {len(managers)}")
for manager in sorted(managers):
    print(f"  - {manager}")

Unique managers found: 32
  - Aaron Choi
  - Asian Economic Risk
  - Aur√©lie Denis
  - Brett Barakett
  - Chris Nieves
  - Christopher Chung
  - Consumer Goods
  - Erin Armstrong
  - Gary Robinson
  - Investment Manager
  - Jake Riley
  - James Mauro
  - Jena Stenger
  - Jennifer Hsui
  - Jonathan Graves
  - Kenny Narzikul
  - Lawrence Burns
  - Managing Director
  - Matt Waldron
  - Michael Cling
  - Michelle Louie
  - Nataliya Kofman
  - Natasha Kuhlkin
  - Nick Birkett
  - Peter Sietsema
  - Senior Managing Director
  - Simon Webber
  - Steven White
  - Suzanne Ly
  - Thomas Coutts
  - Tom Slater
  - Walter Nejman


In [62]:
import pickle
from pathlib import Path
import sys

# Add RAG directory to path
RAG_DIR = Path("/home/alvar/CascadeProjects/windsurf-project/RAG")
if str(RAG_DIR) not in sys.path:
    sys.path.insert(0, str(RAG_DIR))

# Define pickle file path
PKL_PATH = Path("./funds_backup.pkl")

print("Current working directory:", Path.cwd())
print("PKL_PATH resolves to:", PKL_PATH.resolve())

# Save to pickle file
try:
    with PKL_PATH.open("wb") as f:
        pickle.dump(funds_total, f)
    
    print(f"Successfully saved {len(funds_total)} funds to pickle file")
    print(f"File size: {PKL_PATH.stat().st_size / 1024:.2f} KB")
    
except Exception as e:
    print(f"Error saving to pickle file: {e}")

Current working directory: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks
PKL_PATH resolves to: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks/funds_backup.pkl
Successfully saved 420 funds to pickle file
File size: 50133.35 KB


In [None]:
import pickle
from pathlib import Path
from dataclasses import is_dataclass, asdict
import pandas as pd

from pathlib import Path
RAG_DIR = Path("/home/alvar/CascadeProjects/windsurf-project/RAG/src")
if str(RAG_DIR) not in sys.path:
    sys.path.insert(0, str(RAG_DIR))
import sys
from simple_rag.models.fund import FundData

PKL_PATH = Path("./funds_backup.pkl")
print("Current working directory:", Path.cwd())
print("PKL_PATH resolves to:", PKL_PATH.resolve())
funds_total = []
with PKL_PATH.open("rb") as f:

    funds_total = pickle.load(f)

print(f"Loaded {len(funds_total)} funds from pickle file")

ModuleNotFoundError: No module named 'simple_rag'

## NPORT (Portfolio Composition)

In [3]:
from edgar import Company, set_identity
import sys

# 1. Set Identity
set_identity('luis.alvarez.conde@alumnos.upm.es')

def test_series_extraction(ticker):
    print(f"--- Testing Series ID Extraction for {ticker} ---")
    
    try:
        # 2. Get Filings
        company = Company(ticker)
        
        # .latest(1) returns the single object directly in your version, not a list
        filing = company.get_filings(form="NPORT-P").latest(1)
        
        if not filing:
            print(f"No NPORT-P filings found for {ticker}")
            return

        print(f"Accessing filing date: {filing.report_date}")
        
        # 3. Parse XML
        # This downloads the actual data
        xml_data = filing.obj()
        
        # 4. Extract Identifiers
        # The Series ID is usually located in the fund_series object
        fund_series = xml_data.get_fund_series()
        series_id = fund_series.series_id
        series_name = fund_series.name
        print(series_id)
        # Verify General Info
        gen_info = xml_data.general_info
        
        print("\n=== EXTRACTION RESULTS ===")
        print(f"Ticker:         {ticker}")
        print(f"CIK:            {filing.cik}")
        print(f"Series ID:      {series_id}  <-- THIS IS YOUR KEY")
        print(f"Series Name:    {series_name}")
        
        # Optional: Check Class IDs (The specific share classes)
        if hasattr(gen_info, 'series_classes'):
            print("\nAssociated Share Classes (Tickers in this Series):")
            for cls in gen_info.series_classes:
                print(f" - Class ID: {cls.class_id} | Ticker: {cls.ticker_symbol}")
                
    except Exception as e:
        print(f"‚ùå Error: {e}")
        import traceback
        traceback.print_exc()

# Run the test
test_series_extraction("VOO")

--- Testing Series ID Extraction for VOO ---
Accessing filing date: 2025-09-30
S000012757

=== EXTRACTION RESULTS ===
Ticker:         VOO
CIK:            36405
Series ID:      S000012757  <-- THIS IS YOUR KEY
Series Name:    VANGUARD MID-CAP VALUE INDEX FUND


In [None]:
from edgar import Company, set_identity
import pandas as pd
from typing import List, Dict
import sys
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import multiprocessing as mp
from threading import Lock
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
from src.simple_rag.extraction.nport import NPortProcessor
from src.simple_rag.models.fund import PortfolioHolding, Derivatives, NonDerivatives
from pathlib import Path

company_json_path = Path("/home/alvar/CascadeProjects/windsurf-project/RAG/notebooks/sec_data/company_tickers.json")

set_identity('luis.alvarez.conde@alumnos.upm.es')

def process_single_filing(filing, ticker, company_json_path):
    """Process a single filing - can be parallelized"""
    try:
        xml_data = filing.obj()
        fund_name = xml_data.get_fund_series().name
        reporting_period = xml_data.reporting_period
        portfolio_list = xml_data.investments
        derivatives = xml_data.derivatives
        series_id = xml_data.series_id

        # Process holdings
        proc = NPortProcessor(company_tickers_json_path=company_json_path, min_similarity=0.74)
        holdings = proc.process_holdings(portfolio_list)
        result = proc.enrich_tickers(holdings, verbose=False)  # Set verbose=False to reduce I/O
        
        not_matches = result[result['matched_ticker'].isna() | (result['matched_ticker'] == '')]
        
        return {
            'fund_name': fund_name,
            'reporting_period': reporting_period,
            'holdings': holdings,
            'result': result,
            'derivatives': derivatives,
            'not_matches': not_matches,
            'ticker': ticker,
            'series_id': series_id,
            'report_date': filing.report_date
        }
    except Exception as e:
        print(f"Error processing filing for {ticker}: {e}")
        return None

def process_ticker(ticker, company_json_path):
    """Process all filings for a single ticker - SEQUENTIAL within ticker"""
    try:
        nport_file = Company(ticker)
        filings = sorted(nport_file.get_filings(form="NPORT-P"), 
                        key=lambda x: x.report_date, reverse=True)
        
        if not filings:
            print(f"No filings found for {ticker}")
            return None
            
        print(f"Processing ticker: {ticker}, most recent filing date: {filings[0].report_date}")
        
        funds_processed_set = set()
        ticker_results = []
        
        # Process filings SEQUENTIALLY for this ticker (to respect the stop condition)
        for filing in filings:
            result = process_single_filing(filing, ticker, company_json_path)
            
            if result is not None:
                # Check if we've already processed this fund
                if result['fund_name'].lower() in funds_processed_set:
                    print(f"Stopping - already processed fund: {result['fund_name']}")
                    break
                
                funds_processed_set.add(result['fund_name'].lower())
                ticker_results.append(result)
                
                print(f"{ticker} - Fund: {result['fund_name']}, Holdings: {len(result['holdings'])}, Unmatched: {len(result['not_matches'])}")
        
        return {
            'ticker': ticker,
            'results': ticker_results,
            'funds_processed': list(funds_processed_set)
        }
    
    except Exception as e:
        print(f"Error processing ticker {ticker}: {e}")
        return None

# Main execution - PARALLEL across tickers only
tickers = ["VOO", "MGK", "HEZU", "VMGRX", "VDIGX"]

# Use fewer workers to avoid overwhelming the system
max_workers = min(5, len(tickers))  # Start with 3 workers
print(f"Using {max_workers} workers for tickers")

all_results = []
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    future_to_ticker = {
        executor.submit(process_ticker, ticker, company_json_path): ticker 
        for ticker in tickers
    }
    
    for future in tqdm(as_completed(future_to_ticker), total=len(tickers), desc="Processing tickers"):
        ticker = future_to_ticker[future]
        try:
            result = future.result()
            if result:
                all_results.append(result)
                print(f"\nCompleted {ticker}: {len(result['funds_processed'])} funds processed")
        except Exception as e:
            print(f"Error with ticker {ticker}: {e}")

# Update funds_total object with the results
print("\n=== Updating funds_total ===")

processor = NPortProcessor()
for ticker_result in all_results:
    for filing_result in ticker_result['results']:
        ticker = filing_result['ticker']
        reporting_period = filing_result['reporting_period']
        holdings = filing_result['holdings']
        derivatives = filing_result['derivatives']
        series_id = filing_result['series_id']
        # Update your funds_total structure
        for fund in funds_total:
            if ticker == fund.ticker():
                print(f"Updating fund: {fund.name}")
                df = processor.to_df(holdings)

                fund.non_derivatives = NonDerivatives(
                    date=reporting_period,
                    holdings_df=df
                )
                df2 = processor.to_df(derivatives)
                fund.derivatives = Derivatives(
                    date=reporting_period,
                    derivatives_df=df2
                )
                fund.series_id = series_id
                break

print("\n=== Processing Complete ===")
print(f"Total tickers processed: {len(all_results)}")
for result in all_results:
    print(f"{result['ticker']}: {len(result['funds_processed'])} funds")



def verify_fund_data_integrity(funds_list):
    """
    Iterates through funds to verify that the holdings DataFrame is populated 
    and counts how many are None or missing.
    """
    print("\n" + "="*40)
    print("DATA INTEGRITY VERIFICATION")
    print("="*40)
    
    none_count = 0
    valid_count = 0
    
    for fund in funds_list:
        # We need to check layers: 
        # 1. Does fund.non_derivatives exist? 
        # 2. Is it not None?
        # 3. Is the holdings_df inside it not None?
        
        has_data = False
        row_count = 0
        
        try:
            if (hasattr(fund, 'non_derivatives') and 
                fund.non_derivatives is not None and 
                fund.non_derivatives.holdings_df is not None):
                
                has_data = True
                row_count = len(fund.non_derivatives.holdings_df)
        except Exception:
            # If any attribute access fails, treat as no data
            has_data = False

        if has_data:
            valid_count += 1
            # Optional: Print success if you want to see the good ones
            # print(f"‚úì {fund.name:<20} | Rows: {row_count}")
        else:
            none_count += 1
            print(f"‚ùå {fund.name:<20} | Status: DATAFRAME IS NONE/MISSING")

    print("-" * 40)
    print(f"Total Funds Checked: {len(funds_list)}")
    print(f"Valid DataFrames:    {valid_count}")
    print(f"None/Missing Values: {none_count}")
    print("=" * 40)

# Run the verification
# Note: Ensure 'funds_total' is defined in your scope (it was referenced in your loop)
if 'funds_total' in locals():
    verify_fund_data_integrity(funds_total)
else:
    print("Error: 'funds_total' variable not found in current scope.")

Using 5 workers for tickers


Processing tickers:   0%|          | 0/5 [00:00<?, ?it/s]

Processing ticker: VOO, most recent filing date: 2025-09-30
Error processing filing for VOO: 'FundReport' object has no attribute 'series_id'
Processing ticker: MGK, most recent filing date: 2025-09-30
Error processing filing for MGK: 'FundReport' object has no attribute 'series_id'
Processing ticker: VMGRX, most recent filing date: 2025-10-31
Processing ticker: VDIGX, most recent filing date: 2025-10-31
Error processing filing for VMGRX: 'FundReport' object has no attribute 'series_id'
Error processing filing for VDIGX: 'FundReport' object has no attribute 'series_id'
Error processing filing for VDIGX: 'FundReport' object has no attribute 'series_id'
Processing ticker: HEZU, most recent filing date: 2025-10-31
Error processing filing for VDIGX: 'FundReport' object has no attribute 'series_id'
Error processing filing for MGK: 'FundReport' object has no attribute 'series_id'
Error processing filing for VDIGX: 'FundReport' object has no attribute 'series_id'
Error processing filing for M

Processing tickers:  20%|‚ñà‚ñà        | 1/5 [10:02<40:10, 602.60s/it]

Error processing filing for VDIGX: 'FundReport' object has no attribute 'series_id'

Completed VDIGX: 0 funds processed
Error processing filing for VMGRX: 'FundReport' object has no attribute 'series_id'
Error processing filing for VOO: 'FundReport' object has no attribute 'series_id'
Error processing filing for VMGRX: 'FundReport' object has no attribute 'series_id'
Error processing filing for MGK: 'FundReport' object has no attribute 'series_id'
Error processing filing for MGK: 'FundReport' object has no attribute 'series_id'
Error processing filing for VMGRX: 'FundReport' object has no attribute 'series_id'
Error processing filing for MGK: 'FundReport' object has no attribute 'series_id'
Error processing filing for VMGRX: 'FundReport' object has no attribute 'series_id'
Error processing filing for VMGRX: 'FundReport' object has no attribute 'series_id'
Error processing filing for MGK: 'FundReport' object has no attribute 'series_id'
Error processing filing for VOO: 'FundReport' obje

In [65]:
import pickle
from pathlib import Path

PKL_PATH = Path("./funds_backup.pkl")
TMP_PATH = PKL_PATH.with_suffix(PKL_PATH.suffix + ".tmp")

with TMP_PATH.open("wb") as f:
    pickle.dump(funds_total, f, protocol=pickle.HIGHEST_PROTOCOL)

TMP_PATH.replace(PKL_PATH)

print(f"Saved {len(funds_total)} funds to pickle file: {PKL_PATH.resolve()}")
print(f"File size: {PKL_PATH.stat().st_size / (1024 * 1024):.2f} MB")

Saved 420 funds to pickle file: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks/funds_backup.pkl
File size: 48.96 MB


In [10]:
import pickle
from pathlib import Path
from dataclasses import is_dataclass, asdict
import pandas as pd
import sys
from pathlib import Path
RAG_DIR = Path("/home/alvar/CascadeProjects/windsurf-project/RAG/src")
if str(RAG_DIR) not in sys.path:
    sys.path.insert(0, str(RAG_DIR))


PKL_PATH = Path("./funds_backup.pkl")
print("Current working directory:", Path.cwd())
print("PKL_PATH resolves to:", PKL_PATH.resolve())
with PKL_PATH.open("rb") as f:
    funds_total = pickle.load(f)

print(f"Loaded {len(funds_total)} funds from pickle file")

Current working directory: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks
PKL_PATH resolves to: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks/funds_backup.pkl
Loaded 420 funds from pickle file


## Processing Phase

### General information about the fund

In [30]:

for fund in funds_total:
    if 'Vanguard' in fund.name:
        fund.provider = 'The Vanguard Group, Inc'
    else:
        fund.provider = 'BlackRock, Inc'

In [31]:
registrants = set()
for fund in funds_total:
    if '\xa0' in fund.registrant:
        fund.registrant = fund.registrant.replace('\xa0', ' ')
    registrants.add(fund.registrant)

print(registrants)

{'Vanguard Specialized Funds', 'Vanguard Index Funds', 'Vanguard World Fund', 'iShares Trust', 'Vanguard Whitehall Funds'}


In [32]:
from src.simple_rag.models.fund import ShareClassType

unique_share_classes = set(fund.share_class for fund in funds_total)
print("Unique Share Classes:")
for share_class in unique_share_classes:
    print(share_class)

from collections import Counter

# Count funds by share class
share_counts = Counter()
for fund in funds_total:
    if fund.share_class:
        if fund.share_class == ShareClassType.OTHER:
            fund.share_class = ShareClassType.ETF
        share_counts[fund.share_class] += 1

print("Funds by share class:")
for share_type, count in share_counts.items():
    print(f"  {share_type.value}: {count} funds")



Unique Share Classes:
ShareClassType.ETF
ShareClassType.INVESTOR
ShareClassType.ADMIRAL
ShareClassType.OTHER
ShareClassType.INSTITUTIONAL_SELECT
ShareClassType.INSTITUTIONAL_PLUS
ShareClassType.INSTITUTIONAL
Funds by share class:
  Investor Shares: 27 funds
  ETF Shares: 330 funds
  Admiral Shares: 40 funds
  Institutional Shares: 16 funds
  Institutional Plus Shares: 4 funds
  Institutional Select Shares: 3 funds


In [33]:
from collections import Counter
import unicodedata

def normalize_name(text):
    if not text:
        return "Unknown"
    # NFKD normalization turns non-breaking spaces into normal spaces
    return unicodedata.normalize("NFKD", text).strip()

# Your original loop, but with the fix applied:
share_counts = Counter()

for fund in funds_total:
    if fund.registrant:
        # Apply the fix here
        clean_registrant = normalize_name(fund.registrant)
        share_counts[clean_registrant] += 1

print("Funds by share class (Merged):")
for share_type, count in share_counts.items():
    print(f"  {share_type}: {count} funds")

Funds by share class (Merged):
  Vanguard Index Funds: 52 funds
  Vanguard World Fund: 41 funds
  Vanguard Specialized Funds: 14 funds
  Vanguard Whitehall Funds: 18 funds
  iShares Trust: 295 funds


In [34]:
from collections import Counter
import unicodedata

def normalize_name(text):
    if not text:
        return "Unknown"
    # NFKD normalization turns non-breaking spaces into normal spaces
    return unicodedata.normalize("NFKD", text).strip()

fund_counts = Counter()

for fund in funds_total:
    if fund.name:
        # 1. Normalize the name first
        clean_name = normalize_name(fund.name)
        
        # 2. Check if it starts with iShares or Vanguard
        # We check case-sensitively, but you could use .lower() if needed
        if not (clean_name.startswith("iShares") or clean_name.startswith("Vanguard")):
            # 3. Add 'Vanguard' if missing
            clean_name = f"Vanguard {clean_name}"
            
        # 4. Count the final modified name
        fund_counts[clean_name] += 1

print("Funds by name (Merged & Fixed):")
for fund_name, count in fund_counts.items():
    print(f"  {fund_name}: {count} funds")



Funds by name (Merged & Fixed):
  Vanguard Extended Market Index Fund: 6 funds
  Vanguard Mid-Cap Index Fund: 5 funds
  Vanguard Mid-Cap Growth Index Fund: 3 funds
  Vanguard Mid-Cap Value Index Fund: 3 funds
  Vanguard Small-Cap Index Fund: 5 funds
  Vanguard Small-Cap Growth Index Fund: 4 funds
  Vanguard Small-Cap Value Index Fund: 4 funds
  Vanguard Total Stock Market Index Fund: 6 funds
  Vanguard 500 Index Fund: 4 funds
  Vanguard Value Index Fund: 4 funds
  Vanguard Growth Index Fund: 4 funds
  Vanguard Large-Cap Index Fund: 4 funds
  Vanguard Mega Cap Growth Index Fund: 2 funds
  Vanguard Extended Duration Treasury Index Fund: 2 funds
  Vanguard ESG U.S. Stock ETF: 1 funds
  Vanguard ESG International Stock ETF: 1 funds
  Vanguard Global WellingtonTM Fund: 2 funds
  Vanguard Global Wellesley¬Æ Income Fund: 2 funds
  Vanguard ESG U.S. Corporate Bond ETF: 1 funds
  Vanguard FTSE Social Index Fund: 2 funds
  Vanguard Materials Index Fund: 2 funds
  Vanguard Communication Services 

In [35]:
sum = 0
for fund in funds_total:
    # Format correctly the numeric fields
    fund.net_assets *= 1e6
    fund.advisory_fees *= 1e3
   

In [36]:
from src.simple_rag.utils.chart_utils import extract_flexible_performance
from src.simple_rag.models.fund import AverageReturnSnapshot
for fund in funds_total:
    
    year = fund.report_date.year
    if fund.avg_annual_returns is None:
        continue 
    result = extract_flexible_performance(fund.avg_annual_returns)
    
    # Create AverageReturnSnapshot instance
    snapshot = AverageReturnSnapshot(
        return_1y=result.get('1_year', None),
        return_5y=result.get('5_year', None),
        return_10y=result.get('10_year', None),
        return_inception=result.get('since_inception', None)
    )
    
    fund.performance[str(year)] = snapshot 
    

In [37]:
for fund in funds_total:
    if fund.financial_highlights is not None:
        print(fund.financial_highlights)
        for _, values in fund.financial_highlights.items():
            values.net_assets = values.net_assets * 1e6

{'2024': FinancialHighlights(turnover=11.0, expense_ratio=0.19, total_return=16.76, net_assets=195.0, net_assets_value_begining=124.78, net_assets_value_end=144.2, net_income_ratio=1.09), '2023': FinancialHighlights(turnover=11.0, expense_ratio=0.19, total_return=25.22, net_assets=232.0, net_assets_value_begining=100.93, net_assets_value_end=124.78, net_income_ratio=1.28), '2022': FinancialHighlights(turnover=11.0, expense_ratio=0.19, total_return=-26.56, net_assets=229.0, net_assets_value_begining=138.8, net_assets_value_end=100.93, net_income_ratio=1.14), '2021': FinancialHighlights(turnover=19.0, expense_ratio=0.19, total_return=12.31, net_assets=399.0, net_assets_value_begining=124.83, net_assets_value_end=138.8, net_income_ratio=0.87), '2020': FinancialHighlights(turnover=19.0, expense_ratio=0.19, total_return=32.04, net_assets=454.0, net_assets_value_begining=95.66, net_assets_value_end=124.83, net_income_ratio=1.04)}
{'2024': FinancialHighlights(turnover=11.0, expense_ratio=0.06

In [38]:
managers_set = set()
for fund in funds_total:
    if hasattr(fund, 'managers') and fund.managers is not None:
        managers_set.update(fund.managers)
for manager in sorted(managers_set):
    print(manager)

Aaron Choi
Asian Economic Risk
Aur√©lie Denis
Brett Barakett
Chris Nieves
Christopher Chung
Consumer Goods
Erin Armstrong
Gary Robinson
Investment Manager
Jake Riley
James Mauro
Jena Stenger
Jennifer Hsui
Jonathan Graves
Kenny Narzikul
Lawrence Burns
Managing Director
Matt Waldron
Michael Cling
Michelle Louie
Nataliya Kofman
Natasha Kuhlkin
Nick Birkett
Peter Sietsema
Senior Managing Director
Simon Webber
Steven White
Suzanne Ly
Thomas Coutts
Tom Slater
Walter Nejman


Enhance the summary prospectus 

In [63]:
def update_summary_prospectus_with_header(fund):
    """Remove all existing headers and add new header."""
    
    header = f"# FUND PROFILE({fund.ticker} - {fund.name} - {fund.share_class if fund.share_class else 'N/A'}):"
    
    if not fund.summary_prospectus or not fund.summary_prospectus.strip():
        return header
    
    # Remove all header lines and keep content
    lines = fund.summary_prospectus.split('\n')
    content_lines = [line for line in lines if not line.strip().startswith("# FUND PROFILE(")]
    
    # Add new header at beginning with proper spacing
    if content_lines:
        return f"{header}\n" + '\n'.join(content_lines)
    else:
        return header

# Update all funds
for i, fund in enumerate(funds_total):
    fund.summary_prospectus = update_summary_prospectus_with_header(fund)
    print(f"‚úÖ {i+1}/{len(funds_total)}: Updated {fund.ticker}")

‚úÖ 1/420: Updated VEXMX
‚úÖ 2/420: Updated VXF
‚úÖ 3/420: Updated VEXAX
‚úÖ 4/420: Updated VIEIX
‚úÖ 5/420: Updated VEMPX
‚úÖ 6/420: Updated VSEMX
‚úÖ 7/420: Updated VIMSX
‚úÖ 8/420: Updated VO
‚úÖ 9/420: Updated VIMAX
‚úÖ 10/420: Updated VMCIX
‚úÖ 11/420: Updated VMCPX
‚úÖ 12/420: Updated VMGIX
‚úÖ 13/420: Updated VOT
‚úÖ 14/420: Updated VMGMX
‚úÖ 15/420: Updated VMVIX
‚úÖ 16/420: Updated VOE
‚úÖ 17/420: Updated VMVAX
‚úÖ 18/420: Updated NAESX
‚úÖ 19/420: Updated VB
‚úÖ 20/420: Updated VSMAX
‚úÖ 21/420: Updated VSCIX
‚úÖ 22/420: Updated VSCPX
‚úÖ 23/420: Updated VISGX
‚úÖ 24/420: Updated VBK
‚úÖ 25/420: Updated VSGAX
‚úÖ 26/420: Updated VSGIX
‚úÖ 27/420: Updated VISVX
‚úÖ 28/420: Updated VBR
‚úÖ 29/420: Updated VSIAX
‚úÖ 30/420: Updated VSIIX
‚úÖ 31/420: Updated VTSMX
‚úÖ 32/420: Updated VTI
‚úÖ 33/420: Updated VTSAX
‚úÖ 34/420: Updated VITSX
‚úÖ 35/420: Updated VSMPX
‚úÖ 36/420: Updated VSTSX
‚úÖ 37/420: Updated VFINX
‚úÖ 38/420: Updated VOO
‚úÖ 39/420: Updated VFIAX
‚úÖ 40/420: Upd

In [64]:
# Check a few funds to see the result
for fund in funds_total[:3]:
    print(f"\n{fund.ticker}:")
    print(f"First 100 chars: ...")
    print(fund.summary_prospectus)
    print(f"Has header: {'# FUND PROFILE(' in fund.summary_prospectus}")


VEXMX:
First 100 chars: ...
# FUND PROFILE(VEXMX - Vanguard Extended Market Index Fund - Investor Shares):

## Investment Objective
The Fund seeks to track the performance of a benchmark index that measuresthe investment return of small- and mid-capitalization stocks.

## Principal Investment Strategies
The Fund employs an indexing investment approach designed to track theperformance of the S& P Completion Index (the Index), a broadly diversified indexof stocks of small and mid-size U. S. companies. The Index contains all of theU. S. common stocks regularly traded on the New York Stock Exchange, Cboe, and the Nasdaq over-the-counter market, except those stocks included in theS& P 500 Index. The Fund invests by sampling the Index, meaning that it holds abroadly diversified collection of securities that, in the aggregate, approximatesthe full Index in terms of key characteristics. These characteristics includeindustry weightings and market capitalization, as well as certain financialmea

### Geographic Allocation

In [11]:

for fund in funds_total:
    df = fund.geographic_allocation
    if df is not None:
        df = df.iloc[:,[0,1]]
        df.columns = ['Country', 'Percentage']
        df.iloc[:, 0] = df.iloc[:, 0].astype(str).str.replace('\u200b', '', regex=True).str.strip()
        df.iloc[:, 0] = df.iloc[:, 0].replace(['', 'nan', 'None', 'Other#', 'Country/Geographic Region'], 'Other')
        df.iloc[:, 1] = pd.to_numeric(df.iloc[:, 1], errors='coerce')
        df = df[df.iloc[:, 1].notna()]
        
        fund.geographic_allocation = df
        
countries = set()
for fund in funds_total:
    df = fund.geographic_allocation
    if df is not None:
        # Get unique values from the first column
        unique_values = df.iloc[:, 0].unique()
        countries.update(unique_values)
print(f"Unique countries: {len(countries)}")
print("Countries:", sorted(countries))
# Verify data types
for fund in funds_total:  # Check first fund
    if fund.geographic_allocation is not None:
        print(f"\nSample cleaned data for {fund.ticker}:")
        print(f"Shape: {fund.geographic_allocation.shape}")
        print(f"Data types: {fund.geographic_allocation.dtypes}")
        print(f"Sample rows:")
        print(fund.geographic_allocation)
        break


Unique countries: 42
Countries: ['Australia', 'Belgium', 'Brazil', 'Canada', 'Chile', 'China', 'Colombia', 'Denmark', 'Finland', 'France', 'Germany', 'Hong Kong', 'India', 'Indonesia', 'Israel', 'Italy', 'Japan', 'Malaysia', 'Mexico', 'Netherlands', 'New Zealand', 'Norway', 'Other', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Qatar', 'Saudi Arabia', 'Singapore', 'South Africa', 'South Korea', 'Spain', 'Supranational', 'Sweden', 'Switzerland', 'Taiwan', 'Thailand', 'Turkey', 'United Arab Emirates', 'United Kingdom', 'United States']

Sample cleaned data for ERET:
Shape: (10, 2)
Data types: Country       object
Percentage    object
dtype: object
Sample rows:
           Country Percentage
1    United States       62.5
2            Japan        9.7
3        Australia        5.1
4   United Kingdom        4.4
5        Singapore        3.0
6        Hong Kong        2.7
7           Sweden        2.2
8          Germany        2.2
9           France        2.0
10          Canada        2.0


### Sector Allocation


In [12]:
from src.simple_rag.utils.chart_utils import validate_and_clean_allocation

for fund in funds_total:
    df = fund.sector_allocation
    if df is not None:
        df = validate_and_clean_allocation(df, 'Sector', sort_by_value=True)
        fund.sector_allocation = df


VALIDATION REPORT
Category Column: 'Sector' | Value Column: 'Percent of Total Investments(a)'
‚ùå Row  0: 'Portfolio Composition % of Net' | 'nan     ' | Numeric: True | Range OK: False
‚úÖ Row  1: 'Communication Services        ' | '4.3%    ' | Numeric: True | Range OK: True
‚úÖ Row  2: 'Consumer Discretionary        ' | '12.0%   ' | Numeric: True | Range OK: True
‚úÖ Row  3: 'Consumer Staples              ' | '3.0%    ' | Numeric: True | Range OK: True
‚úÖ Row  4: 'Energy                        ' | '4.1%    ' | Numeric: True | Range OK: True
‚úÖ Row  5: 'Financials                    ' | '18.0%   ' | Numeric: True | Range OK: True
‚úÖ Row  6: 'Health Care                   ' | '11.4%   ' | Numeric: True | Range OK: True
‚úÖ Row  7: 'Industrials                   ' | '17.4%   ' | Numeric: True | Range OK: True
‚úÖ Row  8: 'Information Technology        ' | '17.9%   ' | Numeric: True | Range OK: True
‚úÖ Row  9: 'Materials                     ' | '4.7%    ' | Numeric: True | Range OK: 

In [13]:
import pandas as pd
import re

# 1. Get all sectors from your funds
sectors = set()
for fund in funds_total:
    df = fund.sector_allocation
    if df is not None:
        sectors.update(df.iloc[:, 0].unique())

# 2. The Cleaning Function
def get_standard_key(text):
    if not isinstance(text, str):
        return text
        
    # A. Convert to lowercase
    clean = text.lower()
    
    # B. Remove the specific word "sector"
    clean = re.sub(r'\bsector\b', '', clean)
    
    # C. Remove punctuation
    clean = re.sub(r'[^a-z0-9\s]', '', clean)
    
    # D. Remove extra whitespace
    clean = re.sub(r'\s+', ' ', clean).strip()
    
    return clean

# 3. Create mapping DataFrame
sectors_list = list(sectors)
df_mapping = pd.DataFrame({'Original': sectors_list})
df_mapping['Clean_Key'] = df_mapping['Original'].apply(get_standard_key)

# 4. Create canonical mapping (pick shortest name as standard)
canonical_map = df_mapping.groupby('Clean_Key')['Original'].transform(lambda x: sorted(x, key=len)[0])
df_mapping['Standardized'] = canonical_map

# 5. Create the mapping dictionary
standardization_dict = dict(zip(df_mapping['Original'], df_mapping['Standardized']))

# 6. Review the mapping
print("Standardization Mapping:")
print(df_mapping[['Original', 'Standardized']].drop_duplicates().sort_values('Standardized'))

# 7. Apply standardization to all fund DataFrames
updated_funds = 0
for fund in funds_total:
    df = fund.sector_allocation
    if df is not None:
        # Make a copy to avoid SettingWithCopyWarning
        df_copy = df.copy()
        
        # Apply standardization to first column
        original_values = df_copy.iloc[:, 0].copy()
        standardized_values = original_values.map(standardization_dict)
        df_copy.iloc[:, 0] = standardized_values
        df_copy.iloc[:, 0] = (
            df_copy.iloc[:, 0]
            .str.replace(r'\bSector\b', '', regex=True)  # Remove multiple words
            .str.replace(r'\s+', ' ', regex=True)         # Fix extra spaces
            .str.strip()                                 # Remove leading/trailing spaces
        )
        # Update the fund's DataFrame
        fund.sector_allocation = df_copy
        updated_funds += 1

print(f"\n‚úÖ Updated {updated_funds} funds with standardized sector names")

# 8. Verify the results
final_sectors = set()
for fund in funds_total:
    df = fund.sector_allocation
    if df is not None:
        final_sectors.update(df.iloc[:, 0].unique())

print(f"\nBefore standardization: {len(sectors)} unique sectors")
print(f"After standardization: {len(final_sectors)} unique sectors")
print(f"Reduction: {len(sectors) - len(final_sectors)} duplicates removed")

# 9. Show some examples of the changes
print(f"\nExamples of standardization:")
for original, standardized in standardization_dict.items():
    if original != standardized:
        print(f"  '{original}' ‚Üí '{standardized}'")

Standardization Mapping:
                                Original                         Standardized
140                  Aerospace & Defense                  Aerospace & Defense
136             Aerospace Defense Sector                  Aerospace & Defense
70                                Africa                               Africa
97    Agricultural Farm Machinery Sector   Agricultural Farm Machinery Sector
117         Air Freight Logistics Sector              Air Freight & Logistics
..                                   ...                                  ...
141                            Utilities                            Utilities
67   Vanguard Real Estate II Index Fund1  Vanguard Real Estate II Index Fund1
124                      Water Utilities                      Water Utilities
64                Water Utilities Sector                      Water Utilities
111  Wireless Telecommunication Services  Wireless Telecommunication Services

[161 rows x 2 columns]

‚úÖ Updated 24

In [62]:
# Get unique values from first column (index 0) of all sector allocation DataFrames
sectors = set()
for fund in funds_total:
    df = fund.sector_allocation
    if df is not None:
        # Get unique values from first column
        unique_values = df.iloc[:, 0].unique()
        sectors.update(unique_values)

print(f"Unique sectors: {len(sectors)}")
print("Sectors:", sorted(sectors))

Unique sectors: 120
Sectors: ['Aerospace & Defense', 'Africa', 'Agricultural Farm Machinery', 'Air Freight & Logistics', 'Asia', 'Automobile Components', 'Automobile Manufacturers', 'Automobiles', 'Banks', 'Basic Materials', 'Beverages', 'Biotechnology', 'Broadline Retail', 'Building Products', 'Capital Markets', 'Chemicals', 'Coal & Consumable Fuels', 'Commercial Services & Supplies', 'Communication Services', 'Communications', 'Communications Equipment', 'Construction & Engineering', 'Construction Machinery Heavy Transportation Equipment', 'Construction Materials', 'Consumer Discretionary', 'Consumer Finance', 'Consumer Staples', 'Consumer Staples Distribution & Retail', 'Containers & Packaging', 'Data Center REITs', 'Distributors', 'Diversified Consumer Services', 'Diversified Telecommunication Services', 'Domestic Equity', 'Domestic Fixed Income', 'Electric Utilities', 'Electrical Components Equipment', 'Electrical Equipment', 'Electronic Components', 'Electronic Equipment, Instrum

### Industry Allocation

In [64]:
from src.simple_rag.utils.chart_utils import validate_and_clean_allocation

for fund in funds_total:
    df = fund.industry_allocation
    if df is not None:
        df = validate_and_clean_allocation(df, 'Industry', sort_by_value=True)
        fund.industry_allocation = df


üîç Detected header row at index 0
New columns: ['Industry', 'Percent of Total  Investments(a)']
VALIDATION REPORT
Category Column: 'Industry' | Value Column: 'Percent of Total  Investments(a)'
‚úÖ Row  0: 'Retail REITs                  ' | '18.0    ' | Numeric: True | Range OK: True
‚úÖ Row  1: 'Industrial REITs              ' | '13.0    ' | Numeric: True | Range OK: True
‚úÖ Row  2: 'Health Care REITs             ' | '9.9     ' | Numeric: True | Range OK: True
‚úÖ Row  3: 'Multi-Family Residential REITs' | '9.4     ' | Numeric: True | Range OK: True
‚úÖ Row  4: 'Real Estate Operating Companie' | '8.0     ' | Numeric: True | Range OK: True
‚úÖ Row  5: 'Office REITs                  ' | '6.7     ' | Numeric: True | Range OK: True
‚úÖ Row  6: 'Data Center REITs             ' | '6.6     ' | Numeric: True | Range OK: True
‚úÖ Row  7: 'Diversified REITs             ' | '6.5     ' | Numeric: True | Range OK: True
‚úÖ Row  8: 'Self Storage REITs            ' | '6.3     ' | Numeric: True | R

In [65]:
# Get unique values from first column (index 0) of all sector allocation DataFrames
sectors = set()
for fund in funds_total:
    df = fund.industry_allocation
    if df is not None:
        # Get unique values from first column
        unique_values = df.iloc[:, 0].unique()
        sectors.update(unique_values)

print(f"Unique sectors: {len(sectors)}")
print("Sectors:", sorted(sectors))

Unique sectors: 128
Sectors: ['Aerospace & Defense', 'Alternative Carriers', 'Automobile Components', 'Automobiles', 'Automobiles & Components', 'Banks', 'Biotechnology', 'Brewers', 'Building Products', 'Cable & Satellite', 'Capital Goods', 'Capital Markets', 'Chemicals', 'Coal & Consumable Fuels', 'Commercial & Professional Services', 'Commodity Chemicals', 'Communications Equipment', 'Construction & Engineering', 'Construction Machinery & Heavy Transportation Equipment', 'Construction Materials', 'Consumer Discretionary Distribution & Retail', 'Consumer Durables & Apparel', 'Consumer Finance', 'Consumer Services', 'Consumer Staples Merchandise Retail', 'Containers & Packaging', 'Copper', 'Data Center REITs', 'Distillers & Vintners', 'Diversified Banks', 'Diversified Metals & Mining', 'Diversified REITs', 'Diversified Real Estate Activities', 'Diversified Telecommunication Services', 'Electric Utilities', 'Electrical Components & Equipment', 'Electrical Equipment', 'Electronic Equipme

In [14]:
import pickle
from pathlib import Path

PKL_PATH = Path("./funds_backup.pkl")
TMP_PATH = PKL_PATH.with_suffix(PKL_PATH.suffix + ".tmp")

with TMP_PATH.open("wb") as f:
    pickle.dump(funds_total, f, protocol=pickle.HIGHEST_PROTOCOL)

TMP_PATH.replace(PKL_PATH)

print(f"Saved {len(funds_total)} funds to pickle file: {PKL_PATH.resolve()}")
print(f"File size: {PKL_PATH.stat().st_size / (1024 * 1024):.2f} MB")

Saved 420 funds to pickle file: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks/funds_backup.pkl
File size: 74.73 MB
