## Set Up of the Neo4j database

In [36]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [37]:
import pickle
from pathlib import Path
from dataclasses import is_dataclass, asdict
import pandas as pd
import sys
from pathlib import Path
RAG_DIR = Path("/home/alvar/CascadeProjects/windsurf-project/RAG/src")
if str(RAG_DIR) not in sys.path:
    sys.path.insert(0, str(RAG_DIR))


PKL_PATH = Path("./funds_backup.pkl")
print("Current working directory:", Path.cwd())
print("PKL_PATH resolves to:", PKL_PATH.resolve())
with PKL_PATH.open("rb") as f:
    funds_total = pickle.load(f)

print(f"Loaded {len(funds_total)} funds from pickle file")

Current working directory: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks
PKL_PATH resolves to: /home/alvar/CascadeProjects/windsurf-project/RAG/notebooks/funds_backup.pkl
Loaded 420 funds from pickle file


Set up the Neo4j instance using docker-compose

In [46]:
RAG_DIR = Path("/home/alvar/CascadeProjects/windsurf-project/RAG/")
if str(RAG_DIR) not in sys.path:
    sys.path.insert(0, str(RAG_DIR))
%reload_ext autoreload
from simple_rag.database.neo4j.neo4j import Neo4jDatabase

neo = Neo4jDatabase(auto_start=True)
# Reset the entire database
neo.reset_database()

neo.create_constraints()


‚úÖ Database reset complete
üöß Setting up constraints...
‚úÖ Constraints and Indexes configured.


### Create the Providers

In [47]:
provider = ['The Vanguard Group, Inc', 'BlackRock, Inc']

for p in provider:
    neo.get_or_create_provider(p)

[{'p': {'name': 'The Vanguard Group, Inc'}}]
[{'p': {'name': 'BlackRock, Inc'}}]


In [48]:
registrants = set()
for fund in funds_total:
    if '\xa0' in fund.registrant:
        fund.registrant = fund.registrant.replace('\xa0', ' ')
    registrants.add(fund.registrant)
print(registrants)
# Initialize the dictionary with lists
trusts = {
    'The Vanguard Group, Inc': [],
    'BlackRock, Inc': []
}

# Process registrants
for registrant in registrants:
    if 'Vanguard' in registrant:
        trusts['The Vanguard Group, Inc'].append(registrant) 
    else:
        trusts['BlackRock, Inc'].append(registrant)
print(trusts)

{'Vanguard Whitehall Funds', 'Vanguard World Fund', 'iShares Trust', 'Vanguard Index Funds', 'Vanguard Specialized Funds'}
{'The Vanguard Group, Inc': ['Vanguard Whitehall Funds', 'Vanguard World Fund', 'Vanguard Index Funds', 'Vanguard Specialized Funds'], 'BlackRock, Inc': ['iShares Trust']}


### Create the Trusts

In [49]:

for key, value in trusts.items():
    for provider in value:
        print("Created: ", key, " ->", provider)
        neo.get_or_create_trust(key, [provider])

Created:  The Vanguard Group, Inc  -> Vanguard Whitehall Funds
Created:  The Vanguard Group, Inc  -> Vanguard World Fund
Created:  The Vanguard Group, Inc  -> Vanguard Index Funds
Created:  The Vanguard Group, Inc  -> Vanguard Specialized Funds
Created:  BlackRock, Inc  -> iShares Trust


### Create the Share classes

In [50]:
share_classes_data = [
    {
        "name": "Admiral Shares",
        "clean_name": "Admiral Shares",
        "description": (
            "Admiral Shares are Vanguard‚Äôs main retail mutual fund share class and typically "
            "have lower expense ratios than legacy Investor Shares. They are intended for "
            "long-term individual investors and usually require a few thousand dollars as a "
            "minimum investment (often around $3,000, though it varies by fund)."
        ),
    },
    {
        "name": "Investor Shares",
        "clean_name": "Investor Shares",
        "description": (
            "Investor Shares are Vanguard‚Äôs legacy entry-level mutual fund share class. They "
            "generally have higher expense ratios than Admiral Shares and historically had "
            "lower minimum investments (often around $1,000‚Äì$3,000, depending on the fund). "
            "In many cases they are closed to new investors and may be automatically converted "
            "to Admiral Shares once the account balance meets the Admiral minimum."
        ),
    },
    {
        "name": "ETF Shares",
        "clean_name": "ETF Shares",
        "description": (
            "ETF (Exchange-Traded Fund) Shares trade on stock exchanges throughout the day like "
            "a stock. They generally provide the same portfolio exposure as a corresponding "
            "mutual fund share class but offer intraday liquidity. The minimum investment is "
            "the market price of one share (often tens to a few hundred dollars). ETFs are "
            "often more tax-efficient than mutual funds due to the in-kind creation/redemption "
            "mechanism, though investors may face bid‚Äìask spreads."
        ),
    },
    {
        "name": "Institutional Shares",
        "clean_name": "Institutional Shares",
        "description": (
            "Institutional Shares are designed for large investors such as retirement plans, "
            "advisors, endowments, and other institutions. They typically have very low expense "
            "ratios and usually require large minimum investments, commonly in the millions of "
            "dollars (the exact amount varies by fund). Most individuals cannot buy them directly "
            "unless they access them through an employer plan or an institutional platform."
        ),
    },
    {
        "name": "Institutional Plus Shares",
        "clean_name": "Institutional Plus Shares",
        "description": (
            "Institutional Plus Shares are a higher tier of institutional pricing with even lower "
            "expense ratios, generally available only to very large investors. Minimums are typically "
            "in the tens to hundreds of millions of dollars, depending on the fund and access channel."
        ),
    },
    {
        "name": "Institutional Select Shares",
        "clean_name": "Institutional Select Shares",
        "description": (
            "Institutional Select Shares are among the lowest-cost share classes and are typically "
            "available only through very large institutional relationships (often aggregate or negotiated "
            "minimums, potentially in the hundreds of millions to billions). Pricing and availability are "
            "often bespoke and not consistently offered or publicly listed across all funds."
        ),
    },
]


In [51]:
description_map = {sc["clean_name"]: sc["description"] for sc in share_classes_data}
# Cell 2: Count and create share classes with descriptions
from src.simple_rag.models.fund import ShareClassType
from collections import Counter
# Count funds by share class
share_counts = Counter()
for fund in funds_total:
    if fund.share_class:
        if fund.share_class == ShareClassType.OTHER:
            fund.share_class = ShareClassType.ETF
        share_counts[fund.share_class] += 1

for share_type, count in share_counts.items():
    share_class_name = share_type.value
    
    # Get description from mapping, or use a default
    description = description_map.get(share_class_name, f"Share class for {share_class_name}")
    
    # Create share class with proper description
    neo.get_or_create_share_class(share_class_name, description)
    
    print(f"  {share_class_name}: {count} funds")

  Investor Shares: 27 funds
  ETF Shares: 330 funds
  Admiral Shares: 40 funds
  Institutional Shares: 16 funds
  Institutional Plus Shares: 4 funds
  Institutional Select Shares: 3 funds


In [None]:
for fund in funds_total:
    if fund.summary_prospectus:
        print(fund.summary_prospectus)
    neo.create_fund(fund)
    break


Failed to create fund VEXMX: 'FundData' object has no attribute 'embedding'
Traceback (most recent call last):
  File "/home/alvar/CascadeProjects/windsurf-project/RAG/src/simple_rag/database/neo4j/neo4j.py", line 324, in create_fund
    "embedding": fund.embedding,
  File "/home/alvar/anaconda3/envs/rag-env/lib/python3.10/site-packages/pydantic/main.py", line 1026, in __getattr__
    raise AttributeError(f'{type(self).__name__!r} object has no attribute {item!r}')
AttributeError: 'FundData' object has no attribute 'embedding'


üìä Creating fund: VEXMX - Vanguard Extended Market Index Fund
‚ùå ERROR creating fund VEXMX: 'FundData' object has no attribute 'embedding'
