In [1]:
# Cell 1: Import the necessary loader and load the document
from langchain_community.document_loaders import TextLoader

# The loader needs the path to our file
loader = TextLoader("sample_doc.txt")

# The .load() method reads the file and creates a Document object
documents = loader.load()

# Let's inspect the result
print(f"Loaded {len(documents)} document.")
print("\n--- Document Content ---")
print(documents[0].page_content)

print("\n--- Document Metadata ---")
print(documents[0].metadata)

Loaded 1 document.

--- Document Content ---
The Clearstream Banking AG (CBF) is a German Central Securities Depository (CSD). It provides post-trade infrastructure for the German securities market.
Key services include settlement, custody, and asset servicing. Settlement is the process of transferring securities and funds between parties. Custody involves the safekeeping and administration of securities on behalf of clients.
The primary system used for these services is CASCADE. All participants must adhere to the rules and regulations set forth by BaFin.
This gives us a simple, multi-paragraph document to work with.

--- Document Metadata ---
{'source': 'sample_doc.txt'}


In [2]:
# Cell 2: Import additional loaders and utilities for PDF processing
from langchain_community.document_loaders import PyPDFLoader
import os
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Cell 3: Define the docs folder path and scan for PDF files
docs_folder = "docs"
pdf_files = []

# Check if docs folder exists
if os.path.exists(docs_folder):
    # Get all PDF files in the docs folder
    for file in os.listdir(docs_folder):
        if file.lower().endswith('.pdf'):
            pdf_files.append(os.path.join(docs_folder, file))
    
    print(f"Found {len(pdf_files)} PDF files in the docs folder:")
    for pdf_file in pdf_files:
        print(f"  - {pdf_file}")
else:
    print(f"Error: '{docs_folder}' folder not found!")

Found 6 PDF files in the docs folder:
  - docs\aml-ctf-statement-attention-of-cbl-transfer-agent-data.pdf
  - docs\Canadian Collateral Management Services (CCMS).pdf
  - docs\cbl-aml-questionnaire-data.pdf
  - docs\Disclosure Requirements – Investment Funds –Denmark.pdf
  - docs\Holding Restrictions – Investment Funds – Ireland.pdf
  - docs\Holding Restrictions – Investment Funds –Denmark.pdf


In [4]:
# Cell 4: Load all PDF documents
all_documents = []
failed_files = []

print("Starting PDF ingestion process...")
print("=" * 50)

for pdf_file in pdf_files:
    try:
        print(f"Loading: {pdf_file}")
        loader = PyPDFLoader(pdf_file)
        documents = loader.load()
        
        # Add source information to metadata
        for doc in documents:
            doc.metadata['source_file'] = pdf_file
            doc.metadata['file_name'] = os.path.basename(pdf_file)
        
        all_documents.extend(documents)
        print(f"  ✓ Successfully loaded {len(documents)} pages")
        
    except Exception as e:
        failed_files.append((pdf_file, str(e)))
        print(f"  ✗ Failed to load {pdf_file}: {e}")

print("=" * 50)
print(f"Ingestion complete!")
print(f"Total documents loaded: {len(all_documents)}")
print(f"Failed files: {len(failed_files)}")

if failed_files:
    print("\nFailed files:")
    for file, error in failed_files:
        print(f"  - {file}: {error}")

Starting PDF ingestion process...
Loading: docs\aml-ctf-statement-attention-of-cbl-transfer-agent-data.pdf
  ✓ Successfully loaded 7 pages
Loading: docs\Canadian Collateral Management Services (CCMS).pdf
  ✓ Successfully loaded 3 pages
Loading: docs\cbl-aml-questionnaire-data.pdf
  ✓ Successfully loaded 13 pages
Loading: docs\Disclosure Requirements – Investment Funds –Denmark.pdf
  ✓ Successfully loaded 3 pages
Loading: docs\Holding Restrictions – Investment Funds – Ireland.pdf
  ✓ Successfully loaded 2 pages
Loading: docs\Holding Restrictions – Investment Funds –Denmark.pdf
  ✓ Successfully loaded 2 pages
Ingestion complete!
Total documents loaded: 30
Failed files: 0


In [5]:
# Cell 5: Analyze and display document summaries
print("Document Analysis Summary")
print("=" * 60)

# Group documents by file
files_summary = {}
for doc in all_documents:
    file_name = doc.metadata.get('file_name', 'Unknown')
    if file_name not in files_summary:
        files_summary[file_name] = {
            'page_count': 0,
            'total_chars': 0,
            'sample_content': ''
        }
    
    files_summary[file_name]['page_count'] += 1
    files_summary[file_name]['total_chars'] += len(doc.page_content)
    
    # Store sample content from first page if not already stored
    if not files_summary[file_name]['sample_content']:
        # Get first 200 characters as sample
        files_summary[file_name]['sample_content'] = doc.page_content[:200].strip()

# Display summary for each file
for file_name, summary in files_summary.items():
    print(f"\nFile: {file_name}")
    print(f"  Pages: {summary['page_count']}")
    print(f"  Total characters: {summary['total_chars']:,}")
    print(f"  Sample content: {summary['sample_content']}...")
    print("-" * 40)

Document Analysis Summary

File: aml-ctf-statement-attention-of-cbl-transfer-agent-data.pdf
  Pages: 7
  Total characters: 15,640
  Sample content: 1 
To the attention  of: The Transfer Agent (“TA”) / the Fund  and applicable to all 
Clearstream Banking SA accounts and designations in funds under the administration 
of the TA 
Clearstream Banking...
----------------------------------------

File: Canadian Collateral Management Services (CCMS).pdf
  Pages: 3
  Total characters: 3,196
  Sample content: Canadian Collateral Management Services(CCMS)
Powering domestic collateral management with a world-leading tripartyinfrastructure
08.07.2025
Continuous regulatory changes require ﬁnancial market infra...
----------------------------------------

File: cbl-aml-questionnaire-data.pdf
  Pages: 13
  Total characters: 32,581
  Sample content: Wolfsberg Group Correspondent Banking Due Diligence Questionnaire (CBDDQ) V1.4
© The Wolfsberg Group 2023 Page 1 CBDDQ V1.4
Financial Institution Name:
Lo

In [6]:
# Cell 6: Display detailed metadata for inspection
print("Detailed Metadata Inspection")
print("=" * 60)

# Show metadata for first few documents as examples
for i, doc in enumerate(all_documents[:3]):  # Show first 3 documents
    print(f"\nDocument {i+1} Metadata:")
    for key, value in doc.metadata.items():
        print(f"  {key}: {value}")
    print(f"  Content length: {len(doc.page_content)} characters")
    
if len(all_documents) > 3:
    print(f"\n... and {len(all_documents) - 3} more documents")

# Store documents for later use
print(f"\nAll {len(all_documents)} documents are now loaded and ready for processing!")

Detailed Metadata Inspection

Document 1 Metadata:
  producer: Microsoft® Word for Microsoft 365
  creator: Microsoft® Word for Microsoft 365
  creationdate: 2024-04-03T17:34:32+02:00
  author: Grace O'Connor
  msip_label_2e952e98-911c-4aff-840a-f71bc6baaf7f_actionid: db4963dd-f6a7-4a29-baa3-f43e6f4be2dd
  msip_label_2e952e98-911c-4aff-840a-f71bc6baaf7f_contentbits: 2
  msip_label_2e952e98-911c-4aff-840a-f71bc6baaf7f_enabled: true
  msip_label_2e952e98-911c-4aff-840a-f71bc6baaf7f_method: Privileged
  msip_label_2e952e98-911c-4aff-840a-f71bc6baaf7f_name: 2e952e98-911c-4aff-840a-f71bc6baaf7f
  msip_label_2e952e98-911c-4aff-840a-f71bc6baaf7f_setdate: 2023-04-27T15:43:42Z
  msip_label_2e952e98-911c-4aff-840a-f71bc6baaf7f_siteid: e00ddcdf-1e0f-4be5-a37a-894a4731986a
  moddate: 2024-04-11T12:53:52+02:00
  source: docs\aml-ctf-statement-attention-of-cbl-transfer-agent-data.pdf
  total_pages: 7
  page: 0
  page_label: 1
  source_file: docs\aml-ctf-statement-attention-of-cbl-transfer-agent-data

In [7]:
# Cell 7: Split documents into chunks for better RAG performance
from langchain.text_splitter import RecursiveCharacterTextSplitter

print("Splitting documents into chunks...")
print("=" * 50)

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,        # Size of each chunk in characters
    chunk_overlap=200,      # Overlap between chunks to maintain context
    length_function=len,
    separators=["\n\n", "\n", " ", ""]
)

# Split all documents
chunked_documents = text_splitter.split_documents(all_documents)

print(f"Original documents: {len(all_documents)}")
print(f"After chunking: {len(chunked_documents)}")

# Show chunking statistics
chunk_sizes = [len(doc.page_content) for doc in chunked_documents]
print(f"Average chunk size: {sum(chunk_sizes) / len(chunk_sizes):.0f} characters")
print(f"Min chunk size: {min(chunk_sizes)} characters")
print(f"Max chunk size: {max(chunk_sizes)} characters")

Splitting documents into chunks...
Original documents: 30
After chunking: 87
Average chunk size: 802 characters
Min chunk size: 183 characters
Max chunk size: 999 characters


In [8]:
# Cell 8: Create embeddings and vector store using HuggingFace
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

print("Creating embeddings and vector store with HuggingFace...")
print("=" * 50)

# Initialize HuggingFace embeddings with all-MiniLM-L6-v2
# This model is specifically designed for semantic search and clustering
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'},  # Use 'cuda' if you have GPU
    encode_kwargs={'normalize_embeddings': True}  # Normalize for better similarity scores
)

print("Model: sentence-transformers/all-MiniLM-L6-v2")
print("Embedding dimension: 384")
print("Device: CPU")
print()

# Create vector store from documents
print("Generating embeddings for all document chunks...")
print("This may take a few minutes depending on the number of chunks...")

try:
    vectorstore = FAISS.from_documents(chunked_documents, embeddings)
    
    print(f"✓ Vector store created successfully!")
    print(f"✓ Embedded {len(chunked_documents)} document chunks")
    print("✓ Ready for semantic search and retrieval!")
    
except Exception as e:
    print(f"✗ Error creating embeddings: {e}")
    print("Make sure sentence-transformers is installed: pip install sentence-transformers")

Creating embeddings and vector store with HuggingFace...
Model: sentence-transformers/all-MiniLM-L6-v2
Embedding dimension: 384
Device: CPU

Generating embeddings for all document chunks...
This may take a few minutes depending on the number of chunks...
✓ Vector store created successfully!
✓ Embedded 87 document chunks
✓ Ready for semantic search and retrieval!


In [9]:
# Cell 9: Test the retrieval system
print("Testing document retrieval...")
print("=" * 50)

# Test with a financial/regulatory query relevant to Deutsche Börse
test_queries = [
    "investment fund regulations",
    "trading restrictions",
    "compliance requirements",
    "risk management"
]

for query in test_queries:
    print(f"\nQuery: '{query}'")
    # Perform similarity search
    relevant_docs = vectorstore.similarity_search(query, k=2)
    
    for i, doc in enumerate(relevant_docs, 1):
        print(f"  Result {i}:")
        print(f"    Source: {doc.metadata.get('file_name', 'Unknown')}")
        print(f"    Page: {doc.metadata.get('page', 'Unknown')}")
        print(f"    Content preview: {doc.page_content[:150].strip()}...")
        print("    " + "-" * 50)

Testing document retrieval...

Query: 'investment fund regulations'
  Result 1:
    Source: Disclosure Requirements – Investment Funds –Denmark.pdf
    Page: 1
    Content preview: Holding Restrictions – Investment Funds –Denmark
Investment Fund Market Guide - Denmark

9/23/25, 4:23 PM Disclosure Requirements – Investment Fund...
    --------------------------------------------------
  Result 2:
    Source: aml-ctf-statement-attention-of-cbl-transfer-agent-data.pdf
    Page: 0
    Content preview: Investment Fund Services and the Global Liquidity Hub.  
2. Applicable   Regulations
As a Monetary Financial Institution (MFI) established in Luxembou...
    --------------------------------------------------

Query: 'trading restrictions'
  Result 1:
    Source: Holding Restrictions – Investment Funds –Denmark.pdf
    Page: 0
    Content preview: Holding Restrictions – Investment Funds –Denmark
24.06.2025
Restrictions on clients
No general restrictions on client residency for holdings of 

In [10]:
# Cell 10: Save vector store for later use
import pickle

print("Saving vector store and documents...")
print("=" * 50)

# Save vector store to disk (FAISS format)
vectorstore.save_local("vector_store")
print("✓ Vector store saved to 'vector_store' folder")

# Save processed documents for reference
with open("processed_documents.pkl", "wb") as f:
    pickle.dump(chunked_documents, f)
print("✓ Processed documents saved to 'processed_documents.pkl'")

# Save embedding model info for consistency
embedding_info = {
    'model_name': 'sentence-transformers/all-MiniLM-L6-v2',
    'embedding_dimension': 384,
    'chunk_size': 1000,
    'chunk_overlap': 200,
    'total_chunks': len(chunked_documents)
}

with open("embedding_config.pkl", "wb") as f:
    pickle.dump(embedding_info, f)
print("✓ Embedding configuration saved to 'embedding_config.pkl'")

print("\n" + "=" * 50)
print("🎉 INGESTION PIPELINE COMPLETE!")
print("=" * 50)
print(f"✅ Processed {len(all_documents)} documents")
print(f"✅ Created {len(chunked_documents)} searchable chunks") 
print(f"✅ Generated 384-dimensional embeddings")
print("✅ Vector store ready for RAG implementation")
print("\nNext step: Use the saved vector store in your RAG pipeline!")

Saving vector store and documents...
✓ Vector store saved to 'vector_store' folder
✓ Processed documents saved to 'processed_documents.pkl'
✓ Embedding configuration saved to 'embedding_config.pkl'

🎉 INGESTION PIPELINE COMPLETE!
✅ Processed 30 documents
✅ Created 87 searchable chunks
✅ Generated 384-dimensional embeddings
✅ Vector store ready for RAG implementation

Next step: Use the saved vector store in your RAG pipeline!
