# PDF Embeddings Generation

This notebook reads a PDF file, extracts text, and generates embeddings using the embedding model configured in the environment variables.

**Configuration:**
- If `USE_OLLAMA=true`: Uses Ollama with `nomic-embed-text:latest`
- Otherwise: Uses Google Generative AI with `models/embedding-001` (requires `GOOGLE_API_KEY`)

In [7]:
# Install required packages (run once if needed)
# %pip install langchain-community langchain-ollama langchain-google-genai pypdf python-dotenv

In [8]:
import os
from pathlib import Path
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain_ollama import OllamaEmbeddings
from langchain_google_genai import GoogleGenerativeAIEmbeddings

use_ollama = os.getenv("USE_OLLAMA", "false").lower() == "true"
print(f"üìä Configuration: USE_OLLAMA = {use_ollama}")

üìä Configuration: USE_OLLAMA = False


In [9]:
# Initialize the embedding model based on environment configuration
if use_ollama:
    ollama_base_url = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
    embeddings = OllamaEmbeddings(
        model="nomic-embed-text:latest",
        base_url=ollama_base_url
    )
    print(f"‚úÖ Initialized Ollama embeddings: nomic-embed-text:latest")
    print(f"   Base URL: {ollama_base_url}")
else:
    project_id = os.getenv("GOOGLE_CLOUD_PROJECT_ID")
    location = os.getenv("GOOGLE_CLOUD_LOCATION", "asia-south1")
    
    if not project_id:
        raise ValueError("GOOGLE_CLOUD_PROJECT_ID environment variable is required when not using Ollama")

    embeddings = GoogleGenerativeAIEmbeddings(
        model="models/gemini-embedding-001",          # or "models/text-embedding-004" etc.
        project=project_id,                # ‚Üê Required for Vertex AI
        location=location,                       # Common location; check supported regions
        vertexai=True,                                # This flag enables Vertex AI backend
        output_dimensionality=768
    )
    print(f"‚úÖ Initialized Vertex AI embeddings: gemini-embedding-001")
    print(f"   Project: {project_id}")
    print(f"   Location: {location}")

‚úÖ Initialized Vertex AI embeddings: gemini-embedding-001
   Project: notebooklm-clone-483513
   Location: asia-south1


In [10]:
# Load the PDF file
pdf_path = Path("backend/src/upload_graph/Sample-Accounting-Income-Statement.pdf")

if not pdf_path.exists():
    raise FileNotFoundError(f"PDF file not found at: {pdf_path}")

print(f"üìÑ Loading PDF from: {pdf_path}")

# Load PDF using LangChain's PyPDFLoader
loader = PyPDFLoader(str(pdf_path))
docs = loader.load()

print(f"‚úÖ Loaded {len(docs)} document(s) from PDF")

# Display first document metadata
if docs:
    print(f"\nüìã First document metadata:")
    print(f"   Page: {docs[0].metadata.get('page', 'N/A')}")
    print(f"   Source: {docs[0].metadata.get('source', 'N/A')}")
    print(f"\nüìù First document preview (first 200 chars):")
    print(f"   {docs[0].page_content[:200]}...")

üìÑ Loading PDF from: backend/src/upload_graph/Sample-Accounting-Income-Statement.pdf
‚úÖ Loaded 7 document(s) from PDF

üìã First document metadata:
   Page: 0
   Source: backend/src/upload_graph/Sample-Accounting-Income-Statement.pdf

üìù First document preview (first 200 chars):
   Understanding Basic Financial Statements
During the accounting cycle, the accounting system is used to track, organize and record the financial
transactions of an organization. At the close of each pe...


In [11]:
# Extract all text from documents
all_texts = [doc.page_content for doc in docs]
print(f"üìù Extracted {len(all_texts)} text chunk(s)")
print(f"   Total characters: {sum(len(text) for text in all_texts)}")

# Display text statistics
for i, text in enumerate(all_texts[:3], 1):  # Show first 3 chunks
    print(f"\n   Chunk {i}: {len(text)} characters")
    print(f"   Preview: {text[:100]}...")

üìù Extracted 7 text chunk(s)
   Total characters: 10376

   Chunk 1: 3435 characters
   Preview: Understanding Basic Financial Statements
During the accounting cycle, the accounting system is used ...

   Chunk 2: 910 characters
   Preview: XYZ COMPANY LIMITED
BALANCE SHEET
AS AT
JUNE 30, 2002
UNAUDITED - See "Notice to Reader"
2002 2001
A...

   Chunk 3: 1138 characters
   Preview: XYZ COMPANY LIMITED
STATEMENT OF INCOME AND RETAINED EARNINGS
FOR THE YEAR ENDED
JUNE 30, 2002
UNAUD...


In [12]:
# Generate embeddings for all text chunks
print("üîÑ Generating embeddings...")

# Generate embeddings in batches to avoid rate limits
batch_size = 5
all_embeddings = []

for i in range(0, len(all_texts), batch_size):
    batch = all_texts[i:i + batch_size]
    batch_num = (i // batch_size) + 1
    total_batches = (len(all_texts) + batch_size - 1) // batch_size
    
    print(f"   Processing batch {batch_num}/{total_batches} ({len(batch)} texts)...")
    
    try:
        batch_embeddings = embeddings.embed_documents(batch)
        all_embeddings.extend(batch_embeddings)
        print(f"   ‚úÖ Batch {batch_num} completed")
    except Exception as e:
        print(f"   ‚ùå Error in batch {batch_num}: {e}")
        raise

print(f"\n‚úÖ Generated {len(all_embeddings)} embeddings")
print(f"   Embedding dimension: {len(all_embeddings[0]) if all_embeddings else 'N/A'}")

üîÑ Generating embeddings...
   Processing batch 1/2 (5 texts)...
   ‚úÖ Batch 1 completed
   Processing batch 2/2 (2 texts)...
   ‚úÖ Batch 2 completed

‚úÖ Generated 7 embeddings
   Embedding dimension: 768


In [12]:
# Display embedding statistics
import numpy as np

if all_embeddings:
    embeddings_array = np.array(all_embeddings)
    
    print("üìä Embedding Statistics:")
    print(f"   Shape: {embeddings_array.shape}")
    print(f"   Mean: {embeddings_array.mean():.6f}")
    print(f"   Std: {embeddings_array.std():.6f}")
    print(f"   Min: {embeddings_array.min():.6f}")
    print(f"   Max: {embeddings_array.max():.6f}")
    
    # Show first embedding (first 10 dimensions)
    print(f"\nüî¢ First embedding preview (first 10 dimensions):")
    print(f"   {all_embeddings[0][:10]}")

üìä Embedding Statistics:
   Shape: (7, 3072)
   Mean: -0.000133
   Std: 0.018042
   Min: -0.235695
   Max: 0.252917

üî¢ First embedding preview (first 10 dimensions):
   [-0.00233125570230186, 0.011832311749458313, 0.03495323657989502, -0.0471910797059536, 0.0049485107883811, 0.00774120120331645, 0.014832611195743084, 0.011454415507614613, -0.0286474097520113, 0.004035902675241232]


In [13]:
# Optional: Test embedding a query to verify the model works for retrieval
test_query = "What is the total revenue?"

print(f"üîç Testing query embedding: '{test_query}'")

try:
    query_embedding = embeddings.embed_query(test_query)
    print(f"‚úÖ Query embedding generated")
    print(f"   Dimension: {len(query_embedding)}")
    print(f"   First 10 dimensions: {query_embedding[:10]}")
except Exception as e:
    print(f"‚ùå Error generating query embedding: {e}")

üîç Testing query embedding: 'What is the total revenue?'
‚úÖ Query embedding generated
   Dimension: 3072
   First 10 dimensions: [-0.0202526543289423, 0.0029630553908646107, 0.009256268851459026, -0.06388071179389954, 0.016612660139799118, 0.0016189336311072111, -0.0022575261536985636, -0.01491749007254839, -0.03048672527074814, 0.012214137241244316]
