# Open-Source only RAG

- Sentence Transformers as embedding model
- Postgres as vector store
- Llama 2 as LLM

#### Sentence Transformers

In [1]:
#%pip install llama-index-readers-file pymupdf
#%pip install llama-index-vector-stores-postgres
#%pip install llama-index-embeddings-huggingface
#%pip install llama-index-llms-llama-cpp

In [2]:
# sentence transformers
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en")

  from .autonotebook import tqdm as notebook_tqdm


#### Llama CPP

In [3]:
#!pip install llama-cpp-python

In [4]:
from llama_index.llms.llama_cpp import LlamaCPP

# model_url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/resolve/main/llama-2-13b-chat.ggmlv3.q4_0.bin"
model_url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve/main/llama-2-13b-chat.Q4_0.gguf"

llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    model_url=model_url,
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path=None,
    temperature=0.1,
    max_new_tokens=256,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3900,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 1},
    verbose=True,
)




llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from /Users/busraoguzoglu/Library/Caches/llama_index/models/llama-2-13b-chat.Q4_0.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
llama_model_loader: - kv   4:                          llama.block_count u32              = 40
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:      

#### Postgres

In [5]:
#!pip install psycopg2-binary pgvector asyncpg "sqlalchemy[asyncio]" greenlet

In [6]:
import psycopg2

# Define your database connection parameters
db_name = "rag_db"  # Use the existing database name here
host = "localhost"
password = "password"  # Replace with your actual PostgreSQL password
port = "5432"  # Default PostgreSQL port
user = "myuser"  # Replace with your actual PostgreSQL username

# Connect directly to the 'rag_db' database
conn = psycopg2.connect(
    dbname=db_name,  # Connect directly to 'rag_db'
    host=host,
    password=password,
    port=port,
    user=user,
)
conn.autocommit = True

# Optionally, perform any operations on 'rag_db' using a cursor
with conn.cursor() as c:
    # Drop and create operations are not needed if the database already exists
    # Here you can perform other database setup actions if necessary
    print(f"Connected to {db_name} successfully.")

# Close the connection when done
#conn.close()

Connected to rag_db successfully.


Create table (checks if exist, truncates if already exist and generates again)

In [7]:
from sqlalchemy import create_engine, text
from sqlalchemy.exc import OperationalError
from llama_index.vector_stores.postgres import PGVectorStore

# Connection parameters
db_params = {
    "user": "myuser",      # Replace with your actual username
    "password": "password",  # Replace with your actual password
    "host": "localhost",   # Adjust if your database is hosted elsewhere
    "port": "5432",        # Default PostgreSQL port
    "database": "rag_db"   # Replace with your actual database name
}

# Create SQLAlchemy engine
engine = create_engine(f"postgresql://{db_params['user']}:{db_params['password']}@{db_params['host']}:{db_params['port']}/{db_params['database']}")

# Update table name to include 'data_' prefix
table_name = "data_pmc_table"  # Postgre adds 'data' in front of table_name
embed_dim = 384  # Embedding dimension

try:
    with engine.connect() as conn:
        # Check if the table exists
        result = conn.execute(text(f"""
            SELECT EXISTS (
                SELECT FROM information_schema.tables 
                WHERE table_schema = 'public' AND table_name = :table_name
            );
        """), {"table_name": table_name}).scalar()

        if result:
            print(f"Table '{table_name}' exists. Cleaning it...")
            conn.execute(text(f"TRUNCATE TABLE {table_name};"))
        else:
            print(f"Table '{table_name}' does not exist. Creating it...")

        # Create a new table for vector store
        vector_store = PGVectorStore.from_params(
            database=db_params["database"],
            host=db_params["host"],
            password=db_params["password"],
            port=db_params["port"],
            user=db_params["user"],
            table_name="pmc_table",  # Use unprefixed name; PGVectorStore adds 'data_' automatically
            embed_dim=embed_dim,
        )
        print(f"Table '{table_name}' has been created.")
except OperationalError as e:
    print(f"Error: {e}")
    print("Make sure the database exists and connection parameters are correct.")

Table 'data_pmc_table' exists. Cleaning it...
Table 'data_pmc_table' has been created.


## Ingestion Pipeline

### Data Loading and Preprocessing:

Preprocessing of the document does this:

1- Remove all text after the stopwords, like 'References', because they do not have any information for us.

2- Remove section numbering, letters, extra spaces, new lines.

3- Remove 'tables' from the text.

In [14]:
from pathlib import Path
from llama_index.readers.file import PyMuPDFReader
from llama_index.core.node_parser import SentenceSplitter
import re
from pathlib import Path
from llama_index.readers.file import PyMuPDFReader
import fitz  # PyMuPDF for image extraction

### Text Preprocessing

In [15]:
# Define stop keywords
stop_keywords = ["References", "REFERENCES", "references", "Conflict of interest", "Conflicts of Interest", "Conflicts"]

# Regex to capture figures
figure_pattern = r'\b(Figure|Fig\.)\s*\d+[a-zA-Z]*\b.*?(\n.*?)*?(?=\n\n|\Z)'

def preprocess_text(text, stop_keywords):
    """Preprocess text by stopping at the earliest occurrence of any keyword, ignoring case."""
    earliest_position = len(text)  # Default to the end of the text
    
    for keyword in stop_keywords:
        pattern = rf"\b{re.escape(keyword)}\b"  # Match exact word boundaries
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            earliest_position = min(earliest_position, match.start())
    
    return text[:earliest_position]

def clean_text(block_text):
    """Clean and process individual blocks of text."""
    block_text = " ".join(block_text.split())  # Remove extra spaces and newlines
    block_text = re.sub(r'^\d+\.\s+', '', block_text)  # Remove section numbering
    block_text = re.sub(r'[^A-Za-z0-9\s,.]', '', block_text)  # Keep only basic punctuation and letters

    if len(block_text) < 10 or re.match(r'Page \d+', block_text):
        return None  # Skip short blocks or page numbers

    if re.search(r'\d{2,}', block_text) and re.search(r'(\d+\.\d+|\d+%)', block_text):
        return None  # Skip blocks with many numeric values

    if re.search(r'\b(Table|Effect|Mediation|Summary|IV|Mediator|Cont)\b', block_text, re.IGNORECASE):
        return None

    return block_text

### Extract figures and save them individually

In [16]:
def extract_figures(text):
    """Extract figure captions or descriptions from the text."""
    figures = re.findall(figure_pattern, text, re.IGNORECASE | re.DOTALL)
    return [" ".join(fig).strip() for fig in figures if len(" ".join(fig).strip()) > 5]  # Filter out short/no-content matches

def extract_images_from_pdf(pdf_path, output_dir, pdf_name):
    """Extract images from a PDF and save them to output_dir."""
    document = fitz.open(pdf_path)
    for page_idx in range(len(document)):
        page = document.load_page(page_idx)
        images = page.get_images(full=True)
        
        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = document.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            
            # Save the image
            image_filename = output_dir / f"{pdf_name}_page{page_idx}_img{img_index}.{image_ext}"
            with image_filename.open("wb") as img_file:
                img_file.write(image_bytes)
            
            print(f"Saved image: {image_filename}")

    document.close()

### Using preprocess functions on documents

In [17]:
# Load and preprocess documents
folder_path = Path("./data")
documents = []

output_path = Path("./processed_documents.txt")  # File to save processed documents
figures_output_path = Path("./extracted_figures_summary.txt")  # Summary of extracted figures
figures_dir = Path("./extracted_figures")  # Directory to save each figure individually
figures_dir.mkdir(exist_ok=True)

with output_path.open("w", encoding="utf-8") as output_file, figures_output_path.open("w", encoding="utf-8") as figures_file:
    for pdf_file in folder_path.glob("*.pdf"):
        loader = PyMuPDFReader()
        loaded_docs = loader.load(file_path=str(pdf_file))
        print(f"{pdf_file.name}: {len(loaded_docs)} documents loaded")

        # Extract images directly from the PDF
        extract_images_from_pdf(pdf_file, figures_dir, pdf_file.stem)

        for doc_idx, doc in enumerate(loaded_docs):
            # Preprocess text
            processed_text = preprocess_text(doc.text, stop_keywords)
            
            # Extract figures from processed text
            figures = extract_figures(processed_text)

            for i, figure in enumerate(figures):
                figure_file_path = figures_dir / f"{pdf_file.stem}_doc{doc_idx}_fig{i}.txt"
                figure_file_path.write_text(figure, encoding="utf-8")  # Write figure caption to its own file

                # Write to the summary file
                figures_file.write(f"{pdf_file.name} - Document {doc_idx}:\n")
                figures_file.write(figure + "\n")
                figures_file.write("=" * 80 + "\n")
            
            print(f"Saved {len(figures)} figure captions for Document {doc_idx} from {pdf_file.name}")

            # Remove figures from text after extraction to clean up
            for figure in figures:
                processed_text = processed_text.replace(figure, "")

            # Clean and process the remaining text blocks
            cleaned_blocks = []
            for block in processed_text.split("\n"):
                cleaned_block = clean_text(block)
                if cleaned_block:
                    cleaned_blocks.append(cleaned_block)
            
            final_text = "\n".join(cleaned_blocks)
            doc.text = final_text  # Replace with cleaned and processed text
            doc.metadata = {"source": pdf_file.name}  # Add metadata
            documents.append(doc)
            
            # Write the processed text into the output file
            output_file.write(f"Document {doc_idx} from {pdf_file.name}:\n")
            output_file.write(final_text + "\n")
            output_file.write("=" * 80 + "\n")  # Separator between documents

print(f"Processed documents written to {output_path}")
print(f"Extracted figure captions written to {figures_output_path}")
print(f"Individual figure files and images saved in '{figures_dir}'")

5.pdf: 10 documents loaded
Saved image: extracted_figures/5_page0_img0.jpeg
Saved image: extracted_figures/5_page1_img0.jpeg
Saved image: extracted_figures/5_page2_img0.jpeg
Saved image: extracted_figures/5_page3_img0.jpeg
Saved image: extracted_figures/5_page4_img0.png
Saved image: extracted_figures/5_page4_img1.jpeg
Saved image: extracted_figures/5_page4_img2.jpeg
Saved image: extracted_figures/5_page5_img0.jpeg
Saved image: extracted_figures/5_page6_img0.jpeg
Saved image: extracted_figures/5_page7_img0.jpeg
Saved image: extracted_figures/5_page8_img0.jpeg
Saved image: extracted_figures/5_page9_img0.jpeg
Saved 0 figure captions for Document 0 from 5.pdf
Saved 0 figure captions for Document 1 from 5.pdf
Saved 0 figure captions for Document 2 from 5.pdf
Saved 0 figure captions for Document 3 from 5.pdf
Saved 0 figure captions for Document 4 from 5.pdf
Saved 0 figure captions for Document 5 from 5.pdf
Saved 0 figure captions for Document 6 from 5.pdf
Saved 0 figure captions for Document

In [None]:
print(documents[65])
print(len(documents))

### Splitting Documents

In [None]:
from llama_index.core.node_parser import SentenceSplitter

text_parser = SentenceSplitter(
    chunk_size=1024,
    # separator=" ",
)

In [None]:
text_chunks = []
# maintain relationship with source doc index, to help inject doc metadata in (3)
doc_idxs = []
for doc_idx, doc in enumerate(documents):
    cur_text_chunks = text_parser.split_text(doc.text)
    #print(f"Document {doc_idx} has {len(cur_text_chunks)} chunks")
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))  

In [None]:
print(len(text_chunks))

### Construct Nodes from Text Chunks

In [None]:
from llama_index.core.schema import TextNode

nodes = []
for idx, text_chunk in enumerate(text_chunks):
    node = TextNode(
        text=text_chunk,
        metadata={
            **documents[doc_idxs[idx]].metadata,  # This now includes 'source'
            "doc_index": doc_idxs[idx]  # Optional: document index
        }
    )
    nodes.append(node)

In [None]:
nodes[1]

### Generate Embeddings for each Node

Using sentence_transformers

In [None]:
for node in nodes:
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode="text")
    )
    node.embedding = node_embedding

In [None]:
print(type(nodes[0].embedding))
print(nodes[0].embedding)
print(len(nodes))

### Load Nodes into a Vector Store

We now insert these nodes into our `PostgresVectorStore`.

In [None]:
vector_store.add(nodes)

## Retrieval Pipeline

In [None]:
from llama_index.core import QueryBundle
from llama_index.core.retrievers import BaseRetriever
from typing import Any, List
from llama_index.core.schema import NodeWithScore
from typing import Optional
from llama_index.core.vector_stores import VectorStoreQuery

In [None]:
class VectorDBRetriever(BaseRetriever):
    """Retriever over a postgres vector store."""

    def __init__(
        self,
        vector_store: PGVectorStore,
        embed_model: Any,
        query_mode: str = "default",
        similarity_top_k: int = 2,
    ) -> None:
        """Init params."""
        self._vector_store = vector_store
        self._embed_model = embed_model
        self._query_mode = query_mode
        self._similarity_top_k = similarity_top_k
        super().__init__()

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        """Retrieve."""
        query_embedding = embed_model.get_query_embedding(
            query_bundle.query_str
        )
        vector_store_query = VectorStoreQuery(
            query_embedding=query_embedding,
            similarity_top_k=self._similarity_top_k,
            mode=self._query_mode,
        )
        query_result = vector_store.query(vector_store_query)

        nodes_with_scores = []
        for index, node in enumerate(query_result.nodes):
            score: Optional[float] = None
            if query_result.similarities is not None:
                score = query_result.similarities[index]
            nodes_with_scores.append(NodeWithScore(node=node, score=score))

        return nodes_with_scores

In [None]:
retriever = VectorDBRetriever(
    vector_store, embed_model, query_mode="default", similarity_top_k=2
)

retriever

## Response

In [None]:
from llama_index.core.query_engine import RetrieverQueryEngine

query_engine = RetrieverQueryEngine.from_args(retriever, llm=llm)

In [None]:
query_str = "Which individuals play a central role in promoting healthy eating?"

response = query_engine.query(query_str)

In [None]:
print(str(response))

In [None]:
print(response.source_nodes[0].get_content())

In [None]:
query_str = "What does nutrition literacy significantly predicted in the social-ecological framework?"

response = query_engine.query(query_str)

In [None]:
print(str(response))

In [None]:
response.source_nodes[0]

In [None]:
print(response.source_nodes[0].get_content())