# Open-Source only RAG

- Sentence Transformers as embedding model
- Postgres as vector store
- Llama 2 as LLM
- LLaVa for image summarization

#### Sentence Transformers

In [1]:
#%pip install llama-index-readers-file pymupdf
#%pip install llama-index-vector-stores-postgres
#%pip install llama-index-embeddings-huggingface
#%pip install llama-index-llms-llama-cpp

In [2]:
# sentence transformers
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en")

  from .autonotebook import tqdm as notebook_tqdm


#### Llama CPP

In [3]:
#!pip install llama-cpp-python

In [4]:
from llama_index.llms.llama_cpp import LlamaCPP

# model_url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/resolve/main/llama-2-13b-chat.ggmlv3.q4_0.bin"
model_url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve/main/llama-2-13b-chat.Q4_0.gguf"

llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    model_url=model_url,
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path=None,
    temperature=0.1,
    max_new_tokens=256,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3900,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 1},
    verbose=True,
)




llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from /Users/busraoguzoglu/Library/Caches/llama_index/models/llama-2-13b-chat.Q4_0.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
llama_model_loader: - kv   4:                          llama.block_count u32              = 40
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:      

#### Postgres

In [5]:
#!pip install psycopg2-binary pgvector asyncpg "sqlalchemy[asyncio]" greenlet

In [6]:
import psycopg2

# Define your database connection parameters
db_name = "rag_db"  # Use the existing database name here
host = "localhost"
password = "password"  # Replace with your actual PostgreSQL password
port = "5432"  # Default PostgreSQL port
user = "myuser"  # Replace with your actual PostgreSQL username

# Connect directly to the 'rag_db' database
conn = psycopg2.connect(
    dbname=db_name,  # Connect directly to 'rag_db'
    host=host,
    password=password,
    port=port,
    user=user,
)
conn.autocommit = True

# Optionally, perform any operations on 'rag_db' using a cursor
with conn.cursor() as c:
    # Drop and create operations are not needed if the database already exists
    # Here you can perform other database setup actions if necessary
    print(f"Connected to {db_name} successfully.")

# Close the connection when done
#conn.close()

Connected to rag_db successfully.


Create table (checks if exist, truncates if already exist and generates again)

In [46]:
from sqlalchemy import create_engine, text
from sqlalchemy.exc import OperationalError
from llama_index.vector_stores.postgres import PGVectorStore

# Connection parameters
db_params = {
    "user": "myuser",      # Replace with your actual username
    "password": "password",  # Replace with your actual password
    "host": "localhost",   # Adjust if your database is hosted elsewhere
    "port": "5432",        # Default PostgreSQL port
    "database": "rag_db"   # Replace with your actual database name
}

# Create SQLAlchemy engine
engine = create_engine(f"postgresql://{db_params['user']}:{db_params['password']}@{db_params['host']}:{db_params['port']}/{db_params['database']}")

# Update table name to include 'data_' prefix
table_name = "data_pmc_table"  # Postgre adds 'data' in front of table_name
embed_dim = 384  # Embedding dimension

try:
    with engine.connect() as conn:
        # Check if the table exists
        result = conn.execute(text(f"""
            SELECT EXISTS (
                SELECT FROM information_schema.tables 
                WHERE table_schema = 'public' AND table_name = :table_name
            );
        """), {"table_name": table_name}).scalar()

        if result:
            print(f"Table '{table_name}' exists. Cleaning it...")
            conn.execute(text(f"DROP TABLE {table_name} CASCADE;"))
        else:
            print(f"Table '{table_name}' does not exist. Creating it...")

        # Create a new table for vector store
        vector_store = PGVectorStore.from_params(
            database=db_params["database"],
            host=db_params["host"],
            password=db_params["password"],
            port=db_params["port"],
            user=db_params["user"],
            table_name="pmc_table",  # Use unprefixed name; PGVectorStore adds 'data_' automatically
            embed_dim=embed_dim,
        )
        print(f"Table '{table_name}' has been created.")
except OperationalError as e:
    print(f"Error: {e}")
    print("Make sure the database exists and connection parameters are correct.")

Table 'data_pmc_table' does not exist. Creating it...
Table 'data_pmc_table' has been created.


## Ingestion Pipeline

### Data Loading and Preprocessing:

Preprocessing of the document does this:

1- Remove all text after the stopwords, like 'References', because they do not have any information for us.

2- Remove section numbering, letters, extra spaces, new lines.

3- Remove 'tables' from the text.

In [8]:
from pathlib import Path
from llama_index.readers.file import PyMuPDFReader
from llama_index.core.node_parser import SentenceSplitter
import re
from pathlib import Path
from llama_index.readers.file import PyMuPDFReader
import fitz  # PyMuPDF for image extraction

### Text Preprocessing

In [9]:
# Define stop keywords
stop_keywords = ["References", "REFERENCES", "references", "Conflict of interest", "Conflicts of Interest", "Conflicts"]

# Regex to capture figures
figure_pattern = r'\b(Figure|Fig\.)\s*\d+[a-zA-Z]*\b.*?(\n.*?)*?(?=\n\n|\Z)'

def preprocess_text(text, stop_keywords):
    """Preprocess text by stopping at the earliest occurrence of any keyword, ignoring case."""
    earliest_position = len(text)  # Default to the end of the text
    
    for keyword in stop_keywords:
        pattern = rf"\b{re.escape(keyword)}\b"  # Match exact word boundaries
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            earliest_position = min(earliest_position, match.start())
    
    return text[:earliest_position]

def clean_text(block_text):
    """Clean and process individual blocks of text."""
    block_text = " ".join(block_text.split())  # Remove extra spaces and newlines
    block_text = re.sub(r'^\d+\.\s+', '', block_text)  # Remove section numbering
    block_text = re.sub(r'[^A-Za-z0-9\s,.]', '', block_text)  # Keep only basic punctuation and letters

    if len(block_text) < 10 or re.match(r'Page \d+', block_text):
        return None  # Skip short blocks or page numbers

    if re.search(r'\d{2,}', block_text) and re.search(r'(\d+\.\d+|\d+%)', block_text):
        return None  # Skip blocks with many numeric values

    if re.search(r'\b(Table|Effect|Mediation|Summary|IV|Mediator|Cont)\b', block_text, re.IGNORECASE):
        return None

    return block_text

### Extract figures and save them individually

In [10]:
def extract_figures(text):
    """Extract figure captions or descriptions from the text."""
    figures = re.findall(figure_pattern, text, re.IGNORECASE | re.DOTALL)
    return [" ".join(fig).strip() for fig in figures if len(" ".join(fig).strip()) > 5]  # Filter out short/no-content matches

def extract_images_from_pdf(pdf_path, output_dir, pdf_name):
    """Extract images from a PDF and save them to output_dir."""
    document = fitz.open(pdf_path)
    for page_idx in range(len(document)):
        page = document.load_page(page_idx)
        images = page.get_images(full=True)
        
        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = document.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            
            # Save the image
            image_filename = output_dir / f"{pdf_name}_page{page_idx}_img{img_index}.{image_ext}"
            with image_filename.open("wb") as img_file:
                img_file.write(image_bytes)
            
            print(f"Saved image: {image_filename}")

    document.close()

### Using preprocess functions on documents

In [11]:
# Load and preprocess documents
folder_path = Path("./data")
documents = []

output_path = Path("./processed_documents.txt")  # File to save processed documents
figures_output_path = Path("./extracted_figures_summary.txt")  # Summary of extracted figures
figures_dir = Path("./extracted_figures")  # Directory to save each figure individually
figures_dir.mkdir(exist_ok=True)

with output_path.open("w", encoding="utf-8") as output_file, figures_output_path.open("w", encoding="utf-8") as figures_file:
    for pdf_file in folder_path.glob("*.pdf"):
        loader = PyMuPDFReader()
        loaded_docs = loader.load(file_path=str(pdf_file))
        print(f"{pdf_file.name}: {len(loaded_docs)} documents loaded")

        # Extract images directly from the PDF
        extract_images_from_pdf(pdf_file, figures_dir, pdf_file.stem)

        for doc_idx, doc in enumerate(loaded_docs):
            # Preprocess text
            processed_text = preprocess_text(doc.text, stop_keywords)
            
            # Extract figures from processed text
            figures = extract_figures(processed_text)

            for i, figure in enumerate(figures):
                figure_file_path = figures_dir / f"{pdf_file.stem}_doc{doc_idx}_fig{i}.txt"
                figure_file_path.write_text(figure, encoding="utf-8")  # Write figure caption to its own file

                # Write to the summary file
                figures_file.write(f"{pdf_file.name} - Document {doc_idx}:\n")
                figures_file.write(figure + "\n")
                figures_file.write("=" * 80 + "\n")
            
            print(f"Saved {len(figures)} figure captions for Document {doc_idx} from {pdf_file.name}")

            # Remove figures from text after extraction to clean up
            for figure in figures:
                processed_text = processed_text.replace(figure, "")

            # Clean and process the remaining text blocks
            cleaned_blocks = []
            for block in processed_text.split("\n"):
                cleaned_block = clean_text(block)
                if cleaned_block:
                    cleaned_blocks.append(cleaned_block)
            
            final_text = "\n".join(cleaned_blocks)
            doc.text = final_text  # Replace with cleaned and processed text
            doc.metadata = {"source": pdf_file.name}  # Add metadata
            documents.append(doc)
            
            # Write the processed text into the output file
            output_file.write(f"Document {doc_idx} from {pdf_file.name}:\n")
            output_file.write(final_text + "\n")
            output_file.write("=" * 80 + "\n")  # Separator between documents

print(f"Processed documents written to {output_path}")
print(f"Extracted figure captions written to {figures_output_path}")
print(f"Individual figure files and images saved in '{figures_dir}'")

5.pdf: 10 documents loaded
Saved image: extracted_figures/5_page0_img0.jpeg
Saved image: extracted_figures/5_page1_img0.jpeg
Saved image: extracted_figures/5_page2_img0.jpeg
Saved image: extracted_figures/5_page3_img0.jpeg
Saved image: extracted_figures/5_page4_img0.png
Saved image: extracted_figures/5_page4_img1.jpeg
Saved image: extracted_figures/5_page4_img2.jpeg
Saved image: extracted_figures/5_page5_img0.jpeg
Saved image: extracted_figures/5_page6_img0.jpeg
Saved image: extracted_figures/5_page7_img0.jpeg
Saved image: extracted_figures/5_page8_img0.jpeg
Saved image: extracted_figures/5_page9_img0.jpeg
Saved 0 figure captions for Document 0 from 5.pdf
Saved 0 figure captions for Document 1 from 5.pdf
Saved 0 figure captions for Document 2 from 5.pdf
Saved 0 figure captions for Document 3 from 5.pdf
Saved 0 figure captions for Document 4 from 5.pdf
Saved 0 figure captions for Document 5 from 5.pdf
Saved 0 figure captions for Document 6 from 5.pdf
Saved 0 figure captions for Document

In [12]:
print(documents[65])
print(len(documents))

Doc ID: cb9f37e7-e331-48fb-8a9e-a231116b5402
Text: MENGI ELIK et al. Roininen, K., Tuorila, H., Zandstra, E., de
Graaf, C., Vehkalahti, K., Stubenitsky, K.,  Mela, D. J. 2001.
Differences in health and taste attitudes and reported behaviour among
Finnish, Dutch and British consumers A crossnational validation of the
health and taste atti tude scales HTAS. Appetite, 371, 3345. Steptoe,
A., Wardle...
66


### Process Pictures

- We will delete the pictures that are smaller than 50KB, since they do not carry any info.

In [13]:
from pathlib import Path

# Define the directory containing extracted images
figures_dir = Path("./extracted_figures")
size_threshold = 50 * 1024  # 50 KB in bytes

# Iterate through all files in the directory
for file in figures_dir.iterdir():
    if file.is_file() and file.suffix.lower() in {'.png', '.jpeg', '.jpg'}:  # Check for image files
        file_size = file.stat().st_size
        if file_size < size_threshold:
            print(f"Deleting {file.name} (Size: {file_size / 1024:.2f} KB)")
            file.unlink()  # Delete the file

print("Cleanup complete: Deleted all image files below 50KB.")

Deleting 5_page0_img0.jpeg (Size: 40.48 KB)
Deleting 4_page7_img0.jpeg (Size: 24.49 KB)
Deleting 5_page4_img0.png (Size: 1.33 KB)
Deleting 3_page14_img0.png (Size: 3.06 KB)
Deleting 5_page6_img0.jpeg (Size: 12.19 KB)
Deleting 5_page1_img0.jpeg (Size: 12.19 KB)
Deleting 3_page0_img1.jpeg (Size: 1.99 KB)
Deleting 5_page4_img2.jpeg (Size: 12.19 KB)
Deleting 5_page7_img0.jpeg (Size: 12.19 KB)
Deleting 3_page0_img2.jpeg (Size: 3.38 KB)
Deleting 5_page2_img0.jpeg (Size: 12.19 KB)
Deleting 5_page9_img0.jpeg (Size: 12.19 KB)
Deleting 1_page0_img0.png (Size: 6.30 KB)
Deleting 4_page0_img1.png (Size: 11.99 KB)
Deleting 4_page0_img0.png (Size: 4.41 KB)
Deleting 3_page0_img3.png (Size: 15.12 KB)
Deleting 4_page0_img2.png (Size: 12.10 KB)
Deleting 5_page3_img0.jpeg (Size: 12.19 KB)
Deleting 4_page0_img3.png (Size: 0.76 KB)
Deleting 3_page0_img0.png (Size: 1.33 KB)
Deleting 5_page8_img0.jpeg (Size: 12.19 KB)
Deleting 4_page0_img4.png (Size: 6.30 KB)
Deleting 5_page5_img0.jpeg (Size: 12.19 KB)
Cleanu

### Splitting Documents

In [14]:
from llama_index.core.node_parser import SentenceSplitter

text_parser = SentenceSplitter(
    chunk_size=1024,
    # separator=" ",
)

In [15]:
text_chunks = []
# maintain relationship with source doc index, to help inject doc metadata in (3)
doc_idxs = []
for doc_idx, doc in enumerate(documents):
    cur_text_chunks = text_parser.split_text(doc.text)
    #print(f"Document {doc_idx} has {len(cur_text_chunks)} chunks")
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))  

In [16]:
print(len(text_chunks))

88


### Construct Nodes from Text Chunks

In [17]:
from llama_index.core.schema import TextNode

nodes = []
for idx, text_chunk in enumerate(text_chunks):
    node = TextNode(
        text=text_chunk,
        metadata={
            **documents[doc_idxs[idx]].metadata,  # This now includes 'source'
            "doc_index": doc_idxs[idx]  # Optional: document index
        }
    )
    nodes.append(node)

In [18]:
nodes[1]

TextNode(id_='40d38847-4616-4b13-a478-bd9c9bca3cde', embedding=None, metadata={'source': '5.pdf', 'doc_index': 1}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='This change in the lifestyle of the population is partly, due to\nthe need for measures to contain the spread of COVID19, such\nas social isolation. This has proven to be effective8 and has led\nmany people to perform their work remotely.9 This mobility\nrestriction has had direct effects on psychological factors, such\nas an increase in cases of anxiety and depression and a reduction\nin the practice of physical activities.1012 In addition, eating\nhabits were also inuenced both by economic factors, due to the\nreduction in the populations income, as well as by the\nconsumption of foods with higher energy density.13,14\nIn the period before the pandemic, the consumption of fresh\nand minimally processed foods represented approximately 70\nof the total caloric intake by the Brazilian po

### Generate Embeddings for each Node (Text Nodes)

Using sentence_transformers

In [19]:
for node in nodes:
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode="text")
    )
    node.embedding = node_embedding

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [20]:
print(type(nodes[0].embedding))
print(nodes[0].embedding)
print(len(nodes))

<class 'list'>
[0.002242590067908168, 0.025284932926297188, 0.03787685185670853, 0.017316807061433792, 0.035318728536367416, 0.045880142599344254, 0.02748068794608116, -0.004848837852478027, 0.00520316930487752, -0.02245437726378441, -0.012843466363847256, -0.07361365854740143, 0.0024977654684334993, 0.016995027661323547, 0.0012620296329259872, -0.028180761262774467, -0.0007740127039141953, -0.02978023886680603, 0.00105327891651541, 0.019632909446954727, -0.015793396160006523, -0.012700777500867844, -0.015133718959987164, -0.005372608546167612, 0.041983943432569504, 0.017965346574783325, -0.01487655844539404, -0.03357880935072899, -0.07126981765031815, -0.20361019670963287, -0.021399321034550667, -0.03841941058635712, 0.03099704347550869, -0.01860302872955799, -0.057519495487213135, -0.005936900153756142, 0.03111499361693859, 0.016658559441566467, -0.00020628668426070362, -0.011791287921369076, 0.0005544592277146876, 0.042607758194208145, 0.04287157580256462, -0.04061812534928322, -0.0

### Generate Embeddings for Images

- We will use LLaVA model to generate summaries from images.

In [21]:
from transformers import AutoProcessor, LlavaForConditionalGeneration
from PIL import Image
import torch
from pathlib import Path

# Load LLaVA processor and model
model_name = "llava-hf/llava-1.5-7b-hf"
processor = AutoProcessor.from_pretrained(model_name)
model = LlavaForConditionalGeneration.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

Some kwargs in processor config are unused and will not have any effect: num_additional_image_tokens. 
Loading checkpoint shards: 100%|██████████| 3/3 [00:11<00:00,  3.82s/it]


In [22]:
# Check device and use it
device = "mps" if torch.has_mps else "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

# Directory of extracted figures
figures_dir = Path("./extracted_figures")
image_summaries = {}

# Generate summaries for each image (both .png and .jpeg)
for image_file in figures_dir.glob("*"):
    if image_file.suffix.lower() not in [".png", ".jpeg", ".jpg"]:
        continue  # Skip files that are not images

    image = Image.open(image_file).convert("RGB")
    
    # Define the conversation template for generating a detailed description
    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "image"},  # Image placeholder
                {"type": "text", "text": "What is shown in this image?"}
            ],
        }
    ]
    
    # Apply chat template and prepare inputs
    text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
    inputs = processor(text=[text_prompt], images=[image], return_tensors="pt").to(device)  # Ensure inputs are on the same device
    
    # Generate response
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=256)
    
    summary = processor.decode(outputs[0], skip_special_tokens=True)
    
    # Extract only the assistant's response
    assistant_reply = summary.split("ASSISTANT:")[1].strip() if "ASSISTANT:" in summary else summary
    image_summaries[image_file.name] = assistant_reply
    print(f"Generated summary for {image_file.name}:\n{assistant_reply}\n")

# Save summaries to file
summary_output_path = Path("./image_summaries.txt")
with summary_output_path.open("w", encoding="utf-8") as f:
    for image_name, summary in image_summaries.items():
        f.write(f"{image_name}:\n{summary}\n{'='*80}\n")

print(f"Image summaries saved to {summary_output_path}")

  device = "mps" if torch.has_mps else "cuda" if torch.cuda.is_available() else "cpu"
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Generated summary for 1_page4_img1.jpeg:
The image shows a bar graph or a chart with a red line and a green line, representing two different things. The red line is likely a percentage, while the green line is a different measurement. The chart is divided into sections, with each section showing the corresponding values of the two different measurements.

Generated summary for 1_page4_img0.jpeg:
The image shows a chart or graph displaying various health-related statistics, such as the number of calories burned, the number of calories consumed, and the number of calories needed for a healthy diet. The chart is color-coded, with green representing a positive outcome, red representing a negative outcome, and yellow representing a neutral outcome.

Generated summary for 1_page5_img0.jpeg:
The image shows a flowchart or diagram that illustrates the process of healthy eating.

Generated summary for 1_page5_img1.png:
The image shows a diagram or flowchart that illustrates the process of liste

### Embeddings of Image Summaries

In [23]:
from pathlib import Path

# Path to the image summaries file
summaries_path = Path("./image_summaries.txt")

# Dictionary to store image summaries
image_summaries = {}

# Read the summaries file and populate the dictionary
with summaries_path.open("r", encoding="utf-8") as f:
    lines = f.readlines()
    current_image = None
    current_summary = []
    
    for line in lines:
        line = line.strip()
        if line.endswith(":") and not line.startswith("="):  # Detect image name
            if current_image and current_summary:
                image_summaries[current_image] = " ".join(current_summary).strip()
                current_summary = []  # Reset for the next summary
            current_image = line[:-1]  # Remove trailing colon
        elif not line.startswith("="):  # Skip separator lines
            current_summary.append(line)
    
    # Add the last image and its summary
    if current_image and current_summary:
        image_summaries[current_image] = " ".join(current_summary).strip()

# Print the loaded summaries for verification
for image, summary in image_summaries.items():
    print(f"{image}: {summary}")

1_page4_img1.jpeg: The image shows a bar graph or a chart with a red line and a green line, representing two different things. The red line is likely a percentage, while the green line is a different measurement. The chart is divided into sections, with each section showing the corresponding values of the two different measurements.
1_page4_img0.jpeg: The image shows a chart or graph displaying various health-related statistics, such as the number of calories burned, the number of calories consumed, and the number of calories needed for a healthy diet. The chart is color-coded, with green representing a positive outcome, red representing a negative outcome, and yellow representing a neutral outcome.
1_page5_img0.jpeg: The image shows a flowchart or diagram that illustrates the process of healthy eating.
1_page5_img1.png: The image shows a diagram or flowchart that illustrates the process of listening to the body.
1_page8_img0.jpeg: The image shows a chart or graph displaying various st

In [24]:
from llama_index.core.schema import TextNode
from pathlib import Path
from sentence_transformers import SentenceTransformer

# Initialize nodes for image summaries
image_summary_nodes = []

for image_name, summary in image_summaries.items():
    # Generate embedding for the image summary
    node_embedding = embed_model.get_text_embedding(summary)
    
    # Create a TextNode instance
    node = TextNode(
        text=summary,
        embedding=node_embedding,
        metadata={
            "source": image_name,  # The image file name
            "type": "image_summary",  # To distinguish from text nodes
        }
    )
    
    image_summary_nodes.append(node)

# Display one of the nodes for verification
print(image_summary_nodes[0])

Node ID: 1a6ff602-e56b-407f-82b2-a2bdfb71fae9
Text: The image shows a bar graph or a chart with a red line and a
green line, representing two different things. The red line is likely
a percentage, while the green line is a different measurement. The
chart is divided into sections, with each section showing the
corresponding values of the two different measurements.


### Load Nodes (Text Embeddings) and Image Summary Embeddings into a Vector Store

We now insert these nodes into our `PostgresVectorStore`.

In [47]:
vector_store.add(nodes)

['073ac14d-832b-402f-835c-95be3062a4b4',
 '40d38847-4616-4b13-a478-bd9c9bca3cde',
 '8ce8be3c-8449-4f21-b9ba-0496477a99c3',
 'cdc97696-283a-4cae-be0f-7e7cbf424f83',
 '1680cb77-82cf-4c65-bc6b-ae3c976eecf7',
 'aab88b74-ee51-46ee-9835-70da5cc489b1',
 '6ac60b8d-ff2c-45bd-9f2b-bd5f21357022',
 'c4b3cde7-8ee5-4306-8cf6-d1b47ce67b07',
 '2d2c7fcd-d682-45ea-9163-65102f4d1be0',
 'fd50b855-5507-4a75-838a-03b3d8ed41cc',
 'd4183482-1b33-41a6-9906-b57a27cb6448',
 '90a5c433-610c-4a5d-b86c-f4ffe7995b32',
 'e94973f7-72e8-4bf7-8b9a-9ed623a03fef',
 '6a83e236-8e51-4b31-b3af-53223c5c2b82',
 'cb66a10f-404a-4653-9197-951538109ebd',
 '7c930380-c039-4385-85fc-c53f8d2a1029',
 'fb6eb644-8e66-42cb-a874-94f3729dc51e',
 '459d362a-b4f3-4182-ae1f-4c4898b0d995',
 '7641d8f3-ec2f-4340-912b-9e4a13ff7a88',
 '3aecd6bf-f2f2-429a-a4c0-131eec81684f',
 '4b17c549-448b-41f5-9aaa-f5be0ae82df1',
 '7d00053b-e167-4c3f-8f1c-5a6c651e30b0',
 'ed07c959-39f6-4e5b-a7c6-fbd31cc96c4f',
 '3ff96fca-8923-4efd-9c2b-d4051e01070e',
 'cdc08d89-00de-

In [48]:
vector_store.add(image_summary_nodes)

['1a6ff602-e56b-407f-82b2-a2bdfb71fae9',
 'a0b3ad1d-d2ac-46c0-96c3-969f2445c72e',
 '92fac8e5-a27a-47e5-a7d9-f9d644a945a2',
 '93c32a1a-bf53-4042-a508-3e33313b1a63',
 '7e0b70a6-6ac7-4143-948e-283ec5d51987',
 'a9ac4995-ae92-4154-b1e7-6feb921de866',
 '91a36c85-4f83-406d-bde1-cfa49760da2f',
 '5b541f41-319f-4745-9a8d-0d739cfa2bc1']

In [49]:
# Create SQLAlchemy engine
engine = create_engine(f"postgresql://{db_params['user']}:{db_params['password']}@{db_params['host']}:{db_params['port']}/{db_params['database']}")

# Query to count rows in the vector store table
table_name = "data_pmc_table"  # Adjust the table name as necessary

with engine.connect() as conn:
    result = conn.execute(text(f"SELECT COUNT(*) FROM {table_name};"))
    node_count = result.scalar()
    print(f"Number of nodes in vector store: {node_count}")

Number of nodes in vector store: 96


## Retrieval Pipeline

In [50]:
from llama_index.core import QueryBundle
from llama_index.core.retrievers import BaseRetriever
from typing import Any, List
from llama_index.core.schema import NodeWithScore
from typing import Optional
from llama_index.core.vector_stores import VectorStoreQuery

In [51]:
class VectorDBRetriever(BaseRetriever):
    """Retriever over a postgres vector store."""

    def __init__(
        self,
        vector_store: PGVectorStore,
        embed_model: Any,
        query_mode: str = "default",
        similarity_top_k: int = 2,
    ) -> None:
        """Init params."""
        self._vector_store = vector_store
        self._embed_model = embed_model
        self._query_mode = query_mode
        self._similarity_top_k = similarity_top_k
        super().__init__()

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        """Retrieve."""
        query_embedding = embed_model.get_query_embedding(
            query_bundle.query_str
        )
        vector_store_query = VectorStoreQuery(
            query_embedding=query_embedding,
            similarity_top_k=self._similarity_top_k,
            mode=self._query_mode,
        )
        query_result = vector_store.query(vector_store_query)

        nodes_with_scores = []
        for index, node in enumerate(query_result.nodes):
            score: Optional[float] = None
            if query_result.similarities is not None:
                score = query_result.similarities[index]
            nodes_with_scores.append(NodeWithScore(node=node, score=score))

        return nodes_with_scores

In [52]:
retriever = VectorDBRetriever(
    vector_store, embed_model, query_mode="default", similarity_top_k=2
)

retriever

<__main__.VectorDBRetriever at 0x681192050>

## Response

In [53]:
from llama_index.core.query_engine import RetrieverQueryEngine

query_engine = RetrieverQueryEngine.from_args(retriever, llm=llm)

In [54]:
query_str = "Which individuals play a central role in promoting healthy eating?"

response = query_engine.query(query_str)

Llama.generate: 13 prefix-match hit, remaining 1464 prompt tokens to eval

llama_print_timings:        load time =   18228.46 ms
llama_print_timings:      sample time =       1.33 ms /    40 runs   (    0.03 ms per token, 30097.82 tokens per second)
llama_print_timings: prompt eval time =   34779.31 ms /  1464 tokens (   23.76 ms per token,    42.09 tokens per second)
llama_print_timings:        eval time =    5970.56 ms /    39 runs   (  153.09 ms per token,     6.53 tokens per second)
llama_print_timings:       total time =   40783.47 ms /  1503 tokens


In [55]:
print(str(response))

 Professionals in the fields of health and food, such as dietitians, food scientists, and technologists, play a central role in promoting healthy eating.


In [56]:
print(response.source_nodes[0].get_content())

Nutrients 2024, 16, 1365
reaching 16 of the adult population in 2022 4. At the same time, nowadays, cardiovascular
diseases are the main cause of death globally, representing 32 of all global deaths 5. Parallel
and paradoxically, thinness has become progressively valued and fatphobia, the discrimina
tion and stigmatization against fat individuals, has become a phenomenon widely present
worldwide 68. Finally, food crises with important health, social and economic international
impacts, such as the mad cow, have also increased concerns about food consumption 9.
Attempts to define what constitutes a healthy diet and provide dietary recommenda
tions have been made in different scientific fields and organizations, including the World
Health Organization WHO. In response to the increase in prevalence rates of morbidity
and mortality associated with chronic noncommunicable diseases, in 2004 the WHO ap
proved the Global Strategy on Diet, Physical Activity and Health, which invites member
state

In [57]:
query_str = "What does nutrition literacy significantly predicted in the social-ecological framework?"

response = query_engine.query(query_str)

Llama.generate: 13 prefix-match hit, remaining 2159 prompt tokens to eval

llama_print_timings:        load time =   18228.46 ms
llama_print_timings:      sample time =       0.96 ms /    27 runs   (    0.04 ms per token, 28272.25 tokens per second)
llama_print_timings: prompt eval time =   52003.63 ms /  2159 tokens (   24.09 ms per token,    41.52 tokens per second)
llama_print_timings:        eval time =    4267.45 ms /    26 runs   (  164.13 ms per token,     6.09 tokens per second)
llama_print_timings:       total time =   56297.77 ms /  2185 tokens


In [58]:
print(str(response))

 Based on the context information, nutrition literacy significantly predicted healthy eating behavior in the social-ecological framework.


In [59]:
response.source_nodes[0]

NodeWithScore(node=TextNode(id_='bce95773-ee38-437d-955b-7f5a473b2def', embedding=None, metadata={'source': '4.pdf', 'doc_index': 20}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='Lai, I.J. Chang, L.C. Nutrition literacy is associated with healthyeating behaviour among college students in Taiwan.\nHealth Educ. J. 2019, 78, 756769. CrossRef\nYuen, E. Thomson, M. Gardiner, H. Measuring Nutrition and Food Literacy in Adults A Systematic Review and Appraisal of\nExisting Measurement Tools. HLRP Health Lit. Res. Pract. 2018, 2, e134e160. CrossRef PubMed\nSilk, K.J. Sherry, J. Winn, B. Keesecker, N. Horodynski, M.A. Sayir, A. Increasing Nutrition Literacy Testing the Effectiveness\nof Print, Web site, and Game Modalities. J. Nutr. Educ. Behav. 2008, 40, 310. CrossRef\nAihara, Y. Minai, J. Barriers and catalysts of nutrition literacy among elderly Japanese people. Health Promot. Int. 2011, 26,\n421431. CrossRef\nZoellner, J. Connell, C. Bounds, W. Cr

In [60]:
print(response.source_nodes[0].get_content())

Lai, I.J. Chang, L.C. Nutrition literacy is associated with healthyeating behaviour among college students in Taiwan.
Health Educ. J. 2019, 78, 756769. CrossRef
Yuen, E. Thomson, M. Gardiner, H. Measuring Nutrition and Food Literacy in Adults A Systematic Review and Appraisal of
Existing Measurement Tools. HLRP Health Lit. Res. Pract. 2018, 2, e134e160. CrossRef PubMed
Silk, K.J. Sherry, J. Winn, B. Keesecker, N. Horodynski, M.A. Sayir, A. Increasing Nutrition Literacy Testing the Effectiveness
of Print, Web site, and Game Modalities. J. Nutr. Educ. Behav. 2008, 40, 310. CrossRef
Aihara, Y. Minai, J. Barriers and catalysts of nutrition literacy among elderly Japanese people. Health Promot. Int. 2011, 26,
421431. CrossRef
Zoellner, J. Connell, C. Bounds, W. Crook, L. Yadrick, K. Nutrition Literacy Status and Preferred Nutrition Communication
Channels among Adults in the Lower Mississippi Delta. Prev. Chronic Dis. 2009, 6, A128.
between Demographic Variables and HealthPromoting Behaviour

## Evaluation

RAGAS, DeepEval will be checked, however, we do not want to use GPT for the evaluator. 