# Open-Source only RAG

- Sentence Transformers as embedding model
- Postgres as vector store
- Llama 2 as LLM

#### Sentence Transformers

In [1]:
#%pip install llama-index-readers-file pymupdf
#%pip install llama-index-vector-stores-postgres
#%pip install llama-index-embeddings-huggingface
#%pip install llama-index-llms-llama-cpp

In [2]:
# sentence transformers
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en")

  from .autonotebook import tqdm as notebook_tqdm


#### Llama CPP

In [3]:
#!pip install llama-cpp-python

In [4]:
from llama_index.llms.llama_cpp import LlamaCPP

# model_url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/resolve/main/llama-2-13b-chat.ggmlv3.q4_0.bin"
model_url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve/main/llama-2-13b-chat.Q4_0.gguf"

llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    model_url=model_url,
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path=None,
    temperature=0.1,
    max_new_tokens=256,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3900,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 1},
    verbose=True,
)




llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from /Users/busraoguzoglu/Library/Caches/llama_index/models/llama-2-13b-chat.Q4_0.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
llama_model_loader: - kv   4:                          llama.block_count u32              = 40
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:      

#### Postgres

In [5]:
#!pip install psycopg2-binary pgvector asyncpg "sqlalchemy[asyncio]" greenlet

In [6]:
import psycopg2

# Define your database connection parameters
db_name = "rag_db"  # Use the existing database name here
host = "localhost"
password = "password"  # Replace with your actual PostgreSQL password
port = "5432"  # Default PostgreSQL port
user = "myuser"  # Replace with your actual PostgreSQL username

# Connect directly to the 'rag_db' database
conn = psycopg2.connect(
    dbname=db_name,  # Connect directly to 'rag_db'
    host=host,
    password=password,
    port=port,
    user=user,
)
conn.autocommit = True

# Optionally, perform any operations on 'rag_db' using a cursor
with conn.cursor() as c:
    # Drop and create operations are not needed if the database already exists
    # Here you can perform other database setup actions if necessary
    print(f"Connected to {db_name} successfully.")

# Close the connection when done
#conn.close()

Connected to rag_db successfully.


Create table (checks if exist, truncates if already exist and generates again)

In [7]:
from sqlalchemy import create_engine, text
from sqlalchemy.exc import OperationalError
from llama_index.vector_stores.postgres import PGVectorStore

# Connection parameters
db_params = {
    "user": "myuser",      # Replace with your actual username
    "password": "password",  # Replace with your actual password
    "host": "localhost",   # Adjust if your database is hosted elsewhere
    "port": "5432",        # Default PostgreSQL port
    "database": "rag_db"   # Replace with your actual database name
}

# Create SQLAlchemy engine
engine = create_engine(f"postgresql://{db_params['user']}:{db_params['password']}@{db_params['host']}:{db_params['port']}/{db_params['database']}")

# Update table name to include 'data_' prefix
table_name = "data_pmc_table"  # Ensure this matches the actual table name in your database
embed_dim = 384  # Embedding dimension

try:
    with engine.connect() as conn:
        # Check if the table exists
        result = conn.execute(text(f"""
            SELECT EXISTS (
                SELECT FROM information_schema.tables 
                WHERE table_schema = 'public' AND table_name = :table_name
            );
        """), {"table_name": table_name}).scalar()

        if result:
            print(f"Table '{table_name}' exists. Cleaning it...")
            conn.execute(text(f"TRUNCATE TABLE {table_name};"))
        else:
            print(f"Table '{table_name}' does not exist. Creating it...")

        # Create a new table for vector store
        vector_store = PGVectorStore.from_params(
            database=db_params["database"],
            host=db_params["host"],
            password=db_params["password"],
            port=db_params["port"],
            user=db_params["user"],
            table_name="pmc_table",  # Use unprefixed name; PGVectorStore adds 'data_' automatically
            embed_dim=embed_dim,
        )
        print(f"Table '{table_name}' has been created.")
except OperationalError as e:
    print(f"Error: {e}")
    print("Make sure the database exists and connection parameters are correct.")

Table 'data_pmc_table' exists. Cleaning it...
Table 'data_pmc_table' has been created.


## Ingestion Pipeline

### Data Loading and Preprocessing

In [8]:
from pathlib import Path
from llama_index.readers.file import PyMuPDFReader
from llama_index.core.node_parser import SentenceSplitter

In [9]:
import re
# Define stop keywords
stop_keywords = ["References", "REFERENCES", "references", "Conflict of interest", "Conflicts of Interest", "Conflicts"]

def preprocess_text(text, stop_keywords):
    """Preprocess text by stopping at the earliest occurrence of any keyword, ignoring case."""
    earliest_position = len(text)  # Default to the end of the text
    
    for keyword in stop_keywords:
        pattern = rf"\b{re.escape(keyword)}\b"  # Match exact word boundaries
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            print(f"Keyword '{keyword}' found at position {match.start()}")  # Debugging
            earliest_position = min(earliest_position, match.start())
    
    # Truncate the text at the earliest keyword match
    return text[:earliest_position]

def clean_text(block_text):
    """Clean and process individual blocks of text."""
    block_text = " ".join(block_text.split())  # Remove extra spaces and newlines
    block_text = re.sub(r'^\d+\.\s+', '', block_text)  # Remove section numbering
    block_text = re.sub(r'[^A-Za-z0-9\s,.]', '', block_text)  # Keep only basic punctuation and letters
    
    if len(block_text) < 10 or re.match(r'Page \d+', block_text):
        return None  # Skip short blocks or page numbers
    return block_text

In [10]:
# Load and preprocess documents
folder_path = Path("./data")
documents = []

output_path = Path("./processed_documents.txt")  # File to save processed documents
with output_path.open("w", encoding="utf-8") as output_file:
    for pdf_file in folder_path.glob("*.pdf"):
        loader = PyMuPDFReader()
        loaded_docs = loader.load(file_path=str(pdf_file))
        print(f"{pdf_file.name}: {len(loaded_docs)} documents loaded")

        for doc_idx, doc in enumerate(loaded_docs):
            # Preprocess text
            processed_text = preprocess_text(doc.text, stop_keywords)
            
            # Clean and process each block of text
            cleaned_blocks = []
            for block in processed_text.split("\n"):
                cleaned_block = clean_text(block)
                if cleaned_block:
                    cleaned_blocks.append(cleaned_block)
            
            final_text = "\n".join(cleaned_blocks)
            doc.text = final_text  # Replace with cleaned and processed text
            doc.metadata = {"source": pdf_file.name}  # Add metadata
            documents.append(doc)
            
            # Write the processed text into the output file
            output_file.write(f"Document {doc_idx} from {pdf_file.name}:\n")
            output_file.write(final_text + "\n")
            output_file.write("="*80 + "\n")  # Separator between documents

print(f"Processed documents written to {output_path}")

5.pdf: 10 documents loaded
Keyword 'References' found at position 2699
Keyword 'REFERENCES' found at position 2699
Keyword 'references' found at position 2699
4.pdf: 12 documents loaded
Keyword 'References' found at position 1735
Keyword 'REFERENCES' found at position 1735
Keyword 'references' found at position 1735
1.pdf: 18 documents loaded
Keyword 'References' found at position 2673
Keyword 'REFERENCES' found at position 2673
Keyword 'references' found at position 2673
Keyword 'Conflicts of Interest' found at position 2604
Keyword 'Conflicts' found at position 2604
3.pdf: 16 documents loaded
Keyword 'References' found at position 1400
Keyword 'REFERENCES' found at position 1400
Keyword 'references' found at position 1400
2.pdf: 10 documents loaded
Keyword 'References' found at position 206
Keyword 'REFERENCES' found at position 206
Keyword 'references' found at position 206
Processed documents written to processed_documents.txt


In [11]:
print(documents[65])
print(len(documents))

Doc ID: 6572160f-9e7d-4bb4-99e9-1160a2ef91e6
Text: MENGI ELIK et al. Roininen, K., Tuorila, H., Zandstra, E., de
Graaf, C., Vehkalahti, K., Stubenitsky, K.,  Mela, D. J. 2001.
Differences in health and taste attitudes and reported behaviour among
Finnish, Dutch and British consumers A crossnational validation of the
health and taste atti tude scales HTAS. Appetite, 371, 3345. Steptoe,
A., Wardle...
66


### Splitting Documents

In [12]:
from llama_index.core.node_parser import SentenceSplitter

text_parser = SentenceSplitter(
    chunk_size=1024,
    # separator=" ",
)

In [13]:
text_chunks = []
# maintain relationship with source doc index, to help inject doc metadata in (3)
doc_idxs = []
for doc_idx, doc in enumerate(documents):
    cur_text_chunks = text_parser.split_text(doc.text)
    #print(f"Document {doc_idx} has {len(cur_text_chunks)} chunks")
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))  

In [14]:
print(len(text_chunks))

94


### Construct Nodes from Text Chunks

In [15]:
from llama_index.core.schema import TextNode

nodes = []
for idx, text_chunk in enumerate(text_chunks):
    node = TextNode(
        text=text_chunk,
        metadata={
            **documents[doc_idxs[idx]].metadata,  # This now includes 'source'
            "doc_index": doc_idxs[idx]  # Optional: document index
        }
    )
    nodes.append(node)

In [16]:
nodes[1]

TextNode(id_='3cd5fa71-ad3a-48a6-9e0f-8d4b23e54701', embedding=None, metadata={'source': '5.pdf', 'doc_index': 1}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='This change in the lifestyle of the population is partly, due to\nthe need for measures to contain the spread of COVID19, such\nas social isolation. This has proven to be effective8 and has led\nmany people to perform their work remotely.9 This mobility\nrestriction has had direct effects on psychological factors, such\nas an increase in cases of anxiety and depression and a reduction\nin the practice of physical activities.1012 In addition, eating\nhabits were also inuenced both by economic factors, due to the\nreduction in the populations income, as well as by the\nconsumption of foods with higher energy density.13,14\nIn the period before the pandemic, the consumption of fresh\nand minimally processed foods represented approximately 70\nof the total caloric intake by the Brazilian po

### Generate Embeddings for each Node

Using sentence_transformers

In [17]:
for node in nodes:
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode="text")
    )
    node.embedding = node_embedding

In [18]:
print(type(nodes[0].embedding))
print(nodes[0].embedding)
print(len(nodes))

<class 'list'>
[0.0003987177333328873, 0.02646005153656006, 0.03513289615511894, 0.007168531883507967, 0.041294097900390625, 0.0480182059109211, 0.023377425968647003, -0.004793648142367601, 0.00036878735409118235, -0.017007548362016678, -0.01259664911776781, -0.07230765372514725, 0.00043807117617689073, 0.019409792497754097, -0.006811037659645081, -0.024709824472665787, 0.0009681543451733887, -0.022223491221666336, 0.003708706935867667, 0.01637529581785202, -0.014613424427807331, -0.010371314361691475, -0.019866863265633583, -0.012201068922877312, 0.04210752621293068, 0.02097344398498535, -0.02411791682243347, -0.041234202682971954, -0.06391127407550812, -0.197813481092453, -0.025465227663517, -0.043725188821554184, 0.0314055010676384, -0.011092374101281166, -0.055188607424497604, -0.011988278478384018, 0.03101562149822712, 0.014969510026276112, -0.010766901075839996, -0.0149585772305727, -0.0023336694575846195, 0.029446586966514587, 0.03295325115323067, -0.040839437395334244, -0.02298

### Load Nodes into a Vector Store

We now insert these nodes into our `PostgresVectorStore`.

In [19]:
vector_store.add(nodes)

['71688eda-d915-4765-ba15-4f4dfb6fe308',
 '3cd5fa71-ad3a-48a6-9e0f-8d4b23e54701',
 '3160bb3d-9877-4d52-a24c-fe0be93f78bf',
 '959d0e1f-da1a-4e06-ba8f-26d41ef8f044',
 '6297f8ca-6860-4ab8-bc7d-60f14f9bc0b5',
 '7c493d21-4b99-4c51-9f4c-839722f403b5',
 'a11fa909-407e-4078-9337-753385b09eb7',
 '7fa9ef7b-8908-41de-8e18-f487d52e881d',
 '900284ba-b621-456c-92a0-a10d4d859f3b',
 '5a3bf6d7-7026-47e7-83c1-3a078b3d3374',
 '6059024d-319d-4e63-b78c-b03a713494e3',
 'e742c6ca-52cb-4fb9-831d-fb130d48aa2f',
 '4a242c37-e5cb-4c74-ad8d-aa419b766d95',
 '39512d26-0772-4ad3-a29b-db6e8fddc99e',
 '74256f4a-46a5-4124-82f5-0ec3b410294c',
 '878a01c4-045e-479f-8e54-56d781c19802',
 '112383c8-d235-40dd-aec5-4930cc4b71f4',
 '722ce630-0cef-4a5c-a242-fd3b4ef4343a',
 '2e271b04-f119-4a91-8a3b-321e7e143e67',
 'fae95693-d22a-4bcb-9eb5-d4aa41359547',
 'bb9f01a6-d25d-47c8-a6ac-fd6a58b91338',
 '033d7a2b-a713-490a-837d-2401cf3bfef4',
 'de3d0b81-c0a4-43ff-bd17-9873a70ef581',
 'ad8d864c-a11a-4761-85f6-885a9824d10b',
 'a49543f1-e810-

## Retrieval Pipeline

In [20]:
from llama_index.core import QueryBundle
from llama_index.core.retrievers import BaseRetriever
from typing import Any, List
from llama_index.core.schema import NodeWithScore
from typing import Optional
from llama_index.core.vector_stores import VectorStoreQuery

In [21]:
class VectorDBRetriever(BaseRetriever):
    """Retriever over a postgres vector store."""

    def __init__(
        self,
        vector_store: PGVectorStore,
        embed_model: Any,
        query_mode: str = "default",
        similarity_top_k: int = 2,
    ) -> None:
        """Init params."""
        self._vector_store = vector_store
        self._embed_model = embed_model
        self._query_mode = query_mode
        self._similarity_top_k = similarity_top_k
        super().__init__()

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        """Retrieve."""
        query_embedding = embed_model.get_query_embedding(
            query_bundle.query_str
        )
        vector_store_query = VectorStoreQuery(
            query_embedding=query_embedding,
            similarity_top_k=self._similarity_top_k,
            mode=self._query_mode,
        )
        query_result = vector_store.query(vector_store_query)

        nodes_with_scores = []
        for index, node in enumerate(query_result.nodes):
            score: Optional[float] = None
            if query_result.similarities is not None:
                score = query_result.similarities[index]
            nodes_with_scores.append(NodeWithScore(node=node, score=score))

        return nodes_with_scores

In [22]:
retriever = VectorDBRetriever(
    vector_store, embed_model, query_mode="default", similarity_top_k=2
)

retriever

<__main__.VectorDBRetriever at 0x66088ab90>

## Response

In [23]:
from llama_index.core.query_engine import RetrieverQueryEngine

query_engine = RetrieverQueryEngine.from_args(retriever, llm=llm)

In [24]:
query_str = "Which individuals play a central role in promoting healthy eating?"

response = query_engine.query(query_str)


llama_print_timings:        load time =   10208.44 ms
llama_print_timings:      sample time =       0.36 ms /    14 runs   (    0.03 ms per token, 39325.84 tokens per second)
llama_print_timings: prompt eval time =   10207.75 ms /   476 tokens (   21.44 ms per token,    46.63 tokens per second)
llama_print_timings:        eval time =    1439.32 ms /    13 runs   (  110.72 ms per token,     9.03 tokens per second)
llama_print_timings:       total time =   11652.69 ms /   489 tokens


In [25]:
print(str(response))



Please provide the answer based on the given context information.


In [26]:
print(response.source_nodes[0].get_content())

An exploratory and descriptive crosssectional study
was conducted using a qualitative and quantitative methodology and convenience sampling. Two
focus groups and a questionnaire were utilized 300 participants from all academic years completed
the survey. Differences in definitions of healthy eating and perceived barriers were found between
genders and students at different stages of training p  0.05. In their understanding of healthy eating,
the students placed importance on balance, variety, moderation, and individual factors. Although
students considered it easy to follow a healthy diet, familys eating habits, time availability, and
emotional states were found to be the main barriers to the implementation of healthy practices. The
obtained data supports the need to critically address perceptions of healthy eating throughout the
training of nutrition and food science professionals. The insights obtained on the perceived barriers
highlight the importance of considering both individual 

In [27]:
query_str = "What does nutrition literacy significantly predicted in the social-ecological framework?"

response = query_engine.query(query_str)

Llama.generate: 13 prefix-match hit, remaining 1253 prompt tokens to eval

llama_print_timings:        load time =   10208.44 ms
llama_print_timings:      sample time =       3.03 ms /   116 runs   (    0.03 ms per token, 38233.36 tokens per second)
llama_print_timings: prompt eval time =   26838.37 ms /  1253 tokens (   21.42 ms per token,    46.69 tokens per second)
llama_print_timings:        eval time =   13079.78 ms /   115 runs   (  113.74 ms per token,     8.79 tokens per second)
llama_print_timings:       total time =   39964.62 ms /  1368 tokens


In [28]:
print(str(response))

 Based on the provided context information, nutrition literacy significantly predicted adherence to healthy-unhealthy diet patterns in adults with a nutrition-related chronic condition, as well as health-promoting behaviors among college students in Taiwan. Additionally, nutrition literacy was found to be associated with eating habits and BMI in adolescents in Lebanon. Overall, nutrition literacy appears to play an important role in the social-ecological framework of health-promoting behaviors.


In [29]:
response.source_nodes[0]

NodeWithScore(node=TextNode(id_='bccff988-f457-4cad-946b-037732a0d87c', embedding=None, metadata={'source': '4.pdf', 'doc_index': 20}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='Lai, I.J. Chang, L.C. Nutrition literacy is associated with healthyeating behaviour among college students in Taiwan.\nHealth Educ. J. 2019, 78, 756769. CrossRef\nYuen, E. Thomson, M. Gardiner, H. Measuring Nutrition and Food Literacy in Adults A Systematic Review and Appraisal of\nExisting Measurement Tools. HLRP Health Lit. Res. Pract. 2018, 2, e134e160. CrossRef PubMed\nSilk, K.J. Sherry, J. Winn, B. Keesecker, N. Horodynski, M.A. Sayir, A. Increasing Nutrition Literacy Testing the Effectiveness\nof Print, Web site, and Game Modalities. J. Nutr. Educ. Behav. 2008, 40, 310. CrossRef\nAihara, Y. Minai, J. Barriers and catalysts of nutrition literacy among elderly Japanese people. Health Promot. Int. 2011, 26,\n421431. CrossRef\nZoellner, J. Connell, C. Bounds, W. Cr

In [30]:
print(response.source_nodes[0].get_content())

Lai, I.J. Chang, L.C. Nutrition literacy is associated with healthyeating behaviour among college students in Taiwan.
Health Educ. J. 2019, 78, 756769. CrossRef
Yuen, E. Thomson, M. Gardiner, H. Measuring Nutrition and Food Literacy in Adults A Systematic Review and Appraisal of
Existing Measurement Tools. HLRP Health Lit. Res. Pract. 2018, 2, e134e160. CrossRef PubMed
Silk, K.J. Sherry, J. Winn, B. Keesecker, N. Horodynski, M.A. Sayir, A. Increasing Nutrition Literacy Testing the Effectiveness
of Print, Web site, and Game Modalities. J. Nutr. Educ. Behav. 2008, 40, 310. CrossRef
Aihara, Y. Minai, J. Barriers and catalysts of nutrition literacy among elderly Japanese people. Health Promot. Int. 2011, 26,
421431. CrossRef
Zoellner, J. Connell, C. Bounds, W. Crook, L. Yadrick, K. Nutrition Literacy Status and Preferred Nutrition Communication
Channels among Adults in the Lower Mississippi Delta. Prev. Chronic Dis. 2009, 6, A128.
Noroozi, A. Khademolhosseini, F. Lari, H. Tahmasebi, R. The