In [1]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import lancedb
from sentence_transformers import SentenceTransformer
import pyarrow as pa
import pandas as pd
from tqdm.auto import tqdm

In [2]:
# Function to extract data from the PDF document
def load_pdf(data):
    loader = PyPDFLoader(data)
    documents = loader.load()
    return documents

In [3]:
# Apply the function on the pdf document
extracted_data = tqdm(load_pdf('Introduction to Nutrition Science, LibreTexts Project.pdf'))

  0%|          | 0/523 [00:00<?, ?it/s]

In [4]:
# Function to split the extrracted pdf documents into chunks using LangChain's text splitter
def split_pdf(documents):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = text_splitter.split_documents(documents)
    return chunks

In [5]:
# length of chunks
chunks = split_pdf(extracted_data)
print("length of chunks:", len(chunks))

length of chunks: 3595


In [6]:
# page content of first page and last page
chunks[0].page_content, chunks[3594].page_content

('INTRODUCTION TO \nNUTRITION SCIENCE',
 '4.0\n16.3: The Food Industry - CC BY-NC-SA 4.0\n16.4: The Politics of Food - CC BY-NC-SA 4.0\n16.5: Food Cost and Inflation - CC BY-NC-SA 4.0\n16.6: The Issue of Food Security - CC BY-NC-SA 4.0\n16.7: Nutrition and Your Health - CC BY-NC-SA 4.0\n16.8: Diets around the World - CC BY-NC-SA 4.0\n16.E: Food Politics and Perspectives (Exercise) - CC\nBY-NC-SA 4.0\nBack Matter - CC BY-NC-SA 4.0\nIndex - CC BY-NC-SA 4.0\nGlossary - CC BY-NC-SA 4.0\nDetailed Licensing - CC BY-NC-SA 4.0')

In [7]:
# Function to generate embeddings using SentenceTransformer
def generate_embeddings(chunks, model_name: str = "all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    texts = [chunk.page_content for chunk in chunks]
    embeddings = model.encode(texts)
    return embeddings

In [8]:
# Generate embeddings for the chunks
embeddings = tqdm(generate_embeddings(chunks))



  0%|          | 0/3595 [00:00<?, ?it/s]

In [9]:
len(embeddings)

3595

#### Create a Lance database and insert the text and embeddings into it

In [10]:
def load_chunks_into_lancedb(chunks, embeddings, db_path: str, table_name: str):
    
    # Connecting to LanceDB
    db = lancedb.connect(db_path)
    
    # create a custom schema using pyarrow
    custom_schema = pa.schema([
        pa.field('chunk_id', pa.int32()),
        pa.field('text', pa.string()),
        pa.field('embedding', pa.list_(pa.float64()))
    ])

    # Create a table for the lancedb
    if table_name not in db.table_names():
        table = db.create_table(table_name, schema=custom_schema)
    
    # Create a DataFrame from chunks and embedding
    data = {
        "chunk_id": [],
        "text": [],
        "embedding": []
    }

    for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
        data["chunk_id"].append(i)
        data["text"].append(chunk.page_content)
        data["embedding"].append(embedding.tolist()) 

    df = pd.DataFrame(data)

    print(db.table_names())
    print(f"Inserted {len(chunks)} chunks with embeddings into LanceDB table '{table_name}'.")

    return df, table

* Load the chunks and embeddings into LanceDB function

In [11]:
# linux-native path on WSL
db_path = "/home/bluemusk/diet-assistant/lancedb"   
#db_path = '/mnt/c/Users/emman/desktop/diet-assistant/lancedb'
table_name = "diet_table"

df, table = tqdm(load_chunks_into_lancedb(chunks, embeddings, db_path, table_name))

['diet_table']
Inserted 3595 chunks with embeddings into LanceDB table 'diet_table'.


  0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
# get the table columns
table.to_pandas()

Unnamed: 0,chunk_id,text,embedding


In [13]:
# get the dataframe
df.head()

Unnamed: 0,chunk_id,text,embedding
0,0,INTRODUCTION TO \nNUTRITION SCIENCE,"[-0.05099819228053093, -0.056592684239149094, ..."
1,1,Introduction to Nutrition Science,"[-0.05099819228053093, -0.056592684239149094, ..."
2,2,This text is disseminated via the Open Educati...,"[-0.019119346514344215, 0.10461532324552536, 0..."
3,3,Instructors can adopt existing LibreTexts text...,"[-0.029113631695508957, 0.010369417257606983, ..."
4,4,"for the construction, customization, and disse...","[-0.017107605934143066, 0.02413615770637989, -..."


In [14]:
# saving the dataframe df
df.to_csv('diet-data.csv', index=False, escapechar='\\')

* load dataframe into table

In [15]:
def add_dataframe_to_table(dataframe):
    # add dataframe to table
    table.add(dataframe)
    
    print(f"Added {len(chunks)} chunks and {len(embeddings)} embeddings to the table.")
    return table

In [16]:
table = add_dataframe_to_table(df)

Added 3595 chunks and 3595 embeddings to the table.


In [17]:
# converting the table to pandas
table.to_pandas()

Unnamed: 0,chunk_id,text,embedding
0,0,INTRODUCTION TO \nNUTRITION SCIENCE,"[-0.05099819228053093, -0.056592684239149094, ..."
1,1,Introduction to Nutrition Science,"[-0.05099819228053093, -0.056592684239149094, ..."
2,2,This text is disseminated via the Open Educati...,"[-0.019119346514344215, 0.10461532324552536, 0..."
3,3,Instructors can adopt existing LibreTexts text...,"[-0.029113631695508957, 0.010369417257606983, ..."
4,4,"for the construction, customization, and disse...","[-0.017107605934143066, 0.02413615770637989, -..."
...,...,...,...
3590,3590,11.7: Food Processing - CC BY-NC-SA 4.0\n11.8:...,"[-0.02437693625688553, -0.005427069962024689, ..."
3591,3591,3 h t t p s : / / m e d . l i b r e t e x t s ...,"[-0.025427795946598053, 0.023010170087218285, ..."
3592,3592,SA 4.0\n13.4: Fuel Sources - CC BY-NC-SA 4.0\n...,"[-0.08159752935171127, 0.005391544196754694, -..."
3593,3593,14.3: Infancy - CC BY-NC-SA 4.0\n14.4: Toddler...,"[-0.023193208500742912, 0.05462077260017395, -..."


#### Creating an index on text column and perform a full-text search query retrieval

In [18]:
table.create_fts_index("text")

In [19]:
# Perform a full-text search
query = 'What are Nutrients?'
text_search = table.search(query, query_type="fts").limit(5).select(["text"]).to_list()
text_search

[{'text': 'What are some of the nutrients found in your favorite foods?\nhttp://www.ars.usda.gov/Services/docs.htm?docid=17032\n2. Have a discussion in class on the “progression of science” and its significance to human health as depicted in the video on\npellagra (Video .\n1.2: What Are Nutrients? is shared under a CC BY-NC-SA license and was authored, remixed, and/or curated by LibreTexts.\nWhat is a calorie? - Emma Bryce What is a calorie? - Emma Bryce\nFood: A Better Source of Nutrients\n1.2.1',
  '_score': 10.188911437988281},
 {'text': 'What are Nutrients?  \nThe foods we eat contain nutrients. Nutrients are substances required by the body to perform its basic functions. Nutrients must be\nobtained from our diet, since the human body does not synthesize or produce them. Nutrients have one or more of three basic\nfunctions: they provide energy, contribute to body structure, and/or regulate chemical processes in the body. These basic functions',
  '_score': 9.51707935333252},
 {'te

* Function of full_text search retrieval

In [20]:
def search(query):
    text_search = table.search(query, query_type="fts").limit(5).select(["text"]).to_list()
    return text_search

In [21]:
query = 'Hypertension'
text_search = search(query)
text_search

[{'text': 'family members who have hypertension.Weight. Roughly 60 percent of people with hypertension are obese.\nSodium consumption. The more salt in a person’s diet, the more\nlikely they are to have high blood pressure.\nAlcohol. Drinking more than two drinks per day for men and one\ndrink for women increases the likelihood of hypertension.\nDiet. In addition to salt and alcohol consumption, other dietary\nfactors increase chances of developing hypertension.',
  '_score': 7.494791507720947},
 {'text': 'avoided. In this study, people on the low-sodium (1500 milligrams per day) DASH diet had mean systolic blood pressures that were\n7.1 mmHg lower than people without hypertension not on the DASH diet. The effect on blood pressure was greatest in participants\nwith hypertension at the beginning of the study who followed the DASH diet. Their systolic blood pressures were, on average, 11.5\nmmHg lower than participants with hypertension on the control diet.',
  '_score': 6.79581165313720

#### Using Ollama locally to perform RAG with full-text search

In [34]:
import ollama

* Function to write a prompt

In [35]:
def build_prompt(query, text_search):
    prompt_template = """
    You're a diet assistant. Use the following pieces of information to answer the user's question. 
    You're performing a full text search, so use the text column only for answers.
    The answer to the QUESTION should be based on the CONTEXT given. 
    If you don't know the answer, just say that you don't know and not make up an answer. 
    Only return the helpful answer below and nothing else.

    QUESTION: {question}
    CONTEXT: {context}
    """.strip()

    context = ""
    
    for item in text_search:
        context += f'{item.get("text", "")}\n\n'

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

* Function for LLM (ollama llama3)

In [36]:
def llm(prompt):
    # using ollama chat
    response = ollama.chat(
        model='llama3',
        messages=[{'role': 'user', 'content': prompt}])
    
    return response

* Function of the full RAG pipeline

In [37]:
def rag_pipeline(query):
    # Perform full text search
    text_search = search(query)
    
    # Build the prompt
    prompt = build_prompt(query, text_search)
    
    # outcome from Llama3
    answer = llm(prompt)
    outcome = answer['message']['content']
    
    return outcome

In [None]:
# Applying the full RAG functions to this query
query = 'What are Nutrients?'
outcome = rag_pipeline(query)
print(outcome)

#### Creating an index on embedding column and perform semantic search query retrieval

In [22]:
import numpy as np

* Step 1: Convert list embedding column to Fixed Size List Array

In [23]:
# Function to convert list of embeddings to FixedSizeListArray
def convert_to_fixed_size_list(embeddings, embedding_size):
    flattened_embeddings = []
    
    # Iterate over each embedding and ensure correct size
    for embedding in embeddings:
        if len(embedding) != embedding_size:
            raise ValueError(f"Embedding size {len(embedding)} does not match the expected size {embedding_size}")
        flattened_embeddings.extend(embedding)
    
    numpy_array = np.array(flattened_embeddings, dtype=np.float32)
    
    # Create FixedSizeListArray
    fixed_size_list = pa.FixedSizeListArray.from_arrays(
        pa.array(numpy_array),
        list_size=embedding_size
    )
    
    return fixed_size_list

* Step 2: Convert LanceDB table to an Arrow table format

In [38]:
# LanceDB table to Arrow table
arrow_table = table.to_arrow()

# check the pandas frame of the converted arrow table
arrow_table.to_pandas()

Unnamed: 0,chunk_id,text,embedding
0,0,INTRODUCTION TO \nNUTRITION SCIENCE,"[-0.05099819228053093, -0.056592684239149094, ..."
1,1,Introduction to Nutrition Science,"[-0.05099819228053093, -0.056592684239149094, ..."
2,2,This text is disseminated via the Open Educati...,"[-0.019119346514344215, 0.10461532324552536, 0..."
3,3,Instructors can adopt existing LibreTexts text...,"[-0.029113631695508957, 0.010369417257606983, ..."
4,4,"for the construction, customization, and disse...","[-0.017107605934143066, 0.02413615770637989, -..."
...,...,...,...
3590,3590,11.7: Food Processing - CC BY-NC-SA 4.0\n11.8:...,"[-0.02437693625688553, -0.005427069962024689, ..."
3591,3591,3 h t t p s : / / m e d . l i b r e t e x t s ...,"[-0.025427795946598053, 0.023010170087218285, ..."
3592,3592,SA 4.0\n13.4: Fuel Sources - CC BY-NC-SA 4.0\n...,"[-0.08159752935171127, 0.005391544196754694, -..."
3593,3593,14.3: Infancy - CC BY-NC-SA 4.0\n14.4: Toddler...,"[-0.023193208500742912, 0.05462077260017395, -..."


* Step 3: Extract embeddings to Fixed-size-list array with the above function

In [25]:
# Embedding size for embedding model - all-MiniLM-L6-v2'
embedding_size = 384

# Calling the convert to FixedSizeListArray function
fixed_size_embeddings = convert_to_fixed_size_list(arrow_table.to_pandas()['embedding'], embedding_size)

* Step 4: Update the arrow table with the fixed-size-list array embedding column

In [26]:
updated_table = arrow_table.set_column(
    arrow_table.schema.get_field_index('embedding'),       # Get embedding column index
    'embedding',                                           # Column name
    fixed_size_embeddings                                  # FixedSizeListArray embeddings
)

* Step 5: Connect to LanceDB and create a new table name with updated data in the database

In [27]:
# Connect to a LanceDB database. db_path is defined above
db = lancedb.connect(db_path)  

# Create table with updated data and a new table name (new_diet_table)
db.create_table('new_diet_table', updated_table)

# confirm the new created table
db.table_names()

['diet_table', 'new_diet_table']

* Step 6: create a vector index on embedding column of new_diet_table and perform a semantic search

In [28]:
# open created table
new_table = db.open_table('new_diet_table')

# index for embedding column on updated_table
new_table.create_index(metric='cosine', vector_column_name='embedding', index_type='IVF_PQ')

In [29]:
# Define a query
query = 'What are Nutrients?'

# Load the embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Convert the query into a list format
query_embedding = model.encode(query).tolist()  

# Perform the semantic search
semantic_search = new_table.search(query_embedding, query_type='vector', vector_column_name='embedding').limit(5).select(['text']).to_list()



In [30]:
# Get retrieval for semantic search
semantic_search

[{'text': 'What are Nutrients?  \nThe foods we eat contain nutrients. Nutrients are substances required by the body to perform its basic functions. Nutrients must be\nobtained from our diet, since the human body does not synthesize or produce them. Nutrients have one or more of three basic\nfunctions: they provide energy, contribute to body structure, and/or regulate chemical processes in the body. These basic functions',
  '_distance': 0.4856143891811371},
 {'text': 'proteins, and water, and are referred to as macronutrients.\nTwo of the classes of nutrients are needed in lesser amounts, but are still essential for bodily function. They are vitamins and\nminerals.\nOne measurement of food quality is the amount of essential nutrients a food contains relative to the amount of energy it has\n(nutrient density).\nDiscussion Starters  \n1. Make a list of some of your favorite foods and visit the “What’s In the Foods You Eat?” search tool provided by the USDA.',
  '_distance': 0.50263643264

* Function of Semantic search retrieval

In [31]:
def search_vector(query):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    query_embedding = model.encode(query).tolist()
    semantic_search = new_table.search(query_embedding, query_type='vector', vector_column_name='embedding').limit(5).select(['text']).to_list()
    return semantic_search

In [32]:
# testing the function above
query = 'Hypertension'
semantic_search = search_vector(query)
semantic_search

[{'text': 'family members who have hypertension.Weight. Roughly 60 percent of people with hypertension are obese.\nSodium consumption. The more salt in a person’s diet, the more\nlikely they are to have high blood pressure.\nAlcohol. Drinking more than two drinks per day for men and one\ndrink for women increases the likelihood of hypertension.\nDiet. In addition to salt and alcohol consumption, other dietary\nfactors increase chances of developing hypertension.',
  '_distance': 0.7055325508117676},
 {'text': 'blood pressure of 130/80 mm Hg or greater. Hypertension is a risk factor for cardiovascular disease, and reducing blood pressure\nhas been found to decrease the risk of dying from a heart attack or stroke. The Centers for Disease Control and Prevention (CDC)\nreported that in 2007–2008 approximately 33 percent of Americans were hypertensive.The percentage of people with\nhypertension increases to over 60 percent in people over the age of sixty.[1]\n[2]',
  '_distance': 0.72028160

In [33]:
# trying another query
new_query = 'tell me something about protein'
semantic_search = search_vector(new_query)
semantic_search

[{'text': '6 . 8 . 1 h t t p s : / / m e d . l i b r e t e x t s . o r g / @g o / p a g e / 4 0 6 1 16.8: Proteins, Diet, and Personal Choices\nWe have discussed what proteins are, how they are made, how they are digested and absorbed, the many functions of proteins in the\nbody, and the consequences of having too little or too much protein in the diet. This section will provide you with information on',
  '_distance': 0.45965754985809326},
 {'text': '6 . 2 . 1 h t t p s : / / m e d . l i b r e t e x t s . o r g / @g o / p a g e / 4 0 6 0 56.2: Defining Protein\nProtein makes up approximately 20 percent of the human body and is present in every single cell. The word protein is a Greek\nword, meaning “of utmost importance.” Proteins are called the workhorses of life as they provide the body with structure and\nperform a vast array of functions. You can stand, walk, run, skate, swim, and more because of your protein-rich muscles. Protein is',
  '_distance': 0.5450495481491089},
 {'text':

* for loop to get the text from semantic search

In [65]:
# Print the results
for result in semantic_search:
    print(result['text'])

What are Nutrients?  
The foods we eat contain nutrients. Nutrients are substances required by the body to perform its basic functions. Nutrients must be
obtained from our diet, since the human body does not synthesize or produce them. Nutrients have one or more of three basic
functions: they provide energy, contribute to body structure, and/or regulate chemical processes in the body. These basic functions
proteins, and water, and are referred to as macronutrients.
Two of the classes of nutrients are needed in lesser amounts, but are still essential for bodily function. They are vitamins and
minerals.
One measurement of food quality is the amount of essential nutrients a food contains relative to the amount of energy it has
(nutrient density).
Discussion Starters  
1. Make a list of some of your favorite foods and visit the “What’s In the Foods You Eat?” search tool provided by the USDA.
allow us to detect and respond to environmental surroundings, move, excrete wastes, respire (breath

In [None]:
from sentence_transformers import SentenceTransformer
import ollama

# Ensure FTS index on the 'text' column (done once)
table.create_fts_index("text")

# Ensure vector index on the 'embedding' column
table.create_vector_index("embedding")

# Function to generate embeddings using SentenceTransformer
def generate_embeddings(chunks, model_name: str = "all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    texts = [chunk['text'] for chunk in chunks]  # Assuming chunks is a list of dictionaries with 'text' keys
    embeddings = model.encode(texts)
    return embeddings

def text_to_embedding(text, model_name: str = "all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    embedding = model.encode(text)
    return embedding

def semantic_search(query_embedding):
    # Perform semantic search based on embeddings
    search_results = table.search(
        query_vector=query_embedding,
        query_type="vector",  # Ensure this is set for vector similarity search
        limit=5                # Limit results to top 5
    ).select(["text"]).to_list()
    return search_results

def build_prompt(query, text_search):
    prompt_template = """
    You're a diet assistant. Use the following pieces of information to answer the user's question. 
    You're performing a semantic search, so use the text column only for answers.
    The answer to the QUESTION should be based on the CONTEXT given. 
    If you don't know the answer, just say that you don't know and not make up an answer. 
    Only return the helpful answer below and nothing else.

    QUESTION: {question}
    CONTEXT: {context}
    """.strip()

    context = ""
    
    for item in text_search:
        context += f'{item.get("text", "")}\n\n'

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

def llm(prompt):
    # Correct usage of the ollama.chat function
    response = ollama.chat(
        model='llama3',
        messages=[{'role': 'user', 'content': prompt}]
    )

    # Safely access the response to prevent KeyError
    if "text" in response:
        return response["text"]
    else:
        return "No valid response from the model."

def rag_pipeline(query):
    # Step 1: Convert query to embedding
    query_embedding = text_to_embedding(query)
    
    # Step 2: Perform semantic search using the embedding
    text_search = semantic_search(query_embedding)
    
    # Step 3: Build prompt with search results
    prompt = build_prompt(query, text_search)
    
    # Step 4: Query Llama 3 model with the constructed prompt
    answer = llm(prompt)
    
    return answer

# Query example
query = 'What is a nutrient?'
answer = rag_pipeline(query)
print(answer)
