In [2]:
#!pip install langchain-community langchain pyPdf sentence_transformers pyarrow lancedb

In [3]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import lancedb
from sentence_transformers import SentenceTransformer
import pyarrow as pa
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

In [4]:
# Function to extract data from the PDF document
def load_pdf(data):
    loader = PyPDFLoader(data)
    documents = loader.load()
    return documents

In [6]:
# Apply the function on the pdf document
extracted_data = tqdm(load_pdf('Introduction to Nutrition Science, LibreTexts Project.pdf'))

  0%|          | 0/523 [00:00<?, ?it/s]

In [7]:
# Function to split the extrracted pdf documents into chunks using LangChain's text splitter
def split_pdf(documents):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = text_splitter.split_documents(documents)
    return chunks

In [8]:
# length of chunks
chunks = split_pdf(extracted_data)
print("length of chunks:", len(chunks))

length of chunks: 3595


In [9]:
# page content of first page and last page
chunks[0].page_content, chunks[3594].page_content

('INTRODUCTION TO \nNUTRITION SCIENCE',
 '4.0\n16.3: The Food Industry - CC BY-NC-SA 4.0\n16.4: The Politics of Food - CC BY-NC-SA 4.0\n16.5: Food Cost and Inflation - CC BY-NC-SA 4.0\n16.6: The Issue of Food Security - CC BY-NC-SA 4.0\n16.7: Nutrition and Your Health - CC BY-NC-SA 4.0\n16.8: Diets around the World - CC BY-NC-SA 4.0\n16.E: Food Politics and Perspectives (Exercise) - CC\nBY-NC-SA 4.0\nBack Matter - CC BY-NC-SA 4.0\nIndex - CC BY-NC-SA 4.0\nGlossary - CC BY-NC-SA 4.0\nDetailed Licensing - CC BY-NC-SA 4.0')

In [10]:
# Function to generate embeddings using SentenceTransformer
def generate_embeddings(chunks, model_name: str = "all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    texts = [chunk.page_content for chunk in chunks]
    embeddings = model.encode(texts)
    return embeddings

In [11]:
# Generate embeddings for the chunks
embeddings = tqdm(generate_embeddings(chunks))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  0%|          | 0/3595 [00:00<?, ?it/s]

In [12]:
len(embeddings)

3595

#### Create a Lance database and insert the text and embeddings into it

In [13]:
def load_chunks_into_lancedb(chunks, embeddings, db_path: str, table_name: str):

    # Connecting to LanceDB
    db = lancedb.connect(db_path)

    # create a custom schema using pyarrow
    custom_schema = pa.schema([
        pa.field('chunk_id', pa.int32()),
        pa.field('text', pa.string()),
        pa.field('embedding', pa.list_(pa.float64()))
    ])

    # Create a table for the lancedb
    if table_name not in db.table_names():
        table = db.create_table(table_name, schema=custom_schema)

    # Create a DataFrame from chunks and embedding
    data = {
        "chunk_id": [],
        "text": [],
        "embedding": []
    }

    for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
        data["chunk_id"].append(i)
        data["text"].append(chunk.page_content)
        data["embedding"].append(embedding.tolist())

    df = pd.DataFrame(data)

    print(db.table_names())
    print(f"Inserted {len(chunks)} chunks with embeddings into LanceDB table '{table_name}'.")

    return df, table

* Load the chunks and embeddings into LanceDB function

In [14]:
db_path = "lancedb"
table_name = "diet_table"

df, table = tqdm(load_chunks_into_lancedb(chunks, embeddings, db_path, table_name))

['diet_table']
Inserted 3595 chunks with embeddings into LanceDB table 'diet_table'.


  0%|          | 0/2 [00:00<?, ?it/s]

In [15]:
# get the table columns
table.to_pandas()

Unnamed: 0,chunk_id,text,embedding


In [16]:
# get the dataframe
df.head()

Unnamed: 0,chunk_id,text,embedding
0,0,INTRODUCTION TO \nNUTRITION SCIENCE,"[-0.05099822208285332, -0.056592732667922974, ..."
1,1,Introduction to Nutrition Science,"[-0.05099822208285332, -0.056592732667922974, ..."
2,2,This text is disseminated via the Open Educati...,"[-0.01911931298673153, 0.10461530834436417, 0...."
3,3,Instructors can adopt existing LibreTexts text...,"[-0.029113687574863434, 0.010369468480348587, ..."
4,4,"for the construction, customization, and disse...","[-0.017107656225562096, 0.02413615770637989, -..."


In [17]:
# saving the dataframe df
df.to_csv('diet-data.csv', index=False, escapechar='\\')

* load dataframe into table

In [18]:
def add_dataframe_to_table(dataframe):
    # add dataframe to table
    table.add(dataframe)

    print(f"Added {len(chunks)} chunks and {len(embeddings)} embeddings to the table.")
    return table

In [19]:
table = add_dataframe_to_table(df)

Added 3595 chunks and 3595 embeddings to the table.


In [20]:
# converting the table to pandas
table.to_pandas()

Unnamed: 0,chunk_id,text,embedding
0,0,INTRODUCTION TO \nNUTRITION SCIENCE,"[-0.05099822208285332, -0.056592732667922974, ..."
1,1,Introduction to Nutrition Science,"[-0.05099822208285332, -0.056592732667922974, ..."
2,2,This text is disseminated via the Open Educati...,"[-0.01911931298673153, 0.10461530834436417, 0...."
3,3,Instructors can adopt existing LibreTexts text...,"[-0.029113687574863434, 0.010369468480348587, ..."
4,4,"for the construction, customization, and disse...","[-0.017107656225562096, 0.02413615770637989, -..."
...,...,...,...
3590,3590,11.7: Food Processing - CC BY-NC-SA 4.0\n11.8:...,"[-0.024376921355724335, -0.005427063442766666,..."
3591,3591,3 h t t p s : / / m e d . l i b r e t e x t s ...,"[-0.025427835062146187, 0.023010175675153732, ..."
3592,3592,SA 4.0\n13.4: Fuel Sources - CC BY-NC-SA 4.0\n...,"[-0.08159748464822769, 0.005391540005803108, -..."
3593,3593,14.3: Infancy - CC BY-NC-SA 4.0\n14.4: Toddler...,"[-0.023193221539258957, 0.05462077260017395, -..."


#### Creating an index on text column and perform a full-text search query retrieval

#!pip install tantivy

In [23]:
table.create_fts_index("text")

In [24]:
# Perform a full-text search
query = 'What are Nutrients?'
text_search = table.search(query, query_type="fts").limit(5).select(["text"]).to_list()
text_search

[{'text': 'What are some of the nutrients found in your favorite foods?\nhttp://www.ars.usda.gov/Services/docs.htm?docid=17032\n2. Have a discussion in class on the “progression of science” and its significance to human health as depicted in the video on\npellagra (Video .\n1.2: What Are Nutrients? is shared under a CC BY-NC-SA license and was authored, remixed, and/or curated by LibreTexts.\nWhat is a calorie? - Emma Bryce What is a calorie? - Emma Bryce\nFood: A Better Source of Nutrients\n1.2.1',
  '_score': 10.188911437988281},
 {'text': 'What are Nutrients?  \nThe foods we eat contain nutrients. Nutrients are substances required by the body to perform its basic functions. Nutrients must be\nobtained from our diet, since the human body does not synthesize or produce them. Nutrients have one or more of three basic\nfunctions: they provide energy, contribute to body structure, and/or regulate chemical processes in the body. These basic functions',
  '_score': 9.51707935333252},
 {'te

* Function of full_text search retrieval

In [25]:
def search(query):
    text_search = table.search(query, query_type="fts").limit(5).select(["text"]).to_list()
    return text_search

In [26]:
# applying the full-text search function
query = 'Hypertension'
text_search = search(query)
text_search

[{'text': 'family members who have hypertension.Weight. Roughly 60 percent of people with hypertension are obese.\nSodium consumption. The more salt in a person’s diet, the more\nlikely they are to have high blood pressure.\nAlcohol. Drinking more than two drinks per day for men and one\ndrink for women increases the likelihood of hypertension.\nDiet. In addition to salt and alcohol consumption, other dietary\nfactors increase chances of developing hypertension.',
  '_score': 7.494791507720947},
 {'text': 'avoided. In this study, people on the low-sodium (1500 milligrams per day) DASH diet had mean systolic blood pressures that were\n7.1 mmHg lower than people without hypertension not on the DASH diet. The effect on blood pressure was greatest in participants\nwith hypertension at the beginning of the study who followed the DASH diet. Their systolic blood pressures were, on average, 11.5\nmmHg lower than participants with hypertension on the control diet.',
  '_score': 6.79581165313720

* Install Ollama into the notebook's local runtime

In [27]:
!curl -fsSL https://ollama.com/install.sh | sh

>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
############################################################################################# 100.0%
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.


* Start Ollama using `ollama serve`. This needs to run in the backgound - so we run it using `nohup` (to see the output log, open nohup.out).

In [28]:
!nohup ollama serve > nohup.out 2>&1 &

* Pull the desired model (llama3)

In [29]:
%%capture
!ollama pull llama3

In [30]:
!pip install ollama

Collecting ollama
  Downloading ollama-0.3.3-py3-none-any.whl.metadata (3.8 kB)
Downloading ollama-0.3.3-py3-none-any.whl (10 kB)
Installing collected packages: ollama
Successfully installed ollama-0.3.3


In [31]:
import ollama

### Using Ollama locally to perform RAG with full-text search

* Function to write a prompt

In [32]:
def build_prompt(query, text_search):
    prompt_template = """
    You're a diet assistant. Use the following pieces of information to answer the user's question.
    You're performing a full text search, so use the text column only for answers.
    The answer to the QUESTION should be based on the CONTEXT given.
    If you don't know the answer, just say that you don't know and not make up an answer.
    Only return the helpful answer below and nothing else.

    QUESTION: {question}
    CONTEXT: {context}
    """.strip()

    context = ""

    for item in text_search:
        context += f'{item.get("text", "")}\n\n'

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

* Function for LLM llama3 using ollama

In [33]:
def llm(prompt):
    # using ollama chat
    response = ollama.chat(
        model='llama3',
        messages=[{'role': 'user', 'content': prompt}])

    return response

* Function of the full-text RAG pipeline

In [34]:
def rag_pipeline(query):
    # Perform full text search using the updated search function
    text_search = search(query)

    # Build the prompt with the search results
    prompt = build_prompt(query, text_search)

    # Get response from Llama 3 model
    answer = llm(prompt)
    outcome = answer['message']['content']

    return outcome

In [35]:
# Applying the full RAG function to this query
query = 'What are Nutrients?'
outcome = rag_pipeline(query)
print(outcome)

According to the text, "Nutrients are substances required by the body to perform its basic functions. Nutrients must be obtained from our diet, since the human body does not synthesize or produce them. Nutrients have one or more of three basic functions: they provide energy, contribute to body structure, and/or regulate chemical processes in the body."


In [36]:
# Applying the full RAG function to this question
question = 'Tell me something about Hypertension'
outcome = rag_pipeline(question)
print(outcome)

According to the text, nutrition is important to health because it can reveal information about overall health status and disease risk, as indicated by blood tests. Specifically, substances measured in a typical blood test can indicate anemia risk, presence of infection, bleeding disorders, and atherosclerosis risk, among other things. Additionally, dietary factors such as sodium consumption, alcohol intake, and diet quality are associated with the development of hypertension, which is a major health concern that affects nearly 60% of people with hypertension being obese.


In [37]:
# Applying the full RAG function to this question
q = 'What is Diabetes'
outcome = rag_pipeline(q)
print(outcome)

Diabetes is a metabolic disease of insulin deficiency and glucose over-sufficiency.


#### Creating an index on embedding column and perform a semantic search query retrieval

* Step 1: Convert list embedding column to Fixed Size List Array

In [38]:
# Function to convert list of embeddings to FixedSizeListArray
def convert_to_fixed_size_list(embeddings, embedding_size):
    flattened_embeddings = []

    # Iterate over each embedding and ensure correct size
    for embedding in embeddings:
        if len(embedding) != embedding_size:
            raise ValueError(f"Embedding size {len(embedding)} does not match the expected size {embedding_size}")
        flattened_embeddings.extend(embedding)

    numpy_array = np.array(flattened_embeddings, dtype=np.float32)

    # Create FixedSizeListArray
    fixed_size_list = pa.FixedSizeListArray.from_arrays(
        pa.array(numpy_array),
        list_size=embedding_size
    )

    return fixed_size_list

* Step 2: Convert LanceDB table to an Arrow table format

In [39]:
# LanceDB table to Arrow table
arrow_table = table.to_arrow()

# check the pandas frame of the converted arrow table
arrow_table.to_pandas()

Unnamed: 0,chunk_id,text,embedding
0,0,INTRODUCTION TO \nNUTRITION SCIENCE,"[-0.05099822208285332, -0.056592732667922974, ..."
1,1,Introduction to Nutrition Science,"[-0.05099822208285332, -0.056592732667922974, ..."
2,2,This text is disseminated via the Open Educati...,"[-0.01911931298673153, 0.10461530834436417, 0...."
3,3,Instructors can adopt existing LibreTexts text...,"[-0.029113687574863434, 0.010369468480348587, ..."
4,4,"for the construction, customization, and disse...","[-0.017107656225562096, 0.02413615770637989, -..."
...,...,...,...
3590,3590,11.7: Food Processing - CC BY-NC-SA 4.0\n11.8:...,"[-0.024376921355724335, -0.005427063442766666,..."
3591,3591,3 h t t p s : / / m e d . l i b r e t e x t s ...,"[-0.025427835062146187, 0.023010175675153732, ..."
3592,3592,SA 4.0\n13.4: Fuel Sources - CC BY-NC-SA 4.0\n...,"[-0.08159748464822769, 0.005391540005803108, -..."
3593,3593,14.3: Infancy - CC BY-NC-SA 4.0\n14.4: Toddler...,"[-0.023193221539258957, 0.05462077260017395, -..."


* Step 3: Extract embeddings to Fixed-size-list array with the above function

In [40]:
# Embedding size for embedding model - all-MiniLM-L6-v2'
embedding_size = 384

# Calling the convert to FixedSizeListArray function
fixed_size_embeddings = convert_to_fixed_size_list(arrow_table.to_pandas()['embedding'], embedding_size)

* Step 4: Update the arrow table with the fixed-size-list array embedding column

In [41]:
updated_table = arrow_table.set_column(
    arrow_table.schema.get_field_index('embedding'),       # Get embedding column index
    'embedding',                                           # Column name
    fixed_size_embeddings                                  # FixedSizeListArray embeddings
)

* Step 5: Connect to LanceDB and create a new table name with updated data in the database

In [42]:
# Connect to a LanceDB database. db_path is defined above
db = lancedb.connect(db_path)

# Create table with updated data and a new table name (new_diet_table)
db.create_table('new_diet_table', updated_table)

# confirm the new created table
db.table_names()

['diet_table', 'new_diet_table']

* Step 6: create a vector index on embedding column of new_diet_table and perform a semantic search

In [43]:
# open created table
new_table = db.open_table('new_diet_table')

# index on embedding column on updated_table
new_table.create_index(metric='cosine', vector_column_name='embedding', index_type='IVF_PQ')

In [53]:
# Define a query
query = 'What are Nutrients?'

# Load the embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Convert the query into a list format
query_embedding = model.encode(query).tolist()

# Perform the semantic search
semantic_search = new_table.search(query_embedding, query_type='vector', vector_column_name='embedding').limit(5).select(['text']).to_list()

# Get retrieval for semantic search
semantic_search

[{'text': 'proteins, and water, and are referred to as macronutrients.\nTwo of the classes of nutrients are needed in lesser amounts, but are still essential for bodily function. They are vitamins and\nminerals.\nOne measurement of food quality is the amount of essential nutrients a food contains relative to the amount of energy it has\n(nutrient density).\nDiscussion Starters  \n1. Make a list of some of your favorite foods and visit the “What’s In the Foods You Eat?” search tool provided by the USDA.',
  '_distance': 0.4920754134654999},
 {'text': 'What are Nutrients?  \nThe foods we eat contain nutrients. Nutrients are substances required by the body to perform its basic functions. Nutrients must be\nobtained from our diet, since the human body does not synthesize or produce them. Nutrients have one or more of three basic\nfunctions: they provide energy, contribute to body structure, and/or regulate chemical processes in the body. These basic functions',
  '_distance': 0.49581721425

* Function of Semantic search retrieval

In [56]:
def search_vector(query):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    query_embedding = model.encode(query).tolist()
    semantic_search = new_table.search(query_embedding, query_type='vector', vector_column_name='embedding').limit(5).select(['text']).to_list()
    return semantic_search

In [57]:
# testing the function above
query = 'Describe Hypertension'
semantic_search = search_vector(query)
semantic_search

[{'text': 'which is the greatest and least pressure on an artery that occurs with each heartbeat. The force of blood against an artery is\nmeasured with a device called a sphygmomanometer. The results are recorded in millimeters of mercury, or mmHg. A desirable\nblood pressure is less than 120/80 mm Hg. Hypertension is the scientific term for high blood pressure and defined as a sustained',
  '_distance': 0.6673755645751953},
 {'text': 'High Blood Pressure  \nBlood pressure is the force of blood pumping through the arteries. When pressure levels become too high, it results in a condition\nknown as hypertension, which is asymptomatic but can lead to a number of other problems, including heart attacks, heart failure,\nkidney failure, and strokes. For people with high blood pressure, it can be beneficial to follow the same recommendations as those',
  '_distance': 0.7412970066070557},
 {'text': 'Resources  \n1. New ACC/AHA High Blood Pressure Guidelines Lower Definition of Hypertension. (

In [58]:
# trying another query
new_query = 'tell me something about protein'
semantic_search = search_vector(new_query)
semantic_search

[{'text': '6 . 8 . 1 h t t p s : / / m e d . l i b r e t e x t s . o r g / @g o / p a g e / 4 0 6 1 16.8: Proteins, Diet, and Personal Choices\nWe have discussed what proteins are, how they are made, how they are digested and absorbed, the many functions of proteins in the\nbody, and the consequences of having too little or too much protein in the diet. This section will provide you with information on',
  '_distance': 0.4554506242275238},
 {'text': '6 . 2 . 1 h t t p s : / / m e d . l i b r e t e x t s . o r g / @g o / p a g e / 4 0 6 0 56.2: Defining Protein\nProtein makes up approximately 20 percent of the human body and is present in every single cell. The word protein is a Greek\nword, meaning “of utmost importance.” Proteins are called the workhorses of life as they provide the body with structure and\nperform a vast array of functions. You can stand, walk, run, skate, swim, and more because of your protein-rich muscles. Protein is',
  '_distance': 0.5469719767570496},
 {'text': 

In [59]:
# Print the results
for result in semantic_search:
    print(result['text'])

6 . 8 . 1 h t t p s : / / m e d . l i b r e t e x t s . o r g / @g o / p a g e / 4 0 6 1 16.8: Proteins, Diet, and Personal Choices
We have discussed what proteins are, how they are made, how they are digested and absorbed, the many functions of proteins in the
body, and the consequences of having too little or too much protein in the diet. This section will provide you with information on
6 . 2 . 1 h t t p s : / / m e d . l i b r e t e x t s . o r g / @g o / p a g e / 4 0 6 0 56.2: Defining Protein
Protein makes up approximately 20 percent of the human body and is present in every single cell. The word protein is a Greek
word, meaning “of utmost importance.” Proteins are called the workhorses of life as they provide the body with structure and
perform a vast array of functions. You can stand, walk, run, skate, swim, and more because of your protein-rich muscles. Protein is
1CHAPTER OVERVIEW
6: Protein
By the end of this chapter, you will be able to:
Describe the role and structure of 

### Using Ollama locally to perform RAG with semantic search

* Function to write a prompt

In [60]:
def build_prompt(query, semantic_search):
    prompt_template = """
    You're a diet assistant. Use the following pieces of information to answer the user's question.
    You're performing a full text search, so use the text column only for answers.
    The answer to the QUESTION should be based on the CONTEXT given.
    If you don't know the answer, just say that you don't know and not make up an answer.
    Only return the helpful answer below and nothing else.

    QUESTION: {question}
    CONTEXT: {context}
    """.strip()

    context = ""

    for item in semantic_search:
        context += f'{item.get("embedding", "")}\n\n'

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

* Function for LLM llama3 using ollama

In [61]:
def llm(prompt):
    # using ollama chat
    response = ollama.chat(
        model='llama3',
        messages=[{'role': 'user', 'content': prompt}])

    return response

* Function of the vector RAG pipeline

In [62]:
def rag_pipeline(query):
    # Perform semantic search
    semantic_search = search_vector(query)

    # Build the prompt with the search results
    prompt = build_prompt(query, semantic_search)

    # Get response from Llama 3 model
    answer = llm(prompt)
    outcome = answer['message']['content']

    return outcome

In [63]:
# Applying the full RAG function to this query
query = 'What are Nutrients?'
outcome = rag_pipeline(query)
print(outcome)

According to the text, Nutrients refer to substances found in food that provide energy and support growth and health. They can be carbohydrates, proteins, or fats, as well as vitamins, minerals, and water.


In [64]:
# Applying the full RAG function to this question
q = 'What is Diabetes'
outcome = rag_pipeline(q)
print(outcome)

Based on the text column, I found the following relevant information:

Diabetes is a group of metabolic disorders characterized by high blood sugar levels. It occurs when the body's insulin production is impaired or when insulin becomes resistant to its own effects. There are several types of diabetes, including type 1, type 2, and gestational diabetes, each with different causes and symptoms.
