In [1]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import lancedb
from sentence_transformers import SentenceTransformer
import pyarrow as pa
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

In [2]:
# Extract data from the PDF document
def load_pdf(data):
    loader = PyPDFLoader(data)
    documents = loader.load()
    return documents

In [3]:
# Apply the function on the pdf document
extracted_data = tqdm(load_pdf('Introduction to Nutrition Science, LibreTexts Project.pdf'))

  0%|          | 0/523 [00:00<?, ?it/s]

In [4]:
# Split the extracted pdf documents into chunks using LangChain's text splitter
def split_pdf(documents):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = text_splitter.split_documents(documents)
    return chunks

In [5]:
# length of chunks
chunks = split_pdf(extracted_data)
print("length of chunks:", len(chunks))

length of chunks: 3595


In [6]:
# page content of first page and last page
chunks[0].page_content, chunks[3594].page_content

('INTRODUCTION TO \nNUTRITION SCIENCE',
 '4.0\n16.3: The Food Industry - CC BY-NC-SA 4.0\n16.4: The Politics of Food - CC BY-NC-SA 4.0\n16.5: Food Cost and Inflation - CC BY-NC-SA 4.0\n16.6: The Issue of Food Security - CC BY-NC-SA 4.0\n16.7: Nutrition and Your Health - CC BY-NC-SA 4.0\n16.8: Diets around the World - CC BY-NC-SA 4.0\n16.E: Food Politics and Perspectives (Exercise) - CC\nBY-NC-SA 4.0\nBack Matter - CC BY-NC-SA 4.0\nIndex - CC BY-NC-SA 4.0\nGlossary - CC BY-NC-SA 4.0\nDetailed Licensing - CC BY-NC-SA 4.0')

In [7]:
# Generate embeddings using SentenceTransformer
def generate_embeddings(chunks, model_name: str = "all-MiniLM-L6-v2"):
    embed_model = SentenceTransformer(model_name)
    texts = [chunk.page_content for chunk in chunks]
    embeddings = embed_model.encode(texts)
    return embeddings

In [8]:
# Generate embeddings for the chunks
embeddings = tqdm(generate_embeddings(chunks))

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  0%|          | 0/3595 [00:00<?, ?it/s]

In [9]:
len(embeddings)

3595

#### Create a Lance database and insert the text and embeddings into it

In [10]:
def load_chunks_into_lancedb(chunks, embeddings, db_path: str, table_name: str):
    
    # Connect to LanceDB
    db = lancedb.connect(db_path)
    
    # create a custom schema using pyarrow
    custom_schema = pa.schema([
        pa.field('chunk_id', pa.int32()),
        pa.field('text', pa.string()),
        pa.field('embedding', pa.list_(pa.float64()))
    ])

    # Create a table for the lancedb
    if table_name not in db.table_names():
        table = db.create_table(table_name, schema=custom_schema)
    
    # Create a DataFrame from chunks and embedding
    data = {
        "chunk_id": [],
        "text": [],
        "embedding": []
    }

    for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
        data["chunk_id"].append(i)
        data["text"].append(chunk.page_content)
        data["embedding"].append(embedding.tolist()) 

    df = pd.DataFrame(data)

    print(db.table_names())
    print(f"Inserted {len(chunks)} chunks with embeddings into LanceDB table '{table_name}'.")

    return df, table

* Load the chunks and embeddings into LanceDB function

In [11]:
# linux-native path on WSL
db_path = "/home/bluemusk/diet-assistant/lancedb"   
#db_path = '/mnt/c/Users/emman/desktop/diet-assistant/lancedb'
table_name = "diet_table"

df, table = tqdm(load_chunks_into_lancedb(chunks, embeddings, db_path, table_name))

['diet_table']
Inserted 3595 chunks with embeddings into LanceDB table 'diet_table'.


  0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
# get the table columns
table.to_pandas()

Unnamed: 0,chunk_id,text,embedding


In [13]:
# get the dataframe
df.head()

Unnamed: 0,chunk_id,text,embedding
0,0,INTRODUCTION TO \nNUTRITION SCIENCE,"[-0.05099819228053093, -0.056592684239149094, ..."
1,1,Introduction to Nutrition Science,"[-0.05099819228053093, -0.056592684239149094, ..."
2,2,This text is disseminated via the Open Educati...,"[-0.019119346514344215, 0.10461532324552536, 0..."
3,3,Instructors can adopt existing LibreTexts text...,"[-0.029113631695508957, 0.010369417257606983, ..."
4,4,"for the construction, customization, and disse...","[-0.017107605934143066, 0.02413615770637989, -..."


In [14]:
# save the dataframe df
df.to_csv('diet-data.csv', index=False, escapechar='\\')

* load dataframe into table

In [15]:
def add_dataframe_to_table(dataframe):
    # add dataframe to table
    table.add(dataframe)
    
    print(f"Added {len(chunks)} chunks and {len(embeddings)} embeddings to the table.")
    return table

In [16]:
table = add_dataframe_to_table(df)

Added 3595 chunks and 3595 embeddings to the table.


In [17]:
# convert table to pandas
table.to_pandas()

Unnamed: 0,chunk_id,text,embedding
0,0,INTRODUCTION TO \nNUTRITION SCIENCE,"[-0.05099819228053093, -0.056592684239149094, ..."
1,1,Introduction to Nutrition Science,"[-0.05099819228053093, -0.056592684239149094, ..."
2,2,This text is disseminated via the Open Educati...,"[-0.019119346514344215, 0.10461532324552536, 0..."
3,3,Instructors can adopt existing LibreTexts text...,"[-0.029113631695508957, 0.010369417257606983, ..."
4,4,"for the construction, customization, and disse...","[-0.017107605934143066, 0.02413615770637989, -..."
...,...,...,...
3590,3590,11.7: Food Processing - CC BY-NC-SA 4.0\n11.8:...,"[-0.02437693625688553, -0.005427069962024689, ..."
3591,3591,3 h t t p s : / / m e d . l i b r e t e x t s ...,"[-0.025427795946598053, 0.023010170087218285, ..."
3592,3592,SA 4.0\n13.4: Fuel Sources - CC BY-NC-SA 4.0\n...,"[-0.08159752935171127, 0.005391544196754694, -..."
3593,3593,14.3: Infancy - CC BY-NC-SA 4.0\n14.4: Toddler...,"[-0.023193208500742912, 0.05462077260017395, -..."


#### Creating an index on text column and perform a full-text search query retrieval

In [18]:
table.create_fts_index("text")

In [19]:
# Perform a full-text search
query = 'What are Nutrients?'
text_search = table.search(query, query_type="fts").limit(5).select(["text"]).to_list()
text_search

[{'text': 'What are some of the nutrients found in your favorite foods?\nhttp://www.ars.usda.gov/Services/docs.htm?docid=17032\n2. Have a discussion in class on the “progression of science” and its significance to human health as depicted in the video on\npellagra (Video .\n1.2: What Are Nutrients? is shared under a CC BY-NC-SA license and was authored, remixed, and/or curated by LibreTexts.\nWhat is a calorie? - Emma Bryce What is a calorie? - Emma Bryce\nFood: A Better Source of Nutrients\n1.2.1',
  '_score': 10.188911437988281},
 {'text': 'What are Nutrients?  \nThe foods we eat contain nutrients. Nutrients are substances required by the body to perform its basic functions. Nutrients must be\nobtained from our diet, since the human body does not synthesize or produce them. Nutrients have one or more of three basic\nfunctions: they provide energy, contribute to body structure, and/or regulate chemical processes in the body. These basic functions',
  '_score': 9.51707935333252},
 {'te

* Function of full_text search retrieval

In [20]:
def search(query):
    text_search = table.search(query, query_type="fts").limit(5).select(["text"]).to_list()
    return text_search

In [21]:
query = 'Hypertension'
text_search = search(query)
text_search

[{'text': 'family members who have hypertension.Weight. Roughly 60 percent of people with hypertension are obese.\nSodium consumption. The more salt in a person’s diet, the more\nlikely they are to have high blood pressure.\nAlcohol. Drinking more than two drinks per day for men and one\ndrink for women increases the likelihood of hypertension.\nDiet. In addition to salt and alcohol consumption, other dietary\nfactors increase chances of developing hypertension.',
  '_score': 7.494791507720947},
 {'text': 'avoided. In this study, people on the low-sodium (1500 milligrams per day) DASH diet had mean systolic blood pressures that were\n7.1 mmHg lower than people without hypertension not on the DASH diet. The effect on blood pressure was greatest in participants\nwith hypertension at the beginning of the study who followed the DASH diet. Their systolic blood pressures were, on average, 11.5\nmmHg lower than participants with hypertension on the control diet.',
  '_score': 6.79581165313720

#### Using Hugging-face google flan-t5 model locally to perform RAG with full-text search

In [22]:
# Download model directly
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [23]:
# Full-text search retrieval function
def search(query):
    text_search = table.search(query, query_type="fts").limit(5).select(["text"]).to_list()
    return text_search

In [24]:
# Build a prompt and set max_length
def build_prompt(query, text_search, tokenizer, max_length=512):
    prompt_template = """
    You are a diet assistant. You're performing a full text search, so use the text column only for answers.
    Based on the provided context, answer the following question completely and coherently. 
    Use the information from the CONTEXT to provide a detailed and full response to the QUESTION.
    Ensure your response is comprehensive and complete, avoiding any abrupt or partial endings.

    QUESTION: {question}
    CONTEXT: {context}
    """.strip()

    context = ""
    for item in text_search:
        context += f'{item.get("text", "")}\n\n'

    prompt = prompt_template.format(question=query, context=context).strip()

    # Truncate prompt if it exceeds the model's max length
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_length)
    truncated_prompt = tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)
    
    return truncated_prompt

In [25]:
# Open-source LLM and max_token function
def llm(prompt, model, tokenizer, max_tokens=72):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids
    outputs = model.generate(inputs, max_new_tokens=max_tokens)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [26]:
# Full-Text RAG pipeline
def rag_pipeline(query, model, tokenizer):
    text_search = search(query)
    prompt = build_prompt(query, text_search, tokenizer)
    answer = llm(prompt, model, tokenizer)
    return answer

In [27]:
# Applying the RAG pipeline to this query
query = 'What are nutrients?'
outcome = rag_pipeline(query, model, tokenizer)
print(outcome)

Nutrients are substances required by the body to perform its basic functions. Nutrients must be obtained from our diet, since the human body does not synthesize or produce them. Nutrients have one or more of three basic functions: they provide energy, contribute to body structure, and/or regulate chemical processes in the body.


* Note: the value of max_token can be adjusted for more outcome of the output token.

#### Creating an index on embedding column and perform semantic search query retrieval

* Step 1: Convert list embedding column to Fixed Size List Array

In [28]:
# Convert embeddings list to FixedSizeListArray
def convert_to_fixed_size_list(embeddings, embedding_size):
    flattened_embeddings = []
    
    # Iterate over each embedding and ensure correct size
    for embedding in embeddings:
        if len(embedding) != embedding_size:
            raise ValueError(f"Embedding size {len(embedding)} does not match the expected size {embedding_size}")
        flattened_embeddings.extend(embedding)
    
    numpy_array = np.array(flattened_embeddings, dtype=np.float32)
    
    # Create FixedSizeListArray
    fixed_size_list = pa.FixedSizeListArray.from_arrays(
        pa.array(numpy_array),
        list_size=embedding_size
    )
    
    return fixed_size_list

* Step 2: Convert LanceDB table to an Arrow table format

In [29]:
arrow_table = table.to_arrow()

# check the pandas frame of the converted arrow table
arrow_table.to_pandas()

Unnamed: 0,chunk_id,text,embedding
0,0,INTRODUCTION TO \nNUTRITION SCIENCE,"[-0.05099819228053093, -0.056592684239149094, ..."
1,1,Introduction to Nutrition Science,"[-0.05099819228053093, -0.056592684239149094, ..."
2,2,This text is disseminated via the Open Educati...,"[-0.019119346514344215, 0.10461532324552536, 0..."
3,3,Instructors can adopt existing LibreTexts text...,"[-0.029113631695508957, 0.010369417257606983, ..."
4,4,"for the construction, customization, and disse...","[-0.017107605934143066, 0.02413615770637989, -..."
...,...,...,...
3590,3590,11.7: Food Processing - CC BY-NC-SA 4.0\n11.8:...,"[-0.02437693625688553, -0.005427069962024689, ..."
3591,3591,3 h t t p s : / / m e d . l i b r e t e x t s ...,"[-0.025427795946598053, 0.023010170087218285, ..."
3592,3592,SA 4.0\n13.4: Fuel Sources - CC BY-NC-SA 4.0\n...,"[-0.08159752935171127, 0.005391544196754694, -..."
3593,3593,14.3: Infancy - CC BY-NC-SA 4.0\n14.4: Toddler...,"[-0.023193208500742912, 0.05462077260017395, -..."


* Step 3: Extract embeddings to Fixed-size-list array with the above function

In [30]:
# Embedding size for embedding model - all-MiniLM-L6-v2'
embedding_size = 384

# Calling the convert to FixedSizeListArray function
fixed_size_embeddings = convert_to_fixed_size_list(arrow_table.to_pandas()['embedding'], embedding_size)

* Step 4: Update the arrow table with the fixed-size-list array embedding column

In [31]:
updated_table = arrow_table.set_column(
    arrow_table.schema.get_field_index('embedding'),       # Get embedding column index
    'embedding',                                           # Column name
    fixed_size_embeddings                                  # FixedSizeListArray embeddings
)

* Step 5: Connect to LanceDB and create a new table name with updated data in the database

In [32]:
# Connect to a LanceDB database. db_path is defined above
db = lancedb.connect(db_path)  

# Create new table with updated data and a new table name (new_diet_table)
db.create_table('new_diet_table', updated_table)

# confirm the new created table
db.table_names()

['diet_table', 'new_diet_table']

* Step 6: create a vector index on embedding column of new_diet_table and perform a semantic search

In [33]:
# open created table
new_table = db.open_table('new_diet_table')

# index for embedding column on updated_table
new_table.create_index(metric='cosine', vector_column_name='embedding', index_type='IVF_PQ')

In [34]:
# Define a query
query = 'What are Nutrients?'

# Load the embedding model
embed_model = SentenceTransformer('all-MiniLM-L6-v2')

# Convert the query into a list format
query_embedding = embed_model.encode(query).tolist()  

# Perform the semantic search
semantic_search = new_table.search(query_embedding, query_type='vector', vector_column_name='embedding').limit(5).select(['text']).to_list()



In [35]:
# Get retrieval for semantic search
semantic_search

[{'text': 'proteins, and water, and are referred to as macronutrients.\nTwo of the classes of nutrients are needed in lesser amounts, but are still essential for bodily function. They are vitamins and\nminerals.\nOne measurement of food quality is the amount of essential nutrients a food contains relative to the amount of energy it has\n(nutrient density).\nDiscussion Starters  \n1. Make a list of some of your favorite foods and visit the “What’s In the Foods You Eat?” search tool provided by the USDA.',
  '_distance': 0.48930734395980835},
 {'text': 'What are Nutrients?  \nThe foods we eat contain nutrients. Nutrients are substances required by the body to perform its basic functions. Nutrients must be\nobtained from our diet, since the human body does not synthesize or produce them. Nutrients have one or more of three basic\nfunctions: they provide energy, contribute to body structure, and/or regulate chemical processes in the body. These basic functions',
  '_distance': 0.5001618266

* Function of Semantic search retrieval

In [36]:
def search_vector(query):
    embed_model = SentenceTransformer('all-MiniLM-L6-v2')
    query_embedding = embed_model.encode(query).tolist()
    semantic_search = new_table.search(query_embedding, query_type='vector', vector_column_name='embedding').limit(5).select(['text']).to_list()
    return semantic_search

In [37]:
# testing the function above
query = 'Hypertension'
semantic_search = search_vector(query)
semantic_search

[{'text': 'family members who have hypertension.Weight. Roughly 60 percent of people with hypertension are obese.\nSodium consumption. The more salt in a person’s diet, the more\nlikely they are to have high blood pressure.\nAlcohol. Drinking more than two drinks per day for men and one\ndrink for women increases the likelihood of hypertension.\nDiet. In addition to salt and alcohol consumption, other dietary\nfactors increase chances of developing hypertension.',
  '_distance': 0.6795074939727783},
 {'text': 'can be beneficial, as well as consuming more soy products. It is also important to maintain a healthy weight and avoid smoking or\nchewing tobacco.\nHypertension  \nChronic high blood pressure, also known as hypertension, is a significant health hazard affecting one out of three adults in the\nUnited States. This chronic condition is a major cause of heart attacks and strokes, yet it has no symptoms until blood pressure',
  '_distance': 0.7372344136238098},
 {'text': 'blood press

In [38]:
# trying another query
new_query = 'tell me something about protein'
semantic_search = search_vector(new_query)
semantic_search

[{'text': '6 . 8 . 1 h t t p s : / / m e d . l i b r e t e x t s . o r g / @g o / p a g e / 4 0 6 1 16.8: Proteins, Diet, and Personal Choices\nWe have discussed what proteins are, how they are made, how they are digested and absorbed, the many functions of proteins in the\nbody, and the consequences of having too little or too much protein in the diet. This section will provide you with information on',
  '_distance': 0.45874282717704773},
 {'text': '6 . 2 . 1 h t t p s : / / m e d . l i b r e t e x t s . o r g / @g o / p a g e / 4 0 6 0 56.2: Defining Protein\nProtein makes up approximately 20 percent of the human body and is present in every single cell. The word protein is a Greek\nword, meaning “of utmost importance.” Proteins are called the workhorses of life as they provide the body with structure and\nperform a vast array of functions. You can stand, walk, run, skate, swim, and more because of your protein-rich muscles. Protein is',
  '_distance': 0.5481024980545044},
 {'text':

* for loop to get the text from semantic search

In [39]:
# Print the results
for result in semantic_search:
    print(result['text'])

6 . 8 . 1 h t t p s : / / m e d . l i b r e t e x t s . o r g / @g o / p a g e / 4 0 6 1 16.8: Proteins, Diet, and Personal Choices
We have discussed what proteins are, how they are made, how they are digested and absorbed, the many functions of proteins in the
body, and the consequences of having too little or too much protein in the diet. This section will provide you with information on
6 . 2 . 1 h t t p s : / / m e d . l i b r e t e x t s . o r g / @g o / p a g e / 4 0 6 0 56.2: Defining Protein
Protein makes up approximately 20 percent of the human body and is present in every single cell. The word protein is a Greek
word, meaning “of utmost importance.” Proteins are called the workhorses of life as they provide the body with structure and
perform a vast array of functions. You can stand, walk, run, skate, swim, and more because of your protein-rich muscles. Protein is
1CHAPTER OVERVIEW
6: Protein
By the end of this chapter, you will be able to:
Describe the role and structure of 

#### Using Hugging-face google flan-t5 model locally to perform RAG with semantic search

In [40]:
# Initialize the FLAN-T5 model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")

In [41]:
# Semantic search retrieval function
def search_vector(query, table):
    embed_model = SentenceTransformer('all-MiniLM-L6-v2')
    query_embedding = embed_model.encode(query).tolist()
    semantic_search = table.search(query_embedding, query_type='vector', vector_column_name='embedding').limit(5).select(['text']).to_list()
    return semantic_search

In [42]:
# Build a prompt
def build_prompt(query, semantic_search, tokenizer, max_length=512):
    prompt_template = """
    You are a diet assistant. You are performing a semantic search, so use the embedding column for your answers.
    Based on the provided context, answer the following question completely and coherently. 
    Use the information from the CONTEXT to provide a detailed and full response to the QUESTION.
    Ensure your response is comprehensive and complete, avoiding any abrupt or partial endings.

    QUESTION: {question}
    CONTEXT: {context}
    """.strip()

    context = ""
    for item in semantic_search:
        context += f'{item.get("embedding", "")}\n\n'

    prompt = prompt_template.format(question=query, context=context).strip()

    # Tokenize and truncate the prompt if it exceeds the max length
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_length)
    truncated_prompt = tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)
    
    return truncated_prompt

In [43]:
# Generate response from the language model
def llm(prompt, model, tokenizer):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids
    outputs = model.generate(inputs, max_length=512, num_beams=2, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [44]:
# Full RAG pipeline
def rag_pipeline(query, table, model, tokenizer):
    semantic_search = search_vector(query, new_table)
    prompt = build_prompt(query, semantic_search, tokenizer)
    answer = llm(prompt, model, tokenizer)
    return answer

In [45]:
# Applying the RAG pipeline to a query
query = 'Tell me something about hypertension?'
outcome = rag_pipeline(query, new_table, model, tokenizer)
print(outcome)

Hypertension is a medical condition characterized by high blood pressure.


In [46]:
# Applying the RAG pipeline to another query
query = 'What are Lipids?'
outcome = rag_pipeline(query, new_table, model, tokenizer)
print(outcome)

Lipids are fats that are found in the body.


In [47]:
# Applying the RAG pipeline to a query
query = 'Tell me something about Carbohydrates, Proteins and Vitamins?'
outcome = rag_pipeline(query, new_table, model, tokenizer)
print(outcome)

Carbohydrates, Proteins, and Vitamins are the three main nutrients that make up a healthy diet.
