# RAG on local device

1. Document preprocessing and embedding creation
2. Search and Answer

### 1. Document/text processing and embedding creation

In [1]:
import os
import requests

pdf_path = "human-nutrition-text.pdf"

if not os.path.exists(pdf_path):
    print(f"[INFO] File doesn't exist, downloading...")
    
    url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"
    
    filename = pdf_path
    
    response = requests.get(url)
    
    if response.status_code == 200:
        with open(filename, "wb") as file:
            file.write(response.content)
        print(f"[INFO] The file has been downloaded and saved as {filename}")
    else:
        print(f"[INFO] Failed to download the file. Status code: {response.status_code}")
        
else:
    print(f"File {pdf_path} exists.")

File human-nutrition-text.pdf exists.


In [2]:
import fitz

from tqdm.auto import tqdm

def text_formatter(text:str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip()
    
    return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text=text)
        pages_and_texts.append({"page_number":page_number-41,
                                "page_char_count":len(text),
                                "page_word_count":len(text.split(" ")),
                                "page_sentence_count_raw" : len(text.split(". ")),
                                "page_token_count":len(text) / 4,
                                "text":text})
        
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]

0it [00:00, ?it/s]

[{'page_number': -41,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentence_count_raw': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': -40,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''}]

In [3]:
import random

random.sample(pages_and_texts, k=3)

[{'page_number': 799,
  'page_char_count': 1524,
  'page_word_count': 259,
  'page_sentence_count_raw': 20,
  'page_token_count': 381.0,
  'text': 'over-the-counter painkillers. Some studies suggest that very high  amounts of caffeine have been linked to babies born with low birth  weights. The American Journal of Obstetrics and Gynecology  released a report, which found that women who consume 200  milligrams or more of caffeine a day (which is the amount in 10  ounces of coffee or 25 ounces of tea) increase the risk of  miscarriage7.  Consuming large quantities of caffeine affects the pregnant  mother as well, leading to irritability, anxiety, and insomnia. Most  experts agree that small amounts of caffeine each day are safe  (about one 8-ounce cup of coffee a day or less)8. However, that  amount should not be exceeded.  Foodborne Illness  For both mother and child, foodborne illness can cause major health  problems. For example, the foodborne illness caused by the bacteria  Listeria 

In [4]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,147,3,199.25,Contents Preface University of Hawai‘i at Mā...


In [5]:
df.describe().round(decimals=2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,199.5,10.52,287.0
std,348.86,560.38,95.83,6.55,140.1
min,-41.0,0.0,1.0,1.0,0.0
25%,260.75,762.0,134.0,5.0,190.5
50%,562.5,1231.5,216.0,10.0,307.88
75%,864.25,1603.5,272.0,15.0,400.88
max,1166.0,2308.0,430.0,39.0,577.0


#### Further text processing (splitting pages into sentences)

In [6]:
from spacy.lang.en import English

nlp = English()

# Add Sentencizer pipeline
nlp.add_pipe("sentencizer")

# Document instance example
doc = nlp("This is a sentence. This is another one. This is the last.")
assert len(list(doc.sents)) == 3

# Sentences split
list(doc.sents)

[This is a sentence., This is another one., This is the last.]

In [7]:
pages_and_texts[600]

{'page_number': 559,
 'page_char_count': 863,
 'page_word_count': 138,
 'page_sentence_count_raw': 9,
 'page_token_count': 215.75,
 'text': 'Image by  Allison  Calabrese /  CC BY 4.0  Korsakoff syndrome can cause similar symptoms as beriberi such  as confusion, loss of coordination, vision changes, hallucinations,  and may progress to coma and death. This condition is specific  to alcoholics as diets high in alcohol can cause thiamin deficiency.  Other individuals at risk include individuals who also consume diets  typically low in micronutrients such as those with eating disorders,  elderly, and individuals who have gone through gastric bypass  surgery.5  Figure 9.10 The Role of Thiamin  Figure 9.11 Beriberi, Thiamin Deficiency  5. Fact Sheets for Health Professionals: Thiamin. National  Institute of Health, Office of Dietary Supplements.   https://ods.od.nih.gov/factsheets/Thiamin- HealthProfessional/. Updated Feburary 11, 2016.  Accessed October 22, 2017.  Water-Soluble Vitamins  | 

In [8]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)
    
    # spaCy datatype to str
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    
    # Count sentences
    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [9]:
random.sample(pages_and_texts, k=1)

[{'page_number': 1040,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': '',
  'sentences': [],
  'page_sentence_count_spacy': 0}]

In [10]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,199.5,10.52,287.0,10.32
std,348.86,560.38,95.83,6.55,140.1,6.3
min,-41.0,0.0,1.0,1.0,0.0,0.0
25%,260.75,762.0,134.0,5.0,190.5,5.0
50%,562.5,1231.5,216.0,10.0,307.88,10.0
75%,864.25,1603.5,272.0,15.0,400.88,15.0
max,1166.0,2308.0,430.0,39.0,577.0,28.0


In [11]:
# Chunking sentences into smaller groups

num_sentence_chunk_size = 10

# split lists of texts recursively into chunk size
def split_list(input_list: list[str],
               slice_size: int=num_sentence_chunk_size) -> list[list[str]]:
    return [input_list[i:i+slice_size] for i in range(0, len(input_list), slice_size)]

test_list = list(range(25))
split_list(test_list)


[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20, 21, 22, 23, 24]]

In [12]:
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chunk_size)
    
    item["num_chunks"] = len(item["sentence_chunks"])
    
    

  0%|          | 0/1208 [00:00<?, ?it/s]

In [13]:
random.sample(pages_and_texts, k=1)

[{'page_number': 828,
  'page_char_count': 1772,
  'page_word_count': 290,
  'page_sentence_count_raw': 21,
  'page_token_count': 443.0,
  'text': 'also contains more calories than colostrum. As a new mother begins  to produce transitional milk, she typically notices an increase in the  weight and size of her breasts and a change in the volume and type  of liquid secreted.17  Mature milk is the final milk that a new mother produces. Its  composition varies from morning to night, from the beginning of  the feeding to the end, and from early postpartum to later in infancy  and toddlerhood. Breastmilk that is produced by mothers of  premature infants is higher in protein and calcium to meet the  needs of the preemie. Foremilk (the milk that comes at the  beginning of a feeding) tends to be lower in fat. Hind-milk comes  towards the end of a feeding containing higher levels of fat, which  helps the baby to feel satisfied and full. Combined, these two types  of milk ensure that a baby recei

In [14]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,199.5,10.52,287.0,10.32,1.53
std,348.86,560.38,95.83,6.55,140.1,6.3,0.64
min,-41.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,260.75,762.0,134.0,5.0,190.5,5.0,1.0
50%,562.5,1231.5,216.0,10.0,307.88,10.0,1.0
75%,864.25,1603.5,272.0,15.0,400.88,15.0,2.0
max,1166.0,2308.0,430.0,39.0,577.0,28.0,3.0


In [33]:
#random.sample(pages_and_texts, k=1)
import re

pages_and_chunks = []

# Split each chunk into its own item
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]

        # Join sentences together into a paragraph-like structure (join the list of sentences into one paragraph)
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)
        # joined_sentence_chunk = 
        
        chunk_dict["sentence_chunk"] = joined_sentence_chunk
        
        # Check stats about chunks
        chunk_dict["chunk_char_count"]  = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")]) # Crude word count
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 chars
        
        pages_and_chunks.append(chunk_dict)
        
len(pages_and_chunks)

  0%|          | 0/1208 [00:00<?, ?it/s]

1843

In [38]:
random.sample(pages_and_chunks, k=1)

[{'page_number': 387,
  'sentence_chunk': 'PDB 1o9x EBI by Jawahar Swaminatha n and MSD staff at the European Bioinformati cs Institute / Public Domain The butterfly-sha ped protein, albumin, has many functions in the body including maintaining fluid and acid-base balance and transporting molecules. If too much water in the blood suddenly moves into a tissue, the results are swelling and, potentially, cell death. Water always flows from an area of high concentration to one of a low concentration. As a result, water moves toward areas that have higher concentrations of other solutes, such as proteins and glucose. To keep the water evenly distributed between blood and cells, proteins continuously circulate at high concentrations in the blood. The most abundant protein in blood is the butterfly-shaped protein known as albumin. Albumin’s presence in the blood makes the protein concentration in the blood similar to that in cells. Therefore, fluid exchange between the blood and cells is not 

In [40]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,583.38,734.1,112.74,183.52
std,347.79,447.51,71.24,111.88
min,-41.0,12.0,3.0,3.0
25%,280.5,315.0,45.0,78.75
50%,586.0,745.0,115.0,186.25
75%,890.0,1118.0,173.0,279.5
max,1166.0,1830.0,297.0,457.5


#### Filter chunks of text for short chunks

In [41]:
# Show random chunks with under 30 tokens in length
min_token_length = 30
for row in df[df["chunk_token_count"]<= min_token_length].sample(5).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

Chunk token count: 26.5 | Text: It is stored in the rectum until it is expelled through the anus via defecation. The Digestive System | 77
Chunk token count: 7.25 | Text: Human Nutrition: 2020 Edition
Chunk token count: 9.5 | Text: 742 | Building Healthy Eating Patterns
Chunk token count: 17.5 | Text: The Obesity Myth. Gotham Books. Calories In Versus Calories Out | 1069
Chunk token count: 22.0 | Text: Figure 6.10 Enzymes Role in Carbohydrate Digestion Protein’s Functions in the Body | 385


In [42]:
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE',
  'chunk_char_count': 308,
  'chunk_word_count': 42,
  'chunk_token_count': 77.0},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and Human Nutrition Program is licensed under a Creative Commons Attribution 4.0 International License, except where otherwise noted.',
  'chunk_char_count': 210,
  'chunk_word_count': 30,
  'chunk_token_count': 52.5}]

In [43]:
random.sample(pages_and_chunks_over_min_token_len, k=1)

[{'page_number': 679,
  'sentence_chunk': 'Food Serving Selenium (mcg) Percent Daily Value Brazil nuts 1 oz. 544 777 Shrimp 3 oz. 34 49 Crab meat 3 oz. 41 59 Ricotta cheese 1 c. 41 59 Salmon 3 oz. 40 57 Pork 3 oz. 35 50 Ground beef 3 oz. 18 26 Round steak 3 oz. 28.5 41 Beef liver 3 oz. 28 40 Chicken 3 oz. 13 19 Whole-wheat bread 2 slices 23 33 Couscous 1 c. 43 61 Barley, cooked 1 c. 13.5 19 Milk, low-fat 1 c. 8 11 Walnuts, black 1 oz.',
  'chunk_char_count': 395,
  'chunk_word_count': 90,
  'chunk_token_count': 98.75}]

#### Embedding text chunks

In [45]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", device="cpu")

# Create list of sentences
sentences = ["The Sentence Transformer library provides an easy way to create embeddings.",
"Sentences can be embedded one by one or in a list.",
"I like horses!"]

embeddings = embedding_model.encode(sentences)
embeddings_dict = dict(zip(sentences, embeddings))

# Check embeddings
for sentence, embedding in embeddings_dict.items():
    print(f"Sentence: {sentence}")
    print(f"Embedding: {embedding}")
    print("")

Sentence: The Sentence Transformer library provides an easy way to create embeddings.
Embedding: [-3.44286375e-02  2.95328815e-02 -2.33643427e-02  5.57257496e-02
 -2.19098609e-02 -6.47062203e-03  1.02848457e-02 -6.57803863e-02
  2.29717735e-02 -2.61121057e-02  3.80420350e-02  5.61403222e-02
 -3.68746594e-02  1.52787790e-02  4.37020473e-02 -5.19723371e-02
  4.89479862e-02  3.58104147e-03 -1.29751097e-02  3.54387122e-03
  4.23262641e-02  3.52606587e-02  2.49402281e-02  2.99177002e-02
 -1.99382380e-02 -2.39752773e-02 -3.33367917e-03 -4.30450514e-02
  5.72014526e-02 -1.32517833e-02 -3.54477987e-02 -1.13935936e-02
  5.55561110e-02  3.61099187e-03  8.88527040e-07  1.14027122e-02
 -3.82229425e-02 -2.43548071e-03  1.51314372e-02 -1.32699206e-04
  5.00659943e-02 -5.50876483e-02  1.73444841e-02  5.00959158e-02
 -3.75959277e-02 -1.04463594e-02  5.08322380e-02  1.24861132e-02
  8.67377296e-02  4.64143082e-02 -2.10690107e-02 -3.90251614e-02
  1.99698494e-03 -1.42345531e-02 -1.86794791e-02  2.826691

In [46]:
embeddings[0].shape

(768,)

In [48]:
%%time


embedding_model.to("cuda")

# Embed each chunk one by one
for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

  0%|          | 0/1680 [00:00<?, ?it/s]

CPU times: user 1min 21s, sys: 4.18 s, total: 1min 25s
Wall time: 11 s


In [49]:
%%time

text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]

len(text_chunks)

CPU times: user 222 µs, sys: 9 µs, total: 231 µs
Wall time: 236 µs


1680

In [50]:
%%time

# Embed all texts in batches
text_chunk_embeddings = embedding_model.encode(text_chunks,
                                               batch_size=32,
                                               convert_to_tensor=True)

CPU times: user 25.1 s, sys: 1.37 s, total: 26.5 s
Wall time: 3.26 s


In [52]:
# Save embeddings to file
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [53]:
# Import saved file and view
text_chunks_and_embeddings_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embeddings_df_load.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.0,[ 6.74242601e-02 9.02281553e-02 -5.09549258e-...
1,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.5,[ 5.52156232e-02 5.92139289e-02 -1.66167468e-...
2,-37,Contents Preface University of Hawai‘i at Māno...,766,116,191.5,[ 2.79801954e-02 3.39814052e-02 -2.06426717e-...
3,-36,Lifestyles and Nutrition University of Hawai‘i...,941,144,235.25,[ 6.82566836e-02 3.81275043e-02 -8.46853014e-...
4,-35,The Cardiovascular System University of Hawai‘...,998,152,249.5,[ 3.30264419e-02 -8.49767029e-03 9.57159698e-...


## 2. RAG - Search and Answer

Retrieve relevant passages based on a query and use those passages to augment an input to an LLM so it can generate an output based on those relevant passages


In [70]:
import random
import torch
import numpy as np
import pandas as pd

device = "cuda" if torch.cuda.is_available() else "cpu"

# Import texts and embedding df
text_chunks_and_embeddings_df = pd.read_csv("text_chunks_and_embeddings_df.csv")

# Convert embedding column back to np.array (as it got converted to string when saved to CSV)
text_chunks_and_embeddings_df["embedding"] = text_chunks_and_embeddings_df["embedding"].apply(lambda x:np.fromstring(x.strip("[]"), sep=" "))

# Convert texts and embedding tf to list of dicts
pages_and_chunks = text_chunks_and_embeddings_df.to_dict(orient="records")


text_chunks_and_embeddings_df[:1]

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.0,"[0.0674242601, 0.0902281553, -0.00509549258, -..."


In [71]:
pages_and_chunks[:1]

[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE',
  'chunk_char_count': 308,
  'chunk_word_count': 42,
  'chunk_token_count': 77.0,
  'embedding': array([ 6.74242601e-02,  9.02281553e-02, -5.09549258e-03, -3.17545645e-02,
          7.39082173e-02,  3.51976156e-02, -1.97987109e-02,  4.67692763e-02,
          5.35727032e-02,  5.01232594e-03,  3.33929174e-02, -1.62218197e-03,
          1.76080782e-02,  3.62653807e-02, -3.16707330e-04, -1.07118469e-02,
          1.54257929e-02,  2.62176134e-02,  2.77653895e-03,  3.64942439e-02,
         -4.44109589e-02,  1.89361889e-02,  4.90117893e-02,  1.64020080e-02,
         -4.85782959e-02,  3.18291062e-03,  2.72992738e-02, -2.04758975e-03,
         -1.

In [73]:
embeddings = np.stack(text_chunks_and_embeddings_df["embedding"].tolist(), axis=0)
embeddings

array([[ 0.06742426,  0.09022816, -0.00509549, ..., -0.02211551,
        -0.02321365,  0.01256908],
       [ 0.05521562,  0.05921393, -0.01661675, ..., -0.01204065,
        -0.01028472,  0.02273964],
       [ 0.0279802 ,  0.03398141, -0.02064267, ..., -0.00536189,
         0.02125603,  0.0313055 ],
       ...,
       [ 0.07705151,  0.00978558, -0.01218175, ..., -0.04086806,
        -0.07517634, -0.0240526 ],
       [ 0.10304516, -0.01647021,  0.00826846, ..., -0.05742175,
        -0.02828028, -0.02946858],
       [ 0.08637735, -0.0125359 , -0.01127468, ..., -0.05223796,
        -0.03367291, -0.02986607]])