# RAG on local device

1. Document preprocessing and embedding creation
2. Search and Answer

### 1. Document/text processing and embedding creation

In [3]:
import os
import requests

pdf_path = "human-nutrition-text.pdf"

if not os.path.exists(pdf_path):
    print(f"[INFO] File doesn't exist, downloading...")
    
    url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"
    
    filename = pdf_path
    
    response = requests.get(url)
    
    if response.status_code == 200:
        with open(filename, "wb") as file:
            file.write(response.content)
        print(f"[INFO] The file has been downloaded and saved as {filename}")
    else:
        print(f"[INFO] Failed to download the file. Status code: {response.status_code}")
        
else:
    print(f"File {pdf_path} exists.")

File human-nutrition-text.pdf exists.


In [4]:
import fitz

from tqdm.auto import tqdm

def text_formatter(text:str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip()
    
    return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text=text)
        pages_and_texts.append({"page_number":page_number-41,
                                "page_char_count":len(text),
                                "page_word_count":len(text.split(" ")),
                                "page_sentence_count_raw" : len(text.split(". ")),
                                "page_token_count":len(text) / 4,
                                "text":text})
        
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]

0it [00:00, ?it/s]

[{'page_number': -41,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentence_count_raw': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': -40,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''}]

In [5]:
import random

random.sample(pages_and_texts, k=3)

[{'page_number': 1050,
  'page_char_count': 1555,
  'page_word_count': 273,
  'page_sentence_count_raw': 12,
  'page_token_count': 388.75,
  'text': 'The DASH Diet  The Dietary Approaches to Stop Hypertension, or DASH diet,  focuses on reducing sodium intake to either 2,300 milligrams per  day (as recommended by the Dietary Guidelines for Americans) or  1,500 milligrams per day for certain populations. The DASH diet  is an evidence-based eating plan that can help reduce high blood  pressure. This plan may also decrease the risk of heart attack,  stroke, diabetes, osteoporosis, and certain cancers.3  DASH tips to lower sodium include:  • Using spices instead of salt to add flavor  • Reading sodium content on processed or canned food labels,  and choosing low-sodium options  • Removing some sodium from canned foods (such as beans) by  rinsing the product before consumption  • Avoiding salt when cooking  DASH dieters are recommended to consume a variety of whole  grains and high-fiber fru

In [6]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,147,3,199.25,Contents Preface University of Hawai‘i at Mā...


In [7]:
df.describe().round(decimals=2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,199.5,10.52,287.0
std,348.86,560.38,95.83,6.55,140.1
min,-41.0,0.0,1.0,1.0,0.0
25%,260.75,762.0,134.0,5.0,190.5
50%,562.5,1231.5,216.0,10.0,307.88
75%,864.25,1603.5,272.0,15.0,400.88
max,1166.0,2308.0,430.0,39.0,577.0


#### Further text processing (splitting pages into sentences)

In [9]:
from spacy.lang.en import English

nlp = English()

# Add Sentencizer pipeline
nlp.add_pipe("sentencizer")

# Document instance example
doc = nlp("This is a sentence. This is another one. This is the last.")
assert len(list(doc.sents)) == 3

# Sentences split
list(doc.sents)

[This is a sentence., This is another one., This is the last.]

In [15]:
pages_and_texts[600]

{'page_number': 559,
 'page_char_count': 863,
 'page_word_count': 138,
 'page_sentence_count_raw': 9,
 'page_token_count': 215.75,
 'text': 'Image by  Allison  Calabrese /  CC BY 4.0  Korsakoff syndrome can cause similar symptoms as beriberi such  as confusion, loss of coordination, vision changes, hallucinations,  and may progress to coma and death. This condition is specific  to alcoholics as diets high in alcohol can cause thiamin deficiency.  Other individuals at risk include individuals who also consume diets  typically low in micronutrients such as those with eating disorders,  elderly, and individuals who have gone through gastric bypass  surgery.5  Figure 9.10 The Role of Thiamin  Figure 9.11 Beriberi, Thiamin Deficiency  5. Fact Sheets for Health Professionals: Thiamin. National  Institute of Health, Office of Dietary Supplements.   https://ods.od.nih.gov/factsheets/Thiamin- HealthProfessional/. Updated Feburary 11, 2016.  Accessed October 22, 2017.  Water-Soluble Vitamins  | 

In [16]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)
    
    # spaCy datatype to str
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    
    # Count sentences
    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [19]:
random.sample(pages_and_texts, k=1)

[{'page_number': 562,
  'page_char_count': 1339,
  'page_word_count': 268,
  'page_sentence_count_raw': 17,
  'page_token_count': 334.75,
  'text': 'Food  Serving Thiamin  (mg)  Percent Daily  Value  Breakfast cereals, fortified  1  serving  1.5  100  White rice, enriched  ½ c.  1.4  73  Pork chop, broiled  3 oz.  0.4  27  Black beans, boiled  ½ c.  0.4  27  Tuna, cooked  3 oz.  0.2  13  Brown rice, cooked, not  enriched  ½ c.  0.1  7  Whole wheat bread  1 slice  0.1  7  2% Milk  8 oz.  0.1  7  Cheddar cheese  1 ½ oz  0  0  Apple, sliced  1 c.  0  0  Health Professional Fact Sheet: Thiamin. National Institutes of  Health, Office of Dietary Supplements.https://ods.od.nih.gov/ factsheets/Thiamin-HealthProfessional/ . Updated February 11,  2016 . Accessed October 5, 2017.  Riboflavin (B2)  Riboflavin is an essential component of flavoproteins, which are  coenzymes involved in many metabolic pathways of carbohydrate,  lipid, and protein metabolism. Flavoproteins aid in the transfer of  ele

In [20]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,199.5,10.52,287.0,10.32
std,348.86,560.38,95.83,6.55,140.1,6.3
min,-41.0,0.0,1.0,1.0,0.0,0.0
25%,260.75,762.0,134.0,5.0,190.5,5.0
50%,562.5,1231.5,216.0,10.0,307.88,10.0
75%,864.25,1603.5,272.0,15.0,400.88,15.0
max,1166.0,2308.0,430.0,39.0,577.0,28.0


In [21]:
# Chunking sentences into smaller groups

num_sentence_chunk_size = 10

# split lists of texts recursively into chunk size
def split_list(input_list: list[str],
               slice_size: int=num_sentence_chunk_size) -> list[list[str]]:
    return [input_list[i:i+slice_size] for i in range(0, len(input_list), slice_size)]

test_list = list(range(25))
split_list(test_list)


[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20, 21, 22, 23, 24]]

In [22]:
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chunk_size)
    
    item["num_chunks"] = len(item["sentence_chunks"])
    
    

  0%|          | 0/1208 [00:00<?, ?it/s]

In [24]:
random.sample(pages_and_texts, k=1)

[{'page_number': 5,
  'page_char_count': 1742,
  'page_word_count': 295,
  'page_sentence_count_raw': 17,
  'page_token_count': 435.5,
  'text': 'Macronutrients  Nutrients  that  are  needed  in  large  amounts  are  called  macronutrients. There are three classes of macronutrients:  carbohydrates, lipids, and proteins. These can be metabolically  processed into cellular energy. The energy from macronutrients  comes from their chemical bonds. This chemical energy is  converted into cellular energy that is then utilized to perform work,  allowing our bodies to conduct their basic functions. A unit of  measurement of food energy is the calorie. On nutrition food labels  the amount given for “calories” is actually equivalent to each calorie  multiplied by one thousand. A kilocalorie (one thousand calories,  denoted with a small “c”) is synonymous with the “Calorie” (with a  capital “C”) on nutrition food labels. Water is also a macronutrient in  the sense that you require a large amount o

In [25]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,199.5,10.52,287.0,10.32,1.53
std,348.86,560.38,95.83,6.55,140.1,6.3,0.64
min,-41.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,260.75,762.0,134.0,5.0,190.5,5.0,1.0
50%,562.5,1231.5,216.0,10.0,307.88,10.0,1.0
75%,864.25,1603.5,272.0,15.0,400.88,15.0,2.0
max,1166.0,2308.0,430.0,39.0,577.0,28.0,3.0
