# Requirements and setup

 Google Colab with access to a GPU(I used Colab T4 CPU )

 Environment setup.

 Data source (for example, a PDF).

 Internet connection (to download the models, but once you have them, it'll run offline).

In [1]:
# Perform Google Colab installs (if running in Google Colab)
import os

if "COLAB_GPU" in os.environ:
    print("[INFO] Running in Google Colab, installing requirements.")
    !pip install -U torch # requires torch 2.1.1+ (for efficient sdpa implementation)
    !pip install PyMuPDF # for reading PDFs with Python
    !pip install tqdm # for progress bars
    !pip install sentence-transformers # for embedding models
    !pip install accelerate # for quantization model loading
    !pip install bitsandbytes # for quantizing models (less storage space)
    !pip install flash-attn --no-build-isolation # for faster attention mechanism = faster LLM inference

[INFO] Running in Google Colab, installing requirements.
Collecting PyMuPDF
  Downloading pymupdf-1.25.2-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.2-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.25.2
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl (69.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.0
Collecting flash-attn
  Downloading flash_attn-2.7.3.tar.gz (3.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# 1. Document/Text Processing and Embedding Creation
**Ingredients:**

PDF document of choice.
Embedding model of choice.

**Steps:**

1.Import PDF document.

2.Process text for embedding (e.g. split into chunks of sentences).

3.Embed text chunks with embedding model.

4.Save embeddings to file for later use (embeddings will store on file for many years or until you lose your hard drive).

In [75]:
# Download PDF file
import os
import requests

# Get PDF document
pdf_path = "human-nutrition-text.pdf"

# Download PDF if it doesn't already exist
if not os.path.exists(pdf_path):
  print("File doesn't exist, downloading...")

  # The URL of the PDF you want to download
  url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

  # The local filename to save the downloaded file
  filename = pdf_path

  # Send a GET request to the URL
  response = requests.get(url)

  # Check if the request was successful
  if response.status_code == 200:
      # Open a file in binary write mode and save the content to it
      with open(filename, "wb") as file:
          file.write(response.content)
      print(f"The file has been downloaded and saved as {filename}")
  else:
      print(f"Failed to download the file. Status code: {response.status_code}")
else:
  print(f"File {pdf_path} exists.")

File doesn't exist, downloading...
The file has been downloaded and saved as human-nutrition-text.pdf


can import the pages of our PDF to text by first defining the PDF path and then opening and reading it with PyMuPDF (import fitz).


 write a small helper function to preprocess the text as it gets read. Note that not all text will be read in the same so keep this in mind for when you prepare your text.


 save each page to a dictionary and then append that dictionary to a list for ease of use later.

In [4]:
# Requires !pip install PyMuPDF, see: https://github.com/pymupdf/pymupdf
import fitz # (pymupdf, found this is better than pypdf for our use case, note: licence is AGPL-3.0, keep that in mind if you want to use any code commercially)
from tqdm.auto import tqdm # for progress bars, requires !pip install tqdm

def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip() # note: this might be different for each doc (best to experiment)

    # Other potential text formatting functions can go here
    return cleaned_text

# Open PDF and get lines/pages
# Note: this only focuses on text, rather than images/figures etc
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    """
    Opens a PDF file, reads its text content page by page, and collects statistics.

    Parameters:
        pdf_path (str): The file path to the PDF document to be opened and read.

    Returns:
        list[dict]: A list of dictionaries, each containing the page number
        (adjusted), character count, word count, sentence count, token count, and the extracted text
        for each page.
    """
    doc = fitz.open(pdf_path)  # open a document
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):  # iterate the document pages
        text = page.get_text()  # get plain text encoded as UTF-8
        text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number - 41,  # adjust page numbers since our PDF starts on page 42
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]

0it [00:00, ?it/s]

[{'page_number': -41,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentence_count_raw': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': -40,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''}]

In [5]:
import random

random.sample(pages_and_texts, k=3)

[{'page_number': 33,
  'page_char_count': 755,
  'page_word_count': 123,
  'page_sentence_count_raw': 4,
  'page_token_count': 188.75,
  'text': 'Learning Activities  Technology Note: The second edition of the Human  Nutrition Open Educational Resource (OER) textbook  features interactive learning activities.\xa0 These activities are  available in the web-based textbook and not available in the  downloadable versions (EPUB, Digital PDF, Print_PDF, or  Open Document).  Learning activities may be used across various mobile  devices, however, for the best user experience it is strongly  recommended that users complete these activities using a  desktop or laptop computer and in Google Chrome.  \xa0 An interactive or media element has been  excluded from this version of the text. You can  view it online here:  http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=57  \xa0 Achieving a Healthy Diet  |  33'},
 {'page_number': 876,
  'page_char_count': 815,
  'page_word_count': 133,
  'page_sent

# Get some stats on the text
Let's perform a rough exploratory data analysis (EDA) to get an idea of the size of the texts (e.g. character counts, word counts etc)


The different sizes of texts will be a good indicator into how we should split our texts.


Many embedding models have limits on the size of texts they can ingest, for example, the sentence-transformers model all-mpnet-base-v2 has an input size of 384 tokens.


This means that the model has been trained in ingest and turn into embeddings texts with 384 tokens (1 token ~= 4 characters ~= 0.75 words).


Texts over 384 tokens which are encoded by this model will be auotmatically reduced to 384 tokens in length, potentially losing some information.




In [6]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,145,2,199.25,Contents Preface University of Hawai‘i at Mā...


In [7]:
# Get stats
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,198.3,9.97,287.0
std,348.86,560.38,95.76,6.19,140.1
min,-41.0,0.0,1.0,1.0,0.0
25%,260.75,762.0,134.0,4.0,190.5
50%,562.5,1231.5,214.5,10.0,307.88
75%,864.25,1603.5,271.0,14.0,400.88
max,1166.0,2308.0,429.0,32.0,577.0


our average token count per page is 287.

For this particular use case, it means we could embed an average whole page with the all-mpnet-base-v2 model (this model has an input capacity of 384).

# Further text processing (splitting pages into sentences)

In [8]:
from spacy.lang.en import English # see https://spacy.io/usage for install instructions

nlp = English()

# Add a sentencizer pipeline, see https://spacy.io/api/sentencizer/
nlp.add_pipe("sentencizer")

# Create a document instance as an example
doc = nlp("This is a sentence. This another sentence.")
assert len(list(doc.sents)) == 2

# Access the sentences of the document
list(doc.sents)

[This is a sentence., This another sentence.]

In [9]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)

    # Make sure all sentences are strings
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]

    # Count the sentences
    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [10]:
# Inspect an example
random.sample(pages_and_texts, k=1)

[{'page_number': 1107,
  'page_char_count': 1789,
  'page_word_count': 337,
  'page_sentence_count_raw': 26,
  'page_token_count': 447.25,
  'text': 'Unmodifiable Risk Factors  Modifiable Risk Factors  • Age. Most cancers  occur in people over  the age of sixty-five.  However, people of all  ages, including  children, can get  cancer.  • Family history.  Certain types of  cancer have a genetic  link. However,  environmental factors  may also play a part.  • Tobacco. Smoking or chewing tobacco  greatly increases the risk for certain  cancers, including cancer of the lungs,  bladder, cervix, kidneys, mouth, and  pancreas.  • Alcohol. Drinking alcohol is linked to  cancers of the mouth, throat,  esophagus, and breast, as well as to  cancers of the neck and head.  • Obesity. Linked to cancers of the colon,  uterus, pancreas, esophagus, kidney,  and breast.  • Cooking techniques. Grilling, smoking,  and preparing meat at high  temperatures forms carcinogens.  • Red meat. The risk of colon c

**Now let's turn out list of dictionaries into a DataFrame and get some stats.**

In [11]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,198.3,9.97,287.0,10.32
std,348.86,560.38,95.76,6.19,140.1,6.3
min,-41.0,0.0,1.0,1.0,0.0,0.0
25%,260.75,762.0,134.0,4.0,190.5,5.0
50%,562.5,1231.5,214.5,10.0,307.88,10.0
75%,864.25,1603.5,271.0,14.0,400.88,15.0
max,1166.0,2308.0,429.0,32.0,577.0,28.0


# Chunking our sentences together

In [12]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 10

# Create a function that recursively splits a list into desired sizes
def split_list(input_list: list,
               slice_size: int) -> list[list[str]]:
    """
    Splits the input_list into sublists of size slice_size (or as close as possible).

    For example, a list of 17 sentences would be split into two lists of [[10], [7]]
    """
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

# Loop through pages and texts and split sentences into chunks
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [13]:
# Sample an example from the group (note: many samples have only 1 chunk as they have <=10 sentences total)
random.sample(pages_and_texts, k=1)

[{'page_number': 975,
  'page_char_count': 739,
  'page_word_count': 128,
  'page_sentence_count_raw': 4,
  'page_token_count': 184.75,
  'text': 'Image by  Allison  Calabrese /  CC BY 4.0  consumption of sodium in the days leading up to an event and  consume sodium-containing sports drinks during their race or  game. The early signs of hyponatremia include nausea, muscle  cramps, disorientation, and slurred speech. \xa0To learn more about  the sports drinks that can optimize your performance, refer back to  Chapter 3, Water and Electrolytes.  Figure 16.11 The Effect of Exercise on Sodium Levels  Learning Activities  Technology Note: The second edition of the Human  Nutrition Open Educational Resource (OER) textbook  features interactive learning activities.\xa0 These activities are  available in the web-based textbook and not available in the  Water and Electrolyte Needs  |  975',
  'sentences': ['Image by  Allison  Calabrese /  CC BY 4.0  consumption of sodium in the days leading up 

In [14]:
# Create a DataFrame to get stats
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,198.3,9.97,287.0,10.32,1.53
std,348.86,560.38,95.76,6.19,140.1,6.3,0.64
min,-41.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,260.75,762.0,134.0,4.0,190.5,5.0,1.0
50%,562.5,1231.5,214.5,10.0,307.88,10.0,1.0
75%,864.25,1603.5,271.0,14.0,400.88,15.0,2.0
max,1166.0,2308.0,429.0,32.0,577.0,28.0,3.0


# Splitting each chunk into its own item

**embed each chunk of sentences into its own numerical representation.**


**So to keep things clean, let's create a new list of dictionaries each containing a single chunk of sentences with relative information such as page number as well statistics about each chunk.**

In [15]:
import re

# Split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]

        # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get stats about the chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters

        pages_and_chunks.append(chunk_dict)

# How many chunks do we have?
len(pages_and_chunks)

  0%|          | 0/1208 [00:00<?, ?it/s]

1843

In [16]:
# View a random sample
random.sample(pages_and_chunks, k=1)

[{'page_number': -1,
  'sentence_chunk': 'This work is licensed under a Creative Commons Attribution 4.0 International License. Human Nutrition by the\xa0University of Hawai’i at Mānoa Food Science and Human Nutrition Program. Download this book for free at: \xa0http://pressbooks.oer.hawaii.edu/ humannutrition/ — This Open Educational Resource textbook was also inspired by: Kansas State University Human Nutrition | goo.gl/vOAnR // CC BY 3.0 Edited and Reviewed by Carolyn Donohoe-Mather – University of Hawai’i at Mānoa, Chapter reviewer Cecille Farnum — Ryerson University, Copyeditor Changqi Leu — San Diego State University, Chapter reviewer Billy Meinke — University of Hawai’i at Mānoa, Project manager Paula Parslow — Private, Copyeditor Trina Robertson — Saddleback College, Chapter reviewer Allison Tepper — American University, Chapter reviewer Front Cover Photo Noa Kekuewa Lincoln / CC BY 4.0 Acknowledgements | xli',
  'chunk_char_count': 882,
  'chunk_word_count': 126,
  'chunk_toke

**Now we've broken our whole textbook into chunks of 10 sentences or less as well as the page number they came from.**

**This means we could reference a chunk of text and know its source.**

**Let's get some stats about our chunks.**

In [17]:
# Get stats about our chunks
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,583.38,734.44,112.33,183.61
std,347.79,447.54,71.22,111.89
min,-41.0,12.0,3.0,3.0
25%,280.5,315.0,44.0,78.75
50%,586.0,746.0,114.0,186.5
75%,890.0,1118.5,173.0,279.62
max,1166.0,1831.0,297.0,457.75


In [19]:
# Show random chunks with under 30 tokens in length
min_token_length = 30
for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
  print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

Chunk token count: 11.25 | Text: Carbohydrates and Personal Diet Choices | 275
Chunk token count: 20.5 | Text: http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=118   132 | The Immune System
Chunk token count: 10.5 | Text: 442 | Health Consequences of Alcohol Abuse
Chunk token count: 13.0 | Text: US Department of Agriculture, 1136 | Food Insecurity
Chunk token count: 24.25 | Text: biological, chemicals, or physical) and identify preventative 1014 | Protecting the Public Health


Looks like many of these are headers and footers of different pages.

They don't seem to offer too much information.

Let's filter our DataFrame/list of dictionaries to only include chunks with over 30 tokens in length.

In [20]:
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE',
  'chunk_char_count': 308,
  'chunk_word_count': 42,
  'chunk_token_count': 77.0},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and Human Nutrition Program is licensed under a Creative Commons Attribution 4.0 International License, except where otherwise noted.',
  'chunk_char_count': 210,
  'chunk_word_count': 30,
  'chunk_token_count': 52.5}]

Smaller chunks filtered!

Time to embed our chunks of text

# Embedding our text chunks

In [21]:
# Requires !pip install sentence-transformers
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device="cpu") # choose the device to load the model to (note: GPU will often be *much* faster than CPU)

# Create a list of sentences to turn into numbers
sentences = [
    "The Sentences Transformers library provides an easy and open-source way to create embeddings.",
    "Sentences can be embedded one by one or as a list of strings.",
    "Embeddings are one of the most powerful concepts in machine learning!",
    "Learn to use embeddings well and you'll be well on your way to being an AI engineer."
]

# Sentences are encoded/embedded by calling model.encode()
embeddings = embedding_model.encode(sentences)
embeddings_dict = dict(zip(sentences, embeddings))

# See the embeddings
for sentence, embedding in embeddings_dict.items():
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Sentence: The Sentences Transformers library provides an easy and open-source way to create embeddings.
Embedding: [-2.07982697e-02  3.03164832e-02 -2.01217849e-02  6.86484650e-02
 -2.55256221e-02 -8.47686455e-03 -2.07225574e-04 -6.32377118e-02
  2.81606894e-02 -3.33353989e-02  3.02633960e-02  5.30721806e-02
 -5.03527038e-02  2.62288321e-02  3.33313718e-02 -4.51577231e-02
  3.63044813e-02 -1.37122418e-03 -1.20171458e-02  1.14947259e-02
  5.04510924e-02  4.70856987e-02  2.11913940e-02  5.14606535e-02
 -2.03746483e-02 -3.58889401e-02 -6.67763175e-04 -2.94393823e-02
  4.95859198e-02 -1.05639677e-02 -1.52014112e-02 -1.31758570e-03
  4.48197424e-02  1.56023465e-02  8.60379430e-07 -1.21392624e-03
 -2.37978697e-02 -9.09368275e-04  7.34484056e-03 -2.53933994e-03
  5.23370504e-02 -4.68043424e-02  1.66214760e-02  4.71579395e-02
 -4.15599644e-02  9.01976076e-04  3.60277519e-02  3.42214219e-02
  9.68227163e-02  5.94829023e-02 -1.64984372e-02 -3.51249315e-02
  5.92516130e-03 -7.07903586e-04 -2.4103

In [22]:
single_sentence = "Yo! How cool are embeddings?"
single_embedding = embedding_model.encode(single_sentence)
print(f"Sentence: {single_sentence}")
print(f"Embedding:\n{single_embedding}")
print(f"Embedding size: {single_embedding.shape}")

Sentence: Yo! How cool are embeddings?
Embedding:
[-1.97448116e-02 -4.51077055e-03 -4.98486962e-03  6.55444860e-02
 -9.87674389e-03  2.72836108e-02  3.66426110e-02 -3.30219767e-03
  8.50078650e-03  8.24952498e-03 -2.28497703e-02  4.02430147e-02
 -5.75200692e-02  6.33691847e-02  4.43207137e-02 -4.49506715e-02
  1.25284614e-02 -2.52011847e-02 -3.55293006e-02  1.29559003e-02
  8.67021922e-03 -1.92917790e-02  3.55635840e-03  1.89505480e-02
 -1.47128161e-02 -9.39848833e-03  7.64175924e-03  9.62184742e-03
 -5.98920882e-03 -3.90168726e-02 -5.47824651e-02 -5.67456335e-03
  1.11644426e-02  4.08067517e-02  1.76319088e-06  9.15305596e-03
 -8.77257995e-03  2.39382870e-02 -2.32784245e-02  8.04999843e-02
  3.19176875e-02  5.12598455e-03 -1.47708450e-02 -1.62525177e-02
 -6.03213124e-02 -4.35689688e-02  4.51211594e-02 -1.79053694e-02
  2.63366792e-02 -3.47866528e-02 -8.89172778e-03 -5.47675341e-02
 -1.24372439e-02 -2.38606706e-02  8.33496898e-02  5.71241677e-02
  1.13328267e-02 -1.49595067e-02  9.2037

Our embedding has a shape of (768,) meaning it's a vector of 768 numbers which represent our text in high-dimensional space, too many for a human to comprehend but machines love high-dimensional space.

**Note: No matter the size of the text input to our all-mpnet-base-v2 model, it will be turned into an embedding size of (768,). This value is fixed. So whether a sentence is 1 token long or 1000 tokens long, it will be truncated/padded with zeros to size 384 and then turned into an embedding vector of size (768,). Of course, other embedding models may have different input/output shapes.**

**Let's start by trying to create embeddings on the CPU, we'll time it with the %%time magic to see how long it takes.**

In [23]:
%%time

# Uncomment to see how long it takes to create embeddings on CPU
# # Make sure the model is on the CPU
# embedding_model.to("cpu")

# # Embed each chunk one by one
# for item in tqdm(pages_and_chunks_over_min_token_len):
#     item["embedding"] = embedding_model.encode(item["sentence_chunk"])

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 7.15 µs


Now let's see how long it takes to create the embeddings with a GPU.

In [25]:
# Turn text chunks into a single list
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]

In [11]:
%%time

# Embed all texts in batches
text_chunk_embeddings = embedding_model.encode(text_chunks,
                                               batch_size=32, # you can use different batch sizes here for speed/performance, I found 32 works well for this use case
                                               convert_to_tensor=True) # optional to return embeddings as tensor instead of array

text_chunk_embeddings


CPU times: user 663 ms, sys: 120 ms, total: 782 ms
Wall time: 1.18 s


tensor([[ 8.2110e-02,  9.5740e-02,  1.0975e-02,  3.9905e-02,  3.5827e-03,
          2.3762e-02, -7.5415e-03, -3.9637e-02,  7.5331e-02,  2.6136e-02,
          7.6754e-02, -4.0063e-02, -6.9554e-04, -1.2440e-02,  2.7238e-03,
          1.7045e-02,  4.0656e-02, -4.9786e-02, -4.3975e-02,  1.4289e-02,
          1.3618e-02,  9.1474e-02,  1.4704e-02,  7.7338e-03,  8.0799e-03,
          7.2877e-03, -4.6331e-02,  8.0579e-02,  4.2629e-02, -8.8117e-03,
         -4.5124e-02, -7.6632e-03,  5.4132e-02,  4.9267e-02,  2.9562e-02,
         -1.6208e-02,  4.4306e-03,  4.2938e-02,  3.9993e-04, -1.5698e-02,
          3.2198e-02, -1.8857e-02,  3.3932e-02,  2.1054e-02, -9.6018e-03,
          5.5528e-03, -1.3004e-02, -3.1280e-02, -4.3724e-03, -2.1249e-02,
         -9.5889e-02, -8.2187e-02, -2.5127e-02, -1.5415e-03,  1.0639e-02,
          4.4545e-02,  8.3761e-02, -7.2531e-03,  3.5560e-02,  4.3388e-03,
         -3.7168e-02, -2.0781e-02, -9.2186e-02,  8.6834e-02,  1.4420e-01,
          7.5978e-02, -4.7421e-03, -4.

# Save embeddings to file

Since creating embeddings can be a timely process (not so much for our case but it can be for more larger datasets), let's turn the pages_and_chunks_over_min_token_len list of dictionaries into a DataFrame and save it

In [18]:
# Save embeddings to file
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [19]:
# Import saved file and view
text_chunks_and_embedding_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embedding_df_load.head()

Unnamed: 0,page_number,text_chunk,embedding
0,1,This is the first chunk.,"[0.1, 0.2, 0.3]"
1,2,This is the second chunk.,"[0.4, 0.5, 0.6]"


Let's import our embeddings we created earlier (tk -link to embedding file) and prepare them for use by turning them into a tensor.

In [20]:
import random

import torch
import numpy as np
import pandas as pd

device = "cuda" if torch.cuda.is_available() else "cpu"

# Import texts and embedding df
text_chunks_and_embedding_df = pd.read_csv("text_chunks_and_embeddings_df.csv")

# Convert embedding column back to np.array (it got converted to string when it got saved to CSV)
text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

# Convert texts and embedding df to list of dicts
pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")

# Convert embeddings to torch tensor and send to device (note: NumPy arrays are float64, torch tensors are float32 by default)
embeddings = torch.tensor(np.array(text_chunks_and_embedding_df["embedding"].tolist()), dtype=torch.float32).to(device)
embeddings.shape

  text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))


torch.Size([2, 1])

In [21]:
text_chunks_and_embedding_df.head()

Unnamed: 0,page_number,text_chunk,embedding
0,1,This is the first chunk.,[0.1]
1,2,This is the second chunk.,[0.4]


In [22]:
embeddings[0]

tensor([0.1000], device='cuda:0')

In [23]:
from sentence_transformers import util, SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device=device) # choose the device to load the model to

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding model ready!


Time to perform a semantic search.


Let's say you were studying the macronutrients.

And wanted to search your textbook for "macronutrients functions".

 **we can do so with the following steps:**

Define a query string (e.g. "macronutrients functions") - note: this could be anything, specific or not.
Turn the query string in an embedding with same model we used to embed our text chunks.
Perform a dot product or cosine similarity function between the text embeddings and the query embedding (we'll get to what these are shortly) to get similarity scores.
Sort the results from step 3 in descending order (a higher score means more similarity in the eyes of the model) and use these values to inspect the texts.

In [32]:
# Ensure both tensors are on the same device (either CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move both query_embedding and embeddings to the same device
query_embedding = query_embedding.to(device)
embeddings = embeddings.to(device)

# Reshape embeddings to match the shape of query_embedding
# Ensure that embeddings have the same number of dimensions (384) as the query_embedding
embeddings = embeddings.view(-1, query_embedding.shape[-1])

# Now compute the dot product
from time import perf_counter as timer

start_time = timer()
dot_scores = util.dot_score(query_embedding, embeddings)[0]  # Dot product calculation
end_time = timer()

print(f"Time taken to compute scores on {len(embeddings)} embeddings: {end_time - start_time:.5f} seconds")




Time taken to compute scores on 6 embeddings: 0.02381 seconds


In [33]:
# 1. Define the query
# Note: This could be anything. But since we're working with a nutrition textbook, we'll stick with nutrition-based queries.
query = "macronutrients functions"
print(f"Query: {query}")

# 2. Embed the query to the same numerical space as the text examples
# Note: It's important to embed your query with the same model you embedded your examples with.
query_embedding = embedding_model.encode(query, convert_to_tensor=True)

# 3. Get similarity scores with the dot product (we'll time this for fun)
from time import perf_counter as timer

start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
end_time = timer()

print(f"Time take to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

# 4. Get the top-k results (we'll keep this to 5)
top_results_dot_product = torch.topk(dot_scores, k=5)
top_results_dot_product

Query: macronutrients functions
Time take to get scores on 6 embeddings: 0.00016 seconds.


torch.return_types.topk(
values=tensor([0.0241, 0.0241, 0.0161, 0.0161, 0.0080], device='cuda:0'),
indices=tensor([4, 5, 3, 2, 0], device='cuda:0'))

In [40]:
import torch
from sentence_transformers import util
from time import time as timer

# Assuming query_embedding and embeddings are defined earlier
query_embedding = torch.randn(1, 384).to(device)  # Example query embedding
larger_embeddings = torch.randn(100 * embeddings.shape[0], 384).to(device)  # Fix the dimension of larger_embeddings

print(f"Embeddings shape: {larger_embeddings.shape}")

# Perform dot product across 168,000 embeddings
start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=larger_embeddings)[0]
end_time = timer()

print(f"Time taken to get scores on {len(larger_embeddings)} embeddings: {end_time - start_time:.5f} seconds.")



Embeddings shape: torch.Size([600, 384])
Time taken to get scores on 600 embeddings: 0.00116 seconds.


However, for much larger datasets, we'd likely look at a dedicated vector database/indexing libraries such as Faiss.

Let's check the results of our original similarity search.

torch.topk returns a tuple of values (scores) and indicies for those scores.

The indicies relate to which indicies in the embeddings tensor have what scores in relation to the query embedding (higher is better).

 can use those indicies to map back to our text chunks.

First,  define a small helper function to print out wrapped text (so it doesn't print a whole text chunk as a single line).

In [41]:
# Define helper function to print wrapped text
import textwrap

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

Now we can loop through the top_results_dot_product tuple and match up the scores and indicies and then use those indicies to index on our pages_and_chunks variable to get the relevant text chunk.

In [None]:
print(f"Query: '{query}'\n")
print("Results:")
# Loop through zipped together scores and indicies from torch.topk
for score, idx in zip(top_results_dot_product[0], top_results_dot_product[1]):
    print(f"Score: {score:.4f}")
    # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
    print("Text:")
    print_wrapped(pages_and_chunks[idx]["sentence_chunk"])
    # Print the page number too so we can reference the textbook further (and check the results)
    print(f"Page number: {pages_and_chunks[idx]['page_number']}")
    print("\n")

Query: 'macronutrients functions'

Results:
Score: 0.6926

Text:
Macronutrients Nutrients that are needed in large amounts are called
macronutrients. There are three classes of macronutrients: carbohydrates,
lipids, and proteins. These can be metabolically processed into cellular energy.
The energy from macronutrients comes from their chemical bonds. This chemical
energy is converted into cellular energy that is then utilized to perform work,
allowing our bodies to conduct their basic functions. A unit of measurement of
food energy is the calorie. On nutrition food labels the amount given for
“calories” is actually equivalent to each calorie multiplied by one thousand. A
kilocalorie (one thousand calories, denoted with a small “c”) is synonymous with
the “Calorie” (with a capital “C”) on nutrition food labels. Water is also a
macronutrient in the sense that you require a large amount of it, but unlike the
other macronutrients, it does not yield calories. Carbohydrates Carbohydrates
are molecules composed of carbon, hydrogen, and oxygen.

Page number: 5


Score: 0.6738

Text:
Water There is one other nutrient that we must have in large quantities: water.
Water does not contain carbon, but is composed of two hydrogens and one oxygen
per molecule of water. More than 60 percent of your total body weight is water.
Without it, nothing could be transported in or out of the body, chemical
reactions would not occur, organs would not be cushioned, and body temperature
would fluctuate widely. On average, an adult consumes just over two liters of
water per day from food and drink combined. Since water is so critical for
life’s basic processes, the amount of water input and output is supremely
important, a topic we will explore in detail in Chapter 4. Micronutrients
Micronutrients are nutrients required by the body in lesser amounts, but are
still essential for carrying out bodily functions. Micronutrients include all
the essential minerals and vitamins. There are sixteen essential minerals and
thirteen vitamins (See Table 1.1 “Minerals and Their Major Functions” and Table
1.2 “Vitamins and Their Major Functions” for a complete list and their major
functions). In contrast to carbohydrates, lipids, and proteins, micronutrients
are not sources of energy (calories), but they assist in the process as
cofactors or components of enzymes (i.e., coenzymes).

Page number: 8


Score: 0.6646

Text:
Learning Objectives By the end of this chapter, you will be able to: • Describe
basic concepts in nutrition • Describe factors that affect your nutritional
needs • Describe the importance of research and scientific methods to
understanding nutrition What are Nutrients? The foods we eat contain nutrients.
Nutrients are substances required by the body to perform its basic functions.
Nutrients must be obtained from our diet, since the human body does not
synthesize or produce them. Nutrients have one or more of three basic functions:
they provide energy, contribute to body structure, and/or regulate chemical
processes in the body. These basic functions allow us to detect and respond to
environmental surroundings, move, excrete wastes, respire (breathe), grow, and
reproduce. There are six classes of nutrients required for the body to function
and maintain overall health. These are carbohydrates, lipids, proteins, water,
vitamins, and minerals. Foods also contain non-nutrients that may be harmful
(such as natural toxins common in plant foods and additives like some dyes and
preservatives) or beneficial (such as antioxidants). 4 | Introduction

Page number: 4


Score: 0.6536

Text:
Vitamins Major Functions Water-soluble Thiamin (B1) Coenzyme, energy metabolism
assistance Riboflavin (B2 ) Coenzyme, energy metabolism assistance Niacin (B3)
Coenzyme, energy metabolism assistance Pantothenic acid (B5) Coenzyme, energy
metabolism assistance Pyridoxine (B6) Coenzyme, amino acid synthesis assistance
Biotin (B7) Coenzyme, amino acid and fatty acid metabolism Folate (B9) Coenzyme,
essential for growth Cobalamin (B12) Coenzyme, red blood cell synthesis C
(ascorbic acid) Collagen synthesis, antioxidant Fat-soluble A Vision,
reproduction, immune system function D Bone and teeth health maintenance, immune
system function E Antioxidant, cell membrane protection K Bone and teeth health
maintenance, blood clotting Vitamin deficiencies can cause severe health
problems and even death. For example, a deficiency in niacin causes a disease
called pellagra, which was common in the early twentieth century in some parts
of America. The common signs and symptoms of pellagra are known as the
“4D’s—diarrhea, dermatitis, dementia, and death.” Until scientists found out
that better diets relieved the signs and symptoms of pellagra, many people with
the disease ended up hospitalized in insane asylums awaiting death. Other
vitamins were also found to prevent certain disorders and diseases such as
scurvy (vitamin C), night blindness vitamin A, and rickets (vitamin D). Table
1.3 Functions of Nutrients Introduction | 11

Page number: 11


Score: 0.6473

Text:
Figure 1.1 The Macronutrie nts: Carbohydrat es, Lipids, Protein, and Water
Proteins Proteins are macromolecules composed of chains of subunits called amino
acids. Amino acids are simple subunits composed of carbon, oxygen, hydrogen, and
nitrogen. Food sources of proteins include meats, dairy products, seafood, and a
variety of different plant- based foods, most notably soy. The word protein
comes from a Greek word meaning “of primary importance,” which is an apt
description of these macronutrients; they are also known colloquially as the
“workhorses” of life. Proteins provide four kilocalories of energy per gram;
however providing energy is not protein’s most important function. Proteins
provide structure to bones, muscles and skin, and play a role in conducting most
of the chemical reactions that take place in the body. Scientists estimate that
greater than one-hundred thousand different proteins exist within the human
body. The genetic codes in DNA are basically protein recipes that determine the
order in which 20 different amino acids are bound together to make thousands of
specific proteins. Figure 1.1 The Macronutrients: Carbohydrates, Lipids,
Protein, and Water Introduction | 7

Page number: 7

In [60]:
pip install PyMuPDF


Collecting PyMuPDF
  Downloading pymupdf-1.25.2-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.2-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m89.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.25.2


In [61]:
import fitz


In [63]:
import torch

def dot_product(vector1, vector2):
    return torch.dot(vector1, vector2)

def cosine_similarity(vector1, vector2):
    dot_product = torch.dot(vector1, vector2)

    # Get Euclidean/L2 norm of each vector (removes the magnitude, keeps direction)
    norm_vector1 = torch.sqrt(torch.sum(vector1**2))
    norm_vector2 = torch.sqrt(torch.sum(vector2**2))

    return dot_product / (norm_vector1 * norm_vector2)

# Example tensors
vector1 = torch.tensor([1, 2, 3], dtype=torch.float32)
vector2 = torch.tensor([1, 2, 3], dtype=torch.float32)
vector3 = torch.tensor([4, 5, 6], dtype=torch.float32)
vector4 = torch.tensor([-1, -2, -3], dtype=torch.float32)

# Calculate dot product
print("Dot product between vector1 and vector2:", dot_product(vector1, vector2))
print("Dot product between vector1 and vector3:", dot_product(vector1, vector3))
print("Dot product between vector1 and vector4:", dot_product(vector1, vector4))

# Calculate cosine similarity
print("Cosine similarity between vector1 and vector2:", cosine_similarity(vector1, vector2))
print("Cosine similarity between vector1 and vector3:", cosine_similarity(vector1, vector3))
print("Cosine similarity between vector1 and vector4:", cosine_similarity(vector1, vector4))

Dot product between vector1 and vector2: tensor(14.)
Dot product between vector1 and vector3: tensor(32.)
Dot product between vector1 and vector4: tensor(-14.)
Cosine similarity between vector1 and vector2: tensor(1.0000)
Cosine similarity between vector1 and vector3: tensor(0.9746)
Cosine similarity between vector1 and vector4: tensor(-1.0000)


# Functionizing our semantic search pipeline

In [64]:
def retrieve_relevant_resources(query: str,
                                embeddings: torch.tensor,
                                model: SentenceTransformer=embedding_model,
                                n_resources_to_return: int=5,
                                print_time: bool=True):
    """
    Embeds a query with model and returns top k scores and indices from embeddings.
    """

    # Embed the query
    query_embedding = model.encode(query,
                                   convert_to_tensor=True)

    # Get dot product scores on embeddings
    start_time = timer()
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    end_time = timer()

    if print_time:
        print(f"[INFO] Time taken to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

    scores, indices = torch.topk(input=dot_scores,
                                 k=n_resources_to_return)

    return scores, indices

def print_top_results_and_scores(query: str,
                                 embeddings: torch.tensor,
                                 pages_and_chunks: list[dict]=pages_and_chunks,
                                 n_resources_to_return: int=5):
    """
    Takes a query, retrieves most relevant resources and prints them out in descending order.

    Note: Requires pages_and_chunks to be formatted in a specific way (see above for reference).
    """

    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings,
                                                  n_resources_to_return=n_resources_to_return)

    print(f"Query: {query}\n")
    print("Results:")
    # Loop through zipped together scores and indicies
    for score, index in zip(scores, indices):
        print(f"Score: {score:.4f}")
        # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
        print_wrapped(pages_and_chunks[index]["sentence_chunk"])
        # Print the page number too so we can reference the textbook further and check the results
        print(f"Page number: {pages_and_chunks[index]['page_number']}")
        print("\n")

In [65]:
query = "symptoms of pellagra"

# Get just the scores and indices of top related results
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)
scores, indices

[INFO] Time taken to get scores on 6 embeddings: 0.00012 seconds.


(tensor([0.0310, 0.0310, 0.0207, 0.0207, 0.0103], device='cuda:0'),
 tensor([4, 5, 3, 2, 0], device='cuda:0'))

In [None]:
# Print out the texts of the top scores
print_top_results_and_scores(query=query,
                             embeddings=embeddings)

[INFO] Time taken to get scores on 1680 embeddings: 0.00002 seconds.
Query: symptoms of pellagra

Results:
Score: 0.5000

Niacin deficiency is commonly known as pellagra and the symptoms include
fatigue, decreased appetite, and indigestion. These symptoms are then commonly
followed by the four D’s: diarrhea, dermatitis, dementia, and sometimes death.
Figure 9.12 Conversion of Tryptophan to Niacin Water-Soluble Vitamins | 565

Page number: 565


Score: 0.3741

car. Does it drive faster with a half-tank of gas or a full one?It does not
matter; the car drives just as fast as long as it has gas. Similarly, depletion
of B vitamins will cause problems in energy metabolism, but having more than is
required to run metabolism does not speed it up. Buyers of B-vitamin supplements
beware; B vitamins are not stored in the body and all excess will be flushed
down the toilet along with the extra money spent. B vitamins are naturally
present in numerous foods, and many other foods are enriched with them. In the
United States, B-vitamin deficiencies are rare; however in the nineteenth
century some vitamin-B deficiencies plagued many people in North America. Niacin
deficiency, also known as pellagra, was prominent in poorer Americans whose main
dietary staple was refined cornmeal. Its symptoms were severe and included
diarrhea, dermatitis, dementia, and even death. Some of the health consequences
of pellagra are the result of niacin being in insufficient supply to support the
body’s metabolic functions.

Page number: 591


Score: 0.2959

The carbon dioxide gas bubbles infiltrate the stretchy gluten, giving bread its
porosity and tenderness. For those who are sensitive to gluten, it is good to
know that corn, millet, buckwheat, and oats do not contain the proteins that
make gluten. However, some people who have celiac disease also may have a
response to products containing oats. This is most likely the result of cross-
contamination of grains during harvest, storage, packaging, and processing.
Celiac disease is most common in people of European descent and is rare in
people of African American, Japanese, and Chinese descent. It is much more
prevalent in women and in people with Type 1 diabetes, autoimmune thyroid
disease, and Down and Turner syndromes. Symptoms can range from mild to severe
and can include pale, fatty, loose stools, gastrointestinal upset, abdominal
pain, weight loss and, in children, a failure to grow and thrive. The symptoms
can appear in infancy or much later in life, even Nutrition, Health and Disease
| 1079

Page number: 1079


Score: 0.2793

Image by BruceBlaus/ CC BY 4.0 When the vertebral bone tissue is weakened, it
can cause the spine to curve. The increase in spine curvature not only causes
pain, but also decreases a person’s height. Curvature of the upper spine
produces what is called Dowager’s hump, also known as kyphosis. Severe upper-
spine deformity can compress the chest cavity and cause difficulty breathing. It
may also cause abdominal pain and loss of appetite because of the increased
pressure on the abdomen. 1090 | Nutrition, Health and Disease

Page number: 1090


Score: 0.2721

esophagus and cause irritation. It is estimated that GERD affects 25 to 35
percent of the US population. An analysis of several studies published in the
August 2005 issue of Annals of Internal Medicine concludes that GERD is much
more prevalent in people who are obese.1 The most common GERD symptom is
heartburn, but people with GERD may also experience regurgitation (flow of the
stomach’s acidic contents into the mouth), frequent coughing, and trouble
swallowing. There are other causative factors of GERD that may be separate from
or intertwined with obesity. The sphincter that separates the stomach’s internal
contents from the esophagus often does not function properly and acidic gastric
contents seep upward. Sometimes the peristaltic contractions of the esophagus
are also sluggish and compromise the clearance of acidic contents. In addition
to having an unbalanced, high-fat diet, some people with GERD are sensitive to
particular foods—chocolate, garlic, spicy foods, fried foods, and tomato-based
foods—which worsen symptoms. Drinks containing alcohol or caffeine may also
worsen GERD symptoms. GERD is diagnosed most often by a history of the frequency
of recurring symptoms. A more proper diagnosis can be made when a doctor inserts
a small device into the lower esophagus that measures the acidity of the
contents during one’s daily activities.

Page number: 1077

# Checking local GPU memory availability

In [67]:
# Get GPU available memory
import torch
gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = round(gpu_memory_bytes / (2**30))
print(f"Available GPU memory: {gpu_memory_gb} GB")

Available GPU memory: 15 GB


In [68]:
# Note: the following is Gemma focused, however, there are more and more LLMs of the 2B and 7B size appearing for local use.
if gpu_memory_gb < 5.1:
    print(f"Your available GPU memory is {gpu_memory_gb}GB, you may not have enough memory to run a Gemma LLM locally without quantization.")
elif gpu_memory_gb < 8.1:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in 4-bit precision.")
    use_quantization_config = True
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb < 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.")
    use_quantization_config = False
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb > 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommend model: Gemma 7B in 4-bit or float16 precision.")
    use_quantization_config = False
    model_id = "google/gemma-7b-it"

print(f"use_quantization_config set to: {use_quantization_config}")
print(f"model_id set to: {model_id}")

GPU memory: 15 | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.
use_quantization_config set to: False
model_id set to: google/gemma-2b-it
