### Run a local rag pipeline from scratch

Rundown of the steps:

1. Take text from the pdf, use it
2. Use sentencizer to find all sentences in a page and put in array
3. Split the sentence into chunks of max 10 sentences each.
4. Now split every chunk into its own object
5. Remove chunks with low token count
6. Embed the chunks in an array
7. Convert to np.array
8. Convert to torch.tensor and set device to cuda
9. Load an LLM
10. RAG creation - convert query to embedding
11. RAG creation - get top scores through dot product topk
12. RAG creation - Get the context text from chunks using the indices
13. RAG creation - fuse the context items and uery  to create an accurate prompt
14. Generate the output through tokenizer, and then decode

In [1]:
import os
import requests

#Get pdf document path
pdf_path = "human-nutrition-text.pdf"

In [2]:
import fitz
from tqdm.auto import tqdm

def text_formatter(text: str) -> str:
    """perform minor formatting in text"""
    cleaned_text = text.replace("\n", " ").strip()

    #potentally more text formattin functions
    return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text = text)
        pages_and_texts.append({"page_number": page_number - 41,
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text)/ 4, # token = 4 characters
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]

0it [00:00, ?it/s]

[{'page_number': -41,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentence_count_raw': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': -40,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''}]

In [3]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,147,3,199.25,Contents Preface University of Hawai‘i at Mā...


In [4]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,199.5,10.52,287.0
std,348.86,560.38,95.83,6.55,140.1
min,-41.0,0.0,1.0,1.0,0.0
25%,260.75,762.0,134.0,5.0,190.5
50%,562.5,1231.5,216.0,10.0,307.88
75%,864.25,1603.5,272.0,15.0,400.88
max,1166.0,2308.0,430.0,39.0,577.0


In [5]:
from spacy.lang.en import English

nlp = English()

#add an sentencizer pipeline

nlp.add_pipe("sentencizer")

#create a document instance
doc = nlp("here i am. where you go? hello baby.")
assert len(list(doc.sents)) == 3

list(doc.sents)

[here i am., where you go?, hello baby.]

In [6]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)

    #all sentences should be strings
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]

    #count sentecnes
    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [7]:
import random

In [8]:
random.sample(pages_and_texts, k=1)

[{'page_number': 240,
  'page_char_count': 1454,
  'page_word_count': 244,
  'page_sentence_count_raw': 11,
  'page_token_count': 363.5,
  'text': 'Digestion and Absorption of  Carbohydrates  UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM  From the Mouth to the Stomach  The mechanical and chemical digestion of carbohydrates begins  in the mouth. Chewing, also known as mastication, crumbles the  carbohydrate foods into smaller and smaller pieces. The salivary  glands in the oral cavity secrete saliva that coats the food particles.  Saliva contains the enzyme, salivary amylase. This enzyme breaks  the bonds between the monomeric sugar units of disaccharides,  oligosaccharides, and starches. The salivary amylase breaks down  amylose and amylopectin into smaller chains of glucose, called  dextrins and maltose. The increased concentration of maltose in the  mouth that results from the mechanical and chemical breakdown  of starches in who

In [9]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,199.5,10.52,287.0,10.32
std,348.86,560.38,95.83,6.55,140.1,6.3
min,-41.0,0.0,1.0,1.0,0.0,0.0
25%,260.75,762.0,134.0,5.0,190.5,5.0
50%,562.5,1231.5,216.0,10.0,307.88,10.0
75%,864.25,1603.5,272.0,15.0,400.88,15.0
max,1166.0,2308.0,430.0,39.0,577.0,28.0


In [10]:
#chunking our sentences together

num_sentence_chunk_size = 10

def split_list(input_list: list[str],
             slice_size: int=num_sentence_chunk_size) ->list[list[str]]:
    return [input_list[i:i+slice_size] for i in range(0, len(input_list), slice_size)]

test_list = list(range(25))

split_list(test_list)

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20, 21, 22, 23, 24]]

In [11]:
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                          slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [12]:
random.sample(pages_and_texts, k=1)

[{'page_number': 133,
  'page_char_count': 1849,
  'page_word_count': 318,
  'page_sentence_count_raw': 12,
  'page_token_count': 462.25,
  'text': 'Indicators of Health: Body  Mass Index, Body Fat  Content, and Fat Distribution  UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM  Although the terms overweight and obese are often used  interchangeably and considered as gradations of the same thing,  they denote different things. The major physical factors  contributing to body weight are water weight, muscle tissue mass,  bone tissue mass, and fat tissue mass. Overweight refers to having  more weight than normal for a particular height and may be the  result of water weight, muscle weight, or fat mass. Obese refers  specifically to having excess body fat. In most cases people who are  overweight also have excessive body fat and therefore body weight  is an indicator of obesity in much of the population.  The “ideal” healthy body weight 

In [13]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,199.5,10.52,287.0,10.32,1.53
std,348.86,560.38,95.83,6.55,140.1,6.3,0.64
min,-41.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,260.75,762.0,134.0,5.0,190.5,5.0,1.0
50%,562.5,1231.5,216.0,10.0,307.88,10.0,1.0
75%,864.25,1603.5,272.0,15.0,400.88,15.0,2.0
max,1166.0,2308.0,430.0,39.0,577.0,28.0,3.0


In [14]:
#embeed each chunk into its own item
import re

pages_and_chunks = []

for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])',r'. \1', joined_sentence_chunk)
        chunk_dict["sentence_chunk"] = joined_sentence_chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4
        pages_and_chunks.append(chunk_dict)

len(pages_and_chunks)

  0%|          | 0/1208 [00:00<?, ?it/s]

1843

In [15]:
random.sample(pages_and_chunks, k=1)

[{'page_number': 71,
  'sentence_chunk': 'Journal of Nutrition, 138(6), 1250S–4S. http://jn.nutrition.org/content/138/6/ 1250S.long The Digestive System | 71',
  'chunk_char_count': 115,
  'chunk_word_count': 12,
  'chunk_token_count': 28.75}]

In [16]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,583.38,734.1,112.74,183.52
std,347.79,447.51,71.24,111.88
min,-41.0,12.0,3.0,3.0
25%,280.5,315.0,45.0,78.75
50%,586.0,745.0,115.0,186.25
75%,890.0,1118.0,173.0,279.5
max,1166.0,1830.0,297.0,457.5


In [17]:
min_token_length = 30
for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')


Chunk token count: 25.25 | Text: The Polynesian Family System in Ka-‘u. Rutland, Vermont: Charles E. Tuttle Company 780 | Introduction
Chunk token count: 19.5 | Text: 2009). Dietary Glycemic Index: Digestion and Absorption of Carbohydrates | 247
Chunk token count: 23.0 | Text: view it online here: http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=455  Infancy | 851
Chunk token count: 4.5 | Text: 708 | Introduction
Chunk token count: 24.75 | Text: Table 9.33 Some Phytochemical’s Obtained from Diet and Their Related Functions 600 | Phytochemicals


In [18]:
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE',
  'chunk_char_count': 308,
  'chunk_word_count': 42,
  'chunk_token_count': 77.0},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and Human Nutrition Program is licensed under a Creative Commons Attribution 4.0 International License, except where otherwise noted.',
  'chunk_char_count': 210,
  'chunk_word_count': 30,
  'chunk_token_count': 52.5}]

In [19]:
random.sample(pages_and_chunks_over_min_token_len, k=1)

[{'page_number': 557,
  'sentence_chunk': 'http://ods.od.nih.gov/factsheets/VitaminC-QuickFacts/. Updated June 24, 2011. Accessed October 5, 2017. Water-Soluble Vitamins | 557',
  'chunk_char_count': 132,
  'chunk_word_count': 13,
  'chunk_token_count': 33.0}]

#embedding our text into chunks

In [20]:
from sentence_transformers import SentenceTransformer

In [21]:
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                       device="cpu")

#demo

sentences = ["bingo is a great way to teach myself what emebbding is",
             "multiple embeddings are also possible"
             "lets try this again"]

#sentences are encoded/embedded by calling model.encode()

embeddings = embedding_model.encode(sentences)
embeddings_dict = dict(zip(sentences, embeddings))

#see the embeddings
for sentence, embedding in embeddings_dict.items():
    print(f"Sentence: {sentence}")
    print(f"Embedding: {embedding}")
    print("")



Sentence: bingo is a great way to teach myself what emebbding is
Embedding: [-2.19521690e-02 -3.71259265e-02  7.61350337e-03  4.20373958e-03
 -2.22289357e-02  5.23242429e-02 -1.06049985e-01  1.06263366e-02
 -3.44943069e-03  5.04260547e-02 -2.58151200e-02 -2.56067310e-02
 -1.05888564e-02 -4.48169708e-02  3.47519852e-02  1.23472633e-02
 -1.03588933e-02  2.88288146e-02 -2.80886032e-02  6.66685170e-03
 -2.11952981e-02  4.64505190e-03 -5.05887195e-02  5.20383678e-02
  5.20579219e-02  2.92763244e-02 -3.66372652e-02 -2.91706435e-02
 -1.83297098e-02 -3.89698185e-02 -1.37568926e-02 -3.70569318e-03
 -4.90505248e-02  2.58246474e-02  1.68330621e-06 -3.61195905e-03
 -2.91094817e-02 -1.31178517e-02  2.04303977e-03  1.96796358e-02
  3.53965834e-02  1.16316536e-02  3.45935784e-02  3.52282189e-02
 -7.05943396e-03 -5.84399700e-02  7.76792541e-02 -2.56688315e-02
  5.38303480e-02 -1.68943089e-02  9.24782734e-03  5.11653982e-02
 -4.62335199e-02 -5.67695498e-02  1.14756171e-02  3.73718403e-02
  2.44857278e-

In [22]:
embeddings[0].shape

(768,)

In [23]:
%%time

embedding_model.to("cuda")



CPU times: total: 328 ms
Wall time: 499 ms


SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [24]:
import torch
torch.cuda.is_available()

True

In [25]:
%%time

embedding_model.to("cuda")

for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

  0%|          | 0/1680 [00:00<?, ?it/s]

CPU times: total: 1min 5s
Wall time: 19.1 s


In [26]:
%%time

text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]
text_chunks[419]

CPU times: total: 0 ns
Wall time: 0 ns


'often. • Calm your “sweet tooth” by eating fruits, such as berries or an apple. • Replace sugary soft drinks with seltzer water, tea, or a small amount of 100 percent fruit juice added to water or soda water. The Food Industry: Functional Attributes of Carbohydrates and the Use of Sugar Substitutes In the food industry, both fast-releasing and slow-releasing carbohydrates are utilized to give foods a wide spectrum of functional attributes, including increased sweetness, viscosity, bulk, coating ability, solubility, consistency, texture, body, and browning capacity. The differences in chemical structure between the different carbohydrates confer their varied functional uses in foods. Starches, gums, and pectins are used as thickening agents in making jam, cakes, cookies, noodles, canned products, imitation cheeses, and a variety of other foods. Molecular gastronomists use slow- releasing carbohydrates, such as alginate, to give shape and texture to their fascinating food creations. Add

In [27]:
len(text_chunks)

1680

In [28]:
%%time

# Embed all texts in batches

text_chunk_embeddings = embedding_model.encode(text_chunks,
                                               batch_size=32,
                                                convert_to_tensor=True)

text_chunk_embeddings

CPU times: total: 27.1 s
Wall time: 6.97 s


tensor([[ 0.0674,  0.0902, -0.0051,  ..., -0.0221, -0.0232,  0.0126],
        [ 0.0552,  0.0592, -0.0166,  ..., -0.0120, -0.0103,  0.0227],
        [ 0.0280,  0.0340, -0.0206,  ..., -0.0054,  0.0213,  0.0313],
        ...,
        [ 0.0771,  0.0098, -0.0122,  ..., -0.0409, -0.0752, -0.0241],
        [ 0.1030, -0.0165,  0.0083,  ..., -0.0574, -0.0283, -0.0295],
        [ 0.0864, -0.0125, -0.0113,  ..., -0.0522, -0.0337, -0.0299]],
       device='cuda:0')

### save embeddings to file

In [29]:
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path,index=False)

In [30]:
#import saved_file and view

text_chunks_and_embeddings_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embeddings_df_load.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.0,[ 6.74242526e-02 9.02281255e-02 -5.09548467e-...
1,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.5,[ 5.52156232e-02 5.92139289e-02 -1.66167468e-...
2,-37,Contents Preface University of Hawai‘i at Māno...,766,116,191.5,[ 2.79801860e-02 3.39813940e-02 -2.06426699e-...
3,-36,Lifestyles and Nutrition University of Hawai‘i...,941,144,235.25,[ 6.82566687e-02 3.81275043e-02 -8.46853387e-...
4,-35,The Cardiovascular System University of Hawai‘...,998,152,249.5,[ 3.30264419e-02 -8.49766657e-03 9.57159232e-...


#RAG - Search and Answer

#Similarity Search

In [31]:
import random
import torch
import numpy as np
import pandas as pd

device = "cuda" if torch.cuda.is_available() else "cpu"

#import texts and embedding df
text_chunks_and_embedding_df = pd.read_csv("text_chunks_and_embeddings_df.csv")

#convert embedding column back to np.array
text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

#convert embedding sinto torch.tensor
embeddings = torch.tensor(np.stack(text_chunks_and_embedding_df["embedding"].tolist(), axis=0), dtype=torch.float32).to(device)

pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")

text_chunks_and_embedding_df

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.00,"[0.0674242526, 0.0902281255, -0.00509548467, -..."
1,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.50,"[0.0552156232, 0.0592139289, -0.0166167468, -0..."
2,-37,Contents Preface University of Hawai‘i at Māno...,766,116,191.50,"[0.027980186, 0.033981394, -0.0206426699, 0.00..."
3,-36,Lifestyles and Nutrition University of Hawai‘i...,941,144,235.25,"[0.0682566687, 0.0381275043, -0.00846853387, -..."
4,-35,The Cardiovascular System University of Hawai‘...,998,152,249.50,"[0.0330264419, -0.00849766657, 0.00957159232, ..."
...,...,...,...,...,...,...
1675,1164,Flashcard Images Note: Most images in the flas...,1304,186,326.00,"[0.0185622461, -0.0164277442, -0.0127045633, -..."
1676,1164,Hazard Analysis Critical Control Points reused...,374,51,93.50,"[0.03347205, -0.0570440665, 0.0151489489, -0.0..."
1677,1165,ShareAlike 11. Organs reused “Pancreas Organ A...,1285,175,321.25,"[0.0770515352, 0.00978559069, -0.0121817421, 0..."
1678,1165,Sucrose reused “Figure 03 02 05” by OpenStax B...,410,63,102.50,"[0.103045158, -0.0164702032, 0.00826845504, 0...."


In [32]:
embeddings.shape

torch.Size([1680, 768])

In [33]:
#Create model

from sentence_transformers import util, SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device=device)



# embedding mode ready

# let's create a small sematic search pipeline

# In essence, we want to get back relevant passages from textbook

In [34]:
#1. define the query

query = "macronutrient functions"

#2. embed the query
query_embedding = embedding_model.encode(query, convert_to_tensor=True).to("cuda")

#3. Get similarity scores with dot product (use cosine similarity if outputs of model are not normalized)
from time import perf_counter as timer

start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
end_time = timer()

print(f"[INFO] timer score: {end_time-start_time:.5f} seconds.")

#4. get the top k results (we'll keep this to top 5)
top_results_dot_product = torch.topk(dot_scores, k=5)
top_results_dot_product

[INFO] timer score: 0.00017 seconds.


torch.return_types.topk(
values=tensor([0.6843, 0.6717, 0.6517, 0.6493, 0.6478], device='cuda:0'),
indices=tensor([42, 47, 46, 51, 41], device='cuda:0'))

In [35]:
import torch

def dot_product(vector1, vector2):
    return torch.dot(vector1, vector2)

def cosine_product(vector1,vector2):
    return torch.cosine(vector1, vector2)

In [36]:
#Functionizing our semantic search pipeline

def retrieve_relevant_resources(query: str,
                                embeddings: torch.tensor,
                                model: SentenceTransformer=embedding_model,
                                n_resources_to_return: int=5):
    """
    Embed a query with model and return top k scores and indices from embedding
    """
    #embed the query
    query_embedding = model.encode(query, convert_to_tensor=True)

    #get dot product scores on embeddings
    dot_scores = util.dot_score(query_embedding, embeddings)[0]

    scores, indices = torch.topk(input=dot_scores,
                                k=n_resources_to_return)

    return scores, indices

In [37]:
retrieve_relevant_resources(query="foods high in fiber", embeddings=embeddings)

(tensor([0.6964, 0.6810, 0.5566, 0.5344, 0.5187], device='cuda:0'),
 tensor([ 418,  360,  358, 1047,  412], device='cuda:0'))

In [38]:
import torch

gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = round(gpu_memory_bytes / (2**30))
print(f"Available memory: {gpu_memory_gb} GB")

Available memory: 10 GB


In [39]:
!nvidia-smi

Tue Feb 25 00:07:08 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 566.36                 Driver Version: 566.36         CUDA Version: 12.7     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3080      WDDM  |   00000000:03:00.0 Off |                  N/A |
| 30%   48C    P2             26W /  320W |    9885MiB /  10240MiB |     71%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [40]:
if gpu_memory_gb < 5.1:
    print(f"Your available GPU memory is {gpu_memory_gb}GB, you may not have enough memory to run a Gemma LLM locally without quantization.")
elif gpu_memory_gb < 8.1:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in 4-bit precision.")
    use_quantization_config = True 
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb < 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.")
    use_quantization_config = False 
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb > 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommend model: Gemma 7B in 4-bit or float16 precision.")
    use_quantization_config = False 
    model_id = "google/gemma-7b-it"

print(f"use_quantization_config set to: {use_quantization_config}")
print(f"model_id set to: {model_id}")

GPU memory: 10 | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.
use_quantization_config set to: False
model_id set to: google/gemma-2b-it


In [41]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

#1. Create a quantization config
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         nbn_4bit_compute_dtype=torch.float16)
attn_implementation = "sdpa" #scaled dot product attention

#2. pick a model
model_id = "google/gemma-2b-it"

#3. Instantiate tokenizer (turns text into tokens)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)

#4. Instantiate the model
llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id,
                                                  torch_dtype=torch.float16,
                                                  quantization_config=quantization_config if use_quantization_config else None,
                                                  low_cpu_mem_usage=False, #use as much memory as we can
                                                  attn_implementation=attn_implementation)

if not use_quantization_config:
    llm_model.to("cuda")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [42]:
!nvidia-smi

Tue Feb 25 00:07:16 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 566.36                 Driver Version: 566.36         CUDA Version: 12.7     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3080      WDDM  |   00000000:03:00.0 Off |                  N/A |
| 30%   49C    P2            103W /  320W |    9926MiB /  10240MiB |     33%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [43]:
llm_model

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear(in_features=16384, out_features=2048, bias=False)
          (act_fn): GELUActivation()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
      )
    )
    (norm): GemmaRM

In [44]:
def get_model_num_params(model: torch.nn.Module):
    return sum([param.numel() for  param in model.parameters()])

get_model_num_params(llm_model)

2506172416

In [45]:
def get_model_mem_size(model: torch.nn.Module):
    #get model parameters and buffer sizes
    mem_params = sum([param.nelement() * param.element_size() for param in model.parameters()])
    mem_buffers = sum([buf.nelement() * buf.element_size() for buf in model.buffers()])

    #claculate model sizes
    model_mem_bytes = mem_params + mem_buffers
    model_mem_gb = model_mem_bytes / (1024**3)
    return {"model_mem_bytes": model_mem_bytes,
    "model_mem_gb": round(model_mem_gb, 2)}

get_model_mem_size(llm_model)

{'model_mem_bytes': 5079453696, 'model_mem_gb': 4.73}

#we got our model size, loading gemma 2b-it in float16

In [46]:
input_text = "what are the micronutrients, and what roles do they play in the human body?"
print(f"Input Text:\n{input_text}")

dialogue_template = [
    {"role": "user",
     "content": input_text}
]

#apply the chat template
prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                       tokenize=False,
                                       add_generation_prompt=True)
print(f"\nPrompt (formatted): \n{prompt}")

Input Text:
what are the micronutrients, and what roles do they play in the human body?

Prompt (formatted): 
<bos><start_of_turn>user
what are the micronutrients, and what roles do they play in the human body?<end_of_turn>
<start_of_turn>model



In [47]:
tokenizer

GemmaTokenizerFast(name_or_path='google/gemma-2b-it', vocab_size=256000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<bos>', 'eos_token': '<eos>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<start_of_turn>', '<end_of_turn>']}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<eos>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<bos>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("<mask>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	5: AddedToken("<2mass>", rstrip=False, lstrip=False, single_w

In [48]:
%%time

#tokenize the input text (turn it into numbers, and send it to gpu)
input_ids = tokenizer(prompt,
                      return_tensors="pt").to("cuda")
input_ids

#Generate outputs from local LLM
outputs = llm_model.generate(**input_ids,
                             max_new_tokens=256)
print(f"Model output (tokens):\n{outputs[0]}\n")

Model output (tokens):
tensor([     2,      2,    106,   1645,    108,   5049,    708,    573,  92800,
        184592, 235269,    578,   1212,  16065,    749,    984,   1554,    575,
           573,   3515,   2971, 235336,    107,    108,    106,   2516,    108,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,  

In [49]:
#decode the output tokens to text

outputs_decoded = tokenizer.decode(outputs[0])
print(f"Model output decoded: \n {outputs_decoded}\n")

Model output decoded: 
 <bos><bos><start_of_turn>user
what are the micronutrients, and what roles do they play in the human body?<end_of_turn>
<start_of_turn>model
<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><

In [50]:
# Nutrition-style questions generated with GPT4
gpt4_questions = [
    "What are the macronutrients, and what roles do they play in the human body?",
    "How do vitamins and minerals differ in their roles and importance for health?",
    "Describe the process of digestion and absorption of nutrients in the human body.",
    "What role does fibre play in digestion? Name five fibre containing foods.",
    "Explain the concept of energy balance and its importance in weight management."
]

# Manually created question list
manual_questions = [
    "How often should infants be breastfed?",
    "What are symptoms of pellagra?",
    "How does saliva help with digestion?",
    "What is the RDI for protein per day?",
    "water soluble vitamins"
]
query_list = gpt4_questions + manual_questions

In [51]:
import random
query = random.choice(query_list)

print(f"Query: {query}")

# get just the scores
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)
scores, indices

Query: How often should infants be breastfed?


(tensor([0.6205, 0.6067, 0.5696, 0.5624, 0.5307], device='cuda:0'),
 tensor([1151, 1160, 1144, 1138, 1155], device='cuda:0'))

##Augmenting our prompt with context items

In [52]:
def prompt_formatter(query: str,
                     context_items: list[dict]) -> str:
    context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])

    base_prompt = """Based on the following context items, please answer the query.
Give yourself room to think by extracting relevant passages from the context before answering the query.
Don't return the thinking, only return the answer.
Make sure your answers are as explanatory as possible.
Use the following examples as reference for the ideal answer style.
\nExample 1:
Query: What are the fat-soluble vitamins?
Answer: The fat-soluble vitamins include Vitamin A, Vitamin D, Vitamin E, and Vitamin K. These vitamins are absorbed along with fats in the diet and can be stored in the body's fatty tissue and liver for later use. Vitamin A is important for vision, immune function, and skin health. Vitamin D plays a critical role in calcium absorption and bone health. Vitamin E acts as an antioxidant, protecting cells from damage. Vitamin K is essential for blood clotting and bone metabolism.
\nExample 2:
Query: What are the causes of type 2 diabetes?
Answer: Type 2 diabetes is often associated with overnutrition, particularly the overconsumption of calories leading to obesity. Factors include a diet high in refined sugars and saturated fats, which can lead to insulin resistance, a condition where the body's cells do not respond effectively to insulin. Over time, the pancreas cannot produce enough insulin to manage blood sugar levels, resulting in type 2 diabetes. Additionally, excessive caloric intake without sufficient physical activity exacerbates the risk by promoting weight gain and fat accumulation, particularly around the abdomen, further contributing to insulin resistance.
\nExample 3:
Query: What is the importance of hydration for physical performance?
Answer: Hydration is crucial for physical performance because water plays key roles in maintaining blood volume, regulating body temperature, and ensuring the transport of nutrients and oxygen to cells. Adequate hydration is essential for optimal muscle function, endurance, and recovery. Dehydration can lead to decreased performance, fatigue, and increased risk of heat-related illnesses, such as heat stroke. Drinking sufficient water before, during, and after exercise helps ensure peak physical performance and recovery.
\nNow use the following context items to answer the user query:
{context}
\nRelevant passages: <extract relevant passages from the context here>
User query: {query}
Answer:"""
    base_prompt = base_prompt.format(context=context,
                                query=query)

    #create prompt template for instruction tuned model
                           
    dialogue_template = [
        {"role": "user",
         "content": base_prompt}
    ]

    prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                           tokenize=False,
                                           add_generation_prompt=True)
    return prompt

query = random.choice(query_list)
print(f"Query: {query}")

#get relevant resources
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)

#create a list of context items
context_items = [pages_and_chunks[i] for i in indices]


#format our prompt
prompt = prompt_formatter(query=query,
                          context_items=context_items)
print(prompt)

Query: Describe the process of digestion and absorption of nutrients in the human body.
<bos><start_of_turn>user
Based on the following context items, please answer the query.
Give yourself room to think by extracting relevant passages from the context before answering the query.
Don't return the thinking, only return the answer.
Make sure your answers are as explanatory as possible.
Use the following examples as reference for the ideal answer style.

Example 1:
Query: What are the fat-soluble vitamins?
Answer: The fat-soluble vitamins include Vitamin A, Vitamin D, Vitamin E, and Vitamin K. These vitamins are absorbed along with fats in the diet and can be stored in the body's fatty tissue and liver for later use. Vitamin A is important for vision, immune function, and skin health. Vitamin D plays a critical role in calcium absorption and bone health. Vitamin E acts as an antioxidant, protecting cells from damage. Vitamin K is essential for blood clotting and bone metabolism.

Example 

In [53]:
%%time

input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

#generrate an output of tokens
outputs = llm_model.generate(**input_ids,
                             temperature=0.7, #from 0 to 1 and the lower the value the more determinisitic the text, the higher the vlaue the more creative
                             do_sample=True, #whether or not to use sampling,
                             max_new_tokens=256)

#turn the output tokens into text
output_text = tokenizer.decode(outputs[0])
print(f"Query: {query}")
print(f"RAG answer: \n {output_text.replace(prompt, '')}")

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


##Functionize our LLM answering feature

In [54]:
def ask(query: str,
        temperature: float=0.7,
        max_new_tokens: int=256,
        format_answer_text=True,
        return_answer_only=True):
    """
    Takes a query, find relevant resources/context and generates an answer to the query based on the relevant resources
    """

    #RETRIEVAL
    #Get just the scores and indices of top related results
    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings)

    #Create a list of context items
    context_items = [pages_and_chunks[i] for i in indices]

    #Add score to context item
    for i, item in enumerate(context_items):
        item["score"] = scores[i].cpu() #return score back to cpu

    #AUGEMENTATION
    #create the prompt and format it with the contetx items
    prompt = prompt_formatter(query=query,
                            context_items=context_items)

    #GENERATION
    #okenize the prompt
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generate an output of tokens
    outputs = llm_model.generate(**input_ids,
                                    temperature=temperature,
                                    do_sample=True,
                                    max_new_tokens=max_new_tokens)

    #Decode the tokens into text
    output_text = tokenizer.decode(outputs[0])

    #Format the answer
    if format_answer_text:
        #Repalce prompt and special tokens
        output_text = output_text.replace(prompt, "").replace("<bos>", "").replace("<eos>", "")

    #Only return without context items
    if return_answer_only:
        return output_text

    return output_text, context_items

In [55]:
query = random.choice(query_list)
print(f"Query: {query}")

ask(query=query)

Query: How often should infants be breastfed?


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
