In [1]:
import os
import requests
import fitz
from tqdm.auto import tqdm
import random
import pandas as pd
from spacy.lang.en import English
import re
from sentence_transformers import SentenceTransformer, util
import faiss
import numpy as np

pdf_path = "human-nutrition-text.pdf"
url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"
device = "cpu"

if not os.path.exists(pdf_path):
    response = requests.get(url)
    if response.status_code == 200:
        with open(pdf_path, "wb") as file:
            file.write(response.content)

nlp = English()
nlp.add_pipe("sentencizer")

def text_formatter(text: str) -> str:
    return text.replace("\n", " ").strip()

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):
        text = text_formatter(page.get_text())
        pages_and_texts.append({
            "page_number": page_number - 41,
            "page_char_count": len(text),
            "page_word_count": len(text.split(" ")),
            "page_sentence_count_raw": len(text.split(". ")),
            "page_token_count": len(text) / 4,
            "text": text
        })
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)

df = pd.DataFrame(pages_and_texts)

for item in tqdm(pages_and_texts):
    item["sentences"] = [str(sentence) for sentence in list(nlp(item["text"]).sents)]
    item["page_sentence_count_spacy"] = len(item["sentences"])

def split_list(input_list: list, slice_size: int) -> list[list[str]]:
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

num_sentence_chunk_size = 10

for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(item["sentences"], num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)
        pages_and_chunks.append({
            "page_number": item["page_number"],
            "sentence_chunk": joined_sentence_chunk,
            "chunk_char_count": len(joined_sentence_chunk),
            "chunk_word_count": len(joined_sentence_chunk.split(" ")),
            "chunk_token_count": len(joined_sentence_chunk) / 4
        })

df = pd.DataFrame(pages_and_chunks)
min_token_length = 30
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", device=device)

for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]

text_chunks
import torch
from transformers import pipeline, AutoTokenizer
import faiss
from sentence_transformers import SentenceTransformer
import numpy as np


# Convert text into embeddings
embeddings = embedding_model.encode(text_chunks, normalize_embeddings=True)  # Normalize for cosine similarity
embedding_dim = embeddings.shape[1]

# Create FAISS index for Cosine Similarity
index = faiss.IndexFlatIP(embedding_dim)  # Use IndexFlatIP for cosine similarity
index.add(embeddings)

faiss.write_index(index, "rag_index.bin")

model_id = "google/gemma-2-2b-it"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)

def retrieve_relevant_resources(query: str, top_k: int = 5):
    """
    Retrieves the top-k most relevant resources from FAISS index using cosine similarity.
    """
    query_embedding = embedding_model.encode([query], normalize_embeddings=True)
    distances, indices = index.search(query_embedding, top_k)
    return [text_chunks[i] for i in indices[0]]

# 2. Setup Hugging Face API LLM
pipe = pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device=device  # Use "cuda" for GPU or "cpu" for CPU
)

def prompt_formatter(query: str, 
                     context_items: list[dict]) -> str:
    """
    Augments query with text-based context from context_items.
    """
    # Join context items into one dotted paragraph
    context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])

    # Create a base prompt with examples to help the model
    # Note: this is very customizable, I've chosen to use 3 examples of the answer style we'd like.
    # We could also write this in a txt file and import it in if we wanted.
    base_prompt = """Based on the following context items, please answer the query.
    Give yourself room to think by extracting relevant passages from the context before answering the query.
    Don't return the thinking, only return the answer.
    Make sure your answers are as explanatory as possible.
    Use the following examples as reference for the ideal answer style.
    Include only the answer in the final output.
    \nExample 1:
    Query: What are the fat-soluble vitamins?
    Answer: The fat-soluble vitamins include Vitamin A, Vitamin D, Vitamin E, and Vitamin K. These vitamins are absorbed along with fats in the diet and can be stored in the body's fatty tissue and liver for later use. Vitamin A is important for vision, immune function, and skin health. Vitamin D plays a critical role in calcium absorption and bone health. Vitamin E acts as an antioxidant, protecting cells from damage. Vitamin K is essential for blood clotting and bone metabolism.
    \nExample 2:
    Query: What are the causes of type 2 diabetes?
    Answer: Type 2 diabetes is often associated with overnutrition, particularly the overconsumption of calories leading to obesity. Factors include a diet high in refined sugars and saturated fats, which can lead to insulin resistance, a condition where the body's cells do not respond effectively to insulin. Over time, the pancreas cannot produce enough insulin to manage blood sugar levels, resulting in type 2 diabetes. Additionally, excessive caloric intake without sufficient physical activity exacerbates the risk by promoting weight gain and fat accumulation, particularly around the abdomen, further contributing to insulin resistance.
    \nExample 3:
    Query: What is the importance of hydration for physical performance?
    Answer: Hydration is crucial for physical performance because water plays key roles in maintaining blood volume, regulating body temperature, and ensuring the transport of nutrients and oxygen to cells. Adequate hydration is essential for optimal muscle function, endurance, and recovery. Dehydration can lead to decreased performance, fatigue, and increased risk of heat-related illnesses, such as heat stroke. Drinking sufficient water before, during, and after exercise helps ensure peak physical performance and recovery.
    \nNow use the following context items to answer the user query:
    {context}
    \nRelevant passages: <extract relevant passages from the context here>
    User query: {query}
    Answer:"""

    # Update base prompt with context items and query   
    base_prompt = base_prompt.format(context=context, query=query)

    # Create prompt template for instruction-tuned model
    dialogue_template = [
        {"role": "user",
        "content": base_prompt}
    ]

    # Apply the chat template
    prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                          tokenize=False,
                                          add_generation_prompt=True)
    return prompt

def rag_generate_response(user_query):
    relevant_context = retrieve_relevant_resources(user_query)
    context_text = "\n".join(relevant_context)

    prompt = prompt_formatter(user_query, [{"sentence_chunk": text} for text in context_text])
    
    outputs = pipe(prompt, max_new_tokens=256)
    return outputs[0]["generated_text"].strip()

# Example Query
query = "What is the Muscular system?"
response = rag_generate_response(query)
print("\n[INFO] Assistant Response:")
print(response)

0it [00:00, ?it/s]

  0%|          | 0/1208 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [12]:
import os
import requests
import fitz
import pandas as pd
import re
import faiss
import numpy as np
import torch
from tqdm.auto import tqdm
from spacy.lang.en import English
from sentence_transformers import SentenceTransformer
from transformers import pipeline, AutoTokenizer

# Constants
PDF_PATH = "human-nutrition-text.pdf"
PDF_URL = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"
DEVICE = "cpu"
EMBEDDING_MODEL = "all-mpnet-base-v2"
MODEL_ID = "google/gemma-2b-it"
TOP_K = 3

# Download PDF if not present
if not os.path.exists(PDF_PATH):
    response = requests.get(PDF_URL)
    if response.status_code == 200:
        with open(PDF_PATH, "wb") as file:
            file.write(response.content)

# Initialize NLP
nlp = English()
nlp.add_pipe("sentencizer")

# Function to clean text
def text_formatter(text: str) -> str:
    return text.replace("\n", " ").strip()

# Read and preprocess PDF
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):
        text = text_formatter(page.get_text())
        pages_and_texts.append({
            "page_number": page_number,
            "text": text
        })
    return pages_and_texts

pages_and_texts = open_and_read_pdf(PDF_PATH)

# Split text into chunks
for item in pages_and_texts:
    item["sentences"] = [str(sentence) for sentence in list(nlp(item["text"]).sents)]
    item["sentence_chunks"] = ["".join(item["sentences"][i:i + 10]).strip() for i in range(0, len(item["sentences"]), 10)]

# Flatten chunks into a list
pages_and_chunks = [{"page_number": item["page_number"], "sentence_chunk": chunk} for item in pages_and_texts for chunk in item["sentence_chunks"]]

# Initialize embedding model
embedding_model = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE)

# Compute embeddings
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks]
embeddings = embedding_model.encode(text_chunks, normalize_embeddings=True)

# Store embeddings in FAISS
embedding_dim = embeddings.shape[1]
index = faiss.IndexFlatIP(embedding_dim)
index.add(embeddings)
faiss.write_index(index, "faiss_index.bin")


0it [00:00, ?it/s]



In [None]:
# Load tokenizer and LLM API
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
pipe = pipeline("text-generation", model=MODEL_ID, model_kwargs={"torch_dtype": torch.bfloat16}, device=DEVICE)

# Retrieve relevant context from FAISS
def retrieve_relevant_resources(query: str, top_k: int = TOP_K):
    query_embedding = embedding_model.encode([query], normalize_embeddings=True)
    distances, indices = index.search(query_embedding, top_k)
    return [text_chunks[i] for i in indices[0]]

# Generate response
def rag_generate_response(user_query):
    relevant_context = retrieve_relevant_resources(user_query)
    print(relevant_context)
    #prompt = 
    #inputs = tokenizer(prompt, return_tensors="pt")
    #outputs = pipe(inputs, max_new_tokens=256)
    #return outputs[0]["generated_text"].strip()

# Example Query
#query = "What is the Muscular system?"
#response = rag_generate_response(query)
#print("\n[INFO] Assistant Response:")
#print(response)

RuntimeError: Failed to import transformers.models.gemma.modeling_gemma because of the following error (look up to see its traceback):
cannot import name 'is_flash_attn_greater_or_equal' from 'transformers.utils' (/Users/aadi/Desktop/LLM/RAG/rag-app/.venv/lib/python3.12/site-packages/transformers/utils/__init__.py)

In [11]:
rag_generate_response

NameError: name 'rag_generate_response' is not defined