### 4. Loading Documents into ChromaDB

In [1]:
# from langchain.document_loaders import PyMuPDFLoader

# # load document from file_path to memory
# def load_file(file_path):
#   pdf_loader = PyMuPDFLoader(file_path)
#   document = pdf_loader.load()
  # return document

import os
from langchain.document_loaders import PyMuPDFLoader

# Load all PDFs from a directory or a single file
def load_file(file_path):
    documents = []
    
    if os.path.isdir(file_path):
        # If the path is a directory, process each PDF file in the directory
        for file_name in os.listdir(file_path):
            if file_name.endswith(".pdf"):
                pdf_loader = PyMuPDFLoader(os.path.join(file_path, file_name))
                documents.extend(pdf_loader.load())
    elif file_path.endswith(".pdf"):
        # If it's a single PDF file, just load it
        pdf_loader = PyMuPDFLoader(file_path)
        documents = pdf_loader.load()

    return documents


In [2]:
#load 1 file
documents = load_file("document/HealthyEatingPlate.pdf")
print(documents)
print(f"Total documents loaded: {len(documents)}")

[Document(metadata={'source': 'document/HealthyEatingPlate.pdf', 'file_path': 'document/HealthyEatingPlate.pdf', 'page': 0, 'total_pages': 3, 'format': 'PDF 1.4', 'title': 'Healthy Eating Plate – The Nutrition Source', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36', 'producer': 'Skia/PDF m129', 'creationDate': "D:20240923084939+00'00'", 'modDate': "D:20240923084939+00'00'", 'trapped': ''}, page_content='HEALTHY \nOILS\nWATER\nVEGETABLES\nFRUITS\nHEALTHY \nPROTEIN\nWHOLE \nGRAINS\nLooking for a printable copy? Download one here, and hang it on your\nrefrigerator to serve as a daily reminder when planning and preparing your\nmeals! Translations of the Healthy Eating Plate are also available in over 25\nlanguages.\nBuilding a Healthy and Balanced Diet\nMake most of your meal vegetables and fruits – ½ of your plate.\nAim for color and variety, and remember that p

In [4]:
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# #split into small chunks
# # \n\n, \n
# def chunking_document(document):
#   text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
#   texts = text_splitter.split_documents(document)
#   return texts

from langchain.text_splitter import RecursiveCharacterTextSplitter

# Split documents into small chunks
# The document parameter now expects a list of documents
def chunking_document(documents):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
    texts = []
    
    # Loop through each document and split it into chunks
    for document in documents:
        texts.extend(text_splitter.split_documents([document]))
    
    return texts


In [5]:
texts = chunking_document(documents)
print(len(texts))
print(texts[len(texts)-1])

4
page_content='The Healthy Eating Plate encourages consumers to use healthy oils, and it
does not set a maximum on the percentage of calories people should get each
day from healthy sources of fat. In this way, the Healthy Eating Plate
recommends the opposite of the low-fat message promoted for decades by the
USDA.
Your Questions Answered
Your Plate and the Planet
Just as different foods can have differing impacts on human health, they also have
differing impacts on the environment. Food production is a major contributor to
greenhouse gas emissions, and it places an enormous demand upon our earth’s
natural resources.
LEARN ABOUT THE IMPACTS OF THE FOODS ON YOUR PLATE
9/23/24, 3:49 PM
Healthy Eating Plate – The Nutrition Source
https://nutritionsource.hsph.harvard.edu/healthy-eating-plate/
4/8' metadata={'source': 'document/HealthyEatingPlate.pdf', 'file_path': 'document/HealthyEatingPlate.pdf', 'page': 2, 'total_pages': 3, 'format': 'PDF 1.4', 'title': 'Healthy Eating Plate – The Nutr

In [13]:
from langchain import HuggingFaceHub
def get_llm():
    model_name = "google/flan-t5-large"
    model_kwargs=({"temperature":0.9,
                "max_length": 100})
    llm = HuggingFaceHub(repo_id=model_name, model_kwargs=model_kwargs)
    return llm

# from langchain_community.llms import CTransformers

# def get_llm():
#     llm = CTransformers(model='models/llama-2-7b-chat.ggmlv3.q8_0.bin',
#                         # https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/tree/main
#                         model_type='llama',
#                         config={'temperature': 0.9})
#     return llm

In [7]:
### Persisted ChromaDB to disk
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma

#instantiate the Chroma object from langchain, using Hugging embedding
def persist_db(texts):
    #DB name = chromadb_langchain101
    persists_directory = "./db/nutrition_db" 
    embeddings = HuggingFaceEmbeddings()

    # Save to disk
    vectordb = Chroma.from_documents(
        documents=texts,
        embedding=embeddings,
        collection_name="nutrion",
        persist_directory=persists_directory
    )
    return vectordb

vectordb = persist_db(texts)



In [10]:
# import os
# # Disable tokenizers parallelism
# os.environ["TOKENIZERS_PARALLELISM"] = "false"

#query from Chroma object
chain = RetrievalQA.from_chain_type(get_llm(), retriever=vectordb.as_retriever())

In [17]:
def format_llm_prompt(question):
    mega_prompt = f"""
    You are a helpful, concise assistant specializing in providing clear, human-like responses. 
    When answering the following question, ensure your response is natural, well-structured, and brief:

    Question: {question}

    Please answer in a polite and clear manner, and include relevant context where needed, but keep the response to the point.
    """
    return mega_prompt


In [18]:
def ask_llm_question(question, chain):
    # Generate the well-structured prompt
    formatted_prompt = format_llm_prompt(question)
    
    # Pass the formatted prompt to the chain
    response = chain.invoke(formatted_prompt)
    
    # Return the result
    return response['result']

In [21]:
# Example usage
chain = RetrievalQA.from_chain_type(get_llm(), retriever=vectordb.as_retriever())
question = "how many protein should have in our plate?"
formatted_response = ask_llm_question(question, chain)

# Output the response
print(formatted_response)

Protein power – 14 of your plate. Fish, poultry, beans, and nuts are all healthy, versatile protein sources


In [22]:
# Example usage
chain = RetrievalQA.from_chain_type(get_llm(), retriever=vectordb.as_retriever())
question = "tell me about the oils?"
formatted_response = ask_llm_question(question, chain)

# Output the response
print(formatted_response)

Choose healthy vegetable oils like olive, canola, soy, corn, sunflower, peanut, and others, and avoid partially hydrogenated oils, which contain unhealthy trans fats.
