### 4. Loading Documents and RAG

In [None]:
%pip install chromadb tiktoken PyMuPDF langchain langchain-huggingface

In [1]:
from langchain.document_loaders import PyMuPDFLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings

In [21]:
# load document
def load_file(file_path):
  pdf_loader = PyMuPDFLoader(file_path)
  document = pdf_loader.load()
  return document

document = load_file("_data/Troubleshooting _ Chroma Docs.pdf")
print(document)

[Document(page_content='Chroma\nchat\nchat 817 online\n817 online\nStar\nStar\n14k\n14k\nFollow @trychroma\nFollow @trychroma\n🔍 Troubleshooting\nThis page is a list of common gotchas or issues and how to fix them.\nIf you don\'t see your problem listed here, please also search the Github Issues.\nUsing .get or .query, embeddings say None\nThis is actually not an error. Embeddings are quite large and heavy to send back. Most\napplication don\'t use the underlying embeddings and so, by default, chroma does not\nsend them back.\nTo send them back: add include=["embeddings", "documents", "metadatas", "distances"]\nto your query to return all information.\nFor example:\nBuild error when running pip install chromadb\nIf you encounter an error like this during setup\nMenu\npython\n1\nresults = collection.query(\n2\n    query_texts="hello",\n3\n    n_results=1,\n4\n    include=["embeddings", "documents", "metadatas", "distances"],\n5\n)\nCopy Code\nWe may change None  to something else to mor

In [22]:
#split into small chunks
# \n\n, \n
def chunking_document(document):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
  texts = text_splitter.split_documents(document)
  return texts

texts = chunking_document(document)
print(len(texts))
print(texts[len(texts)-1])

5
page_content='Edit this page on GitHub\n7/16/24, 3:16 PM\n🔍 Troubleshooting | Chroma Docs\nhttps://docs.trychroma.com/troubleshooting\n3/3' metadata={'source': '_data/Troubleshooting _ Chroma Docs.pdf', 'file_path': '_data/Troubleshooting _ Chroma Docs.pdf', 'page': 2, 'total_pages': 3, 'format': 'PDF 1.4', 'title': '🔍 Troubleshooting | Chroma Docs', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36', 'producer': 'Skia/PDF m126', 'creationDate': "D:20240716081655+00'00'", 'modDate': "D:20240716081655+00'00'", 'trapped': ''}


In [24]:
from langchain import HuggingFaceHub

model_name = "google/flan-t5-large"
model_kwargs=({"temperature":0,
              "max_length": 100})
llm = HuggingFaceHub(repo_id=model_name, model_kwargs=model_kwargs)

In [45]:
### Persisted ChromaDB to disk

from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma

#instantiate the Chroma object from langchain, using Hugging embedding
#DB name = chromadb_langchain101
persists_directory = "./db/chromadb_langchain101" 
embeddings = HuggingFaceEmbeddings()

# Save to disk
vectordb = Chroma.from_documents(
    documents=texts,
    embedding=embeddings,
    collection_name="langchain101",
    persist_directory=persists_directory
)

#query from Chroma object
chain = RetrievalQA.from_chain_type(llm, retriever=vectordb.as_retriever())
# retriever = vectordb.as_retriever()

### Check file_name existed in processed.csv

In [114]:
import os
import pandas as pd
from datetime import datetime

# Define paths and load processed file
data_path = "./_data/"
processed_file = "processed.csv"

# Load the processed.csv file, creating it if it doesn't exist
if os.path.exists(processed_file):
    processed_df = pd.read_csv(processed_file)
else:
    processed_df = pd.DataFrame(columns=["file_name", "processed_date"])

# Get the list of all PDF files in the data directory
all_files = [f for f in os.listdir(data_path) if f.endswith('.pdf')]

# Find the new files by comparing with the processed_df
processed_files = set(processed_df["file_name"].tolist())
new_files = [f for f in all_files if f not in processed_files]


In [109]:
# Process each new file

for new_file in new_files:
    # Here, you would add your PDF processing code using PyMuPDFLoader or any other processing
    # For example:
    # pdf_loader = PyMuPDFLoader(os.path.join(data_path, new_file))
    # pdf_loader.process()
 
    row = {
        "file_name": [new_file],
        "processed_date": [datetime.now().strftime("%Y-%m-%d %H:%M:%S")]
           }
    df1 = pd.DataFrame(row)
    processed_df = pd.concat([processed_df, df1], ignore_index=True)
    # Log the processed file
    # processed_df = processed_df.concat({
    #     "file_name": new_file,
    #     "processed_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    # }, ignore_index=True)
# print(processed_df)
# Save the updated processed_df back to processed.csv
processed_df.to_csv(processed_file, index=True)
# print(processed_df)
# print(f"Processed and logged {len(new_files)} new files.")

Troubleshooting _ Chroma Docs.pdf
pandas.DataFrame.to_csv — pandas 2.2.2 documentation.pdf
instructor.pdf
Usage Guide _ Chroma Docs.pdf


In [113]:
df = pd.read_csv(processed_file,index_col=0)
df

Unnamed: 0,file_name,processed_date
0,Troubleshooting _ Chroma Docs.pdf,2024-07-16 15:25:37
1,pandas.DataFrame.to_csv — pandas 2.2.2 documen...,2024-07-16 15:25:37
2,instructor.pdf,2024-07-16 15:25:37
3,Usage Guide _ Chroma Docs.pdf,2024-07-16 15:25:37
