### 4. Loading Documents into ChromaDB

In [None]:
%pip install chromadb tiktoken PyMuPDF langchain langchain-huggingface pandas

In [2]:
from langchain.document_loaders import PyMuPDFLoader

# load document from file_path to memory
def load_file(file_path):
  pdf_loader = PyMuPDFLoader(file_path)
  document = pdf_loader.load()
  return document

# document = load_file("_data/Troubleshooting _ Chroma Docs.pdf")
# print(document)

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
#split into small chunks
# \n\n, \n
def chunking_document(document):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
  texts = text_splitter.split_documents(document)
  return texts

# texts = chunking_document(document)
# print(len(texts))
# print(texts[len(texts)-1])

In [10]:
from langchain import HuggingFaceHub

model_name = "google/flan-t5-large"
model_kwargs=({"temperature":0,
              "max_length": 100})
llm = HuggingFaceHub(repo_id=model_name, model_kwargs=model_kwargs)

In [9]:
### Persisted ChromaDB to disk
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma

#instantiate the Chroma object from langchain, using Hugging embedding
def persist_db(texts):
    #DB name = chromadb_langchain101
    persists_directory = "./db/chromadb_langchain101" 
    embeddings = HuggingFaceEmbeddings()

    # Save to disk
    vectordb = Chroma.from_documents(
        documents=texts,
        embedding=embeddings,
        collection_name="langchain101",
        persist_directory=persists_directory
    )
    return vectordb

### Check file_name existed in processed.csv

In [7]:
import os
import pandas as pd
from datetime import datetime

# Define paths and load processed file
data_path = "./_data/"
processed_file = "processed.csv"

# Load the processed.csv file, creating it if it doesn't exist
if os.path.exists(processed_file):
    processed_df = pd.read_csv(processed_file)
else:
    processed_df = pd.DataFrame(columns=["file_name", "processed_date"])

# Get the list of all PDF files in the data directory
all_files = [f for f in os.listdir(data_path) if f.endswith('.pdf')]

# Find the new files by comparing with the processed_df
processed_files = set(processed_df["file_name"].tolist())
new_files = [f for f in all_files if f not in processed_files]


In [8]:
from langchain.document_loaders import PyMuPDFLoader

# Process each new file
for new_file in new_files:
    try:
        # PDF processing
        print(f"--> {os.path.join(data_path,new_file)}")
        document = load_file(file_path=os.path.join(data_path,new_file))
        texts = chunking_document(document)
        persist_db(texts=texts)
    except:
        raise(f"Cannot process the file: [{new_file}]")
    finally:
        row = {
            "file_name": [new_file],
            "processed_date": [datetime.now().strftime("%Y-%m-%d %H:%M:%S")]
            }
        df1 = pd.DataFrame(row)
        processed_df = pd.concat([processed_df, df1], ignore_index=True)
    
# Save the updated processed_df back to processed.csv
processed_df.to_csv(processed_file, index=True)
print(processed_df)


   Unnamed: 0                       file_name       processed_date
0           0                  instructor.pdf  2024-07-16 15:25:37
1           1       UsageGuide_ChromaDocs.pdf  2024-07-19 11:29:06
2           2       UsageGuide_ChromaDocs.pdf  2024-07-19 11:31:18
3           3  Troubleshooting_ChromaDocs.pdf  2024-07-19 11:31:23
4           4     pandas.DataFrame.to_csv.pdf  2024-07-19 11:31:26
