In [1]:

from langchain_ollama import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_ollama.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever
import os
import pdfplumber

# Initialize the data array
data_array = []

# Set the protocol buffers environment variable (if needed)
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"

# Directory containing the PDF files
directory_path = "data/"

# Check if the directory exists
if not os.path.exists(directory_path):
    print(f"Directory {directory_path} does not exist.")
else:
    # Loop through all directories in the 'data' directory
    for dirname in os.listdir(directory_path):
        print(f"-> Data directory: {dirname}")

        # Full path for the subdirectory
        full_path = os.path.join(directory_path, dirname)
        print(f"--> Data subdirectory: {full_path}")

        # Check if it's a directory
        if not os.path.isdir(full_path):
            continue

        # Loop through all files in the subdirectory
        for filename in os.listdir(full_path):
            if filename.endswith(".pdf"):
                # Construct the full file path for the PDF
                local_path = os.path.join(full_path, filename)
                print(f"---> Found PDF file: {local_path}")

                try:
  

                    with pdfplumber.open(local_path) as pdf:
                        pdf_data = ""
                        for page in pdf.pages:
                            pdf_data += page.extract_text()

                    # Check if pdf_data is empty or None
                    if pdf_data:
                        # Append the individual documents from the pdf_data
                        data_array.extend(pdf_data)
                        print(f"PDF loaded successfully: {local_path}")
                    else:
                        print(f"Warning: No data extracted from PDF {local_path}")
                except Exception as e:
                    # Print any errors that occur during PDF loading
                    print(f"Error loading PDF {local_path}: {str(e)}")
            else:
                print(f"Skipping non-PDF file: {filename}")

    # Print the total number of documents loaded (not just PDFs)
    print(f"Total documents loaded: {len(data_array)}")

# Create a Document object from the text content in 'data'
# If 'data' is a string, wrap it in a Document object
documents = [Document(page_content=d) for d in data_array]  # 'data' here should be a string or a list of strings

# Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)
print(f"Text split into {len(chunks)} chunks")


-> Data directory: .DS_Store
--> Data subdirectory: data/.DS_Store
-> Data directory: pdf
--> Data subdirectory: data/pdf
---> Found PDF file: data/pdf/Canadian-Guideline-on-Concussion-in-Sport-2nd-edition-2024.pdf
PDF loaded successfully: data/pdf/Canadian-Guideline-on-Concussion-in-Sport-2nd-edition-2024.pdf
---> Found PDF file: data/pdf/Policy-on-Management-of-Sports-Related-Concussion-2024-2025.pdf
PDF loaded successfully: data/pdf/Policy-on-Management-of-Sports-Related-Concussion-2024-2025.pdf
---> Found PDF file: data/pdf/18377.pdf
PDF loaded successfully: data/pdf/18377.pdf
-> Data directory: json
--> Data subdirectory: data/json
Skipping non-PDF file: book.json
-> Data directory: csv
--> Data subdirectory: data/csv
Skipping non-PDF file: NFL Head Injured Players.csv
-> Data directory: text
--> Data subdirectory: data/text
Total documents loaded: 1118140


NameError: name 'Document' is not defined