### This pipeline processes both the text content of the PDF and any images it contains, allowing for a more comprehensive understanding of the document. The OCR functionality enables the system to extract text from images, which is then included in the knowledge base used for answering queries.

In [38]:
%pip install -Uq langchain-community
%pip install -Uq pytesseract frontend
%pip install -Uq fitz PyMuPDF

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [46]:
from langchain_ollama import ChatOllama
from langchain_ollama import OllamaEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from PIL import Image
import pytesseract
import io
import fitz  # PyMuPDF

# LLM and embeddings setup
llm = ChatOllama(
    model = "llama3.1",
    temperature = 0,
    # num_predict = 256,
    # other params ...
)

embeddings = OllamaEmbeddings(model="llama3.1")

### extract text and images from a PDF

In [47]:
pdf_path = "/Users/Shatten/Edu/Python/MLOps/eBook.pdf"

In [48]:
def process_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text_content = ""
    images = []

    for page in doc:
        text_content += page.get_text()
        
        for img in page.get_images(full=True):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))
            images.append(image)

    return text_content, images

# Example usage:
# pdf_path = "path/to/your/pdf/file.pdf"
# text, images = process_pdf(pdf_path)

In [49]:
text, images = process_pdf(pdf_path)

### create a function to extract text from images using optical character recognition (OCR)

In [50]:
def extract_text_from_images(images):
    image_texts = []
    for img in images:
        text = pytesseract.image_to_string(img)
        image_texts.append(text)
    return "\n".join(image_texts)

# Example usage:
# image_text = extract_text_from_images(images)

In [51]:
image_text = extract_text_from_images(images)

### create the RAG pipeline

In [52]:
def create_rag_pipeline(pdf_path):
    # Process PDF
    text_content, images = process_pdf(pdf_path)
    
    # Extract text from images
    image_text = extract_text_from_images(images)
    
    # Combine all text
    all_text = text_content + "\n" + image_text
    
    # Split text into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = text_splitter.split_text(all_text)
    
    # Create vector store
    vectorstore = Chroma.from_texts(chunks, embeddings)
    
    # Create memory
    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
    
    # Create retrieval chain
    qa_chain = ConversationalRetrievalChain.from_llm(
        llm,
        retriever=vectorstore.as_retriever(),
        memory=memory
    )
    
    return qa_chain

# Example usage:
# pdf_path = "path/to/your/pdf/file.pdf"
# qa_chain = create_rag_pipeline(pdf_path)

In [53]:
# pdf_path = "/Users/Shatten/Edu/Python/MLOps/eBook.pdf"
# qa_chain = create_rag_pipeline(pdf_path)

In [54]:
def query_rag(qa_chain, query):
    result = qa_chain({"question": query})
    return result['answer'], qa_chain

# Example usage:
# pdf_path = "path/to/your/pdf/file.pdf"
# qa_chain = create_rag_pipeline(pdf_path)
# query = "What is the main topic of this PDF?"
# answer = query_rag(qa_chain, query)
# print(answer)

In [55]:
pdf_path = "/Users/Shatten/Edu/Python/MLOps/eBook.pdf"
qa_chain = create_rag_pipeline(pdf_path)
query = "What is the Foundational machine learning skills in this PDF?"
answer, qa_chain = query_rag(qa_chain, query)
print("Q1:", query)
print("A1:", answer)

Q1: What is the Foundational machine learning skills in this PDF?
A1: The text doesn't explicitly mention "Foundational machine learning skills". However, it does mention that understanding the math behind algorithms can be helpful for debugging them, specifically mentioning linear algebra libraries for solving linear systems of equations (for linear regression) and gradient descent, momentum, and the Adam optimization algorithm in deep learning.
