In [9]:
pip install google-generativeai

Note: you may need to restart the kernel to use updated packages.


In [11]:
!pip install pytesseract pdf2image Pillow
import pytesseract
from pdf2image import convert_from_path
from PIL import Image



In [13]:
pytesseract.pytesseract.tesseract_cmd = r"C:\path-to\Program Files\Tesseract-OCR\tesseract.exe"
pdf_path = r"C:\path-to\image-based-pdf-sample.pdf"

def pdf_to_images(pdf_path, dpi=300):
    poppler_path = r"C:\path-to\Release-24.08.0-0\poppler-24.08.0\Library\bin" 
    return convert_from_path(pdf_path, dpi=dpi, poppler_path=poppler_path)
    
images = pdf_to_images(pdf_path)
print(f"Total pages converted: {len(images)}")

def extract_text_with_tesseract(images):
    text = ""
    for i, img in enumerate(images):
        print(f"OCR on page {i+1}")
        page_text = pytesseract.image_to_string(img)
        text += page_text + "\n\n"
    return text
    
raw_text = extract_text_with_tesseract(images)

def clean_text(text):
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    return "\n".join(lines)
    
cleaned_text = clean_text(raw_text)
print(cleaned_text[:2000])  

Total pages converted: 1
OCR on page 1
This is an example of an “Image-based PDF” (also known as image-only PDFs).
Image-based PDFs are typically created through scanning paper in a copier, taking photographs
or taking screenshots. To a computer, they are images. Though we humans can see text in the
image, the file only consists of the image layer but not the searchable text layer that True PDFs
contain. As a result, we cannot use a computer to search the text we see in the image as that text
layer is missing. There are times when discovery is produced, it will be in an image-based PDF
format. When you come across image-based PDFs, ask the U.S. Attorney’s Office in what
format was that file originally. Second, ask if they have it in a searchable format and specifically
if they have it in a digitally created, True, Text-based PDF format. They may not, as they often
receive PDFs from other sources before they provide them to you, but you will want to know
what is the format in which they

In [15]:
!pip install google-generativeai
import google.generativeai as genai



In [17]:
genai.configure(api_key="your-geminiapi-key")

In [19]:
def ask_gemini(question, context):
    model = genai.GenerativeModel(model_name="gemini-1.5-flash")  
    prompt = (
        f"You are an assistant helping answer questions from scanned documents.\n\n"
        f"Document:\n{context}\n\n"
        f"Question:\n{question}\n\n"
        f"Answer:"
    )
    response = model.generate_content(prompt)
    return response.text

In [21]:
def ocr_gemini_pipeline(pdf_path, question):
    print("Converting PDF to images...")
    images = pdf_to_images(pdf_path)
    
    print("Extracting text with OCR...")
    raw_text = extract_text_with_tesseract(images)
    
    print("Cleaning text...")
    context = clean_text(raw_text)
    
    print("Asking Gemini...")
    answer = ask_gemini(question, context)
    
    return answer

In [23]:
question = "What is an image file?"
answer = ocr_gemini_pipeline(pdf_path, question)
print("\nFinal Answer:\n", answer)

Converting PDF to images...
Extracting text with OCR...
OCR on page 1
Cleaning text...
Asking Gemini...

Final Answer:
 Based on the provided document, an image file, in the context of PDFs, is a file created by scanning paper, taking photographs, or screenshots.  Unlike a "True PDF," it contains only an image layer and lacks a searchable text layer.  The computer sees only the image; the text within the image is not searchable by the computer.



In [25]:
!pip install faiss-cpu sentence-transformers



In [27]:
def chunk_text(text, chunk_size=300, overlap=50):
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + chunk_size, len(text))
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

chunks = chunk_text(cleaned_text)
print(f"Total chunks created: {len(chunks)}")

Total chunks created: 5


In [31]:
from sentence_transformers import SentenceTransformer
import numpy as np

In [33]:
genai.configure(api_key="your-geminiapi-key")

In [35]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')
chunk_embeddings = embedder.encode(chunks)
chunk_embeddings = np.array(chunk_embeddings)
print(f"Embedding shape: {chunk_embeddings.shape}")

Embedding shape: (5, 384)


In [39]:
import faiss
embedding_dim = chunk_embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)  
index.add(chunk_embeddings)  
print(f"Total vectors in index: {index.ntotal}")

Total vectors in index: 5


In [41]:
def retrieve_relevant_chunks(query, top_k=5):
    query_embedding = embedder.encode([query])
    D, I = index.search(np.array(query_embedding), top_k)
    return [chunks[i] for i in I[0]]

In [47]:
def ask_gemini_rag(question, retrieved_chunks):
    context = "\n".join(retrieved_chunks)
    model = genai.GenerativeModel("gemini-1.5-flash")
    prompt = (
        f"You are an assistant helping answer questions from scanned documents.\n\n"
        f"Relevant Document Chunks:\n{context}\n\n"
        f"Question:\n{question}\n\n"
        f"Answer:"
    )
    response = model.generate_content(prompt)
    return response.text

In [51]:
def rag_pipeline(question):
    print("Retrieving relevant chunks...")
    retrieved_chunks= retrieve_relevant_chunks(question)
    
    print("Asking Gemini with relevant context...")
    answer= ask_gemini_rag(question, retrieved_chunks)
    
    return answer

In [53]:
question = "What is an image file?"
answer = rag_pipeline(question)
print("\nFinal Answer:\n", answer)

Retrieving relevant chunks...
Asking Gemini with relevant context...

Final Answer:
 An image file, in the context of PDFs, is an "Image-based PDF" (also known as an image-only PDF).  These are created by scanning paper, taking photos, or screenshots.  To a computer, they are simply images; while humans can see the text within the image, the file itself only contains the image layer and lacks a searchable text layer.  Therefore, computers cannot search the text within these files.

