In [1]:
import pytesseract
from PIL import Image
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

In [2]:
!pip install pymupdf pytesseract pillow ollama langchain sentence-transformers faiss-cpu numpy



In [3]:
import fitz
import pytesseract
from PIL import Image
import io
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import re
import ollama  

def extract_text_from_pdf(pdf_path):
    text = ""
    doc = fitz.open(pdf_path)
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        pix = page.get_pixmap()
        img = Image.open(io.BytesIO(pix.tobytes()))
        if not page.get_text("text"):
            text += pytesseract.image_to_string(img, lang='eng')
        else:
            text += page.get_text("text")
    return text

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    return text.strip()

def chunk_text(text, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len
    )
    chunks = text_splitter.split_text(text)
    return chunks

class EmbeddingModel:
    def __init__(self, model_name="paraphrase-multilingual-MiniLM-L12-v2"):
        self.model = SentenceTransformer(model_name)

    def generate_embeddings(self, texts):
        return self.model.encode(texts)

class FAISSVectorDB:
    def __init__(self, embedding_model, dimension=384):
        self.embedding_model = embedding_model
        self.dimension = dimension
        self.index = faiss.IndexFlatL2(dimension)

    def add_documents(self, chunks):
        embeddings = self.embedding_model.generate_embeddings(chunks)
        self.index.add(np.array(embeddings).astype('float32'))

    def search(self, query, top_k=5):
        query_embedding = self.embedding_model.generate_embeddings([query])
        distances, indices = self.index.search(np.array(query_embedding).astype('float32'), top_k)
        return indices, distances

def generate_text_with_ollama(query, context):
    """
    Generate a response using Ollama's language model.
    """
    prompt = f"""You are a highly skilled document analyst. Your task is to generate a precise, well-structured response 
    to the given query using ONLY the provided context. Ensure that your answer is clear, comprehensive, and logically 
    organized. Where relevant, include key themes, supporting evidence, and well-defined sections with bullet points 
    for better readability.

    Query: {query}
    Context: {context}

    Response:"""
    
    response = ollama.generate(
        model='llama3.2',  # Use the desired Ollama model
        prompt=prompt,
        options={
            'temperature': 0.3,
            'num_predict': 1500,
            'top_k': 50,
            'top_p': 0.9
        }
    )
    return response['response']

if __name__ == "__main__":
    pdf_path = r"C:\Users\ARYAN PUND\Desktop\Multilingual\sample_pdfs\en\Reboot_Leadership_and_the_Art_of.pdf"
    extracted_text = extract_text_from_pdf(pdf_path)
    print("Text extracted from PDF.")

    cleaned_text = clean_text(extracted_text)
    chunks = chunk_text(cleaned_text)
    print(f"Number of chunks: {len(chunks)}")

    embedding_model = EmbeddingModel()
    vector_db = FAISSVectorDB(embedding_model)
    vector_db.add_documents(chunks)
    print("Embeddings generated and stored in FAISS.")

    queries = [
        "Summarize the document's key themes, major ideas, and underlying messages in detail.",
        "Examine the protagonist’s journey, highlighting their struggles, transformations, and key turning points.",
        "Discuss the document’s core message and its significance in today's social and cultural context.",
        "Outline the major events in the narrative and analyze their role in shaping the story’s progression.",
        "Explore the protagonist’s character growth, including emotional, intellectual, and moral developments.",
    ]

    for query in queries:
        print(f"\nQuery: {query}")
        indices, distances = vector_db.search(query, top_k=3)
        context = "\n\n".join([chunks[idx] for idx in indices[0]])
        response = generate_text_with_ollama(query, context)
        print(f"Generated Response:\n{response}")
        print("―" * 60)

Text extracted from PDF.
Number of chunks: 459
Embeddings generated and stored in FAISS.

Query: Summarize the document's key themes, major ideas, and underlying messages in detail.
Generated Response:
**Document Analysis: Key Themes, Major Ideas, and Underlying Messages**

The provided document appears to be an email or introduction to a book written by the author. The content revolves around the theme of finding direction and meaning in one's personal, professional, and practical goals. The underlying messages are centered around the importance of self-awareness, growth, and resilience.

**Key Themes:**

• **The Search for Meaning**: The document highlights the desire to define a meaningful next chapter that aligns with personal, professional, and practical goals.
• **Self-Awareness and Growth**: The author emphasizes the need to understand one's relationship with money, work, and life to achieve growth and resilience.
• **Resilience and Equanimity**: The book explores heartbreak, re