In [None]:
# !pip install sentence-transformers
# !pip install "elasticsearch>=8.0.0,<9.0.0"

In [None]:
import os
from typing import List, Optional
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer
from PyPDF2 import PdfReader

class HybridPDFSearch:
    def __init__(self, 
                 index_name: str = "hybrid_pdf",
                 embedding_model: str = "all-MiniLM-L6-v2"):
        self.es = Elasticsearch(
                                "https://localhost:9200",
                                basic_auth=("elastic", "Ra1MKkShNaQmNmlZX5uv"),
                                verify_certs=False)
        self.index_name = index_name
        self.model = SentenceTransformer(embedding_model)
        self.vector_dim = self.model.get_sentence_embedding_dimension()
        self._ensure_index()

    def _ensure_index(self):
        mapping = {
            "mappings": {
                "properties": {
                    "content": {"type": "text"},
                    "content_vector": {"type": "dense_vector", "dims": self.vector_dim}
                }
            }
        }
        if self.es.indices.exists(index=self.index_name):
            pass  # Keep existing index
        else:
            self.es.indices.create(index=self.index_name, body=mapping)

    def _extract_text_from_pdf(self, pdf_path: str) -> List[str]:
        """Extract text per page (or as chunks if desired)"""
        if not os.path.exists(pdf_path):
            raise FileNotFoundError(f"{pdf_path} not found")
        reader = PdfReader(pdf_path)
        # Optionally, split into paragraphs, pages, etc.
        pages = [page.extract_text() for page in reader.pages]
        # Filter out None/empty pages
        return [txt.strip() for txt in pages if txt and txt.strip()]

    def index_pdf(self, pdf_path: str):
        """Extracts and indexes the PDF text as separate documents per page."""
        texts = self._extract_text_from_pdf(pdf_path)
        for idx, text in enumerate(texts):
            vector = self.model.encode(text).tolist()
            doc_id = f"{os.path.basename(pdf_path)}_page_{idx}"
            self.es.index(index=self.index_name, id=doc_id, body={
                "content": text,
                "content_vector": vector
            })

    def search(self, query: str, top_k: int = 5) -> List[dict]:
        """Hybrid search: BM25 (keyword) + cosine similarity"""
        query_vector = self.model.encode(query).tolist()
        body = {
            "size": top_k,
            "query": {
                "script_score": {
                    "query": {
                        "match": {"content": {"query": query}}
                    },
                    "script": {
                        "source": "cosineSimilarity(params.query_vector, 'content_vector') + 1.0",
                        "params": {"query_vector": query_vector}
                    }
                }
            }
        }
        results = self.es.search(index=self.index_name, body=body)
        return [
            {"score": hit["_score"], "content": hit["_source"]["content"]}
            for hit in results["hits"]["hits"]
        ]

In [None]:
# === INPUT FILE and INITIALIZATION ===
pdf_path = "InputData/the-state-of-ai-in-2023-generative-ais-breakout-year_vf.pdf"
searcher = HybridPDFSearch()
searcher.index_pdf(pdf_path)

In [None]:
# Now search for any query
query = "Tell me potential risks of generative AI adoption in enterprise."
results = searcher.search(query, top_k=3)
for idx, res in enumerate(results, 1):
    print(f"{idx}. [Score: {res['score']:.2f}]\n{res['content']}\n")