### 1) Load PDFs and split , indexing to chunks 

import library 

In [1]:
from pathlib import Path
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

PDFs' path 

In [12]:
pdf_paths = [
    "../data/Automotive_SPICE_PAM_31_EN.pdf",
    "../data/AUTOSAR_SWS_ECUStateManager.pdf"
]

# Verify files exist before loading
for path in pdf_paths:
    if not Path(path).exists():
        print(f"Warning: File not found - {path}")
    else:
        print(f"File exists: {path}")


File exists: ../data/Automotive_SPICE_PAM_31_EN.pdf
File exists: ../data/AUTOSAR_SWS_ECUStateManager.pdf


Load PDFs 

In [14]:
docs = []
for path in pdf_paths:
  loader = PyPDFLoader(path)
  docs.extend(loader.load())

split PDFs to Chunks 

In [16]:
splitter = RecursiveCharacterTextSplitter(
  chunk_size = 500, chunk_overlap  = 50, 
  separators=["\n\n", "\n", " ", ""]
)
chunks = splitter.split_documents(docs)

Add IDS 

In [17]:
for i, chunk in enumerate(chunks):
  chunk.metadata["chunk_id"] = i
  if "source" in chunk.metadata:
    chunk.metadata["doc_id"] = Path(chunk.metadata["source"]).stem
  else:
    chunk.metadata["doc_id"] = "unknown"

len(chunks)

1492

### 2) Build indexing Dense (Faiss + Gemini Embeddings )

In [21]:
import langchain_google_genai
print(dir(langchain_google_genai))

['AqaInput', 'AqaOutput', 'ChatGoogleGenerativeAI', 'DoesNotExistsException', 'GenAIAqa', 'GoogleGenerativeAI', 'GoogleGenerativeAIEmbeddings', 'GoogleVectorStore', 'HarmBlockThreshold', 'HarmCategory', 'Modality', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '_common', '_enums', '_function_utils', '_genai_extension', '_image_utils', 'chat_models', 'embeddings', 'genai_aqa', 'google_vector_store', 'llms']


In [25]:
# import libraries 
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_community.vectorstores import FAISS
import os 
from dotenv import load_dotenv

In [30]:
load_dotenv()
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")


In [33]:
embeddings = GoogleGenerativeAIEmbeddings(
    model="models/gemini-embedding-001",
    google_api_key=GEMINI_API_KEY
    )


In [34]:
LLM = ChatGoogleGenerativeAI(
  api_key = GEMINI_API_KEY, 
  model="gemini-2.5-pro",
  temperature=0.2,
  max_retries=2,
)

### 3) BM25 (rank_bm25)

In [37]:
import re, pickle
from rank_bm25 import BM25Okapi

In [39]:
def tokeize(text:str):
  return re.findall(r"\b\w+\b", text.lower())

corpus_tokens = [tokeize(chunk.page_content) for chunk in chunks]
bm25 = BM25Okapi(corpus_tokens)

with open("../artifacts/bm25_corpus.pkl", "wb") as f:
  pickle.dump({"corpus_tokens":corpus_tokens}, f)


In [41]:
import json, os 
with open("../artifacts/chunks.jsonl", "w", encoding ="utf-8") as f:
  for chunk in chunks:
    f.write(json.dumps({
      "chunk_id": chunk.metadata["chunk_id"],
      "doc_id": chunk.metadata["doc_id"],
      "page" : chunk.metadata.get("page", None),
      "text" : chunk.page_content
    }, ensure_ascii=False) + "\n")