In [1]:
import os
import pdfplumber
import pytesseract
from pdf2image import convert_from_path
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from dotenv import load_dotenv
from google import genai

# ---- 0. Setup ----
load_dotenv()
client = genai.Client()  # Reads GOOGLE_API_KEY from .env

PDF_PATH = "ML_Resume.pdf"
if not os.path.exists(PDF_PATH):
    PDF_PATH = input("Enter your PDF file name/path: ").strip()

In [2]:
# ---- 1. PDF Extraction (OCR fallback) ----
text = ""
with pdfplumber.open(PDF_PATH) as pdf:
    for page in pdf.pages:
        pg_text = page.extract_text()
        if pg_text:
            text += pg_text + "\n"
if not text.strip():
    print("[INFO] No selectable text—using OCR fallback.")
    images = convert_from_path(PDF_PATH, dpi=200)
    for image in images:
        text += pytesseract.image_to_string(image) + "\n"
else:
    print("[INFO] Text extraction successful without OCR.")

[INFO] Text extraction successful without OCR.


In [3]:
# ---- 2. Chunk text for RAG ----
chunk_size = 500
overlap = 50
words = text.split()
chunks = []
for i in range(0, len(words), chunk_size - overlap):
    chunk = " ".join(words[i:i + chunk_size])
    chunks.append(chunk)
    if i + chunk_size >= len(words):
        break
print(f"[INFO] PDF split into {len(chunks)} chunks.")

[INFO] PDF split into 1 chunks.


In [4]:
# ---- 3. Compute Embeddings ----
model_name = 'all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)
chunk_embeddings = model.encode(chunks, convert_to_tensor=False, show_progress_bar=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
# ---- 4. Query loop: semantic retrieval + Gemini answer ----
while True:
    user_question = input("\nAsk a question about your PDF (or 'exit'): ").strip()
    if user_question.lower() == "exit":
        break

    # Semantic search (cosine similarity)
    query_embed = model.encode(user_question, convert_to_tensor=False)
    sims = cosine_similarity([query_embed], chunk_embeddings)[0]
    top_idx = sims.argsort()[::-1][:3]
    top_chunks = [(chunks[i], sims[i]) for i in top_idx]

    print("\n--- Top relevant PDF passages ---")
    for idx, (chunk, sim) in enumerate(top_chunks, 1):
        print(f"\nChunk #{idx} (sim={sim:.2f}):\n{chunk[:600]}")  # Print first 600 characters

    use_extra = input("\nAllow outside knowledge in answer? (y/n): ").strip().lower() == "y"
    # Prompt for Gemini LLM
    context = "\n\n".join([
        f"Context {i+1}:\n{chunk}" for i, (chunk, _) in enumerate(top_chunks)
    ])
    if use_extra:
        sys_prompt = (
            "You are a helpful assistant. Answer the user's question using the provided PDF context. "
            "You MAY also include helpful outside knowledge, but cite the PDF if it has the answer."
        )
    else:
        sys_prompt = (
            "You are a helpful assistant. Answer using ONLY the PDF context below. "
            "If the answer isn't present, say so (do NOT add outside knowledge)."
        )
    prompt = f"""{sys_prompt}

Context:
{context}
Question: {user_question}
Answer:"""

    response = client.models.generate_content(
        model="models/gemini-1.5-flash-latest",
        contents=prompt
    )
    print("\nFinal Answer:\n", response.text.strip())

print("\n[Done]")


--- Top relevant PDF passages ---

Chunk #1 (sim=0.33):
Yuvaraj S Machine Learning Engineer — Software Engineer (cid:131) +91 9384137766 # ai.yuvaraj21@gmail.com (cid:239) LinkedIn § GitHub (cid:128) Portfolio Aspiring Machine Learning Engineer with a passion for building scalable AI solutions and real-time applications. Technical Skills Languages: Python, JavaScript, HTML, CSS ML/AI: TensorFlow, PyTorch, Scikit-learn, OpenCV, BERT, Neural Networks, Computer Vision, NLP Web/Backend: Django, REST API, MongoDB Tools/Cloud: Git, AWS, Jupyter, Postman, CI/CD Experience Machine Learning Engineer July 2024 – Present Techso IT LLC, USA – Developed real-ti

Final Answer:
 Based on the provided resume, Yuvaraj S is an aspiring Machine Learning Engineer with experience in developing real-time AI applications and scalable solutions.  Their skills encompass various programming languages (Python, JavaScript, HTML, CSS), machine learning frameworks (TensorFlow, PyTorch, Scikit-learn, OpenCV, BERT),