In [1]:
!pip install openai





In [4]:
!pip install google-generativeai python-dotenv


Collecting protobuf (from google-generativeai)
  Downloading protobuf-4.25.8-cp39-cp39-win_amd64.whl.metadata (541 bytes)
Downloading protobuf-4.25.8-cp39-cp39-win_amd64.whl (413 kB)
Installing collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 3.19.6
    Uninstalling protobuf-3.19.6:
      Successfully uninstalled protobuf-3.19.6
Successfully installed protobuf-4.25.8


  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
mysql-connector-python 8.2.0 requires protobuf<=4.21.12,>=4.21.1, but you have protobuf 4.25.8 which is incompatible.
paddlepaddle 2.6.2 requires protobuf<=3.20.2,>=3.1.0; platform_system == "Windows", but you have protobuf 4.25.8 which is incompatible.
streamlit 1.22.0 requires protobuf<4,>=3.12, but you have protobuf 4.25.8 which is incompatible.
tensorboard 2.11.2 requires protobuf<4,>=3.9.2, but you have protobuf 4.25.8 which is incompatible.
tensorflow-intel 2.11.0 requires protobuf<3.20,>=3.9.2, but you have protobuf 4.25.8 which is incompatible.
tf2onnx 1.16.1 requires protobuf~=3.20, but you have protobuf 4.25.8 which is incompatible.


In [None]:
import os
import pdfplumber
import pytesseract
from pdf2image import convert_from_path
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from dotenv import load_dotenv
from google import genai

# ---- 1. PDF Extraction (OCR fallback) ----
def extract_text_pdf_with_ocr(pdf_path, dpi=200):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            pg_text = page.extract_text()
            if pg_text:
                text += pg_text + "\n"
    if not text.strip():
        print("[INFO] No selectable text—using OCR fallback.")
        images = convert_from_path(pdf_path, dpi=dpi)
        for image in images:
            text += pytesseract.image_to_string(image) + "\n"
    else:
        print("[INFO] Text extraction successful without OCR.")
    return text

# ---- 2. Chunk text for RAG ----
def chunk_text(text, chunk_size=500, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
        if i + chunk_size >= len(words):
            break
    return chunks

# ---- 3. Embedding ----
def embed_chunks(chunks, model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(chunks, convert_to_tensor=False, show_progress_bar=True)
    return model, embeddings

# ---- 4. Query & Retrieval ----
def retrieve_top_k(query, model, chunk_embeddings, chunks, k=3):
    query_embed = model.encode(query, convert_to_tensor=False)
    sims = cosine_similarity([query_embed], chunk_embeddings)[0]
    top_idx = sims.argsort()[::-1][:k]
    return [(chunks[i], sims[i]) for i in top_idx]

# ---- 5. Gemini LLM Integration (Latest SDK) ----
def generate_answer_gemini(
    client,  # pass the client object!
    question,
    retrieved_chunks,
    use_outside_knowledge=False,
    model_name="models/gemini-1.5-flash-latest"  # Use what is in client.models.list()
):
    context = "\n\n".join([
        f"Context {i+1}:\n{chunk[0] if isinstance(chunk, tuple) else chunk}"
        for i, chunk in enumerate(retrieved_chunks)
    ])
    if use_outside_knowledge:
        sys_prompt = (
            "You are a helpful assistant. Answer the user's question using the provided PDF context. "
            "You MAY also include helpful outside knowledge, but cite the PDF if it has the answer."
        )
    else:
        sys_prompt = (
            "You are a helpful assistant. Answer using ONLY the PDF context below. "
            "If the answer isn't present, say so (do NOT add outside knowledge)."
        )
    prompt = f"""{sys_prompt}

Context:
{context}
Question: {question}
Answer:"""
    response = client.models.generate_content(
        model=model_name,
        contents=prompt
    )
    return response.text.strip()

# ---- 6. Main Routine ----
if __name__ == "__main__":
    load_dotenv()
    # IMPORTANT: Set GOOGLE_API_KEY (or GEMINI_API_KEY) in your .env!
    client = genai.Client()  # Reads the key from environment automatically

    # --------- User Input Section ------------
    PDF_PATH = "ML_Resume.pdf"
    if not os.path.exists(PDF_PATH):
        PDF_PATH = input("Enter your PDF file name/path: ").strip()

    full_text = extract_text_pdf_with_ocr(PDF_PATH)
    chunks = chunk_text(full_text, chunk_size=500, overlap=50)
    print(f"[INFO] PDF split into {len(chunks)} chunks.")

    embed_model, chunk_embeddings = embed_chunks(chunks)

    while True:
        user_question = input("\nAsk a question about your PDF (or 'exit'): ").strip()
        if user_question.lower() == "exit":
            break

        top_chunks = retrieve_top_k(user_question, embed_model, chunk_embeddings, chunks, k=3)
        print("\n--- Top relevant PDF passages ---")
        for idx, (chunk, sim) in enumerate(top_chunks, 1):
            print(f"\nChunk #{idx} (sim={sim:.2f}):\n{chunk[:600]}")

        use_extra = input("\nAllow outside knowledge in answer? (y/n): ").strip().lower() == "y"
        answer = generate_answer_gemini(client, user_question, top_chunks, use_outside_knowledge=use_extra)
        print("\nFinal Answer:\n", answer)

    print("\n[Done]")


[INFO] Text extraction successful without OCR.
[INFO] PDF split into 1 chunks.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


--- Top relevant PDF passages ---

Chunk #1 (sim=0.29):
Yuvaraj S Machine Learning Engineer — Software Engineer (cid:131) +91 9384137766 # ai.yuvaraj21@gmail.com (cid:239) LinkedIn § GitHub (cid:128) Portfolio Aspiring Machine Learning Engineer with a passion for building scalable AI solutions and real-time applications. Technical Skills Languages: Python, JavaScript, HTML, CSS ML/AI: TensorFlow, PyTorch, Scikit-learn, OpenCV, BERT, Neural Networks, Computer Vision, NLP Web/Backend: Django, REST API, MongoDB Tools/Cloud: Git, AWS, Jupyter, Postman, CI/CD Experience Machine Learning Engineer July 2024 – Present Techso IT LLC, USA – Developed real-ti

Final Answer:
 The provided text describes several projects undertaken by Yuvaraj S.  There's not a single, overarching "project," but rather multiple projects detailed under "Key Projects" and within his work experience.  Here's a summary:

**Key Projects:**

* **Library Access Control System:** This project involved creating a Django RES

In [3]:
import faiss
import pickle
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load index and chunks
index = faiss.read_index("ML_Resume.pdf.faiss")
with open("ML_Resume.pdf.pkl", "rb") as f:
    chunks = pickle.load(f)

print(f"FAISS has {index.ntotal} segments/chunks.")
for i, chunk in enumerate(chunks):
    print(f"Chunk {i}: {chunk[:100]}...")  # Show first 100 chars

# Show nearest neighbor for first chunk
vec = index.reconstruct(0)
D, I = index.search(np.array([vec]), k=3)
# Filter out invalid indices (-1) before accessing chunks
valid_indices = [int(j) for j in I[0] if j != -1]
print("Top matches for first chunk:", [chunks[j] for j in valid_indices])


FAISS has 1 segments/chunks.
Chunk 0: Yuvaraj S Machine Learning Engineer — Software Engineer (cid:131) +91 9384137766 # ai.yuvaraj21@gmai...
Top matches for first chunk: ['Yuvaraj S Machine Learning Engineer — Software Engineer (cid:131) +91 9384137766 # ai.yuvaraj21@gmail.com (cid:239) LinkedIn § GitHub (cid:128) Portfolio Aspiring Machine Learning Engineer with a passion for building scalable AI solutions and real-time applications. Technical Skills Languages: Python, JavaScript, HTML, CSS ML/AI: TensorFlow, PyTorch, Scikit-learn, OpenCV, BERT, Neural Networks, Computer Vision, NLP Web/Backend: Django, REST API, MongoDB Tools/Cloud: Git, AWS, Jupyter, Postman, CI/CD Experience Machine Learning Engineer July 2024 – Present Techso IT LLC, USA – Developed real-time audio transcription models with 80% accuracy using TensorFlow and Python – Built NLP summarization systems reducing content processing time by 40% with BERT implementation – Integrated ML pipelines with Django REST APIs for 