In [1]:
import PyPDF2
from sentence_transformers import SentenceTransformer
import faiss
from transformers import pipeline


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def extract_text_from_pdf(file_path):
    reader = PyPDF2.PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

pdf_path = "data.pdf"  # Make sure your PDF is named 'data.pdf'
text = extract_text_from_pdf(pdf_path)
print("✅ Extracted text length:", len(text))


✅ Extracted text length: 642


In [5]:
def chunk_text(text, chunk_size=500):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

chunks = chunk_text(text)
print("✅ Number of chunks:", len(chunks))


✅ Number of chunks: 2


In [7]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

vectors = embedding_model.encode(chunks)
index = faiss.IndexFlatL2(vectors.shape[1])
index.add(vectors)

print("✅ Vector index created!")




✅ Vector index created!


In [9]:
def search_chunks(query, chunks, model, index, top_k=1):
    q_vector = model.encode([query])
    _, I = index.search(q_vector, top_k)
    return [chunks[i] for i in I[0]]


In [17]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline

model_name = "deepset/roberta-base-squad2"  # PyTorch-only model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [19]:
def ask_question(question):
    context = search_chunks(question, chunks, embedding_model, index)[0]
    result = qa_pipeline(question=question, context=context)
    return result["answer"]


In [21]:
question = "What is the main idea of the document?"  # Change this as needed
answer = ask_question(question)
print("🧠 Answer:", answer)


🧠 Answer: Earth is the only planet known to support life


In [23]:
question = "How many planets are there in Solar System?"  # Change this as needed
answer = ask_question(question)
print("🧠 Answer:", answer)


🧠 Answer: eight


In [25]:
question = "Largest planet?"  # Change this as needed
answer = ask_question(question)
print("🧠 Answer:", answer)


🧠 Answer: Jupiter
