# PDF Document Processing and Question Answering System

This notebook demonstrates how to use the PDF processing and question answering system in an interactive environment.

In [None]:
!pip install apache-flink==2.0.0 apache-flink-libraries==2.0.0 PyPDF2==3.0.1 sentence-transformers==2.2.2 huggingface-hub==0.16.4 faiss-cpu==1.7.4 numpy==1.24.3 pandas==2.0.3 transformers==4.30.2 torch==2.1.2 

## Setup and Imports

In [None]:
from flink_processor import FlinkProcessor
from vector_store import VectorStore
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

## Initialize Components

In [None]:
# Initialize Flink processor
flink_processor = FlinkProcessor()

# Initialize vector store
vector_store = VectorStore()

# Initialize question answering model
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
print("Components initialized successfully!")

## Process PDF Documents

In [None]:
# Specify the directory containing PDF documents
pdf_directory = "docs"

# Process documents using Flink
processed_docs = flink_processor.process_directory(pdf_directory)
print(f"Processed {len(processed_docs)} documents")

## Store Documents in Vector Database

In [None]:
# Store processed documents
for doc in processed_docs:
    vector_store.add_document(
        filename=doc['filename'],
        content=doc['content'],
        embedding=doc['embedding']
    )
print("Documents stored in vector database")

## Question Answering Function

In [None]:
def answer_question(question: str, top_k: int = 3) -> str:
    """Answer a question using the stored documents and LLM."""
    # Get relevant documents
    relevant_docs = vector_store.search(question, top_k=top_k)
    
    # Prepare context from relevant documents
    context = "\n\n".join([f"Document {i+1}:\n{doc['content']}" 
                             for i, doc in enumerate(relevant_docs)])
    
    # Prepare prompt for the model
    prompt = f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"
    
    # Generate answer
    inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True)
    outputs = model.generate(
        inputs["input_ids"],
        max_length=200,
        num_beams=4,
        temperature=0.7
    )
    
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Print relevant documents for reference
    print("\nBased on the following documents:")
    for i, doc in enumerate(relevant_docs):
        print(f"\nDocument {i+1} (from {doc['filename']}):")
        print(f"Relevance: {doc['relevance']} matching chunks, average score: {doc['score']}")
        print(doc['content'][:500] + "...")
    
    return answer

## Interactive Question Answering

In [None]:
# Example questions
questions = [
    "What are the key requirements for the system?",
    "What are the main features of the architecture?",
    "How does the system handle document processing?"
]

# Answer each question
for question in questions:
    print(f"\nQuestion: {question}")
    answer = answer_question(question)
    print(f"\nAnswer: {answer}")

## Custom Questions

You can ask your own questions by calling the `answer_question` function:

In [None]:
# Try your own question
your_question = "What are the similarities in requirements?"
answer = answer_question(your_question)
print(f"\nQuestion: {your_question}")
print(f"\nAnswer: {answer}")