In [1]:
import os
import PyPDF2
import openai
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

In [2]:
# Set your OpenAI API key
api_key = "OpenAi api key"
os.environ["OPENAI_API_KEY"] = api_key

In [8]:
def extract_pdf_text(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text
# def extract_pdf_text(pdf_path):
#     text = ""
#     with pdfplumber.open(pdf_path) as pdf:
#         for page in pdf.pages:
#             text += page.extract_text()  # Extract normal text
#             tables = page.extract_tables()  # Extract tables
#             for table in tables:
#                 for row in table:
#                     text += " | ".join(str(cell) if cell else "" for cell in row) + "\n"
#     return text

In [9]:

def extract_text(file_path):
    text = ""
    if file_path.lower().endswith('.pdf'):
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page in pdf_reader.pages:
                text += page.extract_text() or ""
    elif file_path.lower().endswith('.docx'):
        doc = Document(file_path)
        for para in doc.paragraphs:
            text += para.text + "\n"
    else:
        raise ValueError("Unsupported file format. Use PDF or DOCX.")
    
    return text

In [10]:
def prepare_qa_system(pdf_path):
    print("Extracting text from PDF...")
    text = extract_pdf_text(pdf_path)

    print("Splitting text into chunks...")
    # Split text into smaller chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = text_splitter.split_text(text)

    print("Creating embeddings and vector store...")
    # Create embeddings and vector store
    embeddings = OpenAIEmbeddings(openai_api_key=api_key)
    vector_store = Chroma.from_texts(chunks, embeddings)

    # Create Retrieval-based QA system
    retriever = vector_store.as_retriever(search_kwargs={"k": 3})
    qa_chain = RetrievalQA.from_chain_type(
        llm=ChatOpenAI(model="gpt-4", temperature= 0),
        retriever=retriever,
        return_source_documents=True
    )

    return qa_chain

In [11]:
def pdf_qna():
    pdf_path = input("Enter the path to the PDF file: ")
    qa_chain = prepare_qa_system(pdf_path)

    print("\nPDF QnA system is ready! Type your questions below.\n")
    while True:
        query = input("Q: ")
        if query.lower() in ["exit", "quit"]:
            print("Exiting the QnA system. Goodbye!")
            break

        result = qa_chain.invoke({"query": query})
        print(f"A: {result['result']}\n")
        for doc in result["source_documents"]:
            print(f"[Source]: {doc.metadata.get('source', 'No source')}...\n")

In [None]:
pdf_qna()

Enter the path to the PDF file:  sample.pdf


Extracting text from PDF...
Splitting text into chunks...
Creating embeddings and vector store...


  embeddings = OpenAIEmbeddings(openai_api_key=api_key)
  llm=ChatOpenAI(model="gpt-4", temperature= 0),



PDF QnA system is ready! Type your questions below.



Q:  How much was Infosys' goodwill as of March 31, 2024?


A: Infosys' goodwill as of March 31, 2024 was ₹7,303 crore.

[Source]: No source...

[Source]: No source...

[Source]: No source...

