In [8]:
from spellchecker import SpellChecker
import re
import pdfplumber

In [9]:
# Initialize the spell checker
spell = SpellChecker()
def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file using pdfplumber.
    
    Args:
    - pdf_path (str): Path to the PDF file.
    
    Returns:
    - dict: Dictionary with page numbers as keys and their corresponding text as values.
    """
    page_text_dict = {}
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            page_text_dict[page_num] = page.extract_text()
    return page_text_dict

def correct_splits_and_typos(text):
    """
    Corrects split words and common typos using a spell checker.
    
    Args:
    - text (str): The text to be corrected.
    
    Returns:
    - str: Corrected text.
    """
    # Fix split words
    def merge_words(match):
        word1, word2 = match.groups()
        combined = word1 + word2
        if combined.lower() in spell:  # Check if the combined word is valid
            return combined
        return f"{word1} {word2}"  # Keep as is if not valid

    text = re.sub(r'\b(\w{2,})\s+(\w{1,4})\b', merge_words, text)

    # Spell-check and correct typos
    corrected_words = []
    for word in text.split():
        corrected_words.append(spell.correction(word) or word)
    return ' '.join(corrected_words)

def clean_and_correct_text_with_spellcheck(page_text_dict):
    """
    Cleans and corrects the extracted text by removing extra spaces, newlines, 
    and dynamically fixing split words and typos.
    
    Args:
    - page_text_dict (dict): Dictionary with page numbers as keys and text as values.
    
    Returns:
    - dict: Cleaned and corrected text for each page.
    """
    cleaned_text_dict = {}
    for page_num, text in page_text_dict.items():
        if text:  # Check if the page has text
            # Remove extra spaces and newlines
            text = re.sub(r'\s+', ' ', text.strip())
            # Dynamically correct split words and typos
            text = correct_splits_and_typos(text)
            cleaned_text_dict[page_num] = text
    return cleaned_text_dict


In [10]:
def chunk_text_by_phrase(clean_text_dict, chunk_size=300):
    """
    Chunks cleaned text by phrases, keeping track of the page numbers.
    
    Args:
    - clean_text_dict (dict): Dictionary with page numbers as keys and cleaned text as values.
    - chunk_size (int): Approximate size of each chunk in characters.
    
    Returns:
    - list of dict: List of dictionaries, where each dictionary represents a chunk with text and page number.
    """
    chunks = []
    for page_num, text in clean_text_dict.items():
        # Split into phrases by punctuation
        phrases = re.split(r'([.!?])', text)  # Keep punctuation as part of the phrase
        
        chunk = ""
        for phrase in phrases:
            if len(chunk) + len(phrase) <= chunk_size:
                chunk += phrase
            else:
                if chunk.strip():
                    chunks.append({"page": page_num, "text": chunk.strip()})
                chunk = phrase
        if chunk.strip():
            chunks.append({"page": page_num, "text": chunk.strip()})
    return chunks

In [11]:
# Example usage
pdf_path = "practice-standard-project-risk-management.pdf"  # Replace with your PDF path

In [12]:
import gradio as gr
from PyPDF2 import PdfReader

def extract_text_from_pdf(uploaded_file):
    """
    Extract text from a PDF file using PyPDF2.
    """
    try:
        reader = PdfReader(uploaded_file.name)
        page_text_dict = {}
        for i, page in enumerate(reader.pages):
            page_text_dict[f"Page {i+1}"] = page.extract_text()
        return page_text_dict
    except Exception as e:
        return f"An error occurred while extracting text: {e}"

# Global variable to store the extracted text
extracted_text_var = None

def process_pdf(file):
    global extracted_text_var
    if file is None:
        extracted_text_var = None
        return "No file uploaded!"
    extracted_text = extract_text_from_pdf(file)
    if isinstance(extracted_text, dict):
        # Format extracted text nicely for display
        formatted_text = "\n\n".join([f"{key}:\n{value}" for key, value in extracted_text.items()])
        extracted_text_var = extracted_text  # Save the raw extracted text to the global variable
        return formatted_text or "No text found in the PDF."
    extracted_text_var = extracted_text  # Save the raw error or result
    return extracted_text


# Create a Gradio interface
interface = gr.Interface(
    fn=process_pdf,
    inputs=gr.File(label="Upload PDF File", file_types=[".pdf"]),
    outputs=gr.Textbox(label="Extracted Text", lines=20)
)

# Launch the interface
interface.launch(share=True)


Running on local URL:  http://127.0.0.1:7862

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




In [13]:
# After running the Gradio interface and uploading the file
print(extracted_text_var)  # This will contain the raw extracted text as a dictionary


None


In [14]:
page_text_dict

{'Page 1': 'PFE\nEDITION 2025\nSTART YOUR CAREER WITH VISION AND PASSION \nBOOK\n',
 'Page 2': 'S O M M A I R E\n03\n04\n06\n07-21\n22\n23\n24\nA propos de nous\nNotre Mission\nNos chiffres clés\nNos Sujets PFE\nComment postuler ?\nNos anciens stagiaires temoignent\nContactez-nous\n',
 'Page 3': '3\nA PROPOS\nDE NOUS\nEXCELLIA Solutions est une entreprise tunisienne qui\ninvestie dans l’innovation grâce à un capital social solide de\n8,5 millions de dinars. \nGrâce à cette expertise et ces alliances stratégiques,  \nEXCELLIA Solutions conçoit et intègre des solutions de\npointe, adaptées aux besoins des entreprise modernes.\nEn tant que filiale d’un groupe diversifié opérant dans\nplusieurs \nsecteurs \nfinanciers \n(banque, \nassurance,\nmicrocrédits, paiements, gestion d’actifs, etc.) et partenaire\nstratégique de Microsoft et d’Oracle, EXCELLIA Solutions\nbénéficie d’un écosystème robuste et intégré.\n',
 'Page 4': "Accompagner nos clients dans la concrétisation de\nleurs ambitions 

In [15]:
import gradio as gr

# Define a global variable for the file path
pdf_path = None

def upload_pdf(file):
    global pdf_path
    if file is None:
        pdf_path = None
        return "No file uploaded!"
    # Set the global variable to the file's path
    pdf_path = file.name
    return f"File {file.name} uploaded successfully. You can now proceed with the notebook."

# Create the Gradio interface for uploading the file
interface = gr.Interface(
    fn=upload_pdf,
    inputs=gr.File(label="Upload PDF File", file_types=[".pdf"]),
    outputs=gr.Textbox(label="Status")
)

# Launch the interface
interface.launch(share=True)


Running on local URL:  http://127.0.0.1:7863

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




In [16]:
import gradio as gr

# Define a global variable for the file path
pdf_path = None

def upload_pdf(file):
    global pdf_path
    if file is None:
        pdf_path = None
        return "No file uploaded!"
    # Set the global variable to the file's path
    pdf_path = file.name
    return f"File {file.name} uploaded successfully. You can now proceed with the notebook."

# Create the Gradio interface for uploading the file
interface = gr.Interface(
    fn=upload_pdf,
    inputs=gr.File(label="Upload PDF File", file_types=[".pdf"]),
    outputs=gr.Textbox(label="Status")
)

# Launch the interface
interface.launch(share=True)


Running on local URL:  http://127.0.0.1:7864

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




In [17]:
import gradio as gr
from PyPDF2 import PdfReader

# Global variable to store the PDF file path
pdf_path = None

def extract_text_from_pdf(pdf_file_path):
    """
    Extract text from a PDF file using its file path.
    """
    try:
        # Use PdfReader with the file path
        reader = PdfReader(pdf_file_path)
        page_text_dict = {}
        for i, page in enumerate(reader.pages):
            page_text_dict[f"Page {i+1}"] = page.extract_text()
        return page_text_dict
    except Exception as e:
        return f"An error occurred while extracting text: {e}"

def upload_pdf(file):
    """
    Gradio function to update the global `pdf_path`.
    """
    global pdf_path
    if file is None:
        pdf_path = None
        return "No file uploaded!"
    pdf_path = file.name  # Update the global variable with the file path
    return f"File {file.name} uploaded successfully. You can now proceed with the notebook."

# Create the Gradio interface for uploading the file
interface = gr.Interface(
    fn=upload_pdf,
    inputs=gr.File(label="Upload PDF File", file_types=[".pdf"]),
    outputs=gr.Textbox(label="Status")
)

# Launch the Gradio interface
interface.launch(share=True)


Running on local URL:  http://127.0.0.1:7865

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




In [18]:
pdf_path

In [19]:
# Step 1: Extract text
page_text_dict = extract_text_from_pdf(f"pdf_path")


In [20]:
import gradio as gr
from PyPDF2 import PdfReader

# Global variables
pdf_path = None
page_text_dict = None  # This will store the extracted text

def extract_text_from_pdf(pdf_path):
    """
    Extract text from a PDF file using its path.
    """
    try:
        reader = PdfReader(pdf_path)
        page_text_dict = {}
        for i, page in enumerate(reader.pages):
            page_text_dict[f"Page {i+1}"] = page.extract_text()
        return page_text_dict
    except Exception as e:
        return f"An error occurred while extracting text: {e}"

def upload_pdf(file):
    """
    Gradio upload function to update the global `pdf_path` and extract text.
    """
    global pdf_path, page_text_dict
    if file is None:
        pdf_path = None
        page_text_dict = None
        return "No file uploaded!"
    
    # Update the global variable for the file path
    pdf_path = file.name

    # Extract text immediately and store it in `page_text_dict`
    page_text_dict = extract_text_from_pdf(pdf_path)

    return f"File {file.name} uploaded and text extracted successfully!"

# Create the Gradio interface
interface = gr.Interface(
    fn=upload_pdf,
    inputs=gr.File(label="Upload PDF File", file_types=[".pdf"]),
    outputs=gr.Textbox(label="Status")
)

# Launch the Gradio interface
interface.launch(share=True)


Running on local URL:  http://127.0.0.1:7866

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




In [21]:
print(page_text_dict)


None


In [22]:
import gradio as gr
from PyPDF2 import PdfReader

# Global variables
pdf_path = None
page_text_dict = None

def extract_text_from_pdf(pdf_path):
    """
    Extract text from a PDF file, including handling AES-encrypted PDFs.
    """
    try:
        reader = PdfReader(pdf_path)

        # Check if the PDF is encrypted
        if reader.is_encrypted:
            print("The PDF is encrypted. Attempting to decrypt...")
            try:
                # Attempt to decrypt with an empty password
                reader.decrypt("")
                print("Decryption successful!")
            except Exception as e:
                return f"Failed to decrypt PDF. Reason: {e}"

        # Extract text from each page
        page_text_dict = {}
        for i, page in enumerate(reader.pages):
            page_text_dict[f"Page {i+1}"] = page.extract_text() or "No text found on this page."
        return page_text_dict
    except Exception as e:
        return f"An error occurred while extracting text: {e}"

def upload_pdf(file):
    """
    Gradio function to handle file upload and extract text.
    """
    global pdf_path, page_text_dict
    if file is None:
        pdf_path = None
        page_text_dict = None
        return "No file uploaded!"
    
    # Save the file path
    pdf_path = file.name
    
    # Extract text from the file
    page_text_dict = extract_text_from_pdf(pdf_path)
    
    return f"File {file.name} uploaded and text extracted successfully!"

# Create the Gradio interface
interface = gr.Interface(
    fn=upload_pdf,
    inputs=gr.File(label="Upload PDF File", file_types=[".pdf"]),
    outputs=gr.Textbox(label="Status")
)

# Launch the Gradio interface
interface.launch(share=True)


Running on local URL:  http://127.0.0.1:7867

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




In [23]:
import fitz  # PyMuPDF

def extract_text_with_pymupdf(pdf_path):
    """
    Extract text from a PDF using PyMuPDF.
    """
    try:
        doc = fitz.open(pdf_path)
        page_text_dict = {}
        for i, page in enumerate(doc):
            page_text_dict[f"Page {i+1}"] = page.get_text() or "No text found on this page."
        return page_text_dict
    except Exception as e:
        return f"An error occurred while extracting text: {e}"


In [24]:
page_text_dict = extract_text_with_pymupdf(pdf_path)

In [25]:
page_text_dict

{}

In [26]:
# Step 2: Clean and correct text with spell check
cleaned_text_dict = clean_and_correct_text_with_spellcheck(page_text_dict)

In [27]:
# Step 3: Chunk text
chunks = chunk_text_by_phrase(cleaned_text_dict, chunk_size=300)

In [28]:

# Output the chunks
for i, chunk in enumerate(chunks):
    print(f"Chunk {i + 1} (Page {chunk['page']}):")
    print(chunk["text"])
    print("-" * 80)

# Save chunks to a file (optional)
with open("chunked_output.txt", "w", encoding="utf-8") as f:
    for chunk in chunks:
        f.write(f"Page {chunk['page']}:\n{chunk['text']}\n")
        f.write("-" * 80 + "\n")


In [29]:
from langchain_community.chat_models.ollama import ChatOllama
from langchain.schema import HumanMessage
from langchain.vectorstores import Chroma
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.schema import Document
import re

In [30]:
# Step 1: Initialize the ChatOllama Model
local_model = "llama3:8b"
llm = ChatOllama(model=local_model)

  llm = ChatOllama(model=local_model)


In [31]:
# Step 2: Initialize the Vector Database with Chroma
persist_directory = "./chromadb_store"
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")  # Use SentenceTransformers for embeddings
vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embedding_model)

  embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")  # Use SentenceTransformers for embeddings
  vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embedding_model)


In [32]:
# Step 3: Add Chunks to ChromaDB
def add_chunks_to_chromadb(chunks):
    """
    Adds text chunks with metadata (page numbers) to ChromaDB.

    Args:
    - chunks (list of dict): List of chunks with 'page' and 'text' keys.
    """
    documents = [
        Document(
            page_content=chunk["text"],
            metadata={"page": chunk["page"]}
        ) for chunk in chunks
    ]
    vectorstore.add_documents(documents)

In [33]:
# Step 4: Query ChromaDB
def query_chromadb(query, top_k=3):
    """
    Queries ChromaDB for the most relevant chunks based on a query.

    Args:
    - query (str): The user's query.
    - top_k (int): Number of most relevant results to retrieve.

    Returns:
    - list of dict: Relevant chunks with metadata.
    """
    results = vectorstore.similarity_search_with_score(query, k=top_k)
    return [{"text": result[0].page_content, "page": result[0].metadata["page"], "score": result[1]} for result in results]

In [34]:
def ask_book_question_with_references(question):
    """
    Queries the book content and uses the LLaMA model to answer based on the retrieved chunks.
    Includes the most relevant chunk and its page for better understanding.

    Args:
    - question (str): The user's question.

    Returns:
    - str: The model's response along with the relevant chunks and pages for further reading.
    """
    # Query ChromaDB for relevant chunks
    relevant_chunks = query_chromadb(question, top_k=3)
    
    # Combine relevant chunks into a context string
    context = "\n".join([f"Page {chunk['page']}: {chunk['text']}" for chunk in relevant_chunks])
    
    # Create the prompt for the model
    prompt = f"Answer the following question based on the context provided.\n\nContext:\n{context}\n\nQuestion: {question}"
    
    # Get the response from the LLaMA model
    response = llm([HumanMessage(content=prompt)])
    
    # Generate the references for the most relevant chunks
    references = "\n\nFor better understanding, you can refer to the following sections:\n"
    for chunk in relevant_chunks:
        references += f"- Page {chunk['page']}:\n  \"{chunk['text']}\"\n"
    
    # Combine the model's response with the references
    full_response = f"{response.content}\n\n{references}"
    return full_response



In [35]:
import gradio as gr

def chat_with_book(question):
    """
    Gradio interface function to answer questions about the book content.
    
    Args:
    - question (str): The user's input question.
    
    Returns:
    - str: The answer from the LLaMA model along with references.
    """
    return ask_book_question_with_references(question)

# Create Gradio interface
interface = gr.Interface(
    fn=chat_with_book,
    inputs=gr.Textbox(label="Ask a Question About the Book", placeholder="What is the project risk management process?"),
    outputs=gr.Textbox(label="Answer with References"),
    title="Book Question Answering",
    description="Ask any question about the book, and the system will retrieve relevant content and answer using the LLaMA model."
)

# Launch the interface
interface.launch(share=True)


Running on local URL:  http://127.0.0.1:7868

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




In [36]:
def ask_book_question_with_references(question):
    """
    Queries the book content and uses the LLaMA model to answer based on the retrieved chunks.
    Includes the most relevant chunk and its page for better understanding.

    Args:
    - question (str): The user's question.

    Returns:
    - str: A structured response with the answer and references.
    """
    # Query ChromaDB for relevant chunks
    relevant_chunks = query_chromadb(question, top_k=3)
    
    # Combine relevant chunks into a context string
    context = "\n".join([f"Page {chunk['page']}: {chunk['text']}" for chunk in relevant_chunks])
    
    # Create the prompt for the model
    prompt = f"Answer the following question based on the context provided.\n\nContext:\n{context}\n\nQuestion: {question}"
    
    # Get the response from the LLaMA model
    response = llm([HumanMessage(content=prompt)])
    
    # Generate the references for the most relevant chunks
    references = "\n".join([f"- Page {chunk['page']}:\n  \"{chunk['text']}\"" for chunk in relevant_chunks])
    
    # Structure the final response
    structured_response = f"### Answer:\n{response.content}\n\n### For Better Understanding, Refer to:\n{references}"
    return structured_response


In [37]:
import gradio as gr

def chat_with_book(question):
    """
    Gradio interface function to answer questions about the book content.
    """
    return ask_book_question_with_references(question)

# Create Gradio interface
interface = gr.Interface(
    fn=chat_with_book,
    inputs=gr.Textbox(label="Ask a Question About the Book", placeholder="What is the project risk management process?"),
    outputs=gr.Textbox(label="Answer with References", lines=15),
    title="Book Question Answering",
    description="Ask any question about the book, and the system will retrieve relevant content and answer using the LLaMA model."
)

# Launch the interface
interface.launch(share=True)


Running on local URL:  http://127.0.0.1:7869

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




In [38]:
import gradio as gr

def chat_with_book(question):
    """
    Handles user input and returns the answer and references as separate outputs.
    
    Args:
    - question (str): The user's query.
    
    Returns:
    - tuple: (Answer, References)
    """
    response = ask_book_question_with_references(question)
    
    # Split the response into Answer and References
    sections = response.split("### For Better Understanding, Refer to:")
    answer = sections[0].strip("### Answer:\n").strip()
    references = sections[1].strip() if len(sections) > 1 else "No references found."
    
    return answer, references

# Create the Gradio interface
interface = gr.Interface(
    fn=chat_with_book,
    inputs=gr.Textbox(label="Ask a Question About the Book", placeholder="Type your question here and press Enter...", lines=1),
    outputs=[
        gr.Textbox(label="Answer", lines=10),
        gr.Textbox(label="References", lines=10)
    ],
    title="Book Question Answering",
    description="Ask a question about the book, and the system will retrieve relevant content and provide an answer along with references.",
    live=True  # Makes it behave like pressing Enter submits the query
)

# Launch the interface
interface.launch(share=True)


Running on local URL:  http://127.0.0.1:7870

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




In [39]:
import gradio as gr

def chat_with_book(question):
    """
    Handles user input and returns the answer and references as separate outputs.
    
    Args:
    - question (str): The user's query.
    
    Returns:
    - tuple: (Answer, References)
    """
    response = ask_book_question_with_references(question)
    
    # Split the response into Answer and References
    sections = response.split("### For Better Understanding, Refer to:")
    answer = sections[0].strip("### Answer:\n").strip()
    references = sections[1].strip() if len(sections) > 1 else "No references found."
    
    return answer, references

# Create the Gradio interface
interface = gr.Interface(
    fn=chat_with_book,
    inputs=gr.Textbox(label="Ask a Question About the Book", placeholder="Type your question here and press Enter...", lines=1),
    outputs=[
        gr.Textbox(label="Answer", lines=10),
        gr.Textbox(label="References", lines=10)
    ],
    title="Book Question Answering",
    description="Ask a question about the book, and the system will retrieve relevant content and provide an answer along with references.",
    live=True  # Makes it behave like pressing Enter submits the query
)

# Launch the interface
interface.launch(share=True)


Running on local URL:  http://127.0.0.1:7871

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




In [40]:
import gradio as gr

# Global variables
page_text_dict = None
cleaned_text_dict = None
chunks = None

def upload_book(file):
    """
    Uploads the book (PDF) and processes it for chunking and querying.
    """
    global page_text_dict, cleaned_text_dict, chunks
    if file is None:
        return "No file uploaded!"
    
    # Save file path and process the PDF
    pdf_path = file.name
    page_text_dict = extract_text_from_pdf(pdf_path)
    if isinstance(page_text_dict, str):  # Check if error occurred
        return page_text_dict

    # Clean and chunk the text
    cleaned_text_dict = clean_and_correct_text_with_spellcheck(page_text_dict)
    chunks = chunk_text_by_phrase(cleaned_text_dict, chunk_size=300)

    # Add chunks to ChromaDB
    add_chunks_to_chromadb(chunks)
    return f"Book '{file.name}' uploaded and processed successfully!"

def chat_with_book(question):
    """
    Handles user input and returns the answer and references as separate outputs.
    """
    if chunks is None:
        return "No book has been uploaded yet. Please upload a book first.", "No references available."

    response = ask_book_question_with_references(question)
    
    # Split the response into Answer and References
    sections = response.split("### For Better Understanding, Refer to:")
    answer = sections[0].strip("### Answer:\n").strip()
    references = sections[1].strip() if len(sections) > 1 else "No references found."
    
    return answer, references

# Create the Gradio interface
with gr.Blocks() as interface:
    gr.Markdown("# 📚 Book Question Answering System")
    gr.Markdown("1. **Upload a book (PDF) to process.**\n2. **Ask any question about the book and get relevant answers with references.**")
    
    with gr.Tab("Upload Book"):
        with gr.Row():
            file_input = gr.File(label="Upload Book (PDF)", file_types=[".pdf"])
            upload_status = gr.Textbox(label="Upload Status", interactive=False)
        file_input.change(fn=upload_book, inputs=file_input, outputs=upload_status)
    
    with gr.Tab("Ask Question"):
        with gr.Row():
            question_input = gr.Textbox(label="Ask a Question About the Book", placeholder="Type your question here...")
            answer_output = gr.Textbox(label="Answer", interactive=False)
            references_output = gr.Textbox(label="References", interactive=False)
        question_input.submit(fn=chat_with_book, inputs=question_input, outputs=[answer_output, references_output])

# Launch the interface
interface.launch(share=True)


Running on local URL:  http://127.0.0.1:7872

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




In [41]:
import gradio as gr

# Global variables
page_text_dict = None
cleaned_text_dict = None
chunks = None

def upload_book(file):
    """
    Uploads the book (PDF) and processes it for chunking and querying.
    """
    global page_text_dict, cleaned_text_dict, chunks
    if file is None:
        return "No file uploaded!"
    
    # Save file path and process the PDF
    pdf_path = file.name
    page_text_dict = extract_text_with_pymupdf(pdf_path)
    if isinstance(page_text_dict, str):  # Check if error occurred
        return page_text_dict

    # Clean and chunk the text
    cleaned_text_dict = clean_and_correct_text_with_spellcheck(page_text_dict)
    chunks = chunk_text_by_phrase(cleaned_text_dict, chunk_size=300)

    # Add chunks to ChromaDB
    add_chunks_to_chromadb(chunks)
    return f"Book '{file.name}' uploaded and processed successfully!"

def chat_with_book(question):
    """
    Handles user input and returns the answer and references as separate outputs.
    """
    if chunks is None:
        return "No book has been uploaded yet. Please upload a book first.", "No references available."

    response = ask_book_question_with_references(question)
    
    # Split the response into Answer and References
    sections = response.split("### For Better Understanding, Refer to:")
    answer = sections[0].strip("### Answer:\n").strip()
    references = sections[1].strip() if len(sections) > 1 else "No references found."
    
    return answer, references

# Create Gradio interface
with gr.Blocks() as interface:
    gr.Markdown("# 📚 Book Question Answering System")
    gr.Markdown("1. **Upload a book (PDF) to process.**\n2. **Ask any question about the book and get relevant answers with references.**")
    
    # Upload Section
    with gr.Row():
        file_input = gr.File(label="Upload Book (PDF)", file_types=[".pdf"])
        upload_status = gr.Textbox(label="Upload Status", interactive=False)
    file_input.change(fn=upload_book, inputs=file_input, outputs=upload_status)

    # Question Section
    gr.Markdown("### Ask a Question About the Book")
    with gr.Row():
        question_input = gr.Textbox(label="Type your question here...", placeholder="What is the project risk management process?")
    with gr.Row():
        answer_output = gr.Textbox(label="Answer", interactive=False)
        references_output = gr.Textbox(label="References", interactive=False)
    question_input.submit(fn=chat_with_book, inputs=question_input, outputs=[answer_output, references_output])

# Launch the interface
interface.launch(share=True)


Running on local URL:  http://127.0.0.1:7873

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




In [42]:
import gradio as gr
import fitz  # PyMuPDF

# Global variables
page_text_dict = None
cleaned_text_dict = None
chunks = None

def extract_text_with_pymupdf(pdf_path):
    """
    Extract text from a PDF using PyMuPDF.
    
    Args:
    - pdf_path (str): The file path to the PDF.

    Returns:
    - dict: A dictionary with page numbers as keys and text content as values.
    """
    try:
        doc = fitz.open(pdf_path)
        page_text_dict = {}
        for i, page in enumerate(doc):
            page_text_dict[f"Page {i+1}"] = page.get_text("text") or "No text found on this page."
        return page_text_dict
    except Exception as e:
        return f"An error occurred while extracting text: {e}"

def upload_book(file):
    """
    Uploads the book (PDF) and processes it for chunking and querying.
    """
    global page_text_dict, cleaned_text_dict, chunks
    if file is None:
        return "No file uploaded!"
    
    # Save file path and process the PDF
    pdf_path = file.name
    page_text_dict = extract_text_with_pymupdf(pdf_path)
    if isinstance(page_text_dict, str):  # Check if error occurred
        return page_text_dict

    # Clean and chunk the text
    cleaned_text_dict = clean_and_correct_text_with_spellcheck(page_text_dict)
    chunks = chunk_text_by_phrase(cleaned_text_dict, chunk_size=300)

    # Add chunks to ChromaDB
    add_chunks_to_chromadb(chunks)
    return f"Book '{file.name}' uploaded and processed successfully!"

def chat_with_book(question):
    """
    Handles user input and returns the answer and references as separate outputs.
    """
    if chunks is None:
        return "No book has been uploaded yet. Please upload a book first.", "No references available."

    response = ask_book_question_with_references(question)
    
    # Split the response into Answer and References
    sections = response.split("### For Better Understanding, Refer to:")
    answer = sections[0].strip("### Answer:\n").strip()
    references = sections[1].strip() if len(sections) > 1 else "No references found."
    
    return answer, references

# Create Gradio interface
with gr.Blocks() as interface:
    gr.Markdown("# 📚 Book Question Answering System")
    gr.Markdown("1. **Upload a book (PDF) to process.**\n2. **Ask any question about the book and get relevant answers with references.**")
    
    # Upload Section
    with gr.Row():
        file_input = gr.File(label="Upload Book (PDF)", file_types=[".pdf"])
        upload_status = gr.Textbox(label="Upload Status", interactive=False)
    file_input.change(fn=upload_book, inputs=file_input, outputs=upload_status)

    # Question Section
    gr.Markdown("### Ask a Question About the Book")
    with gr.Row():
        question_input = gr.Textbox(label="Type your question here...", placeholder="What is the project risk management process?")
    with gr.Row():
        answer_output = gr.Textbox(label="Answer", interactive=False)
        references_output = gr.Textbox(label="References", interactive=False)
    question_input.submit(fn=chat_with_book, inputs=question_input, outputs=[answer_output, references_output])

# Launch the interface
interface.launch(share=True)


Running on local URL:  http://127.0.0.1:7874

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




In [43]:
# Example Usage
question = "What is the project risk management process?"
answer_with_references = ask_book_question_with_references(question)
print("Answer from the Book:")
print(answer_with_references)


  response = llm([HumanMessage(content=prompt)])


MuPDF error: format error: object out of range (840 0 R); xref size 767

MuPDF error: syntax error: unknown cid font type

MuPDF error: format error: object out of range (842 0 R); xref size 767

MuPDF error: syntax error: unknown cid font type

MuPDF error: format error: object out of range (842 0 R); xref size 767

MuPDF error: syntax error: unknown cid font type

MuPDF error: format error: object out of range (842 0 R); xref size 767

MuPDF error: syntax error: unknown cid font type

MuPDF error: format error: object out of range (842 0 R); xref size 767

MuPDF error: syntax error: unknown cid font type

MuPDF error: format error: object out of range (844 0 R); xref size 767

MuPDF error: syntax error: unknown cid font type

MuPDF error: format error: object out of range (846 0 R); xref size 767

MuPDF error: syntax error: unknown cid font type

MuPDF error: format error: object out of range (846 0 R); xref size 767

MuPDF error: syntax error: unknown cid font type

MuPDF error: for

In [44]:
# Example Usage
question = "What is risk management ?"
answer_with_references = ask_book_question_with_references(question)
print("Answer from the Book:")
print(answer_with_references)

Answer from the Book:
### Answer:
Based on the context, I would answer:

Risk Management is a systematic process that identifies, assesses, and prioritizes potential risks to an organization's goals and objectives. It involves taking proactive steps to mitigate or eliminate these risks before they have a negative impact on the organization. The goal of risk management is to maximize opportunities and minimize threats to ensure the success of the organization.

### For Better Understanding, Refer to:



In [45]:
# Example Usage
question = "What is risk management porcess ?"
answer_with_references = ask_book_question_with_references(question)
print("Answer from the Book:")
print(answer_with_references)

Answer from the Book:
### Answer:
Based on the context, I would answer:

The Risk Management Process is a systematic approach to identify, assess, prioritize, and mitigate or manage potential risks that could impact an organization's objectives. It involves several steps, including:

1. **Risk Identification**: Identifying potential risks that could affect the organization.
2. **Risk Assessment**: Evaluating the likelihood and potential impact of each identified risk.
3. **Risk Prioritization**: Prioritizing risks based on their likelihood and potential impact.
4. **Risk Mitigation**: Developing and implementing strategies to reduce or eliminate the identified risks.
5. **Risk Monitoring**: Continuously monitoring and reviewing the effectiveness of the risk management process.

The goal of risk management is to minimize the likelihood and impact of adverse events, while also identifying opportunities for growth and improvement.

### For Better Understanding, Refer to:



In [46]:
# Example Usage
question = "Who is superman ?"
answer_with_references = ask_book_question_with_references(question)
print("Answer from the Book:")
print(answer_with_references)

Answer from the Book:
### Answer:
Based on the context, Superman is a fictional character known for his incredible strength, speed, and ability to fly. He is a superhero from the DC Comics universe and has been featured in various comic books, movies, TV shows, and other media.

### For Better Understanding, Refer to:

