In [4]:
from spellchecker import SpellChecker
import re
import pdfplumber

In [5]:
# Initialize the spell checker
spell = SpellChecker()
def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file using pdfplumber.
    
    Args:
    - pdf_path (str): Path to the PDF file.
    
    Returns:
    - dict: Dictionary with page numbers as keys and their corresponding text as values.
    """
    page_text_dict = {}
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            page_text_dict[page_num] = page.extract_text()
    return page_text_dict

def correct_splits_and_typos(text):
    """
    Corrects split words and common typos using a spell checker.
    
    Args:
    - text (str): The text to be corrected.
    
    Returns:
    - str: Corrected text.
    """
    # Fix split words
    def merge_words(match):
        word1, word2 = match.groups()
        combined = word1 + word2
        if combined.lower() in spell:  # Check if the combined word is valid
            return combined
        return f"{word1} {word2}"  # Keep as is if not valid

    text = re.sub(r'\b(\w{2,})\s+(\w{1,4})\b', merge_words, text)

    # Spell-check and correct typos
    corrected_words = []
    for word in text.split():
        corrected_words.append(spell.correction(word) or word)
    return ' '.join(corrected_words)

def clean_and_correct_text_with_spellcheck(page_text_dict):
    """
    Cleans and corrects the extracted text by removing extra spaces, newlines, 
    and dynamically fixing split words and typos.
    
    Args:
    - page_text_dict (dict): Dictionary with page numbers as keys and text as values.
    
    Returns:
    - dict: Cleaned and corrected text for each page.
    """
    cleaned_text_dict = {}
    for page_num, text in page_text_dict.items():
        if text:  # Check if the page has text
            # Remove extra spaces and newlines
            text = re.sub(r'\s+', ' ', text.strip())
            # Dynamically correct split words and typos
            text = correct_splits_and_typos(text)
            cleaned_text_dict[page_num] = text
    return cleaned_text_dict


In [6]:
def chunk_text_by_phrase(clean_text_dict, chunk_size=300):
    """
    Chunks cleaned text by phrases, keeping track of the page numbers.
    
    Args:
    - clean_text_dict (dict): Dictionary with page numbers as keys and cleaned text as values.
    - chunk_size (int): Approximate size of each chunk in characters.
    
    Returns:
    - list of dict: List of dictionaries, where each dictionary represents a chunk with text and page number.
    """
    chunks = []
    for page_num, text in clean_text_dict.items():
        # Split into phrases by punctuation
        phrases = re.split(r'([.!?])', text)  # Keep punctuation as part of the phrase
        
        chunk = ""
        for phrase in phrases:
            if len(chunk) + len(phrase) <= chunk_size:
                chunk += phrase
            else:
                if chunk.strip():
                    chunks.append({"page": page_num, "text": chunk.strip()})
                chunk = phrase
        if chunk.strip():
            chunks.append({"page": page_num, "text": chunk.strip()})
    return chunks

In [7]:
# Example usage
pdf_path = "practice-standard-project-risk-management.pdf"  # Replace with your PDF path

In [7]:
import gradio as gr
from PyPDF2 import PdfReader

def extract_text_from_pdf(uploaded_file):
    """
    Extract text from a PDF file using PyPDF2.
    """
    try:
        reader = PdfReader(uploaded_file.name)
        page_text_dict = {}
        for i, page in enumerate(reader.pages):
            page_text_dict[f"Page {i+1}"] = page.extract_text()
        return page_text_dict
    except Exception as e:
        return f"An error occurred while extracting text: {e}"

# Global variable to store the extracted text
extracted_text_var = None

def process_pdf(file):
    global extracted_text_var
    if file is None:
        extracted_text_var = None
        return "No file uploaded!"
    extracted_text = extract_text_from_pdf(file)
    if isinstance(extracted_text, dict):
        # Format extracted text nicely for display
        formatted_text = "\n\n".join([f"{key}:\n{value}" for key, value in extracted_text.items()])
        extracted_text_var = extracted_text  # Save the raw extracted text to the global variable
        return formatted_text or "No text found in the PDF."
    extracted_text_var = extracted_text  # Save the raw error or result
    return extracted_text


# Create a Gradio interface
interface = gr.Interface(
    fn=process_pdf,
    inputs=gr.File(label="Upload PDF File", file_types=[".pdf"]),
    outputs=gr.Textbox(label="Extracted Text", lines=20)
)

# Launch the interface
interface.launch(share=True)


Running on local URL:  http://127.0.0.1:7863

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




In [44]:
# After running the Gradio interface and uploading the file
print(extracted_text_var)  # This will contain the raw extracted text as a dictionary


{'Page 1': 'PFE\nEDITION 2025\nSTART YOUR CAREER WITH VISION AND PASSION BOOK', 'Page 2': 'S O M M A I R E\n03\n04\n06\n07-21\n22\n23\n24A propos de nous\nNotre Mission\nNos chiffres clés\nNos Sujets PFE\nComment postuler ?\nNos anciens stagiaires temoignent\nContactez-nous', 'Page 3': '3\nA PROPOS\nDE NOUS\nEXCELLIA Solutions est une entreprise tunisienne qui\ninvestie dans l’innovation grâce à un capital social solide de\n8,5 millions de dinars. \nGrâce à cette expertise et ces alliances stratégiques,  \nEXCELLIA Solutions conçoit et intègre des solutions de\npointe, adaptées aux besoins des entreprise modernes.En tant que filiale d’un groupe diversifié opérant dans\nplusieurs secteurs financiers (banque, assurance,\nmicrocrédits, paiements, gestion d’actifs, etc.) et partenaire\nstratégique de Microsoft et d’Oracle, EXCELLIA Solutions\nbénéficie d’un écosystème robuste et intégré.', 'Page 4': "Accompagner nos clients dans la concrétisation de\nleurs ambitions digitales. Grâce à l'ex

In [40]:
page_text_dict

{1: 'Project Management Institute\nPRACTICE STANDARD\nFOR PROJECT RISK MANAGEMENT',
 2: 'ISBN: 978-1-933890-38-8\nPublished by:\nProject Management Institute, Inc.\n14 Campus Boulevard\nNewtown Square, Pennsylvania 19073-3299 USA.\nPhone: +610-356-4600\nFax: +610-356-4647\nE-mail: customercare@pmi.org\nInternet: www.pmi.org\n©2009 Project Management Institute, Inc. All rights reserved.\n“PMI”, the PMI logo, “PMP”, the PMP logo, “PMBOK”, “PgMP”, “Project Management Journal”, “PM Network”, and the PMI\nToday logo are registered marks of Project Management Institute, Inc. The Quarter Globe Design is a trademark of the Project\nManagement Institute, Inc. For a comprehensive list of PMI marks, contact the PMI Legal Department.\nPMI Publications welcomes corrections and comments on its books. Please feel free to send comments on typographical,\nformatting, or other errors. Simply make a copy of the relevant page of the book, mark the error, and send it to: Book Editor,\nPMI Publications, 14 

In [46]:
import gradio as gr

# Define a global variable for the file path
pdf_path = None

def upload_pdf(file):
    global pdf_path
    if file is None:
        pdf_path = None
        return "No file uploaded!"
    # Set the global variable to the file's path
    pdf_path = file.name
    return f"File {file.name} uploaded successfully. You can now proceed with the notebook."

# Create the Gradio interface for uploading the file
interface = gr.Interface(
    fn=upload_pdf,
    inputs=gr.File(label="Upload PDF File", file_types=[".pdf"]),
    outputs=gr.Textbox(label="Status")
)

# Launch the interface
interface.launch(share=True)


Running on local URL:  http://127.0.0.1:7880

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




In [49]:
import gradio as gr

# Define a global variable for the file path
pdf_path = None

def upload_pdf(file):
    global pdf_path
    if file is None:
        pdf_path = None
        return "No file uploaded!"
    # Set the global variable to the file's path
    pdf_path = file.name
    return f"File {file.name} uploaded successfully. You can now proceed with the notebook."

# Create the Gradio interface for uploading the file
interface = gr.Interface(
    fn=upload_pdf,
    inputs=gr.File(label="Upload PDF File", file_types=[".pdf"]),
    outputs=gr.Textbox(label="Status")
)

# Launch the interface
interface.launch(share=True)


Running on local URL:  http://127.0.0.1:7881

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




In [52]:
import gradio as gr
from PyPDF2 import PdfReader

# Global variable to store the PDF file path
pdf_path = None

def extract_text_from_pdf(pdf_file_path):
    """
    Extract text from a PDF file using its file path.
    """
    try:
        # Use PdfReader with the file path
        reader = PdfReader(pdf_file_path)
        page_text_dict = {}
        for i, page in enumerate(reader.pages):
            page_text_dict[f"Page {i+1}"] = page.extract_text()
        return page_text_dict
    except Exception as e:
        return f"An error occurred while extracting text: {e}"

def upload_pdf(file):
    """
    Gradio function to update the global `pdf_path`.
    """
    global pdf_path
    if file is None:
        pdf_path = None
        return "No file uploaded!"
    pdf_path = file.name  # Update the global variable with the file path
    return f"File {file.name} uploaded successfully. You can now proceed with the notebook."

# Create the Gradio interface for uploading the file
interface = gr.Interface(
    fn=upload_pdf,
    inputs=gr.File(label="Upload PDF File", file_types=[".pdf"]),
    outputs=gr.Textbox(label="Status")
)

# Launch the Gradio interface
interface.launch(share=True)


Running on local URL:  http://127.0.0.1:7882

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




In [57]:
pdf_path

'C:\\Users\\msalm\\AppData\\Local\\Temp\\gradio\\3dd7cd7c5467cff8fa844ff31d2cb8199c20697fef5ebceaefd025dcebc5c45b\\practice-standard-project-risk-management.pdf'

In [58]:
# Step 1: Extract text
page_text_dict = extract_text_from_pdf(f"pdf_path")


In [60]:
import gradio as gr
from PyPDF2 import PdfReader

# Global variables
pdf_path = None
page_text_dict = None  # This will store the extracted text

def extract_text_from_pdf(pdf_path):
    """
    Extract text from a PDF file using its path.
    """
    try:
        reader = PdfReader(pdf_path)
        page_text_dict = {}
        for i, page in enumerate(reader.pages):
            page_text_dict[f"Page {i+1}"] = page.extract_text()
        return page_text_dict
    except Exception as e:
        return f"An error occurred while extracting text: {e}"

def upload_pdf(file):
    """
    Gradio upload function to update the global `pdf_path` and extract text.
    """
    global pdf_path, page_text_dict
    if file is None:
        pdf_path = None
        page_text_dict = None
        return "No file uploaded!"
    
    # Update the global variable for the file path
    pdf_path = file.name

    # Extract text immediately and store it in `page_text_dict`
    page_text_dict = extract_text_from_pdf(pdf_path)

    return f"File {file.name} uploaded and text extracted successfully!"

# Create the Gradio interface
interface = gr.Interface(
    fn=upload_pdf,
    inputs=gr.File(label="Upload PDF File", file_types=[".pdf"]),
    outputs=gr.Textbox(label="Status")
)

# Launch the Gradio interface
interface.launch(share=True)


Running on local URL:  http://127.0.0.1:7883

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




In [61]:
print(page_text_dict)


An error occurred while extracting text: PyCryptodome is required for AES algorithm


In [8]:
import gradio as gr
from PyPDF2 import PdfReader

# Global variables
pdf_path = None
page_text_dict = None

def extract_text_from_pdf(pdf_path):
    """
    Extract text from a PDF file, including handling AES-encrypted PDFs.
    """
    try:
        reader = PdfReader(pdf_path)

        # Check if the PDF is encrypted
        if reader.is_encrypted:
            print("The PDF is encrypted. Attempting to decrypt...")
            try:
                # Attempt to decrypt with an empty password
                reader.decrypt("")
                print("Decryption successful!")
            except Exception as e:
                return f"Failed to decrypt PDF. Reason: {e}"

        # Extract text from each page
        page_text_dict = {}
        for i, page in enumerate(reader.pages):
            page_text_dict[f"Page {i+1}"] = page.extract_text() or "No text found on this page."
        return page_text_dict
    except Exception as e:
        return f"An error occurred while extracting text: {e}"

def upload_pdf(file):
    """
    Gradio function to handle file upload and extract text.
    """
    global pdf_path, page_text_dict
    if file is None:
        pdf_path = None
        page_text_dict = None
        return "No file uploaded!"
    
    # Save the file path
    pdf_path = file.name
    
    # Extract text from the file
    page_text_dict = extract_text_from_pdf(pdf_path)
    
    return f"File {file.name} uploaded and text extracted successfully!"

# Create the Gradio interface
interface = gr.Interface(
    fn=upload_pdf,
    inputs=gr.File(label="Upload PDF File", file_types=[".pdf"]),
    outputs=gr.Textbox(label="Status")
)

# Launch the Gradio interface
interface.launch(share=True)


Running on local URL:  http://127.0.0.1:7864

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




In [9]:
import fitz  # PyMuPDF

def extract_text_with_pymupdf(pdf_path):
    """
    Extract text from a PDF using PyMuPDF.
    """
    try:
        doc = fitz.open(pdf_path)
        page_text_dict = {}
        for i, page in enumerate(doc):
            page_text_dict[f"Page {i+1}"] = page.get_text() or "No text found on this page."
        return page_text_dict
    except Exception as e:
        return f"An error occurred while extracting text: {e}"


In [10]:
page_text_dict = extract_text_with_pymupdf(pdf_path)

In [11]:
page_text_dict

{}

In [82]:
# Step 2: Clean and correct text with spell check
cleaned_text_dict = clean_and_correct_text_with_spellcheck(page_text_dict)

In [83]:
# Step 3: Chunk text
chunks = chunk_text_by_phrase(cleaned_text_dict, chunk_size=300)

In [12]:

# Output the chunks
for i, chunk in enumerate(chunks):
    print(f"Chunk {i + 1} (Page {chunk['page']}):")
    print(chunk["text"])
    print("-" * 80)

# Save chunks to a file (optional)
with open("chunked_output.txt", "w", encoding="utf-8") as f:
    for chunk in chunks:
        f.write(f"Page {chunk['page']}:\n{chunk['text']}\n")
        f.write("-" * 80 + "\n")


TypeError: 'NoneType' object is not iterable

In [13]:
from langchain_community.chat_models.ollama import ChatOllama
from langchain.schema import HumanMessage
from langchain.vectorstores import Chroma
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.schema import Document
import re

In [14]:
# Step 1: Initialize the ChatOllama Model
local_model = "llama3:8b"
llm = ChatOllama(model=local_model)

  llm = ChatOllama(model=local_model)


In [15]:
# Step 2: Initialize the Vector Database with Chroma
persist_directory = "./chromadb_store"
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")  # Use SentenceTransformers for embeddings
vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embedding_model)

  embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")  # Use SentenceTransformers for embeddings
  vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embedding_model)


In [16]:
# Step 3: Add Chunks to ChromaDB
def add_chunks_to_chromadb(chunks):
    """
    Adds text chunks with metadata (page numbers) to ChromaDB.

    Args:
    - chunks (list of dict): List of chunks with 'page' and 'text' keys.
    """
    documents = [
        Document(
            page_content=chunk["text"],
            metadata={"page": chunk["page"]}
        ) for chunk in chunks
    ]
    vectorstore.add_documents(documents)

In [None]:
# Step 4: Query ChromaDB
def query_chromadb(query, top_k=3):
    """
    Queries ChromaDB for the most relevant chunks based on a query.

    Args:
    - query (str): The user's query.
    - top_k (int): Number of most relevant results to retrieve.

    Returns:
    - list of dict: Relevant chunks with metadata.
    """
    results = vectorstore.similarity_search_with_score(query, k=top_k)
    return [{"text": result[0].page_content, "page": result[0].metadata["page"], "score": result[1]} for result in results]

  llm = ChatOllama(model=local_model)
  embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")  # Use SentenceTransformers for embeddings
  from tqdm.autonotebook import tqdm, trange
  vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embedding_model)


In [17]:
def ask_book_question_with_references(question):
    """
    Queries the book content and uses the LLaMA model to answer based on the retrieved chunks.
    Includes the most relevant chunk and its page for better understanding.

    Args:
    - question (str): The user's question.

    Returns:
    - str: The model's response along with the relevant chunks and pages for further reading.
    """
    # Query ChromaDB for relevant chunks
    relevant_chunks = query_chromadb(question, top_k=3)
    
    # Combine relevant chunks into a context string
    context = "\n".join([f"Page {chunk['page']}: {chunk['text']}" for chunk in relevant_chunks])
    
    # Create the prompt for the model
    prompt = f"Answer the following question based on the context provided.\n\nContext:\n{context}\n\nQuestion: {question}"
    
    # Get the response from the LLaMA model
    response = llm([HumanMessage(content=prompt)])
    
    # Generate the references for the most relevant chunks
    references = "\n\nFor better understanding, you can refer to the following sections:\n"
    for chunk in relevant_chunks:
        references += f"- Page {chunk['page']}:\n  \"{chunk['text']}\"\n"
    
    # Combine the model's response with the references
    full_response = f"{response.content}\n\n{references}"
    return full_response



In [88]:
import gradio as gr

def chat_with_book(question):
    """
    Gradio interface function to answer questions about the book content.
    
    Args:
    - question (str): The user's input question.
    
    Returns:
    - str: The answer from the LLaMA model along with references.
    """
    return ask_book_question_with_references(question)

# Create Gradio interface
interface = gr.Interface(
    fn=chat_with_book,
    inputs=gr.Textbox(label="Ask a Question About the Book", placeholder="What is the project risk management process?"),
    outputs=gr.Textbox(label="Answer with References"),
    title="Book Question Answering",
    description="Ask any question about the book, and the system will retrieve relevant content and answer using the LLaMA model."
)

# Launch the interface
interface.launch(share=True)


Running on local URL:  http://127.0.0.1:7887

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




In [89]:
def ask_book_question_with_references(question):
    """
    Queries the book content and uses the LLaMA model to answer based on the retrieved chunks.
    Includes the most relevant chunk and its page for better understanding.

    Args:
    - question (str): The user's question.

    Returns:
    - str: A structured response with the answer and references.
    """
    # Query ChromaDB for relevant chunks
    relevant_chunks = query_chromadb(question, top_k=3)
    
    # Combine relevant chunks into a context string
    context = "\n".join([f"Page {chunk['page']}: {chunk['text']}" for chunk in relevant_chunks])
    
    # Create the prompt for the model
    prompt = f"Answer the following question based on the context provided.\n\nContext:\n{context}\n\nQuestion: {question}"
    
    # Get the response from the LLaMA model
    response = llm([HumanMessage(content=prompt)])
    
    # Generate the references for the most relevant chunks
    references = "\n".join([f"- Page {chunk['page']}:\n  \"{chunk['text']}\"" for chunk in relevant_chunks])
    
    # Structure the final response
    structured_response = f"### Answer:\n{response.content}\n\n### For Better Understanding, Refer to:\n{references}"
    return structured_response


In [90]:
import gradio as gr

def chat_with_book(question):
    """
    Gradio interface function to answer questions about the book content.
    """
    return ask_book_question_with_references(question)

# Create Gradio interface
interface = gr.Interface(
    fn=chat_with_book,
    inputs=gr.Textbox(label="Ask a Question About the Book", placeholder="What is the project risk management process?"),
    outputs=gr.Textbox(label="Answer with References", lines=15),
    title="Book Question Answering",
    description="Ask any question about the book, and the system will retrieve relevant content and answer using the LLaMA model."
)

# Launch the interface
interface.launch(share=True)


Running on local URL:  http://127.0.0.1:7888

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




In [91]:
import gradio as gr

def chat_with_book(question):
    """
    Handles user input and returns the answer and references as separate outputs.
    
    Args:
    - question (str): The user's query.
    
    Returns:
    - tuple: (Answer, References)
    """
    response = ask_book_question_with_references(question)
    
    # Split the response into Answer and References
    sections = response.split("### For Better Understanding, Refer to:")
    answer = sections[0].strip("### Answer:\n").strip()
    references = sections[1].strip() if len(sections) > 1 else "No references found."
    
    return answer, references

# Create the Gradio interface
interface = gr.Interface(
    fn=chat_with_book,
    inputs=gr.Textbox(label="Ask a Question About the Book", placeholder="Type your question here and press Enter...", lines=1),
    outputs=[
        gr.Textbox(label="Answer", lines=10),
        gr.Textbox(label="References", lines=10)
    ],
    title="Book Question Answering",
    description="Ask a question about the book, and the system will retrieve relevant content and provide an answer along with references.",
    live=True  # Makes it behave like pressing Enter submits the query
)

# Launch the interface
interface.launch(share=True)


Running on local URL:  http://127.0.0.1:7889

Thanks for being a Gradio user! If you have questions or feedback, please join our Discord server and chat with us: https://discord.gg/feTf9x3ZSB

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




In [92]:
import gradio as gr

def chat_with_book(question):
    """
    Handles user input and returns the answer and references as separate outputs.
    
    Args:
    - question (str): The user's query.
    
    Returns:
    - tuple: (Answer, References)
    """
    response = ask_book_question_with_references(question)
    
    # Split the response into Answer and References
    sections = response.split("### For Better Understanding, Refer to:")
    answer = sections[0].strip("### Answer:\n").strip()
    references = sections[1].strip() if len(sections) > 1 else "No references found."
    
    return answer, references

# Create the Gradio interface
interface = gr.Interface(
    fn=chat_with_book,
    inputs=gr.Textbox(label="Ask a Question About the Book", placeholder="Type your question here and press Enter...", lines=1),
    outputs=[
        gr.Textbox(label="Answer", lines=10),
        gr.Textbox(label="References", lines=10)
    ],
    title="Book Question Answering",
    description="Ask a question about the book, and the system will retrieve relevant content and provide an answer along with references.",
    live=True  # Makes it behave like pressing Enter submits the query
)

# Launch the interface
interface.launch(share=True)


Running on local URL:  http://127.0.0.1:7890

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




In [95]:
import gradio as gr

# Global variables
page_text_dict = None
cleaned_text_dict = None
chunks = None

def upload_book(file):
    """
    Uploads the book (PDF) and processes it for chunking and querying.
    """
    global page_text_dict, cleaned_text_dict, chunks
    if file is None:
        return "No file uploaded!"
    
    # Save file path and process the PDF
    pdf_path = file.name
    page_text_dict = extract_text_from_pdf(pdf_path)
    if isinstance(page_text_dict, str):  # Check if error occurred
        return page_text_dict

    # Clean and chunk the text
    cleaned_text_dict = clean_and_correct_text_with_spellcheck(page_text_dict)
    chunks = chunk_text_by_phrase(cleaned_text_dict, chunk_size=300)

    # Add chunks to ChromaDB
    add_chunks_to_chromadb(chunks)
    return f"Book '{file.name}' uploaded and processed successfully!"

def chat_with_book(question):
    """
    Handles user input and returns the answer and references as separate outputs.
    """
    if chunks is None:
        return "No book has been uploaded yet. Please upload a book first.", "No references available."

    response = ask_book_question_with_references(question)
    
    # Split the response into Answer and References
    sections = response.split("### For Better Understanding, Refer to:")
    answer = sections[0].strip("### Answer:\n").strip()
    references = sections[1].strip() if len(sections) > 1 else "No references found."
    
    return answer, references

# Create the Gradio interface
with gr.Blocks() as interface:
    gr.Markdown("# 📚 Book Question Answering System")
    gr.Markdown("1. **Upload a book (PDF) to process.**\n2. **Ask any question about the book and get relevant answers with references.**")
    
    with gr.Tab("Upload Book"):
        with gr.Row():
            file_input = gr.File(label="Upload Book (PDF)", file_types=[".pdf"])
            upload_status = gr.Textbox(label="Upload Status", interactive=False)
        file_input.change(fn=upload_book, inputs=file_input, outputs=upload_status)
    
    with gr.Tab("Ask Question"):
        with gr.Row():
            question_input = gr.Textbox(label="Ask a Question About the Book", placeholder="Type your question here...")
            answer_output = gr.Textbox(label="Answer", interactive=False)
            references_output = gr.Textbox(label="References", interactive=False)
        question_input.submit(fn=chat_with_book, inputs=question_input, outputs=[answer_output, references_output])

# Launch the interface
interface.launch(share=True)


Running on local URL:  http://127.0.0.1:7891

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




In [2]:
import gradio as gr

# Global variables
page_text_dict = None
cleaned_text_dict = None
chunks = None

def upload_book(file):
    """
    Uploads the book (PDF) and processes it for chunking and querying.
    """
    global page_text_dict, cleaned_text_dict, chunks
    if file is None:
        return "No file uploaded!"
    
    # Save file path and process the PDF
    pdf_path = file.name
    page_text_dict = extract_text_with_pymupdf(pdf_path)
    if isinstance(page_text_dict, str):  # Check if error occurred
        return page_text_dict

    # Clean and chunk the text
    cleaned_text_dict = clean_and_correct_text_with_spellcheck(page_text_dict)
    chunks = chunk_text_by_phrase(cleaned_text_dict, chunk_size=300)

    # Add chunks to ChromaDB
    add_chunks_to_chromadb(chunks)
    return f"Book '{file.name}' uploaded and processed successfully!"

def chat_with_book(question):
    """
    Handles user input and returns the answer and references as separate outputs.
    """
    if chunks is None:
        return "No book has been uploaded yet. Please upload a book first.", "No references available."

    response = ask_book_question_with_references(question)
    
    # Split the response into Answer and References
    sections = response.split("### For Better Understanding, Refer to:")
    answer = sections[0].strip("### Answer:\n").strip()
    references = sections[1].strip() if len(sections) > 1 else "No references found."
    
    return answer, references

# Create Gradio interface
with gr.Blocks() as interface:
    gr.Markdown("# 📚 Book Question Answering System")
    gr.Markdown("1. **Upload a book (PDF) to process.**\n2. **Ask any question about the book and get relevant answers with references.**")
    
    # Upload Section
    with gr.Row():
        file_input = gr.File(label="Upload Book (PDF)", file_types=[".pdf"])
        upload_status = gr.Textbox(label="Upload Status", interactive=False)
    file_input.change(fn=upload_book, inputs=file_input, outputs=upload_status)

    # Question Section
    gr.Markdown("### Ask a Question About the Book")
    with gr.Row():
        question_input = gr.Textbox(label="Type your question here...", placeholder="What is the project risk management process?")
    with gr.Row():
        answer_output = gr.Textbox(label="Answer", interactive=False)
        references_output = gr.Textbox(label="References", interactive=False)
    question_input.submit(fn=chat_with_book, inputs=question_input, outputs=[answer_output, references_output])

# Launch the interface
interface.launch(share=True)


Running on local URL:  http://127.0.0.1:7861

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




Traceback (most recent call last):
  File "c:\Users\msalm\anaconda3\envs\ollama_local_2\lib\site-packages\gradio\queueing.py", line 536, in process_events
    response = await route_utils.call_process_api(
  File "c:\Users\msalm\anaconda3\envs\ollama_local_2\lib\site-packages\gradio\route_utils.py", line 322, in call_process_api
    output = await app.get_blocks().process_api(
  File "c:\Users\msalm\anaconda3\envs\ollama_local_2\lib\site-packages\gradio\blocks.py", line 1935, in process_api
    result = await self.call_function(
  File "c:\Users\msalm\anaconda3\envs\ollama_local_2\lib\site-packages\gradio\blocks.py", line 1520, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
  File "c:\Users\msalm\anaconda3\envs\ollama_local_2\lib\site-packages\anyio\to_thread.py", line 56, in run_sync
    return await get_async_backend().run_sync_in_worker_thread(
  File "c:\Users\msalm\anaconda3\envs\ollama_local_2\lib\site-packages\anyio\_backends\_asyncio.py", line

In [18]:
import gradio as gr
import fitz  # PyMuPDF

# Global variables
page_text_dict = None
cleaned_text_dict = None
chunks = None

def extract_text_with_pymupdf(pdf_path):
    """
    Extract text from a PDF using PyMuPDF.
    
    Args:
    - pdf_path (str): The file path to the PDF.

    Returns:
    - dict: A dictionary with page numbers as keys and text content as values.
    """
    try:
        doc = fitz.open(pdf_path)
        page_text_dict = {}
        for i, page in enumerate(doc):
            page_text_dict[f"Page {i+1}"] = page.get_text("text") or "No text found on this page."
        return page_text_dict
    except Exception as e:
        return f"An error occurred while extracting text: {e}"

def upload_book(file):
    """
    Uploads the book (PDF) and processes it for chunking and querying.
    """
    global page_text_dict, cleaned_text_dict, chunks
    if file is None:
        return "No file uploaded!"
    
    # Save file path and process the PDF
    pdf_path = file.name
    page_text_dict = extract_text_with_pymupdf(pdf_path)
    if isinstance(page_text_dict, str):  # Check if error occurred
        return page_text_dict

    # Clean and chunk the text
    cleaned_text_dict = clean_and_correct_text_with_spellcheck(page_text_dict)
    chunks = chunk_text_by_phrase(cleaned_text_dict, chunk_size=300)

    # Add chunks to ChromaDB
    add_chunks_to_chromadb(chunks)
    return f"Book '{file.name}' uploaded and processed successfully!"

def chat_with_book(question):
    """
    Handles user input and returns the answer and references as separate outputs.
    """
    if chunks is None:
        return "No book has been uploaded yet. Please upload a book first.", "No references available."

    response = ask_book_question_with_references(question)
    
    # Split the response into Answer and References
    sections = response.split("### For Better Understanding, Refer to:")
    answer = sections[0].strip("### Answer:\n").strip()
    references = sections[1].strip() if len(sections) > 1 else "No references found."
    
    return answer, references

# Create Gradio interface
with gr.Blocks() as interface:
    gr.Markdown("# 📚 Book Question Answering System")
    gr.Markdown("1. **Upload a book (PDF) to process.**\n2. **Ask any question about the book and get relevant answers with references.**")
    
    # Upload Section
    with gr.Row():
        file_input = gr.File(label="Upload Book (PDF)", file_types=[".pdf"])
        upload_status = gr.Textbox(label="Upload Status", interactive=False)
    file_input.change(fn=upload_book, inputs=file_input, outputs=upload_status)

    # Question Section
    gr.Markdown("### Ask a Question About the Book")
    with gr.Row():
        question_input = gr.Textbox(label="Type your question here...", placeholder="What is the project risk management process?")
    with gr.Row():
        answer_output = gr.Textbox(label="Answer", interactive=False)
        references_output = gr.Textbox(label="References", interactive=False)
    question_input.submit(fn=chat_with_book, inputs=question_input, outputs=[answer_output, references_output])

# Launch the interface
interface.launch(share=True)


Running on local URL:  http://127.0.0.1:7865

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




In [None]:
# Example Usage
question = "What is the project risk management process?"
answer_with_references = ask_book_question_with_references(question)
print("Answer from the Book:")
print(answer_with_references)


In [19]:
# Example Usage
question = "What is risk management ?"
answer_with_references = ask_book_question_with_references(question)
print("Answer from the Book:")
print(answer_with_references)

Answer from the Book:
According to the context, risk management is a discipline that contains a series of processes to apply to both large and small projects. It is also described as being more effective if its practice is tailored to the project and congruent with the organizational culture, processes, and assets.



For better understanding, you can refer to the following sections:
- Page 15:
  "Different projects organizations and situations will require different approaches to Project Risk management In particular risk management is a discipline that contains a series of processes to apply to both large and small projects Risk management will be more effective if its practice is tailored to the project and congruent with the organizational culture processes and assets There are many different ways of conducting risk management that may comply with the principles of Project Risk Management as presented in this practice standard 3"
- Page 15:
  "Different projects organizations and s

In [20]:
# Example Usage
question = "What is risk management porcess ?"
answer_with_references = ask_book_question_with_references(question)
print("Answer from the Book:")
print(answer_with_references)

Answer from the Book:
Based on the context provided, it appears that the question is asking about "project risk management processes". However, I assume you meant to ask what "risk management process" is.

According to general knowledge and project management principles, Risk Management Process refers to a systematic approach to identify, assess, prioritize, mitigate, and monitor risks associated with a project or organization. This process helps to minimize the impact of potential threats on project goals, objectives, and outcomes.



For better understanding, you can refer to the following sections:
- Page 25:
  "3 CHAPTER 3 INTRODUCTION TO PROJECT RISK MANAGEMENT PROCESSES 3."
- Page 25:
  "3 CHAPTER 3 INTRODUCTION TO PROJECT RISK MANAGEMENT PROCESSES 3."
- Page 26:
  "3 CHAPTER 3 i INTRODUCTION TO PROJECT RISK MANAGEMENT PROCESSES 3."



In [21]:
# Example Usage
question = "Who is superman ?"
answer_with_references = ask_book_question_with_references(question)
print("Answer from the Book:")
print(answer_with_references)

Answer from the Book:
I apologize, but there is no information about Superman in the provided context. The pages seem to be blank or contain nothing relevant to the question. Therefore, I cannot provide an answer to this question based on the given context.



For better understanding, you can refer to the following sections:
- Page 124:
  ".g."
- Page 115:
  "."
- Page 122:
  "."

