Developed ChatPDF, an interactive application utilizing LangChain and Google's FLAN-T5 XXL model. Users can upload PDF documents, which are processed and used as a knowledge base. The system leverages PyMuPDF for document loading, HuggingFace embeddings, Recursive Character TextSplitter for text chunking, Chroma for vector storage, and HuggingFaceHub for large language model integration. Users can query the document for summaries, explanations, and more, facilitating advanced document interaction and understanding.

Technologies used: LangChain, Google's FLAN-T5 XXL, PyMuPDF, HuggingFaceHub, Chroma, Gradio.

In [None]:
!pip install gradio

In [None]:
!pip install langchain

In [None]:
!pip install -U langchain-community

In [None]:
!pip install pymupdf

In [None]:
!pip install sentence-transformers

In [None]:
!pip install chromadb

In [None]:
!pip install unstructured
!pip install unstructured[local-inference]

In [5]:
import os
import gradio as gr
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import HuggingFaceHub
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyMuPDFLoader

os.environ["HUGGINGFACEHUB_API_TOKEN"] = my_token

chain = None

# Function to load and process the PDF document
def load_doc(pdf_doc):

    global chain   # Global variable to hold the chain object

    try:

        if pdf_doc is None:  # Checks if a file is uploaded
            return "No file uploaded."

        # Loads the PDF document using PyMuPDFLoader
        loader = PyMuPDFLoader(pdf_doc.name)
        documents = loader.load()

        # Creates the HuggingFaceEmbeddings object
        embedding = HuggingFaceEmbeddings()

        # Splits the text into chunks for processing
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        text = text_splitter.split_documents(documents)

        # Creates a Chroma vector database to store the text chunks' embeddings
        db = Chroma.from_documents(text, embedding)

        # Initializing the Hugging Face Hub LLM with specific parameters, I'm using Google's FLAN-T5 here
        llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature": 1.0, "max_length": 256})

        # Creating a RetrievalQA chain using the LLM and Chroma retriever
        chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=db.as_retriever())

        return 'Document has successfully been loaded'
    except Exception as e:
        return f"Error loading document: {str(e)}"


# Function to answer a query using the loaded document
def answer_query(query):

    global chain

    try:
        # Checks if the document has been loaded
        if chain is None:
            return "Document not loaded yet."

        # Run the query through the chain and return the result
        return chain.run(query)
    except Exception as e:
        return f"Error processing query: {str(e)}"

# HTML content for the Gradio interface
html = """
<div style="text-align:center; max-width: 700px;">
    <h1>ChatPDF</h1>
    <p> Upload a PDF File, then click on Load PDF File. <br>
    Once the document has been loaded you can begin chatting with the PDF.
</div>"""

# CSS styling for the Gradio interface
css = """container{max-width:700px; margin-left:auto; margin-right:auto,padding:20px}"""

# Creating the Gradio Blocks interface
with gr.Blocks(css=css, theme=gr.themes.Monochrome()) as demo:
    gr.HTML(html)

    with gr.Column():
        gr.Markdown('ChatPDF')
        pdf_doc = gr.File(label="Load a PDF", file_types=['.pdf'], type='filepath')  # Ensure type is 'file' to get file object

        with gr.Row():
            load_pdf = gr.Button('Load PDF File')
            status = gr.Textbox(label="Status", placeholder='', interactive=False)

        with gr.Row():
            input_query = gr.Textbox(label="Type in your question")
            output_response = gr.Textbox(label="Output")
        submit_query = gr.Button("Submit")

        # Defining the interactions between components
        load_pdf.click(load_doc, inputs=pdf_doc, outputs=status)
        submit_query.click(answer_query, inputs=input_query, outputs=output_response)

demo.launch()


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://a3197ac7259dd57245.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


