# PDF CHATBOT
CHAINLIT+PDF+CHROMADB+OPENAI

## 初始環境設定

In [None]:
# 初始環境設定
import os
from pathlib import Path
HOME = str(Path.home())
Add_Binarry_Path=HOME+'/.local/bin:/usr/ubuntu_bin'
os.environ['PATH']=os.environ['PATH']+':'+Add_Binarry_Path

## 確認CUDA版本, 以及否能使用GPU
若無gpu 請點選右側->已連線->變更執行階段類型->T4 Gpu

In [None]:
!nvidia-smi
import torch
torch.cuda.is_available()

## 安裝套件

In [None]:
pip install accelerate chainlit==0.7.700 chromadb cohere gdown kaleido langchain openai pyngrok pdfplumber pypdf python-dotenv sentence-transformers tiktoken -q

## Step1: create app.py

In [None]:
%%bash
cat << \EOF >  app.py
import sys
from typing import List
from tempfile import NamedTemporaryFile
import chainlit as cl
from chainlit.types import AskFileResponse
import chromadb
from chromadb.config import Settings
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PDFPlumberLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.schema import Document
from langchain.schema.embeddings import Embeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.vectorstores.base import VectorStore
from prompts import EXAMPLE_PROMPT, PROMPT, WELCOME_MESSAGE
import os
# INPUT OPENAPI KEY
import os
OPENAI_API_KEY='sk-xxxxxxxxxxx'
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

def process_file(*, file: AskFileResponse) -> List[Document]:
    """Takes a Chailit AskFileResponse, get the document and process and chunk
    it into a list of Langchain's Documents. Each Document has page_content and
    matadata fields. Supports PDF files only.

    Args:
        file (AskFileResponse): User's file input

    Raises:
        TypeError: when the file type is not pdf
        ValueError: when the PDF is not parseable

    Returns:
        List[Document]: chunked documents
    """
    if file.type != "application/pdf":
        raise TypeError("Only PDF files are supported")

    with NamedTemporaryFile() as tempfile:
        tempfile.write(file.content)

        loader = PDFPlumberLoader(tempfile.name)
        documents = loader.load()

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=2000, chunk_overlap=100
        )
        docs = text_splitter.split_documents(documents)

        for i, doc in enumerate(docs):
            doc.metadata["source"] = f"source_{i}"

        if not docs:
            raise ValueError("PDF file parsing failed.")

        return docs


def create_search_engine(
    *, docs: List[Document], embeddings: Embeddings
) -> VectorStore:
    """Takes a list of Langchain Documents and an Langchain embeddings wrapper
    over encoder models, and index the data into a ChromaDB as a search engine

    Args:
        docs (List[Document]): list of documents to be ingested
        embeddings (Embeddings): encoder model

    Returns:
        VectorStore: vector store for RAG
    """
    search_engine = Chroma.from_documents(
        documents=docs,
        embedding=embeddings,
        #client_settings=client_settings,
        collection_name="my_papers"
    )

    return search_engine


@cl.on_chat_start
async def on_chat_start():
    """This function is run at every chat session starts to ask user for file,
    index it, and build the RAG chain.

    Raises:
        SystemError: yolo
    """
    # Asking user to to upload a PDF to chat with
    files = None
    while files is None:
        files = await cl.AskFileMessage(
            content=WELCOME_MESSAGE,
            accept=["application/pdf"],
            max_size_mb=20,
        ).send()
    file = files[0]

    # Process and save data in the user session
    msg = cl.Message(content=f"Processing `{file.name}`...")
    await msg.send()
    docs = process_file(file=file)
    cl.user_session.set("docs", docs)
    msg.content = f"`{file.name}` processed. Loading ..."
    await msg.update()

    # Index documents into search engine
    embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
    try:
        search_engine = await cl.make_async(create_search_engine)(
            docs=docs, embeddings=embeddings
        )
    except Exception as e:
        await cl.Message(content=f"Error: {e}").send()
        raise SystemError
    msg.content = f"`{file.name}` loaded. You can now ask questions!"
    await msg.update()

    # RAG Chain
    llm = ChatOpenAI(
        model="gpt-3.5-turbo-16k-0613", temperature=0, streaming=True
    )
    chain = RetrievalQAWithSourcesChain.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=search_engine.as_retriever(max_tokens_limit=4097),
        chain_type_kwargs={"prompt": PROMPT, "document_prompt": EXAMPLE_PROMPT},
    )

    cl.user_session.set("chain", chain)


@cl.on_message
async def main(message: cl.Message):
    """Invoked whenever we receive a Chainlit message.

    Args:
        message (cl.Message): user input
    """
    chain = cl.user_session.get("chain")  # type: RetrievalQAWithSourcesChain
    response = await chain.acall(
        message.content,
        callbacks=[cl.AsyncLangchainCallbackHandler(stream_final_answer=True)],
    )

    answer = response["answer"]
    sources = response["sources"].strip()
    source_elements = []

    # Get the documents from the user session
    docs = cl.user_session.get("docs")
    metadatas = [doc.metadata for doc in docs]
    all_sources = [m["source"] for m in metadatas]

    # Adding sources to the answer
    if sources:
        found_sources = []

        # Add the sources to the message
        for source in sources.split(","):
            source_name = source.strip().replace(".", "")
            # Get the index of the source
            try:
                index = all_sources.index(source_name)
            except ValueError:
                continue
            
            file = docs[index].metadata['file_path']
            page = docs[index].metadata['page']
            text = docs[index].page_content
            basename = os.path.basename(file)                
            text="[Document: "+basename+", PAGE:"+str(page)+"], "+text
            found_sources.append(source_name)
            # Create the text element referenced in the message
            source_elements.append(cl.Text(content=text, name=source_name))

        if found_sources:
            answer += f"\nSources: {', '.join(found_sources)}"
        else:
            answer += "\nNo sources found"

    await cl.Message(content=answer, elements=source_elements).send()


EOF

## Step2: create prompts.py

In [None]:
%%bash
cat << \EOF >  prompts.py
from langchain.prompts import PromptTemplate

template = """Given the following extracted parts of a long document and a question, create a final answer with references ("SOURCES").
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
ALWAYS return a "SOURCES" field in your answer, with the format "SOURCES: <source1>, <source2>, <source3>, ...".

QUESTION: {question}
=========
{summaries}
=========
FINAL ANSWER:"""

PROMPT = PromptTemplate(
    template=template, input_variables=["summaries", "question"]
)

EXAMPLE_PROMPT = PromptTemplate(
    template="Content: {page_content}\nSource: {source}",
    input_variables=["page_content", "source"],
)

WELCOME_MESSAGE = """\
Welcome to Introduction to LLM App Development Sample PDF QA Application!
To get started:
1. Upload a PDF or text file
2. Ask any question about the file!
"""


EOF

## RUN JOB

In [None]:
# CHAINLIT
!chainlit run app.py -w &> /content/logs.txt &

## ngrok connection, 請修改以下  xxxxxxxxxxx

In [None]:
!ngrok config add-authtoken xxxxxxxxxxxxx

from pyngrok import ngrok
ngrok_tunnel = ngrok.connect(8000)
print('Public URL:', ngrok_tunnel.public_url)

## DELETE JOB, 結束前再執行

In [None]:
ngrok.kill()

In [None]:
!ps -ef |grep chainlit | awk '{print $2}' | xargs kill -9
!ps -ef |grep ngrok | awk '{print $2}' | xargs kill -9