<a href="https://colab.research.google.com/github/blackhawkee/llm-play/blob/main/LaMini_T5_738M_pdf_chat_streamlit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q langchain streamlit streamlit_chat transformers==4.30.2 requests torch einops accelerate bitsandbytes pdfminer.six bs4 sentence_transformers chromadb==0.3.29 pyngrok fake_useragent

  Building wheel for hnswlib (pyproject.toml) ... [?25l[?25hdone
  Building wheel for validators (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires requests==2.27.1, but you have requests 2.31.0 which is incompatible.[0m[31m
[0m

In [2]:
import requests
from fake_useragent import UserAgent


import os

directory = "/content/docs/"

if not os.path.exists(directory):
    os.makedirs(directory)

ua = UserAgent()
header = {'User-Agent':str(ua.chrome)}

file_url = 'https://www.un.org/sites/un2.un.org/files/fastfacts-what-is-climate-change.pdf'
response = requests.get(file_url, headers=header)
print(response)

with open("/content/docs/fastfacts-what-is-climate-change.pdf", "wb") as file:
  file.write(response.content)

<Response [200]>


In [2]:
%%writefile constants.py
import os
from chromadb.config import Settings

#Define the chroma settings
CHROMA_SETTINGS = Settings(
    chroma_db_impl = 'duckdb+parquet',
    persist_directory = "db",
    anonymized_telemetry = False
)

Writing constants.py


In [3]:
%%writefile ingest.py
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, DirectoryLoader, PDFMinerLoader
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
import os
import tempfile
from constants import CHROMA_SETTINGS


persist_directory = "db"


def main():
    for root, dirs, files in os.walk("/content/docs"):
        for file in files:
            if file.endswith(".pdf"):
                print(file)
                loader = PDFMinerLoader(os.path.join(root, file))
    documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    texts = text_splitter.split_documents(documents)
    #create embeddings here
    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS)
    db.persist()
    db=None

def upload_pdf(content):
  with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
        tmp_file.write(content)
        tmp_file_path = tmp_file.name
        print(tmp_file_path)
        loader = PDFMinerLoader(tmp_file_path)

  documents = loader.load()
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
  texts = text_splitter.split_documents(documents)
  #create embeddings here
  embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
  db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS)
  db.persist()
  return db
  #db=None

if __name__ == "__main__":
    main()

Writing ingest.py


In [29]:
%%writefile app.py
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import pipeline
import torch
import base64
import textwrap
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from constants import CHROMA_SETTINGS
#from transformers import logging
import logging
from ingest import upload_pdf
from langchain.chains import ConversationalRetrievalChain
from streamlit_chat import message

logging.basicConfig(filename='app.log', filemode='w', format='%(name)s - %(levelname)s - %(message)s')
#logging.set_verbosity(logging.CRITICAL)

#model and tokenizer loading
#hg_model = "MBZUAI/LaMini-T5-738M"
hg_model = "hkunlp/instructor-large"
tokenizer = AutoTokenizer.from_pretrained(hg_model)
base_model = AutoModelForSeq2SeqLM.from_pretrained(hg_model, device_map='auto', torch_dtype=torch.float32)

#sentence_transformer_model_name = "all-MiniLM-L6-v2"
sentence_transformer_model_name = "all-mpnet-base-v22"

@st.cache_resource
def llm_pipeline():
    pipe = pipeline(
        'text2text-generation',
        model = base_model,
        tokenizer = tokenizer,
        max_length = 256,
        do_sample=True,
        temperature = 0.3,
        top_p = 0.95
    )
    local_llm = HuggingFacePipeline(pipeline=pipe)
    return local_llm

@st.cache_resource
def qa_llm():
    llm = llm_pipeline()
    embeddings = SentenceTransformerEmbeddings(model_name=sentence_transformer_model_name)
    db = Chroma(persist_directory="db", embedding_function=embeddings, client_settings=CHROMA_SETTINGS)
    retriever = db.as_retriever()
    qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
    return qa

def process_answer(instruction):
    response = ''
    instruction = instruction
    qa = qa_llm()
    generated_text = qa(instruction)
    answer = generated_text['result']
    # metadata = generated_text['metadata']
    # for text in generated_text:

    #     print(answer)

    # wrapped_text = textwrap.fill(response, 100)
    # return wrapped_text
    return answer,generated_text

def main():
    st.title("Converse with Your PDF \U0001F917 🐦📄")
    with st.expander("About the App"):
        st.markdown(
            """
            This is a Generative AI powered Question and Answering app that responds to questions about your PDF File.
            """
        )

    uploaded_file = st.sidebar.file_uploader("Upload your Data", type="pdf")
    if uploaded_file :
        db = upload_pdf(uploaded_file.getvalue())
        llm = llm_pipeline()

        chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=db.as_retriever())

        def conversational_chat(query):
            result = chain({"question": query, "chat_history": st.session_state['history']})
            st.session_state['history'].append((query, result["answer"]))
            return result["answer"]

        if 'history' not in st.session_state:
            st.session_state['history'] = []

        if 'generated' not in st.session_state:
            st.session_state['generated'] = ["Hello ! Ask me anything about " + uploaded_file.name + " 🤗"]

        if 'past' not in st.session_state:
            st.session_state['past'] = ["Hey ! 👋"]

        #container for the chat history
        response_container = st.container()
        #container for the user's text input
        container = st.container()

        with container:
            with st.form(key='my_form', clear_on_submit=True):

                user_input = st.text_input("Query:", placeholder="Talk to your pdf data here <:>", key='input')
                submit_button = st.form_submit_button(label='Send')

            if submit_button and user_input:
                output = conversational_chat(user_input)

                st.session_state['past'].append(user_input)
                st.session_state['generated'].append(output)

        if st.session_state['generated']:
            with response_container:
                for i in range(len(st.session_state['generated'])):
                    message(st.session_state["past"][i], is_user=True, key=str(i) + '_user', avatar_style="big-smile")
                    message(st.session_state["generated"][i], key=str(i), avatar_style="thumbs")

if __name__ == '__main__':
    main()

Overwriting app.py




---


Run the app


---



In [None]:
!#python /content/ingest.py

In [30]:
!streamlit run /content/app.py &>/dev/null&

In [6]:
from pyngrok import ngrok

ngrok.kill()

!ngrok config add-authtoken 2TLdGJHytafVAyzDiZTsmQ8DHSg_84gTAQVReZ3duhLYMpLwu
public_url = ngrok.connect('8501')
public_url

Authtoken saved to configuration file: /root/.ngrok2/ngrok.yml




<NgrokTunnel: "https://6f4d-34-91-5-208.ngrok-free.app" -> "http://localhost:8501">

In [10]:
from pyngrok import ngrok

ngrok.kill()

import torch
with torch.no_grad():
    torch.cuda.empty_cache()

In [14]:
!rm -rf /content/db