In [1]:
!apt-get install libreoffice-writer

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  apparmor dictionaries-common fonts-liberation2 fonts-opensymbol hunspell-en-us libabsl20210324
  libabw-0.1-1 libclucene-contribs1v5 libclucene-core1v5 libe-book-0.1-1 libeot0 libepubgen-0.1-1
  libetonyek-0.1-1 libexttextcat-2.0-0 libexttextcat-data libgpgme11 libgpgmepp6 libharfbuzz-icu0
  libhunspell-1.7-0 libhyphen0 liblangtag-common liblangtag1 libmhash2 libmwaw-0.3-3
  libmythes-1.2-0 libodfgen-0.1-1 liborcus-0.17-0 liborcus-parser-0.17-0 libraptor2-0 librasqal3
  librdf0 libreoffice-base-core libreoffice-common libreoffice-core libreoffice-math
  libreoffice-style-colibre librevenge-0.0-0 libtext-iconv-perl libuno-cppu3
  libuno-cppuhelpergcc3-3 libuno-purpenvhelpergcc3-3 libuno-sal3 libuno-salhelpergcc3-3
  libwpd-0.10-10 libwpg-0.3-3 libwps-0.4-4 libxmlsec1 libxmlsec1-nss libyajl2 poppler-data
  python3-uno uno-libs-private ure

In [8]:
!pip install chromadb langchain langchain_community docx2txt langchain-core langchain-google-vertexai google-cloud-aiplatform langchain-experimental gradio unstructured PyPDF2 python-docx python-pptx

Collecting python-pptx
  Downloading python_pptx-0.6.23-py3-none-any.whl (471 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting XlsxWriter>=0.5.7 (from python-pptx)
  Downloading XlsxWriter-3.2.0-py3-none-any.whl (159 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.9/159.9 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: XlsxWriter, python-pptx
Successfully installed XlsxWriter-3.2.0 python-pptx-0.6.23


In [9]:
import sys

# Additional authentication is required for Google Colab
if "google.colab" in sys.modules:
    # Authenticate user to Google Cloud
    from google.colab import auth

    auth.authenticate_user()


if "google.colab" in sys.modules:
    # Define project information
    PROJECT_ID = "aakash-test-env"  # @param {type:"string"}
    LOCATION = "us-central1"  # @param {type:"string"}

    # Initialize Vertex AI
    import vertexai

    vertexai.init(project=PROJECT_ID, location=LOCATION)

In [10]:
from langchain.embeddings import VertexAIEmbeddings
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import Chroma
import os
import pptx
import gradio as gr
from langchain.llms import VertexAI
from langchain.retrievers.document_compressors import LLMChainFilter
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from vertexai.language_models import TextGenerationModel
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import UnstructuredPowerPointLoader
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
from langchain_community.document_loaders.csv_loader import CSVLoader

vertex_embeddings = VertexAIEmbeddings(model_name="textembedding-gecko@003")
prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer.
{context}
Question: {question}
Helpful Answer:"""
# model = TextGenerationModel.from_pretrained("gemini-pro")

def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )

def generate_embeddings_and_vector(texts):
    # Remember vertex_embeddings we have defined above: VertexAIEmbeddings(model_name="textembedding-gecko@001")
    vector_index = Chroma.from_texts(texts, vertex_embeddings).as_retriever()
    return vector_index


def get_similar_documents(vector_index, search_query):
    docs = vector_index.get_relevant_documents(search_query)
    return docs

def generate_final_response(docs, search_query):

    parameters = {
        "candidate_count": 1,
        "max_output_tokens": 1024,
        "temperature": 0.9,
        "top_p": 1
    }

    prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer.
    {context}
    Question: {question}
    Helpful Answer: """.format(context=docs, question=search_query)

    model = TextGenerationModel.from_pretrained("text-bison")
    response = model.predict(prompt_template, **parameters)

    print(response.text)

    return response.text

def process_file(fileobj, search_query):
    file_path = fileobj.name
    filename, file_extension = os.path.splitext(file_path)

    if file_extension == '.txt':
        # return do_something(file_path)
        loader = TextLoader(file_path)
        documents = loader.load()

        text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        context = "\n\n".join(str(p.page_content) for p in documents)
        texts = text_splitter.split_text(context)

    if file_extension == '.pdf':
        # return do_something(file_path)
        loader = PyPDFLoader(file_path)
        documents = loader.load_and_split()

        text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        context = "\n\n".join(str(p.page_content) for p in documents)
        texts = text_splitter.split_text(context)

    if file_extension == '.pptx' or file_extension == '.ppt':
        # return do_something(file_path)
        loader = UnstructuredPowerPointLoader(file_path)
        documents = loader.load_and_split()

        text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        context = "\n\n".join(str(p.page_content) for p in documents)
        texts = text_splitter.split_text(context)

    if file_extension == '.docx' or file_extension == '.doc':
        # return do_something(file_path)
        loader = UnstructuredWordDocumentLoader(file_path)
        documents = loader.load_and_split()

        text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        context = "\n\n".join(str(p.page_content) for p in documents)
        texts = text_splitter.split_text(context)

    if file_extension == '.csv':
        # return do_something(file_path)
        loader = CSVLoader(file_path)
        documents = loader.load()

        # text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        texts = [str(p.page_content) for p in documents]
        # texts = text_splitter.split_text(context)

    if len(texts) > 0:

        vector_index = generate_embeddings_and_vector(texts)

        llm = VertexAI(model_name="gemini-pro")
        _filter = LLMChainFilter.from_llm(llm)
        compression_retriever = ContextualCompressionRetriever(
            base_compressor=_filter, base_retriever=vector_index
        )


        compressed_docs = compression_retriever.get_relevant_documents(
            search_query
        )
        context_text = [i.page_content for i in compressed_docs]
        response_text = generate_final_response(context_text, search_query)
        # print(compressed_docs)
        pretty_print_docs(compressed_docs)
        # return docs[0].page_content
        return response_text

    else:
        return "Failed to load the document"

In [None]:
with gr.Blocks() as demo:
    with gr.Tabs():
        with gr.TabItem("Text Embeddings + ChromaDB + Text Bison"):

            app = gr.Interface(
                fn=process_file,
                inputs=["file", "text"],
                outputs=["textbox"],
                title="Question Answering bot",
                description="Input context and question, then get answers!",
            )

demo.launch(debug=True)

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://795843318d20fc0e0f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Error: source file could not be loaded
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/gradio/queueing.py", line 501, in call_prediction
    output = await route_utils.call_process_api(
  File "/usr/local/lib/python3.10/dist-packages/gradio/route_utils.py", line 252, in call_process_api
    output = await app.get_blocks().process_api(
  File "/usr/local/lib/python3.10/dist-packages/gradio/blocks.py", line 1664, in process_api
    result = await self.call_function(
  File "/usr/local/lib/python3.10/dist-packages/gradio/blocks.py", line 1205, in call_function
    prediction = await anyio.to_thread.run_sync(
  File "/usr/local/lib/python3.10/dist-packages/anyio/to_thread.py", line 33, in run_sync
    return await get_asynclib().run_sync_in_worker_thread(
  File "/usr/local/lib/python3.10/dist-packages/anyio/_backends/_asyncio.py", line 877, in run_sync_in_worker_thread
    return await future
  File "/usr/local/lib/python3.10/dist-packages/anyio/_backend

 The three types of apps are: "Time wasters", "Everyday", and "Occasional". Thanks for asking!
Document 1:

MISSION YOU last year

	Last year was a huge success, with more than 30 passionate and eager students participating in a wide range of TCS functions!

>1

	>What makes a Smart Phone, 		“smart”?

Some people think it’s simply the powerful hardware and base a smart phone solely on that

>1

	>What makes a Smart Phone, 		“smart”?

“However, that’s only part of what makes a smartphone “smart”. The other half of the equation comprises of the applications that run the device”

>1

	> What is an App?		

>	Short for Application

>	A piece of software designed to 	fulfill a particular purpose

> 	What are some examples of 	apps that you use?  

>1

	>Why are apps so popular?

>	Apps typically exist to  solve a 	problem

>	In the olden days people 	carried a whole host of things 	around with them…

>1

	>Remember these?

>1

	>Types of Apps 

1. “Time wasters” 

2.	Everyday 

3



 The release of 60 million barrels of oil from reserves around the world, with 30 million barrels coming from the United States' Strategic Petroleum Reserve, is expected to help blunt gas prices in the country. However, the context does not specify additional information about other measures that may be taken.
Thanks for asking!
Document 1:

To all Americans, I will be honest with you, as I’ve always promised. A Russian dictator, invading a foreign country, has costs around the world. 

And I’m taking robust action to make sure the pain of our sanctions  is targeted at Russia’s economy. And I will use every tool at our disposal to protect American businesses and consumers. 

Tonight, I can announce that the United States has worked with 30 other countries to release 60 Million barrels of oil from reserves around the world.  

America will lead that effort, releasing 30 Million barrels from our own Strategic Petroleum Reserve. And we stand ready to do more if necessary, unified with our