This code allows you to build and test a RAG chatbot connected to a Qdrant vector database.

In [None]:
%pip install sec-downloader langchain sentence-transformers qdrant-client unstructured[all-docs] google-generativeai langchain-google-genai lark langchainhub

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from IPython.display import display
from IPython.display import Markdown
import textwrap

def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [5]:
from google.colab import userdata

In [None]:
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain.schema import Document
# from langchain_community.embeddings.openai import OpenAIEmbeddings

In [None]:
import os
from pathlib import Path
import pandas as pd
import seaborn as sns
import re
import unicodedata

In [None]:
%cd /content/drive/MyDrive/Anyone_AI/Final_Project

/content/drive/MyDrive/Anyone_AI/Final_Project


In [None]:
def clean_text(text):
  new_text = unicodedata.normalize("NFKC", text).replace('\n', ' ')
  new_text = re.sub(r'\s+', ' ', text).strip()
  return new_text

In [None]:
def html_to_text(html_path):
  loader = UnstructuredFileLoader(html_path, mode="elements")
  docs = loader.load()

  docs_metadata = [{'page_content': doc.page_content,
                    'source': doc.metadata.get('source'),
                    'page_number': doc.metadata.get('page_number'),
                    'category': doc.metadata.get('category'),
                    'parent_id': doc.metadata.get('parent_id')
                    } for doc in docs]

  docs_metadata_df = pd.DataFrame(docs_metadata)
  docs_metadata_df = docs_metadata_df[(docs_metadata_df['category'] == 'NarrativeText') | (docs_metadata_df['category'] == 'ListItem')]

  docs_metadata_df = docs_metadata_df.groupby('page_number').agg({'page_content':lambda col: ' '.join(col)}).reset_index()
  docs_metadata_df['page_content'] = docs_metadata_df['page_content'].map(lambda text: clean_text(text))

  new_docs = []

  for index, row in docs_metadata_df.iterrows():
    new_doc = Document(page_content=row['page_content'],
                       metadata={'page_number':row['page_number']})
    new_docs.append(new_doc)

  # final_text = ' '.join(docs_metadata_df['page_content'].map(lambda text: clean_text(text)))

  return new_docs

In [None]:
df = html_to_text("zm-20230131.html")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
df = df.groupby('page_number').agg({'page_content':lambda col: ' '.join(col)}).reset_index()

In [None]:
df

In [None]:
df['page_content'] = df['page_content'].map(lambda text: clean_text(text))

In [None]:
df['page_content'].iloc[100]

'Net revenues from “Mobile and ancillary” primarily include revenues from mobile devices. Net revenues from “Other” primarily include revenues from our Distribution business, the Overwatch League, and the Call of Duty League. Intersegment revenues reflect licensing and service fees charged between segments. Long-lived assets by geographic region were as follows (amounts in millions): The only long-lived assets that we classify by region are our long-term tangible fixed assets, which consist of property, plant, and equipment assets and lease right-of-use assets. All other long-term assets are not allocated by location. For information regarding significant customers, see “Concentration of Credit Risk” in On June 5, 2014, the Activision Blizzard, Inc. 2014 Incentive Plan (the “2014 Plan”) became effective. Under the 2014 Plan, the Compensation Committee of our Board of Directors is authorized to provide share-based compensation in the form of stock options, share appreciation rights, res

In [None]:
docs = html_to_text("zm-20230131.html")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
docs[0]

Document(page_content="For the fiscal year ended January 31, 2023 (Exact name of registrant as specified in its Charter) (Registrant’s telephone number, including area code) Securities registered pursuant to Section 12(b) of the Act: Securities registered pursuant to section 12(g) of the Act: None Indicate by check mark if the registrant is a well-known seasoned issuer, as defined in Rule 405 of the Securities Act. Yes ☒ No ☐ Indicate by check mark if the registrant is not required to file reports pursuant to Section 13 or Section 15(d) of the Act. Indicate by check mark whether the registrant: (1) has filed all reports required to be filed by Section 13 or 15(d) of the Securities Exchange Act of 1934 during the preceding 12 months (or for such shorter period that the registrant was required to file such reports), and (2) has been subject to such filing requirements for the past 90 days. Yes ☒ No ☐ Indicate by check mark whether the registrant has submitted electronically every Interac

In [None]:
docs[0].page_content[1000:2000]

"§ 232.405 of this chapter) during the preceding 12 months (or for such shorter period that the registrant was required to submit such files). Yes Indicate by check mark whether the registrant is a large accelerated filer, an accelerated filer, a non-accelerated filer, a smaller reporting company, or an emerging growth company. See the definitions of “large accelerated filer”, “accelerated filer”, “smaller reporting company”, and “emerging growth company” in Rule 12b-2 of the Exchange Act. If an emerging growth company, indicate by check mark if the registrant has elected not to use the extended transition period for complying with any new or revised financial accounting standards provided pursuant to Section 13(a) of the Exchange Act. Indicate by check mark whether the registrant has filed a report on and attestation to its management's assessment of the effectiveness of its internal control over financial reporting under Section 404(b) of the Sarbanes-Oxley Act (15 U.S.C. 7262(b)) by

In [None]:
len(docs[0].page_content)

2671

In [None]:
len(docs)

99

## Gemini

In [1]:
%pip install google-generativeai



In [14]:
import google.generativeai as genai
import os

In [10]:
from google.colab import userdata

In [16]:
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')

In [17]:
genai.configure(api_key=GOOGLE_API_KEY)

In [18]:
safety_settings=[
    {"category":'HARM_CATEGORY_SEXUALLY_EXPLICIT', "threshold":'block_none'},
     {"category":'HARM_CATEGORY_HATE_SPEECH', "threshold":'block_none'},
      {"category":'HARM_CATEGORY_HARASSMENT', "threshold":'block_none'},
       {"category":'HARM_CATEGORY_DANGEROUS_CONTENT', "threshold":'block_none'}
    ]

In [19]:
model = genai.GenerativeModel(model_name = "gemini-pro", safety_settings=safety_settings)
model

 genai.GenerativeModel(
   model_name='models/gemini-pro',
   generation_config={}.
   safety_settings={<HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: 9>: <HarmBlockThreshold.BLOCK_NONE: 4>, <HarmCategory.HARM_CATEGORY_HATE_SPEECH: 8>: <HarmBlockThreshold.BLOCK_NONE: 4>, <HarmCategory.HARM_CATEGORY_HARASSMENT: 7>: <HarmBlockThreshold.BLOCK_NONE: 4>, <HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: 10>: <HarmBlockThreshold.BLOCK_NONE: 4>}
)

In [32]:
chat = model.start_chat()

In [21]:
response = chat.send_message("Who is microsoft's CEO?")

In [25]:
response.text

'Satya Nadella'

In [33]:
response = chat.send_message("Where did she study?")

In [34]:
response.text

'I do not have enough information to answer the question. Please provide more information or ask a different question.'

In [29]:
type(chat.history[0])

google.ai.generativelanguage_v1beta.types.content.Content

In [None]:
response = model.generate_content("Hi Gemini. Please tell me how to use your API.")

In [None]:
to_markdown(response.text)

In [None]:
response.__dict__

## Qdrant

In [None]:
from qdrant_client import QdrantClient
from langchain_community.vectorstores import Qdrant
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceBgeEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
QDRANT_API_KEY = userdata.get('QDRANT_API_KEY')
url = "https://d3c36c99-73dd-4df5-8340-93131b214e3d.us-east4-0.gcp.cloud.qdrant.io:6333"

In [None]:
# embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001",
#                                           task_type='retrieval_document',
#                                           google_api_key=GOOGLE_API_KEY)

In [None]:
model_name = "BAAI/bge-large-en-v1.5"
model_kwargs = {'device': 'cpu'}
# encode_kwargs = {'normalize_embeddings': False}

embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs
)

In [None]:
model_name = "BAAI/bge-large-en-v1.5"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.3k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

In [None]:
texts = text_splitter.split_documents(docs)

In [None]:
texts[225].page_content

'law in Hong Kong has created additional U.S.-China tensions and could potentially increase the risks associated with the business and operations of U.S.-based technology companies in China. Any alterations to our business strategy or operations made in order to adapt to or comply with any such changes would be time-consuming and expensive, and certain of our competitors may be better suited to withstand or react to these changes. Further, in recent years, the U.S. Government has expressed concerns with the security of information and communications technology and services (“ICTS”) sourced from providers in China, Russia, and other jurisdictions. In May 2019, former President Trump issued an executive order that invoked national emergency economic powers to implement a framework to regulate the acquisition or transfer of ICTS in transactions that imposed undue national security risks. The executive order is subject to implementation by the Secretary of Commerce and applies to contracts

In [None]:
texts[225].metadata

{'page_number': 35}

In [None]:
len(texts[0].page_content)

1000

In [None]:
len(texts)

503

In [None]:
%%time

qdrant_db = Qdrant.from_documents(
    documents=texts,
    embedding=embeddings,
    host="8.tcp.ngrok.io",
    port="19664",
    prefer_grpc=False,
    # api_key=QDRANT_API_KEY,
    collection_name="10-k_s-1_zm",
)

CPU times: user 22.2 s, sys: 55.1 ms, total: 22.3 s
Wall time: 39.9 s


In [None]:
qdrant_client = QdrantClient(
    # url=url,
    api_key=QDRANT_API_KEY,
    host="d3c36c99-73dd-4df5-8340-93131b214e3d.us-east4-0.gcp.cloud.qdrant.io",
    port=6333
)

In [None]:
doc_store = Qdrant(
    client=qdrant_client,
    collection_name="10-k_s-1_zm",
    embeddings=embeddings,
)

## RAG (qdrant_db)

In [None]:
from langchain import hub
from langchain_core.output_parsers import StrOutputParser

In [None]:
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
import google.generativeai as genai

In [None]:
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

In [None]:
gemini = ChatGoogleGenerativeAI(model="gemini-pro",
                                google_api_key=GOOGLE_API_KEY,
                                temperature=0.2,
                                convert_system_message_to_human=True)

In [None]:
gemini.invoke('Nice!')

AIMessage(content="Thank you! I'm glad you like it. I tried to make it as informative and engaging as possible. Is there anything else I can help you with today?")

In [None]:
# metadata_field_info = [
#     AttributeInfo(
#         name="genre",
#         description="The genre of the movie",
#         type="string or list[string]",
#     ),
#     AttributeInfo(
#         name="year",
#         description="The year the movie was released",
#         type="integer",
#     ),
#     AttributeInfo(
#         name="director",
#         description="The name of the movie director",
#         type="string",
#     ),
#     AttributeInfo(
#         name="rating", description="A 1-10 rating for the movie", type="float"
#     ),
# ]

In [None]:
metadata_field_info = [
    AttributeInfo(
        name="page_number",
        description="The page number of the html file that contains the information about the fact.",
        type="string",
    )
]

document_content_description = 'A fact about the company.'

In [None]:
retriever = SelfQueryRetriever.from_llm(
    gemini,
    qdrant_db,
    document_content_description,
    metadata_field_info,
    verbose=True,
    enable_limit=True,
    limit=3
)

In [None]:
retriever.get_relevant_documents("Has Zoom had any problems related to the Russian invasion of Ukraine?")

[Document(page_content="fluctuations in foreign currency exchange rates, weighing on our customers' ability to pay for our service on a timely basis; double taxation of our international earnings and potentially adverse tax consequences due to changes in the income and other tax laws of the United States or the international jurisdictions in which we operate, including the imposition of digital services taxes; and higher costs of doing business internationally, including increased accounting, travel, infrastructure, and legal compliance costs. As described above, following Russia’s military invasion of Ukraine in February 2022, the United States, European Union, and other nations announced various sanctions against Russia and export restrictions against Russia and Belarus. Such restrictions include blocking sanctions on some of the largest state-owned and private Russian financial institutions, and their removal from the Society for Worldwide Interbank Financial Telecommunication, or t

In [None]:
prompt = hub.pull("rlm/rag-prompt")

  warn_beta(


In [None]:
prompt

ChatPromptTemplate(input_variables=['context', 'question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))])

In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [None]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables import RunnableParallel

In [None]:
rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | prompt
    | gemini
    | StrOutputParser()
)

In [None]:
rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

In [None]:
rag_chain_with_source.invoke("Has Zoom had any problems related to the Russian invasion of Ukraine?")

{'context': [Document(page_content="fluctuations in foreign currency exchange rates, weighing on our customers' ability to pay for our service on a timely basis; double taxation of our international earnings and potentially adverse tax consequences due to changes in the income and other tax laws of the United States or the international jurisdictions in which we operate, including the imposition of digital services taxes; and higher costs of doing business internationally, including increased accounting, travel, infrastructure, and legal compliance costs. As described above, following Russia’s military invasion of Ukraine in February 2022, the United States, European Union, and other nations announced various sanctions against Russia and export restrictions against Russia and Belarus. Such restrictions include blocking sanctions on some of the largest state-owned and private Russian financial institutions, and their removal from the Society for Worldwide Interbank Financial Telecommuni

## Qdrant (doc_store)

In [None]:
from langchain import hub
from langchain_core.output_parsers import StrOutputParser

In [None]:
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
import google.generativeai as genai

In [None]:
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

In [None]:
gemini = ChatGoogleGenerativeAI(model="gemini-pro",
                                google_api_key=GOOGLE_API_KEY,
                                temperature=0.2,
                                convert_system_message_to_human=True)

In [None]:
metadata_field_info = [
    AttributeInfo(
        name="page_number",
        description="The page number of the html file that contains the information about the fact.",
        type="string",
    )
]

document_content_description = 'A fact about the company.'

In [None]:
retriever = SelfQueryRetriever.from_llm(
    gemini,
    doc_store,
    document_content_description,
    metadata_field_info,
    verbose=True,
    enable_limit=True,
)

In [None]:
retriever.get_relevant_documents("Has Zoom had any problems related to the Russian invasion of Ukraine?")

[Document(page_content="fluctuations in foreign currency exchange rates, weighing on our customers' ability to pay for our service on a timely basis; double taxation of our international earnings and potentially adverse tax consequences due to changes in the income and other tax laws of the United States or the international jurisdictions in which we operate, including the imposition of digital services taxes; and higher costs of doing business internationally, including increased accounting, travel, infrastructure, and legal compliance costs. As described above, following Russia’s military invasion of Ukraine in February 2022, the United States, European Union, and other nations announced various sanctions against Russia and export restrictions against Russia and Belarus. Such restrictions include blocking sanctions on some of the largest state-owned and private Russian financial institutions, and their removal from the Society for Worldwide Interbank Financial Telecommunication, or t

In [None]:
prompt = hub.pull("rlm/rag-prompt")

In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [None]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables import RunnableParallel

In [None]:
rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | prompt
    | gemini
    | StrOutputParser()
)

In [None]:
rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

In [None]:
rag_chain_with_source.invoke("Has Zoom had any problems related to the Russian invasion of Ukraine?")

{'context': [Document(page_content="fluctuations in foreign currency exchange rates, weighing on our customers' ability to pay for our service on a timely basis; double taxation of our international earnings and potentially adverse tax consequences due to changes in the income and other tax laws of the United States or the international jurisdictions in which we operate, including the imposition of digital services taxes; and higher costs of doing business internationally, including increased accounting, travel, infrastructure, and legal compliance costs. As described above, following Russia’s military invasion of Ukraine in February 2022, the United States, European Union, and other nations announced various sanctions against Russia and export restrictions against Russia and Belarus. Such restrictions include blocking sanctions on some of the largest state-owned and private Russian financial institutions, and their removal from the Society for Worldwide Interbank Financial Telecommuni

## 17 html files test

In [None]:
for path in Path('Test_html_files').glob('*'):
  print(str(path))

Test_html_files/atvi-20221231.html
Test_html_files/aapl-20230930.html
Test_html_files/aapl-20231102.html
Test_html_files/msft-20230630.html
Test_html_files/d636199d8k.html
Test_html_files/goog-20221231.html
Test_html_files/goog-20231018.html
Test_html_files/amzn-20221231.html
Test_html_files/tm2329405d1_8k.htm
Test_html_files/nvda-20230129.html
Test_html_files/nvda-20231121.html
Test_html_files/meta-20221231.html
Test_html_files/Registration Statement on Form S-1.html
Test_html_files/meta-20231025.html
Test_html_files/tsla-20221231.html
Test_html_files/Form S-1 Registration Statement.html
Test_html_files/tsla-20240102.html


In [None]:
for path in Path('Test_html_files').glob('*'):
  docs = html_to_text(str(path))
  texts = text_splitter.split_documents(docs)
  qdrant_db = Qdrant.from_documents(
    documents=texts,
    embedding=embeddings,
    url="https://d3c36c99-73dd-4df5-8340-93131b214e3d.us-east4-0.gcp.cloud.qdrant.io:6333",
    prefer_grpc=False,
    api_key=QDRANT_API_KEY,
    collection_name="10-k_s-1_zm",
)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
