<a href="https://colab.research.google.com/github/aicrashcoursewinter24/ai_crashcourselabsLukeA/blob/AGENT_RAG_01-19/NEW_RAG_first_steps.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
! pip install numexpr openai langchain_openai langchain langchain-community duckduckgo-search loguru qdrant-client sentence-transformers tiktoken docx2txt unstructured tqdm

Collecting openai
  Downloading openai-1.9.0-py3-none-any.whl (223 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m223.4/223.4 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain_openai
  Downloading langchain_openai-0.0.3-py3-none-any.whl (28 kB)
Collecting langchain
  Downloading langchain-0.1.3-py3-none-any.whl (803 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.6/803.6 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-community
  Downloading langchain_community-0.0.15-py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting duckduckgo-search
  Downloading duckduckgo_search-4.2-py3-none-any.whl (25 kB)
Collecting loguru
  Downloading loguru-0.7.2-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[

In [3]:
from typing import List, Dict, Any

import xml.etree.ElementTree as ET
import json

from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader
from langchain_core.embeddings import Embeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Qdrant
from langchain.tools.retriever import create_retriever_tool


def parse_xml_to_dict(file_path: str) -> Dict[str, Any]:
    # Parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Function to parse each review element
    def parse_review(review_element: ET.Element) -> Dict[str, Any]:
        review_data = {}
        for child in review_element:
            if child.tag == 'reviewer':
                # Special handling for nested 'reviewer' tag
                reviewer_data = {grandchild.tag: grandchild.text for grandchild in child}
                review_data[child.tag] = reviewer_data
            else:
                review_data[child.tag] = child.text
        return review_data

    # Parse all reviews
    reviews = [parse_review(review) for review in root]
    return reviews


class XMLReviewLoader(BaseLoader):
    def __init__(self, file_path: str):
        self.file_path = file_path

    def load(self) -> List[Document]:
        docs = parse_xml_to_dict(self.file_path)
        return [Document(page_content=json.dumps(doc)) for doc in docs]


class VerboseEmbeddings(Embeddings):
    def __init__(self, embeddings: Embeddings):
        self.embeddings = embeddings

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        print(f"Embedding {len(texts)} documents")
        return self.embeddings.embed_documents(texts)

    def embed_query(self, text: str) -> List[float]:
        print(f"Embedding query: {text}")
        return self.embeddings.embed_query(text)


def build_vectorstore_from_xml_directory(path: str, vectorstore_path: str, loading_glob="**/*.xml",
                                         collection_name="my_documents",
                                         embeddings: Embeddings = VerboseEmbeddings(HuggingFaceEmbeddings())):
    # setting up a document loader which loads all xml reviews in the directory
    dir_loader = DirectoryLoader(path=path, glob=loading_glob, loader_cls=XMLReviewLoader,
                                 show_progress=True, use_multithreading=True)
    docs = dir_loader.load()
    # splitting the xml documents into chunks of 1000 characters with an overlap of 200 characters
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=3000, chunk_overlap=100, add_start_index=True
    )
    all_splits: List[Document] = text_splitter.split_documents(docs)
    print(f"Split {len(docs)} documents into {len(all_splits)} chunks")
    # setting up a vectorstore with the Qdrant backend
    vectorstore = Qdrant.from_documents(
        all_splits,
        embeddings,
        path=vectorstore_path,
        collection_name=collection_name
    )
    # search args could return a different number of nearest neighbors.  Arbitrarily putting "6" here.
    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})
    return retriever


def retriever_tool(retriever):
    """Creates a tool that searches the vectorstore for documents similar to the input text."""
    tool = create_retriever_tool(
        retriever,
        "search_xml_documents",
        "Searches inside of our local store old Best Buy reviews.  " +
        "Useful for understanding how people talk about products from the early 2010's era",
    )
    return tool



.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
!ls drive/MyDrive/CSC480-RAG

reviews_0001_24122_to_98772.xml      reviews_0024_5130266_to_5247642.xml
reviews_0002_98780_to_345240.xml     reviews_0025_5247643_to_5363104.xml
reviews_0003_345248_to_501823.xml    reviews_0026_5363109_to_5806838.xml
reviews_0004_501835_to_691133.xml    reviews_0027_5806868_to_6022633.xml
reviews_0005_691144_to_831905.xml    reviews_0028_6022641_to_6172272.xml
reviews_0006_831928_to_1117787.xml   reviews_0029_6172275_to_6358111.xml
reviews_0007_1117796_to_1382936.xml  reviews_0030_6358125_to_6396149.xml
reviews_0008_1382981_to_1597862.xml  reviews_0031_6396154_to_6423017.xml
reviews_0009_1597878_to_1906168.xml  reviews_0032_6423025_to_6486864.xml
reviews_0010_1906234_to_2063675.xml  reviews_0033_6486865_to_6587831.xml
reviews_0011_2063679_to_2151936.xml  reviews_0034_6587839_to_6701463.xml
reviews_0012_2151940_to_2284992.xml  reviews_0035_6701490_to_6801194.xml
reviews_0013_2285000_to_2440628.xml  reviews_0036_6801200_to_6889357.xml
reviews_0014_2440655_to_2799035.xml  reviews_0037_6

In [6]:

review_dir = "/content/drive/MyDrive/CSC480-RAG/"


In [7]:
retriever = build_vectorstore_from_xml_directory(collection_name="test", path=review_dir, loading_glob="**/reviews_0001_24122_to_98772.xml", vectorstore_path='/content/reviews')


100%|██████████| 1/1 [00:00<00:00,  1.02it/s]


Split 5000 documents into 5024 chunks
Embedding 1 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documents
Embedding 64 documen

In [8]:
review_tool = retriever_tool(retriever)

In [9]:
import os
from getpass import getpass
from langchain.agents import AgentExecutor, AgentType, load_tools, initialize_agent
from langchain.chat_models import ChatAnyscale
from langchain_openai import OpenAIEmbeddings

In [10]:
from google.colab import userdata

llm = ChatAnyscale(anyscale_api_base="https://api.endpoints.anyscale.com/v1",
                   anyscale_api_key=userdata.get("ANYSCALE_API_KEY"),
                   model_name="mistralai/Mixtral-8x7B-Instruct-v0.1",
                   temperature=0.7,
                   verbose=True)

In [11]:
tools = load_tools(["llm-math", "ddg-search"], llm=llm) + [review_tool]

In [12]:
agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True)
print(agent.agent.llm_chain.prompt.template)

Answer the following questions as best you can. You have access to the following tools:

Calculator: Useful for when you need to answer questions about math.
duckduckgo_search: A wrapper around DuckDuckGo Search. Useful for when you need to answer questions about current events. Input should be a search query.
search_xml_documents: Searches inside of our local store old Best Buy reviews.  Useful for understanding how people talk about products from the early 2010's era

Use the following format:

Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [Calculator, duckduckgo_search, search_xml_documents]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question

Begin!

Question: {input}
Thought:{agent_scratchpad}


  warn_deprecated(


In [13]:
agent.invoke("What were the best reviews from the dataset you were given?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m I need to search the local store old Best Buy reviews to find the best reviews. I will use the `search_xml_documents` tool for this.

Action: search_xml_documents
Action Input: best reviews
Observation[0mEmbedding query: best reviews
Observation

Observation: [38;5;200m[1;3m{"id": "81520", "sku": "6844739", "reviewer": {"name": "Rahl"}, "aboutMe": null, "rating": "3.0", "title": "it was ok", "comment": "the film was good for the most part, the way they put it all together and how it all turned out in the end. i liked it but it did have its flaws and listing them would only ruin the movie for someone who intends on seeing it. its a realistic movie.", "submissionTime": "2007-09-25T12:00:38"}

{"id": "46208", "sku": "8199914", "reviewer": {"name": "ACDJ"}, "aboutMe": null, "rating": "5.0", "title": "awesome", "comment": "this is one of my favorite movies. the editorial is very biased, i think this movie deserves way better c

{'input': 'What were the best reviews from the dataset you were given?',
 'output': 'Based on the reviews found, some of the best movies from the dataset include those praised for their engaging plots, humor, realistic approach, and positive reception by the reviewers and their families.'}

In [None]:
# The agent appears to create thoughts as to how it will answer the question as described during class lecture.
# I found it incredibly similar to how someone would look something up on the internet.

In [14]:
agent.invoke("So what is the best review of that which you found?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m The question refers to the best review of a product that I previously found using the `search_xml_documents` tool. I need to find that review again and return the relevant information.

Action: search_xml_documents
Action Input: best review product from the early 2010's
Observation[0mEmbedding query: best review product from the early 2010's
Observation

Observation: [38;5;200m[1;3m{"id": "75553", "sku": "8508572", "reviewer": {"name": "PinoOhki"}, "aboutMe": null, "rating": "4.0", "title": "Text you later", "comment": "Another must have accessory for the Xbox 360, makes entering data for texting and redemption codes easier and quicker. Also a great idea to have it attached to the controller, otherwise you would have to put the controller down and hunt for the message pad if it was independent. I don't know if Microsoft reads these review, but I do have a suggestion that I would like to see. Implementing the use of the me

{'input': 'So what is the best review of that which you found?',
 'output': 'The best review is for a product with SKU "8508572". The reviewer "PinoOhki" rated it 4.0 and titled it "Text you later". The review suggests that this product, likely an accessory for the Xbox 360, is a must-have and proposes a useful feature for future games.'}