In [None]:
from langchain.chains import RetrievalQA
from langchain.embeddings import AzureOpenAIEmbeddings
from langchain.chat_models import AzureChatOpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
import os
from dotenv import load_dotenv

load_dotenv()


In [None]:
llm = AzureChatOpenAI(
    azure_endpoint=os.environ.get("AZURE_ENDPOINT"),
    deployment_name=os.environ.get("DEPLOYMENT_NAME"),
    openai_api_version=os.environ.get("OPENAI_API_VERSION"),
    openai_api_key=os.environ.get("AZURE_OPENAI_API_KEY"),
    model_name=os.environ.get("MODEL_NAME"),
)


In [None]:
from langchain.retrievers.multi_vector import MultiVectorRetriever

In [None]:
from langchain.document_loaders import TextLoader
from langchain.embeddings import AzureOpenAIEmbeddings
from langchain.storage import InMemoryStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

In [None]:
loaders = [
    TextLoader("./paul_graham_essay.txt"),
    TextLoader("./state_of_the_union.txt"),
]
docs = []
for l in loaders:
    docs.extend(l.load())
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000)
docs = text_splitter.split_documents(docs)

# Smaller chunks

In [None]:
# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="full_documents", embedding_function=AzureOpenAIEmbeddings(
        azure_endpoint=os.environ.get("AZURE_ENDPOINT"),
        deployment="frm-ai-clust-embed",
        openai_api_version=os.environ.get("OPENAI_API_VERSION"),
        openai_api_key=os.environ.get("AZURE_OPENAI_API_KEY"),
    )
)

# The storage layer for the parent documents
store = InMemoryStore()
id_key = "doc_id"

# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)
import uuid

doc_ids = [str(uuid.uuid4()) for _ in docs]

In [None]:
# The splitter to use to create smaller chunks
child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

In [None]:
sub_docs = []
for i, doc in enumerate(docs):
    _id = doc_ids[i]
    _sub_docs = child_text_splitter.split_documents([doc])
    for _doc in _sub_docs:
        _doc.metadata[id_key] = _id
    sub_docs.extend(_sub_docs)

In [None]:
retriever.vectorstore.add_documents(sub_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))

In [None]:
# Vectorstore alone retrieves the small chunks
retriever.vectorstore.similarity_search("justice breyer")[0]

In [None]:
retriever.get_relevant_documents("justice breyer")[0].page_content

In [None]:
# Retriever returns larger chunks
len(retriever.get_relevant_documents("justice breyer")[0].page_content)

In [None]:
retriever.get_relevant_documents("justice breyer")[0].page_content

# Summaries

In [None]:
import uuid

from langchain.chat_models import AzureChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.document import Document
from langchain.schema.output_parser import StrOutputParser

In [None]:
chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template("Summarize the following document:\n\n{doc}")
    | AzureChatOpenAI(
        azure_endpoint=os.environ.get("AZURE_ENDPOINT"),
        deployment_name=os.environ.get("DEPLOYMENT_NAME"),
        openai_api_version=os.environ.get("OPENAI_API_VERSION"),
        openai_api_key=os.environ.get("AZURE_OPENAI_API_KEY"),
        model_name=os.environ.get("MODEL_NAME"),
    )
    | StrOutputParser()
)

In [None]:
summaries = chain.batch(docs, {"max_concurrency": 5})

In [None]:
# The vectorstore to use to index the child chunks
vectorstore = Chroma(collection_name="summaries", embedding_function=AzureOpenAIEmbeddings(
        azure_endpoint=os.environ.get("AZURE_ENDPOINT"),
        deployment="frm-ai-clust-embed",
        openai_api_version=os.environ.get("OPENAI_API_VERSION"),
        openai_api_key=os.environ.get("AZURE_OPENAI_API_KEY"),
    )
)
# The storage layer for the parent documents
store = InMemoryStore()
id_key = "doc_id"
# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)
doc_ids = [str(uuid.uuid4()) for _ in docs]

In [None]:
summary_docs = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(summaries)
]

In [None]:
retriever.vectorstore.add_documents(summary_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))

In [None]:
sub_docs = vectorstore.similarity_search("justice breyer")

In [None]:
sub_docs[0]

In [None]:
retrieved_docs = retriever.get_relevant_documents("justice breyer")

In [None]:
len(retrieved_docs[0].page_content)

In [None]:
retrieved_docs[0].page_content

# Hypothetical Queries

In [None]:
functions = [
    {
        "name": "hypothetical_questions",
        "description": "Generate hypothetical questions",
        "parameters": {
            "type": "object",
            "properties": {
                "questions": {
                    "type": "array",
                    "items": {"type": "string"},
                },
            },
            "required": ["questions"],
        },
    }
]

In [None]:
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser

chain = (
    {"doc": lambda x: x.page_content}
    # Only asking for 3 hypothetical questions, but this could be adjusted
    | ChatPromptTemplate.from_template(
        "Generate a list of 3 hypothetical questions that the below document could be used to answer:\n\n{doc}"
    )
    | AzureChatOpenAI(
        azure_endpoint=os.environ.get("AZURE_ENDPOINT"),
        deployment_name=os.environ.get("DEPLOYMENT_NAME"),
        openai_api_version=os.environ.get("OPENAI_API_VERSION"),
        openai_api_key=os.environ.get("AZURE_OPENAI_API_KEY"),
        model_name=os.environ.get("MODEL_NAME"),
    ).bind(
        functions=functions, function_call={"name": "hypothetical_questions"}
    )
    | JsonKeyOutputFunctionsParser(key_name="questions")
)

In [None]:
chain.invoke(docs[0])

In [None]:
hypothetical_questions = chain.batch(docs, {"max_concurrency": 5})

In [None]:
# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="hypo-questions", embedding_function=AzureOpenAIEmbeddings(
        azure_endpoint=os.environ.get("AZURE_ENDPOINT"),
        deployment="frm-ai-clust-embed",
        openai_api_version=os.environ.get("OPENAI_API_VERSION"),
        openai_api_key=os.environ.get("AZURE_OPENAI_API_KEY"),
    )
)

# The storage layer for the parent documents
store = InMemoryStore()
id_key = "doc_id"

# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)
doc_ids = [str(uuid.uuid4()) for _ in docs]

In [None]:
question_docs = []
for i, question_list in enumerate(hypothetical_questions):
    question_docs.extend(
        [Document(page_content=s, metadata={id_key: doc_ids[i]}) for s in question_list]
    )

In [None]:
retriever.vectorstore.add_documents(question_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))

In [None]:
sub_docs = vectorstore.similarity_search("justice breyer")

In [None]:
sub_docs

In [None]:
retrieved_docs = retriever.get_relevant_documents("justice breyer")

In [None]:
len(retrieved_docs[0].page_content)

In [None]:
retrieved_docs[0].page_content

# CSV agent

In [None]:
from langchain.embeddings import AzureOpenAIEmbeddings
from langchain.chat_models import AzureChatOpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
import os
from langchain.agents.agent_types import AgentType
from langchain_experimental.agents.agent_toolkits import create_csv_agent
from dotenv import load_dotenv

load_dotenv()

In [None]:
import re

def clean_line(line):
    # This regular expression finds all quoted strings and removes commas from within them
    return re.sub(r'"([^"]*)"', lambda m: m.group(0).replace(',', ''), line)

# Read the CSV file
with open('deniro.csv', 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Clean each line
cleaned_lines = [clean_line(line) for line in lines]

# Write the cleaned data back to a new CSV file
with open('deniro_cleaned.csv', 'w', encoding='utf-8') as file:
    file.writelines(cleaned_lines)


In [None]:
agent = create_csv_agent(
    AzureChatOpenAI(
        azure_endpoint=os.environ.get("AZURE_ENDPOINT"),
        deployment_name=os.environ.get("DEPLOYMENT_NAME"),
        openai_api_version=os.environ.get("OPENAI_API_VERSION"),
        openai_api_key=os.environ.get("AZURE_OPENAI_API_KEY"),
        model_name=os.environ.get("MODEL_NAME"),
    ),
    "./deniro_cleaned.csv",
    verbose=True,
    agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
)

In [None]:
agent.run("How many rows are there?")

# Putting it all together
The plan is as follows:-
- Take the CSV and create different types of vector embeddings of its chunks and store in vector store with `id_key`
- Store the parent documents in docstore once
- Create retriever as a tool
- Pass this to extra tools parameter in `create_csv_agent`

In [1]:
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.embeddings import AzureOpenAIEmbeddings
from langchain.chat_models import AzureChatOpenAI
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.document_loaders.csv_loader import CSVLoader, UnstructuredCSVLoader
from langchain.schema.document import Document
from langchain.agents.agent_toolkits import create_retriever_tool
from langchain_experimental.agents.agent_toolkits import create_csv_agent, create_pandas_dataframe_agent
from langchain.agents.agent_types import AgentType
import pandas as pd

import uuid

import os
from dotenv import load_dotenv

load_dotenv()


True

In [2]:
import re

def clean_line(line):
    # This regular expression finds all quoted strings and removes commas from within them
    return re.sub(r'"([^"]*)"', lambda m: m.group(0).replace(',', ''), line)

# Read the CSV file
with open('rfi.csv', 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Clean each line
cleaned_lines = [clean_line(line) for line in lines]

# Write the cleaned data back to a new CSV file
with open('rfi_cleaned.csv', 'w', encoding='utf-8') as file:
    file.writelines(cleaned_lines)


In [3]:
# Load and parse the CSV data
loader = UnstructuredCSVLoader(file_path='./rfi_cleaned.csv')


In [4]:
text_splitter = RecursiveCharacterTextSplitter()

In [5]:
docs = loader.load_and_split(text_splitter)

In [6]:
len(docs)

3

In [7]:
chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template("Summarize the following document:\n\n{doc}")
    | AzureChatOpenAI(
        azure_endpoint=os.environ.get("AZURE_ENDPOINT"),
        deployment_name=os.environ.get("DEPLOYMENT_NAME"),
        openai_api_version=os.environ.get("OPENAI_API_VERSION"),
        openai_api_key=os.environ.get("AZURE_OPENAI_API_KEY"),
        model_name=os.environ.get("MODEL_NAME"),
    )
    | StrOutputParser()
)

In [8]:
summaries = chain.batch(docs, {"max_concurrency": 5})

In [9]:
summaries

['The document is a request for information from Uber as part of its 2023 Media Agency Review. The document reveals information about the media agency, MediaMonks Inc, and its relevant experience. The agency, registered in 2001 and incorporated in Delaware, US, describes its extensive experience within the tech sector, including working with technology clients across various subverticals. The agency also emphasizes its agility, ability to drive adoption of delivery apps, and its analytics practice. The agency also develops first-to-market technologies like their Inflection Engine. They believe these experiences will be beneficial for Uber.\n\nTheir three key clients relevant to Uber are PayPal, Shopify, and Netflix. However, the document does not explain why these clients are relevant to Uber. The document does not provide any information on how the agency manages conflict and confidentiality, especially when working with key competitors.',
 "Media.Monks's approach to conflict and conf

In [10]:
# The vectorstore to use to index the child chunks
vectorstore = Chroma(collection_name="summaries", embedding_function=AzureOpenAIEmbeddings(
        azure_endpoint=os.environ.get("AZURE_ENDPOINT"),
        deployment="frm-ai-clust-embed",
        openai_api_version=os.environ.get("OPENAI_API_VERSION"),
        openai_api_key=os.environ.get("AZURE_OPENAI_API_KEY"),
    )
)

# The storage layer for the parent documents
store = InMemoryStore()
id_key = "doc_id"

# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)

# Create doc_ids
doc_ids = [str(uuid.uuid4()) for _ in docs]

In [11]:
summary_docs = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(summaries)
]

In [12]:
summary_docs

[Document(page_content='The document is a request for information from Uber as part of its 2023 Media Agency Review. The document reveals information about the media agency, MediaMonks Inc, and its relevant experience. The agency, registered in 2001 and incorporated in Delaware, US, describes its extensive experience within the tech sector, including working with technology clients across various subverticals. The agency also emphasizes its agility, ability to drive adoption of delivery apps, and its analytics practice. The agency also develops first-to-market technologies like their Inflection Engine. They believe these experiences will be beneficial for Uber.\n\nTheir three key clients relevant to Uber are PayPal, Shopify, and Netflix. However, the document does not explain why these clients are relevant to Uber. The document does not provide any information on how the agency manages conflict and confidentiality, especially when working with key competitors.', metadata={'doc_id': '19

In [13]:
retriever.vectorstore.add_documents(summary_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))

In [14]:
# Create the tool
tool = create_retriever_tool(
    retriever,
    "search_rfi_docs",
    "Searches and returns RFI documents that are relevant to generate Request For Proposal (RFP)",
)
tools = [tool]

In [15]:
df = pd.read_csv("./rfi_cleaned.csv")

agent = create_pandas_dataframe_agent(
    AzureChatOpenAI(
        azure_endpoint=os.environ.get("AZURE_ENDPOINT"),
        deployment_name=os.environ.get("DEPLOYMENT_NAME"),
        openai_api_version=os.environ.get("OPENAI_API_VERSION"),
        openai_api_key=os.environ.get("AZURE_OPENAI_API_KEY"),
        model_name=os.environ.get("MODEL_NAME"),
    ),
    df,
    verbose=True,
    agent_type=AgentType.OPENAI_FUNCTIONS,
    extra_tools=tools,
)

In [17]:
prefix = """
You are an advanced RFP document generation system specifically designed for 'TheoremOne.' 
You should be capable of analyzing and extracting key information from RFI documents using appropriate tools. 
The extracted information should then be used to formulate a comprehensive RFP document. 
The RFP should be structured in a clear and concise Q&A format, ensuring that it addresses all critical aspects relevant to TheoremOne's needs and objectives. 
You should prioritize clarity, relevance, and completeness in the RFP document. 
The output should be easily navigable, well-organized, and formatted for professional presentation.
Use any tools you need to accomplish the task highlighted below and only mention the output and nothing else:-\n
"""
prompt = prefix + """How does your agency define innovation in media and how does this influence your approach to planning? Show us one example of truly innovative work that your agency has led """
response = agent.invoke(prompt)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `search_rfi_docs` with `{'query': 'How does your agency define innovation in media and how does this influence your approach to planning? Show us one example of truly innovative work that your agency has led'}`


[0m

Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


[33;1m[1;3m[Document(page_content="UBER - Media Agency Review 2023 - Request for Information\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nQuestion\nSub Question (if necessary)\nAgency Answer (please submit all answers within the yellow fields below)\n\n\n\nAgency Details\nCompany Name:\n\nMediaMonks Inc\n\n\n\n\nYear Company Registered:\n\n2001\n\n\n\n\nCentral/ Head office address:\n\n1214 Abbott Kinney Blvd\\nVenice, CA 90291\n\n\n\n\nCountry of Incorporation:\n\nUS - Delaware\n\n\n\nCategory/Relevant Experience\nPlease detail your experience within Uber's category (e.g. technology ride share delivery)? What from this experience do you think will be relevant and of benefit for Uber - (max. 300 words)\n\nAs an organization, Media.Monks has developed a key focus on the tech sector, with a majority of our revenue across all practices coming from leading, global technology companies. This experience allows us to bring a unique point of view to Uber's business from a series key vantage points:\\n\\nFir

In [18]:
"""How does your agency define innovation in media and how does this influence your approach to planning? Show us one example of truly innovative work that your agency has led """
print(response["output"])

From the RFI documents, the agency MediaMonks Inc, defines innovation in media through the development of leading, first-to-market technologies that create efficiencies not just for themselves, but for their advertiser clients as well. Their approach to planning is influenced by anticipating changes in consumer sentiment and analyzing competitive behavior to inform and influence media strategies.

An example of their truly innovative work is the Inflection Engine. It is a proprietary tool that uniquely captures signals - both macro and behavioral, that anticipate changes in consumer sentiment and analyze competitive behavior to inform and influence media strategies. 

This approach and example demonstrate how MediaMonks Inc. not only engages with clients but also how they build best-in-class technologies to drive results. They recognize the need to move and adapt at the pace of change, and bring technologies that will help accelerate decisions and drive performance for their clients.
