# RAG Implementation with Azure OpenAI and LangChain

In [1]:
!pip install langchain-community langchainhub langchain-openai chromadb langchain langchain-experimental --quiet

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tavily-python 0.3.1 requires tiktoken==0.5.2, but you have tiktoken 0.7.0 which is incompatible.
steamship 2.17.34 requires tiktoken~=0.4.0, but you have tiktoken 0.7.0 which is incompatible.
llama-index 0.9.48 requires requests>=2.31.0, but you have requests 2.28.2 which is incompatible.
llama-index-legacy 0.9.48 requires requests>=2.31.0, but you have requests 2.28.2 which is incompatible.
llama-index-core 0.10.18.post1 requires requests>=2.31.0, but you have requests 2.28.2 which is incompatible.
langflow 0.6.10 requires pydantic<3.0.0,>=2.6.0, but you have pydantic 1.10.15 which is incompatible.
langflow 0.6.10 requires tiktoken<0.7.0,>=0.6.0, but you have tiktoken 0.7.0 which is incompatible.


In [2]:
!pip install pypdf faiss-cpu --quiet

  error: subprocess-exited-with-error
  
  × Building wheel for jq (pyproject.toml) did not run successfully.
  │ exit code: 1
  ╰─> [5 lines of output]
      running bdist_wheel
      running build
      running build_ext
      Executing: ./configure CFLAGS=-fPIC -pthread --disable-maintainer-mode --with-oniguruma=builtin
      error: [WinError 2] The system cannot find the file specified
      [end of output]
  
  note: This error originates from a subprocess, and is likely not a problem with pip.
  ERROR: Failed building wheel for jq
ERROR: Could not build wheels for jq, which is required to install pyproject.toml-based projects


In [1]:
api_key = "xxxxxxxxxxxxxxxxxxxxxxxx"
api_version = "2023-07-01-preview" # "2023-05-15"
azure_endpoint = "https://xxxxxxxxx.openai.azure.com/"
model_name = "gpt-35-turbo"
embedding_model = "text-embedding-ada-002"

In [3]:
import os
os.environ["OPENAI_API_VERSION"] = api_version
os.environ["AZURE_OPENAI_ENDPOINT"] = azure_endpoint
os.environ["AZURE_OPENAI_API_KEY"] = api_key

In [4]:
doc_paths = ["https://www.morningstar.com/content/dam/marketing/shared/research/methodology/771945_Morningstar_Rating_for_Funds_Methodology.pdf",
             "https://www.morningstar.in/docs/methodology/CategoryDefinitionsIndiaV3.pdf",
             "https://s21.q4cdn.com/198919461/files/doc_downloads/press_kits/2016/Morningstar-Sustainable-Investing-Handbook.pdf"]

In [5]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loaders = [PyPDFLoader(pdf, extract_images=False) for pdf in doc_paths]

docs = []

for loader in loaders:
    doc = loader.load()
    docs.extend(doc)

In [6]:
len(docs)

46

In [7]:
# drop pages which have less than 100 characters (e.g. header pages, empty separater pages)
docs = [doc for doc in docs if len(doc.page_content.strip())>100]
len(docs)

45

In [8]:
# FOr all remaining document, checking the average characters count
sum(len(doc.page_content) for doc in docs)/len(docs)

2665.911111111111

In [9]:
# split the documents(each pdf page as one document) , into multiple so that at max there are 3500 characters in one document, with overlap of 500 characters while splitting
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3500, chunk_overlap=500)
splits = text_splitter.split_documents(docs)
len(splits)

53

In [10]:
print(splits[2].page_content)

©2021 Morningstar, Inc. All rights reserved. The information in this document is the property of Morningstar, Inc. Reproduction or transcription by any means, in whole or in part, without the prior written 
consent of Morningstar, Inc., is prohibited.
 The Morningstar RatingTM for Funds    August 2021 Page 3 of 21
3
3
3bond funds domiciled in Europe against other European high-yield bond funds. For more information 
about available categories, please contact your local Morningstar office.
Style Profiles
A style profile may be considered a summary of a fund’s risk-factor exposures. Fund categories 
define groups of funds whose members are similar enough in their risk-factor exposures that return 
comparisons between them are useful.
The risk factors on which fund categories are based can relate to value-growth orientation; 
capitalization; industry sector, geographic region, and country weights; duration and credit quality; 
historical return volatility; beta; and many other investment 

In [11]:
from langchain_openai import AzureOpenAIEmbeddings
embeddings = AzureOpenAIEmbeddings(azure_deployment=embedding_model)

In [30]:
# initializng the vector store using CHromaDB
#from langchain_community.vectorstores import FAISS
#from langchain.embeddings import HuggingFaceBgeEmbeddings
#embedding_model_name = "BAAI/bge-large-en-v1.5"
#embeddings = HuggingFaceBgeEmbeddings(  model_name=embedding_model_name,)



In [12]:
from langchain_community.vectorstores import FAISS
# Using embedding model, to embed documents to vector and store to a vector db (Inmemory vectorDB - FAISS)
vectorstore = FAISS.from_documents(documents=splits, embedding=embeddings)

# using vector db object to initialize a retriever object - to perform vector search/retrieval
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 2})

In [13]:
retrieved_docs = retriever.invoke("What is Large Cap equity fund")
len(retrieved_docs)

2

In [14]:
print(retrieved_docs[0].page_content)

?  
 
 
 
 
 
 
 
 
 
 
 Categor y Definitions  
India 
Equity  
 
Large-Cap 
Large-Cap funds primarily consist of stocks which a re the Top 100 stocks by full market capitalization  of 
the equity market. These funds invest at least 80% of total assets in Indian equities and the balance can 
be invested in other asset classes such as fixed in come and overseas equities, among others. Funds in 
this category would invest at least 80% of their to tal assets in large-cap stocks. 
Morningstar Category Index: S&P BSE 100 TR 
 
Mid-Cap 
Mid-Cap funds primarily consist of stocks ranked 10 1st to 250th by full market capitalization of the 
equity market. These funds invest at least 65% of t otal assets in Indian equities, and the balance can  be 
invested in other asset classes such as fixed incom e and overseas equities, among others. Funds in thi s 
category would invest at least 65% of their total a ssets in mid-cap stocks. 
Morningstar Category Index: S&P BSE Mid Cap TR 
 
Small-Cap 
Smal

In [15]:
print(retrieved_docs[1].page_content)

©2019 Morningstar, Inc. All rights reserved. The in formation in this document is the property of Morni ngstar, Inc. Reproduction or transcription by any m eans, in whole or part, without  
the prior written consent of Morningstar, Inc., is prohibited. Category Definitions , India  | 26 February 2021  Page 2 of 12  
Multi- Cap 
Multi-Cap funds invest at least 75% of their total assets in Indian equities, and the balance can be 
invested in other asset classes such as fixed incom e and overseas equities, among others. These funds 
will invest a minimum of 25% each in Large Cap, Mid  Cap and Small Cap stocks. 
Morningstar Category Index: S&P BSE 500 TR 
 
 
Large & Mid- Cap 
Large & Mid-Cap funds primarily consist of stocks w hich are the Top 250 stocks by full market 
capitalization of the equity market. These funds in vest at least 70% of total assets in Indian equitie s and 
the balance can be invested in other asset classes such as fixed income and overseas equities, among 
others. F

### Implementing RAG Chain

In [16]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

message = """
Answer this question using the provided context only.

{question}

Context:
{context}
"""

prompt = ChatPromptTemplate.from_messages([("human", message)])



In [17]:
from langchain_openai import AzureChatOpenAI
llm = AzureChatOpenAI(deployment_name=model_name)

rag_chain = {"context": retriever, "question": RunnablePassthrough()} | prompt | llm


In [18]:
response = rag_chain.invoke("tell me about mid cap market")

print(response.content)

The mid-cap market consists of stocks ranked 101st to 250th by full market capitalization of the equity market. Mid-cap funds invest at least 65% of total assets in Indian equities, and the balance can be invested in other asset classes such as fixed income and overseas equities, among others. Funds in this category would invest at least 65% of their total assets in mid-cap stocks.


### Implementing RAG Agent

### Creating retriever Tool

In [19]:
from langchain.tools.retriever import create_retriever_tool

tool = create_retriever_tool(
    retriever,
    "searchCapitalMarket",
    "Searches and returns excerpts about trading stocks markets shares capital markets, DO not use it for any other info than that of capital market/ finance questions",
)
tools = [tool,]

In [20]:
from langchain import hub

prompt = hub.pull("hwchase17/openai-tools-agent")
prompt.messages


[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are a helpful assistant')),
 MessagesPlaceholder(variable_name='chat_history', optional=True),
 HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], template='{input}')),
 MessagesPlaceholder(variable_name='agent_scratchpad')]

In [21]:
from langchain.agents import initialize_agent, load_tools, AgentType
from langchain_openai import AzureChatOpenAI
llm = AzureChatOpenAI(deployment_name=model_name)


In [22]:
from langchain.agents import AgentExecutor, create_openai_tools_agent

agent = create_openai_tools_agent(llm, tools, prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools,verbose=True, handle_parsing_errors=True)

In [23]:
result =agent_executor.invoke({"input": "Hi, I am Anshu"})
print(result)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mHello Anshu! How can I assist you today?[0m

[1m> Finished chain.[0m
{'input': 'Hi, I am Anshu', 'output': 'Hello Anshu! How can I assist you today?'}


In [24]:
result = agent_executor.invoke({"input": "What is large cap market?"})
print(result)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `searchCapitalMarket` with `{'query': 'large cap market'}`


[0m[36;1m[1;3m[Document(page_content='?  \n \n \n \n \n \n \n \n \n \n \n Categor y Definitions  \nIndia \nEquity  \n \nLarge-Cap \nLarge-Cap funds primarily consist of stocks which a re the Top 100 stocks by full market capitalization  of \nthe equity market. These funds invest at least 80% of total assets in Indian equities and the balance can \nbe invested in other asset classes such as fixed in come and overseas equities, among others. Funds in \nthis category would invest at least 80% of their to tal assets in large-cap stocks. \nMorningstar Category Index: S&P BSE 100 TR \n \nMid-Cap \nMid-Cap funds primarily consist of stocks ranked 10 1st to 250th by full market capitalization of the \nequity market. These funds invest at least 65% of t otal assets in Indian equities, and the balance can  be \ninvested in other asset classes such as fixed incom