In [1]:
!pip install langchain | tail -n 1

!pip install langchain-ibm | tail -n 1

!pip install langchain-community | tail -n 1

!pip install ibm-watsonx-ai | tail -n 1

!pip install ibm_watson_machine_learning | tail -n 1

!pip install chromadb | tail -n 1

!pip install tiktoken | tail -n 1

!pip install python-dotenv | tail -n 1

!pip install bs4 | tail -n 1

!pip install pypdf | tail -n 1



In [2]:
import os

from langchain_ibm import WatsonxEmbeddings, WatsonxLLM

from langchain.vectorstores import Chroma

from langchain_community.document_loaders import PyPDFLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

from langchain.prompts import PromptTemplate

from langchain.tools import tool

from langchain.tools.render import render_text_description_and_args

from langchain.agents.output_parsers import JSONAgentOutputParser

from langchain.agents.format_scratchpad import format_log_to_str

from langchain.agents import AgentExecutor

from langchain.memory import ConversationBufferMemory

from langchain_core.runnables import RunnablePassthrough

from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams

from ibm_watsonx_ai.foundation_models.utils.enums import EmbeddingTypes

In [3]:
llm = WatsonxLLM(

    model_id="ibm/granite-3-8b-instruct",

    url="https://us-south.ml.cloud.ibm.com",

    apikey="oQBrX7C8IauX85QIVPvct9ROUGdy0SArQM5qfUjmsQGV",

    project_id="41c124f4-93e9-4c59-8ce3-659034693834",

    params={

        GenParams.DECODING_METHOD: "greedy",

        GenParams.TEMPERATURE: 0,

        GenParams.MIN_NEW_TOKENS: 5,

        GenParams.MAX_NEW_TOKENS: 500,

        GenParams.STOP_SEQUENCES: ["Human:", "Observation"],

    },

)

In [4]:
template = "Answer the {query} accurately. If you do not know the answer, simply say you do not know."

prompt = PromptTemplate.from_template(template)

In [5]:
agent = prompt | llm

In [6]:
# List all PDF files in the current directory
pdf_files = [f for f in os.listdir('.') if f.endswith('.pdf')]

# Load each PDF file using PyPDFLoader
docs = [PyPDFLoader(pdf_file).load() for pdf_file in pdf_files]

# Flatten the list (if each load returns a list of documents)
docs_list = [doc for sublist in docs for doc in sublist]

# Print the first document
print(docs_list[0])

page_content='S
a
m
p
l
e
 
 MEMORANDUM OF UNDERSTANDING 
 THIS MEMORANDUM OF UNDERSTANDING ("MOU") is entered into this 31st day of March, 
 2017 (the "Effective Date") by and between Tally  Inc. ("Tally ") and BuyCo Limited  ("BuyCo"). 
 This MOU confirms the mutual understandings of previous discussions between the parties 
 with respect to the distribution of Products by Tally 's channel partners to the customers within  ARC 
 previously served by Channel Partners, namely, ABC, XYZ, and DMXZ ("Designated  Channel 
 Partners"), as defined in the Amended and Restated BuyCo Distribution Agreement,  effective 
 December 21, 2005, between the parties hereto (the "Distribution Agreement"). The parties  have 
 agreed as follows. 
 1.  Binding  . All terms and conditions expressed in  this MOU are the agreement made in the 
 course of negotiation and shall be executed by the parties in good faith. 
 2.  Limited Distribution by Tally  to BuyCo ARC Accounts  .  Notwithstanding the provisions

In [7]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=500, chunk_overlap=0)

In [8]:
doc_splits = text_splitter.split_documents(docs_list)

In [9]:
embeddings = WatsonxEmbeddings(

    model_id=EmbeddingTypes.IBM_SLATE_30M_ENG.value,

    url="https://us-south.ml.cloud.ibm.com",

    apikey="oQBrX7C8IauX85QIVPvct9ROUGdy0SArQM5qfUjmsQGV",

    project_id="41c124f4-93e9-4c59-8ce3-659034693834",

)

In [10]:
vectorstore = Chroma.from_documents(

    documents=doc_splits,

    collection_name="agentic-rag-chroma",

    embedding=embeddings,

)

In [11]:
retriever = vectorstore.as_retriever()

In [12]:
def retrieve_context(query: str, k: int = 5) -> str:
    # Retrieve the top documents relevant to the query
    retriever = vectorstore.as_retriever()
    docs = retriever.get_relevant_documents(query)
    # Select only the top-k documents
    top_k_docs = docs[:k]
    # Concatenate the text from each document (using a double newline as separator)
    context = "\n\n".join(doc.page_content for doc in top_k_docs)
    return context

In [13]:
@tool
def agreement_qa(question: str) -> str:
    """Answers questions about the agreement based on the provided question."""
    context = retrieve_context(question, k=5)
    prompt = (
        f"You are an expert in analyzing legal agreements. Based on the following excerpts from the agreement, "
        f"provide a clear and concise answer to the question: '{question}'. If the context does not contain sufficient "
        f"information to answer the question, say 'The provided agreement excerpts do not contain enough information to answer this question.' "
        f"Do not make assumptions beyond the given context.\n\n"
        f"Context: {context}"
    )
    response = llm.invoke(prompt)
    return response.strip()


@tool
def compliance_obligations_list() -> str:
    """Lists compliance obligations for each party in JSON format."""
    query = "obligations, duties, responsibilities of each party"
    context = retrieve_context(query, k=15)  # Reduced context for focus

    prompt = (
        f"Analyze the agreement excerpts and extract compliance obligations. "
        f"Strictly Return a JSON array with objects containing 'party' and 'obligations'. Example:\n"
        f'[{{"party": "Tally", "obligations": ["obligation 1", "obligation 2"]}}]\n\n'
        f"1. Output ONLY valid JSON array\n"
        f"2. Max 2 obligations per party\n"
        f"3. Use exact party names from context\n"
        f"4. Remove section numbers\n"
        f"5. Exclude definitions\n\n"
        f"Excerpts: {context}"
    )

    response = llm.invoke(prompt)
    return response.strip()


@tool
def key_events() -> str:
    """Extracts key dates and events from the agreement in JSON format."""
    query = "effective date, termination date, deadline, renewal, notice period, timeline, YYYY-MM-DD"
    context = retrieve_context(query, k=30)
    prompt = (
        f"Extract key dates/events with these rules:\n"
        f"1. Only include dates in YYYY-MM-DD format\n"
        f"2. Exclude relative time periods like 'within X days'\n"
        f"3. Output format: [{{'date': 'YYYY-MM-DD', 'event': 'description'}}]\n"
        f"4. If no valid dates, return empty list\n\n"
        f"Excerpts: {context}"
    )
    response = llm.invoke(prompt)
    return response.strip()

@tool
def summarize_agreement() -> str:
    """Provides a concise summary of the agreement."""
    query = "summary of the agreement"
    context = retrieve_context(query, k=20)
    prompt = (
        f"You are an expert in summarizing legal agreements. Provide a concise and accurate summary of the following excerpts from the agreement in 2-3 sentences. "
        f"Focus on the main purpose, key parties, and primary obligations or terms. Avoid including unnecessary details or speculation beyond the given context.\n\n"
        f"Excerpts: {context}"
    )
    response = llm.invoke(prompt)
    return response.strip()

tools = [
    agreement_qa,
    compliance_obligations_list,
    key_events,
    summarize_agreement
]

In [14]:
system_prompt = """Respond to the human as helpfully and accurately as possible. You have access to the following tools: {tools}

Use a json blob to specify a tool by providing an action key (tool name) and an action_input key (tool input). For tools that do not require input, use an empty object ({{}}) as action_input.

Valid "action" values: "Final Answer" or {tool_names}

Provide only ONE action per $JSON_BLOB, as shown:"

```

{{

"action": $TOOL_NAME,

"action_input": $INPUT

}}

```

Follow this format:

Question: input question to answer

Thought: consider previous and subsequent steps

Action:

```

$JSON_BLOB

```

Observation: action result

... (repeat Thought/Action/Observation N times)

Thought: I know what to respond

Action:

```

{{

"action": "Final Answer",

"action_input": "Some text"

}}

Begin! Reminder to ALWAYS respond with a valid json blob of a single action.

Respond directly if appropriate. Format is Action:```$JSON_BLOB```then Observation"""

In [15]:
# human_prompt = """{input}

# {agent_scratchpad}

# [REMINDERS]
# 1. For JSON outputs: return raw JSON strings, NO formatting or explanations
# 2. For key_events: EXCLUDE relative dates like "within X days"
# 3. Only use YYYY-MM-DD dates from the agreement
# 4. Respond ONLY with valid JSON!"""
human_prompt = """{input}

{agent_scratchpad}

(reminder to always respond in a JSON blob)"""

In [16]:
prompt = ChatPromptTemplate.from_messages(

    [

        ("system", system_prompt),

        MessagesPlaceholder("chat_history", optional=True),

        ("human", human_prompt),

    ]

)

In [17]:
prompt = prompt.partial(

    tools=render_text_description_and_args(list(tools)),

    tool_names=", ".join([t.name for t in tools]),

)

In [18]:
memory = ConversationBufferMemory()

  memory = ConversationBufferMemory()


In [19]:
chain = ( RunnablePassthrough.assign(

    agent_scratchpad=lambda x: format_log_to_str(x["intermediate_steps"]),

    chat_history=lambda x: memory.chat_memory.messages,

    )

    | prompt | llm | JSONAgentOutputParser())



agent_executor = AgentExecutor(agent=chain, tools=tools, handle_parsing_errors=True, verbose=True, memory=memory)

In [20]:
agent_executor.invoke({"input": 'Can you tell me the service fees?'})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m

Thought: The user is asking about service fees, which are typically found in the agreement. I will use the summarize_agreement tool to find this information.

Action:

```

{

"action": "summarize_agreement",

"action_input": {}

}

```

Observation[0m

  docs = retriever.get_relevant_documents(query)


[36;1m[1;3mCustomers 
 for such Product; 
 (d) for Combined Products with equal to or greater than fifty percent  (50%) and less than sixty-five 
 percent (65%) Tally  Content, the Service Fee rate shall be equal to 2.5% of the Tally 's direct selling 
 price to customer if the Product is not sold through a Channel Partner or Tally  Channel Partners' 
 actual distributor cost for the BuyCo ARC  Customers for such Product; 
 (e) for Combined Products with equal to or greater than thirty percent  (30%) and less than fifty 
 percent (50%) Tally  Content, the Service Fee rate shall be equal to 1.5% of the Tally 's direct selling 
 price to customer if the Product is not sold through a Channel Partner or Tally  Channel Partners' 
 actual distributor cost for the BuyCo ARC  Customers for such Product; 
 (f) for Combined Products with equal to or greater than ten percent  (10%) and less than thirty 
 percent (30%) Tally  Content, the Service Fee rate shall be equal to 1.0% of the Tally 's d

{'input': 'Can you tell me the service fees?',
 'history': '',
 'output': "The service fees are as follows:\n\n- For Combined Products with equal to or greater than fifty percent (50%) and less than sixty-five percent (65%) Tally Content, the Service Fee rate is 2.5% of the Tally's direct selling price to customer or Tally Channel Partners' actual distributor cost for the BuyCo ARC Customers for such Product.\n- For Combined Products with equal to or greater than thirty percent (30%) and less than fifty percent (50%) Tally Content, the Service Fee rate is 1.5% of the Tally's direct selling price to customer or Tally Channel Partners' actual distributor cost for the BuyCo ARC Customers for such Product.\n- For Combined Products with equal to or greater than ten percent (10%) and less than thirty percent (30%) Tally Content, the Service Fee rate is 1.0% of the Tally's direct selling price to customer or Tally Channel Partners' actual distributor cost for the BuyCo ARC Customers for such 

In [21]:
agent_executor.invoke({"input": 'Summarize the document'})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m

```

{

"action": "summarize_agreement",

"action_input": {}

}

```

Observation[0m[36;1m[1;3mCustomers 
 for such Product; 
 (d) for Combined Products with equal to or greater than fifty percent  (50%) and less than sixty-five 
 percent (65%) Tally  Content, the Service Fee rate shall be equal to 2.5% of the Tally 's direct selling 
 price to customer if the Product is not sold through a Channel Partner or Tally  Channel Partners' 
 actual distributor cost for the BuyCo ARC  Customers for such Product; 
 (e) for Combined Products with equal to or greater than thirty percent  (30%) and less than fifty 
 percent (50%) Tally  Content, the Service Fee rate shall be equal to 1.5% of the Tally 's direct selling 
 price to customer if the Product is not sold through a Channel Partner or Tally  Channel Partners' 
 actual distributor cost for the BuyCo ARC  Customers for such Product; 
 (f) for Combined Products with equal to o

{'input': 'Summarize the document',
 'history': "Human: Can you tell me the service fees?\nAI: The service fees are as follows:\n\n- For Combined Products with equal to or greater than fifty percent (50%) and less than sixty-five percent (65%) Tally Content, the Service Fee rate is 2.5% of the Tally's direct selling price to customer or Tally Channel Partners' actual distributor cost for the BuyCo ARC Customers for such Product.\n- For Combined Products with equal to or greater than thirty percent (30%) and less than fifty percent (50%) Tally Content, the Service Fee rate is 1.5% of the Tally's direct selling price to customer or Tally Channel Partners' actual distributor cost for the BuyCo ARC Customers for such Product.\n- For Combined Products with equal to or greater than ten percent (10%) and less than thirty percent (30%) Tally Content, the Service Fee rate is 1.0% of the Tally's direct selling price to customer or Tally Channel Partners' actual distributor cost for the BuyCo ARC

In [22]:
full_text = " ".join([doc.page_content for doc in docs_list])

In [23]:
import json

In [24]:
key_events_prompt = """
Given the following text extracted from a legal document, identify key events with their dates and descriptions. Return the result as a JSON list of objects, where each object has a "date" (in YYYY-MM-DD format) and an "event" (a descriptive string). Focus on dates explicitly mentioned and significant actions like signatures or agreement terms.

Text:
{full_text}

Output format:
[
    {{"date": "YYYY-MM-DD", "event": "description"}},
    ...
]
"""
formatted_key_events_prompt = key_events_prompt.format(full_text=full_text)
key_events_response = llm.generate([formatted_key_events_prompt])  # Returns LLMResult
# Extract the text from the first generation
key_events_text = key_events_response.generations[0][0].text
try:
    key_events_json = json.loads(key_events_text)  # Parse the text as JSON
    print(json.dumps(key_events_json, indent=2))
except json.JSONDecodeError as e:
    print(f"Error decoding key events JSON: {e}")
    print("Raw response:", key_events_text)

[
  {
    "date": "2017-03-31",
    "event": "Memorandum of Understanding entered into between Tally Inc. and BuyCo Limited"
  },
  {
    "date": "2006-07-01",
    "event": "Limited distribution of Products by Tally to BuyCo ARC Accounts begins"
  },
  {
    "date": "2017-09-30",
    "event": "Service Fee for Designated BuyCo ARC Customers ends"
  },
  {
    "date": "2017-10-01",
    "event": "Service Fee rate for all Product reduces to 1.0%"
  },
  {
    "date": "2017-12-31",
    "event": "Reduced Service Fee rate of 1.0% continues until the earlier of this date or the date on which BuyCo's Aggregate Ownership Interest falls below 12.5%"
  },
  {
    "date": "2006-07-01",
    "event": "Payment for Non-Designated BuyCo ARC Customers begins"
  },
  {
    "date": "2007-04-06",
    "event": "Payment for Non-Designated BuyCo ARC Customers is due"
  },
  {
    "date": "2017-07-01",
    "event": "Service Fee rate for Combined Products for Designated BuyCo ARC Customers begins"
  },
  {
    "

In [25]:
compliance_prompt = """
Given the following text extracted from a legal document, identify the parties involved and their specific obligations. Return the result as a JSON list of objects, where each object has a "party" (the name of the entity) and "obligations" (a list of obligation strings). Follow these rules:
- For each named party, include a maximum of 2 specific obligations.
- If there are obligations that apply to both parties, include an entry with "party": "Both" and list up to 2 obligations that both parties must fulfill.
- Focus on explicit obligations or responsibilities assigned to parties in the text.

Text:
{full_text}

Output format:
[
    {{"party": "Party Name", "obligations": ["obligation 1", "obligation 2"]}},
    {{"party": "Both", "obligations": ["shared obligation 1", "shared obligation 2"]}},
    ...
]
"""
formatted_compliance_prompt = compliance_prompt.format(full_text=full_text)
compliance_response = llm.generate([formatted_compliance_prompt])  # Returns LLMResult
# Extract the text from the first generation
compliance_text = compliance_response.generations[0][0].text
try:
    compliance_json = json.loads(compliance_text)  # Parse the text as JSON
    print("Compliance Obligatory List:", json.dumps(compliance_json, indent=2))
except json.JSONDecodeError as e:
    print(f"Error decoding compliance JSON: {e}")
    print("Raw response:", compliance_text)


Compliance Obligatory List: [
  {
    "party": "Tally Inc",
    "obligations": [
      "Market, sell, or otherwise distribute Products, or appoint its sales representatives or channel partners to do so, with respect to Designated BuyCo ARC Customers and Non-Designated BuyCo ARC Customers subject to payment of the applicable Service Fee to BuyCo and other amounts due under Section 4.",
      "Pay a Service Fee to BuyCo equal to the product of Tally's direct selling price to customer or Tally Channel Partners' actual distributor cost for the BuyCo ARC Customers, multiplied by a Service Fee rate of 4.3% for SCO Product and a figure specified in Section 5 for Combined Product, for the period beginning from July 1, 2006 through September 30, 2017."
    ]
  },
  {
    "party": "BuyCo Limited",
    "obligations": [
      "Allow Tally to market, sell, or otherwise distribute Products, or appoint its sales representatives or channel partners to do so, with respect to Designated BuyCo ARC Custom

In [26]:
compliance_score_prompt = """
Given the following text extracted from a legal document, calculate the Compliance Obligatory Score, a custom metric to gauge the number of obligations in the agreement and their importance in keeping the agreement compliant and in force, on a scale from 1 to 100. Assess the obligations and their significance based on the text. Return only a JSON object with a "score" field, and do not include any additional text, explanations, or comments outside the JSON.

Text:
{full_text}

Output format:
{{
    "score": <integer between 1 and 100>
}}
"""
formatted_compliance_score_prompt = compliance_score_prompt.format(full_text=full_text)
compliance_score_response = llm.generate([formatted_compliance_score_prompt])
compliance_score_text = compliance_score_response.generations[0][0].text
try:
    compliance_score_json = json.loads(compliance_score_text)
    print("Compliance Obligatory Score:", json.dumps(compliance_score_json, indent=2))
except json.JSONDecodeError as e:
    print(f"Error decoding compliance score JSON: {e}")
    print("Raw response:", compliance_score_text)

Compliance Obligatory Score: {
  "score": 85
}
