<a href="https://colab.research.google.com/github/ashater/creditreviews/blob/main/LLM_document_selector_with_structured_output.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

reference notebook:
https://github.com/anthropics/courses/blob/master/ToolUse/03_structured_outputs.ipynb



##Install and Import

In [None]:
# ! pip install langchain
# ! pip install langchain-anthropic

# ! pip install pymupdf

In [None]:
import anthropic
from langchain_anthropic import ChatAnthropic

from langchain.prompts import ChatPromptTemplate
from IPython.display import display, Markdown

import fitz
from google.colab import userdata

##Set up LLM and Tool to enforce structured outupt

###Setting up in 1 LLM query - doesn't work for me

In [None]:
# The tool description

tool_definition_document_selector = {
    "name": "print_selected_sessions",
    "description": "Output the sessions in a document's table of contents in a structured way",
    "input_schema": {
        "type": "object",
        "properties": {
            "document_session": {
                "type": "string",
                "description": "the title of the selected session."
            },
            "document_page": {
                "type": "int",
                "description": "the page number of the selected session"
            }
        },
        "required": ["document_session", "document_page"]
    }
}

In [None]:
# Set up LLM
# Native API - Langchain seems not support multi varable tools very well
client = anthropic.Anthropic(api_key = userdata.get('ANTHROPIC_API_KEY'))

def get_response(prompt):

  message = client.messages.create(
      model = "claude-3-sonnet-20240229",
      max_tokens = 1000,
      temperature = 0.0,
      tools=[tool_definition_document_selector],
      system = "You are a financial analyst to select relevant sessions \
          in a company's financial statement in order to perform credit risk review.\
          The financial statements can be 10-K, 10-Q, earning call transcripts or others.\
          Use the print_selected_sessions tool to give structured output.",
      messages = [
          {"role": "user", "content": prompt}
      ]
  )

  if message.stop_reason == "tool_use":
    tool_use = message.content[-1]
    return tool_use

  elif message.stop_reason == "end_turn":
    return "Claude didn't want to use a tool"
    return "Claude responded with:" + message.content[0].text

In [None]:
# Test

# https://www.jpmorganchase.com/content/dam/jpmc/jpmorgan-chase-and-co/investor-relations/documents/quarterly-earnings/2023/4th-quarter/corp-10k-2023.pdf
pdf_name = 'corp-10k-2023.pdf'

In [None]:
with fitz.open(pdf_name) as doc:
    toc = doc.get_toc()  # [[lvl, title, page, …], …]

In [None]:
document_selector_prompt_template_string = """
  We are looking for information on {query} with the file with name: {file_name}.
  Can you tell me what is the one most relevant session to look into?

  You will be given the table of contents of the 10-K filing below that is delimited by triple backticks.
  The table of contents is in the format of a list of lists.
  In the child list, the first number is the level of the bookmark.
  The second number is the title of the session.
  The third number is the page number of the session.

  We are looking for structured output.

  {file_name} Table of Contents: ```{toc}```
"""

# this can be further modified to take a few files in at a time

In [None]:
document_selector_prompt_template = ChatPromptTemplate.from_template(document_selector_prompt_template_string)

In [None]:
document_selector_query = document_selector_prompt_template.format_messages(
                            query='financial updates',
                            file_name=pdf_name,
                            toc=toc)

In [None]:
response = get_response(document_selector_query[0].content)

BadRequestError: Error code: 400 - {'type': 'error', 'error': {'type': 'invalid_request_error', 'message': 'tools.0.input_schema: JSON schema is invalid - please consult https://json-schema.org or our documentation at https://docs.anthropic.com/en/docs/tool-use'}}

### Set up 2 LLMs, one to select sections, the other to convert to machine readable format - doesn't work for me either.

In [None]:
client = anthropic.Anthropic(api_key = userdata.get('ANTHROPIC_API_KEY'))

In [None]:

# https://www.jpmorganchase.com/content/dam/jpmc/jpmorgan-chase-and-co/investor-relations/documents/quarterly-earnings/2023/4th-quarter/corp-10k-2023.pdf
pdf_name = 'corp-10k-2023.pdf'

In [None]:
with fitz.open(pdf_name) as doc:
    toc = doc.get_toc()  # [[lvl, title, page, …], …]

First LLM to select sessions

In [None]:
# prompt
document_selector_prompt_template_string = """
  We are looking for information on {query} within the file {file_name}.
  Can you tell me what are the most relevant sessions to look into?

  The file is a financial statement.
  You can tell the type of financial statement based on the file name. i.e. 10-K, 10-Q, etc.

  You will be given the table of contents of the file below that is delimited by triple backticks.
  The table of contents is in the format of a list of lists.
  In the child list, the first number is the level of the bookmark.
  The second number is the title of the session.
  The third number is the page number of the session.

  We are looking for output in the same format as the table of contents.
  Please rank the sessions in descending order of relevance, and only return the top 5 sessions.
  Please only return the list of sessions, without any other verbage.

  {file_name} Table of Contents: ```{toc}```
"""

document_selector_prompt_template = ChatPromptTemplate.from_template(document_selector_prompt_template_string)

document_selector_query = document_selector_prompt_template.format_messages(
                            query='financial updates',
                            file_name=pdf_name,
                            toc=toc)


In [None]:
response = client.messages.create(
    model = "claude-3-sonnet-20240229",
    max_tokens = 4096,
    temperature = 0.0,
    system = "You are a financial analyst to select relevant sessions \
        in a company's financial statement in order to perform credit risk review.\
        The financial statements can be 10-K, 10-Q, earning call transcripts or others.\
        When asked, please provide structured output as much as you can.",
    messages = [
        {"role": "user", "content": document_selector_query[0].content}
    ]
)

In [None]:
print(response.content[0].text)

```
[[2, 'Item 7. Management's Discussion and Analysis of Financial Condition and Results of Operations.', 37],
 [2, 'Item 8. Financial Statements and Supplementary Data.', 38],
 [3, 'Consolidated statements of income', 168],
 [3, 'Consolidated statements of comprehensive income', 169],
 [3, 'Consolidated balance sheets', 170]]
```


Second LLM to convert to Json

In [None]:
tool_definition_selection_converter = {
    "name": "print_selected_sessions",
    "description": "Output the sessions in a document's table of contents in a structured way",
    "input_schema": {
            "type": "object",
            "properties": {
                "sessions": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "session_name": {
                                "type": "string",
                                "description": "the title of the session."
                            },
                            "page_number": {
                                "type": "int",
                                "description": "the page number of session"
                            }
                        },
                        "required": ["session_name", "page_number", "context"]
                    }
                }
            },
            "required": ["sessions"]
        }
}

In [None]:
# prompt
selection_converter_prompt_template_string = """
  Use the print_selected_sessions tool to convert the content that is delimited by triple backticks into a JSON object.

  The content is in the format of a list of lists.
  In the child list, the first number is the level of the bookmark.
  The second number is the title of the session.
  The third number is the page number of the session.

   {selection}
"""

selection_converter_prompt_template = ChatPromptTemplate.from_template(selection_converter_prompt_template_string)

selection_converter_query = selection_converter_prompt_template.format_messages(
                                selection=response.content[0].text)

In [None]:
converter_response = client.messages.create(
    model = "claude-3-sonnet-20240229",
    max_tokens = 4096,
    temperature = 0.0,
    tools=[tool_definition_selection_converter],
    tool_choice={"type": "tool", "name": "print_selected_sessions"},
    system = "Use print_selected_sessions tool to convert.",
    messages = [
        {"role": "user", "content": selection_converter_query[0].content}
    ]
)

BadRequestError: Error code: 400 - {'type': 'error', 'error': {'type': 'invalid_request_error', 'message': 'tools.0.input_schema: JSON schema is invalid - please consult https://json-schema.org or our documentation at https://docs.anthropic.com/en/docs/tool-use'}}