# Anthropic Claude LLMs Invokation Guide

This notebook walks you through how to call Anthropic's Claude LLMs through [Amazon Bedrock](https://aws.amazon.com/bedrock/). It includes the following examples:
- Basic invocation
- Invocation in streaming fashion
- Invocation with printing invocation and latency detials
- Invocation with JSON output
- Invocation with batch processing and management

This notebook requires Claude v3 Sonnet to be enabled in Bedrock via Model Access.

In [1]:
import json
import pandas as pd
import re
import copy
from collections import Counter

import boto3
from typing import List
from langchain.llms.bedrock import Bedrock
from botocore.config import Config
import time
import os

In [2]:
# Initialize the Amazon Bedrock runtime client
my_config = Config(
    region_name = 'us-east-1',
    signature_version = 'v4',
    retries = {
        'max_attempts': 3,
        'mode': 'standard'
    }
)

client = boto3.client("bedrock-runtime", config = my_config)

## Claude invokation functions

In [3]:
def invoke_claude_base(client, 
                       messages = [{"role": "user", "content": [{"type": "text", "text": "Hello!"}]}],
                       system = "You are an assistant.",
                       model_id="anthropic.claude-3-sonnet-20240229-v1:0", 
                       max_tokens=1024, 
                       temperature = 1.0, 
                       top_k = None, 
                       top_p = None,
                       stop_sequences=["Human:"],
                       use_streaming = False,
                       anthropic_version = "bedrock-2023-05-31",
                       print_details = True):
    """
    Invokes Anthropic Claude models to run an inference using the input
    provided in the request body.

    :param prompt: The prompt that you want Claude 3 to complete.
    :return: Inference response from the model.
    """

    # Invoke Claude models with the text prompt
    
    body = {
        "anthropic_version": anthropic_version,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "messages": messages,
    }
    
    if system is not None: 
        body["system"]= system
    if top_k is not None: 
        body["top_k"]= top_k
    if top_p is not None: 
        body["top_p"]= top_p
    if stop_sequences is not None:    
        body["stop_sequences"] = stop_sequences
    
    time0 = time.time()
    if use_streaming:
        response = client.invoke_model_with_response_stream(
            modelId=model_id,
            body=json.dumps(body),
        )
        stream = response.get("body")
        output_text = ""
        la = True
        if stream:
            for event in stream:
                chunk = event.get("chunk")
                if chunk:
                    if la:
                        start_time = time.time() - time0
                        print(f"Response(s):")
                        #print(f"\n**** Stream Start {start_time} ****\n")
                        la = False
                    chunk_obj = json.loads(chunk.get("bytes").decode())
                    #print(chunk_obj)
                    if chunk_obj["type"]=="content_block_delta":
                        text = chunk_obj["delta"]["text"]
                        print(text, end="")
                        output_text = output_text + text
                    if chunk_obj["type"]=="message_stop":
                        input_tokens = chunk_obj["amazon-bedrock-invocationMetrics"]["inputTokenCount"]
                        output_tokens = chunk_obj["amazon-bedrock-invocationMetrics"]["outputTokenCount"]
                        latency_start = chunk_obj["amazon-bedrock-invocationMetrics"]["firstByteLatency"]/1000
                        latency_end = chunk_obj["amazon-bedrock-invocationMetrics"]["invocationLatency"]/1000
        end_time = time.time() - time0
        output_list = [output_text]
        #print(f"\n**** Stream End {end_time} ****\n")
        print("\n")
    else:
        response = client.invoke_model(
            modelId=model_id,
            body=json.dumps(body),
        )
        end_time = time.time() - time0
        latency_start = end_time
        latency_end = end_time

        # Process and print the response
        result = json.loads(response.get("body").read())
        input_tokens = result["usage"]["input_tokens"]
        output_tokens = result["usage"]["output_tokens"]
        output_list = result.get("content", [])
        output_text = "\n".join([x["text"] for x in output_list])
        print(f"Response(s):")
        print(output_text)

    if print_details:
        print("Latency details:")
        print(f"- The streaming start latency is {latency_start} seconds.")
        print(f"- The full invocation latency is {latency_end} seconds.")

        print("Invocation details:")
        print(f"- The input length is {input_tokens} tokens.")
        print(f"- The output length is {output_tokens} tokens.")
    
    output_obj = {
        "response_text": output_text,
        "input_tokens": input_tokens,
        "output_tokens": output_tokens,
        "latency_start": latency_start,
        "latency_end": latency_end,
    }

    return output_obj


def invoke_claude_with_text(client, prompt,
                       model_id="anthropic.claude-3-sonnet-20240229-v1:0", 
                       max_tokens=1024, 
                       temperature = 1.0, 
                       top_k = None, 
                       top_p = None,
                       stop_sequences=["Human:"],
                       use_streaming = False,
                       anthropic_version = "bedrock-2023-05-31",
                       print_details = True):
    
    messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
    system = None
    
    return invoke_claude_base(client, 
                       messages = messages,
                       system = system,
                       model_id=model_id, 
                       max_tokens=max_tokens, 
                       temperature = temperature, 
                       top_k = top_k, 
                       top_p = top_p,
                       stop_sequences=stop_sequences,
                       use_streaming = use_streaming,
                       anthropic_version = anthropic_version,
                       print_details = print_details)

## Invocation Example - Basic

In [4]:
#model_id = "anthropic.claude-instant-v1"
#model_id = "anthropic.claude-v2:1"
#model_id = "anthropic.claude-3-haiku-20240307-v1:0"
model_id = "anthropic.claude-3-sonnet-20240229-v1:0"

prompt = "What is Large Langue Model?"

messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]

output_obj = \
invoke_claude_base(client, 
                   messages,
                   system = None,
                   model_id=model_id, 
                   max_tokens=1024, 
                   temperature = 0.0, 
                   top_k = None, 
                   top_p = None,
                   stop_sequences=["Human:"],
                   use_streaming = False,
                   anthropic_version = "bedrock-2023-05-31",
                   print_details = False)

output_obj

Response(s):
A large language model (LLM) is a type of artificial intelligence system that is trained on vast amounts of text data to understand and generate human-like language. These models use deep learning techniques, specifically transformer-based neural networks, to learn patterns and relationships in the training data, allowing them to perform a wide range of natural language processing tasks.

Some key characteristics of large language models include:

1. Massive scale: LLMs are trained on enormous datasets, often comprising billions or even trillions of words from various sources such as websites, books, and articles. This massive scale allows them to capture a broad understanding of language and knowledge.

2. Generative capabilities: LLMs can generate human-like text, including coherent paragraphs, stories, articles, and even code, based on the input or prompt provided.

3. Versatility: LLMs can be adapted to various natural language tasks, such as text summarization, questi

{'response_text': 'A large language model (LLM) is a type of artificial intelligence system that is trained on vast amounts of text data to understand and generate human-like language. These models use deep learning techniques, specifically transformer-based neural networks, to learn patterns and relationships in the training data, allowing them to perform a wide range of natural language processing tasks.\n\nSome key characteristics of large language models include:\n\n1. Massive scale: LLMs are trained on enormous datasets, often comprising billions or even trillions of words from various sources such as websites, books, and articles. This massive scale allows them to capture a broad understanding of language and knowledge.\n\n2. Generative capabilities: LLMs can generate human-like text, including coherent paragraphs, stories, articles, and even code, based on the input or prompt provided.\n\n3. Versatility: LLMs can be adapted to various natural language tasks, such as text summari

## Invocation Example - Streaming

In [5]:
#model_id = "anthropic.claude-instant-v1"
#model_id = "anthropic.claude-v2:1"
#model_id = "anthropic.claude-3-haiku-20240307-v1:0"
model_id = "anthropic.claude-3-sonnet-20240229-v1:0"

prompt = "What is Large Langue Model?"

messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]

output_obj = \
invoke_claude_base(client, 
                   messages,
                   system = None,
                   model_id=model_id, 
                   max_tokens=1024, 
                   temperature = 0.0, 
                   top_k = None, 
                   top_p = None,
                   stop_sequences=["Human:"],
                   use_streaming = True,
                   anthropic_version = "bedrock-2023-05-31",
                   print_details = False)

output_obj

Response(s):
A large language model (LLM) is a type of artificial intelligence system that is trained on vast amounts of text data to understand and generate human-like language. These models use deep learning techniques, specifically transformer-based neural networks, to learn patterns and relationships in the training data, allowing them to perform a wide range of natural language processing tasks.

Some key characteristics of large language models include:

1. Massive scale: LLMs are trained on enormous datasets, often comprising billions or even trillions of words from various sources such as websites, books, and articles. This massive scale allows them to capture a broad understanding of language and knowledge.

2. Generative capabilities: LLMs can generate human-like text, including coherent paragraphs, stories, articles, and even code, based on the input or prompt provided.

3. Versatility: LLMs can be adapted to various natural language tasks, such as text summarization, questi

{'response_text': 'A large language model (LLM) is a type of artificial intelligence system that is trained on vast amounts of text data to understand and generate human-like language. These models use deep learning techniques, specifically transformer-based neural networks, to learn patterns and relationships in the training data, allowing them to perform a wide range of natural language processing tasks.\n\nSome key characteristics of large language models include:\n\n1. Massive scale: LLMs are trained on enormous datasets, often comprising billions or even trillions of words from various sources such as websites, books, and articles. This massive scale allows them to capture a broad understanding of language and knowledge.\n\n2. Generative capabilities: LLMs can generate human-like text, including coherent paragraphs, stories, articles, and even code, based on the input or prompt provided.\n\n3. Versatility: LLMs can be adapted to various natural language tasks, such as text summari

## Invocation Example - Streaming with printing invocation and latency detials

In [6]:
#model_id = "anthropic.claude-instant-v1"
#model_id = "anthropic.claude-v2:1"
#model_id = "anthropic.claude-3-haiku-20240307-v1:0"
model_id = "anthropic.claude-3-sonnet-20240229-v1:0"

prompt = "What is Large Langue Model?"

messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]

output_obj = \
invoke_claude_base(client, 
                   messages,
                   system = None,
                   model_id=model_id, 
                   max_tokens=1024, 
                   temperature = 0.0, 
                   top_k = None, 
                   top_p = None,
                   stop_sequences=["Human:"],
                   use_streaming = True,
                   anthropic_version = "bedrock-2023-05-31",
                   print_details = True)

output_obj

Response(s):
A large language model (LLM) is a type of artificial intelligence system that is trained on vast amounts of text data to understand and generate human-like language. These models use deep learning techniques, specifically transformer-based neural networks, to learn patterns and relationships in the training data, allowing them to perform a wide range of natural language processing tasks.

Some key characteristics of large language models include:

1. Massive scale: LLMs are trained on enormous datasets, often comprising billions or even trillions of words from various sources such as websites, books, and articles. This massive scale allows them to capture a broad understanding of language and knowledge.

2. Generative capabilities: LLMs can generate human-like text, including coherent paragraphs, stories, articles, and even code, based on the input or prompt provided.

3. Versatility: LLMs can be adapted to various natural language tasks, such as text generation, translati

{'response_text': 'A large language model (LLM) is a type of artificial intelligence system that is trained on vast amounts of text data to understand and generate human-like language. These models use deep learning techniques, specifically transformer-based neural networks, to learn patterns and relationships in the training data, allowing them to perform a wide range of natural language processing tasks.\n\nSome key characteristics of large language models include:\n\n1. Massive scale: LLMs are trained on enormous datasets, often comprising billions or even trillions of words from various sources such as websites, books, and articles. This massive scale allows them to capture a broad understanding of language and knowledge.\n\n2. Generative capabilities: LLMs can generate human-like text, including coherent paragraphs, stories, articles, and even code, based on the input or prompt provided.\n\n3. Versatility: LLMs can be adapted to various natural language tasks, such as text generat

## Invocation Example - JSON output

In [19]:
PROMPT_QUERY_META_GENERATION = """
        \n\nHuman:
                Financial question related to yearly and Quarterly financial Reports: {query} \n
                Generate the keywords and rephrase the question to make it very clear, think step by step:

                1. Expand any acronyms and abbreviations in the original question by providing the full term. Include both the original abbreviated version and the expanded version in the rephrased question.

                2. If there is no time keywords mentioned in the original question, do not include time keywords in the rephrased question either.

                3. Generate a list of company names that are mentioned in the question.

                4. Generate a comprehensive list of all technical keywords and key phrases that are relevant to answering the question.

                5. Pay close attention to any time spans requested in the original question, such as specific years, quarters, or months.

                6. Generate a list of time_keywords using a 'Quarter Year' format (e.g. Q1'22). Include only the time keywords related to the question. If there is no time-related keywords mentioned in the original question, please leave this time_keywords as an empty list. Do not include the most recent (last) quarter in the time_keywords if it's not needed for answering the question.

                7. Return a JSON object with the following fields:
                   - 'time_keywords': a list of time-related keywords
                   - 'technical_keywords': a list of technical keywords
                   - 'company_keywords': a list of company names
                   - 'rephrased_question': the full rephrased question string


        \n\nAssistant:
"""

In [21]:
#model_id = "anthropic.claude-instant-v1"
#model_id = "anthropic.claude-v2:1"
#model_id = "anthropic.claude-3-haiku-20240307-v1:0"
model_id = "anthropic.claude-3-sonnet-20240229-v1:0"

query = "What was the revenue growth for each of Apple's product segments in 2022?"
prompt = PROMPT_QUERY_META_GENERATION.format(query = query)

messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]

output_obj = \
invoke_claude_base(client, 
                   messages,
                   system = None,
                   model_id=model_id, 
                   max_tokens=1024, 
                   temperature = 0.0, 
                   top_k = None, 
                   top_p = None,
                   stop_sequences=["Human:"],
                   use_streaming = True,
                   anthropic_version = "bedrock-2023-05-31",
                   print_details = True)

output_obj

Response(s):
{
  "time_keywords": ["2022"],
  "technical_keywords": ["revenue growth", "product segments"],
  "company_keywords": ["Apple"],
  "rephrased_question": "What was the revenue growth for each of Apple's (Apple Inc.) product segments in the year 2022?"
}

Latency details:
- The streaming start latency is 0.657 seconds.
- The full invocation latency is 3.136 seconds.
Invocation details:
- The input length is 377 tokens.
- The output length is 75 tokens.


{'response_text': '{\n  "time_keywords": ["2022"],\n  "technical_keywords": ["revenue growth", "product segments"],\n  "company_keywords": ["Apple"],\n  "rephrased_question": "What was the revenue growth for each of Apple\'s (Apple Inc.) product segments in the year 2022?"\n}',
 'input_tokens': 377,
 'output_tokens': 75,
 'latency_start': 0.657,
 'latency_end': 3.136}

## Invocation Example - Batch processing
The code for this example is for demo only as the customer's specific data is not shareable in this codebase

Load the data

In [None]:
data_file = "../data/broadcast_segmentation_ds.xlsx"
df_data = pd.read_excel(data_file, index_col=0)

Run the invocation of LLM and save the output results (JSON) for each sample in {output_folder_root}/{run1,run2,run3,...}

Note: Please modify this cell for your own use case

In [None]:
%%time
model_id = "anthropic.claude-3-sonnet-20240229-v1:0"

max_tokens=4096
temperature=0.5
top_k = None
top_p = None
input_content_field = "trimmed_paragraph"
PROMPT_TEMPLATE = PROMPT_QA_v2
use_streaming = True

output_folder_root = \
"../data/results/may07_claudev3s_trimmed_promptv2_tem05_stream_fullseg"

for run in ['run1', 'run2', 'run3']:
    print(f"##########{run}")
    output_folder = os.path.join(output_folder_root,run)

    for idx in range(len(df_data)):
        output_file_name = os.path.join(output_folder, f"result_sample{idx}.json")
        if os.path.exists(output_file_name):
            print(f"###### Already Finished for Sample {idx}")
            continue         
        else:    
            row = df_data.iloc[idx]
            prompt = PROMPT_TEMPLATE.format(keywords_list = [row["keyword"]], input_content = row[input_content_field])
            print(f"###### Result for Sample {idx}") 
            output_obj= invoke_claude_with_text(prompt, model_id, 
                                                max_tokens=max_tokens, temperature=temperature, top_k = top_k, top_p = top_p, use_streaming = use_streaming)

        result, input_tokens, output_tokens, latency_start, latency_end =\
        output_obj["response_text"], output_obj["input_tokens"], output_obj["output_tokens"], output_obj["latency_start"], output_obj["latency_end"]  

        dict_temp = {
            "sample_id": idx,
            "response": result,
            "input_tokens": input_tokens, 
            "output_tokens": output_tokens, 
            "latency_start": latency_start, 
            "latency_end": latency_end,
        }

        with open(output_file_name, 'w') as f:
            json.dump(dict_temp, f)

        time.sleep(1)

Read the output results for each sample in {output_folder_root}/{run1,run2,run3,...}, and aggregate them into a dataframe 

In [None]:
output_folder_root = \
"../data/results/may07_claudev3s_trimmed_promptv2_tem05_stream_fullseg"

runs = os.listdir(output_folder_root)
runs = [x for x in runs if x.find('run')>=0]

list_df = []
for run in runs:
    output_folder = os.path.join(output_folder_root,run)
    list_result_files = os.listdir(output_folder)
    list_result_files = [x for x in list_result_files if x.find('.json')>=0]

    list_dict = []
    for result_file in list_result_files:
        with open(os.path.join(output_folder,result_file),'r') as f:
            json_temp = json.load(f)
            list_dict.append(json_temp)

    df_results_temp = pd.DataFrame(list_dict)
    df_results_temp = df_results_temp.sort_values('sample_id').reset_index()
    df_results_temp['run'] = run
    list_df.append(df_results_temp)
    
df_results_all = pd.concat(list_df)