In [1]:
# !pip install --force-reinstall -U -r requirements.txt --quiet

In [4]:
%%writefile agent2_docclassification_agent.py

from strands import Agent, tool
from strands_tools import calculator # Import the calculator tool
import argparse
import json
from bedrock_agentcore.runtime import BedrockAgentCoreApp
from strands.models import BedrockModel
################
import boto3, json
import time
import os
from logging import exception
from botocore.exceptions import ClientError
# import trp
# from trp import Document
from typing import Dict

# Import date class from datetime module
# from datetime import date
# from datetime import datetime
import datetime
import decimal

from typing import Optional
import traceback
import botocore
from botocore.config import Config
##################################

# textract = boto3.client('textract', region_name='us-east-1')
# print("textract obj = ", textract)
print("in agent 2 beginning...")

app = BedrockAgentCoreApp()

# Initialize DynamoDB resource
dynamodb_resource = boto3.resource("dynamodb")

def get_docs_extract(docid):
    """Retrieve document extract details from DynamoDB"""
    print('in get_docs_extract begin', docid)
    
    dynamodb_tbl_nm = "hcltech-doc-extraction"
    dbtable = dynamodb_resource.Table(dynamodb_tbl_nm)
    
    return dbtable.get_item(Key={'docid': docid})


def get_prompt_ready(raw_text, tabletext, key_value_pair_data):
    
    cpromppretext="""Human: You are an expert in understand and analyzing the Worker Compensation Industry Documents. 
    
    You are provided with all the details within the <raw_text>,  <key_value_pair_data> and <table_text> xml tags.
   
    <raw_text>""" + raw_text + """</raw_text> , 
    <key_value_pair_data>""" + key_value_pair_data + """</key_value_pair_data> , 
    <table_text>""" + tabletext + """</table_text> 

    The raw text of all the details is within <raw_text> xml tag.
    The key value pair of all the details are mentioned as a list within <key_value_pair_data> xml tag.
    The tabular data of the all the details are mentioned as a json array with objects within <table_text> xml tag.     
    
    Your job is to analyze the document content and identify the Classification Type of the document from one of these types ['MedicalReport', 'ClaimForm', 'DoctorReportMMI', 'PhysicalTherapy', 'Prescription', 'CMS1500', 'Legal', 'Invoice'].
    Select the classification type only from the above given types and if you are not able to identify the document type then please reply the Classification Type as Unidentified Type.

    {
        classification_type : ""
    }
    
    Generate only a perfect JSON till end.
    Do not provide any supporting or explanation text beyond generating the perfect JSON.
    Skip any preamble text and generate the final JSON ONLY. 
    
    """

    prompbody=cpromppretext+"\n"+"Assistant:"

    return prompbody

def execute_model(prompt, bedrock):
    """Execute Claude model for document classification"""
    try:
        print("inside execute_model() method. Please wait Large Language Model (Haiku) is preparing your JSON....")
        
        start_time = time.time()
        model_id = "anthropic.claude-3-haiku-20240307-v1:0"
        response = bedrock.invoke_model(
            modelId=model_id,
            body=json.dumps({
                "anthropic_version": "bedrock-2023-05-31",
                "max_tokens": 5000,
                "messages": [{
                    "role": "user",
                    "content": [{"type": "text", "text": prompt}],
                }]
            }),
        )
        
        end_time = time.time()
        print(f"Time taken by execute_model() {end_time - start_time} sec")
        
        result = json.loads(response.get("body").read())
        output_list = result["content"]
        for output in output_list:
            return output["text"]
            
    except Exception as error:
        print(f"Exception in execute_model(): {error}")
        print('Exception Details:', traceback.format_exc())
        return ""

def upsert_dashboard_record(tablename, docid, **kwargs):
    """
    Update existing record or insert new record in specified DynamoDB table

    Args:
        tablename (str): Name of the DynamoDB table
        docid (str): Document ID (primary key)
        **kwargs: Additional fields to update/insert

    Returns:
        dict: Response from DynamoDB operation
    """
    table = dynamodb_resource.Table(tablename)

    # Build update expression and attribute values
    update_expression = "SET "
    expression_attribute_values = {}

    for key, value in kwargs.items():
        update_expression += f"{key} = :{key}, "
        expression_attribute_values[f":{key}"] = value

    # Remove trailing comma and space
    update_expression = update_expression.rstrip(", ")
    print("update_expression =", update_expression)
    
    try:
        response = table.update_item(
            Key={'docid': docid},
            UpdateExpression=update_expression,
            ExpressionAttributeValues=expression_attribute_values,
            ReturnValues='ALL_NEW'
        )
        print("response =", response)
        return response
    except ClientError as e:
        print(f"Error updating record: {e}")
        raise


# Create a custom tool 
@tool
def get_classification(docid):
#     try:
       
#         classificationtype = "Legal classification"
#         print("classificationtype = ", classificationtype)

#         return classificationtype
        
#     except Exception as e:
#         return {
#             'statusCode': 500,
#             'body': json.dumps({'error': str(e)})
#         }
     
# def get_classification(docid):
    try:
        print("inside get_classification  docid - ", docid)
        # # Extract document ID from event
        # print("Event - ", event)
        # data_string = event['Records'][0]['body']   # this change is done only to accept the 'body' from SQS
        # print("event[body]",data_string,type(data_string))
        
        # if type(data_string) is dict:
        #     qtext = data_string
        # else:
        #     qtext = json.loads(data_string)
        # #print("\n","Lambda Handler context:",type(context),context)
        
        # print("qtext=json.loads(data_string) :",type(qtext),qtext)
        # s3files = [qtext['s3filename']]  # send the file name in an array
        # indexid = qtext['indexid']
        # print("indexid = ",indexid)
        # docid = qtext['docid']
        # print("docid = ",docid)

        # #docid = event.get('docid')
        # if not docid:
        #     print("Error: docid is not present, hence returning")
        #     return {
        #         'statusCode': 400,
        #         'body': json.dumps({'error': 'docid is required'})
        #     }
        
        # Initialize Bedrock client
        bedrock = boto3.client('bedrock-runtime')
        print("bedrock - ", bedrock)
        
        # Get document extract details
        docs_extract_details = get_docs_extract(docid)
        print("docs_extract_details - ", docs_extract_details)
        
        if 'Item' not in docs_extract_details:
            return {
                'statusCode': 404,
                'body': json.dumps({'error': 'Document not found for docid'})
            }
        
        # Extract document data
        document_name = docs_extract_details["Item"]["document_name"]
        rawtext = docs_extract_details["Item"]["rawtext"]
        tbltxt = docs_extract_details["Item"]["tbltxt"]
        keyvaluesText = docs_extract_details["Item"]["keyvaluesText"]
        
        #########################################
        ## Here we will check document language and translate to english
        translate_client = boto3.client('translate')
        comprehend_client = boto3.client('comprehend')
        translated_text=""
        # Detect language
        language_response = comprehend_client.detect_dominant_language(Text=rawtext)
        detected_language = language_response['Languages'][0]['LanguageCode']
        print("detected language = ", detected_language)
        doc_language = "English"
        # Translate if not English
        if detected_language == 'en':
            #translated_text = rawtext
            doc_language = "English"
            print("detected lang is english")
        else:
            translation_response = translate_client.translate_text(
                Text=rawtext,
                SourceLanguageCode=detected_language,
                TargetLanguageCode='en'
            )
            doc_language = "Spanish"
            translated_text = translation_response['TranslatedText']
            # Update Document Extraction table with TranslatedText
            resTanslate = upsert_dashboard_record('hcltech-doc-extraction', docid=docid, translated_text=translated_text)
            print ("resTanslate = ", resTanslate)
            print("translated_text = ", translated_text)

        #########################################
        
        # Generate prompt and classify document
        if detected_language == 'en':
            prompt = get_prompt_ready(str(rawtext), str(tbltxt), str(keyvaluesText))
        else:
            prompt = get_prompt_ready(str(translated_text), "", "")

        print("prompt = ", prompt )
        classification_result = execute_model(prompt, bedrock)
        ##########################################
        # prompt = get_prompt_ready(str(rawtext), str(tbltxt), str(keyvaluesText))
        # classification_result = execute_model(prompt, bedrock)
        
        # Parse classification result
        llm_extracted_json = json.loads(classification_result)
        classificationtype = llm_extracted_json['classification_type']
        print("classificationtype = ", classificationtype)
        
        # Update Document Extraction table with classification type
        upsert_dashboard_record('hcltech-doc-extraction', docid=docid, classification=classificationtype)
        
        # Update Dashboard table with classification status
        upsert_dashboard_record('hcltech-dashboard', docid=docid, classification_status="Completed", classification=classificationtype)
        # ####################################
        # # Update Document Extraction table with classification type
        # resExtraction = upsert_dashboard_record('hcltech-doc-extraction', docid=docid, classification=classificationtype)
        # print ("resExtraction = ", resExtraction)

        # if "ResponseMetadata" in resExtraction:
        #     if "HTTPStatusCode" in resExtraction["ResponseMetadata"]:
        #         print("@@@HTTP Response Code of resExtraction = ", resExtraction["ResponseMetadata"]["HTTPStatusCode"])
                
        #         if resExtraction["ResponseMetadata"]["HTTPStatusCode"] == 200:
        #             print("success to call next dashboard")
        #             # Update Dashboard table with classification status
        #             restDashboard = upsert_dashboard_record('hcltech-dashboard', docid=docid, classification_status="Completed", classification=classificationtype, doc_language=doc_language, s3filename=s3files)
        #             print ("restDashboard = ", restDashboard)
                    
        #             if "ResponseMetadata" in restDashboard:
        #                 if "HTTPStatusCode" in restDashboard["ResponseMetadata"]:
        #                     print("@@@HTTP Response Code of restDashboard = ", restDashboard["ResponseMetadata"]["HTTPStatusCode"])

        #                     if restDashboard["ResponseMetadata"]["HTTPStatusCode"] == 200:
        #                         print("success to call entity extraction lambda  ")
                                
        #                         ent_payload = """{   'docid': '""" + docid + """' }"""
        #                         print("ent_payload = ", ent_payload)
        #                         try:
        #                             # Initialize a boto3 client
        #                             lambda_client = boto3.client('lambda')
        #                             function_name = "hcltech_entityextraction_lambda"
        #                             # Invoke the hcltech_entityextraction_lambda function
        #                             response = lambda_client.invoke(
        #                                 FunctionName=function_name,
        #                                 InvocationType='RequestResponse',  # Use 'Event' for asynchronous invocation
        #                                 Payload=json.dumps(event)  # Pass the event or any payload you need
        #                                 # Payload=json.dumps(ent_payload)  # Pass the event or any payload you need
        #                             )
        #                             print("lambda invoke response = ", response)

        #                         except Exception as e:
        #                             print(f"Error invoking Entity Extraction Lambda function: {e}")

        return classificationtype
        
    except Exception as e:
        return {
            'statusCode': 500,
            'body': json.dumps({'error': str(e)})
        }

########################################

model_id = "us.anthropic.claude-3-7-sonnet-20250219-v1:0"
model = BedrockModel(
    model_id=model_id,
)
agent = Agent(
    model=model,
    # tools=[calculator, weather],
    # system_prompt="You're a helpful assistant. You can do simple math calculation, and tell the weather."
    tools=[get_classification],
    system_prompt="You're a helpful assistant. You can do classification of extracted document content."
)

# # @app.entrypoint
# # def strands_agent_bedrock(payload):
# #     """
# #     Invoke the agent with a payload
# #     """
# #     user_input = payload.get("prompt")
# #     print("User input:", user_input)
# #     response = agent(user_input)
# #     return response.message['content'][0]['text']

# if __name__ == "__main__":
#     print("\@@@@@@@ agent2 doc extract report:")
#     user_input = "Classify the document whose document id is DOC417927"
#     response = agent(user_input)
#     print("response = ", response)
#     # return response.message['content'][0]['text']
#     # app.run()

Overwriting agent2_docclassification_agent.py


In [7]:
!python agent2_docclassification_agent.py 

in agent 2 beginning...
\@@@@@@@ agent2 doc extract report:
I'll help you classify the document with ID DOC417927. Let me retrieve the classification information for you.
Tool #1: get_classification
inside get_classification  docid -  DOC417927
bedrock -  <botocore.client.BedrockRuntime object at 0x7f76d9822c80>
in get_docs_extract begin DOC417927
docs_extract_details -  {'Item': {'classification': 'Legal', 'current_datetime': '2025-11-14T05:56:26.612007+00:00', 'docid': 'DOC417927', 'document_name': 'newmexicomutual/claimforms/IN434221/DOC417927/LegalCaseDocument-WC.pdf', 'indexid': 'IN434221', 'keyvaluesText': '[{\'newmexicomutual/claimforms/IN434221/DOC417927/LegalCaseDocument-WC\': [\'Key: Decided and Entered:, Value: May 26, 2022\', \'Key: Calendar Date:, Value: April 21, 2022\', \'Key: In the Matter of the Claim of, Value: DAVID BONET, Appellant, V\', \'Key: Before:, Value: Garry, P.J., Lynch, Pritzker, Colangelo and McShan, JJ.\', \'Key: State of New York, Value: Supreme Court, 