In [90]:
%pip install boto3
%pip install botocore

[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.


In [91]:
import boto3
from botocore.config import Config
import json
import os

In [92]:
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")

session = boto3.Session(
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
)

config = Config(
  retries = {
    'max_attempts': 3,
    'mode': 'standard'
  }
)
bedrock_runtime = session.client("bedrock-runtime", region_name="us-east-1", config=config)
runtime_waiter = bedrock_runtime.get_waiter()
runtime_waiter

In [93]:
def get_claude_kwargs(prompt):
    kwargs = {
        "modelId": "anthropic.claude-3-5-sonnet-20240620-v1:0",
        "contentType": "application/json",
        "accept": "application/json",
        "body": json.dumps(
            {
                "anthropic_version": "bedrock-2023-05-31",
                "max_tokens": 5000,
                "messages": [
                    {"role": "user", "content": [{"type": "text", "text": prompt}]}
                ],
            }
        ),
    }
    return kwargs

In [94]:
def get_agent_response(prompt):
    kwargs = get_claude_kwargs(prompt)
    response = bedrock_runtime.invoke_model(**kwargs)
    response_body = json.loads(response.get("body").read())
    return response_body["content"][0]["text"]

In [95]:
sample_text = """
3 despite subsection ( 2 ) of this section, if under section 14. 08 ( a ) the director commences proceedings under section 3 in relation to the subject property, the public body entitled to maintain possession of the subject property under section 14. 05 continues to be entitled to maintain possession of that property until expiry of the 30 day period described in section 14. 05 ( a ). 4 this part does not apply in relation to property if the property is the subject of an order of a court establishing a right of possession in that property with a person other than the public body or authorizing a person other than the public body to have or take possession of that property
"""

In [96]:
import re

def extract_labels(result):
  list = re.search(r"\[\[\d+, \d+, [\"\'][A-Z_]+[\"\']](?:, \[\d+, \d+, [\"\'][A-Z_]+[\"\']])*]", result)
  if (list != None):
    labels_str = list.group()
    # Convert from string to JSON-usable object. Cannot have single-quotes for JSON.
    labels = json.loads(labels_str.replace("'", '"'))
    return labels
  else:
    return None
  

In [97]:
examples = []
max_examples = 10
examples_added = 0
with open("./NER Training/doccano_export.jsonl", "r") as input:
  for index, line in enumerate(input):
    if examples_added > max_examples:
      break
    obj = json.loads(line)
    line_text = obj["text"]
    label = obj["label"]
    if len(label) > 0:
      examples.append((line_text, label))
      examples_added += 1

def ask_bedrock_for_ner(text):
  prompt = (
  """
  Help me complete this NER task.

  I have this tag: REF_IN

  Every time there is a reference to a section within the same act, I need to receive the starting and ending position of the numerical act id. 

  I would like the results to use the following format:
  [[starting index, ending index, label]]

  For example, in the phrase "Please refer to section 12.5" The act id to label is "12. 5" and the result would be [24, 28, "REF_IN"]
  In the phrase, "According to section 3," the act id to label is "3" and the result would be [21, 22, "REF_IN"]

  When there are multiple instances in a text, they should be in a list based on their order of occurrence.
  For example, in the sentence "Subject to sections 14. 04 to 14. 10" the result should be [[20, 26, "REF_IN"],[30, 36, "REF_IN"]]

  Only return the label array.

  Here is a list of tuples with the original text and the correct label array:
  """
  f"{examples}\n"

  "You try this task with the following prompt:\n"
  f"{text}"
  )
  answer = get_agent_response(prompt)
  labels = extract_labels(answer)
  return labels


In [100]:
import threading

# Replace this with your known concurrency limit
CONCURRENCY_LIMIT = 10
semaphore = threading.BoundedSemaphore(CONCURRENCY_LIMIT)

with open("./NER Training/doccano_import_small.jsonl", "r") as input:
  with open("./NER Training/bedrock_annotation_output.jsonl", "w") as output:
    for index, line in enumerate(input):
      with semaphore:
        try:
          line_obj = json.loads(line)
          line_number = index + 1
          text = line_obj["text"]
          labels = ask_bedrock_for_ner(text)
          if (labels is not None):
            print(line_number, text, labels)
            line_obj["label"] = labels
            json.dump(line_obj, output, ensure_ascii=False)
            output.write("\n")
        except Exception as e:
          print(f"Failed to send prompt for line {line_number}. Reason: {e}")
      


Failed to send prompt for line 1. Reason: An error occurred (UnrecognizedClientException) when calling the InvokeModel operation: The security token included in the request is invalid.
Failed to send prompt for line 2. Reason: An error occurred (UnrecognizedClientException) when calling the InvokeModel operation: The security token included in the request is invalid.
Failed to send prompt for line 3. Reason: An error occurred (UnrecognizedClientException) when calling the InvokeModel operation: The security token included in the request is invalid.
Failed to send prompt for line 4. Reason: An error occurred (UnrecognizedClientException) when calling the InvokeModel operation: The security token included in the request is invalid.
Failed to send prompt for line 5. Reason: An error occurred (UnrecognizedClientException) when calling the InvokeModel operation: The security token included in the request is invalid.
Failed to send prompt for line 6. Reason: An error occurred (UnrecognizedCl