In [None]:
!pip install boto3
!pip install torch transformers diffgram neo4j anthropic pandas tqdm
!pip install llama_index

In [1]:
import torch
from transformers import BertTokenizerFast, BertForTokenClassification
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from diffgram import Project
from typing import List, Dict, Optional
import anthropic
import json
from neo4j import GraphDatabase
from tqdm import tqdm
import logging
import os
import sys
import boto3
import requests
import pprint

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Use os.getcwd() since __file__ is not available in interactive environments
current_dir = os.getcwd()

# If your structure is such that the package is in the parent directory, compute the parent directory:
parent_dir = os.path.abspath(os.path.join(current_dir, '..'))

# Add the parent directory to sys.path if it's not already there
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

In [3]:
from AgenticWorkflow.bedrock_session import get_boto_session

In [4]:
session = get_boto_session()

In [5]:
bedrock_runtime = session.client("bedrock-runtime", region_name="us-east-1")

In [6]:
def get_claudia_kwargs(prompt):
    kwargs = {
      "modelId": "anthropic.claude-3-5-sonnet-20240620-v1:0",
      "contentType": "application/json",
      "accept": "application/json",
      "body": json.dumps({
        "anthropic_version": "bedrock-2023-05-31",
        "max_tokens": 10000,
        "messages": [
          {
            "role": "user",
            "content": [
              {
                "type": "text",
                "text": prompt
              }
            ]
          }
        ]
      })
    }
    return kwargs

In [7]:
prompt = "Does this work?"

In [8]:
kwargs = get_claudia_kwargs(prompt)

In [9]:
def get_response(prompt):
    kwargs = get_claudia_kwargs(prompt)
    response = bedrock_runtime.invoke_model(**kwargs)
    response_body = json.loads(response.get("body").read())
    return response_body['content'][0]['text']

In [10]:
response = get_response(prompt)

In [11]:
response

'I apologize, but I don\'t have any context about what you\'re referring to when you ask "Does this work?" Without more information, I can\'t determine if something works or not. If you have a specific question, problem, or task in mind, please provide more details so I can better assist you. What exactly are you trying to do or asking about?'

In [12]:
# Configuration
DIFFGRAM_CONFIG = {
    "host": "http://dispatcher:8085",
    "project_string_id": "translucenttracker",
    "client_id": "LIVE__u3v8q0m7tx1p851dp0ap",
    "client_secret": "1qgd8as7xfcbuem6mw9j1z0xvjfmmvlagbugqr8z1g1ntypugr2ul24cce5k"
}

In [13]:
project = Project(host=DIFFGRAM_CONFIG["host"],
        project_string_id = "translucenttracker",
        client_id = "LIVE__u3v8q0m7tx1p851dp0ap",
        client_secret = "1qgd8as7xfcbuem6mw9j1z0xvjfmmvlagbugqr8z1g1ntypugr2ul24cce5k"
      )
project_local = project

In [15]:
# Define constants
BATCH_SIZE = 32
MAX_LENGTH = 256
NUM_TRAIN_SAMPLES = 5440  # Number of samples to use for training
NUM_TRAINING_DATA = 5440
train_dataset_suffix = "NER_train_batch_"
test_dataset_suffix = "NER_test_batch_"
JOB_NAME = "Law_NER_task1"
JOB_TRAIN_SUFFIX = "NER_train_JOB_"
JOB_TEST_SUFFIX = "NER_test_JOB_"
MAX_NUM_OF_TASK = 250
NER_schema_name = 'ENTITY_TRAINING_SCHEMA'

## Import all the files 
### make sure you have the diffgram_processing_v2 folder which has all the data arranged for NER task

In [16]:
from llama_index.core import SimpleDirectoryReader, StorageContext

In [17]:
file_metadata = lambda x: {"filename": x}
diffgram_documents = SimpleDirectoryReader("diffgram_processing",file_metadata=file_metadata).load_data()

In [18]:
print(len(diffgram_documents))

81611


In [19]:
print(diffgram_documents[500].text)

Chunk ID: Provincial Sales Tax Act-chunk-Tax if tangible personal property no longer for temporary use-0000
Act ID: Provincial Sales Tax Act
Regulation ID: None
Section Name: Tax if tangible personal property no longer for temporary use
Section ID: 51.1
Sequence ID: 0
Text:
1 this section applies to a person in relation to tangible personal property if a section 51 applied to the person in relation to the tangible personal property, and b within 3 years after the date on which the tangible personal property is first used in british columbia and during a calculation year in respect of which tax was payable under section 51, the person uses that property, or allows that property to be used, in british columbia for a purpose other than for temporary use. 2 a person to whom this section applies must pay to the government tax in an amount equal to the amount of tax under section 49 that would have otherwise been payable if that section had applied to the person in relation to the tangible p

## Diffgram utilities

In [20]:
def check_if_directory_exist(dir_name):
    project = project_local.directory.get_directory_list(limit=50000)
    for project_dir in project:
        if (project_dir.__dict__['nickname'] == dir_name):
            return project_dir
    return None

In [21]:
## You may need to run this twice to see if the directory is created
def create_diffgram_directory(dataset_name):
    #directory = project_local.directory.get(name = dataset_name)
    directory = check_if_directory_exist(dataset_name)
    if (directory is None):
        project_local.directory.new(name=dataset_name)
        directory = check_if_directory_exist(dataset_name)
        print(directory.__dict__)    
    return directory

In [22]:
## check if file exist in the dir
def check_if_file_exist_in_dir(filename):
    file = project_local.file.file_list_exists(filename)

In [23]:
def create_dataset_job(data_suffix, job_suffix, index, member_list_ids):
    dataset_batch_name = data_suffix + str(index)
    directory = create_diffgram_directory(dataset_batch_name)

    if directory is None:
        print(f"{dataset_batch_name} Directory does not exist")
        return

    job_name = job_suffix + str(index)

    job = project_local.job.new(
        name = job_name,
        instance_type = "box",
        share = "Project",
        sync_directories = [directory],
        label_schema_id = schema_id,
        tag_list = ["Laws", "Acts", "Regulations"],
        members_list_ids = member_list_ids,
        auto_launch = True
    )
    print(f"The {job_name} task is created")
    return directory

In [24]:
def upload_files_to_dataset(index,batch_size, offset, directory):
    for document in range((index+offset) * batch_size, ((index + offset) * batch_size) + batch_size):
        filename = diffgram_documents[document].metadata['filename']
        # check if the file exist in the diffgram directory
        try:
            file = project_local.file.from_local(filename,directory_id=directory.__dict__['id'])
        except:
            print(f"File with {filename} exist in this directory. Continuing ....")
            continue;

## get schmea id

In [25]:
schema_id = None

# List the existing schemas in your Diffgram project.
schemas = project.schema.list()
print("Existing Schemas in Diffgram:")
print(json.dumps(schemas, indent=2))

# Check if a schema with the name NER_schema_name already exists.
for schema in schemas:
    if schema.get('name') == NER_schema_name:
        schema_id = schema.get('id')
        break

# If the schema does not exist, create a new one.
if schema_id is None:
    print(f"Schema '{NER_schema_name}' not found. Creating a new one...")
    json_response = project.new_schema(name=NER_schema_name)
    schema_id = json_response.get("id")
    print(f"Created new schema with id: {schema_id}")
else:
    print(f"Schema '{NER_schema_name}' already exists with id: {schema_id}")

Existing Schemas in Diffgram:
[
  {
    "archived": false,
    "id": 8,
    "is_default": true,
    "member_created_id": 1,
    "member_updated_id": null,
    "name": "Default Schema",
    "project_id": 4,
    "time_created": "2025-02-04 22:16:17",
    "time_updated": null
  },
  {
    "archived": false,
    "id": 9,
    "is_default": false,
    "member_created_id": 10,
    "member_updated_id": null,
    "name": "NER_TRAINING_SCHEMA",
    "project_id": 4,
    "time_created": "2025-02-05 17:08:24",
    "time_updated": null
  },
  {
    "archived": false,
    "id": 11,
    "is_default": false,
    "member_created_id": 10,
    "member_updated_id": null,
    "name": "ENTITY_TRAINING_SCHEMA",
    "project_id": 4,
    "time_created": "2025-02-05 17:20:02",
    "time_updated": null
  }
]
Schema 'ENTITY_TRAINING_SCHEMA' already exists with id: 11


## Uplaod the files

In [27]:
import math 
def create_datasets(diffgram_documents, num_training_data, batch_size, train_suffix, test_suffix, job_train_suffix, job_test_suffix):
    #check if the lenght of all the data is equal to or more than num_training_data
    if (len(diffgram_documents) < num_training_data):
        print(f"Not sufficient data for training {len(diffgram_documents)}")
        return
    
    train_batch_size = math.floor(num_training_data/batch_size)
    test_batch_size = math.floor((num_training_data * (5 /100))/ batch_size)
    #train_dataset_name = "NER_train_batch_"
    
    print(f"The batch size of the training data is : {train_batch_size}")
    print(f"The batch size of the test data is: {test_batch_size}")
    
    member_list = project.get_member_list()
    member_list_ids = [x['member_id'] for x in member_list]
    
    #schemas = project_local.schema.list()
    train_batch_size = max(MAX_NUM_OF_TASK, train_batch_size)
    test_batch_size = max(MAX_NUM_OF_TASK, test_batch_size)
    
    for index in range(66, train_batch_size):
        directory = create_dataset_job(train_suffix, job_train_suffix, index, member_list_ids)
        
        print(f"Creating / Uploading data to directory {directory.__dict__['nickname']}")
        upload_files_to_dataset(index,batch_size, 0, directory)
        
#    for index in range(0, test_batch_size):
#        directory = create_dataset_job(test_suffix, job_test_suffix, index, member_list_ids)
#        
#        print(f"Creating / Uploading data to directory {directory.__dict__['nickname']}")
#        upload_files_to_dataset(index,batch_size, train_batch_size+1, directory)

In [28]:
create_datasets(diffgram_documents[2112:], NUM_TRAINING_DATA, BATCH_SIZE, train_dataset_suffix, test_dataset_suffix,  JOB_TRAIN_SUFFIX, JOB_TEST_SUFFIX)

The batch size of the training data is : 170
The batch size of the test data is: 8
The NER_train_JOB_66 task is created
Creating / Uploading data to directory NER_train_batch_66
{'client': <diffgram.core.core.Project object at 0x7f6f4bedbc50>, 'id': 332, 'file_list_metadata': {'annotation_status': 'All', 'date_from': None, 'date_to': None, 'directory_id': 332, 'end_index': 0, 'file': {}, 'file_count': 0, 'file_view_mode': 'ids_only', 'issues_filter': None, 'job_id': None, 'label': {'start_index': 0}, 'length_current_page': 0, 'limit': 5000, 'machine_made_setting': 'All', 'media_type': 'All', 'next_page': None, 'no_results_match_search': True, 'page': 1, 'prev_page': 0, 'query': None, 'regen_url': True, 'search_term': None, 'start_index': 0, 'total_pages': 0, 'with_children_files': False}, 'nickname': 'NER_train_batch_67', 'file_id_list': [], 'diffgram_file_id_list': [], 'max_size_cache': 1073741824, 'pool': <concurrent.futures.thread.ThreadPoolExecutor object at 0x7f6f08cefed0>, 'custo