# Annotation Validation

The purpose of this notebook is to double check if all the annotation are proplery annotated, nothing is skipped and if so how to re-annotate the document again using a more specialzied prompt. 

In [None]:
!pip install torch transformers diffgram neo4j anthropic pandas tqdm
!pip install llama_index
!pip install boto3

In [None]:
!pip install arize-phoenix-otel
!pip install openinference-instrumentation-bedrock opentelemetry-exporter-otlp

In [1]:
import torch
from transformers import BertTokenizerFast, BertForTokenClassification
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from diffgram import Project
from typing import List, Dict, Optional
import anthropic
import json
from neo4j import GraphDatabase
from tqdm import tqdm
import logging
import os
import sys
import boto3
import requests
import pprint

  from .autonotebook import tqdm as notebook_tqdm


## Connect to Diffgram

In [2]:
# Diffgram project configuration
DIFFGRAM_CONFIG = {
    "host": "http://dispatcher:8085",
    "project_string_id": "translucenttracker",
    "client_id": "LIVE__u3v8q0m7tx1p851dp0ap",
    "client_secret": "1qgd8as7xfcbuem6mw9j1z0xvjfmmvlagbugqr8z1g1ntypugr2ul24cce5k"
}

In [3]:
# Initialize connection to Diffgram project
project = Project(host=DIFFGRAM_CONFIG["host"],
        project_string_id = "translucenttracker",
        client_id = "LIVE__u3v8q0m7tx1p851dp0ap",
        client_secret = "1qgd8as7xfcbuem6mw9j1z0xvjfmmvlagbugqr8z1g1ntypugr2ul24cce5k"
      )
project_local = project

## Fetch Schema

In [4]:
# Define and manage NER schema in Diffgram
# Retrieve and process existing schema labels
NER_schema_name = 'ENTITY_TRAINING_SCHEMA'

In [5]:
def get_schema_list(id):
    auth = project.session.auth
    url = f"{DIFFGRAM_CONFIG['host']}/api/project/{DIFFGRAM_CONFIG['project_string_id']}/labels?schema_id={id}"
    # Step 4: Make the POST request using the SDK's session auth
    response = requests.get(url, auth=auth)
    # Step 5: Handle the response
    if response.status_code == 200:
        #print("Annotation update successful!")
        #pprint.pprint(response.json())  # View the updated data
        return response.json()
    else:
        print(f"Error: {response.status_code}")
        print(response.text)  # Print error details for debugging

In [6]:
schema_id = None

# List the existing schemas in your Diffgram project.
schemas = project.schema.list()
schema_list = schemas
print("Existing Schemas in Diffgram:")
print(json.dumps(schemas, indent=2))

# Check if a schema with the name NER_schema_name already exists.
for schema in schemas:
    if schema.get('name') == NER_schema_name:
        schema_id = schema.get('id')
        break

# If the schema does not exist, create a new one.
if schema_id is None:
    print(f"Schema '{NER_schema_name}' not found. Creating a new one...")
    json_response = project.new_schema(name=NER_schema_name)
    schema_id = json_response.get("id")
    print(f"Created new schema with id: {schema_id}")
else:
    print(f"Schema '{NER_schema_name}' already exists with id: {schema_id}")

schema_labels = get_schema_list(schema_id)

# Retrieve existing labels for the schema to avoid duplicates.
schema_label_id_value = []
if schema_labels is not None:
    labels = schema_labels['labels_out']
    for label in labels:
        value = {}
        value['id'] = label['id']
        value['name'] = label['label']['name']
        schema_label_id_value.append(value)

existing_label_names = set()
try:
    schema_label_id_value[0]['name']
    for label in schema_label_id_value:
            label_name = label.get("name")
            if label_name:
                existing_label_names.add(label_name)
    print(existing_label_names)      
except:
     print("There are no schema labels") 

Existing Schemas in Diffgram:
[
  {
    "archived": false,
    "id": 8,
    "is_default": true,
    "member_created_id": 1,
    "member_updated_id": null,
    "name": "Default Schema",
    "project_id": 4,
    "time_created": "2025-02-04 22:16:17",
    "time_updated": null
  },
  {
    "archived": false,
    "id": 9,
    "is_default": false,
    "member_created_id": 10,
    "member_updated_id": null,
    "name": "NER_TRAINING_SCHEMA",
    "project_id": 4,
    "time_created": "2025-02-05 17:08:24",
    "time_updated": null
  },
  {
    "archived": false,
    "id": 11,
    "is_default": false,
    "member_created_id": 10,
    "member_updated_id": null,
    "name": "ENTITY_TRAINING_SCHEMA",
    "project_id": 4,
    "time_created": "2025-02-05 17:20:02",
    "time_updated": null
  }
]
Schema 'ENTITY_TRAINING_SCHEMA' already exists with id: 11
{'B-CHUNK_ID', 'B-REQUIREMENT', 'B-AUTHORITY', 'I-METADATA_VALUE', 'I-DEFINITION', 'I-SEQUENCE_ID', 'I-ACT_ID', 'B-SECTION_ID', 'B-METADATA_FIELD', '

## NER Validation Prompt

## Utilities

In [17]:
def extract_word_data(url):
    # Original URL with localhost
    # Replace localhost with ngrok URL (example: "https://example.ngrok.io")
    file_url = url.replace("http://localhost:8085", DIFFGRAM_CONFIG['host'])

    # Make the GET request to fetch the file
    response = requests.get(file_url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the JSON content into a Python dictionary
        data = response.json()  # Assuming the file is in JSON format
        return data
    else:
        print(f"Failed to retrieve the file. Status code: {response.status_code}")

In [18]:
# Utility functions for processing Diffgram annotations
# Extract and format word-level data from files
def get_file_number(completed_annotations, files_index_in_job):
    data = []
    sentences = []
    labels = []
    data_index = 0
    for completed_annotation in completed_annotations:
        try:
            file_index = int(completed_annotation)
            files_index_in_job.append(completed_annotation)
            #print(completed_annotation)
            continue
        except:
            #print(f"{completed_annotation} is not a file")
            continue
            
        #print(f"{completed_annotation} ----")
        if (completed_annotation != 'attribute_groups_reference')  \
            and (completed_annotation != 'export_info') \
            and (completed_annotation != 'label_map') \
            and (completed_annotation != 'readme') \
            and (completed_annotation != 'label_colour_map'):
            sentence_local = []
            labels_local = []

            # First get the point where the annotation is started
            for start in completed_annotations[completed_annotation]['instance_list']:
                if 'start_token' in start:
                    start_token =  start['start_token']
                    break

            #start_token = completed_annotations[completed_annotation]['instance_list'][0]['start_token']
            for annotated_index in range(start_token, len(completed_annotations[completed_annotation]['text']['tokens']['words'])):
                # check if this text is annotated
                for data in completed_annotations[completed_annotation]['instance_list']:
                    if 'start_token' in data:
                        if annotated_index == data['start_token']:
                            sentence_local.append(completed_annotations[completed_annotation]['text']['tokens']['words'][annotated_index]['value'])
                            labels_local.append(completed_annotations['label_map'][str(data['label_file_id'])])
                            #print(f"{completed_annotations[completed_annotation]['text']['tokens']['words'][annotated_index]['value']} - {completed_annotations['label_map'][str(data['label_file_id'])]}")
                            break;
            sentences.append(sentence_local)       
            labels.append(labels_local)
            data_index+=1

## Scan all the tasks
in each task see if the data has annotation
if not then add the task id to annotation pending bucket

In [8]:
annotation_pending = []

In [9]:
results = project_local.job

In [10]:
get_job = project_local.job.list(limit=10000, page_number=1)

In [11]:
print(len(get_job))

250


In [12]:
jobs_with_data_index = []
for job_key, job_list in enumerate(get_job):
    try:
        nickname = job_list['attached_directories_dict']['attached_directories_list'][0]['nickname']
        if nickname:
            job_value = {}
            job_value['nickname'] = nickname
            job_value['index'] = job_key
            jobs_with_data_index.append(job_value)
        #print(nickname)
    except KeyError:
        print("Key not found.")
    except IndexError:
        print("List index out of range.")

In [None]:
jobs_with_data_index

In [14]:
def vallidate_annotation(completed_annotations, files_that_need_annotation, failed_to_annotate, annotation_not_complete, job_index, incorrect_count):
    for completed_annotation in completed_annotations:
        #print(f"{completed_annotation} ----")
        if (completed_annotation != 'attribute_groups_reference')  \
            and (completed_annotation != 'export_info') \
            and (completed_annotation != 'label_map') \
            and (completed_annotation != 'readme') \
            and (completed_annotation != 'label_colour_map'):
            file = project_local.file.get_by_id(completed_annotation,with_instances=True)
            url = file.__dict__['text']['tokens_url_signed']
            data = extract_word_data(url)
            word_count = 0
            for word in data['nltk']['words']:
                if (word['value'] == '\n'):
                    continue
                word_count += 1
            annotated_count = len(completed_annotations[completed_annotation]['instance_list'])    

            ## Count the number of instances
            num_annotated_text_index =  len(file.__dict__['instance_list'])

            job_value = {}
            job_value['nickname'] = job_index['nickname']
            job_value['index'] = job_index['index']
            job_value['file'] = completed_annotation
                    
            if (word_count == annotated_count):
                continue
                #print(f"SUCCESS: The file id is: {completed_annotation} and total annotation is {annotated_count} and word count is {word_count}")
            elif(word_count - num_annotated_text_index < 4):
                diff  = abs(word_count - num_annotated_text_index)
                incorrect_count.append(job_value)
                print(f"skipping:  file id {completed_annotation} of task {job_index['nickname']} of index {job_index['index']} diff {diff}") 
                continue
            else:
                #jobs_with_data_index.append(job_value)
                if (annotated_count) == 0:
                    failed_to_annotate.append(job_value)
                else:
                    annotation_not_complete.append(job_value)
                print(f"ERROR: The file id is: {completed_annotation} and total annotation is {annotated_count} and word count is {word_count}")
                files_that_need_annotation.append(completed_annotation)

In [15]:
len(jobs_with_data_index)

250

In [None]:
files_that_need_annotation = []
failed_to_annotate = []
annotation_not_complete = []
incorrect_count = []
for job_index in jobs_with_data_index:
    print(f"The job nickname is {job_index['nickname']} and the index is {job_index['index']}")
    results.refresh_from_dict(get_job[job_index['index']])
    completed_annotations = results.generate_export()
    vallidate_annotation(completed_annotations,files_that_need_annotation, failed_to_annotate, annotation_not_complete, job_index, incorrect_count)
    #files_index_in_job = []
    #get_file_number(completed_annotations, files_index_in_job, )
    #print(files_index_in_job)
    ## Extract the file data:
print(len(files_that_need_annotation))   

The job nickname is NER_train_batch_249 and the index is 0
The job nickname is NER_train_batch_248 and the index is 1
skipping:  file id 20693 of task NER_train_batch_248 of index 1 diff 2
skipping:  file id 20718 of task NER_train_batch_248 of index 1 diff 1
The job nickname is NER_train_batch_247 and the index is 2
The job nickname is NER_train_batch_246 and the index is 3
