# Annotation Validation

The purpose of this notebook is to double check if all the annotation are proplery annotated, nothing is skipped and if so how to re-annotate the document again using a more specialzied prompt. 

In [None]:
!pip install torch transformers diffgram neo4j anthropic pandas tqdm
!pip install llama_index
!pip install boto3
!pip install pandas

In [None]:
!pip install arize-phoenix-otel
!pip install openinference-instrumentation-bedrock opentelemetry-exporter-otlp

In [1]:
import torch
from transformers import BertTokenizerFast, BertForTokenClassification
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from diffgram import Project
from typing import List, Dict, Optional
import anthropic
import json
from neo4j import GraphDatabase
from tqdm import tqdm
import logging
import os
import sys
import boto3
import requests
import pprint

## Connect to Diffgram

In [2]:
# Diffgram project configuration
DIFFGRAM_CONFIG = {
    "host": "http://dispatcher:8085",
    "project_string_id": "translucenttracker",
    "client_id": "LIVE__u3v8q0m7tx1p851dp0ap",
    "client_secret": "1qgd8as7xfcbuem6mw9j1z0xvjfmmvlagbugqr8z1g1ntypugr2ul24cce5k"
}

In [3]:
# Initialize connection to Diffgram project
project = Project(host=DIFFGRAM_CONFIG["host"],
        project_string_id = "translucenttracker",
        client_id = "LIVE__u3v8q0m7tx1p851dp0ap",
        client_secret = "1qgd8as7xfcbuem6mw9j1z0xvjfmmvlagbugqr8z1g1ntypugr2ul24cce5k"
      )
project_local = project

## Fetch Schema

In [4]:
# Define and manage NER schema in Diffgram
# Retrieve and process existing schema labels
NER_schema_name = 'ENTITY_TRAINING_SCHEMA'

In [5]:
def get_schema_list(id):
    auth = project.session.auth
    url = f"{DIFFGRAM_CONFIG['host']}/api/project/{DIFFGRAM_CONFIG['project_string_id']}/labels?schema_id={id}"
    # Step 4: Make the POST request using the SDK's session auth
    response = requests.get(url, auth=auth)
    # Step 5: Handle the response
    if response.status_code == 200:
        #print("Annotation update successful!")
        #pprint.pprint(response.json())  # View the updated data
        return response.json()
    else:
        print(f"Error: {response.status_code}")
        print(response.text)  # Print error details for debugging

In [6]:
schema_id = None

# List the existing schemas in your Diffgram project.
schemas = project.schema.list()
schema_list = schemas
print("Existing Schemas in Diffgram:")
print(json.dumps(schemas, indent=2))

# Check if a schema with the name NER_schema_name already exists.
for schema in schemas:
    if schema.get('name') == NER_schema_name:
        schema_id = schema.get('id')
        break

# If the schema does not exist, create a new one.
if schema_id is None:
    print(f"Schema '{NER_schema_name}' not found. Creating a new one...")
    json_response = project.new_schema(name=NER_schema_name)
    schema_id = json_response.get("id")
    print(f"Created new schema with id: {schema_id}")
else:
    print(f"Schema '{NER_schema_name}' already exists with id: {schema_id}")

schema_labels = get_schema_list(schema_id)

# Retrieve existing labels for the schema to avoid duplicates.
schema_label_id_value = []
if schema_labels is not None:
    labels = schema_labels['labels_out']
    for label in labels:
        value = {}
        value['id'] = label['id']
        value['name'] = label['label']['name']
        schema_label_id_value.append(value)

existing_label_names = set()
try:
    schema_label_id_value[0]['name']
    for label in schema_label_id_value:
            label_name = label.get("name")
            if label_name:
                existing_label_names.add(label_name)
    print(existing_label_names)      
except:
     print("There are no schema labels") 

Existing Schemas in Diffgram:
[
  {
    "archived": false,
    "id": 8,
    "is_default": true,
    "member_created_id": 1,
    "member_updated_id": null,
    "name": "Default Schema",
    "project_id": 4,
    "time_created": "2025-02-04 22:16:17",
    "time_updated": null
  },
  {
    "archived": false,
    "id": 9,
    "is_default": false,
    "member_created_id": 10,
    "member_updated_id": null,
    "name": "NER_TRAINING_SCHEMA",
    "project_id": 4,
    "time_created": "2025-02-05 17:08:24",
    "time_updated": null
  },
  {
    "archived": false,
    "id": 11,
    "is_default": false,
    "member_created_id": 10,
    "member_updated_id": null,
    "name": "ENTITY_TRAINING_SCHEMA",
    "project_id": 4,
    "time_created": "2025-02-05 17:20:02",
    "time_updated": null
  }
]
Schema 'ENTITY_TRAINING_SCHEMA' already exists with id: 11
{'B-ACT_ID', 'B-AUTHORITY', 'B-METADATA_VALUE', 'B-DEFINITION', 'B-SUBSECTION_REF', 'I-REGULATION_ID', 'I-METADATA_VALUE', 'O', 'I-REQUIREMENT', 'I-S

## NER Validation Prompt

## Utilities

In [7]:
def extract_word_data(url):
    # Original URL with localhost
    # Replace localhost with ngrok URL (example: "https://example.ngrok.io")
    file_url = url.replace("http://localhost:8085", DIFFGRAM_CONFIG['host'])

    # Make the GET request to fetch the file
    response = requests.get(file_url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the JSON content into a Python dictionary
        data = response.json()  # Assuming the file is in JSON format
        return data
    else:
        print(f"Failed to retrieve the file. Status code: {response.status_code}")

In [8]:
# Utility functions for processing Diffgram annotations
# Extract and format word-level data from files
def get_file_number(completed_annotations, files_index_in_job):
    data = []
    sentences = []
    labels = []
    data_index = 0
    for completed_annotation in completed_annotations:
        try:
            file_index = int(completed_annotation)
            files_index_in_job.append(completed_annotation)
            #print(completed_annotation)
            continue
        except:
            #print(f"{completed_annotation} is not a file")
            continue
            
        #print(f"{completed_annotation} ----")
        if (completed_annotation != 'attribute_groups_reference')  \
            and (completed_annotation != 'export_info') \
            and (completed_annotation != 'label_map') \
            and (completed_annotation != 'readme') \
            and (completed_annotation != 'label_colour_map'):
            sentence_local = []
            labels_local = []

            # First get the point where the annotation is started
            for start in completed_annotations[completed_annotation]['instance_list']:
                if 'start_token' in start:
                    start_token =  start['start_token']
                    break

            #start_token = completed_annotations[completed_annotation]['instance_list'][0]['start_token']
            for annotated_index in range(start_token, len(completed_annotations[completed_annotation]['text']['tokens']['words'])):
                # check if this text is annotated
                for data in completed_annotations[completed_annotation]['instance_list']:
                    if 'start_token' in data:
                        if annotated_index == data['start_token']:
                            sentence_local.append(completed_annotations[completed_annotation]['text']['tokens']['words'][annotated_index]['value'])
                            labels_local.append(completed_annotations['label_map'][str(data['label_file_id'])])
                            #print(f"{completed_annotations[completed_annotation]['text']['tokens']['words'][annotated_index]['value']} - {completed_annotations['label_map'][str(data['label_file_id'])]}")
                            break;
            sentences.append(sentence_local)       
            labels.append(labels_local)
            data_index+=1

## Scan all the tasks
in each task see if the data has annotation
if not then add the task id to annotation pending bucket

In [9]:
annotation_pending = []

In [10]:
results = project_local.job

In [11]:
get_job = project_local.job.list(limit=10000, page_number=1)

In [12]:
print(len(get_job))

250


In [13]:
jobs_with_data_index = []
for job_key, job_list in enumerate(get_job):
    try:
        nickname = job_list['attached_directories_dict']['attached_directories_list'][0]['nickname']
        if nickname:
            job_value = {}
            job_value['nickname'] = nickname
            job_value['index'] = job_key
            jobs_with_data_index.append(job_value)
        #print(nickname)
    except KeyError:
        print("Key not found.")
    except IndexError:
        print("List index out of range.")

In [None]:
jobs_with_data_index

In [15]:
def vallidate_annotation_old(completed_annotations, files_that_need_annotation, failed_to_annotate, annotation_not_complete, job_index, incorrect_count):
    for completed_annotation in completed_annotations:
        #print(f"{completed_annotation} ----")
        if (completed_annotation != 'attribute_groups_reference')  \
            and (completed_annotation != 'export_info') \
            and (completed_annotation != 'label_map') \
            and (completed_annotation != 'readme') \
            and (completed_annotation != 'label_colour_map'):
            file = project_local.file.get_by_id(completed_annotation,with_instances=True)
            url = file.__dict__['text']['tokens_url_signed']
            data = extract_word_data(url)
            word_count = 0
            for word in data['nltk']['words']:
                if (word['value'] == '\n'):
                    continue
                word_count += 1
            annotated_count = len(completed_annotations[completed_annotation]['instance_list'])    

            ## Count the number of instances
            num_annotated_text_index =  len(file.__dict__['instance_list'])

            job_value = {}
            job_value['nickname'] = job_index['nickname']
            job_value['index'] = job_index['index']
            job_value['file'] = completed_annotation
                    
            if (word_count == annotated_count):
                continue
                #print(f"SUCCESS: The file id is: {completed_annotation} and total annotation is {annotated_count} and word count is {word_count}")
            elif(word_count - num_annotated_text_index < 4):
                diff  = abs(word_count - num_annotated_text_index)
                incorrect_count.append(job_value)
                print(f"skipping:  file id {completed_annotation} of task {job_index['nickname']} of index {job_index['index']} diff {diff}") 
                continue
            else:
                #jobs_with_data_index.append(job_value)
                if (annotated_count) == 0:
                    failed_to_annotate.append(job_value)
                else:
                    annotation_not_complete.append(job_value)
                print(f"ERROR: The file id is: {completed_annotation} and total annotation is {annotated_count} and word count is {word_count}")
                files_that_need_annotation.append(completed_annotation)

In [16]:
import pandas as pd

# Create an empty DataFrame to store annotation errors
annotation_errors_df = pd.DataFrame(columns=["job_nickname", "index", "file_id", "error_type", "word_count", "annotated_count", "diff"])

def validate_annotation(completed_annotations, df, job_index):
    """
    Validate annotations and log errors into the DataFrame.

    Parameters:
        completed_annotations (dict): Dictionary containing annotation data.
        df (DataFrame): The DataFrame to store annotation errors.
        job_index (dict): Dictionary containing job nickname and index.

    Returns:
        None (modifies the DataFrame in-place).
    """

    for completed_annotation in completed_annotations:
        if completed_annotation in ["attribute_groups_reference", "export_info", "label_map", "readme", "label_colour_map"]:
            continue

        # Retrieve file from Diffgram
        file = project_local.file.get_by_id(completed_annotation, with_instances=True)
        url = file.__dict__['text']['tokens_url_signed']
        data = extract_word_data(url)

        # Count total words
        word_count = sum(1 for word in data['nltk']['words'] if word['value'] != '\n')

        # Count annotated words
        annotated_count = len(completed_annotations[completed_annotation]['instance_list'])
        num_annotated_text_index = len(file.__dict__['instance_list'])

        # Job Metadata
        job_value = {
            "job_nickname": job_index['nickname'],
            "index": job_index['index'],
            "file_id": completed_annotation,
            "word_count": word_count,
            "annotated_count": annotated_count,
            "diff": abs(word_count - num_annotated_text_index)
        }

        # Validation Rules
        if word_count == annotated_count:
            continue  # Correctly annotated, no issue.

        elif (word_count - num_annotated_text_index) < 4:
            job_value["error_type"] = "Incorrect Count (Minor)"
            df.loc[len(df)] = job_value
            print(f"⚠ Skipping file {completed_annotation} for task {job_index['nickname']} (diff {job_value['diff']})")
        
        else:
            if annotated_count == 0:
                job_value["error_type"] = "Failed to Annotate"
            else:
                job_value["error_type"] = "Annotation Not Complete"

            df.loc[len(df)] = job_value  # Append to DataFrame
            print(f"❌ ERROR: File {completed_annotation} - Annotated {annotated_count} / Expected {word_count}")



In [17]:
len(jobs_with_data_index)

250

## The older valdiation scanning function

In [None]:
files_that_need_annotation = []
failed_to_annotate = []
annotation_not_complete = []
incorrect_count = []
for job_index in jobs_with_data_index:
    print(f"The job nickname is {job_index['nickname']} and the index is {job_index['index']}")
    results.refresh_from_dict(get_job[job_index['index']])
    completed_annotations = results.generate_export()
    vallidate_annotation_old(completed_annotations,files_that_need_annotation, failed_to_annotate, annotation_not_complete, job_index, incorrect_count)
    #files_index_in_job = []
    #get_file_number(completed_annotations, files_index_in_job, )
    #print(files_index_in_job)
    ## Extract the file data:
print(len(files_that_need_annotation))

## Improved valdiation scanning function 

In [None]:
# Initialize an empty DataFrame for errors
annotation_errors_df = pd.DataFrame(columns=["job_nickname", "index", "file_id", "error_type", "word_count", "annotated_count", "diff"])

for job_index in jobs_with_data_index:
    print(f"The job nickname is {job_index['nickname']} and the index is {job_index['index']}")
    
    # Get annotation results
    results.refresh_from_dict(get_job[job_index['index']])
    completed_annotations = results.generate_export()

    # Validate and log errors into the DataFrame
    validate_annotation(completed_annotations, annotation_errors_df, job_index)

# Display the DataFrame in Jupyter Notebook
import ace_tools as tools
tools.display_dataframe_to_user(name="Annotation Errors", dataframe=annotation_errors_df)


## Latest validation scanning function

In [22]:
import pandas as pd
import requests
from IPython.core.display import display, HTML

# Initialize an empty DataFrame for annotation errors
annotation_errors_df = pd.DataFrame(columns=["Task Name", "File number", "Error Type", "URL"])

def get_diffgram_task_url(job_id, file_id, auth):
    """
    Fetches the task URL for a given job ID and file ID in Diffgram.

    Parameters:
        job_id (int): The job ID.
        file_id (int): The file ID.
        auth (object): Diffgram session authentication.

    Returns:
        str: The URL to the annotation task, or None if not found.
    """
    url = f"{DIFFGRAM_CONFIG['host']}/api/v1/job/{job_id}/task/list"
    data = {"page_number": 0, "job_id": str(job_id), "mode_data": "direct_route", "status": "all", "limit_count": 32}
    
    response = requests.post(url, json=data, auth=auth)
    if response.status_code == 200:
        for task in response.json().get("task_list", []):
            #print(f"{task['id']} file id in task: {task['file']['id']} file id {file_id}")
            if int(task["file"]["id"]) == int(file_id):
                return f"{DIFFGRAM_CONFIG['host']}/task/{task['id']}?file={file_id}&"
    return None

def validate_annotation(completed_annotations, job_index, job_id, auth, df):
    """
    Validates annotations and logs errors in the DataFrame with clickable links.

    Parameters:
        completed_annotations (dict): Annotation data.
        job_index (dict): Job metadata.
        job_id (int): The Diffgram job ID.
        auth (object): Diffgram session authentication.
        df (DataFrame): DataFrame to store annotation errors.

    Returns:
        None (modifies df in-place)
    """

    for file_id in completed_annotations:
        if file_id in ["attribute_groups_reference", "export_info", "label_map", "readme", "label_colour_map"]:
            continue

        # Retrieve file details
        file = project_local.file.get_by_id(file_id, with_instances=True)
        url = file.__dict__["text"]["tokens_url_signed"]
        data = extract_word_data(url)

        # Count total words
        word_count = sum(1 for word in data["nltk"]["words"] if word["value"] != "\n")

        # Count annotations
        annotated_count = len(completed_annotations[file_id]["instance_list"])
        num_annotated_text_index = len(file.__dict__["instance_list"])

        # Determine error type
        error_type = None
        if word_count == annotated_count:
            continue  # Skip valid files
        elif abs(word_count - num_annotated_text_index) < 4:
            error_type = "Incorrect Count (Minor)"
        elif annotated_count == 0:
            error_type = "Failed to Annotate"
        else:
            error_type = "Annotation Incomplete"

        if error_type:
            task_url = get_diffgram_task_url(job_id, file_id, auth)  # Fetch clickable URL
            
            # Append error details to DataFrame
            df.loc[len(df)] = {
                "Task Name": job_index["nickname"],
                "File number": file_id,
                "Error Type": error_type,
                "URL": f'<a href="{task_url}" target="_blank">{task_url.replace("dispatcher", "localhost")}</a>' if task_url else "URL Not Found"
            }
            if error_type == "Incorrect Count (Minor)":
                error_symbol = "⚠"
            else:
                error_symbol = "❌"
            print(f"{error_symbol} ERROR: {error_type} | File ID {file_id} | Task: {job_index['nickname']} | {task_url.replace('dispatcher', 'localhost')}")


  from IPython.core.display import display, HTML


In [None]:
# Initialize DataFrame for tracking errors
annotation_errors_df = pd.DataFrame(columns=["Task Name", "File number", "Error Type", "URL"])

for job_index in jobs_with_data_index:
    job_id = get_job[job_index["index"]]["id"]  # Extract job ID

    print(f"Processing Job: {job_index['nickname']} (ID: {job_id})")

    # Refresh job data
    results.refresh_from_dict(get_job[job_index["index"]])
    completed_annotations = results.generate_export()

    # Validate and log errors
    validate_annotation(completed_annotations, job_index, job_id, project.session.auth, annotation_errors_df)

# Display errors as a clickable DataFrame
display(HTML(annotation_errors_df.to_html(escape=False)))

# Optionally save to CSV for review
annotation_errors_df.to_csv("annotation_errors.csv", index=False)


In [24]:
# Define a custom sorting order for error types
error_priority = {
    "Failed to Annotate": 1,  # Most critical
    "Annotation Incomplete": 2,
    "Incorrect Count (Minor)": 3  # Least critical
}

# Apply sorting based on the custom priority order
annotation_errors_df["Priority"] = annotation_errors_df["Error Type"].map(error_priority)
annotation_errors_df = annotation_errors_df.sort_values(by="Priority", ascending=True).drop(columns=["Priority"])

# Display the sorted DataFrame
from IPython.core.display import display, HTML
display(HTML(annotation_errors_df.to_html(escape=False)))

  from IPython.core.display import display, HTML


Unnamed: 0,Task Name,File number,Error Type,URL
123,NER_train_batch_75,15162,Annotation Incomplete,http://localhost:8085/task/31348?file=15162&
141,NER_train_batch_42,14111,Annotation Incomplete,http://localhost:8085/task/30297?file=14111&
147,NER_train_batch_28,13669,Annotation Incomplete,http://localhost:8085/task/29855?file=13669&
122,NER_train_batch_83,15434,Annotation Incomplete,http://localhost:8085/task/31620?file=15434&
68,NER_train_batch_153,17653,Annotation Incomplete,http://localhost:8085/task/33839?file=17653&
112,NER_train_batch_97,15889,Annotation Incomplete,http://localhost:8085/task/32075?file=15889&
152,NER_train_batch_22,13480,Annotation Incomplete,http://localhost:8085/task/29666?file=13480&
102,NER_train_batch_107,16203,Annotation Incomplete,http://localhost:8085/task/32389?file=16203&
137,NER_train_batch_51,14410,Annotation Incomplete,http://localhost:8085/task/30596?file=14410&
157,NER_train_batch_16,13298,Annotation Incomplete,http://localhost:8085/task/29484?file=13298&
