## Step 6: WhyLogs Validation

This notebook performs validation of the **metadata annotations** using WhyLogs. It helps ensure that fields like `Act ID`, `Section ID`, and other metadata labels are applied consistently and correctly.

In addition, it attempts to validate whether the word **"section"** is annotated correctly. However, this check is inherently brittle and may produce false positives, since the word "section" can appear in both referential and non-referential contexts. Despite this limitation, tracking occurrences of "section" still provides useful insights into how it's being labeled across the dataset.

In [None]:
!pip install boto3
!pip install torch transformers diffgram neo4j anthropic pandas tqdm
!pip install llama_index
!pip install matplotlib
!pip install tabulate
!pip install 'whylogs[viz]'
!pip install pyarrow

In [27]:
import torch
from transformers import BertTokenizerFast, BertForTokenClassification
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from diffgram import Project
from typing import List, Dict, Optional
import anthropic
import json
from neo4j import GraphDatabase
from tqdm import tqdm
import logging
import os
import sys
import boto3
import requests
import pprint
import json
from diffgram import Project
from llama_index.core import SimpleDirectoryReader, StorageContext
import pandas as pd
from IPython.display import display, HTML
import matplotlib as mpl
from tabulate import tabulate
import math

In [28]:
# Use os.getcwd() since __file__ is not available in interactive environments
current_dir = os.getcwd()

# If your structure is such that the package is in the parent directory, compute the parent directory:
parent_dir = os.path.abspath(os.path.join(current_dir, '..'))

# Add the parent directory to sys.path if it's not already there
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

In [29]:
import diffgram_utils as du

## Connect  to Diffgram

In [30]:
# Configuration
DIFFGRAM_CONFIG = {
    "host": "http://dispatcher:8085",
    "project_string_id": "translucenttracker",
    "client_id": "LIVE__u3v8q0m7tx1p851dp0ap",
    "client_secret": "1qgd8as7xfcbuem6mw9j1z0xvjfmmvlagbugqr8z1g1ntypugr2ul24cce5k"
}

In [31]:
project = Project(host=DIFFGRAM_CONFIG["host"],
        project_string_id = "translucenttracker",
        client_id = "LIVE__u3v8q0m7tx1p851dp0ap",
        client_secret = "1qgd8as7xfcbuem6mw9j1z0xvjfmmvlagbugqr8z1g1ntypugr2ul24cce5k"
      )
project_local = project
auth = project.session.auth

In [32]:
# Define constants
BATCH_SIZE = 32
MAX_LENGTH = 256
NUM_TRAIN_SAMPLES = 5440  # Number of samples to use for training
NUM_TRAINING_DATA = 5440
train_dataset_suffix = "NER_train_batch_"
test_dataset_suffix = "NER_test_batch_"
JOB_NAME = "Law_NER_task1"
JOB_TRAIN_SUFFIX = "NER_train_JOB_"
JOB_TEST_SUFFIX = "NER_test_JOB_"
MAX_NUM_OF_TASK = 250
NER_schema_name = 'ENTITY_TRAINING_SCHEMA'

## Get  schema ID

In [33]:
schema_id = du.find_schema(NER_schema_name, project)

Existing Schemas in Diffgram:
[
  {
    "archived": false,
    "id": 8,
    "is_default": true,
    "member_created_id": 1,
    "member_updated_id": null,
    "name": "Default Schema",
    "project_id": 4,
    "time_created": "2025-02-04 22:16:17",
    "time_updated": null
  },
  {
    "archived": false,
    "id": 9,
    "is_default": false,
    "member_created_id": 10,
    "member_updated_id": null,
    "name": "NER_TRAINING_SCHEMA",
    "project_id": 4,
    "time_created": "2025-02-05 17:08:24",
    "time_updated": null
  },
  {
    "archived": false,
    "id": 11,
    "is_default": false,
    "member_created_id": 10,
    "member_updated_id": null,
    "name": "ENTITY_TRAINING_SCHEMA",
    "project_id": 4,
    "time_created": "2025-02-05 17:20:02",
    "time_updated": null
  }
]
Schema 'ENTITY_TRAINING_SCHEMA' already exists with id: 11


## Get diffgram tasks

In [34]:
results = project.job
get_job = project.job.list(limit=10000, page_number=1)

In [35]:
jobs_with_data_index = du.get_all_tasks(project, get_job)

In [36]:
#jobs_with_data_index

## NER Schema labesl

In [37]:
schema_list = du.get_schema_list(schema_id, project, DIFFGRAM_CONFIG)
ner_schema_list = []
for labels in schema_list['labels_out']:
    #print(labels['label']['name'])
    ner_schema_list.append(labels['label']['name'])

## Populate df of all the data for whylogs to process

In [41]:
def extract_and_arrange(completed_annotations, pd_idx, pd_word, pd_schema, pd_job_id, dataset_id, pd_job_name, pd_dataset_url, job_index):
    for completed_annotation in completed_annotations:
        #print(f"{completed_annotation} ----")
        if completed_annotation in ["attribute_groups_reference", "export_info", "label_map", "readme", "label_colour_map"]:
            continue  # Skip metadata entries
        sentence_local = []
        labels_local = []
        task_url = None

        job_id = get_job[job_index["index"]]["id"]
        if job_id and auth:
            task_url = du.get_diffgram_task_url(job_id, completed_annotation, auth, DIFFGRAM_CONFIG)
            task_url = task_url.replace("dispatcher", "localhost") if task_url else "URL Not Found"

            
        #print(completed_annotations[completed_annotation]['text']['tokens']['words'])
        for idx, words in enumerate(completed_annotations[completed_annotation]['text']['tokens']['words']):
            #print(idx)
            #print(words)
            #print(f"Idx: {idx} Word: {words['value']}")
            ## find the token value
            for start in completed_annotations[completed_annotation]['instance_list']:
                if 'start_token' in start:
                    if idx == start['start_token']:
                        ## Find the schmea value
                        schema_id = completed_annotations['label_map'][str(start['label_file_id'])]
                        #print(f"Idx: {idx} {words['value']} {start['start_token']} schmea: {schema_id}")
                        pd_idx.append(idx)
                        pd_word.append(words['value'])
                        pd_schema.append(schema_id)
                        pd_job_id.append(job_index['index'])  
                        pd_job_name.append(job_index['nickname'])
                        pd_dataset_id.append(completed_annotation)
                        pd_dataset_url.append(f'<a href="{task_url}" target="_blank">View</a>')
                                # Get task URL if parameters are provided

In [42]:
import os
import pandas as pd

# Define the Parquet file path
parquet_file = "annotation.parquet"

# Check if the Parquet file already exists
if os.path.exists(parquet_file):
    print(f"Loading existing annotations from {parquet_file}")
    df = pd.read_parquet(parquet_file)
else:
    print("Parquet file not found. Processing annotations from Diffgram...")
    # Initialize your lists
    files_index_in_job_total = []
    pd_idx = []
    pd_word = []
    pd_schema = []
    pd_job_id = []
    pd_job_name = []
    pd_dataset_url = []
    pd_dataset_id = []
    
    # Process the data
    start_index = 0
    last_index = 255  # len(jobs_with_data_index)
    for job_index in jobs_with_data_index[start_index:last_index]:
        print(f"The job nickname is {job_index['nickname']} and the index is {job_index['index']}")
        results.refresh_from_dict(get_job[job_index['index']])
        completed_annotations = results.generate_export()
        extract_and_arrange(completed_annotations, pd_idx, pd_word, pd_schema, pd_job_id, 
                           pd_dataset_id, pd_job_name, pd_dataset_url, job_index)
    
    # Create DataFrame
    df = pd.DataFrame({
        'dataset_id': pd_dataset_id,
        'token index': pd_idx,
        'word': pd_word,
        'schema': pd_schema,
        'job_id': pd_job_id,
        'job_name': pd_job_name,
        'task_url': pd_dataset_url
    })
    
    # Save to Parquet
    print(f"Saving annotations to {parquet_file}")
    df.to_parquet(parquet_file, engine="pyarrow", index=False)

# Now df is available for further processing in either case
print(f"Loaded DataFrame with {len(df)} rows and {df['dataset_id'].nunique()} unique datasets")

Loading existing annotations from annotation.parquet
Loaded DataFrame with 1195135 rows and 8000 unique datasets


In [61]:
df.head(5)

Unnamed: 0,dataset_id,token index,word,schema,job_id,job_name,task_url
0,20723,0,Chunk,B-METADATA_FIELD,0,NER_train_batch_249,"<a href=""http://localhost:8085/task/36909?file..."
1,20723,1,ID,I-METADATA_FIELD,0,NER_train_batch_249,"<a href=""http://localhost:8085/task/36909?file..."
2,20723,2,:,O,0,NER_train_batch_249,"<a href=""http://localhost:8085/task/36909?file..."
3,20723,3,Homeowner,B-CHUNK_ID,0,NER_train_batch_249,"<a href=""http://localhost:8085/task/36909?file..."
4,20723,4,Protection,I-CHUNK_ID,0,NER_train_batch_249,"<a href=""http://localhost:8085/task/36909?file..."


In [43]:
#display(HTML(df.to_html(escape=False)))

## Whylogs Constraints

In [44]:
check_b_metadata_field_count = lambda x: {
    item.value: item.est for item in x.to_summary_dict()['frequent_strings']
}.get('B-METADATA_FIELD', 0) == 7

In [45]:
check_b_metadata_value_count = lambda x: {
    item.value: item.est for item in x.to_summary_dict()['frequent_strings']
}.get('B-METADATA_VALUE', 0) == 6

In [46]:
def check_metadata_annotation_overall(metric):
    """Check if B-METADATA_VALUE count is 6 and the rest of the medata data is 0"""
    item_counts = {
        item.value: item.est for item in metric.to_summary_dict()['frequent_strings']
    }
    meta_field = item_counts.get('B-METADATA_FIELD', 0)
    meta_value = item_counts.get('B-METADATA_VALUE', 0)
    if ((meta_value ==  6) and (meta_field ==7)):
        return True
    else:
        return False

In [47]:
def check_section_annotation(profile_view, unique_df, dataset_id):
    #print(f"print the main {dataset_id}")
    try:
        dataset_slice = df[df['dataset_id'] == dataset_id]
        constraint = False
        for word in dataset_slice:
            if word['word'] == 'section':
                if schema['schema'] == 'B-SECTION-REF' or schema['schema'] == 'I-SECTION-REF':
                    constraint = True
                    continue
                else:
                    constraint = False
        return cosntraint
    except:
        return False

In [48]:
def check_section(metric):
    # your logic here
    item_counts = {
        item.value: item.est for item in metric.to_summary_dict()['frequent_strings']
    }
    constraint = True
    for dataset_id in item_counts:
        dataset_slice = df[df['dataset_id'] == dataset_id]
        for idx, word in enumerate(dataset_slice['word']):
            if word in ['section', 'sections']:
                label = dataset_slice['schema'].iloc[idx]
                #print(f" {word} and label is {label}")
                if label not in ['B-SECTION_REF', 'I-SECTION_REF']:
                    constraint = False
                    break
    #condition_result = True  # or False, based on your check
    #metric.__dict__
    # optionally add details to the second item
    return constraint


## Profile dataset using whylogs

In [49]:
import os
import pandas as pd
import whylogs as why
from whylogs.core import DatasetProfileView

# Define all the file paths
unique_file = "annotation_unique.parquet"
profile_summaries_file = "whylogs_profile_summaries.parquet"
schema_counts_file = "schema_counts.parquet"

# Create profiles only if the profile summaries file doesn't exist
if not os.path.exists(profile_summaries_file):
    print(f"WhyLogs profile summaries not found. Generating profiles...")
    
    # Profile the entire dataset
    profile = why.log(df)
    
    # Profile by dataset_id
    profiles_by_dataset = {}
    unique_df = df['dataset_id'].unique()
    for dataset_id in unique_df:
        dataset_slice = df[df['dataset_id'] == dataset_id]
        profiles_by_dataset[dataset_id] = why.log(dataset_slice)
    
    # Create profile summaries
    profile_summaries = []
    for dataset_id, profile_obj in profiles_by_dataset.items():
        # Get the profile view
        view = profile_obj.view()
        
        # Extract schema frequency data
        schema_data = view.get_column("schema").get_metric("frequent_items").to_summary_dict()
        schema_counts = {item.value: item.est for item in schema_data.get('frequent_strings', [])}
        
        # Extract word frequency data
        try:
            word_data = view.get_column("word").get_metric("frequent_items").to_summary_dict()
            word_counts = {item.value: item.est for item in word_data.get('frequent_strings', [])}
        except:
            word_counts = {}
        
        # Create summary dictionary
        summary = {
            "dataset_id": dataset_id,
            "timestamp": pd.Timestamp.now(),
            "schema_counts": str(schema_counts),
            "word_counts": str(word_counts),
            "num_rows": view.get_column("schema").get_metric("counts").to_summary_dict().get('n', 0)
        }
        
        profile_summaries.append(summary)
    
    # Save profile summaries
    profile_summary_df = pd.DataFrame(profile_summaries)
    print(f"Saving profile summaries to {profile_summaries_file}")
    profile_summary_df.to_parquet(profile_summaries_file, engine="pyarrow", index=False)
else:
    print(f"Loading existing profile summaries from {profile_summaries_file}")
    profile_summary_df = pd.read_parquet(profile_summaries_file)

# Save unique dataset IDs if file doesn't exist
if not os.path.exists(unique_file):
    print(f"Saving unique dataset IDs to {unique_file}")
    unique_df = pd.DataFrame({'dataset_id': df['dataset_id'].unique()})
    unique_df.to_parquet(unique_file, engine="pyarrow", index=False)
else:
    print(f"Unique dataset IDs file already exists at {unique_file}")
    unique_df = pd.read_parquet(unique_file)

# Calculate and save schema counts if file doesn't exist
if not os.path.exists(schema_counts_file):
    print(f"Calculating schema counts and saving to {schema_counts_file}")
    schema_counts = df.groupby(['dataset_id', 'schema']).size().reset_index(name='count')
    schema_counts.to_parquet(schema_counts_file, engine="pyarrow", index=False)
else:
    print(f"Loading existing schema counts from {schema_counts_file}")
    schema_counts = pd.read_parquet(schema_counts_file)

print("All WhyLogs processing and file saving completed.")

Loading existing profile summaries from whylogs_profile_summaries.parquet
Unique dataset IDs file already exists at annotation_unique.parquet
Loading existing schema counts from schema_counts.parquet
All WhyLogs processing and file saving completed.


## Process all the whylog profiles and check all the constraints

In [60]:
import whylogs as why
from whylogs.core import DatasetProfileView

# Profile the entire dataset
profile = why.log(df)

# You can also profile by dataset_id and job_id
profiles_by_dataset = {}
for dataset_id in unique_df['dataset_id']:
    dataset_slice = df[df['dataset_id'] == dataset_id]
    profiles_by_dataset[dataset_id] = why.log(dataset_slice)

# Get schema counts
schema_counts = df.groupby(['dataset_id', 'schema']).size().reset_index(name='count')

In [62]:
len(unique_df['dataset_id'])

8000

In [63]:
import pandas as pd
import whylogs as why
from whylogs.core.constraints import ConstraintsBuilder
from whylogs.core.constraints import Constraints, ConstraintsBuilder, MetricsSelector, MetricConstraint, DatasetConstraint
from whylogs.viz import NotebookProfileVisualizer
#from whylogs.core.metrics.metrics_selector import MetricsSelector
#from IPython.display import display, HTML

schema_issues = []
#schema_types = ner_schema_list
# Define all schema types to track
schema_types = [
    'B-METADATA_FIELD', 
    'B-METADATA_VALUE',
    'B-CHUNK_ID',
    'B-ACT_ID',
    'B-SECTION_NAME',
    'B-REGULATION_ID',
    'B-SECTION_ID',
    'B-SEQUENCE_ID'
]

for profile in profiles_by_dataset:
    dataset_id = profile
    #print(profile)
    #print(profiles_by_dataset[profile])
    whylog_data_profile = profiles_by_dataset[profile]
    profile_view = whylog_data_profile.view()
    builder = ConstraintsBuilder(dataset_profile_view=profile_view)

    ## Extract URL
    task_url_summary = profile_view.get_column('task_url').to_summary_dict()
    frequent_strings = task_url_summary.get('frequent_items/frequent_strings', [])
    if frequent_strings and len(frequent_strings) > 0:
        url_item = frequent_strings[0]
        url_value = url_item.value

    ## Extract Job Name
    job_name = profile_view.get_column('job_name').to_summary_dict()
    frequent_strings = job_name.get('frequent_items/frequent_strings', [])
    if frequent_strings and len(frequent_strings) > 0:
        job_item = frequent_strings[0]
        task_name = job_item.value

    ## Extract dataset id
    #dataset_id = profile_view.get_column('dataset_id').to_summary_dict()
    #frequent_strings = dataset_id.get('frequent_items/frequent_strings', [])
    #if frequent_strings and len(frequent_strings) > 0:
    #    dataset_item = frequent_strings[0]
    #    file_id = dataset_item.value
    
    # ✅ Add constraint for metadata fields
    builder.add_constraint(
        MetricConstraint(
            name="B-METADATA_FIELD Count",
            condition=check_b_metadata_field_count,
            metric_selector=MetricsSelector(column_name="schema", metric_name="frequent_items"),
        )
    )
    meta_field_constraint = builder.build().validate(profile_view)

    builder.add_constraint(
        MetricConstraint(
            name="B-METADATA_VALUE Count",
            condition=check_b_metadata_value_count,
            metric_selector=MetricsSelector(column_name="schema", metric_name="frequent_items"),
        )
    )
    meta_value_constraint = builder.build().validate(profile_view)

    builder.add_constraint(
        MetricConstraint(
            name="METADATA OVERALL Count",
            condition=check_metadata_annotation_overall,
            metric_selector=MetricsSelector(column_name="schema", metric_name="frequent_items"),
        )
    )

    meta_overall_constraint = builder.build().validate(profile_view)

    
    freq_values_values = profile_view.get_column("word").get_metric("frequent_items").to_summary_dict()
    word_counts = {item.value: item.est for item in freq_values_values['frequent_strings']}

    section_variants = {"section", "Section", "sections", "Sections"}
    section_found = {word: count for word, count in word_counts.items() if word in section_variants}

    section_constraint = None
    if section_found['Section'] > 2 or len(section_found) > 1:
        #print("Section-like words found:", section_found)
            #Check if sections are annotated right
        condition = check_section_annotation(word_counts, unique_df, dataset_id)
        #print(condition)
        builder.add_constraint(MetricConstraint(
            name="section_annotation_check",
            condition=check_section,
            metric_selector=MetricsSelector(column_name="dataset_id", metric_name="frequent_items"),
        ))
        section_constraint = builder.build().validate(profile_view)
    #else:
    #    print("No 'section' variants found.")

    constraints = builder.build()

    # Create a dictionary for the current file's results
    # Store summary row, profile_view, and visualization function
    file_result = {
        "Task #": task_name,
        "File #": dataset_id, #file_id,
        "URL": f'{url_value}',
        "Metadata Pass": f"✔ ({meta_overall_constraint})" if meta_overall_constraint else f"❌ ({meta_overall_constraint})",
        #"profile_view": profile_view,
        #"constraints": constraints,
    }
    if section_constraint is not None:
        file_result['SECTION-CONSTRAINT'] = f"✔ ({section_constraint})" if section_constraint else f"❌ ({section_constraint})"
    else:
        file_result['SECTION-CONSTRAINT'] = 'N/A'

    schema_freq_values = profile_view.get_column("schema").get_metric("frequent_items").to_summary_dict()
    schema_counts = {item.value: item.est for item in schema_freq_values['frequent_strings']}
    
    # Count occurrences of each schema type
    for schema_type in schema_types:
        if schema_type in schema_counts: 
            count = schema_counts[schema_type]
        else:
            count = 0
        # For now, set a minimum threshold of 1 for all types except the original two
        min_threshold = 5 if schema_type in ['B-METADATA_FIELD', 'B-METADATA_VALUE'] else 1
        status = f"✔ ({count})" if count >= min_threshold else f"❌ ({count})"
        file_result[f"Schema {schema_type}"] = status

    file_result["render_visualization"] =  lambda pv=profile_view, cs=constraints: display_visualization(pv, cs)

    # ✅ Store results
    schema_issues.append(file_result)
    #break

In [64]:
len(schema_issues)

8000

In [65]:
# ✅ Convert to DataFrame for Display
schema_issues_df = pd.DataFrame(schema_issues)

# ✅ Display the validation summary table
display(HTML(schema_issues_df[0:10].to_html(escape=False)))

Unnamed: 0,Task #,File #,URL,Metadata Pass,SECTION-CONSTRAINT,Schema B-METADATA_FIELD,Schema B-METADATA_VALUE,Schema B-CHUNK_ID,Schema B-ACT_ID,Schema B-SECTION_NAME,Schema B-REGULATION_ID,Schema B-SECTION_ID,Schema B-SEQUENCE_ID,render_visualization
0,NER_train_batch_249,20723,View,❌ (False),,✔ (7),❌ (0),✔ (1),✔ (1),✔ (1),✔ (1),✔ (1),✔ (1),at 0x7f555893a980>
1,NER_train_batch_249,20724,View,✔ (True),,✔ (7),✔ (6),❌ (0),❌ (0),❌ (0),❌ (0),✔ (1),❌ (0),at 0x7f555899bf60>
2,NER_train_batch_249,20725,View,❌ (False),,✔ (7),❌ (0),✔ (1),✔ (1),✔ (1),✔ (1),✔ (1),✔ (1),at 0x7f555899afc0>
3,NER_train_batch_249,20726,View,❌ (False),,✔ (7),❌ (0),✔ (1),✔ (1),✔ (1),✔ (1),✔ (1),✔ (1),at 0x7f555894d9e0>
4,NER_train_batch_249,20727,View,❌ (False),,✔ (7),❌ (0),✔ (1),✔ (1),✔ (1),✔ (1),✔ (1),✔ (1),at 0x7f555882d1c0>
5,NER_train_batch_249,20728,View,❌ (False),❌ (False),✔ (7),❌ (0),✔ (1),✔ (1),✔ (1),✔ (1),✔ (1),✔ (1),at 0x7f55589d3100>
6,NER_train_batch_249,20729,View,❌ (False),,✔ (7),❌ (2),❌ (0),✔ (1),✔ (1),❌ (0),✔ (1),✔ (1),at 0x7f555887c4a0>
7,NER_train_batch_249,20730,View,✔ (True),,✔ (7),✔ (6),❌ (0),❌ (0),❌ (0),❌ (0),✔ (1),❌ (0),at 0x7f555885fb00>
8,NER_train_batch_249,20731,View,❌ (False),❌ (False),✔ (7),❌ (1),✔ (1),✔ (1),✔ (1),❌ (0),✔ (1),✔ (1),at 0x7f55588c2ca0>
9,NER_train_batch_249,20732,View,❌ (False),,✔ (7),❌ (2),❌ (0),✔ (1),✔ (1),❌ (0),✔ (1),✔ (1),at 0x7f55589d1b20>


In [66]:
#visualization.difference_distribution_chart(feature_name=["schema"]) ## Because there is no difference between the ref and target

In [67]:
import ipywidgets as widgets
from IPython.display import display, clear_output

dropdown = widgets.Dropdown(
    options=[(f"{entry['Task #']} - {entry['File #']}", i) for i, entry in enumerate(schema_issues)],
    description='Select File:',
    layout={'width': '60%'}
)

output = widgets.Output()

def on_change(change):
    output.clear_output()
    with output:
        selected_index = change["new"]
        if selected_index is not None:
            schema_issues[selected_index]["render_visualization"]()

dropdown.observe(on_change, names="value")

display(dropdown, output)

Dropdown(description='Select File:', layout=Layout(width='60%'), options=(('NER_train_batch_249 - 20723', 0), …

Output()

## Export Visualization

In [68]:
import os
os.getcwd()
visualization = NotebookProfileVisualizer()
visualization.set_profiles(target_profile_view=profile_view, reference_profile_view=profile_view)
visualization.write(
    visualization.difference_distribution_chart(feature_name="schema"),
    html_file_name=os.getcwd() + "/example",
)

## Collect failed valdiation for pre-annotation

Send passed validation for training