In [1]:
import pandas as pd 
import json 
import os
from tqdm import tqdm
import subprocess
import platform
import threading
import jsonschema
from time import time


In [2]:
CORRECT_INSTANCES_PATH='./dataset/correct-instances'
if not os.path.exists(CORRECT_INSTANCES_PATH): 
    os.mkdir(CORRECT_INSTANCES_PATH)

# 1- Edit distances General function calulcation 

## 1.1 Define directories

In [3]:
SCHEMAS_PATH = './dataset/schemas'
MIGRATED_SCHEMAS_PATH = './dataset/schemas-migrated'
CORRECT_INSTANCES_BRACKET_PATH = './dataset/bracket/correct-instances'
EXPERIMENT_SCRIPT = './scripts/expr.sh'
WITNESSES = ['DG','JE','JSF']
WITNESSES_EDIT_DISTANCES_COLUMNS = [f'{x}_ted'for x in WITNESSES ]
WITNESSES_DIRS = [f'./dataset/{x}'for x in WITNESSES ]
WITNESSES_BRACKET_DIRS = [f'./dataset/bracket/{x}'for x in WITNESSES ]


## 1.2 Initialize the dataframe

In [4]:
def initialise_results(columns,file_name):
    df = pd.DataFrame(columns=columns)
    if not os.path.exists('./results'):
        os.makedirs('./results')
    df.to_csv(f'./results/{file_name}.csv',index=False)
    return df

In [5]:
def save_results(result_file_name,schema_name:str,results:list,columns:list):
    df = pd.DataFrame([[schema_name]+results],columns=columns) 
    df.to_csv(f'./results/{result_file_name}.csv',mode='a',index=False,header=False)

In [6]:
def witness_file_name(schema_name,is_json_file=False):
    return schema_name+'_witness.'+ ('json' if is_json_file else 'bracket')

## 1.3 Calculating edit distance

In [6]:
def calculate_edit_distance(schema_name:str,correct_instance_schema_bracket_path:str,w_bracket_dir)->int | None: 
    witness_bracket_path = f'{w_bracket_dir}/{witness_file_name(schema_name)}'
    if not os.path.exists(witness_bracket_path) : 
        return None 
    # print(correct_instance_schema_bracket_path)
    # print(witness_bracket_path)
    if platform.system() == 'Windows':
        command=['wsl',EXPERIMENT_SCRIPT,correct_instance_schema_bracket_path,witness_bracket_path]
    else : 
        command = [EXPERIMENT_SCRIPT,correct_instance_schema_bracket_path,witness_bracket_path]
    max_retries = 3
    retry_delay = 1  # seconds
    
    for retry in range(max_retries):
        try:
            process = subprocess.run(command, capture_output=True, text=True, timeout=30)  # Increase or decrease timeout as needed
            if process.returncode != 0:
                # print("Subprocess failed:", process.stderr)
                continue
            output_string = process.stdout.strip()
            output_parts = output_string.split(':')
            if len(output_parts) < 2:
                # print("Unexpected output format. Expected 'TED Distances: <Number>'. Actual:", output_string)
                continue 
            result = int(float(output_parts[1]))
            return result
        except subprocess.TimeoutExpired:
            print(f"Subprocess timed out on attempt {retry + 1}. Retrying...")
            # time.sleep(retry_delay)
    
    # print("Max retries exceeded. Unable to calculate edit distance.")
    return None
  
# print(calculate_edit_distance('o9915','./dataset/bracket/correct-instances/o9915.bracket','./dataset/bracket/JE/'))

# 2. Number of Errors and TED 

In [42]:


witnesses_number_errors_columns = [f'{x}_number_errors'for x in WITNESSES ]
# AST node stuff 
witnesses_ast_sizes_columns = [f'{x}_ast_size'for x in WITNESSES ]
ast_sizes_columns = ['correct_instance_ast_size']+witnesses_ast_sizes_columns
ast_sizes_dirs = [CORRECT_INSTANCES_PATH]+WITNESSES_DIRS

columns_ted_number_errors = ['schema']+ WITNESSES_EDIT_DISTANCES_COLUMNS+witnesses_number_errors_columns+ast_sizes_columns

## 2.1 Calculating the number of errors 

In [27]:
def calcualte_number_errors(schema_name,witness_path):
    with open(f'{SCHEMAS_PATH}/{schema_name}.json','r',encoding='utf-8') as f : 
        schema = json.load(f)
    if not os.path.exists(witness_path):
        return None 
    try : 
        with open(witness_path,'r',encoding='utf-8') as f : 
            data = json.load(f)
    except :
        return None
    try : 
        validator = jsonschema.Draft7Validator(schema)
        # print(list(validator.iter_errors(data)))
        return len(list(validator.iter_errors(data)))
    except Exception as e : 
        if("error: nothing to repeat at position 1" in str(e)) : # error appearing somehow
            return 0
        else :
            return None 
        
    
# calcualte_number_errors('o33194',f'./dataset/JE/{witness_file_name("o33194",True)}')

## 2.2 Calculate the number of nodes of a json schema 

In [28]:

def get_size_json(doc):
    """Returns the size of the corresponding tree of the given JSON."""

    size = 0
    if isinstance(doc, dict): # OBJECT
        # Count the node of the object and all its keys.
        size += 1 + len(doc.keys())
        # Add the sizes of all values.
        for key, val in doc.items():
            size += get_size_json(val)
    elif isinstance(doc, list): # ARRAY
        # Count the node of the array and all its array order nodes.
        size += 1 + len(doc)
        # Add the sizes of all values.
        for val in doc:
            size += get_size_json(val)
    else: # VALUE
        # Add node for the value.
        size += 1
    return size
def calculate_ast_size(json_file_path:str):
    try : 
        with open(json_file_path) as json_file:
            data = json.load(json_file)
            tree_size  = get_size_json(data)
        return tree_size 
    except :
        return None

In [19]:
def process_schema_edit_distance_number_errors_with_generators(result_file_name:str,schema:str,columns):
    schema_name = schema.split('.')[0]
    correct_instance_schema_bracket_path = f'{CORRECT_INSTANCES_BRACKET_PATH}/{schema_name}.bracket'
    results = []
    # calculate the edit distance 
    print(f"Processing {schema_name}")
    print("Calculating edit distance : ")
    for w_gen,w_bracket_dir in zip(WITNESSES,WITNESSES_BRACKET_DIRS): 
        # print(f"\t >{w_gen}")
        result = calculate_edit_distance(schema_name,correct_instance_schema_bracket_path,w_bracket_dir)
        results.append(result)
    # print(results)
    if all([x== None for x in results]):
        return 
    print("Calculating errors")
    # calculate the number of errors  
    for w_gen,w_dir in zip(WITNESSES,WITNESSES_DIRS): 
        # print(f"\t >{w_gen}")
        w_path = f'{w_dir}/{witness_file_name(schema_name,True)}'
        result = calcualte_number_errors(schema_name,w_path)
        results.append(result)
    
    # calculate the number of ast 
    print("Caluculating number of ast nodes")
    for i,(instance_column_name,instance_base_dir) in enumerate(zip(ast_sizes_columns,ast_sizes_dirs)):
        file_name = schema if i==0 else witness_file_name(schema_name,True)
        json_instance_path = f'{instance_base_dir}/{file_name}'
        result = calculate_ast_size(json_instance_path)
        results.append(result)
    
    print(f"Processed {schema_name}")
    save_results(result_file_name,schema_name,results,columns)
    pass  
    

In [None]:
result_file = "ted_number_errors"
df_ted_number_errors=initialise_results(columns_ted_number_errors,result_file)
for schema in tqdm(os.listdir(SCHEMAS_PATH)):
    process_schema_edit_distance_number_errors_with_generators(result_file,schema,columns_ted_number_errors)
    


In [338]:
"Distance TED:128".split(':')[1]

'128'

# 3- Type of errors vs TED

In [7]:
from jschon import create_catalog,JSON, JSONSchema
import json,os

In [19]:
witnesses_type_of_errors_columns = [f'{x}_type_of_errors'for x in WITNESSES ]
columns_ted_type_of_errors = ['schema']+ witnesses_type_of_errors_columns

In [23]:
create_catalog('2019-09')

    
    # print("there's still error")
   
keywords_to_avoid = [
    'not','anyOf','oneOf','allOf','properties','patternPropreties','items',
    'additionalProperties','contains','unevaluatedProperties','prefixItems', 'unevaluatedItems'
]
def extract_errors_map(error:dict, error_map:dict):
    # print(error)
    # if there's errors in subschema treat them since they are the origin of the error  
    keywordLocation = str(error['keywordLocation'])
    key = keywordLocation.split('/')[-1]
    if 'errors' in error:
        if key in  keywords_to_avoid :
            if error_map.get(key) is None : 
                error_map[key]=[error['error'] if 'error' in error else '']
            else : 
                error_map[key].append(error['error'] if 'error' in error else '') 
        for e in error['errors']:
            extract_errors_map(e, error_map)
    else : 
        # if key not in keywords_to_avoid:
        if error_map.get(key) is None : 
            error_map[key]=[error['error'] if 'error' in error else '']
        else : 
            error_map[key].append(error['error'] if 'error' in error else '') 
    
        
def extract_errors(errors): 
    error_map= {}
    for error in errors : 
        extract_errors_map(error,error_map)
    return list(error_map.keys())
def validate_instance(schema_dir:str,instance_dir): 
    with open(schema_dir,'r',encoding='utf-8') as f : 
        schema= json.load(f)
    demo_schema = JSONSchema(schema)
    # print(demo_schema)
    with open (instance_dir,'r',encoding='utf-8') as f : 
        instance=json.load(f)
        
    result = demo_schema.evaluate(
        JSON(instance)
    )
    return result 
def get_errors_validation(schema_dir:str,instance_dir:str): 
    try:
        result = validate_instance(schema_dir,instance_dir)
        # print(result.output('detailed'))# if errors in result.output('detailed')
        if(result.output("detailed")['valid']==False or "errors" in result.output("detailed")): 
            errors = extract_errors(result.output('detailed')['errors'])
        else : 
            errors=[]
        return errors
    except : 
        return []
    

In [21]:
def process_schema_edit_distance_with_generators(result_file_name:str,schema:str,columns):
    schema_name = schema.split('.')[0]
    migrated_schema_path = f'{MIGRATED_SCHEMAS_PATH}/{schema}'
    correct_instance_schema_bracket_path = f'{CORRECT_INSTANCES_BRACKET_PATH}/{schema_name}.bracket'
    results = []
    # calculate the edit distance 
    print(f"Processing {schema_name}")
    print("Calculating errors")
    # calculate the number of errors  
    for w_gen,w_dir in zip(WITNESSES,WITNESSES_DIRS): 
        print(f"\t >{w_gen}")
        w_path = f'{w_dir}/{witness_file_name(schema_name,is_json_file=True)}'
        if not os.path.isfile(w_path) : 
            result = []
        else : 
            result = get_errors_validation(migrated_schema_path,w_path)
        results.append(result)
    
    print(f"Processed {schema_name}")
    save_results(result_file_name,schema_name,results,columns)
    pass  
    

In [None]:
result_file_name="ted_type_of_errors"
df_ted_type_of_errors=initialise_results(columns_ted_type_of_errors,result_file_name)
for schema in tqdm(os.listdir(MIGRATED_SCHEMAS_PATH)):
    process_schema_edit_distance_with_generators(result_file_name,schema,columns_ted_type_of_errors)
    
