In [1]:
# Import necessary libraries
import os
import json

# requires jsonschema>=4.23.0
from jsonschema import validate, ValidationError

# Set the module directory to locate the .env file
base_directory = os.path.dirname(os.path.abspath("__file__"))

# Set the data, template, and validation directories
data_directory = os.path.join(base_directory, "data")
validation_directory = os.path.join(base_directory, "schema")


### Helper Functions

In [2]:
def get_ft_validation_schema(ft_type, schema_directory):
    """
    Function to load the JSON validation schema for the fine-tuning data

    Args:
        - ft_type (str): The fine-tuning type - supervised or dpo
        - schema_directory (str): The directory where the JSON validation

    Returns:
        - schema (dict): The JSON validation schema
    """
    # Check the fine-tuning type and load the appropriate schema
    if ft_type == "supervised":
        schema_file = "supervised_ft_schema.json"
    elif ft_type == "dpo":
        schema_file = "dpo_ft_schema.json"
    
    # Set the schema path
    schema_path = os.path.join(schema_directory, schema_file)

    # Load the JSON validation schema
    with open(schema_path, "r") as file:
        schema = file.read()

    return json.loads(schema)


def validate_jsonl(file_path, schema):
    """
    Function to validate a JSONL file against a JSON schema
    
    Args:
        - file_path (str): The path to the JSONL file
        - schema (dict): The JSON schema to validate against
    
    Returns:
        - num_lines (int): The number of lines in the file
        - num_errors (int): The number of errors in the file
        - errors (list): A list of errors in the file
    """
    # Initialize counters
    num_lines = 0
    num_errors = 0
    errors = []
    
    # Open the file and read line by line
    with open(file_path, 'r') as file:
        for line_number, line in enumerate(file, start=1):
            num_lines += 1
            try:
                # Parse the JSON line
                json_object = json.loads(line)
                # Validate against the schema
                validate(instance=json_object, schema=schema)
            except json.JSONDecodeError:
                errors.append(f"Line {line_number}: Invalid JSON format")
                num_errors += 1
            except ValidationError as e:
                errors.append(f"Line {line_number}: Schema validation error - {e.message}")
                num_errors += 1

    return num_lines, num_errors, errors

### Check if the format is correct

In [3]:
# Load the schema to validate the jsonl
sup_ft_schema = get_ft_validation_schema(ft_type = "supervised", schema_directory = validation_directory)
dpo_ft_schema = get_ft_validation_schema(ft_type = "dpo", schema_directory = validation_directory)

In [4]:
sup_data = os.path.join(data_directory, "sup_ft_test_optimistic.jsonl")
dpo_data = os.path.join(data_directory, "dpo_ft_test_optimistic.jsonl")

In [5]:
# Run the validation
num_lines, num_errors, errors = validate_jsonl(sup_data, sup_ft_schema)

# print the output of the validation
print(f"{num_errors}/{num_lines} errors in the file")
print(errors)

0/50 errors in the file
[]


In [6]:
# Run the validation
num_lines, num_errors, errors = validate_jsonl(dpo_data, dpo_ft_schema)

# print the output of the validation
print(f"{num_errors}/{num_lines} errors in the file")
print(errors)

0/50 errors in the file
[]


### Example of Error

In [7]:
dpo_data = os.path.join(data_directory, "dpo_ft_test_optimistic_error.jsonl") 
# Run the validation
num_lines, num_errors, errors = validate_jsonl(dpo_data, dpo_ft_schema)

# print the output of the validation
print(f"{num_errors}/{num_lines} errors in the file")
print(errors)

1/50 errors in the file
["Line 1: Schema validation error - 'system1' is not one of ['system', 'user', 'assistant']"]
