# Remove Unwanted Fields from Normalized JSONL

This notebook processes normalized JSONL files and keeps only the fields that are present in the sample normalized file.

In [None]:
import json
from pathlib import Path

In [None]:
# Read sample file to get the expected fields
sample_file = Path("/mnt/d/Pobrane/poleval-gender/normalization/sample_proofreading_normalised.jsonl")

with open(sample_file, 'r', encoding='utf-8') as f:
    first_line = json.loads(f.readline())
    expected_fields = set(first_line.keys())

print(f"Expected fields: {expected_fields}")

In [None]:
def clean_jsonl_file(input_file, output_file, expected_fields):
    """
    Process JSONL file and keep only expected fields.
    
    Args:
        input_file: Path to input JSONL file
        output_file: Path to output JSONL file
        expected_fields: Set of field names to keep
    """
    input_path = Path(input_file)
    output_path = Path(output_file)
    
    cleaned_count = 0
    
    with open(input_path, 'r', encoding='utf-8') as infile, \
         open(output_path, 'w', encoding='utf-8') as outfile:
        
        for line in infile:
            # Parse JSON line
            data = json.loads(line)
            
            # Keep only expected fields
            cleaned_data = {k: v for k, v in data.items() if k in expected_fields}
            
            # Write cleaned data
            json.dump(cleaned_data, outfile, ensure_ascii=False)
            outfile.write('\n')
            
            cleaned_count += 1
    
    print(f"Processed {cleaned_count} lines")
    print(f"Output saved to: {output_path}")
    
    return output_path

In [None]:
# Process test2.jsonl
input_file = Path("/mnt/d/Pobrane/poleval-gender/normalization/predictions_340.jsonl")
output_file = Path("/mnt/d/Pobrane/poleval-gender/solution/task_proofreading/predictions_340_FINAL_cleaned.jsonl")

clean_jsonl_file(input_file, output_file, expected_fields)

In [None]:
# Verify the output by reading first few lines
with open(output_file, 'r', encoding='utf-8') as f:
    print("First 2 entries in cleaned file:")
    for i, line in enumerate(f):
        if i >= 2:
            break
        data = json.loads(line)
        print(f"\nEntry {i+1}:")
        print(f"Fields: {list(data.keys())}")
        print(f"ipis_id: {data.get('ipis_id', 'N/A')}")