In [None]:
import json
from pathlib import Path

In [None]:
# Load test_B entries
test_file = Path("/mnt/d/Pobrane/poleval-gender/data/taskA/test_B.jsonl")
test_ids = []
test_entries = {}

with open(test_file, 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        if line:  # Skip empty lines
            entry = json.loads(line)
            test_ids.append(entry['ipis_id'])
            test_entries[entry['ipis_id']] = entry

print(f"Total test entries: {len(test_ids)}")

In [None]:
# Load prediction entries
pred_file = Path("/mnt/d/Pobrane/poleval-gender/solution/task_proofreading/02_inference/predictions_test_B.jsonl")
pred_ids = []

with open(pred_file, 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        if line:
            entry = json.loads(line)
            pred_ids.append(entry['ipis_id'])

print(f"Total predictions: {len(pred_ids)}")

In [None]:
test_ids_set = set(test_ids)
pred_ids_set = set(pred_ids)

missing_ids = sorted(test_ids_set - pred_ids_set)

print(f"\nMissing entries: {len(missing_ids)}")
if missing_ids:
    print(f"\nMissing IDs: {missing_ids}")
else:
    print("\nNo missing entries!")

In [None]:
# Show details of missing entries
if missing_ids:
    print("\n" + "="*80)
    print("MISSING ENTRY DETAILS")
    print("="*80)
    
    for ipis_id in missing_ids:
        entry = test_entries[ipis_id]
        source_text = entry['source']
        print(f"\nID: {ipis_id}")
        print(f"Source length: {len(source_text)} chars")
        print(f"Source preview: {source_text[:200]}...")
        print("-"*80)

In [None]:
print("\nChecking if IDs are in the same order...")
common_ids = [tid for tid in test_ids if tid in pred_ids_set]
pred_order = [tid for tid in pred_ids if tid in test_ids_set]

if common_ids == pred_order:
    print("Predictions are in the same order as test file")
else:
    print("[WARN] Predictions are in different order than test file")
    for i, (test_id, pred_id) in enumerate(zip(common_ids, pred_order)):
        if test_id != pred_id:
            print(f"   First mismatch at position {i}: test={test_id}, pred={pred_id}")
            break

In [None]:
# Check for encoding issues in the files
import os

print("\nFile encoding and byte-level analysis:")
print("="*80)

# Check test file
test_size = os.path.getsize(test_file)
with open(test_file, 'rb') as f:
    test_bytes = f.read()
    test_newlines = test_bytes.count(b'\n')

print(f"\nTest file: {test_file}")
print(f"  File size: {test_size} bytes")
print(f"  Newline count: {test_newlines}")
print(f"  Valid JSON entries: {len(test_ids)}")
print(f"  Empty lines: {test_newlines - len(test_ids)}")

# Check prediction file
pred_size = os.path.getsize(pred_file)
with open(pred_file, 'rb') as f:
    pred_bytes = f.read()
    pred_newlines = pred_bytes.count(b'\n')

print(f"\nPrediction file: {pred_file}")
print(f"  File size: {pred_size} bytes")
print(f"  Newline count: {pred_newlines}")
print(f"  Valid JSON entries: {len(pred_ids)}")
print(f"  Empty lines: {pred_newlines - len(pred_ids)}")

In [None]:
submission_file = Path("/mnt/d/Pobrane/poleval-gender/solution/task_proofreading/03_postprocessing/predictions_340_submission.tsv")

if submission_file.exists():
    submission_ids = []
    with open(submission_file, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                entry = json.loads(line)
                submission_ids.append(entry['ipis_id'])
    
    print(f"\n\nSubmission file analysis:")
    print("="*80)
    print(f"Submission file: {submission_file}")
    print(f"  Valid JSON entries: {len(submission_ids)}")
    
    with open(submission_file, 'rb') as f:
        sub_bytes = f.read()
        sub_newlines = sub_bytes.count(b'\n')
    
    print(f"  Newline count: {sub_newlines}")
    print(f"  File size: {len(sub_bytes)} bytes")
    
    if len(submission_ids) != len(set(submission_ids)):
        print(f"  [WARN] Duplicate IDs found!")
        from collections import Counter
        duplicates = [id for id, count in Counter(submission_ids).items() if count > 1]
        print(f"  Duplicate IDs: {duplicates[:10]}")
    else:
        print(f"  No duplicate IDs")
    
    sub_ids_set = set(submission_ids)
    extra_in_sub = sub_ids_set - test_ids_set
    if extra_in_sub:
        print(f"  [WARN] Extra IDs in submission not in test: {len(extra_in_sub)}")
        print(f"     {sorted(list(extra_in_sub))[:5]}")
else:
    print("\nSubmission file not found!")

In [None]:
files_to_check = [
    ("Test file", test_file),
    ("Prediction file", pred_file),
    ("Submission file", submission_file)
]

print("Encoding and Format Analysis:")
print("="*80)

for name, filepath in files_to_check:
    if filepath.exists():
        with open(filepath, 'rb') as f:
            raw_data = f.read()
        
        print(f"\n{name}: {filepath.name}")
        print(f"  File size: {len(raw_data)} bytes")
        
        if raw_data.startswith(b'\xef\xbb\xbf'):
            print(f"  [WARN] UTF-8 BOM detected (3 bytes)")
        elif raw_data.startswith(b'\xff\xfe'):
            print(f"  [WARN] UTF-16 LE BOM detected")
        elif raw_data.startswith(b'\xfe\xff'):
            print(f"  [WARN] UTF-16 BE BOM detected")
        else:
            print(f"  No BOM")
        
        crlf_count = raw_data.count(b'\r\n')
        lf_only = raw_data.count(b'\n') - crlf_count
        cr_only = raw_data.count(b'\r') - crlf_count
        
        print(f"  Line endings:")
        print(f"    - LF only (\\n): {lf_only}")
        print(f"    - CRLF (\\r\\n): {crlf_count}")
        print(f"    - CR only (\\r): {cr_only}")
        
        if crlf_count > 0:
            print(f"  [WARN] Windows line endings (CRLF) detected - may cause issues!")
        else:
            print(f"  Unix line endings (LF)")
        
        try:
            text = raw_data.decode('utf-8')
            print(f"  Valid UTF-8 encoding")
        except UnicodeDecodeError as e:
            print(f"  [ERROR] UTF-8 decode error: {e}")
        
        if b'\x00' in raw_data:
            print(f"  [WARN] Null bytes found in file")

In [None]:
output_with_newline = Path("/mnt/d/Pobrane/poleval-gender/solution/task_proofreading/03_postprocessing/predictions_340_submission_v2.tsv")

with open(submission_file, 'rb') as f:
    content = f.read()

if not content.endswith(b'\n'):
    print("[WARN] File doesn't end with newline, adding one...")
    with open(output_with_newline, 'wb') as f:
        f.write(content)
        f.write(b'\n')
    print(f"Created: {output_with_newline}")
else:
    print("File already ends with newline")
    output_with_newline = submission_file

with open(output_with_newline, 'rb') as f:
    new_content = f.read()
    newline_count = new_content.count(b'\n')
    
print(f"\nNew file stats:")
print(f"  Size: {len(new_content)} bytes")
print(f"  Newlines: {newline_count}")
print(f"  Ends with newline: {new_content.endswith(b'\\n')}")