### ADD TRANSLATION COLUMN FROM ANOTHER FILE

In [None]:
import json

main_file = '/Users/alessioborgi/GitHub/AMT-AutomaticMachineTranslation/gold_labels_first30s.jsonl'
extra_file = '/Users/alessioborgi/GitHub/AMT-AutomaticMachineTranslation/results/llm_based/chain_of_thoughts_prompting/BorgiNonModernToModern-hw2_transl-gemma-fewshot-cot.jsonl'
new_field = 'gemma_translation_fewshot-cot'  # Name of the new column to add
extra_field = 'gemma_translation'        # Field to read from extra_file

# Step 1: Read the first 30 values from the extra file
extra_values = []
with open(extra_file, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        try:
            row = json.loads(line)
        except json.JSONDecodeError as e:
            print(f"Error in line {i}: {line}")
            raise e

    for i, line in enumerate(f):
        if i < 30:
            data = json.loads(line)
            extra_values.append(data[extra_field])
        else:
            break

if len(extra_values) < 30:
    print (len(extra_values))
    raise ValueError('extra.jsonl contains fewer than 30 lines!')

# Step 2: Read main.jsonl and update first 30 rows with new column
main_rows = []
with open(main_file, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        row = json.loads(line)
        if i < 30:
            row[new_field] = extra_values[i]
        main_rows.append(row)

# Step 3: Overwrite main.jsonl with updated content
with open(main_file, 'w', encoding='utf-8') as f:
    for row in main_rows:
        f.write(json.dumps(row, ensure_ascii=False) + '\n')

print(f"Added column '{new_field}' to the first 30 lines in '{main_file}' using values from '{extra_file}'.")


ValueError: extra.jsonl contains fewer than 30 lines!

### ADD NEW COLUMN BASED ON ANOTHER .JSONL

In [4]:
import json

# Filenames
main_file = '/Users/alessioborgi/GitHub/AMT-AutomaticMachineTranslation/gold_labels_first30s.jsonl'
new_column_file = 'new_column.jsonl'
output_file = main_file

# Read main_file.jsonl lines
with open(main_file, 'r', encoding='utf8') as f_main:
    main_lines = [json.loads(line) for line in f_main]

# Read new_column.jsonl lines (each line is a dict with "gemma_translation_fewshot_gold_label_scores")
with open(new_column_file, 'r', encoding='utf8') as f_new:
    new_col_lines = [json.loads(line) for line in f_new]

# Sanity check
assert len(main_lines) >= len(new_col_lines), "main_file has fewer lines than new_column_file!"

# Merge: Add the new column to each main_file entry
for i, extra in enumerate(new_col_lines):
    # Add the (single) new key-value to the main dict
    main_lines[i].update(extra)

# Write the merged lines to a new file
with open(output_file, 'w', encoding='utf8') as f_out:
    for entry in main_lines:
        f_out.write(json.dumps(entry, ensure_ascii=False) + '\n')

print(f"Done! Output written to: {output_file}")


Done! Output written to: /Users/alessioborgi/GitHub/AMT-AutomaticMachineTranslation/gold_labels_first30s.jsonl


### ELIMINATE A COLUMN

In [8]:
import json

input_file = '/Users/alessioborgi/GitHub/AMT-AutomaticMachineTranslation/gold_labels_first30s.jsonl'
output_file = '/Users/alessioborgi/GitHub/AMT-AutomaticMachineTranslation/gold_labels_first30s.jsonl'
key_to_remove = "llama_translation_fewshot-cot"

with open(input_file, "r", encoding="utf-8") as fin, open(output_file, "w", encoding="utf-8") as fout:
    for line in fin:
        data = json.loads(line)
        if key_to_remove in data:
            del data[key_to_remove]
        fout.write(json.dumps(data, ensure_ascii=False) + "\n")

print(f"Done! Saved to {output_file}")


Done! Saved to /Users/alessioborgi/GitHub/AMT-AutomaticMachineTranslation/gold_labels_first30s.jsonl
