In [None]:
import json
from numbers_parser import Document

# Path to your .numbers file (change the filename if needed)
input_file = "training_data.numbers"
# Output JSON file for Qwen fine-tuning data
output_file = "qwen_training_data.json"

# Load the Numbers document
doc = Document(input_file)

# Assume the data is in the first sheet and the first table of that sheet
sheet = doc.sheets[0]
table = sheet.tables[0]

# Get all rows from the table (first row assumed to be the header)
rows = table.rows()

# The first row contains the headers (e.g., "src", "tgt", "explanation")
headers = rows[0]
data_rows = rows[1:]

# Determine column indices from the header row
src_idx = headers.index("src")
tgt_idx = headers.index("tgt")
explanation_idx = headers.index("explanation")

# Define a common instruction for each example
instruction = (
    "Translate the following Mongolian sentence into English, "
    "and then explain your translation choices."
)

# Build the training data list
training_data = []

for row in data_rows:
    src_text = row[src_idx]
    tgt_text = row[tgt_idx]
    explanation = row[explanation_idx]
    
    # Format the output as desired for Qwen
    output_text = (
        f"English Translation: {tgt_text}\n\n"
        f"Explanation: {explanation}"
    )
    
    training_example = {
        "instruction": instruction,
        "input": src_text,
        "output": output_text
    }
    
    training_data.append(training_example)

# Write the training data to a JSON file
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(training_data, f, ensure_ascii=False, indent=2)

print(f"Saved {len(training_data)} training examples to {output_file}")
