In [25]:
import os
import pandas as pd


DATA_FILE = os.path.join(os.path.dirname(__name__), '..', '..', 'data', "svg_results.jsonl")

import pandas as pd

df = pd.read_json(DATA_FILE, lines=True)

# # remove the ```xml and ``` in the svg_content column
# df['svg_content'] = df['svg_content'].str.replace('```xml', '', regex=False).str.replace('```', '', regex=False)
# df['svg_content'] = df['svg_content'].str.strip()


# # save it back to a new jsonl file
# df.to_json(DATA_FILE, orient='records', lines=True)
# print(f"Saved cleaned data to {DATA_FILE}")

print(df.svg_content[0])

<reasoning>
1.  Visual Elements: Cerulean sky and rolling green hills.
2.  Color Palette: Cerulean (light blue), various shades of green for the hills.
3.  Layout: Sky occupies the top portion, hills the bottom. Overlapping ellipses create the rolling effect.
4.  Base Shapes: `rect` for the sky, multiple `ellipse` shapes for the hills.
5.  Details: Overlap the hills and use varied green tones.
6.  Transformations: None needed.
7.  Review: Simple, clean, and accurately represents the scene.
</reasoning>


<svg viewBox="0 0 256 256" width="256" height="256" xmlns="http://www.w3.org/2000/svg">

  <!-- Sky background -->
  <rect x="0" y="0" width="256" height="150" fill="cerulean" />

  <!-- First rolling hill (dark green) -->
  <ellipse cx="128" cy="180" rx="140" ry="50" fill="forestgreen" />

  <!-- Second rolling hill (medium green) -->
  <ellipse cx="128" cy="200" rx="140" ry="50" fill="seagreen" />

  <!-- Third rolling hill (light green) -->
  <ellipse cx="128" cy="220" rx="140" ry="

# Investigate Training Error

We're encountering the following error when training:

```
ValueError: No columns in the dataset match the model's forward method signature. The following columns have been ignored: [text]. Please check the dataset and model. You may need to set `remove_unused_columns=False` in `TrainingArguments`.
```

This is happening because our dataset has a column named 'text', but the model's forward method expects inputs like 'input_ids', 'attention_mask', etc. Let's explore the issue and fix it.

In [None]:
# Let's examine the dataset format issue
from datasets import Dataset
from transformers import AutoTokenizer

# 1. Create a small sample dataset similar to what we have in SVGDataset
sample_data = [
    {"text": "This is a sample instruction and response."}, 
    {"text": "This is another sample."}
]
dataset = Dataset.from_list(sample_data)
print("Original dataset format:")
print(dataset)

In [None]:
# 2. The issue is that HF Trainer expects tokenized inputs, not raw text
# Let's show how to fix this by properly tokenizing

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-1.5B-Instruct", trust_remote_code=True)

# Function to tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding=False, truncation=True, max_length=1024)

# Apply tokenization to dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)
print("\nTokenized dataset format (this is what Trainer expects):")
print(tokenized_dataset)

## Solution: Fix the SVGDataset Class

The issue is in our `SVGDataset.prepare_dataset()` method. We need to add a tokenization step after creating the dataset with the raw text. Here's how to fix the `train_svg_model.py` file:

In [None]:
# Replace the prepare_dataset method in SVGDataset class with:

def prepare_dataset(self) -> DatasetDict:
    """Prepare the dataset for training by converting to HF Dataset format and splitting."""
    formatted_data = []
    for item in self.data:
        prompt = PROMPT_TEMPLATE.format(description=item['description'])
        response = f"{item['svg_content']}"
        
        # Format as instruction format
        formatted_data.append({
            "text": f"{prompt}\n\n{response}",
        })
    
    # Convert to HF Dataset
    dataset = Dataset.from_list(formatted_data)
    
    # Tokenize the dataset
    def tokenize_function(examples):
        # This is for causal language modeling, so we don't need labels
        return self.tokenizer(examples["text"], padding=False, truncation=True, max_length=self.max_length)
    
    tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
    
    # Split into train/validation/test (90/5/5)
    splits = tokenized_dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
    test_valid = splits['test'].train_test_split(test_size=0.5, shuffle=True, seed=42)
    
    return DatasetDict({
        'train': splits['train'],
        'validation': test_valid['train'], 
        'test': test_valid['test']
    })

## Alternative Fix with TrainingArguments

Alternatively, we can set `remove_unused_columns=False` in the TrainingArguments:

In [None]:
# In the train function, modify the TrainingArguments as follows:

training_args = TrainingArguments(
    output_dir=str(output_dir),
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=2,
    learning_rate=learning_rate,
    weight_decay=0.01,
    max_grad_norm=1.0,
    logging_steps=10,
    logging_first_step=True,
    evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="valid_svg_ratio",
    greater_is_better=True,
    warmup_ratio=warmup_ratio,
    fp16=fp16,
    report_to="none",  # We'll use custom W&B logging
    remove_unused_columns=False,  # Add this line
)

## Recommendation

The first approach (tokenizing the dataset) is generally better because:

1. It's more efficient - tokenization happens once during dataset preparation rather than every batch
2. It's more explicit about what data the model is actually using
3. It's the standard way to prepare datasets for HuggingFace Trainer

Let's implement this fix in the `train_svg_model.py` file.