# Fine-tuning for Qwen 2.5 Coder 7B

Let's start by importing packages!

In [11]:
import os
import sys
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
import torch
import json
from datasets import Dataset

sys.path.append(os.path.abspath("../data"))

1. Define the model name (from huggingface)

In [12]:
model_name = "Qwen/Qwen2.5-Coder-7B-Instruct"

2. Loading the Model and Tokenizer
   

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name, torch_dtype="auto", device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

3. Loading and Preprocessing the Dataset

In [None]:
data_dict = {"question": [], "sql_query": []}


with open(
    "../processed/MINIDEV/mini_dev_oracle.json",
    "r",
) as f:
    data = json.load(f)


for example in data:
    data_dict["question"].append(example["question"])
    data_dict["sql_query"].append(example["SQL"])

dataset = Dataset.from_dict(data_dict)

## Preprocessing Function

We need to prepare the inputs and labels for training. The model expects input in a conversational format.

- Messages: We format each example as a conversation between the user and the assistant.
- Text Generation: apply_chat_template constructs the conversation text.
- Tokenization: We tokenize the full conversation and the assistant’s response separately.
- Labels: We set labels to -100 (ignore index) for the input tokens and only compute loss on the assistant’s response.

In [None]:
def preprocess_function(examples):
    inputs = []
    labels_list = []
    for question, sql_query in zip(examples["question"], examples["sql_query"]):
        messages = [
            {
                "role": "system",
                "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant.",
            },
            {"role": "user", "content": question},
            {"role": "assistant", "content": sql_query},
        ]
        # Generate the conversation text
        text = tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=False
        )
        # Tokenize the full conversation
        tokenized = tokenizer(text)
        input_ids = tokenized["input_ids"]
        # Tokenize the assistant's response separately
        assistant_tokenized = tokenizer(sql_query, add_special_tokens=False)
        assistant_input_ids = assistant_tokenized["input_ids"]
        # Determine where the assistant's response starts
        assistant_start = len(input_ids) - len(assistant_input_ids)
        # Mask the inputs before the assistant's response
        labels = [-100] * assistant_start + input_ids[assistant_start:]
        inputs.append(input_ids)
        labels_list.append(labels)
    return {"input_ids": inputs, "labels": labels_list}

5. Apply the Preprocessing Function

In [None]:
tokenized_dataset = dataset.map(
    preprocess_function, batched=True, remove_columns=["question", "sql_query"]
)

## Training

6. Set Up Training Arguments

In [None]:


training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=1,  # Adjust based on your GPU memory
    gradient_accumulation_steps=16,  # Effective batch size
    logging_steps=10,
    save_steps=1000,
    save_total_limit=2,
    learning_rate=5e-5,
    fp16=True,  # Enable if using compatible GPU
    evaluation_strategy="no",
)

In [None]:


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

## Testing

In [None]:
def generate_sql(question):
    messages = [
        {
            "role": "system",
            "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant.",
        },
        {"role": "user", "content": question},
    ]
    text = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    generated_ids = model.generate(**model_inputs, max_new_tokens=512)
    generated_ids = [
        output_ids[len(model_inputs.input_ids[0]) :] for output_ids in generated_ids
    ]
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response.strip()


# Example usage
question = "List all customers who paid in EUR."
print(generate_sql(question))