# Fine-tuning

Let's start by importing packages!

In [1]:
import os
import sys
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)
import torch
import json
from datasets import Dataset

sys.path.append(os.path.abspath("../data"))

1. Define the model name (from huggingface)

In [8]:
model_name = "meta-llama/CodeLlama-7b-hf"

2. Loading the Model and Tokenizer
   

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Load the model onto the GPU
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,  # Use float16 for GPUs like V100/A100
    device_map=None,  # Remove device_map to avoid sharding
)
model.to("cuda")  # Move the model to GPU


3. Loading and Preprocessing the Dataset

In [10]:
data_dict = {"question": [], "sql_query": []}

with open(
    "../data/processed/MINIDEV/mini_dev_oracle.json",
    "r",
) as f:
    data = json.load(f)


for example in data:
    data_dict["question"].append(example["question"])
    data_dict["sql_query"].append(example["SQL"])

dataset = Dataset.from_dict(data_dict)

## Preprocessing Function

We need to prepare the inputs and labels for training. The model expects input in a conversational format.

- Messages: We format each example as a conversation between the user and the assistant.
- Text Generation: apply_chat_template constructs the conversation text.
- Tokenization: We tokenize the full conversation and the assistant’s response separately.
- Labels: We set labels to -100 (ignore index) for the input tokens and only compute loss on the assistant’s response.

In [11]:
def preprocess_function(examples):
    inputs = []
    labels_list = []
    for question, sql_query in zip(examples["question"], examples["sql_query"]):
        # Construct the prompt
        prompt = f"Question: {question}\nAnswer:"
        # Tokenize the prompt
        prompt_tokens = tokenizer(prompt, add_special_tokens=True)
        prompt_input_ids = prompt_tokens["input_ids"]
        # Tokenize the SQL query (the target)
        target_tokens = tokenizer(sql_query, add_special_tokens=True)
        target_input_ids = target_tokens["input_ids"]
        # Combine prompt and target
        input_ids = prompt_input_ids + target_input_ids
        # Create labels: mask the prompt tokens
        labels = [-100] * len(prompt_input_ids) + target_input_ids
        # Truncate sequences if necessary
        max_length = tokenizer.model_max_length
        if len(input_ids) > max_length:
            input_ids = input_ids[:max_length]
            labels = labels[:max_length]
        inputs.append(input_ids)
        labels_list.append(labels)
    return {"input_ids": inputs, "labels": labels_list}

5. Apply the Preprocessing Function

In [None]:
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=["question", "sql_query"],
    num_proc=4,  # Optional: adjust for multiprocessing
)

In [13]:
# Update the data collator to handle padding

# Define the data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    return_tensors="pt",
    pad_to_multiple_of=8,  # Efficient padding for GPU
)

## Training

6. Set Up Training Arguments

In [3]:
# Update the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=1,  # Adjust based on your GPU memory
    gradient_accumulation_steps=32,  # Effective batch size
    logging_steps=10,
    save_steps=1000,
    save_total_limit=2,
    learning_rate=5e-5,
    fp16=True,  # Enable FP16 training
    deepspeed="../notebooks/deepspeed.json",
    eval_strategy="no",
    dataloader_num_workers=4,
    # Ensure correct device is used
    # device should be set automatically, but you can specify it explicitly if needed
    # device="cuda",
)

In [None]:
model.gradient_checkpointing_enable()
training_args.gradient_checkpointing = True

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
trainer.train()

## Testing

In [None]:
def generate_sql(question):
    messages = [
        {
            "role": "system",
            "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant.",
        },
        {"role": "user", "content": question},
    ]
    text = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    generated_ids = model.generate(**model_inputs, max_new_tokens=512)
    generated_ids = [
        output_ids[len(model_inputs.input_ids[0]) :] for output_ids in generated_ids
    ]
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response.strip()


# Example usage
question = "List all customers who paid in EUR."
print(generate_sql(question))