# Fine-tuning

Let's start by importing packages!

In [None]:
!module load CUDA
!module load cuDNN/8.9.2.26-CUDA-12.1.1

In [None]:
%pip uninstall -y torch

In [None]:
%pip install torch==2.4.0 --index-url https://download.pytorch.org/whl/cu121

In [8]:
import torch

print("torch version:", torch.__version__)
print("CUDA Version:", torch.version.cuda)
print("CUDA Available:", torch.cuda.is_available())
print("Number of GPUs:", torch.cuda.device_count())
print("Current CUDA Device:", torch.cuda.current_device())
print("Device Name:", torch.cuda.get_device_name(torch.cuda.current_device()))


torch version: 2.4.0
CUDA Version: None
CUDA Available: False
Number of GPUs: 0


AssertionError: Torch not compiled with CUDA enabled

In [None]:
import os
os.environ['CUDA_HOME'] = '/cvmfs/hpc.rug.nl/versions/2023.01/rocky8/x86_64/amd/zen3/software/CUDA/12.1.1'
os.environ['PATH'] = f"{os.environ['CUDA_HOME']}/bin:{os.environ['PATH']}"
os.environ['LD_LIBRARY_PATH'] = f"{os.environ['CUDA_HOME']}/lib64:{os.environ.get('LD_LIBRARY_PATH', '')}"

In [None]:
# %pip install --upgrade  pip
# %pip install -U  transformers accelerate datasets deepspeed
# %pip install torch --index-url https://download.pytorch.org/whl/cu121

In [None]:
# %pip install flash-attn

In [1]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)
import json
from datasets import load_dataset

1. Define the model name (from huggingface)

In [2]:
model_name = "stabilityai/stable-code-3b"

2. Loading the Model and Tokenizer
   

In [3]:
import os
os.environ["HF_TOKEN"] = "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"

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='auto',
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
)
model.config.use_cache = False
model.gradient_checkpointing_enable()

tokenizer_config.json:   0%|          | 0.00/4.76k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.47M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

3. Loading and Preprocessing the Dataset

In [16]:
dataset = load_dataset("json", data_files="../habrok/dataset.json")
split_dataset = dataset["train"].train_test_split(test_size=0.2)
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]

print(f"Train dataset size: {len(train_dataset)}")
print(f"Eval dataset size: {len(test_dataset)}")


Train dataset size: 1227
Eval dataset size: 307


In [21]:
def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,

        padding="max_length",
    )

    # "self-supervised learning" means the labels are also the inputs:
    result["labels"] = result["input_ids"].copy()
    return result


def formatting_prompts_func(datapoint):
    question = datapoint["question"]
    query = datapoint["SQL"]
    database_schema = datapoint["database_schema"]
    prompt = f"""Given the following SQL tables, your job is to generate the Sqlite SQL query given the user's question.
Put your answer inside the ⁠```sql and ```⁠ tags.
{database_schema}
###
Question: {question}

⁠```sql
{query} ;
```
<|EOT|>
"""

    return tokenize(prompt)


train_dataset = train_dataset.map(formatting_prompts_func, batched=False)
test_dataset = test_dataset.map(formatting_prompts_func, batched=False)

Map:   0%|          | 0/1227 [00:00<?, ? examples/s]

Map:   0%|          | 0/307 [00:00<?, ? examples/s]

In [18]:
train_dataset

Dataset({
    features: ['question_id', 'db_id', 'question', 'evidence', 'SQL', 'difficulty', 'database_schema'],
    num_rows: 1227
})

## Preprocessing Function

We need to prepare the inputs and labels for training. The model expects input in a conversational format.

- Messages: We format each example as a conversation between the user and the assistant.
- Text Generation: apply_chat_template constructs the conversation text.
- Tokenization: We tokenize the full conversation and the assistant’s response separately.
- Labels: We set labels to -100 (ignore index) for the input tokens and only compute loss on the assistant’s response.

5. Apply the Preprocessing Function

In [None]:
# Define the data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    return_tensors="pt",
    pad_to_multiple_of=8,  # Efficient padding for GPU
)

## Training

6. Set Up Training Arguments

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=32, # effective batch size
    learning_rate=5e-5,
    bf16=True,
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=100,  # Evaluate every 100 steps
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    group_by_length=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
trainer.train()