## Full-Parameter Fine-Tuning Stable-Code 3B on Text-to-SQL task on the BIRD train dataset and evaluating it on mini-dev dataset.

In [2]:
!module load CUDA
!module load cuDNN/8.9.2.26-CUDA-12.1.1

In [None]:
%pip uninstall -y torch

In [None]:
%pip install torch==2.4.0 --index-url https://download.pytorch.org/whl/cu121

In [5]:
import torch

print("torch version:", torch.__version__)
print("CUDA Version:", torch.version.cuda)
print("CUDA Available:", torch.cuda.is_available())
print("Number of GPUs:", torch.cuda.device_count())
print("Current CUDA Device:", torch.cuda.current_device())
print("Device Name:", torch.cuda.get_device_name(torch.cuda.current_device()))


torch version: 2.4.0+cu121
CUDA Version: 12.1
CUDA Available: True
Number of GPUs: 1
Current CUDA Device: 0
Device Name: NVIDIA A100-PCIE-40GB


In [6]:
import os
os.environ['CUDA_HOME'] = '/cvmfs/hpc.rug.nl/versions/2023.01/rocky8/x86_64/amd/zen3/software/CUDA/12.1.1'
os.environ['PATH'] = f"{os.environ['CUDA_HOME']}/bin:{os.environ['PATH']}"
os.environ['LD_LIBRARY_PATH'] = f"{os.environ['CUDA_HOME']}/lib64:{os.environ.get('LD_LIBRARY_PATH', '')}"

In [None]:
%pip install --upgrade  pip
%pip install -U  transformers accelerate datasets deepspeed
%pip install torch --index-url https://download.pytorch.org/whl/cu121

In [None]:
%pip install flash-attn

In [8]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)
import json
from datasets import load_dataset

### Define the model name (from huggingface)

In [9]:
model_name = "stabilityai/stable-code-3b"

### Loading the Model and Tokenizer
   

In [9]:
import os
os.environ["HF_TOKEN"] = "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"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='auto',
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
)
model.config.use_cache = False
model.gradient_checkpointing_enable()

### Loading and Preprocessing the Dataset

In [23]:
dataset = load_dataset("json", data_files="../habrok/dataset.json")
split_dataset = dataset["train"].train_test_split(test_size=0.2)
train_eval_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]

train_eval_split = train_eval_dataset.train_test_split(test_size=0.25)
train_dataset = train_eval_split["train"]
eval_dataset = train_eval_split["test"]


print(f"Train dataset size: {len(train_dataset)}")
print(f"Eval dataset size: {len(eval_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")
test_dataset.save_to_disk("test_dataset")


Train dataset size: 920
Eval dataset size: 307
Test dataset size: 307


Saving the dataset (0/1 shards):   0%|          | 0/307 [00:00<?, ? examples/s]

### Formating the prompts for the train and eval datasets

In [None]:
def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,

        padding="max_length",
        max_length=1024
    )

    # "self-supervised learning" means the labels are also the inputs:
    result["labels"] = result["input_ids"].copy()
    return result


def formatting_prompts_func(datapoint):
    question = datapoint["question"]
    query = datapoint["SQL"]
    database_schema = datapoint["database_schema"]
    prompt = f"""Given the following SQL tables, your job is to generate the Sqlite SQL query given the user's question.
Put your answer inside the ⁠```sql and ```⁠ tags.
{database_schema}
###
Question: {question}

⁠```sql
{query} ;
```
<|EOT|>
"""

    return tokenize(prompt)


train_dataset = train_dataset.map(formatting_prompts_func, batched=False)
eval_dataset = eval_dataset.map(formatting_prompts_func, batched=False)

## Preprocessing Function

We need to prepare the inputs and labels for training. The model expects input in a conversational format.

- Messages: We format each example as a conversation between the user and the assistant.
- Text Generation: apply_chat_template constructs the conversation text.
- Tokenization: We tokenize the full conversation and the assistant’s response separately.
- Labels: We set labels to -100 (ignore index) for the input tokens and only compute loss on the assistant’s response.

5. Apply the Preprocessing Function

In [26]:
# Define the data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    return_tensors="pt",
    pad_to_multiple_of=8,  # Efficient padding for GPU
)

## Training

6. Set Up Training Arguments

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=32, # effective batch size
    learning_rate=5e-5,
    bf16=True,
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=10,  # Evaluate every 100 steps
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    group_by_length=True,
)

In [28]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
%pip install numpy

In [None]:
%pip install --upgrade pyarrow datasets numpy

In [31]:
trainer.train()

  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
10,0.2279,0.082383
20,0.0854,0.070294
30,0.0726,0.06538
40,0.0483,0.06451
50,0.0482,0.064372


TrainOutput(global_step=56, training_loss=0.09117009490728378, metrics={'train_runtime': 463.3844, 'train_samples_per_second': 3.971, 'train_steps_per_second': 0.121, 'total_flos': 2.936010920951808e+16, 'train_loss': 0.09117009490728378, 'epoch': 1.9478260869565216})

In [None]:
%pip install datasets

In [19]:
from datasets import load_from_disk

# Load the dataset from the directory where it was saved
test_dataset = load_from_disk("test_dataset")


In [26]:
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
import json
import re
import torch

model_name = "stabilityai/stable-code-3b"
b_tokenizer = AutoTokenizer.from_pretrained(model_name)
b_model = AutoModelForCausalLM.from_pretrained(model_name)
# List to store results for all entries
all_results = []

for entry in test_dataset:
    question = entry["question"]
    query = entry["SQL"]
    database_schema = entry["database_schema"]
    
    # Generate the prompt
    prompt = f"""Given the following SQL tables, your job is to generate the Sqlite SQL query given the user's question.
Put your answer inside the ⁠```sql and ```⁠ tags.
{database_schema}
###
Question: {question}

⁠```sql
"""
    input_text = prompt
    inputs = ft_tokenizer(input_text, return_tensors="pt")  # Return PyTorch tensors

    # Step 2: Move model and inputs to the same device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    b_model.to(device)  # Move the model to the GPU if available

    # Move input tensors to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Step 3: Generate the output
    output = b_model.generate(**inputs, max_new_tokens=100)

    # Step 4: Decode the generated tokens into readable text
    generated_text = b_tokenizer.decode(output[0], skip_special_tokens=True)
    # Extract all SQL queries using regex and account for the " ;" at the end
    sql_queries = re.findall(r'```sql\s+(.*?)\s+```', generated_text, re.DOTALL)

    # Check if there is a second query
    if len(sql_queries) >= 2:
        second_sql_query = sql_queries[1].rstrip(' ;')  # Get the second query and strip any trailing " ;"
    else:
        second_sql_query = None  # Handle case where there is no second query

    # Create the dictionary to store the result for this entry
    output_data = {
        "question_id": entry["question_id"],
        "db_id": entry["db_id"],
        "Original SQL": entry["SQL"],
        "Generated SQL": second_sql_query
    }
    
    # Append the result to the list of all results
    all_results.append(output_data)

# Save the list of results as a JSON file
with open("generated_sql_nt_model.json", "w") as json_file:
    json.dump(all_results, json_file, indent=4)