## LoRA Fine-Tuning Stable-Code 3B on Text-to-SQL task on the BIRD train dataset and evaluating it on mini-dev dataset.

In [None]:
!module load CUDA
!module load cuDNN/8.9.2.26-CUDA-12.1.1

In [None]:
%pip install torch==2.4.0 --index-url https://download.pytorch.org/whl/cu121

In [27]:
import torch
print("torch version:", torch.__version__)
print("CUDA Version:", torch.version.cuda)
print("CUDA Available:", torch.cuda.is_available())
print("Number of GPUs:", torch.cuda.device_count())
print("Current CUDA Device:", torch.cuda.current_device())
print("Device Name:", torch.cuda.get_device_name(torch.cuda.current_device()))

torch version: 2.4.0+cu121
CUDA Version: 12.1
CUDA Available: True
Number of GPUs: 1
Current CUDA Device: 0
Device Name: NVIDIA A100-PCIE-40GB


In [28]:
import os
os.environ['CUDA_HOME'] = '/cvmfs/hpc.rug.nl/versions/2023.01/rocky8/x86_64/amd/zen3/software/CUDA/12.1.1'
os.environ['PATH'] = f"{os.environ['CUDA_HOME']}/bin:{os.environ['PATH']}"
os.environ['LD_LIBRARY_PATH'] = f"{os.environ['CUDA_HOME']}/lib64:{os.environ.get('LD_LIBRARY_PATH', '')}"

In [None]:
%pip install --upgrade  pip
%pip install -U  transformers accelerate datasets deepspeed
%pip install torch --index-url https://download.pytorch.org/whl/cu121

In [None]:
%pip install flash-attn

In [None]:

# Install necessary libraries
%pip install transformers==4.45.0 peft accelerate


In [32]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)
from peft import LoraConfig, get_peft_model
import json
from datasets import load_dataset

In [33]:
model_name = "stabilityai/stable-code-3b"

In [34]:
import os
os.environ["HF_TOKEN"] = "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"


In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='auto',
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
)
model.config.use_cache = False
model.gradient_checkpointing_enable()
# Set up the LoRA configuration

In [None]:
lora_config = LoraConfig(
    r=32,                # LoRA rank
    lora_alpha=64,            # Scaling factor for LoRA
    lora_dropout=0.01,        # Dropout for LoRA layers
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]  # Set LoRA on attention layers (adjust based on architecture)
)

# Wrap the model with LoRA
model = get_peft_model(model, lora_config)

In [None]:

# Load dataset for training
dataset = load_dataset("json", data_files="../habrok/train_dataset.json")
split_dataset = dataset["train"].train_test_split(test_size=0.2)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]


print(f"Train dataset size: {len(train_dataset)}")
print(f"Eval dataset size: {len(eval_dataset)}")



In [None]:
def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,

        padding="max_length",
        max_length=1024
    )

    # "self-supervised learning" means the labels are also the inputs:
    result["labels"] = result["input_ids"].copy()
    return result


def formatting_prompts_func(datapoint):
    question = datapoint["question"]
    query = datapoint["SQL"]
    database_schema = datapoint["database_schema"]
    prompt = f"""Given the following SQL tables, your job is to generate the Sqlite SQL query given the user's question.
Put your answer inside the ⁠```sql and ```⁠ tags.
{database_schema}
###
Question: {question}

⁠```sql
{query} ;
```
<|EOT|>
"""

    return tokenize(prompt)


train_dataset = train_dataset.map(formatting_prompts_func, batched=False)
eval_dataset = eval_dataset.map(formatting_prompts_func, batched=False)

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    return_tensors="pt",
    pad_to_multiple_of=8,  # Efficient padding for GPU
)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=4,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=32, # effective batch size
    learning_rate=5e-5,
    bf16=True,
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    eval_strategy="steps",
    eval_steps=100,  # Evaluate every 100 steps
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    group_by_length=True,
)

In [None]:
%pip install tensorboard

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)
trainer.can_return_loss = True

In [None]:
%pip install numpy

In [None]:
%pip install --upgrade pyarrow datasets numpy

In [None]:
#model.eval()
# Start training using LoRA fine-tuning
trainer.train()


In [None]:

# Save the LoRA fine-tuned model and tokenizer
model.save_pretrained("./lora_finetuned_model")
tokenizer.save_pretrained("./lora_finetuned_model")
