In [1]:
!pip install datasets
!pip install transformers -U  # -U ensures the latest versions
!pip install accelerate -U  # Speed up training
!pip install trl
!pip install bitsandbytes
!pip install peft



In [2]:
import torch

# Check if GPU is available and use it, otherwise fall back to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
from datasets import load_dataset

# Load dataset for fine-tuning
dataset_name = "ChrisHayduk/Llama-2-SQL-Dataset"
dataset = load_dataset(dataset_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
# Extract the training set
full_training_dataset = dataset["train"]

# Shuffle the dataset for randomness
shuffled = full_training_dataset.shuffle()  # Adding a seed for reproducibility

# Select the first 1000 samples for fine-tuning
training_dataset = shuffled.select(range(1000))

In [5]:
print(training_dataset)

Dataset({
    features: ['input', 'output'],
    num_rows: 1000
})


#  Configure BitsAndBytes Quantization (4-bit)

In [6]:
import bitsandbytes as bnb
from transformers import BitsAndBytesConfig

# Enable 4-bit quantization for efficient memory usage
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_dtype="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

Unused kwargs: ['bnb_4bit_quant_dtype']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


# Load Pretrained LLaMA-2 Model with Quantization

In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "NousResearch/Llama-2-7b-hf"

# Load model with quantization settings and auto device mapping
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto"
)

# Enable caching for improved inference speed
model.config.use_cache = True

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Set padding token and padding side
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
def construct_datapoint(x):
    """Combines input and output into a single string and tokenizes."""
    combined = x["input"] + x["output"]
    return tokenizer(combined, padding=True, truncation=True)

# Apply tokenization to the dataset
training_dataset = training_dataset.map(construct_datapoint)

print(training_dataset)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Dataset({
    features: ['input', 'output', 'input_ids', 'attention_mask'],
    num_rows: 1000
})


#  Configure LoRA (Low-Rank Adaptation) for Efficient Fine-Tuning

In [9]:
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training
)

# Define LoRA configuration
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=['q_proj', 'k_proj', 'down_proj', 'v_proj', 'gate_proj', 'o_proj', 'up_proj'],
    lora_dropout=0.05,
    task_type="CAUSAL_LM"
)

# Prepare model for training with LoRA and 4-bit quantization
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)  # Freezes non-attention layers


# Modify generation parameters
generation_configuration = model.generation_config
generation_configuration.pad_token_id = tokenizer.eos_token_id
generation_configuration.eos_token_id = tokenizer.eos_token_id
generation_configuration.max_new_tokens = 256
generation_configuration.temperature = 0.7
generation_configuration.top_p = 0.9
generation_configuration.do_sample = True

In [10]:
def generate(prompt):
    """Generates text based on a given prompt using the fine-tuned model."""

    generation_configuration.max_new_tokens = 20  # Limit response length

    encoded = tokenizer.encode(prompt, add_special_tokens=True, return_tensors="pt").to(device)

    with torch.inference_mode():
        out = model.generate(input_ids=encoded, generation_config=generation_configuration, repetition_penalty=2.0)

    string_decoded = tokenizer.decode(out[0], clean_up_tokenization_spaces=True)
    print(string_decoded)

In [11]:
# Example usage
generate("Today I want to")

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


<s> Today I want to share with you my 5 favorite products that help me stay beautiful and healthy. nobody wants the


In [12]:
import transformers

# Define training arguments
train_arguments = transformers.TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,  # Simulates a larger batch size
    num_train_epochs=1,
    learning_rate=2e-4,
    fp16=True,  # Use mixed precision training
    optim="paged_adamw_8bit",  # Optimizer optimized for quantized models
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    output_dir="fine_tuning"
)

In [13]:
from transformers import Trainer, DataCollatorForLanguageModeling

# Initialize Trainer
trainer = Trainer(
    model=model,
    train_dataset=training_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    args=train_arguments
)

# Disable cache for training
model.config.use_cache = False

In [14]:
# Start training
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33myanojan96[0m ([33myanojan96-western-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


  return fn(*args, **kwargs)


Step,Training Loss


TrainOutput(global_step=250, training_loss=0.6354869995117187, metrics={'train_runtime': 1463.1965, 'train_samples_per_second': 0.683, 'train_steps_per_second': 0.171, 'total_flos': 5031556363591680.0, 'train_loss': 0.6354869995117187, 'epoch': 1.0})

In [17]:
# Shuffle the evaluation dataset
evaluation_dataset = dataset["eval"].shuffle()

# Pick a sample SQL question
sample_sql_question = evaluation_dataset[0]["input"]
correct_answer = evaluation_dataset[0]["output"]

# Generate a response
generate(sample_sql_question)

<s> Below is an instruction that describes a SQL generation task, paired with an input that provides further context about the available table schemas. Write SQL code that appropriately answers the request.

### Instruction:
What was the smallest crowd of vfl park?

### Input:
CREATE TABLE table_name_83 (crowd INTEGER, venue VARCHAR)

### Response:  SELECT MIN(venue), COUNT(*) FROM CREATETABLEtable-2951704


In [18]:
# Display the correct answer
print("Correct Answer:", correct_answer)

Correct Answer: SELECT MIN(crowd) FROM table_name_83 WHERE venue = "vfl park"
