# Natural Language to SQL

In this notebook, we benchmark the opensource code generation large language models such as CodeLLAMA, Codestral,etc., by fine tuning the models to generate SQL queries from natural language and evaluate them on metrics such as Accuracy, MMLU, etc. The best model will be integrated to the app [GenoQuery](https://github.com/parthasarathydNU/protein-data-nlq).

In [None]:
# # Install Pytorch & other libraries
# !pip install torch tensorboard

# # Install Hugging Face libraries
# !pip install --upgrade \
#   transformers \
#   datasets \
#   accelerate \
#   evaluate \
#   bitsandbytes \
#   trl \
#   peft

# # install peft & trl from github
# !pip install git+https://github.com/huggingface/trl@a3c5b7178ac4f65569975efadc97db2f3749c65e --upgrade
# !pip install git+https://github.com/huggingface/peft@4a1559582281fc3c9283892caea8ccef1d6f5a4f --upgrade



In [1]:
# Install Pytorch & other libraries
!pip install "torch==2.1.2" tensorboard

# Install Hugging Face libraries
!pip install  --upgrade \
  "transformers==4.36.2" \
  "datasets==2.16.1" \
  "accelerate==0.26.1" \
  "evaluate==0.4.1" \
  "bitsandbytes==0.42.0" \
  # "trl==0.7.10" # \
  # "peft==0.7.1" \

# install peft & trl from github
!pip install git+https://github.com/huggingface/trl@a3c5b7178ac4f65569975efadc97db2f3749c65e --upgrade
!pip install git+https://github.com/huggingface/peft@4a1559582281fc3c9283892caea8ccef1d6f5a4f --upgrade

Collecting git+https://github.com/huggingface/trl@a3c5b7178ac4f65569975efadc97db2f3749c65e
  Cloning https://github.com/huggingface/trl (to revision a3c5b7178ac4f65569975efadc97db2f3749c65e) to /tmp/pip-req-build-5yyzw94k
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/trl /tmp/pip-req-build-5yyzw94k
  Running command git rev-parse -q --verify 'sha^a3c5b7178ac4f65569975efadc97db2f3749c65e'
  Running command git fetch -q https://github.com/huggingface/trl a3c5b7178ac4f65569975efadc97db2f3749c65e
  Running command git checkout -q a3c5b7178ac4f65569975efadc97db2f3749c65e
  Resolved https://github.com/huggingface/trl to commit a3c5b7178ac4f65569975efadc97db2f3749c65e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting git+https://github.com/huggingface/peft@4a1559582281fc3c9283892caea8ccef1d6f5a4f
  Cloning https://github.co

In [2]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:32'

# Set the Hugging Face access token as an environment variable
os.environ['HUGGINGFACE_TOKEN'] = 'abc'

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import setup_chat_format

model_id = "meta-llama/Llama-2-7b-hf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=os.getenv('HUGGINGFACE_TOKEN'))
model = AutoModelForCausalLM.from_pretrained(
                                              model_id,
                                              quantization_config=bnb_config,
                                              device_map={"":0},
                                              _attn_implementation="eager",
                                              use_auth_token=os.getenv('HUGGINGFACE_TOKEN'))

print("Tokenizer : ", tokenizer)

 # set chat template to OAI chatML, remove if you start from a fine-tuned model
model, tokenizer = setup_chat_format(model, tokenizer)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Tokenizer :  LlamaTokenizerFast(name_or_path='meta-llama/Llama-2-7b-hf', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}


In [6]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [7]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [8]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules="all-linear",
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 19988480 || all params: 3520417792 || trainable%: 0.5677871542810337


# Dataset Loader

In [None]:
# !conda install -c conda-forge pyarrow -y

Channels:
 - conda-forge
 - rapidsai
 - nvidia
 - defaults
 - pytorch
Platform: linux-64
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /opt/conda

  added / updated specs:
    - pyarrow


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    aws-c-auth-0.7.20          |       h5f1c8d9_0         103 KB  conda-forge
    aws-c-cal-0.6.12           |       h2ba76a8_0          45 KB  conda-forge
    aws-c-common-0.9.17        |       h4ab18f5_0         222 KB  conda-forge
    aws-c-compression-0.2.18   |       h36a0aea_4          19 KB  conda-forge
    aws-c-event-stream-0.4.2   |      h161de36_10          53 KB  conda-forge
    aws-c-http-0.8.1           |      h63f54a0_13         191 KB  conda-forge
    aws-c-io-0.14.8            |       h96d4d28_0         154 KB  conda-forge
    aws-c-mqtt-0.10.4          |       hcc7299c

In [9]:
from datasets import load_dataset

# Convert dataset to OAI messages
system_message = """You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.
SCHEMA:
{schema}"""

def create_conversation(sample):
    return {
    "messages": [
      {"role": "system", "content": system_message.format(schema=sample["context"])},
      {"role": "user", "content": sample["question"]},
      {"role": "assistant", "content": sample["answer"]}
    ]
  }

# Load dataset from the hub
dataset = load_dataset("b-mc2/sql-create-context", split="train")
dataset = dataset.shuffle().select(range(12500))

# Convert dataset to OAI messages
dataset = dataset.map(create_conversation, remove_columns=dataset.features,batched=False)
# split dataset into 10,000 training samples and 2,500 test samples
dataset = dataset.train_test_split(test_size=2500/12500)

print(dataset["train"][345]["messages"])

# save datasets to disk
dataset["train"].to_json("train_dataset.json", orient="records")
dataset["test"].to_json("test_dataset.json", orient="records")

Map:   0%|          | 0/12500 [00:00<?, ? examples/s]

[{'content': 'You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.\nSCHEMA:\nCREATE TABLE table_29163303_1 (partner VARCHAR, year VARCHAR, surface VARCHAR)', 'role': 'system'}, {'content': "Who was Bob Bryan's partner/s on a hard surface in 2003?", 'role': 'user'}, {'content': 'SELECT partner FROM table_29163303_1 WHERE year = 2003 AND surface = "Hard"', 'role': 'assistant'}]


Creating json from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

1188896

In [10]:
from datasets import load_dataset

# Load jsonl data from disk
dataset = load_dataset("json", data_files="train_dataset.json", split="train")

Generating train split: 0 examples [00:00, ? examples/s]

# Model Training

In [None]:
# import transformers

# # needed for gpt-neo-x tokenizer
# tokenizer.pad_token = tokenizer.eos_token

# trainer = transformers.Trainer(
#     model=model,
#     train_dataset=dataset["train"],
#     args=transformers.TrainingArguments(
#         per_device_train_batch_size=1,
#         gradient_accumulation_steps=4,
#         warmup_steps=2,
#         max_steps=10,
#         learning_rate=2e-4,
#         fp16=True,
#         logging_steps=1,
#         output_dir="outputs",
#         optim="paged_adamw_8bit"
#     ),
#     data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
# )
# model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
# trainer.train()

In [11]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="llama-7b-text-to-sql", # directory to save and repository id
    num_train_epochs=2,                     # number of training epochs
    per_device_train_batch_size=1,          # batch size per device during training
    gradient_accumulation_steps=4,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps=5,                       # log every 10 steps
    save_strategy="epoch",                  # save checkpoint every epoch
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    bf16=False,                              # use bfloat16 precision
    tf32=False,                              # use tf32 precision
    fp16=True,
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    push_to_hub=False,                       # push model to hub
    report_to="tensorboard",                # report metrics to tensorboard
)

In [12]:
from trl import SFTTrainer

max_seq_length = 3072 # max sequence length for model and packing of the dataset

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset,
    peft_config=config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": False, # No need to add additional separator token
    }
)

Generating train split: 0 examples [00:00, ? examples/s]

In [13]:
torch.cuda.empty_cache()

In [14]:
import gc
gc.collect()

13300

In [None]:
# start training, the model will be automatically saved to the hub and the output directory
trainer.train()

# save model
# trainer.save_model()
trainer.save_state()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
10,0.9548
20,0.5709
30,0.514
40,0.4952
50,0.471
60,0.4323
70,0.4198
80,0.4181
90,0.4123
100,0.4143




In [None]:
# start training, the model will be automatically saved to the hub and the output directory
trainer.train()

# save model
# trainer.save_model()
trainer.save_state()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
5,0.9119
10,0.7291
15,0.6552
20,0.5918
25,0.558
30,0.5492
35,0.5378
40,0.5233
45,0.5135
50,0.5153




Step,Training Loss
5,0.9119
10,0.7291
15,0.6552
20,0.5918
25,0.558
30,0.5492
35,0.5378
40,0.5233
45,0.5135
50,0.5153


# Continue Training

In [None]:
!pip install --upgrade trl

Collecting trl
  Using cached trl-0.9.6-py3-none-any.whl.metadata (12 kB)
Using cached trl-0.9.6-py3-none-any.whl (245 kB)
Installing collected packages: trl
  Attempting uninstall: trl
    Found existing installation: trl 0.7.11.dev0
    Uninstalling trl-0.7.11.dev0:
      Successfully uninstalled trl-0.7.11.dev0
Successfully installed trl-0.9.6


In [None]:
# from peft import PeftModel
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import setup_chat_format

modelpath = "/kaggle/working/codestral-7b-text-to-sql/checkpoint-99"
tokenizer = AutoTokenizer.from_pretrained(modelpath)

In [None]:

model_id = "mistralai/Mistral-7B-Instruct-v0.2"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)


In [None]:
# peft_model = PeftModelForCausalLM.from_pretrained(model_id=modelpath, model=model,is_trainable=True)

In [None]:
# peft_model.print_trainable_parameters()

trainable params: 20,971,520 || all params: 7,262,703,616 || trainable%: 0.2888


In [None]:
from peft import PeftModel, PeftConfig

config = PeftConfig.from_pretrained("/kaggle/working/codestral-7b-text-to-sql/checkpoint-99", use_auth_token=os.getenv('HUGGINGFACE_TOKEN'))
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config=bnb_config,
                                             device_map={"":0},
                                             _attn_implementation="eager",
                                             use_auth_token=os.getenv('HUGGINGFACE_TOKEN'))

# Resize token embeddings to match the tokenizer's vocab size
model.resize_token_embeddings(len(tokenizer))
model.enable_input_require_grads()
model = PeftModel.from_pretrained(model,
                                 "/kaggle/working/codestral-7b-text-to-sql/checkpoint-99",
                                   is_trainable=True,
#                                   ignore_mismatched_sizes=True
                                 )


print(model.print_trainable_parameters())
# set chat template to OAI chatML, remove if you start from a fine-tuned model
model, tokenizer = setup_chat_format(model, tokenizer)



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

trainable params: 20,971,520 || all params: 7,262,720,000 || trainable%: 0.28875572788156506
None


In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="codestral-7b-text-to-sql", # directory to save and repository id
    num_train_epochs=2,                     # number of training epochs
    per_device_train_batch_size=1,          # batch size per device during training
    gradient_accumulation_steps=4,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps=10,                       # log every 10 steps
    save_strategy="epoch",                  # save checkpoint every epoch
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    bf16=False,                              # use bfloat16 precision
    tf32=False,                              # use tf32 precision
    fp16=True,
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    push_to_hub=False,                       # push model to hub
    report_to="tensorboard",                # report metrics to tensorboard
)

In [None]:
from trl import SFTTrainer

max_seq_length = 3072 # max sequence length for model and packing of the dataset

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset,
    peft_config=config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": False, # No need to add additional separator token
    }
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Generating train split: 0 examples [00:00, ? examples/s]



In [None]:
trainer.train(resume_from_checkpoint=modelpath)

In [None]:
# trainer.save_model()
trainer.save_state()

## Model Inference

Load the fine tuned model.
* Load the base model
* Add the fine tuned adapter layers

In [None]:
model = PeftModel.from_pretrained(model,
                                 "llama-7b-text-to-sql/checkpoint-100",
                                   is_trainable=False,
#                                   ignore_mismatched_sizes=True
                                 )


pipe = pipeline("text-generation",
                model=model,
                tokenizer=tokenizer,
                max_new_tokens=200,
)

In [None]:
prompt = <INST>

print(pipe(prompt)[0]['generated_text'])