In [6]:
!pip install -q -U accelerate=='0.25.0' peft=='0.7.1' bitsandbytes=='0.41.3.post2' trl=='0.7.4'
!pip install -q git+https://github.com/huggingface/transformers.git@main accelerate 

In [7]:
import os
import torch
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)

from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType 
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
import pandas as pd

In [8]:
class TrainerConfig:
    def __init__(self):
        # Model parameters
        self.tokenizer = "codellama/CodeLlama-7b-Instruct-hf"
        self.base_model = "/kaggle/input/fine-tuned-part-1"
        self.new_model = "ContextSQL-7b"

        # QLoRA parameters
        self.lora_r = 64
        self.lora_alpha = 16
        self.lora_dropout = 0.05
        self.bias = "none"

        # bitsandbytes parameters
        self.use_4bit = True
        self.bnb_4bit_compute_dtype = "float16"
        self.bnb_4bit_quant_type = "nf4"
        self.use_nested_quant = False

        # TrainingArguments parameters
        self.output_dir = "./results"
        self.num_train_epochs = 1
        self.fp16 = True
        self.bf16 = False
        self.per_device_train_batch_size = 1
        self.per_device_eval_batch_size = 1
        self.gradient_accumulation_steps = 1
        self.gradient_checkpointing = True
        self.max_grad_norm = 0.3
        self.learning_rate = 2e-6
        self.weight_decay = 0.001
        self.optim = "paged_adamw_32bit"
        self.lr_scheduler_type = "cosine"
        self.max_steps = -1
        self.warmup_ratio = 0.03
        self.group_by_length = True
        self.save_steps = 0
        self.logging_steps = 50

        # SFT parameters
        self.max_seq_length = 1024
        self.packing = False
        self.device_map = {"": torch.cuda.current_device()}

config = TrainerConfig()

In [9]:
def parse_context(context):
    create_statements = context.split('; ')
    parsed_statements = []

    for statement in create_statements:
        parts = statement.split()
        table_name = parts[2]
        columns_part = statement.split('(')[1].split(')')[0]
        parsed_statement = f"(TABLE: {table_name} COLUMNS: {columns_part})"
        parsed_statements.append(parsed_statement)

    return parsed_statements

In [10]:
# load SQL dataset
data = pd.read_json('/kaggle/input/training-data/training_data.json')
data['context'] = data['context'].apply(parse_context)
data['context'] = data['context'].apply(lambda x: x[0])
dataset = Dataset.from_pandas(data)

compute_dtype = getattr(torch, config.bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=config.use_4bit,
    bnb_4bit_quant_type=config.bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=config.bnb_4bit_compute_dtype,
    bnb_4bit_use_double_quant=config.use_nested_quant,
)

In [11]:
# fine-tune half dataset then other half
one = Dataset.from_pandas(pd.DataFrame(dataset[:len(dataset) // 2]))
two = Dataset.from_pandas(pd.DataFrame(dataset[len(dataset) // 2:]))

In [12]:
# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and config.use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

In [13]:
# Load base model
model = AutoModelForCausalLM.from_pretrained(
    config.base_model,
    quantization_config=bnb_config,
    device_map=torch.cuda.current_device(),
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)

# disabled during training - need on for inference
model.config.use_cache = False
model.config.pretraining_tp = 1

config.json:   0%|          | 0.00/646 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [14]:
# check model
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32016, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=4096, out_features=64, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=64, out_features=4096, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
          )
          (k_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
 

In [15]:
tokenizer = AutoTokenizer.from_pretrained(config.tokenizer, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

peft_config = LoraConfig(
    lora_alpha=config.lora_alpha,
    lora_dropout=config.lora_dropout,
    r=config.lora_r,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
    ],
)

tokenizer_config.json:   0%|          | 0.00/749 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

In [16]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['context'])):
        text = f"""
        ### Question: {example['input'][i]}\n 
        ### Context: {example['context'][i]}\n 
        ### Answer: {example['output'][i]}"""
        
        output_texts.append(text)
    return output_texts

In [17]:
# set training parameters using attributes from config
training_arguments = TrainingArguments(
    output_dir=config.output_dir,
    num_train_epochs=config.num_train_epochs,
    per_device_train_batch_size=config.per_device_train_batch_size,
    gradient_accumulation_steps=config.gradient_accumulation_steps,
    optim=config.optim,
    save_steps=config.save_steps,
    logging_steps=config.logging_steps,
    learning_rate=config.learning_rate,
    weight_decay=config.weight_decay,
    fp16=config.fp16,
    bf16=config.bf16,
    max_grad_norm=config.max_grad_norm,
    max_steps=config.max_steps,
    warmup_ratio=config.warmup_ratio,
    group_by_length=config.group_by_length,
    lr_scheduler_type=config.lr_scheduler_type,
    report_to="wandb"
)

# set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=two,
    peft_config=peft_config,  
    formatting_func=formatting_prompts_func,
    max_seq_length=config.max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=config.packing,
)

  0%|          | 0/40 [00:00<?, ?ba/s]



In [18]:
# train model
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
50,2.3161
100,2.3256
150,2.333
200,2.3889
250,2.2964
300,2.3707
350,2.3716
400,2.328
450,2.2663
500,2.2436


TrainOutput(global_step=39289, training_loss=0.7761403638470816, metrics={'train_runtime': 38925.9689, 'train_samples_per_second': 1.009, 'train_steps_per_second': 1.009, 'total_flos': 1.4026972166499533e+17, 'train_loss': 0.7761403638470816, 'epoch': 1.0})

In [19]:
# save model
trainer.model.save_pretrained(f'{config.new_model}')

Application will allow the following:

- if SQL has been sucessfully ran before it will attempt to use that first, this is based on the question similarity of prior questions and how similar they are given they are above some threshold.
- context generation using sentence similarity to parse the context, create a better metadata structure.

In [20]:
import zipfile
import os
from IPython.display import FileLink

def zip_dir(directory = os.curdir, file_name = 'directory.zip'):
    os.chdir(directory)
    zip_ref = zipfile.ZipFile(file_name, mode='w')
    for folder, _, files in os.walk(directory):
        for file in files:
            if file_name in file:
                pass
            else:
                zip_ref.write(os.path.join(folder, file))

    return FileLink(file_name)

In [21]:
zip_dir()