<a href="https://colab.research.google.com/github/ahsan0444/NLP-TO-SQL/blob/main/text_to_sql.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Installation and Setup**



In [None]:
!pip install openai transformers datasets peft trl huggingface_hub

In [None]:
import os

os.environ['HF_TOKEN'] = "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"

In [None]:
from huggingface_hub import login, logout

login(token=os.environ['HF_TOKEN'], add_to_git_credential=True)

# **Load the Base Model and Tokenizer**

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

model_name = 't5-small'

tokenizer = AutoTokenizer.from_pretrained(model_name)

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
original_model = original_model.to('cuda')


In [None]:
original_model

# **Prepare Datasets**

In [None]:
from datasets import load_dataset, DatasetDict, interleave_datasets

# Load and split the first dataset
dataset_scc_train = load_dataset("b-mc2/sql-create-context", split='train[:80%]')
dataset_scc_test  = load_dataset("b-mc2/sql-create-context", split='train[-20%:-10%]')
dataset_scc_val   = load_dataset("b-mc2/sql-create-context", split='train[-10%:]')

# Load, preprocess, and split the second dataset
dataset_tts_train = load_dataset("Clinton/Text-to-sql-v1", split='train[:80%]')
dataset_tts_train = (dataset_tts_train
                     .remove_columns(['source', 'text'])
                     .rename_columns({'instruction': 'question', 'input': 'context', 'response': 'answer'}))

dataset_tts_test = load_dataset("Clinton/Text-to-sql-v1", split='train[-20%:-10%]')
dataset_tts_test = (dataset_tts_test
                    .remove_columns(['source', 'text'])
                    .rename_columns({'instruction': 'question', 'input': 'context', 'response': 'answer'}))

dataset_tts_val = load_dataset("Clinton/Text-to-sql-v1", split='train[-10%:]')
dataset_tts_val = (dataset_tts_val
                   .remove_columns(['source', 'text'])
                   .rename_columns({'instruction': 'question', 'input': 'context', 'response': 'answer'}))

# Load and split the third dataset
dataset_ks_train = load_dataset("knowrohit07/know_sql", split='validation[:80%]')
dataset_ks_test  = load_dataset("knowrohit07/know_sql", split='validation[-20%:-10%]')
dataset_ks_val   = load_dataset("knowrohit07/know_sql", split='validation[-10%:]')

# Interleave the datasets to create a combined DatasetDict
dataset = DatasetDict({
    'train': interleave_datasets([dataset_scc_train, dataset_tts_train, dataset_ks_train]),
    'test': interleave_datasets([dataset_scc_test, dataset_tts_test, dataset_ks_test]),
    'validation': interleave_datasets([dataset_scc_val, dataset_tts_val, dataset_ks_val])
})


# **Tokenization Function**

In [None]:
def tokenize_function(example):
    # Define the prompt structure
    start_prompt = "Tables:\n"
    middle_prompt = "\n\nQuestion:\n"
    end_prompt = "\n\nAnswer:\n"

    # Create prompts by combining context and question
    data_zip = zip(example['context'], example['question'])
    prompt = [
        start_prompt + context + middle_prompt + question + end_prompt
        for context, question in data_zip
    ]

    # Tokenize the prompts and answers
    example['input_ids'] = tokenizer(
        prompt,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    ).input_ids

    example['labels'] = tokenizer(
        example['answer'],
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    ).input_ids

    return example

# Apply the tokenize function across all dataset splits
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# Remove the original columns
tokenized_datasets = tokenized_datasets.remove_columns(['question', 'context', 'answer'])


# **Zero-Shot Model Generation**

In [None]:
# Initialize the index for selecting an example
index = 0

# Extract the question, context, and answer from the test split of the dataset
question = dataset['test'][index]['question']
context = dataset['test'][index]['context']
answer = dataset['test'][index]['answer']

# Construct the input prompt
prompt = f"""Tables:
{context}

Question:
{question}

Answer:
"""

# Tokenize the prompt and move the tensors to GPU (if available)
inputs = tokenizer(prompt, return_tensors='pt')
# inputs = inputs.to('cuda')

# Generate the model's output
output = tokenizer.decode(
    original_model.generate(
        inputs["input_ids"],
        max_new_tokens=200,
    )[0],
    skip_special_tokens=True
)

# Print the results
dash_line = '-' * 100
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN ANSWER:\n{answer}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

# **Fine-Tuning Setup with PEFT**

In [None]:
finetuned_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
# finetuned_model = finetuned_model.to('cuda')
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# Disable cache to improve training speed.
finetuned_model.config.use_cache = False

# Set the temperature for pretraining to 1.
finetuned_model.config.pretraining_tp = 1

In [None]:
from peft import LoraConfig

# Define the PEFT configuration.
peft_config = LoraConfig(
    # Set the rank of the LoRA projection matrix.
    r=8,

    # Set the alpha parameter for the LoRA projection matrix.
    lora_alpha=16,

    # Set the dropout rate for the LoRA projection matrix.
    lora_dropout=0.05,

    # Set the bias term to "none".
    bias="none",

    # Set the task type to "CAUSAL_LM".
    task_type="CAUSAL_LM"
)

# **Training Configuration**

In [None]:
!pip install bitsandbytes

In [None]:
from transformers import TrainingArguments, Trainer
from trl import SFTTrainer
import time


# Define the output directory with a timestamp for uniqueness
output_dir = f'./sql-training-{int(time.time())}'

# Set up the training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=5e-3,
    num_train_epochs=2,
    per_device_train_batch_size=16,  # Batch size per device during training
    per_device_eval_batch_size=16,   # Batch size per device during evaluation
    weight_decay=0.01,
    logging_steps=50,
    eval_strategy='steps',           # Updated argument name for evaluation strategy
    eval_steps=500,                  # Number of steps between evaluations
    fp16=True,                       # Enable fp16 training
    optim="paged_adamw_32bit",       # Set the optimizer to use
    gradient_accumulation_steps=2,   # Set the number of gradient accumulation steps
    lr_scheduler_type="cosine",      # Set the learning rate scheduler type
    save_strategy="epoch"            # Set the save strategy
)

# Initialize the SFTTrainer
trainer = SFTTrainer(
    model=finetuned_model,          # Set the model to be trained
    train_dataset=tokenized_datasets['train'],  # Set the training dataset
    eval_dataset=tokenized_datasets['validation'],  # Set evaluation dataset
    peft_config=peft_config,        # Set the PEFT configuration
    args=training_args,             # Set the training arguments
    tokenizer=tokenizer,            # Set the tokenizer
    packing=False,                  # Disable packing
    max_seq_length=1024             # Set the maximum sequence length
)


In [None]:
trainer

In [None]:
# Start the training process
trainer.train()

# ** Evaluate the Fine-Tuned Model**

In [None]:
# Select the index for the example (uncomment the line below to use the last 200 examples)
index = 0
# index = len(dataset['test']) - 200

# Extract the question, context, and answer for the selected example
question = dataset['test'][index]['question']
context = dataset['test'][index]['context']
answer = dataset['test'][index]['answer']

# Construct the input prompt
prompt = f"""Tables:
{context}

Question:
{question}

Answer:
"""

# Tokenize the prompt and move the tensors to GPU
inputs = tokenizer(prompt, return_tensors='pt')
inputs = inputs.to('cuda')

# Generate the model's output
output = tokenizer.decode(
    finetuned_model.generate(
        inputs["input_ids"],
        max_new_tokens=200,
    )[0],
    skip_special_tokens=True
)

# Print the results with visual separators
dash_line = '-' * 100
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN ANSWER:\n{answer}\n')
print(dash_line)
print(f'FINE-TUNED MODEL - ZERO SHOT:\n{output}')
