In [1]:
!pip install transformers datasets peft bitsandbytes accelerate
!pip uninstall -y transformers accelerate
!pip install -U "transformers>=4.41.0" "accelerate>=0.33.0" "datasets>=2.20.0" evaluate peft bitsandbytes
!pip install -U transformers datasets peft accelerate trl




Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.13.0->peft)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.13.0->peft)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.13.0-

In [2]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, BitsAndBytesConfig

model_name = "t5-small"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16"
)

model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [3]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM"

model = get_peft_model(model, lora_config)


In [4]:
from datasets import load_dataset

dataset = load_dataset(
    "csv",
    data_files={
        "train": "/content/WikiSQL.csv",
        "validation": "/content/Wikisql_val.csv",
        "test": "/content/Wikisql_test.csv"
    }
)

def preprocess(examples):
    inputs = tokenizer(examples["question"], truncation=True, padding="max_length", max_length=128)
    labels = tokenizer(examples["sql"], truncation=True, padding="max_length", max_length=128)
    inputs["labels"] = labels["input_ids"]
    return inputs

tokenized_dataset = dataset.map(preprocess, batched=True)


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/56355 [00:00<?, ? examples/s]

Map:   0%|          | 0/8421 [00:00<?, ? examples/s]

Map:   0%|          | 0/15878 [00:00<?, ? examples/s]

In [5]:
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer


In [6]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=5e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    save_total_limit=2,
    predict_with_generate=True,
    logging_dir="./logs",
    use_legacy_prediction_loop=False,
)

In [7]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")




In [8]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    processing_class=tokenizer,
    data_collator=data_collator
)





In [11]:
trainer.train()

Step,Training Loss


KeyboardInterrupt: 

In [12]:
model.save_pretrained("lora_sql_model")


In [21]:
question = "Retrieve all rolls from the student table."
tables = "ADMIN: USERNAME (PRIMARY KEY) (text); PASS (text); STUDENT: roll (PRIMARY KEY) (int), name (text), dept (text)"

input_text = "Here is schema details: " + tables + " Answer this question as an SQL query: " + question

inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024).to(device)
outputs = model.generate(**inputs, max_length=512)
sql_query = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Predicted SQL:", sql_query)


Predicted SQL: SELECT schema FROM table WHERE Text = ADMIN: USERNAME (PRIMARY KEY) (text); PASS (text); STUDENT: roll (PRIMARY KEY) (int), name (text), dept (text)


In [38]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import PeftModel
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

base_model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
base_model = AutoModelForSeq2SeqLM.from_pretrained(base_model_name, device_map="auto")

model = PeftModel.from_pretrained(base_model, "lora_sql_model", device_map="auto")
model.eval()

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(32128, 512)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32128, 512)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear(
                    (base_layer): Linear(in_features=512, out_features=512, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=512, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=8, out_features=512, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
              

In [50]:
import pandas as pd
from tqdm import tqdm
import torch

df_test = pd.read_csv("/content/Wikisql_test.csv")

df_test = df_test.iloc[:1000]

inputs = df_test['question'].tolist()
labels = df_test['sql'].tolist()


device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)


batch_size = 16
generated_sql_queries = []

for i in tqdm(range(0, len(inputs), batch_size)):
    batch_texts = inputs[i:i+batch_size]

    input_tensor = tokenizer(batch_texts,
                             return_tensors="pt",
                             truncation=True,
                             padding=True,
                             max_length=1024).to(device)

    outputs = model.generate(**input_tensor, max_length=512)


    batch_sql = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    generated_sql_queries.extend(batch_sql)

for i in range(10):
    print(f"Question: {inputs[i]}")
    print(f"Predicted SQL: {generated_sql_queries[i]}")
    print(f"Ground Truth SQL: {labels[i]}")
    print("-"*40)



100%|██████████| 63/63 [00:44<00:00,  1.43it/s]

Question: What is terrence ross' nationality
Predicted SQL: SELECT Nationality FROM table WHERE Nation = terrence ross
Ground Truth SQL: SELECT Nationality FROM table WHERE Player = Terrence Ross
----------------------------------------
Question: What clu was in toronto 1995-96
Predicted SQL: SELECT clu FROM table WHERE Toronto = 1995-96
Ground Truth SQL: SELECT School/Club Team FROM table WHERE Years in Toronto = 1995-96
----------------------------------------
Question: which club was in toronto 2003-06
Predicted SQL: SELECT Club FROM table WHERE Club = toronto 2003-06
Ground Truth SQL: SELECT School/Club Team FROM table WHERE Years in Toronto = 2003-06
----------------------------------------
Question: how many schools or teams had jalen rose
Predicted SQL: SELECT Schools AND Teams FROM table WHERE School = jalen rose
Ground Truth SQL: SELECT COUNT School/Club Team FROM table WHERE Player = Jalen Rose
----------------------------------------
Question: Where was Assen held?
Predicted




In [54]:
import nltk
import evaluate

nltk.download("punkt", quiet=True)
nltk.download("punkt_tab", quiet=True)
rouge = evaluate.load("rouge")
decoded_preds = ["\n".join(nltk.sent_tokenize(p.strip())) for p in generated_sql_queries]
decoded_labels = ["\n".join(nltk.sent_tokenize(l.strip())) for l in labels]
rouge_score = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

print("\nROUGE Score on Test Set:")
print(rouge_score)


ROUGE Score on Test Set:
{'rouge1': np.float64(0.8454442520330798), 'rouge2': np.float64(0.6546006486458233), 'rougeL': np.float64(0.8190328666504755), 'rougeLsum': np.float64(0.8193795895838367)}


In [52]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=991eba8b7790e58e53118f343697f1cb0b08050bf6dcf1df92208ffb7160e39e
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
