In [3]:
#FINE-TUNING PRE-TRAINED LLAMA ON TPCDSA DATA

import torch
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    prepare_model_for_kbit_training
)

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from datasets import load_dataset
from trl import SFTTrainer
from transformers import pipeline
from transformers import DataCollatorForLanguageModeling



uncomment this line to save the data splits to their respective locations. Running this will replace the existing content in these folders

# train_dataset.save_to_disk(train_datset_dir)
# val_dataset.save_to_disk(validation_dataset_dir)
# test_dataset.save_to_disk(test_dataset_dir)

In [4]:

#DO-NOT EXECUTE THIS AGAIN> THIS WILL REPLACE THE TRAIN_SPLIT SAVED
file_path = "./dataset/sqlize-finetuned-dataset.json"
train_dataset_dir ="./dataset/dataset-splits/train-split/"
validation_dataset_dir ="./dataset/dataset-splits/val-split/"
test_dataset_dir ="./dataset/dataset-splits/test-split/"
# Load your JSON dataset into a Dataset object
dataset = load_dataset('json', data_files=file_path)



# train-654, val-218, test-217
# Split each dataset within the DatasetDict
dataset_split = dataset["train"].train_test_split(
    test_size=435,
    shuffle=True,
    seed=42
)

train_dataset=dataset_split["train"]

val_and_test_splits=dataset_split["test"].train_test_split(
    test_size=217,
    shuffle=True,
    seed=42
)

val_dataset = val_and_test_splits["train"]
test_dataset = val_and_test_splits["test"]


# Print the sizes of the splits
print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")



# train_dataset.save_to_disk(train_datset_dir)
# val_dataset.save_to_disk(validation_dataset_dir)
# test_dataset.save_to_disk(test_dataset_dir)

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Train dataset size: 654
Validation dataset size: 218
Test dataset size: 217


In [5]:
from datasets import load_from_disk
train_splits=load_from_disk('./dataset//dataset-splits/train-split')
val_splits=load_from_disk('./dataset/dataset-splits/val-split')
test_splits=load_from_disk('./dataset/dataset-splits/test-split')


In [6]:
test_splits[0]

{'input': '\n[SYSTEM]:"You are an expert Text-to-SQL generator assistant. Your goal is to provide correct SQL queries to the given text description. Your output only contains the SQL code. No explanation or introductory sentences surrounding the SQL response is needed. You are given schema information. Here is the schema information: \n<tableName>item</tableName>\n<columns>i_item_sk,  i_item_id,  i_rec_start_date,  i_rec_end_date,  i_item_desc,  i_current_price,  i_wholesale_cost,  i_brand_id,  i_brand,  i_class_id,  i_class,  i_category_id,  i_category,  i_manufact_id,  i_manufact,  i_size,  i_formulation,  i_color,  i_units,  i_container,  i_manager_id,  i_product_name</columns>\n<tableName>store_sales</tableName>\n<columns>ss_sold_date_sk,  ss_sold_time_sk,  ss_item_sk,  ss_customer_sk,  ss_cdemo_sk,  ss_hdemo_sk,  ss_addr_sk,  ss_store_sk,  ss_promo_sk,  ss_ticket_number,  ss_quantity,  ss_wholesale_cost,  ss_list_price,  ss_sales_price,  ss_ext_discount_amt,  ss_ext_sales_price,  

In [8]:

#combine base and pretrained model - since pre-training using LORA is like adding new weights to the base model, that is why you need to merge the adapter and base model
# Reload model in FP16 and merge it with LoRA weights
quantization_config = BitsAndBytesConfig(
   load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model_id = "../models/7B/output"
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    quantization_config=quantization_config,
    device_map='auto'
    )



new_model_id = "../models/llama-2-7b-pretrained"
new_model = AutoModelForCausalLM.from_pretrained(
    new_model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    quantization_config=quantization_config,
    device_map='auto'
   )


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
model = PeftModel.from_pretrained(base_model, new_model_id)

In [10]:
# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True,device="cuda")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


In [11]:
# Define a DataCollator for Language Modeling to collate and prepare data
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Set this to True if you are fine-tuning for masked language modeling
)

In [12]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [14]:
# The model that you want to train from the Hugging Face hub


# The instruction dataset to use


# Fine-tuned model name ( fine-tuned on tpcds data)
new_model = "../models/llama-2-7b-finetuned-text2SQL"

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./finetuning-results"

# Number of training epochs
num_train_epochs = 5

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = True

# Batch size per GPU for training
per_device_train_batch_size = 1

# Batch size per GPU for evaluation
per_device_eval_batch_size = 1

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

In [15]:
# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

In [16]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

In [17]:
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=train_splits,
    eval_dataset= val_splits,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
    data_collator=data_collator, 

)




Map:   0%|          | 0/218 [00:00<?, ? examples/s]

In [18]:
# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss


KeyboardInterrupt: 

In [None]:
trainer.state.log_history

#writing training loss to a file
import json

with open('./finetuning-results/training_loss.json',"w") as file:
    file.write(json.dumps(trainer.state.log_history,indent=4))

In [None]:
#plotting graph for loss

log_path = './finetuning-results/training_loss.json'

import json 
import matplotlib.pyplot as plt
with open(log_path,"r") as json_file:
    data= json.load(json_file)

print(data)

steps  = [entry["step"] for entry in data if "loss" in entry]
print(data[0]["loss"])
losses  = [entry["loss"] for entry in data if "loss" in entry]

plt.figure(figsize=(10,8))
plt.plot(steps,losses,label='TrainingLoss')
plt.xlabel('Step', weight="bold")
plt.ylabel('Training Loss',weight="bold")
plt.title('fine-tuning on novel TPCDS-dataset',weight="bold")
plt.legend()
plt.grid(True)
plt.savefig('pre-training_loss_plt.pdf',format="pdf")
plt.show()

In [14]:
#INFERENCE MODE -finetuned model

#combine base and pretrained model - since pre-training using LORA is like adding new weights to the base model, that is why you need to merge the adapter and base model
# Reload model in FP16 and merge it with LoRA weights
quantization_config = BitsAndBytesConfig(
   load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model_id = "../models/7B/output"
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    quantization_config=quantization_config,
    device_map='auto'
    )



new_model_id = "llama-2-7b-finetuned-text2SQL"
new_model = AutoModelForCausalLM.from_pretrained(
    new_model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    quantization_config=quantization_config,
    device_map='auto'
   )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [15]:
model = PeftModel.from_pretrained(base_model, new_model_id)


In [16]:
# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True,device="cuda")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


In [16]:
#sample inferene running

encoding = tokenizer(prompt, return_tensors="pt").to("cuda")
with torch.inference_mode():
  outputs = model.generate(
      input_ids = encoding.input_ids,
      attention_mask = encoding.attention_mask,
      generation_config = generation_config,
      
      
    
  )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


[SYSTEM]:"You are an expert Text-to-SQL generator assistant. Your goal is to provide correct SQL queries to the given text description. Your output only contains the SQL code. No explanation or introductory sentences surrounding the SQL response is needed. You are given schema information. Here is the schema information: 
<tableName>web_sales</tableName>
<columns>ws_sold_date_sk,  ws_sold_time_sk,  ws_ship_date_sk,  ws_item_sk,  ws_bill_customer_sk,  ws_bill_cdemo_sk,  ws_bill_hdemo_sk,  ws_bill_addr_sk,  ws_ship_customer_sk,  ws_ship_cdemo_sk,  ws_ship_hdemo_sk,  ws_ship_addr_sk,  ws_web_page_sk,  ws_web_site_sk,  ws_ship_mode_sk,  ws_warehouse_sk,  ws_promo_sk,  ws_order_number,  ws_quantity,  ws_wholesale_cost,  ws_list_price,  ws_sales_price,  ws_ext_discount_amt,  ws_ext_sales_price,  ws_ext_wholesale_cost,  ws_ext_list_price,  ws_ext_tax,  ws_coupon_amt,  ws_ext_ship_cost,  ws_net_paid,  ws_net_paid_inc_tax,  ws_net_paid_inc_ship,  ws_net_paid_inc_ship_tax,  ws_net_profit</colum

In [18]:
generation_config = base_model.generation_config
generation_config.max_new_tokens = 2048
#generation_config.temperature = 0.7
#generation_config.top_p = 0.7
#generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

In [21]:
#generating responses for all test dataset
import os
import time
query_time=""
for queryIdx in range(len(test_splits)):
    start=time.time()
    print(f"started infernece for {queryIdx+1}")
    with torch.inference_mode():
        encoding = tokenizer(test_splits[queryIdx]["text"], return_tensors="pt").to("cuda")
        outputs = model.generate(input_ids = encoding.input_ids, attention_mask = encoding.attention_mask,generation_config = generation_config) 
        query_time+= str(time.time() - start)  +"\n"
        print(f"inference finished for query{queryIdx+1}")
        with open(f"./test-results/llm-results/query_{queryIdx+1}.txt","w") as response_file:
            response_file.write(tokenizer.decode(outputs[0], skip_special_tokens=True))
            response_file.close()
        print("completed writing response ot a file")

with open(f"./test-results/querytime/query-time.txt","w") as time_file:
            time_file.write(query_time)
            time_file.close()



    
    


started infernece for 1
inference finished for query1
completed writing response ot a file
started infernece for 2
inference finished for query2
completed writing response ot a file
started infernece for 3
inference finished for query3
completed writing response ot a file
started infernece for 4
inference finished for query4
completed writing response ot a file
started infernece for 5
inference finished for query5
completed writing response ot a file
started infernece for 6
inference finished for query6
completed writing response ot a file
started infernece for 7
inference finished for query7
completed writing response ot a file
started infernece for 8
inference finished for query8
completed writing response ot a file
started infernece for 9
inference finished for query9
completed writing response ot a file
started infernece for 10
inference finished for query10
completed writing response ot a file
started infernece for 11
inference finished for query11
completed writing response ot a 

In [20]:
#this code randomly selects 99 queries from llama results (random sampling) to execute on db

import os
import shutil
import random

# Paths for source and destination folders
src_folder = './test-results/llm-responses'
dest_folder = './test-results/sampled-queries'

# Ensure the destination folder exists
os.makedirs(dest_folder, exist_ok=True)

# Generate a set of unique random numbers
random_numbers = set(random.sample(range(1, 218), k=99))  # Adjust 'k' for the number of files you want to copy

# Copy files based on generated numbers
for number in random_numbers:
    file_name = f'query_{number}.txt'
    src_path = os.path.join(src_folder, file_name)
    dest_path = os.path.join(dest_folder, file_name)

    # Copy the file from the source to the destination
    shutil.copy(src_path, dest_path)

print(f"Files successfully copied to {dest_folder}")


Files successfully copied to ./test-results/sampled-queries


In [3]:
#execute randomly sampled 99 queries on duckdb


import duckdb
import os
import time

# Define the connection parameters
query_path='./test-results/sampled-queries/'
query_result='./test-results/db-results/llm-resp-db-results/results/'
query_time_file='./test-results/db-results/llm-resp-db-results/querytime/log.txt'

# Connect to the DB 
db_con = duckdb.connect()

# Load the TPCDS database
db_con.execute("IMPORT DATABASE '/workspace/data/duckdb/build/release/tpcdssf100'")

print("Loading of TPCDS DB complete.")

# Create a folder to store query results
os.makedirs(query_result, exist_ok=True)

query_files = [f for f in os.listdir(query_path) if os.path.isfile(os.path.join(query_path, f))]
query_files = sorted(query_files)
query_exec_time=""
# Iterate through the queries and execute them
for query_file in query_files:
    print("Executing ", query_file)
    # extract query number from the query file name.
    query_name = os.path.splitext(query_file)[0]

    # Read the query from the file
    with open(os.path.join(query_path, query_file), 'r') as f:
        query=f.read()

    # Execute the query and measure execution time
    try:
        start_time = time.time()
        result = db_con.execute(query)
        execution_time = time.time()-start_time
    
        # Save the query result to a file
        output_file_path = query_result + query_name + ".csv"
        with open(output_file_path, "w") as out_file:
            # print("Writing result:", out_file)
            for row in result.fetchall():
                out_file.write(str(row) + "\n")
                # print(row)
            out_file.close()    
    except Exception as e: 
        print(f'Query {query_name} executed in error: {e}')
    query_exec_time+= str(execution_time)+"\n"
        
with open(query_time_file, "a") as time_file:
    time_file.write(query_exec_time)

    print(f'Query {query_name} executed in {execution_time:.2f} seconds')

# Close the database connection
db_con.close()


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Loading of TPCDS DB complete.


In [5]:
#execute all sampled queries's golden answer on duckdb for comparison


import duckdb
import os
import time
import re
from datasets import load_from_disk

# Define the connection parameters
query_path='./test-results/sampled-queries/'
query_result='./test-results/db-results/golden-query-db-results/results/'
query_time_file='./test-results/db-results/golden-query-db-results/querytime/log.txt'

#load test splits as each of the sample has golden truth 
test_splits=load_from_disk('./dataset-splits/test-split')
# Connect to the DB 
db_con = duckdb.connect()

# Load the TPCDS database
db_con.execute("IMPORT DATABASE '/workspace/data/duckdb/build/release/tpcdssf100'")

print("Loading of TPCDS DB complete.")

# Create a folder to store query results
os.makedirs(query_result, exist_ok=True)

query_files = [f for f in os.listdir(query_path) if os.path.isfile(os.path.join(query_path, f))]
query_files = sorted(query_files)
query_exec_time=""


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Loading of TPCDS DB complete.


In [6]:

for query_file in query_files:
    print("Executing ", query_file)
    # extract query number from the query file name.
    query_name = os.path.splitext(query_file)[0]
    #regex to extract query number from query_10.txt 
    query_number=int(re.search(r'\d+',query_name).group())
    # print(query_name)
    # print("query_number",query_number-1)


    sql_query=test_splits[query_number-1]["output"]
    # print("sql_query",sql_query)
    # Execute the query and measure execution time
    try:
        start_time = time.time()
        result = db_con.execute(sql_query)
        execution_time = time.time()-start_time
    
        # Save the query result to a file
        output_file_path = query_result + query_name + ".csv"
        with open(output_file_path, "w") as out_file:
            # print("Writing result:", out_file)
            for row in result.fetchall():
                out_file.write(str(row) + "\n")
                # print(row)
            out_file.close()    
    except Exception as e: 
        print(f'Query {query_name} executed in error: {e}')
    query_exec_time+= str(execution_time)+"\n"
        
with open(query_time_file, "a") as time_file:
    time_file.write(query_exec_time)

    print(f'Query {query_name} executed in {execution_time:.2f} seconds')
# Close the database connection
db_con.close()


Executing  query_10.txt
Executing  query_100.txt


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Executing  query_102.txt


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Executing  query_105.txt
Executing  query_109.txt
Executing  query_11.txt
Executing  query_110.txt
Executing  query_113.txt
Executing  query_115.txt
Executing  query_117.txt
Executing  query_119.txt
Executing  query_12.txt
Executing  query_120.txt
Executing  query_124.txt
Executing  query_127.txt


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Executing  query_128.txt
Executing  query_130.txt


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Executing  query_133.txt
Executing  query_134.txt


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Executing  query_138.txt
Executing  query_139.txt
Executing  query_141.txt
Executing  query_146.txt
Executing  query_147.txt
Executing  query_148.txt


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Executing  query_151.txt


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Executing  query_153.txt
Executing  query_158.txt
Executing  query_16.txt


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Executing  query_160.txt
Executing  query_162.txt


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Executing  query_163.txt
Executing  query_164.txt


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Executing  query_167.txt
Executing  query_169.txt


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Executing  query_17.txt
Executing  query_170.txt
Executing  query_174.txt
Executing  query_175.txt
Executing  query_179.txt
Executing  query_18.txt
Executing  query_182.txt
Executing  query_184.txt
Executing  query_187.txt
Executing  query_188.txt
Executing  query_190.txt
Executing  query_192.txt


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Executing  query_194.txt
Executing  query_197.txt
Executing  query_199.txt


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Executing  query_2.txt


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Executing  query_200.txt
Executing  query_202.txt


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Executing  query_204.txt
Executing  query_206.txt
Executing  query_209.txt


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Executing  query_216.txt
Executing  query_22.txt


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Executing  query_24.txt
Executing  query_31.txt
Executing  query_33.txt
Executing  query_35.txt
Executing  query_36.txt
Executing  query_38.txt


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Executing  query_4.txt
Executing  query_43.txt
Executing  query_44.txt
Executing  query_47.txt
Executing  query_5.txt
Executing  query_51.txt
Executing  query_52.txt
Executing  query_53.txt
Executing  query_55.txt
Executing  query_58.txt
Executing  query_59.txt
Executing  query_60.txt
Executing  query_64.txt
Executing  query_67.txt
Executing  query_68.txt
Executing  query_72.txt
Executing  query_73.txt
Executing  query_74.txt
Executing  query_77.txt
Executing  query_78.txt
Executing  query_8.txt
Executing  query_80.txt
Executing  query_81.txt
Executing  query_82.txt
Executing  query_83.txt
Executing  query_84.txt


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Executing  query_87.txt
Executing  query_88.txt


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Executing  query_9.txt
Executing  query_90.txt
Executing  query_92.txt
Executing  query_93.txt
Executing  query_95.txt
Executing  query_97.txt
Executing  query_99.txt
Query query_99 executed in 0.66 seconds


In [21]:
#sample inferene running on final fine-tuned model
test_prompt='''
'\n\n[SYSTEM]:"You are an expert Text-to-SQL generator assistant. Your goal is to provide correct SQL queries to the given text description. Your output only contains the SQL code. No explanation or introductory sentences surrounding the SQL response is needed. You are given schema information. Here is the schema information: \n<tableName>store_returns</tableName>\n<columns>sr_returned_date_sk,  sr_return_time_sk,  sr_item_sk,  sr_customer_sk,  sr_cdemo_sk,  sr_hdemo_sk,  sr_addr_sk,  sr_store_sk,  sr_reason_sk,  sr_ticket_number,  sr_return_quantity,  sr_return_amt,  sr_return_tax,  sr_return_amt_inc_tax,  sr_fee,  sr_return_ship_cost,  sr_refunded_cash,  sr_reversed_charge,  sr_store_credit,  sr_net_loss</columns>\n<tableName>date_dim</tableName>\n<columns>d_date_sk,  d_date_id,  d_date,  d_month_seq,  d_week_seq,  d_quarter_seq,  d_year,  d_dow,  d_moy,  d_dom,  d_qoy,  d_fy_year,  d_fy_quarter_seq,  d_fy_week_seq,  d_day_name,  d_quarter_name,  d_holiday,  d_weekend,  d_following_holiday,  d_first_dom,  d_last_dom,  d_same_day_ly,  d_same_day_lq,  d_current_day,  d_current_week,  d_current_month,  d_current_quarter,  d_current_year</columns>\n<tableName>store</tableName>\n<columns>s_store_sk,  s_store_id,  s_rec_start_date,  s_rec_end_date,  s_closed_date_sk,  s_store_name,  s_number_employees,  s_floor_space,  s_hours,  s_manager,  s_market_id,  s_geography_class,  s_market_desc,  s_market_manager,  s_division_id,  s_division_name,  s_company_id,  s_company_name,  s_street_number,  s_street_name,  s_street_type,  s_suite_number,  s_city,  s_county,  s_state,  s_zip,  s_country,  s_gmt_offset,  s_tax_percentage</columns>\n<tableName>customer</tableName>\n<columns>c_customer_sk,  c_customer_id,  c_current_cdemo_sk,  c_current_hdemo_sk,  c_current_addr_sk,  c_first_shipto_date_sk,  c_first_sales_date_sk,  c_salutation,  c_first_name,  c_last_name,  c_preferred_cust_flag,  c_birth_day,  c_birth_month,  c_birth_year,  c_birth_country,  c_login,  c_email_address,  c_last_review_date_sk</columns>\n. Here are the 5 critical rules for the interactions you must abide: <rules> 1. Do not wrap the generated SQL code within SQL code markdown format. Also, do not include the SQL keyword in the beginning of the response. 2. If I don\'t tell you to find the limited set of results, limit to 100. 3. Only use table and columns from the list provided 4. When performing aliasing, make sure to refer the aliased tables as alias.column_name and not as alias_column_name. 5. For US state names, use abbreviated forms. For example, for South Dakota state, use SD.</rules> \n\n"Here is the user question:"[/SYSTEM]\n[HUMAN]: For the state of South Dakota in the year 2000, identify the first 100 customers, sorted by their IDs, whose returns are notably higher, exceeding the store\'s average by more than 20%, indicating a trend of higher returns.\n[/HUMAN]\n\n
'''
encoding = tokenizer(test_prompt, return_tensors="pt").to("cuda")
with torch.inference_mode():
  outputs = model.generate(
      input_ids = encoding.input_ids,
      attention_mask = encoding.attention_mask,
      generation_config = generation_config,
      
      
    
  )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))
with open(f"./test-q1-SD.txt","w") as file:
  file.write(str(tokenizer.decode(outputs[0], skip_special_tokens=True)))



'

[SYSTEM]:"You are an expert Text-to-SQL generator assistant. Your goal is to provide correct SQL queries to the given text description. Your output only contains the SQL code. No explanation or introductory sentences surrounding the SQL response is needed. You are given schema information. Here is the schema information: 
<tableName>store_returns</tableName>
<columns>sr_returned_date_sk,  sr_return_time_sk,  sr_item_sk,  sr_customer_sk,  sr_cdemo_sk,  sr_hdemo_sk,  sr_addr_sk,  sr_store_sk,  sr_reason_sk,  sr_ticket_number,  sr_return_quantity,  sr_return_amt,  sr_return_tax,  sr_return_amt_inc_tax,  sr_fee,  sr_return_ship_cost,  sr_refunded_cash,  sr_reversed_charge,  sr_store_credit,  sr_net_loss</columns>
<tableName>date_dim</tableName>
<columns>d_date_sk,  d_date_id,  d_date,  d_month_seq,  d_week_seq,  d_quarter_seq,  d_year,  d_dow,  d_moy,  d_dom,  d_qoy,  d_fy_year,  d_fy_quarter_seq,  d_fy_week_seq,  d_day_name,  d_quarter_name,  d_holiday,  d_weekend,  d_following_holida

: 