In [17]:
# !pip install -q  torch peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 accelerate datasets

import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline
)
from peft import LoraConfig
from trl import SFTTrainer

In [18]:
# !pip install ipywidgets
# !jupyter nbextension enable --py widgetsnbextension
from huggingface_hub import login

login(token = "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")

In [19]:
import json

with open('tables.json', 'r') as file:
    tables = json.load(file)

data_name = "spider"

training_data = load_dataset(data_name, split="train")
training_data[0]

In [20]:
def process_data_to_model_inputs(batch):
    # Concatenate the prompt, question, and SQL query
    inputs = [f"<s>[INST] What is the SQL code for this query: {q} [/INST] {a}" for q, a in zip(batch['question'], batch['query'])]
    return {"input_text": inputs}
    

# Apply the function to the dataset
processed_dataset = training_data.map(process_data_to_model_inputs, batched=True)


In [21]:
processed_dataset[0]

In [11]:
# Model and tokenizer names
base_model_name = "meta-llama/Llama-2-7b-hf"
refined_model = "llama-2-7b-spider-finetuned" #You can give it your own name

# Tokenizer
llama_tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True, cache_dir=".")
llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = "right"  # Fix for fp16

# Quantization Config
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)

# Model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=quant_config,
    device_map={"": 0},
    cache_dir = "."
)
base_model.config.use_cache = True
base_model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
tokenizer = llama_tokenizer

In [12]:
tokenizer = llama_tokenizer
def tokenize_and_mask_function(examples):
    # Tokenize the inputs
    tokenized_inputs = tokenizer(examples["input_text"], padding="max_length", truncation=True, max_length=512)

    # Create a mask to prevent the model from predicting the question part
    attention_masks = []
    for input_text in examples["input_text"]:
        # Split the input text into question and SQL parts
        question_part, sql_query = input_text.split("[/INST]")

        # Tokenize the question part only
        tokenized_question = tokenizer(question_part, add_special_tokens=False, padding=False, truncation=False)
        
        # Calculate the length of the tokenized question
        question_length = len(tokenized_question['input_ids'])

        # Create the attention mask with 0s for the question part and 1s for the rest
        attention_mask = [0] * question_length + [1] * (len(tokenized_inputs['input_ids'][0]) - question_length)
        attention_masks.append(attention_mask)

    # Update the tokenized inputs with the new attention masks
    tokenized_inputs["attention_mask"] = attention_masks
    return tokenized_inputs

tokenized_dataset = processed_dataset.map(tokenize_and_mask_function, batched=True)


Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

In [None]:
tokenized_dataset[0]


In [16]:
# LoRA Config
peft_parameters = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=8,
    bias="none",
    task_type="CAUSAL_LM"
)

# Training Params
train_params = TrainingArguments(
    output_dir="./results_modified",
    num_train_epochs=30,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=3e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
#     report_to="tensorboard"
)

# Trainer
fine_tuning = SFTTrainer(
    model=base_model,
    train_dataset=training_data,
    peft_config=peft_parameters,
    dataset_text_field="question",
    tokenizer=llama_tokenizer,
    args=train_params
)

# Training
fine_tuning.train()

# Save Model
fine_tuning.model.save_pretrained(refined_model)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
You are using 8-bit optimizers with a version of `bitsandbytes` < 0.41.1. It is recommended to update your version as a major bug has been fixed in 8-bit optimizers.


Step,Training Loss
25,2.9983
50,2.6748
75,2.6543
100,2.569
125,2.4407
150,2.4778
175,2.4674
200,2.4489
225,2.4554
250,2.3706






































































In [None]:
!wandb off

In [None]:
# Generate Text
query = "How do I use the OpenAI API?"
text_gen = pipeline(task="text-generation", model=refined_model, tokenizer=llama_tokenizer, max_length=200)
output = text_gen(f"<s>[INST] {query} [/INST]")
print(output[0]['generated_text'])

In [None]:
# import torch
# from datasets import load_dataset
# from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer

# # Load and process the dataset
# data_name = "spider"
# training_data = load_dataset(data_name, split="train")

# def process_data_to_model_inputs(batch):
#     # This time, we are only processing the questions as inputs.
#     inputs = batch['question']
#     targets = batch['query']
#     return {"input_text": inputs, "labels": targets}

# processed_dataset = training_data.map(process_data_to_model_inputs, batched=True)

# # Load tokenizer and model
# base_model_name = "meta-llama/Llama-2-7b-hf"
# tokenizer = AutoTokenizer.from_pretrained(base_model_name)
# model = AutoModelForCausalLM.from_pretrained(base_model_name, cache_dir =".")
# tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset
def tokenize_function(examples):
    # Tokenizing both the inputs and the targets
    model_inputs = tokenizer(examples["input_text"], padding="max_length", truncation=True, max_length=512)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["labels"], padding="max_length", truncation=True, max_length=512)["input_ids"]
    model_inputs["labels"] = labels
    return model_inputs

tokenized_dataset = processed_dataset.map(tokenize_function, batched=True)

# # Define training arguments
# training_args = TrainingArguments(
#     output_dir="./results_modified",
#     num_train_epochs=1,
#     per_device_train_batch_size=2,  # Reduced batch size
#     gradient_accumulation_steps=2,  # Increased accumulation steps
#     fp16=True,  # Enable mixed precision training
#     logging_strategy="steps",
#     logging_steps=25,
#     save_strategy="steps",
#     save_steps=500,
#     evaluation_strategy="no",
#     load_best_model_at_end=False,
# )

# # Initialize the Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_dataset.remove_columns(["input_text", "labels"]),
#     tokenizer=tokenizer
# )

# # Start training
# trainer.train()


In [None]:
torch.cuda.empty_cache()

In [None]:
!nvidia-smi