In [1]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.26" trl peft accelerate bitsandbytes

In [28]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",
    "unsloth/gemma-7b-it-bnb-4bit", # Instruct version of Gemma 7b
    "unsloth/gemma-2b-bnb-4bit",
    "unsloth/gemma-2b-it-bnb-4bit", # Instruct version of Gemma 2b
    "unsloth/llama-3-8b-bnb-4bit", # [NEW] 15 Trillion token Llama-3
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-3-mini-4k-instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth: Fast Mistral patching release 2024.5
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Then we have to apply some preprocessing to the model to prepare it for training. For that use the `prepare_model_for_kbit_training` method from PEFT.

In [29]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [30]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [31]:
from datasets import load_dataset

data = load_dataset("rajanonymous12/info_about_database_table_relation")

In [32]:
data = data.map(lambda samples: tokenizer(samples["input"]), batched=True)

In [33]:
data

DatasetDict({
    train: Dataset({
        features: ['input', 'input_ids', 'attention_mask'],
        num_rows: 91
    })
})

In [34]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [36]:
import transformers

# needed for gpt-neo-x tokenizer
tokenizer.pad_token = tokenizer.eos_token
# Check if the tokenizer has a mask token
# if tokenizer.mask_token is None:
#     # Add a mask token
#     tokenizer.add_special_tokens({'mask_token': '[MASK]'})



trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        # num_train_epochs=20,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=10,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 91 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 4
\        /    Total batch size = 4 | Total steps = 10
 "-____-"     Number of trainable parameters = 239,075,328


Step,Training Loss
1,1.7133
2,1.7248
3,2.6762
4,1.4341
5,1.336
6,1.0052
7,0.8845
8,0.8576
9,0.819
10,1.1227


TrainOutput(global_step=10, training_loss=1.3573333084583283, metrics={'train_runtime': 54.0094, 'train_samples_per_second': 0.741, 'train_steps_per_second': 0.185, 'total_flos': 476705858549760.0, 'train_loss': 1.3573333084583283, 'epoch': 0.43956043956043955})

## Output 3

In [38]:
text = """You have to answer the question only in json format.For example if user ask : Give me top 5 customer with most number of orders.
Then you have to give response as :
[ "table_information": [
{
"table_name": "customer_0",
"table_description": "Stores detailed customer information and behavior",
"primary_key_column": [
"customer_id"
],
"columns": [
{
"name": "customer_id",
"description": "Unique identifier for the customer",
"data_type": "STRING",
"format": "",
"is_pii_column": "Y",
"enum": [],
"dimension_group": "",
"is_nullable": "N"
},
{
"name": "first_name",
"description": "Customer's first name",

"data_type": "STRING",
"format": "CamelCase",
"is_pii_column": "Y",
"enum": [],
"dimension_group": "customer_dimension_group",
"is_nullable": "N"
},
{
"name": "last_name",
"description": "Customer's last name",
"data_type": "STRING",
"format": "CamelCase",
"is_pii_column": "Y",
"enum": [],
"dimension_group": "customer_dimension_group",
"is_nullable": "N"
}
]
},
{
"table_name": "order",
"table_description": "Stores detailed records of customer orders",
"primary_key_column": [
"order_id",
"order_item_id"
],
"columns": [
{
"name": "order_id",
"description": "Unique identifier for the order",
"data_type": "STRING",
"format": "",
"is_pii_column": "N",
"enum": [],
"is_nullable": "N"
},
{
"name": "customer_id",
"description": "Unique identifier for the customer",
"data_type": "STRING",
"format": "",
"is_pii_column": "Y",
"enum": [],
"is_nullable": "N"
}
]
}

]
}
Relationship
"relationships": [
{
"FromTable": "customer_0",
"FromColumn": [
"customer_id"
],
"ToTable": "order",
"ToColumn": [
"customer_id"
]
}
]
}]

<|end|>
************************************Now************************************
User ask  :How can I identify customers from specific regions who have spent more than $500 in the last month and assess their satisfaction scores?
Respone : " "
 """
device = "cuda:0"

inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=1000)
print(tokenizer.decode(outputs[0], skip_special_tokens=False))

<s> You have to answer the question only in json format.For example if user ask : Give me top 5 customer with most number of orders.
Then you have to give response as : 
[ "table_information": [
{
"table_name": "customer_0",
"table_description": "Stores detailed customer information and behavior",
"primary_key_column": [
"customer_id"
],
"columns": [
{
"name": "customer_id",
"description": "Unique identifier for the customer",
"data_type": "STRING",
"format": "",
"is_pii_column": "Y",
"enum": [],
"dimension_group": "",
"is_nullable": "N"
},
{
"name": "first_name",
"description": "Customer's first name",

"data_type": "STRING",
"format": "CamelCase",
"is_pii_column": "Y",
"enum": [],
"dimension_group": "customer_dimension_group",
"is_nullable": "N"
},
{
"name": "last_name",
"description": "Customer's last name",
"data_type": "STRING",
"format": "CamelCase",
"is_pii_column": "Y",
"enum": [],
"dimension_group": "customer_dimension_group",
"is_nullable": "N"
}
]
},
{
"table_name": "order",

In [None]:
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

## Output 1

In [15]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

In [16]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        """Use only the table and relationship between table exist. Don't create any other new table and use logic to give answer and response is in json format.
        For example :
Question : /n
Give me top 5 customer with most number of orders.
then/n
Expected Output :
Columns

{
"table_information": [
{
"table_name": "customer_0",
"table_description": "Stores detailed customer information and behavior",
"primary_key_column": [
"customer_id"
],
"columns": [
{
"name": "customer_id",
"description": "Unique identifier for the customer",
"data_type": "STRING",
"format": "",
"is_pii_column": "Y",
"enum": [],
"dimension_group": "",
"is_nullable": "N"
},
{
"name": "first_name",
"description": "Customer's first name",

"data_type": "STRING",
"format": "CamelCase",
"is_pii_column": "Y",
"enum": [],
"dimension_group": "customer_dimension_group",
"is_nullable": "N"
},
{
"name": "last_name",
"description": "Customer's last name",
"data_type": "STRING",
"format": "CamelCase",
"is_pii_column": "Y",
"enum": [],
"dimension_group": "customer_dimension_group",
"is_nullable": "N"
}
]
},
{
"table_name": "order",
"table_description": "Stores detailed records of customer orders",
"primary_key_column": [
"order_id",
"order_item_id"
],
"columns": [
{
"name": "order_id",
"description": "Unique identifier for the order",
"data_type": "STRING",
"format": "",
"is_pii_column": "N",
"enum": [],
"is_nullable": "N"
},
{
"name": "customer_id",
"description": "Unique identifier for the customer",
"data_type": "STRING",
"format": "",
"is_pii_column": "Y",
"enum": [],
"is_nullable": "N"
}
]
}

]
}
Relationship
"relationships": [
{
"FromTable": "customer_0",
"FromColumn": [
"customer_id"
],
"ToTable": "order",
"ToColumn": [
"customer_id"
]
}
]
}
""", # instruction
        """ Question:  What is the average duration from order placement to shipment across different product categories, segmented by customer regions and order volume? """, # input
        "  ", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

# outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
# tokenizer.batch_decode(outputs)

In [18]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer,max_new_tokens = 400,use_cache = False)

<s> Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Use only the table and relationship between table exist. Don't create any other new table and use logic to give answer and response is in json format.
        For example : 
Question : /n
Give me top 5 customer with most number of orders.
then/n
Expected Output :
Columns

{
"table_information": [
{
"table_name": "customer_0",
"table_description": "Stores detailed customer information and behavior",
"primary_key_column": [
"customer_id"
],
"columns": [
{
"name": "customer_id",
"description": "Unique identifier for the customer",
"data_type": "STRING",
"format": "",
"is_pii_column": "Y",
"enum": [],
"dimension_group": "",
"is_nullable": "N"
},
{
"name": "first_name",
"description": "Customer's first name",

"data_type": "STRING",
"format": "CamelCase",
"is_pii_column": "Y",
"enum": [],
"dimension_group": "custo

KeyboardInterrupt: 

## Output 2

In [35]:
messages = [

    {"role": "user", "content": """Give me top 5 customer with most number of orders."""},
    {"role": "assistant", "content": """Sure! Here expected output : [ "table_information": [
{
"table_name": "customer_0",
"table_description": "Stores detailed customer information and behavior",
"primary_key_column": [
"customer_id"
],
"columns": [
{
"name": "customer_id",
"description": "Unique identifier for the customer",
"data_type": "STRING",
"format": "",
"is_pii_column": "Y",
"enum": [],
"dimension_group": "",
"is_nullable": "N"
},
{
"name": "first_name",
"description": "Customer's first name",

"data_type": "STRING",
"format": "CamelCase",
"is_pii_column": "Y",
"enum": [],
"dimension_group": "customer_dimension_group",
"is_nullable": "N"
},
{
"name": "last_name",
"description": "Customer's last name",
"data_type": "STRING",
"format": "CamelCase",
"is_pii_column": "Y",
"enum": [],
"dimension_group": "customer_dimension_group",
"is_nullable": "N"
}
]
},
{
"table_name": "order",
"table_description": "Stores detailed records of customer orders",
"primary_key_column": [
"order_id",
"order_item_id"
],
"columns": [
{
"name": "order_id",
"description": "Unique identifier for the order",
"data_type": "STRING",
"format": "",
"is_pii_column": "N",
"enum": [],
"is_nullable": "N"
},
{
"name": "customer_id",
"description": "Unique identifier for the customer",
"data_type": "STRING",
"format": "",
"is_pii_column": "Y",
"enum": [],
"is_nullable": "N"
}
]
}

]
}
Relationship
"relationships": [
{
"FromTable": "customer_0",
"FromColumn": [
"customer_id"
],
"ToTable": "order",
"ToColumn": [
"customer_id"
]
}
]
}]"""},
    {"role": "user", "content": """What is the average duration from order placement to shipment across different product categories, segmented by customer regions and order volume?"""},
]


In [38]:
from transformers import pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

generation_args = {
    "max_new_tokens": 1000,
    "return_full_text": False,
    "temperature": 1 ,
    "do_sample": False,
}

output = pipe(messages, **generation_args)
print(output[0]['generated_text'])

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'JambaForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MusicgenMelodyFo

 To answer this question, we need to perform a multi-dimensional analysis on the database. Here is a step-by-step approach:

1. Join the "order" table with the "product" table on the product_id column to get the product category.
2. Join the "order" table with the "customer" table on the customer_id column to get the customer region.
3. Calculate the duration from order placement to shipment for each order.
4. Group the data by product category, customer region, and order volume.
5. Calculate the average duration for each group.

Here is a SQL query that accomplishes this:

```sql
SELECT 
    p.category AS product_category,
    c.region AS customer_region,
    CASE 
        WHEN o.quantity <= 10 THEN 'low'
        WHEN o.quantity <= 100 THEN'medium'
        ELSE 'high'
    END AS order_volume,
    AVG(o.shipment_date - o.order_date) AS avg_duration
FROM 
    order o
    JOIN product p ON o.product_id = p.product_id
    JOIN customer c ON o.customer_id = c.customer_id
GROUP BY _
    pro

# Save model

In [None]:
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving