## Import Modules

In [None]:
#!pip install --upgrade transformers
!pip install peft
!pip install -U bitsandbytes
!pip install datasets
!pip install trl

In [None]:
import os
import pathlib
import torch
import numpy as np

from datasets import load_dataset
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from trl import SFTTrainer

from transformers import (
  AutoTokenizer,
  AutoModelForCausalLM,
  AutoModelForSeq2SeqLM,
  AutoModel,
  AutoModelForSequenceClassification,
  DataCollatorForLanguageModeling,
  Trainer,
  TrainingArguments,
  pipeline,
  TextDataset,
  EvalPrediction,
  DataCollatorWithPadding,
  GenerationConfig,
  BitsAndBytesConfig
)

from peft import (
  LoraConfig,
  PeftModelForSequenceClassification,
  PeftModel,
  TaskType,
  AutoPeftModelForSequenceClassification,
  get_peft_model,
  prepare_model_for_kbit_training
)

if torch.cuda.is_available():
  print("GPU is available!")
else:
  print("GPU is not available.")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

## Import Model

In [None]:
#url = 'https://huggingface.co/Qwen/Qwen2.5-0.5B'
#model_name = url.split('.co/')[-1]

model_name = 'unsloth/Llama-3.2-1B-Instruct'

In [None]:
'''bnb_config = BitsAndBytesConfig(
  load_in_4bit = True,
  bnb_4bit_quant_type = 'nf4',
  bnb_4bit_compute_dtype = torch.float16,
  bnb_4bit_use_double_quant = True,
)

model = AutoModelForCausalLM.from_pretrained(
  model_name,
  quantization_config = bnb_config,
  trust_remote_code = True
)#.to(device) #'''

In [None]:
model = AutoModelForCausalLM.from_pretrained(
  model_name,
  torch_dtype = torch.float16,
  trust_remote_code = True
).to(device) #'''

In [None]:
model

In [None]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
trainable_percentage = (trainable_params / total_params) * 100

print('Total parameters :', total_params)
print('Trainable parameters :', trainable_params)
print('Trainable percentage: {:.2f}%'.format(trainable_percentage))

## Import Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
tokenizer

## Pre Test

In [None]:
def assistant(prompt):
    prompt = f"### Human:{prompt}\n### Assistant:"
    inputs = tokenizer(prompt, return_tensors = 'pt').to('cuda')

    generation_config = GenerationConfig(
        do_sample = True,
        top_k = 1,
        temperature = 0.1,
        max_new_tokens = 512,
        pad_token_id = tokenizer.eos_token_id
    )

    outputs = model.generate(**inputs, generation_config = generation_config)
    return print(tokenizer.decode(outputs[0], skip_special_tokens = True))

In [None]:
prompt = '''
Help me to convert this email to API instruction in json format

Subject: Order Request: 500 Units of Type X Laptops

Dear Mr. Smith,

Please confirm our order for 500 units of Type X Laptops as discussed. Kindly ship to:
Delivery Address: 123 Elm Street, New York, NY, 10001

Supplier Contact:
John Smith, Sales Manager
Tech Supplies Inc.
Email: john.smith@techsupplies.com
Phone: +1-123-456-7890

Thank you,
Regards,
Michael Brown
Purchasing Manager, Tech Solutions
Email: michael.brown@techsolutions.com
Phone: +1-987-654-3210

'''
assistant(prompt)

In [None]:
prompt = '''
Help me to convert this email to API instruction in json format

Subject: Order Request: 2000 Bags of Cement

Dear Ms. Johnson,

We would like to place an order for 2000 bags of Portland cement to be delivered to the following location:
Delivery Address: 456 Maple Avenue, Los Angeles, CA, 90001

Supplier Contact:
Alice Johnson, Sales Representative
Construction World Supplies
Email: alice.johnson@constructionsupply.com
Phone: +1-234-567-8901

Thank you,
Best regards,
James Taylor
Operations Manager, BuildRight Corp
Email: james.taylor@buildright.com
Phone: +1-654-321-0987
'''
assistant(prompt)

In [None]:
prompt = '''
Help me to convert this email to API instruction in json format

Subject: Order Confirmation for 300 Office Chairs

Dear Mr. Lee,

We would like to confirm our order for 300 ergonomic office chairs for delivery to:
Delivery Address: 789 Oak Street, Houston, TX, 77001

Supplier Contact:
David Lee, Product Specialist
OfficeMax Solutions
Email: david.lee@officemax.com
Phone: +1-345-678-9012

Kind regards,
Sarah Mitchell
Admin Manager, Corporate Solutions Ltd.
Email: sarah.mitchell@corpsolutions.com
Phone: +1-765-432-1098
'''
assistant(prompt)

In [None]:
prompt = '''
Help me to convert this email to API instruction in json format

Subject: Bulk Order of 1500 Units of Printer Ink Cartridges

Dear Ms. Patel,

Please arrange to supply 1500 units of printer ink cartridges (Model ABC123) to the following address:
Delivery Address: 101 Pine Street, Chicago, IL, 60601

Supplier Contact:
Priya Patel, Procurement Lead
Ink Solutions Co.
Email: priya.patel@inksolutions.com
Phone: +1-456-789-0123

Thank you,
Sincerely,
Johnathan Wu
Purchasing Head, PrintHub Services
Email: johnathan.wu@printhub.com
Phone: +1-876-543-2109
'''
assistant(prompt)

In [None]:
prompt = '''
Help me to convert this email to API instruction in json format

Subject: Order Request: 400 Sets of Power Tools

Dear Mr. Nguyen,

We are placing an order for 400 sets of power tools (Drill, Saw, Wrench Set) for delivery to:
Delivery Address: 202 Cedar Road, Miami, FL, 33101

Supplier Contact:
Alex Nguyen, Senior Sales Consultant
Tools and Equip Supplies
Email: alex.nguyen@toolsupply.com
Phone: +1-567-890-1234

Thank you,
Regards,
Laura Sanchez
Project Manager, Builders United
Email: laura.sanchez@buildersunited.com
Phone: +1-234-987-6543
'''
assistant(prompt)

## Import Dataset

In [None]:
#url = 'https://huggingface.co/datasets/KingNish/reasoning-base-20k'
#dataset_name = url.split('datasets/')[-1]

dataset_name = 'argilla/Synth-APIGen-v0.1'

In [None]:
max_length = 192

In [None]:
dataset = load_dataset(dataset_name, split = 'train')
dataset

In [None]:
dataset.select(range(5)).to_pandas().head()

In [None]:
dataset[0]

In [None]:
features = list(dataset.features.keys())
print(features)

In [None]:
def format_text(example):
  example["text"] = f"Function Name: {example['func_name']} Function Description: {example['func_desc']} Instruction: {example['query']} Response: {example['answers']}"
  return example

In [None]:
formatted_dataset = dataset.map(format_text, remove_columns = features)
formatted_dataset

In [None]:
formatted_dataset[0]

In [None]:
def tokenize_data(example, max_length = max_length):
    return tokenizer(example['text'], truncation = True, padding = 'max_length', max_length = max_length)

In [None]:
tokenized_dataset = formatted_dataset.map(tokenize_data, batched = True)#, remove_columns = 'text')
tokenized_dataset

In [None]:
tokenized_dataset[0]

In [None]:
dataset = tokenized_dataset.train_test_split(test_size = 0.1, seed = 42)
dataset

In [None]:
train_dataset = dataset['train']
test_dataset = dataset['test']
train_dataset

In [None]:
train_dataset.select(range(5)).to_pandas().head()

In [None]:
train_dataset[0]

## Training Set Up

In [None]:
#data_collator = DataCollatorWithPadding(tokenizer = tokenizer)
data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm = False)

In [None]:
def compute_metrics(p: EvalPrediction):
  preds = np.argmax(p.predictions, axis = 1)
  precision, recall, f1, _ = precision_recall_fscore_support(
    p.label_ids,
    preds,
    average = 'weighted'
  )
  matrix = {
    'accuracy': accuracy_score(p.label_ids, preds),
    'f1': f1, 'precision': precision,
    'recall': recall
  }
  return matrix

In [None]:
model.config.pad_token_id = tokenizer.pad_token_id

In [None]:
torch.cuda.empty_cache()

## Set Up PEFT

In [None]:
peft_name = model_name.split('\\')[-1] + '-API'
peft_name = peft_name.replace('.', '-')
peft_name

In [None]:
lora_alpha = 16
lora_dropout = 0.1
lora_r = 256

peft_config = LoraConfig(
  lora_alpha = lora_alpha,
  lora_dropout = lora_dropout,
  r = lora_r,
  bias = 'none',
  task_type = 'CAUSAL_LM',
)

In [None]:
peft_model = get_peft_model(model, peft_config, adapter_name = peft_name)
peft_model.print_trainable_parameters()

## Training Model

In [None]:
model = get_peft_model(model, peft_config)

In [None]:
model

In [None]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
trainable_percentage = (trainable_params / total_params) * 100

print('Total parameters :', total_params)
print('Trainable parameters :', trainable_params)
print('Trainable percentage: {:.2f}%'.format(trainable_percentage))

In [None]:
torch.cuda.empty_cache()

In [None]:
save_path = './model'

batch_size = 8
max_steps = 1000

training_args = TrainingArguments(
    output_dir = save_path,
    gradient_accumulation_steps = 32,
    evaluation_strategy = 'steps',
    do_eval = True,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    log_level = 'debug',
    save_strategy = 'no',
    save_total_limit = 2,
    save_safetensors = False,
    fp16 = False,
    bf16 = True,
    logging_steps = 10,
    learning_rate = 2e-5,
    eval_steps = 10,
    #max_steps = max_steps,
    warmup_steps = 30,
    lr_scheduler_type = 'cosine',
    #optim = 'paged_adamw_32bit',
    optim = "adamw_8bit",
    dataloader_num_workers = 4,
)

training_args.push_to_hub = False
training_args.ddp_find_unused_parameters = False
training_args.sharded_ddp = 'simple'

training_args

In [None]:
trainer = SFTTrainer(
  model = model,
  train_dataset = train_dataset.select(range(10000)),
  eval_dataset = test_dataset.select(range(1000)),
  dataset_text_field = 'text',
  max_seq_length = max_length,
  tokenizer = tokenizer,
  args = training_args,
  peft_config = peft_config,
)
trainer

In [None]:
trainer.train()

## Model Evaluation

In [None]:
evaluation_results = trainer.evaluate()
print('Evaluation Results:', evaluation_results)

## Save Model

In [None]:
save_model = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model
save_model.save_pretrained(save_path)

## Load PEFT Model

In [None]:
peft_model = PeftModel.from_pretrained(model, save_path)

## Post Test

In [None]:
def assistant(prompt):
    prompt = f"### Human:{prompt}\n### Assistant:"
    inputs = tokenizer(prompt, return_tensors = 'pt').to('cuda')

    generation_config = GenerationConfig(
        do_sample = True,
        top_k = 1,
        temperature = 0.1,
        max_new_tokens = 512,
        pad_token_id = tokenizer.eos_token_id
    )

    outputs = peft_model.generate(**inputs, generation_config = generation_config)
    return print(tokenizer.decode(outputs[0], skip_special_tokens = True))

In [None]:
prompt = '''
Help me to convert this email to API instruction in json format

Subject: Order Request: 500 Units of Type X Laptops

Dear Mr. Smith,

Please confirm our order for 500 units of Type X Laptops as discussed. Kindly ship to:
Delivery Address: 123 Elm Street, New York, NY, 10001

Supplier Contact:
John Smith, Sales Manager
Tech Supplies Inc.
Email: john.smith@techsupplies.com
Phone: +1-123-456-7890

Thank you,
Regards,
Michael Brown
Purchasing Manager, Tech Solutions
Email: michael.brown@techsolutions.com
Phone: +1-987-654-3210

'''
assistant(prompt)

In [None]:
prompt = '''
Help me to convert this email to API instruction in json format

Subject: Order Request: 2000 Bags of Cement

Dear Ms. Johnson,

We would like to place an order for 2000 bags of Portland cement to be delivered to the following location:
Delivery Address: 456 Maple Avenue, Los Angeles, CA, 90001

Supplier Contact:
Alice Johnson, Sales Representative
Construction World Supplies
Email: alice.johnson@constructionsupply.com
Phone: +1-234-567-8901

Thank you,
Best regards,
James Taylor
Operations Manager, BuildRight Corp
Email: james.taylor@buildright.com
Phone: +1-654-321-0987
'''
assistant(prompt)

In [None]:
prompt = '''
Help me to convert this email to API instruction in json format

Subject: Order Confirmation for 300 Office Chairs

Dear Mr. Lee,

We would like to confirm our order for 300 ergonomic office chairs for delivery to:
Delivery Address: 789 Oak Street, Houston, TX, 77001

Supplier Contact:
David Lee, Product Specialist
OfficeMax Solutions
Email: david.lee@officemax.com
Phone: +1-345-678-9012

Kind regards,
Sarah Mitchell
Admin Manager, Corporate Solutions Ltd.
Email: sarah.mitchell@corpsolutions.com
Phone: +1-765-432-1098
'''
assistant(prompt)

In [None]:
prompt = '''
Help me to convert this email to API instruction in json format

Subject: Bulk Order of 1500 Units of Printer Ink Cartridges

Dear Ms. Patel,

Please arrange to supply 1500 units of printer ink cartridges (Model ABC123) to the following address:
Delivery Address: 101 Pine Street, Chicago, IL, 60601

Supplier Contact:
Priya Patel, Procurement Lead
Ink Solutions Co.
Email: priya.patel@inksolutions.com
Phone: +1-456-789-0123

Thank you,
Sincerely,
Johnathan Wu
Purchasing Head, PrintHub Services
Email: johnathan.wu@printhub.com
Phone: +1-876-543-2109
'''
assistant(prompt)

In [None]:
prompt = '''
Help me to convert this email to API instruction in json format

Subject: Order Request: 400 Sets of Power Tools

Dear Mr. Nguyen,

We are placing an order for 400 sets of power tools (Drill, Saw, Wrench Set) for delivery to:
Delivery Address: 202 Cedar Road, Miami, FL, 33101

Supplier Contact:
Alex Nguyen, Senior Sales Consultant
Tools and Equip Supplies
Email: alex.nguyen@toolsupply.com
Phone: +1-567-890-1234

Thank you,
Regards,
Laura Sanchez
Project Manager, Builders United
Email: laura.sanchez@buildersunited.com
Phone: +1-234-987-6543
'''
assistant(prompt)