In [1]:
# Databricks notebook source

# https://github.com/avisoori-databricks/Tuning-the-Finetuning

from datasets import load_dataset , Dataset, concatenate_datasets 
import numpy as np
import pandas as pd
import random

# COMMAND ----------

rd_ds = load_dataset("xiyuez/red-dot-design-award-product-description")
rd_df = pd.DataFrame(rd_ds['train'])
display(rd_df)

# COMMAND ----------

rd_df['instruction'] = 'Create a detailed description for the following product: '+ rd_df['product']+', belonging to category: '+ rd_df['category']
rd_df = rd_df[['instruction', 'description']]
display(rd_df)

# COMMAND ----------

rd_df_sample = rd_df.sample(n=5000, random_state=42)
display(rd_df_sample)

# COMMAND ----------

template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{}

### Response:\n"""

# COMMAND ----------

rd_df_sample['prompt'] = rd_df_sample["instruction"].apply(lambda x: template.format(x))

# COMMAND ----------

rd_df_sample.rename(columns={'description': 'response'}, inplace=True)

# COMMAND ----------

rd_df_sample['response'] = rd_df_sample['response'] +  "\n### End"
rd_df_sample = rd_df_sample[['prompt', 'response']]
display(rd_df_sample)


  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,product,category,description,text
0,Biamp Rack Products,Digital Audio Processors,"“High recognition value, uniform aesthetics an...",Product Name: Biamp Rack Products;\n\nProduct ...
1,V33,Video Camera,The V33 livestreaming video camera ensures hig...,Product Name: V33;\n\nProduct Category: Video ...
2,HP LaserJet 5000-6000 and E700-E800 Series MFPs,Multi-Function Printers,The HP LaserJet 5000 to 6000 Series and E700 t...,Product Name: HP LaserJet 5000-6000 and E700-E...
3,Meaco Arete One 20L Dehumidifier,Heating and Air Conditioning Technology,The Meaco Arete One Dehumidifier is characteri...,Product Name: Meaco Arete One 20L Dehumidifier...
4,théATRE Glass Container for Loose Leaf Tea,Food Containers,The design and colouring of the théATRE Glass ...,Product Name: théATRE Glass Container for Loos...
...,...,...,...,...
21178,Pico (S100),Mobile Phone,The Pico (S100) is a very light mobile phone w...,Product Name: Pico (S100);\n\nProduct Category...
21179,PPQ 719 B 21 E,Gas Hob,A refined metallic-coloured glass top emphasis...,Product Name: PPQ 719 B 21 E;\n\nProduct Categ...
21180,i1Pro,Spectral Measurement Device,"This versatile colour-measurement device, whic...",Product Name: i1Pro;\n\nProduct Category: Spec...
21181,787 Dreamliner,Aircraft Interior,The new 787 Dreamliner combines groundbreaking...,Product Name: 787 Dreamliner;\n\nProduct Categ...


Unnamed: 0,instruction,description
0,Create a detailed description for the followin...,"“High recognition value, uniform aesthetics an..."
1,Create a detailed description for the followin...,The V33 livestreaming video camera ensures hig...
2,Create a detailed description for the followin...,The HP LaserJet 5000 to 6000 Series and E700 t...
3,Create a detailed description for the followin...,The Meaco Arete One Dehumidifier is characteri...
4,Create a detailed description for the followin...,The design and colouring of the théATRE Glass ...
...,...,...
21178,Create a detailed description for the followin...,The Pico (S100) is a very light mobile phone w...
21179,Create a detailed description for the followin...,A refined metallic-coloured glass top emphasis...
21180,Create a detailed description for the followin...,"This versatile colour-measurement device, whic..."
21181,Create a detailed description for the followin...,The new 787 Dreamliner combines groundbreaking...


Unnamed: 0,instruction,description
18952,Create a detailed description for the followin...,The CG8565 is a gaming PC offering space for h...
12584,Create a detailed description for the followin...,The iSHOXS BullBar ProX mount can be used to a...
5702,Create a detailed description for the followin...,The S81 Pro focuses on two things: outstanding...
20503,Create a detailed description for the followin...,The CenFlex superfinish machine is designed fo...
2480,Create a detailed description for the followin...,The THALION S gas absorption heat pump uses na...
...,...,...
268,Create a detailed description for the followin...,“The MoodPlay can be described as a record pla...
518,Create a detailed description for the followin...,V23 is a switch panel that includes sockets an...
8137,Create a detailed description for the followin...,The Bosch Aqua water purifier collection for u...
5508,Create a detailed description for the followin...,The design concept for these kitchen knives an...


Unnamed: 0,prompt,response
18952,Below is an instruction that describes a task....,The CG8565 is a gaming PC offering space for h...
12584,Below is an instruction that describes a task....,The iSHOXS BullBar ProX mount can be used to a...
5702,Below is an instruction that describes a task....,The S81 Pro focuses on two things: outstanding...
20503,Below is an instruction that describes a task....,The CenFlex superfinish machine is designed fo...
2480,Below is an instruction that describes a task....,The THALION S gas absorption heat pump uses na...
...,...,...
268,Below is an instruction that describes a task....,“The MoodPlay can be described as a record pla...
518,Below is an instruction that describes a task....,V23 is a switch panel that includes sockets an...
8137,Below is an instruction that describes a task....,The Bosch Aqua water purifier collection for u...
5508,Below is an instruction that describes a task....,The design concept for these kitchen knives an...


In [2]:
import torch
from transformers import LlamaTokenizer, LlamaForCausalLM

model_path = 'openlm-research/open_llama_3b_v2'
tokenizer = LlamaTokenizer.from_pretrained(model_path)
model = LlamaForCausalLM.from_pretrained(
model_path, load_in_8bit=True, device_map='auto',
)

import re
import sys
model_modules = str(model.modules)
print(model_modules)



<bound method Module.modules of LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 3200, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear8bitLt(in_features=3200, out_features=3200, bias=False)
          (k_proj): Linear8bitLt(in_features=3200, out_features=3200, bias=False)
          (v_proj): Linear8bitLt(in_features=3200, out_features=3200, bias=False)
          (o_proj): Linear8bitLt(in_features=3200, out_features=3200, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=3200, out_features=8640, bias=False)
          (down_proj): Linear8bitLt(in_features=8640, out_features=3200, bias=False)
          (up_proj): Linear8bitLt(in_features=3200, out_features=8640, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_at

In [3]:
#Pass in a prompt and infer with the model
'''tokenizer = LlamaTokenizer.from_pretrained(model_path)
prompt = 'Q: Create a detailed description for the following product: Corelogic Smooth Mouse, belonging to category: Optical Mouse\nA:'
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

generation_output = model.generate(
input_ids=input_ids, max_new_tokens=128
)

print(tokenizer.decode(generation_output[0]))'''

'tokenizer = LlamaTokenizer.from_pretrained(model_path)\nprompt = \'Q: Create a detailed description for the following product: Corelogic Smooth Mouse, belonging to category: Optical Mouse\nA:\'\ninput_ids = tokenizer(prompt, return_tensors="pt").input_ids\n\ngeneration_output = model.generate(\ninput_ids=input_ids, max_new_tokens=128\n)\n\nprint(tokenizer.decode(generation_output[0]))'

In [4]:
'''prompt= """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Create a detailed description for the following product: Corelogic Smooth Mouse, belonging to category: Optical Mouse

### Response:"""

input_ids = tokenizer(prompt, return_tensors="pt").input_ids

generation_output = model.generate(
input_ids=input_ids, max_new_tokens=128
)

print(tokenizer.decode(generation_output[0]))'''

'prompt= """Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nCreate a detailed description for the following product: Corelogic Smooth Mouse, belonging to category: Optical Mouse\n\n### Response:"""\n\ninput_ids = tokenizer(prompt, return_tensors="pt").input_ids\n\ngeneration_output = model.generate(\ninput_ids=input_ids, max_new_tokens=128\n)\n\nprint(tokenizer.decode(generation_output[0]))'

In [5]:
rd_df_sample

Unnamed: 0,prompt,response
18952,Below is an instruction that describes a task....,The CG8565 is a gaming PC offering space for h...
12584,Below is an instruction that describes a task....,The iSHOXS BullBar ProX mount can be used to a...
5702,Below is an instruction that describes a task....,The S81 Pro focuses on two things: outstanding...
20503,Below is an instruction that describes a task....,The CenFlex superfinish machine is designed fo...
2480,Below is an instruction that describes a task....,The THALION S gas absorption heat pump uses na...
...,...,...
268,Below is an instruction that describes a task....,“The MoodPlay can be described as a record pla...
518,Below is an instruction that describes a task....,V23 is a switch panel that includes sockets an...
8137,Below is an instruction that describes a task....,The Bosch Aqua water purifier collection for u...
5508,Below is an instruction that describes a task....,The design concept for these kitchen knives an...


In [6]:
rd_df_sample['text'] = rd_df_sample["prompt"]+rd_df_sample["response"]
rd_df_sample.drop(columns=['prompt', 'response'], inplace=True)
display(rd_df_sample), rd_df_sample.shape

Unnamed: 0,text
18952,Below is an instruction that describes a task....
12584,Below is an instruction that describes a task....
5702,Below is an instruction that describes a task....
20503,Below is an instruction that describes a task....
2480,Below is an instruction that describes a task....
...,...
268,Below is an instruction that describes a task....
518,Below is an instruction that describes a task....
8137,Below is an instruction that describes a task....
5508,Below is an instruction that describes a task....


(None, (5000, 1))

In [7]:
from datasets import load_dataset
from datasets import Dataset
dataset = Dataset.from_pandas(rd_df_sample).train_test_split(test_size=0.05, seed=42)

In [None]:
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
from transformers import AutoModelForCausalLM
from transformers import LlamaTokenizer, LlamaForCausalLM
import torch
from transformers.trainer_callback import TrainerCallback
import os
import sys
from transformers import BitsAndBytesConfig
from trl import SFTTrainer


#target_modules = ['q_proj','k_proj','v_proj','o_proj','gate_proj','down_proj','up_proj','lm_head']
#or
target_modules = ['q_proj','v_proj']

lora_config = LoraConfig(
    r=8,#or r=16
    lora_alpha=8,
    lora_dropout=0.05,
    bias="none",
    target_modules = target_modules,
    task_type="CAUSAL_LM",
)

base_dir = "llama"

per_device_train_batch_size = 4
gradient_accumulation_steps = 4
optim = 'adamw_hf'
learning_rate = 1e-5
max_grad_norm = 0.3
warmup_ratio = 0.03
lr_scheduler_type = "linear"

# COMMAND ----------

from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir=base_dir,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    num_train_epochs = 3.0,
    logging_strategy="epoch",
    logging_steps=100,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
)
    

# COMMAND ----------

#model_path = 'openlm-research/open_llama_3b_v2'

# COMMAND ----------

#tokenizer = LlamaTokenizer.from_pretrained(model_path)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# COMMAND ----------

#model = LlamaForCausalLM.from_pretrained(
#    model_path, device_map='auto', load_in_8bit=True,
#)

# COMMAND ----------

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainer = SFTTrainer(
    model,
    train_dataset=dataset['train'],
    eval_dataset = dataset['test'],
    dataset_text_field="text",
    max_seq_length=256,
    args=training_args,
)

#Upcast layer norms to float 32 for stability
for name, module in trainer.model.named_modules():
  if "norm" in name:
    module = module.to(torch.float32)

trainer.train()

trainable params: 2,662,400 || all params: 3,429,136,000 || trainable%: 0.07764054852300988


Using pad_token, but it is not set yet.
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4750/4750 [00:01<00:00, 3250.42 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 250/250 [00:00<00:00, 2907.24 examples/s]
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


In [None]:
model_path = 'openlm-research/open_llama_3b_v2'

# COMMAND ----------

tokenizer = LlamaTokenizer.from_pretrained(model_path)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# COMMAND ----------

model = LlamaForCausalLM.from_pretrained(
    model_path, load_in_8bit=True, device_map='auto',
)

# COMMAND ----------

peft_model_id = '<adapter_final_checkpoint_location>'

# COMMAND ----------

peft_model = PeftModel.from_pretrained(model, peft_model_id)

# COMMAND ----------

test_strings = ["Create a detailed description for the following product: Corelogic Smooth Mouse, belonging to category: Optical Mouse",
"Create a detailed description for the following product: Hoover Lightspeed, belonging to category: Cordless Vacuum Cleaner",
"Create a detailed description for the following product: Flattronic Cinematron, belonging to category: High Definition Flatscreen TV"]

# COMMAND ----------

predictions = []
for test in test_strings:
  prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

  ### Instruction:
  {}

  ### Response:""".format(test)
  input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to('cuda')

  generation_output = model.generate(
      input_ids=input_ids, max_new_tokens=156
  )
  predictions.append(tokenizer.decode(generation_output[0]))

# COMMAND ----------

def extract_response_text(input_string):
    start_marker = '### Response:'
    end_marker = '###'
    
    start_index = input_string.find(start_marker)
    if start_index == -1:
        return None
    
    start_index += len(start_marker)
    
    end_index = input_string.find(end_marker, start_index)
    if end_index == -1:
        return input_string[start_index:]
    
    return input_string[start_index:end_index].strip()

# COMMAND ----------

# predictions[2]

# COMMAND ----------

for i in range(3): 
  pred = predictions[i]
  text = test_strings[i]
  print(text+'\n')
  print(extract_response_text(pred))
  print('--------')

# COMMAND ----------