### Install required libraries

In [None]:
!pip install peft
!pip install accelerate
!pip install bitsandBytes
!pip install transformers
!pip install datasets


!pip install GPUtil

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.13.0->peft)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.13.0->peft)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.13.0->peft)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting

In [None]:
import torch
import GPUtil
import os

GPUtil.showUtilization()

if torch.cuda.is_available():
    print("GPU is available")
else:
    print("GPU is not available, using CPU instead")

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"


import os
from google.colab import userdata
os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN')


os.environ["WANDB_DISABLED"] = "true"  ### to disable the WANDDB monitoring and doesnot requires API

| ID | GPU | MEM |
------------------
|  0 |  0% |  0% |
GPU is available


In [None]:
import json

file = json.load(open("json_extraction_dataset_500.json", "r"))
print(file[1])



{'input': "Extract the product information:\n<div class='product'><h2>iPad Air</h2><span class='price'>$1344</span><span class='category'>audio</span><span class='brand'>Dell</span></div>", 'output': {'name': 'iPad Air', 'price': '$1344', 'category': 'audio', 'manufacturer': 'Dell'}}


In [None]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, LlamaTokenizer
from huggingface_hub import notebook_login
from datasets import load_dataset
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

if "COLAB_GPU" in os.environ:
  from google.colab import output
  output.enable_custom_widget_manager()

In [None]:
base_model_id = "meta-llama/Llama-2-7b-chat-hf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config)

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [None]:
from datasets import Dataset

def format_prompt(example):
    return f"### Input: {example['input']}\n### Output: {json.dumps(example['output'])}<|endoftext|>"

formatted_data = [format_prompt(item) for item in file]
dataset = Dataset.from_dict({"text": formatted_data})

In [None]:
dataset[1]

{'text': '### Input: Extract the product information:\n<div class=\'product\'><h2>iPad Air</h2><span class=\'price\'>$1344</span><span class=\'category\'>audio</span><span class=\'brand\'>Dell</span></div>\n### Output: {"name": "iPad Air", "price": "$1344", "category": "audio", "manufacturer": "Dell"}<|endoftext|>'}

In [None]:
tokenizer = LlamaTokenizer.from_pretrained(base_model_id, use_fast=False, trust_remote_code=True, add_eos_token=True)

if tokenizer.pad_token is None:
  tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

In [None]:
tokenized_train_dataset = []
for item in dataset:
  #print(item["text"])
  tokenized_train_dataset.append(tokenizer(item["text"]))

In [None]:
tokenized_train_dataset[1]

{'input_ids': [1, 835, 10567, 29901, 7338, 1461, 278, 3234, 2472, 29901, 13, 29966, 4563, 770, 2433, 4704, 29915, 5299, 29882, 29906, 29958, 29875, 20369, 5593, 829, 29882, 29906, 5299, 9653, 770, 2433, 9175, 11041, 29938, 29896, 29941, 29946, 29946, 829, 9653, 5299, 9653, 770, 2433, 7320, 11041, 18494, 829, 9653, 5299, 9653, 770, 2433, 16472, 11041, 29928, 514, 829, 9653, 2565, 4563, 29958, 13, 2277, 29937, 10604, 29901, 8853, 978, 1115, 376, 29875, 20369, 5593, 613, 376, 9175, 1115, 3908, 29896, 29941, 29946, 29946, 613, 376, 7320, 1115, 376, 18494, 613, 376, 1171, 9765, 9945, 1115, 376, 29928, 514, 9092, 29966, 29989, 355, 974, 726, 29989, 29958, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [None]:
tokenized_train_dataset[5]

{'input_ids': [1, 835, 10567, 29901, 7338, 1461, 278, 3234, 2472, 29901, 13, 29966, 4563, 770, 2433, 4704, 29915, 5299, 29882, 29906, 29958, 29934, 834, 261, 14450, 3253, 672, 478, 29906, 829, 29882, 29906, 5299, 9653, 770, 2433, 9175, 11041, 29938, 29896, 29906, 29955, 29955, 829, 9653, 5299, 9653, 770, 2433, 7320, 11041, 18494, 829, 9653, 5299, 9653, 770, 2433, 16472, 11041, 2052, 280, 829, 9653, 2565, 4563, 29958, 13, 2277, 29937, 10604, 29901, 8853, 978, 1115, 376, 29934, 834, 261, 14450, 3253, 672, 478, 29906, 613, 376, 9175, 1115, 3908, 29896, 29906, 29955, 29955, 613, 376, 7320, 1115, 376, 18494, 613, 376, 1171, 9765, 9945, 1115, 376, 2052, 280, 9092, 29966, 29989, 355, 974, 726, 29989, 29958, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [None]:
tokenizer.eos_token

'</s>'

In [None]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r=8,
    lora_alpha=64,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)

In [None]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    args=transformers.TrainingArguments(
        output_dir="./finetunedModel",
        per_device_train_batch_size=2,
        gradient_accumulation_steps=2,
        num_train_epochs=3,
        learning_rate=1e-4,
        max_steps=4,
        bf16=False,
        optim="paged_adamw_8bit",
        logging_dir="./log",
        save_strategy="epoch",
        save_steps=50,
        logging_steps=10

),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
model.config.use_cache=False
trainer.train()

  return fn(*args, **kwargs)


Step,Training Loss


TrainOutput(global_step=4, training_loss=1.437775731086731, metrics={'train_runtime': 35.2492, 'train_samples_per_second': 0.454, 'train_steps_per_second': 0.113, 'total_flos': 73165746339840.0, 'train_loss': 1.437775731086731, 'epoch': 0.032})

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig, LlamaTokenizer
from peft import PeftModel

base_model_id = "meta-llama/Llama-2-7b-chat-hf"

nf4Config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = LlamaTokenizer.from_pretrained(base_model_id, use_fast=False, trust_remote_code=True, add_eos_token=True)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=nf4Config,
    device_map="auto",
    trust_remote_code=True,
    use_auth_token=True
  )




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
tokenizer = LlamaTokenizer.from_pretrained(base_model_id, use_fast=False, trust_remote_code=True, add_eos_token=True)

modelFinetuned = PeftModel.from_pretrained(base_model, "finetunedModel/checkpoint-4")

In [None]:
user_question = "### Input: Extract the product information:\n<div class=\'product\'><h2>iPad Air</h2><span class=\'price\'>$1344</span><span class=\'category\'>audio</span><span class=\'brand\'>Dell</span></div>"

eval_prompt = f"Provide the  Output in {{}} for this {user_question}"

promptTokenized = tokenizer(user_question, return_tensors="pt").to("cuda")

### Using without finetuning the model

In [None]:
with torch.no_grad():
  print(tokenizer.decode(base_model.generate(**promptTokenized, max_new_tokens=1024)[0], skip_special_tokens=True))
  torch.cuda.empty_cache()

### Input: Extract the product information:
<div class='product'><h2>iPad Air</h2><span class='price'>$1344</span><span class='category'>audio</span><span class='brand'>Dell</span></div>01. What is the product name?
iPad Air

02. What is the price of the product?
$1344

03. What is the category of the product?
audio

04. What is the brand of the product?
Dell


Please let me know if you need any further assistance.


### After fine tuning the model

In [None]:
modelFinetuned.eval()

with torch.no_grad():
  print(tokenizer.decode(modelFinetuned.generate(**promptTokenized, max_new_tokens=1024)[0], skip_special_tokens=True))
  torch.cuda.empty_cache()

### Input: Extract the product information:
<div class='product'><h2>iPad Air</h2><span class='price'>$1344</span><span class='category'>audio</span><span class='brand'>Dell</span></div>0. Extract the product name: iPad Air
1. Extract the price: $1344
2. Extract the product category: audio
3. Extract the product brand: Dell

```

```

### Output:

Product Name: iPad Air
Price: $1344
Product Category: audio
Product Brand: Dell

```

```

Note: The `div` element contains the product information. We used CSS selectors to extract the product name, price, category, and brand.
