# Getting PEFT and SFT code to work on Phi-3

here I use the train split of the absa-quad dataset (after reformatting) to finetune Phi-3-mini-4k-instruct

**UPDATE: model link after training AND MERGING is:**

[https://huggingface.co/benjaminzwhite/phi-3-mini-4k-instruct-ABSA-QUAD](https://huggingface.co/benjaminzwhite/phi-3-mini-4k-instruct-ABSA-QUAD)

In [None]:
#from huggingface_hub import notebook_login

#notebook_login()

In [None]:
from datasets import load_dataset

absa_quad = load_dataset("NEUDM/absa-quad")

# convert dataset to usable format

In [None]:
import ast

In [None]:
phi3_train_dataset = []


for example in absa_quad["train"].iter(batch_size=1):
    #print(example)
    # get the raw text
    example_text = example["input"][0]

    input_text = ast.literal_eval(example_text)[0]
    
    gold_labels = example["output"][0]
    
    #print(gold_labels)
    golds = ast.literal_eval(gold_labels)
    tmp_list_of_quads = []
    for quad in golds:
        tmp_d = {}
        tmp_d["opinion term"] = quad[0]
        tmp_d["aspect category"] = quad[1]
        tmp_d["sentiment"] = quad[2]
        tmp_d["justification"] = quad[3]
        # convert to str representation for making the prompt
        str_x = str(tmp_d)
        tmp_list_of_quads.append(str_x)
    
    # create the string representation of the gold answer
    gold_answer = '[' + ','.join(tmp_list_of_quads) + ']'
    
    phi3template = f"""<|system|>
You are a computer program who only replies with valid JSON lists.<|end|>
<|user|>
Perform a full aspect-based sentiment analysis of the following restaurant review:

{input_text}
<|end|>
<|assistant|>
{gold_answer}"""
    phi3_train_dataset.append(phi3template)
    

In [None]:
len(phi3_train_dataset)

In [None]:
import pandas as pd

In [None]:
from datasets import Dataset

# convert to HF dataset
df = pd.DataFrame(phi3_train_dataset)
train_data = Dataset.from_pandas(df.rename(columns={0: "text"}), split="train")

In [None]:
train_data

# Model preparation stuff

In [None]:
#!pip install -q -U torch --index-url https://download.pytorch.org/whl/cu117
!pip install -q -U -i https://pypi.org/simple/ bitsandbytes
#!pip install -q -U transformers=="4.40.0"
!pip install -q -U accelerate
#!pip install -q -U datasets
!pip install -q -U trl
!pip install -q -U peft
!pip install -q -U tensorboard
!pip install -q -U einops

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline, 
                          logging)
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)
from sklearn.model_selection import train_test_split

In [None]:
model_name = "microsoft/Phi-3-mini-4k-instruct"

compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    device_map="auto",
    quantization_config=bnb_config, 
)

model.config.use_cache = False
model.config.pretraining_tp = 1

max_seq_length = 2048
tokenizer = AutoTokenizer.from_pretrained(model_name, 
                                          trust_remote_code=True,
                                          max_seq_length=max_seq_length,
                                         )
tokenizer.pad_token = tokenizer.eos_token

In [None]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules="all-linear",
    lora_dropout=0.00,
    bias="none",
    task_type="CAUSAL_LM",
)

training_arguments = TrainingArguments(
    output_dir="bzw_train_logs",
    num_train_epochs=1, # ADJUSTED TO 1 FOR TESTING CAN SET TO 4
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8, # 4
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to="tensorboard",
    #evaluation_strategy="epoch"
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    #eval_dataset=eval_data,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    max_seq_length=max_seq_length,
    args=training_arguments,
    packing=False,
)

# go

**NOTE - FFS for some reason Kaggle bars dont show progress so I cancelled one run since it was stuck on like 32/4000 but then when I interrupted it showed taht there had been 3 epochs or so and only had 15mins left to finish**

basically just leave it and trust that it is working ok - 45 mins or so for 1 epoch

In [None]:
trainer.train()

In [None]:
my_model_name = "phi-3-mini-4k-instruct-ABSA-QUAD"

trainer.model.save_pretrained(my_model_name)

# Use prompt template manually to test a few examples

In [None]:
from transformers import pipeline

In [None]:
generation_template = """<|system|>
You are a computer program who only replies with valid JSON lists.<|end|>
<|user|>
Perform a full aspect-based sentiment analysis of the following restaurant review:

{review_text}
<|end|>
<|assistant|>
"""

In [None]:
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=500)

In [None]:
example_review = "This place's burgers are the absolute best in town, and even though the service is incredibly slow I'd definitely come back - I want to try the tomato sauce that my friend had which looked delicious!"

result = pipe(generation_template.format(review_text=example_review))
print(result[0]['generated_text'])

In [None]:
# toggle Return full text NO
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=500, return_full_text=False) 
result = pipe(generation_template.format(review_text=example_review))
print(result[0]['generated_text'])

# Merging back to model

# DEBUG 

after reading Younes HF gist :

[https://gist.github.com/younesbelkada/9f7f75c94bdc1981c8ca5cc937d4a4da](https://gist.github.com/younesbelkada/9f7f75c94bdc1981c8ca5cc937d4a4da)

the stuff with `gc` didn't work - to debug and get the model merging steps below to work, I found that I had to do as said in

https://mlabonne.github.io/blog/posts/Fine_Tune_Your_Own_Llama_2_Model_in_a_Colab_Notebook.html

where he says to **restart notebook (Run > restart & clear cell outputs)**

then copy **first few cells** (basically in my case I copied below what needs to be copied - it's the stuff about where your trained adapter is saved `my_model_name` in this case

# TODO - check

before merging i'm 99% sure that /kaggle/working had `my_model_name` file being around 100 Mb or so, and seemed to be only the adapter

after running the restarted part of notebook below (i.e doing the model merge) i note that my output kaggle/working dir is now indeed 7 or 8 Gb so seems to have the **full model** ok ; **but i still want to be clear that the FIRST PART of this notebook/training up to `save_pretrained` <-- that this is saving only the adapter??? not clear yet to me**


In [None]:
# DIDNT WORK SEE ABOVE COMMENTS AND BELOW CELL FOR CODE THAT WORKS - AFTER RESTART NOTEBOOK
del model
del pipe
del trainer
import gc
gc.collect()
gc.collect()
gc.collect()
gc.collect()
gc.collect()
gc.collect()

# This works (below) after restarting notebook

**NOTE !!!!!!! YOU NEED TO HAVE DONE THE save_pretrained BIT AFTER YOUR TRAINING LOOP!!!! THAT IS THE ADAPTER THAT GETS SAVED (IN KAGGLE OUTPUT IF YOU WORK ON KAGGLE) AND THIS IS WHAT IS MERGED INTO THE FP16 MODEL THAT IS BEING "CLEAN" LOADED IN BELOW CODE**

In [1]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from peft import LoraConfig, PeftModel # ADDED
from trl import SFTTrainer
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline, 
                          logging)

my_model_name = "phi-3-mini-4k-instruct-ABSA-QUAD"
model_name = "microsoft/Phi-3-mini-4k-instruct"

device_map = {"": 0}

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, my_model_name)
model = model.merge_and_unload()



The repository for microsoft/Phi-3-mini-4k-instruct contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/microsoft/Phi-3-mini-4k-instruct.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y
The repository for microsoft/Phi-3-mini-4k-instruct contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/microsoft/Phi-3-mini-4k-instruct.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Push to hub - NOTE THIS TAKES A WHILE, LIKE 5 MINS OR SO FOR BIG MODEL O_o

In [4]:
model.push_to_hub(my_model_name, use_temp_dir=False)
tokenizer.push_to_hub(my_model_name, use_temp_dir=False)

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/benjaminzwhite/phi-3-mini-4k-instruct-ABSA-QUAD/commit/1b51cedf7d9280c74e58740f5ed730a34650c46e', commit_message='Upload tokenizer', commit_description='', oid='1b51cedf7d9280c74e58740f5ed730a34650c46e', pr_url=None, pr_revision=None, pr_num=None)